diff --git a/.gitattributes b/.gitattributes index 54a153654417ddaea006800ea5ae57912e55de2d..e4c1d11d06f6212941481ca70eb5d4e9442acb35 100644 --- a/.gitattributes +++ b/.gitattributes @@ -809,3 +809,251 @@ illustrious_generated/3e2afaad2b7d.png filter=lfs diff=lfs merge=lfs -text illustrious_generated/04d6bfa98264.png filter=lfs diff=lfs merge=lfs -text illustrious_generated/62a8fa0ac7dd.png filter=lfs diff=lfs merge=lfs -text illustrious_generated/d190d03f64a7.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/f6342e8db68a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/f7ca451e1933.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/6b3c44df8332.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/ed13e74032fb.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/faa1e7049117.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/c17212cc7fda.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/6c268f463a2b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/a364591ba4c1.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/2ea3ba7918b4.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/2ffb09f5cbc0.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/0d55065059c0.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/85e9723ae8cf.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/e89ab638d462.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/224c2084abb8.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/0b77d88bc5f0.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/91076903bce5.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7acda55248bc.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/ee32c9618a12.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/698a4bf05f13.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/bf97f1eaffeb.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/62daa562132c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/9ee7e057c8a2.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/427d956c743b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/06da7f820423.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/92bcab0aaba1.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/502a84449b45.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/d99abaed93ba.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/3a12bf82c05e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/433a115b55a3.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/574012fe8664.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7d22dc2a6fb2.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/4f23c350b644.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/e24085ea542f.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/3cc7f3366f7a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/5242430c6777.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/6fe5f96649a3.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/12875eda15eb.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/eac29190186c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/c1276a9fc21b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/a891e5d92031.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/0367ba694b76.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/f84f116882be.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/b8e81c1a4bd1.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/392a7a129a01.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/1506e01a5598.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/cbd5827b38ea.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/b80b59fe722f.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/a2ca03055273.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/b58cf17494db.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/4c587778617b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7c5200560049.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/b78d0c1f0687.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/5c6f22f08540.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/9b2b12c21a2b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/ec96a311c2cb.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/a28e4715fc8c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/00f5e16a2236.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/0ef8c1ed2c6c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/f214facc5681.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/f41b4fc2c7d5.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/9e9a0ce3d676.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/26d2ef2d7d03.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/1e774fcc188d.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7eab3f4f0c8e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/f8631de95d70.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/8d95e57fcb27.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7ac791baad53.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7b8529c066a0.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7d8509931e4e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/9fafd1175b72.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7023242de1c0.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/99d5b088ccd4.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/2bac6ab4413e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/00ff6449b55d.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7b900f6e27b1.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/69e10254baf5.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/93d9e9abc98e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/095dc81d1160.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/3315198d28df.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/2549abad7eff.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/8a90db3476ef.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/72473c769552.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/bbf3fb096202.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/c5e0eb8a2241.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/8fa96985fc06.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/645e3b996530.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/b9fdc64b985c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/fa67e15ca2bf.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/9f5c49f2e362.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/e8318516b273.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/e801a5ce2da6.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/cd9145683d1e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/275253c8ad6b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/f2a6e0c5c432.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/586dbda7c6ff.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/dff506d177c0.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/c8846919f3a8.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/afbdb8dce1e5.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/fd4c46f2141f.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/ee36cea22c91.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/6ca60a86b836.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/11c7f55b2aab.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/d684bc0d0627.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/4f1602c01d5b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/45c709323899.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/d7bc7c5ba632.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/0e0acc59ef85.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/1c7a7ed6f359.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/31cbd66704bb.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/dd8a48931525.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7368d4c82b5f.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/c7e1a60c0f5d.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/be56d67f1e08.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/269ee6e9a79c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/2bb0e99b92bc.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/afd28993674d.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/585afc2017e2.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/f9c5bdc8bef5.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/8f338d47820a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/e0443895d658.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/67ea9c16fed3.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/78dfdb4f0521.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/fff7c0390e8a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/c63799030196.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/fc061ac787c7.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/26185801988b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/656abae8d0b6.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/5c4a2ea8f842.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/2286bf835a6b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/dc7501a6f47f.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/38b5363061d5.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/451e48977b1a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/f7621703575c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/891dc839571c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/d1e30fd687b5.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/d1413371999b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/0ad3307ea09c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/6fba429dafc5.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/481f3834876a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/1e54c0c78134.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/a564e408f362.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/ec6650b62802.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/9f447e4cf3d7.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/790ece21df10.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/75e576f27cb6.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/205b715d279f.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/060e926dcc0a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/733c86338921.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/b9f37572031b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/43eeb1fb403b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/d22ef7243fac.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/162e3face5a7.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/765bf9d23c7e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/47418c15a58f.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/3030bee9df5a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/e4acb93d313c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/08e454ab01c2.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/3f43e650c7d7.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/085929212457.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/91d346543b7c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/891abd7c9fa3.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/1927adcb399a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7e49e6b5a30b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/2cd36314054f.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/b569d3590c66.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/9e8dc59217e8.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/c2c3bea0e9d5.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/05972b153525.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/c9bf921e364a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/13cdedc9c525.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/d8641bfcdd46.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/34afbd2725c8.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/f0d97f98333f.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/76b2de1037cb.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/a370eb471cd7.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/f5ab32c63fb8.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/5718f8172842.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/b7f508ecce88.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/5f147d77f3ed.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/ac9d950baac7.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/8b674edb3a4e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/8ad0a744de62.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/5b8f74bcc260.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/78026f131004.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/d305fe437c6f.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7cce990ade4c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/c76729f0f827.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/0706f94ebdc3.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/22af9def0424.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/43877698ad33.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/5a0201bebc6d.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7ad096e9b528.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/46edb49b5dbf.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/bd65b176bfe6.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/073f299a3b06.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/fc885c9be9af.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/bcfc32b88c98.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/e55e6cf94025.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/b4a9600f3647.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/d7ef34bf47ee.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/8cbc6e1dbe62.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/8633a3dff7ea.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/cb335826ba02.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/3048ba382498.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/eca43ddadd85.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/365e7d0f97c2.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/e71b25950c5d.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/59a595c825c8.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/82ee8177ef04.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/36915299353b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/ca07713b354c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/fbebd175667e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/dacfbbcd3fb3.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/8a371dac467c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/40c498965cbd.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/190beb9306ef.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/bb2041beb345.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/6f1c05af41ca.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/9f741bd68919.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/9bb815cccb98.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/41d42d8f4842.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/13166cbea867.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/e2812aff73e9.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/954594f7f0a6.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/c4b5bff2dbc1.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/980b174e831c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/ed89a47fd589.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/a8e5c9011eef.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/f1de13ffcad6.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/574fba2c6515.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/591e156ad5fd.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/aef907db00ce.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/3967f8d787ab.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/a1ec0d3b0b0e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/9da135f5f21e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/8fd9fbffb954.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/24e5b9fe7d38.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/495f1b55919f.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/19ff2ce2a961.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/e39fecdd2676.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7663094bacec.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/6d5feb7de870.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/abe90752beb0.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/cae43d7fd0f8.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/3f5c59c8ee7b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/49712a2e71f1.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/6346f39915f3.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/4c6ea9681419.png filter=lfs diff=lfs merge=lfs -text diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/__init__.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2899647a6e82fbf639b7d7bc14d07b03c0384be1 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/__init__.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_adapters.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_adapters.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b1f7eb47d3bfa78f0838a340db3dda9e38e560b5 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_adapters.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_collections.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_collections.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..efb7dc12217ffeb49f6cf64ea9341ff1837b2dba Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_collections.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_compat.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_compat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dcc8d6a70c088e238daaeea958534a147ba900cc Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_compat.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_functools.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_functools.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7564852d126318056eedd2950936883061927698 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_functools.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_itertools.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_itertools.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd26baf8ce93b33c66c0a3df4c31242ace7d0575 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_itertools.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_meta.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_meta.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a04c10ccfc8c22e8d479ef52468f7fd27b897ab0 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_meta.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_text.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_text.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6697fe51d614b5f5bcf0619a5b9f1731f12b002 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_text.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_typing.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_typing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6987f964bfdb3450551bddfd37211382a06c1ef6 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_typing.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__init__.py b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/__init__.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0a349c77d717de0f69c478b62b328f5f0eba2847 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/__init__.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/py311.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/py311.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f12e92b4553516c241336205c5fc505df171eafc Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/py311.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/py39.cpython-312.pyc b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/py39.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53edb8ac07b7843a354bf656fbf2b0e0a6df7176 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/py39.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/compat/py311.py b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/py311.py new file mode 100644 index 0000000000000000000000000000000000000000..3a5327436f9b1d9eae371e321c491a270634b3cf --- /dev/null +++ b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/py311.py @@ -0,0 +1,22 @@ +import os +import pathlib +import sys +import types + + +def wrap(path): # pragma: no cover + """ + Workaround for https://github.com/python/cpython/issues/84538 + to add backward compatibility for walk_up=True. + An example affected package is dask-labextension, which uses + jupyter-packaging to install JupyterLab javascript files outside + of site-packages. + """ + + def relative_to(root, *, walk_up=False): + return pathlib.Path(os.path.relpath(path, root)) + + return types.SimpleNamespace(relative_to=relative_to) + + +relative_fix = wrap if sys.version_info < (3, 12) else lambda x: x diff --git a/.venv/lib/python3.12/site-packages/importlib_metadata/compat/py39.py b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/py39.py new file mode 100644 index 0000000000000000000000000000000000000000..3eb9c01ecbbdcdf7b79d8840ee91c2fe7a734a1c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/importlib_metadata/compat/py39.py @@ -0,0 +1,42 @@ +""" +Compatibility layer with Python 3.8/3.9 +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: # pragma: no cover + # Prevent circular imports on runtime. + from .. import Distribution, EntryPoint +else: + Distribution = EntryPoint = Any + +from .._typing import md_none + + +def normalized_name(dist: Distribution) -> str | None: + """ + Honor name normalization for distributions that don't provide ``_normalized_name``. + """ + try: + return dist._normalized_name + except AttributeError: + from .. import Prepared # -> delay to prevent circular imports. + + return Prepared.normalize( + getattr(dist, "name", None) or md_none(dist.metadata)['Name'] + ) + + +def ep_matches(ep: EntryPoint, **params) -> bool: + """ + Workaround for ``EntryPoint`` objects without the ``matches`` method. + """ + try: + return ep.matches(**params) + except AttributeError: + from .. import EntryPoint # -> delay to prevent circular imports. + + # Reconstruct the EntryPoint object to make sure it is compatible. + return EntryPoint(ep.name, ep.value, ep.group).matches(**params) diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn.h new file mode 100644 index 0000000000000000000000000000000000000000..7e08847c95f1294bc99e96e737a53cc6ebb7a458 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn.h @@ -0,0 +1,68 @@ +/* + * Copyright 2014-2023 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* cudnn : Neural Networks Library */ + +#if !defined(CUDNN_H_) +#define CUDNN_H_ +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "cudnn_version.h" +#include "cudnn_graph.h" +#include "cudnn_ops.h" +#include "cudnn_adv.h" +#include "cudnn_cnn.h" + +#if defined(__cplusplus) +} +#endif +#endif /* CUDNN_H_ */ diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_adv.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_adv.h new file mode 100644 index 0000000000000000000000000000000000000000..5d9bef65d5323dd3354299569d869191a07615cf --- /dev/null +++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_adv.h @@ -0,0 +1,669 @@ +/* + * Copyright 2014-2023 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* cudnn_adv : cuDNN's advanced and experimental features. + +*/ + +#if !defined(CUDNN_ADV_H_) +#define CUDNN_ADV_H_ + +#include + +#include "cudnn_version.h" +#include "cudnn_ops.h" + +/* These version numbers are autogenerated, do not edit manually. */ +#define CUDNN_ADV_MAJOR 9 +#define CUDNN_ADV_MINOR 10 +#define CUDNN_ADV_PATCH 2 + +#if (CUDNN_ADV_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_MINOR != CUDNN_MINOR) || (CUDNN_ADV_PATCH != CUDNN_PATCHLEVEL) +#error Version mismatch in cuDNN ADV INFER!!! +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +/* BASIC RNN API */ + +typedef enum { + CUDNN_RNN_ALGO_STANDARD = 0, + CUDNN_RNN_ALGO_PERSIST_STATIC = 1, + CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2, + CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H = 3, + CUDNN_RNN_ALGO_COUNT = 4, +} cudnnRNNAlgo_t; + +typedef enum { + CUDNN_FWD_MODE_INFERENCE = 0, + CUDNN_FWD_MODE_TRAINING = 1, +} cudnnForwardMode_t; + +typedef enum { + CUDNN_RNN_RELU = 0, /* basic RNN cell type with ReLu activation */ + CUDNN_RNN_TANH = 1, /* basic RNN cell type with tanh activation */ + CUDNN_LSTM = 2, /* LSTM with optional recurrent projection and clipping */ + CUDNN_GRU = 3, /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */ +} cudnnRNNMode_t; + +typedef enum { + CUDNN_RNN_NO_BIAS = 0, /* rnn cell formulas do not use biases */ + CUDNN_RNN_SINGLE_INP_BIAS = 1, /* rnn cell formulas use one input bias in input GEMM */ + CUDNN_RNN_DOUBLE_BIAS = 2, /* default, rnn cell formulas use two bias vectors */ + CUDNN_RNN_SINGLE_REC_BIAS = 3 /* rnn cell formulas use one recurrent bias in recurrent GEMM */ +} cudnnRNNBiasMode_t; + +typedef enum { + CUDNN_UNIDIRECTIONAL = 0, /* single direction network */ + CUDNN_BIDIRECTIONAL = 1, /* output concatination at each layer */ +} cudnnDirectionMode_t; + +typedef enum { + CUDNN_LINEAR_INPUT = 0, /* adjustable weight matrix in first layer input GEMM */ + CUDNN_SKIP_INPUT = 1, /* fixed identity matrix in the first layer input GEMM */ +} cudnnRNNInputMode_t; + +typedef enum { + CUDNN_RNN_CLIP_NONE = 0, /* disables LSTM cell clipping */ + CUDNN_RNN_CLIP_MINMAX = 1, /* enables LSTM cell clipping */ +} cudnnRNNClipMode_t; + +typedef enum { + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 0, /* padded, outer stride from one time-step to the next */ + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 1, /* sequence length sorted and packed as in basic RNN api */ + CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2, /* padded, outer stride from one batch to the next */ +} cudnnRNNDataLayout_t; + +/* For auxFlags in cudnnSetRNNDescriptor_v8() */ +#define CUDNN_RNN_PADDED_IO_DISABLED 0 +#define CUDNN_RNN_PADDED_IO_ENABLED (1U << 0) + +struct cudnnRNNStruct; +typedef struct cudnnRNNStruct *cudnnRNNDescriptor_t; + +struct cudnnRNNDataStruct; +typedef struct cudnnRNNDataStruct *cudnnRNNDataDescriptor_t; + +cudnnStatus_t CUDNNWINAPI +cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc); + +/* + * mathPrec in cudnnSetRNNDescriptor_v8() specifies compute precision. + * Compute precision is further modified by mathType that sets the + * preferred option for using NVIDIA Tensor Cores. dataType specify + * input/output data type and weight/bias type. + */ + +cudnnStatus_t CUDNNWINAPI +cudnnSetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc, + cudnnRNNAlgo_t algo, + cudnnRNNMode_t cellMode, + cudnnRNNBiasMode_t biasMode, + cudnnDirectionMode_t dirMode, + cudnnRNNInputMode_t inputMode, + cudnnDataType_t dataType, + cudnnDataType_t mathPrec, + cudnnMathType_t mathType, + int32_t inputSize, + int32_t hiddenSize, + int32_t projSize, + int32_t numLayers, + cudnnDropoutDescriptor_t dropoutDesc, + uint32_t auxFlags); + +cudnnStatus_t CUDNNWINAPI +cudnnGetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc, + cudnnRNNAlgo_t *algo, + cudnnRNNMode_t *cellMode, + cudnnRNNBiasMode_t *biasMode, + cudnnDirectionMode_t *dirMode, + cudnnRNNInputMode_t *inputMode, + cudnnDataType_t *dataType, + cudnnDataType_t *mathPrec, + cudnnMathType_t *mathType, + int32_t *inputSize, + int32_t *hiddenSize, + int32_t *projSize, + int32_t *numLayers, + cudnnDropoutDescriptor_t *dropoutDesc, + uint32_t *auxFlags); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnRNNSetClip_v8(cudnnRNNDescriptor_t rnnDesc, + cudnnRNNClipMode_t clipMode, + cudnnNanPropagation_t clipNanOpt, + double lclip, + double rclip); + +cudnnStatus_t CUDNNWINAPI +cudnnRNNSetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t clipMode, double lclip, double rclip); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnRNNGetClip_v8(cudnnRNNDescriptor_t rnnDesc, + cudnnRNNClipMode_t *clipMode, + cudnnNanPropagation_t *clipNanOpt, + double *lclip, + double *rclip); + +cudnnStatus_t CUDNNWINAPI +cudnnRNNGetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t *clipMode, double *lclip, double *rclip); + +cudnnStatus_t CUDNNWINAPI +cudnnBuildRNNDynamic(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int miniBatch); + +cudnnStatus_t CUDNNWINAPI +cudnnGetRNNTempSpaceSizes(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + cudnnForwardMode_t fwdMode, + cudnnRNNDataDescriptor_t xDesc, + size_t *workSpaceSize, + size_t *reserveSpaceSize); + +cudnnStatus_t CUDNNWINAPI +cudnnGetRNNWeightSpaceSize(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, size_t *weightSpaceSize); + +cudnnStatus_t CUDNNWINAPI +cudnnGetRNNWeightParams(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + int32_t pseudoLayer, + size_t weightSpaceSize, + const void *weightSpace, + int32_t linLayerID, + cudnnTensorDescriptor_t mDesc, + void **mAddr, + cudnnTensorDescriptor_t bDesc, + void **bAddr); + +cudnnStatus_t CUDNNWINAPI +cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc, + cudnnDataType_t dataType, + cudnnRNNDataLayout_t layout, + int maxSeqLength, + int batchSize, + int vectorSize, + const int seqLengthArray[], /* length of each sequence in the batch */ + void *paddingFill); /* symbol for filling padding position in output */ + +cudnnStatus_t CUDNNWINAPI +cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc, + cudnnDataType_t *dataType, + cudnnRNNDataLayout_t *layout, + int *maxSeqLength, + int *batchSize, + int *vectorSize, + int arrayLengthRequested, + int seqLengthArray[], + void *paddingFill); + +cudnnStatus_t CUDNNWINAPI +cudnnRNNForward(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + cudnnForwardMode_t fwdMode, + const int32_t devSeqLengths[], + cudnnRNNDataDescriptor_t xDesc, + const void *x, + cudnnRNNDataDescriptor_t yDesc, + void *y, + cudnnTensorDescriptor_t hDesc, + const void *hx, + void *hy, + cudnnTensorDescriptor_t cDesc, + const void *cx, + void *cy, + size_t weightSpaceSize, + const void *weightSpace, + size_t workSpaceSize, + void *workSpace, + size_t reserveSpaceSize, + void *reserveSpace); + +/* Sequence data descriptor */ + +typedef enum { + CUDNN_SEQDATA_TIME_DIM = 0, /* index in time */ + CUDNN_SEQDATA_BATCH_DIM = 1, /* index in batch */ + CUDNN_SEQDATA_BEAM_DIM = 2, /* index in beam */ + CUDNN_SEQDATA_VECT_DIM = 3 /* index in vector */ +} cudnnSeqDataAxis_t; + +struct cudnnSeqDataStruct; +typedef struct cudnnSeqDataStruct *cudnnSeqDataDescriptor_t CUDNN_DEPRECATED; + +#define CUDNN_SEQDATA_DIM_COUNT 4 /* dimension count */ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetSeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc, + cudnnDataType_t dataType, + int nbDims, + const int dimA[], + const cudnnSeqDataAxis_t axes[], + size_t seqLengthArraySize, + const int seqLengthArray[], + void *paddingFill); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetSeqDataDescriptor(const cudnnSeqDataDescriptor_t seqDataDesc, + cudnnDataType_t *dataType, + int *nbDims, + int nbDimsRequested, + int dimA[], + cudnnSeqDataAxis_t axes[], + size_t *seqLengthArraySize, + size_t seqLengthSizeRequested, + int seqLengthArray[], + void *paddingFill); + +/* Multihead Attention */ + +/* + * Multi-head attention options passed via 'attnMode' in cudnnSetAttnDescriptor(). + * Use the bitwise OR operator to combine several settings listed below. Additional + * minor options can be added here w/o changing or introducing new API functions. + */ +#define CUDNN_ATTN_QUERYMAP_ALL_TO_ONE 0 /* multiple Q-s map to a single (K,V) set when beam size > 1 */ +#define CUDNN_ATTN_QUERYMAP_ONE_TO_ONE (1U << 0) /* multiple Q-s map to multiple (K,V) sets when beam size > 1 */ +#define CUDNN_ATTN_DISABLE_PROJ_BIASES 0 /* no biases in attention input and output projections */ +#define CUDNN_ATTN_ENABLE_PROJ_BIASES (1U << 1) /* use biases in attention input and output projections */ + +struct cudnnAttnStruct; +typedef struct cudnnAttnStruct *cudnnAttnDescriptor_t CUDNN_DEPRECATED; + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetAttnDescriptor(cudnnAttnDescriptor_t attnDesc, + unsigned attnMode, + int nHeads, + double smScaler, + cudnnDataType_t dataType, + cudnnDataType_t computePrec, + cudnnMathType_t mathType, + cudnnDropoutDescriptor_t attnDropoutDesc, + cudnnDropoutDescriptor_t postDropoutDesc, + int qSize, + int kSize, + int vSize, + int qProjSize, + int kProjSize, + int vProjSize, + int oProjSize, + int qoMaxSeqLength, + int kvMaxSeqLength, + int maxBatchSize, + int maxBeamSize); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetAttnDescriptor(cudnnAttnDescriptor_t attnDesc, + unsigned *attnMode, + int *nHeads, + double *smScaler, + cudnnDataType_t *dataType, + cudnnDataType_t *computePrec, + cudnnMathType_t *mathType, + cudnnDropoutDescriptor_t *attnDropoutDesc, + cudnnDropoutDescriptor_t *postDropoutDesc, + int *qSize, + int *kSize, + int *vSize, + int *qProjSize, + int *kProjSize, + int *vProjSize, + int *oProjSize, + int *qoMaxSeqLength, + int *kvMaxSeqLength, + int *maxBatchSize, + int *maxBeamSize); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetMultiHeadAttnBuffers(cudnnHandle_t handle, + const cudnnAttnDescriptor_t attnDesc, + size_t *weightSizeInBytes, + size_t *workSpaceSizeInBytes, + size_t *reserveSpaceSizeInBytes); + +typedef enum { + CUDNN_MH_ATTN_Q_WEIGHTS = 0, /* input projection weights for 'queries' */ + CUDNN_MH_ATTN_K_WEIGHTS = 1, /* input projection weights for 'keys' */ + CUDNN_MH_ATTN_V_WEIGHTS = 2, /* input projection weights for 'values' */ + CUDNN_MH_ATTN_O_WEIGHTS = 3, /* output projection weights */ + CUDNN_MH_ATTN_Q_BIASES = 4, /* input projection bias tensor for 'queries' */ + CUDNN_MH_ATTN_K_BIASES = 5, /* input projection bias for 'keys' */ + CUDNN_MH_ATTN_V_BIASES = 6, /* input projection bias for 'values' */ + CUDNN_MH_ATTN_O_BIASES = 7, /* output projection biases */ +} cudnnMultiHeadAttnWeightKind_t; + +#define CUDNN_ATTN_WKIND_COUNT 8 /* Number of attention weight/bias tensors */ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetMultiHeadAttnWeights(cudnnHandle_t handle, + const cudnnAttnDescriptor_t attnDesc, + cudnnMultiHeadAttnWeightKind_t wKind, + size_t weightSizeInBytes, + const void *weights, + cudnnTensorDescriptor_t wDesc, + void **wAddr); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnMultiHeadAttnForward(cudnnHandle_t handle, + const cudnnAttnDescriptor_t attnDesc, + int currIdx, + const int loWinIdx[], + const int hiWinIdx[], + const int devSeqLengthsQO[], + const int devSeqLengthsKV[], + const cudnnSeqDataDescriptor_t qDesc, + const void *queries, + const void *residuals, + const cudnnSeqDataDescriptor_t kDesc, + const void *keys, + const cudnnSeqDataDescriptor_t vDesc, + const void *values, + const cudnnSeqDataDescriptor_t oDesc, + void *out, + size_t weightSizeInBytes, + const void *weights, + size_t workSpaceSizeInBytes, + void *workSpace, + size_t reserveSpaceSizeInBytes, + void *reserveSpace); + +/* + * \brief Cross-library version checker. + * This function is implemented differently in each sub-library. Each sublib + * checks whether its own version matches that of its dependencies. + * \returns CUDNN_STATUS_SUCCESS if the version check passes, + * CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent. + */ +cudnnStatus_t CUDNNWINAPI +cudnnAdvVersionCheck(void); + +typedef enum { + CUDNN_WGRAD_MODE_ADD = 0, /* add partial gradients to wgrad output buffers */ + CUDNN_WGRAD_MODE_SET = 1, /* write partial gradients to wgrad output buffers */ +} cudnnWgradMode_t; + +cudnnStatus_t CUDNNWINAPI +cudnnRNNBackwardData_v8(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + const int32_t devSeqLengths[], + cudnnRNNDataDescriptor_t yDesc, + const void *y, + const void *dy, + cudnnRNNDataDescriptor_t xDesc, + void *dx, + cudnnTensorDescriptor_t hDesc, + const void *hx, + const void *dhy, + void *dhx, + cudnnTensorDescriptor_t cDesc, + const void *cx, + const void *dcy, + void *dcx, + size_t weightSpaceSize, + const void *weightSpace, + size_t workSpaceSize, + void *workSpace, + size_t reserveSpaceSize, + void *reserveSpace); + +cudnnStatus_t CUDNNWINAPI +cudnnRNNBackwardWeights_v8(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + cudnnWgradMode_t addGrad, + const int32_t devSeqLengths[], + cudnnRNNDataDescriptor_t xDesc, + const void *x, + cudnnTensorDescriptor_t hDesc, + const void *hx, + cudnnRNNDataDescriptor_t yDesc, + const void *y, + size_t weightSpaceSize, + void *dweightSpace, + size_t workSpaceSize, + void *workSpace, + size_t reserveSpaceSize, + void *reserveSpace); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnMultiHeadAttnBackwardData(cudnnHandle_t handle, + const cudnnAttnDescriptor_t attnDesc, + const int loWinIdx[], + const int hiWinIdx[], + const int devSeqLengthsDQDO[], + const int devSeqLengthsDKDV[], + const cudnnSeqDataDescriptor_t doDesc, + const void *dout, + const cudnnSeqDataDescriptor_t dqDesc, + void *dqueries, + const void *queries, + const cudnnSeqDataDescriptor_t dkDesc, + void *dkeys, + const void *keys, + const cudnnSeqDataDescriptor_t dvDesc, + void *dvalues, + const void *values, + size_t weightSizeInBytes, + const void *weights, + size_t workSpaceSizeInBytes, + void *workSpace, + size_t reserveSpaceSizeInBytes, + void *reserveSpace); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnMultiHeadAttnBackwardWeights(cudnnHandle_t handle, + const cudnnAttnDescriptor_t attnDesc, + cudnnWgradMode_t addGrad, + const cudnnSeqDataDescriptor_t qDesc, + const void *queries, + const cudnnSeqDataDescriptor_t kDesc, + const void *keys, + const cudnnSeqDataDescriptor_t vDesc, + const void *values, + const cudnnSeqDataDescriptor_t doDesc, + const void *dout, + size_t weightSizeInBytes, + const void *weights, + void *dweights, + size_t workSpaceSizeInBytes, + void *workSpace, + size_t reserveSpaceSizeInBytes, + void *reserveSpace); + +/* + * CTC (Connectionist Temporal Classification) loss descriptor create/destory/set/get functions + */ +/* Input normalization mode for loss function */ +typedef enum { + CUDNN_LOSS_NORMALIZATION_NONE = 0, + CUDNN_LOSS_NORMALIZATION_SOFTMAX = 1, +} cudnnLossNormalizationMode_t; + +cudnnStatus_t CUDNNWINAPI +cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc, + cudnnDataType_t compType, + cudnnLossNormalizationMode_t normMode, + cudnnNanPropagation_t gradMode); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc, + cudnnDataType_t compType, + cudnnLossNormalizationMode_t normMode, + cudnnNanPropagation_t gradMode, + int maxLabelLength); + +cudnnStatus_t CUDNNWINAPI +cudnnSetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc, + cudnnDataType_t compType, + cudnnLossNormalizationMode_t normMode, + cudnnCTCGradMode_t ctcGradMode, + int maxLabelLength); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc, + cudnnDataType_t *compType, + cudnnLossNormalizationMode_t *normMode, + cudnnNanPropagation_t *gradMode); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc, + cudnnDataType_t *compType, + cudnnLossNormalizationMode_t *normMode, + cudnnNanPropagation_t *gradMode, + int *maxLabelLength); + +cudnnStatus_t CUDNNWINAPI +cudnnGetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc, + cudnnDataType_t *compType, + cudnnLossNormalizationMode_t *normMode, + cudnnCTCGradMode_t *ctcGradMode, + int *maxLabelLength); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc); + +/* return the ctc costs and gradients, given the probabilities and labels */ +cudnnStatus_t CUDNNWINAPI +cudnnCTCLoss( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the + timing steps, N is the mini batch size, A is the alphabet size) */ + const void *probs, /* probabilities after softmax, in GPU memory */ + const int hostLabels[], /* labels, in CPU memory */ + const int hostLabelLengths[], /* the length of each label, in CPU memory */ + const int hostInputLengths[], /* the lengths of timing steps in each batch, in CPU memory */ + void *costs, /* the returned costs of CTC, in GPU memory */ + const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */ + void *gradients, /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */ + cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */ + cudnnCTCLossDescriptor_t ctcLossDesc, + void *workspace, /* pointer to the workspace, in GPU memory */ + size_t workSpaceSizeInBytes); /* size of the workspace */ + +/* return the ctc costs and gradients, given the probabilities and labels */ +cudnnStatus_t CUDNNWINAPI +cudnnCTCLoss_v8( + cudnnHandle_t handle, + cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */ + cudnnCTCLossDescriptor_t ctcLossDesc, + const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the + timing steps, N is the mini batch size, A is the alphabet size) */ + const void *probs, /* probabilities after softmax, in GPU memory */ + const int labels[], /* labels, in GPU memory */ + const int labelLengths[], /* the length of each label, in GPU memory */ + const int inputLengths[], /* the lengths of timing steps in each batch, in GPU memory */ + void *costs, /* the returned costs of CTC, in GPU memory */ + const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */ + void *gradients, /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */ + size_t workSpaceSizeInBytes, /* size of the workspace */ + void *workspace); /* pointer to the workspace, in GPU memory */ + +/* return the workspace size needed for ctc */ +cudnnStatus_t CUDNNWINAPI +cudnnGetCTCLossWorkspaceSize( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the + timing steps, N is the mini batch size, A is the alphabet size) */ + const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the + dimensions are T,N,A. To compute costs + only, set it to NULL */ + const int *labels, /* labels, in CPU memory */ + const int *labelLengths, /* the length of each label, in CPU memory */ + const int *inputLengths, /* the lengths of timing steps in each batch, in CPU memory */ + cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */ + cudnnCTCLossDescriptor_t ctcLossDesc, + size_t *sizeInBytes); /* pointer to the returned workspace size */ + +/* return the workspace size needed for ctc */ +cudnnStatus_t CUDNNWINAPI +cudnnGetCTCLossWorkspaceSize_v8( + cudnnHandle_t handle, + cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */ + cudnnCTCLossDescriptor_t ctcLossDesc, + const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the + timing steps, N is the mini batch size, A is the alphabet size) */ + const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the + dimensions are T,N,A. To compute costs + only, set it to NULL */ + size_t *sizeInBytes); /* pointer to the returned workspace size */ + +#if defined(__cplusplus) +} +#endif + +#endif /* CUDNN_ADV_H_ */ diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_adv_v9.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_adv_v9.h new file mode 100644 index 0000000000000000000000000000000000000000..5d9bef65d5323dd3354299569d869191a07615cf --- /dev/null +++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_adv_v9.h @@ -0,0 +1,669 @@ +/* + * Copyright 2014-2023 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* cudnn_adv : cuDNN's advanced and experimental features. + +*/ + +#if !defined(CUDNN_ADV_H_) +#define CUDNN_ADV_H_ + +#include + +#include "cudnn_version.h" +#include "cudnn_ops.h" + +/* These version numbers are autogenerated, do not edit manually. */ +#define CUDNN_ADV_MAJOR 9 +#define CUDNN_ADV_MINOR 10 +#define CUDNN_ADV_PATCH 2 + +#if (CUDNN_ADV_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_MINOR != CUDNN_MINOR) || (CUDNN_ADV_PATCH != CUDNN_PATCHLEVEL) +#error Version mismatch in cuDNN ADV INFER!!! +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +/* BASIC RNN API */ + +typedef enum { + CUDNN_RNN_ALGO_STANDARD = 0, + CUDNN_RNN_ALGO_PERSIST_STATIC = 1, + CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2, + CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H = 3, + CUDNN_RNN_ALGO_COUNT = 4, +} cudnnRNNAlgo_t; + +typedef enum { + CUDNN_FWD_MODE_INFERENCE = 0, + CUDNN_FWD_MODE_TRAINING = 1, +} cudnnForwardMode_t; + +typedef enum { + CUDNN_RNN_RELU = 0, /* basic RNN cell type with ReLu activation */ + CUDNN_RNN_TANH = 1, /* basic RNN cell type with tanh activation */ + CUDNN_LSTM = 2, /* LSTM with optional recurrent projection and clipping */ + CUDNN_GRU = 3, /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */ +} cudnnRNNMode_t; + +typedef enum { + CUDNN_RNN_NO_BIAS = 0, /* rnn cell formulas do not use biases */ + CUDNN_RNN_SINGLE_INP_BIAS = 1, /* rnn cell formulas use one input bias in input GEMM */ + CUDNN_RNN_DOUBLE_BIAS = 2, /* default, rnn cell formulas use two bias vectors */ + CUDNN_RNN_SINGLE_REC_BIAS = 3 /* rnn cell formulas use one recurrent bias in recurrent GEMM */ +} cudnnRNNBiasMode_t; + +typedef enum { + CUDNN_UNIDIRECTIONAL = 0, /* single direction network */ + CUDNN_BIDIRECTIONAL = 1, /* output concatination at each layer */ +} cudnnDirectionMode_t; + +typedef enum { + CUDNN_LINEAR_INPUT = 0, /* adjustable weight matrix in first layer input GEMM */ + CUDNN_SKIP_INPUT = 1, /* fixed identity matrix in the first layer input GEMM */ +} cudnnRNNInputMode_t; + +typedef enum { + CUDNN_RNN_CLIP_NONE = 0, /* disables LSTM cell clipping */ + CUDNN_RNN_CLIP_MINMAX = 1, /* enables LSTM cell clipping */ +} cudnnRNNClipMode_t; + +typedef enum { + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 0, /* padded, outer stride from one time-step to the next */ + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 1, /* sequence length sorted and packed as in basic RNN api */ + CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2, /* padded, outer stride from one batch to the next */ +} cudnnRNNDataLayout_t; + +/* For auxFlags in cudnnSetRNNDescriptor_v8() */ +#define CUDNN_RNN_PADDED_IO_DISABLED 0 +#define CUDNN_RNN_PADDED_IO_ENABLED (1U << 0) + +struct cudnnRNNStruct; +typedef struct cudnnRNNStruct *cudnnRNNDescriptor_t; + +struct cudnnRNNDataStruct; +typedef struct cudnnRNNDataStruct *cudnnRNNDataDescriptor_t; + +cudnnStatus_t CUDNNWINAPI +cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc); + +/* + * mathPrec in cudnnSetRNNDescriptor_v8() specifies compute precision. + * Compute precision is further modified by mathType that sets the + * preferred option for using NVIDIA Tensor Cores. dataType specify + * input/output data type and weight/bias type. + */ + +cudnnStatus_t CUDNNWINAPI +cudnnSetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc, + cudnnRNNAlgo_t algo, + cudnnRNNMode_t cellMode, + cudnnRNNBiasMode_t biasMode, + cudnnDirectionMode_t dirMode, + cudnnRNNInputMode_t inputMode, + cudnnDataType_t dataType, + cudnnDataType_t mathPrec, + cudnnMathType_t mathType, + int32_t inputSize, + int32_t hiddenSize, + int32_t projSize, + int32_t numLayers, + cudnnDropoutDescriptor_t dropoutDesc, + uint32_t auxFlags); + +cudnnStatus_t CUDNNWINAPI +cudnnGetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc, + cudnnRNNAlgo_t *algo, + cudnnRNNMode_t *cellMode, + cudnnRNNBiasMode_t *biasMode, + cudnnDirectionMode_t *dirMode, + cudnnRNNInputMode_t *inputMode, + cudnnDataType_t *dataType, + cudnnDataType_t *mathPrec, + cudnnMathType_t *mathType, + int32_t *inputSize, + int32_t *hiddenSize, + int32_t *projSize, + int32_t *numLayers, + cudnnDropoutDescriptor_t *dropoutDesc, + uint32_t *auxFlags); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnRNNSetClip_v8(cudnnRNNDescriptor_t rnnDesc, + cudnnRNNClipMode_t clipMode, + cudnnNanPropagation_t clipNanOpt, + double lclip, + double rclip); + +cudnnStatus_t CUDNNWINAPI +cudnnRNNSetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t clipMode, double lclip, double rclip); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnRNNGetClip_v8(cudnnRNNDescriptor_t rnnDesc, + cudnnRNNClipMode_t *clipMode, + cudnnNanPropagation_t *clipNanOpt, + double *lclip, + double *rclip); + +cudnnStatus_t CUDNNWINAPI +cudnnRNNGetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t *clipMode, double *lclip, double *rclip); + +cudnnStatus_t CUDNNWINAPI +cudnnBuildRNNDynamic(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int miniBatch); + +cudnnStatus_t CUDNNWINAPI +cudnnGetRNNTempSpaceSizes(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + cudnnForwardMode_t fwdMode, + cudnnRNNDataDescriptor_t xDesc, + size_t *workSpaceSize, + size_t *reserveSpaceSize); + +cudnnStatus_t CUDNNWINAPI +cudnnGetRNNWeightSpaceSize(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, size_t *weightSpaceSize); + +cudnnStatus_t CUDNNWINAPI +cudnnGetRNNWeightParams(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + int32_t pseudoLayer, + size_t weightSpaceSize, + const void *weightSpace, + int32_t linLayerID, + cudnnTensorDescriptor_t mDesc, + void **mAddr, + cudnnTensorDescriptor_t bDesc, + void **bAddr); + +cudnnStatus_t CUDNNWINAPI +cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc, + cudnnDataType_t dataType, + cudnnRNNDataLayout_t layout, + int maxSeqLength, + int batchSize, + int vectorSize, + const int seqLengthArray[], /* length of each sequence in the batch */ + void *paddingFill); /* symbol for filling padding position in output */ + +cudnnStatus_t CUDNNWINAPI +cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc, + cudnnDataType_t *dataType, + cudnnRNNDataLayout_t *layout, + int *maxSeqLength, + int *batchSize, + int *vectorSize, + int arrayLengthRequested, + int seqLengthArray[], + void *paddingFill); + +cudnnStatus_t CUDNNWINAPI +cudnnRNNForward(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + cudnnForwardMode_t fwdMode, + const int32_t devSeqLengths[], + cudnnRNNDataDescriptor_t xDesc, + const void *x, + cudnnRNNDataDescriptor_t yDesc, + void *y, + cudnnTensorDescriptor_t hDesc, + const void *hx, + void *hy, + cudnnTensorDescriptor_t cDesc, + const void *cx, + void *cy, + size_t weightSpaceSize, + const void *weightSpace, + size_t workSpaceSize, + void *workSpace, + size_t reserveSpaceSize, + void *reserveSpace); + +/* Sequence data descriptor */ + +typedef enum { + CUDNN_SEQDATA_TIME_DIM = 0, /* index in time */ + CUDNN_SEQDATA_BATCH_DIM = 1, /* index in batch */ + CUDNN_SEQDATA_BEAM_DIM = 2, /* index in beam */ + CUDNN_SEQDATA_VECT_DIM = 3 /* index in vector */ +} cudnnSeqDataAxis_t; + +struct cudnnSeqDataStruct; +typedef struct cudnnSeqDataStruct *cudnnSeqDataDescriptor_t CUDNN_DEPRECATED; + +#define CUDNN_SEQDATA_DIM_COUNT 4 /* dimension count */ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetSeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc, + cudnnDataType_t dataType, + int nbDims, + const int dimA[], + const cudnnSeqDataAxis_t axes[], + size_t seqLengthArraySize, + const int seqLengthArray[], + void *paddingFill); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetSeqDataDescriptor(const cudnnSeqDataDescriptor_t seqDataDesc, + cudnnDataType_t *dataType, + int *nbDims, + int nbDimsRequested, + int dimA[], + cudnnSeqDataAxis_t axes[], + size_t *seqLengthArraySize, + size_t seqLengthSizeRequested, + int seqLengthArray[], + void *paddingFill); + +/* Multihead Attention */ + +/* + * Multi-head attention options passed via 'attnMode' in cudnnSetAttnDescriptor(). + * Use the bitwise OR operator to combine several settings listed below. Additional + * minor options can be added here w/o changing or introducing new API functions. + */ +#define CUDNN_ATTN_QUERYMAP_ALL_TO_ONE 0 /* multiple Q-s map to a single (K,V) set when beam size > 1 */ +#define CUDNN_ATTN_QUERYMAP_ONE_TO_ONE (1U << 0) /* multiple Q-s map to multiple (K,V) sets when beam size > 1 */ +#define CUDNN_ATTN_DISABLE_PROJ_BIASES 0 /* no biases in attention input and output projections */ +#define CUDNN_ATTN_ENABLE_PROJ_BIASES (1U << 1) /* use biases in attention input and output projections */ + +struct cudnnAttnStruct; +typedef struct cudnnAttnStruct *cudnnAttnDescriptor_t CUDNN_DEPRECATED; + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetAttnDescriptor(cudnnAttnDescriptor_t attnDesc, + unsigned attnMode, + int nHeads, + double smScaler, + cudnnDataType_t dataType, + cudnnDataType_t computePrec, + cudnnMathType_t mathType, + cudnnDropoutDescriptor_t attnDropoutDesc, + cudnnDropoutDescriptor_t postDropoutDesc, + int qSize, + int kSize, + int vSize, + int qProjSize, + int kProjSize, + int vProjSize, + int oProjSize, + int qoMaxSeqLength, + int kvMaxSeqLength, + int maxBatchSize, + int maxBeamSize); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetAttnDescriptor(cudnnAttnDescriptor_t attnDesc, + unsigned *attnMode, + int *nHeads, + double *smScaler, + cudnnDataType_t *dataType, + cudnnDataType_t *computePrec, + cudnnMathType_t *mathType, + cudnnDropoutDescriptor_t *attnDropoutDesc, + cudnnDropoutDescriptor_t *postDropoutDesc, + int *qSize, + int *kSize, + int *vSize, + int *qProjSize, + int *kProjSize, + int *vProjSize, + int *oProjSize, + int *qoMaxSeqLength, + int *kvMaxSeqLength, + int *maxBatchSize, + int *maxBeamSize); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetMultiHeadAttnBuffers(cudnnHandle_t handle, + const cudnnAttnDescriptor_t attnDesc, + size_t *weightSizeInBytes, + size_t *workSpaceSizeInBytes, + size_t *reserveSpaceSizeInBytes); + +typedef enum { + CUDNN_MH_ATTN_Q_WEIGHTS = 0, /* input projection weights for 'queries' */ + CUDNN_MH_ATTN_K_WEIGHTS = 1, /* input projection weights for 'keys' */ + CUDNN_MH_ATTN_V_WEIGHTS = 2, /* input projection weights for 'values' */ + CUDNN_MH_ATTN_O_WEIGHTS = 3, /* output projection weights */ + CUDNN_MH_ATTN_Q_BIASES = 4, /* input projection bias tensor for 'queries' */ + CUDNN_MH_ATTN_K_BIASES = 5, /* input projection bias for 'keys' */ + CUDNN_MH_ATTN_V_BIASES = 6, /* input projection bias for 'values' */ + CUDNN_MH_ATTN_O_BIASES = 7, /* output projection biases */ +} cudnnMultiHeadAttnWeightKind_t; + +#define CUDNN_ATTN_WKIND_COUNT 8 /* Number of attention weight/bias tensors */ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetMultiHeadAttnWeights(cudnnHandle_t handle, + const cudnnAttnDescriptor_t attnDesc, + cudnnMultiHeadAttnWeightKind_t wKind, + size_t weightSizeInBytes, + const void *weights, + cudnnTensorDescriptor_t wDesc, + void **wAddr); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnMultiHeadAttnForward(cudnnHandle_t handle, + const cudnnAttnDescriptor_t attnDesc, + int currIdx, + const int loWinIdx[], + const int hiWinIdx[], + const int devSeqLengthsQO[], + const int devSeqLengthsKV[], + const cudnnSeqDataDescriptor_t qDesc, + const void *queries, + const void *residuals, + const cudnnSeqDataDescriptor_t kDesc, + const void *keys, + const cudnnSeqDataDescriptor_t vDesc, + const void *values, + const cudnnSeqDataDescriptor_t oDesc, + void *out, + size_t weightSizeInBytes, + const void *weights, + size_t workSpaceSizeInBytes, + void *workSpace, + size_t reserveSpaceSizeInBytes, + void *reserveSpace); + +/* + * \brief Cross-library version checker. + * This function is implemented differently in each sub-library. Each sublib + * checks whether its own version matches that of its dependencies. + * \returns CUDNN_STATUS_SUCCESS if the version check passes, + * CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent. + */ +cudnnStatus_t CUDNNWINAPI +cudnnAdvVersionCheck(void); + +typedef enum { + CUDNN_WGRAD_MODE_ADD = 0, /* add partial gradients to wgrad output buffers */ + CUDNN_WGRAD_MODE_SET = 1, /* write partial gradients to wgrad output buffers */ +} cudnnWgradMode_t; + +cudnnStatus_t CUDNNWINAPI +cudnnRNNBackwardData_v8(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + const int32_t devSeqLengths[], + cudnnRNNDataDescriptor_t yDesc, + const void *y, + const void *dy, + cudnnRNNDataDescriptor_t xDesc, + void *dx, + cudnnTensorDescriptor_t hDesc, + const void *hx, + const void *dhy, + void *dhx, + cudnnTensorDescriptor_t cDesc, + const void *cx, + const void *dcy, + void *dcx, + size_t weightSpaceSize, + const void *weightSpace, + size_t workSpaceSize, + void *workSpace, + size_t reserveSpaceSize, + void *reserveSpace); + +cudnnStatus_t CUDNNWINAPI +cudnnRNNBackwardWeights_v8(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + cudnnWgradMode_t addGrad, + const int32_t devSeqLengths[], + cudnnRNNDataDescriptor_t xDesc, + const void *x, + cudnnTensorDescriptor_t hDesc, + const void *hx, + cudnnRNNDataDescriptor_t yDesc, + const void *y, + size_t weightSpaceSize, + void *dweightSpace, + size_t workSpaceSize, + void *workSpace, + size_t reserveSpaceSize, + void *reserveSpace); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnMultiHeadAttnBackwardData(cudnnHandle_t handle, + const cudnnAttnDescriptor_t attnDesc, + const int loWinIdx[], + const int hiWinIdx[], + const int devSeqLengthsDQDO[], + const int devSeqLengthsDKDV[], + const cudnnSeqDataDescriptor_t doDesc, + const void *dout, + const cudnnSeqDataDescriptor_t dqDesc, + void *dqueries, + const void *queries, + const cudnnSeqDataDescriptor_t dkDesc, + void *dkeys, + const void *keys, + const cudnnSeqDataDescriptor_t dvDesc, + void *dvalues, + const void *values, + size_t weightSizeInBytes, + const void *weights, + size_t workSpaceSizeInBytes, + void *workSpace, + size_t reserveSpaceSizeInBytes, + void *reserveSpace); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnMultiHeadAttnBackwardWeights(cudnnHandle_t handle, + const cudnnAttnDescriptor_t attnDesc, + cudnnWgradMode_t addGrad, + const cudnnSeqDataDescriptor_t qDesc, + const void *queries, + const cudnnSeqDataDescriptor_t kDesc, + const void *keys, + const cudnnSeqDataDescriptor_t vDesc, + const void *values, + const cudnnSeqDataDescriptor_t doDesc, + const void *dout, + size_t weightSizeInBytes, + const void *weights, + void *dweights, + size_t workSpaceSizeInBytes, + void *workSpace, + size_t reserveSpaceSizeInBytes, + void *reserveSpace); + +/* + * CTC (Connectionist Temporal Classification) loss descriptor create/destory/set/get functions + */ +/* Input normalization mode for loss function */ +typedef enum { + CUDNN_LOSS_NORMALIZATION_NONE = 0, + CUDNN_LOSS_NORMALIZATION_SOFTMAX = 1, +} cudnnLossNormalizationMode_t; + +cudnnStatus_t CUDNNWINAPI +cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc, + cudnnDataType_t compType, + cudnnLossNormalizationMode_t normMode, + cudnnNanPropagation_t gradMode); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc, + cudnnDataType_t compType, + cudnnLossNormalizationMode_t normMode, + cudnnNanPropagation_t gradMode, + int maxLabelLength); + +cudnnStatus_t CUDNNWINAPI +cudnnSetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc, + cudnnDataType_t compType, + cudnnLossNormalizationMode_t normMode, + cudnnCTCGradMode_t ctcGradMode, + int maxLabelLength); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc, + cudnnDataType_t *compType, + cudnnLossNormalizationMode_t *normMode, + cudnnNanPropagation_t *gradMode); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc, + cudnnDataType_t *compType, + cudnnLossNormalizationMode_t *normMode, + cudnnNanPropagation_t *gradMode, + int *maxLabelLength); + +cudnnStatus_t CUDNNWINAPI +cudnnGetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc, + cudnnDataType_t *compType, + cudnnLossNormalizationMode_t *normMode, + cudnnCTCGradMode_t *ctcGradMode, + int *maxLabelLength); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc); + +/* return the ctc costs and gradients, given the probabilities and labels */ +cudnnStatus_t CUDNNWINAPI +cudnnCTCLoss( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the + timing steps, N is the mini batch size, A is the alphabet size) */ + const void *probs, /* probabilities after softmax, in GPU memory */ + const int hostLabels[], /* labels, in CPU memory */ + const int hostLabelLengths[], /* the length of each label, in CPU memory */ + const int hostInputLengths[], /* the lengths of timing steps in each batch, in CPU memory */ + void *costs, /* the returned costs of CTC, in GPU memory */ + const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */ + void *gradients, /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */ + cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */ + cudnnCTCLossDescriptor_t ctcLossDesc, + void *workspace, /* pointer to the workspace, in GPU memory */ + size_t workSpaceSizeInBytes); /* size of the workspace */ + +/* return the ctc costs and gradients, given the probabilities and labels */ +cudnnStatus_t CUDNNWINAPI +cudnnCTCLoss_v8( + cudnnHandle_t handle, + cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */ + cudnnCTCLossDescriptor_t ctcLossDesc, + const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the + timing steps, N is the mini batch size, A is the alphabet size) */ + const void *probs, /* probabilities after softmax, in GPU memory */ + const int labels[], /* labels, in GPU memory */ + const int labelLengths[], /* the length of each label, in GPU memory */ + const int inputLengths[], /* the lengths of timing steps in each batch, in GPU memory */ + void *costs, /* the returned costs of CTC, in GPU memory */ + const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */ + void *gradients, /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */ + size_t workSpaceSizeInBytes, /* size of the workspace */ + void *workspace); /* pointer to the workspace, in GPU memory */ + +/* return the workspace size needed for ctc */ +cudnnStatus_t CUDNNWINAPI +cudnnGetCTCLossWorkspaceSize( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the + timing steps, N is the mini batch size, A is the alphabet size) */ + const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the + dimensions are T,N,A. To compute costs + only, set it to NULL */ + const int *labels, /* labels, in CPU memory */ + const int *labelLengths, /* the length of each label, in CPU memory */ + const int *inputLengths, /* the lengths of timing steps in each batch, in CPU memory */ + cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */ + cudnnCTCLossDescriptor_t ctcLossDesc, + size_t *sizeInBytes); /* pointer to the returned workspace size */ + +/* return the workspace size needed for ctc */ +cudnnStatus_t CUDNNWINAPI +cudnnGetCTCLossWorkspaceSize_v8( + cudnnHandle_t handle, + cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */ + cudnnCTCLossDescriptor_t ctcLossDesc, + const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the + timing steps, N is the mini batch size, A is the alphabet size) */ + const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the + dimensions are T,N,A. To compute costs + only, set it to NULL */ + size_t *sizeInBytes); /* pointer to the returned workspace size */ + +#if defined(__cplusplus) +} +#endif + +#endif /* CUDNN_ADV_H_ */ diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_backend.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_backend.h new file mode 100644 index 0000000000000000000000000000000000000000..5a378e2087f7a45c423f65d213d98c4fa20f3a52 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_backend.h @@ -0,0 +1,60 @@ +/* + * Copyright 2014-2023 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CUDNN_BACKEND_H_ +#define _CUDNN_BACKEND_H_ + +/* + * The content of this header has been moved into cudnn_graph.h. + * This header is kept for the backward compatibility purpose. + */ + +#include "cudnn_graph.h" + +#endif /* _CUDNN_BACKEND_H_ */ diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_backend_v9.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_backend_v9.h new file mode 100644 index 0000000000000000000000000000000000000000..5a378e2087f7a45c423f65d213d98c4fa20f3a52 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_backend_v9.h @@ -0,0 +1,60 @@ +/* + * Copyright 2014-2023 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CUDNN_BACKEND_H_ +#define _CUDNN_BACKEND_H_ + +/* + * The content of this header has been moved into cudnn_graph.h. + * This header is kept for the backward compatibility purpose. + */ + +#include "cudnn_graph.h" + +#endif /* _CUDNN_BACKEND_H_ */ diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_cnn.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_cnn.h new file mode 100644 index 0000000000000000000000000000000000000000..e988a8a033df31e35a37aeba12b9a7cdc1d7ed60 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_cnn.h @@ -0,0 +1,693 @@ +/* + * Copyright 2014-2023 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* + * cudnn_cnn : cuDNN's basic definitions and CNN functions. + */ + +#if !defined(CUDNN_CNN_H_) +#define CUDNN_CNN_H_ + +#pragma once +#include + +#include "cudnn_version.h" +#include "cudnn_ops.h" + +/* These version numbers are autogenerated, do not edit manually. */ +#define CUDNN_CNN_MAJOR 9 +#define CUDNN_CNN_MINOR 10 +#define CUDNN_CNN_PATCH 2 + +#if (CUDNN_CNN_MAJOR != CUDNN_MAJOR) || (CUDNN_CNN_MINOR != CUDNN_MINOR) || (CUDNN_CNN_PATCH != CUDNN_PATCHLEVEL) +#error Version mismatch in cuDNN CNN INFER!!! +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +typedef struct cudnnConvolutionStruct *cudnnConvolutionDescriptor_t CUDNN_DEPRECATED; + +typedef struct cudnnConvolutionFwdAlgoPerfStruct { + cudnnConvolutionFwdAlgo_t algo; + cudnnStatus_t status; + float time; + size_t memory; + cudnnDeterminism_t determinism; + cudnnMathType_t mathType; + int reserved[3]; +} cudnnConvolutionFwdAlgoPerf_t CUDNN_DEPRECATED; + +/* Create an instance of convolution descriptor */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc); + +/* Destroy an instance of convolution descriptor */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t reorderType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t *reorderType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc, + int pad_h, /* zero-padding height */ + int pad_w, /* zero-padding width */ + int u, /* vertical filter stride */ + int v, /* horizontal filter stride */ + int dilation_h, /* filter dilation in the vertical dimension */ + int dilation_w, /* filter dilation in the horizontal dimension */ + cudnnConvolutionMode_t mode, + cudnnDataType_t computeType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc, + int *pad_h, /* zero-padding height */ + int *pad_w, /* zero-padding width */ + int *u, /* vertical filter stride */ + int *v, /* horizontal filter stride */ + int *dilation_h, /* filter dilation in the vertical dimension */ + int *dilation_w, /* filter dilation in the horizontal dimension */ + cudnnConvolutionMode_t *mode, + cudnnDataType_t *computeType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc, + int arrayLength, /* nbDims-2 size */ + const int padA[], + const int filterStrideA[], + const int dilationA[], + cudnnConvolutionMode_t mode, + cudnnDataType_t computeType); /* convolution data type */ + +/* Helper function to return the dimensions of the output tensor given a convolution descriptor */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc, + int arrayLengthRequested, + int *arrayLength, + int padA[], + int strideA[], + int dilationA[], + cudnnConvolutionMode_t *mode, + cudnnDataType_t *computeType); /* convolution data type */ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + const cudnnFilterDescriptor_t filterDesc, + int *n, + int *c, + int *h, + int *w); + +/* Helper function to return the dimensions of the output tensor given a convolution descriptor */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + const cudnnFilterDescriptor_t filterDesc, + int nbDims, + int tensorOuputDimA[]); + +/* helper function to provide the convolution forward algo that fit best the requirement */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle, + const cudnnTensorDescriptor_t srcDesc, + const cudnnFilterDescriptor_t filterDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t destDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionFwdAlgoPerf_t *perfResults); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionFwdAlgoPerf_t *perfResults); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + void *y, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionFwdAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnIm2Col(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + void *colBuffer); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnReorderFilterAndBias(cudnnHandle_t handle, + const cudnnFilterDescriptor_t filterDesc, + cudnnReorderType_t reorderType, + const void *filterData, + void *reorderedFilterData, + int reorderBias, + const void *biasData, + void *reorderedBiasData); + +/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + cudnnConvolutionFwdAlgo_t algo, + size_t *sizeInBytes); + +/* Convolution functions: All of the form "output = alpha * Op(inputs) + beta * output" */ + +/* Function to perform the forward pass for batch convolution */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnConvolutionForward(cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionFwdAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +/* Fused conv/bias/activation operation : y = Act( alpha1 * conv(x) + alpha2 * z + bias ) */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnConvolutionBiasActivationForward(cudnnHandle_t handle, + const void *alpha1, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionFwdAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *alpha2, + const cudnnTensorDescriptor_t zDesc, + const void *z, + const cudnnTensorDescriptor_t biasDesc, + const void *bias, + const cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t yDesc, + void *y); + +/* helper function to provide the convolution backward data algo that fit best the requirement */ + +typedef struct cudnnConvolutionBwdDataAlgoPerfStruct { + cudnnConvolutionBwdDataAlgo_t algo; + cudnnStatus_t status; + float time; + size_t memory; + cudnnDeterminism_t determinism; + cudnnMathType_t mathType; + int reserved[3]; +} cudnnConvolutionBwdDataAlgoPerf_t CUDNN_DEPRECATED; + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdDataAlgoPerf_t *perfResults); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + void *dx, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdDataAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle, + const cudnnFilterDescriptor_t filterDesc, + const cudnnTensorDescriptor_t diffDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t gradDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdDataAlgoPerf_t *perfResults); + +/* + * convolution algorithm (which requires potentially some workspace) + */ + +/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + cudnnConvolutionBwdDataAlgo_t algo, + size_t *sizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnConvolutionBackwardData(cudnnHandle_t handle, + const void *alpha, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionBwdDataAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx); + +/* Helper function to calculate folding descriptors for dgrad */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetFoldedConvBackwardDataDescriptors(const cudnnHandle_t handle, + const cudnnFilterDescriptor_t filterDesc, + const cudnnTensorDescriptor_t diffDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t gradDesc, + const cudnnTensorFormat_t transformFormat, + cudnnFilterDescriptor_t foldedFilterDesc, + cudnnTensorDescriptor_t paddedDiffDesc, + cudnnConvolutionDescriptor_t foldedConvDesc, + cudnnTensorDescriptor_t foldedGradDesc, + cudnnTensorTransformDescriptor_t filterFoldTransDesc, + cudnnTensorTransformDescriptor_t diffPadTransDesc, + cudnnTensorTransformDescriptor_t gradFoldTransDesc, + cudnnTensorTransformDescriptor_t gradUnfoldTransDesc); + +/* cudnnFusedOps... */ +struct cudnnFusedOpsConstParamStruct; +typedef struct cudnnFusedOpsConstParamStruct *cudnnFusedOpsConstParamPack_t CUDNN_DEPRECATED; + +struct cudnnFusedOpsVariantParamStruct; +typedef struct cudnnFusedOpsVariantParamStruct *cudnnFusedOpsVariantParamPack_t CUDNN_DEPRECATED; + +struct cudnnFusedOpsPlanStruct; +typedef struct cudnnFusedOpsPlanStruct *cudnnFusedOpsPlan_t CUDNN_DEPRECATED; + +typedef enum { + /* each op in [ ] can be disabled by passing NULL ptr */ + /* [per channel scale], [per channel bias], [activation], convolution, [generate BN stats] */ + CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 0, + /* [per channel scale], [per channel bias], [activation], convolutionBackwardWeights */ + CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 1, + /* utility for BN training in BN-conv fusion */ + /* computes the equivalent scale and bias from ySum ySqSum and learned scale, bias */ + /* optionally update running stats and generate saved stats */ + CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 2, + /* utility for BN inference in BN-conv fusion */ + /* computes the equivalent scale and bias from learned running stats and learned scale, bias */ + CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 3, + /* reserved for future use: convolution, [per channel scale], [per channel bias], [residual add], [activation] */ + CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 4, + /* reserved for future use: [per channel scale], [per channel bias], [residual add], activation, bitmask */ + CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 5, + /* reserved for future use */ + CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 6, +} cudnnFusedOps_t CUDNN_DEPRECATED; + +typedef enum { + /* set XDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get XDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_XDESC = 0, + /* set/get XDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_XDATA_PLACEHOLDER = 1, + /* set/get BN_MODE: pass cudnnBatchNormMode_t* */ + CUDNN_PARAM_BN_MODE = 2, + /* set CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 3, + /* set/get BN_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 4, + /* set/get BN_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 5, + /* set ACTIVATION_DESC: pass previously initialized cudnnActivationDescriptor_t */ + /* get ACTIVATION_DESC: pass previously created cudnnActivationDescriptor_t */ + CUDNN_PARAM_ACTIVATION_DESC = 6, + /* set CONV_DESC: pass previously initialized cudnnConvolutionDescriptor_t */ + /* get CONV_DESC: pass previously created cudnnConvolutionDescriptor_t */ + CUDNN_PARAM_CONV_DESC = 7, + /* set WDESC: pass previously initialized cudnnFilterDescriptor_t */ + /* get WDESC: pass previously created cudnnFilterDescriptor_t */ + CUDNN_PARAM_WDESC = 8, + /* set/get WDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_WDATA_PLACEHOLDER = 9, + /* set DWDESC: pass previously initialized cudnnFilterDescriptor_t */ + /* get DWDESC: pass previously created cudnnFilterDescriptor_t */ + CUDNN_PARAM_DWDESC = 10, + /* set/get DWDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_DWDATA_PLACEHOLDER = 11, + /* set YDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get YDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_YDESC = 12, + /* set/get YDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_YDATA_PLACEHOLDER = 13, + /* set DYDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get DYDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_DYDESC = 14, + /* set/get DYDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_DYDATA_PLACEHOLDER = 15, + /* set YSTATS_DESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get YSTATS_DESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_YSTATS_DESC = 16, + /* set/get YSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_YSUM_PLACEHOLDER = 17, + /* set/get YSQSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_YSQSUM_PLACEHOLDER = 18, + /* set CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 19, + /* set/get CUDNN_PARAM_BN_SCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 20, + /* set/get CUDNN_PARAM_BN_BIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 21, + /* set/get CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 22, + /* set/get CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 23, + /* set/get CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 24, + /* set/get CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 25, + + /* set ZDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get ZDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_ZDESC = 26, + /* set/get ZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_ZDATA_PLACEHOLDER = 27, + /* set BN_Z_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get BN_Z_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 28, + /* set/get BN_Z_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 29, + /* set/get BN_Z_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 30, + + /* set ACTIVATION_BITMASK_DESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get ACTIVATION_BITMASK_DESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 31, + /* set/get ACTIVATION_BITMASK_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 32, + + /* set DXDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get DXDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_DXDESC = 33, + /* set/get DXDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_DXDATA_PLACEHOLDER = 34, + /* set DZDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get DZDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_DZDESC = 35, + /* set/get DZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_DZDATA_PLACEHOLDER = 36, + /* set/get CUDNN_PARAM_BN_DSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 37, + /* set/get CUDNN_PARAM_BN_DBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 38, +} cudnnFusedOpsConstParamLabel_t CUDNN_DEPRECATED; + +typedef enum { + CUDNN_PTR_NULL = 0, + CUDNN_PTR_ELEM_ALIGNED = 1, + CUDNN_PTR_16B_ALIGNED = 2, +} cudnnFusedOpsPointerPlaceHolder_t CUDNN_DEPRECATED; + +typedef enum { + /* set: pass void* pointing to dev memory */ + /* get: pass void** pointing to host memory */ + CUDNN_PTR_XDATA = 0, + CUDNN_PTR_BN_EQSCALE = 1, + CUDNN_PTR_BN_EQBIAS = 2, + CUDNN_PTR_WDATA = 3, + CUDNN_PTR_DWDATA = 4, + CUDNN_PTR_YDATA = 5, + CUDNN_PTR_DYDATA = 6, + CUDNN_PTR_YSUM = 7, + CUDNN_PTR_YSQSUM = 8, + CUDNN_PTR_WORKSPACE = 9, + CUDNN_PTR_BN_SCALE = 10, + CUDNN_PTR_BN_BIAS = 11, + CUDNN_PTR_BN_SAVED_MEAN = 12, + CUDNN_PTR_BN_SAVED_INVSTD = 13, + CUDNN_PTR_BN_RUNNING_MEAN = 14, + CUDNN_PTR_BN_RUNNING_VAR = 15, + CUDNN_PTR_ZDATA = 16, + CUDNN_PTR_BN_Z_EQSCALE = 17, + CUDNN_PTR_BN_Z_EQBIAS = 18, + CUDNN_PTR_ACTIVATION_BITMASK = 19, + CUDNN_PTR_DXDATA = 20, + CUDNN_PTR_DZDATA = 21, + CUDNN_PTR_BN_DSCALE = 22, + CUDNN_PTR_BN_DBIAS = 23, + + /* set/get: pass size_t* pointing to host memory */ + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 100, + /* set/get: pass int64_t* pointing to host memory */ + CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 101, + /* set/get: pass double* pointing to host memory */ + CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102, + /* set/get: pass double* pointing to host memory */ + CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103, +} cudnnFusedOpsVariantParamLabel_t CUDNN_DEPRECATED; + +cudnnStatus_t CUDNNWINAPI +cudnnCnnVersionCheck(void); + +/* helper function to provide the convolution backward filter algo that fit best the requirement */ + +typedef struct cudnnConvolutionBwdFilterAlgoPerfStruct { + cudnnConvolutionBwdFilterAlgo_t algo; + cudnnStatus_t status; + float time; + size_t memory; + cudnnDeterminism_t determinism; + cudnnMathType_t mathType; + int reserved[3]; +} cudnnConvolutionBwdFilterAlgoPerf_t CUDNN_DEPRECATED; + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle_t handle, int *count); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnFindConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t dwDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdFilterAlgoPerf_t *perfResults); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnTensorDescriptor_t dyDesc, + const void *y, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t dwDesc, + void *dw, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_t handle, + const cudnnTensorDescriptor_t srcDesc, + const cudnnTensorDescriptor_t diffDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t gradDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdFilterAlgoPerf_t *perfResults); + +/* + * convolution algorithm (which requires potentially some workspace) + */ + +/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t gradDesc, + cudnnConvolutionBwdFilterAlgo_t algo, + size_t *sizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnConvolutionBackwardFilter(cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionBwdFilterAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *beta, + const cudnnFilterDescriptor_t dwDesc, + void *dw); + +/* Function to compute the bias gradient for batch convolution */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnConvolutionBackwardBias(cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const void *beta, + const cudnnTensorDescriptor_t dbDesc, + void *db); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t *constPack, cudnnFusedOps_t ops); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t constPack); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetFusedOpsConstParamPackAttribute(cudnnFusedOpsConstParamPack_t constPack, + cudnnFusedOpsConstParamLabel_t paramLabel, + const void *param); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetFusedOpsConstParamPackAttribute(const cudnnFusedOpsConstParamPack_t constPack, + cudnnFusedOpsConstParamLabel_t paramLabel, + void *param, + int *isNULL); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t *varPack, cudnnFusedOps_t ops); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t varPack); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetFusedOpsVariantParamPackAttribute(cudnnFusedOpsVariantParamPack_t varPack, + cudnnFusedOpsVariantParamLabel_t paramLabel, + void *ptr); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetFusedOpsVariantParamPackAttribute(const cudnnFusedOpsVariantParamPack_t varPack, + cudnnFusedOpsVariantParamLabel_t paramLabel, + void *ptr); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateFusedOpsPlan(cudnnFusedOpsPlan_t *plan, cudnnFusedOps_t ops); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyFusedOpsPlan(cudnnFusedOpsPlan_t plan); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnMakeFusedOpsPlan(cudnnHandle_t handle, + cudnnFusedOpsPlan_t plan, + const cudnnFusedOpsConstParamPack_t constPack, + size_t *workspaceSizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnFusedOpsExecute(cudnnHandle_t handle, const cudnnFusedOpsPlan_t plan, cudnnFusedOpsVariantParamPack_t varPack); + +#if defined(__cplusplus) +} +#endif + +#endif /* CUDNN_CNN_H_ */ diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_cnn_v9.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_cnn_v9.h new file mode 100644 index 0000000000000000000000000000000000000000..e988a8a033df31e35a37aeba12b9a7cdc1d7ed60 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_cnn_v9.h @@ -0,0 +1,693 @@ +/* + * Copyright 2014-2023 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* + * cudnn_cnn : cuDNN's basic definitions and CNN functions. + */ + +#if !defined(CUDNN_CNN_H_) +#define CUDNN_CNN_H_ + +#pragma once +#include + +#include "cudnn_version.h" +#include "cudnn_ops.h" + +/* These version numbers are autogenerated, do not edit manually. */ +#define CUDNN_CNN_MAJOR 9 +#define CUDNN_CNN_MINOR 10 +#define CUDNN_CNN_PATCH 2 + +#if (CUDNN_CNN_MAJOR != CUDNN_MAJOR) || (CUDNN_CNN_MINOR != CUDNN_MINOR) || (CUDNN_CNN_PATCH != CUDNN_PATCHLEVEL) +#error Version mismatch in cuDNN CNN INFER!!! +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +typedef struct cudnnConvolutionStruct *cudnnConvolutionDescriptor_t CUDNN_DEPRECATED; + +typedef struct cudnnConvolutionFwdAlgoPerfStruct { + cudnnConvolutionFwdAlgo_t algo; + cudnnStatus_t status; + float time; + size_t memory; + cudnnDeterminism_t determinism; + cudnnMathType_t mathType; + int reserved[3]; +} cudnnConvolutionFwdAlgoPerf_t CUDNN_DEPRECATED; + +/* Create an instance of convolution descriptor */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc); + +/* Destroy an instance of convolution descriptor */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t reorderType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t *reorderType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc, + int pad_h, /* zero-padding height */ + int pad_w, /* zero-padding width */ + int u, /* vertical filter stride */ + int v, /* horizontal filter stride */ + int dilation_h, /* filter dilation in the vertical dimension */ + int dilation_w, /* filter dilation in the horizontal dimension */ + cudnnConvolutionMode_t mode, + cudnnDataType_t computeType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc, + int *pad_h, /* zero-padding height */ + int *pad_w, /* zero-padding width */ + int *u, /* vertical filter stride */ + int *v, /* horizontal filter stride */ + int *dilation_h, /* filter dilation in the vertical dimension */ + int *dilation_w, /* filter dilation in the horizontal dimension */ + cudnnConvolutionMode_t *mode, + cudnnDataType_t *computeType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc, + int arrayLength, /* nbDims-2 size */ + const int padA[], + const int filterStrideA[], + const int dilationA[], + cudnnConvolutionMode_t mode, + cudnnDataType_t computeType); /* convolution data type */ + +/* Helper function to return the dimensions of the output tensor given a convolution descriptor */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc, + int arrayLengthRequested, + int *arrayLength, + int padA[], + int strideA[], + int dilationA[], + cudnnConvolutionMode_t *mode, + cudnnDataType_t *computeType); /* convolution data type */ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + const cudnnFilterDescriptor_t filterDesc, + int *n, + int *c, + int *h, + int *w); + +/* Helper function to return the dimensions of the output tensor given a convolution descriptor */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + const cudnnFilterDescriptor_t filterDesc, + int nbDims, + int tensorOuputDimA[]); + +/* helper function to provide the convolution forward algo that fit best the requirement */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle, + const cudnnTensorDescriptor_t srcDesc, + const cudnnFilterDescriptor_t filterDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t destDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionFwdAlgoPerf_t *perfResults); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionFwdAlgoPerf_t *perfResults); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + void *y, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionFwdAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnIm2Col(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + void *colBuffer); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnReorderFilterAndBias(cudnnHandle_t handle, + const cudnnFilterDescriptor_t filterDesc, + cudnnReorderType_t reorderType, + const void *filterData, + void *reorderedFilterData, + int reorderBias, + const void *biasData, + void *reorderedBiasData); + +/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + cudnnConvolutionFwdAlgo_t algo, + size_t *sizeInBytes); + +/* Convolution functions: All of the form "output = alpha * Op(inputs) + beta * output" */ + +/* Function to perform the forward pass for batch convolution */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnConvolutionForward(cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionFwdAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +/* Fused conv/bias/activation operation : y = Act( alpha1 * conv(x) + alpha2 * z + bias ) */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnConvolutionBiasActivationForward(cudnnHandle_t handle, + const void *alpha1, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionFwdAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *alpha2, + const cudnnTensorDescriptor_t zDesc, + const void *z, + const cudnnTensorDescriptor_t biasDesc, + const void *bias, + const cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t yDesc, + void *y); + +/* helper function to provide the convolution backward data algo that fit best the requirement */ + +typedef struct cudnnConvolutionBwdDataAlgoPerfStruct { + cudnnConvolutionBwdDataAlgo_t algo; + cudnnStatus_t status; + float time; + size_t memory; + cudnnDeterminism_t determinism; + cudnnMathType_t mathType; + int reserved[3]; +} cudnnConvolutionBwdDataAlgoPerf_t CUDNN_DEPRECATED; + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdDataAlgoPerf_t *perfResults); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + void *dx, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdDataAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle, + const cudnnFilterDescriptor_t filterDesc, + const cudnnTensorDescriptor_t diffDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t gradDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdDataAlgoPerf_t *perfResults); + +/* + * convolution algorithm (which requires potentially some workspace) + */ + +/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + cudnnConvolutionBwdDataAlgo_t algo, + size_t *sizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnConvolutionBackwardData(cudnnHandle_t handle, + const void *alpha, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionBwdDataAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx); + +/* Helper function to calculate folding descriptors for dgrad */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetFoldedConvBackwardDataDescriptors(const cudnnHandle_t handle, + const cudnnFilterDescriptor_t filterDesc, + const cudnnTensorDescriptor_t diffDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t gradDesc, + const cudnnTensorFormat_t transformFormat, + cudnnFilterDescriptor_t foldedFilterDesc, + cudnnTensorDescriptor_t paddedDiffDesc, + cudnnConvolutionDescriptor_t foldedConvDesc, + cudnnTensorDescriptor_t foldedGradDesc, + cudnnTensorTransformDescriptor_t filterFoldTransDesc, + cudnnTensorTransformDescriptor_t diffPadTransDesc, + cudnnTensorTransformDescriptor_t gradFoldTransDesc, + cudnnTensorTransformDescriptor_t gradUnfoldTransDesc); + +/* cudnnFusedOps... */ +struct cudnnFusedOpsConstParamStruct; +typedef struct cudnnFusedOpsConstParamStruct *cudnnFusedOpsConstParamPack_t CUDNN_DEPRECATED; + +struct cudnnFusedOpsVariantParamStruct; +typedef struct cudnnFusedOpsVariantParamStruct *cudnnFusedOpsVariantParamPack_t CUDNN_DEPRECATED; + +struct cudnnFusedOpsPlanStruct; +typedef struct cudnnFusedOpsPlanStruct *cudnnFusedOpsPlan_t CUDNN_DEPRECATED; + +typedef enum { + /* each op in [ ] can be disabled by passing NULL ptr */ + /* [per channel scale], [per channel bias], [activation], convolution, [generate BN stats] */ + CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 0, + /* [per channel scale], [per channel bias], [activation], convolutionBackwardWeights */ + CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 1, + /* utility for BN training in BN-conv fusion */ + /* computes the equivalent scale and bias from ySum ySqSum and learned scale, bias */ + /* optionally update running stats and generate saved stats */ + CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 2, + /* utility for BN inference in BN-conv fusion */ + /* computes the equivalent scale and bias from learned running stats and learned scale, bias */ + CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 3, + /* reserved for future use: convolution, [per channel scale], [per channel bias], [residual add], [activation] */ + CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 4, + /* reserved for future use: [per channel scale], [per channel bias], [residual add], activation, bitmask */ + CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 5, + /* reserved for future use */ + CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 6, +} cudnnFusedOps_t CUDNN_DEPRECATED; + +typedef enum { + /* set XDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get XDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_XDESC = 0, + /* set/get XDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_XDATA_PLACEHOLDER = 1, + /* set/get BN_MODE: pass cudnnBatchNormMode_t* */ + CUDNN_PARAM_BN_MODE = 2, + /* set CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 3, + /* set/get BN_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 4, + /* set/get BN_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 5, + /* set ACTIVATION_DESC: pass previously initialized cudnnActivationDescriptor_t */ + /* get ACTIVATION_DESC: pass previously created cudnnActivationDescriptor_t */ + CUDNN_PARAM_ACTIVATION_DESC = 6, + /* set CONV_DESC: pass previously initialized cudnnConvolutionDescriptor_t */ + /* get CONV_DESC: pass previously created cudnnConvolutionDescriptor_t */ + CUDNN_PARAM_CONV_DESC = 7, + /* set WDESC: pass previously initialized cudnnFilterDescriptor_t */ + /* get WDESC: pass previously created cudnnFilterDescriptor_t */ + CUDNN_PARAM_WDESC = 8, + /* set/get WDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_WDATA_PLACEHOLDER = 9, + /* set DWDESC: pass previously initialized cudnnFilterDescriptor_t */ + /* get DWDESC: pass previously created cudnnFilterDescriptor_t */ + CUDNN_PARAM_DWDESC = 10, + /* set/get DWDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_DWDATA_PLACEHOLDER = 11, + /* set YDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get YDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_YDESC = 12, + /* set/get YDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_YDATA_PLACEHOLDER = 13, + /* set DYDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get DYDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_DYDESC = 14, + /* set/get DYDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_DYDATA_PLACEHOLDER = 15, + /* set YSTATS_DESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get YSTATS_DESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_YSTATS_DESC = 16, + /* set/get YSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_YSUM_PLACEHOLDER = 17, + /* set/get YSQSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_YSQSUM_PLACEHOLDER = 18, + /* set CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 19, + /* set/get CUDNN_PARAM_BN_SCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 20, + /* set/get CUDNN_PARAM_BN_BIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 21, + /* set/get CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 22, + /* set/get CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 23, + /* set/get CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 24, + /* set/get CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 25, + + /* set ZDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get ZDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_ZDESC = 26, + /* set/get ZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_ZDATA_PLACEHOLDER = 27, + /* set BN_Z_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get BN_Z_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 28, + /* set/get BN_Z_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 29, + /* set/get BN_Z_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 30, + + /* set ACTIVATION_BITMASK_DESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get ACTIVATION_BITMASK_DESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 31, + /* set/get ACTIVATION_BITMASK_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 32, + + /* set DXDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get DXDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_DXDESC = 33, + /* set/get DXDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_DXDATA_PLACEHOLDER = 34, + /* set DZDESC: pass previously initialized cudnnTensorDescriptor_t */ + /* get DZDESC: pass previously created cudnnTensorDescriptor_t */ + CUDNN_PARAM_DZDESC = 35, + /* set/get DZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_DZDATA_PLACEHOLDER = 36, + /* set/get CUDNN_PARAM_BN_DSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 37, + /* set/get CUDNN_PARAM_BN_DBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */ + CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 38, +} cudnnFusedOpsConstParamLabel_t CUDNN_DEPRECATED; + +typedef enum { + CUDNN_PTR_NULL = 0, + CUDNN_PTR_ELEM_ALIGNED = 1, + CUDNN_PTR_16B_ALIGNED = 2, +} cudnnFusedOpsPointerPlaceHolder_t CUDNN_DEPRECATED; + +typedef enum { + /* set: pass void* pointing to dev memory */ + /* get: pass void** pointing to host memory */ + CUDNN_PTR_XDATA = 0, + CUDNN_PTR_BN_EQSCALE = 1, + CUDNN_PTR_BN_EQBIAS = 2, + CUDNN_PTR_WDATA = 3, + CUDNN_PTR_DWDATA = 4, + CUDNN_PTR_YDATA = 5, + CUDNN_PTR_DYDATA = 6, + CUDNN_PTR_YSUM = 7, + CUDNN_PTR_YSQSUM = 8, + CUDNN_PTR_WORKSPACE = 9, + CUDNN_PTR_BN_SCALE = 10, + CUDNN_PTR_BN_BIAS = 11, + CUDNN_PTR_BN_SAVED_MEAN = 12, + CUDNN_PTR_BN_SAVED_INVSTD = 13, + CUDNN_PTR_BN_RUNNING_MEAN = 14, + CUDNN_PTR_BN_RUNNING_VAR = 15, + CUDNN_PTR_ZDATA = 16, + CUDNN_PTR_BN_Z_EQSCALE = 17, + CUDNN_PTR_BN_Z_EQBIAS = 18, + CUDNN_PTR_ACTIVATION_BITMASK = 19, + CUDNN_PTR_DXDATA = 20, + CUDNN_PTR_DZDATA = 21, + CUDNN_PTR_BN_DSCALE = 22, + CUDNN_PTR_BN_DBIAS = 23, + + /* set/get: pass size_t* pointing to host memory */ + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 100, + /* set/get: pass int64_t* pointing to host memory */ + CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 101, + /* set/get: pass double* pointing to host memory */ + CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102, + /* set/get: pass double* pointing to host memory */ + CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103, +} cudnnFusedOpsVariantParamLabel_t CUDNN_DEPRECATED; + +cudnnStatus_t CUDNNWINAPI +cudnnCnnVersionCheck(void); + +/* helper function to provide the convolution backward filter algo that fit best the requirement */ + +typedef struct cudnnConvolutionBwdFilterAlgoPerfStruct { + cudnnConvolutionBwdFilterAlgo_t algo; + cudnnStatus_t status; + float time; + size_t memory; + cudnnDeterminism_t determinism; + cudnnMathType_t mathType; + int reserved[3]; +} cudnnConvolutionBwdFilterAlgoPerf_t CUDNN_DEPRECATED; + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle_t handle, int *count); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnFindConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t dwDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdFilterAlgoPerf_t *perfResults); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnTensorDescriptor_t dyDesc, + const void *y, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t dwDesc, + void *dw, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_t handle, + const cudnnTensorDescriptor_t srcDesc, + const cudnnTensorDescriptor_t diffDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t gradDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdFilterAlgoPerf_t *perfResults); + +/* + * convolution algorithm (which requires potentially some workspace) + */ + +/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t gradDesc, + cudnnConvolutionBwdFilterAlgo_t algo, + size_t *sizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnConvolutionBackwardFilter(cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionBwdFilterAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *beta, + const cudnnFilterDescriptor_t dwDesc, + void *dw); + +/* Function to compute the bias gradient for batch convolution */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnConvolutionBackwardBias(cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const void *beta, + const cudnnTensorDescriptor_t dbDesc, + void *db); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t *constPack, cudnnFusedOps_t ops); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t constPack); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetFusedOpsConstParamPackAttribute(cudnnFusedOpsConstParamPack_t constPack, + cudnnFusedOpsConstParamLabel_t paramLabel, + const void *param); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetFusedOpsConstParamPackAttribute(const cudnnFusedOpsConstParamPack_t constPack, + cudnnFusedOpsConstParamLabel_t paramLabel, + void *param, + int *isNULL); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t *varPack, cudnnFusedOps_t ops); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t varPack); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetFusedOpsVariantParamPackAttribute(cudnnFusedOpsVariantParamPack_t varPack, + cudnnFusedOpsVariantParamLabel_t paramLabel, + void *ptr); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetFusedOpsVariantParamPackAttribute(const cudnnFusedOpsVariantParamPack_t varPack, + cudnnFusedOpsVariantParamLabel_t paramLabel, + void *ptr); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateFusedOpsPlan(cudnnFusedOpsPlan_t *plan, cudnnFusedOps_t ops); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyFusedOpsPlan(cudnnFusedOpsPlan_t plan); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnMakeFusedOpsPlan(cudnnHandle_t handle, + cudnnFusedOpsPlan_t plan, + const cudnnFusedOpsConstParamPack_t constPack, + size_t *workspaceSizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnFusedOpsExecute(cudnnHandle_t handle, const cudnnFusedOpsPlan_t plan, cudnnFusedOpsVariantParamPack_t varPack); + +#if defined(__cplusplus) +} +#endif + +#endif /* CUDNN_CNN_H_ */ diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_graph.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_graph.h new file mode 100644 index 0000000000000000000000000000000000000000..389fba220c579e08519072255f2aea9a5da2d3e5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_graph.h @@ -0,0 +1,992 @@ +/* + * Copyright 2014-2023 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* + * cudnn_graph : cuDNN's basic definitions operations. + */ + +#if !defined(CUDNN_GRAPH_H_) +#define CUDNN_GRAPH_H_ + +#include +#include + +#include + +#include "cudnn_version.h" + +/* These version numbers are autogenerated, do not edit manually. */ +#define CUDNN_GRAPH_MAJOR 9 +#define CUDNN_GRAPH_MINOR 10 +#define CUDNN_GRAPH_PATCH 2 + +#if (CUDNN_GRAPH_MAJOR != CUDNN_MAJOR) || (CUDNN_GRAPH_MINOR != CUDNN_MINOR) || (CUDNN_GRAPH_PATCH != CUDNN_PATCHLEVEL) +#error Version mismatch in cuDNN GRAPH!!! +#endif + +#ifndef CUDNNWINAPI +#ifdef _WIN32 +#define CUDNNWINAPI __stdcall +#else +#define CUDNNWINAPI +#endif +#endif + +/* Warnings for deprecated API-s are enabled using the CUDNN_WARN_DEPRECATED macro */ +#if defined(CUDNN_WARN_DEPRECATED) && (defined(__GNUC__) || defined(__clang__)) +/* GCC, Intel C/C++, Cray C/C++, CLANG, IBM XL C/C++ little endian */ +#define CUDNN_DEPRECATED __attribute__((deprecated)) +#define CUDNN_DEPRECATED_ENUM __attribute__((deprecated)) +#elif defined(CUDNN_WARN_DEPRECATED) && defined(_MSC_VER) +/* Microsoft Visual C++ */ +#define CUDNN_DEPRECATED __declspec(deprecated) +#define CUDNN_DEPRECATED_ENUM __declspec(deprecated) +#elif defined(CUDNN_WARN_DEPRECATED) && (__cplusplus >= 201402L) +/* C++14 compilers */ +#define CUDNN_DEPRECATED [[deprecated]] +#define CUDNN_DEPRECATED_ENUM [[deprecated]] +#else +/* No support for the deprecated attribute */ +#define CUDNN_DEPRECATED +#define CUDNN_DEPRECATED_ENUM +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +struct cudnnContext; +typedef struct cudnnContext *cudnnHandle_t; + +size_t CUDNNWINAPI +cudnnGetVersion(void); + +size_t CUDNNWINAPI +cudnnGetMaxDeviceVersion(void); + +/* Returns CUDA Runtime version statically linked against cudnn */ +size_t CUDNNWINAPI +cudnnGetCudartVersion(void); + +/* + * CUDNN return codes + */ +typedef enum { + CUDNN_STATUS_SUCCESS = 0, + + /* Uncategorized errors */ + CUDNN_STATUS_NOT_INITIALIZED = 1001, + CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH = 1002, + CUDNN_STATUS_SERIALIZATION_VERSION_MISMATCH = 1003, + CUDNN_STATUS_DEPRECATED = 1004, + CUDNN_STATUS_LICENSE_ERROR = 1005, + CUDNN_STATUS_RUNTIME_IN_PROGRESS = 1006, + CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 1007, + CUDNN_STATUS_SUBLIBRARY_LOADING_FAILED = 1008, + + CUDNN_STATUS_BAD_PARAM = 2000, + CUDNN_STATUS_BAD_PARAM_NULL_POINTER = 2002, + CUDNN_STATUS_BAD_PARAM_MISALIGNED_POINTER = 2003, + CUDNN_STATUS_BAD_PARAM_NOT_FINALIZED = 2004, + CUDNN_STATUS_BAD_PARAM_OUT_OF_BOUND = 2005, + CUDNN_STATUS_BAD_PARAM_SIZE_INSUFFICIENT = 2006, + CUDNN_STATUS_BAD_PARAM_STREAM_MISMATCH = 2007, + CUDNN_STATUS_BAD_PARAM_SHAPE_MISMATCH = 2008, + CUDNN_STATUS_BAD_PARAM_DUPLICATED_ENTRIES = 2009, + CUDNN_STATUS_BAD_PARAM_ATTRIBUTE_TYPE = 2010, + CUDNN_STATUS_BAD_PARAM_CUDA_GRAPH_MISMATCH = 2011, + CUDNN_STATUS_BAD_PARAM_DESCRIPTOR_TYPE = 2012, + + CUDNN_STATUS_NOT_SUPPORTED = 3000, + CUDNN_STATUS_NOT_SUPPORTED_GRAPH_PATTERN = 3001, + CUDNN_STATUS_NOT_SUPPORTED_SHAPE = 3002, + CUDNN_STATUS_NOT_SUPPORTED_DATA_TYPE = 3003, + CUDNN_STATUS_NOT_SUPPORTED_LAYOUT = 3004, + CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDA_DRIVER = 3005, + CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDART = 3006, + CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH = 3007, + CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING = 3008, + CUDNN_STATUS_NOT_SUPPORTED_SUBLIBRARY_UNAVAILABLE = 3009, + CUDNN_STATUS_NOT_SUPPORTED_SHARED_MEMORY_INSUFFICIENT = 3010, + CUDNN_STATUS_NOT_SUPPORTED_PADDING = 3011, + CUDNN_STATUS_NOT_SUPPORTED_BAD_LAUNCH_PARAM = 3012, + CUDNN_STATUS_NOT_SUPPORTED_CUDA_GRAPH_NATIVE_API = 3013, + + CUDNN_STATUS_INTERNAL_ERROR = 4000, + CUDNN_STATUS_INTERNAL_ERROR_COMPILATION_FAILED = 4001, + CUDNN_STATUS_INTERNAL_ERROR_UNEXPECTED_VALUE = 4002, + CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED = 4003, + CUDNN_STATUS_INTERNAL_ERROR_DEVICE_ALLOCATION_FAILED = 4004, + CUDNN_STATUS_INTERNAL_ERROR_BAD_LAUNCH_PARAM = 4005, + CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED = 4006, + + CUDNN_STATUS_EXECUTION_FAILED = 5000, + CUDNN_STATUS_EXECUTION_FAILED_CUDA_DRIVER = 5001, + CUDNN_STATUS_EXECUTION_FAILED_CUBLAS = 5002, + CUDNN_STATUS_EXECUTION_FAILED_CUDART = 5003, + CUDNN_STATUS_EXECUTION_FAILED_CURAND = 5004, + + CUDNN_STATUS_ALLOC_FAILED CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED, + CUDNN_STATUS_INVALID_VALUE CUDNN_DEPRECATED_ENUM = 2001 /* please transition to CUDNN_STATUS_BAD_PARAM instead */, + CUDNN_STATUS_ARCH_MISMATCH CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH, + CUDNN_STATUS_MAPPING_ERROR CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED, + CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING CUDNN_DEPRECATED_ENUM = + CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING, + CUDNN_STATUS_VERSION_MISMATCH CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH, +} cudnnStatus_t; + +#define CUDNN_STATUS_FULL_ERROR_CODE(category, specific_err) ((cudnnStatus_t)(0 + (category) + (specific_err))) +#define CUDNN_STATUS_CATEGORY(full_error_code) ((full_error_code) / 1000 * 1000) +#define CUDNN_STATUS_SPECIFIC_ERROR(full_error_code) ((full_error_code) % 1000) + +/* human-readable error messages */ +const char *CUDNNWINAPI +cudnnGetErrorString(cudnnStatus_t status); + +void CUDNNWINAPI +cudnnGetLastErrorString(char *message, size_t max_size); + +/* Forward definition in this version only */ +typedef struct cudnnRuntimeTag_t cudnnRuntimeTag_t CUDNN_DEPRECATED; + +typedef enum { + CUDNN_ERRQUERY_RAWCODE = 0, + CUDNN_ERRQUERY_NONBLOCKING = 1, + CUDNN_ERRQUERY_BLOCKING = 2, +} cudnnErrQueryMode_t; + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag); + +cudnnStatus_t CUDNNWINAPI +cudnnGetProperty(libraryPropertyType type, int *value); + +cudnnStatus_t CUDNNWINAPI +cudnnCreate(cudnnHandle_t *handle); +cudnnStatus_t CUDNNWINAPI +cudnnDestroy(cudnnHandle_t handle); +cudnnStatus_t CUDNNWINAPI +cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId); +cudnnStatus_t CUDNNWINAPI +cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId); +/* + * CUDNN data type + */ +typedef enum { + CUDNN_DATA_FLOAT = 0, + CUDNN_DATA_DOUBLE = 1, + CUDNN_DATA_HALF = 2, + CUDNN_DATA_INT8 = 3, + CUDNN_DATA_INT32 = 4, + CUDNN_DATA_INT8x4 CUDNN_DEPRECATED_ENUM = 5, + CUDNN_DATA_UINT8 = 6, + CUDNN_DATA_UINT8x4 CUDNN_DEPRECATED_ENUM = 7, + CUDNN_DATA_INT8x32 CUDNN_DEPRECATED_ENUM = 8, + CUDNN_DATA_BFLOAT16 = 9, + CUDNN_DATA_INT64 = 10, + CUDNN_DATA_BOOLEAN = 11, + CUDNN_DATA_FP8_E4M3 = 12, + CUDNN_DATA_FP8_E5M2 = 13, + CUDNN_DATA_FAST_FLOAT_FOR_FP8 = 14, + CUDNN_DATA_FP8_E8M0 = 15, + CUDNN_DATA_FP4_E2M1 = 16, +} cudnnDataType_t; + +/* + * CUDNN math type + */ +typedef enum { + CUDNN_DEFAULT_MATH = 0, + CUDNN_TENSOR_OP_MATH = 1, + CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2, + CUDNN_FMA_MATH = 3, +} cudnnMathType_t; + +/* + * CUDNN propagate Nan + */ +typedef enum { + CUDNN_NOT_PROPAGATE_NAN CUDNN_DEPRECATED_ENUM = 0, + CUDNN_PROPAGATE_NAN CUDNN_DEPRECATED_ENUM = 1, +} cudnnNanPropagation_t; + +/* + * Behavior for OOB samples. OOB samples are samples where L+R > T is encountered during the gradient calculation. If + * gradMode is set to CUDNN_CTC_SKIP_OOB_GRADIENTS, then the CTC loss function does not write to the gradient buffer for + * that sample. Instead, the current values, even not finite, are retained. If gradMode is set to + * CUDNN_CTC_ZERO_OOB_GRADIENTS, then the gradient for that sample is set to zero. This guarantees a finite gradient. + */ +typedef enum { + CUDNN_CTC_ZERO_OOB_GRADIENTS = 0, + CUDNN_CTC_SKIP_OOB_GRADIENTS = 1, +} cudnnCTCGradMode_t; + +typedef enum { + CUDNN_TENSOR_NCHW = 0, /* row major (wStride = 1, hStride = w) */ + CUDNN_TENSOR_NHWC = 1, /* feature maps interleaved ( cStride = 1 )*/ + CUDNN_TENSOR_NCHW_VECT_C = 2, /* each image point is vector of element of C, vector length in data type */ +} cudnnTensorFormat_t; + +/* + * CUDNN ReduceTensor op type + */ +typedef enum { + CUDNN_REDUCE_TENSOR_ADD = 0, + CUDNN_REDUCE_TENSOR_MUL = 1, + CUDNN_REDUCE_TENSOR_MIN = 2, + CUDNN_REDUCE_TENSOR_MAX = 3, + CUDNN_REDUCE_TENSOR_AMAX = 4, + CUDNN_REDUCE_TENSOR_AVG = 5, + CUDNN_REDUCE_TENSOR_NORM1 = 6, + CUDNN_REDUCE_TENSOR_NORM2 = 7, + CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8, +} cudnnReduceTensorOp_t; + +/* + * activation mode + */ +typedef enum { + CUDNN_ACTIVATION_SIGMOID = 0, + CUDNN_ACTIVATION_RELU = 1, + CUDNN_ACTIVATION_TANH = 2, + CUDNN_ACTIVATION_CLIPPED_RELU = 3, + CUDNN_ACTIVATION_ELU = 4, + CUDNN_ACTIVATION_IDENTITY = 5, + CUDNN_ACTIVATION_SWISH = 6 +} cudnnActivationMode_t CUDNN_DEPRECATED; + +typedef enum { + CUDNN_SEV_FATAL = 0, + CUDNN_SEV_ERROR = 1, + CUDNN_SEV_WARNING = 2, + CUDNN_SEV_INFO = 3, +} cudnnSeverity_t; + +/* Message masks to be used with cudnnSetCallback() */ +#define CUDNN_SEV_ERROR_EN (1U << CUDNN_SEV_ERROR) +#define CUDNN_SEV_WARNING_EN (1U << CUDNN_SEV_WARNING) +#define CUDNN_SEV_INFO_EN (1U << CUDNN_SEV_INFO) + +/* struct containing useful informaiton for each API call */ +typedef struct cudnnDebugStruct { + unsigned cudnn_version; + cudnnStatus_t cudnnStatus; + unsigned time_sec; /* epoch time in seconds */ + unsigned time_usec; /* microseconds part of epoch time */ + unsigned time_delta; /* time since start in seconds */ + cudnnHandle_t handle; /* cudnn handle */ + cudaStream_t stream; /* cuda stream ID */ + unsigned long long pid; /* process ID */ + unsigned long long tid; /* thread ID */ + int cudaDeviceId; /* CUDA device ID */ + int reserved[15]; /* reserved for future use */ +} cudnnDebug_t; + +typedef void (*cudnnCallback_t)(cudnnSeverity_t sev, void *udata, const cudnnDebug_t *dbg, const char *msg); + +cudnnStatus_t CUDNNWINAPI +cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr); + +cudnnStatus_t CUDNNWINAPI +cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr); + +/* + * \brief Cross-library version checker. + * This function is implemented differently in each sub-library. Each sublib + * checks whether its own version matches that of its dependencies. + * \returns CUDNN_STATUS_SUCCESS if the version check passes, + * CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent. + */ +cudnnStatus_t CUDNNWINAPI +cudnnGraphVersionCheck(void); + +/* Maximum supported number of tensor dimensions */ +#define CUDNN_DIM_MAX 8 + +/* + * convolution mode + */ +typedef enum { CUDNN_CONVOLUTION = 0, CUDNN_CROSS_CORRELATION = 1 } cudnnConvolutionMode_t; + +/* + * CUDNN Reorder + */ +typedef enum { + CUDNN_DEFAULT_REORDER = 0, + CUDNN_NO_REORDER = 1, +} cudnnReorderType_t CUDNN_DEPRECATED; + +typedef void *cudnnBackendDescriptor_t; + +typedef struct cudnnFractionStruct { + int64_t numerator; + int64_t denominator; +} cudnnFraction_t; + +typedef enum { + CUDNN_POINTWISE_ADD = 0, + CUDNN_POINTWISE_ADD_SQUARE = 5, + CUDNN_POINTWISE_DIV = 6, + CUDNN_POINTWISE_MAX = 3, + CUDNN_POINTWISE_MIN = 2, + CUDNN_POINTWISE_MOD = 7, + CUDNN_POINTWISE_MUL = 1, + CUDNN_POINTWISE_POW = 8, + CUDNN_POINTWISE_SUB = 9, + + CUDNN_POINTWISE_ABS = 10, + CUDNN_POINTWISE_CEIL = 11, + CUDNN_POINTWISE_COS = 12, + CUDNN_POINTWISE_EXP = 13, + CUDNN_POINTWISE_FLOOR = 14, + CUDNN_POINTWISE_LOG = 15, + CUDNN_POINTWISE_NEG = 16, + CUDNN_POINTWISE_RSQRT = 17, + CUDNN_POINTWISE_SIN = 18, + CUDNN_POINTWISE_SQRT = 4, + CUDNN_POINTWISE_TAN = 19, + CUDNN_POINTWISE_ERF = 20, + CUDNN_POINTWISE_IDENTITY = 21, + CUDNN_POINTWISE_RECIPROCAL = 22, + CUDNN_POINTWISE_ATAN2 = 23, + + CUDNN_POINTWISE_RELU_FWD = 100, + CUDNN_POINTWISE_TANH_FWD = 101, + CUDNN_POINTWISE_SIGMOID_FWD = 102, + CUDNN_POINTWISE_ELU_FWD = 103, + CUDNN_POINTWISE_GELU_FWD = 104, + CUDNN_POINTWISE_SOFTPLUS_FWD = 105, + CUDNN_POINTWISE_SWISH_FWD = 106, + CUDNN_POINTWISE_GELU_APPROX_TANH_FWD = 107, + + CUDNN_POINTWISE_RELU_BWD = 200, + CUDNN_POINTWISE_TANH_BWD = 201, + CUDNN_POINTWISE_SIGMOID_BWD = 202, + CUDNN_POINTWISE_ELU_BWD = 203, + CUDNN_POINTWISE_GELU_BWD = 204, + CUDNN_POINTWISE_SOFTPLUS_BWD = 205, + CUDNN_POINTWISE_SWISH_BWD = 206, + CUDNN_POINTWISE_GELU_APPROX_TANH_BWD = 207, + + CUDNN_POINTWISE_CMP_EQ = 300, + CUDNN_POINTWISE_CMP_NEQ = 301, + CUDNN_POINTWISE_CMP_GT = 302, + CUDNN_POINTWISE_CMP_GE = 303, + CUDNN_POINTWISE_CMP_LT = 304, + CUDNN_POINTWISE_CMP_LE = 305, + + CUDNN_POINTWISE_LOGICAL_AND = 400, + CUDNN_POINTWISE_LOGICAL_OR = 401, + CUDNN_POINTWISE_LOGICAL_NOT = 402, + + CUDNN_POINTWISE_GEN_INDEX = 501, + + CUDNN_POINTWISE_BINARY_SELECT = 601, +} cudnnPointwiseMode_t; + +typedef enum { + CUDNN_RESAMPLE_NEAREST = 0, + CUDNN_RESAMPLE_BILINEAR = 1, + CUDNN_RESAMPLE_AVGPOOL = 2, + CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING = 2, + CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING = 4, + CUDNN_RESAMPLE_MAXPOOL = 3, +} cudnnResampleMode_t; + +typedef enum { + CUDNN_SIGNAL_SET = 0, + CUDNN_SIGNAL_WAIT = 1, +} cudnnSignalMode_t; + +typedef enum { + CUDNN_GENSTATS_SUM_SQSUM = 0, +} cudnnGenStatsMode_t; + +typedef enum { + CUDNN_BN_FINALIZE_STATISTICS_TRAINING = 0, + CUDNN_BN_FINALIZE_STATISTICS_INFERENCE = 1, +} cudnnBnFinalizeStatsMode_t; + +typedef enum { + CUDNN_RNG_DISTRIBUTION_BERNOULLI = 0, + CUDNN_RNG_DISTRIBUTION_UNIFORM = 1, + CUDNN_RNG_DISTRIBUTION_NORMAL = 2, +} cudnnRngDistribution_t; + +typedef enum { + CUDNN_ATTR_POINTWISE_MODE = 0, + CUDNN_ATTR_POINTWISE_MATH_PREC = 1, + CUDNN_ATTR_POINTWISE_NAN_PROPAGATION CUDNN_DEPRECATED_ENUM = 2, + CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP = 3, + CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP = 4, + CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE = 5, + CUDNN_ATTR_POINTWISE_ELU_ALPHA = 6, + CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA = 7, + CUDNN_ATTR_POINTWISE_SWISH_BETA = 8, + CUDNN_ATTR_POINTWISE_AXIS = 9, + + CUDNN_ATTR_CONVOLUTION_COMP_TYPE = 100, + CUDNN_ATTR_CONVOLUTION_CONV_MODE = 101, + CUDNN_ATTR_CONVOLUTION_DILATIONS = 102, + CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES = 103, + CUDNN_ATTR_CONVOLUTION_POST_PADDINGS = 104, + CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS = 105, + CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS = 106, + + CUDNN_ATTR_ENGINEHEUR_MODE = 200, + CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH = 201, + CUDNN_ATTR_ENGINEHEUR_RESULTS = 202, + CUDNN_ATTR_ENGINEHEUR_SM_COUNT_TARGET = 203, + CUDNN_ATTR_ENGINEHEUR_DEVICEPROP = 204, + + CUDNN_ATTR_ENGINECFG_ENGINE = 300, + CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO = 301, + CUDNN_ATTR_ENGINECFG_KNOB_CHOICES = 302, + CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZE = 303, + CUDNN_ATTR_ENGINECFG_SHARED_MEMORY_USED = 304, + + CUDNN_ATTR_EXECUTION_PLAN_HANDLE CUDNN_DEPRECATED_ENUM = 400, + CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG = 401, + CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE = 402, + CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS = 403, + CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS = 404, + CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION = 405, + CUDNN_ATTR_EXECUTION_PLAN_KERNEL_CACHE = 406, + CUDNN_ATTR_EXECUTION_PLAN_DEVICEPROP = 407, + + CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID = 500, + CUDNN_ATTR_INTERMEDIATE_INFO_SIZE = 501, + CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS = 502, + CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES = 503, + + CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE = 600, + CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE = 601, + + CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA = 700, + CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA = 701, + CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC = 702, + CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W = 703, + CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X = 704, + CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y = 705, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA = 706, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA = 707, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC = 708, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W = 709, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX = 710, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY = 711, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA = 712, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA = 713, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC = 714, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW = 715, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X = 716, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY = 717, + + CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR = 750, + CUDNN_ATTR_OPERATION_POINTWISE_XDESC = 751, + CUDNN_ATTR_OPERATION_POINTWISE_BDESC = 752, + CUDNN_ATTR_OPERATION_POINTWISE_YDESC = 753, + CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1 = 754, + CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2 = 755, + CUDNN_ATTR_OPERATION_POINTWISE_DXDESC = 756, + CUDNN_ATTR_OPERATION_POINTWISE_DYDESC = 757, + CUDNN_ATTR_OPERATION_POINTWISE_TDESC = 758, + + CUDNN_ATTR_OPERATION_GENSTATS_MODE = 770, + CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC = 771, + CUDNN_ATTR_OPERATION_GENSTATS_XDESC = 772, + CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC = 773, + CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC = 774, + + CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE = 780, + CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC = 781, + CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC = 782, + CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC = 783, + CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC = 784, + CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC = 785, + CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC = 786, + CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC = 787, + CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC = 788, + CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC = 789, + CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC = 790, + CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC = 791, + CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC = 792, + CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC = 793, + CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC = 794, + CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC = 795, + CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC = 796, + + CUDNN_ATTR_OPERATIONGRAPH_HANDLE CUDNN_DEPRECATED_ENUM = 800, + CUDNN_ATTR_OPERATIONGRAPH_OPS = 801, + CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT = 802, + CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLED = 803, + CUDNN_ATTR_OPERATIONGRAPH_IS_SAME_TOPOLOGY = 804, + + CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT = 900, + CUDNN_ATTR_TENSOR_DATA_TYPE = 901, + CUDNN_ATTR_TENSOR_DIMENSIONS = 902, + CUDNN_ATTR_TENSOR_STRIDES = 903, + CUDNN_ATTR_TENSOR_VECTOR_COUNT = 904, + CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION = 905, + CUDNN_ATTR_TENSOR_UNIQUE_ID = 906, + CUDNN_ATTR_TENSOR_IS_VIRTUAL = 907, + CUDNN_ATTR_TENSOR_IS_BY_VALUE = 908, + CUDNN_ATTR_TENSOR_REORDERING_MODE = 909, + CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC = 913, + + CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS = 1000, + CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS = 1001, + CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES = 1002, + CUDNN_ATTR_VARIANT_PACK_WORKSPACE = 1003, + + CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID = 1100, + CUDNN_ATTR_LAYOUT_INFO_TYPES = 1101, + + CUDNN_ATTR_KNOB_INFO_TYPE = 1200, + CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE = 1201, + CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE = 1202, + CUDNN_ATTR_KNOB_INFO_STRIDE = 1203, + + CUDNN_ATTR_ENGINE_OPERATION_GRAPH = 1300, + CUDNN_ATTR_ENGINE_GLOBAL_INDEX = 1301, + CUDNN_ATTR_ENGINE_KNOB_INFO = 1302, + CUDNN_ATTR_ENGINE_NUMERICAL_NOTE = 1303, + CUDNN_ATTR_ENGINE_LAYOUT_INFO = 1304, + CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE = 1305, + CUDNN_ATTR_ENGINE_SM_COUNT_TARGET = 1306, + CUDNN_ATTR_ENGINE_DEVICEPROP = 1307, + + CUDNN_ATTR_MATMUL_COMP_TYPE = 1500, + CUDNN_ATTR_MATMUL_PADDING_VALUE = 1503, + + CUDNN_ATTR_OPERATION_MATMUL_ADESC = 1520, + CUDNN_ATTR_OPERATION_MATMUL_BDESC = 1521, + CUDNN_ATTR_OPERATION_MATMUL_CDESC = 1522, + CUDNN_ATTR_OPERATION_MATMUL_DESC = 1523, + CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT CUDNN_DEPRECATED_ENUM = 1524, + CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC = 1525, + CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC = 1526, + CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC = 1527, + + CUDNN_ATTR_REDUCTION_OPERATOR = 1600, + CUDNN_ATTR_REDUCTION_COMP_TYPE = 1601, + + CUDNN_ATTR_OPERATION_REDUCTION_XDESC = 1610, + CUDNN_ATTR_OPERATION_REDUCTION_YDESC = 1611, + CUDNN_ATTR_OPERATION_REDUCTION_DESC = 1612, + + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC = 1620, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC = 1621, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC = 1622, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC = 1623, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC = 1624, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC = 1625, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC = 1626, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC = 1627, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC = 1628, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC = 1629, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS = 1630, + + CUDNN_ATTR_RESAMPLE_MODE = 1700, + CUDNN_ATTR_RESAMPLE_COMP_TYPE = 1701, + CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS = 1702, + CUDNN_ATTR_RESAMPLE_POST_PADDINGS = 1703, + CUDNN_ATTR_RESAMPLE_PRE_PADDINGS = 1704, + CUDNN_ATTR_RESAMPLE_STRIDES = 1705, + CUDNN_ATTR_RESAMPLE_WINDOW_DIMS = 1706, + CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION = 1707, + CUDNN_ATTR_RESAMPLE_PADDING_MODE = 1708, + + CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC = 1710, + CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC = 1711, + CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC = 1712, + CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA CUDNN_DEPRECATED_ENUM = 1713, + CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA CUDNN_DEPRECATED_ENUM = 1714, + CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC = 1716, + + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC = 1720, + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC = 1721, + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC = 1722, + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA CUDNN_DEPRECATED_ENUM = 1723, + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA CUDNN_DEPRECATED_ENUM = 1724, + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC = 1725, + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC = 1726, + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC = 1727, + + CUDNN_ATTR_OPERATION_CONCAT_AXIS = 1800, + CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS = 1801, + CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX = 1802, + CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC = 1803, + + CUDNN_ATTR_OPERATION_SIGNAL_MODE = 1900, + CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC = 1901, + CUDNN_ATTR_OPERATION_SIGNAL_VALUE = 1902, + CUDNN_ATTR_OPERATION_SIGNAL_XDESC = 1903, + CUDNN_ATTR_OPERATION_SIGNAL_YDESC = 1904, + + CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_CONTAINER_DESC = 1950, + CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_YDESC = 1951, + CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_SEQUENCE_DESC = 1952, + CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_PAGE_TABLE_DESC = 1953, + + CUDNN_ATTR_OPERATION_NORM_FWD_MODE = 2000, + CUDNN_ATTR_OPERATION_NORM_FWD_PHASE = 2001, + CUDNN_ATTR_OPERATION_NORM_FWD_XDESC = 2002, + CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC = 2003, + CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC = 2004, + CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC = 2005, + CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC = 2006, + CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC = 2007, + CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC = 2008, + CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC = 2009, + CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC = 2010, + CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC = 2011, + CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC = 2012, + CUDNN_ATTR_OPERATION_NORM_FWD_YDESC = 2013, + CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS = 2014, + + CUDNN_ATTR_OPERATION_NORM_BWD_MODE = 2100, + CUDNN_ATTR_OPERATION_NORM_BWD_XDESC = 2101, + CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC = 2102, + CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC = 2103, + CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC = 2104, + CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC = 2105, + CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC = 2106, + CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC = 2107, + CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC = 2108, + CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC = 2109, + CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS = 2110, + + CUDNN_ATTR_OPERATION_RESHAPE_XDESC = 2200, + CUDNN_ATTR_OPERATION_RESHAPE_YDESC = 2201, + + CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_XDESC = 2250, + CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_YDESC = 2251, + CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_LOWER_BANDWIDTH = 2252, + CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_UPPER_BANDWIDTH = 2253, + CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_AXIS = 2254, + CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_PAD_VALUE = 2255, + CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_KV_TOKEN_OFFSET_DESC = 2256, + + CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_XDESC = 2270, + CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_YDESC = 2271, + CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_LOWER_BANDWIDTH = 2272, + CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_UPPER_BANDWIDTH = 2273, + CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_AXIS = 2274, + CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_PAD_VALUE = 2275, + CUDNN_ATTR_OPERATION_CONTRACT_BAND_MAX_TOKEN_VALUE = 2276, + + CUDNN_ATTR_RNG_DISTRIBUTION = 2300, + CUDNN_ATTR_RNG_NORMAL_DIST_MEAN = 2301, + CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION = 2302, + CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM = 2303, + CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM = 2304, + CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY = 2305, + + CUDNN_ATTR_OPERATION_RNG_YDESC = 2310, + CUDNN_ATTR_OPERATION_RNG_SEED = 2311, + CUDNN_ATTR_OPERATION_RNG_DESC = 2312, + CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC = 2313, + + CUDNN_ATTR_KERNEL_CACHE_OPERATION_GRAPH = 2400, + CUDNN_ATTR_KERNEL_CACHE_IS_ENGINECFG_KERNEL_CACHED = 2401, + CUDNN_ATTR_KERNEL_CACHE_JSON_REPRESENTATION = 2402, + + CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_XDESC = 2500, + CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_YDESC = 2501, + CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_SCALE_DESC = 2502, + CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_MATH_PREC = 2503, + CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_BLOCK_SIZE = 2504, + + CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_XDESC = 2600, + CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_SCALE_DESC = 2601, + CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_YDESC = 2602, + CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_MATH_PREC = 2603, + CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_BLOCK_SIZE = 2604, + + CUDNN_ATTR_DEVICEPROP_DEVICE_ID = 2700, + CUDNN_ATTR_DEVICEPROP_HANDLE = 2701, + CUDNN_ATTR_DEVICEPROP_JSON_REPRESENTATION = 2702, +} cudnnBackendAttributeName_t; + +typedef enum { + CUDNN_TYPE_HANDLE = 0, + CUDNN_TYPE_DATA_TYPE = 1, + CUDNN_TYPE_BOOLEAN = 2, + CUDNN_TYPE_INT64 = 3, + CUDNN_TYPE_FLOAT = 4, + CUDNN_TYPE_DOUBLE = 5, + CUDNN_TYPE_VOID_PTR = 6, + CUDNN_TYPE_CONVOLUTION_MODE = 7, + CUDNN_TYPE_HEUR_MODE = 8, + CUDNN_TYPE_KNOB_TYPE = 9, + CUDNN_TYPE_NAN_PROPOGATION CUDNN_DEPRECATED_ENUM = 10, + CUDNN_TYPE_NUMERICAL_NOTE = 11, + CUDNN_TYPE_LAYOUT_TYPE = 12, + CUDNN_TYPE_ATTRIB_NAME = 13, + CUDNN_TYPE_POINTWISE_MODE = 14, + CUDNN_TYPE_BACKEND_DESCRIPTOR = 15, + CUDNN_TYPE_GENSTATS_MODE = 16, + CUDNN_TYPE_BN_FINALIZE_STATS_MODE = 17, + CUDNN_TYPE_REDUCTION_OPERATOR_TYPE = 18, + CUDNN_TYPE_BEHAVIOR_NOTE = 19, + CUDNN_TYPE_TENSOR_REORDERING_MODE = 20, + CUDNN_TYPE_RESAMPLE_MODE = 21, + CUDNN_TYPE_PADDING_MODE = 22, + CUDNN_TYPE_INT32 = 23, + CUDNN_TYPE_CHAR = 24, + CUDNN_TYPE_SIGNAL_MODE = 25, + CUDNN_TYPE_FRACTION = 26, + CUDNN_TYPE_NORM_MODE = 27, + CUDNN_TYPE_NORM_FWD_PHASE = 28, + CUDNN_TYPE_RNG_DISTRIBUTION = 29, +} cudnnBackendAttributeType_t; + +typedef enum { + CUDNN_BACKEND_POINTWISE_DESCRIPTOR = 0, + CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR = 1, + CUDNN_BACKEND_ENGINE_DESCRIPTOR = 2, + CUDNN_BACKEND_ENGINECFG_DESCRIPTOR = 3, + CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR = 4, + CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR = 5, + CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR = 6, + CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR = 7, + CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR = 8, + CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR = 9, + CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR = 10, + CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR = 11, + CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR = 12, + CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR = 13, + CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR = 14, + CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR = 15, + CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR = 16, + CUDNN_BACKEND_TENSOR_DESCRIPTOR = 17, + CUDNN_BACKEND_MATMUL_DESCRIPTOR = 18, + CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR = 19, + CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR = 20, + CUDNN_BACKEND_REDUCTION_DESCRIPTOR = 21, + CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR = 22, + CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR = 23, + CUDNN_BACKEND_RESAMPLE_DESCRIPTOR = 24, + CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR = 25, + CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR = 26, + CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR = 27, + CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR = 28, + CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR = 29, + CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR = 30, + CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR = 31, + CUDNN_BACKEND_RNG_DESCRIPTOR = 32, + CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR = 33, + CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR = 34, + CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR = 35, + CUDNN_BACKEND_OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR = 36, + CUDNN_BACKEND_OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR = 37, + CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR = 38, + CUDNN_BACKEND_OPERATION_EXPAND_BAND_MATRIX_DESCRIPTOR = 39, + CUDNN_BACKEND_OPERATION_CONTRACT_BAND_MATRIX_DESCRIPTOR = 40, +} cudnnBackendDescriptorType_t; + +typedef enum { + CUDNN_NUMERICAL_NOTE_TENSOR_CORE = 0, + CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS = 1, + CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION = 2, + CUDNN_NUMERICAL_NOTE_FFT = 3, + CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC = 4, + CUDNN_NUMERICAL_NOTE_WINOGRAD = 5, + CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4 = 6, + CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6 = 7, + CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13 = 8, + CUDNN_NUMERICAL_NOTE_STRICT_NAN_PROP = 9, + CUDNN_NUMERICAL_NOTE_TYPE_COUNT = 10, +} cudnnBackendNumericalNote_t; + +typedef enum { + CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION = 0, + CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER = 1, + CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER = 2, + CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API = 3, + CUDNN_BEHAVIOR_NOTE_TYPE_COUNT = 4, +} cudnnBackendBehaviorNote_t; + +typedef enum { + CUDNN_KNOB_TYPE_SPLIT_K CUDNN_DEPRECATED_ENUM = 0, + CUDNN_KNOB_TYPE_SWIZZLE = 1, + CUDNN_KNOB_TYPE_TILE_SIZE = 2, + CUDNN_KNOB_TYPE_USE_TEX CUDNN_DEPRECATED_ENUM = 3, + CUDNN_KNOB_TYPE_EDGE = 4, + CUDNN_KNOB_TYPE_KBLOCK CUDNN_DEPRECATED_ENUM = 5, + CUDNN_KNOB_TYPE_LDGA CUDNN_DEPRECATED_ENUM = 6, + CUDNN_KNOB_TYPE_LDGB CUDNN_DEPRECATED_ENUM = 7, + CUDNN_KNOB_TYPE_CHUNK_K CUDNN_DEPRECATED_ENUM = 8, + CUDNN_KNOB_TYPE_SPLIT_H CUDNN_DEPRECATED_ENUM = 9, + CUDNN_KNOB_TYPE_WINO_TILE CUDNN_DEPRECATED_ENUM = 10, + CUDNN_KNOB_TYPE_MULTIPLY = 11, + CUDNN_KNOB_TYPE_SPLIT_K_BUF = 12, + CUDNN_KNOB_TYPE_TILEK = 13, + CUDNN_KNOB_TYPE_STAGES = 14, + CUDNN_KNOB_TYPE_REDUCTION_MODE = 15, + CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE CUDNN_DEPRECATED_ENUM = 16, + CUDNN_KNOB_TYPE_SPLIT_K_SLC = 17, + CUDNN_KNOB_TYPE_IDX_MODE = 18, + CUDNN_KNOB_TYPE_SLICED CUDNN_DEPRECATED_ENUM = 19, + CUDNN_KNOB_TYPE_SPLIT_RS CUDNN_DEPRECATED_ENUM = 20, + CUDNN_KNOB_TYPE_SINGLEBUFFER CUDNN_DEPRECATED_ENUM = 21, + CUDNN_KNOB_TYPE_LDGC CUDNN_DEPRECATED_ENUM = 22, + CUDNN_KNOB_TYPE_SPECFILT = 23, + CUDNN_KNOB_TYPE_KERNEL_CFG = 24, + CUDNN_KNOB_TYPE_WORKSPACE = 25, + CUDNN_KNOB_TYPE_TILE_CGA CUDNN_DEPRECATED_ENUM = 26, + CUDNN_KNOB_TYPE_TILE_CGA_M = 27, + CUDNN_KNOB_TYPE_TILE_CGA_N = 28, + CUDNN_KNOB_TYPE_BLOCK_SIZE = 29, + CUDNN_KNOB_TYPE_OCCUPANCY = 30, + CUDNN_KNOB_TYPE_ARRAY_SIZE_PER_THREAD = 31, + CUDNN_KNOB_TYPE_NUM_C_PER_BLOCK CUDNN_DEPRECATED_ENUM = 32, + CUDNN_KNOB_TYPE_SPLIT_COLS = 33, + CUDNN_KNOB_TYPE_TILE_ROWS = 34, + CUDNN_KNOB_TYPE_TILE_COLS = 35, + CUDNN_KNOB_TYPE_LOAD_SIZE = 36, + CUDNN_KNOB_TYPE_CTA_COUNT = 37, + CUDNN_KNOB_TYPE_STREAM_K = 38, + CUDNN_KNOB_TYPE_SPLIT_P_SLC = 39, + CUDNN_KNOB_TYPE_TILE_M = 40, + CUDNN_KNOB_TYPE_TILE_N = 41, + CUDNN_KNOB_TYPE_WARP_SPEC_CFG = 42, + CUDNN_KNOB_TYPE_COUNTS = 43, +} cudnnBackendKnobType_t; + +typedef enum { + CUDNN_LAYOUT_TYPE_PREFERRED_NCHW = 0, + CUDNN_LAYOUT_TYPE_PREFERRED_NHWC = 1, + CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK = 2, + CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK = 3, + CUDNN_LAYOUT_TYPE_COUNT = 4, +} cudnnBackendLayoutType_t; + +typedef enum { + CUDNN_HEUR_MODE_INSTANT = 0, + CUDNN_HEUR_MODE_B = 1, + CUDNN_HEUR_MODE_FALLBACK = 2, + CUDNN_HEUR_MODE_A = 3, + CUDNN_HEUR_MODES_COUNT = 4, +} cudnnBackendHeurMode_t; + +typedef enum { + CUDNN_TENSOR_REORDERING_NONE = 0, + CUDNN_TENSOR_REORDERING_INT8x32 = 1, + CUDNN_TENSOR_REORDERING_F16x16 = 2, + CUDNN_TENSOR_REORDERING_F8_128x4 = 3, +} cudnnBackendTensorReordering_t; + +typedef enum { + CUDNN_ZERO_PAD = 0, + CUDNN_NEG_INF_PAD = 1, + CUDNN_EDGE_VAL_PAD = 2, +} cudnnPaddingMode_t; + +typedef enum { + CUDNN_LAYER_NORM = 0, + CUDNN_INSTANCE_NORM = 1, + CUDNN_BATCH_NORM = 2, + CUDNN_GROUP_NORM = 3, + CUDNN_RMS_NORM = 4, + CUDNN_ADA_LAYER_NORM = 5, +} cudnnBackendNormMode_t; + +typedef enum { + CUDNN_NORM_FWD_INFERENCE = 0, + CUDNN_NORM_FWD_TRAINING = 1, +} cudnnBackendNormFwdPhase_t; + +cudnnStatus_t CUDNNWINAPI +cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor); + +cudnnStatus_t CUDNNWINAPI +cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor); + +cudnnStatus_t CUDNNWINAPI +cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor); + +cudnnStatus_t CUDNNWINAPI +cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor, + cudnnBackendAttributeName_t attributeName, + cudnnBackendAttributeType_t attributeType, + int64_t elementCount, + const void *arrayOfElements); + +cudnnStatus_t CUDNNWINAPI +cudnnBackendGetAttribute(cudnnBackendDescriptor_t const descriptor, + cudnnBackendAttributeName_t attributeName, + cudnnBackendAttributeType_t attributeType, + int64_t requestedElementCount, + int64_t *elementCount, + void *arrayOfElements); + +cudnnStatus_t CUDNNWINAPI +cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack); + +cudnnStatus_t CUDNNWINAPI +cudnnBackendPopulateCudaGraph(cudnnHandle_t handle, + cudnnBackendDescriptor_t executionPlan, + cudnnBackendDescriptor_t variantPack, + cudaGraph_t graph); + +cudnnStatus_t CUDNNWINAPI +cudnnBackendUpdateCudaGraph(cudnnHandle_t handle, + cudnnBackendDescriptor_t executionPlan, + cudnnBackendDescriptor_t variantPack, + cudaGraph_t graph); + +#if defined(__cplusplus) +} +#endif + +#endif /* CUDNN_GRAPH_H_ */ diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_graph_v9.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_graph_v9.h new file mode 100644 index 0000000000000000000000000000000000000000..389fba220c579e08519072255f2aea9a5da2d3e5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_graph_v9.h @@ -0,0 +1,992 @@ +/* + * Copyright 2014-2023 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* + * cudnn_graph : cuDNN's basic definitions operations. + */ + +#if !defined(CUDNN_GRAPH_H_) +#define CUDNN_GRAPH_H_ + +#include +#include + +#include + +#include "cudnn_version.h" + +/* These version numbers are autogenerated, do not edit manually. */ +#define CUDNN_GRAPH_MAJOR 9 +#define CUDNN_GRAPH_MINOR 10 +#define CUDNN_GRAPH_PATCH 2 + +#if (CUDNN_GRAPH_MAJOR != CUDNN_MAJOR) || (CUDNN_GRAPH_MINOR != CUDNN_MINOR) || (CUDNN_GRAPH_PATCH != CUDNN_PATCHLEVEL) +#error Version mismatch in cuDNN GRAPH!!! +#endif + +#ifndef CUDNNWINAPI +#ifdef _WIN32 +#define CUDNNWINAPI __stdcall +#else +#define CUDNNWINAPI +#endif +#endif + +/* Warnings for deprecated API-s are enabled using the CUDNN_WARN_DEPRECATED macro */ +#if defined(CUDNN_WARN_DEPRECATED) && (defined(__GNUC__) || defined(__clang__)) +/* GCC, Intel C/C++, Cray C/C++, CLANG, IBM XL C/C++ little endian */ +#define CUDNN_DEPRECATED __attribute__((deprecated)) +#define CUDNN_DEPRECATED_ENUM __attribute__((deprecated)) +#elif defined(CUDNN_WARN_DEPRECATED) && defined(_MSC_VER) +/* Microsoft Visual C++ */ +#define CUDNN_DEPRECATED __declspec(deprecated) +#define CUDNN_DEPRECATED_ENUM __declspec(deprecated) +#elif defined(CUDNN_WARN_DEPRECATED) && (__cplusplus >= 201402L) +/* C++14 compilers */ +#define CUDNN_DEPRECATED [[deprecated]] +#define CUDNN_DEPRECATED_ENUM [[deprecated]] +#else +/* No support for the deprecated attribute */ +#define CUDNN_DEPRECATED +#define CUDNN_DEPRECATED_ENUM +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +struct cudnnContext; +typedef struct cudnnContext *cudnnHandle_t; + +size_t CUDNNWINAPI +cudnnGetVersion(void); + +size_t CUDNNWINAPI +cudnnGetMaxDeviceVersion(void); + +/* Returns CUDA Runtime version statically linked against cudnn */ +size_t CUDNNWINAPI +cudnnGetCudartVersion(void); + +/* + * CUDNN return codes + */ +typedef enum { + CUDNN_STATUS_SUCCESS = 0, + + /* Uncategorized errors */ + CUDNN_STATUS_NOT_INITIALIZED = 1001, + CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH = 1002, + CUDNN_STATUS_SERIALIZATION_VERSION_MISMATCH = 1003, + CUDNN_STATUS_DEPRECATED = 1004, + CUDNN_STATUS_LICENSE_ERROR = 1005, + CUDNN_STATUS_RUNTIME_IN_PROGRESS = 1006, + CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 1007, + CUDNN_STATUS_SUBLIBRARY_LOADING_FAILED = 1008, + + CUDNN_STATUS_BAD_PARAM = 2000, + CUDNN_STATUS_BAD_PARAM_NULL_POINTER = 2002, + CUDNN_STATUS_BAD_PARAM_MISALIGNED_POINTER = 2003, + CUDNN_STATUS_BAD_PARAM_NOT_FINALIZED = 2004, + CUDNN_STATUS_BAD_PARAM_OUT_OF_BOUND = 2005, + CUDNN_STATUS_BAD_PARAM_SIZE_INSUFFICIENT = 2006, + CUDNN_STATUS_BAD_PARAM_STREAM_MISMATCH = 2007, + CUDNN_STATUS_BAD_PARAM_SHAPE_MISMATCH = 2008, + CUDNN_STATUS_BAD_PARAM_DUPLICATED_ENTRIES = 2009, + CUDNN_STATUS_BAD_PARAM_ATTRIBUTE_TYPE = 2010, + CUDNN_STATUS_BAD_PARAM_CUDA_GRAPH_MISMATCH = 2011, + CUDNN_STATUS_BAD_PARAM_DESCRIPTOR_TYPE = 2012, + + CUDNN_STATUS_NOT_SUPPORTED = 3000, + CUDNN_STATUS_NOT_SUPPORTED_GRAPH_PATTERN = 3001, + CUDNN_STATUS_NOT_SUPPORTED_SHAPE = 3002, + CUDNN_STATUS_NOT_SUPPORTED_DATA_TYPE = 3003, + CUDNN_STATUS_NOT_SUPPORTED_LAYOUT = 3004, + CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDA_DRIVER = 3005, + CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDART = 3006, + CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH = 3007, + CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING = 3008, + CUDNN_STATUS_NOT_SUPPORTED_SUBLIBRARY_UNAVAILABLE = 3009, + CUDNN_STATUS_NOT_SUPPORTED_SHARED_MEMORY_INSUFFICIENT = 3010, + CUDNN_STATUS_NOT_SUPPORTED_PADDING = 3011, + CUDNN_STATUS_NOT_SUPPORTED_BAD_LAUNCH_PARAM = 3012, + CUDNN_STATUS_NOT_SUPPORTED_CUDA_GRAPH_NATIVE_API = 3013, + + CUDNN_STATUS_INTERNAL_ERROR = 4000, + CUDNN_STATUS_INTERNAL_ERROR_COMPILATION_FAILED = 4001, + CUDNN_STATUS_INTERNAL_ERROR_UNEXPECTED_VALUE = 4002, + CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED = 4003, + CUDNN_STATUS_INTERNAL_ERROR_DEVICE_ALLOCATION_FAILED = 4004, + CUDNN_STATUS_INTERNAL_ERROR_BAD_LAUNCH_PARAM = 4005, + CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED = 4006, + + CUDNN_STATUS_EXECUTION_FAILED = 5000, + CUDNN_STATUS_EXECUTION_FAILED_CUDA_DRIVER = 5001, + CUDNN_STATUS_EXECUTION_FAILED_CUBLAS = 5002, + CUDNN_STATUS_EXECUTION_FAILED_CUDART = 5003, + CUDNN_STATUS_EXECUTION_FAILED_CURAND = 5004, + + CUDNN_STATUS_ALLOC_FAILED CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED, + CUDNN_STATUS_INVALID_VALUE CUDNN_DEPRECATED_ENUM = 2001 /* please transition to CUDNN_STATUS_BAD_PARAM instead */, + CUDNN_STATUS_ARCH_MISMATCH CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH, + CUDNN_STATUS_MAPPING_ERROR CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED, + CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING CUDNN_DEPRECATED_ENUM = + CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING, + CUDNN_STATUS_VERSION_MISMATCH CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH, +} cudnnStatus_t; + +#define CUDNN_STATUS_FULL_ERROR_CODE(category, specific_err) ((cudnnStatus_t)(0 + (category) + (specific_err))) +#define CUDNN_STATUS_CATEGORY(full_error_code) ((full_error_code) / 1000 * 1000) +#define CUDNN_STATUS_SPECIFIC_ERROR(full_error_code) ((full_error_code) % 1000) + +/* human-readable error messages */ +const char *CUDNNWINAPI +cudnnGetErrorString(cudnnStatus_t status); + +void CUDNNWINAPI +cudnnGetLastErrorString(char *message, size_t max_size); + +/* Forward definition in this version only */ +typedef struct cudnnRuntimeTag_t cudnnRuntimeTag_t CUDNN_DEPRECATED; + +typedef enum { + CUDNN_ERRQUERY_RAWCODE = 0, + CUDNN_ERRQUERY_NONBLOCKING = 1, + CUDNN_ERRQUERY_BLOCKING = 2, +} cudnnErrQueryMode_t; + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag); + +cudnnStatus_t CUDNNWINAPI +cudnnGetProperty(libraryPropertyType type, int *value); + +cudnnStatus_t CUDNNWINAPI +cudnnCreate(cudnnHandle_t *handle); +cudnnStatus_t CUDNNWINAPI +cudnnDestroy(cudnnHandle_t handle); +cudnnStatus_t CUDNNWINAPI +cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId); +cudnnStatus_t CUDNNWINAPI +cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId); +/* + * CUDNN data type + */ +typedef enum { + CUDNN_DATA_FLOAT = 0, + CUDNN_DATA_DOUBLE = 1, + CUDNN_DATA_HALF = 2, + CUDNN_DATA_INT8 = 3, + CUDNN_DATA_INT32 = 4, + CUDNN_DATA_INT8x4 CUDNN_DEPRECATED_ENUM = 5, + CUDNN_DATA_UINT8 = 6, + CUDNN_DATA_UINT8x4 CUDNN_DEPRECATED_ENUM = 7, + CUDNN_DATA_INT8x32 CUDNN_DEPRECATED_ENUM = 8, + CUDNN_DATA_BFLOAT16 = 9, + CUDNN_DATA_INT64 = 10, + CUDNN_DATA_BOOLEAN = 11, + CUDNN_DATA_FP8_E4M3 = 12, + CUDNN_DATA_FP8_E5M2 = 13, + CUDNN_DATA_FAST_FLOAT_FOR_FP8 = 14, + CUDNN_DATA_FP8_E8M0 = 15, + CUDNN_DATA_FP4_E2M1 = 16, +} cudnnDataType_t; + +/* + * CUDNN math type + */ +typedef enum { + CUDNN_DEFAULT_MATH = 0, + CUDNN_TENSOR_OP_MATH = 1, + CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2, + CUDNN_FMA_MATH = 3, +} cudnnMathType_t; + +/* + * CUDNN propagate Nan + */ +typedef enum { + CUDNN_NOT_PROPAGATE_NAN CUDNN_DEPRECATED_ENUM = 0, + CUDNN_PROPAGATE_NAN CUDNN_DEPRECATED_ENUM = 1, +} cudnnNanPropagation_t; + +/* + * Behavior for OOB samples. OOB samples are samples where L+R > T is encountered during the gradient calculation. If + * gradMode is set to CUDNN_CTC_SKIP_OOB_GRADIENTS, then the CTC loss function does not write to the gradient buffer for + * that sample. Instead, the current values, even not finite, are retained. If gradMode is set to + * CUDNN_CTC_ZERO_OOB_GRADIENTS, then the gradient for that sample is set to zero. This guarantees a finite gradient. + */ +typedef enum { + CUDNN_CTC_ZERO_OOB_GRADIENTS = 0, + CUDNN_CTC_SKIP_OOB_GRADIENTS = 1, +} cudnnCTCGradMode_t; + +typedef enum { + CUDNN_TENSOR_NCHW = 0, /* row major (wStride = 1, hStride = w) */ + CUDNN_TENSOR_NHWC = 1, /* feature maps interleaved ( cStride = 1 )*/ + CUDNN_TENSOR_NCHW_VECT_C = 2, /* each image point is vector of element of C, vector length in data type */ +} cudnnTensorFormat_t; + +/* + * CUDNN ReduceTensor op type + */ +typedef enum { + CUDNN_REDUCE_TENSOR_ADD = 0, + CUDNN_REDUCE_TENSOR_MUL = 1, + CUDNN_REDUCE_TENSOR_MIN = 2, + CUDNN_REDUCE_TENSOR_MAX = 3, + CUDNN_REDUCE_TENSOR_AMAX = 4, + CUDNN_REDUCE_TENSOR_AVG = 5, + CUDNN_REDUCE_TENSOR_NORM1 = 6, + CUDNN_REDUCE_TENSOR_NORM2 = 7, + CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8, +} cudnnReduceTensorOp_t; + +/* + * activation mode + */ +typedef enum { + CUDNN_ACTIVATION_SIGMOID = 0, + CUDNN_ACTIVATION_RELU = 1, + CUDNN_ACTIVATION_TANH = 2, + CUDNN_ACTIVATION_CLIPPED_RELU = 3, + CUDNN_ACTIVATION_ELU = 4, + CUDNN_ACTIVATION_IDENTITY = 5, + CUDNN_ACTIVATION_SWISH = 6 +} cudnnActivationMode_t CUDNN_DEPRECATED; + +typedef enum { + CUDNN_SEV_FATAL = 0, + CUDNN_SEV_ERROR = 1, + CUDNN_SEV_WARNING = 2, + CUDNN_SEV_INFO = 3, +} cudnnSeverity_t; + +/* Message masks to be used with cudnnSetCallback() */ +#define CUDNN_SEV_ERROR_EN (1U << CUDNN_SEV_ERROR) +#define CUDNN_SEV_WARNING_EN (1U << CUDNN_SEV_WARNING) +#define CUDNN_SEV_INFO_EN (1U << CUDNN_SEV_INFO) + +/* struct containing useful informaiton for each API call */ +typedef struct cudnnDebugStruct { + unsigned cudnn_version; + cudnnStatus_t cudnnStatus; + unsigned time_sec; /* epoch time in seconds */ + unsigned time_usec; /* microseconds part of epoch time */ + unsigned time_delta; /* time since start in seconds */ + cudnnHandle_t handle; /* cudnn handle */ + cudaStream_t stream; /* cuda stream ID */ + unsigned long long pid; /* process ID */ + unsigned long long tid; /* thread ID */ + int cudaDeviceId; /* CUDA device ID */ + int reserved[15]; /* reserved for future use */ +} cudnnDebug_t; + +typedef void (*cudnnCallback_t)(cudnnSeverity_t sev, void *udata, const cudnnDebug_t *dbg, const char *msg); + +cudnnStatus_t CUDNNWINAPI +cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr); + +cudnnStatus_t CUDNNWINAPI +cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr); + +/* + * \brief Cross-library version checker. + * This function is implemented differently in each sub-library. Each sublib + * checks whether its own version matches that of its dependencies. + * \returns CUDNN_STATUS_SUCCESS if the version check passes, + * CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent. + */ +cudnnStatus_t CUDNNWINAPI +cudnnGraphVersionCheck(void); + +/* Maximum supported number of tensor dimensions */ +#define CUDNN_DIM_MAX 8 + +/* + * convolution mode + */ +typedef enum { CUDNN_CONVOLUTION = 0, CUDNN_CROSS_CORRELATION = 1 } cudnnConvolutionMode_t; + +/* + * CUDNN Reorder + */ +typedef enum { + CUDNN_DEFAULT_REORDER = 0, + CUDNN_NO_REORDER = 1, +} cudnnReorderType_t CUDNN_DEPRECATED; + +typedef void *cudnnBackendDescriptor_t; + +typedef struct cudnnFractionStruct { + int64_t numerator; + int64_t denominator; +} cudnnFraction_t; + +typedef enum { + CUDNN_POINTWISE_ADD = 0, + CUDNN_POINTWISE_ADD_SQUARE = 5, + CUDNN_POINTWISE_DIV = 6, + CUDNN_POINTWISE_MAX = 3, + CUDNN_POINTWISE_MIN = 2, + CUDNN_POINTWISE_MOD = 7, + CUDNN_POINTWISE_MUL = 1, + CUDNN_POINTWISE_POW = 8, + CUDNN_POINTWISE_SUB = 9, + + CUDNN_POINTWISE_ABS = 10, + CUDNN_POINTWISE_CEIL = 11, + CUDNN_POINTWISE_COS = 12, + CUDNN_POINTWISE_EXP = 13, + CUDNN_POINTWISE_FLOOR = 14, + CUDNN_POINTWISE_LOG = 15, + CUDNN_POINTWISE_NEG = 16, + CUDNN_POINTWISE_RSQRT = 17, + CUDNN_POINTWISE_SIN = 18, + CUDNN_POINTWISE_SQRT = 4, + CUDNN_POINTWISE_TAN = 19, + CUDNN_POINTWISE_ERF = 20, + CUDNN_POINTWISE_IDENTITY = 21, + CUDNN_POINTWISE_RECIPROCAL = 22, + CUDNN_POINTWISE_ATAN2 = 23, + + CUDNN_POINTWISE_RELU_FWD = 100, + CUDNN_POINTWISE_TANH_FWD = 101, + CUDNN_POINTWISE_SIGMOID_FWD = 102, + CUDNN_POINTWISE_ELU_FWD = 103, + CUDNN_POINTWISE_GELU_FWD = 104, + CUDNN_POINTWISE_SOFTPLUS_FWD = 105, + CUDNN_POINTWISE_SWISH_FWD = 106, + CUDNN_POINTWISE_GELU_APPROX_TANH_FWD = 107, + + CUDNN_POINTWISE_RELU_BWD = 200, + CUDNN_POINTWISE_TANH_BWD = 201, + CUDNN_POINTWISE_SIGMOID_BWD = 202, + CUDNN_POINTWISE_ELU_BWD = 203, + CUDNN_POINTWISE_GELU_BWD = 204, + CUDNN_POINTWISE_SOFTPLUS_BWD = 205, + CUDNN_POINTWISE_SWISH_BWD = 206, + CUDNN_POINTWISE_GELU_APPROX_TANH_BWD = 207, + + CUDNN_POINTWISE_CMP_EQ = 300, + CUDNN_POINTWISE_CMP_NEQ = 301, + CUDNN_POINTWISE_CMP_GT = 302, + CUDNN_POINTWISE_CMP_GE = 303, + CUDNN_POINTWISE_CMP_LT = 304, + CUDNN_POINTWISE_CMP_LE = 305, + + CUDNN_POINTWISE_LOGICAL_AND = 400, + CUDNN_POINTWISE_LOGICAL_OR = 401, + CUDNN_POINTWISE_LOGICAL_NOT = 402, + + CUDNN_POINTWISE_GEN_INDEX = 501, + + CUDNN_POINTWISE_BINARY_SELECT = 601, +} cudnnPointwiseMode_t; + +typedef enum { + CUDNN_RESAMPLE_NEAREST = 0, + CUDNN_RESAMPLE_BILINEAR = 1, + CUDNN_RESAMPLE_AVGPOOL = 2, + CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING = 2, + CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING = 4, + CUDNN_RESAMPLE_MAXPOOL = 3, +} cudnnResampleMode_t; + +typedef enum { + CUDNN_SIGNAL_SET = 0, + CUDNN_SIGNAL_WAIT = 1, +} cudnnSignalMode_t; + +typedef enum { + CUDNN_GENSTATS_SUM_SQSUM = 0, +} cudnnGenStatsMode_t; + +typedef enum { + CUDNN_BN_FINALIZE_STATISTICS_TRAINING = 0, + CUDNN_BN_FINALIZE_STATISTICS_INFERENCE = 1, +} cudnnBnFinalizeStatsMode_t; + +typedef enum { + CUDNN_RNG_DISTRIBUTION_BERNOULLI = 0, + CUDNN_RNG_DISTRIBUTION_UNIFORM = 1, + CUDNN_RNG_DISTRIBUTION_NORMAL = 2, +} cudnnRngDistribution_t; + +typedef enum { + CUDNN_ATTR_POINTWISE_MODE = 0, + CUDNN_ATTR_POINTWISE_MATH_PREC = 1, + CUDNN_ATTR_POINTWISE_NAN_PROPAGATION CUDNN_DEPRECATED_ENUM = 2, + CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP = 3, + CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP = 4, + CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE = 5, + CUDNN_ATTR_POINTWISE_ELU_ALPHA = 6, + CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA = 7, + CUDNN_ATTR_POINTWISE_SWISH_BETA = 8, + CUDNN_ATTR_POINTWISE_AXIS = 9, + + CUDNN_ATTR_CONVOLUTION_COMP_TYPE = 100, + CUDNN_ATTR_CONVOLUTION_CONV_MODE = 101, + CUDNN_ATTR_CONVOLUTION_DILATIONS = 102, + CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES = 103, + CUDNN_ATTR_CONVOLUTION_POST_PADDINGS = 104, + CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS = 105, + CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS = 106, + + CUDNN_ATTR_ENGINEHEUR_MODE = 200, + CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH = 201, + CUDNN_ATTR_ENGINEHEUR_RESULTS = 202, + CUDNN_ATTR_ENGINEHEUR_SM_COUNT_TARGET = 203, + CUDNN_ATTR_ENGINEHEUR_DEVICEPROP = 204, + + CUDNN_ATTR_ENGINECFG_ENGINE = 300, + CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO = 301, + CUDNN_ATTR_ENGINECFG_KNOB_CHOICES = 302, + CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZE = 303, + CUDNN_ATTR_ENGINECFG_SHARED_MEMORY_USED = 304, + + CUDNN_ATTR_EXECUTION_PLAN_HANDLE CUDNN_DEPRECATED_ENUM = 400, + CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG = 401, + CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE = 402, + CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS = 403, + CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS = 404, + CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION = 405, + CUDNN_ATTR_EXECUTION_PLAN_KERNEL_CACHE = 406, + CUDNN_ATTR_EXECUTION_PLAN_DEVICEPROP = 407, + + CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID = 500, + CUDNN_ATTR_INTERMEDIATE_INFO_SIZE = 501, + CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS = 502, + CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES = 503, + + CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE = 600, + CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE = 601, + + CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA = 700, + CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA = 701, + CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC = 702, + CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W = 703, + CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X = 704, + CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y = 705, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA = 706, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA = 707, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC = 708, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W = 709, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX = 710, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY = 711, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA = 712, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA = 713, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC = 714, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW = 715, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X = 716, + CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY = 717, + + CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR = 750, + CUDNN_ATTR_OPERATION_POINTWISE_XDESC = 751, + CUDNN_ATTR_OPERATION_POINTWISE_BDESC = 752, + CUDNN_ATTR_OPERATION_POINTWISE_YDESC = 753, + CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1 = 754, + CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2 = 755, + CUDNN_ATTR_OPERATION_POINTWISE_DXDESC = 756, + CUDNN_ATTR_OPERATION_POINTWISE_DYDESC = 757, + CUDNN_ATTR_OPERATION_POINTWISE_TDESC = 758, + + CUDNN_ATTR_OPERATION_GENSTATS_MODE = 770, + CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC = 771, + CUDNN_ATTR_OPERATION_GENSTATS_XDESC = 772, + CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC = 773, + CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC = 774, + + CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE = 780, + CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC = 781, + CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC = 782, + CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC = 783, + CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC = 784, + CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC = 785, + CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC = 786, + CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC = 787, + CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC = 788, + CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC = 789, + CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC = 790, + CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC = 791, + CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC = 792, + CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC = 793, + CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC = 794, + CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC = 795, + CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC = 796, + + CUDNN_ATTR_OPERATIONGRAPH_HANDLE CUDNN_DEPRECATED_ENUM = 800, + CUDNN_ATTR_OPERATIONGRAPH_OPS = 801, + CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT = 802, + CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLED = 803, + CUDNN_ATTR_OPERATIONGRAPH_IS_SAME_TOPOLOGY = 804, + + CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT = 900, + CUDNN_ATTR_TENSOR_DATA_TYPE = 901, + CUDNN_ATTR_TENSOR_DIMENSIONS = 902, + CUDNN_ATTR_TENSOR_STRIDES = 903, + CUDNN_ATTR_TENSOR_VECTOR_COUNT = 904, + CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION = 905, + CUDNN_ATTR_TENSOR_UNIQUE_ID = 906, + CUDNN_ATTR_TENSOR_IS_VIRTUAL = 907, + CUDNN_ATTR_TENSOR_IS_BY_VALUE = 908, + CUDNN_ATTR_TENSOR_REORDERING_MODE = 909, + CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC = 913, + + CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS = 1000, + CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS = 1001, + CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES = 1002, + CUDNN_ATTR_VARIANT_PACK_WORKSPACE = 1003, + + CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID = 1100, + CUDNN_ATTR_LAYOUT_INFO_TYPES = 1101, + + CUDNN_ATTR_KNOB_INFO_TYPE = 1200, + CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE = 1201, + CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE = 1202, + CUDNN_ATTR_KNOB_INFO_STRIDE = 1203, + + CUDNN_ATTR_ENGINE_OPERATION_GRAPH = 1300, + CUDNN_ATTR_ENGINE_GLOBAL_INDEX = 1301, + CUDNN_ATTR_ENGINE_KNOB_INFO = 1302, + CUDNN_ATTR_ENGINE_NUMERICAL_NOTE = 1303, + CUDNN_ATTR_ENGINE_LAYOUT_INFO = 1304, + CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE = 1305, + CUDNN_ATTR_ENGINE_SM_COUNT_TARGET = 1306, + CUDNN_ATTR_ENGINE_DEVICEPROP = 1307, + + CUDNN_ATTR_MATMUL_COMP_TYPE = 1500, + CUDNN_ATTR_MATMUL_PADDING_VALUE = 1503, + + CUDNN_ATTR_OPERATION_MATMUL_ADESC = 1520, + CUDNN_ATTR_OPERATION_MATMUL_BDESC = 1521, + CUDNN_ATTR_OPERATION_MATMUL_CDESC = 1522, + CUDNN_ATTR_OPERATION_MATMUL_DESC = 1523, + CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT CUDNN_DEPRECATED_ENUM = 1524, + CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC = 1525, + CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC = 1526, + CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC = 1527, + + CUDNN_ATTR_REDUCTION_OPERATOR = 1600, + CUDNN_ATTR_REDUCTION_COMP_TYPE = 1601, + + CUDNN_ATTR_OPERATION_REDUCTION_XDESC = 1610, + CUDNN_ATTR_OPERATION_REDUCTION_YDESC = 1611, + CUDNN_ATTR_OPERATION_REDUCTION_DESC = 1612, + + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC = 1620, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC = 1621, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC = 1622, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC = 1623, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC = 1624, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC = 1625, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC = 1626, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC = 1627, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC = 1628, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC = 1629, + CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS = 1630, + + CUDNN_ATTR_RESAMPLE_MODE = 1700, + CUDNN_ATTR_RESAMPLE_COMP_TYPE = 1701, + CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS = 1702, + CUDNN_ATTR_RESAMPLE_POST_PADDINGS = 1703, + CUDNN_ATTR_RESAMPLE_PRE_PADDINGS = 1704, + CUDNN_ATTR_RESAMPLE_STRIDES = 1705, + CUDNN_ATTR_RESAMPLE_WINDOW_DIMS = 1706, + CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION = 1707, + CUDNN_ATTR_RESAMPLE_PADDING_MODE = 1708, + + CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC = 1710, + CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC = 1711, + CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC = 1712, + CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA CUDNN_DEPRECATED_ENUM = 1713, + CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA CUDNN_DEPRECATED_ENUM = 1714, + CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC = 1716, + + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC = 1720, + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC = 1721, + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC = 1722, + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA CUDNN_DEPRECATED_ENUM = 1723, + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA CUDNN_DEPRECATED_ENUM = 1724, + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC = 1725, + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC = 1726, + CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC = 1727, + + CUDNN_ATTR_OPERATION_CONCAT_AXIS = 1800, + CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS = 1801, + CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX = 1802, + CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC = 1803, + + CUDNN_ATTR_OPERATION_SIGNAL_MODE = 1900, + CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC = 1901, + CUDNN_ATTR_OPERATION_SIGNAL_VALUE = 1902, + CUDNN_ATTR_OPERATION_SIGNAL_XDESC = 1903, + CUDNN_ATTR_OPERATION_SIGNAL_YDESC = 1904, + + CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_CONTAINER_DESC = 1950, + CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_YDESC = 1951, + CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_SEQUENCE_DESC = 1952, + CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_PAGE_TABLE_DESC = 1953, + + CUDNN_ATTR_OPERATION_NORM_FWD_MODE = 2000, + CUDNN_ATTR_OPERATION_NORM_FWD_PHASE = 2001, + CUDNN_ATTR_OPERATION_NORM_FWD_XDESC = 2002, + CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC = 2003, + CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC = 2004, + CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC = 2005, + CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC = 2006, + CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC = 2007, + CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC = 2008, + CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC = 2009, + CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC = 2010, + CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC = 2011, + CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC = 2012, + CUDNN_ATTR_OPERATION_NORM_FWD_YDESC = 2013, + CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS = 2014, + + CUDNN_ATTR_OPERATION_NORM_BWD_MODE = 2100, + CUDNN_ATTR_OPERATION_NORM_BWD_XDESC = 2101, + CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC = 2102, + CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC = 2103, + CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC = 2104, + CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC = 2105, + CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC = 2106, + CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC = 2107, + CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC = 2108, + CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC = 2109, + CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS = 2110, + + CUDNN_ATTR_OPERATION_RESHAPE_XDESC = 2200, + CUDNN_ATTR_OPERATION_RESHAPE_YDESC = 2201, + + CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_XDESC = 2250, + CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_YDESC = 2251, + CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_LOWER_BANDWIDTH = 2252, + CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_UPPER_BANDWIDTH = 2253, + CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_AXIS = 2254, + CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_PAD_VALUE = 2255, + CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_KV_TOKEN_OFFSET_DESC = 2256, + + CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_XDESC = 2270, + CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_YDESC = 2271, + CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_LOWER_BANDWIDTH = 2272, + CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_UPPER_BANDWIDTH = 2273, + CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_AXIS = 2274, + CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_PAD_VALUE = 2275, + CUDNN_ATTR_OPERATION_CONTRACT_BAND_MAX_TOKEN_VALUE = 2276, + + CUDNN_ATTR_RNG_DISTRIBUTION = 2300, + CUDNN_ATTR_RNG_NORMAL_DIST_MEAN = 2301, + CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION = 2302, + CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM = 2303, + CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM = 2304, + CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY = 2305, + + CUDNN_ATTR_OPERATION_RNG_YDESC = 2310, + CUDNN_ATTR_OPERATION_RNG_SEED = 2311, + CUDNN_ATTR_OPERATION_RNG_DESC = 2312, + CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC = 2313, + + CUDNN_ATTR_KERNEL_CACHE_OPERATION_GRAPH = 2400, + CUDNN_ATTR_KERNEL_CACHE_IS_ENGINECFG_KERNEL_CACHED = 2401, + CUDNN_ATTR_KERNEL_CACHE_JSON_REPRESENTATION = 2402, + + CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_XDESC = 2500, + CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_YDESC = 2501, + CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_SCALE_DESC = 2502, + CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_MATH_PREC = 2503, + CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_BLOCK_SIZE = 2504, + + CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_XDESC = 2600, + CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_SCALE_DESC = 2601, + CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_YDESC = 2602, + CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_MATH_PREC = 2603, + CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_BLOCK_SIZE = 2604, + + CUDNN_ATTR_DEVICEPROP_DEVICE_ID = 2700, + CUDNN_ATTR_DEVICEPROP_HANDLE = 2701, + CUDNN_ATTR_DEVICEPROP_JSON_REPRESENTATION = 2702, +} cudnnBackendAttributeName_t; + +typedef enum { + CUDNN_TYPE_HANDLE = 0, + CUDNN_TYPE_DATA_TYPE = 1, + CUDNN_TYPE_BOOLEAN = 2, + CUDNN_TYPE_INT64 = 3, + CUDNN_TYPE_FLOAT = 4, + CUDNN_TYPE_DOUBLE = 5, + CUDNN_TYPE_VOID_PTR = 6, + CUDNN_TYPE_CONVOLUTION_MODE = 7, + CUDNN_TYPE_HEUR_MODE = 8, + CUDNN_TYPE_KNOB_TYPE = 9, + CUDNN_TYPE_NAN_PROPOGATION CUDNN_DEPRECATED_ENUM = 10, + CUDNN_TYPE_NUMERICAL_NOTE = 11, + CUDNN_TYPE_LAYOUT_TYPE = 12, + CUDNN_TYPE_ATTRIB_NAME = 13, + CUDNN_TYPE_POINTWISE_MODE = 14, + CUDNN_TYPE_BACKEND_DESCRIPTOR = 15, + CUDNN_TYPE_GENSTATS_MODE = 16, + CUDNN_TYPE_BN_FINALIZE_STATS_MODE = 17, + CUDNN_TYPE_REDUCTION_OPERATOR_TYPE = 18, + CUDNN_TYPE_BEHAVIOR_NOTE = 19, + CUDNN_TYPE_TENSOR_REORDERING_MODE = 20, + CUDNN_TYPE_RESAMPLE_MODE = 21, + CUDNN_TYPE_PADDING_MODE = 22, + CUDNN_TYPE_INT32 = 23, + CUDNN_TYPE_CHAR = 24, + CUDNN_TYPE_SIGNAL_MODE = 25, + CUDNN_TYPE_FRACTION = 26, + CUDNN_TYPE_NORM_MODE = 27, + CUDNN_TYPE_NORM_FWD_PHASE = 28, + CUDNN_TYPE_RNG_DISTRIBUTION = 29, +} cudnnBackendAttributeType_t; + +typedef enum { + CUDNN_BACKEND_POINTWISE_DESCRIPTOR = 0, + CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR = 1, + CUDNN_BACKEND_ENGINE_DESCRIPTOR = 2, + CUDNN_BACKEND_ENGINECFG_DESCRIPTOR = 3, + CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR = 4, + CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR = 5, + CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR = 6, + CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR = 7, + CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR = 8, + CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR = 9, + CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR = 10, + CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR = 11, + CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR = 12, + CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR = 13, + CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR = 14, + CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR = 15, + CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR = 16, + CUDNN_BACKEND_TENSOR_DESCRIPTOR = 17, + CUDNN_BACKEND_MATMUL_DESCRIPTOR = 18, + CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR = 19, + CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR = 20, + CUDNN_BACKEND_REDUCTION_DESCRIPTOR = 21, + CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR = 22, + CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR = 23, + CUDNN_BACKEND_RESAMPLE_DESCRIPTOR = 24, + CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR = 25, + CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR = 26, + CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR = 27, + CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR = 28, + CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR = 29, + CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR = 30, + CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR = 31, + CUDNN_BACKEND_RNG_DESCRIPTOR = 32, + CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR = 33, + CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR = 34, + CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR = 35, + CUDNN_BACKEND_OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR = 36, + CUDNN_BACKEND_OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR = 37, + CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR = 38, + CUDNN_BACKEND_OPERATION_EXPAND_BAND_MATRIX_DESCRIPTOR = 39, + CUDNN_BACKEND_OPERATION_CONTRACT_BAND_MATRIX_DESCRIPTOR = 40, +} cudnnBackendDescriptorType_t; + +typedef enum { + CUDNN_NUMERICAL_NOTE_TENSOR_CORE = 0, + CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS = 1, + CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION = 2, + CUDNN_NUMERICAL_NOTE_FFT = 3, + CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC = 4, + CUDNN_NUMERICAL_NOTE_WINOGRAD = 5, + CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4 = 6, + CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6 = 7, + CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13 = 8, + CUDNN_NUMERICAL_NOTE_STRICT_NAN_PROP = 9, + CUDNN_NUMERICAL_NOTE_TYPE_COUNT = 10, +} cudnnBackendNumericalNote_t; + +typedef enum { + CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION = 0, + CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER = 1, + CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER = 2, + CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API = 3, + CUDNN_BEHAVIOR_NOTE_TYPE_COUNT = 4, +} cudnnBackendBehaviorNote_t; + +typedef enum { + CUDNN_KNOB_TYPE_SPLIT_K CUDNN_DEPRECATED_ENUM = 0, + CUDNN_KNOB_TYPE_SWIZZLE = 1, + CUDNN_KNOB_TYPE_TILE_SIZE = 2, + CUDNN_KNOB_TYPE_USE_TEX CUDNN_DEPRECATED_ENUM = 3, + CUDNN_KNOB_TYPE_EDGE = 4, + CUDNN_KNOB_TYPE_KBLOCK CUDNN_DEPRECATED_ENUM = 5, + CUDNN_KNOB_TYPE_LDGA CUDNN_DEPRECATED_ENUM = 6, + CUDNN_KNOB_TYPE_LDGB CUDNN_DEPRECATED_ENUM = 7, + CUDNN_KNOB_TYPE_CHUNK_K CUDNN_DEPRECATED_ENUM = 8, + CUDNN_KNOB_TYPE_SPLIT_H CUDNN_DEPRECATED_ENUM = 9, + CUDNN_KNOB_TYPE_WINO_TILE CUDNN_DEPRECATED_ENUM = 10, + CUDNN_KNOB_TYPE_MULTIPLY = 11, + CUDNN_KNOB_TYPE_SPLIT_K_BUF = 12, + CUDNN_KNOB_TYPE_TILEK = 13, + CUDNN_KNOB_TYPE_STAGES = 14, + CUDNN_KNOB_TYPE_REDUCTION_MODE = 15, + CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE CUDNN_DEPRECATED_ENUM = 16, + CUDNN_KNOB_TYPE_SPLIT_K_SLC = 17, + CUDNN_KNOB_TYPE_IDX_MODE = 18, + CUDNN_KNOB_TYPE_SLICED CUDNN_DEPRECATED_ENUM = 19, + CUDNN_KNOB_TYPE_SPLIT_RS CUDNN_DEPRECATED_ENUM = 20, + CUDNN_KNOB_TYPE_SINGLEBUFFER CUDNN_DEPRECATED_ENUM = 21, + CUDNN_KNOB_TYPE_LDGC CUDNN_DEPRECATED_ENUM = 22, + CUDNN_KNOB_TYPE_SPECFILT = 23, + CUDNN_KNOB_TYPE_KERNEL_CFG = 24, + CUDNN_KNOB_TYPE_WORKSPACE = 25, + CUDNN_KNOB_TYPE_TILE_CGA CUDNN_DEPRECATED_ENUM = 26, + CUDNN_KNOB_TYPE_TILE_CGA_M = 27, + CUDNN_KNOB_TYPE_TILE_CGA_N = 28, + CUDNN_KNOB_TYPE_BLOCK_SIZE = 29, + CUDNN_KNOB_TYPE_OCCUPANCY = 30, + CUDNN_KNOB_TYPE_ARRAY_SIZE_PER_THREAD = 31, + CUDNN_KNOB_TYPE_NUM_C_PER_BLOCK CUDNN_DEPRECATED_ENUM = 32, + CUDNN_KNOB_TYPE_SPLIT_COLS = 33, + CUDNN_KNOB_TYPE_TILE_ROWS = 34, + CUDNN_KNOB_TYPE_TILE_COLS = 35, + CUDNN_KNOB_TYPE_LOAD_SIZE = 36, + CUDNN_KNOB_TYPE_CTA_COUNT = 37, + CUDNN_KNOB_TYPE_STREAM_K = 38, + CUDNN_KNOB_TYPE_SPLIT_P_SLC = 39, + CUDNN_KNOB_TYPE_TILE_M = 40, + CUDNN_KNOB_TYPE_TILE_N = 41, + CUDNN_KNOB_TYPE_WARP_SPEC_CFG = 42, + CUDNN_KNOB_TYPE_COUNTS = 43, +} cudnnBackendKnobType_t; + +typedef enum { + CUDNN_LAYOUT_TYPE_PREFERRED_NCHW = 0, + CUDNN_LAYOUT_TYPE_PREFERRED_NHWC = 1, + CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK = 2, + CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK = 3, + CUDNN_LAYOUT_TYPE_COUNT = 4, +} cudnnBackendLayoutType_t; + +typedef enum { + CUDNN_HEUR_MODE_INSTANT = 0, + CUDNN_HEUR_MODE_B = 1, + CUDNN_HEUR_MODE_FALLBACK = 2, + CUDNN_HEUR_MODE_A = 3, + CUDNN_HEUR_MODES_COUNT = 4, +} cudnnBackendHeurMode_t; + +typedef enum { + CUDNN_TENSOR_REORDERING_NONE = 0, + CUDNN_TENSOR_REORDERING_INT8x32 = 1, + CUDNN_TENSOR_REORDERING_F16x16 = 2, + CUDNN_TENSOR_REORDERING_F8_128x4 = 3, +} cudnnBackendTensorReordering_t; + +typedef enum { + CUDNN_ZERO_PAD = 0, + CUDNN_NEG_INF_PAD = 1, + CUDNN_EDGE_VAL_PAD = 2, +} cudnnPaddingMode_t; + +typedef enum { + CUDNN_LAYER_NORM = 0, + CUDNN_INSTANCE_NORM = 1, + CUDNN_BATCH_NORM = 2, + CUDNN_GROUP_NORM = 3, + CUDNN_RMS_NORM = 4, + CUDNN_ADA_LAYER_NORM = 5, +} cudnnBackendNormMode_t; + +typedef enum { + CUDNN_NORM_FWD_INFERENCE = 0, + CUDNN_NORM_FWD_TRAINING = 1, +} cudnnBackendNormFwdPhase_t; + +cudnnStatus_t CUDNNWINAPI +cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor); + +cudnnStatus_t CUDNNWINAPI +cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor); + +cudnnStatus_t CUDNNWINAPI +cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor); + +cudnnStatus_t CUDNNWINAPI +cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor, + cudnnBackendAttributeName_t attributeName, + cudnnBackendAttributeType_t attributeType, + int64_t elementCount, + const void *arrayOfElements); + +cudnnStatus_t CUDNNWINAPI +cudnnBackendGetAttribute(cudnnBackendDescriptor_t const descriptor, + cudnnBackendAttributeName_t attributeName, + cudnnBackendAttributeType_t attributeType, + int64_t requestedElementCount, + int64_t *elementCount, + void *arrayOfElements); + +cudnnStatus_t CUDNNWINAPI +cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack); + +cudnnStatus_t CUDNNWINAPI +cudnnBackendPopulateCudaGraph(cudnnHandle_t handle, + cudnnBackendDescriptor_t executionPlan, + cudnnBackendDescriptor_t variantPack, + cudaGraph_t graph); + +cudnnStatus_t CUDNNWINAPI +cudnnBackendUpdateCudaGraph(cudnnHandle_t handle, + cudnnBackendDescriptor_t executionPlan, + cudnnBackendDescriptor_t variantPack, + cudaGraph_t graph); + +#if defined(__cplusplus) +} +#endif + +#endif /* CUDNN_GRAPH_H_ */ diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_ops.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..471a0e59d67228ab8a74159517418f217ab86324 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_ops.h @@ -0,0 +1,1316 @@ +/* + * Copyright 2014-2023 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* + * cudnn_ops : cuDNN's basic definitions and basic operations. + */ + +#if !defined(CUDNN_OPS_H_) +#define CUDNN_OPS_H_ + +#include + +#include "cudnn_version.h" +#include "cudnn_graph.h" + +/* These version numbers are autogenerated, do not edit manually. */ +#define CUDNN_OPS_MAJOR 9 +#define CUDNN_OPS_MINOR 10 +#define CUDNN_OPS_PATCH 2 + +#if (CUDNN_OPS_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_MINOR != CUDNN_MINOR) || (CUDNN_OPS_PATCH != CUDNN_PATCHLEVEL) +#error Version mismatch in cuDNN OPS INFER!!! +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +/* Data structures to represent Image/Filter and the Neural Network Layer */ +typedef struct cudnnTensorStruct *cudnnTensorDescriptor_t; +typedef struct cudnnPoolingStruct *cudnnPoolingDescriptor_t CUDNN_DEPRECATED; +typedef struct cudnnFilterStruct *cudnnFilterDescriptor_t CUDNN_DEPRECATED; +typedef struct cudnnLRNStruct *cudnnLRNDescriptor_t; +typedef struct cudnnActivationStruct *cudnnActivationDescriptor_t CUDNN_DEPRECATED; +typedef struct cudnnSpatialTransformerStruct *cudnnSpatialTransformerDescriptor_t; +typedef struct cudnnOpTensorStruct *cudnnOpTensorDescriptor_t CUDNN_DEPRECATED; +typedef struct cudnnReduceTensorStruct *cudnnReduceTensorDescriptor_t CUDNN_DEPRECATED; +typedef struct cudnnCTCLossStruct *cudnnCTCLossDescriptor_t; +typedef struct cudnnTensorTransformStruct *cudnnTensorTransformDescriptor_t CUDNN_DEPRECATED; +/* + * CUDNN Determinism + */ +typedef enum { + CUDNN_NON_DETERMINISTIC = 0, + CUDNN_DETERMINISTIC = 1, +} cudnnDeterminism_t; + +/* Create an instance of a generic Tensor descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc, + cudnnTensorFormat_t format, + cudnnDataType_t dataType, /* image data type */ + int n, /* number of inputs (batch size) */ + int c, /* number of input feature maps */ + int h, /* height of input section */ + int w); /* width of input section */ + +cudnnStatus_t CUDNNWINAPI +cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc, + cudnnDataType_t dataType, /* image data type */ + int n, /* number of inputs (batch size) */ + int c, /* number of input feature maps */ + int h, /* height of input section */ + int w, /* width of input section */ + int nStride, + int cStride, + int hStride, + int wStride); + +cudnnStatus_t CUDNNWINAPI +cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc, + cudnnDataType_t *dataType, /* image data type */ + int *n, /* number of inputs (batch size) */ + int *c, /* number of input feature maps */ + int *h, /* height of input section */ + int *w, /* width of input section */ + int *nStride, + int *cStride, + int *hStride, + int *wStride); + +cudnnStatus_t CUDNNWINAPI +cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc, + cudnnDataType_t dataType, + int nbDims, + const int dimA[], + const int strideA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc, + cudnnTensorFormat_t format, + cudnnDataType_t dataType, + int nbDims, + const int dimA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc, + int nbDimsRequested, + cudnnDataType_t *dataType, + int *nbDims, + int dimA[], + int strideA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size); + +/* PixelOffset( n, c, h, w ) = n *input_stride + c * feature_stride + h * h_stride + w * w_stride + + 1)Example of all images in row major order one batch of features after the other (with an optional padding on row) + input_stride : c x h x h_stride + feature_stride : h x h_stride + h_stride : >= w ( h_stride = w if no padding) + w_stride : 1 + + + 2)Example of all images in row major with features maps interleaved + input_stride : c x h x h_stride + feature_stride : 1 + h_stride : w x c + w_stride : c + + 3)Example of all images in column major order one batch of features after the other (with optional padding on column) + input_stride : c x w x w_stride + feature_stride : w x w_stride + h_stride : 1 + w_stride : >= h + +*/ + +/* Destroy an instance of Tensor4d descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc); + +/* Fold/unfold transforms */ +typedef enum { + CUDNN_TRANSFORM_FOLD = 0U, + CUDNN_TRANSFORM_UNFOLD = 1U, +} cudnnFoldingDirection_t; + +/** Create a destination descriptor for cudnnTransformTensor */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc, + const cudnnTensorDescriptor_t srcDesc, + cudnnTensorDescriptor_t destDesc, + size_t *destSizeInBytes); + +/** Create an empty tensor transform descriptor */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc); + +/** Initialize a previously created tensor transform descriptor. */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc, + const uint32_t nbDims, + const cudnnTensorFormat_t destFormat, + const int32_t padBeforeA[], + const int32_t padAfterA[], + const uint32_t foldA[], + const cudnnFoldingDirection_t direction); + +/** + * Retrieves the values stored in a previously initialized tensor transform + * descriptor. + */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc, + uint32_t nbDimsRequested, + cudnnTensorFormat_t *destFormat, + int32_t padBeforeA[], + int32_t padAfterA[], + uint32_t foldA[], + cudnnFoldingDirection_t *direction); + +/** + * Destroys a previously created tensor transform descriptor. + */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc); + +/* Tensor layout conversion helper (y = alpha * x + beta * y) */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnTransformTensor(cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnTransformTensorEx(cudnnHandle_t handle, + const cudnnTensorTransformDescriptor_t transDesc, + const void *alpha, + const cudnnTensorDescriptor_t srcDesc, + const void *srcData, + const void *beta, + const cudnnTensorDescriptor_t destDesc, + void *destData); + +/* Tensor Bias addition : C = alpha * A + beta * C */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnAddTensor(cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t aDesc, + const void *A, + const void *beta, + const cudnnTensorDescriptor_t cDesc, + void *C); + +/* + * CUDNN OpTensor op type + */ +typedef enum { + CUDNN_OP_TENSOR_ADD = 0, + CUDNN_OP_TENSOR_MUL = 1, + CUDNN_OP_TENSOR_MIN = 2, + CUDNN_OP_TENSOR_MAX = 3, + CUDNN_OP_TENSOR_SQRT = 4, + CUDNN_OP_TENSOR_NOT = 5, +} cudnnOpTensorOp_t; + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc, + cudnnOpTensorOp_t opTensorOp, + cudnnDataType_t opTensorCompType, + cudnnNanPropagation_t opTensorNanOpt); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc, + cudnnOpTensorOp_t *opTensorOp, + cudnnDataType_t *opTensorCompType, + cudnnNanPropagation_t *opTensorNanOpt); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc); + +/* Tensor operation : C = op( alpha1 * A, alpha2 * B ) + beta * C */ +/* B tensor is ignored for CUDNN_OP_TENSOR_SQRT, CUDNN_OP_TENSOR_NOT. */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnOpTensor(cudnnHandle_t handle, + const cudnnOpTensorDescriptor_t opTensorDesc, + const void *alpha1, + const cudnnTensorDescriptor_t aDesc, + const void *A, + const void *alpha2, + const cudnnTensorDescriptor_t bDesc, + const void *B, + const void *beta, + const cudnnTensorDescriptor_t cDesc, + void *C); + +/* + * CUDNN ReduceTensor indices type + */ +typedef enum { + CUDNN_REDUCE_TENSOR_NO_INDICES = 0, + CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1, +} cudnnReduceTensorIndices_t CUDNN_DEPRECATED; + +/* + * CUDNN tensor indices type size (all unsigned) + * Currently not supported, default is 32 bit unsigned. + */ +typedef enum { + CUDNN_32BIT_INDICES = 0, + CUDNN_64BIT_INDICES = 1, + CUDNN_16BIT_INDICES = 2, + CUDNN_8BIT_INDICES = 3, +} cudnnIndicesType_t CUDNN_DEPRECATED; + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc, + cudnnReduceTensorOp_t reduceTensorOp, + cudnnDataType_t reduceTensorCompType, + cudnnNanPropagation_t reduceTensorNanOpt, + cudnnReduceTensorIndices_t reduceTensorIndices, + cudnnIndicesType_t reduceTensorIndicesType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc, + cudnnReduceTensorOp_t *reduceTensorOp, + cudnnDataType_t *reduceTensorCompType, + cudnnNanPropagation_t *reduceTensorNanOpt, + cudnnReduceTensorIndices_t *reduceTensorIndices, + cudnnIndicesType_t *reduceTensorIndicesType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc); + +/* Helper function to return the minimum size of the index space to be passed to the reduction given the input and + * output tensors */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetReductionIndicesSize(cudnnHandle_t handle, + const cudnnReduceTensorDescriptor_t reduceTensorDesc, + const cudnnTensorDescriptor_t aDesc, + const cudnnTensorDescriptor_t cDesc, + size_t *sizeInBytes); + +/* Helper function to return the minimum size of the workspace to be passed to the reduction given the input and output + * tensors */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetReductionWorkspaceSize(cudnnHandle_t handle, + const cudnnReduceTensorDescriptor_t reduceTensorDesc, + const cudnnTensorDescriptor_t aDesc, + const cudnnTensorDescriptor_t cDesc, + size_t *sizeInBytes); + +/* Tensor operation : C = reduce op( alpha * A ) + beta * C */ +/* The NaN propagation enum applies to only the min and max reduce ops; the other reduce ops propagate NaN as usual. */ +/* The indices space is ignored for reduce ops other than min or max. */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnReduceTensor(cudnnHandle_t handle, + const cudnnReduceTensorDescriptor_t reduceTensorDesc, + void *indices, + size_t indicesSizeInBytes, + void *workspace, + size_t workspaceSizeInBytes, + const void *alpha, + const cudnnTensorDescriptor_t aDesc, + const void *A, + const void *beta, + const cudnnTensorDescriptor_t cDesc, + void *C); + +/* Set all values of a tensor to a given value : y[i] = value[0] */ +cudnnStatus_t CUDNNWINAPI +cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr); + +/* Scale all values of a tensor by a given factor : y[i] = alpha * y[i] */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha); + +/* Create an instance of FilterStruct */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t dataType, /* image data type */ + cudnnTensorFormat_t format, + int k, /* number of output feature maps */ + int c, /* number of input feature maps */ + int h, /* height of each input filter */ + int w); /* width of each input filter */ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t *dataType, /* image data type */ + cudnnTensorFormat_t *format, + int *k, /* number of output feature maps */ + int *c, /* number of input feature maps */ + int *h, /* height of each input filter */ + int *w); /* width of each input filter */ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t dataType, /* image data type */ + cudnnTensorFormat_t format, + int nbDims, + const int filterDimA[]); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc, + int nbDimsRequested, + cudnnDataType_t *dataType, /* image data type */ + cudnnTensorFormat_t *format, + int *nbDims, + int filterDimA[]); +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnTransformFilter(cudnnHandle_t handle, + const cudnnTensorTransformDescriptor_t transDesc, + const void *alpha, + const cudnnFilterDescriptor_t srcDesc, + const void *srcData, + const void *beta, + const cudnnFilterDescriptor_t destDesc, + void *destData); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc); + +/* + * softmax algorithm + */ +typedef enum { + CUDNN_SOFTMAX_FAST = 0, /* straightforward implementation */ + CUDNN_SOFTMAX_ACCURATE = 1, /* subtract max from every point to avoid overflow */ + CUDNN_SOFTMAX_LOG = 2 +} cudnnSoftmaxAlgorithm_t; + +typedef enum { + CUDNN_SOFTMAX_MODE_INSTANCE = 0, /* compute the softmax over all C, H, W for each N */ + CUDNN_SOFTMAX_MODE_CHANNEL = 1 /* compute the softmax over all C for each H, W, N */ +} cudnnSoftmaxMode_t; + +/* Softmax functions: All of the form "output = alpha * Op(inputs) + beta * output" */ + +/* Function to perform forward softmax */ +cudnnStatus_t CUDNNWINAPI +cudnnSoftmaxForward(cudnnHandle_t handle, + cudnnSoftmaxAlgorithm_t algo, + cudnnSoftmaxMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +/* + * pooling mode + */ +typedef enum { + CUDNN_POOLING_MAX = 0, + CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values */ + CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values */ + CUDNN_POOLING_MAX_DETERMINISTIC = 3 +} cudnnPoolingMode_t CUDNN_DEPRECATED; + +/* Create an instance of pooling descriptor */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc, + cudnnPoolingMode_t mode, + cudnnNanPropagation_t maxpoolingNanOpt, + int windowHeight, + int windowWidth, + int verticalPadding, + int horizontalPadding, + int verticalStride, + int horizontalStride); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc, + cudnnPoolingMode_t *mode, + cudnnNanPropagation_t *maxpoolingNanOpt, + int *windowHeight, + int *windowWidth, + int *verticalPadding, + int *horizontalPadding, + int *verticalStride, + int *horizontalStride); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc, + const cudnnPoolingMode_t mode, + const cudnnNanPropagation_t maxpoolingNanOpt, + int nbDims, + const int windowDimA[], + const int paddingA[], + const int strideA[]); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc, + int nbDimsRequested, + cudnnPoolingMode_t *mode, + cudnnNanPropagation_t *maxpoolingNanOpt, + int *nbDims, + int windowDimA[], + int paddingA[], + int strideA[]); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + int nbDims, + int outputTensorDimA[]); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + int *n, + int *c, + int *h, + int *w); + +/* Destroy an instance of pooling descriptor */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc); + +/* Pooling functions: All of the form "output = alpha * Op(inputs) + beta * output" */ + +/* Function to perform forward pooling */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnPoolingForward(cudnnHandle_t handle, + const cudnnPoolingDescriptor_t poolingDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +/* Activation functions: All of the form "output = alpha * Op(inputs) + beta * output" */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc, + cudnnActivationMode_t mode, + cudnnNanPropagation_t reluNanOpt, + double coef); /* ceiling for clipped RELU, alpha for ELU */ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc, + cudnnActivationMode_t *mode, + cudnnNanPropagation_t *reluNanOpt, + double *coef); /* ceiling for clipped RELU, alpha for ELU */ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double *swish_beta); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc); + +/* Function to perform forward activation */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnActivationForward(cudnnHandle_t handle, + cudnnActivationDescriptor_t activationDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +/* + * Create an instance of LRN (Local Response Normalization) descriptor + * Uses lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper + */ +cudnnStatus_t CUDNNWINAPI +cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc); + +#define CUDNN_LRN_MIN_N 1 /* minimum allowed lrnN */ +#define CUDNN_LRN_MAX_N 16 /* maximum allowed lrnN */ +#define CUDNN_LRN_MIN_K 1e-5 /* minimum allowed lrnK */ +#define CUDNN_LRN_MIN_BETA 0.01 /* minimum allowed lrnBeta */ + +/* LRN layer mode */ +typedef enum { + CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0, /* Normalize across tensor's dimA[1] dimension */ +} cudnnLRNMode_t; + +/* + * Uses a window [center-lookBehind, center+lookAhead], where + * lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1. + * Values of double parameters cast to tensor data type. + */ +cudnnStatus_t CUDNNWINAPI +cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK); +/* + * Retrieve the settings currently stored in an LRN layer descriptor + * Any of the provided pointers can be NULL (no corresponding value will be returned) + */ +cudnnStatus_t CUDNNWINAPI +cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK); + +/* Destroy an instance of LRN descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc); + +/* LRN functions: output = alpha * normalize(x) + beta * old_y */ + +/* LRN cross-channel forward computation. Double parameters cast to tensor data type */ +cudnnStatus_t CUDNNWINAPI +cudnnLRNCrossChannelForward(cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnLRNMode_t lrnMode, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +typedef enum { + CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0, +} cudnnDivNormMode_t; + +/* LCN/divisive normalization functions: y = alpha * normalize(x) + beta * y */ +cudnnStatus_t CUDNNWINAPI +cudnnDivisiveNormalizationForward(cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnDivNormMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */ + const void *x, + const void *means, /* if NULL, means are assumed to be zero */ + void *temp, + void *temp2, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +typedef enum { + /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */ + CUDNN_BATCHNORM_PER_ACTIVATION = 0, + + /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */ + CUDNN_BATCHNORM_SPATIAL = 1, + + /* + * bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors). + * May be faster than CUDNN_BATCHNORM_SPATIAL but imposes some limits on the range of values + */ + CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2, +} cudnnBatchNormMode_t CUDNN_DEPRECATED; + +#define CUDNN_BN_MIN_EPSILON 0.0 /* Minimum epsilon allowed to be used in the Batch Normalization formula */ + +/* + * Derives a tensor descriptor from layer data descriptor for BatchNormalization + * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for + * bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc in Batch Normalization forward and backward functions. + */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc, + const cudnnTensorDescriptor_t xDesc, + cudnnBatchNormMode_t mode); + +typedef enum { + CUDNN_BATCHNORM_OPS_BN = 0, /* do batch normalization only */ + CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 1, /* do batchNorm, then activation */ + CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2, /* do batchNorm, then elemWiseAdd, then activation */ +} cudnnBatchNormOps_t CUDNN_DEPRECATED; + +/* + * Performs Batch Normalization during Inference: + * y[i] = bnScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + bnBias[k] + * with bnScale, bnBias, runningMean, runningInvVariance tensors indexed + * according to spatial or per-activation mode. Refer to cudnnBatchNormalizationForwardTraining + * above for notes on function arguments. + */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnBatchNormalizationForwardInference(cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + const void *alpha, /* alpha[0] = result blend factor */ + const void *beta, /* beta[0] = dest layer blend factor */ + const cudnnTensorDescriptor_t xDesc, + const void *x, /* NxCxHxW */ + const cudnnTensorDescriptor_t yDesc, + void *y, /* NxCxHxW */ + const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, + const void *bnScale, + const void *bnBias, + const void *estimatedMean, + const void *estimatedVariance, + double epsilon); + +typedef enum { + /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */ + CUDNN_NORM_PER_ACTIVATION = 0, + + /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */ + CUDNN_NORM_PER_CHANNEL = 1, +} cudnnNormMode_t CUDNN_DEPRECATED; + +typedef enum { CUDNN_NORM_ALGO_STANDARD = 0, CUDNN_NORM_ALGO_PERSIST = 1 } cudnnNormAlgo_t CUDNN_DEPRECATED; + +/* + * Derives a tensor descriptor from layer data descriptor for Normalization + * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for + * normScaleBiasMeanVarDesc and normScaleBiasDiffDesc in Normalization forward and backward functions. + */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNormScaleBiasDesc, + cudnnTensorDescriptor_t derivedNormMeanVarDesc, + const cudnnTensorDescriptor_t xDesc, + cudnnNormMode_t mode, + int groupCnt); /* Place hold for future work, should be set to 1 now*/ + +typedef enum { + CUDNN_NORM_OPS_NORM = 0, /* do normalization only */ + CUDNN_NORM_OPS_NORM_ACTIVATION = 1, /* do Norm, then activation */ + CUDNN_NORM_OPS_NORM_ADD_ACTIVATION = 2, /* do Norm, then elemWiseAdd, then activation */ +} cudnnNormOps_t CUDNN_DEPRECATED; + +/* + * Performs Normalization during Inference: + * y[i] = normScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + normBias[k] + * with normScale, normBias, runningMean, runningInvVariance tensors indexed + * according to per-channel or per-activation mode. Refer to cudnnNormalizationForwardTraining + * above for notes on function arguments. + */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnNormalizationForwardInference(cudnnHandle_t handle, + cudnnNormMode_t mode, + cudnnNormOps_t normOps, + cudnnNormAlgo_t algo, + const void *alpha, /* alpha[0] = result blend factor */ + const void *beta, /* beta[0] = dest layer blend factor */ + const cudnnTensorDescriptor_t xDesc, + const void *x, /* NxCxHxW */ + const cudnnTensorDescriptor_t normScaleBiasDesc, + const void *normScale, + const void *normBias, + const cudnnTensorDescriptor_t normMeanVarDesc, + const void *estimatedMean, + const void *estimatedVariance, + const cudnnTensorDescriptor_t zDesc, + const void *z, + cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t yDesc, + void *y, /* NxCxHxW */ + double epsilon, + int groupCnt); /* Place hold for future work*/ + +/* APIs for spatial transformer network*/ +typedef enum { + CUDNN_SAMPLER_BILINEAR = 0, +} cudnnSamplerType_t; + +cudnnStatus_t CUDNNWINAPI +cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc, + cudnnSamplerType_t samplerType, + cudnnDataType_t dataType, + const int nbDims, + const int dimA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle, + const cudnnSpatialTransformerDescriptor_t stDesc, + const void *theta, + void *grid); + +cudnnStatus_t CUDNNWINAPI +cudnnSpatialTfSamplerForward(cudnnHandle_t handle, + cudnnSpatialTransformerDescriptor_t stDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *grid, + const void *beta, + cudnnTensorDescriptor_t yDesc, + void *y); + +typedef struct cudnnDropoutStruct *cudnnDropoutDescriptor_t; + +cudnnStatus_t CUDNNWINAPI +cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc); + +/*helper function to determine size of the states to be passed to cudnnSetDropoutDescriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes); + +/*helper function to determine size of the reserve space to be passed to dropout forward/backward calls */ +cudnnStatus_t CUDNNWINAPI +cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes); + +cudnnStatus_t CUDNNWINAPI +cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, + cudnnHandle_t handle, + float dropout, + void *states, + size_t stateSizeInBytes, + unsigned long long seed); + +/* Restores the dropout descriptor to a previously saved-off state */ +cudnnStatus_t CUDNNWINAPI +cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, + cudnnHandle_t handle, + float dropout, + void *states, + size_t stateSizeInBytes, + unsigned long long seed); + +cudnnStatus_t CUDNNWINAPI +cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, + cudnnHandle_t handle, + float *dropout, + void **states, + unsigned long long *seed); + +cudnnStatus_t CUDNNWINAPI +cudnnDropoutForward(cudnnHandle_t handle, + const cudnnDropoutDescriptor_t dropoutDesc, + const cudnnTensorDescriptor_t xdesc, + const void *x, + const cudnnTensorDescriptor_t ydesc, + void *y, + void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +/* TODO: move these enums out to the appropriate submodule */ +typedef enum { + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 0, + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1, + CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2, + CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3, + CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4, + CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 7, + CUDNN_CONVOLUTION_FWD_ALGO_COUNT = 8 +} cudnnConvolutionFwdAlgo_t; + +typedef enum { + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0, /* non-deterministic */ + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3, /* non-deterministic */ + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 4, /* not implemented */ + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING = 6, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT = 7 +} cudnnConvolutionBwdFilterAlgo_t; + +typedef enum { + CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0, /* non-deterministic */ + CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT = 6 +} cudnnConvolutionBwdDataAlgo_t; + +typedef enum { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0, CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 } cudnnCTCLossAlgo_t; + +/* + * \brief Cross-library version checker. + * This function is implemented differently in each sub-library. Each sublib + * checks whether its own version matches that of its dependencies. + * \returns CUDNN_STATUS_SUCCESS if the version check passes, + * CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent. + */ +cudnnStatus_t CUDNNWINAPI +cudnnOpsVersionCheck(void); + +/* Function to perform backward softmax */ +cudnnStatus_t CUDNNWINAPI +cudnnSoftmaxBackward(cudnnHandle_t handle, + cudnnSoftmaxAlgorithm_t algo, + cudnnSoftmaxMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t yDesc, + const void *y, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx); + +/* Function to perform backward pooling */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnPoolingBackward(cudnnHandle_t handle, + const cudnnPoolingDescriptor_t poolingDesc, + const void *alpha, + const cudnnTensorDescriptor_t yDesc, + const void *y, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx); + +/* Function to perform backward activation */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnActivationBackward(cudnnHandle_t handle, + cudnnActivationDescriptor_t activationDesc, + const void *alpha, + const cudnnTensorDescriptor_t yDesc, + const void *y, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx); + +/* LRN cross-channel backward computation. Double parameters cast to tensor data type */ +cudnnStatus_t CUDNNWINAPI +cudnnLRNCrossChannelBackward(cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnLRNMode_t lrnMode, + const void *alpha, + const cudnnTensorDescriptor_t yDesc, + const void *y, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx); + +cudnnStatus_t CUDNNWINAPI +cudnnDivisiveNormalizationBackward(cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnDivNormMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */ + const void *x, + const void *means, /* if NULL, means are assumed to be zero */ + const void *dy, + void *temp, + void *temp2, + const void *beta, + const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */ + void *dx, /* output x differential */ + void *dMeans); /* output means differential, can be NULL */ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + cudnnBatchNormOps_t bnOps, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t zDesc, + const cudnnTensorDescriptor_t yDesc, + const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, + const cudnnActivationDescriptor_t activationDesc, + size_t *sizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + cudnnBatchNormOps_t bnOps, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t yDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnTensorDescriptor_t dzDesc, + const cudnnTensorDescriptor_t dxDesc, + const cudnnTensorDescriptor_t dBnScaleBiasDesc, + const cudnnActivationDescriptor_t activationDesc, + size_t *sizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + cudnnBatchNormOps_t bnOps, + const cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t xDesc, + size_t *sizeInBytes); + +/* Computes y = BN(x). Also accumulates moving averages of mean and inverse variances */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnBatchNormalizationForwardTraining( + cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + + const void *alpha, /* alpha[0] = result blend factor */ + const void *beta, /* beta[0] = dest layer blend factor */ + + const cudnnTensorDescriptor_t xDesc, + const void *x, /* NxCxHxW */ + const cudnnTensorDescriptor_t yDesc, + void *y, /* NxCxHxW */ + + /* Shared desc for the next 6 tensors in the argument list. + Data type to be set as follows: + type = (typeOf(x) == double) ? double : float + Dimensions for this descriptor depend on normalization mode + - Spatial Normalization : tensors are expected to have dims 1xCx1x1 + (normalization is performed across NxHxW) + - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW + (normalization is performed across N) */ + const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, + + /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */ + const void *bnScale, + const void *bnBias, + + /* MUST use factor=1 in the very first call of a complete training cycle. + Use a factor=1/(1+n) at N-th call to the function to get + Cumulative Moving Average (CMA) behavior + CMA[n] = (x[1]+...+x[n])/n + Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) = + ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) = + CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */ + double exponentialAverageFactor, + + /* Used in Training phase only. + runningMean = newMean*factor + runningMean*(1-factor) */ + void *resultRunningMean, + /* Output in training mode, input in inference. Is the moving average + of variance[x] (factor is applied in the same way as for runningMean) */ + void *resultRunningVariance, + + /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */ + double epsilon, + + /* Optionally save intermediate results from the forward pass here + - can be reused to speed up backward pass. NULL if unused */ + void *resultSaveMean, + void *resultSaveInvVariance); + +/* Computes y = relu(BN(x) + z). Also accumulates moving averages of mean and inverse variances */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnBatchNormalizationForwardTrainingEx( + cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + cudnnBatchNormOps_t bnOps, + + const void *alpha, /* alpha[0] = result blend factor */ + const void *beta, /* beta[0] = dest layer blend factor */ + + const cudnnTensorDescriptor_t xDesc, + const void *xData, + const cudnnTensorDescriptor_t zDesc, + const void *zData, + const cudnnTensorDescriptor_t yDesc, + void *yData, + + const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, + const void *bnScale, + const void *bnBias, + + double exponentialAverageFactor, + void *resultRunningMean, + void *resultRunningVariance, + + /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */ + double epsilon, + + /* Optionally save intermediate results from the forward pass here + - can be reused to speed up backward pass. NULL if unused */ + void *resultSaveMean, + void *resultSaveInvVariance, + + cudnnActivationDescriptor_t activationDesc, + void *workspace, + size_t workSpaceSizeInBytes, + void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +/* Performs backward pass of Batch Normalization layer. Returns x gradient, + * bnScale gradient and bnBias gradient */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnBatchNormalizationBackward(cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + const void *alphaDataDiff, + const void *betaDataDiff, + const void *alphaParamDiff, + const void *betaParamDiff, + const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */ + const void *x, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnTensorDescriptor_t dxDesc, + void *dx, + /* Shared tensor desc for the 4 tensors below */ + const cudnnTensorDescriptor_t dBnScaleBiasDesc, + const void *bnScale, /* bnBias doesn't affect backpropagation */ + /* scale and bias diff are not backpropagated below this layer */ + void *dBnScaleResult, + void *dBnBiasResult, + /* Same epsilon as forward pass */ + double epsilon, + + /* Optionally cached intermediate results from + forward pass */ + const void *savedMean, + const void *savedInvVariance); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + cudnnBatchNormOps_t bnOps, + + const void *alphaDataDiff, + const void *betaDataDiff, + const void *alphaParamDiff, + const void *betaParamDiff, + const cudnnTensorDescriptor_t xDesc, + const void *xData, + const cudnnTensorDescriptor_t yDesc, + const void *yData, + const cudnnTensorDescriptor_t dyDesc, + const void *dyData, + const cudnnTensorDescriptor_t dzDesc, + void *dzData, + const cudnnTensorDescriptor_t dxDesc, + void *dxData, + + /* Shared tensor desc for the 4 tensors below */ + const cudnnTensorDescriptor_t dBnScaleBiasDesc, + const void *bnScaleData, + const void *bnBiasData, /* needed if there is activation */ + void *dBnScaleData, + void *dBnBiasData, + double epsilon, /* Same epsilon as forward pass */ + + /* Optionally cached intermediate results from + forward pass */ + const void *savedMean, + const void *savedInvVariance, + cudnnActivationDescriptor_t activationDesc, + void *workSpace, + size_t workSpaceSizeInBytes, + void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetNormalizationForwardTrainingWorkspaceSize(cudnnHandle_t handle, + cudnnNormMode_t mode, + cudnnNormOps_t normOps, + cudnnNormAlgo_t algo, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t zDesc, + const cudnnTensorDescriptor_t yDesc, + const cudnnTensorDescriptor_t normScaleBiasDesc, + const cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t normMeanVarDesc, + size_t *sizeInBytes, + int groupCnt); /* Place hold for future work, should be set to 1 now*/ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetNormalizationBackwardWorkspaceSize(cudnnHandle_t handle, + cudnnNormMode_t mode, + cudnnNormOps_t normOps, + cudnnNormAlgo_t algo, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t yDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnTensorDescriptor_t dzDesc, + const cudnnTensorDescriptor_t dxDesc, + const cudnnTensorDescriptor_t dNormScaleBiasDesc, + const cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t normMeanVarDesc, + size_t *sizeInBytes, + int groupCnt); /* Place hold for future work, should be set to 1 now*/ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetNormalizationTrainingReserveSpaceSize(cudnnHandle_t handle, + cudnnNormMode_t mode, + cudnnNormOps_t normOps, + cudnnNormAlgo_t algo, + const cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t xDesc, + size_t *sizeInBytes, + int groupCnt); /* Place hold for future work, should be set to 1 now*/ + +/* Computes y = relu(Norm(x) + z). Also accumulates moving averages of mean and inverse variances */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnNormalizationForwardTraining(cudnnHandle_t handle, + cudnnNormMode_t mode, + cudnnNormOps_t normOps, + cudnnNormAlgo_t algo, + const void *alpha, /* alpha[0] = result blend factor */ + const void *beta, /* beta[0] = dest layer blend factor */ + const cudnnTensorDescriptor_t xDesc, + const void *xData, + const cudnnTensorDescriptor_t normScaleBiasDesc, + const void *normScale, + const void *normBias, + double exponentialAverageFactor, + const cudnnTensorDescriptor_t normMeanVarDesc, + void *resultRunningMean, + void *resultRunningVariance, + /* Has to be >= 0. Should be the same in forward and backward functions. */ + double epsilon, + /* Optionally save intermediate results from the forward pass here + - can be reused to speed up backward pass. NULL if unused */ + void *resultSaveMean, + void *resultSaveInvVariance, + cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t zDesc, + const void *zData, + const cudnnTensorDescriptor_t yDesc, + void *yData, + void *workspace, + size_t workSpaceSizeInBytes, + void *reserveSpace, + size_t reserveSpaceSizeInBytes, + int groupCnt); /* Place hold for future work, should be set to 1 now*/ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnNormalizationBackward(cudnnHandle_t handle, + cudnnNormMode_t mode, + cudnnNormOps_t normOps, + cudnnNormAlgo_t algo, + const void *alphaDataDiff, + const void *betaDataDiff, + const void *alphaParamDiff, + const void *betaParamDiff, + const cudnnTensorDescriptor_t xDesc, + const void *xData, + const cudnnTensorDescriptor_t yDesc, + const void *yData, + const cudnnTensorDescriptor_t dyDesc, + const void *dyData, + const cudnnTensorDescriptor_t dzDesc, + void *dzData, + const cudnnTensorDescriptor_t dxDesc, + void *dxData, + /* Shared tensor desc for the 4 tensors below */ + const cudnnTensorDescriptor_t dNormScaleBiasDesc, + const void *normScaleData, + const void *normBiasData, /* needed if there is activation */ + void *dNormScaleData, + void *dNormBiasData, + double epsilon, /* Same epsilon as forward pass */ + const cudnnTensorDescriptor_t normMeanVarDesc, + /* Optionally cached intermediate results from + forward pass */ + const void *savedMean, + const void *savedInvVariance, + cudnnActivationDescriptor_t activationDesc, + void *workSpace, + size_t workSpaceSizeInBytes, + void *reserveSpace, + size_t reserveSpaceSizeInBytes, + int groupCnt); /* Place hold for future work, should be set to 1 now*/ + +cudnnStatus_t CUDNNWINAPI +cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle, + const cudnnSpatialTransformerDescriptor_t stDesc, + const void *dgrid, + void *dtheta); + +cudnnStatus_t CUDNNWINAPI +cudnnSpatialTfSamplerBackward(cudnnHandle_t handle, + cudnnSpatialTransformerDescriptor_t stDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx, + const void *alphaDgrid, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const void *grid, + const void *betaDgrid, + void *dgrid); + +cudnnStatus_t CUDNNWINAPI +cudnnDropoutBackward(cudnnHandle_t handle, + const cudnnDropoutDescriptor_t dropoutDesc, + const cudnnTensorDescriptor_t dydesc, + const void *dy, + const cudnnTensorDescriptor_t dxdesc, + void *dx, + void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +#if defined(__cplusplus) +} +#endif + +#endif /* CUDNN_OPS_H_ */ diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_ops_v9.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_ops_v9.h new file mode 100644 index 0000000000000000000000000000000000000000..471a0e59d67228ab8a74159517418f217ab86324 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_ops_v9.h @@ -0,0 +1,1316 @@ +/* + * Copyright 2014-2023 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* + * cudnn_ops : cuDNN's basic definitions and basic operations. + */ + +#if !defined(CUDNN_OPS_H_) +#define CUDNN_OPS_H_ + +#include + +#include "cudnn_version.h" +#include "cudnn_graph.h" + +/* These version numbers are autogenerated, do not edit manually. */ +#define CUDNN_OPS_MAJOR 9 +#define CUDNN_OPS_MINOR 10 +#define CUDNN_OPS_PATCH 2 + +#if (CUDNN_OPS_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_MINOR != CUDNN_MINOR) || (CUDNN_OPS_PATCH != CUDNN_PATCHLEVEL) +#error Version mismatch in cuDNN OPS INFER!!! +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +/* Data structures to represent Image/Filter and the Neural Network Layer */ +typedef struct cudnnTensorStruct *cudnnTensorDescriptor_t; +typedef struct cudnnPoolingStruct *cudnnPoolingDescriptor_t CUDNN_DEPRECATED; +typedef struct cudnnFilterStruct *cudnnFilterDescriptor_t CUDNN_DEPRECATED; +typedef struct cudnnLRNStruct *cudnnLRNDescriptor_t; +typedef struct cudnnActivationStruct *cudnnActivationDescriptor_t CUDNN_DEPRECATED; +typedef struct cudnnSpatialTransformerStruct *cudnnSpatialTransformerDescriptor_t; +typedef struct cudnnOpTensorStruct *cudnnOpTensorDescriptor_t CUDNN_DEPRECATED; +typedef struct cudnnReduceTensorStruct *cudnnReduceTensorDescriptor_t CUDNN_DEPRECATED; +typedef struct cudnnCTCLossStruct *cudnnCTCLossDescriptor_t; +typedef struct cudnnTensorTransformStruct *cudnnTensorTransformDescriptor_t CUDNN_DEPRECATED; +/* + * CUDNN Determinism + */ +typedef enum { + CUDNN_NON_DETERMINISTIC = 0, + CUDNN_DETERMINISTIC = 1, +} cudnnDeterminism_t; + +/* Create an instance of a generic Tensor descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc, + cudnnTensorFormat_t format, + cudnnDataType_t dataType, /* image data type */ + int n, /* number of inputs (batch size) */ + int c, /* number of input feature maps */ + int h, /* height of input section */ + int w); /* width of input section */ + +cudnnStatus_t CUDNNWINAPI +cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc, + cudnnDataType_t dataType, /* image data type */ + int n, /* number of inputs (batch size) */ + int c, /* number of input feature maps */ + int h, /* height of input section */ + int w, /* width of input section */ + int nStride, + int cStride, + int hStride, + int wStride); + +cudnnStatus_t CUDNNWINAPI +cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc, + cudnnDataType_t *dataType, /* image data type */ + int *n, /* number of inputs (batch size) */ + int *c, /* number of input feature maps */ + int *h, /* height of input section */ + int *w, /* width of input section */ + int *nStride, + int *cStride, + int *hStride, + int *wStride); + +cudnnStatus_t CUDNNWINAPI +cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc, + cudnnDataType_t dataType, + int nbDims, + const int dimA[], + const int strideA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc, + cudnnTensorFormat_t format, + cudnnDataType_t dataType, + int nbDims, + const int dimA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc, + int nbDimsRequested, + cudnnDataType_t *dataType, + int *nbDims, + int dimA[], + int strideA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size); + +/* PixelOffset( n, c, h, w ) = n *input_stride + c * feature_stride + h * h_stride + w * w_stride + + 1)Example of all images in row major order one batch of features after the other (with an optional padding on row) + input_stride : c x h x h_stride + feature_stride : h x h_stride + h_stride : >= w ( h_stride = w if no padding) + w_stride : 1 + + + 2)Example of all images in row major with features maps interleaved + input_stride : c x h x h_stride + feature_stride : 1 + h_stride : w x c + w_stride : c + + 3)Example of all images in column major order one batch of features after the other (with optional padding on column) + input_stride : c x w x w_stride + feature_stride : w x w_stride + h_stride : 1 + w_stride : >= h + +*/ + +/* Destroy an instance of Tensor4d descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc); + +/* Fold/unfold transforms */ +typedef enum { + CUDNN_TRANSFORM_FOLD = 0U, + CUDNN_TRANSFORM_UNFOLD = 1U, +} cudnnFoldingDirection_t; + +/** Create a destination descriptor for cudnnTransformTensor */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc, + const cudnnTensorDescriptor_t srcDesc, + cudnnTensorDescriptor_t destDesc, + size_t *destSizeInBytes); + +/** Create an empty tensor transform descriptor */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc); + +/** Initialize a previously created tensor transform descriptor. */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc, + const uint32_t nbDims, + const cudnnTensorFormat_t destFormat, + const int32_t padBeforeA[], + const int32_t padAfterA[], + const uint32_t foldA[], + const cudnnFoldingDirection_t direction); + +/** + * Retrieves the values stored in a previously initialized tensor transform + * descriptor. + */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc, + uint32_t nbDimsRequested, + cudnnTensorFormat_t *destFormat, + int32_t padBeforeA[], + int32_t padAfterA[], + uint32_t foldA[], + cudnnFoldingDirection_t *direction); + +/** + * Destroys a previously created tensor transform descriptor. + */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc); + +/* Tensor layout conversion helper (y = alpha * x + beta * y) */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnTransformTensor(cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnTransformTensorEx(cudnnHandle_t handle, + const cudnnTensorTransformDescriptor_t transDesc, + const void *alpha, + const cudnnTensorDescriptor_t srcDesc, + const void *srcData, + const void *beta, + const cudnnTensorDescriptor_t destDesc, + void *destData); + +/* Tensor Bias addition : C = alpha * A + beta * C */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnAddTensor(cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t aDesc, + const void *A, + const void *beta, + const cudnnTensorDescriptor_t cDesc, + void *C); + +/* + * CUDNN OpTensor op type + */ +typedef enum { + CUDNN_OP_TENSOR_ADD = 0, + CUDNN_OP_TENSOR_MUL = 1, + CUDNN_OP_TENSOR_MIN = 2, + CUDNN_OP_TENSOR_MAX = 3, + CUDNN_OP_TENSOR_SQRT = 4, + CUDNN_OP_TENSOR_NOT = 5, +} cudnnOpTensorOp_t; + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc, + cudnnOpTensorOp_t opTensorOp, + cudnnDataType_t opTensorCompType, + cudnnNanPropagation_t opTensorNanOpt); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc, + cudnnOpTensorOp_t *opTensorOp, + cudnnDataType_t *opTensorCompType, + cudnnNanPropagation_t *opTensorNanOpt); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc); + +/* Tensor operation : C = op( alpha1 * A, alpha2 * B ) + beta * C */ +/* B tensor is ignored for CUDNN_OP_TENSOR_SQRT, CUDNN_OP_TENSOR_NOT. */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnOpTensor(cudnnHandle_t handle, + const cudnnOpTensorDescriptor_t opTensorDesc, + const void *alpha1, + const cudnnTensorDescriptor_t aDesc, + const void *A, + const void *alpha2, + const cudnnTensorDescriptor_t bDesc, + const void *B, + const void *beta, + const cudnnTensorDescriptor_t cDesc, + void *C); + +/* + * CUDNN ReduceTensor indices type + */ +typedef enum { + CUDNN_REDUCE_TENSOR_NO_INDICES = 0, + CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1, +} cudnnReduceTensorIndices_t CUDNN_DEPRECATED; + +/* + * CUDNN tensor indices type size (all unsigned) + * Currently not supported, default is 32 bit unsigned. + */ +typedef enum { + CUDNN_32BIT_INDICES = 0, + CUDNN_64BIT_INDICES = 1, + CUDNN_16BIT_INDICES = 2, + CUDNN_8BIT_INDICES = 3, +} cudnnIndicesType_t CUDNN_DEPRECATED; + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc, + cudnnReduceTensorOp_t reduceTensorOp, + cudnnDataType_t reduceTensorCompType, + cudnnNanPropagation_t reduceTensorNanOpt, + cudnnReduceTensorIndices_t reduceTensorIndices, + cudnnIndicesType_t reduceTensorIndicesType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc, + cudnnReduceTensorOp_t *reduceTensorOp, + cudnnDataType_t *reduceTensorCompType, + cudnnNanPropagation_t *reduceTensorNanOpt, + cudnnReduceTensorIndices_t *reduceTensorIndices, + cudnnIndicesType_t *reduceTensorIndicesType); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc); + +/* Helper function to return the minimum size of the index space to be passed to the reduction given the input and + * output tensors */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetReductionIndicesSize(cudnnHandle_t handle, + const cudnnReduceTensorDescriptor_t reduceTensorDesc, + const cudnnTensorDescriptor_t aDesc, + const cudnnTensorDescriptor_t cDesc, + size_t *sizeInBytes); + +/* Helper function to return the minimum size of the workspace to be passed to the reduction given the input and output + * tensors */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetReductionWorkspaceSize(cudnnHandle_t handle, + const cudnnReduceTensorDescriptor_t reduceTensorDesc, + const cudnnTensorDescriptor_t aDesc, + const cudnnTensorDescriptor_t cDesc, + size_t *sizeInBytes); + +/* Tensor operation : C = reduce op( alpha * A ) + beta * C */ +/* The NaN propagation enum applies to only the min and max reduce ops; the other reduce ops propagate NaN as usual. */ +/* The indices space is ignored for reduce ops other than min or max. */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnReduceTensor(cudnnHandle_t handle, + const cudnnReduceTensorDescriptor_t reduceTensorDesc, + void *indices, + size_t indicesSizeInBytes, + void *workspace, + size_t workspaceSizeInBytes, + const void *alpha, + const cudnnTensorDescriptor_t aDesc, + const void *A, + const void *beta, + const cudnnTensorDescriptor_t cDesc, + void *C); + +/* Set all values of a tensor to a given value : y[i] = value[0] */ +cudnnStatus_t CUDNNWINAPI +cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr); + +/* Scale all values of a tensor by a given factor : y[i] = alpha * y[i] */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha); + +/* Create an instance of FilterStruct */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t dataType, /* image data type */ + cudnnTensorFormat_t format, + int k, /* number of output feature maps */ + int c, /* number of input feature maps */ + int h, /* height of each input filter */ + int w); /* width of each input filter */ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t *dataType, /* image data type */ + cudnnTensorFormat_t *format, + int *k, /* number of output feature maps */ + int *c, /* number of input feature maps */ + int *h, /* height of each input filter */ + int *w); /* width of each input filter */ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t dataType, /* image data type */ + cudnnTensorFormat_t format, + int nbDims, + const int filterDimA[]); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc, + int nbDimsRequested, + cudnnDataType_t *dataType, /* image data type */ + cudnnTensorFormat_t *format, + int *nbDims, + int filterDimA[]); +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnTransformFilter(cudnnHandle_t handle, + const cudnnTensorTransformDescriptor_t transDesc, + const void *alpha, + const cudnnFilterDescriptor_t srcDesc, + const void *srcData, + const void *beta, + const cudnnFilterDescriptor_t destDesc, + void *destData); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc); + +/* + * softmax algorithm + */ +typedef enum { + CUDNN_SOFTMAX_FAST = 0, /* straightforward implementation */ + CUDNN_SOFTMAX_ACCURATE = 1, /* subtract max from every point to avoid overflow */ + CUDNN_SOFTMAX_LOG = 2 +} cudnnSoftmaxAlgorithm_t; + +typedef enum { + CUDNN_SOFTMAX_MODE_INSTANCE = 0, /* compute the softmax over all C, H, W for each N */ + CUDNN_SOFTMAX_MODE_CHANNEL = 1 /* compute the softmax over all C for each H, W, N */ +} cudnnSoftmaxMode_t; + +/* Softmax functions: All of the form "output = alpha * Op(inputs) + beta * output" */ + +/* Function to perform forward softmax */ +cudnnStatus_t CUDNNWINAPI +cudnnSoftmaxForward(cudnnHandle_t handle, + cudnnSoftmaxAlgorithm_t algo, + cudnnSoftmaxMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +/* + * pooling mode + */ +typedef enum { + CUDNN_POOLING_MAX = 0, + CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values */ + CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values */ + CUDNN_POOLING_MAX_DETERMINISTIC = 3 +} cudnnPoolingMode_t CUDNN_DEPRECATED; + +/* Create an instance of pooling descriptor */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc, + cudnnPoolingMode_t mode, + cudnnNanPropagation_t maxpoolingNanOpt, + int windowHeight, + int windowWidth, + int verticalPadding, + int horizontalPadding, + int verticalStride, + int horizontalStride); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc, + cudnnPoolingMode_t *mode, + cudnnNanPropagation_t *maxpoolingNanOpt, + int *windowHeight, + int *windowWidth, + int *verticalPadding, + int *horizontalPadding, + int *verticalStride, + int *horizontalStride); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc, + const cudnnPoolingMode_t mode, + const cudnnNanPropagation_t maxpoolingNanOpt, + int nbDims, + const int windowDimA[], + const int paddingA[], + const int strideA[]); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc, + int nbDimsRequested, + cudnnPoolingMode_t *mode, + cudnnNanPropagation_t *maxpoolingNanOpt, + int *nbDims, + int windowDimA[], + int paddingA[], + int strideA[]); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + int nbDims, + int outputTensorDimA[]); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + int *n, + int *c, + int *h, + int *w); + +/* Destroy an instance of pooling descriptor */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc); + +/* Pooling functions: All of the form "output = alpha * Op(inputs) + beta * output" */ + +/* Function to perform forward pooling */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnPoolingForward(cudnnHandle_t handle, + const cudnnPoolingDescriptor_t poolingDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +/* Activation functions: All of the form "output = alpha * Op(inputs) + beta * output" */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc, + cudnnActivationMode_t mode, + cudnnNanPropagation_t reluNanOpt, + double coef); /* ceiling for clipped RELU, alpha for ELU */ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc, + cudnnActivationMode_t *mode, + cudnnNanPropagation_t *reluNanOpt, + double *coef); /* ceiling for clipped RELU, alpha for ELU */ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double *swish_beta); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc); + +/* Function to perform forward activation */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnActivationForward(cudnnHandle_t handle, + cudnnActivationDescriptor_t activationDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +/* + * Create an instance of LRN (Local Response Normalization) descriptor + * Uses lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper + */ +cudnnStatus_t CUDNNWINAPI +cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc); + +#define CUDNN_LRN_MIN_N 1 /* minimum allowed lrnN */ +#define CUDNN_LRN_MAX_N 16 /* maximum allowed lrnN */ +#define CUDNN_LRN_MIN_K 1e-5 /* minimum allowed lrnK */ +#define CUDNN_LRN_MIN_BETA 0.01 /* minimum allowed lrnBeta */ + +/* LRN layer mode */ +typedef enum { + CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0, /* Normalize across tensor's dimA[1] dimension */ +} cudnnLRNMode_t; + +/* + * Uses a window [center-lookBehind, center+lookAhead], where + * lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1. + * Values of double parameters cast to tensor data type. + */ +cudnnStatus_t CUDNNWINAPI +cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK); +/* + * Retrieve the settings currently stored in an LRN layer descriptor + * Any of the provided pointers can be NULL (no corresponding value will be returned) + */ +cudnnStatus_t CUDNNWINAPI +cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK); + +/* Destroy an instance of LRN descriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc); + +/* LRN functions: output = alpha * normalize(x) + beta * old_y */ + +/* LRN cross-channel forward computation. Double parameters cast to tensor data type */ +cudnnStatus_t CUDNNWINAPI +cudnnLRNCrossChannelForward(cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnLRNMode_t lrnMode, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +typedef enum { + CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0, +} cudnnDivNormMode_t; + +/* LCN/divisive normalization functions: y = alpha * normalize(x) + beta * y */ +cudnnStatus_t CUDNNWINAPI +cudnnDivisiveNormalizationForward(cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnDivNormMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */ + const void *x, + const void *means, /* if NULL, means are assumed to be zero */ + void *temp, + void *temp2, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y); + +typedef enum { + /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */ + CUDNN_BATCHNORM_PER_ACTIVATION = 0, + + /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */ + CUDNN_BATCHNORM_SPATIAL = 1, + + /* + * bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors). + * May be faster than CUDNN_BATCHNORM_SPATIAL but imposes some limits on the range of values + */ + CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2, +} cudnnBatchNormMode_t CUDNN_DEPRECATED; + +#define CUDNN_BN_MIN_EPSILON 0.0 /* Minimum epsilon allowed to be used in the Batch Normalization formula */ + +/* + * Derives a tensor descriptor from layer data descriptor for BatchNormalization + * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for + * bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc in Batch Normalization forward and backward functions. + */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc, + const cudnnTensorDescriptor_t xDesc, + cudnnBatchNormMode_t mode); + +typedef enum { + CUDNN_BATCHNORM_OPS_BN = 0, /* do batch normalization only */ + CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 1, /* do batchNorm, then activation */ + CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2, /* do batchNorm, then elemWiseAdd, then activation */ +} cudnnBatchNormOps_t CUDNN_DEPRECATED; + +/* + * Performs Batch Normalization during Inference: + * y[i] = bnScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + bnBias[k] + * with bnScale, bnBias, runningMean, runningInvVariance tensors indexed + * according to spatial or per-activation mode. Refer to cudnnBatchNormalizationForwardTraining + * above for notes on function arguments. + */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnBatchNormalizationForwardInference(cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + const void *alpha, /* alpha[0] = result blend factor */ + const void *beta, /* beta[0] = dest layer blend factor */ + const cudnnTensorDescriptor_t xDesc, + const void *x, /* NxCxHxW */ + const cudnnTensorDescriptor_t yDesc, + void *y, /* NxCxHxW */ + const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, + const void *bnScale, + const void *bnBias, + const void *estimatedMean, + const void *estimatedVariance, + double epsilon); + +typedef enum { + /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */ + CUDNN_NORM_PER_ACTIVATION = 0, + + /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */ + CUDNN_NORM_PER_CHANNEL = 1, +} cudnnNormMode_t CUDNN_DEPRECATED; + +typedef enum { CUDNN_NORM_ALGO_STANDARD = 0, CUDNN_NORM_ALGO_PERSIST = 1 } cudnnNormAlgo_t CUDNN_DEPRECATED; + +/* + * Derives a tensor descriptor from layer data descriptor for Normalization + * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for + * normScaleBiasMeanVarDesc and normScaleBiasDiffDesc in Normalization forward and backward functions. + */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNormScaleBiasDesc, + cudnnTensorDescriptor_t derivedNormMeanVarDesc, + const cudnnTensorDescriptor_t xDesc, + cudnnNormMode_t mode, + int groupCnt); /* Place hold for future work, should be set to 1 now*/ + +typedef enum { + CUDNN_NORM_OPS_NORM = 0, /* do normalization only */ + CUDNN_NORM_OPS_NORM_ACTIVATION = 1, /* do Norm, then activation */ + CUDNN_NORM_OPS_NORM_ADD_ACTIVATION = 2, /* do Norm, then elemWiseAdd, then activation */ +} cudnnNormOps_t CUDNN_DEPRECATED; + +/* + * Performs Normalization during Inference: + * y[i] = normScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + normBias[k] + * with normScale, normBias, runningMean, runningInvVariance tensors indexed + * according to per-channel or per-activation mode. Refer to cudnnNormalizationForwardTraining + * above for notes on function arguments. + */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnNormalizationForwardInference(cudnnHandle_t handle, + cudnnNormMode_t mode, + cudnnNormOps_t normOps, + cudnnNormAlgo_t algo, + const void *alpha, /* alpha[0] = result blend factor */ + const void *beta, /* beta[0] = dest layer blend factor */ + const cudnnTensorDescriptor_t xDesc, + const void *x, /* NxCxHxW */ + const cudnnTensorDescriptor_t normScaleBiasDesc, + const void *normScale, + const void *normBias, + const cudnnTensorDescriptor_t normMeanVarDesc, + const void *estimatedMean, + const void *estimatedVariance, + const cudnnTensorDescriptor_t zDesc, + const void *z, + cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t yDesc, + void *y, /* NxCxHxW */ + double epsilon, + int groupCnt); /* Place hold for future work*/ + +/* APIs for spatial transformer network*/ +typedef enum { + CUDNN_SAMPLER_BILINEAR = 0, +} cudnnSamplerType_t; + +cudnnStatus_t CUDNNWINAPI +cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc, + cudnnSamplerType_t samplerType, + cudnnDataType_t dataType, + const int nbDims, + const int dimA[]); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle, + const cudnnSpatialTransformerDescriptor_t stDesc, + const void *theta, + void *grid); + +cudnnStatus_t CUDNNWINAPI +cudnnSpatialTfSamplerForward(cudnnHandle_t handle, + cudnnSpatialTransformerDescriptor_t stDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *grid, + const void *beta, + cudnnTensorDescriptor_t yDesc, + void *y); + +typedef struct cudnnDropoutStruct *cudnnDropoutDescriptor_t; + +cudnnStatus_t CUDNNWINAPI +cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc); + +cudnnStatus_t CUDNNWINAPI +cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc); + +/*helper function to determine size of the states to be passed to cudnnSetDropoutDescriptor */ +cudnnStatus_t CUDNNWINAPI +cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes); + +/*helper function to determine size of the reserve space to be passed to dropout forward/backward calls */ +cudnnStatus_t CUDNNWINAPI +cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes); + +cudnnStatus_t CUDNNWINAPI +cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, + cudnnHandle_t handle, + float dropout, + void *states, + size_t stateSizeInBytes, + unsigned long long seed); + +/* Restores the dropout descriptor to a previously saved-off state */ +cudnnStatus_t CUDNNWINAPI +cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, + cudnnHandle_t handle, + float dropout, + void *states, + size_t stateSizeInBytes, + unsigned long long seed); + +cudnnStatus_t CUDNNWINAPI +cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, + cudnnHandle_t handle, + float *dropout, + void **states, + unsigned long long *seed); + +cudnnStatus_t CUDNNWINAPI +cudnnDropoutForward(cudnnHandle_t handle, + const cudnnDropoutDescriptor_t dropoutDesc, + const cudnnTensorDescriptor_t xdesc, + const void *x, + const cudnnTensorDescriptor_t ydesc, + void *y, + void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +/* TODO: move these enums out to the appropriate submodule */ +typedef enum { + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 0, + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1, + CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2, + CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3, + CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4, + CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 7, + CUDNN_CONVOLUTION_FWD_ALGO_COUNT = 8 +} cudnnConvolutionFwdAlgo_t; + +typedef enum { + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0, /* non-deterministic */ + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3, /* non-deterministic */ + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 4, /* not implemented */ + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING = 6, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT = 7 +} cudnnConvolutionBwdFilterAlgo_t; + +typedef enum { + CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0, /* non-deterministic */ + CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT = 6 +} cudnnConvolutionBwdDataAlgo_t; + +typedef enum { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0, CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 } cudnnCTCLossAlgo_t; + +/* + * \brief Cross-library version checker. + * This function is implemented differently in each sub-library. Each sublib + * checks whether its own version matches that of its dependencies. + * \returns CUDNN_STATUS_SUCCESS if the version check passes, + * CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent. + */ +cudnnStatus_t CUDNNWINAPI +cudnnOpsVersionCheck(void); + +/* Function to perform backward softmax */ +cudnnStatus_t CUDNNWINAPI +cudnnSoftmaxBackward(cudnnHandle_t handle, + cudnnSoftmaxAlgorithm_t algo, + cudnnSoftmaxMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t yDesc, + const void *y, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx); + +/* Function to perform backward pooling */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnPoolingBackward(cudnnHandle_t handle, + const cudnnPoolingDescriptor_t poolingDesc, + const void *alpha, + const cudnnTensorDescriptor_t yDesc, + const void *y, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx); + +/* Function to perform backward activation */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnActivationBackward(cudnnHandle_t handle, + cudnnActivationDescriptor_t activationDesc, + const void *alpha, + const cudnnTensorDescriptor_t yDesc, + const void *y, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx); + +/* LRN cross-channel backward computation. Double parameters cast to tensor data type */ +cudnnStatus_t CUDNNWINAPI +cudnnLRNCrossChannelBackward(cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnLRNMode_t lrnMode, + const void *alpha, + const cudnnTensorDescriptor_t yDesc, + const void *y, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx); + +cudnnStatus_t CUDNNWINAPI +cudnnDivisiveNormalizationBackward(cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnDivNormMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */ + const void *x, + const void *means, /* if NULL, means are assumed to be zero */ + const void *dy, + void *temp, + void *temp2, + const void *beta, + const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */ + void *dx, /* output x differential */ + void *dMeans); /* output means differential, can be NULL */ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + cudnnBatchNormOps_t bnOps, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t zDesc, + const cudnnTensorDescriptor_t yDesc, + const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, + const cudnnActivationDescriptor_t activationDesc, + size_t *sizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + cudnnBatchNormOps_t bnOps, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t yDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnTensorDescriptor_t dzDesc, + const cudnnTensorDescriptor_t dxDesc, + const cudnnTensorDescriptor_t dBnScaleBiasDesc, + const cudnnActivationDescriptor_t activationDesc, + size_t *sizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + cudnnBatchNormOps_t bnOps, + const cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t xDesc, + size_t *sizeInBytes); + +/* Computes y = BN(x). Also accumulates moving averages of mean and inverse variances */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnBatchNormalizationForwardTraining( + cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + + const void *alpha, /* alpha[0] = result blend factor */ + const void *beta, /* beta[0] = dest layer blend factor */ + + const cudnnTensorDescriptor_t xDesc, + const void *x, /* NxCxHxW */ + const cudnnTensorDescriptor_t yDesc, + void *y, /* NxCxHxW */ + + /* Shared desc for the next 6 tensors in the argument list. + Data type to be set as follows: + type = (typeOf(x) == double) ? double : float + Dimensions for this descriptor depend on normalization mode + - Spatial Normalization : tensors are expected to have dims 1xCx1x1 + (normalization is performed across NxHxW) + - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW + (normalization is performed across N) */ + const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, + + /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */ + const void *bnScale, + const void *bnBias, + + /* MUST use factor=1 in the very first call of a complete training cycle. + Use a factor=1/(1+n) at N-th call to the function to get + Cumulative Moving Average (CMA) behavior + CMA[n] = (x[1]+...+x[n])/n + Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) = + ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) = + CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */ + double exponentialAverageFactor, + + /* Used in Training phase only. + runningMean = newMean*factor + runningMean*(1-factor) */ + void *resultRunningMean, + /* Output in training mode, input in inference. Is the moving average + of variance[x] (factor is applied in the same way as for runningMean) */ + void *resultRunningVariance, + + /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */ + double epsilon, + + /* Optionally save intermediate results from the forward pass here + - can be reused to speed up backward pass. NULL if unused */ + void *resultSaveMean, + void *resultSaveInvVariance); + +/* Computes y = relu(BN(x) + z). Also accumulates moving averages of mean and inverse variances */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnBatchNormalizationForwardTrainingEx( + cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + cudnnBatchNormOps_t bnOps, + + const void *alpha, /* alpha[0] = result blend factor */ + const void *beta, /* beta[0] = dest layer blend factor */ + + const cudnnTensorDescriptor_t xDesc, + const void *xData, + const cudnnTensorDescriptor_t zDesc, + const void *zData, + const cudnnTensorDescriptor_t yDesc, + void *yData, + + const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, + const void *bnScale, + const void *bnBias, + + double exponentialAverageFactor, + void *resultRunningMean, + void *resultRunningVariance, + + /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */ + double epsilon, + + /* Optionally save intermediate results from the forward pass here + - can be reused to speed up backward pass. NULL if unused */ + void *resultSaveMean, + void *resultSaveInvVariance, + + cudnnActivationDescriptor_t activationDesc, + void *workspace, + size_t workSpaceSizeInBytes, + void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +/* Performs backward pass of Batch Normalization layer. Returns x gradient, + * bnScale gradient and bnBias gradient */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnBatchNormalizationBackward(cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + const void *alphaDataDiff, + const void *betaDataDiff, + const void *alphaParamDiff, + const void *betaParamDiff, + const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */ + const void *x, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnTensorDescriptor_t dxDesc, + void *dx, + /* Shared tensor desc for the 4 tensors below */ + const cudnnTensorDescriptor_t dBnScaleBiasDesc, + const void *bnScale, /* bnBias doesn't affect backpropagation */ + /* scale and bias diff are not backpropagated below this layer */ + void *dBnScaleResult, + void *dBnBiasResult, + /* Same epsilon as forward pass */ + double epsilon, + + /* Optionally cached intermediate results from + forward pass */ + const void *savedMean, + const void *savedInvVariance); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + cudnnBatchNormOps_t bnOps, + + const void *alphaDataDiff, + const void *betaDataDiff, + const void *alphaParamDiff, + const void *betaParamDiff, + const cudnnTensorDescriptor_t xDesc, + const void *xData, + const cudnnTensorDescriptor_t yDesc, + const void *yData, + const cudnnTensorDescriptor_t dyDesc, + const void *dyData, + const cudnnTensorDescriptor_t dzDesc, + void *dzData, + const cudnnTensorDescriptor_t dxDesc, + void *dxData, + + /* Shared tensor desc for the 4 tensors below */ + const cudnnTensorDescriptor_t dBnScaleBiasDesc, + const void *bnScaleData, + const void *bnBiasData, /* needed if there is activation */ + void *dBnScaleData, + void *dBnBiasData, + double epsilon, /* Same epsilon as forward pass */ + + /* Optionally cached intermediate results from + forward pass */ + const void *savedMean, + const void *savedInvVariance, + cudnnActivationDescriptor_t activationDesc, + void *workSpace, + size_t workSpaceSizeInBytes, + void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetNormalizationForwardTrainingWorkspaceSize(cudnnHandle_t handle, + cudnnNormMode_t mode, + cudnnNormOps_t normOps, + cudnnNormAlgo_t algo, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t zDesc, + const cudnnTensorDescriptor_t yDesc, + const cudnnTensorDescriptor_t normScaleBiasDesc, + const cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t normMeanVarDesc, + size_t *sizeInBytes, + int groupCnt); /* Place hold for future work, should be set to 1 now*/ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetNormalizationBackwardWorkspaceSize(cudnnHandle_t handle, + cudnnNormMode_t mode, + cudnnNormOps_t normOps, + cudnnNormAlgo_t algo, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t yDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnTensorDescriptor_t dzDesc, + const cudnnTensorDescriptor_t dxDesc, + const cudnnTensorDescriptor_t dNormScaleBiasDesc, + const cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t normMeanVarDesc, + size_t *sizeInBytes, + int groupCnt); /* Place hold for future work, should be set to 1 now*/ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnGetNormalizationTrainingReserveSpaceSize(cudnnHandle_t handle, + cudnnNormMode_t mode, + cudnnNormOps_t normOps, + cudnnNormAlgo_t algo, + const cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t xDesc, + size_t *sizeInBytes, + int groupCnt); /* Place hold for future work, should be set to 1 now*/ + +/* Computes y = relu(Norm(x) + z). Also accumulates moving averages of mean and inverse variances */ +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnNormalizationForwardTraining(cudnnHandle_t handle, + cudnnNormMode_t mode, + cudnnNormOps_t normOps, + cudnnNormAlgo_t algo, + const void *alpha, /* alpha[0] = result blend factor */ + const void *beta, /* beta[0] = dest layer blend factor */ + const cudnnTensorDescriptor_t xDesc, + const void *xData, + const cudnnTensorDescriptor_t normScaleBiasDesc, + const void *normScale, + const void *normBias, + double exponentialAverageFactor, + const cudnnTensorDescriptor_t normMeanVarDesc, + void *resultRunningMean, + void *resultRunningVariance, + /* Has to be >= 0. Should be the same in forward and backward functions. */ + double epsilon, + /* Optionally save intermediate results from the forward pass here + - can be reused to speed up backward pass. NULL if unused */ + void *resultSaveMean, + void *resultSaveInvVariance, + cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t zDesc, + const void *zData, + const cudnnTensorDescriptor_t yDesc, + void *yData, + void *workspace, + size_t workSpaceSizeInBytes, + void *reserveSpace, + size_t reserveSpaceSizeInBytes, + int groupCnt); /* Place hold for future work, should be set to 1 now*/ + +CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI +cudnnNormalizationBackward(cudnnHandle_t handle, + cudnnNormMode_t mode, + cudnnNormOps_t normOps, + cudnnNormAlgo_t algo, + const void *alphaDataDiff, + const void *betaDataDiff, + const void *alphaParamDiff, + const void *betaParamDiff, + const cudnnTensorDescriptor_t xDesc, + const void *xData, + const cudnnTensorDescriptor_t yDesc, + const void *yData, + const cudnnTensorDescriptor_t dyDesc, + const void *dyData, + const cudnnTensorDescriptor_t dzDesc, + void *dzData, + const cudnnTensorDescriptor_t dxDesc, + void *dxData, + /* Shared tensor desc for the 4 tensors below */ + const cudnnTensorDescriptor_t dNormScaleBiasDesc, + const void *normScaleData, + const void *normBiasData, /* needed if there is activation */ + void *dNormScaleData, + void *dNormBiasData, + double epsilon, /* Same epsilon as forward pass */ + const cudnnTensorDescriptor_t normMeanVarDesc, + /* Optionally cached intermediate results from + forward pass */ + const void *savedMean, + const void *savedInvVariance, + cudnnActivationDescriptor_t activationDesc, + void *workSpace, + size_t workSpaceSizeInBytes, + void *reserveSpace, + size_t reserveSpaceSizeInBytes, + int groupCnt); /* Place hold for future work, should be set to 1 now*/ + +cudnnStatus_t CUDNNWINAPI +cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle, + const cudnnSpatialTransformerDescriptor_t stDesc, + const void *dgrid, + void *dtheta); + +cudnnStatus_t CUDNNWINAPI +cudnnSpatialTfSamplerBackward(cudnnHandle_t handle, + cudnnSpatialTransformerDescriptor_t stDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx, + const void *alphaDgrid, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const void *grid, + const void *betaDgrid, + void *dgrid); + +cudnnStatus_t CUDNNWINAPI +cudnnDropoutBackward(cudnnHandle_t handle, + const cudnnDropoutDescriptor_t dropoutDesc, + const cudnnTensorDescriptor_t dydesc, + const void *dy, + const cudnnTensorDescriptor_t dxdesc, + void *dx, + void *reserveSpace, + size_t reserveSpaceSizeInBytes); + +#if defined(__cplusplus) +} +#endif + +#endif /* CUDNN_OPS_H_ */ diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_v9.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_v9.h new file mode 100644 index 0000000000000000000000000000000000000000..7e08847c95f1294bc99e96e737a53cc6ebb7a458 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_v9.h @@ -0,0 +1,68 @@ +/* + * Copyright 2014-2023 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* cudnn : Neural Networks Library */ + +#if !defined(CUDNN_H_) +#define CUDNN_H_ +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "cudnn_version.h" +#include "cudnn_graph.h" +#include "cudnn_ops.h" +#include "cudnn_adv.h" +#include "cudnn_cnn.h" + +#if defined(__cplusplus) +} +#endif +#endif /* CUDNN_H_ */ diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_version.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_version.h new file mode 100644 index 0000000000000000000000000000000000000000..1af101fac7672614e3af52cbc32c57bc2104f498 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_version.h @@ -0,0 +1,70 @@ +/* + * Copyright 2014-2023 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/** + * \file: The master cuDNN version file. + */ + +#ifndef CUDNN_VERSION_H_ +#define CUDNN_VERSION_H_ + +#define CUDNN_MAJOR 9 +#define CUDNN_MINOR 10 +#define CUDNN_PATCHLEVEL 2 + +#define CUDNN_VERSION (CUDNN_MAJOR * 10000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL) + +/* cannot use constexpr here since this is a C-only file */ +/* Below is the max SM version this cuDNN library is aware of and supports natively */ + +#define CUDNN_MAX_SM_MAJOR_NUMBER 12 +#define CUDNN_MAX_SM_MINOR_NUMBER 0 +#define CUDNN_MAX_DEVICE_VERSION (CUDNN_MAX_SM_MAJOR_NUMBER * 100 + CUDNN_MAX_SM_MINOR_NUMBER * 10) + +#endif /* CUDNN_VERSION_H */ diff --git a/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_version_v9.h b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_version_v9.h new file mode 100644 index 0000000000000000000000000000000000000000..1af101fac7672614e3af52cbc32c57bc2104f498 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_version_v9.h @@ -0,0 +1,70 @@ +/* + * Copyright 2014-2023 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/** + * \file: The master cuDNN version file. + */ + +#ifndef CUDNN_VERSION_H_ +#define CUDNN_VERSION_H_ + +#define CUDNN_MAJOR 9 +#define CUDNN_MINOR 10 +#define CUDNN_PATCHLEVEL 2 + +#define CUDNN_VERSION (CUDNN_MAJOR * 10000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL) + +/* cannot use constexpr here since this is a C-only file */ +/* Below is the max SM version this cuDNN library is aware of and supports natively */ + +#define CUDNN_MAX_SM_MAJOR_NUMBER 12 +#define CUDNN_MAX_SM_MINOR_NUMBER 0 +#define CUDNN_MAX_DEVICE_VERSION (CUDNN_MAX_SM_MAJOR_NUMBER * 100 + CUDNN_MAX_SM_MINOR_NUMBER * 10) + +#endif /* CUDNN_VERSION_H */ diff --git a/.venv/lib/python3.12/site-packages/nvidia_nccl_cu12-2.27.3.dist-info/licenses/License.txt b/.venv/lib/python3.12/site-packages/nvidia_nccl_cu12-2.27.3.dist-info/licenses/License.txt new file mode 100644 index 0000000000000000000000000000000000000000..bcd1867a02a6a8c1e592b92e2e50f34e531f2d87 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/nvidia_nccl_cu12-2.27.3.dist-info/licenses/License.txt @@ -0,0 +1,39 @@ + + Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National + Laboratory, the U.S. Department of Energy, nor the names of their + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + The U.S. Department of Energy funded the development of this software + under subcontract 7078610 with Lawrence Berkeley National Laboratory. + + +This code also includes files from the NVIDIA Tools Extension SDK project. + +See: + + https://github.com/NVIDIA/NVTX + +for more information and license details. diff --git a/.venv/lib/python3.12/site-packages/sklearn/__check_build/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/__check_build/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6e06d16bd4d506966ccc5a6ea42de1a608d8e99e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/__check_build/__init__.py @@ -0,0 +1,54 @@ +"""Module to give helpful messages to the user that did not +compile scikit-learn properly. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import os + +INPLACE_MSG = """ +It appears that you are importing a local scikit-learn source tree. For +this, you need to have an inplace install. Maybe you are in the source +directory and you need to try from another location.""" + +STANDARD_MSG = """ +If you have used an installer, please check that it is suited for your +Python version, your operating system and your platform.""" + + +def raise_build_error(e): + # Raise a comprehensible error and list the contents of the + # directory to help debugging on the mailing list. + local_dir = os.path.split(__file__)[0] + msg = STANDARD_MSG + if local_dir == "sklearn/__check_build": + # Picking up the local install: this will work only if the + # install is an 'inplace build' + msg = INPLACE_MSG + dir_content = list() + for i, filename in enumerate(os.listdir(local_dir)): + if (i + 1) % 3: + dir_content.append(filename.ljust(26)) + else: + dir_content.append(filename + "\n") + raise ImportError( + """%s +___________________________________________________________________________ +Contents of %s: +%s +___________________________________________________________________________ +It seems that scikit-learn has not been built correctly. + +If you have installed scikit-learn from source, please do not forget +to build the package before using it. For detailed instructions, see: +https://scikit-learn.org/dev/developers/advanced_installation.html#building-from-source +%s""" + % (e, local_dir, "".join(dir_content).strip(), msg) + ) + + +try: + from ._check_build import check_build # noqa: F401 +except ImportError as e: + raise_build_error(e) diff --git a/.venv/lib/python3.12/site-packages/sklearn/__check_build/_check_build.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/__check_build/_check_build.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..74e02aa76589b2223acfb9fdfbe8da3beb3dc778 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/__check_build/_check_build.cpython-312-x86_64-linux-gnu.so differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/__check_build/_check_build.pyx b/.venv/lib/python3.12/site-packages/sklearn/__check_build/_check_build.pyx new file mode 100644 index 0000000000000000000000000000000000000000..0409e73f5e96dc3a4c27889fa44eda8a17d36ef9 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/__check_build/_check_build.pyx @@ -0,0 +1,2 @@ +def check_build(): + return diff --git a/.venv/lib/python3.12/site-packages/sklearn/__check_build/meson.build b/.venv/lib/python3.12/site-packages/sklearn/__check_build/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..5f6115d9765499dc28f477a1506a8298492003f5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/__check_build/meson.build @@ -0,0 +1,6 @@ +py.extension_module( + '_check_build', + cython_gen.process('_check_build.pyx'), + install: true, + subdir: 'sklearn/__check_build', +) diff --git a/.venv/lib/python3.12/site-packages/sklearn/__pycache__/__init__.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..714515227428299e0390d3ad8ea743e9be703bf6 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/__init__.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/__pycache__/_built_with_meson.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/_built_with_meson.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f5f935f2e11c1cb4d83e397b390a97230473e223 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/_built_with_meson.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/__pycache__/_config.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/_config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a68226bee1459be761629fc8d25cc26fc37b0f9b Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/_config.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/__pycache__/_distributor_init.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/_distributor_init.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24f16857cc66b2e3c2635b23dd7124b99b802a52 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/_distributor_init.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/__pycache__/base.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..261cfecb8d9d846d532dbe7549e5f46c0390c029 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/base.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/__pycache__/exceptions.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/exceptions.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4011fd3ad9eff11c7cf6a4c662b4c90b5df67583 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/__pycache__/exceptions.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/_build_utils/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/_build_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/_build_utils/tempita.py b/.venv/lib/python3.12/site-packages/sklearn/_build_utils/tempita.py new file mode 100644 index 0000000000000000000000000000000000000000..c8a7a35a62feeed47fbb10ace87411c9bdc16370 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/_build_utils/tempita.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import argparse +import os + +from Cython import Tempita as tempita + +# XXX: If this import ever fails (does it really?), vendor either +# cython.tempita or numpy/npy_tempita. + + +def process_tempita(fromfile, outfile=None): + """Process tempita templated file and write out the result. + + The template file is expected to end in `.c.tp` or `.pyx.tp`: + E.g. processing `template.c.in` generates `template.c`. + + """ + with open(fromfile, "r", encoding="utf-8") as f: + template_content = f.read() + + template = tempita.Template(template_content) + content = template.substitute() + + with open(outfile, "w", encoding="utf-8") as f: + f.write(content) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("infile", type=str, help="Path to the input file") + parser.add_argument("-o", "--outdir", type=str, help="Path to the output directory") + parser.add_argument( + "-i", + "--ignore", + type=str, + help=( + "An ignored input - may be useful to add a " + "dependency between custom targets" + ), + ) + args = parser.parse_args() + + if not args.infile.endswith(".tp"): + raise ValueError(f"Unexpected extension: {args.infile}") + + if not args.outdir: + raise ValueError("Missing `--outdir` argument to tempita.py") + + outdir_abs = os.path.join(os.getcwd(), args.outdir) + outfile = os.path.join( + outdir_abs, os.path.splitext(os.path.split(args.infile)[1])[0] + ) + + process_tempita(args.infile, outfile) + + +if __name__ == "__main__": + main() diff --git a/.venv/lib/python3.12/site-packages/sklearn/_build_utils/version.py b/.venv/lib/python3.12/site-packages/sklearn/_build_utils/version.py new file mode 100644 index 0000000000000000000000000000000000000000..922a14917bf3fd2d395a4f5002a39c4d9d9c7ee2 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/_build_utils/version.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 +"""Extract version number from __init__.py""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import os + +sklearn_init = os.path.join(os.path.dirname(__file__), "../__init__.py") + +data = open(sklearn_init).readlines() +version_line = next(line for line in data if line.startswith("__version__")) + +version = version_line.strip().split(" = ")[1].replace('"', "").replace("'", "") + +print(version) diff --git a/.venv/lib/python3.12/site-packages/sklearn/_loss/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/_loss/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..97fdd884e517c4a623e6fc180526bde227af0c21 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/_loss/__init__.py @@ -0,0 +1,33 @@ +""" +The :mod:`sklearn._loss` module includes loss function classes suitable for +fitting classification and regression tasks. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from .loss import ( + AbsoluteError, + HalfBinomialLoss, + HalfGammaLoss, + HalfMultinomialLoss, + HalfPoissonLoss, + HalfSquaredError, + HalfTweedieLoss, + HalfTweedieLossIdentity, + HuberLoss, + PinballLoss, +) + +__all__ = [ + "AbsoluteError", + "HalfBinomialLoss", + "HalfGammaLoss", + "HalfMultinomialLoss", + "HalfPoissonLoss", + "HalfSquaredError", + "HalfTweedieLoss", + "HalfTweedieLossIdentity", + "HuberLoss", + "PinballLoss", +] diff --git a/.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pxd b/.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pxd new file mode 100644 index 0000000000000000000000000000000000000000..ac01b122a0941c35bc4d440543cf5c981943952a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pxd @@ -0,0 +1,101 @@ +# Fused types for input like y_true, raw_prediction, sample_weights. +ctypedef fused floating_in: + double + float + + +# Fused types for output like gradient and hessian +# We use a different fused types for input (floating_in) and output (floating_out), such +# that input and output can have different dtypes in the same function call. A single +# fused type can only take on one single value (type) for all arguments in one function +# call. +ctypedef fused floating_out: + double + float + + +# Struct to return 2 doubles +ctypedef struct double_pair: + double val1 + double val2 + + +# C base class for loss functions +cdef class CyLossFunction: + cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil + cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil + cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil + + +cdef class CyHalfSquaredError(CyLossFunction): + cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil + cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil + cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil + + +cdef class CyAbsoluteError(CyLossFunction): + cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil + cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil + cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil + + +cdef class CyPinballLoss(CyLossFunction): + cdef readonly double quantile # readonly makes it accessible from Python + cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil + cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil + cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil + + +cdef class CyHuberLoss(CyLossFunction): + cdef public double delta # public makes it accessible from Python + cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil + cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil + cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil + + +cdef class CyHalfPoissonLoss(CyLossFunction): + cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil + cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil + cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil + + +cdef class CyHalfGammaLoss(CyLossFunction): + cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil + cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil + cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil + + +cdef class CyHalfTweedieLoss(CyLossFunction): + cdef readonly double power # readonly makes it accessible from Python + cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil + cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil + cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil + + +cdef class CyHalfTweedieLossIdentity(CyLossFunction): + cdef readonly double power # readonly makes it accessible from Python + cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil + cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil + cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil + + +cdef class CyHalfBinomialLoss(CyLossFunction): + cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil + cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil + cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil + + +cdef class CyExponentialLoss(CyLossFunction): + cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil + cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil + cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil + + +cdef class CyHalfMultinomialLoss(): + cdef void cy_gradient( + self, + const floating_in y_true, + const floating_in[::1] raw_prediction, + const floating_in sample_weight, + floating_out[::1] gradient_out, + ) noexcept nogil diff --git a/.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pyx.tp new file mode 100644 index 0000000000000000000000000000000000000000..44d5acd530a7f60ac6e08174c5e5197f3fb00735 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pyx.tp @@ -0,0 +1,1505 @@ +{{py: + +""" +Template file to easily generate loops over samples using Tempita +(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py). + +Generated file: _loss.pyx + +Each loss class is generated by a cdef functions on single samples. +The keywords between double braces are substituted during the build. +""" + +doc_HalfSquaredError = ( + """Half Squared Error with identity link. + + Domain: + y_true and y_pred all real numbers + + Link: + y_pred = raw_prediction + """ +) + +doc_AbsoluteError = ( + """Absolute Error with identity link. + + Domain: + y_true and y_pred all real numbers + + Link: + y_pred = raw_prediction + """ +) + +doc_PinballLoss = ( + """Quantile Loss aka Pinball Loss with identity link. + + Domain: + y_true and y_pred all real numbers + quantile in (0, 1) + + Link: + y_pred = raw_prediction + + Note: 2 * cPinballLoss(quantile=0.5) equals cAbsoluteError() + """ +) + +doc_HuberLoss = ( + """Huber Loss with identity link. + + Domain: + y_true and y_pred all real numbers + delta in positive real numbers + + Link: + y_pred = raw_prediction + """ +) + +doc_HalfPoissonLoss = ( + """Half Poisson deviance loss with log-link. + + Domain: + y_true in non-negative real numbers + y_pred in positive real numbers + + Link: + y_pred = exp(raw_prediction) + + Half Poisson deviance with log-link is + y_true * log(y_true/y_pred) + y_pred - y_true + = y_true * log(y_true) - y_true * raw_prediction + + exp(raw_prediction) - y_true + + Dropping constant terms, this gives: + exp(raw_prediction) - y_true * raw_prediction + """ +) + +doc_HalfGammaLoss = ( + """Half Gamma deviance loss with log-link. + + Domain: + y_true and y_pred in positive real numbers + + Link: + y_pred = exp(raw_prediction) + + Half Gamma deviance with log-link is + log(y_pred/y_true) + y_true/y_pred - 1 + = raw_prediction - log(y_true) + y_true * exp(-raw_prediction) - 1 + + Dropping constant terms, this gives: + raw_prediction + y_true * exp(-raw_prediction) + """ +) + +doc_HalfTweedieLoss = ( + """Half Tweedie deviance loss with log-link. + + Domain: + y_true in real numbers if p <= 0 + y_true in non-negative real numbers if 0 < p < 2 + y_true in positive real numbers if p >= 2 + y_pred and power in positive real numbers + + Link: + y_pred = exp(raw_prediction) + + Half Tweedie deviance with log-link and p=power is + max(y_true, 0)**(2-p) / (1-p) / (2-p) + - y_true * y_pred**(1-p) / (1-p) + + y_pred**(2-p) / (2-p) + = max(y_true, 0)**(2-p) / (1-p) / (2-p) + - y_true * exp((1-p) * raw_prediction) / (1-p) + + exp((2-p) * raw_prediction) / (2-p) + + Dropping constant terms, this gives: + exp((2-p) * raw_prediction) / (2-p) + - y_true * exp((1-p) * raw_prediction) / (1-p) + + Notes: + - Poisson with p=1 and Gamma with p=2 have different terms dropped such + that cHalfTweedieLoss is not continuous in p=power at p=1 and p=2. + - While the Tweedie distribution only exists for p<=0 or p>=1, the range + 0= 2 + y_pred and power in positive real numbers, y_pred may be negative for p=0. + + Link: + y_pred = raw_prediction + + Half Tweedie deviance with identity link and p=power is + max(y_true, 0)**(2-p) / (1-p) / (2-p) + - y_true * y_pred**(1-p) / (1-p) + + y_pred**(2-p) / (2-p) + + Notes: + - Here, we do not drop constant terms in contrast to the version with log-link. + """ +) + +doc_HalfBinomialLoss = ( + """Half Binomial deviance loss with logit link. + + Domain: + y_true in [0, 1] + y_pred in (0, 1), i.e. boundaries excluded + + Link: + y_pred = expit(raw_prediction) + """ +) + +doc_ExponentialLoss = ( + """"Exponential loss with (half) logit link + + Domain: + y_true in [0, 1] + y_pred in (0, 1), i.e. boundaries excluded + + Link: + y_pred = expit(2 * raw_prediction) + """ +) + +# loss class name, docstring, param, +# cy_loss, cy_loss_grad, +# cy_grad, cy_grad_hess, +class_list = [ + ("CyHalfSquaredError", doc_HalfSquaredError, None, + "closs_half_squared_error", None, + "cgradient_half_squared_error", "cgrad_hess_half_squared_error"), + ("CyAbsoluteError", doc_AbsoluteError, None, + "closs_absolute_error", None, + "cgradient_absolute_error", "cgrad_hess_absolute_error"), + ("CyPinballLoss", doc_PinballLoss, "quantile", + "closs_pinball_loss", None, + "cgradient_pinball_loss", "cgrad_hess_pinball_loss"), + ("CyHuberLoss", doc_HuberLoss, "delta", + "closs_huber_loss", None, + "cgradient_huber_loss", "cgrad_hess_huber_loss"), + ("CyHalfPoissonLoss", doc_HalfPoissonLoss, None, + "closs_half_poisson", "closs_grad_half_poisson", + "cgradient_half_poisson", "cgrad_hess_half_poisson"), + ("CyHalfGammaLoss", doc_HalfGammaLoss, None, + "closs_half_gamma", "closs_grad_half_gamma", + "cgradient_half_gamma", "cgrad_hess_half_gamma"), + ("CyHalfTweedieLoss", doc_HalfTweedieLoss, "power", + "closs_half_tweedie", "closs_grad_half_tweedie", + "cgradient_half_tweedie", "cgrad_hess_half_tweedie"), + ("CyHalfTweedieLossIdentity", doc_HalfTweedieLossIdentity, "power", + "closs_half_tweedie_identity", "closs_grad_half_tweedie_identity", + "cgradient_half_tweedie_identity", "cgrad_hess_half_tweedie_identity"), + ("CyHalfBinomialLoss", doc_HalfBinomialLoss, None, + "closs_half_binomial", "closs_grad_half_binomial", + "cgradient_half_binomial", "cgrad_hess_half_binomial"), + ("CyExponentialLoss", doc_ExponentialLoss, None, + "closs_exponential", "closs_grad_exponential", + "cgradient_exponential", "cgrad_hess_exponential"), +] +}} + +# Design: +# See https://github.com/scikit-learn/scikit-learn/issues/15123 for reasons. +# a) Merge link functions into loss functions for speed and numerical +# stability, i.e. use raw_prediction instead of y_pred in signature. +# b) Pure C functions (nogil) calculate single points (single sample) +# c) Wrap C functions in a loop to get Python functions operating on ndarrays. +# - Write loops manually---use Tempita for this. +# Reason: There is still some performance overhead when using a wrapper +# function "wrap" that carries out the loop and gets as argument a function +# pointer to one of the C functions from b), e.g. +# wrap(closs_half_poisson, y_true, ...) +# - Pass n_threads as argument to prange and propagate option to all callers. +# d) Provide classes (Cython extension types) per loss (names start with Cy) in +# order to have semantical structured objects. +# - Member functions for single points just call the C function from b). +# These are used e.g. in SGD `_plain_sgd`. +# - Member functions operating on ndarrays, see c), looping over calls to C +# functions from b). +# e) Provide convenience Python classes that compose from these extension types +# elsewhere (see loss.py) +# - Example: loss.gradient calls CyLoss.gradient but does some input +# checking like None -> np.empty(). +# +# Note: We require 1-dim ndarrays to be contiguous. + +from cython.parallel import parallel, prange +import numpy as np + +from libc.math cimport exp, fabs, log, log1p, pow +from libc.stdlib cimport malloc, free + + +# ------------------------------------- +# Helper functions +# ------------------------------------- +# Numerically stable version of log(1 + exp(x)) for double precision, see Eq. (10) of +# https://cran.r-project.org/web/packages/Rmpfr/vignettes/log1mexp-note.pdf +# Note: The only important cutoff is at x = 18. All others are to save computation +# time. Compared to the reference, we add the additional case distinction x <= -2 in +# order to use log instead of log1p for improved performance. As with the other +# cutoffs, this is accurate within machine precision of double. +cdef inline double log1pexp(double x) noexcept nogil: + if x <= -37: + return exp(x) + elif x <= -2: + return log1p(exp(x)) + elif x <= 18: + return log(1. + exp(x)) + elif x <= 33.3: + return x + exp(-x) + else: + return x + + +cdef inline double_pair sum_exp_minus_max( + const int i, + const floating_in[:, :] raw_prediction, # IN + floating_out *p # OUT +) noexcept nogil: + # Thread local buffers are used to store part of the results via p. + # The results are stored as follows: + # p[k] = exp(raw_prediction_i_k - max_value) for k = 0 to n_classes-1 + # return.val1 = max_value = max(raw_prediction_i_k, k = 0 to n_classes-1) + # return.val2 = sum_exps = sum(p[k], k = 0 to n_classes-1) = sum of exponentials + # len(p) must be n_classes + # Notes: + # - We return the max value and sum of exps (stored in p) as a double_pair. + # - i needs to be passed (and stays constant) because otherwise Cython does + # not generate optimal code, see + # https://github.com/scikit-learn/scikit-learn/issues/17299 + # - We do not normalize p by calculating p[k] = p[k] / sum_exps. + # This helps to save one loop over k. + cdef: + int k + int n_classes = raw_prediction.shape[1] + double_pair max_value_and_sum_exps # val1 = max_value, val2 = sum_exps + + max_value_and_sum_exps.val1 = raw_prediction[i, 0] + max_value_and_sum_exps.val2 = 0 + for k in range(1, n_classes): + # Compute max value of array for numerical stability + if max_value_and_sum_exps.val1 < raw_prediction[i, k]: + max_value_and_sum_exps.val1 = raw_prediction[i, k] + + for k in range(n_classes): + p[k] = exp(raw_prediction[i, k] - max_value_and_sum_exps.val1) + max_value_and_sum_exps.val2 += p[k] + + return max_value_and_sum_exps + + +# ------------------------------------- +# Single point inline C functions +# ------------------------------------- +# Half Squared Error +cdef inline double closs_half_squared_error( + double y_true, + double raw_prediction +) noexcept nogil: + return 0.5 * (raw_prediction - y_true) * (raw_prediction - y_true) + + +cdef inline double cgradient_half_squared_error( + double y_true, + double raw_prediction +) noexcept nogil: + return raw_prediction - y_true + + +cdef inline double_pair cgrad_hess_half_squared_error( + double y_true, + double raw_prediction +) noexcept nogil: + cdef double_pair gh + gh.val1 = raw_prediction - y_true # gradient + gh.val2 = 1. # hessian + return gh + + +# Absolute Error +cdef inline double closs_absolute_error( + double y_true, + double raw_prediction +) noexcept nogil: + return fabs(raw_prediction - y_true) + + +cdef inline double cgradient_absolute_error( + double y_true, + double raw_prediction +) noexcept nogil: + return 1. if raw_prediction > y_true else -1. + + +cdef inline double_pair cgrad_hess_absolute_error( + double y_true, + double raw_prediction +) noexcept nogil: + cdef double_pair gh + # Note that exact hessian = 0 almost everywhere. Optimization routines like + # in HGBT, however, need a hessian > 0. Therefore, we assign 1. + gh.val1 = 1. if raw_prediction > y_true else -1. # gradient + gh.val2 = 1. # hessian + return gh + + +# Quantile Loss / Pinball Loss +cdef inline double closs_pinball_loss( + double y_true, + double raw_prediction, + double quantile +) noexcept nogil: + return (quantile * (y_true - raw_prediction) if y_true >= raw_prediction + else (1. - quantile) * (raw_prediction - y_true)) + + +cdef inline double cgradient_pinball_loss( + double y_true, + double raw_prediction, + double quantile +) noexcept nogil: + return -quantile if y_true >=raw_prediction else 1. - quantile + + +cdef inline double_pair cgrad_hess_pinball_loss( + double y_true, + double raw_prediction, + double quantile +) noexcept nogil: + cdef double_pair gh + # Note that exact hessian = 0 almost everywhere. Optimization routines like + # in HGBT, however, need a hessian > 0. Therefore, we assign 1. + gh.val1 = -quantile if y_true >=raw_prediction else 1. - quantile # gradient + gh.val2 = 1. # hessian + return gh + + +# Huber Loss +cdef inline double closs_huber_loss( + double y_true, + double raw_prediction, + double delta, +) noexcept nogil: + cdef double abserr = fabs(y_true - raw_prediction) + if abserr <= delta: + return 0.5 * abserr**2 + else: + return delta * (abserr - 0.5 * delta) + + +cdef inline double cgradient_huber_loss( + double y_true, + double raw_prediction, + double delta, +) noexcept nogil: + cdef double res = raw_prediction - y_true + if fabs(res) <= delta: + return res + else: + return delta if res >=0 else -delta + + +cdef inline double_pair cgrad_hess_huber_loss( + double y_true, + double raw_prediction, + double delta, +) noexcept nogil: + cdef double_pair gh + gh.val2 = raw_prediction - y_true # used as temporary + if fabs(gh.val2) <= delta: + gh.val1 = gh.val2 # gradient + gh.val2 = 1 # hessian + else: + gh.val1 = delta if gh.val2 >=0 else -delta # gradient + gh.val2 = 0 # hessian + return gh + + +# Half Poisson Deviance with Log-Link, dropping constant terms +cdef inline double closs_half_poisson( + double y_true, + double raw_prediction +) noexcept nogil: + return exp(raw_prediction) - y_true * raw_prediction + + +cdef inline double cgradient_half_poisson( + double y_true, + double raw_prediction +) noexcept nogil: + # y_pred - y_true + return exp(raw_prediction) - y_true + + +cdef inline double_pair closs_grad_half_poisson( + double y_true, + double raw_prediction +) noexcept nogil: + cdef double_pair lg + lg.val2 = exp(raw_prediction) # used as temporary + lg.val1 = lg.val2 - y_true * raw_prediction # loss + lg.val2 -= y_true # gradient + return lg + + +cdef inline double_pair cgrad_hess_half_poisson( + double y_true, + double raw_prediction +) noexcept nogil: + cdef double_pair gh + gh.val2 = exp(raw_prediction) # hessian + gh.val1 = gh.val2 - y_true # gradient + return gh + + +# Half Gamma Deviance with Log-Link, dropping constant terms +cdef inline double closs_half_gamma( + double y_true, + double raw_prediction +) noexcept nogil: + return raw_prediction + y_true * exp(-raw_prediction) + + +cdef inline double cgradient_half_gamma( + double y_true, + double raw_prediction +) noexcept nogil: + return 1. - y_true * exp(-raw_prediction) + + +cdef inline double_pair closs_grad_half_gamma( + double y_true, + double raw_prediction +) noexcept nogil: + cdef double_pair lg + lg.val2 = exp(-raw_prediction) # used as temporary + lg.val1 = raw_prediction + y_true * lg.val2 # loss + lg.val2 = 1. - y_true * lg.val2 # gradient + return lg + + +cdef inline double_pair cgrad_hess_half_gamma( + double y_true, + double raw_prediction +) noexcept nogil: + cdef double_pair gh + gh.val2 = exp(-raw_prediction) # used as temporary + gh.val1 = 1. - y_true * gh.val2 # gradient + gh.val2 *= y_true # hessian + return gh + + +# Half Tweedie Deviance with Log-Link, dropping constant terms +# Note that by dropping constants this is no longer continuous in parameter power. +cdef inline double closs_half_tweedie( + double y_true, + double raw_prediction, + double power +) noexcept nogil: + if power == 0.: + return closs_half_squared_error(y_true, exp(raw_prediction)) + elif power == 1.: + return closs_half_poisson(y_true, raw_prediction) + elif power == 2.: + return closs_half_gamma(y_true, raw_prediction) + else: + return (exp((2. - power) * raw_prediction) / (2. - power) + - y_true * exp((1. - power) * raw_prediction) / (1. - power)) + + +cdef inline double cgradient_half_tweedie( + double y_true, + double raw_prediction, + double power +) noexcept nogil: + cdef double exp1 + if power == 0.: + exp1 = exp(raw_prediction) + return exp1 * (exp1 - y_true) + elif power == 1.: + return cgradient_half_poisson(y_true, raw_prediction) + elif power == 2.: + return cgradient_half_gamma(y_true, raw_prediction) + else: + return (exp((2. - power) * raw_prediction) + - y_true * exp((1. - power) * raw_prediction)) + + +cdef inline double_pair closs_grad_half_tweedie( + double y_true, + double raw_prediction, + double power +) noexcept nogil: + cdef double_pair lg + cdef double exp1, exp2 + if power == 0.: + exp1 = exp(raw_prediction) + lg.val1 = closs_half_squared_error(y_true, exp1) # loss + lg.val2 = exp1 * (exp1 - y_true) # gradient + elif power == 1.: + return closs_grad_half_poisson(y_true, raw_prediction) + elif power == 2.: + return closs_grad_half_gamma(y_true, raw_prediction) + else: + exp1 = exp((1. - power) * raw_prediction) + exp2 = exp((2. - power) * raw_prediction) + lg.val1 = exp2 / (2. - power) - y_true * exp1 / (1. - power) # loss + lg.val2 = exp2 - y_true * exp1 # gradient + return lg + + +cdef inline double_pair cgrad_hess_half_tweedie( + double y_true, + double raw_prediction, + double power +) noexcept nogil: + cdef double_pair gh + cdef double exp1, exp2 + if power == 0.: + exp1 = exp(raw_prediction) + gh.val1 = exp1 * (exp1 - y_true) # gradient + gh.val2 = exp1 * (2 * exp1 - y_true) # hessian + elif power == 1.: + return cgrad_hess_half_poisson(y_true, raw_prediction) + elif power == 2.: + return cgrad_hess_half_gamma(y_true, raw_prediction) + else: + exp1 = exp((1. - power) * raw_prediction) + exp2 = exp((2. - power) * raw_prediction) + gh.val1 = exp2 - y_true * exp1 # gradient + gh.val2 = (2. - power) * exp2 - (1. - power) * y_true * exp1 # hessian + return gh + + +# Half Tweedie Deviance with identity link, without dropping constant terms! +# Therefore, best loss value is zero. +cdef inline double closs_half_tweedie_identity( + double y_true, + double raw_prediction, + double power +) noexcept nogil: + cdef double tmp + if power == 0.: + return closs_half_squared_error(y_true, raw_prediction) + elif power == 1.: + if y_true == 0: + return raw_prediction + else: + return y_true * log(y_true/raw_prediction) + raw_prediction - y_true + elif power == 2.: + return log(raw_prediction/y_true) + y_true/raw_prediction - 1. + else: + tmp = pow(raw_prediction, 1. - power) + tmp = raw_prediction * tmp / (2. - power) - y_true * tmp / (1. - power) + if y_true > 0: + tmp += pow(y_true, 2. - power) / ((1. - power) * (2. - power)) + return tmp + + +cdef inline double cgradient_half_tweedie_identity( + double y_true, + double raw_prediction, + double power +) noexcept nogil: + if power == 0.: + return raw_prediction - y_true + elif power == 1.: + return 1. - y_true / raw_prediction + elif power == 2.: + return (raw_prediction - y_true) / (raw_prediction * raw_prediction) + else: + return pow(raw_prediction, -power) * (raw_prediction - y_true) + + +cdef inline double_pair closs_grad_half_tweedie_identity( + double y_true, + double raw_prediction, + double power +) noexcept nogil: + cdef double_pair lg + cdef double tmp + if power == 0.: + lg.val2 = raw_prediction - y_true # gradient + lg.val1 = 0.5 * lg.val2 * lg.val2 # loss + elif power == 1.: + if y_true == 0: + lg.val1 = raw_prediction + else: + lg.val1 = (y_true * log(y_true/raw_prediction) # loss + + raw_prediction - y_true) + lg.val2 = 1. - y_true / raw_prediction # gradient + elif power == 2.: + lg.val1 = log(raw_prediction/y_true) + y_true/raw_prediction - 1. # loss + tmp = raw_prediction * raw_prediction + lg.val2 = (raw_prediction - y_true) / tmp # gradient + else: + tmp = pow(raw_prediction, 1. - power) + lg.val1 = (raw_prediction * tmp / (2. - power) # loss + - y_true * tmp / (1. - power)) + if y_true > 0: + lg.val1 += (pow(y_true, 2. - power) + / ((1. - power) * (2. - power))) + lg.val2 = tmp * (1. - y_true / raw_prediction) # gradient + return lg + + +cdef inline double_pair cgrad_hess_half_tweedie_identity( + double y_true, + double raw_prediction, + double power +) noexcept nogil: + cdef double_pair gh + cdef double tmp + if power == 0.: + gh.val1 = raw_prediction - y_true # gradient + gh.val2 = 1. # hessian + elif power == 1.: + gh.val1 = 1. - y_true / raw_prediction # gradient + gh.val2 = y_true / (raw_prediction * raw_prediction) # hessian + elif power == 2.: + tmp = raw_prediction * raw_prediction + gh.val1 = (raw_prediction - y_true) / tmp # gradient + gh.val2 = (-1. + 2. * y_true / raw_prediction) / tmp # hessian + else: + tmp = pow(raw_prediction, -power) + gh.val1 = tmp * (raw_prediction - y_true) # gradient + gh.val2 = tmp * ((1. - power) + power * y_true / raw_prediction) # hessian + return gh + + +# Half Binomial deviance with logit-link, aka log-loss or binary cross entropy +cdef inline double closs_half_binomial( + double y_true, + double raw_prediction +) noexcept nogil: + # log1p(exp(raw_prediction)) - y_true * raw_prediction + return log1pexp(raw_prediction) - y_true * raw_prediction + + +cdef inline double cgradient_half_binomial( + double y_true, + double raw_prediction +) noexcept nogil: + # gradient = y_pred - y_true = expit(raw_prediction) - y_true + # Numerically more stable, see http://fa.bianp.net/blog/2019/evaluate_logistic/ + # if raw_prediction < 0: + # exp_tmp = exp(raw_prediction) + # return ((1 - y_true) * exp_tmp - y_true) / (1 + exp_tmp) + # else: + # exp_tmp = exp(-raw_prediction) + # return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp) + # Note that optimal speed would be achieved, at the cost of precision, by + # return expit(raw_prediction) - y_true + # i.e. no "if else" and an own inline implementation of expit instead of + # from scipy.special.cython_special cimport expit + # The case distinction raw_prediction < 0 in the stable implementation does not + # provide significant better precision apart from protecting overflow of exp(..). + # The branch (if else), however, can incur runtime costs of up to 30%. + # Instead, we help branch prediction by almost always ending in the first if clause + # and making the second branch (else) a bit simpler. This has the exact same + # precision but is faster than the stable implementation. + # As branching criteria, we use the same cutoff as in log1pexp. Note that the + # maximal value to get gradient = -1 with y_true = 1 is -37.439198610162731 + # (based on mpmath), and scipy.special.logit(np.finfo(float).eps) ~ -36.04365. + cdef double exp_tmp + if raw_prediction > -37: + exp_tmp = exp(-raw_prediction) + return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp) + else: + # expit(raw_prediction) = exp(raw_prediction) for raw_prediction <= -37 + return exp(raw_prediction) - y_true + + +cdef inline double_pair closs_grad_half_binomial( + double y_true, + double raw_prediction +) noexcept nogil: + cdef double_pair lg + # Same if else conditions as in log1pexp. + if raw_prediction <= -37: + lg.val2 = exp(raw_prediction) # used as temporary + lg.val1 = lg.val2 - y_true * raw_prediction # loss + lg.val2 -= y_true # gradient + elif raw_prediction <= -2: + lg.val2 = exp(raw_prediction) # used as temporary + lg.val1 = log1p(lg.val2) - y_true * raw_prediction # loss + lg.val2 = ((1 - y_true) * lg.val2 - y_true) / (1 + lg.val2) # gradient + elif raw_prediction <= 18: + lg.val2 = exp(-raw_prediction) # used as temporary + # log1p(exp(x)) = log(1 + exp(x)) = x + log1p(exp(-x)) + lg.val1 = log1p(lg.val2) + (1 - y_true) * raw_prediction # loss + lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2) # gradient + else: + lg.val2 = exp(-raw_prediction) # used as temporary + lg.val1 = lg.val2 + (1 - y_true) * raw_prediction # loss + lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2) # gradient + return lg + + +cdef inline double_pair cgrad_hess_half_binomial( + double y_true, + double raw_prediction +) noexcept nogil: + # with y_pred = expit(raw) + # hessian = y_pred * (1 - y_pred) = exp( raw) / (1 + exp( raw))**2 + # = exp(-raw) / (1 + exp(-raw))**2 + cdef double_pair gh + # See comment in cgradient_half_binomial. + if raw_prediction > -37: + gh.val2 = exp(-raw_prediction) # used as temporary + gh.val1 = ((1 - y_true) - y_true * gh.val2) / (1 + gh.val2) # gradient + gh.val2 = gh.val2 / (1 + gh.val2)**2 # hessian + else: + gh.val2 = exp(raw_prediction) # = 1. order Taylor in exp(raw_prediction) + gh.val1 = gh.val2 - y_true + return gh + + +# Exponential loss with (half) logit-link, aka boosting loss +cdef inline double closs_exponential( + double y_true, + double raw_prediction +) noexcept nogil: + cdef double tmp = exp(raw_prediction) + return y_true / tmp + (1 - y_true) * tmp + + +cdef inline double cgradient_exponential( + double y_true, + double raw_prediction +) noexcept nogil: + cdef double tmp = exp(raw_prediction) + return -y_true / tmp + (1 - y_true) * tmp + + +cdef inline double_pair closs_grad_exponential( + double y_true, + double raw_prediction +) noexcept nogil: + cdef double_pair lg + lg.val2 = exp(raw_prediction) # used as temporary + + lg.val1 = y_true / lg.val2 + (1 - y_true) * lg.val2 # loss + lg.val2 = -y_true / lg.val2 + (1 - y_true) * lg.val2 # gradient + return lg + + +cdef inline double_pair cgrad_hess_exponential( + double y_true, + double raw_prediction +) noexcept nogil: + # Note that hessian = loss + cdef double_pair gh + gh.val2 = exp(raw_prediction) # used as temporary + + gh.val1 = -y_true / gh.val2 + (1 - y_true) * gh.val2 # gradient + gh.val2 = y_true / gh.val2 + (1 - y_true) * gh.val2 # hessian + return gh + + +# --------------------------------------------------- +# Extension Types for Loss Functions of 1-dim targets +# --------------------------------------------------- +cdef class CyLossFunction: + """Base class for convex loss functions.""" + + def __reduce__(self): + return (self.__class__, ()) + + cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil: + """Compute the loss for a single sample. + + Parameters + ---------- + y_true : double + Observed, true target value. + raw_prediction : double + Raw prediction value (in link space). + + Returns + ------- + double + The loss evaluated at `y_true` and `raw_prediction`. + """ + pass + + cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil: + """Compute gradient of loss w.r.t. raw_prediction for a single sample. + + Parameters + ---------- + y_true : double + Observed, true target value. + raw_prediction : double + Raw prediction value (in link space). + + Returns + ------- + double + The derivative of the loss function w.r.t. `raw_prediction`. + """ + pass + + cdef double_pair cy_grad_hess( + self, double y_true, double raw_prediction + ) noexcept nogil: + """Compute gradient and hessian. + + Gradient and hessian of loss w.r.t. raw_prediction for a single sample. + + This is usually diagonal in raw_prediction_i and raw_prediction_j. + Therefore, we return the diagonal element i=j. + + For a loss with a non-canonical link, this might implement the diagonal + of the Fisher matrix (=expected hessian) instead of the hessian. + + Parameters + ---------- + y_true : double + Observed, true target value. + raw_prediction : double + Raw prediction value (in link space). + + Returns + ------- + double_pair + Gradient and hessian of the loss function w.r.t. `raw_prediction`. + """ + pass + + def loss( + self, + const floating_in[::1] y_true, # IN + const floating_in[::1] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[::1] loss_out, # OUT + int n_threads=1 + ): + """Compute the point-wise loss value for each input. + + The point-wise loss is written to `loss_out` and no array is returned. + + Parameters + ---------- + y_true : array of shape (n_samples,) + Observed, true target values. + raw_prediction : array of shape (n_samples,) + Raw prediction values (in link space). + sample_weight : array of shape (n_samples,) or None + Sample weights. + loss_out : array of shape (n_samples,) + A location into which the result is stored. + n_threads : int + Number of threads used by OpenMP (if any). + """ + pass + + def gradient( + self, + const floating_in[::1] y_true, # IN + const floating_in[::1] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[::1] gradient_out, # OUT + int n_threads=1 + ): + """Compute gradient of loss w.r.t raw_prediction for each input. + + The gradient is written to `gradient_out` and no array is returned. + + Parameters + ---------- + y_true : array of shape (n_samples,) + Observed, true target values. + raw_prediction : array of shape (n_samples,) + Raw prediction values (in link space). + sample_weight : array of shape (n_samples,) or None + Sample weights. + gradient_out : array of shape (n_samples,) + A location into which the result is stored. + n_threads : int + Number of threads used by OpenMP (if any). + """ + pass + + def loss_gradient( + self, + const floating_in[::1] y_true, # IN + const floating_in[::1] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[::1] loss_out, # OUT + floating_out[::1] gradient_out, # OUT + int n_threads=1 + ): + """Compute loss and gradient of loss w.r.t raw_prediction. + + The loss and gradient are written to `loss_out` and `gradient_out` and no arrays + are returned. + + Parameters + ---------- + y_true : array of shape (n_samples,) + Observed, true target values. + raw_prediction : array of shape (n_samples,) + Raw prediction values (in link space). + sample_weight : array of shape (n_samples,) or None + Sample weights. + loss_out : array of shape (n_samples,) or None + A location into which the element-wise loss is stored. + gradient_out : array of shape (n_samples,) + A location into which the gradient is stored. + n_threads : int + Number of threads used by OpenMP (if any). + """ + self.loss(y_true, raw_prediction, sample_weight, loss_out, n_threads) + self.gradient(y_true, raw_prediction, sample_weight, gradient_out, n_threads) + + def gradient_hessian( + self, + const floating_in[::1] y_true, # IN + const floating_in[::1] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[::1] gradient_out, # OUT + floating_out[::1] hessian_out, # OUT + int n_threads=1 + ): + """Compute gradient and hessian of loss w.r.t raw_prediction. + + The gradient and hessian are written to `gradient_out` and `hessian_out` and no + arrays are returned. + + Parameters + ---------- + y_true : array of shape (n_samples,) + Observed, true target values. + raw_prediction : array of shape (n_samples,) + Raw prediction values (in link space). + sample_weight : array of shape (n_samples,) or None + Sample weights. + gradient_out : array of shape (n_samples,) + A location into which the gradient is stored. + hessian_out : array of shape (n_samples,) + A location into which the hessian is stored. + n_threads : int + Number of threads used by OpenMP (if any). + """ + pass + + +{{for name, docstring, param, closs, closs_grad, cgrad, cgrad_hess, in class_list}} +{{py: +if param is None: + with_param = "" +else: + with_param = ", self." + param +}} + +cdef class {{name}}(CyLossFunction): + """{{docstring}}""" + + {{if param is not None}} + def __init__(self, {{param}}): + self.{{param}} = {{param}} + {{endif}} + + {{if param is not None}} + def __reduce__(self): + return (self.__class__, (self.{{param}},)) + {{endif}} + + cdef inline double cy_loss(self, double y_true, double raw_prediction) noexcept nogil: + return {{closs}}(y_true, raw_prediction{{with_param}}) + + cdef inline double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil: + return {{cgrad}}(y_true, raw_prediction{{with_param}}) + + cdef inline double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil: + return {{cgrad_hess}}(y_true, raw_prediction{{with_param}}) + + def loss( + self, + const floating_in[::1] y_true, # IN + const floating_in[::1] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[::1] loss_out, # OUT + int n_threads=1 + ): + cdef: + int i + int n_samples = y_true.shape[0] + + if sample_weight is None: + for i in prange( + n_samples, schedule='static', nogil=True, num_threads=n_threads + ): + loss_out[i] = {{closs}}(y_true[i], raw_prediction[i]{{with_param}}) + else: + for i in prange( + n_samples, schedule='static', nogil=True, num_threads=n_threads + ): + loss_out[i] = sample_weight[i] * {{closs}}(y_true[i], raw_prediction[i]{{with_param}}) + + {{if closs_grad is not None}} + def loss_gradient( + self, + const floating_in[::1] y_true, # IN + const floating_in[::1] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[::1] loss_out, # OUT + floating_out[::1] gradient_out, # OUT + int n_threads=1 + ): + cdef: + int i + int n_samples = y_true.shape[0] + double_pair dbl2 + + if sample_weight is None: + for i in prange( + n_samples, schedule='static', nogil=True, num_threads=n_threads + ): + dbl2 = {{closs_grad}}(y_true[i], raw_prediction[i]{{with_param}}) + loss_out[i] = dbl2.val1 + gradient_out[i] = dbl2.val2 + else: + for i in prange( + n_samples, schedule='static', nogil=True, num_threads=n_threads + ): + dbl2 = {{closs_grad}}(y_true[i], raw_prediction[i]{{with_param}}) + loss_out[i] = sample_weight[i] * dbl2.val1 + gradient_out[i] = sample_weight[i] * dbl2.val2 + + {{endif}} + + def gradient( + self, + const floating_in[::1] y_true, # IN + const floating_in[::1] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[::1] gradient_out, # OUT + int n_threads=1 + ): + cdef: + int i + int n_samples = y_true.shape[0] + + if sample_weight is None: + for i in prange( + n_samples, schedule='static', nogil=True, num_threads=n_threads + ): + gradient_out[i] = {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}}) + else: + for i in prange( + n_samples, schedule='static', nogil=True, num_threads=n_threads + ): + gradient_out[i] = sample_weight[i] * {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}}) + + def gradient_hessian( + self, + const floating_in[::1] y_true, # IN + const floating_in[::1] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[::1] gradient_out, # OUT + floating_out[::1] hessian_out, # OUT + int n_threads=1 + ): + cdef: + int i + int n_samples = y_true.shape[0] + double_pair dbl2 + + if sample_weight is None: + for i in prange( + n_samples, schedule='static', nogil=True, num_threads=n_threads + ): + dbl2 = {{cgrad_hess}}(y_true[i], raw_prediction[i]{{with_param}}) + gradient_out[i] = dbl2.val1 + hessian_out[i] = dbl2.val2 + else: + for i in prange( + n_samples, schedule='static', nogil=True, num_threads=n_threads + ): + dbl2 = {{cgrad_hess}}(y_true[i], raw_prediction[i]{{with_param}}) + gradient_out[i] = sample_weight[i] * dbl2.val1 + hessian_out[i] = sample_weight[i] * dbl2.val2 + +{{endfor}} + + +# The multinomial deviance loss is also known as categorical cross-entropy or +# multinomial log-likelihood. +# Here, we do not inherit from CyLossFunction as its cy_gradient method deviates +# from the API. +cdef class CyHalfMultinomialLoss(): + """Half Multinomial deviance loss with multinomial logit link. + + Domain: + y_true in {0, 1, 2, 3, .., n_classes - 1} + y_pred in (0, 1)**n_classes, i.e. interval with boundaries excluded + + Link: + y_pred = softmax(raw_prediction) + + Note: Label encoding is built-in, i.e. {0, 1, 2, 3, .., n_classes - 1} is + mapped to (y_true == k) for k = 0 .. n_classes - 1 which is either 0 or 1. + """ + + # Here we deviate from the CyLossFunction API. SAG/SAGA needs direct access to + # sample-wise gradients which we provide here. + cdef inline void cy_gradient( + self, + const floating_in y_true, + const floating_in[::1] raw_prediction, # IN + const floating_in sample_weight, + floating_out[::1] gradient_out, # OUT + ) noexcept nogil: + """Compute gradient of loss w.r.t. `raw_prediction` for a single sample. + + The gradient of the multinomial logistic loss with respect to a class k, + and for one sample is: + grad_k = - sw * (p[k] - (y==k)) + + where: + p[k] = proba[k] = exp(raw_prediction[k] - logsumexp(raw_prediction)) + sw = sample_weight + + Parameters + ---------- + y_true : double + Observed, true target value. + raw_prediction : array of shape (n_classes,) + Raw prediction values (in link space). + sample_weight : double + Sample weight. + gradient_out : array of shape (n_classs,) + A location into which the gradient is stored. + + Returns + ------- + gradient : double + The derivative of the loss function w.r.t. `raw_prediction`. + """ + cdef: + int k + int n_classes = raw_prediction.shape[0] + double_pair max_value_and_sum_exps + const floating_in[:, :] raw = raw_prediction[None, :] + + max_value_and_sum_exps = sum_exp_minus_max(0, raw, &gradient_out[0]) + for k in range(n_classes): + # gradient_out[k] = p_k = y_pred_k = prob of class k + gradient_out[k] /= max_value_and_sum_exps.val2 + # gradient_k = (p_k - (y_true == k)) * sw + gradient_out[k] = (gradient_out[k] - (y_true == k)) * sample_weight + + def _test_cy_gradient( + self, + const floating_in[::1] y_true, # IN + const floating_in[:, ::1] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + ): + """For testing only.""" + cdef: + int i, k + int n_samples = y_true.shape[0] + int n_classes = raw_prediction.shape[1] + floating_in [:, ::1] gradient_out + gradient = np.empty((n_samples, n_classes), dtype=np.float64) + gradient_out = gradient + + for i in range(n_samples): + self.cy_gradient( + y_true=y_true[i], + raw_prediction=raw_prediction[i, :], + sample_weight=1.0 if sample_weight is None else sample_weight[i], + gradient_out=gradient_out[i, :], + ) + return gradient + + # Note that we do not assume memory alignment/contiguity of 2d arrays. + # There seems to be little benefit in doing so. Benchmarks proofing the + # opposite are welcome. + def loss( + self, + const floating_in[::1] y_true, # IN + const floating_in[:, :] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[::1] loss_out, # OUT + int n_threads=1 + ): + cdef: + int i, k + int n_samples = y_true.shape[0] + int n_classes = raw_prediction.shape[1] + floating_in max_value, sum_exps + floating_in* p # temporary buffer + double_pair max_value_and_sum_exps + + # We assume n_samples > n_classes. In this case having the inner loop + # over n_classes is a good default. + # TODO: If every memoryview is contiguous and raw_prediction is + # f-contiguous, can we write a better algo (loops) to improve + # performance? + if sample_weight is None: + # inner loop over n_classes + with nogil, parallel(num_threads=n_threads): + # Define private buffer variables as each thread might use its + # own. + p = malloc(sizeof(floating_in) * (n_classes)) + + for i in prange(n_samples, schedule='static'): + max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p) + max_value = max_value_and_sum_exps.val1 + sum_exps = max_value_and_sum_exps.val2 + loss_out[i] = log(sum_exps) + max_value + + # label encoded y_true + k = int(y_true[i]) + loss_out[i] -= raw_prediction[i, k] + + free(p) + else: + with nogil, parallel(num_threads=n_threads): + p = malloc(sizeof(floating_in) * (n_classes)) + + for i in prange(n_samples, schedule='static'): + max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p) + max_value = max_value_and_sum_exps.val1 + sum_exps = max_value_and_sum_exps.val2 + loss_out[i] = log(sum_exps) + max_value + + # label encoded y_true + k = int(y_true[i]) + loss_out[i] -= raw_prediction[i, k] + + loss_out[i] *= sample_weight[i] + + free(p) + + def loss_gradient( + self, + const floating_in[::1] y_true, # IN + const floating_in[:, :] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[::1] loss_out, # OUT + floating_out[:, :] gradient_out, # OUT + int n_threads=1 + ): + cdef: + int i, k + int n_samples = y_true.shape[0] + int n_classes = raw_prediction.shape[1] + floating_in max_value, sum_exps + floating_in* p # temporary buffer + double_pair max_value_and_sum_exps + + if sample_weight is None: + # inner loop over n_classes + with nogil, parallel(num_threads=n_threads): + # Define private buffer variables as each thread might use its + # own. + p = malloc(sizeof(floating_in) * (n_classes)) + + for i in prange(n_samples, schedule='static'): + max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p) + max_value = max_value_and_sum_exps.val1 + sum_exps = max_value_and_sum_exps.val2 + loss_out[i] = log(sum_exps) + max_value + + for k in range(n_classes): + # label decode y_true + if y_true[i] == k: + loss_out[i] -= raw_prediction[i, k] + p[k] /= sum_exps # p_k = y_pred_k = prob of class k + # gradient_k = p_k - (y_true == k) + gradient_out[i, k] = p[k] - (y_true[i] == k) + + free(p) + else: + with nogil, parallel(num_threads=n_threads): + p = malloc(sizeof(floating_in) * (n_classes)) + + for i in prange(n_samples, schedule='static'): + max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p) + max_value = max_value_and_sum_exps.val1 + sum_exps = max_value_and_sum_exps.val2 + loss_out[i] = log(sum_exps) + max_value + + for k in range(n_classes): + # label decode y_true + if y_true[i] == k: + loss_out[i] -= raw_prediction[i, k] + p[k] /= sum_exps # p_k = y_pred_k = prob of class k + # gradient_k = (p_k - (y_true == k)) * sw + gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i] + + loss_out[i] *= sample_weight[i] + + free(p) + + def gradient( + self, + const floating_in[::1] y_true, # IN + const floating_in[:, :] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[:, :] gradient_out, # OUT + int n_threads=1 + ): + cdef: + int i, k + int n_samples = y_true.shape[0] + int n_classes = raw_prediction.shape[1] + floating_in sum_exps + floating_in* p # temporary buffer + double_pair max_value_and_sum_exps + + if sample_weight is None: + # inner loop over n_classes + with nogil, parallel(num_threads=n_threads): + # Define private buffer variables as each thread might use its + # own. + p = malloc(sizeof(floating_in) * (n_classes)) + + for i in prange(n_samples, schedule='static'): + max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p) + sum_exps = max_value_and_sum_exps.val2 + + for k in range(n_classes): + p[k] /= sum_exps # p_k = y_pred_k = prob of class k + # gradient_k = y_pred_k - (y_true == k) + gradient_out[i, k] = p[k] - (y_true[i] == k) + + free(p) + else: + with nogil, parallel(num_threads=n_threads): + p = malloc(sizeof(floating_in) * (n_classes)) + + for i in prange(n_samples, schedule='static'): + max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p) + sum_exps = max_value_and_sum_exps.val2 + + for k in range(n_classes): + p[k] /= sum_exps # p_k = y_pred_k = prob of class k + # gradient_k = (p_k - (y_true == k)) * sw + gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i] + + free(p) + + def gradient_hessian( + self, + const floating_in[::1] y_true, # IN + const floating_in[:, :] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[:, :] gradient_out, # OUT + floating_out[:, :] hessian_out, # OUT + int n_threads=1 + ): + cdef: + int i, k + int n_samples = y_true.shape[0] + int n_classes = raw_prediction.shape[1] + floating_in sum_exps + floating_in* p # temporary buffer + double_pair max_value_and_sum_exps + + if sample_weight is None: + # inner loop over n_classes + with nogil, parallel(num_threads=n_threads): + # Define private buffer variables as each thread might use its + # own. + p = malloc(sizeof(floating_in) * (n_classes)) + + for i in prange(n_samples, schedule='static'): + max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p) + sum_exps = max_value_and_sum_exps.val2 + + for k in range(n_classes): + p[k] /= sum_exps # p_k = y_pred_k = prob of class k + # hessian_k = p_k * (1 - p_k) + # gradient_k = p_k - (y_true == k) + gradient_out[i, k] = p[k] - (y_true[i] == k) + hessian_out[i, k] = p[k] * (1. - p[k]) + + free(p) + else: + with nogil, parallel(num_threads=n_threads): + p = malloc(sizeof(floating_in) * (n_classes)) + + for i in prange(n_samples, schedule='static'): + max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p) + sum_exps = max_value_and_sum_exps.val2 + + for k in range(n_classes): + p[k] /= sum_exps # p_k = y_pred_k = prob of class k + # gradient_k = (p_k - (y_true == k)) * sw + # hessian_k = p_k * (1 - p_k) * sw + gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i] + hessian_out[i, k] = (p[k] * (1. - p[k])) * sample_weight[i] + + free(p) + + # This method simplifies the implementation of hessp in linear models, + # i.e. the matrix-vector product of the full hessian, not only of the + # diagonal (in the classes) approximation as implemented above. + def gradient_proba( + self, + const floating_in[::1] y_true, # IN + const floating_in[:, :] raw_prediction, # IN + const floating_in[::1] sample_weight, # IN + floating_out[:, :] gradient_out, # OUT + floating_out[:, :] proba_out, # OUT + int n_threads=1 + ): + cdef: + int i, k + int n_samples = y_true.shape[0] + int n_classes = raw_prediction.shape[1] + floating_in sum_exps + floating_in* p # temporary buffer + double_pair max_value_and_sum_exps + + if sample_weight is None: + # inner loop over n_classes + with nogil, parallel(num_threads=n_threads): + # Define private buffer variables as each thread might use its + # own. + p = malloc(sizeof(floating_in) * (n_classes)) + + for i in prange(n_samples, schedule='static'): + max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p) + sum_exps = max_value_and_sum_exps.val2 + + for k in range(n_classes): + proba_out[i, k] = p[k] / sum_exps # y_pred_k = prob of class k + # gradient_k = y_pred_k - (y_true == k) + gradient_out[i, k] = proba_out[i, k] - (y_true[i] == k) + + free(p) + else: + with nogil, parallel(num_threads=n_threads): + p = malloc(sizeof(floating_in) * (n_classes)) + + for i in prange(n_samples, schedule='static'): + max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p) + sum_exps = max_value_and_sum_exps.val2 + + for k in range(n_classes): + proba_out[i, k] = p[k] / sum_exps # y_pred_k = prob of class k + # gradient_k = (p_k - (y_true == k)) * sw + gradient_out[i, k] = (proba_out[i, k] - (y_true[i] == k)) * sample_weight[i] + + free(p) diff --git a/.venv/lib/python3.12/site-packages/sklearn/_loss/link.py b/.venv/lib/python3.12/site-packages/sklearn/_loss/link.py new file mode 100644 index 0000000000000000000000000000000000000000..53dff6c2e928541ce58bb71c484e59622143104d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/_loss/link.py @@ -0,0 +1,282 @@ +""" +Module contains classes for invertible (and differentiable) link functions. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from abc import ABC, abstractmethod +from dataclasses import dataclass + +import numpy as np +from scipy.special import expit, logit +from scipy.stats import gmean + +from ..utils.extmath import softmax + + +@dataclass +class Interval: + low: float + high: float + low_inclusive: bool + high_inclusive: bool + + def __post_init__(self): + """Check that low <= high""" + if self.low > self.high: + raise ValueError( + f"One must have low <= high; got low={self.low}, high={self.high}." + ) + + def includes(self, x): + """Test whether all values of x are in interval range. + + Parameters + ---------- + x : ndarray + Array whose elements are tested to be in interval range. + + Returns + ------- + result : bool + """ + if self.low_inclusive: + low = np.greater_equal(x, self.low) + else: + low = np.greater(x, self.low) + + if not np.all(low): + return False + + if self.high_inclusive: + high = np.less_equal(x, self.high) + else: + high = np.less(x, self.high) + + # Note: np.all returns numpy.bool_ + return bool(np.all(high)) + + +def _inclusive_low_high(interval, dtype=np.float64): + """Generate values low and high to be within the interval range. + + This is used in tests only. + + Returns + ------- + low, high : tuple + The returned values low and high lie within the interval. + """ + eps = 10 * np.finfo(dtype).eps + if interval.low == -np.inf: + low = -1e10 + elif interval.low < 0: + low = interval.low * (1 - eps) + eps + else: + low = interval.low * (1 + eps) + eps + + if interval.high == np.inf: + high = 1e10 + elif interval.high < 0: + high = interval.high * (1 + eps) - eps + else: + high = interval.high * (1 - eps) - eps + + return low, high + + +class BaseLink(ABC): + """Abstract base class for differentiable, invertible link functions. + + Convention: + - link function g: raw_prediction = g(y_pred) + - inverse link h: y_pred = h(raw_prediction) + + For (generalized) linear models, `raw_prediction = X @ coef` is the so + called linear predictor, and `y_pred = h(raw_prediction)` is the predicted + conditional (on X) expected value of the target `y_true`. + + The methods are not implemented as staticmethods in case a link function needs + parameters. + """ + + is_multiclass = False # used for testing only + + # Usually, raw_prediction may be any real number and y_pred is an open + # interval. + # interval_raw_prediction = Interval(-np.inf, np.inf, False, False) + interval_y_pred = Interval(-np.inf, np.inf, False, False) + + @abstractmethod + def link(self, y_pred, out=None): + """Compute the link function g(y_pred). + + The link function maps (predicted) target values to raw predictions, + i.e. `g(y_pred) = raw_prediction`. + + Parameters + ---------- + y_pred : array + Predicted target values. + out : array + A location into which the result is stored. If provided, it must + have a shape that the inputs broadcast to. If not provided or None, + a freshly-allocated array is returned. + + Returns + ------- + out : array + Output array, element-wise link function. + """ + + @abstractmethod + def inverse(self, raw_prediction, out=None): + """Compute the inverse link function h(raw_prediction). + + The inverse link function maps raw predictions to predicted target + values, i.e. `h(raw_prediction) = y_pred`. + + Parameters + ---------- + raw_prediction : array + Raw prediction values (in link space). + out : array + A location into which the result is stored. If provided, it must + have a shape that the inputs broadcast to. If not provided or None, + a freshly-allocated array is returned. + + Returns + ------- + out : array + Output array, element-wise inverse link function. + """ + + +class IdentityLink(BaseLink): + """The identity link function g(x)=x.""" + + def link(self, y_pred, out=None): + if out is not None: + np.copyto(out, y_pred) + return out + else: + return y_pred + + inverse = link + + +class LogLink(BaseLink): + """The log link function g(x)=log(x).""" + + interval_y_pred = Interval(0, np.inf, False, False) + + def link(self, y_pred, out=None): + return np.log(y_pred, out=out) + + def inverse(self, raw_prediction, out=None): + return np.exp(raw_prediction, out=out) + + +class LogitLink(BaseLink): + """The logit link function g(x)=logit(x).""" + + interval_y_pred = Interval(0, 1, False, False) + + def link(self, y_pred, out=None): + return logit(y_pred, out=out) + + def inverse(self, raw_prediction, out=None): + return expit(raw_prediction, out=out) + + +class HalfLogitLink(BaseLink): + """Half the logit link function g(x)=1/2 * logit(x). + + Used for the exponential loss. + """ + + interval_y_pred = Interval(0, 1, False, False) + + def link(self, y_pred, out=None): + out = logit(y_pred, out=out) + out *= 0.5 + return out + + def inverse(self, raw_prediction, out=None): + return expit(2 * raw_prediction, out) + + +class MultinomialLogit(BaseLink): + """The symmetric multinomial logit function. + + Convention: + - y_pred.shape = raw_prediction.shape = (n_samples, n_classes) + + Notes: + - The inverse link h is the softmax function. + - The sum is over the second axis, i.e. axis=1 (n_classes). + + We have to choose additional constraints in order to make + + y_pred[k] = exp(raw_pred[k]) / sum(exp(raw_pred[k]), k=0..n_classes-1) + + for n_classes classes identifiable and invertible. + We choose the symmetric side constraint where the geometric mean response + is set as reference category, see [2]: + + The symmetric multinomial logit link function for a single data point is + then defined as + + raw_prediction[k] = g(y_pred[k]) = log(y_pred[k]/gmean(y_pred)) + = log(y_pred[k]) - mean(log(y_pred)). + + Note that this is equivalent to the definition in [1] and implies mean + centered raw predictions: + + sum(raw_prediction[k], k=0..n_classes-1) = 0. + + For linear models with raw_prediction = X @ coef, this corresponds to + sum(coef[k], k=0..n_classes-1) = 0, i.e. the sum over classes for every + feature is zero. + + Reference + --------- + .. [1] Friedman, Jerome; Hastie, Trevor; Tibshirani, Robert. "Additive + logistic regression: a statistical view of boosting" Ann. Statist. + 28 (2000), no. 2, 337--407. doi:10.1214/aos/1016218223. + https://projecteuclid.org/euclid.aos/1016218223 + + .. [2] Zahid, Faisal Maqbool and Gerhard Tutz. "Ridge estimation for + multinomial logit models with symmetric side constraints." + Computational Statistics 28 (2013): 1017-1034. + http://epub.ub.uni-muenchen.de/11001/1/tr067.pdf + """ + + is_multiclass = True + interval_y_pred = Interval(0, 1, False, False) + + def symmetrize_raw_prediction(self, raw_prediction): + return raw_prediction - np.mean(raw_prediction, axis=1)[:, np.newaxis] + + def link(self, y_pred, out=None): + # geometric mean as reference category + gm = gmean(y_pred, axis=1) + return np.log(y_pred / gm[:, np.newaxis], out=out) + + def inverse(self, raw_prediction, out=None): + if out is None: + return softmax(raw_prediction, copy=True) + else: + np.copyto(out, raw_prediction) + softmax(out, copy=False) + return out + + +_LINKS = { + "identity": IdentityLink, + "log": LogLink, + "logit": LogitLink, + "half_logit": HalfLogitLink, + "multinomial_logit": MultinomialLogit, +} diff --git a/.venv/lib/python3.12/site-packages/sklearn/_loss/loss.py b/.venv/lib/python3.12/site-packages/sklearn/_loss/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..b45ff3322699aa26533d504be6407f9d5acbb5b8 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/_loss/loss.py @@ -0,0 +1,1181 @@ +""" +This module contains loss classes suitable for fitting. + +It is not part of the public API. +Specific losses are used for regression, binary classification or multiclass +classification. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# Goals: +# - Provide a common private module for loss functions/classes. +# - To be used in: +# - LogisticRegression +# - PoissonRegressor, GammaRegressor, TweedieRegressor +# - HistGradientBoostingRegressor, HistGradientBoostingClassifier +# - GradientBoostingRegressor, GradientBoostingClassifier +# - SGDRegressor, SGDClassifier +# - Replace link module of GLMs. + +import numbers + +import numpy as np +from scipy.special import xlogy + +from ..utils import check_scalar +from ..utils.stats import _weighted_percentile +from ._loss import ( + CyAbsoluteError, + CyExponentialLoss, + CyHalfBinomialLoss, + CyHalfGammaLoss, + CyHalfMultinomialLoss, + CyHalfPoissonLoss, + CyHalfSquaredError, + CyHalfTweedieLoss, + CyHalfTweedieLossIdentity, + CyHuberLoss, + CyPinballLoss, +) +from .link import ( + HalfLogitLink, + IdentityLink, + Interval, + LogitLink, + LogLink, + MultinomialLogit, +) + + +# Note: The shape of raw_prediction for multiclass classifications are +# - GradientBoostingClassifier: (n_samples, n_classes) +# - HistGradientBoostingClassifier: (n_classes, n_samples) +# +# Note: Instead of inheritance like +# +# class BaseLoss(BaseLink, CyLossFunction): +# ... +# +# # Note: Naturally, we would inherit in the following order +# # class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss) +# # But because of https://github.com/cython/cython/issues/4350 we set BaseLoss as +# # the last one. This, of course, changes the MRO. +# class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss): +# +# we use composition. This way we improve maintainability by avoiding the above +# mentioned Cython edge case and have easier to understand code (which method calls +# which code). +class BaseLoss: + """Base class for a loss function of 1-dimensional targets. + + Conventions: + + - y_true.shape = sample_weight.shape = (n_samples,) + - y_pred.shape = raw_prediction.shape = (n_samples,) + - If is_multiclass is true (multiclass classification), then + y_pred.shape = raw_prediction.shape = (n_samples, n_classes) + Note that this corresponds to the return value of decision_function. + + y_true, y_pred, sample_weight and raw_prediction must either be all float64 + or all float32. + gradient and hessian must be either both float64 or both float32. + + Note that y_pred = link.inverse(raw_prediction). + + Specific loss classes can inherit specific link classes to satisfy + BaseLink's abstractmethods. + + Parameters + ---------- + sample_weight : {None, ndarray} + If sample_weight is None, the hessian might be constant. + n_classes : {None, int} + The number of classes for classification, else None. + + Attributes + ---------- + closs: CyLossFunction + link : BaseLink + interval_y_true : Interval + Valid interval for y_true + interval_y_pred : Interval + Valid Interval for y_pred + differentiable : bool + Indicates whether or not loss function is differentiable in + raw_prediction everywhere. + need_update_leaves_values : bool + Indicates whether decision trees in gradient boosting need to uptade + leave values after having been fit to the (negative) gradients. + approx_hessian : bool + Indicates whether the hessian is approximated or exact. If, + approximated, it should be larger or equal to the exact one. + constant_hessian : bool + Indicates whether the hessian is one for this loss. + is_multiclass : bool + Indicates whether n_classes > 2 is allowed. + """ + + # For gradient boosted decision trees: + # This variable indicates whether the loss requires the leaves values to + # be updated once the tree has been trained. The trees are trained to + # predict a Newton-Raphson step (see grower._finalize_leaf()). But for + # some losses (e.g. least absolute deviation) we need to adjust the tree + # values to account for the "line search" of the gradient descent + # procedure. See the original paper Greedy Function Approximation: A + # Gradient Boosting Machine by Friedman + # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory. + differentiable = True + need_update_leaves_values = False + is_multiclass = False + + def __init__(self, closs, link, n_classes=None): + self.closs = closs + self.link = link + self.approx_hessian = False + self.constant_hessian = False + self.n_classes = n_classes + self.interval_y_true = Interval(-np.inf, np.inf, False, False) + self.interval_y_pred = self.link.interval_y_pred + + def in_y_true_range(self, y): + """Return True if y is in the valid range of y_true. + + Parameters + ---------- + y : ndarray + """ + return self.interval_y_true.includes(y) + + def in_y_pred_range(self, y): + """Return True if y is in the valid range of y_pred. + + Parameters + ---------- + y : ndarray + """ + return self.interval_y_pred.includes(y) + + def loss( + self, + y_true, + raw_prediction, + sample_weight=None, + loss_out=None, + n_threads=1, + ): + """Compute the pointwise loss value for each input. + + Parameters + ---------- + y_true : C-contiguous array of shape (n_samples,) + Observed, true target values. + raw_prediction : C-contiguous array of shape (n_samples,) or array of \ + shape (n_samples, n_classes) + Raw prediction values (in link space). + sample_weight : None or C-contiguous array of shape (n_samples,) + Sample weights. + loss_out : None or C-contiguous array of shape (n_samples,) + A location into which the result is stored. If None, a new array + might be created. + n_threads : int, default=1 + Might use openmp thread parallelism. + + Returns + ------- + loss : array of shape (n_samples,) + Element-wise loss function. + """ + if loss_out is None: + loss_out = np.empty_like(y_true) + # Be graceful to shape (n_samples, 1) -> (n_samples,) + if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1: + raw_prediction = raw_prediction.squeeze(1) + + self.closs.loss( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + loss_out=loss_out, + n_threads=n_threads, + ) + return loss_out + + def loss_gradient( + self, + y_true, + raw_prediction, + sample_weight=None, + loss_out=None, + gradient_out=None, + n_threads=1, + ): + """Compute loss and gradient w.r.t. raw_prediction for each input. + + Parameters + ---------- + y_true : C-contiguous array of shape (n_samples,) + Observed, true target values. + raw_prediction : C-contiguous array of shape (n_samples,) or array of \ + shape (n_samples, n_classes) + Raw prediction values (in link space). + sample_weight : None or C-contiguous array of shape (n_samples,) + Sample weights. + loss_out : None or C-contiguous array of shape (n_samples,) + A location into which the loss is stored. If None, a new array + might be created. + gradient_out : None or C-contiguous array of shape (n_samples,) or array \ + of shape (n_samples, n_classes) + A location into which the gradient is stored. If None, a new array + might be created. + n_threads : int, default=1 + Might use openmp thread parallelism. + + Returns + ------- + loss : array of shape (n_samples,) + Element-wise loss function. + + gradient : array of shape (n_samples,) or (n_samples, n_classes) + Element-wise gradients. + """ + if loss_out is None: + if gradient_out is None: + loss_out = np.empty_like(y_true) + gradient_out = np.empty_like(raw_prediction) + else: + loss_out = np.empty_like(y_true, dtype=gradient_out.dtype) + elif gradient_out is None: + gradient_out = np.empty_like(raw_prediction, dtype=loss_out.dtype) + + # Be graceful to shape (n_samples, 1) -> (n_samples,) + if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1: + raw_prediction = raw_prediction.squeeze(1) + if gradient_out.ndim == 2 and gradient_out.shape[1] == 1: + gradient_out = gradient_out.squeeze(1) + + self.closs.loss_gradient( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + loss_out=loss_out, + gradient_out=gradient_out, + n_threads=n_threads, + ) + return loss_out, gradient_out + + def gradient( + self, + y_true, + raw_prediction, + sample_weight=None, + gradient_out=None, + n_threads=1, + ): + """Compute gradient of loss w.r.t raw_prediction for each input. + + Parameters + ---------- + y_true : C-contiguous array of shape (n_samples,) + Observed, true target values. + raw_prediction : C-contiguous array of shape (n_samples,) or array of \ + shape (n_samples, n_classes) + Raw prediction values (in link space). + sample_weight : None or C-contiguous array of shape (n_samples,) + Sample weights. + gradient_out : None or C-contiguous array of shape (n_samples,) or array \ + of shape (n_samples, n_classes) + A location into which the result is stored. If None, a new array + might be created. + n_threads : int, default=1 + Might use openmp thread parallelism. + + Returns + ------- + gradient : array of shape (n_samples,) or (n_samples, n_classes) + Element-wise gradients. + """ + if gradient_out is None: + gradient_out = np.empty_like(raw_prediction) + + # Be graceful to shape (n_samples, 1) -> (n_samples,) + if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1: + raw_prediction = raw_prediction.squeeze(1) + if gradient_out.ndim == 2 and gradient_out.shape[1] == 1: + gradient_out = gradient_out.squeeze(1) + + self.closs.gradient( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + gradient_out=gradient_out, + n_threads=n_threads, + ) + return gradient_out + + def gradient_hessian( + self, + y_true, + raw_prediction, + sample_weight=None, + gradient_out=None, + hessian_out=None, + n_threads=1, + ): + """Compute gradient and hessian of loss w.r.t raw_prediction. + + Parameters + ---------- + y_true : C-contiguous array of shape (n_samples,) + Observed, true target values. + raw_prediction : C-contiguous array of shape (n_samples,) or array of \ + shape (n_samples, n_classes) + Raw prediction values (in link space). + sample_weight : None or C-contiguous array of shape (n_samples,) + Sample weights. + gradient_out : None or C-contiguous array of shape (n_samples,) or array \ + of shape (n_samples, n_classes) + A location into which the gradient is stored. If None, a new array + might be created. + hessian_out : None or C-contiguous array of shape (n_samples,) or array \ + of shape (n_samples, n_classes) + A location into which the hessian is stored. If None, a new array + might be created. + n_threads : int, default=1 + Might use openmp thread parallelism. + + Returns + ------- + gradient : arrays of shape (n_samples,) or (n_samples, n_classes) + Element-wise gradients. + + hessian : arrays of shape (n_samples,) or (n_samples, n_classes) + Element-wise hessians. + """ + if gradient_out is None: + if hessian_out is None: + gradient_out = np.empty_like(raw_prediction) + hessian_out = np.empty_like(raw_prediction) + else: + gradient_out = np.empty_like(hessian_out) + elif hessian_out is None: + hessian_out = np.empty_like(gradient_out) + + # Be graceful to shape (n_samples, 1) -> (n_samples,) + if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1: + raw_prediction = raw_prediction.squeeze(1) + if gradient_out.ndim == 2 and gradient_out.shape[1] == 1: + gradient_out = gradient_out.squeeze(1) + if hessian_out.ndim == 2 and hessian_out.shape[1] == 1: + hessian_out = hessian_out.squeeze(1) + + self.closs.gradient_hessian( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + gradient_out=gradient_out, + hessian_out=hessian_out, + n_threads=n_threads, + ) + return gradient_out, hessian_out + + def __call__(self, y_true, raw_prediction, sample_weight=None, n_threads=1): + """Compute the weighted average loss. + + Parameters + ---------- + y_true : C-contiguous array of shape (n_samples,) + Observed, true target values. + raw_prediction : C-contiguous array of shape (n_samples,) or array of \ + shape (n_samples, n_classes) + Raw prediction values (in link space). + sample_weight : None or C-contiguous array of shape (n_samples,) + Sample weights. + n_threads : int, default=1 + Might use openmp thread parallelism. + + Returns + ------- + loss : float + Mean or averaged loss function. + """ + return np.average( + self.loss( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=None, + loss_out=None, + n_threads=n_threads, + ), + weights=sample_weight, + ) + + def fit_intercept_only(self, y_true, sample_weight=None): + """Compute raw_prediction of an intercept-only model. + + This can be used as initial estimates of predictions, i.e. before the + first iteration in fit. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + Observed, true target values. + sample_weight : None or array of shape (n_samples,) + Sample weights. + + Returns + ------- + raw_prediction : numpy scalar or array of shape (n_classes,) + Raw predictions of an intercept-only model. + """ + # As default, take weighted average of the target over the samples + # axis=0 and then transform into link-scale (raw_prediction). + y_pred = np.average(y_true, weights=sample_weight, axis=0) + eps = 10 * np.finfo(y_pred.dtype).eps + + if self.interval_y_pred.low == -np.inf: + a_min = None + elif self.interval_y_pred.low_inclusive: + a_min = self.interval_y_pred.low + else: + a_min = self.interval_y_pred.low + eps + + if self.interval_y_pred.high == np.inf: + a_max = None + elif self.interval_y_pred.high_inclusive: + a_max = self.interval_y_pred.high + else: + a_max = self.interval_y_pred.high - eps + + if a_min is None and a_max is None: + return self.link.link(y_pred) + else: + return self.link.link(np.clip(y_pred, a_min, a_max)) + + def constant_to_optimal_zero(self, y_true, sample_weight=None): + """Calculate term dropped in loss. + + With this term added, the loss of perfect predictions is zero. + """ + return np.zeros_like(y_true) + + def init_gradient_and_hessian(self, n_samples, dtype=np.float64, order="F"): + """Initialize arrays for gradients and hessians. + + Unless hessians are constant, arrays are initialized with undefined values. + + Parameters + ---------- + n_samples : int + The number of samples, usually passed to `fit()`. + dtype : {np.float64, np.float32}, default=np.float64 + The dtype of the arrays gradient and hessian. + order : {'C', 'F'}, default='F' + Order of the arrays gradient and hessian. The default 'F' makes the arrays + contiguous along samples. + + Returns + ------- + gradient : C-contiguous array of shape (n_samples,) or array of shape \ + (n_samples, n_classes) + Empty array (allocated but not initialized) to be used as argument + gradient_out. + hessian : C-contiguous array of shape (n_samples,), array of shape + (n_samples, n_classes) or shape (1,) + Empty (allocated but not initialized) array to be used as argument + hessian_out. + If constant_hessian is True (e.g. `HalfSquaredError`), the array is + initialized to ``1``. + """ + if dtype not in (np.float32, np.float64): + raise ValueError( + "Valid options for 'dtype' are np.float32 and np.float64. " + f"Got dtype={dtype} instead." + ) + + if self.is_multiclass: + shape = (n_samples, self.n_classes) + else: + shape = (n_samples,) + gradient = np.empty(shape=shape, dtype=dtype, order=order) + + if self.constant_hessian: + # If the hessians are constant, we consider them equal to 1. + # - This is correct for HalfSquaredError + # - For AbsoluteError, hessians are actually 0, but they are + # always ignored anyway. + hessian = np.ones(shape=(1,), dtype=dtype) + else: + hessian = np.empty(shape=shape, dtype=dtype, order=order) + + return gradient, hessian + + +# Note: Naturally, we would inherit in the following order +# class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss) +# But because of https://github.com/cython/cython/issues/4350 we +# set BaseLoss as the last one. This, of course, changes the MRO. +class HalfSquaredError(BaseLoss): + """Half squared error with identity link, for regression. + + Domain: + y_true and y_pred all real numbers + + Link: + y_pred = raw_prediction + + For a given sample x_i, half squared error is defined as:: + + loss(x_i) = 0.5 * (y_true_i - raw_prediction_i)**2 + + The factor of 0.5 simplifies the computation of gradients and results in a + unit hessian (and is consistent with what is done in LightGBM). It is also + half the Normal distribution deviance. + """ + + def __init__(self, sample_weight=None): + super().__init__(closs=CyHalfSquaredError(), link=IdentityLink()) + self.constant_hessian = sample_weight is None + + +class AbsoluteError(BaseLoss): + """Absolute error with identity link, for regression. + + Domain: + y_true and y_pred all real numbers + + Link: + y_pred = raw_prediction + + For a given sample x_i, the absolute error is defined as:: + + loss(x_i) = |y_true_i - raw_prediction_i| + + Note that the exact hessian = 0 almost everywhere (except at one point, therefore + differentiable = False). Optimization routines like in HGBT, however, need a + hessian > 0. Therefore, we assign 1. + """ + + differentiable = False + need_update_leaves_values = True + + def __init__(self, sample_weight=None): + super().__init__(closs=CyAbsoluteError(), link=IdentityLink()) + self.approx_hessian = True + self.constant_hessian = sample_weight is None + + def fit_intercept_only(self, y_true, sample_weight=None): + """Compute raw_prediction of an intercept-only model. + + This is the weighted median of the target, i.e. over the samples + axis=0. + """ + if sample_weight is None: + return np.median(y_true, axis=0) + else: + return _weighted_percentile(y_true, sample_weight, 50) + + +class PinballLoss(BaseLoss): + """Quantile loss aka pinball loss, for regression. + + Domain: + y_true and y_pred all real numbers + quantile in (0, 1) + + Link: + y_pred = raw_prediction + + For a given sample x_i, the pinball loss is defined as:: + + loss(x_i) = rho_{quantile}(y_true_i - raw_prediction_i) + + rho_{quantile}(u) = u * (quantile - 1_{u<0}) + = -u *(1 - quantile) if u < 0 + u * quantile if u >= 0 + + Note: 2 * PinballLoss(quantile=0.5) equals AbsoluteError(). + + Note that the exact hessian = 0 almost everywhere (except at one point, therefore + differentiable = False). Optimization routines like in HGBT, however, need a + hessian > 0. Therefore, we assign 1. + + Additional Attributes + --------------------- + quantile : float + The quantile level of the quantile to be estimated. Must be in range (0, 1). + """ + + differentiable = False + need_update_leaves_values = True + + def __init__(self, sample_weight=None, quantile=0.5): + check_scalar( + quantile, + "quantile", + target_type=numbers.Real, + min_val=0, + max_val=1, + include_boundaries="neither", + ) + super().__init__( + closs=CyPinballLoss(quantile=float(quantile)), + link=IdentityLink(), + ) + self.approx_hessian = True + self.constant_hessian = sample_weight is None + + def fit_intercept_only(self, y_true, sample_weight=None): + """Compute raw_prediction of an intercept-only model. + + This is the weighted median of the target, i.e. over the samples + axis=0. + """ + if sample_weight is None: + return np.percentile(y_true, 100 * self.closs.quantile, axis=0) + else: + return _weighted_percentile( + y_true, sample_weight, 100 * self.closs.quantile + ) + + +class HuberLoss(BaseLoss): + """Huber loss, for regression. + + Domain: + y_true and y_pred all real numbers + quantile in (0, 1) + + Link: + y_pred = raw_prediction + + For a given sample x_i, the Huber loss is defined as:: + + loss(x_i) = 1/2 * abserr**2 if abserr <= delta + delta * (abserr - delta/2) if abserr > delta + + abserr = |y_true_i - raw_prediction_i| + delta = quantile(abserr, self.quantile) + + Note: HuberLoss(quantile=1) equals HalfSquaredError and HuberLoss(quantile=0) + equals delta * (AbsoluteError() - delta/2). + + Additional Attributes + --------------------- + quantile : float + The quantile level which defines the breaking point `delta` to distinguish + between absolute error and squared error. Must be in range (0, 1). + + Reference + --------- + .. [1] Friedman, J.H. (2001). :doi:`Greedy function approximation: A gradient + boosting machine <10.1214/aos/1013203451>`. + Annals of Statistics, 29, 1189-1232. + """ + + differentiable = False + need_update_leaves_values = True + + def __init__(self, sample_weight=None, quantile=0.9, delta=0.5): + check_scalar( + quantile, + "quantile", + target_type=numbers.Real, + min_val=0, + max_val=1, + include_boundaries="neither", + ) + self.quantile = quantile # This is better stored outside of Cython. + super().__init__( + closs=CyHuberLoss(delta=float(delta)), + link=IdentityLink(), + ) + self.approx_hessian = True + self.constant_hessian = False + + def fit_intercept_only(self, y_true, sample_weight=None): + """Compute raw_prediction of an intercept-only model. + + This is the weighted median of the target, i.e. over the samples + axis=0. + """ + # See formula before algo 4 in Friedman (2001), but we apply it to y_true, + # not to the residual y_true - raw_prediction. An estimator like + # HistGradientBoostingRegressor might then call it on the residual, e.g. + # fit_intercept_only(y_true - raw_prediction). + if sample_weight is None: + median = np.percentile(y_true, 50, axis=0) + else: + median = _weighted_percentile(y_true, sample_weight, 50) + diff = y_true - median + term = np.sign(diff) * np.minimum(self.closs.delta, np.abs(diff)) + return median + np.average(term, weights=sample_weight) + + +class HalfPoissonLoss(BaseLoss): + """Half Poisson deviance loss with log-link, for regression. + + Domain: + y_true in non-negative real numbers + y_pred in positive real numbers + + Link: + y_pred = exp(raw_prediction) + + For a given sample x_i, half the Poisson deviance is defined as:: + + loss(x_i) = y_true_i * log(y_true_i/exp(raw_prediction_i)) + - y_true_i + exp(raw_prediction_i) + + Half the Poisson deviance is actually the negative log-likelihood up to + constant terms (not involving raw_prediction) and simplifies the + computation of the gradients. + We also skip the constant term `y_true_i * log(y_true_i) - y_true_i`. + """ + + def __init__(self, sample_weight=None): + super().__init__(closs=CyHalfPoissonLoss(), link=LogLink()) + self.interval_y_true = Interval(0, np.inf, True, False) + + def constant_to_optimal_zero(self, y_true, sample_weight=None): + term = xlogy(y_true, y_true) - y_true + if sample_weight is not None: + term *= sample_weight + return term + + +class HalfGammaLoss(BaseLoss): + """Half Gamma deviance loss with log-link, for regression. + + Domain: + y_true and y_pred in positive real numbers + + Link: + y_pred = exp(raw_prediction) + + For a given sample x_i, half Gamma deviance loss is defined as:: + + loss(x_i) = log(exp(raw_prediction_i)/y_true_i) + + y_true/exp(raw_prediction_i) - 1 + + Half the Gamma deviance is actually proportional to the negative log- + likelihood up to constant terms (not involving raw_prediction) and + simplifies the computation of the gradients. + We also skip the constant term `-log(y_true_i) - 1`. + """ + + def __init__(self, sample_weight=None): + super().__init__(closs=CyHalfGammaLoss(), link=LogLink()) + self.interval_y_true = Interval(0, np.inf, False, False) + + def constant_to_optimal_zero(self, y_true, sample_weight=None): + term = -np.log(y_true) - 1 + if sample_weight is not None: + term *= sample_weight + return term + + +class HalfTweedieLoss(BaseLoss): + """Half Tweedie deviance loss with log-link, for regression. + + Domain: + y_true in real numbers for power <= 0 + y_true in non-negative real numbers for 0 < power < 2 + y_true in positive real numbers for 2 <= power + y_pred in positive real numbers + power in real numbers + + Link: + y_pred = exp(raw_prediction) + + For a given sample x_i, half Tweedie deviance loss with p=power is defined + as:: + + loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p) + - y_true_i * exp(raw_prediction_i)**(1-p) / (1-p) + + exp(raw_prediction_i)**(2-p) / (2-p) + + Taking the limits for p=0, 1, 2 gives HalfSquaredError with a log link, + HalfPoissonLoss and HalfGammaLoss. + + We also skip constant terms, but those are different for p=0, 1, 2. + Therefore, the loss is not continuous in `power`. + + Note furthermore that although no Tweedie distribution exists for + 0 < power < 1, it still gives a strictly consistent scoring function for + the expectation. + """ + + def __init__(self, sample_weight=None, power=1.5): + super().__init__( + closs=CyHalfTweedieLoss(power=float(power)), + link=LogLink(), + ) + if self.closs.power <= 0: + self.interval_y_true = Interval(-np.inf, np.inf, False, False) + elif self.closs.power < 2: + self.interval_y_true = Interval(0, np.inf, True, False) + else: + self.interval_y_true = Interval(0, np.inf, False, False) + + def constant_to_optimal_zero(self, y_true, sample_weight=None): + if self.closs.power == 0: + return HalfSquaredError().constant_to_optimal_zero( + y_true=y_true, sample_weight=sample_weight + ) + elif self.closs.power == 1: + return HalfPoissonLoss().constant_to_optimal_zero( + y_true=y_true, sample_weight=sample_weight + ) + elif self.closs.power == 2: + return HalfGammaLoss().constant_to_optimal_zero( + y_true=y_true, sample_weight=sample_weight + ) + else: + p = self.closs.power + term = np.power(np.maximum(y_true, 0), 2 - p) / (1 - p) / (2 - p) + if sample_weight is not None: + term *= sample_weight + return term + + +class HalfTweedieLossIdentity(BaseLoss): + """Half Tweedie deviance loss with identity link, for regression. + + Domain: + y_true in real numbers for power <= 0 + y_true in non-negative real numbers for 0 < power < 2 + y_true in positive real numbers for 2 <= power + y_pred in positive real numbers for power != 0 + y_pred in real numbers for power = 0 + power in real numbers + + Link: + y_pred = raw_prediction + + For a given sample x_i, half Tweedie deviance loss with p=power is defined + as:: + + loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p) + - y_true_i * raw_prediction_i**(1-p) / (1-p) + + raw_prediction_i**(2-p) / (2-p) + + Note that the minimum value of this loss is 0. + + Note furthermore that although no Tweedie distribution exists for + 0 < power < 1, it still gives a strictly consistent scoring function for + the expectation. + """ + + def __init__(self, sample_weight=None, power=1.5): + super().__init__( + closs=CyHalfTweedieLossIdentity(power=float(power)), + link=IdentityLink(), + ) + if self.closs.power <= 0: + self.interval_y_true = Interval(-np.inf, np.inf, False, False) + elif self.closs.power < 2: + self.interval_y_true = Interval(0, np.inf, True, False) + else: + self.interval_y_true = Interval(0, np.inf, False, False) + + if self.closs.power == 0: + self.interval_y_pred = Interval(-np.inf, np.inf, False, False) + else: + self.interval_y_pred = Interval(0, np.inf, False, False) + + +class HalfBinomialLoss(BaseLoss): + """Half Binomial deviance loss with logit link, for binary classification. + + This is also know as binary cross entropy, log-loss and logistic loss. + + Domain: + y_true in [0, 1], i.e. regression on the unit interval + y_pred in (0, 1), i.e. boundaries excluded + + Link: + y_pred = expit(raw_prediction) + + For a given sample x_i, half Binomial deviance is defined as the negative + log-likelihood of the Binomial/Bernoulli distribution and can be expressed + as:: + + loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i + + See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman, + section 4.4.1 (about logistic regression). + + Note that the formulation works for classification, y = {0, 1}, as well as + logistic regression, y = [0, 1]. + If you add `constant_to_optimal_zero` to the loss, you get half the + Bernoulli/binomial deviance. + + More details: Inserting the predicted probability y_pred = expit(raw_prediction) + in the loss gives the well known:: + + loss(x_i) = - y_true_i * log(y_pred_i) - (1 - y_true_i) * log(1 - y_pred_i) + """ + + def __init__(self, sample_weight=None): + super().__init__( + closs=CyHalfBinomialLoss(), + link=LogitLink(), + n_classes=2, + ) + self.interval_y_true = Interval(0, 1, True, True) + + def constant_to_optimal_zero(self, y_true, sample_weight=None): + # This is non-zero only if y_true is neither 0 nor 1. + term = xlogy(y_true, y_true) + xlogy(1 - y_true, 1 - y_true) + if sample_weight is not None: + term *= sample_weight + return term + + def predict_proba(self, raw_prediction): + """Predict probabilities. + + Parameters + ---------- + raw_prediction : array of shape (n_samples,) or (n_samples, 1) + Raw prediction values (in link space). + + Returns + ------- + proba : array of shape (n_samples, 2) + Element-wise class probabilities. + """ + # Be graceful to shape (n_samples, 1) -> (n_samples,) + if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1: + raw_prediction = raw_prediction.squeeze(1) + proba = np.empty((raw_prediction.shape[0], 2), dtype=raw_prediction.dtype) + proba[:, 1] = self.link.inverse(raw_prediction) + proba[:, 0] = 1 - proba[:, 1] + return proba + + +class HalfMultinomialLoss(BaseLoss): + """Categorical cross-entropy loss, for multiclass classification. + + Domain: + y_true in {0, 1, 2, 3, .., n_classes - 1} + y_pred has n_classes elements, each element in (0, 1) + + Link: + y_pred = softmax(raw_prediction) + + Note: We assume y_true to be already label encoded. The inverse link is + softmax. But the full link function is the symmetric multinomial logit + function. + + For a given sample x_i, the categorical cross-entropy loss is defined as + the negative log-likelihood of the multinomial distribution, it + generalizes the binary cross-entropy to more than 2 classes:: + + loss_i = log(sum(exp(raw_pred_{i, k}), k=0..n_classes-1)) + - sum(y_true_{i, k} * raw_pred_{i, k}, k=0..n_classes-1) + + See [1]. + + Note that for the hessian, we calculate only the diagonal part in the + classes: If the full hessian for classes k and l and sample i is H_i_k_l, + we calculate H_i_k_k, i.e. k=l. + + Reference + --------- + .. [1] :arxiv:`Simon, Noah, J. Friedman and T. Hastie. + "A Blockwise Descent Algorithm for Group-penalized Multiresponse and + Multinomial Regression". + <1311.6529>` + """ + + is_multiclass = True + + def __init__(self, sample_weight=None, n_classes=3): + super().__init__( + closs=CyHalfMultinomialLoss(), + link=MultinomialLogit(), + n_classes=n_classes, + ) + self.interval_y_true = Interval(0, np.inf, True, False) + self.interval_y_pred = Interval(0, 1, False, False) + + def in_y_true_range(self, y): + """Return True if y is in the valid range of y_true. + + Parameters + ---------- + y : ndarray + """ + return self.interval_y_true.includes(y) and np.all(y.astype(int) == y) + + def fit_intercept_only(self, y_true, sample_weight=None): + """Compute raw_prediction of an intercept-only model. + + This is the softmax of the weighted average of the target, i.e. over + the samples axis=0. + """ + out = np.zeros(self.n_classes, dtype=y_true.dtype) + eps = np.finfo(y_true.dtype).eps + for k in range(self.n_classes): + out[k] = np.average(y_true == k, weights=sample_weight, axis=0) + out[k] = np.clip(out[k], eps, 1 - eps) + return self.link.link(out[None, :]).reshape(-1) + + def predict_proba(self, raw_prediction): + """Predict probabilities. + + Parameters + ---------- + raw_prediction : array of shape (n_samples, n_classes) + Raw prediction values (in link space). + + Returns + ------- + proba : array of shape (n_samples, n_classes) + Element-wise class probabilities. + """ + return self.link.inverse(raw_prediction) + + def gradient_proba( + self, + y_true, + raw_prediction, + sample_weight=None, + gradient_out=None, + proba_out=None, + n_threads=1, + ): + """Compute gradient and class probabilities fow raw_prediction. + + Parameters + ---------- + y_true : C-contiguous array of shape (n_samples,) + Observed, true target values. + raw_prediction : array of shape (n_samples, n_classes) + Raw prediction values (in link space). + sample_weight : None or C-contiguous array of shape (n_samples,) + Sample weights. + gradient_out : None or array of shape (n_samples, n_classes) + A location into which the gradient is stored. If None, a new array + might be created. + proba_out : None or array of shape (n_samples, n_classes) + A location into which the class probabilities are stored. If None, + a new array might be created. + n_threads : int, default=1 + Might use openmp thread parallelism. + + Returns + ------- + gradient : array of shape (n_samples, n_classes) + Element-wise gradients. + + proba : array of shape (n_samples, n_classes) + Element-wise class probabilities. + """ + if gradient_out is None: + if proba_out is None: + gradient_out = np.empty_like(raw_prediction) + proba_out = np.empty_like(raw_prediction) + else: + gradient_out = np.empty_like(proba_out) + elif proba_out is None: + proba_out = np.empty_like(gradient_out) + + self.closs.gradient_proba( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + gradient_out=gradient_out, + proba_out=proba_out, + n_threads=n_threads, + ) + return gradient_out, proba_out + + +class ExponentialLoss(BaseLoss): + """Exponential loss with (half) logit link, for binary classification. + + This is also know as boosting loss. + + Domain: + y_true in [0, 1], i.e. regression on the unit interval + y_pred in (0, 1), i.e. boundaries excluded + + Link: + y_pred = expit(2 * raw_prediction) + + For a given sample x_i, the exponential loss is defined as:: + + loss(x_i) = y_true_i * exp(-raw_pred_i)) + (1 - y_true_i) * exp(raw_pred_i) + + See: + - J. Friedman, T. Hastie, R. Tibshirani. + "Additive logistic regression: a statistical view of boosting (With discussion + and a rejoinder by the authors)." Ann. Statist. 28 (2) 337 - 407, April 2000. + https://doi.org/10.1214/aos/1016218223 + - A. Buja, W. Stuetzle, Y. Shen. (2005). + "Loss Functions for Binary Class Probability Estimation and Classification: + Structure and Applications." + + Note that the formulation works for classification, y = {0, 1}, as well as + "exponential logistic" regression, y = [0, 1]. + Note that this is a proper scoring rule, but without it's canonical link. + + More details: Inserting the predicted probability + y_pred = expit(2 * raw_prediction) in the loss gives:: + + loss(x_i) = y_true_i * sqrt((1 - y_pred_i) / y_pred_i) + + (1 - y_true_i) * sqrt(y_pred_i / (1 - y_pred_i)) + """ + + def __init__(self, sample_weight=None): + super().__init__( + closs=CyExponentialLoss(), + link=HalfLogitLink(), + n_classes=2, + ) + self.interval_y_true = Interval(0, 1, True, True) + + def constant_to_optimal_zero(self, y_true, sample_weight=None): + # This is non-zero only if y_true is neither 0 nor 1. + term = -2 * np.sqrt(y_true * (1 - y_true)) + if sample_weight is not None: + term *= sample_weight + return term + + def predict_proba(self, raw_prediction): + """Predict probabilities. + + Parameters + ---------- + raw_prediction : array of shape (n_samples,) or (n_samples, 1) + Raw prediction values (in link space). + + Returns + ------- + proba : array of shape (n_samples, 2) + Element-wise class probabilities. + """ + # Be graceful to shape (n_samples, 1) -> (n_samples,) + if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1: + raw_prediction = raw_prediction.squeeze(1) + proba = np.empty((raw_prediction.shape[0], 2), dtype=raw_prediction.dtype) + proba[:, 1] = self.link.inverse(raw_prediction) + proba[:, 0] = 1 - proba[:, 1] + return proba + + +_LOSSES = { + "squared_error": HalfSquaredError, + "absolute_error": AbsoluteError, + "pinball_loss": PinballLoss, + "huber_loss": HuberLoss, + "poisson_loss": HalfPoissonLoss, + "gamma_loss": HalfGammaLoss, + "tweedie_loss": HalfTweedieLoss, + "binomial_loss": HalfBinomialLoss, + "multinomial_loss": HalfMultinomialLoss, + "exponential_loss": ExponentialLoss, +} diff --git a/.venv/lib/python3.12/site-packages/sklearn/_loss/meson.build b/.venv/lib/python3.12/site-packages/sklearn/_loss/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..a4b3425a21cd21b6dfa69d28ac688ede94ef2bea --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/_loss/meson.build @@ -0,0 +1,23 @@ +# .pyx is generated, so this is needed to make Cython compilation work +_loss_cython_tree = [ + fs.copyfile('_loss.pxd') +] + +_loss_pyx = custom_target( + '_loss_pyx', + output: '_loss.pyx', + input: '_loss.pyx.tp', + command: [tempita, '@INPUT@', '-o', '@OUTDIR@'], + # TODO in principle this should go in py.exension_module below. This is + # temporary work-around for dependency issue with .pyx.tp files. For more + # details, see https://github.com/mesonbuild/meson/issues/13212 + depends: _loss_cython_tree, +) + +py.extension_module( + '_loss', + cython_gen.process(_loss_pyx), + dependencies: [openmp_dep], + install: true, + subdir: 'sklearn/_loss', +) diff --git a/.venv/lib/python3.12/site-packages/sklearn/_loss/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/_loss/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/_loss/tests/test_link.py b/.venv/lib/python3.12/site-packages/sklearn/_loss/tests/test_link.py new file mode 100644 index 0000000000000000000000000000000000000000..e5a665f8d48ac9e356971346774a125b18d234d9 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/_loss/tests/test_link.py @@ -0,0 +1,111 @@ +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_array_equal + +from sklearn._loss.link import ( + _LINKS, + HalfLogitLink, + Interval, + MultinomialLogit, + _inclusive_low_high, +) + +LINK_FUNCTIONS = list(_LINKS.values()) + + +def test_interval_raises(): + """Test that interval with low > high raises ValueError.""" + with pytest.raises( + ValueError, match="One must have low <= high; got low=1, high=0." + ): + Interval(1, 0, False, False) + + +@pytest.mark.parametrize( + "interval", + [ + Interval(0, 1, False, False), + Interval(0, 1, False, True), + Interval(0, 1, True, False), + Interval(0, 1, True, True), + Interval(-np.inf, np.inf, False, False), + Interval(-np.inf, np.inf, False, True), + Interval(-np.inf, np.inf, True, False), + Interval(-np.inf, np.inf, True, True), + Interval(-10, -1, False, False), + Interval(-10, -1, False, True), + Interval(-10, -1, True, False), + Interval(-10, -1, True, True), + ], +) +def test_is_in_range(interval): + # make sure low and high are always within the interval, used for linspace + low, high = _inclusive_low_high(interval) + + x = np.linspace(low, high, num=10) + assert interval.includes(x) + + # x contains lower bound + assert interval.includes(np.r_[x, interval.low]) == interval.low_inclusive + + # x contains upper bound + assert interval.includes(np.r_[x, interval.high]) == interval.high_inclusive + + # x contains upper and lower bound + assert interval.includes(np.r_[x, interval.low, interval.high]) == ( + interval.low_inclusive and interval.high_inclusive + ) + + +@pytest.mark.parametrize("link", LINK_FUNCTIONS) +def test_link_inverse_identity(link, global_random_seed): + # Test that link of inverse gives identity. + rng = np.random.RandomState(global_random_seed) + link = link() + n_samples, n_classes = 100, None + # The values for `raw_prediction` are limited from -20 to 20 because in the + # class `LogitLink` the term `expit(x)` comes very close to 1 for large + # positive x and therefore loses precision. + if link.is_multiclass: + n_classes = 10 + raw_prediction = rng.uniform(low=-20, high=20, size=(n_samples, n_classes)) + if isinstance(link, MultinomialLogit): + raw_prediction = link.symmetrize_raw_prediction(raw_prediction) + elif isinstance(link, HalfLogitLink): + raw_prediction = rng.uniform(low=-10, high=10, size=(n_samples)) + else: + raw_prediction = rng.uniform(low=-20, high=20, size=(n_samples)) + + assert_allclose(link.link(link.inverse(raw_prediction)), raw_prediction) + y_pred = link.inverse(raw_prediction) + assert_allclose(link.inverse(link.link(y_pred)), y_pred) + + +@pytest.mark.parametrize("link", LINK_FUNCTIONS) +def test_link_out_argument(link): + # Test that out argument gets assigned the result. + rng = np.random.RandomState(42) + link = link() + n_samples, n_classes = 100, None + if link.is_multiclass: + n_classes = 10 + raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples, n_classes)) + if isinstance(link, MultinomialLogit): + raw_prediction = link.symmetrize_raw_prediction(raw_prediction) + else: + # So far, the valid interval of raw_prediction is (-inf, inf) and + # we do not need to distinguish. + raw_prediction = rng.uniform(low=-10, high=10, size=(n_samples)) + + y_pred = link.inverse(raw_prediction, out=None) + out = np.empty_like(raw_prediction) + y_pred_2 = link.inverse(raw_prediction, out=out) + assert_allclose(y_pred, out) + assert_array_equal(out, y_pred_2) + assert np.shares_memory(out, y_pred_2) + + out = np.empty_like(y_pred) + raw_prediction_2 = link.link(y_pred, out=out) + assert_allclose(raw_prediction, out) + assert_array_equal(out, raw_prediction_2) + assert np.shares_memory(out, raw_prediction_2) diff --git a/.venv/lib/python3.12/site-packages/sklearn/_loss/tests/test_loss.py b/.venv/lib/python3.12/site-packages/sklearn/_loss/tests/test_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..4fea32572902366ed70490d67431cab1d1a29f80 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/_loss/tests/test_loss.py @@ -0,0 +1,1358 @@ +import pickle + +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_array_equal +from pytest import approx +from scipy.optimize import ( + LinearConstraint, + minimize, + minimize_scalar, + newton, +) +from scipy.special import logsumexp + +from sklearn._loss.link import IdentityLink, _inclusive_low_high +from sklearn._loss.loss import ( + _LOSSES, + AbsoluteError, + BaseLoss, + HalfBinomialLoss, + HalfGammaLoss, + HalfMultinomialLoss, + HalfPoissonLoss, + HalfSquaredError, + HalfTweedieLoss, + HalfTweedieLossIdentity, + HuberLoss, + PinballLoss, +) +from sklearn.utils import assert_all_finite +from sklearn.utils._testing import create_memmap_backed_data, skip_if_32bit + +ALL_LOSSES = list(_LOSSES.values()) + +LOSS_INSTANCES = [loss() for loss in ALL_LOSSES] +# HalfTweedieLoss(power=1.5) is already there as default +LOSS_INSTANCES += [ + PinballLoss(quantile=0.25), + HuberLoss(quantile=0.75), + HalfTweedieLoss(power=-1.5), + HalfTweedieLoss(power=0), + HalfTweedieLoss(power=1), + HalfTweedieLoss(power=2), + HalfTweedieLoss(power=3.0), + HalfTweedieLossIdentity(power=0), + HalfTweedieLossIdentity(power=1), + HalfTweedieLossIdentity(power=2), + HalfTweedieLossIdentity(power=3.0), +] + + +def loss_instance_name(param): + if isinstance(param, BaseLoss): + loss = param + name = loss.__class__.__name__ + if isinstance(loss, PinballLoss): + name += f"(quantile={loss.closs.quantile})" + elif isinstance(loss, HuberLoss): + name += f"(quantile={loss.quantile}" + elif hasattr(loss, "closs") and hasattr(loss.closs, "power"): + name += f"(power={loss.closs.power})" + return name + else: + return str(param) + + +def random_y_true_raw_prediction( + loss, n_samples, y_bound=(-100, 100), raw_bound=(-5, 5), seed=42 +): + """Random generate y_true and raw_prediction in valid range.""" + rng = np.random.RandomState(seed) + if loss.is_multiclass: + raw_prediction = np.empty((n_samples, loss.n_classes)) + raw_prediction.flat[:] = rng.uniform( + low=raw_bound[0], + high=raw_bound[1], + size=n_samples * loss.n_classes, + ) + y_true = np.arange(n_samples).astype(float) % loss.n_classes + else: + # If link is identity, we must respect the interval of y_pred: + if isinstance(loss.link, IdentityLink): + low, high = _inclusive_low_high(loss.interval_y_pred) + low = np.amax([low, raw_bound[0]]) + high = np.amin([high, raw_bound[1]]) + raw_bound = (low, high) + raw_prediction = rng.uniform( + low=raw_bound[0], high=raw_bound[1], size=n_samples + ) + # generate a y_true in valid range + low, high = _inclusive_low_high(loss.interval_y_true) + low = max(low, y_bound[0]) + high = min(high, y_bound[1]) + y_true = rng.uniform(low, high, size=n_samples) + # set some values at special boundaries + if loss.interval_y_true.low == 0 and loss.interval_y_true.low_inclusive: + y_true[:: (n_samples // 3)] = 0 + if loss.interval_y_true.high == 1 and loss.interval_y_true.high_inclusive: + y_true[1 :: (n_samples // 3)] = 1 + + return y_true, raw_prediction + + +def numerical_derivative(func, x, eps): + """Helper function for numerical (first) derivatives.""" + # For numerical derivatives, see + # https://en.wikipedia.org/wiki/Numerical_differentiation + # https://en.wikipedia.org/wiki/Finite_difference_coefficient + # We use central finite differences of accuracy 4. + h = np.full_like(x, fill_value=eps) + f_minus_2h = func(x - 2 * h) + f_minus_1h = func(x - h) + f_plus_1h = func(x + h) + f_plus_2h = func(x + 2 * h) + return (-f_plus_2h + 8 * f_plus_1h - 8 * f_minus_1h + f_minus_2h) / (12.0 * eps) + + +@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name) +def test_loss_boundary(loss): + """Test interval ranges of y_true and y_pred in losses.""" + # make sure low and high are always within the interval, used for linspace + if loss.is_multiclass: + n_classes = 3 # default value + y_true = np.tile(np.linspace(0, n_classes - 1, num=n_classes), 3) + else: + low, high = _inclusive_low_high(loss.interval_y_true) + y_true = np.linspace(low, high, num=10) + + # add boundaries if they are included + if loss.interval_y_true.low_inclusive: + y_true = np.r_[y_true, loss.interval_y_true.low] + if loss.interval_y_true.high_inclusive: + y_true = np.r_[y_true, loss.interval_y_true.high] + + assert loss.in_y_true_range(y_true) + + n = y_true.shape[0] + low, high = _inclusive_low_high(loss.interval_y_pred) + if loss.is_multiclass: + y_pred = np.empty((n, n_classes)) + y_pred[:, 0] = np.linspace(low, high, num=n) + y_pred[:, 1] = 0.5 * (1 - y_pred[:, 0]) + y_pred[:, 2] = 0.5 * (1 - y_pred[:, 0]) + else: + y_pred = np.linspace(low, high, num=n) + + assert loss.in_y_pred_range(y_pred) + + # calculating losses should not fail + raw_prediction = loss.link.link(y_pred) + loss.loss(y_true=y_true, raw_prediction=raw_prediction) + + +# Fixture to test valid value ranges. +Y_COMMON_PARAMS = [ + # (loss, [y success], [y fail]) + (HalfSquaredError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]), + (AbsoluteError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]), + (PinballLoss(), [-100, 0, 0.1, 100], [-np.inf, np.inf]), + (HuberLoss(), [-100, 0, 0.1, 100], [-np.inf, np.inf]), + (HalfPoissonLoss(), [0.1, 100], [-np.inf, -3, -0.1, np.inf]), + (HalfGammaLoss(), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]), + (HalfTweedieLoss(power=-3), [0.1, 100], [-np.inf, np.inf]), + (HalfTweedieLoss(power=0), [0.1, 100], [-np.inf, np.inf]), + (HalfTweedieLoss(power=1.5), [0.1, 100], [-np.inf, -3, -0.1, np.inf]), + (HalfTweedieLoss(power=2), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]), + (HalfTweedieLoss(power=3), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]), + (HalfTweedieLossIdentity(power=-3), [0.1, 100], [-np.inf, np.inf]), + (HalfTweedieLossIdentity(power=0), [-3, -0.1, 0, 0.1, 100], [-np.inf, np.inf]), + (HalfTweedieLossIdentity(power=1.5), [0.1, 100], [-np.inf, -3, -0.1, np.inf]), + (HalfTweedieLossIdentity(power=2), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]), + (HalfTweedieLossIdentity(power=3), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]), + (HalfBinomialLoss(), [0.1, 0.5, 0.9], [-np.inf, -1, 2, np.inf]), + (HalfMultinomialLoss(), [], [-np.inf, -1, 1.1, np.inf]), +] +# y_pred and y_true do not always have the same domain (valid value range). +# Hence, we define extra sets of parameters for each of them. +Y_TRUE_PARAMS = [ # type: ignore[var-annotated] + # (loss, [y success], [y fail]) + (HalfPoissonLoss(), [0], []), + (HuberLoss(), [0], []), + (HalfTweedieLoss(power=-3), [-100, -0.1, 0], []), + (HalfTweedieLoss(power=0), [-100, 0], []), + (HalfTweedieLoss(power=1.5), [0], []), + (HalfTweedieLossIdentity(power=-3), [-100, -0.1, 0], []), + (HalfTweedieLossIdentity(power=0), [-100, 0], []), + (HalfTweedieLossIdentity(power=1.5), [0], []), + (HalfBinomialLoss(), [0, 1], []), + (HalfMultinomialLoss(), [0.0, 1.0, 2], []), +] +Y_PRED_PARAMS = [ + # (loss, [y success], [y fail]) + (HalfPoissonLoss(), [], [0]), + (HalfTweedieLoss(power=-3), [], [-3, -0.1, 0]), + (HalfTweedieLoss(power=0), [], [-3, -0.1, 0]), + (HalfTweedieLoss(power=1.5), [], [0]), + (HalfTweedieLossIdentity(power=-3), [], [-3, -0.1, 0]), + (HalfTweedieLossIdentity(power=0), [-3, -0.1, 0], []), + (HalfTweedieLossIdentity(power=1.5), [], [0]), + (HalfBinomialLoss(), [], [0, 1]), + (HalfMultinomialLoss(), [0.1, 0.5], [0, 1]), +] + + +@pytest.mark.parametrize( + "loss, y_true_success, y_true_fail", + Y_COMMON_PARAMS + Y_TRUE_PARAMS, # type: ignore[operator] +) +def test_loss_boundary_y_true(loss, y_true_success, y_true_fail): + """Test boundaries of y_true for loss functions.""" + for y in y_true_success: + assert loss.in_y_true_range(np.array([y])) + for y in y_true_fail: + assert not loss.in_y_true_range(np.array([y])) + + +@pytest.mark.parametrize( + "loss, y_pred_success, y_pred_fail", + Y_COMMON_PARAMS + Y_PRED_PARAMS, # type: ignore[operator] +) +def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail): + """Test boundaries of y_pred for loss functions.""" + for y in y_pred_success: + assert loss.in_y_pred_range(np.array([y])) + for y in y_pred_fail: + assert not loss.in_y_pred_range(np.array([y])) + + +@pytest.mark.parametrize( + "loss, y_true, raw_prediction, loss_true, gradient_true, hessian_true", + [ + (HalfSquaredError(), 1.0, 5.0, 8, 4, 1), + (AbsoluteError(), 1.0, 5.0, 4.0, 1.0, None), + (PinballLoss(quantile=0.5), 1.0, 5.0, 2, 0.5, None), + (PinballLoss(quantile=0.25), 1.0, 5.0, 4 * (1 - 0.25), 1 - 0.25, None), + (PinballLoss(quantile=0.25), 5.0, 1.0, 4 * 0.25, -0.25, None), + (HuberLoss(quantile=0.5, delta=3), 1.0, 5.0, 3 * (4 - 3 / 2), None, None), + (HuberLoss(quantile=0.5, delta=3), 1.0, 3.0, 0.5 * 2**2, None, None), + (HalfPoissonLoss(), 2.0, np.log(4), 4 - 2 * np.log(4), 4 - 2, 4), + (HalfGammaLoss(), 2.0, np.log(4), np.log(4) + 2 / 4, 1 - 2 / 4, 2 / 4), + (HalfTweedieLoss(power=3), 2.0, np.log(4), -1 / 4 + 1 / 4**2, None, None), + (HalfTweedieLossIdentity(power=1), 2.0, 4.0, 2 - 2 * np.log(2), None, None), + (HalfTweedieLossIdentity(power=2), 2.0, 4.0, np.log(2) - 1 / 2, None, None), + ( + HalfTweedieLossIdentity(power=3), + 2.0, + 4.0, + -1 / 4 + 1 / 4**2 + 1 / 2 / 2, + None, + None, + ), + ( + HalfBinomialLoss(), + 0.25, + np.log(4), + np.log1p(4) - 0.25 * np.log(4), + None, + None, + ), + # Extreme log loss cases, checked with mpmath: + # import mpmath as mp + # + # # Stolen from scipy + # def mpf2float(x): + # return float(mp.nstr(x, 17, min_fixed=0, max_fixed=0)) + # + # def mp_logloss(y_true, raw): + # with mp.workdps(100): + # y_true, raw = mp.mpf(float(y_true)), mp.mpf(float(raw)) + # out = mp.log1p(mp.exp(raw)) - y_true * raw + # return mpf2float(out) + # + # def mp_gradient(y_true, raw): + # with mp.workdps(100): + # y_true, raw = mp.mpf(float(y_true)), mp.mpf(float(raw)) + # out = mp.mpf(1) / (mp.mpf(1) + mp.exp(-raw)) - y_true + # return mpf2float(out) + # + # def mp_hessian(y_true, raw): + # with mp.workdps(100): + # y_true, raw = mp.mpf(float(y_true)), mp.mpf(float(raw)) + # p = mp.mpf(1) / (mp.mpf(1) + mp.exp(-raw)) + # out = p * (mp.mpf(1) - p) + # return mpf2float(out) + # + # y, raw = 0.0, 37. + # mp_logloss(y, raw), mp_gradient(y, raw), mp_hessian(y, raw) + (HalfBinomialLoss(), 0.0, -1e20, 0, 0, 0), + (HalfBinomialLoss(), 1.0, -1e20, 1e20, -1, 0), + (HalfBinomialLoss(), 0.0, -1e3, 0, 0, 0), + (HalfBinomialLoss(), 1.0, -1e3, 1e3, -1, 0), + (HalfBinomialLoss(), 1.0, -37.5, 37.5, -1, 0), + (HalfBinomialLoss(), 1.0, -37.0, 37, 1e-16 - 1, 8.533047625744065e-17), + (HalfBinomialLoss(), 0.0, -37.0, *[8.533047625744065e-17] * 3), + (HalfBinomialLoss(), 1.0, -36.9, 36.9, 1e-16 - 1, 9.430476078526806e-17), + (HalfBinomialLoss(), 0.0, -36.9, *[9.430476078526806e-17] * 3), + (HalfBinomialLoss(), 0.0, 37.0, 37, 1 - 1e-16, 8.533047625744065e-17), + (HalfBinomialLoss(), 1.0, 37.0, *[8.533047625744066e-17] * 3), + (HalfBinomialLoss(), 0.0, 37.5, 37.5, 1, 5.175555005801868e-17), + (HalfBinomialLoss(), 0.0, 232.8, 232.8, 1, 1.4287342391028437e-101), + (HalfBinomialLoss(), 1.0, 1e20, 0, 0, 0), + (HalfBinomialLoss(), 0.0, 1e20, 1e20, 1, 0), + ( + HalfBinomialLoss(), + 1.0, + 232.8, + 0, + -1.4287342391028437e-101, + 1.4287342391028437e-101, + ), + (HalfBinomialLoss(), 1.0, 232.9, 0, 0, 0), + (HalfBinomialLoss(), 1.0, 1e3, 0, 0, 0), + (HalfBinomialLoss(), 0.0, 1e3, 1e3, 1, 0), + ( + HalfMultinomialLoss(n_classes=3), + 0.0, + [0.2, 0.5, 0.3], + logsumexp([0.2, 0.5, 0.3]) - 0.2, + None, + None, + ), + ( + HalfMultinomialLoss(n_classes=3), + 1.0, + [0.2, 0.5, 0.3], + logsumexp([0.2, 0.5, 0.3]) - 0.5, + None, + None, + ), + ( + HalfMultinomialLoss(n_classes=3), + 2.0, + [0.2, 0.5, 0.3], + logsumexp([0.2, 0.5, 0.3]) - 0.3, + None, + None, + ), + ( + HalfMultinomialLoss(n_classes=3), + 2.0, + [1e4, 0, 7e-7], + logsumexp([1e4, 0, 7e-7]) - (7e-7), + None, + None, + ), + ], + ids=loss_instance_name, +) +def test_loss_on_specific_values( + loss, y_true, raw_prediction, loss_true, gradient_true, hessian_true +): + """Test losses, gradients and hessians at specific values.""" + loss1 = loss(y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction])) + grad1 = loss.gradient( + y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction]) + ) + loss2, grad2 = loss.loss_gradient( + y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction]) + ) + grad3, hess = loss.gradient_hessian( + y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction]) + ) + + assert loss1 == approx(loss_true, rel=1e-15, abs=1e-15) + assert loss2 == approx(loss_true, rel=1e-15, abs=1e-15) + + if gradient_true is not None: + assert grad1 == approx(gradient_true, rel=1e-15, abs=1e-15) + assert grad2 == approx(gradient_true, rel=1e-15, abs=1e-15) + assert grad3 == approx(gradient_true, rel=1e-15, abs=1e-15) + + if hessian_true is not None: + assert hess == approx(hessian_true, rel=1e-15, abs=1e-15) + + +@pytest.mark.parametrize("loss", ALL_LOSSES) +@pytest.mark.parametrize("readonly_memmap", [False, True]) +@pytest.mark.parametrize("dtype_in", [np.float32, np.float64]) +@pytest.mark.parametrize("dtype_out", [np.float32, np.float64]) +@pytest.mark.parametrize("sample_weight", [None, 1]) +@pytest.mark.parametrize("out1", [None, 1]) +@pytest.mark.parametrize("out2", [None, 1]) +@pytest.mark.parametrize("n_threads", [1, 2]) +def test_loss_dtype( + loss, readonly_memmap, dtype_in, dtype_out, sample_weight, out1, out2, n_threads +): + """Test acceptance of dtypes, readonly and writeable arrays in loss functions. + + Check that loss accepts if all input arrays are either all float32 or all + float64, and all output arrays are either all float32 or all float64. + + Also check that input arrays can be readonly, e.g. memory mapped. + """ + loss = loss() + # generate a y_true and raw_prediction in valid range + n_samples = 5 + y_true, raw_prediction = random_y_true_raw_prediction( + loss=loss, + n_samples=n_samples, + y_bound=(-100, 100), + raw_bound=(-10, 10), + seed=42, + ) + y_true = y_true.astype(dtype_in) + raw_prediction = raw_prediction.astype(dtype_in) + + if sample_weight is not None: + sample_weight = np.array([2.0] * n_samples, dtype=dtype_in) + if out1 is not None: + out1 = np.empty_like(y_true, dtype=dtype_out) + if out2 is not None: + out2 = np.empty_like(raw_prediction, dtype=dtype_out) + + if readonly_memmap: + y_true = create_memmap_backed_data(y_true) + raw_prediction = create_memmap_backed_data(raw_prediction) + if sample_weight is not None: + sample_weight = create_memmap_backed_data(sample_weight) + + l = loss.loss( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + loss_out=out1, + n_threads=n_threads, + ) + assert l is out1 if out1 is not None else True + g = loss.gradient( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + gradient_out=out2, + n_threads=n_threads, + ) + assert g is out2 if out2 is not None else True + l, g = loss.loss_gradient( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + loss_out=out1, + gradient_out=out2, + n_threads=n_threads, + ) + assert l is out1 if out1 is not None else True + assert g is out2 if out2 is not None else True + if out1 is not None and loss.is_multiclass: + out1 = np.empty_like(raw_prediction, dtype=dtype_out) + g, h = loss.gradient_hessian( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + gradient_out=out1, + hessian_out=out2, + n_threads=n_threads, + ) + assert g is out1 if out1 is not None else True + assert h is out2 if out2 is not None else True + loss(y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight) + loss.fit_intercept_only(y_true=y_true, sample_weight=sample_weight) + loss.constant_to_optimal_zero(y_true=y_true, sample_weight=sample_weight) + if hasattr(loss, "predict_proba"): + loss.predict_proba(raw_prediction=raw_prediction) + if hasattr(loss, "gradient_proba"): + g, p = loss.gradient_proba( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + gradient_out=out1, + proba_out=out2, + n_threads=n_threads, + ) + assert g is out1 if out1 is not None else True + assert p is out2 if out2 is not None else True + + +@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name) +@pytest.mark.parametrize("sample_weight", [None, "range"]) +def test_loss_same_as_C_functions(loss, sample_weight): + """Test that Python and Cython functions return same results.""" + y_true, raw_prediction = random_y_true_raw_prediction( + loss=loss, + n_samples=20, + y_bound=(-100, 100), + raw_bound=(-10, 10), + seed=42, + ) + if sample_weight == "range": + sample_weight = np.linspace(1, y_true.shape[0], num=y_true.shape[0]) + + out_l1 = np.empty_like(y_true) + out_l2 = np.empty_like(y_true) + out_g1 = np.empty_like(raw_prediction) + out_g2 = np.empty_like(raw_prediction) + out_h1 = np.empty_like(raw_prediction) + out_h2 = np.empty_like(raw_prediction) + loss.loss( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + loss_out=out_l1, + ) + loss.closs.loss( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + loss_out=out_l2, + ) + assert_allclose(out_l1, out_l2) + loss.gradient( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + gradient_out=out_g1, + ) + loss.closs.gradient( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + gradient_out=out_g2, + ) + assert_allclose(out_g1, out_g2) + loss.closs.loss_gradient( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + loss_out=out_l1, + gradient_out=out_g1, + ) + loss.closs.loss_gradient( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + loss_out=out_l2, + gradient_out=out_g2, + ) + assert_allclose(out_l1, out_l2) + assert_allclose(out_g1, out_g2) + loss.gradient_hessian( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + gradient_out=out_g1, + hessian_out=out_h1, + ) + loss.closs.gradient_hessian( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + gradient_out=out_g2, + hessian_out=out_h2, + ) + assert_allclose(out_g1, out_g2) + assert_allclose(out_h1, out_h2) + + +@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name) +@pytest.mark.parametrize("sample_weight", [None, "range"]) +def test_loss_gradients_are_the_same(loss, sample_weight, global_random_seed): + """Test that loss and gradient are the same across different functions. + + Also test that output arguments contain correct results. + """ + y_true, raw_prediction = random_y_true_raw_prediction( + loss=loss, + n_samples=20, + y_bound=(-100, 100), + raw_bound=(-10, 10), + seed=global_random_seed, + ) + if sample_weight == "range": + sample_weight = np.linspace(1, y_true.shape[0], num=y_true.shape[0]) + + out_l1 = np.empty_like(y_true) + out_l2 = np.empty_like(y_true) + out_g1 = np.empty_like(raw_prediction) + out_g2 = np.empty_like(raw_prediction) + out_g3 = np.empty_like(raw_prediction) + out_h3 = np.empty_like(raw_prediction) + + l1 = loss.loss( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + loss_out=out_l1, + ) + g1 = loss.gradient( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + gradient_out=out_g1, + ) + l2, g2 = loss.loss_gradient( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + loss_out=out_l2, + gradient_out=out_g2, + ) + g3, h3 = loss.gradient_hessian( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + gradient_out=out_g3, + hessian_out=out_h3, + ) + assert_allclose(l1, l2) + assert_array_equal(l1, out_l1) + assert np.shares_memory(l1, out_l1) + assert_array_equal(l2, out_l2) + assert np.shares_memory(l2, out_l2) + assert_allclose(g1, g2) + assert_allclose(g1, g3) + assert_array_equal(g1, out_g1) + assert np.shares_memory(g1, out_g1) + assert_array_equal(g2, out_g2) + assert np.shares_memory(g2, out_g2) + assert_array_equal(g3, out_g3) + assert np.shares_memory(g3, out_g3) + + if hasattr(loss, "gradient_proba"): + assert loss.is_multiclass # only for HalfMultinomialLoss + out_g4 = np.empty_like(raw_prediction) + out_proba = np.empty_like(raw_prediction) + g4, proba = loss.gradient_proba( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + gradient_out=out_g4, + proba_out=out_proba, + ) + assert_allclose(g1, out_g4) + assert_allclose(g1, g4) + assert_allclose(proba, out_proba) + assert_allclose(np.sum(proba, axis=1), 1, rtol=1e-11) + + +@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name) +@pytest.mark.parametrize("sample_weight", ["ones", "random"]) +def test_sample_weight_multiplies(loss, sample_weight, global_random_seed): + """Test sample weights in loss, gradients and hessians. + + Make sure that passing sample weights to loss, gradient and hessian + computation methods is equivalent to multiplying by the weights. + """ + n_samples = 100 + y_true, raw_prediction = random_y_true_raw_prediction( + loss=loss, + n_samples=n_samples, + y_bound=(-100, 100), + raw_bound=(-5, 5), + seed=global_random_seed, + ) + + if sample_weight == "ones": + sample_weight = np.ones(shape=n_samples, dtype=np.float64) + else: + rng = np.random.RandomState(global_random_seed) + sample_weight = rng.normal(size=n_samples).astype(np.float64) + + assert_allclose( + loss.loss( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + ), + sample_weight + * loss.loss( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=None, + ), + ) + + losses, gradient = loss.loss_gradient( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=None, + ) + losses_sw, gradient_sw = loss.loss_gradient( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + ) + assert_allclose(losses * sample_weight, losses_sw) + if not loss.is_multiclass: + assert_allclose(gradient * sample_weight, gradient_sw) + else: + assert_allclose(gradient * sample_weight[:, None], gradient_sw) + + gradient, hessian = loss.gradient_hessian( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=None, + ) + gradient_sw, hessian_sw = loss.gradient_hessian( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + ) + if not loss.is_multiclass: + assert_allclose(gradient * sample_weight, gradient_sw) + assert_allclose(hessian * sample_weight, hessian_sw) + else: + assert_allclose(gradient * sample_weight[:, None], gradient_sw) + assert_allclose(hessian * sample_weight[:, None], hessian_sw) + + +@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name) +def test_graceful_squeezing(loss): + """Test that reshaped raw_prediction gives same results.""" + y_true, raw_prediction = random_y_true_raw_prediction( + loss=loss, + n_samples=20, + y_bound=(-100, 100), + raw_bound=(-10, 10), + seed=42, + ) + + if raw_prediction.ndim == 1: + raw_prediction_2d = raw_prediction[:, None] + assert_allclose( + loss.loss(y_true=y_true, raw_prediction=raw_prediction_2d), + loss.loss(y_true=y_true, raw_prediction=raw_prediction), + ) + assert_allclose( + loss.loss_gradient(y_true=y_true, raw_prediction=raw_prediction_2d), + loss.loss_gradient(y_true=y_true, raw_prediction=raw_prediction), + ) + assert_allclose( + loss.gradient(y_true=y_true, raw_prediction=raw_prediction_2d), + loss.gradient(y_true=y_true, raw_prediction=raw_prediction), + ) + assert_allclose( + loss.gradient_hessian(y_true=y_true, raw_prediction=raw_prediction_2d), + loss.gradient_hessian(y_true=y_true, raw_prediction=raw_prediction), + ) + + +@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name) +@pytest.mark.parametrize("sample_weight", [None, "range"]) +def test_loss_of_perfect_prediction(loss, sample_weight): + """Test value of perfect predictions. + + Loss of y_pred = y_true plus constant_to_optimal_zero should sums up to + zero. + """ + if not loss.is_multiclass: + # Use small values such that exp(value) is not nan. + raw_prediction = np.array([-10, -0.1, 0, 0.1, 3, 10]) + # If link is identity, we must respect the interval of y_pred: + if isinstance(loss.link, IdentityLink): + eps = 1e-10 + low = loss.interval_y_pred.low + if not loss.interval_y_pred.low_inclusive: + low = low + eps + high = loss.interval_y_pred.high + if not loss.interval_y_pred.high_inclusive: + high = high - eps + raw_prediction = np.clip(raw_prediction, low, high) + y_true = loss.link.inverse(raw_prediction) + else: + # HalfMultinomialLoss + y_true = np.arange(loss.n_classes).astype(float) + # raw_prediction with entries -exp(10), but +exp(10) on the diagonal + # this is close enough to np.inf which would produce nan + raw_prediction = np.full( + shape=(loss.n_classes, loss.n_classes), + fill_value=-np.exp(10), + dtype=float, + ) + raw_prediction.flat[:: loss.n_classes + 1] = np.exp(10) + + if sample_weight == "range": + sample_weight = np.linspace(1, y_true.shape[0], num=y_true.shape[0]) + + loss_value = loss.loss( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + ) + constant_term = loss.constant_to_optimal_zero( + y_true=y_true, sample_weight=sample_weight + ) + # Comparing loss_value + constant_term to zero would result in large + # round-off errors. + assert_allclose(loss_value, -constant_term, atol=1e-14, rtol=1e-15) + + +@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name) +@pytest.mark.parametrize("sample_weight", [None, "range"]) +def test_gradients_hessians_numerically(loss, sample_weight, global_random_seed): + """Test gradients and hessians with numerical derivatives. + + Gradient should equal the numerical derivatives of the loss function. + Hessians should equal the numerical derivatives of gradients. + """ + n_samples = 20 + y_true, raw_prediction = random_y_true_raw_prediction( + loss=loss, + n_samples=n_samples, + y_bound=(-100, 100), + raw_bound=(-5, 5), + seed=global_random_seed, + ) + + if sample_weight == "range": + sample_weight = np.linspace(1, y_true.shape[0], num=y_true.shape[0]) + + g, h = loss.gradient_hessian( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + ) + + assert g.shape == raw_prediction.shape + assert h.shape == raw_prediction.shape + + if not loss.is_multiclass: + + def loss_func(x): + return loss.loss( + y_true=y_true, + raw_prediction=x, + sample_weight=sample_weight, + ) + + g_numeric = numerical_derivative(loss_func, raw_prediction, eps=1e-6) + assert_allclose(g, g_numeric, rtol=5e-6, atol=1e-10) + + def grad_func(x): + return loss.gradient( + y_true=y_true, + raw_prediction=x, + sample_weight=sample_weight, + ) + + h_numeric = numerical_derivative(grad_func, raw_prediction, eps=1e-6) + if loss.approx_hessian: + # TODO: What could we test if loss.approx_hessian? + pass + else: + assert_allclose(h, h_numeric, rtol=5e-6, atol=1e-10) + else: + # For multiclass loss, we should only change the predictions of the + # class for which the derivative is taken for, e.g. offset[:, k] = eps + # for class k. + # As a softmax is computed, offsetting the whole array by a constant + # would have no effect on the probabilities, and thus on the loss. + for k in range(loss.n_classes): + + def loss_func(x): + raw = raw_prediction.copy() + raw[:, k] = x + return loss.loss( + y_true=y_true, + raw_prediction=raw, + sample_weight=sample_weight, + ) + + g_numeric = numerical_derivative(loss_func, raw_prediction[:, k], eps=1e-5) + assert_allclose(g[:, k], g_numeric, rtol=5e-6, atol=1e-10) + + def grad_func(x): + raw = raw_prediction.copy() + raw[:, k] = x + return loss.gradient( + y_true=y_true, + raw_prediction=raw, + sample_weight=sample_weight, + )[:, k] + + h_numeric = numerical_derivative(grad_func, raw_prediction[:, k], eps=1e-6) + if loss.approx_hessian: + # TODO: What could we test if loss.approx_hessian? + pass + else: + assert_allclose(h[:, k], h_numeric, rtol=5e-6, atol=1e-10) + + +@pytest.mark.parametrize( + "loss, x0, y_true", + [ + ("squared_error", -2.0, 42), + ("squared_error", 117.0, 1.05), + ("squared_error", 0.0, 0.0), + # The argmin of binomial_loss for y_true=0 and y_true=1 is resp. + # -inf and +inf due to logit, cf. "complete separation". Therefore, we + # use 0 < y_true < 1. + ("binomial_loss", 0.3, 0.1), + ("binomial_loss", -12, 0.2), + ("binomial_loss", 30, 0.9), + ("poisson_loss", 12.0, 1.0), + ("poisson_loss", 0.0, 2.0), + ("poisson_loss", -22.0, 10.0), + ], +) +@skip_if_32bit +def test_derivatives(loss, x0, y_true): + """Test that gradients are zero at the minimum of the loss. + + We check this on a single value/sample using Halley's method with the + first and second order derivatives computed by the Loss instance. + Note that methods of Loss instances operate on arrays while the newton + root finder expects a scalar or a one-element array for this purpose. + """ + loss = _LOSSES[loss](sample_weight=None) + y_true = np.array([y_true], dtype=np.float64) + x0 = np.array([x0], dtype=np.float64) + + def func(x: np.ndarray) -> np.ndarray: + """Compute loss plus constant term. + + The constant term is such that the minimum function value is zero, + which is required by the Newton method. + """ + return loss.loss( + y_true=y_true, raw_prediction=x + ) + loss.constant_to_optimal_zero(y_true=y_true) + + def fprime(x: np.ndarray) -> np.ndarray: + return loss.gradient(y_true=y_true, raw_prediction=x) + + def fprime2(x: np.ndarray) -> np.ndarray: + return loss.gradient_hessian(y_true=y_true, raw_prediction=x)[1] + + optimum = newton( + func, + x0=x0, + fprime=fprime, + fprime2=fprime2, + maxiter=100, + tol=5e-8, + ) + + # Need to ravel arrays because assert_allclose requires matching + # dimensions. + y_true = y_true.ravel() + optimum = optimum.ravel() + assert_allclose(loss.link.inverse(optimum), y_true) + assert_allclose(func(optimum), 0, atol=1e-14) + assert_allclose(loss.gradient(y_true=y_true, raw_prediction=optimum), 0, atol=5e-7) + + +@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name) +@pytest.mark.parametrize("sample_weight", [None, "range"]) +def test_loss_intercept_only(loss, sample_weight): + """Test that fit_intercept_only returns the argmin of the loss. + + Also test that the gradient is zero at the minimum. + """ + n_samples = 50 + if not loss.is_multiclass: + y_true = loss.link.inverse(np.linspace(-4, 4, num=n_samples)) + else: + y_true = np.arange(n_samples).astype(np.float64) % loss.n_classes + y_true[::5] = 0 # exceedance of class 0 + + if sample_weight == "range": + sample_weight = np.linspace(0.1, 2, num=n_samples) + + a = loss.fit_intercept_only(y_true=y_true, sample_weight=sample_weight) + + # find minimum by optimization + def fun(x): + if not loss.is_multiclass: + raw_prediction = np.full(shape=(n_samples), fill_value=x) + else: + raw_prediction = np.ascontiguousarray( + np.broadcast_to(x, shape=(n_samples, loss.n_classes)) + ) + return loss( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + ) + + if not loss.is_multiclass: + opt = minimize_scalar(fun, tol=1e-7, options={"maxiter": 100}) + grad = loss.gradient( + y_true=y_true, + raw_prediction=np.full_like(y_true, a), + sample_weight=sample_weight, + ) + assert a.shape == tuple() # scalar + assert a.dtype == y_true.dtype + assert_all_finite(a) + a == approx(opt.x, rel=1e-7) + grad.sum() == approx(0, abs=1e-12) + else: + # The constraint corresponds to sum(raw_prediction) = 0. Without it, we would + # need to apply loss.symmetrize_raw_prediction to opt.x before comparing. + opt = minimize( + fun, + np.zeros((loss.n_classes)), + tol=1e-13, + options={"maxiter": 100}, + method="SLSQP", + constraints=LinearConstraint(np.ones((1, loss.n_classes)), 0, 0), + ) + grad = loss.gradient( + y_true=y_true, + raw_prediction=np.tile(a, (n_samples, 1)), + sample_weight=sample_weight, + ) + assert a.dtype == y_true.dtype + assert_all_finite(a) + assert_allclose(a, opt.x, rtol=5e-6, atol=1e-12) + assert_allclose(grad.sum(axis=0), 0, atol=1e-12) + + +@pytest.mark.parametrize( + "loss, func, random_dist", + [ + (HalfSquaredError(), np.mean, "normal"), + (AbsoluteError(), np.median, "normal"), + (PinballLoss(quantile=0.25), lambda x: np.percentile(x, q=25), "normal"), + (HalfPoissonLoss(), np.mean, "poisson"), + (HalfGammaLoss(), np.mean, "exponential"), + (HalfTweedieLoss(), np.mean, "exponential"), + (HalfBinomialLoss(), np.mean, "binomial"), + ], +) +def test_specific_fit_intercept_only(loss, func, random_dist, global_random_seed): + """Test that fit_intercept_only returns the correct functional. + + We test the functional for specific, meaningful distributions, e.g. + squared error estimates the expectation of a probability distribution. + """ + rng = np.random.RandomState(global_random_seed) + if random_dist == "binomial": + y_train = rng.binomial(1, 0.5, size=100) + else: + y_train = getattr(rng, random_dist)(size=100) + baseline_prediction = loss.fit_intercept_only(y_true=y_train) + # Make sure baseline prediction is the expected functional=func, e.g. mean + # or median. + assert_all_finite(baseline_prediction) + assert baseline_prediction == approx(loss.link.link(func(y_train))) + assert loss.link.inverse(baseline_prediction) == approx(func(y_train)) + if isinstance(loss, IdentityLink): + assert_allclose(loss.link.inverse(baseline_prediction), baseline_prediction) + + # Test baseline at boundary + if loss.interval_y_true.low_inclusive: + y_train.fill(loss.interval_y_true.low) + baseline_prediction = loss.fit_intercept_only(y_true=y_train) + assert_all_finite(baseline_prediction) + if loss.interval_y_true.high_inclusive: + y_train.fill(loss.interval_y_true.high) + baseline_prediction = loss.fit_intercept_only(y_true=y_train) + assert_all_finite(baseline_prediction) + + +def test_multinomial_loss_fit_intercept_only(): + """Test that fit_intercept_only returns the mean functional for CCE.""" + rng = np.random.RandomState(0) + n_classes = 4 + loss = HalfMultinomialLoss(n_classes=n_classes) + # Same logic as test_specific_fit_intercept_only. Here inverse link + # function = softmax and link function = log - symmetry term. + y_train = rng.randint(0, n_classes + 1, size=100).astype(np.float64) + baseline_prediction = loss.fit_intercept_only(y_true=y_train) + assert baseline_prediction.shape == (n_classes,) + p = np.zeros(n_classes, dtype=y_train.dtype) + for k in range(n_classes): + p[k] = (y_train == k).mean() + assert_allclose(baseline_prediction, np.log(p) - np.mean(np.log(p))) + assert_allclose(baseline_prediction[None, :], loss.link.link(p[None, :])) + + for y_train in (np.zeros(shape=10), np.ones(shape=10)): + y_train = y_train.astype(np.float64) + baseline_prediction = loss.fit_intercept_only(y_true=y_train) + assert baseline_prediction.dtype == y_train.dtype + assert_all_finite(baseline_prediction) + + +def test_multinomial_cy_gradient(global_random_seed): + """Test that Multinomial cy_gradient gives the same result as gradient. + + CyHalfMultinomialLoss does not inherit from CyLossFunction and has a different API. + As a consequence, the functions like `loss` and `gradient` do not rely on `cy_loss` + and `cy_gradient`. + """ + n_samples = 100 + n_classes = 5 + loss = HalfMultinomialLoss(n_classes=n_classes) + y_true, raw_prediction = random_y_true_raw_prediction( + loss=loss, + n_samples=n_samples, + seed=global_random_seed, + ) + sample_weight = np.linspace(0.1, 2, num=n_samples) + + grad1 = loss.closs._test_cy_gradient( + y_true=y_true, + raw_prediction=raw_prediction, # needs to be C-contiguous + sample_weight=sample_weight, + ) + grad2 = loss.gradient( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + ) + assert_allclose(grad1, grad2) + + +def test_binomial_and_multinomial_loss(global_random_seed): + """Test that multinomial loss with n_classes = 2 is the same as binomial loss.""" + rng = np.random.RandomState(global_random_seed) + n_samples = 20 + binom = HalfBinomialLoss() + multinom = HalfMultinomialLoss(n_classes=2) + y_train = rng.randint(0, 2, size=n_samples).astype(np.float64) + raw_prediction = rng.normal(size=n_samples) + raw_multinom = np.empty((n_samples, 2)) + raw_multinom[:, 0] = -0.5 * raw_prediction + raw_multinom[:, 1] = 0.5 * raw_prediction + assert_allclose( + binom.loss(y_true=y_train, raw_prediction=raw_prediction), + multinom.loss(y_true=y_train, raw_prediction=raw_multinom), + ) + + +@pytest.mark.parametrize("y_true", (np.array([0.0, 0, 0]), np.array([1.0, 1, 1]))) +@pytest.mark.parametrize("y_pred", (np.array([-5.0, -5, -5]), np.array([3.0, 3, 3]))) +def test_binomial_vs_alternative_formulation(y_true, y_pred, global_dtype): + """Test that both formulations of the binomial deviance agree. + + Often, the binomial deviance or log loss is written in terms of a variable + z in {-1, +1}, but we use y in {0, 1}, hence z = 2 * y - 1. + ESL II Eq. (10.18): + + -loglike(z, f) = log(1 + exp(-2 * z * f)) + + Note: + - ESL 2*f = raw_prediction, hence the factor 2 of ESL disappears. + - Deviance = -2*loglike + .., but HalfBinomialLoss is half of the + deviance, hence the factor of 2 cancels in the comparison. + """ + + def alt_loss(y, raw_pred): + z = 2 * y - 1 + return np.mean(np.log(1 + np.exp(-z * raw_pred))) + + def alt_gradient(y, raw_pred): + # alternative gradient formula according to ESL + z = 2 * y - 1 + return -z / (1 + np.exp(z * raw_pred)) + + bin_loss = HalfBinomialLoss() + + y_true = y_true.astype(global_dtype) + y_pred = y_pred.astype(global_dtype) + datum = (y_true, y_pred) + + assert bin_loss(*datum) == approx(alt_loss(*datum)) + assert_allclose(bin_loss.gradient(*datum), alt_gradient(*datum)) + + +@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name) +def test_predict_proba(loss, global_random_seed): + """Test that predict_proba and gradient_proba work as expected.""" + n_samples = 20 + y_true, raw_prediction = random_y_true_raw_prediction( + loss=loss, + n_samples=n_samples, + y_bound=(-100, 100), + raw_bound=(-5, 5), + seed=global_random_seed, + ) + + if hasattr(loss, "predict_proba"): + proba = loss.predict_proba(raw_prediction) + assert proba.shape == (n_samples, loss.n_classes) + assert np.sum(proba, axis=1) == approx(1, rel=1e-11) + + if hasattr(loss, "gradient_proba"): + for grad, proba in ( + (None, None), + (None, np.empty_like(raw_prediction)), + (np.empty_like(raw_prediction), None), + (np.empty_like(raw_prediction), np.empty_like(raw_prediction)), + ): + grad, proba = loss.gradient_proba( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=None, + gradient_out=grad, + proba_out=proba, + ) + assert proba.shape == (n_samples, loss.n_classes) + assert np.sum(proba, axis=1) == approx(1, rel=1e-11) + assert_allclose( + grad, + loss.gradient( + y_true=y_true, + raw_prediction=raw_prediction, + sample_weight=None, + gradient_out=None, + ), + ) + + +@pytest.mark.parametrize("loss", ALL_LOSSES) +@pytest.mark.parametrize("sample_weight", [None, "range"]) +@pytest.mark.parametrize("dtype", (np.float32, np.float64)) +@pytest.mark.parametrize("order", ("C", "F")) +def test_init_gradient_and_hessians(loss, sample_weight, dtype, order): + """Test that init_gradient_and_hessian works as expected. + + passing sample_weight to a loss correctly influences the constant_hessian + attribute, and consequently the shape of the hessian array. + """ + n_samples = 5 + if sample_weight == "range": + sample_weight = np.ones(n_samples) + loss = loss(sample_weight=sample_weight) + gradient, hessian = loss.init_gradient_and_hessian( + n_samples=n_samples, + dtype=dtype, + order=order, + ) + if loss.constant_hessian: + assert gradient.shape == (n_samples,) + assert hessian.shape == (1,) + elif loss.is_multiclass: + assert gradient.shape == (n_samples, loss.n_classes) + assert hessian.shape == (n_samples, loss.n_classes) + else: + assert hessian.shape == (n_samples,) + assert hessian.shape == (n_samples,) + + assert gradient.dtype == dtype + assert hessian.dtype == dtype + + if order == "C": + assert gradient.flags.c_contiguous + assert hessian.flags.c_contiguous + else: + assert gradient.flags.f_contiguous + assert hessian.flags.f_contiguous + + +@pytest.mark.parametrize("loss", ALL_LOSSES) +@pytest.mark.parametrize( + "params, err_msg", + [ + ( + {"dtype": np.int64}, + f"Valid options for 'dtype' are .* Got dtype={np.int64} instead.", + ), + ], +) +def test_init_gradient_and_hessian_raises(loss, params, err_msg): + """Test that init_gradient_and_hessian raises errors for invalid input.""" + loss = loss() + with pytest.raises((ValueError, TypeError), match=err_msg): + gradient, hessian = loss.init_gradient_and_hessian(n_samples=5, **params) + + +@pytest.mark.parametrize( + "loss, params, err_type, err_msg", + [ + ( + PinballLoss, + {"quantile": None}, + TypeError, + "quantile must be an instance of float, not NoneType.", + ), + ( + PinballLoss, + {"quantile": 0}, + ValueError, + "quantile == 0, must be > 0.", + ), + (PinballLoss, {"quantile": 1.1}, ValueError, "quantile == 1.1, must be < 1."), + ( + HuberLoss, + {"quantile": None}, + TypeError, + "quantile must be an instance of float, not NoneType.", + ), + ( + HuberLoss, + {"quantile": 0}, + ValueError, + "quantile == 0, must be > 0.", + ), + (HuberLoss, {"quantile": 1.1}, ValueError, "quantile == 1.1, must be < 1."), + ], +) +def test_loss_init_parameter_validation(loss, params, err_type, err_msg): + """Test that loss raises errors for invalid input.""" + with pytest.raises(err_type, match=err_msg): + loss(**params) + + +@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name) +def test_loss_pickle(loss): + """Test that losses can be pickled.""" + n_samples = 20 + y_true, raw_prediction = random_y_true_raw_prediction( + loss=loss, + n_samples=n_samples, + y_bound=(-100, 100), + raw_bound=(-5, 5), + seed=42, + ) + pickled_loss = pickle.dumps(loss) + unpickled_loss = pickle.loads(pickled_loss) + assert loss(y_true=y_true, raw_prediction=raw_prediction) == approx( + unpickled_loss(y_true=y_true, raw_prediction=raw_prediction) + ) + + +@pytest.mark.parametrize("p", [-1.5, 0, 1, 1.5, 2, 3]) +def test_tweedie_log_identity_consistency(p): + """Test for identical losses when only the link function is different.""" + half_tweedie_log = HalfTweedieLoss(power=p) + half_tweedie_identity = HalfTweedieLossIdentity(power=p) + n_samples = 10 + y_true, raw_prediction = random_y_true_raw_prediction( + loss=half_tweedie_log, n_samples=n_samples, seed=42 + ) + y_pred = half_tweedie_log.link.inverse(raw_prediction) # exp(raw_prediction) + + # Let's compare the loss values, up to some constant term that is dropped + # in HalfTweedieLoss but not in HalfTweedieLossIdentity. + loss_log = half_tweedie_log.loss( + y_true=y_true, raw_prediction=raw_prediction + ) + half_tweedie_log.constant_to_optimal_zero(y_true) + loss_identity = half_tweedie_identity.loss( + y_true=y_true, raw_prediction=y_pred + ) + half_tweedie_identity.constant_to_optimal_zero(y_true) + # Note that HalfTweedieLoss ignores different constant terms than + # HalfTweedieLossIdentity. Constant terms means terms not depending on + # raw_prediction. By adding these terms, `constant_to_optimal_zero`, both losses + # give the same values. + assert_allclose(loss_log, loss_identity) + + # For gradients and hessians, the constant terms do not matter. We have, however, + # to account for the chain rule, i.e. with x=raw_prediction + # gradient_log(x) = d/dx loss_log(x) + # = d/dx loss_identity(exp(x)) + # = exp(x) * gradient_identity(exp(x)) + # Similarly, + # hessian_log(x) = exp(x) * gradient_identity(exp(x)) + # + exp(x)**2 * hessian_identity(x) + gradient_log, hessian_log = half_tweedie_log.gradient_hessian( + y_true=y_true, raw_prediction=raw_prediction + ) + gradient_identity, hessian_identity = half_tweedie_identity.gradient_hessian( + y_true=y_true, raw_prediction=y_pred + ) + assert_allclose(gradient_log, y_pred * gradient_identity) + assert_allclose( + hessian_log, y_pred * gradient_identity + y_pred**2 * hessian_identity + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..de86a59e07113dcc7f9c656e65c7708ee230afa6 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/__init__.py @@ -0,0 +1,56 @@ +"""Popular unsupervised clustering algorithms.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._affinity_propagation import AffinityPropagation, affinity_propagation +from ._agglomerative import ( + AgglomerativeClustering, + FeatureAgglomeration, + linkage_tree, + ward_tree, +) +from ._bicluster import SpectralBiclustering, SpectralCoclustering +from ._birch import Birch +from ._bisect_k_means import BisectingKMeans +from ._dbscan import DBSCAN, dbscan +from ._hdbscan.hdbscan import HDBSCAN +from ._kmeans import KMeans, MiniBatchKMeans, k_means, kmeans_plusplus +from ._mean_shift import MeanShift, estimate_bandwidth, get_bin_seeds, mean_shift +from ._optics import ( + OPTICS, + cluster_optics_dbscan, + cluster_optics_xi, + compute_optics_graph, +) +from ._spectral import SpectralClustering, spectral_clustering + +__all__ = [ + "DBSCAN", + "HDBSCAN", + "OPTICS", + "AffinityPropagation", + "AgglomerativeClustering", + "Birch", + "BisectingKMeans", + "FeatureAgglomeration", + "KMeans", + "MeanShift", + "MiniBatchKMeans", + "SpectralBiclustering", + "SpectralClustering", + "SpectralCoclustering", + "affinity_propagation", + "cluster_optics_dbscan", + "cluster_optics_xi", + "compute_optics_graph", + "dbscan", + "estimate_bandwidth", + "get_bin_seeds", + "k_means", + "kmeans_plusplus", + "linkage_tree", + "mean_shift", + "spectral_clustering", + "ward_tree", +] diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_affinity_propagation.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_affinity_propagation.py new file mode 100644 index 0000000000000000000000000000000000000000..c7ae6ed63580d60eb2d889c11cfe84875380c55c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_affinity_propagation.py @@ -0,0 +1,607 @@ +"""Affinity Propagation clustering algorithm.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from numbers import Integral, Real + +import numpy as np + +from .._config import config_context +from ..base import BaseEstimator, ClusterMixin, _fit_context +from ..exceptions import ConvergenceWarning +from ..metrics import euclidean_distances, pairwise_distances_argmin +from ..utils import check_random_state +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.validation import check_is_fitted, validate_data + + +def _equal_similarities_and_preferences(S, preference): + def all_equal_preferences(): + return np.all(preference == preference.flat[0]) + + def all_equal_similarities(): + # Create mask to ignore diagonal of S + mask = np.ones(S.shape, dtype=bool) + np.fill_diagonal(mask, 0) + + return np.all(S[mask].flat == S[mask].flat[0]) + + return all_equal_preferences() and all_equal_similarities() + + +def _affinity_propagation( + S, + *, + preference, + convergence_iter, + max_iter, + damping, + verbose, + return_n_iter, + random_state, +): + """Main affinity propagation algorithm.""" + n_samples = S.shape[0] + if n_samples == 1 or _equal_similarities_and_preferences(S, preference): + # It makes no sense to run the algorithm in this case, so return 1 or + # n_samples clusters, depending on preferences + warnings.warn( + "All samples have mutually equal similarities. " + "Returning arbitrary cluster center(s)." + ) + if preference.flat[0] > S.flat[n_samples - 1]: + return ( + (np.arange(n_samples), np.arange(n_samples), 0) + if return_n_iter + else (np.arange(n_samples), np.arange(n_samples)) + ) + else: + return ( + (np.array([0]), np.array([0] * n_samples), 0) + if return_n_iter + else (np.array([0]), np.array([0] * n_samples)) + ) + + # Place preference on the diagonal of S + S.flat[:: (n_samples + 1)] = preference + + A = np.zeros((n_samples, n_samples)) + R = np.zeros((n_samples, n_samples)) # Initialize messages + # Intermediate results + tmp = np.zeros((n_samples, n_samples)) + + # Remove degeneracies + S += ( + np.finfo(S.dtype).eps * S + np.finfo(S.dtype).tiny * 100 + ) * random_state.standard_normal(size=(n_samples, n_samples)) + + # Execute parallel affinity propagation updates + e = np.zeros((n_samples, convergence_iter)) + + ind = np.arange(n_samples) + + for it in range(max_iter): + # tmp = A + S; compute responsibilities + np.add(A, S, tmp) + I = np.argmax(tmp, axis=1) + Y = tmp[ind, I] # np.max(A + S, axis=1) + tmp[ind, I] = -np.inf + Y2 = np.max(tmp, axis=1) + + # tmp = Rnew + np.subtract(S, Y[:, None], tmp) + tmp[ind, I] = S[ind, I] - Y2 + + # Damping + tmp *= 1 - damping + R *= damping + R += tmp + + # tmp = Rp; compute availabilities + np.maximum(R, 0, tmp) + tmp.flat[:: n_samples + 1] = R.flat[:: n_samples + 1] + + # tmp = -Anew + tmp -= np.sum(tmp, axis=0) + dA = np.diag(tmp).copy() + tmp.clip(0, np.inf, tmp) + tmp.flat[:: n_samples + 1] = dA + + # Damping + tmp *= 1 - damping + A *= damping + A -= tmp + + # Check for convergence + E = (np.diag(A) + np.diag(R)) > 0 + e[:, it % convergence_iter] = E + K = np.sum(E, axis=0) + + if it >= convergence_iter: + se = np.sum(e, axis=1) + unconverged = np.sum((se == convergence_iter) + (se == 0)) != n_samples + if (not unconverged and (K > 0)) or (it == max_iter): + never_converged = False + if verbose: + print("Converged after %d iterations." % it) + break + else: + never_converged = True + if verbose: + print("Did not converge") + + I = np.flatnonzero(E) + K = I.size # Identify exemplars + + if K > 0: + if never_converged: + warnings.warn( + ( + "Affinity propagation did not converge, this model " + "may return degenerate cluster centers and labels." + ), + ConvergenceWarning, + ) + c = np.argmax(S[:, I], axis=1) + c[I] = np.arange(K) # Identify clusters + # Refine the final set of exemplars and clusters and return results + for k in range(K): + ii = np.asarray(c == k).nonzero()[0] + j = np.argmax(np.sum(S[ii[:, np.newaxis], ii], axis=0)) + I[k] = ii[j] + + c = np.argmax(S[:, I], axis=1) + c[I] = np.arange(K) + labels = I[c] + # Reduce labels to a sorted, gapless, list + cluster_centers_indices = np.unique(labels) + labels = np.searchsorted(cluster_centers_indices, labels) + else: + warnings.warn( + ( + "Affinity propagation did not converge and this model " + "will not have any cluster centers." + ), + ConvergenceWarning, + ) + labels = np.array([-1] * n_samples) + cluster_centers_indices = [] + + if return_n_iter: + return cluster_centers_indices, labels, it + 1 + else: + return cluster_centers_indices, labels + + +############################################################################### +# Public API + + +@validate_params( + { + "S": ["array-like"], + "return_n_iter": ["boolean"], + }, + prefer_skip_nested_validation=False, +) +def affinity_propagation( + S, + *, + preference=None, + convergence_iter=15, + max_iter=200, + damping=0.5, + copy=True, + verbose=False, + return_n_iter=False, + random_state=None, +): + """Perform Affinity Propagation Clustering of data. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + S : array-like of shape (n_samples, n_samples) + Matrix of similarities between points. + + preference : array-like of shape (n_samples,) or float, default=None + Preferences for each point - points with larger values of + preferences are more likely to be chosen as exemplars. The number of + exemplars, i.e. of clusters, is influenced by the input preferences + value. If the preferences are not passed as arguments, they will be + set to the median of the input similarities (resulting in a moderate + number of clusters). For a smaller amount of clusters, this can be set + to the minimum value of the similarities. + + convergence_iter : int, default=15 + Number of iterations with no change in the number + of estimated clusters that stops the convergence. + + max_iter : int, default=200 + Maximum number of iterations. + + damping : float, default=0.5 + Damping factor between 0.5 and 1. + + copy : bool, default=True + If copy is False, the affinity matrix is modified inplace by the + algorithm, for memory efficiency. + + verbose : bool, default=False + The verbosity level. + + return_n_iter : bool, default=False + Whether or not to return the number of iterations. + + random_state : int, RandomState instance or None, default=None + Pseudo-random number generator to control the starting state. + Use an int for reproducible results across function calls. + See the :term:`Glossary `. + + .. versionadded:: 0.23 + this parameter was previously hardcoded as 0. + + Returns + ------- + cluster_centers_indices : ndarray of shape (n_clusters,) + Index of clusters centers. + + labels : ndarray of shape (n_samples,) + Cluster labels for each point. + + n_iter : int + Number of iterations run. Returned only if `return_n_iter` is + set to True. + + Notes + ----- + For an example usage, + see :ref:`sphx_glr_auto_examples_cluster_plot_affinity_propagation.py`. + You may also check out, + :ref:`sphx_glr_auto_examples_applications_plot_stock_market.py` + + When the algorithm does not converge, it will still return a arrays of + ``cluster_center_indices`` and labels if there are any exemplars/clusters, + however they may be degenerate and should be used with caution. + + When all training samples have equal similarities and equal preferences, + the assignment of cluster centers and labels depends on the preference. + If the preference is smaller than the similarities, a single cluster center + and label ``0`` for every sample will be returned. Otherwise, every + training sample becomes its own cluster center and is assigned a unique + label. + + References + ---------- + Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages + Between Data Points", Science Feb. 2007 + + Examples + -------- + >>> import numpy as np + >>> from sklearn.cluster import affinity_propagation + >>> from sklearn.metrics.pairwise import euclidean_distances + >>> X = np.array([[1, 2], [1, 4], [1, 0], + ... [4, 2], [4, 4], [4, 0]]) + >>> S = -euclidean_distances(X, squared=True) + >>> cluster_centers_indices, labels = affinity_propagation(S, random_state=0) + >>> cluster_centers_indices + array([0, 3]) + >>> labels + array([0, 0, 0, 1, 1, 1]) + """ + estimator = AffinityPropagation( + damping=damping, + max_iter=max_iter, + convergence_iter=convergence_iter, + copy=copy, + preference=preference, + affinity="precomputed", + verbose=verbose, + random_state=random_state, + ).fit(S) + + if return_n_iter: + return estimator.cluster_centers_indices_, estimator.labels_, estimator.n_iter_ + return estimator.cluster_centers_indices_, estimator.labels_ + + +class AffinityPropagation(ClusterMixin, BaseEstimator): + """Perform Affinity Propagation Clustering of data. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + damping : float, default=0.5 + Damping factor in the range `[0.5, 1.0)` is the extent to + which the current value is maintained relative to + incoming values (weighted 1 - damping). This in order + to avoid numerical oscillations when updating these + values (messages). + + max_iter : int, default=200 + Maximum number of iterations. + + convergence_iter : int, default=15 + Number of iterations with no change in the number + of estimated clusters that stops the convergence. + + copy : bool, default=True + Make a copy of input data. + + preference : array-like of shape (n_samples,) or float, default=None + Preferences for each point - points with larger values of + preferences are more likely to be chosen as exemplars. The number + of exemplars, ie of clusters, is influenced by the input + preferences value. If the preferences are not passed as arguments, + they will be set to the median of the input similarities. + + affinity : {'euclidean', 'precomputed'}, default='euclidean' + Which affinity to use. At the moment 'precomputed' and + ``euclidean`` are supported. 'euclidean' uses the + negative squared euclidean distance between points. + + verbose : bool, default=False + Whether to be verbose. + + random_state : int, RandomState instance or None, default=None + Pseudo-random number generator to control the starting state. + Use an int for reproducible results across function calls. + See the :term:`Glossary `. + + .. versionadded:: 0.23 + this parameter was previously hardcoded as 0. + + Attributes + ---------- + cluster_centers_indices_ : ndarray of shape (n_clusters,) + Indices of cluster centers. + + cluster_centers_ : ndarray of shape (n_clusters, n_features) + Cluster centers (if affinity != ``precomputed``). + + labels_ : ndarray of shape (n_samples,) + Labels of each point. + + affinity_matrix_ : ndarray of shape (n_samples, n_samples) + Stores the affinity matrix used in ``fit``. + + n_iter_ : int + Number of iterations taken to converge. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + AgglomerativeClustering : Recursively merges the pair of + clusters that minimally increases a given linkage distance. + FeatureAgglomeration : Similar to AgglomerativeClustering, + but recursively merges features instead of samples. + KMeans : K-Means clustering. + MiniBatchKMeans : Mini-Batch K-Means clustering. + MeanShift : Mean shift clustering using a flat kernel. + SpectralClustering : Apply clustering to a projection + of the normalized Laplacian. + + Notes + ----- + The algorithmic complexity of affinity propagation is quadratic + in the number of points. + + When the algorithm does not converge, it will still return a arrays of + ``cluster_center_indices`` and labels if there are any exemplars/clusters, + however they may be degenerate and should be used with caution. + + When ``fit`` does not converge, ``cluster_centers_`` is still populated + however it may be degenerate. In such a case, proceed with caution. + If ``fit`` does not converge and fails to produce any ``cluster_centers_`` + then ``predict`` will label every sample as ``-1``. + + When all training samples have equal similarities and equal preferences, + the assignment of cluster centers and labels depends on the preference. + If the preference is smaller than the similarities, ``fit`` will result in + a single cluster center and label ``0`` for every sample. Otherwise, every + training sample becomes its own cluster center and is assigned a unique + label. + + References + ---------- + + Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages + Between Data Points", Science Feb. 2007 + + Examples + -------- + >>> from sklearn.cluster import AffinityPropagation + >>> import numpy as np + >>> X = np.array([[1, 2], [1, 4], [1, 0], + ... [4, 2], [4, 4], [4, 0]]) + >>> clustering = AffinityPropagation(random_state=5).fit(X) + >>> clustering + AffinityPropagation(random_state=5) + >>> clustering.labels_ + array([0, 0, 0, 1, 1, 1]) + >>> clustering.predict([[0, 0], [4, 4]]) + array([0, 1]) + >>> clustering.cluster_centers_ + array([[1, 2], + [4, 2]]) + + For an example usage, + see :ref:`sphx_glr_auto_examples_cluster_plot_affinity_propagation.py`. + + For a comparison of Affinity Propagation with other clustering algorithms, see + :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py` + """ + + _parameter_constraints: dict = { + "damping": [Interval(Real, 0.5, 1.0, closed="left")], + "max_iter": [Interval(Integral, 1, None, closed="left")], + "convergence_iter": [Interval(Integral, 1, None, closed="left")], + "copy": ["boolean"], + "preference": [ + "array-like", + Interval(Real, None, None, closed="neither"), + None, + ], + "affinity": [StrOptions({"euclidean", "precomputed"})], + "verbose": ["verbose"], + "random_state": ["random_state"], + } + + def __init__( + self, + *, + damping=0.5, + max_iter=200, + convergence_iter=15, + copy=True, + preference=None, + affinity="euclidean", + verbose=False, + random_state=None, + ): + self.damping = damping + self.max_iter = max_iter + self.convergence_iter = convergence_iter + self.copy = copy + self.verbose = verbose + self.preference = preference + self.affinity = affinity + self.random_state = random_state + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = self.affinity == "precomputed" + tags.input_tags.sparse = self.affinity != "precomputed" + return tags + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit the clustering from features, or affinity matrix. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features), or \ + array-like of shape (n_samples, n_samples) + Training instances to cluster, or similarities / affinities between + instances if ``affinity='precomputed'``. If a sparse feature matrix + is provided, it will be converted into a sparse ``csr_matrix``. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self + Returns the instance itself. + """ + if self.affinity == "precomputed": + X = validate_data(self, X, copy=self.copy, force_writeable=True) + self.affinity_matrix_ = X + else: # self.affinity == "euclidean" + X = validate_data(self, X, accept_sparse="csr") + self.affinity_matrix_ = -euclidean_distances(X, squared=True) + + if self.affinity_matrix_.shape[0] != self.affinity_matrix_.shape[1]: + raise ValueError( + "The matrix of similarities must be a square array. " + f"Got {self.affinity_matrix_.shape} instead." + ) + + if self.preference is None: + preference = np.median(self.affinity_matrix_) + else: + preference = self.preference + preference = np.asarray(preference) + + random_state = check_random_state(self.random_state) + + ( + self.cluster_centers_indices_, + self.labels_, + self.n_iter_, + ) = _affinity_propagation( + self.affinity_matrix_, + max_iter=self.max_iter, + convergence_iter=self.convergence_iter, + preference=preference, + damping=self.damping, + verbose=self.verbose, + return_n_iter=True, + random_state=random_state, + ) + + if self.affinity != "precomputed": + self.cluster_centers_ = X[self.cluster_centers_indices_].copy() + + return self + + def predict(self, X): + """Predict the closest cluster each sample in X belongs to. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + New data to predict. If a sparse matrix is provided, it will be + converted into a sparse ``csr_matrix``. + + Returns + ------- + labels : ndarray of shape (n_samples,) + Cluster labels. + """ + check_is_fitted(self) + X = validate_data(self, X, reset=False, accept_sparse="csr") + if not hasattr(self, "cluster_centers_"): + raise ValueError( + "Predict method is not supported when affinity='precomputed'." + ) + + if self.cluster_centers_.shape[0] > 0: + with config_context(assume_finite=True): + return pairwise_distances_argmin(X, self.cluster_centers_) + else: + warnings.warn( + ( + "This model does not have any cluster centers " + "because affinity propagation did not converge. " + "Labeling every sample as '-1'." + ), + ConvergenceWarning, + ) + return np.array([-1] * X.shape[0]) + + def fit_predict(self, X, y=None): + """Fit clustering from features/affinity matrix; return cluster labels. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features), or \ + array-like of shape (n_samples, n_samples) + Training instances to cluster, or similarities / affinities between + instances if ``affinity='precomputed'``. If a sparse feature matrix + is provided, it will be converted into a sparse ``csr_matrix``. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + labels : ndarray of shape (n_samples,) + Cluster labels. + """ + return super().fit_predict(X, y) diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_agglomerative.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_agglomerative.py new file mode 100644 index 0000000000000000000000000000000000000000..f068dc934151d0f4a03f32000fb79e2d657f45a2 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_agglomerative.py @@ -0,0 +1,1333 @@ +"""Hierarchical Agglomerative Clustering + +These routines perform some hierarchical agglomerative clustering of some +input data. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from heapq import heapify, heappop, heappush, heappushpop +from numbers import Integral, Real + +import numpy as np +from scipy import sparse +from scipy.sparse.csgraph import connected_components + +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + ClusterMixin, + _fit_context, +) +from ..metrics import DistanceMetric +from ..metrics._dist_metrics import METRIC_MAPPING64 +from ..metrics.pairwise import _VALID_METRICS, paired_distances +from ..utils import check_array +from ..utils._fast_dict import IntFloatDict +from ..utils._param_validation import ( + HasMethods, + Interval, + StrOptions, + validate_params, +) +from ..utils.graph import _fix_connected_components +from ..utils.validation import check_memory, validate_data + +# mypy error: Module 'sklearn.cluster' has no attribute '_hierarchical_fast' +from . import _hierarchical_fast as _hierarchical # type: ignore[attr-defined] +from ._feature_agglomeration import AgglomerationTransform + +############################################################################### +# For non fully-connected graphs + + +def _fix_connectivity(X, connectivity, affinity): + """ + Fixes the connectivity matrix. + + The different steps are: + + - copies it + - makes it symmetric + - converts it to LIL if necessary + - completes it if necessary. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Feature matrix representing `n_samples` samples to be clustered. + + connectivity : sparse matrix, default=None + Connectivity matrix. Defines for each sample the neighboring samples + following a given structure of the data. The matrix is assumed to + be symmetric and only the upper triangular half is used. + Default is `None`, i.e, the Ward algorithm is unstructured. + + affinity : {"euclidean", "precomputed"}, default="euclidean" + Which affinity to use. At the moment `precomputed` and + ``euclidean`` are supported. `euclidean` uses the + negative squared Euclidean distance between points. + + Returns + ------- + connectivity : sparse matrix + The fixed connectivity matrix. + + n_connected_components : int + The number of connected components in the graph. + """ + n_samples = X.shape[0] + if connectivity.shape[0] != n_samples or connectivity.shape[1] != n_samples: + raise ValueError( + "Wrong shape for connectivity matrix: %s when X is %s" + % (connectivity.shape, X.shape) + ) + + # Make the connectivity matrix symmetric: + connectivity = connectivity + connectivity.T + + # Convert connectivity matrix to LIL + if not sparse.issparse(connectivity): + connectivity = sparse.lil_matrix(connectivity) + + # `connectivity` is a sparse matrix at this point + if connectivity.format != "lil": + connectivity = connectivity.tolil() + + # Compute the number of nodes + n_connected_components, labels = connected_components(connectivity) + + if n_connected_components > 1: + warnings.warn( + "the number of connected components of the " + "connectivity matrix is %d > 1. Completing it to avoid " + "stopping the tree early." % n_connected_components, + stacklevel=2, + ) + # XXX: Can we do without completing the matrix? + connectivity = _fix_connected_components( + X=X, + graph=connectivity, + n_connected_components=n_connected_components, + component_labels=labels, + metric=affinity, + mode="connectivity", + ) + + return connectivity, n_connected_components + + +def _single_linkage_tree( + connectivity, + n_samples, + n_nodes, + n_clusters, + n_connected_components, + return_distance, +): + """ + Perform single linkage clustering on sparse data via the minimum + spanning tree from scipy.sparse.csgraph, then using union-find to label. + The parent array is then generated by walking through the tree. + """ + from scipy.sparse.csgraph import minimum_spanning_tree + + # explicitly cast connectivity to ensure safety + connectivity = connectivity.astype(np.float64, copy=False) + + # Ensure zero distances aren't ignored by setting them to "epsilon" + epsilon_value = np.finfo(dtype=connectivity.data.dtype).eps + connectivity.data[connectivity.data == 0] = epsilon_value + + # Use scipy.sparse.csgraph to generate a minimum spanning tree + mst = minimum_spanning_tree(connectivity.tocsr()) + + # Convert the graph to scipy.cluster.hierarchy array format + mst = mst.tocoo() + + # Undo the epsilon values + mst.data[mst.data == epsilon_value] = 0 + + mst_array = np.vstack([mst.row, mst.col, mst.data]).T + + # Sort edges of the min_spanning_tree by weight + mst_array = mst_array[np.argsort(mst_array.T[2], kind="mergesort"), :] + + # Convert edge list into standard hierarchical clustering format + single_linkage_tree = _hierarchical._single_linkage_label(mst_array) + children_ = single_linkage_tree[:, :2].astype(int) + + # Compute parents + parent = np.arange(n_nodes, dtype=np.intp) + for i, (left, right) in enumerate(children_, n_samples): + if n_clusters is not None and i >= n_nodes: + break + if left < n_nodes: + parent[left] = i + if right < n_nodes: + parent[right] = i + + if return_distance: + distances = single_linkage_tree[:, 2] + return children_, n_connected_components, n_samples, parent, distances + return children_, n_connected_components, n_samples, parent + + +############################################################################### +# Hierarchical tree building functions + + +@validate_params( + { + "X": ["array-like"], + "connectivity": ["array-like", "sparse matrix", None], + "n_clusters": [Interval(Integral, 1, None, closed="left"), None], + "return_distance": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def ward_tree(X, *, connectivity=None, n_clusters=None, return_distance=False): + """Ward clustering based on a Feature matrix. + + Recursively merges the pair of clusters that minimally increases + within-cluster variance. + + The inertia matrix uses a Heapq-based representation. + + This is the structured version, that takes into account some topological + structure between samples. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Feature matrix representing `n_samples` samples to be clustered. + + connectivity : {array-like, sparse matrix}, default=None + Connectivity matrix. Defines for each sample the neighboring samples + following a given structure of the data. The matrix is assumed to + be symmetric and only the upper triangular half is used. + Default is None, i.e, the Ward algorithm is unstructured. + + n_clusters : int, default=None + `n_clusters` should be less than `n_samples`. Stop early the + construction of the tree at `n_clusters.` This is useful to decrease + computation time if the number of clusters is not small compared to the + number of samples. In this case, the complete tree is not computed, thus + the 'children' output is of limited use, and the 'parents' output should + rather be used. This option is valid only when specifying a connectivity + matrix. + + return_distance : bool, default=False + If `True`, return the distance between the clusters. + + Returns + ------- + children : ndarray of shape (n_nodes-1, 2) + The children of each non-leaf node. Values less than `n_samples` + correspond to leaves of the tree which are the original samples. + A node `i` greater than or equal to `n_samples` is a non-leaf + node and has children `children_[i - n_samples]`. Alternatively + at the i-th iteration, children[i][0] and children[i][1] + are merged to form node `n_samples + i`. + + n_connected_components : int + The number of connected components in the graph. + + n_leaves : int + The number of leaves in the tree. + + parents : ndarray of shape (n_nodes,) or None + The parent of each node. Only returned when a connectivity matrix + is specified, elsewhere 'None' is returned. + + distances : ndarray of shape (n_nodes-1,) + Only returned if `return_distance` is set to `True` (for compatibility). + The distances between the centers of the nodes. `distances[i]` + corresponds to a weighted Euclidean distance between + the nodes `children[i, 1]` and `children[i, 2]`. If the nodes refer to + leaves of the tree, then `distances[i]` is their unweighted Euclidean + distance. Distances are updated in the following way + (from scipy.hierarchy.linkage): + + The new entry :math:`d(u,v)` is computed as follows, + + .. math:: + + d(u,v) = \\sqrt{\\frac{|v|+|s|} + {T}d(v,s)^2 + + \\frac{|v|+|t|} + {T}d(v,t)^2 + - \\frac{|v|} + {T}d(s,t)^2} + + where :math:`u` is the newly joined cluster consisting of + clusters :math:`s` and :math:`t`, :math:`v` is an unused + cluster in the forest, :math:`T=|v|+|s|+|t|`, and + :math:`|*|` is the cardinality of its argument. This is also + known as the incremental algorithm. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.cluster import ward_tree + >>> X = np.array([[1, 2], [1, 4], [1, 0], + ... [4, 2], [4, 4], [4, 0]]) + >>> children, n_connected_components, n_leaves, parents = ward_tree(X) + >>> children + array([[0, 1], + [3, 5], + [2, 6], + [4, 7], + [8, 9]]) + >>> n_connected_components + 1 + >>> n_leaves + 6 + """ + X = np.asarray(X) + if X.ndim == 1: + X = np.reshape(X, (-1, 1)) + n_samples, n_features = X.shape + + if connectivity is None: + from scipy.cluster import hierarchy # imports PIL + + if n_clusters is not None: + warnings.warn( + ( + "Partial build of the tree is implemented " + "only for structured clustering (i.e. with " + "explicit connectivity). The algorithm " + "will build the full tree and only " + "retain the lower branches required " + "for the specified number of clusters" + ), + stacklevel=2, + ) + X = np.require(X, requirements="W") + out = hierarchy.ward(X) + children_ = out[:, :2].astype(np.intp) + + if return_distance: + distances = out[:, 2] + return children_, 1, n_samples, None, distances + else: + return children_, 1, n_samples, None + + connectivity, n_connected_components = _fix_connectivity( + X, connectivity, affinity="euclidean" + ) + if n_clusters is None: + n_nodes = 2 * n_samples - 1 + else: + if n_clusters > n_samples: + raise ValueError( + "Cannot provide more clusters than samples. " + "%i n_clusters was asked, and there are %i " + "samples." % (n_clusters, n_samples) + ) + n_nodes = 2 * n_samples - n_clusters + + # create inertia matrix + coord_row = [] + coord_col = [] + A = [] + for ind, row in enumerate(connectivity.rows): + A.append(row) + # We keep only the upper triangular for the moments + # Generator expressions are faster than arrays on the following + row = [i for i in row if i < ind] + coord_row.extend( + len(row) + * [ + ind, + ] + ) + coord_col.extend(row) + + coord_row = np.array(coord_row, dtype=np.intp, order="C") + coord_col = np.array(coord_col, dtype=np.intp, order="C") + + # build moments as a list + moments_1 = np.zeros(n_nodes, order="C") + moments_1[:n_samples] = 1 + moments_2 = np.zeros((n_nodes, n_features), order="C") + moments_2[:n_samples] = X + inertia = np.empty(len(coord_row), dtype=np.float64, order="C") + _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, inertia) + inertia = list(zip(inertia, coord_row, coord_col)) + heapify(inertia) + + # prepare the main fields + parent = np.arange(n_nodes, dtype=np.intp) + used_node = np.ones(n_nodes, dtype=bool) + children = [] + if return_distance: + distances = np.empty(n_nodes - n_samples) + + not_visited = np.empty(n_nodes, dtype=bool, order="C") + + # recursive merge loop + for k in range(n_samples, n_nodes): + # identify the merge + while True: + inert, i, j = heappop(inertia) + if used_node[i] and used_node[j]: + break + parent[i], parent[j] = k, k + children.append((i, j)) + used_node[i] = used_node[j] = False + if return_distance: # store inertia value + distances[k - n_samples] = inert + + # update the moments + moments_1[k] = moments_1[i] + moments_1[j] + moments_2[k] = moments_2[i] + moments_2[j] + + # update the structure matrix A and the inertia matrix + coord_col = [] + not_visited.fill(1) + not_visited[k] = 0 + _hierarchical._get_parents(A[i], coord_col, parent, not_visited) + _hierarchical._get_parents(A[j], coord_col, parent, not_visited) + # List comprehension is faster than a for loop + [A[col].append(k) for col in coord_col] + A.append(coord_col) + coord_col = np.array(coord_col, dtype=np.intp, order="C") + coord_row = np.empty(coord_col.shape, dtype=np.intp, order="C") + coord_row.fill(k) + n_additions = len(coord_row) + ini = np.empty(n_additions, dtype=np.float64, order="C") + + _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, ini) + + # List comprehension is faster than a for loop + [heappush(inertia, (ini[idx], k, coord_col[idx])) for idx in range(n_additions)] + + # Separate leaves in children (empty lists up to now) + n_leaves = n_samples + # sort children to get consistent output with unstructured version + children = [c[::-1] for c in children] + children = np.array(children) # return numpy array for efficient caching + + if return_distance: + # 2 is scaling factor to compare w/ unstructured version + distances = np.sqrt(2.0 * distances) + return children, n_connected_components, n_leaves, parent, distances + else: + return children, n_connected_components, n_leaves, parent + + +# single average and complete linkage +def linkage_tree( + X, + connectivity=None, + n_clusters=None, + linkage="complete", + affinity="euclidean", + return_distance=False, +): + """Linkage agglomerative clustering based on a Feature matrix. + + The inertia matrix uses a Heapq-based representation. + + This is the structured version, that takes into account some topological + structure between samples. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Feature matrix representing `n_samples` samples to be clustered. + + connectivity : sparse matrix, default=None + Connectivity matrix. Defines for each sample the neighboring samples + following a given structure of the data. The matrix is assumed to + be symmetric and only the upper triangular half is used. + Default is `None`, i.e, the Ward algorithm is unstructured. + + n_clusters : int, default=None + Stop early the construction of the tree at `n_clusters`. This is + useful to decrease computation time if the number of clusters is + not small compared to the number of samples. In this case, the + complete tree is not computed, thus the 'children' output is of + limited use, and the 'parents' output should rather be used. + This option is valid only when specifying a connectivity matrix. + + linkage : {"average", "complete", "single"}, default="complete" + Which linkage criteria to use. The linkage criterion determines which + distance to use between sets of observation. + - "average" uses the average of the distances of each observation of + the two sets. + - "complete" or maximum linkage uses the maximum distances between + all observations of the two sets. + - "single" uses the minimum of the distances between all + observations of the two sets. + + affinity : str or callable, default='euclidean' + Which metric to use. Can be 'euclidean', 'manhattan', or any + distance known to paired distance (see metric.pairwise). + + return_distance : bool, default=False + Whether or not to return the distances between the clusters. + + Returns + ------- + children : ndarray of shape (n_nodes-1, 2) + The children of each non-leaf node. Values less than `n_samples` + correspond to leaves of the tree which are the original samples. + A node `i` greater than or equal to `n_samples` is a non-leaf + node and has children `children_[i - n_samples]`. Alternatively + at the i-th iteration, children[i][0] and children[i][1] + are merged to form node `n_samples + i`. + + n_connected_components : int + The number of connected components in the graph. + + n_leaves : int + The number of leaves in the tree. + + parents : ndarray of shape (n_nodes, ) or None + The parent of each node. Only returned when a connectivity matrix + is specified, elsewhere 'None' is returned. + + distances : ndarray of shape (n_nodes-1,) + Returned when `return_distance` is set to `True`. + + distances[i] refers to the distance between children[i][0] and + children[i][1] when they are merged. + + See Also + -------- + ward_tree : Hierarchical clustering with ward linkage. + """ + X = np.asarray(X) + if X.ndim == 1: + X = np.reshape(X, (-1, 1)) + n_samples, n_features = X.shape + + linkage_choices = { + "complete": _hierarchical.max_merge, + "average": _hierarchical.average_merge, + "single": None, + } # Single linkage is handled differently + try: + join_func = linkage_choices[linkage] + except KeyError as e: + raise ValueError( + "Unknown linkage option, linkage should be one of %s, but %s was given" + % (linkage_choices.keys(), linkage) + ) from e + + if affinity == "cosine" and np.any(~np.any(X, axis=1)): + raise ValueError("Cosine affinity cannot be used when X contains zero vectors") + + if connectivity is None: + from scipy.cluster import hierarchy # imports PIL + + if n_clusters is not None: + warnings.warn( + ( + "Partial build of the tree is implemented " + "only for structured clustering (i.e. with " + "explicit connectivity). The algorithm " + "will build the full tree and only " + "retain the lower branches required " + "for the specified number of clusters" + ), + stacklevel=2, + ) + + if affinity == "precomputed": + # for the linkage function of hierarchy to work on precomputed + # data, provide as first argument an ndarray of the shape returned + # by sklearn.metrics.pairwise_distances. + if X.shape[0] != X.shape[1]: + raise ValueError( + f"Distance matrix should be square, got matrix of shape {X.shape}" + ) + i, j = np.triu_indices(X.shape[0], k=1) + X = X[i, j] + elif affinity == "l2": + # Translate to something understood by scipy + affinity = "euclidean" + elif affinity in ("l1", "manhattan"): + affinity = "cityblock" + elif callable(affinity): + X = affinity(X) + i, j = np.triu_indices(X.shape[0], k=1) + X = X[i, j] + if ( + linkage == "single" + and affinity != "precomputed" + and not callable(affinity) + and affinity in METRIC_MAPPING64 + ): + # We need the fast cythonized metric from neighbors + dist_metric = DistanceMetric.get_metric(affinity) + + # The Cython routines used require contiguous arrays + X = np.ascontiguousarray(X, dtype=np.double) + + mst = _hierarchical.mst_linkage_core(X, dist_metric) + # Sort edges of the min_spanning_tree by weight + mst = mst[np.argsort(mst.T[2], kind="mergesort"), :] + + # Convert edge list into standard hierarchical clustering format + out = _hierarchical.single_linkage_label(mst) + else: + out = hierarchy.linkage(X, method=linkage, metric=affinity) + children_ = out[:, :2].astype(int, copy=False) + + if return_distance: + distances = out[:, 2] + return children_, 1, n_samples, None, distances + return children_, 1, n_samples, None + + connectivity, n_connected_components = _fix_connectivity( + X, connectivity, affinity=affinity + ) + connectivity = connectivity.tocoo() + # Put the diagonal to zero + diag_mask = connectivity.row != connectivity.col + connectivity.row = connectivity.row[diag_mask] + connectivity.col = connectivity.col[diag_mask] + connectivity.data = connectivity.data[diag_mask] + del diag_mask + + if affinity == "precomputed": + distances = X[connectivity.row, connectivity.col].astype(np.float64, copy=False) + else: + # FIXME We compute all the distances, while we could have only computed + # the "interesting" distances + distances = paired_distances( + X[connectivity.row], X[connectivity.col], metric=affinity + ) + connectivity.data = distances + + if n_clusters is None: + n_nodes = 2 * n_samples - 1 + else: + assert n_clusters <= n_samples + n_nodes = 2 * n_samples - n_clusters + + if linkage == "single": + return _single_linkage_tree( + connectivity, + n_samples, + n_nodes, + n_clusters, + n_connected_components, + return_distance, + ) + + if return_distance: + distances = np.empty(n_nodes - n_samples) + # create inertia heap and connection matrix + A = np.empty(n_nodes, dtype=object) + inertia = list() + + # LIL seems to the best format to access the rows quickly, + # without the numpy overhead of slicing CSR indices and data. + connectivity = connectivity.tolil() + # We are storing the graph in a list of IntFloatDict + for ind, (data, row) in enumerate(zip(connectivity.data, connectivity.rows)): + A[ind] = IntFloatDict( + np.asarray(row, dtype=np.intp), np.asarray(data, dtype=np.float64) + ) + # We keep only the upper triangular for the heap + # Generator expressions are faster than arrays on the following + inertia.extend( + _hierarchical.WeightedEdge(d, ind, r) for r, d in zip(row, data) if r < ind + ) + del connectivity + + heapify(inertia) + + # prepare the main fields + parent = np.arange(n_nodes, dtype=np.intp) + used_node = np.ones(n_nodes, dtype=np.intp) + children = [] + + # recursive merge loop + for k in range(n_samples, n_nodes): + # identify the merge + while True: + edge = heappop(inertia) + if used_node[edge.a] and used_node[edge.b]: + break + i = edge.a + j = edge.b + + if return_distance: + # store distances + distances[k - n_samples] = edge.weight + + parent[i] = parent[j] = k + children.append((i, j)) + # Keep track of the number of elements per cluster + n_i = used_node[i] + n_j = used_node[j] + used_node[k] = n_i + n_j + used_node[i] = used_node[j] = False + + # update the structure matrix A and the inertia matrix + # a clever 'min', or 'max' operation between A[i] and A[j] + coord_col = join_func(A[i], A[j], used_node, n_i, n_j) + for col, d in coord_col: + A[col].append(k, d) + # Here we use the information from coord_col (containing the + # distances) to update the heap + heappush(inertia, _hierarchical.WeightedEdge(d, k, col)) + A[k] = coord_col + # Clear A[i] and A[j] to save memory + A[i] = A[j] = 0 + + # Separate leaves in children (empty lists up to now) + n_leaves = n_samples + + # # return numpy array for efficient caching + children = np.array(children)[:, ::-1] + + if return_distance: + return children, n_connected_components, n_leaves, parent, distances + return children, n_connected_components, n_leaves, parent + + +# Matching names to tree-building strategies +def _complete_linkage(*args, **kwargs): + kwargs["linkage"] = "complete" + return linkage_tree(*args, **kwargs) + + +def _average_linkage(*args, **kwargs): + kwargs["linkage"] = "average" + return linkage_tree(*args, **kwargs) + + +def _single_linkage(*args, **kwargs): + kwargs["linkage"] = "single" + return linkage_tree(*args, **kwargs) + + +_TREE_BUILDERS = dict( + ward=ward_tree, + complete=_complete_linkage, + average=_average_linkage, + single=_single_linkage, +) + +############################################################################### +# Functions for cutting hierarchical clustering tree + + +def _hc_cut(n_clusters, children, n_leaves): + """Function cutting the ward tree for a given number of clusters. + + Parameters + ---------- + n_clusters : int or ndarray + The number of clusters to form. + + children : ndarray of shape (n_nodes-1, 2) + The children of each non-leaf node. Values less than `n_samples` + correspond to leaves of the tree which are the original samples. + A node `i` greater than or equal to `n_samples` is a non-leaf + node and has children `children_[i - n_samples]`. Alternatively + at the i-th iteration, children[i][0] and children[i][1] + are merged to form node `n_samples + i`. + + n_leaves : int + Number of leaves of the tree. + + Returns + ------- + labels : array [n_samples] + Cluster labels for each point. + """ + if n_clusters > n_leaves: + raise ValueError( + "Cannot extract more clusters than samples: " + f"{n_clusters} clusters were given for a tree with {n_leaves} leaves." + ) + # In this function, we store nodes as a heap to avoid recomputing + # the max of the nodes: the first element is always the smallest + # We use negated indices as heaps work on smallest elements, and we + # are interested in largest elements + # children[-1] is the root of the tree + nodes = [-(max(children[-1]) + 1)] + for _ in range(n_clusters - 1): + # As we have a heap, nodes[0] is the smallest element + these_children = children[-nodes[0] - n_leaves] + # Insert the 2 children and remove the largest node + heappush(nodes, -these_children[0]) + heappushpop(nodes, -these_children[1]) + label = np.zeros(n_leaves, dtype=np.intp) + for i, node in enumerate(nodes): + label[_hierarchical._hc_get_descendent(-node, children, n_leaves)] = i + return label + + +############################################################################### + + +class AgglomerativeClustering(ClusterMixin, BaseEstimator): + """ + Agglomerative Clustering. + + Recursively merges pair of clusters of sample data; uses linkage distance. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_clusters : int or None, default=2 + The number of clusters to find. It must be ``None`` if + ``distance_threshold`` is not ``None``. + + metric : str or callable, default="euclidean" + Metric used to compute the linkage. Can be "euclidean", "l1", "l2", + "manhattan", "cosine", or "precomputed". If linkage is "ward", only + "euclidean" is accepted. If "precomputed", a distance matrix is needed + as input for the fit method. If connectivity is None, linkage is + "single" and affinity is not "precomputed" any valid pairwise distance + metric can be assigned. + + For an example of agglomerative clustering with different metrics, see + :ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_clustering_metrics.py`. + + .. versionadded:: 1.2 + + memory : str or object with the joblib.Memory interface, default=None + Used to cache the output of the computation of the tree. + By default, no caching is done. If a string is given, it is the + path to the caching directory. + + connectivity : array-like, sparse matrix, or callable, default=None + Connectivity matrix. Defines for each sample the neighboring + samples following a given structure of the data. + This can be a connectivity matrix itself or a callable that transforms + the data into a connectivity matrix, such as derived from + `kneighbors_graph`. Default is ``None``, i.e, the + hierarchical clustering algorithm is unstructured. + + For an example of connectivity matrix using + :class:`~sklearn.neighbors.kneighbors_graph`, see + :ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_clustering.py`. + + compute_full_tree : 'auto' or bool, default='auto' + Stop early the construction of the tree at ``n_clusters``. This is + useful to decrease computation time if the number of clusters is not + small compared to the number of samples. This option is useful only + when specifying a connectivity matrix. Note also that when varying the + number of clusters and using caching, it may be advantageous to compute + the full tree. It must be ``True`` if ``distance_threshold`` is not + ``None``. By default `compute_full_tree` is "auto", which is equivalent + to `True` when `distance_threshold` is not `None` or that `n_clusters` + is inferior to the maximum between 100 or `0.02 * n_samples`. + Otherwise, "auto" is equivalent to `False`. + + linkage : {'ward', 'complete', 'average', 'single'}, default='ward' + Which linkage criterion to use. The linkage criterion determines which + distance to use between sets of observation. The algorithm will merge + the pairs of cluster that minimize this criterion. + + - 'ward' minimizes the variance of the clusters being merged. + - 'average' uses the average of the distances of each observation of + the two sets. + - 'complete' or 'maximum' linkage uses the maximum distances between + all observations of the two sets. + - 'single' uses the minimum of the distances between all observations + of the two sets. + + .. versionadded:: 0.20 + Added the 'single' option + + For examples comparing different `linkage` criteria, see + :ref:`sphx_glr_auto_examples_cluster_plot_linkage_comparison.py`. + + distance_threshold : float, default=None + The linkage distance threshold at or above which clusters will not be + merged. If not ``None``, ``n_clusters`` must be ``None`` and + ``compute_full_tree`` must be ``True``. + + .. versionadded:: 0.21 + + compute_distances : bool, default=False + Computes distances between clusters even if `distance_threshold` is not + used. This can be used to make dendrogram visualization, but introduces + a computational and memory overhead. + + .. versionadded:: 0.24 + + For an example of dendrogram visualization, see + :ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_dendrogram.py`. + + Attributes + ---------- + n_clusters_ : int + The number of clusters found by the algorithm. If + ``distance_threshold=None``, it will be equal to the given + ``n_clusters``. + + labels_ : ndarray of shape (n_samples) + Cluster labels for each point. + + n_leaves_ : int + Number of leaves in the hierarchical tree. + + n_connected_components_ : int + The estimated number of connected components in the graph. + + .. versionadded:: 0.21 + ``n_connected_components_`` was added to replace ``n_components_``. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + children_ : array-like of shape (n_samples-1, 2) + The children of each non-leaf node. Values less than `n_samples` + correspond to leaves of the tree which are the original samples. + A node `i` greater than or equal to `n_samples` is a non-leaf + node and has children `children_[i - n_samples]`. Alternatively + at the i-th iteration, children[i][0] and children[i][1] + are merged to form node `n_samples + i`. + + distances_ : array-like of shape (n_nodes-1,) + Distances between nodes in the corresponding place in `children_`. + Only computed if `distance_threshold` is used or `compute_distances` + is set to `True`. + + See Also + -------- + FeatureAgglomeration : Agglomerative clustering but for features instead of + samples. + ward_tree : Hierarchical clustering with ward linkage. + + Examples + -------- + >>> from sklearn.cluster import AgglomerativeClustering + >>> import numpy as np + >>> X = np.array([[1, 2], [1, 4], [1, 0], + ... [4, 2], [4, 4], [4, 0]]) + >>> clustering = AgglomerativeClustering().fit(X) + >>> clustering + AgglomerativeClustering() + >>> clustering.labels_ + array([1, 1, 1, 0, 0, 0]) + + For a comparison of Agglomerative clustering with other clustering algorithms, see + :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py` + """ + + _parameter_constraints: dict = { + "n_clusters": [Interval(Integral, 1, None, closed="left"), None], + "metric": [ + StrOptions(set(_VALID_METRICS) | {"precomputed"}), + callable, + ], + "memory": [str, HasMethods("cache"), None], + "connectivity": ["array-like", "sparse matrix", callable, None], + "compute_full_tree": [StrOptions({"auto"}), "boolean"], + "linkage": [StrOptions(set(_TREE_BUILDERS.keys()))], + "distance_threshold": [Interval(Real, 0, None, closed="left"), None], + "compute_distances": ["boolean"], + } + + def __init__( + self, + n_clusters=2, + *, + metric="euclidean", + memory=None, + connectivity=None, + compute_full_tree="auto", + linkage="ward", + distance_threshold=None, + compute_distances=False, + ): + self.n_clusters = n_clusters + self.distance_threshold = distance_threshold + self.memory = memory + self.connectivity = connectivity + self.compute_full_tree = compute_full_tree + self.linkage = linkage + self.metric = metric + self.compute_distances = compute_distances + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit the hierarchical clustering from features, or distance matrix. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) or \ + (n_samples, n_samples) + Training instances to cluster, or distances between instances if + ``metric='precomputed'``. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self : object + Returns the fitted instance. + """ + X = validate_data(self, X, ensure_min_samples=2) + return self._fit(X) + + def _fit(self, X): + """Fit without validation + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples) + Training instances to cluster, or distances between instances if + ``metric='precomputed'``. + + Returns + ------- + self : object + Returns the fitted instance. + """ + memory = check_memory(self.memory) + + if not ((self.n_clusters is None) ^ (self.distance_threshold is None)): + raise ValueError( + "Exactly one of n_clusters and " + "distance_threshold has to be set, and the other " + "needs to be None." + ) + + if self.distance_threshold is not None and not self.compute_full_tree: + raise ValueError( + "compute_full_tree must be True if distance_threshold is set." + ) + + if self.linkage == "ward" and self.metric != "euclidean": + raise ValueError( + f"{self.metric} was provided as metric. Ward can only " + "work with euclidean distances." + ) + + tree_builder = _TREE_BUILDERS[self.linkage] + + connectivity = self.connectivity + if self.connectivity is not None: + if callable(self.connectivity): + connectivity = self.connectivity(X) + connectivity = check_array( + connectivity, accept_sparse=["csr", "coo", "lil"] + ) + + n_samples = len(X) + compute_full_tree = self.compute_full_tree + if self.connectivity is None: + compute_full_tree = True + if compute_full_tree == "auto": + if self.distance_threshold is not None: + compute_full_tree = True + else: + # Early stopping is likely to give a speed up only for + # a large number of clusters. The actual threshold + # implemented here is heuristic + compute_full_tree = self.n_clusters < max(100, 0.02 * n_samples) + n_clusters = self.n_clusters + if compute_full_tree: + n_clusters = None + + # Construct the tree + kwargs = {} + if self.linkage != "ward": + kwargs["linkage"] = self.linkage + kwargs["affinity"] = self.metric + + distance_threshold = self.distance_threshold + + return_distance = (distance_threshold is not None) or self.compute_distances + + out = memory.cache(tree_builder)( + X, + connectivity=connectivity, + n_clusters=n_clusters, + return_distance=return_distance, + **kwargs, + ) + (self.children_, self.n_connected_components_, self.n_leaves_, parents) = out[ + :4 + ] + + if return_distance: + self.distances_ = out[-1] + + if self.distance_threshold is not None: # distance_threshold is used + self.n_clusters_ = ( + np.count_nonzero(self.distances_ >= distance_threshold) + 1 + ) + else: # n_clusters is used + self.n_clusters_ = self.n_clusters + + # Cut the tree + if compute_full_tree: + self.labels_ = _hc_cut(self.n_clusters_, self.children_, self.n_leaves_) + else: + labels = _hierarchical.hc_get_heads(parents, copy=False) + # copy to avoid holding a reference on the original array + labels = np.copy(labels[:n_samples]) + # Reassign cluster numbers + self.labels_ = np.searchsorted(np.unique(labels), labels) + return self + + def fit_predict(self, X, y=None): + """Fit and return the result of each sample's clustering assignment. + + In addition to fitting, this method also return the result of the + clustering assignment for each sample in the training set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) or \ + (n_samples, n_samples) + Training instances to cluster, or distances between instances if + ``affinity='precomputed'``. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + labels : ndarray of shape (n_samples,) + Cluster labels. + """ + return super().fit_predict(X, y) + + +class FeatureAgglomeration( + ClassNamePrefixFeaturesOutMixin, AgglomerationTransform, AgglomerativeClustering +): + """Agglomerate features. + + Recursively merges pair of clusters of features. + + Refer to + :ref:`sphx_glr_auto_examples_cluster_plot_feature_agglomeration_vs_univariate_selection.py` + for an example comparison of :class:`FeatureAgglomeration` strategy with a + univariate feature selection strategy (based on ANOVA). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_clusters : int or None, default=2 + The number of clusters to find. It must be ``None`` if + ``distance_threshold`` is not ``None``. + + metric : str or callable, default="euclidean" + Metric used to compute the linkage. Can be "euclidean", "l1", "l2", + "manhattan", "cosine", or "precomputed". If linkage is "ward", only + "euclidean" is accepted. If "precomputed", a distance matrix is needed + as input for the fit method. + + .. versionadded:: 1.2 + + memory : str or object with the joblib.Memory interface, default=None + Used to cache the output of the computation of the tree. + By default, no caching is done. If a string is given, it is the + path to the caching directory. + + connectivity : array-like, sparse matrix, or callable, default=None + Connectivity matrix. Defines for each feature the neighboring + features following a given structure of the data. + This can be a connectivity matrix itself or a callable that transforms + the data into a connectivity matrix, such as derived from + `kneighbors_graph`. Default is `None`, i.e, the + hierarchical clustering algorithm is unstructured. + + compute_full_tree : 'auto' or bool, default='auto' + Stop early the construction of the tree at `n_clusters`. This is useful + to decrease computation time if the number of clusters is not small + compared to the number of features. This option is useful only when + specifying a connectivity matrix. Note also that when varying the + number of clusters and using caching, it may be advantageous to compute + the full tree. It must be ``True`` if ``distance_threshold`` is not + ``None``. By default `compute_full_tree` is "auto", which is equivalent + to `True` when `distance_threshold` is not `None` or that `n_clusters` + is inferior to the maximum between 100 or `0.02 * n_samples`. + Otherwise, "auto" is equivalent to `False`. + + linkage : {"ward", "complete", "average", "single"}, default="ward" + Which linkage criterion to use. The linkage criterion determines which + distance to use between sets of features. The algorithm will merge + the pairs of cluster that minimize this criterion. + + - "ward" minimizes the variance of the clusters being merged. + - "complete" or maximum linkage uses the maximum distances between + all features of the two sets. + - "average" uses the average of the distances of each feature of + the two sets. + - "single" uses the minimum of the distances between all features + of the two sets. + + pooling_func : callable, default=np.mean + This combines the values of agglomerated features into a single + value, and should accept an array of shape [M, N] and the keyword + argument `axis=1`, and reduce it to an array of size [M]. + + distance_threshold : float, default=None + The linkage distance threshold at or above which clusters will not be + merged. If not ``None``, ``n_clusters`` must be ``None`` and + ``compute_full_tree`` must be ``True``. + + .. versionadded:: 0.21 + + compute_distances : bool, default=False + Computes distances between clusters even if `distance_threshold` is not + used. This can be used to make dendrogram visualization, but introduces + a computational and memory overhead. + + .. versionadded:: 0.24 + + Attributes + ---------- + n_clusters_ : int + The number of clusters found by the algorithm. If + ``distance_threshold=None``, it will be equal to the given + ``n_clusters``. + + labels_ : array-like of (n_features,) + Cluster labels for each feature. + + n_leaves_ : int + Number of leaves in the hierarchical tree. + + n_connected_components_ : int + The estimated number of connected components in the graph. + + .. versionadded:: 0.21 + ``n_connected_components_`` was added to replace ``n_components_``. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + children_ : array-like of shape (n_nodes-1, 2) + The children of each non-leaf node. Values less than `n_features` + correspond to leaves of the tree which are the original samples. + A node `i` greater than or equal to `n_features` is a non-leaf + node and has children `children_[i - n_features]`. Alternatively + at the i-th iteration, children[i][0] and children[i][1] + are merged to form node `n_features + i`. + + distances_ : array-like of shape (n_nodes-1,) + Distances between nodes in the corresponding place in `children_`. + Only computed if `distance_threshold` is used or `compute_distances` + is set to `True`. + + See Also + -------- + AgglomerativeClustering : Agglomerative clustering samples instead of + features. + ward_tree : Hierarchical clustering with ward linkage. + + Examples + -------- + >>> import numpy as np + >>> from sklearn import datasets, cluster + >>> digits = datasets.load_digits() + >>> images = digits.images + >>> X = np.reshape(images, (len(images), -1)) + >>> agglo = cluster.FeatureAgglomeration(n_clusters=32) + >>> agglo.fit(X) + FeatureAgglomeration(n_clusters=32) + >>> X_reduced = agglo.transform(X) + >>> X_reduced.shape + (1797, 32) + """ + + _parameter_constraints: dict = { + "n_clusters": [Interval(Integral, 1, None, closed="left"), None], + "metric": [ + StrOptions(set(_VALID_METRICS) | {"precomputed"}), + callable, + ], + "memory": [str, HasMethods("cache"), None], + "connectivity": ["array-like", "sparse matrix", callable, None], + "compute_full_tree": [StrOptions({"auto"}), "boolean"], + "linkage": [StrOptions(set(_TREE_BUILDERS.keys()))], + "pooling_func": [callable], + "distance_threshold": [Interval(Real, 0, None, closed="left"), None], + "compute_distances": ["boolean"], + } + + def __init__( + self, + n_clusters=2, + *, + metric="euclidean", + memory=None, + connectivity=None, + compute_full_tree="auto", + linkage="ward", + pooling_func=np.mean, + distance_threshold=None, + compute_distances=False, + ): + super().__init__( + n_clusters=n_clusters, + memory=memory, + connectivity=connectivity, + compute_full_tree=compute_full_tree, + linkage=linkage, + metric=metric, + distance_threshold=distance_threshold, + compute_distances=compute_distances, + ) + self.pooling_func = pooling_func + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit the hierarchical clustering on the data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self : object + Returns the transformer. + """ + X = validate_data(self, X, ensure_min_features=2) + super()._fit(X.T) + self._n_features_out = self.n_clusters_ + return self + + @property + def fit_predict(self): + """Fit and return the result of each sample's clustering assignment.""" + raise AttributeError diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_bicluster.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_bicluster.py new file mode 100644 index 0000000000000000000000000000000000000000..04a4e68024d33350b9fdd844f6bc614e4c22f39a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_bicluster.py @@ -0,0 +1,621 @@ +"""Spectral biclustering algorithms.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from abc import ABCMeta, abstractmethod +from numbers import Integral + +import numpy as np +from scipy.linalg import norm +from scipy.sparse import dia_matrix, issparse +from scipy.sparse.linalg import eigsh, svds + +from ..base import BaseEstimator, BiclusterMixin, _fit_context +from ..utils import check_random_state, check_scalar +from ..utils._param_validation import Interval, StrOptions +from ..utils.extmath import _randomized_svd, make_nonnegative, safe_sparse_dot +from ..utils.validation import assert_all_finite, validate_data +from ._kmeans import KMeans, MiniBatchKMeans + +__all__ = ["SpectralBiclustering", "SpectralCoclustering"] + + +def _scale_normalize(X): + """Normalize ``X`` by scaling rows and columns independently. + + Returns the normalized matrix and the row and column scaling + factors. + """ + X = make_nonnegative(X) + row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze() + col_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=0))).squeeze() + row_diag = np.where(np.isnan(row_diag), 0, row_diag) + col_diag = np.where(np.isnan(col_diag), 0, col_diag) + if issparse(X): + n_rows, n_cols = X.shape + r = dia_matrix((row_diag, [0]), shape=(n_rows, n_rows)) + c = dia_matrix((col_diag, [0]), shape=(n_cols, n_cols)) + an = r @ X @ c + else: + an = row_diag[:, np.newaxis] * X * col_diag + return an, row_diag, col_diag + + +def _bistochastic_normalize(X, max_iter=1000, tol=1e-5): + """Normalize rows and columns of ``X`` simultaneously so that all + rows sum to one constant and all columns sum to a different + constant. + """ + # According to paper, this can also be done more efficiently with + # deviation reduction and balancing algorithms. + X = make_nonnegative(X) + X_scaled = X + for _ in range(max_iter): + X_new, _, _ = _scale_normalize(X_scaled) + if issparse(X): + dist = norm(X_scaled.data - X.data) + else: + dist = norm(X_scaled - X_new) + X_scaled = X_new + if dist is not None and dist < tol: + break + return X_scaled + + +def _log_normalize(X): + """Normalize ``X`` according to Kluger's log-interactions scheme.""" + X = make_nonnegative(X, min_value=1) + if issparse(X): + raise ValueError( + "Cannot compute log of a sparse matrix," + " because log(x) diverges to -infinity as x" + " goes to 0." + ) + L = np.log(X) + row_avg = L.mean(axis=1)[:, np.newaxis] + col_avg = L.mean(axis=0) + avg = L.mean() + return L - row_avg - col_avg + avg + + +class BaseSpectral(BiclusterMixin, BaseEstimator, metaclass=ABCMeta): + """Base class for spectral biclustering.""" + + _parameter_constraints: dict = { + "svd_method": [StrOptions({"randomized", "arpack"})], + "n_svd_vecs": [Interval(Integral, 0, None, closed="left"), None], + "mini_batch": ["boolean"], + "init": [StrOptions({"k-means++", "random"}), np.ndarray], + "n_init": [Interval(Integral, 1, None, closed="left")], + "random_state": ["random_state"], + } + + @abstractmethod + def __init__( + self, + n_clusters=3, + svd_method="randomized", + n_svd_vecs=None, + mini_batch=False, + init="k-means++", + n_init=10, + random_state=None, + ): + self.n_clusters = n_clusters + self.svd_method = svd_method + self.n_svd_vecs = n_svd_vecs + self.mini_batch = mini_batch + self.init = init + self.n_init = n_init + self.random_state = random_state + + @abstractmethod + def _check_parameters(self, n_samples): + """Validate parameters depending on the input data.""" + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Create a biclustering for X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + SpectralBiclustering instance. + """ + X = validate_data(self, X, accept_sparse="csr", dtype=np.float64) + self._check_parameters(X.shape[0]) + self._fit(X) + return self + + def _svd(self, array, n_components, n_discard): + """Returns first `n_components` left and right singular + vectors u and v, discarding the first `n_discard`. + """ + if self.svd_method == "randomized": + kwargs = {} + if self.n_svd_vecs is not None: + kwargs["n_oversamples"] = self.n_svd_vecs + u, _, vt = _randomized_svd( + array, n_components, random_state=self.random_state, **kwargs + ) + + elif self.svd_method == "arpack": + u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs) + if np.any(np.isnan(vt)): + # some eigenvalues of A * A.T are negative, causing + # sqrt() to be np.nan. This causes some vectors in vt + # to be np.nan. + A = safe_sparse_dot(array.T, array) + random_state = check_random_state(self.random_state) + # initialize with [-1,1] as in ARPACK + v0 = random_state.uniform(-1, 1, A.shape[0]) + _, v = eigsh(A, ncv=self.n_svd_vecs, v0=v0) + vt = v.T + if np.any(np.isnan(u)): + A = safe_sparse_dot(array, array.T) + random_state = check_random_state(self.random_state) + # initialize with [-1,1] as in ARPACK + v0 = random_state.uniform(-1, 1, A.shape[0]) + _, u = eigsh(A, ncv=self.n_svd_vecs, v0=v0) + + assert_all_finite(u) + assert_all_finite(vt) + u = u[:, n_discard:] + vt = vt[n_discard:] + return u, vt.T + + def _k_means(self, data, n_clusters): + if self.mini_batch: + model = MiniBatchKMeans( + n_clusters, + init=self.init, + n_init=self.n_init, + random_state=self.random_state, + ) + else: + model = KMeans( + n_clusters, + init=self.init, + n_init=self.n_init, + random_state=self.random_state, + ) + model.fit(data) + centroid = model.cluster_centers_ + labels = model.labels_ + return centroid, labels + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + + +class SpectralCoclustering(BaseSpectral): + """Spectral Co-Clustering algorithm (Dhillon, 2001). + + Clusters rows and columns of an array `X` to solve the relaxed + normalized cut of the bipartite graph created from `X` as follows: + the edge between row vertex `i` and column vertex `j` has weight + `X[i, j]`. + + The resulting bicluster structure is block-diagonal, since each + row and each column belongs to exactly one bicluster. + + Supports sparse matrices, as long as they are nonnegative. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_clusters : int, default=3 + The number of biclusters to find. + + svd_method : {'randomized', 'arpack'}, default='randomized' + Selects the algorithm for finding singular vectors. May be + 'randomized' or 'arpack'. If 'randomized', use + :func:`sklearn.utils.extmath.randomized_svd`, which may be faster + for large matrices. If 'arpack', use + :func:`scipy.sparse.linalg.svds`, which is more accurate, but + possibly slower in some cases. + + n_svd_vecs : int, default=None + Number of vectors to use in calculating the SVD. Corresponds + to `ncv` when `svd_method=arpack` and `n_oversamples` when + `svd_method` is 'randomized`. + + mini_batch : bool, default=False + Whether to use mini-batch k-means, which is faster but may get + different results. + + init : {'k-means++', 'random'}, or ndarray of shape \ + (n_clusters, n_features), default='k-means++' + Method for initialization of k-means algorithm; defaults to + 'k-means++'. + + n_init : int, default=10 + Number of random initializations that are tried with the + k-means algorithm. + + If mini-batch k-means is used, the best initialization is + chosen and the algorithm runs once. Otherwise, the algorithm + is run for each initialization and the best solution chosen. + + random_state : int, RandomState instance, default=None + Used for randomizing the singular value decomposition and the k-means + initialization. Use an int to make the randomness deterministic. + See :term:`Glossary `. + + Attributes + ---------- + rows_ : array-like of shape (n_row_clusters, n_rows) + Results of the clustering. `rows[i, r]` is True if + cluster `i` contains row `r`. Available only after calling ``fit``. + + columns_ : array-like of shape (n_column_clusters, n_columns) + Results of the clustering, like `rows`. + + row_labels_ : array-like of shape (n_rows,) + The bicluster label of each row. + + column_labels_ : array-like of shape (n_cols,) + The bicluster label of each column. + + biclusters_ : tuple of two ndarrays + The tuple contains the `rows_` and `columns_` arrays. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + SpectralBiclustering : Partitions rows and columns under the assumption + that the data has an underlying checkerboard structure. + + References + ---------- + * :doi:`Dhillon, Inderjit S, 2001. Co-clustering documents and words using + bipartite spectral graph partitioning. + <10.1145/502512.502550>` + + Examples + -------- + >>> from sklearn.cluster import SpectralCoclustering + >>> import numpy as np + >>> X = np.array([[1, 1], [2, 1], [1, 0], + ... [4, 7], [3, 5], [3, 6]]) + >>> clustering = SpectralCoclustering(n_clusters=2, random_state=0).fit(X) + >>> clustering.row_labels_ #doctest: +SKIP + array([0, 1, 1, 0, 0, 0], dtype=int32) + >>> clustering.column_labels_ #doctest: +SKIP + array([0, 0], dtype=int32) + >>> clustering + SpectralCoclustering(n_clusters=2, random_state=0) + + For a more detailed example, see the following: + :ref:`sphx_glr_auto_examples_bicluster_plot_spectral_coclustering.py`. + """ + + _parameter_constraints: dict = { + **BaseSpectral._parameter_constraints, + "n_clusters": [Interval(Integral, 1, None, closed="left")], + } + + def __init__( + self, + n_clusters=3, + *, + svd_method="randomized", + n_svd_vecs=None, + mini_batch=False, + init="k-means++", + n_init=10, + random_state=None, + ): + super().__init__( + n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, random_state + ) + + def _check_parameters(self, n_samples): + if self.n_clusters > n_samples: + raise ValueError( + f"n_clusters should be <= n_samples={n_samples}. Got" + f" {self.n_clusters} instead." + ) + + def _fit(self, X): + normalized_data, row_diag, col_diag = _scale_normalize(X) + n_sv = 1 + int(np.ceil(np.log2(self.n_clusters))) + u, v = self._svd(normalized_data, n_sv, n_discard=1) + z = np.vstack((row_diag[:, np.newaxis] * u, col_diag[:, np.newaxis] * v)) + + _, labels = self._k_means(z, self.n_clusters) + + n_rows = X.shape[0] + self.row_labels_ = labels[:n_rows] + self.column_labels_ = labels[n_rows:] + + self.rows_ = np.vstack([self.row_labels_ == c for c in range(self.n_clusters)]) + self.columns_ = np.vstack( + [self.column_labels_ == c for c in range(self.n_clusters)] + ) + + +class SpectralBiclustering(BaseSpectral): + """Spectral biclustering (Kluger, 2003). + + Partitions rows and columns under the assumption that the data has + an underlying checkerboard structure. For instance, if there are + two row partitions and three column partitions, each row will + belong to three biclusters, and each column will belong to two + biclusters. The outer product of the corresponding row and column + label vectors gives this checkerboard structure. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_clusters : int or tuple (n_row_clusters, n_column_clusters), default=3 + The number of row and column clusters in the checkerboard + structure. + + method : {'bistochastic', 'scale', 'log'}, default='bistochastic' + Method of normalizing and converting singular vectors into + biclusters. May be one of 'scale', 'bistochastic', or 'log'. + The authors recommend using 'log'. If the data is sparse, + however, log normalization will not work, which is why the + default is 'bistochastic'. + + .. warning:: + if `method='log'`, the data must not be sparse. + + n_components : int, default=6 + Number of singular vectors to check. + + n_best : int, default=3 + Number of best singular vectors to which to project the data + for clustering. + + svd_method : {'randomized', 'arpack'}, default='randomized' + Selects the algorithm for finding singular vectors. May be + 'randomized' or 'arpack'. If 'randomized', uses + :func:`~sklearn.utils.extmath.randomized_svd`, which may be faster + for large matrices. If 'arpack', uses + `scipy.sparse.linalg.svds`, which is more accurate, but + possibly slower in some cases. + + n_svd_vecs : int, default=None + Number of vectors to use in calculating the SVD. Corresponds + to `ncv` when `svd_method=arpack` and `n_oversamples` when + `svd_method` is 'randomized`. + + mini_batch : bool, default=False + Whether to use mini-batch k-means, which is faster but may get + different results. + + init : {'k-means++', 'random'} or ndarray of shape (n_clusters, n_features), \ + default='k-means++' + Method for initialization of k-means algorithm; defaults to + 'k-means++'. + + n_init : int, default=10 + Number of random initializations that are tried with the + k-means algorithm. + + If mini-batch k-means is used, the best initialization is + chosen and the algorithm runs once. Otherwise, the algorithm + is run for each initialization and the best solution chosen. + + random_state : int, RandomState instance, default=None + Used for randomizing the singular value decomposition and the k-means + initialization. Use an int to make the randomness deterministic. + See :term:`Glossary `. + + Attributes + ---------- + rows_ : array-like of shape (n_row_clusters, n_rows) + Results of the clustering. `rows[i, r]` is True if + cluster `i` contains row `r`. Available only after calling ``fit``. + + columns_ : array-like of shape (n_column_clusters, n_columns) + Results of the clustering, like `rows`. + + row_labels_ : array-like of shape (n_rows,) + Row partition labels. + + column_labels_ : array-like of shape (n_cols,) + Column partition labels. + + biclusters_ : tuple of two ndarrays + The tuple contains the `rows_` and `columns_` arrays. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + SpectralCoclustering : Spectral Co-Clustering algorithm (Dhillon, 2001). + + References + ---------- + + * :doi:`Kluger, Yuval, et. al., 2003. Spectral biclustering of microarray + data: coclustering genes and conditions. + <10.1101/gr.648603>` + + Examples + -------- + >>> from sklearn.cluster import SpectralBiclustering + >>> import numpy as np + >>> X = np.array([[1, 1], [2, 1], [1, 0], + ... [4, 7], [3, 5], [3, 6]]) + >>> clustering = SpectralBiclustering(n_clusters=2, random_state=0).fit(X) + >>> clustering.row_labels_ + array([1, 1, 1, 0, 0, 0], dtype=int32) + >>> clustering.column_labels_ + array([1, 0], dtype=int32) + >>> clustering + SpectralBiclustering(n_clusters=2, random_state=0) + + For a more detailed example, see + :ref:`sphx_glr_auto_examples_bicluster_plot_spectral_biclustering.py` + """ + + _parameter_constraints: dict = { + **BaseSpectral._parameter_constraints, + "n_clusters": [Interval(Integral, 1, None, closed="left"), tuple], + "method": [StrOptions({"bistochastic", "scale", "log"})], + "n_components": [Interval(Integral, 1, None, closed="left")], + "n_best": [Interval(Integral, 1, None, closed="left")], + } + + def __init__( + self, + n_clusters=3, + *, + method="bistochastic", + n_components=6, + n_best=3, + svd_method="randomized", + n_svd_vecs=None, + mini_batch=False, + init="k-means++", + n_init=10, + random_state=None, + ): + super().__init__( + n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, random_state + ) + self.method = method + self.n_components = n_components + self.n_best = n_best + + def _check_parameters(self, n_samples): + if isinstance(self.n_clusters, Integral): + if self.n_clusters > n_samples: + raise ValueError( + f"n_clusters should be <= n_samples={n_samples}. Got" + f" {self.n_clusters} instead." + ) + else: # tuple + try: + n_row_clusters, n_column_clusters = self.n_clusters + check_scalar( + n_row_clusters, + "n_row_clusters", + target_type=Integral, + min_val=1, + max_val=n_samples, + ) + check_scalar( + n_column_clusters, + "n_column_clusters", + target_type=Integral, + min_val=1, + max_val=n_samples, + ) + except (ValueError, TypeError) as e: + raise ValueError( + "Incorrect parameter n_clusters has value:" + f" {self.n_clusters}. It should either be a single integer" + " or an iterable with two integers:" + " (n_row_clusters, n_column_clusters)" + " And the values are should be in the" + " range: (1, n_samples)" + ) from e + + if self.n_best > self.n_components: + raise ValueError( + f"n_best={self.n_best} must be <= n_components={self.n_components}." + ) + + def _fit(self, X): + n_sv = self.n_components + if self.method == "bistochastic": + normalized_data = _bistochastic_normalize(X) + n_sv += 1 + elif self.method == "scale": + normalized_data, _, _ = _scale_normalize(X) + n_sv += 1 + elif self.method == "log": + normalized_data = _log_normalize(X) + n_discard = 0 if self.method == "log" else 1 + u, v = self._svd(normalized_data, n_sv, n_discard) + ut = u.T + vt = v.T + + try: + n_row_clusters, n_col_clusters = self.n_clusters + except TypeError: + n_row_clusters = n_col_clusters = self.n_clusters + + best_ut = self._fit_best_piecewise(ut, self.n_best, n_row_clusters) + + best_vt = self._fit_best_piecewise(vt, self.n_best, n_col_clusters) + + self.row_labels_ = self._project_and_cluster(X, best_vt.T, n_row_clusters) + + self.column_labels_ = self._project_and_cluster(X.T, best_ut.T, n_col_clusters) + + self.rows_ = np.vstack( + [ + self.row_labels_ == label + for label in range(n_row_clusters) + for _ in range(n_col_clusters) + ] + ) + self.columns_ = np.vstack( + [ + self.column_labels_ == label + for _ in range(n_row_clusters) + for label in range(n_col_clusters) + ] + ) + + def _fit_best_piecewise(self, vectors, n_best, n_clusters): + """Find the ``n_best`` vectors that are best approximated by piecewise + constant vectors. + + The piecewise vectors are found by k-means; the best is chosen + according to Euclidean distance. + + """ + + def make_piecewise(v): + centroid, labels = self._k_means(v.reshape(-1, 1), n_clusters) + return centroid[labels].ravel() + + piecewise_vectors = np.apply_along_axis(make_piecewise, axis=1, arr=vectors) + dists = np.apply_along_axis(norm, axis=1, arr=(vectors - piecewise_vectors)) + result = vectors[np.argsort(dists)[:n_best]] + return result + + def _project_and_cluster(self, data, vectors, n_clusters): + """Project ``data`` to ``vectors`` and cluster the result.""" + projected = safe_sparse_dot(data, vectors) + _, labels = self._k_means(projected, n_clusters) + return labels diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_birch.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_birch.py new file mode 100644 index 0000000000000000000000000000000000000000..4c894a644c8bc8b96b1c285358fd6a9cbf803a47 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_birch.py @@ -0,0 +1,749 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from math import sqrt +from numbers import Integral, Real + +import numpy as np +from scipy import sparse + +from .._config import config_context +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + ClusterMixin, + TransformerMixin, + _fit_context, +) +from ..exceptions import ConvergenceWarning +from ..metrics import pairwise_distances_argmin +from ..metrics.pairwise import euclidean_distances +from ..utils._param_validation import Hidden, Interval, StrOptions +from ..utils.extmath import row_norms +from ..utils.validation import check_is_fitted, validate_data +from . import AgglomerativeClustering + + +def _iterate_sparse_X(X): + """This little hack returns a densified row when iterating over a sparse + matrix, instead of constructing a sparse matrix for every row that is + expensive. + """ + n_samples = X.shape[0] + X_indices = X.indices + X_data = X.data + X_indptr = X.indptr + + for i in range(n_samples): + row = np.zeros(X.shape[1]) + startptr, endptr = X_indptr[i], X_indptr[i + 1] + nonzero_indices = X_indices[startptr:endptr] + row[nonzero_indices] = X_data[startptr:endptr] + yield row + + +def _split_node(node, threshold, branching_factor): + """The node has to be split if there is no place for a new subcluster + in the node. + 1. Two empty nodes and two empty subclusters are initialized. + 2. The pair of distant subclusters are found. + 3. The properties of the empty subclusters and nodes are updated + according to the nearest distance between the subclusters to the + pair of distant subclusters. + 4. The two nodes are set as children to the two subclusters. + """ + new_subcluster1 = _CFSubcluster() + new_subcluster2 = _CFSubcluster() + new_node1 = _CFNode( + threshold=threshold, + branching_factor=branching_factor, + is_leaf=node.is_leaf, + n_features=node.n_features, + dtype=node.init_centroids_.dtype, + ) + new_node2 = _CFNode( + threshold=threshold, + branching_factor=branching_factor, + is_leaf=node.is_leaf, + n_features=node.n_features, + dtype=node.init_centroids_.dtype, + ) + new_subcluster1.child_ = new_node1 + new_subcluster2.child_ = new_node2 + + if node.is_leaf: + if node.prev_leaf_ is not None: + node.prev_leaf_.next_leaf_ = new_node1 + new_node1.prev_leaf_ = node.prev_leaf_ + new_node1.next_leaf_ = new_node2 + new_node2.prev_leaf_ = new_node1 + new_node2.next_leaf_ = node.next_leaf_ + if node.next_leaf_ is not None: + node.next_leaf_.prev_leaf_ = new_node2 + + dist = euclidean_distances( + node.centroids_, Y_norm_squared=node.squared_norm_, squared=True + ) + n_clusters = dist.shape[0] + + farthest_idx = np.unravel_index(dist.argmax(), (n_clusters, n_clusters)) + node1_dist, node2_dist = dist[(farthest_idx,)] + + node1_closer = node1_dist < node2_dist + # make sure node1 is closest to itself even if all distances are equal. + # This can only happen when all node.centroids_ are duplicates leading to all + # distances between centroids being zero. + node1_closer[farthest_idx[0]] = True + + for idx, subcluster in enumerate(node.subclusters_): + if node1_closer[idx]: + new_node1.append_subcluster(subcluster) + new_subcluster1.update(subcluster) + else: + new_node2.append_subcluster(subcluster) + new_subcluster2.update(subcluster) + return new_subcluster1, new_subcluster2 + + +class _CFNode: + """Each node in a CFTree is called a CFNode. + + The CFNode can have a maximum of branching_factor + number of CFSubclusters. + + Parameters + ---------- + threshold : float + Threshold needed for a new subcluster to enter a CFSubcluster. + + branching_factor : int + Maximum number of CF subclusters in each node. + + is_leaf : bool + We need to know if the CFNode is a leaf or not, in order to + retrieve the final subclusters. + + n_features : int + The number of features. + + Attributes + ---------- + subclusters_ : list + List of subclusters for a particular CFNode. + + prev_leaf_ : _CFNode + Useful only if is_leaf is True. + + next_leaf_ : _CFNode + next_leaf. Useful only if is_leaf is True. + the final subclusters. + + init_centroids_ : ndarray of shape (branching_factor + 1, n_features) + Manipulate ``init_centroids_`` throughout rather than centroids_ since + the centroids are just a view of the ``init_centroids_`` . + + init_sq_norm_ : ndarray of shape (branching_factor + 1,) + manipulate init_sq_norm_ throughout. similar to ``init_centroids_``. + + centroids_ : ndarray of shape (branching_factor + 1, n_features) + View of ``init_centroids_``. + + squared_norm_ : ndarray of shape (branching_factor + 1,) + View of ``init_sq_norm_``. + + """ + + def __init__(self, *, threshold, branching_factor, is_leaf, n_features, dtype): + self.threshold = threshold + self.branching_factor = branching_factor + self.is_leaf = is_leaf + self.n_features = n_features + + # The list of subclusters, centroids and squared norms + # to manipulate throughout. + self.subclusters_ = [] + self.init_centroids_ = np.zeros((branching_factor + 1, n_features), dtype=dtype) + self.init_sq_norm_ = np.zeros((branching_factor + 1), dtype) + self.squared_norm_ = [] + self.prev_leaf_ = None + self.next_leaf_ = None + + def append_subcluster(self, subcluster): + n_samples = len(self.subclusters_) + self.subclusters_.append(subcluster) + self.init_centroids_[n_samples] = subcluster.centroid_ + self.init_sq_norm_[n_samples] = subcluster.sq_norm_ + + # Keep centroids and squared norm as views. In this way + # if we change init_centroids and init_sq_norm_, it is + # sufficient, + self.centroids_ = self.init_centroids_[: n_samples + 1, :] + self.squared_norm_ = self.init_sq_norm_[: n_samples + 1] + + def update_split_subclusters(self, subcluster, new_subcluster1, new_subcluster2): + """Remove a subcluster from a node and update it with the + split subclusters. + """ + ind = self.subclusters_.index(subcluster) + self.subclusters_[ind] = new_subcluster1 + self.init_centroids_[ind] = new_subcluster1.centroid_ + self.init_sq_norm_[ind] = new_subcluster1.sq_norm_ + self.append_subcluster(new_subcluster2) + + def insert_cf_subcluster(self, subcluster): + """Insert a new subcluster into the node.""" + if not self.subclusters_: + self.append_subcluster(subcluster) + return False + + threshold = self.threshold + branching_factor = self.branching_factor + # We need to find the closest subcluster among all the + # subclusters so that we can insert our new subcluster. + dist_matrix = np.dot(self.centroids_, subcluster.centroid_) + dist_matrix *= -2.0 + dist_matrix += self.squared_norm_ + closest_index = np.argmin(dist_matrix) + closest_subcluster = self.subclusters_[closest_index] + + # If the subcluster has a child, we need a recursive strategy. + if closest_subcluster.child_ is not None: + split_child = closest_subcluster.child_.insert_cf_subcluster(subcluster) + + if not split_child: + # If it is determined that the child need not be split, we + # can just update the closest_subcluster + closest_subcluster.update(subcluster) + self.init_centroids_[closest_index] = self.subclusters_[ + closest_index + ].centroid_ + self.init_sq_norm_[closest_index] = self.subclusters_[ + closest_index + ].sq_norm_ + return False + + # things not too good. we need to redistribute the subclusters in + # our child node, and add a new subcluster in the parent + # subcluster to accommodate the new child. + else: + new_subcluster1, new_subcluster2 = _split_node( + closest_subcluster.child_, + threshold, + branching_factor, + ) + self.update_split_subclusters( + closest_subcluster, new_subcluster1, new_subcluster2 + ) + + if len(self.subclusters_) > self.branching_factor: + return True + return False + + # good to go! + else: + merged = closest_subcluster.merge_subcluster(subcluster, self.threshold) + if merged: + self.init_centroids_[closest_index] = closest_subcluster.centroid_ + self.init_sq_norm_[closest_index] = closest_subcluster.sq_norm_ + return False + + # not close to any other subclusters, and we still + # have space, so add. + elif len(self.subclusters_) < self.branching_factor: + self.append_subcluster(subcluster) + return False + + # We do not have enough space nor is it closer to an + # other subcluster. We need to split. + else: + self.append_subcluster(subcluster) + return True + + +class _CFSubcluster: + """Each subcluster in a CFNode is called a CFSubcluster. + + A CFSubcluster can have a CFNode has its child. + + Parameters + ---------- + linear_sum : ndarray of shape (n_features,), default=None + Sample. This is kept optional to allow initialization of empty + subclusters. + + Attributes + ---------- + n_samples_ : int + Number of samples that belong to each subcluster. + + linear_sum_ : ndarray + Linear sum of all the samples in a subcluster. Prevents holding + all sample data in memory. + + squared_sum_ : float + Sum of the squared l2 norms of all samples belonging to a subcluster. + + centroid_ : ndarray of shape (branching_factor + 1, n_features) + Centroid of the subcluster. Prevent recomputing of centroids when + ``CFNode.centroids_`` is called. + + child_ : _CFNode + Child Node of the subcluster. Once a given _CFNode is set as the child + of the _CFNode, it is set to ``self.child_``. + + sq_norm_ : ndarray of shape (branching_factor + 1,) + Squared norm of the subcluster. Used to prevent recomputing when + pairwise minimum distances are computed. + """ + + def __init__(self, *, linear_sum=None): + if linear_sum is None: + self.n_samples_ = 0 + self.squared_sum_ = 0.0 + self.centroid_ = self.linear_sum_ = 0 + else: + self.n_samples_ = 1 + self.centroid_ = self.linear_sum_ = linear_sum + self.squared_sum_ = self.sq_norm_ = np.dot( + self.linear_sum_, self.linear_sum_ + ) + self.child_ = None + + def update(self, subcluster): + self.n_samples_ += subcluster.n_samples_ + self.linear_sum_ += subcluster.linear_sum_ + self.squared_sum_ += subcluster.squared_sum_ + self.centroid_ = self.linear_sum_ / self.n_samples_ + self.sq_norm_ = np.dot(self.centroid_, self.centroid_) + + def merge_subcluster(self, nominee_cluster, threshold): + """Check if a cluster is worthy enough to be merged. If + yes then merge. + """ + new_ss = self.squared_sum_ + nominee_cluster.squared_sum_ + new_ls = self.linear_sum_ + nominee_cluster.linear_sum_ + new_n = self.n_samples_ + nominee_cluster.n_samples_ + new_centroid = (1 / new_n) * new_ls + new_sq_norm = np.dot(new_centroid, new_centroid) + + # The squared radius of the cluster is defined: + # r^2 = sum_i ||x_i - c||^2 / n + # with x_i the n points assigned to the cluster and c its centroid: + # c = sum_i x_i / n + # This can be expanded to: + # r^2 = sum_i ||x_i||^2 / n - 2 < sum_i x_i / n, c> + n ||c||^2 / n + # and therefore simplifies to: + # r^2 = sum_i ||x_i||^2 / n - ||c||^2 + sq_radius = new_ss / new_n - new_sq_norm + + if sq_radius <= threshold**2: + ( + self.n_samples_, + self.linear_sum_, + self.squared_sum_, + self.centroid_, + self.sq_norm_, + ) = (new_n, new_ls, new_ss, new_centroid, new_sq_norm) + return True + return False + + @property + def radius(self): + """Return radius of the subcluster""" + # Because of numerical issues, this could become negative + sq_radius = self.squared_sum_ / self.n_samples_ - self.sq_norm_ + return sqrt(max(0, sq_radius)) + + +class Birch( + ClassNamePrefixFeaturesOutMixin, ClusterMixin, TransformerMixin, BaseEstimator +): + """Implements the BIRCH clustering algorithm. + + It is a memory-efficient, online-learning algorithm provided as an + alternative to :class:`MiniBatchKMeans`. It constructs a tree + data structure with the cluster centroids being read off the leaf. + These can be either the final cluster centroids or can be provided as input + to another clustering algorithm such as :class:`AgglomerativeClustering`. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.16 + + Parameters + ---------- + threshold : float, default=0.5 + The radius of the subcluster obtained by merging a new sample and the + closest subcluster should be lesser than the threshold. Otherwise a new + subcluster is started. Setting this value to be very low promotes + splitting and vice-versa. + + branching_factor : int, default=50 + Maximum number of CF subclusters in each node. If a new samples enters + such that the number of subclusters exceed the branching_factor then + that node is split into two nodes with the subclusters redistributed + in each. The parent subcluster of that node is removed and two new + subclusters are added as parents of the 2 split nodes. + + n_clusters : int, instance of sklearn.cluster model or None, default=3 + Number of clusters after the final clustering step, which treats the + subclusters from the leaves as new samples. + + - `None` : the final clustering step is not performed and the + subclusters are returned as they are. + + - :mod:`sklearn.cluster` Estimator : If a model is provided, the model + is fit treating the subclusters as new samples and the initial data + is mapped to the label of the closest subcluster. + + - `int` : the model fit is :class:`AgglomerativeClustering` with + `n_clusters` set to be equal to the int. + + compute_labels : bool, default=True + Whether or not to compute labels for each fit. + + copy : bool, default=True + Whether or not to make a copy of the given data. If set to False, + the initial data will be overwritten. + + .. deprecated:: 1.6 + `copy` was deprecated in 1.6 and will be removed in 1.8. It has no effect + as the estimator does not perform in-place operations on the input data. + + Attributes + ---------- + root_ : _CFNode + Root of the CFTree. + + dummy_leaf_ : _CFNode + Start pointer to all the leaves. + + subcluster_centers_ : ndarray + Centroids of all subclusters read directly from the leaves. + + subcluster_labels_ : ndarray + Labels assigned to the centroids of the subclusters after + they are clustered globally. + + labels_ : ndarray of shape (n_samples,) + Array of labels assigned to the input data. + if partial_fit is used instead of fit, they are assigned to the + last batch of data. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + MiniBatchKMeans : Alternative implementation that does incremental updates + of the centers' positions using mini-batches. + + Notes + ----- + The tree data structure consists of nodes with each node consisting of + a number of subclusters. The maximum number of subclusters in a node + is determined by the branching factor. Each subcluster maintains a + linear sum, squared sum and the number of samples in that subcluster. + In addition, each subcluster can also have a node as its child, if the + subcluster is not a member of a leaf node. + + For a new point entering the root, it is merged with the subcluster closest + to it and the linear sum, squared sum and the number of samples of that + subcluster are updated. This is done recursively till the properties of + the leaf node are updated. + + See :ref:`sphx_glr_auto_examples_cluster_plot_birch_vs_minibatchkmeans.py` for a + comparison with :class:`~sklearn.cluster.MiniBatchKMeans`. + + References + ---------- + * Tian Zhang, Raghu Ramakrishnan, Maron Livny + BIRCH: An efficient data clustering method for large databases. + https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf + + * Roberto Perdisci + JBirch - Java implementation of BIRCH clustering algorithm + https://code.google.com/archive/p/jbirch + + Examples + -------- + >>> from sklearn.cluster import Birch + >>> X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]] + >>> brc = Birch(n_clusters=None) + >>> brc.fit(X) + Birch(n_clusters=None) + >>> brc.predict(X) + array([0, 0, 0, 1, 1, 1]) + + For a comparison of the BIRCH clustering algorithm with other clustering algorithms, + see :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py` + """ + + _parameter_constraints: dict = { + "threshold": [Interval(Real, 0.0, None, closed="neither")], + "branching_factor": [Interval(Integral, 1, None, closed="neither")], + "n_clusters": [None, ClusterMixin, Interval(Integral, 1, None, closed="left")], + "compute_labels": ["boolean"], + "copy": ["boolean", Hidden(StrOptions({"deprecated"}))], + } + + def __init__( + self, + *, + threshold=0.5, + branching_factor=50, + n_clusters=3, + compute_labels=True, + copy="deprecated", + ): + self.threshold = threshold + self.branching_factor = branching_factor + self.n_clusters = n_clusters + self.compute_labels = compute_labels + self.copy = copy + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """ + Build a CF Tree for the input data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self + Fitted estimator. + """ + return self._fit(X, partial=False) + + def _fit(self, X, partial): + has_root = getattr(self, "root_", None) + first_call = not (partial and has_root) + + if self.copy != "deprecated" and first_call: + warnings.warn( + "`copy` was deprecated in 1.6 and will be removed in 1.8 since it " + "has no effect internally. Simply leave this parameter to its default " + "value to avoid this warning.", + FutureWarning, + ) + + X = validate_data( + self, + X, + accept_sparse="csr", + reset=first_call, + dtype=[np.float64, np.float32], + ) + threshold = self.threshold + branching_factor = self.branching_factor + + n_samples, n_features = X.shape + + # If partial_fit is called for the first time or fit is called, we + # start a new tree. + if first_call: + # The first root is the leaf. Manipulate this object throughout. + self.root_ = _CFNode( + threshold=threshold, + branching_factor=branching_factor, + is_leaf=True, + n_features=n_features, + dtype=X.dtype, + ) + + # To enable getting back subclusters. + self.dummy_leaf_ = _CFNode( + threshold=threshold, + branching_factor=branching_factor, + is_leaf=True, + n_features=n_features, + dtype=X.dtype, + ) + self.dummy_leaf_.next_leaf_ = self.root_ + self.root_.prev_leaf_ = self.dummy_leaf_ + + # Cannot vectorize. Enough to convince to use cython. + if not sparse.issparse(X): + iter_func = iter + else: + iter_func = _iterate_sparse_X + + for sample in iter_func(X): + subcluster = _CFSubcluster(linear_sum=sample) + split = self.root_.insert_cf_subcluster(subcluster) + + if split: + new_subcluster1, new_subcluster2 = _split_node( + self.root_, threshold, branching_factor + ) + del self.root_ + self.root_ = _CFNode( + threshold=threshold, + branching_factor=branching_factor, + is_leaf=False, + n_features=n_features, + dtype=X.dtype, + ) + self.root_.append_subcluster(new_subcluster1) + self.root_.append_subcluster(new_subcluster2) + + centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()]) + self.subcluster_centers_ = centroids + self._n_features_out = self.subcluster_centers_.shape[0] + + self._global_clustering(X) + return self + + def _get_leaves(self): + """ + Retrieve the leaves of the CF Node. + + Returns + ------- + leaves : list of shape (n_leaves,) + List of the leaf nodes. + """ + leaf_ptr = self.dummy_leaf_.next_leaf_ + leaves = [] + while leaf_ptr is not None: + leaves.append(leaf_ptr) + leaf_ptr = leaf_ptr.next_leaf_ + return leaves + + @_fit_context(prefer_skip_nested_validation=True) + def partial_fit(self, X=None, y=None): + """ + Online learning. Prevents rebuilding of CFTree from scratch. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features), \ + default=None + Input data. If X is not provided, only the global clustering + step is done. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self + Fitted estimator. + """ + if X is None: + # Perform just the final global clustering step. + self._global_clustering() + return self + else: + return self._fit(X, partial=True) + + def predict(self, X): + """ + Predict data using the ``centroids_`` of subclusters. + + Avoid computation of the row norms of X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data. + + Returns + ------- + labels : ndarray of shape(n_samples,) + Labelled data. + """ + check_is_fitted(self) + X = validate_data(self, X, accept_sparse="csr", reset=False) + return self._predict(X) + + def _predict(self, X): + """Predict data using the ``centroids_`` of subclusters.""" + kwargs = {"Y_norm_squared": self._subcluster_norms} + + with config_context(assume_finite=True): + argmin = pairwise_distances_argmin( + X, self.subcluster_centers_, metric_kwargs=kwargs + ) + return self.subcluster_labels_[argmin] + + def transform(self, X): + """ + Transform X into subcluster centroids dimension. + + Each dimension represents the distance from the sample point to each + cluster centroid. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data. + + Returns + ------- + X_trans : {array-like, sparse matrix} of shape (n_samples, n_clusters) + Transformed data. + """ + check_is_fitted(self) + X = validate_data(self, X, accept_sparse="csr", reset=False) + with config_context(assume_finite=True): + return euclidean_distances(X, self.subcluster_centers_) + + def _global_clustering(self, X=None): + """ + Global clustering for the subclusters obtained after fitting + """ + clusterer = self.n_clusters + centroids = self.subcluster_centers_ + compute_labels = (X is not None) and self.compute_labels + + # Preprocessing for the global clustering. + not_enough_centroids = False + if isinstance(clusterer, Integral): + clusterer = AgglomerativeClustering(n_clusters=self.n_clusters) + # There is no need to perform the global clustering step. + if len(centroids) < self.n_clusters: + not_enough_centroids = True + + # To use in predict to avoid recalculation. + self._subcluster_norms = row_norms(self.subcluster_centers_, squared=True) + + if clusterer is None or not_enough_centroids: + self.subcluster_labels_ = np.arange(len(centroids)) + if not_enough_centroids: + warnings.warn( + "Number of subclusters found (%d) by BIRCH is less " + "than (%d). Decrease the threshold." + % (len(centroids), self.n_clusters), + ConvergenceWarning, + ) + else: + # The global clustering step that clusters the subclusters of + # the leaves. It assumes the centroids of the subclusters as + # samples and finds the final centroids. + self.subcluster_labels_ = clusterer.fit_predict(self.subcluster_centers_) + + if compute_labels: + self.labels_ = self._predict(X) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = ["float64", "float32"] + tags.input_tags.sparse = True + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_bisect_k_means.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_bisect_k_means.py new file mode 100644 index 0000000000000000000000000000000000000000..77e24adbf80848b13f36adc1151686746024bf25 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_bisect_k_means.py @@ -0,0 +1,543 @@ +"""Bisecting K-means clustering.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings + +import numpy as np +import scipy.sparse as sp + +from ..base import _fit_context +from ..utils._openmp_helpers import _openmp_effective_n_threads +from ..utils._param_validation import Integral, Interval, StrOptions +from ..utils.extmath import row_norms +from ..utils.validation import ( + _check_sample_weight, + check_is_fitted, + check_random_state, + validate_data, +) +from ._k_means_common import _inertia_dense, _inertia_sparse +from ._kmeans import ( + _BaseKMeans, + _kmeans_single_elkan, + _kmeans_single_lloyd, + _labels_inertia_threadpool_limit, +) + + +class _BisectingTree: + """Tree structure representing the hierarchical clusters of BisectingKMeans.""" + + def __init__(self, center, indices, score): + """Create a new cluster node in the tree. + + The node holds the center of this cluster and the indices of the data points + that belong to it. + """ + self.center = center + self.indices = indices + self.score = score + + self.left = None + self.right = None + + def split(self, labels, centers, scores): + """Split the cluster node into two subclusters.""" + self.left = _BisectingTree( + indices=self.indices[labels == 0], center=centers[0], score=scores[0] + ) + self.right = _BisectingTree( + indices=self.indices[labels == 1], center=centers[1], score=scores[1] + ) + + # reset the indices attribute to save memory + self.indices = None + + def get_cluster_to_bisect(self): + """Return the cluster node to bisect next. + + It's based on the score of the cluster, which can be either the number of + data points assigned to that cluster or the inertia of that cluster + (see `bisecting_strategy` for details). + """ + max_score = None + + for cluster_leaf in self.iter_leaves(): + if max_score is None or cluster_leaf.score > max_score: + max_score = cluster_leaf.score + best_cluster_leaf = cluster_leaf + + return best_cluster_leaf + + def iter_leaves(self): + """Iterate over all the cluster leaves in the tree.""" + if self.left is None: + yield self + else: + yield from self.left.iter_leaves() + yield from self.right.iter_leaves() + + +class BisectingKMeans(_BaseKMeans): + """Bisecting K-Means clustering. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.1 + + Parameters + ---------- + n_clusters : int, default=8 + The number of clusters to form as well as the number of + centroids to generate. + + init : {'k-means++', 'random'} or callable, default='random' + Method for initialization: + + 'k-means++' : selects initial cluster centers for k-mean + clustering in a smart way to speed up convergence. See section + Notes in k_init for more details. + + 'random': choose `n_clusters` observations (rows) at random from data + for the initial centroids. + + If a callable is passed, it should take arguments X, n_clusters and a + random state and return an initialization. + + n_init : int, default=1 + Number of time the inner k-means algorithm will be run with different + centroid seeds in each bisection. + That will result producing for each bisection best output of n_init + consecutive runs in terms of inertia. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for centroid initialization + in inner K-Means. Use an int to make the randomness deterministic. + See :term:`Glossary `. + + max_iter : int, default=300 + Maximum number of iterations of the inner k-means algorithm at each + bisection. + + verbose : int, default=0 + Verbosity mode. + + tol : float, default=1e-4 + Relative tolerance with regards to Frobenius norm of the difference + in the cluster centers of two consecutive iterations to declare + convergence. Used in inner k-means algorithm at each bisection to pick + best possible clusters. + + copy_x : bool, default=True + When pre-computing distances it is more numerically accurate to center + the data first. If copy_x is True (default), then the original data is + not modified. If False, the original data is modified, and put back + before the function returns, but small numerical differences may be + introduced by subtracting and then adding the data mean. Note that if + the original data is not C-contiguous, a copy will be made even if + copy_x is False. If the original data is sparse, but not in CSR format, + a copy will be made even if copy_x is False. + + algorithm : {"lloyd", "elkan"}, default="lloyd" + Inner K-means algorithm used in bisection. + The classical EM-style algorithm is `"lloyd"`. + The `"elkan"` variation can be more efficient on some datasets with + well-defined clusters, by using the triangle inequality. However it's + more memory intensive due to the allocation of an extra array of shape + `(n_samples, n_clusters)`. + + bisecting_strategy : {"biggest_inertia", "largest_cluster"},\ + default="biggest_inertia" + Defines how bisection should be performed: + + - "biggest_inertia" means that BisectingKMeans will always check + all calculated cluster for cluster with biggest SSE + (Sum of squared errors) and bisect it. This approach concentrates on + precision, but may be costly in terms of execution time (especially for + larger amount of data points). + + - "largest_cluster" - BisectingKMeans will always split cluster with + largest amount of points assigned to it from all clusters + previously calculated. That should work faster than picking by SSE + ('biggest_inertia') and may produce similar results in most cases. + + Attributes + ---------- + cluster_centers_ : ndarray of shape (n_clusters, n_features) + Coordinates of cluster centers. If the algorithm stops before fully + converging (see ``tol`` and ``max_iter``), these will not be + consistent with ``labels_``. + + labels_ : ndarray of shape (n_samples,) + Labels of each point. + + inertia_ : float + Sum of squared distances of samples to their closest cluster center, + weighted by the sample weights if provided. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + See Also + -------- + KMeans : Original implementation of K-Means algorithm. + + Notes + ----- + It might be inefficient when n_cluster is less than 3, due to unnecessary + calculations for that case. + + Examples + -------- + >>> from sklearn.cluster import BisectingKMeans + >>> import numpy as np + >>> X = np.array([[1, 1], [10, 1], [3, 1], + ... [10, 0], [2, 1], [10, 2], + ... [10, 8], [10, 9], [10, 10]]) + >>> bisect_means = BisectingKMeans(n_clusters=3, random_state=0).fit(X) + >>> bisect_means.labels_ + array([0, 2, 0, 2, 0, 2, 1, 1, 1], dtype=int32) + >>> bisect_means.predict([[0, 0], [12, 3]]) + array([0, 2], dtype=int32) + >>> bisect_means.cluster_centers_ + array([[ 2., 1.], + [10., 9.], + [10., 1.]]) + + For a comparison between BisectingKMeans and K-Means refer to example + :ref:`sphx_glr_auto_examples_cluster_plot_bisect_kmeans.py`. + """ + + _parameter_constraints: dict = { + **_BaseKMeans._parameter_constraints, + "init": [StrOptions({"k-means++", "random"}), callable], + "n_init": [Interval(Integral, 1, None, closed="left")], + "copy_x": ["boolean"], + "algorithm": [StrOptions({"lloyd", "elkan"})], + "bisecting_strategy": [StrOptions({"biggest_inertia", "largest_cluster"})], + } + + def __init__( + self, + n_clusters=8, + *, + init="random", + n_init=1, + random_state=None, + max_iter=300, + verbose=0, + tol=1e-4, + copy_x=True, + algorithm="lloyd", + bisecting_strategy="biggest_inertia", + ): + super().__init__( + n_clusters=n_clusters, + init=init, + max_iter=max_iter, + verbose=verbose, + random_state=random_state, + tol=tol, + n_init=n_init, + ) + + self.copy_x = copy_x + self.algorithm = algorithm + self.bisecting_strategy = bisecting_strategy + + def _warn_mkl_vcomp(self, n_active_threads): + """Warn when vcomp and mkl are both present""" + warnings.warn( + "BisectingKMeans is known to have a memory leak on Windows " + "with MKL, when there are less chunks than available " + "threads. You can avoid it by setting the environment" + f" variable OMP_NUM_THREADS={n_active_threads}." + ) + + def _inertia_per_cluster(self, X, centers, labels, sample_weight): + """Calculate the sum of squared errors (inertia) per cluster. + + Parameters + ---------- + X : {ndarray, csr_matrix} of shape (n_samples, n_features) + The input samples. + + centers : ndarray of shape (n_clusters=2, n_features) + The cluster centers. + + labels : ndarray of shape (n_samples,) + Index of the cluster each sample belongs to. + + sample_weight : ndarray of shape (n_samples,) + The weights for each observation in X. + + Returns + ------- + inertia_per_cluster : ndarray of shape (n_clusters=2,) + Sum of squared errors (inertia) for each cluster. + """ + n_clusters = centers.shape[0] # = 2 since centers comes from a bisection + _inertia = _inertia_sparse if sp.issparse(X) else _inertia_dense + + inertia_per_cluster = np.empty(n_clusters) + for label in range(n_clusters): + inertia_per_cluster[label] = _inertia( + X, sample_weight, centers, labels, self._n_threads, single_label=label + ) + + return inertia_per_cluster + + def _bisect(self, X, x_squared_norms, sample_weight, cluster_to_bisect): + """Split a cluster into 2 subsclusters. + + Parameters + ---------- + X : {ndarray, csr_matrix} of shape (n_samples, n_features) + Training instances to cluster. + + x_squared_norms : ndarray of shape (n_samples,) + Squared euclidean norm of each data point. + + sample_weight : ndarray of shape (n_samples,) + The weights for each observation in X. + + cluster_to_bisect : _BisectingTree node object + The cluster node to split. + """ + X = X[cluster_to_bisect.indices] + x_squared_norms = x_squared_norms[cluster_to_bisect.indices] + sample_weight = sample_weight[cluster_to_bisect.indices] + + best_inertia = None + + # Split samples in X into 2 clusters. + # Repeating `n_init` times to obtain best clusters + for _ in range(self.n_init): + centers_init = self._init_centroids( + X, + x_squared_norms=x_squared_norms, + init=self.init, + random_state=self._random_state, + n_centroids=2, + sample_weight=sample_weight, + ) + + labels, inertia, centers, _ = self._kmeans_single( + X, + sample_weight, + centers_init, + max_iter=self.max_iter, + verbose=self.verbose, + tol=self.tol, + n_threads=self._n_threads, + ) + + # allow small tolerance on the inertia to accommodate for + # non-deterministic rounding errors due to parallel computation + if best_inertia is None or inertia < best_inertia * (1 - 1e-6): + best_labels = labels + best_centers = centers + best_inertia = inertia + + if self.verbose: + print(f"New centroids from bisection: {best_centers}") + + if self.bisecting_strategy == "biggest_inertia": + scores = self._inertia_per_cluster( + X, best_centers, best_labels, sample_weight + ) + else: # bisecting_strategy == "largest_cluster" + # Using minlength to make sure that we have the counts for both labels even + # if all samples are labelled 0. + scores = np.bincount(best_labels, minlength=2) + + cluster_to_bisect.split(best_labels, best_centers, scores) + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None, sample_weight=None): + """Compute bisecting k-means clustering. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + + Training instances to cluster. + + .. note:: The data will be converted to C ordering, + which will cause a memory copy + if the given data is not C-contiguous. + + y : Ignored + Not used, present here for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + The weights for each observation in X. If None, all observations + are assigned equal weight. `sample_weight` is not used during + initialization if `init` is a callable. + + Returns + ------- + self + Fitted estimator. + """ + X = validate_data( + self, + X, + accept_sparse="csr", + dtype=[np.float64, np.float32], + order="C", + copy=self.copy_x, + accept_large_sparse=False, + ) + + self._check_params_vs_input(X) + + self._random_state = check_random_state(self.random_state) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + self._n_threads = _openmp_effective_n_threads() + + if self.algorithm == "lloyd" or self.n_clusters == 1: + self._kmeans_single = _kmeans_single_lloyd + self._check_mkl_vcomp(X, X.shape[0]) + else: + self._kmeans_single = _kmeans_single_elkan + + # Subtract of mean of X for more accurate distance computations + if not sp.issparse(X): + self._X_mean = X.mean(axis=0) + X -= self._X_mean + + # Initialize the hierarchical clusters tree + self._bisecting_tree = _BisectingTree( + indices=np.arange(X.shape[0]), + center=X.mean(axis=0), + score=0, + ) + + x_squared_norms = row_norms(X, squared=True) + + for _ in range(self.n_clusters - 1): + # Chose cluster to bisect + cluster_to_bisect = self._bisecting_tree.get_cluster_to_bisect() + + # Split this cluster into 2 subclusters + self._bisect(X, x_squared_norms, sample_weight, cluster_to_bisect) + + # Aggregate final labels and centers from the bisecting tree + self.labels_ = np.full(X.shape[0], -1, dtype=np.int32) + self.cluster_centers_ = np.empty((self.n_clusters, X.shape[1]), dtype=X.dtype) + + for i, cluster_node in enumerate(self._bisecting_tree.iter_leaves()): + self.labels_[cluster_node.indices] = i + self.cluster_centers_[i] = cluster_node.center + cluster_node.label = i # label final clusters for future prediction + cluster_node.indices = None # release memory + + # Restore original data + if not sp.issparse(X): + X += self._X_mean + self.cluster_centers_ += self._X_mean + + _inertia = _inertia_sparse if sp.issparse(X) else _inertia_dense + self.inertia_ = _inertia( + X, sample_weight, self.cluster_centers_, self.labels_, self._n_threads + ) + + self._n_features_out = self.cluster_centers_.shape[0] + + return self + + def predict(self, X): + """Predict which cluster each sample in X belongs to. + + Prediction is made by going down the hierarchical tree + in searching of closest leaf cluster. + + In the vector quantization literature, `cluster_centers_` is called + the code book and each value returned by `predict` is the index of + the closest code in the code book. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + New data to predict. + + Returns + ------- + labels : ndarray of shape (n_samples,) + Index of the cluster each sample belongs to. + """ + check_is_fitted(self) + + X = self._check_test_data(X) + x_squared_norms = row_norms(X, squared=True) + + # sample weights are unused but necessary in cython helpers + sample_weight = np.ones_like(x_squared_norms) + + labels = self._predict_recursive(X, sample_weight, self._bisecting_tree) + + return labels + + def _predict_recursive(self, X, sample_weight, cluster_node): + """Predict recursively by going down the hierarchical tree. + + Parameters + ---------- + X : {ndarray, csr_matrix} of shape (n_samples, n_features) + The data points, currently assigned to `cluster_node`, to predict between + the subclusters of this node. + + sample_weight : ndarray of shape (n_samples,) + The weights for each observation in X. + + cluster_node : _BisectingTree node object + The cluster node of the hierarchical tree. + + Returns + ------- + labels : ndarray of shape (n_samples,) + Index of the cluster each sample belongs to. + """ + if cluster_node.left is None: + # This cluster has no subcluster. Labels are just the label of the cluster. + return np.full(X.shape[0], cluster_node.label, dtype=np.int32) + + # Determine if data points belong to the left or right subcluster + centers = np.vstack((cluster_node.left.center, cluster_node.right.center)) + if hasattr(self, "_X_mean"): + centers += self._X_mean + + cluster_labels = _labels_inertia_threadpool_limit( + X, + sample_weight, + centers, + self._n_threads, + return_inertia=False, + ) + mask = cluster_labels == 0 + + # Compute the labels for each subset of the data points. + labels = np.full(X.shape[0], -1, dtype=np.int32) + + labels[mask] = self._predict_recursive( + X[mask], sample_weight[mask], cluster_node.left + ) + + labels[~mask] = self._predict_recursive( + X[~mask], sample_weight[~mask], cluster_node.right + ) + + return labels + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + tags.transformer_tags.preserves_dtype = ["float64", "float32"] + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_dbscan.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_dbscan.py new file mode 100644 index 0000000000000000000000000000000000000000..857a332cc2371a6cbbcc8b69c21cd7e432ccbcc6 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_dbscan.py @@ -0,0 +1,480 @@ +""" +DBSCAN: Density-Based Spatial Clustering of Applications with Noise +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from numbers import Integral, Real + +import numpy as np +from scipy import sparse + +from ..base import BaseEstimator, ClusterMixin, _fit_context +from ..metrics.pairwise import _VALID_METRICS +from ..neighbors import NearestNeighbors +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.validation import _check_sample_weight, validate_data +from ._dbscan_inner import dbscan_inner + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "sample_weight": ["array-like", None], + }, + prefer_skip_nested_validation=False, +) +def dbscan( + X, + eps=0.5, + *, + min_samples=5, + metric="minkowski", + metric_params=None, + algorithm="auto", + leaf_size=30, + p=2, + sample_weight=None, + n_jobs=None, +): + """Perform DBSCAN clustering from vector array or distance matrix. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse (CSR) matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) + A feature array, or array of distances between samples if + ``metric='precomputed'``. + + eps : float, default=0.5 + The maximum distance between two samples for one to be considered + as in the neighborhood of the other. This is not a maximum bound + on the distances of points within a cluster. This is the most + important DBSCAN parameter to choose appropriately for your data set + and distance function. + + min_samples : int, default=5 + The number of samples (or total weight) in a neighborhood for a point + to be considered as a core point. This includes the point itself. + + metric : str or callable, default='minkowski' + The metric to use when calculating distance between instances in a + feature array. If metric is a string or callable, it must be one of + the options allowed by :func:`sklearn.metrics.pairwise_distances` for + its metric parameter. + If metric is "precomputed", X is assumed to be a distance matrix and + must be square during fit. + X may be a :term:`sparse graph `, + in which case only "nonzero" elements may be considered neighbors. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + .. versionadded:: 0.19 + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + The algorithm to be used by the NearestNeighbors module + to compute pointwise distances and find nearest neighbors. + See NearestNeighbors module documentation for details. + + leaf_size : int, default=30 + Leaf size passed to BallTree or cKDTree. This can affect the speed + of the construction and query, as well as the memory required + to store the tree. The optimal value depends + on the nature of the problem. + + p : float, default=2 + The power of the Minkowski metric to be used to calculate distance + between points. + + sample_weight : array-like of shape (n_samples,), default=None + Weight of each sample, such that a sample with a weight of at least + ``min_samples`` is by itself a core sample; a sample with negative + weight may inhibit its eps-neighbor from being core. + Note that weights are absolute, and default to 1. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. ``None`` means + 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means + using all processors. See :term:`Glossary ` for more details. + If precomputed distance are used, parallel execution is not available + and thus n_jobs will have no effect. + + Returns + ------- + core_samples : ndarray of shape (n_core_samples,) + Indices of core samples. + + labels : ndarray of shape (n_samples,) + Cluster labels for each point. Noisy samples are given the label -1. + + See Also + -------- + DBSCAN : An estimator interface for this clustering algorithm. + OPTICS : A similar estimator interface clustering at multiple values of + eps. Our implementation is optimized for memory usage. + + Notes + ----- + For an example, see :ref:`sphx_glr_auto_examples_cluster_plot_dbscan.py`. + + This implementation bulk-computes all neighborhood queries, which increases + the memory complexity to O(n.d) where d is the average number of neighbors, + while original DBSCAN had memory complexity O(n). It may attract a higher + memory complexity when querying these nearest neighborhoods, depending + on the ``algorithm``. + + One way to avoid the query complexity is to pre-compute sparse + neighborhoods in chunks using + :func:`NearestNeighbors.radius_neighbors_graph + ` with + ``mode='distance'``, then using ``metric='precomputed'`` here. + + Another way to reduce memory and computation time is to remove + (near-)duplicate points and use ``sample_weight`` instead. + + :class:`~sklearn.cluster.OPTICS` provides a similar clustering with lower + memory usage. + + References + ---------- + Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based + Algorithm for Discovering Clusters in Large Spatial Databases with Noise" + `_. + In: Proceedings of the 2nd International Conference on Knowledge Discovery + and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996 + + Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017). + :doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN." + <10.1145/3068335>` + ACM Transactions on Database Systems (TODS), 42(3), 19. + + Examples + -------- + >>> from sklearn.cluster import dbscan + >>> X = [[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]] + >>> core_samples, labels = dbscan(X, eps=3, min_samples=2) + >>> core_samples + array([0, 1, 2, 3, 4]) + >>> labels + array([ 0, 0, 0, 1, 1, -1]) + """ + + est = DBSCAN( + eps=eps, + min_samples=min_samples, + metric=metric, + metric_params=metric_params, + algorithm=algorithm, + leaf_size=leaf_size, + p=p, + n_jobs=n_jobs, + ) + est.fit(X, sample_weight=sample_weight) + return est.core_sample_indices_, est.labels_ + + +class DBSCAN(ClusterMixin, BaseEstimator): + """Perform DBSCAN clustering from vector array or distance matrix. + + DBSCAN - Density-Based Spatial Clustering of Applications with Noise. + Finds core samples of high density and expands clusters from them. + Good for data which contains clusters of similar density. + + This implementation has a worst case memory complexity of :math:`O({n}^2)`, + which can occur when the `eps` param is large and `min_samples` is low, + while the original DBSCAN only uses linear memory. + For further details, see the Notes below. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + eps : float, default=0.5 + The maximum distance between two samples for one to be considered + as in the neighborhood of the other. This is not a maximum bound + on the distances of points within a cluster. This is the most + important DBSCAN parameter to choose appropriately for your data set + and distance function. + + min_samples : int, default=5 + The number of samples (or total weight) in a neighborhood for a point to + be considered as a core point. This includes the point itself. If + `min_samples` is set to a higher value, DBSCAN will find denser clusters, + whereas if it is set to a lower value, the found clusters will be more + sparse. + + metric : str, or callable, default='euclidean' + The metric to use when calculating distance between instances in a + feature array. If metric is a string or callable, it must be one of + the options allowed by :func:`sklearn.metrics.pairwise_distances` for + its metric parameter. + If metric is "precomputed", X is assumed to be a distance matrix and + must be square. X may be a :term:`sparse graph`, in which + case only "nonzero" elements may be considered neighbors for DBSCAN. + + .. versionadded:: 0.17 + metric *precomputed* to accept precomputed sparse matrix. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + .. versionadded:: 0.19 + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + The algorithm to be used by the NearestNeighbors module + to compute pointwise distances and find nearest neighbors. + See NearestNeighbors module documentation for details. + + leaf_size : int, default=30 + Leaf size passed to BallTree or cKDTree. This can affect the speed + of the construction and query, as well as the memory required + to store the tree. The optimal value depends + on the nature of the problem. + + p : float, default=None + The power of the Minkowski metric to be used to calculate distance + between points. If None, then ``p=2`` (equivalent to the Euclidean + distance). + + n_jobs : int, default=None + The number of parallel jobs to run. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Attributes + ---------- + core_sample_indices_ : ndarray of shape (n_core_samples,) + Indices of core samples. + + components_ : ndarray of shape (n_core_samples, n_features) + Copy of each core sample found by training. + + labels_ : ndarray of shape (n_samples) + Cluster labels for each point in the dataset given to fit(). + Noisy samples are given the label -1. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + OPTICS : A similar clustering at multiple values of eps. Our implementation + is optimized for memory usage. + + Notes + ----- + This implementation bulk-computes all neighborhood queries, which increases + the memory complexity to O(n.d) where d is the average number of neighbors, + while original DBSCAN had memory complexity O(n). It may attract a higher + memory complexity when querying these nearest neighborhoods, depending + on the ``algorithm``. + + One way to avoid the query complexity is to pre-compute sparse + neighborhoods in chunks using + :func:`NearestNeighbors.radius_neighbors_graph + ` with + ``mode='distance'``, then using ``metric='precomputed'`` here. + + Another way to reduce memory and computation time is to remove + (near-)duplicate points and use ``sample_weight`` instead. + + :class:`~sklearn.cluster.OPTICS` provides a similar clustering with lower memory + usage. + + References + ---------- + Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based + Algorithm for Discovering Clusters in Large Spatial Databases with Noise" + `_. + In: Proceedings of the 2nd International Conference on Knowledge Discovery + and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996 + + Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017). + :doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN." + <10.1145/3068335>` + ACM Transactions on Database Systems (TODS), 42(3), 19. + + Examples + -------- + >>> from sklearn.cluster import DBSCAN + >>> import numpy as np + >>> X = np.array([[1, 2], [2, 2], [2, 3], + ... [8, 7], [8, 8], [25, 80]]) + >>> clustering = DBSCAN(eps=3, min_samples=2).fit(X) + >>> clustering.labels_ + array([ 0, 0, 0, 1, 1, -1]) + >>> clustering + DBSCAN(eps=3, min_samples=2) + + For an example, see + :ref:`sphx_glr_auto_examples_cluster_plot_dbscan.py`. + + For a comparison of DBSCAN with other clustering algorithms, see + :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py` + """ + + _parameter_constraints: dict = { + "eps": [Interval(Real, 0.0, None, closed="neither")], + "min_samples": [Interval(Integral, 1, None, closed="left")], + "metric": [ + StrOptions(set(_VALID_METRICS) | {"precomputed"}), + callable, + ], + "metric_params": [dict, None], + "algorithm": [StrOptions({"auto", "ball_tree", "kd_tree", "brute"})], + "leaf_size": [Interval(Integral, 1, None, closed="left")], + "p": [Interval(Real, 0.0, None, closed="left"), None], + "n_jobs": [Integral, None], + } + + def __init__( + self, + eps=0.5, + *, + min_samples=5, + metric="euclidean", + metric_params=None, + algorithm="auto", + leaf_size=30, + p=None, + n_jobs=None, + ): + self.eps = eps + self.min_samples = min_samples + self.metric = metric + self.metric_params = metric_params + self.algorithm = algorithm + self.leaf_size = leaf_size + self.p = p + self.n_jobs = n_jobs + + @_fit_context( + # DBSCAN.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y=None, sample_weight=None): + """Perform DBSCAN clustering from features, or distance matrix. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features), or \ + (n_samples, n_samples) + Training instances to cluster, or distances between instances if + ``metric='precomputed'``. If a sparse matrix is provided, it will + be converted into a sparse ``csr_matrix``. + + y : Ignored + Not used, present here for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + Weight of each sample, such that a sample with a weight of at least + ``min_samples`` is by itself a core sample; a sample with a + negative weight may inhibit its eps-neighbor from being core. + Note that weights are absolute, and default to 1. + + Returns + ------- + self : object + Returns a fitted instance of self. + """ + X = validate_data(self, X, accept_sparse="csr") + + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) + + # Calculate neighborhood for all samples. This leaves the original + # point in, which needs to be considered later (i.e. point i is in the + # neighborhood of point i. While True, its useless information) + if self.metric == "precomputed" and sparse.issparse(X): + # set the diagonal to explicit values, as a point is its own + # neighbor + X = X.copy() # copy to avoid in-place modification + with warnings.catch_warnings(): + warnings.simplefilter("ignore", sparse.SparseEfficiencyWarning) + X.setdiag(X.diagonal()) + + neighbors_model = NearestNeighbors( + radius=self.eps, + algorithm=self.algorithm, + leaf_size=self.leaf_size, + metric=self.metric, + metric_params=self.metric_params, + p=self.p, + n_jobs=self.n_jobs, + ) + neighbors_model.fit(X) + # This has worst case O(n^2) memory complexity + neighborhoods = neighbors_model.radius_neighbors(X, return_distance=False) + + if sample_weight is None: + n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods]) + else: + n_neighbors = np.array( + [np.sum(sample_weight[neighbors]) for neighbors in neighborhoods] + ) + + # Initially, all samples are noise. + labels = np.full(X.shape[0], -1, dtype=np.intp) + + # A list of all core samples found. + core_samples = np.asarray(n_neighbors >= self.min_samples, dtype=np.uint8) + dbscan_inner(core_samples, neighborhoods, labels) + + self.core_sample_indices_ = np.where(core_samples)[0] + self.labels_ = labels + + if len(self.core_sample_indices_): + # fix for scipy sparse indexing issue + self.components_ = X[self.core_sample_indices_].copy() + else: + # no core samples + self.components_ = np.empty((0, X.shape[1])) + return self + + def fit_predict(self, X, y=None, sample_weight=None): + """Compute clusters from a data or distance matrix and predict labels. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features), or \ + (n_samples, n_samples) + Training instances to cluster, or distances between instances if + ``metric='precomputed'``. If a sparse matrix is provided, it will + be converted into a sparse ``csr_matrix``. + + y : Ignored + Not used, present here for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + Weight of each sample, such that a sample with a weight of at least + ``min_samples`` is by itself a core sample; a sample with a + negative weight may inhibit its eps-neighbor from being core. + Note that weights are absolute, and default to 1. + + Returns + ------- + labels : ndarray of shape (n_samples,) + Cluster labels. Noisy samples are given the label -1. + """ + self.fit(X, sample_weight=sample_weight) + return self.labels_ + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = self.metric == "precomputed" + tags.input_tags.sparse = True + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_dbscan_inner.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/cluster/_dbscan_inner.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..d1a0dd9aec1c17fb677016ce2a6c95872a80bf6e Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/cluster/_dbscan_inner.cpython-312-x86_64-linux-gnu.so differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_dbscan_inner.pyx b/.venv/lib/python3.12/site-packages/sklearn/cluster/_dbscan_inner.pyx new file mode 100644 index 0000000000000000000000000000000000000000..266b214bb269a717fd2eea300fe7445b96bd7cba --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_dbscan_inner.pyx @@ -0,0 +1,41 @@ +# Fast inner loop for DBSCAN. + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from libcpp.vector cimport vector + +from ..utils._typedefs cimport uint8_t, intp_t + + +def dbscan_inner(const uint8_t[::1] is_core, + object[:] neighborhoods, + intp_t[::1] labels): + cdef intp_t i, label_num = 0, v + cdef intp_t[:] neighb + cdef vector[intp_t] stack + + for i in range(labels.shape[0]): + if labels[i] != -1 or not is_core[i]: + continue + + # Depth-first search starting from i, ending at the non-core points. + # This is very similar to the classic algorithm for computing connected + # components, the difference being that we label non-core points as + # part of a cluster (component), but don't expand their neighborhoods. + while True: + if labels[i] == -1: + labels[i] = label_num + if is_core[i]: + neighb = neighborhoods[i] + for i in range(neighb.shape[0]): + v = neighb[i] + if labels[v] == -1: + stack.push_back(v) + + if stack.size() == 0: + break + i = stack.back() + stack.pop_back() + + label_num += 1 diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_feature_agglomeration.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_feature_agglomeration.py new file mode 100644 index 0000000000000000000000000000000000000000..32fcb85625f354bf0dcece88453e7e8f931e03cb --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_feature_agglomeration.py @@ -0,0 +1,76 @@ +""" +Feature agglomeration. Base classes and functions for performing feature +agglomeration. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np +from scipy.sparse import issparse + +from ..base import TransformerMixin +from ..utils.validation import check_is_fitted, validate_data + +############################################################################### +# Mixin class for feature agglomeration. + + +class AgglomerationTransform(TransformerMixin): + """ + A class for feature agglomeration via the transform interface. + """ + + def transform(self, X): + """ + Transform a new matrix using the built clustering. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) or \ + (n_samples, n_samples) + A M by N array of M observations in N dimensions or a length + M array of M one-dimensional observations. + + Returns + ------- + Y : ndarray of shape (n_samples, n_clusters) or (n_clusters,) + The pooled values for each feature cluster. + """ + check_is_fitted(self) + + X = validate_data(self, X, reset=False) + if self.pooling_func == np.mean and not issparse(X): + size = np.bincount(self.labels_) + n_samples = X.shape[0] + # a fast way to compute the mean of grouped features + nX = np.array( + [np.bincount(self.labels_, X[i, :]) / size for i in range(n_samples)] + ) + else: + nX = [ + self.pooling_func(X[:, self.labels_ == l], axis=1) + for l in np.unique(self.labels_) + ] + nX = np.array(nX).T + return nX + + def inverse_transform(self, X): + """ + Inverse the transformation and return a vector of size `n_features`. + + Parameters + ---------- + X : array-like of shape (n_samples, n_clusters) or (n_clusters,) + The values to be assigned to each cluster of samples. + + Returns + ------- + X_original : ndarray of shape (n_samples, n_features) or (n_features,) + A vector of size `n_samples` with the values of `X` assigned to + each of the cluster of samples. + """ + check_is_fitted(self) + + unil, inverse = np.unique(self.labels_, return_inverse=True) + return X[..., inverse] diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..67dd18fb94b593f0a3125c1f5833f3b9597614ba --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/__init__.py @@ -0,0 +1,2 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_linkage.pyx b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_linkage.pyx new file mode 100644 index 0000000000000000000000000000000000000000..5684193a13d40ed68cabe9b8502a4b59b18d4e1b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_linkage.pyx @@ -0,0 +1,274 @@ +# Minimum spanning tree single linkage implementation for hdbscan + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software without +# specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +cimport numpy as cnp +from libc.float cimport DBL_MAX + +import numpy as np +from ...metrics._dist_metrics cimport DistanceMetric64 +from ...cluster._hierarchical_fast cimport UnionFind +from ...cluster._hdbscan._tree cimport HIERARCHY_t +from ...cluster._hdbscan._tree import HIERARCHY_dtype +from ...utils._typedefs cimport intp_t, float64_t, int64_t, uint8_t + +cnp.import_array() + +cdef extern from "numpy/arrayobject.h": + intp_t * PyArray_SHAPE(cnp.PyArrayObject *) + +# Numpy structured dtype representing a single ordered edge in Prim's algorithm +MST_edge_dtype = np.dtype([ + ("current_node", np.int64), + ("next_node", np.int64), + ("distance", np.float64), +]) + +# Packed shouldn't make a difference since they're all 8-byte quantities, +# but it's included just to be safe. +ctypedef packed struct MST_edge_t: + int64_t current_node + int64_t next_node + float64_t distance + +cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability( + cnp.ndarray[float64_t, ndim=2] mutual_reachability +): + """Compute the Minimum Spanning Tree (MST) representation of the mutual- + reachability graph using Prim's algorithm. + + Parameters + ---------- + mutual_reachability : ndarray of shape (n_samples, n_samples) + Array of mutual-reachabilities between samples. + + Returns + ------- + mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype + The MST representation of the mutual-reachability graph. The MST is + represented as a collection of edges. + """ + cdef: + # Note: we utilize ndarray's over memory-views to make use of numpy + # binary indexing and sub-selection below. + cnp.ndarray[int64_t, ndim=1, mode='c'] current_labels + cnp.ndarray[float64_t, ndim=1, mode='c'] min_reachability, left, right + cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst + + cnp.ndarray[uint8_t, mode='c'] label_filter + + int64_t n_samples = PyArray_SHAPE( mutual_reachability)[0] + int64_t current_node, new_node_index, new_node, i + + mst = np.empty(n_samples - 1, dtype=MST_edge_dtype) + current_labels = np.arange(n_samples, dtype=np.int64) + current_node = 0 + min_reachability = np.full(n_samples, fill_value=np.inf, dtype=np.float64) + for i in range(0, n_samples - 1): + label_filter = current_labels != current_node + current_labels = current_labels[label_filter] + left = min_reachability[label_filter] + right = mutual_reachability[current_node][current_labels] + min_reachability = np.minimum(left, right) + + new_node_index = np.argmin(min_reachability) + new_node = current_labels[new_node_index] + mst[i].current_node = current_node + mst[i].next_node = new_node + mst[i].distance = min_reachability[new_node_index] + current_node = new_node + + return mst + + +cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix( + const float64_t[:, ::1] raw_data, + const float64_t[::1] core_distances, + DistanceMetric64 dist_metric, + float64_t alpha=1.0 +): + """Compute the Minimum Spanning Tree (MST) representation of the mutual- + reachability graph generated from the provided `raw_data` and + `core_distances` using Prim's algorithm. + + Parameters + ---------- + raw_data : ndarray of shape (n_samples, n_features) + Input array of data samples. + + core_distances : ndarray of shape (n_samples,) + An array containing the core-distance calculated for each corresponding + sample. + + dist_metric : DistanceMetric + The distance metric to use when calculating pairwise distances for + determining mutual-reachability. + + Returns + ------- + mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype + The MST representation of the mutual-reachability graph. The MST is + represented as a collection of edges. + """ + + cdef: + uint8_t[::1] in_tree + float64_t[::1] min_reachability + int64_t[::1] current_sources + cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst + + int64_t current_node, source_node, new_node, next_node_source + int64_t i, j, n_samples, num_features + + float64_t current_node_core_dist, new_reachability, mutual_reachability_distance + float64_t next_node_min_reach, pair_distance, next_node_core_dist + + n_samples = raw_data.shape[0] + num_features = raw_data.shape[1] + + mst = np.empty(n_samples - 1, dtype=MST_edge_dtype) + + in_tree = np.zeros(n_samples, dtype=np.uint8) + min_reachability = np.full(n_samples, fill_value=np.inf, dtype=np.float64) + current_sources = np.ones(n_samples, dtype=np.int64) + + current_node = 0 + + # The following loop dynamically updates minimum reachability node-by-node, + # avoiding unnecessary computation where possible. + for i in range(0, n_samples - 1): + + in_tree[current_node] = 1 + + current_node_core_dist = core_distances[current_node] + + new_reachability = DBL_MAX + source_node = 0 + new_node = 0 + + for j in range(n_samples): + if in_tree[j]: + continue + + next_node_min_reach = min_reachability[j] + next_node_source = current_sources[j] + + pair_distance = dist_metric.dist( + &raw_data[current_node, 0], + &raw_data[j, 0], + num_features + ) + + pair_distance /= alpha + + next_node_core_dist = core_distances[j] + mutual_reachability_distance = max( + current_node_core_dist, + next_node_core_dist, + pair_distance + ) + + # If MRD(i, j) is smaller than node j's min_reachability, we update + # node j's min_reachability for future reference. + if mutual_reachability_distance < next_node_min_reach: + min_reachability[j] = mutual_reachability_distance + current_sources[j] = current_node + + # If MRD(i, j) is also smaller than node i's current + # min_reachability, we update and set their edge as the current + # MST edge candidate. + if mutual_reachability_distance < new_reachability: + new_reachability = mutual_reachability_distance + source_node = current_node + new_node = j + + # If the node j is closer to another node already in the tree, we + # make their edge the current MST candidate edge. + elif next_node_min_reach < new_reachability: + new_reachability = next_node_min_reach + source_node = next_node_source + new_node = j + + mst[i].current_node = source_node + mst[i].next_node = new_node + mst[i].distance = new_reachability + current_node = new_node + + return mst + +cpdef cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] make_single_linkage(const MST_edge_t[::1] mst): + """Construct a single-linkage tree from an MST. + + Parameters + ---------- + mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype + The MST representation of the mutual-reachability graph. The MST is + represented as a collection of edges. + + Returns + ------- + single_linkage : ndarray of shape (n_samples - 1,), dtype=HIERARCHY_dtype + The single-linkage tree tree (dendrogram) built from the MST. Each + of the array represents the following: + + - left node/cluster + - right node/cluster + - distance + - new cluster size + """ + cdef: + cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] single_linkage + + # Note mst.shape[0] is one fewer than the number of samples + int64_t n_samples = mst.shape[0] + 1 + intp_t current_node_cluster, next_node_cluster + int64_t current_node, next_node, i + float64_t distance + UnionFind U = UnionFind(n_samples) + + single_linkage = np.zeros(n_samples - 1, dtype=HIERARCHY_dtype) + + for i in range(n_samples - 1): + + current_node = mst[i].current_node + next_node = mst[i].next_node + distance = mst[i].distance + + current_node_cluster = U.fast_find(current_node) + next_node_cluster = U.fast_find(next_node) + + single_linkage[i].left_node = current_node_cluster + single_linkage[i].right_node = next_node_cluster + single_linkage[i].value = distance + single_linkage[i].cluster_size = U.size[current_node_cluster] + U.size[next_node_cluster] + + U.union(current_node_cluster, next_node_cluster) + + return single_linkage diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_reachability.pyx b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_reachability.pyx new file mode 100644 index 0000000000000000000000000000000000000000..bff686ae0a6369a7891525433a3fd79341dd2022 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_reachability.pyx @@ -0,0 +1,210 @@ +# mutual reachability distance computations + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software without +# specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +cimport numpy as cnp + +import numpy as np +from scipy.sparse import issparse +from cython cimport floating, integral +from libc.math cimport isfinite, INFINITY +from ...utils._typedefs cimport intp_t +cnp.import_array() + + +def mutual_reachability_graph( + distance_matrix, min_samples=5, max_distance=0.0 +): + """Compute the weighted adjacency matrix of the mutual reachability graph. + + The mutual reachability distance used to build the graph is defined as:: + + max(d_core(x_p), d_core(x_q), d(x_p, x_q)) + + and the core distance `d_core` is defined as the distance between a point + `x_p` and its k-th nearest neighbor. + + Note that all computations are done in-place. + + Parameters + ---------- + distance_matrix : {ndarray, sparse matrix} of shape (n_samples, n_samples) + Array of distances between samples. If sparse, the array must be in + `CSR` format. + + min_samples : int, default=5 + The parameter `k` used to calculate the distance between a point + `x_p` and its k-th nearest neighbor. + + max_distance : float, default=0.0 + The distance which `np.inf` is replaced with. When the true mutual- + reachability distance is measured to be infinite, it is instead + truncated to `max_dist`. Only used when `distance_matrix` is a sparse + matrix. + + Returns + ------- + mututal_reachability_graph: {ndarray, sparse matrix} of shape \ + (n_samples, n_samples) + Weighted adjacency matrix of the mutual reachability graph. + + References + ---------- + .. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April). + Density-based clustering based on hierarchical density estimates. + In Pacific-Asia Conference on Knowledge Discovery and Data Mining + (pp. 160-172). Springer Berlin Heidelberg. + """ + further_neighbor_idx = min_samples - 1 + if issparse(distance_matrix): + if distance_matrix.format != "csr": + raise ValueError( + "Only sparse CSR matrices are supported for `distance_matrix`." + ) + _sparse_mutual_reachability_graph( + distance_matrix.data, + distance_matrix.indices, + distance_matrix.indptr, + distance_matrix.shape[0], + further_neighbor_idx=further_neighbor_idx, + max_distance=max_distance, + ) + else: + _dense_mutual_reachability_graph( + distance_matrix, further_neighbor_idx=further_neighbor_idx + ) + return distance_matrix + + +def _dense_mutual_reachability_graph( + floating[:, :] distance_matrix, + intp_t further_neighbor_idx, +): + """Dense implementation of mutual reachability graph. + + The computation is done in-place, i.e. the distance matrix is modified + directly. + + Parameters + ---------- + distance_matrix : ndarray of shape (n_samples, n_samples) + Array of distances between samples. + + further_neighbor_idx : int + The index of the furthest neighbor to use to define the core distances. + """ + cdef: + intp_t i, j, n_samples = distance_matrix.shape[0] + floating mutual_reachability_distance + floating[::1] core_distances + + # We assume that the distance matrix is symmetric. We choose to sort every + # row to have the same implementation than the sparse case that requires + # CSR matrix. + core_distances = np.ascontiguousarray( + np.partition( + distance_matrix, further_neighbor_idx, axis=1 + )[:, further_neighbor_idx] + ) + + with nogil: + # TODO: Update w/ prange with thread count based on + # _openmp_effective_n_threads + for i in range(n_samples): + for j in range(n_samples): + mutual_reachability_distance = max( + core_distances[i], + core_distances[j], + distance_matrix[i, j], + ) + distance_matrix[i, j] = mutual_reachability_distance + + +def _sparse_mutual_reachability_graph( + cnp.ndarray[floating, ndim=1, mode="c"] data, + cnp.ndarray[integral, ndim=1, mode="c"] indices, + cnp.ndarray[integral, ndim=1, mode="c"] indptr, + intp_t n_samples, + intp_t further_neighbor_idx, + floating max_distance, +): + """Sparse implementation of mutual reachability graph. + + The computation is done in-place, i.e. the distance matrix is modified + directly. This implementation only accepts `CSR` format sparse matrices. + + Parameters + ---------- + distance_matrix : sparse matrix of shape (n_samples, n_samples) + Sparse matrix of distances between samples. The sparse format should + be `CSR`. + + further_neighbor_idx : int + The index of the furthest neighbor to use to define the core distances. + + max_distance : float + The distance which `np.inf` is replaced with. When the true mutual- + reachability distance is measured to be infinite, it is instead + truncated to `max_dist`. Only used when `distance_matrix` is a sparse + matrix. + """ + cdef: + integral i, col_ind, row_ind + floating mutual_reachability_distance + floating[:] core_distances + floating[:] row_data + + if floating is float: + dtype = np.float32 + else: + dtype = np.float64 + + core_distances = np.empty(n_samples, dtype=dtype) + + for i in range(n_samples): + row_data = data[indptr[i]:indptr[i + 1]] + if further_neighbor_idx < row_data.size: + core_distances[i] = np.partition( + row_data, further_neighbor_idx + )[further_neighbor_idx] + else: + core_distances[i] = INFINITY + + with nogil: + for row_ind in range(n_samples): + for i in range(indptr[row_ind], indptr[row_ind + 1]): + col_ind = indices[i] + mutual_reachability_distance = max( + core_distances[row_ind], core_distances[col_ind], data[i] + ) + if isfinite(mutual_reachability_distance): + data[i] = mutual_reachability_distance + elif max_distance > 0: + data[i] = max_distance diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_tree.pxd b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_tree.pxd new file mode 100644 index 0000000000000000000000000000000000000000..23708b9a38d07884c035b88e260821146075f861 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_tree.pxd @@ -0,0 +1,49 @@ +# Copyright (c) 2015, Leland McInnes +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software without +# specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from ...utils._typedefs cimport intp_t, float64_t, uint8_t +cimport numpy as cnp + +# This corresponds to the scipy.cluster.hierarchy format +ctypedef packed struct HIERARCHY_t: + intp_t left_node + intp_t right_node + float64_t value + intp_t cluster_size + +# Effectively an edgelist encoding a parent/child pair, along with a value and +# the corresponding cluster_size in each row providing a tree structure. +ctypedef packed struct CONDENSED_t: + intp_t parent + intp_t child + float64_t value + intp_t cluster_size + +cdef extern from "numpy/arrayobject.h": + intp_t * PyArray_SHAPE(cnp.PyArrayObject *) diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_tree.pyx b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_tree.pyx new file mode 100644 index 0000000000000000000000000000000000000000..161092033b915bd9bb51f87750fb156c6a598833 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/_tree.pyx @@ -0,0 +1,799 @@ +# Tree handling (condensing, finding stable clusters) for hdbscan + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software without +# specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +cimport numpy as cnp +from libc.math cimport isinf +import cython + +import numpy as np + +cnp.import_array() + +cdef extern from "numpy/arrayobject.h": + intp_t * PyArray_SHAPE(cnp.PyArrayObject *) + +cdef cnp.float64_t INFTY = np.inf +cdef cnp.intp_t NOISE = -1 + +HIERARCHY_dtype = np.dtype([ + ("left_node", np.intp), + ("right_node", np.intp), + ("value", np.float64), + ("cluster_size", np.intp), +]) + +CONDENSED_dtype = np.dtype([ + ("parent", np.intp), + ("child", np.intp), + ("value", np.float64), + ("cluster_size", np.intp), +]) + +cpdef tuple tree_to_labels( + const HIERARCHY_t[::1] single_linkage_tree, + cnp.intp_t min_cluster_size=10, + cluster_selection_method="eom", + bint allow_single_cluster=False, + cnp.float64_t cluster_selection_epsilon=0.0, + max_cluster_size=None, +): + cdef: + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree + cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels + cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] probabilities + + condensed_tree = _condense_tree(single_linkage_tree, min_cluster_size) + labels, probabilities = _get_clusters( + condensed_tree, + _compute_stability(condensed_tree), + cluster_selection_method, + allow_single_cluster, + cluster_selection_epsilon, + max_cluster_size, + ) + + return (labels, probabilities) + +cdef list bfs_from_hierarchy( + const HIERARCHY_t[::1] hierarchy, + cnp.intp_t bfs_root +): + """ + Perform a breadth first search on a tree in scipy hclust format. + """ + + cdef list process_queue, next_queue, result + cdef cnp.intp_t n_samples = hierarchy.shape[0] + 1 + cdef cnp.intp_t node + process_queue = [bfs_root] + result = [] + + while process_queue: + result.extend(process_queue) + # By construction, node i is formed by the union of nodes + # hierarchy[i - n_samples, 0] and hierarchy[i - n_samples, 1] + process_queue = [ + x - n_samples + for x in process_queue + if x >= n_samples + ] + if process_queue: + next_queue = [] + for node in process_queue: + next_queue.extend( + [ + hierarchy[node].left_node, + hierarchy[node].right_node, + ] + ) + process_queue = next_queue + return result + + +cpdef cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] _condense_tree( + const HIERARCHY_t[::1] hierarchy, + cnp.intp_t min_cluster_size=10 +): + """Condense a tree according to a minimum cluster size. This is akin + to the runt pruning procedure of Stuetzle. The result is a much simpler + tree that is easier to visualize. We include extra information on the + lambda value at which individual points depart clusters for later + analysis and computation. + + Parameters + ---------- + hierarchy : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype + A single linkage hierarchy in scipy.cluster.hierarchy format. + + min_cluster_size : int, optional (default 10) + The minimum size of clusters to consider. Clusters smaller than this + are pruned from the tree. + + Returns + ------- + condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype + Effectively an edgelist encoding a parent/child pair, along with a + value and the corresponding cluster_size in each row providing a tree + structure. + """ + + cdef: + cnp.intp_t root = 2 * hierarchy.shape[0] + cnp.intp_t n_samples = hierarchy.shape[0] + 1 + cnp.intp_t next_label = n_samples + 1 + list result_list, node_list = bfs_from_hierarchy(hierarchy, root) + + cnp.intp_t[::1] relabel + cnp.uint8_t[::1] ignore + + cnp.intp_t node, sub_node, left, right + cnp.float64_t lambda_value, distance + cnp.intp_t left_count, right_count + HIERARCHY_t children + + relabel = np.empty(root + 1, dtype=np.intp) + relabel[root] = n_samples + result_list = [] + ignore = np.zeros(len(node_list), dtype=bool) + + for node in node_list: + if ignore[node] or node < n_samples: + continue + + children = hierarchy[node - n_samples] + left = children.left_node + right = children.right_node + distance = children.value + if distance > 0.0: + lambda_value = 1.0 / distance + else: + lambda_value = INFTY + + if left >= n_samples: + left_count = hierarchy[left - n_samples].cluster_size + else: + left_count = 1 + + if right >= n_samples: + right_count = hierarchy[right - n_samples].cluster_size + else: + right_count = 1 + + if left_count >= min_cluster_size and right_count >= min_cluster_size: + relabel[left] = next_label + next_label += 1 + result_list.append( + (relabel[node], relabel[left], lambda_value, left_count) + ) + + relabel[right] = next_label + next_label += 1 + result_list.append( + (relabel[node], relabel[right], lambda_value, right_count) + ) + + elif left_count < min_cluster_size and right_count < min_cluster_size: + for sub_node in bfs_from_hierarchy(hierarchy, left): + if sub_node < n_samples: + result_list.append( + (relabel[node], sub_node, lambda_value, 1) + ) + ignore[sub_node] = True + + for sub_node in bfs_from_hierarchy(hierarchy, right): + if sub_node < n_samples: + result_list.append( + (relabel[node], sub_node, lambda_value, 1) + ) + ignore[sub_node] = True + + elif left_count < min_cluster_size: + relabel[right] = relabel[node] + for sub_node in bfs_from_hierarchy(hierarchy, left): + if sub_node < n_samples: + result_list.append( + (relabel[node], sub_node, lambda_value, 1) + ) + ignore[sub_node] = True + + else: + relabel[left] = relabel[node] + for sub_node in bfs_from_hierarchy(hierarchy, right): + if sub_node < n_samples: + result_list.append( + (relabel[node], sub_node, lambda_value, 1) + ) + ignore[sub_node] = True + + return np.array(result_list, dtype=CONDENSED_dtype) + + +cdef dict _compute_stability( + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree +): + + cdef: + cnp.float64_t[::1] result, births + cnp.intp_t[:] parents = condensed_tree['parent'] + + cnp.intp_t parent, cluster_size, result_index, idx + cnp.float64_t lambda_val + CONDENSED_t condensed_node + cnp.intp_t largest_child = condensed_tree['child'].max() + cnp.intp_t smallest_cluster = np.min(parents) + cnp.intp_t num_clusters = np.max(parents) - smallest_cluster + 1 + dict stability_dict = {} + + largest_child = max(largest_child, smallest_cluster) + births = np.full(largest_child + 1, np.nan, dtype=np.float64) + + for idx in range(PyArray_SHAPE( condensed_tree)[0]): + condensed_node = condensed_tree[idx] + births[condensed_node.child] = condensed_node.value + + births[smallest_cluster] = 0.0 + + result = np.zeros(num_clusters, dtype=np.float64) + for idx in range(PyArray_SHAPE( condensed_tree)[0]): + condensed_node = condensed_tree[idx] + parent = condensed_node.parent + lambda_val = condensed_node.value + cluster_size = condensed_node.cluster_size + + result_index = parent - smallest_cluster + result[result_index] += (lambda_val - births[parent]) * cluster_size + + for idx in range(num_clusters): + stability_dict[idx + smallest_cluster] = result[idx] + + return stability_dict + + +cdef list bfs_from_cluster_tree( + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree, + cnp.intp_t bfs_root +): + + cdef: + list result = [] + cnp.ndarray[cnp.intp_t, ndim=1] process_queue = ( + np.array([bfs_root], dtype=np.intp) + ) + cnp.ndarray[cnp.intp_t, ndim=1] children = condensed_tree['child'] + cnp.intp_t[:] parents = condensed_tree['parent'] + + while len(process_queue) > 0: + result.extend(process_queue.tolist()) + process_queue = children[np.isin(parents, process_queue)] + + return result + + +cdef cnp.float64_t[::1] max_lambdas(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree): + + cdef: + cnp.intp_t parent, current_parent, idx + cnp.float64_t lambda_val, max_lambda + cnp.float64_t[::1] deaths + cnp.intp_t largest_parent = condensed_tree['parent'].max() + + deaths = np.zeros(largest_parent + 1, dtype=np.float64) + current_parent = condensed_tree[0].parent + max_lambda = condensed_tree[0].value + + for idx in range(1, PyArray_SHAPE( condensed_tree)[0]): + parent = condensed_tree[idx].parent + lambda_val = condensed_tree[idx].value + + if parent == current_parent: + max_lambda = max(max_lambda, lambda_val) + else: + deaths[current_parent] = max_lambda + current_parent = parent + max_lambda = lambda_val + + deaths[current_parent] = max_lambda # value for last parent + return deaths + + +@cython.final +cdef class TreeUnionFind: + + cdef cnp.intp_t[:, ::1] data + cdef cnp.uint8_t[::1] is_component + + def __init__(self, size): + cdef cnp.intp_t idx + self.data = np.zeros((size, 2), dtype=np.intp) + for idx in range(size): + self.data[idx, 0] = idx + self.is_component = np.ones(size, dtype=np.uint8) + + cdef void union(self, cnp.intp_t x, cnp.intp_t y): + cdef cnp.intp_t x_root = self.find(x) + cdef cnp.intp_t y_root = self.find(y) + + if self.data[x_root, 1] < self.data[y_root, 1]: + self.data[x_root, 0] = y_root + elif self.data[x_root, 1] > self.data[y_root, 1]: + self.data[y_root, 0] = x_root + else: + self.data[y_root, 0] = x_root + self.data[x_root, 1] += 1 + return + + cdef cnp.intp_t find(self, cnp.intp_t x): + if self.data[x, 0] != x: + self.data[x, 0] = self.find(self.data[x, 0]) + self.is_component[x] = False + return self.data[x, 0] + + +cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labelling_at_cut( + const HIERARCHY_t[::1] linkage, + cnp.float64_t cut, + cnp.intp_t min_cluster_size +): + """Given a single linkage tree and a cut value, return the + vector of cluster labels at that cut value. This is useful + for Robust Single Linkage, and extracting DBSCAN results + from a single HDBSCAN run. + + Parameters + ---------- + linkage : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype + The single linkage tree in scipy.cluster.hierarchy format. + + cut : double + The cut value at which to find clusters. + + min_cluster_size : int + The minimum cluster size; clusters below this size at + the cut will be considered noise. + + Returns + ------- + labels : ndarray of shape (n_samples,) + The cluster labels for each point in the data set; + a label of -1 denotes a noise assignment. + """ + + cdef: + cnp.intp_t n, cluster, root, n_samples, cluster_label + cnp.intp_t[::1] unique_labels, cluster_size + cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result + TreeUnionFind union_find + dict cluster_label_map + HIERARCHY_t node + + root = 2 * linkage.shape[0] + n_samples = root // 2 + 1 + result = np.empty(n_samples, dtype=np.intp) + union_find = TreeUnionFind(root + 1) + + cluster = n_samples + for node in linkage: + if node.value < cut: + union_find.union(node.left_node, cluster) + union_find.union(node.right_node, cluster) + cluster += 1 + + cluster_size = np.zeros(cluster, dtype=np.intp) + for n in range(n_samples): + cluster = union_find.find(n) + cluster_size[cluster] += 1 + result[n] = cluster + + cluster_label_map = {-1: NOISE} + cluster_label = 0 + unique_labels = np.unique(result) + + for cluster in unique_labels: + if cluster_size[cluster] < min_cluster_size: + cluster_label_map[cluster] = NOISE + else: + cluster_label_map[cluster] = cluster_label + cluster_label += 1 + + for n in range(n_samples): + result[n] = cluster_label_map[result[n]] + + return result + + +cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] _do_labelling( + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree, + set clusters, + dict cluster_label_map, + cnp.intp_t allow_single_cluster, + cnp.float64_t cluster_selection_epsilon +): + """Given a condensed tree, clusters and a labeling map for the clusters, + return an array containing the labels of each point based on cluster + membership. Note that this is where points may be marked as noisy + outliers. The determination of some points as noise is in large, single- + cluster datasets is controlled by the `allow_single_cluster` and + `cluster_selection_epsilon` parameters. + + Parameters + ---------- + condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype + Effectively an edgelist encoding a parent/child pair, along with a + value and the corresponding cluster_size in each row providing a tree + structure. + + clusters : set + The set of nodes corresponding to identified clusters. These node + values should be the same as those present in `condensed_tree`. + + cluster_label_map : dict + A mapping from the node values present in `clusters` to the labels + which will be returned. + + Returns + ------- + labels : ndarray of shape (n_samples,) + The cluster labels for each point in the data set; + a label of -1 denotes a noise assignment. + """ + + cdef: + cnp.intp_t root_cluster + cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result + cnp.ndarray[cnp.intp_t, ndim=1] parent_array, child_array + cnp.ndarray[cnp.float64_t, ndim=1] lambda_array + TreeUnionFind union_find + cnp.intp_t n, parent, child, cluster + cnp.float64_t threshold + + child_array = condensed_tree['child'] + parent_array = condensed_tree['parent'] + lambda_array = condensed_tree['value'] + + root_cluster = np.min(parent_array) + result = np.empty(root_cluster, dtype=np.intp) + union_find = TreeUnionFind(np.max(parent_array) + 1) + + for n in range(PyArray_SHAPE( condensed_tree)[0]): + child = child_array[n] + parent = parent_array[n] + if child not in clusters: + union_find.union(parent, child) + + for n in range(root_cluster): + cluster = union_find.find(n) + label = NOISE + if cluster != root_cluster: + label = cluster_label_map[cluster] + elif len(clusters) == 1 and allow_single_cluster: + # There can only be one edge with this particular child hence this + # expression extracts a unique, scalar lambda value. + parent_lambda = lambda_array[child_array == n] + if cluster_selection_epsilon != 0.0: + threshold = 1 / cluster_selection_epsilon + else: + # The threshold should be calculated per-sample based on the + # largest lambda of any simbling node. + threshold = lambda_array[parent_array == cluster].max() + if parent_lambda >= threshold: + label = cluster_label_map[cluster] + + result[n] = label + + return result + + +cdef cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] get_probabilities( + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree, + dict cluster_map, + cnp.intp_t[::1] labels +): + + cdef: + cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] result + cnp.float64_t[:] lambda_array + cnp.float64_t[::1] deaths + cnp.intp_t[:] child_array, parent_array + cnp.intp_t root_cluster, n, point, cluster_num, cluster + cnp.float64_t max_lambda, lambda_val + + child_array = condensed_tree['child'] + parent_array = condensed_tree['parent'] + lambda_array = condensed_tree['value'] + + result = np.zeros(labels.shape[0]) + deaths = max_lambdas(condensed_tree) + root_cluster = np.min(parent_array) + + for n in range(PyArray_SHAPE( condensed_tree)[0]): + point = child_array[n] + if point >= root_cluster: + continue + + cluster_num = labels[point] + if cluster_num == -1: + continue + + cluster = cluster_map[cluster_num] + max_lambda = deaths[cluster] + if max_lambda == 0.0 or isinf(lambda_array[n]): + result[point] = 1.0 + else: + lambda_val = min(lambda_array[n], max_lambda) + result[point] = lambda_val / max_lambda + + return result + + +cpdef list recurse_leaf_dfs( + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree, + cnp.intp_t current_node +): + cdef cnp.intp_t[:] children + cdef cnp.intp_t child + + children = cluster_tree[cluster_tree['parent'] == current_node]['child'] + if children.shape[0] == 0: + return [current_node,] + else: + return sum([recurse_leaf_dfs(cluster_tree, child) for child in children], []) + + +cpdef list get_cluster_tree_leaves(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree): + cdef cnp.intp_t root + if PyArray_SHAPE( cluster_tree)[0] == 0: + return [] + root = cluster_tree['parent'].min() + return recurse_leaf_dfs(cluster_tree, root) + +cdef cnp.intp_t traverse_upwards( + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree, + cnp.float64_t cluster_selection_epsilon, + cnp.intp_t leaf, + cnp.intp_t allow_single_cluster +): + cdef cnp.intp_t root, parent + cdef cnp.float64_t parent_eps + + root = cluster_tree['parent'].min() + parent = cluster_tree[cluster_tree['child'] == leaf]['parent'] + if parent == root: + if allow_single_cluster: + return parent + else: + return leaf # return node closest to root + + parent_eps = 1 / cluster_tree[cluster_tree['child'] == parent]['value'] + if parent_eps > cluster_selection_epsilon: + return parent + else: + return traverse_upwards( + cluster_tree, + cluster_selection_epsilon, + parent, + allow_single_cluster + ) + +cdef set epsilon_search( + set leaves, + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree, + cnp.float64_t cluster_selection_epsilon, + cnp.intp_t allow_single_cluster +): + cdef: + list selected_clusters = list() + list processed = list() + cnp.intp_t leaf, epsilon_child, sub_node + cnp.float64_t eps + cnp.uint8_t[:] leaf_nodes + cnp.ndarray[cnp.intp_t, ndim=1] children = cluster_tree['child'] + cnp.ndarray[cnp.float64_t, ndim=1] distances = cluster_tree['value'] + + for leaf in leaves: + leaf_nodes = children == leaf + eps = 1 / distances[leaf_nodes][0] + if eps < cluster_selection_epsilon: + if leaf not in processed: + epsilon_child = traverse_upwards( + cluster_tree, + cluster_selection_epsilon, + leaf, + allow_single_cluster + ) + selected_clusters.append(epsilon_child) + + for sub_node in bfs_from_cluster_tree(cluster_tree, epsilon_child): + if sub_node != epsilon_child: + processed.append(sub_node) + else: + selected_clusters.append(leaf) + + return set(selected_clusters) + + +@cython.wraparound(True) +cdef tuple _get_clusters( + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree, + dict stability, + cluster_selection_method='eom', + cnp.uint8_t allow_single_cluster=False, + cnp.float64_t cluster_selection_epsilon=0.0, + max_cluster_size=None +): + """Given a tree and stability dict, produce the cluster labels + (and probabilities) for a flat clustering based on the chosen + cluster selection method. + + Parameters + ---------- + condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype + Effectively an edgelist encoding a parent/child pair, along with a + value and the corresponding cluster_size in each row providing a tree + structure. + + stability : dict + A dictionary mapping cluster_ids to stability values + + cluster_selection_method : string, optional (default 'eom') + The method of selecting clusters. The default is the + Excess of Mass algorithm specified by 'eom'. The alternate + option is 'leaf'. + + allow_single_cluster : boolean, optional (default False) + Whether to allow a single cluster to be selected by the + Excess of Mass algorithm. + + cluster_selection_epsilon: double, optional (default 0.0) + A distance threshold for cluster splits. + + max_cluster_size: int, default=None + The maximum size for clusters located by the EOM clusterer. Can + be overridden by the cluster_selection_epsilon parameter in + rare cases. + + Returns + ------- + labels : ndarray of shape (n_samples,) + An integer array of cluster labels, with -1 denoting noise. + + probabilities : ndarray (n_samples,) + The cluster membership strength of each sample. + + stabilities : ndarray (n_clusters,) + The cluster coherence strengths of each cluster. + """ + cdef: + list node_list + cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree + cnp.uint8_t[::1] child_selection + cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels + dict is_cluster, cluster_sizes + cnp.float64_t subtree_stability + cnp.intp_t node, sub_node, cluster, n_samples + cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] probs + + # Assume clusters are ordered by numeric id equivalent to + # a topological sort of the tree; This is valid given the + # current implementation above, so don't change that ... or + # if you do, change this accordingly! + if allow_single_cluster: + node_list = sorted(stability.keys(), reverse=True) + else: + node_list = sorted(stability.keys(), reverse=True)[:-1] + # (exclude root) + + cluster_tree = condensed_tree[condensed_tree['cluster_size'] > 1] + is_cluster = {cluster: True for cluster in node_list} + n_samples = np.max(condensed_tree[condensed_tree['cluster_size'] == 1]['child']) + 1 + + if max_cluster_size is None: + max_cluster_size = n_samples + 1 # Set to a value that will never be triggered + cluster_sizes = { + child: cluster_size for child, cluster_size + in zip(cluster_tree['child'], cluster_tree['cluster_size']) + } + if allow_single_cluster: + # Compute cluster size for the root node + cluster_sizes[node_list[-1]] = np.sum( + cluster_tree[cluster_tree['parent'] == node_list[-1]]['cluster_size']) + + if cluster_selection_method == 'eom': + for node in node_list: + child_selection = (cluster_tree['parent'] == node) + subtree_stability = np.sum([ + stability[child] for + child in cluster_tree['child'][child_selection]]) + if subtree_stability > stability[node] or cluster_sizes[node] > max_cluster_size: + is_cluster[node] = False + stability[node] = subtree_stability + else: + for sub_node in bfs_from_cluster_tree(cluster_tree, node): + if sub_node != node: + is_cluster[sub_node] = False + + if cluster_selection_epsilon != 0.0 and PyArray_SHAPE( cluster_tree)[0] > 0: + eom_clusters = [c for c in is_cluster if is_cluster[c]] + selected_clusters = [] + # first check if eom_clusters only has root node, which skips epsilon check. + if (len(eom_clusters) == 1 and eom_clusters[0] == cluster_tree['parent'].min()): + if allow_single_cluster: + selected_clusters = eom_clusters + else: + selected_clusters = epsilon_search( + set(eom_clusters), + cluster_tree, + cluster_selection_epsilon, + allow_single_cluster + ) + for c in is_cluster: + if c in selected_clusters: + is_cluster[c] = True + else: + is_cluster[c] = False + + elif cluster_selection_method == 'leaf': + leaves = set(get_cluster_tree_leaves(cluster_tree)) + if len(leaves) == 0: + for c in is_cluster: + is_cluster[c] = False + is_cluster[condensed_tree['parent'].min()] = True + + if cluster_selection_epsilon != 0.0: + selected_clusters = epsilon_search( + leaves, + cluster_tree, + cluster_selection_epsilon, + allow_single_cluster + ) + else: + selected_clusters = leaves + + for c in is_cluster: + if c in selected_clusters: + is_cluster[c] = True + else: + is_cluster[c] = False + + clusters = set([c for c in is_cluster if is_cluster[c]]) + cluster_map = {c: n for n, c in enumerate(sorted(list(clusters)))} + reverse_cluster_map = {n: c for c, n in cluster_map.items()} + + labels = _do_labelling( + condensed_tree, + clusters, + cluster_map, + allow_single_cluster, + cluster_selection_epsilon + ) + probs = get_probabilities(condensed_tree, reverse_cluster_map, labels) + + return (labels, probs) diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/hdbscan.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/hdbscan.py new file mode 100644 index 0000000000000000000000000000000000000000..f292a1f65909b6a5a1a0287adbc2996a3dc36381 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/hdbscan.py @@ -0,0 +1,1000 @@ +""" +HDBSCAN: Hierarchical Density-Based Spatial Clustering + of Applications with Noise +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software without +# specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from numbers import Integral, Real +from warnings import warn + +import numpy as np +from scipy.sparse import csgraph, issparse + +from ...base import BaseEstimator, ClusterMixin, _fit_context +from ...metrics import pairwise_distances +from ...metrics._dist_metrics import DistanceMetric +from ...metrics.pairwise import _VALID_METRICS +from ...neighbors import BallTree, KDTree, NearestNeighbors +from ...utils._param_validation import Interval, StrOptions +from ...utils.validation import ( + _allclose_dense_sparse, + _assert_all_finite, + validate_data, +) +from ._linkage import ( + MST_edge_dtype, + make_single_linkage, + mst_from_data_matrix, + mst_from_mutual_reachability, +) +from ._reachability import mutual_reachability_graph +from ._tree import HIERARCHY_dtype, labelling_at_cut, tree_to_labels + +FAST_METRICS = set(KDTree.valid_metrics + BallTree.valid_metrics) + +# Encodings are arbitrary but must be strictly negative. +# The current encodings are chosen as extensions to the -1 noise label. +# Avoided enums so that the end user only deals with simple labels. +_OUTLIER_ENCODING: dict = { + "infinite": { + "label": -2, + # The probability could also be 1, since infinite points are certainly + # infinite outliers, however 0 is convention from the HDBSCAN library + # implementation. + "prob": 0, + }, + "missing": { + "label": -3, + # A nan probability is chosen to emphasize the fact that the + # corresponding data was not considered in the clustering problem. + "prob": np.nan, + }, +} + + +def _brute_mst(mutual_reachability, min_samples): + """ + Builds a minimum spanning tree (MST) from the provided mutual-reachability + values. This function dispatches to a custom Cython implementation for + dense arrays, and `scipy.sparse.csgraph.minimum_spanning_tree` for sparse + arrays/matrices. + + Parameters + ---------- + mututal_reachability_graph: {ndarray, sparse matrix} of shape \ + (n_samples, n_samples) + Weighted adjacency matrix of the mutual reachability graph. + + min_samples : int, default=None + The number of samples in a neighborhood for a point + to be considered as a core point. This includes the point itself. + + Returns + ------- + mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype + The MST representation of the mutual-reachability graph. The MST is + represented as a collection of edges. + """ + if not issparse(mutual_reachability): + return mst_from_mutual_reachability(mutual_reachability) + + # Check if the mutual reachability matrix has any rows which have + # less than `min_samples` non-zero elements. + indptr = mutual_reachability.indptr + num_points = mutual_reachability.shape[0] + if any((indptr[i + 1] - indptr[i]) < min_samples for i in range(num_points)): + raise ValueError( + f"There exists points with fewer than {min_samples} neighbors. Ensure" + " your distance matrix has non-zero values for at least" + f" `min_sample`={min_samples} neighbors for each points (i.e. K-nn" + " graph), or specify a `max_distance` in `metric_params` to use when" + " distances are missing." + ) + # Check connected component on mutual reachability. + # If more than one connected component is present, + # it means that the graph is disconnected. + n_components = csgraph.connected_components( + mutual_reachability, directed=False, return_labels=False + ) + if n_components > 1: + raise ValueError( + f"Sparse mutual reachability matrix has {n_components} connected" + " components. HDBSCAN cannot be performed on a disconnected graph. Ensure" + " that the sparse distance matrix has only one connected component." + ) + + # Compute the minimum spanning tree for the sparse graph + sparse_min_spanning_tree = csgraph.minimum_spanning_tree(mutual_reachability) + rows, cols = sparse_min_spanning_tree.nonzero() + mst = np.rec.fromarrays( + [rows, cols, sparse_min_spanning_tree.data], + dtype=MST_edge_dtype, + ) + return mst + + +def _process_mst(min_spanning_tree): + """ + Builds a single-linkage tree (SLT) from the provided minimum spanning tree + (MST). The MST is first sorted then processed by a custom Cython routine. + + Parameters + ---------- + min_spanning_tree : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype + The MST representation of the mutual-reachability graph. The MST is + represented as a collection of edges. + + Returns + ------- + single_linkage : ndarray of shape (n_samples - 1,), dtype=HIERARCHY_dtype + The single-linkage tree tree (dendrogram) built from the MST. + """ + # Sort edges of the min_spanning_tree by weight + row_order = np.argsort(min_spanning_tree["distance"]) + min_spanning_tree = min_spanning_tree[row_order] + # Convert edge list into standard hierarchical clustering format + return make_single_linkage(min_spanning_tree) + + +def _hdbscan_brute( + X, + min_samples=5, + alpha=None, + metric="euclidean", + n_jobs=None, + copy=False, + **metric_params, +): + """ + Builds a single-linkage tree (SLT) from the input data `X`. If + `metric="precomputed"` then `X` must be a symmetric array of distances. + Otherwise, the pairwise distances are calculated directly and passed to + `mutual_reachability_graph`. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples) + Either the raw data from which to compute the pairwise distances, + or the precomputed distances. + + min_samples : int, default=None + The number of samples in a neighborhood for a point + to be considered as a core point. This includes the point itself. + + alpha : float, default=1.0 + A distance scaling parameter as used in robust single linkage. + + metric : str or callable, default='euclidean' + The metric to use when calculating distance between instances in a + feature array. + + - If metric is a string or callable, it must be one of + the options allowed by :func:`~sklearn.metrics.pairwise_distances` + for its metric parameter. + + - If metric is "precomputed", X is assumed to be a distance matrix and + must be square. + + n_jobs : int, default=None + The number of jobs to use for computing the pairwise distances. This + works by breaking down the pairwise matrix into n_jobs even slices and + computing them in parallel. This parameter is passed directly to + :func:`~sklearn.metrics.pairwise_distances`. + + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + copy : bool, default=False + If `copy=True` then any time an in-place modifications would be made + that would overwrite `X`, a copy will first be made, guaranteeing that + the original data will be unchanged. Currently, it only applies when + `metric="precomputed"`, when passing a dense array or a CSR sparse + array/matrix. + + metric_params : dict, default=None + Arguments passed to the distance metric. + + Returns + ------- + single_linkage : ndarray of shape (n_samples - 1,), dtype=HIERARCHY_dtype + The single-linkage tree tree (dendrogram) built from the MST. + """ + if metric == "precomputed": + if X.shape[0] != X.shape[1]: + raise ValueError( + "The precomputed distance matrix is expected to be symmetric, however" + f" it has shape {X.shape}. Please verify that the" + " distance matrix was constructed correctly." + ) + if not _allclose_dense_sparse(X, X.T): + raise ValueError( + "The precomputed distance matrix is expected to be symmetric, however" + " its values appear to be asymmetric. Please verify that the distance" + " matrix was constructed correctly." + ) + + distance_matrix = X.copy() if copy else X + else: + distance_matrix = pairwise_distances( + X, metric=metric, n_jobs=n_jobs, **metric_params + ) + distance_matrix /= alpha + + max_distance = metric_params.get("max_distance", 0.0) + if issparse(distance_matrix) and distance_matrix.format != "csr": + # we need CSR format to avoid a conversion in `_brute_mst` when calling + # `csgraph.connected_components` + distance_matrix = distance_matrix.tocsr() + + # Note that `distance_matrix` is manipulated in-place, however we do not + # need it for anything else past this point, hence the operation is safe. + mutual_reachability_ = mutual_reachability_graph( + distance_matrix, min_samples=min_samples, max_distance=max_distance + ) + min_spanning_tree = _brute_mst(mutual_reachability_, min_samples=min_samples) + # Warn if the MST couldn't be constructed around the missing distances + if np.isinf(min_spanning_tree["distance"]).any(): + warn( + ( + "The minimum spanning tree contains edge weights with value " + "infinity. Potentially, you are missing too many distances " + "in the initial distance matrix for the given neighborhood " + "size." + ), + UserWarning, + ) + return _process_mst(min_spanning_tree) + + +def _hdbscan_prims( + X, + algo, + min_samples=5, + alpha=1.0, + metric="euclidean", + leaf_size=40, + n_jobs=None, + **metric_params, +): + """ + Builds a single-linkage tree (SLT) from the input data `X`. If + `metric="precomputed"` then `X` must be a symmetric array of distances. + Otherwise, the pairwise distances are calculated directly and passed to + `mutual_reachability_graph`. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + The raw data. + + min_samples : int, default=None + The number of samples in a neighborhood for a point + to be considered as a core point. This includes the point itself. + + alpha : float, default=1.0 + A distance scaling parameter as used in robust single linkage. + + metric : str or callable, default='euclidean' + The metric to use when calculating distance between instances in a + feature array. `metric` must be one of the options allowed by + :func:`~sklearn.metrics.pairwise_distances` for its metric + parameter. + + n_jobs : int, default=None + The number of jobs to use for computing the pairwise distances. This + works by breaking down the pairwise matrix into n_jobs even slices and + computing them in parallel. This parameter is passed directly to + :func:`~sklearn.metrics.pairwise_distances`. + + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + copy : bool, default=False + If `copy=True` then any time an in-place modifications would be made + that would overwrite `X`, a copy will first be made, guaranteeing that + the original data will be unchanged. Currently, it only applies when + `metric="precomputed"`, when passing a dense array or a CSR sparse + array/matrix. + + metric_params : dict, default=None + Arguments passed to the distance metric. + + Returns + ------- + single_linkage : ndarray of shape (n_samples - 1,), dtype=HIERARCHY_dtype + The single-linkage tree tree (dendrogram) built from the MST. + """ + # The Cython routines used require contiguous arrays + X = np.asarray(X, order="C") + + # Get distance to kth nearest neighbour + nbrs = NearestNeighbors( + n_neighbors=min_samples, + algorithm=algo, + leaf_size=leaf_size, + metric=metric, + metric_params=metric_params, + n_jobs=n_jobs, + p=None, + ).fit(X) + + neighbors_distances, _ = nbrs.kneighbors(X, min_samples, return_distance=True) + core_distances = np.ascontiguousarray(neighbors_distances[:, -1]) + dist_metric = DistanceMetric.get_metric(metric, **metric_params) + + # Mutual reachability distance is implicit in mst_from_data_matrix + min_spanning_tree = mst_from_data_matrix(X, core_distances, dist_metric, alpha) + return _process_mst(min_spanning_tree) + + +def remap_single_linkage_tree(tree, internal_to_raw, non_finite): + """ + Takes an internal single_linkage_tree structure and adds back in a set of points + that were initially detected as non-finite and returns that new tree. + These points will all be merged into the final node at np.inf distance and + considered noise points. + + Parameters + ---------- + tree : ndarray of shape (n_samples - 1,), dtype=HIERARCHY_dtype + The single-linkage tree tree (dendrogram) built from the MST. + internal_to_raw: dict + A mapping from internal integer index to the raw integer index + non_finite : ndarray + Boolean array of which entries in the raw data are non-finite + """ + finite_count = len(internal_to_raw) + + outlier_count = len(non_finite) + for i, _ in enumerate(tree): + left = tree[i]["left_node"] + right = tree[i]["right_node"] + + if left < finite_count: + tree[i]["left_node"] = internal_to_raw[left] + else: + tree[i]["left_node"] = left + outlier_count + if right < finite_count: + tree[i]["right_node"] = internal_to_raw[right] + else: + tree[i]["right_node"] = right + outlier_count + + outlier_tree = np.zeros(len(non_finite), dtype=HIERARCHY_dtype) + last_cluster_id = max( + tree[tree.shape[0] - 1]["left_node"], tree[tree.shape[0] - 1]["right_node"] + ) + last_cluster_size = tree[tree.shape[0] - 1]["cluster_size"] + for i, outlier in enumerate(non_finite): + outlier_tree[i] = (outlier, last_cluster_id + 1, np.inf, last_cluster_size + 1) + last_cluster_id += 1 + last_cluster_size += 1 + tree = np.concatenate([tree, outlier_tree]) + return tree + + +def _get_finite_row_indices(matrix): + """ + Returns the indices of the purely finite rows of a + sparse matrix or dense ndarray + """ + if issparse(matrix): + row_indices = np.array( + [i for i, row in enumerate(matrix.tolil().data) if np.all(np.isfinite(row))] + ) + else: + (row_indices,) = np.isfinite(matrix.sum(axis=1)).nonzero() + return row_indices + + +class HDBSCAN(ClusterMixin, BaseEstimator): + """Cluster data using hierarchical density-based clustering. + + HDBSCAN - Hierarchical Density-Based Spatial Clustering of Applications + with Noise. Performs :class:`~sklearn.cluster.DBSCAN` over varying epsilon + values and integrates the result to find a clustering that gives the best + stability over epsilon. + This allows HDBSCAN to find clusters of varying densities (unlike + :class:`~sklearn.cluster.DBSCAN`), and be more robust to parameter selection. + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.3 + + Parameters + ---------- + min_cluster_size : int, default=5 + The minimum number of samples in a group for that group to be + considered a cluster; groupings smaller than this size will be left + as noise. + + min_samples : int, default=None + The parameter `k` used to calculate the distance between a point + `x_p` and its k-th nearest neighbor. + When `None`, defaults to `min_cluster_size`. + + cluster_selection_epsilon : float, default=0.0 + A distance threshold. Clusters below this value will be merged. + See [5]_ for more information. + + max_cluster_size : int, default=None + A limit to the size of clusters returned by the `"eom"` cluster + selection algorithm. There is no limit when `max_cluster_size=None`. + Has no effect if `cluster_selection_method="leaf"`. + + metric : str or callable, default='euclidean' + The metric to use when calculating distance between instances in a + feature array. + + - If metric is a string or callable, it must be one of + the options allowed by :func:`~sklearn.metrics.pairwise_distances` + for its metric parameter. + + - If metric is "precomputed", X is assumed to be a distance matrix and + must be square. + + metric_params : dict, default=None + Arguments passed to the distance metric. + + alpha : float, default=1.0 + A distance scaling parameter as used in robust single linkage. + See [3]_ for more information. + + algorithm : {"auto", "brute", "kd_tree", "ball_tree"}, default="auto" + Exactly which algorithm to use for computing core distances; By default + this is set to `"auto"` which attempts to use a + :class:`~sklearn.neighbors.KDTree` tree if possible, otherwise it uses + a :class:`~sklearn.neighbors.BallTree` tree. Both `"kd_tree"` and + `"ball_tree"` algorithms use the + :class:`~sklearn.neighbors.NearestNeighbors` estimator. + + If the `X` passed during `fit` is sparse or `metric` is invalid for + both :class:`~sklearn.neighbors.KDTree` and + :class:`~sklearn.neighbors.BallTree`, then it resolves to use the + `"brute"` algorithm. + + leaf_size : int, default=40 + Leaf size for trees responsible for fast nearest neighbour queries when + a KDTree or a BallTree are used as core-distance algorithms. A large + dataset size and small `leaf_size` may induce excessive memory usage. + If you are running out of memory consider increasing the `leaf_size` + parameter. Ignored for `algorithm="brute"`. + + n_jobs : int, default=None + Number of jobs to run in parallel to calculate distances. + `None` means 1 unless in a :obj:`joblib.parallel_backend` context. + `-1` means using all processors. See :term:`Glossary ` + for more details. + + cluster_selection_method : {"eom", "leaf"}, default="eom" + The method used to select clusters from the condensed tree. The + standard approach for HDBSCAN* is to use an Excess of Mass (`"eom"`) + algorithm to find the most persistent clusters. Alternatively you can + instead select the clusters at the leaves of the tree -- this provides + the most fine grained and homogeneous clusters. + + allow_single_cluster : bool, default=False + By default HDBSCAN* will not produce a single cluster, setting this + to True will override this and allow single cluster results in + the case that you feel this is a valid result for your dataset. + + store_centers : str, default=None + Which, if any, cluster centers to compute and store. The options are: + + - `None` which does not compute nor store any centers. + - `"centroid"` which calculates the center by taking the weighted + average of their positions. Note that the algorithm uses the + euclidean metric and does not guarantee that the output will be + an observed data point. + - `"medoid"` which calculates the center by taking the point in the + fitted data which minimizes the distance to all other points in + the cluster. This is slower than "centroid" since it requires + computing additional pairwise distances between points of the + same cluster but guarantees the output is an observed data point. + The medoid is also well-defined for arbitrary metrics, and does not + depend on a euclidean metric. + - `"both"` which computes and stores both forms of centers. + + copy : bool, default=False + If `copy=True` then any time an in-place modifications would be made + that would overwrite data passed to :term:`fit`, a copy will first be + made, guaranteeing that the original data will be unchanged. + Currently, it only applies when `metric="precomputed"`, when passing + a dense array or a CSR sparse matrix and when `algorithm="brute"`. + + Attributes + ---------- + labels_ : ndarray of shape (n_samples,) + Cluster labels for each point in the dataset given to :term:`fit`. + Outliers are labeled as follows: + + - Noisy samples are given the label -1. + - Samples with infinite elements (+/- np.inf) are given the label -2. + - Samples with missing data are given the label -3, even if they + also have infinite elements. + + probabilities_ : ndarray of shape (n_samples,) + The strength with which each sample is a member of its assigned + cluster. + + - Clustered samples have probabilities proportional to the degree that + they persist as part of the cluster. + - Noisy samples have probability zero. + - Samples with infinite elements (+/- np.inf) have probability 0. + - Samples with missing data have probability `np.nan`. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + centroids_ : ndarray of shape (n_clusters, n_features) + A collection containing the centroid of each cluster calculated under + the standard euclidean metric. The centroids may fall "outside" their + respective clusters if the clusters themselves are non-convex. + + Note that `n_clusters` only counts non-outlier clusters. That is to + say, the `-1, -2, -3` labels for the outlier clusters are excluded. + + medoids_ : ndarray of shape (n_clusters, n_features) + A collection containing the medoid of each cluster calculated under + the whichever metric was passed to the `metric` parameter. The + medoids are points in the original cluster which minimize the average + distance to all other points in that cluster under the chosen metric. + These can be thought of as the result of projecting the `metric`-based + centroid back onto the cluster. + + Note that `n_clusters` only counts non-outlier clusters. That is to + say, the `-1, -2, -3` labels for the outlier clusters are excluded. + + See Also + -------- + DBSCAN : Density-Based Spatial Clustering of Applications + with Noise. + OPTICS : Ordering Points To Identify the Clustering Structure. + Birch : Memory-efficient, online-learning algorithm. + + Notes + ----- + The `min_samples` parameter includes the point itself, whereas the implementation in + `scikit-learn-contrib/hdbscan `_ + does not. To get the same results in both versions, the value of `min_samples` here + must be 1 greater than the value used in `scikit-learn-contrib/hdbscan + `_. + + References + ---------- + + .. [1] :doi:`Campello, R. J., Moulavi, D., & Sander, J. Density-based clustering + based on hierarchical density estimates. + <10.1007/978-3-642-37456-2_14>` + .. [2] :doi:`Campello, R. J., Moulavi, D., Zimek, A., & Sander, J. + Hierarchical density estimates for data clustering, visualization, + and outlier detection.<10.1145/2733381>` + + .. [3] `Chaudhuri, K., & Dasgupta, S. Rates of convergence for the + cluster tree. + `_ + + .. [4] `Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and + Sander, J. Density-Based Clustering Validation. + `_ + + .. [5] :arxiv:`Malzer, C., & Baum, M. "A Hybrid Approach To Hierarchical + Density-based Cluster Selection."<1911.02282>`. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.cluster import HDBSCAN + >>> from sklearn.datasets import load_digits + >>> X, _ = load_digits(return_X_y=True) + >>> hdb = HDBSCAN(min_cluster_size=20) + >>> hdb.fit(X) + HDBSCAN(min_cluster_size=20) + >>> hdb.labels_.shape == (X.shape[0],) + True + >>> np.unique(hdb.labels_).tolist() + [-1, 0, 1, 2, 3, 4, 5, 6, 7] + """ + + _parameter_constraints = { + "min_cluster_size": [Interval(Integral, left=2, right=None, closed="left")], + "min_samples": [Interval(Integral, left=1, right=None, closed="left"), None], + "cluster_selection_epsilon": [ + Interval(Real, left=0, right=None, closed="left") + ], + "max_cluster_size": [ + None, + Interval(Integral, left=1, right=None, closed="left"), + ], + "metric": [ + StrOptions(FAST_METRICS | set(_VALID_METRICS) | {"precomputed"}), + callable, + ], + "metric_params": [dict, None], + "alpha": [Interval(Real, left=0, right=None, closed="neither")], + "algorithm": [StrOptions({"auto", "brute", "kd_tree", "ball_tree"})], + "leaf_size": [Interval(Integral, left=1, right=None, closed="left")], + "n_jobs": [Integral, None], + "cluster_selection_method": [StrOptions({"eom", "leaf"})], + "allow_single_cluster": ["boolean"], + "store_centers": [None, StrOptions({"centroid", "medoid", "both"})], + "copy": ["boolean"], + } + + def __init__( + self, + min_cluster_size=5, + min_samples=None, + cluster_selection_epsilon=0.0, + max_cluster_size=None, + metric="euclidean", + metric_params=None, + alpha=1.0, + algorithm="auto", + leaf_size=40, + n_jobs=None, + cluster_selection_method="eom", + allow_single_cluster=False, + store_centers=None, + copy=False, + ): + self.min_cluster_size = min_cluster_size + self.min_samples = min_samples + self.alpha = alpha + self.max_cluster_size = max_cluster_size + self.cluster_selection_epsilon = cluster_selection_epsilon + self.metric = metric + self.metric_params = metric_params + self.algorithm = algorithm + self.leaf_size = leaf_size + self.n_jobs = n_jobs + self.cluster_selection_method = cluster_selection_method + self.allow_single_cluster = allow_single_cluster + self.store_centers = store_centers + self.copy = copy + + @_fit_context( + # HDBSCAN.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y=None): + """Find clusters based on hierarchical density-based clustering. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features), or \ + ndarray of shape (n_samples, n_samples) + A feature array, or array of distances between samples if + `metric='precomputed'`. + + y : None + Ignored. + + Returns + ------- + self : object + Returns self. + """ + if self.metric == "precomputed" and self.store_centers is not None: + raise ValueError( + "Cannot store centers when using a precomputed distance matrix." + ) + + self._metric_params = self.metric_params or {} + if self.metric != "precomputed": + # Non-precomputed matrices may contain non-finite values. + X = validate_data( + self, + X, + accept_sparse=["csr", "lil"], + ensure_all_finite=False, + dtype=np.float64, + ) + self._raw_data = X + all_finite = True + try: + _assert_all_finite(X.data if issparse(X) else X) + except ValueError: + all_finite = False + + if not all_finite: + # Pass only the purely finite indices into hdbscan + # We will later assign all non-finite points their + # corresponding labels, as specified in `_OUTLIER_ENCODING` + + # Reduce X to make the checks for missing/outlier samples more + # convenient. + reduced_X = X.sum(axis=1) + + # Samples with missing data are denoted by the presence of + # `np.nan` + missing_index = np.isnan(reduced_X).nonzero()[0] + + # Outlier samples are denoted by the presence of `np.inf` + infinite_index = np.isinf(reduced_X).nonzero()[0] + + # Continue with only finite samples + finite_index = _get_finite_row_indices(X) + internal_to_raw = {x: y for x, y in enumerate(finite_index)} + X = X[finite_index] + elif issparse(X): + # Handle sparse precomputed distance matrices separately + X = validate_data( + self, + X, + accept_sparse=["csr", "lil"], + dtype=np.float64, + force_writeable=True, + ) + else: + # Only non-sparse, precomputed distance matrices are handled here + # and thereby allowed to contain numpy.inf for missing distances + + # Perform data validation after removing infinite values (numpy.inf) + # from the given distance matrix. + X = validate_data( + self, X, ensure_all_finite=False, dtype=np.float64, force_writeable=True + ) + if np.isnan(X).any(): + # TODO: Support np.nan in Cython implementation for precomputed + # dense HDBSCAN + raise ValueError("np.nan values found in precomputed-dense") + if X.shape[0] == 1: + raise ValueError("n_samples=1 while HDBSCAN requires more than one sample") + self._min_samples = ( + self.min_cluster_size if self.min_samples is None else self.min_samples + ) + + if self._min_samples > X.shape[0]: + raise ValueError( + f"min_samples ({self._min_samples}) must be at most the number of" + f" samples in X ({X.shape[0]})" + ) + + mst_func = None + kwargs = dict( + X=X, + min_samples=self._min_samples, + alpha=self.alpha, + metric=self.metric, + n_jobs=self.n_jobs, + **self._metric_params, + ) + if self.algorithm == "kd_tree" and self.metric not in KDTree.valid_metrics: + raise ValueError( + f"{self.metric} is not a valid metric for a KDTree-based algorithm." + " Please select a different metric." + ) + elif ( + self.algorithm == "ball_tree" and self.metric not in BallTree.valid_metrics + ): + raise ValueError( + f"{self.metric} is not a valid metric for a BallTree-based algorithm." + " Please select a different metric." + ) + + if self.algorithm != "auto": + if ( + self.metric != "precomputed" + and issparse(X) + and self.algorithm != "brute" + ): + raise ValueError("Sparse data matrices only support algorithm `brute`.") + + if self.algorithm == "brute": + mst_func = _hdbscan_brute + kwargs["copy"] = self.copy + elif self.algorithm == "kd_tree": + mst_func = _hdbscan_prims + kwargs["algo"] = "kd_tree" + kwargs["leaf_size"] = self.leaf_size + else: + mst_func = _hdbscan_prims + kwargs["algo"] = "ball_tree" + kwargs["leaf_size"] = self.leaf_size + else: + if issparse(X) or self.metric not in FAST_METRICS: + # We can't do much with sparse matrices ... + mst_func = _hdbscan_brute + kwargs["copy"] = self.copy + elif self.metric in KDTree.valid_metrics: + # TODO: Benchmark KD vs Ball Tree efficiency + mst_func = _hdbscan_prims + kwargs["algo"] = "kd_tree" + kwargs["leaf_size"] = self.leaf_size + else: + # Metric is a valid BallTree metric + mst_func = _hdbscan_prims + kwargs["algo"] = "ball_tree" + kwargs["leaf_size"] = self.leaf_size + + self._single_linkage_tree_ = mst_func(**kwargs) + + self.labels_, self.probabilities_ = tree_to_labels( + self._single_linkage_tree_, + self.min_cluster_size, + self.cluster_selection_method, + self.allow_single_cluster, + self.cluster_selection_epsilon, + self.max_cluster_size, + ) + if self.metric != "precomputed" and not all_finite: + # Remap indices to align with original data in the case of + # non-finite entries. Samples with np.inf are mapped to -1 and + # those with np.nan are mapped to -2. + self._single_linkage_tree_ = remap_single_linkage_tree( + self._single_linkage_tree_, + internal_to_raw, + # There may be overlap for points w/ both `np.inf` and `np.nan` + non_finite=set(np.hstack([infinite_index, missing_index])), + ) + new_labels = np.empty(self._raw_data.shape[0], dtype=np.int32) + new_labels[finite_index] = self.labels_ + new_labels[infinite_index] = _OUTLIER_ENCODING["infinite"]["label"] + new_labels[missing_index] = _OUTLIER_ENCODING["missing"]["label"] + self.labels_ = new_labels + + new_probabilities = np.zeros(self._raw_data.shape[0], dtype=np.float64) + new_probabilities[finite_index] = self.probabilities_ + # Infinite outliers have probability 0 by convention, though this + # is arbitrary. + new_probabilities[infinite_index] = _OUTLIER_ENCODING["infinite"]["prob"] + new_probabilities[missing_index] = _OUTLIER_ENCODING["missing"]["prob"] + self.probabilities_ = new_probabilities + + if self.store_centers: + self._weighted_cluster_center(X) + return self + + def fit_predict(self, X, y=None): + """Cluster X and return the associated cluster labels. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features), or \ + ndarray of shape (n_samples, n_samples) + A feature array, or array of distances between samples if + `metric='precomputed'`. + + y : None + Ignored. + + Returns + ------- + y : ndarray of shape (n_samples,) + Cluster labels. + """ + self.fit(X) + return self.labels_ + + def _weighted_cluster_center(self, X): + """Calculate and store the centroids/medoids of each cluster. + + This requires `X` to be a raw feature array, not precomputed + distances. Rather than return outputs directly, this helper method + instead stores them in the `self.{centroids, medoids}_` attributes. + The choice for which attributes are calculated and stored is mediated + by the value of `self.store_centers`. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + The feature array that the estimator was fit with. + + """ + # Number of non-noise clusters + n_clusters = len(set(self.labels_) - {-1, -2}) + mask = np.empty((X.shape[0],), dtype=np.bool_) + make_centroids = self.store_centers in ("centroid", "both") + make_medoids = self.store_centers in ("medoid", "both") + + if make_centroids: + self.centroids_ = np.empty((n_clusters, X.shape[1]), dtype=np.float64) + if make_medoids: + self.medoids_ = np.empty((n_clusters, X.shape[1]), dtype=np.float64) + + # Need to handle iteratively seen each cluster may have a different + # number of samples, hence we can't create a homogeneous 3D array. + for idx in range(n_clusters): + mask = self.labels_ == idx + data = X[mask] + strength = self.probabilities_[mask] + if make_centroids: + self.centroids_[idx] = np.average(data, weights=strength, axis=0) + if make_medoids: + # TODO: Implement weighted argmin PWD backend + dist_mat = pairwise_distances( + data, metric=self.metric, **self._metric_params + ) + dist_mat = dist_mat * strength + medoid_index = np.argmin(dist_mat.sum(axis=1)) + self.medoids_[idx] = data[medoid_index] + return + + def dbscan_clustering(self, cut_distance, min_cluster_size=5): + """Return clustering given by DBSCAN without border points. + + Return clustering that would be equivalent to running DBSCAN* for a + particular cut_distance (or epsilon) DBSCAN* can be thought of as + DBSCAN without the border points. As such these results may differ + slightly from `cluster.DBSCAN` due to the difference in implementation + over the non-core points. + + This can also be thought of as a flat clustering derived from constant + height cut through the single linkage tree. + + This represents the result of selecting a cut value for robust single linkage + clustering. The `min_cluster_size` allows the flat clustering to declare noise + points (and cluster smaller than `min_cluster_size`). + + Parameters + ---------- + cut_distance : float + The mutual reachability distance cut value to use to generate a + flat clustering. + + min_cluster_size : int, default=5 + Clusters smaller than this value with be called 'noise' and remain + unclustered in the resulting flat clustering. + + Returns + ------- + labels : ndarray of shape (n_samples,) + An array of cluster labels, one per datapoint. + Outliers are labeled as follows: + + - Noisy samples are given the label -1. + - Samples with infinite elements (+/- np.inf) are given the label -2. + - Samples with missing data are given the label -3, even if they + also have infinite elements. + """ + labels = labelling_at_cut( + self._single_linkage_tree_, cut_distance, min_cluster_size + ) + # Infer indices from labels generated during `fit` + infinite_index = self.labels_ == _OUTLIER_ENCODING["infinite"]["label"] + missing_index = self.labels_ == _OUTLIER_ENCODING["missing"]["label"] + + # Overwrite infinite/missing outlier samples (otherwise simple noise) + labels[infinite_index] = _OUTLIER_ENCODING["infinite"]["label"] + labels[missing_index] = _OUTLIER_ENCODING["missing"]["label"] + return labels + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + tags.input_tags.allow_nan = self.metric != "precomputed" + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/meson.build b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..8d880b39a4db58dffa1b282c3633c873755f5245 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/meson.build @@ -0,0 +1,15 @@ +cluster_hdbscan_extension_metadata = { + '_linkage': {'sources': [cython_gen.process('_linkage.pyx'), metrics_cython_tree]}, + '_reachability': {'sources': [cython_gen.process('_reachability.pyx')]}, + '_tree': {'sources': [cython_gen.process('_tree.pyx')]} +} + +foreach ext_name, ext_dict : cluster_hdbscan_extension_metadata + py.extension_module( + ext_name, + ext_dict.get('sources'), + dependencies: [np_dep], + subdir: 'sklearn/cluster/_hdbscan', + install: true + ) +endforeach diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/tests/test_reachibility.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/tests/test_reachibility.py new file mode 100644 index 0000000000000000000000000000000000000000..a336e6be6116d1345a1d4eb0448c2e2f58cd8ecd --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hdbscan/tests/test_reachibility.py @@ -0,0 +1,63 @@ +import numpy as np +import pytest + +from sklearn.cluster._hdbscan._reachability import mutual_reachability_graph +from sklearn.utils._testing import ( + _convert_container, + assert_allclose, +) + + +def test_mutual_reachability_graph_error_sparse_format(): + """Check that we raise an error if the sparse format is not CSR.""" + rng = np.random.RandomState(0) + X = rng.randn(10, 10) + X = X.T @ X + np.fill_diagonal(X, 0.0) + X = _convert_container(X, "sparse_csc") + + err_msg = "Only sparse CSR matrices are supported" + with pytest.raises(ValueError, match=err_msg): + mutual_reachability_graph(X) + + +@pytest.mark.parametrize("array_type", ["array", "sparse_csr"]) +def test_mutual_reachability_graph_inplace(array_type): + """Check that the operation is happening inplace.""" + rng = np.random.RandomState(0) + X = rng.randn(10, 10) + X = X.T @ X + np.fill_diagonal(X, 0.0) + X = _convert_container(X, array_type) + + mr_graph = mutual_reachability_graph(X) + + assert id(mr_graph) == id(X) + + +def test_mutual_reachability_graph_equivalence_dense_sparse(): + """Check that we get the same results for dense and sparse implementation.""" + rng = np.random.RandomState(0) + X = rng.randn(5, 5) + X_dense = X.T @ X + X_sparse = _convert_container(X_dense, "sparse_csr") + + mr_graph_dense = mutual_reachability_graph(X_dense, min_samples=3) + mr_graph_sparse = mutual_reachability_graph(X_sparse, min_samples=3) + + assert_allclose(mr_graph_dense, mr_graph_sparse.toarray()) + + +@pytest.mark.parametrize("array_type", ["array", "sparse_csr"]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_mutual_reachability_graph_preserves_dtype(array_type, dtype): + """Check that the computation preserve dtype thanks to fused types.""" + rng = np.random.RandomState(0) + X = rng.randn(10, 10) + X = (X.T @ X).astype(dtype) + np.fill_diagonal(X, 0.0) + X = _convert_container(X, array_type) + + assert X.dtype == dtype + mr_graph = mutual_reachability_graph(X) + assert mr_graph.dtype == dtype diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hierarchical_fast.pxd b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hierarchical_fast.pxd new file mode 100644 index 0000000000000000000000000000000000000000..a10f8c12f34402c872ccc3bd7c14266dcc9b5e7a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hierarchical_fast.pxd @@ -0,0 +1,9 @@ +from ..utils._typedefs cimport intp_t + +cdef class UnionFind: + cdef intp_t next_label + cdef intp_t[:] parent + cdef intp_t[:] size + + cdef void union(self, intp_t m, intp_t n) noexcept + cdef intp_t fast_find(self, intp_t n) noexcept diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_hierarchical_fast.pyx b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hierarchical_fast.pyx new file mode 100644 index 0000000000000000000000000000000000000000..36ae0ab0d241432df9f5833901580dc88c30d925 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_hierarchical_fast.pyx @@ -0,0 +1,507 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np +cimport cython + +from ..metrics._dist_metrics cimport DistanceMetric64 +from ..utils._fast_dict cimport IntFloatDict +from ..utils._typedefs cimport float64_t, intp_t, uint8_t + +# C++ +from cython.operator cimport dereference as deref, preincrement as inc +from libcpp.map cimport map as cpp_map +from libc.math cimport fmax, INFINITY + + +############################################################################### +# Utilities for computing the ward momentum + +def compute_ward_dist( + const float64_t[::1] m_1, + const float64_t[:, ::1] m_2, + const intp_t[::1] coord_row, + const intp_t[::1] coord_col, + float64_t[::1] res +): + cdef intp_t size_max = coord_row.shape[0] + cdef intp_t n_features = m_2.shape[1] + cdef intp_t i, j, row, col + cdef float64_t pa, n + + for i in range(size_max): + row = coord_row[i] + col = coord_col[i] + n = (m_1[row] * m_1[col]) / (m_1[row] + m_1[col]) + pa = 0. + for j in range(n_features): + pa += (m_2[row, j] / m_1[row] - m_2[col, j] / m_1[col]) ** 2 + res[i] = pa * n + + +############################################################################### +# Utilities for cutting and exploring a hierarchical tree + +def _hc_get_descendent(intp_t node, children, intp_t n_leaves): + """ + Function returning all the descendent leaves of a set of nodes in the tree. + + Parameters + ---------- + node : integer + The node for which we want the descendents. + + children : list of pairs, length n_nodes + The children of each non-leaf node. Values less than `n_samples` refer + to leaves of the tree. A greater value `i` indicates a node with + children `children[i - n_samples]`. + + n_leaves : integer + Number of leaves. + + Returns + ------- + descendent : list of int + """ + ind = [node] + if node < n_leaves: + return ind + descendent = [] + + # It is actually faster to do the accounting of the number of + # elements is the list ourselves: len is a lengthy operation on a + # chained list + cdef intp_t i, n_indices = 1 + + while n_indices: + i = ind.pop() + if i < n_leaves: + descendent.append(i) + n_indices -= 1 + else: + ind.extend(children[i - n_leaves]) + n_indices += 1 + return descendent + + +def hc_get_heads(intp_t[:] parents, copy=True): + """Returns the heads of the forest, as defined by parents. + + Parameters + ---------- + parents : array of integers + The parent structure defining the forest (ensemble of trees) + copy : boolean + If copy is False, the input 'parents' array is modified inplace + + Returns + ------- + heads : array of integers of same shape as parents + The indices in the 'parents' of the tree heads + + """ + cdef intp_t parent, node0, node, size + if copy: + parents = np.copy(parents) + size = parents.size + + # Start from the top of the tree and go down + for node0 in range(size - 1, -1, -1): + node = node0 + parent = parents[node] + while parent != node: + parents[node0] = parent + node = parent + parent = parents[node] + return parents + + +def _get_parents( + nodes, + heads, + const intp_t[:] parents, + uint8_t[::1] not_visited +): + """Returns the heads of the given nodes, as defined by parents. + + Modifies 'heads' and 'not_visited' in-place. + + Parameters + ---------- + nodes : list of integers + The nodes to start from + heads : list of integers + A list to hold the results (modified inplace) + parents : array of integers + The parent structure defining the tree + not_visited + The tree nodes to consider (modified inplace) + + """ + cdef intp_t parent, node + + for node in nodes: + parent = parents[node] + while parent != node: + node = parent + parent = parents[node] + if not_visited[node]: + not_visited[node] = 0 + heads.append(node) + + +############################################################################### +# merge strategies implemented on IntFloatDicts + +# These are used in the hierarchical clustering code, to implement +# merging between two clusters, defined as a dict containing node number +# as keys and edge weights as values. + + +def max_merge( + IntFloatDict a, + IntFloatDict b, + const intp_t[:] mask, + intp_t n_a, + intp_t n_b +): + """Merge two IntFloatDicts with the max strategy: when the same key is + present in the two dicts, the max of the two values is used. + + Parameters + ========== + a, b : IntFloatDict object + The IntFloatDicts to merge + mask : ndarray array of dtype integer and of dimension 1 + a mask for keys to ignore: if not mask[key] the corresponding key + is skipped in the output dictionary + n_a, n_b : float + n_a and n_b are weights for a and b for the merge strategy. + They are not used in the case of a max merge. + + Returns + ======= + out : IntFloatDict object + The IntFloatDict resulting from the merge + """ + cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict) + cdef cpp_map[intp_t, float64_t].iterator a_it = a.my_map.begin() + cdef cpp_map[intp_t, float64_t].iterator a_end = a.my_map.end() + cdef intp_t key + cdef float64_t value + # First copy a into out + while a_it != a_end: + key = deref(a_it).first + if mask[key]: + out_obj.my_map[key] = deref(a_it).second + inc(a_it) + + # Then merge b into out + cdef cpp_map[intp_t, float64_t].iterator out_it = out_obj.my_map.begin() + cdef cpp_map[intp_t, float64_t].iterator out_end = out_obj.my_map.end() + cdef cpp_map[intp_t, float64_t].iterator b_it = b.my_map.begin() + cdef cpp_map[intp_t, float64_t].iterator b_end = b.my_map.end() + while b_it != b_end: + key = deref(b_it).first + value = deref(b_it).second + if mask[key]: + out_it = out_obj.my_map.find(key) + if out_it == out_end: + # Key not found + out_obj.my_map[key] = value + else: + deref(out_it).second = fmax(deref(out_it).second, value) + inc(b_it) + return out_obj + + +def average_merge( + IntFloatDict a, + IntFloatDict b, + const intp_t[:] mask, + intp_t n_a, + intp_t n_b +): + """Merge two IntFloatDicts with the average strategy: when the + same key is present in the two dicts, the weighted average of the two + values is used. + + Parameters + ========== + a, b : IntFloatDict object + The IntFloatDicts to merge + mask : ndarray array of dtype integer and of dimension 1 + a mask for keys to ignore: if not mask[key] the corresponding key + is skipped in the output dictionary + n_a, n_b : float + n_a and n_b are weights for a and b for the merge strategy. + They are used for a weighted mean. + + Returns + ======= + out : IntFloatDict object + The IntFloatDict resulting from the merge + """ + cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict) + cdef cpp_map[intp_t, float64_t].iterator a_it = a.my_map.begin() + cdef cpp_map[intp_t, float64_t].iterator a_end = a.my_map.end() + cdef intp_t key + cdef float64_t value + cdef float64_t n_out = (n_a + n_b) + # First copy a into out + while a_it != a_end: + key = deref(a_it).first + if mask[key]: + out_obj.my_map[key] = deref(a_it).second + inc(a_it) + + # Then merge b into out + cdef cpp_map[intp_t, float64_t].iterator out_it = out_obj.my_map.begin() + cdef cpp_map[intp_t, float64_t].iterator out_end = out_obj.my_map.end() + cdef cpp_map[intp_t, float64_t].iterator b_it = b.my_map.begin() + cdef cpp_map[intp_t, float64_t].iterator b_end = b.my_map.end() + while b_it != b_end: + key = deref(b_it).first + value = deref(b_it).second + if mask[key]: + out_it = out_obj.my_map.find(key) + if out_it == out_end: + # Key not found + out_obj.my_map[key] = value + else: + deref(out_it).second = (n_a * deref(out_it).second + + n_b * value) / n_out + inc(b_it) + return out_obj + + +############################################################################### +# An edge object for fast comparisons + +cdef class WeightedEdge: + cdef public intp_t a + cdef public intp_t b + cdef public float64_t weight + + def __init__(self, float64_t weight, intp_t a, intp_t b): + self.weight = weight + self.a = a + self.b = b + + def __richcmp__(self, WeightedEdge other, int op): + """Cython-specific comparison method. + + op is the comparison code:: + < 0 + == 2 + > 4 + <= 1 + != 3 + >= 5 + """ + if op == 0: + return self.weight < other.weight + elif op == 1: + return self.weight <= other.weight + elif op == 2: + return self.weight == other.weight + elif op == 3: + return self.weight != other.weight + elif op == 4: + return self.weight > other.weight + elif op == 5: + return self.weight >= other.weight + + def __repr__(self): + return "%s(weight=%f, a=%i, b=%i)" % (self.__class__.__name__, + self.weight, + self.a, self.b) + + +################################################################################ +# Efficient labelling/conversion of MSTs to single linkage hierarchies + +cdef class UnionFind(object): + + def __init__(self, N): + self.parent = np.full(2 * N - 1, -1., dtype=np.intp, order='C') + self.next_label = N + self.size = np.hstack((np.ones(N, dtype=np.intp), + np.zeros(N - 1, dtype=np.intp))) + + cdef void union(self, intp_t m, intp_t n) noexcept: + self.parent[m] = self.next_label + self.parent[n] = self.next_label + self.size[self.next_label] = self.size[m] + self.size[n] + self.next_label += 1 + return + + @cython.wraparound(True) + cdef intp_t fast_find(self, intp_t n) noexcept: + cdef intp_t p + p = n + # find the highest node in the linkage graph so far + while self.parent[n] != -1: + n = self.parent[n] + # provide a shortcut up to the highest node + while self.parent[p] != n: + p, self.parent[p] = self.parent[p], n + return n + + +def _single_linkage_label(const float64_t[:, :] L): + """ + Convert an linkage array or MST to a tree by labelling clusters at merges. + This is done by using a Union find structure to keep track of merges + efficiently. This is the private version of the function that assumes that + ``L`` has been properly validated. See ``single_linkage_label`` for the + user facing version of this function. + + Parameters + ---------- + L: array of shape (n_samples - 1, 3) + The linkage array or MST where each row specifies two samples + to be merged and a distance or weight at which the merge occurs. This + array is assumed to be sorted by the distance/weight. + + Returns + ------- + A tree in the format used by scipy.cluster.hierarchy. + """ + + cdef float64_t[:, ::1] result_arr + + cdef intp_t left, left_cluster, right, right_cluster, index + cdef float64_t delta + + result_arr = np.zeros((L.shape[0], 4), dtype=np.float64) + U = UnionFind(L.shape[0] + 1) + + for index in range(L.shape[0]): + + left = L[index, 0] + right = L[index, 1] + delta = L[index, 2] + + left_cluster = U.fast_find(left) + right_cluster = U.fast_find(right) + + result_arr[index][0] = left_cluster + result_arr[index][1] = right_cluster + result_arr[index][2] = delta + result_arr[index][3] = U.size[left_cluster] + U.size[right_cluster] + + U.union(left_cluster, right_cluster) + + return np.asarray(result_arr) + + +@cython.wraparound(True) +def single_linkage_label(L): + """ + Convert an linkage array or MST to a tree by labelling clusters at merges. + This is done by using a Union find structure to keep track of merges + efficiently. + + Parameters + ---------- + L: array of shape (n_samples - 1, 3) + The linkage array or MST where each row specifies two samples + to be merged and a distance or weight at which the merge occurs. This + array is assumed to be sorted by the distance/weight. + + Returns + ------- + A tree in the format used by scipy.cluster.hierarchy. + """ + # Validate L + if L[:, :2].min() < 0 or L[:, :2].max() >= 2 * L.shape[0] + 1: + raise ValueError("Input MST array is not a validly formatted MST array") + + is_sorted = lambda x: np.all(x[:-1] <= x[1:]) + if not is_sorted(L[:, 2]): + raise ValueError("Input MST array must be sorted by weight") + + return _single_linkage_label(L) + + +# Implements MST-LINKAGE-CORE from https://arxiv.org/abs/1109.2378 +def mst_linkage_core( + const float64_t [:, ::1] raw_data, + DistanceMetric64 dist_metric): + """ + Compute the necessary elements of a minimum spanning + tree for computation of single linkage clustering. This + represents the MST-LINKAGE-CORE algorithm (Figure 6) from + :arxiv:`Daniel Mullner, "Modern hierarchical, agglomerative clustering + algorithms" <1109.2378>`. + + In contrast to the scipy implementation is never computes + a full distance matrix, generating distances only as they + are needed and releasing them when no longer needed. + + Parameters + ---------- + raw_data: array of shape (n_samples, n_features) + The array of feature data to be clustered. Must be C-aligned + + dist_metric: DistanceMetric64 + A DistanceMetric64 object conforming to the API from + ``sklearn.metrics._dist_metrics.pxd`` that will be + used to compute distances. + + Returns + ------- + mst_core_data: array of shape (n_samples, 3) + An array providing information from which one + can either compute an MST, or the linkage hierarchy + very efficiently. See :arxiv:`Daniel Mullner, "Modern hierarchical, + agglomerative clustering algorithms" <1109.2378>` algorithm + MST-LINKAGE-CORE for more details. + """ + cdef: + intp_t n_samples = raw_data.shape[0] + uint8_t[:] in_tree = np.zeros(n_samples, dtype=bool) + float64_t[:, ::1] result = np.zeros((n_samples - 1, 3)) + + intp_t current_node = 0 + intp_t new_node + intp_t i + intp_t j + intp_t num_features = raw_data.shape[1] + + float64_t right_value + float64_t left_value + float64_t new_distance + + float64_t[:] current_distances = np.full(n_samples, INFINITY) + + for i in range(n_samples - 1): + + in_tree[current_node] = 1 + + new_distance = INFINITY + new_node = 0 + + for j in range(n_samples): + if in_tree[j]: + continue + + right_value = current_distances[j] + left_value = dist_metric.dist(&raw_data[current_node, 0], + &raw_data[j, 0], + num_features) + + if left_value < right_value: + current_distances[j] = left_value + + if current_distances[j] < new_distance: + new_distance = current_distances[j] + new_node = j + + result[i, 0] = current_node + result[i, 1] = new_node + result[i, 2] = new_distance + current_node = new_node + + return np.array(result) diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_common.pxd b/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_common.pxd new file mode 100644 index 0000000000000000000000000000000000000000..9a41ea68d1bafc0cad55c028e0413e463ddb6d2e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_common.pxd @@ -0,0 +1,48 @@ +from cython cimport floating + + +cdef floating _euclidean_dense_dense( + const floating*, + const floating*, + int, + bint +) noexcept nogil + +cdef floating _euclidean_sparse_dense( + const floating[::1], + const int[::1], + const floating[::1], + floating, + bint +) noexcept nogil + +cpdef void _relocate_empty_clusters_dense( + const floating[:, ::1], + const floating[::1], + const floating[:, ::1], + floating[:, ::1], + floating[::1], + const int[::1] +) + +cpdef void _relocate_empty_clusters_sparse( + const floating[::1], + const int[::1], + const int[::1], + const floating[::1], + const floating[:, ::1], + floating[:, ::1], + floating[::1], + const int[::1] +) + +cdef void _average_centers( + floating[:, ::1], + const floating[::1] +) + +cdef void _center_shift( + const floating[:, ::1], + const floating[:, ::1], + floating[::1] +) diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_common.pyx b/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_common.pyx new file mode 100644 index 0000000000000000000000000000000000000000..674d4026a67564f266ec709a9f47d77f8f912386 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_common.pyx @@ -0,0 +1,328 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np +from cython cimport floating +from cython.parallel cimport prange +from libc.math cimport sqrt + +from ..utils.extmath import row_norms + + +# Number of samples per data chunk defined as a global constant. +CHUNK_SIZE = 256 + + +cdef floating _euclidean_dense_dense( + const floating* a, # IN + const floating* b, # IN + int n_features, + bint squared +) noexcept nogil: + """Euclidean distance between a dense and b dense""" + cdef: + int i + int n = n_features // 4 + int rem = n_features % 4 + floating result = 0 + + # We manually unroll the loop for better cache optimization. + for i in range(n): + result += ( + (a[0] - b[0]) * (a[0] - b[0]) + + (a[1] - b[1]) * (a[1] - b[1]) + + (a[2] - b[2]) * (a[2] - b[2]) + + (a[3] - b[3]) * (a[3] - b[3]) + ) + a += 4 + b += 4 + + for i in range(rem): + result += (a[i] - b[i]) * (a[i] - b[i]) + + return result if squared else sqrt(result) + + +def _euclidean_dense_dense_wrapper( + const floating[::1] a, + const floating[::1] b, + bint squared +): + """Wrapper of _euclidean_dense_dense for testing purpose""" + return _euclidean_dense_dense(&a[0], &b[0], a.shape[0], squared) + + +cdef floating _euclidean_sparse_dense( + const floating[::1] a_data, # IN + const int[::1] a_indices, # IN + const floating[::1] b, # IN + floating b_squared_norm, + bint squared +) noexcept nogil: + """Euclidean distance between a sparse and b dense""" + cdef: + int nnz = a_indices.shape[0] + int i + floating tmp, bi + floating result = 0.0 + + for i in range(nnz): + bi = b[a_indices[i]] + tmp = a_data[i] - bi + result += tmp * tmp - bi * bi + + result += b_squared_norm + + if result < 0: + result = 0.0 + + return result if squared else sqrt(result) + + +def _euclidean_sparse_dense_wrapper( + const floating[::1] a_data, + const int[::1] a_indices, + const floating[::1] b, + floating b_squared_norm, + bint squared +): + """Wrapper of _euclidean_sparse_dense for testing purpose""" + return _euclidean_sparse_dense( + a_data, a_indices, b, b_squared_norm, squared) + + +cpdef floating _inertia_dense( + const floating[:, ::1] X, # IN + const floating[::1] sample_weight, # IN + const floating[:, ::1] centers, # IN + const int[::1] labels, # IN + int n_threads, + int single_label=-1, +): + """Compute inertia for dense input data + + Sum of squared distance between each sample and its assigned center. + + If single_label is >= 0, the inertia is computed only for that label. + """ + cdef: + int n_samples = X.shape[0] + int n_features = X.shape[1] + int i, j + + floating sq_dist = 0.0 + floating inertia = 0.0 + + for i in prange(n_samples, nogil=True, num_threads=n_threads, + schedule='static'): + j = labels[i] + if single_label < 0 or single_label == j: + sq_dist = _euclidean_dense_dense(&X[i, 0], ¢ers[j, 0], + n_features, True) + inertia += sq_dist * sample_weight[i] + + return inertia + + +cpdef floating _inertia_sparse( + X, # IN + const floating[::1] sample_weight, # IN + const floating[:, ::1] centers, # IN + const int[::1] labels, # IN + int n_threads, + int single_label=-1, +): + """Compute inertia for sparse input data + + Sum of squared distance between each sample and its assigned center. + + If single_label is >= 0, the inertia is computed only for that label. + """ + cdef: + floating[::1] X_data = X.data + int[::1] X_indices = X.indices + int[::1] X_indptr = X.indptr + + int n_samples = X.shape[0] + int i, j + + floating sq_dist = 0.0 + floating inertia = 0.0 + + floating[::1] centers_squared_norms = row_norms(centers, squared=True) + + for i in prange(n_samples, nogil=True, num_threads=n_threads, + schedule='static'): + j = labels[i] + if single_label < 0 or single_label == j: + sq_dist = _euclidean_sparse_dense( + X_data[X_indptr[i]: X_indptr[i + 1]], + X_indices[X_indptr[i]: X_indptr[i + 1]], + centers[j], centers_squared_norms[j], True) + inertia += sq_dist * sample_weight[i] + + return inertia + + +cpdef void _relocate_empty_clusters_dense( + const floating[:, ::1] X, # IN + const floating[::1] sample_weight, # IN + const floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # INOUT + floating[::1] weight_in_clusters, # INOUT + const int[::1] labels # IN +): + """Relocate centers which have no sample assigned to them.""" + cdef: + int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32) + int n_empty = empty_clusters.shape[0] + + if n_empty == 0: + return + + cdef: + int n_features = X.shape[1] + + floating[::1] distances = ((np.asarray(X) - np.asarray(centers_old)[labels])**2).sum(axis=1) + int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32) + + int new_cluster_id, old_cluster_id, far_idx, idx, k + floating weight + + if np.max(distances) == 0: + # Happens when there are more clusters than non-duplicate samples. Relocating + # is pointless in this case. + return + + for idx in range(n_empty): + + new_cluster_id = empty_clusters[idx] + + far_idx = far_from_centers[idx] + weight = sample_weight[far_idx] + + old_cluster_id = labels[far_idx] + + for k in range(n_features): + centers_new[old_cluster_id, k] -= X[far_idx, k] * weight + centers_new[new_cluster_id, k] = X[far_idx, k] * weight + + weight_in_clusters[new_cluster_id] = weight + weight_in_clusters[old_cluster_id] -= weight + + +cpdef void _relocate_empty_clusters_sparse( + const floating[::1] X_data, # IN + const int[::1] X_indices, # IN + const int[::1] X_indptr, # IN + const floating[::1] sample_weight, # IN + const floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # INOUT + floating[::1] weight_in_clusters, # INOUT + const int[::1] labels # IN +): + """Relocate centers which have no sample assigned to them.""" + cdef: + int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32) + int n_empty = empty_clusters.shape[0] + + if n_empty == 0: + return + + cdef: + int n_samples = X_indptr.shape[0] - 1 + int i, j, k + + floating[::1] distances = np.zeros(n_samples, dtype=X_data.base.dtype) + floating[::1] centers_squared_norms = row_norms(centers_old, squared=True) + + for i in range(n_samples): + j = labels[i] + distances[i] = _euclidean_sparse_dense( + X_data[X_indptr[i]: X_indptr[i + 1]], + X_indices[X_indptr[i]: X_indptr[i + 1]], + centers_old[j], centers_squared_norms[j], True) + + if np.max(distances) == 0: + # Happens when there are more clusters than non-duplicate samples. Relocating + # is pointless in this case. + return + + cdef: + int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32) + + int new_cluster_id, old_cluster_id, far_idx, idx + floating weight + + for idx in range(n_empty): + + new_cluster_id = empty_clusters[idx] + + far_idx = far_from_centers[idx] + weight = sample_weight[far_idx] + + old_cluster_id = labels[far_idx] + + for k in range(X_indptr[far_idx], X_indptr[far_idx + 1]): + centers_new[old_cluster_id, X_indices[k]] -= X_data[k] * weight + centers_new[new_cluster_id, X_indices[k]] = X_data[k] * weight + + weight_in_clusters[new_cluster_id] = weight + weight_in_clusters[old_cluster_id] -= weight + + +cdef void _average_centers( + floating[:, ::1] centers, # INOUT + const floating[::1] weight_in_clusters # IN +): + """Average new centers wrt weights.""" + cdef: + int n_clusters = centers.shape[0] + int n_features = centers.shape[1] + int j, k + floating alpha + int argmax_weight = np.argmax(weight_in_clusters) + + for j in range(n_clusters): + if weight_in_clusters[j] > 0: + alpha = 1.0 / weight_in_clusters[j] + for k in range(n_features): + centers[j, k] *= alpha + else: + # For convenience, we avoid setting empty clusters at the origin but place + # them at the location of the biggest cluster. + for k in range(n_features): + centers[j, k] = centers[argmax_weight, k] + + +cdef void _center_shift( + const floating[:, ::1] centers_old, # IN + const floating[:, ::1] centers_new, # IN + floating[::1] center_shift # OUT +): + """Compute shift between old and new centers.""" + cdef: + int n_clusters = centers_old.shape[0] + int n_features = centers_old.shape[1] + int j + + for j in range(n_clusters): + center_shift[j] = _euclidean_dense_dense( + ¢ers_new[j, 0], ¢ers_old[j, 0], n_features, False) + + +def _is_same_clustering( + const int[::1] labels1, + const int[::1] labels2, + n_clusters +): + """Check if two arrays of labels are the same up to a permutation of the labels""" + cdef int[::1] mapping = np.full(fill_value=-1, shape=(n_clusters,), dtype=np.int32) + cdef int i + + for i in range(labels1.shape[0]): + if mapping[labels1[i]] == -1: + mapping[labels1[i]] = labels2[i] + elif mapping[labels1[i]] != labels2[i]: + return False + return True diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_elkan.pyx b/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_elkan.pyx new file mode 100644 index 0000000000000000000000000000000000000000..564218a17f7018241d43dd33f55d3f516746a145 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_elkan.pyx @@ -0,0 +1,686 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from cython cimport floating +from cython.parallel import prange, parallel +from libc.stdlib cimport calloc, free +from libc.string cimport memset + +from ..utils._openmp_helpers cimport omp_lock_t +from ..utils._openmp_helpers cimport omp_init_lock +from ..utils._openmp_helpers cimport omp_destroy_lock +from ..utils._openmp_helpers cimport omp_set_lock +from ..utils._openmp_helpers cimport omp_unset_lock +from ..utils.extmath import row_norms +from ._k_means_common import CHUNK_SIZE +from ._k_means_common cimport _relocate_empty_clusters_dense +from ._k_means_common cimport _relocate_empty_clusters_sparse +from ._k_means_common cimport _euclidean_dense_dense +from ._k_means_common cimport _euclidean_sparse_dense +from ._k_means_common cimport _average_centers +from ._k_means_common cimport _center_shift + + +def init_bounds_dense( + const floating[:, ::1] X, # IN + const floating[:, ::1] centers, # IN + const floating[:, ::1] center_half_distances, # IN + int[::1] labels, # OUT + floating[::1] upper_bounds, # OUT + floating[:, ::1] lower_bounds, # OUT + int n_threads): + """Initialize upper and lower bounds for each sample for dense input data. + + Given X, centers and the pairwise distances divided by 2.0 between the + centers this calculates the upper bounds and lower bounds for each sample. + The upper bound for each sample is set to the distance between the sample + and the closest center. + + The lower bound for each sample is a one-dimensional array of n_clusters. + For each sample i assume that the previously assigned cluster is c1 and the + previous closest distance is dist, for a new cluster c2, the + lower_bound[i][c2] is set to distance between the sample and this new + cluster, if and only if dist > center_half_distances[c1][c2]. This prevents + computation of unnecessary distances for each sample to the clusters that + it is unlikely to be assigned to. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features), dtype=floating + The input data. + + centers : ndarray of shape (n_clusters, n_features), dtype=floating + The cluster centers. + + center_half_distances : ndarray of shape (n_clusters, n_clusters), \ + dtype=floating + The half of the distance between any 2 clusters centers. + + labels : ndarray of shape(n_samples), dtype=int + The label for each sample. This array is modified in place. + + upper_bounds : ndarray of shape(n_samples,), dtype=floating + The upper bound on the distance between each sample and its closest + cluster center. This array is modified in place. + + lower_bounds : ndarray, of shape(n_samples, n_clusters), dtype=floating + The lower bound on the distance between each sample and each cluster + center. This array is modified in place. + + n_threads : int + The number of threads to be used by openmp. + """ + cdef: + int n_samples = X.shape[0] + int n_clusters = centers.shape[0] + int n_features = X.shape[1] + + floating min_dist, dist + int best_cluster, i, j + + for i in prange( + n_samples, num_threads=n_threads, schedule='static', nogil=True + ): + best_cluster = 0 + min_dist = _euclidean_dense_dense(&X[i, 0], ¢ers[0, 0], + n_features, False) + lower_bounds[i, 0] = min_dist + for j in range(1, n_clusters): + if min_dist > center_half_distances[best_cluster, j]: + dist = _euclidean_dense_dense(&X[i, 0], ¢ers[j, 0], + n_features, False) + lower_bounds[i, j] = dist + if dist < min_dist: + min_dist = dist + best_cluster = j + labels[i] = best_cluster + upper_bounds[i] = min_dist + + +def init_bounds_sparse( + X, # IN + const floating[:, ::1] centers, # IN + const floating[:, ::1] center_half_distances, # IN + int[::1] labels, # OUT + floating[::1] upper_bounds, # OUT + floating[:, ::1] lower_bounds, # OUT + int n_threads): + """Initialize upper and lower bounds for each sample for sparse input data. + + Given X, centers and the pairwise distances divided by 2.0 between the + centers this calculates the upper bounds and lower bounds for each sample. + The upper bound for each sample is set to the distance between the sample + and the closest center. + + The lower bound for each sample is a one-dimensional array of n_clusters. + For each sample i assume that the previously assigned cluster is c1 and the + previous closest distance is dist, for a new cluster c2, the + lower_bound[i][c2] is set to distance between the sample and this new + cluster, if and only if dist > center_half_distances[c1][c2]. This prevents + computation of unnecessary distances for each sample to the clusters that + it is unlikely to be assigned to. + + Parameters + ---------- + X : sparse matrix of shape (n_samples, n_features), dtype=floating + The input data. Must be in CSR format. + + centers : ndarray of shape (n_clusters, n_features), dtype=floating + The cluster centers. + + center_half_distances : ndarray of shape (n_clusters, n_clusters), \ + dtype=floating + The half of the distance between any 2 clusters centers. + + labels : ndarray of shape(n_samples), dtype=int + The label for each sample. This array is modified in place. + + upper_bounds : ndarray of shape(n_samples,), dtype=floating + The upper bound on the distance between each sample and its closest + cluster center. This array is modified in place. + + lower_bounds : ndarray of shape(n_samples, n_clusters), dtype=floating + The lower bound on the distance between each sample and each cluster + center. This array is modified in place. + + n_threads : int + The number of threads to be used by openmp. + """ + cdef: + int n_samples = X.shape[0] + int n_clusters = centers.shape[0] + + floating[::1] X_data = X.data + int[::1] X_indices = X.indices + int[::1] X_indptr = X.indptr + + floating min_dist, dist + int best_cluster, i, j + + floating[::1] centers_squared_norms = row_norms(centers, squared=True) + + for i in prange( + n_samples, num_threads=n_threads, schedule='static', nogil=True + ): + best_cluster = 0 + min_dist = _euclidean_sparse_dense( + X_data[X_indptr[i]: X_indptr[i + 1]], + X_indices[X_indptr[i]: X_indptr[i + 1]], + centers[0], centers_squared_norms[0], False) + + lower_bounds[i, 0] = min_dist + for j in range(1, n_clusters): + if min_dist > center_half_distances[best_cluster, j]: + dist = _euclidean_sparse_dense( + X_data[X_indptr[i]: X_indptr[i + 1]], + X_indices[X_indptr[i]: X_indptr[i + 1]], + centers[j], centers_squared_norms[j], False) + lower_bounds[i, j] = dist + if dist < min_dist: + min_dist = dist + best_cluster = j + labels[i] = best_cluster + upper_bounds[i] = min_dist + + +def elkan_iter_chunked_dense( + const floating[:, ::1] X, # IN + const floating[::1] sample_weight, # IN + const floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # OUT + floating[::1] weight_in_clusters, # OUT + const floating[:, ::1] center_half_distances, # IN + const floating[::1] distance_next_center, # IN + floating[::1] upper_bounds, # INOUT + floating[:, ::1] lower_bounds, # INOUT + int[::1] labels, # INOUT + floating[::1] center_shift, # OUT + int n_threads, + bint update_centers=True): + """Single iteration of K-means Elkan algorithm with dense input. + + Update labels and centers (inplace), for one iteration, distributed + over data chunks. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features), dtype=floating + The observations to cluster. + + sample_weight : ndarray of shape (n_samples,), dtype=floating + The weights for each observation in X. + + centers_old : ndarray of shape (n_clusters, n_features), dtype=floating + Centers before previous iteration, placeholder for the centers after + previous iteration. + + centers_new : ndarray of shape (n_clusters, n_features), dtype=floating + Centers after previous iteration, placeholder for the new centers + computed during this iteration. + + weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating + Placeholder for the sums of the weights of every observation assigned + to each center. + + center_half_distances : ndarray of shape (n_clusters, n_clusters), \ + dtype=floating + Half pairwise distances between centers. + + distance_next_center : ndarray of shape (n_clusters,), dtype=floating + Distance between each center its closest center. + + upper_bounds : ndarray of shape (n_samples,), dtype=floating + Upper bound for the distance between each sample and its center, + updated inplace. + + lower_bounds : ndarray of shape (n_samples, n_clusters), dtype=floating + Lower bound for the distance between each sample and each center, + updated inplace. + + labels : ndarray of shape (n_samples,), dtype=int + labels assignment. + + center_shift : ndarray of shape (n_clusters,), dtype=floating + Distance between old and new centers. + + n_threads : int + The number of threads to be used by openmp. + + update_centers : bool + - If True, the labels and the new centers will be computed, i.e. runs + the E-step and the M-step of the algorithm. + - If False, only the labels will be computed, i.e runs the E-step of + the algorithm. This is useful especially when calling predict on a + fitted model. + """ + cdef: + int n_samples = X.shape[0] + int n_features = X.shape[1] + int n_clusters = centers_new.shape[0] + + if n_samples == 0: + # An empty array was passed, do nothing and return early (before + # attempting to compute n_chunks). This can typically happen when + # calling the prediction function of a bisecting k-means model with a + # large fraction of outliers. + return + + cdef: + # hard-coded number of samples per chunk. Splitting in chunks is + # necessary to get parallelism. Chunk size chosen to be same as lloyd's + int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples + int n_chunks = n_samples // n_samples_chunk + int n_samples_rem = n_samples % n_samples_chunk + int chunk_idx + int start, end + + int i, j, k + + floating *centers_new_chunk + floating *weight_in_clusters_chunk + + omp_lock_t lock + + # count remainder chunk in total number of chunks + n_chunks += n_samples != n_chunks * n_samples_chunk + + # number of threads should not be bigger than number of chunks + n_threads = min(n_threads, n_chunks) + + if update_centers: + memset(¢ers_new[0, 0], 0, n_clusters * n_features * sizeof(floating)) + memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating)) + omp_init_lock(&lock) + + with nogil, parallel(num_threads=n_threads): + # thread local buffers + centers_new_chunk = calloc(n_clusters * n_features, sizeof(floating)) + weight_in_clusters_chunk = calloc(n_clusters, sizeof(floating)) + + for chunk_idx in prange(n_chunks, schedule='static'): + start = chunk_idx * n_samples_chunk + if chunk_idx == n_chunks - 1 and n_samples_rem > 0: + end = start + n_samples_rem + else: + end = start + n_samples_chunk + + _update_chunk_dense( + X[start: end], + sample_weight[start: end], + centers_old, + center_half_distances, + distance_next_center, + labels[start: end], + upper_bounds[start: end], + lower_bounds[start: end], + centers_new_chunk, + weight_in_clusters_chunk, + update_centers) + + # reduction from local buffers. + if update_centers: + # The lock is necessary to avoid race conditions when aggregating + # info from different thread-local buffers. + omp_set_lock(&lock) + for j in range(n_clusters): + weight_in_clusters[j] += weight_in_clusters_chunk[j] + for k in range(n_features): + centers_new[j, k] += centers_new_chunk[j * n_features + k] + omp_unset_lock(&lock) + + free(centers_new_chunk) + free(weight_in_clusters_chunk) + + if update_centers: + omp_destroy_lock(&lock) + _relocate_empty_clusters_dense(X, sample_weight, centers_old, + centers_new, weight_in_clusters, labels) + + _average_centers(centers_new, weight_in_clusters) + _center_shift(centers_old, centers_new, center_shift) + + # update lower and upper bounds + for i in range(n_samples): + upper_bounds[i] += center_shift[labels[i]] + + for j in range(n_clusters): + lower_bounds[i, j] -= center_shift[j] + if lower_bounds[i, j] < 0: + lower_bounds[i, j] = 0 + + +cdef void _update_chunk_dense( + const floating[:, ::1] X, # IN + const floating[::1] sample_weight, # IN + const floating[:, ::1] centers_old, # IN + const floating[:, ::1] center_half_distances, # IN + const floating[::1] distance_next_center, # IN + int[::1] labels, # INOUT + floating[::1] upper_bounds, # INOUT + floating[:, ::1] lower_bounds, # INOUT + floating *centers_new, # OUT + floating *weight_in_clusters, # OUT + bint update_centers) noexcept nogil: + """K-means combined EM step for one dense data chunk. + + Compute the partial contribution of a single data chunk to the labels and + centers. + """ + cdef: + int n_samples = labels.shape[0] + int n_clusters = centers_old.shape[0] + int n_features = centers_old.shape[1] + + floating upper_bound, distance + int i, j, k, label + + for i in range(n_samples): + upper_bound = upper_bounds[i] + bounds_tight = 0 + label = labels[i] + + # Next center is not far away from the currently assigned center. + # Sample might need to be assigned to another center. + if not distance_next_center[label] >= upper_bound: + + for j in range(n_clusters): + + # If this holds, then center_index is a good candidate for the + # sample to be relabelled, and we need to confirm this by + # recomputing the upper and lower bounds. + if ( + j != label + and (upper_bound > lower_bounds[i, j]) + and (upper_bound > center_half_distances[label, j]) + ): + + # Recompute upper bound by calculating the actual distance + # between the sample and its current assigned center. + if not bounds_tight: + upper_bound = _euclidean_dense_dense( + &X[i, 0], ¢ers_old[label, 0], n_features, False) + lower_bounds[i, label] = upper_bound + bounds_tight = 1 + + # If the condition still holds, then compute the actual + # distance between the sample and center. If this is less + # than the previous distance, reassign label. + if ( + upper_bound > lower_bounds[i, j] + or (upper_bound > center_half_distances[label, j]) + ): + + distance = _euclidean_dense_dense( + &X[i, 0], ¢ers_old[j, 0], n_features, False) + lower_bounds[i, j] = distance + if distance < upper_bound: + label = j + upper_bound = distance + + labels[i] = label + upper_bounds[i] = upper_bound + + if update_centers: + weight_in_clusters[label] += sample_weight[i] + for k in range(n_features): + centers_new[label * n_features + k] += X[i, k] * sample_weight[i] + + +def elkan_iter_chunked_sparse( + X, # IN + const floating[::1] sample_weight, # IN + const floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # OUT + floating[::1] weight_in_clusters, # OUT + const floating[:, ::1] center_half_distances, # IN + const floating[::1] distance_next_center, # IN + floating[::1] upper_bounds, # INOUT + floating[:, ::1] lower_bounds, # INOUT + int[::1] labels, # INOUT + floating[::1] center_shift, # OUT + int n_threads, + bint update_centers=True): + """Single iteration of K-means Elkan algorithm with sparse input. + + Update labels and centers (inplace), for one iteration, distributed + over data chunks. + + Parameters + ---------- + X : sparse matrix of shape (n_samples, n_features) + The observations to cluster. Must be in CSR format. + + sample_weight : ndarray of shape (n_samples,), dtype=floating + The weights for each observation in X. + + centers_old : ndarray of shape (n_clusters, n_features), dtype=floating + Centers before previous iteration, placeholder for the centers after + previous iteration. + + centers_new : ndarray of shape (n_clusters, n_features), dtype=floating + Centers after previous iteration, placeholder for the new centers + computed during this iteration. + + weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating + Placeholder for the sums of the weights of every observation assigned + to each center. + + center_half_distances : ndarray of shape (n_clusters, n_clusters), \ + dtype=floating + Half pairwise distances between centers. + + distance_next_center : ndarray of shape (n_clusters,), dtype=floating + Distance between each center its closest center. + + upper_bounds : ndarray of shape (n_samples,), dtype=floating + Upper bound for the distance between each sample and its center, + updated inplace. + + lower_bounds : ndarray of shape (n_samples, n_clusters), dtype=floating + Lower bound for the distance between each sample and each center, + updated inplace. + + labels : ndarray of shape (n_samples,), dtype=int + labels assignment. + + center_shift : ndarray of shape (n_clusters,), dtype=floating + Distance between old and new centers. + + n_threads : int + The number of threads to be used by openmp. + + update_centers : bool + - If True, the labels and the new centers will be computed, i.e. runs + the E-step and the M-step of the algorithm. + - If False, only the labels will be computed, i.e runs the E-step of + the algorithm. This is useful especially when calling predict on a + fitted model. + """ + cdef: + int n_samples = X.shape[0] + int n_features = X.shape[1] + int n_clusters = centers_new.shape[0] + + if n_samples == 0: + # An empty array was passed, do nothing and return early (before + # attempting to compute n_chunks). This can typically happen when + # calling the prediction function of a bisecting k-means model with a + # large fraction of outliers. + return + + cdef: + floating[::1] X_data = X.data + int[::1] X_indices = X.indices + int[::1] X_indptr = X.indptr + + # hard-coded number of samples per chunk. Splitting in chunks is + # necessary to get parallelism. Chunk size chosen to be same as lloyd's + int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples + int n_chunks = n_samples // n_samples_chunk + int n_samples_rem = n_samples % n_samples_chunk + int chunk_idx + int start, end + + int i, j, k + + floating[::1] centers_squared_norms = row_norms(centers_old, squared=True) + + floating *centers_new_chunk + floating *weight_in_clusters_chunk + + omp_lock_t lock + + # count remainder chunk in total number of chunks + n_chunks += n_samples != n_chunks * n_samples_chunk + + # number of threads should not be bigger than number of chunks + n_threads = min(n_threads, n_chunks) + + if update_centers: + memset(¢ers_new[0, 0], 0, n_clusters * n_features * sizeof(floating)) + memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating)) + omp_init_lock(&lock) + + with nogil, parallel(num_threads=n_threads): + # thread local buffers + centers_new_chunk = calloc(n_clusters * n_features, sizeof(floating)) + weight_in_clusters_chunk = calloc(n_clusters, sizeof(floating)) + + for chunk_idx in prange(n_chunks, schedule='static'): + start = chunk_idx * n_samples_chunk + if chunk_idx == n_chunks - 1 and n_samples_rem > 0: + end = start + n_samples_rem + else: + end = start + n_samples_chunk + + _update_chunk_sparse( + X_data[X_indptr[start]: X_indptr[end]], + X_indices[X_indptr[start]: X_indptr[end]], + X_indptr[start: end+1], + sample_weight[start: end], + centers_old, + centers_squared_norms, + center_half_distances, + distance_next_center, + labels[start: end], + upper_bounds[start: end], + lower_bounds[start: end], + centers_new_chunk, + weight_in_clusters_chunk, + update_centers) + + # reduction from local buffers. + if update_centers: + # The lock is necessary to avoid race conditions when aggregating + # info from different thread-local buffers. + omp_set_lock(&lock) + for j in range(n_clusters): + weight_in_clusters[j] += weight_in_clusters_chunk[j] + for k in range(n_features): + centers_new[j, k] += centers_new_chunk[j * n_features + k] + omp_unset_lock(&lock) + + free(centers_new_chunk) + free(weight_in_clusters_chunk) + + if update_centers: + omp_destroy_lock(&lock) + _relocate_empty_clusters_sparse( + X_data, X_indices, X_indptr, sample_weight, + centers_old, centers_new, weight_in_clusters, labels) + + _average_centers(centers_new, weight_in_clusters) + _center_shift(centers_old, centers_new, center_shift) + + # update lower and upper bounds + for i in range(n_samples): + upper_bounds[i] += center_shift[labels[i]] + + for j in range(n_clusters): + lower_bounds[i, j] -= center_shift[j] + if lower_bounds[i, j] < 0: + lower_bounds[i, j] = 0 + + +cdef void _update_chunk_sparse( + const floating[::1] X_data, # IN + const int[::1] X_indices, # IN + const int[::1] X_indptr, # IN + const floating[::1] sample_weight, # IN + const floating[:, ::1] centers_old, # IN + const floating[::1] centers_squared_norms, # IN + const floating[:, ::1] center_half_distances, # IN + const floating[::1] distance_next_center, # IN + int[::1] labels, # INOUT + floating[::1] upper_bounds, # INOUT + floating[:, ::1] lower_bounds, # INOUT + floating *centers_new, # OUT + floating *weight_in_clusters, # OUT + bint update_centers) noexcept nogil: + """K-means combined EM step for one sparse data chunk. + + Compute the partial contribution of a single data chunk to the labels and + centers. + """ + cdef: + int n_samples = labels.shape[0] + int n_clusters = centers_old.shape[0] + int n_features = centers_old.shape[1] + + floating upper_bound, distance + int i, j, k, label + int s = X_indptr[0] + + for i in range(n_samples): + upper_bound = upper_bounds[i] + bounds_tight = 0 + label = labels[i] + + # Next center is not far away from the currently assigned center. + # Sample might need to be assigned to another center. + if not distance_next_center[label] >= upper_bound: + + for j in range(n_clusters): + + # If this holds, then center_index is a good candidate for the + # sample to be relabelled, and we need to confirm this by + # recomputing the upper and lower bounds. + if ( + j != label + and (upper_bound > lower_bounds[i, j]) + and (upper_bound > center_half_distances[label, j]) + ): + + # Recompute upper bound by calculating the actual distance + # between the sample and its current assigned center. + if not bounds_tight: + upper_bound = _euclidean_sparse_dense( + X_data[X_indptr[i] - s: X_indptr[i + 1] - s], + X_indices[X_indptr[i] - s: X_indptr[i + 1] - s], + centers_old[label], centers_squared_norms[label], False) + lower_bounds[i, label] = upper_bound + bounds_tight = 1 + + # If the condition still holds, then compute the actual + # distance between the sample and center. If this is less + # than the previous distance, reassign label. + if ( + upper_bound > lower_bounds[i, j] + or (upper_bound > center_half_distances[label, j]) + ): + distance = _euclidean_sparse_dense( + X_data[X_indptr[i] - s: X_indptr[i + 1] - s], + X_indices[X_indptr[i] - s: X_indptr[i + 1] - s], + centers_old[j], centers_squared_norms[j], False) + lower_bounds[i, j] = distance + if distance < upper_bound: + label = j + upper_bound = distance + + labels[i] = label + upper_bounds[i] = upper_bound + + if update_centers: + weight_in_clusters[label] += sample_weight[i] + for k in range(X_indptr[i] - s, X_indptr[i + 1] - s): + centers_new[label * n_features + X_indices[k]] += X_data[k] * sample_weight[i] diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_lloyd.pyx b/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_lloyd.pyx new file mode 100644 index 0000000000000000000000000000000000000000..a507a6239ab5f836e8c7d23ac0e3e2ab2f7f4d11 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_lloyd.pyx @@ -0,0 +1,420 @@ +# Licence: BSD 3 clause + +from cython cimport floating +from cython.parallel import prange, parallel +from libc.stdlib cimport malloc, calloc, free +from libc.string cimport memset +from libc.float cimport DBL_MAX, FLT_MAX + +from ..utils._openmp_helpers cimport omp_lock_t +from ..utils._openmp_helpers cimport omp_init_lock +from ..utils._openmp_helpers cimport omp_destroy_lock +from ..utils._openmp_helpers cimport omp_set_lock +from ..utils._openmp_helpers cimport omp_unset_lock +from ..utils.extmath import row_norms +from ..utils._cython_blas cimport _gemm +from ..utils._cython_blas cimport RowMajor, Trans, NoTrans +from ._k_means_common import CHUNK_SIZE +from ._k_means_common cimport _relocate_empty_clusters_dense +from ._k_means_common cimport _relocate_empty_clusters_sparse +from ._k_means_common cimport _average_centers, _center_shift + + +def lloyd_iter_chunked_dense( + const floating[:, ::1] X, # IN + const floating[::1] sample_weight, # IN + const floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # OUT + floating[::1] weight_in_clusters, # OUT + int[::1] labels, # OUT + floating[::1] center_shift, # OUT + int n_threads, + bint update_centers=True): + """Single iteration of K-means lloyd algorithm with dense input. + + Update labels and centers (inplace), for one iteration, distributed + over data chunks. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features), dtype=floating + The observations to cluster. + + sample_weight : ndarray of shape (n_samples,), dtype=floating + The weights for each observation in X. + + centers_old : ndarray of shape (n_clusters, n_features), dtype=floating + Centers before previous iteration, placeholder for the centers after + previous iteration. + + centers_new : ndarray of shape (n_clusters, n_features), dtype=floating + Centers after previous iteration, placeholder for the new centers + computed during this iteration. `centers_new` can be `None` if + `update_centers` is False. + + weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating + Placeholder for the sums of the weights of every observation assigned + to each center. `weight_in_clusters` can be `None` if `update_centers` + is False. + + labels : ndarray of shape (n_samples,), dtype=int + labels assignment. + + center_shift : ndarray of shape (n_clusters,), dtype=floating + Distance between old and new centers. + + n_threads : int + The number of threads to be used by openmp. + + update_centers : bool + - If True, the labels and the new centers will be computed, i.e. runs + the E-step and the M-step of the algorithm. + - If False, only the labels will be computed, i.e runs the E-step of + the algorithm. This is useful especially when calling predict on a + fitted model. + """ + cdef: + int n_samples = X.shape[0] + int n_features = X.shape[1] + int n_clusters = centers_old.shape[0] + + if n_samples == 0: + # An empty array was passed, do nothing and return early (before + # attempting to compute n_chunks). This can typically happen when + # calling the prediction function of a bisecting k-means model with a + # large fraction of outliers. + return + + cdef: + # hard-coded number of samples per chunk. Appeared to be close to + # optimal in all situations. + int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples + int n_chunks = n_samples // n_samples_chunk + int n_samples_rem = n_samples % n_samples_chunk + int chunk_idx + int start, end + + int j, k + + floating[::1] centers_squared_norms = row_norms(centers_old, squared=True) + + floating *centers_new_chunk + floating *weight_in_clusters_chunk + floating *pairwise_distances_chunk + + omp_lock_t lock + + # count remainder chunk in total number of chunks + n_chunks += n_samples != n_chunks * n_samples_chunk + + # number of threads should not be bigger than number of chunks + n_threads = min(n_threads, n_chunks) + + if update_centers: + memset(¢ers_new[0, 0], 0, n_clusters * n_features * sizeof(floating)) + memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating)) + omp_init_lock(&lock) + + with nogil, parallel(num_threads=n_threads): + # thread local buffers + centers_new_chunk = calloc(n_clusters * n_features, sizeof(floating)) + weight_in_clusters_chunk = calloc(n_clusters, sizeof(floating)) + pairwise_distances_chunk = malloc(n_samples_chunk * n_clusters * sizeof(floating)) + + for chunk_idx in prange(n_chunks, schedule='static'): + start = chunk_idx * n_samples_chunk + if chunk_idx == n_chunks - 1 and n_samples_rem > 0: + end = start + n_samples_rem + else: + end = start + n_samples_chunk + + _update_chunk_dense( + X[start: end], + sample_weight[start: end], + centers_old, + centers_squared_norms, + labels[start: end], + centers_new_chunk, + weight_in_clusters_chunk, + pairwise_distances_chunk, + update_centers) + + # reduction from local buffers. + if update_centers: + # The lock is necessary to avoid race conditions when aggregating + # info from different thread-local buffers. + omp_set_lock(&lock) + for j in range(n_clusters): + weight_in_clusters[j] += weight_in_clusters_chunk[j] + for k in range(n_features): + centers_new[j, k] += centers_new_chunk[j * n_features + k] + + omp_unset_lock(&lock) + + free(centers_new_chunk) + free(weight_in_clusters_chunk) + free(pairwise_distances_chunk) + + if update_centers: + omp_destroy_lock(&lock) + _relocate_empty_clusters_dense( + X, sample_weight, centers_old, centers_new, weight_in_clusters, labels + ) + + _average_centers(centers_new, weight_in_clusters) + _center_shift(centers_old, centers_new, center_shift) + + +cdef void _update_chunk_dense( + const floating[:, ::1] X, # IN + const floating[::1] sample_weight, # IN + const floating[:, ::1] centers_old, # IN + const floating[::1] centers_squared_norms, # IN + int[::1] labels, # OUT + floating *centers_new, # OUT + floating *weight_in_clusters, # OUT + floating *pairwise_distances, # OUT + bint update_centers) noexcept nogil: + """K-means combined EM step for one dense data chunk. + + Compute the partial contribution of a single data chunk to the labels and + centers. + """ + cdef: + int n_samples = labels.shape[0] + int n_clusters = centers_old.shape[0] + int n_features = centers_old.shape[1] + + floating sq_dist, min_sq_dist + int i, j, k, label + + # Instead of computing the full pairwise squared distances matrix, + # ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to store + # the - 2 X.C^T + ||C||² term since the argmin for a given sample only + # depends on the centers. + # pairwise_distances = ||C||² + for i in range(n_samples): + for j in range(n_clusters): + pairwise_distances[i * n_clusters + j] = centers_squared_norms[j] + + # pairwise_distances += -2 * X.dot(C.T) + _gemm(RowMajor, NoTrans, Trans, n_samples, n_clusters, n_features, + -2.0, &X[0, 0], n_features, ¢ers_old[0, 0], n_features, + 1.0, pairwise_distances, n_clusters) + + for i in range(n_samples): + min_sq_dist = pairwise_distances[i * n_clusters] + label = 0 + for j in range(1, n_clusters): + sq_dist = pairwise_distances[i * n_clusters + j] + if sq_dist < min_sq_dist: + min_sq_dist = sq_dist + label = j + labels[i] = label + + if update_centers: + weight_in_clusters[label] += sample_weight[i] + for k in range(n_features): + centers_new[label * n_features + k] += X[i, k] * sample_weight[i] + + +def lloyd_iter_chunked_sparse( + X, # IN + const floating[::1] sample_weight, # IN + const floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # OUT + floating[::1] weight_in_clusters, # OUT + int[::1] labels, # OUT + floating[::1] center_shift, # OUT + int n_threads, + bint update_centers=True): + """Single iteration of K-means lloyd algorithm with sparse input. + + Update labels and centers (inplace), for one iteration, distributed + over data chunks. + + Parameters + ---------- + X : sparse matrix of shape (n_samples, n_features), dtype=floating + The observations to cluster. Must be in CSR format. + + sample_weight : ndarray of shape (n_samples,), dtype=floating + The weights for each observation in X. + + centers_old : ndarray of shape (n_clusters, n_features), dtype=floating + Centers before previous iteration, placeholder for the centers after + previous iteration. + + centers_new : ndarray of shape (n_clusters, n_features), dtype=floating + Centers after previous iteration, placeholder for the new centers + computed during this iteration. `centers_new` can be `None` if + `update_centers` is False. + + weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating + Placeholder for the sums of the weights of every observation assigned + to each center. `weight_in_clusters` can be `None` if `update_centers` + is False. + + labels : ndarray of shape (n_samples,), dtype=int + labels assignment. + + center_shift : ndarray of shape (n_clusters,), dtype=floating + Distance between old and new centers. + + n_threads : int + The number of threads to be used by openmp. + + update_centers : bool + - If True, the labels and the new centers will be computed, i.e. runs + the E-step and the M-step of the algorithm. + - If False, only the labels will be computed, i.e runs the E-step of + the algorithm. This is useful especially when calling predict on a + fitted model. + """ + cdef: + int n_samples = X.shape[0] + int n_features = X.shape[1] + int n_clusters = centers_old.shape[0] + + if n_samples == 0: + # An empty array was passed, do nothing and return early (before + # attempting to compute n_chunks). This can typically happen when + # calling the prediction function of a bisecting k-means model with a + # large fraction of outliers. + return + + cdef: + # Choose same as for dense. Does not have the same impact since with + # sparse data the pairwise distances matrix is not precomputed. + # However, splitting in chunks is necessary to get parallelism. + int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples + int n_chunks = n_samples // n_samples_chunk + int n_samples_rem = n_samples % n_samples_chunk + int chunk_idx + int start = 0, end = 0 + + int j, k + + floating[::1] X_data = X.data + int[::1] X_indices = X.indices + int[::1] X_indptr = X.indptr + + floating[::1] centers_squared_norms = row_norms(centers_old, squared=True) + + floating *centers_new_chunk + floating *weight_in_clusters_chunk + + omp_lock_t lock + + # count remainder chunk in total number of chunks + n_chunks += n_samples != n_chunks * n_samples_chunk + + # number of threads should not be bigger than number of chunks + n_threads = min(n_threads, n_chunks) + + if update_centers: + memset(¢ers_new[0, 0], 0, n_clusters * n_features * sizeof(floating)) + memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating)) + omp_init_lock(&lock) + + with nogil, parallel(num_threads=n_threads): + # thread local buffers + centers_new_chunk = calloc(n_clusters * n_features, sizeof(floating)) + weight_in_clusters_chunk = calloc(n_clusters, sizeof(floating)) + + for chunk_idx in prange(n_chunks, schedule='static'): + start = chunk_idx * n_samples_chunk + if chunk_idx == n_chunks - 1 and n_samples_rem > 0: + end = start + n_samples_rem + else: + end = start + n_samples_chunk + + _update_chunk_sparse( + X_data[X_indptr[start]: X_indptr[end]], + X_indices[X_indptr[start]: X_indptr[end]], + X_indptr[start: end+1], + sample_weight[start: end], + centers_old, + centers_squared_norms, + labels[start: end], + centers_new_chunk, + weight_in_clusters_chunk, + update_centers) + + # reduction from local buffers. + if update_centers: + # The lock is necessary to avoid race conditions when aggregating + # info from different thread-local buffers. + omp_set_lock(&lock) + for j in range(n_clusters): + weight_in_clusters[j] += weight_in_clusters_chunk[j] + for k in range(n_features): + centers_new[j, k] += centers_new_chunk[j * n_features + k] + omp_unset_lock(&lock) + + free(centers_new_chunk) + free(weight_in_clusters_chunk) + + if update_centers: + omp_destroy_lock(&lock) + _relocate_empty_clusters_sparse( + X_data, X_indices, X_indptr, sample_weight, + centers_old, centers_new, weight_in_clusters, labels) + + _average_centers(centers_new, weight_in_clusters) + _center_shift(centers_old, centers_new, center_shift) + + +cdef void _update_chunk_sparse( + const floating[::1] X_data, # IN + const int[::1] X_indices, # IN + const int[::1] X_indptr, # IN + const floating[::1] sample_weight, # IN + const floating[:, ::1] centers_old, # IN + const floating[::1] centers_squared_norms, # IN + int[::1] labels, # OUT + floating *centers_new, # OUT + floating *weight_in_clusters, # OUT + bint update_centers) noexcept nogil: + """K-means combined EM step for one sparse data chunk. + + Compute the partial contribution of a single data chunk to the labels and + centers. + """ + cdef: + int n_samples = labels.shape[0] + int n_clusters = centers_old.shape[0] + int n_features = centers_old.shape[1] + + floating sq_dist, min_sq_dist + int i, j, k, label + floating max_floating = FLT_MAX if floating is float else DBL_MAX + int s = X_indptr[0] + + # XXX Precompute the pairwise distances matrix is not worth for sparse + # currently. Should be tested when BLAS (sparse x dense) matrix + # multiplication is available. + for i in range(n_samples): + min_sq_dist = max_floating + label = 0 + + for j in range(n_clusters): + sq_dist = 0.0 + for k in range(X_indptr[i] - s, X_indptr[i + 1] - s): + sq_dist += centers_old[j, X_indices[k]] * X_data[k] + + # Instead of computing the full squared distance with each cluster, + # ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to compute + # the - 2 X.C^T + ||C||² term since the argmin for a given sample + # only depends on the centers C. + sq_dist = centers_squared_norms[j] -2 * sq_dist + if sq_dist < min_sq_dist: + min_sq_dist = sq_dist + label = j + + labels[i] = label + + if update_centers: + weight_in_clusters[label] += sample_weight[i] + for k in range(X_indptr[i] - s, X_indptr[i + 1] - s): + centers_new[label * n_features + X_indices[k]] += X_data[k] * sample_weight[i] diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_minibatch.pyx b/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_minibatch.pyx new file mode 100644 index 0000000000000000000000000000000000000000..22ca5255e3889574d7155f1e077f84111832cf92 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_k_means_minibatch.pyx @@ -0,0 +1,218 @@ +from cython cimport floating +from cython.parallel cimport parallel, prange +from libc.stdlib cimport malloc, free + + +def _minibatch_update_dense( + const floating[:, ::1] X, # IN + const floating[::1] sample_weight, # IN + const floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # OUT + floating[::1] weight_sums, # INOUT + const int[::1] labels, # IN + int n_threads): + """Update of the centers for dense MiniBatchKMeans. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features), dtype=floating + The observations to cluster. + + sample_weight : ndarray of shape (n_samples,), dtype=floating + The weights for each observation in X. + + centers_old : ndarray of shape (n_clusters, n_features), dtype=floating + Centers before previous iteration, placeholder for the centers after + previous iteration. + + centers_new : ndarray of shape (n_clusters, n_features), dtype=floating + Centers after previous iteration, placeholder for the new centers + computed during this iteration. + + weight_sums : ndarray of shape (n_clusters,), dtype=floating + Current sums of the accumulated weights for each center. + + labels : ndarray of shape (n_samples,), dtype=int + labels assignment. + + n_threads : int + The number of threads to be used by openmp. + """ + cdef: + int n_samples = X.shape[0] + int n_clusters = centers_old.shape[0] + int cluster_idx + + int *indices + + with nogil, parallel(num_threads=n_threads): + indices = malloc(n_samples * sizeof(int)) + + for cluster_idx in prange(n_clusters, schedule="static"): + update_center_dense(cluster_idx, X, sample_weight, + centers_old, centers_new, weight_sums, labels, + indices) + + free(indices) + + +cdef void update_center_dense( + int cluster_idx, + const floating[:, ::1] X, # IN + const floating[::1] sample_weight, # IN + const floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # OUT + floating[::1] weight_sums, # INOUT + const int[::1] labels, # IN + int *indices) noexcept nogil: # TMP + """Update of a single center for dense MinibatchKMeans""" + cdef: + int n_samples = sample_weight.shape[0] + int n_features = centers_old.shape[1] + floating alpha + int n_indices + int k, sample_idx, feature_idx + + floating wsum = 0 + + # indices = np.where(labels == cluster_idx)[0] + k = 0 + for sample_idx in range(n_samples): + if labels[sample_idx] == cluster_idx: + indices[k] = sample_idx + wsum += sample_weight[sample_idx] + k += 1 + n_indices = k + + if wsum > 0: + # Undo the previous count-based scaling for this cluster center + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx] + + # Update cluster with new point members + for k in range(n_indices): + sample_idx = indices[k] + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] += X[sample_idx, feature_idx] * sample_weight[sample_idx] + + # Update the count statistics for this center + weight_sums[cluster_idx] += wsum + + # Rescale to compute mean of all points (old and new) + alpha = 1 / weight_sums[cluster_idx] + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] *= alpha + else: + # No sample was assigned to this cluster in this batch of data + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] + + +def _minibatch_update_sparse( + X, # IN + const floating[::1] sample_weight, # IN + const floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # OUT + floating[::1] weight_sums, # INOUT + const int[::1] labels, # IN + int n_threads): + """Update of the centers for sparse MiniBatchKMeans. + + Parameters + ---------- + X : sparse matrix of shape (n_samples, n_features), dtype=floating + The observations to cluster. Must be in CSR format. + + sample_weight : ndarray of shape (n_samples,), dtype=floating + The weights for each observation in X. + + centers_old : ndarray of shape (n_clusters, n_features), dtype=floating + Centers before previous iteration, placeholder for the centers after + previous iteration. + + centers_new : ndarray of shape (n_clusters, n_features), dtype=floating + Centers after previous iteration, placeholder for the new centers + computed during this iteration. + + weight_sums : ndarray of shape (n_clusters,), dtype=floating + Current sums of the accumulated weights for each center. + + labels : ndarray of shape (n_samples,), dtype=int + labels assignment. + + n_threads : int + The number of threads to be used by openmp. + """ + cdef: + floating[::1] X_data = X.data + int[::1] X_indices = X.indices + int[::1] X_indptr = X.indptr + int n_samples = X.shape[0] + int n_clusters = centers_old.shape[0] + int cluster_idx + + int *indices + + with nogil, parallel(num_threads=n_threads): + indices = malloc(n_samples * sizeof(int)) + + for cluster_idx in prange(n_clusters, schedule="static"): + update_center_sparse(cluster_idx, X_data, X_indices, X_indptr, + sample_weight, centers_old, centers_new, + weight_sums, labels, indices) + + free(indices) + + +cdef void update_center_sparse( + int cluster_idx, + const floating[::1] X_data, # IN + const int[::1] X_indices, # IN + const int[::1] X_indptr, # IN + const floating[::1] sample_weight, # IN + const floating[:, ::1] centers_old, # IN + floating[:, ::1] centers_new, # OUT + floating[::1] weight_sums, # INOUT + const int[::1] labels, # IN + int *indices) noexcept nogil: # TMP + """Update of a single center for sparse MinibatchKMeans""" + cdef: + int n_samples = sample_weight.shape[0] + int n_features = centers_old.shape[1] + floating alpha + int n_indices + int k, sample_idx, feature_idx + + floating wsum = 0 + + # indices = np.where(labels == cluster_idx)[0] + k = 0 + for sample_idx in range(n_samples): + if labels[sample_idx] == cluster_idx: + indices[k] = sample_idx + wsum += sample_weight[sample_idx] + k += 1 + n_indices = k + + if wsum > 0: + # Undo the previous count-based scaling for this cluster center: + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx] + + # Update cluster with new point members + for k in range(n_indices): + sample_idx = indices[k] + for feature_idx in range(X_indptr[sample_idx], X_indptr[sample_idx + 1]): + centers_new[cluster_idx, X_indices[feature_idx]] += X_data[feature_idx] * sample_weight[sample_idx] + + # Update the count statistics for this center + weight_sums[cluster_idx] += wsum + + # Rescale to compute mean of all points (old and new) + alpha = 1 / weight_sums[cluster_idx] + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] *= alpha + else: + # No sample was assigned to this cluster in this batch of data + for feature_idx in range(n_features): + centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py new file mode 100644 index 0000000000000000000000000000000000000000..11c85610239ccae163137a1ced0f990325864390 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py @@ -0,0 +1,2303 @@ +"""K-means clustering.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from abc import ABC, abstractmethod +from numbers import Integral, Real + +import numpy as np +import scipy.sparse as sp + +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + ClusterMixin, + TransformerMixin, + _fit_context, +) +from ..exceptions import ConvergenceWarning +from ..metrics.pairwise import _euclidean_distances, euclidean_distances +from ..utils import check_array, check_random_state +from ..utils._openmp_helpers import _openmp_effective_n_threads +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.extmath import row_norms, stable_cumsum +from ..utils.parallel import ( + _get_threadpool_controller, + _threadpool_controller_decorator, +) +from ..utils.sparsefuncs import mean_variance_axis +from ..utils.sparsefuncs_fast import assign_rows_csr +from ..utils.validation import ( + _check_sample_weight, + _is_arraylike_not_scalar, + check_is_fitted, + validate_data, +) +from ._k_means_common import ( + CHUNK_SIZE, + _inertia_dense, + _inertia_sparse, + _is_same_clustering, +) +from ._k_means_elkan import ( + elkan_iter_chunked_dense, + elkan_iter_chunked_sparse, + init_bounds_dense, + init_bounds_sparse, +) +from ._k_means_lloyd import lloyd_iter_chunked_dense, lloyd_iter_chunked_sparse +from ._k_means_minibatch import _minibatch_update_dense, _minibatch_update_sparse + +############################################################################### +# Initialization heuristic + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "n_clusters": [Interval(Integral, 1, None, closed="left")], + "sample_weight": ["array-like", None], + "x_squared_norms": ["array-like", None], + "random_state": ["random_state"], + "n_local_trials": [Interval(Integral, 1, None, closed="left"), None], + }, + prefer_skip_nested_validation=True, +) +def kmeans_plusplus( + X, + n_clusters, + *, + sample_weight=None, + x_squared_norms=None, + random_state=None, + n_local_trials=None, +): + """Init n_clusters seeds according to k-means++. + + .. versionadded:: 0.24 + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to pick seeds from. + + n_clusters : int + The number of centroids to initialize. + + sample_weight : array-like of shape (n_samples,), default=None + The weights for each observation in `X`. If `None`, all observations + are assigned equal weight. `sample_weight` is ignored if `init` + is a callable or a user provided array. + + .. versionadded:: 1.3 + + x_squared_norms : array-like of shape (n_samples,), default=None + Squared Euclidean norm of each data point. + + random_state : int or RandomState instance, default=None + Determines random number generation for centroid initialization. Pass + an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + n_local_trials : int, default=None + The number of seeding trials for each center (except the first), + of which the one reducing inertia the most is greedily chosen. + Set to None to make the number of trials depend logarithmically + on the number of seeds (2+log(k)) which is the recommended setting. + Setting to 1 disables the greedy cluster selection and recovers the + vanilla k-means++ algorithm which was empirically shown to work less + well than its greedy variant. + + Returns + ------- + centers : ndarray of shape (n_clusters, n_features) + The initial centers for k-means. + + indices : ndarray of shape (n_clusters,) + The index location of the chosen centers in the data array X. For a + given index and center, X[index] = center. + + Notes + ----- + Selects initial cluster centers for k-mean clustering in a smart way + to speed up convergence. see: Arthur, D. and Vassilvitskii, S. + "k-means++: the advantages of careful seeding". ACM-SIAM symposium + on Discrete algorithms. 2007 + + Examples + -------- + + >>> from sklearn.cluster import kmeans_plusplus + >>> import numpy as np + >>> X = np.array([[1, 2], [1, 4], [1, 0], + ... [10, 2], [10, 4], [10, 0]]) + >>> centers, indices = kmeans_plusplus(X, n_clusters=2, random_state=0) + >>> centers + array([[10, 2], + [ 1, 0]]) + >>> indices + array([3, 2]) + """ + # Check data + check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + + if X.shape[0] < n_clusters: + raise ValueError( + f"n_samples={X.shape[0]} should be >= n_clusters={n_clusters}." + ) + + # Check parameters + if x_squared_norms is None: + x_squared_norms = row_norms(X, squared=True) + else: + x_squared_norms = check_array(x_squared_norms, dtype=X.dtype, ensure_2d=False) + + if x_squared_norms.shape[0] != X.shape[0]: + raise ValueError( + f"The length of x_squared_norms {x_squared_norms.shape[0]} should " + f"be equal to the length of n_samples {X.shape[0]}." + ) + + random_state = check_random_state(random_state) + + # Call private k-means++ + centers, indices = _kmeans_plusplus( + X, n_clusters, x_squared_norms, sample_weight, random_state, n_local_trials + ) + + return centers, indices + + +def _kmeans_plusplus( + X, n_clusters, x_squared_norms, sample_weight, random_state, n_local_trials=None +): + """Computational component for initialization of n_clusters by + k-means++. Prior validation of data is assumed. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + The data to pick seeds for. + + n_clusters : int + The number of seeds to choose. + + sample_weight : ndarray of shape (n_samples,) + The weights for each observation in `X`. + + x_squared_norms : ndarray of shape (n_samples,) + Squared Euclidean norm of each data point. + + random_state : RandomState instance + The generator used to initialize the centers. + See :term:`Glossary `. + + n_local_trials : int, default=None + The number of seeding trials for each center (except the first), + of which the one reducing inertia the most is greedily chosen. + Set to None to make the number of trials depend logarithmically + on the number of seeds (2+log(k)); this is the default. + + Returns + ------- + centers : ndarray of shape (n_clusters, n_features) + The initial centers for k-means. + + indices : ndarray of shape (n_clusters,) + The index location of the chosen centers in the data array X. For a + given index and center, X[index] = center. + """ + n_samples, n_features = X.shape + + centers = np.empty((n_clusters, n_features), dtype=X.dtype) + + # Set the number of local seeding trials if none is given + if n_local_trials is None: + # This is what Arthur/Vassilvitskii tried, but did not report + # specific results for other than mentioning in the conclusion + # that it helped. + n_local_trials = 2 + int(np.log(n_clusters)) + + # Pick first center randomly and track index of point + center_id = random_state.choice(n_samples, p=sample_weight / sample_weight.sum()) + indices = np.full(n_clusters, -1, dtype=int) + if sp.issparse(X): + centers[0] = X[[center_id]].toarray() + else: + centers[0] = X[center_id] + indices[0] = center_id + + # Initialize list of closest distances and calculate current potential + closest_dist_sq = _euclidean_distances( + centers[0, np.newaxis], X, Y_norm_squared=x_squared_norms, squared=True + ) + current_pot = closest_dist_sq @ sample_weight + + # Pick the remaining n_clusters-1 points + for c in range(1, n_clusters): + # Choose center candidates by sampling with probability proportional + # to the squared distance to the closest existing center + rand_vals = random_state.uniform(size=n_local_trials) * current_pot + candidate_ids = np.searchsorted( + stable_cumsum(sample_weight * closest_dist_sq), rand_vals + ) + # XXX: numerical imprecision can result in a candidate_id out of range + np.clip(candidate_ids, None, closest_dist_sq.size - 1, out=candidate_ids) + + # Compute distances to center candidates + distance_to_candidates = _euclidean_distances( + X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True + ) + + # update closest distances squared and potential for each candidate + np.minimum(closest_dist_sq, distance_to_candidates, out=distance_to_candidates) + candidates_pot = distance_to_candidates @ sample_weight.reshape(-1, 1) + + # Decide which candidate is the best + best_candidate = np.argmin(candidates_pot) + current_pot = candidates_pot[best_candidate] + closest_dist_sq = distance_to_candidates[best_candidate] + best_candidate = candidate_ids[best_candidate] + + # Permanently add best center candidate found in local tries + if sp.issparse(X): + centers[c] = X[[best_candidate]].toarray() + else: + centers[c] = X[best_candidate] + indices[c] = best_candidate + + return centers, indices + + +############################################################################### +# K-means batch estimation by EM (expectation maximization) + + +def _tolerance(X, tol): + """Return a tolerance which is dependent on the dataset.""" + if tol == 0: + return 0 + if sp.issparse(X): + variances = mean_variance_axis(X, axis=0)[1] + else: + variances = np.var(X, axis=0) + return np.mean(variances) * tol + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "sample_weight": ["array-like", None], + "return_n_iter": [bool], + }, + prefer_skip_nested_validation=False, +) +def k_means( + X, + n_clusters, + *, + sample_weight=None, + init="k-means++", + n_init="auto", + max_iter=300, + verbose=False, + tol=1e-4, + random_state=None, + copy_x=True, + algorithm="lloyd", + return_n_iter=False, +): + """Perform K-means clustering algorithm. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The observations to cluster. It must be noted that the data + will be converted to C ordering, which will cause a memory copy + if the given data is not C-contiguous. + + n_clusters : int + The number of clusters to form as well as the number of + centroids to generate. + + sample_weight : array-like of shape (n_samples,), default=None + The weights for each observation in `X`. If `None`, all observations + are assigned equal weight. `sample_weight` is not used during + initialization if `init` is a callable or a user provided array. + + init : {'k-means++', 'random'}, callable or array-like of shape \ + (n_clusters, n_features), default='k-means++' + Method for initialization: + + - `'k-means++'` : selects initial cluster centers for k-mean + clustering in a smart way to speed up convergence. See section + Notes in k_init for more details. + - `'random'`: choose `n_clusters` observations (rows) at random from data + for the initial centroids. + - If an array is passed, it should be of shape `(n_clusters, n_features)` + and gives the initial centers. + - If a callable is passed, it should take arguments `X`, `n_clusters` and a + random state and return an initialization. + + n_init : 'auto' or int, default="auto" + Number of time the k-means algorithm will be run with different + centroid seeds. The final results will be the best output of + n_init consecutive runs in terms of inertia. + + When `n_init='auto'`, the number of runs depends on the value of init: + 10 if using `init='random'` or `init` is a callable; + 1 if using `init='k-means++'` or `init` is an array-like. + + .. versionadded:: 1.2 + Added 'auto' option for `n_init`. + + .. versionchanged:: 1.4 + Default value for `n_init` changed to `'auto'`. + + max_iter : int, default=300 + Maximum number of iterations of the k-means algorithm to run. + + verbose : bool, default=False + Verbosity mode. + + tol : float, default=1e-4 + Relative tolerance with regards to Frobenius norm of the difference + in the cluster centers of two consecutive iterations to declare + convergence. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for centroid initialization. Use + an int to make the randomness deterministic. + See :term:`Glossary `. + + copy_x : bool, default=True + When pre-computing distances it is more numerically accurate to center + the data first. If `copy_x` is True (default), then the original data is + not modified. If False, the original data is modified, and put back + before the function returns, but small numerical differences may be + introduced by subtracting and then adding the data mean. Note that if + the original data is not C-contiguous, a copy will be made even if + `copy_x` is False. If the original data is sparse, but not in CSR format, + a copy will be made even if `copy_x` is False. + + algorithm : {"lloyd", "elkan"}, default="lloyd" + K-means algorithm to use. The classical EM-style algorithm is `"lloyd"`. + The `"elkan"` variation can be more efficient on some datasets with + well-defined clusters, by using the triangle inequality. However it's + more memory intensive due to the allocation of an extra array of shape + `(n_samples, n_clusters)`. + + .. versionchanged:: 0.18 + Added Elkan algorithm + + .. versionchanged:: 1.1 + Renamed "full" to "lloyd", and deprecated "auto" and "full". + Changed "auto" to use "lloyd" instead of "elkan". + + return_n_iter : bool, default=False + Whether or not to return the number of iterations. + + Returns + ------- + centroid : ndarray of shape (n_clusters, n_features) + Centroids found at the last iteration of k-means. + + label : ndarray of shape (n_samples,) + The `label[i]` is the code or index of the centroid the + i'th observation is closest to. + + inertia : float + The final value of the inertia criterion (sum of squared distances to + the closest centroid for all observations in the training set). + + best_n_iter : int + Number of iterations corresponding to the best results. + Returned only if `return_n_iter` is set to True. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.cluster import k_means + >>> X = np.array([[1, 2], [1, 4], [1, 0], + ... [10, 2], [10, 4], [10, 0]]) + >>> centroid, label, inertia = k_means( + ... X, n_clusters=2, n_init="auto", random_state=0 + ... ) + >>> centroid + array([[10., 2.], + [ 1., 2.]]) + >>> label + array([1, 1, 1, 0, 0, 0], dtype=int32) + >>> inertia + 16.0 + """ + est = KMeans( + n_clusters=n_clusters, + init=init, + n_init=n_init, + max_iter=max_iter, + verbose=verbose, + tol=tol, + random_state=random_state, + copy_x=copy_x, + algorithm=algorithm, + ).fit(X, sample_weight=sample_weight) + if return_n_iter: + return est.cluster_centers_, est.labels_, est.inertia_, est.n_iter_ + else: + return est.cluster_centers_, est.labels_, est.inertia_ + + +def _kmeans_single_elkan( + X, + sample_weight, + centers_init, + max_iter=300, + verbose=False, + tol=1e-4, + n_threads=1, +): + """A single run of k-means elkan, assumes preparation completed prior. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + The observations to cluster. If sparse matrix, must be in CSR format. + + sample_weight : array-like of shape (n_samples,) + The weights for each observation in X. + + centers_init : ndarray of shape (n_clusters, n_features) + The initial centers. + + max_iter : int, default=300 + Maximum number of iterations of the k-means algorithm to run. + + verbose : bool, default=False + Verbosity mode. + + tol : float, default=1e-4 + Relative tolerance with regards to Frobenius norm of the difference + in the cluster centers of two consecutive iterations to declare + convergence. + It's not advised to set `tol=0` since convergence might never be + declared due to rounding errors. Use a very small number instead. + + n_threads : int, default=1 + The number of OpenMP threads to use for the computation. Parallelism is + sample-wise on the main cython loop which assigns each sample to its + closest center. + + Returns + ------- + centroid : ndarray of shape (n_clusters, n_features) + Centroids found at the last iteration of k-means. + + label : ndarray of shape (n_samples,) + label[i] is the code or index of the centroid the + i'th observation is closest to. + + inertia : float + The final value of the inertia criterion (sum of squared distances to + the closest centroid for all observations in the training set). + + n_iter : int + Number of iterations run. + """ + n_samples = X.shape[0] + n_clusters = centers_init.shape[0] + + # Buffers to avoid new allocations at each iteration. + centers = centers_init + centers_new = np.zeros_like(centers) + weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype) + labels = np.full(n_samples, -1, dtype=np.int32) + labels_old = labels.copy() + center_half_distances = euclidean_distances(centers) / 2 + distance_next_center = np.partition( + np.asarray(center_half_distances), kth=1, axis=0 + )[1] + upper_bounds = np.zeros(n_samples, dtype=X.dtype) + lower_bounds = np.zeros((n_samples, n_clusters), dtype=X.dtype) + center_shift = np.zeros(n_clusters, dtype=X.dtype) + + if sp.issparse(X): + init_bounds = init_bounds_sparse + elkan_iter = elkan_iter_chunked_sparse + _inertia = _inertia_sparse + else: + init_bounds = init_bounds_dense + elkan_iter = elkan_iter_chunked_dense + _inertia = _inertia_dense + + init_bounds( + X, + centers, + center_half_distances, + labels, + upper_bounds, + lower_bounds, + n_threads=n_threads, + ) + + strict_convergence = False + + for i in range(max_iter): + elkan_iter( + X, + sample_weight, + centers, + centers_new, + weight_in_clusters, + center_half_distances, + distance_next_center, + upper_bounds, + lower_bounds, + labels, + center_shift, + n_threads, + ) + + # compute new pairwise distances between centers and closest other + # center of each center for next iterations + center_half_distances = euclidean_distances(centers_new) / 2 + distance_next_center = np.partition( + np.asarray(center_half_distances), kth=1, axis=0 + )[1] + + if verbose: + inertia = _inertia(X, sample_weight, centers, labels, n_threads) + print(f"Iteration {i}, inertia {inertia}") + + centers, centers_new = centers_new, centers + + if np.array_equal(labels, labels_old): + # First check the labels for strict convergence. + if verbose: + print(f"Converged at iteration {i}: strict convergence.") + strict_convergence = True + break + else: + # No strict convergence, check for tol based convergence. + center_shift_tot = (center_shift**2).sum() + if center_shift_tot <= tol: + if verbose: + print( + f"Converged at iteration {i}: center shift " + f"{center_shift_tot} within tolerance {tol}." + ) + break + + labels_old[:] = labels + + if not strict_convergence: + # rerun E-step so that predicted labels match cluster centers + elkan_iter( + X, + sample_weight, + centers, + centers, + weight_in_clusters, + center_half_distances, + distance_next_center, + upper_bounds, + lower_bounds, + labels, + center_shift, + n_threads, + update_centers=False, + ) + + inertia = _inertia(X, sample_weight, centers, labels, n_threads) + + return labels, inertia, centers, i + 1 + + +# Threadpoolctl context to limit the number of threads in second level of +# nested parallelism (i.e. BLAS) to avoid oversubscription. +@_threadpool_controller_decorator(limits=1, user_api="blas") +def _kmeans_single_lloyd( + X, + sample_weight, + centers_init, + max_iter=300, + verbose=False, + tol=1e-4, + n_threads=1, +): + """A single run of k-means lloyd, assumes preparation completed prior. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + The observations to cluster. If sparse matrix, must be in CSR format. + + sample_weight : ndarray of shape (n_samples,) + The weights for each observation in X. + + centers_init : ndarray of shape (n_clusters, n_features) + The initial centers. + + max_iter : int, default=300 + Maximum number of iterations of the k-means algorithm to run. + + verbose : bool, default=False + Verbosity mode + + tol : float, default=1e-4 + Relative tolerance with regards to Frobenius norm of the difference + in the cluster centers of two consecutive iterations to declare + convergence. + It's not advised to set `tol=0` since convergence might never be + declared due to rounding errors. Use a very small number instead. + + n_threads : int, default=1 + The number of OpenMP threads to use for the computation. Parallelism is + sample-wise on the main cython loop which assigns each sample to its + closest center. + + Returns + ------- + centroid : ndarray of shape (n_clusters, n_features) + Centroids found at the last iteration of k-means. + + label : ndarray of shape (n_samples,) + label[i] is the code or index of the centroid the + i'th observation is closest to. + + inertia : float + The final value of the inertia criterion (sum of squared distances to + the closest centroid for all observations in the training set). + + n_iter : int + Number of iterations run. + """ + n_clusters = centers_init.shape[0] + + # Buffers to avoid new allocations at each iteration. + centers = centers_init + centers_new = np.zeros_like(centers) + labels = np.full(X.shape[0], -1, dtype=np.int32) + labels_old = labels.copy() + weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype) + center_shift = np.zeros(n_clusters, dtype=X.dtype) + + if sp.issparse(X): + lloyd_iter = lloyd_iter_chunked_sparse + _inertia = _inertia_sparse + else: + lloyd_iter = lloyd_iter_chunked_dense + _inertia = _inertia_dense + + strict_convergence = False + + for i in range(max_iter): + lloyd_iter( + X, + sample_weight, + centers, + centers_new, + weight_in_clusters, + labels, + center_shift, + n_threads, + ) + + if verbose: + inertia = _inertia(X, sample_weight, centers, labels, n_threads) + print(f"Iteration {i}, inertia {inertia}.") + + centers, centers_new = centers_new, centers + + if np.array_equal(labels, labels_old): + # First check the labels for strict convergence. + if verbose: + print(f"Converged at iteration {i}: strict convergence.") + strict_convergence = True + break + else: + # No strict convergence, check for tol based convergence. + center_shift_tot = (center_shift**2).sum() + if center_shift_tot <= tol: + if verbose: + print( + f"Converged at iteration {i}: center shift " + f"{center_shift_tot} within tolerance {tol}." + ) + break + + labels_old[:] = labels + + if not strict_convergence: + # rerun E-step so that predicted labels match cluster centers + lloyd_iter( + X, + sample_weight, + centers, + centers, + weight_in_clusters, + labels, + center_shift, + n_threads, + update_centers=False, + ) + + inertia = _inertia(X, sample_weight, centers, labels, n_threads) + + return labels, inertia, centers, i + 1 + + +def _labels_inertia(X, sample_weight, centers, n_threads=1, return_inertia=True): + """E step of the K-means EM algorithm. + + Compute the labels and the inertia of the given samples and centers. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + The input samples to assign to the labels. If sparse matrix, must + be in CSR format. + + sample_weight : ndarray of shape (n_samples,) + The weights for each observation in X. + + x_squared_norms : ndarray of shape (n_samples,) + Precomputed squared euclidean norm of each data point, to speed up + computations. + + centers : ndarray of shape (n_clusters, n_features) + The cluster centers. + + n_threads : int, default=1 + The number of OpenMP threads to use for the computation. Parallelism is + sample-wise on the main cython loop which assigns each sample to its + closest center. + + return_inertia : bool, default=True + Whether to compute and return the inertia. + + Returns + ------- + labels : ndarray of shape (n_samples,) + The resulting assignment. + + inertia : float + Sum of squared distances of samples to their closest cluster center. + Inertia is only returned if return_inertia is True. + """ + n_samples = X.shape[0] + n_clusters = centers.shape[0] + + labels = np.full(n_samples, -1, dtype=np.int32) + center_shift = np.zeros(n_clusters, dtype=centers.dtype) + + if sp.issparse(X): + _labels = lloyd_iter_chunked_sparse + _inertia = _inertia_sparse + else: + _labels = lloyd_iter_chunked_dense + _inertia = _inertia_dense + + _labels( + X, + sample_weight, + centers, + centers_new=None, + weight_in_clusters=None, + labels=labels, + center_shift=center_shift, + n_threads=n_threads, + update_centers=False, + ) + + if return_inertia: + inertia = _inertia(X, sample_weight, centers, labels, n_threads) + return labels, inertia + + return labels + + +# Same as _labels_inertia but in a threadpool_limits context. +_labels_inertia_threadpool_limit = _threadpool_controller_decorator( + limits=1, user_api="blas" +)(_labels_inertia) + + +class _BaseKMeans( + ClassNamePrefixFeaturesOutMixin, TransformerMixin, ClusterMixin, BaseEstimator, ABC +): + """Base class for KMeans and MiniBatchKMeans""" + + _parameter_constraints: dict = { + "n_clusters": [Interval(Integral, 1, None, closed="left")], + "init": [StrOptions({"k-means++", "random"}), callable, "array-like"], + "n_init": [ + StrOptions({"auto"}), + Interval(Integral, 1, None, closed="left"), + ], + "max_iter": [Interval(Integral, 1, None, closed="left")], + "tol": [Interval(Real, 0, None, closed="left")], + "verbose": ["verbose"], + "random_state": ["random_state"], + } + + def __init__( + self, + n_clusters, + *, + init, + n_init, + max_iter, + tol, + verbose, + random_state, + ): + self.n_clusters = n_clusters + self.init = init + self.max_iter = max_iter + self.tol = tol + self.n_init = n_init + self.verbose = verbose + self.random_state = random_state + + def _check_params_vs_input(self, X, default_n_init=None): + # n_clusters + if X.shape[0] < self.n_clusters: + raise ValueError( + f"n_samples={X.shape[0]} should be >= n_clusters={self.n_clusters}." + ) + + # tol + self._tol = _tolerance(X, self.tol) + + # n-init + if self.n_init == "auto": + if isinstance(self.init, str) and self.init == "k-means++": + self._n_init = 1 + elif isinstance(self.init, str) and self.init == "random": + self._n_init = default_n_init + elif callable(self.init): + self._n_init = default_n_init + else: # array-like + self._n_init = 1 + else: + self._n_init = self.n_init + + if _is_arraylike_not_scalar(self.init) and self._n_init != 1: + warnings.warn( + ( + "Explicit initial center position passed: performing only" + f" one init in {self.__class__.__name__} instead of " + f"n_init={self._n_init}." + ), + RuntimeWarning, + stacklevel=2, + ) + self._n_init = 1 + + @abstractmethod + def _warn_mkl_vcomp(self, n_active_threads): + """Issue an estimator specific warning when vcomp and mkl are both present + + This method is called by `_check_mkl_vcomp`. + """ + + def _check_mkl_vcomp(self, X, n_samples): + """Check when vcomp and mkl are both present""" + # The BLAS call inside a prange in lloyd_iter_chunked_dense is known to + # cause a small memory leak when there are less chunks than the number + # of available threads. It only happens when the OpenMP library is + # vcomp (microsoft OpenMP) and the BLAS library is MKL. see #18653 + if sp.issparse(X): + return + + n_active_threads = int(np.ceil(n_samples / CHUNK_SIZE)) + if n_active_threads < self._n_threads: + modules = _get_threadpool_controller().info() + has_vcomp = "vcomp" in [module["prefix"] for module in modules] + has_mkl = ("mkl", "intel") in [ + (module["internal_api"], module.get("threading_layer", None)) + for module in modules + ] + if has_vcomp and has_mkl: + self._warn_mkl_vcomp(n_active_threads) + + def _validate_center_shape(self, X, centers): + """Check if centers is compatible with X and n_clusters.""" + if centers.shape[0] != self.n_clusters: + raise ValueError( + f"The shape of the initial centers {centers.shape} does not " + f"match the number of clusters {self.n_clusters}." + ) + if centers.shape[1] != X.shape[1]: + raise ValueError( + f"The shape of the initial centers {centers.shape} does not " + f"match the number of features of the data {X.shape[1]}." + ) + + def _check_test_data(self, X): + X = validate_data( + self, + X, + accept_sparse="csr", + reset=False, + dtype=[np.float64, np.float32], + order="C", + accept_large_sparse=False, + ) + return X + + def _init_centroids( + self, + X, + x_squared_norms, + init, + random_state, + sample_weight, + init_size=None, + n_centroids=None, + ): + """Compute the initial centroids. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + The input samples. + + x_squared_norms : ndarray of shape (n_samples,) + Squared euclidean norm of each data point. Pass it if you have it + at hands already to avoid it being recomputed here. + + init : {'k-means++', 'random'}, callable or ndarray of shape \ + (n_clusters, n_features) + Method for initialization. + + random_state : RandomState instance + Determines random number generation for centroid initialization. + See :term:`Glossary `. + + sample_weight : ndarray of shape (n_samples,) + The weights for each observation in X. `sample_weight` is not used + during initialization if `init` is a callable or a user provided + array. + + init_size : int, default=None + Number of samples to randomly sample for speeding up the + initialization (sometimes at the expense of accuracy). + + n_centroids : int, default=None + Number of centroids to initialize. + If left to 'None' the number of centroids will be equal to + number of clusters to form (self.n_clusters). + + Returns + ------- + centers : ndarray of shape (n_clusters, n_features) + Initial centroids of clusters. + """ + n_samples = X.shape[0] + n_clusters = self.n_clusters if n_centroids is None else n_centroids + + if init_size is not None and init_size < n_samples: + init_indices = random_state.randint(0, n_samples, init_size) + X = X[init_indices] + x_squared_norms = x_squared_norms[init_indices] + n_samples = X.shape[0] + sample_weight = sample_weight[init_indices] + + if isinstance(init, str) and init == "k-means++": + centers, _ = _kmeans_plusplus( + X, + n_clusters, + random_state=random_state, + x_squared_norms=x_squared_norms, + sample_weight=sample_weight, + ) + elif isinstance(init, str) and init == "random": + seeds = random_state.choice( + n_samples, + size=n_clusters, + replace=False, + p=sample_weight / sample_weight.sum(), + ) + centers = X[seeds] + elif _is_arraylike_not_scalar(self.init): + centers = init + elif callable(init): + centers = init(X, n_clusters, random_state=random_state) + centers = check_array(centers, dtype=X.dtype, copy=False, order="C") + self._validate_center_shape(X, centers) + + if sp.issparse(centers): + centers = centers.toarray() + + return centers + + def fit_predict(self, X, y=None, sample_weight=None): + """Compute cluster centers and predict cluster index for each sample. + + Convenience method; equivalent to calling fit(X) followed by + predict(X). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + New data to transform. + + y : Ignored + Not used, present here for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + The weights for each observation in X. If None, all observations + are assigned equal weight. + + Returns + ------- + labels : ndarray of shape (n_samples,) + Index of the cluster each sample belongs to. + """ + return self.fit(X, sample_weight=sample_weight).labels_ + + def predict(self, X): + """Predict the closest cluster each sample in X belongs to. + + In the vector quantization literature, `cluster_centers_` is called + the code book and each value returned by `predict` is the index of + the closest code in the code book. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + New data to predict. + + Returns + ------- + labels : ndarray of shape (n_samples,) + Index of the cluster each sample belongs to. + """ + check_is_fitted(self) + + X = self._check_test_data(X) + + # sample weights are not used by predict but cython helpers expect an array + sample_weight = np.ones(X.shape[0], dtype=X.dtype) + + labels = _labels_inertia_threadpool_limit( + X, + sample_weight, + self.cluster_centers_, + n_threads=self._n_threads, + return_inertia=False, + ) + + return labels + + def fit_transform(self, X, y=None, sample_weight=None): + """Compute clustering and transform X to cluster-distance space. + + Equivalent to fit(X).transform(X), but more efficiently implemented. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + New data to transform. + + y : Ignored + Not used, present here for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + The weights for each observation in X. If None, all observations + are assigned equal weight. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_clusters) + X transformed in the new space. + """ + return self.fit(X, sample_weight=sample_weight)._transform(X) + + def transform(self, X): + """Transform X to a cluster-distance space. + + In the new space, each dimension is the distance to the cluster + centers. Note that even if X is sparse, the array returned by + `transform` will typically be dense. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + New data to transform. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_clusters) + X transformed in the new space. + """ + check_is_fitted(self) + + X = self._check_test_data(X) + return self._transform(X) + + def _transform(self, X): + """Guts of transform method; no input validation.""" + return euclidean_distances(X, self.cluster_centers_) + + def score(self, X, y=None, sample_weight=None): + """Opposite of the value of X on the K-means objective. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + New data. + + y : Ignored + Not used, present here for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + The weights for each observation in X. If None, all observations + are assigned equal weight. + + Returns + ------- + score : float + Opposite of the value of X on the K-means objective. + """ + check_is_fitted(self) + + X = self._check_test_data(X) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + + _, scores = _labels_inertia_threadpool_limit( + X, sample_weight, self.cluster_centers_, self._n_threads + ) + return -scores + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + + +class KMeans(_BaseKMeans): + """K-Means clustering. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + + n_clusters : int, default=8 + The number of clusters to form as well as the number of + centroids to generate. + + For an example of how to choose an optimal value for `n_clusters` refer to + :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py`. + + init : {'k-means++', 'random'}, callable or array-like of shape \ + (n_clusters, n_features), default='k-means++' + Method for initialization: + + * 'k-means++' : selects initial cluster centroids using sampling \ + based on an empirical probability distribution of the points' \ + contribution to the overall inertia. This technique speeds up \ + convergence. The algorithm implemented is "greedy k-means++". It \ + differs from the vanilla k-means++ by making several trials at \ + each sampling step and choosing the best centroid among them. + + * 'random': choose `n_clusters` observations (rows) at random from \ + data for the initial centroids. + + * If an array is passed, it should be of shape (n_clusters, n_features)\ + and gives the initial centers. + + * If a callable is passed, it should take arguments X, n_clusters and a\ + random state and return an initialization. + + For an example of how to use the different `init` strategies, see + :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_digits.py`. + + For an evaluation of the impact of initialization, see the example + :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_stability_low_dim_dense.py`. + + n_init : 'auto' or int, default='auto' + Number of times the k-means algorithm is run with different centroid + seeds. The final results is the best output of `n_init` consecutive runs + in terms of inertia. Several runs are recommended for sparse + high-dimensional problems (see :ref:`kmeans_sparse_high_dim`). + + When `n_init='auto'`, the number of runs depends on the value of init: + 10 if using `init='random'` or `init` is a callable; + 1 if using `init='k-means++'` or `init` is an array-like. + + .. versionadded:: 1.2 + Added 'auto' option for `n_init`. + + .. versionchanged:: 1.4 + Default value for `n_init` changed to `'auto'`. + + max_iter : int, default=300 + Maximum number of iterations of the k-means algorithm for a + single run. + + tol : float, default=1e-4 + Relative tolerance with regards to Frobenius norm of the difference + in the cluster centers of two consecutive iterations to declare + convergence. + + verbose : int, default=0 + Verbosity mode. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for centroid initialization. Use + an int to make the randomness deterministic. + See :term:`Glossary `. + + copy_x : bool, default=True + When pre-computing distances it is more numerically accurate to center + the data first. If copy_x is True (default), then the original data is + not modified. If False, the original data is modified, and put back + before the function returns, but small numerical differences may be + introduced by subtracting and then adding the data mean. Note that if + the original data is not C-contiguous, a copy will be made even if + copy_x is False. If the original data is sparse, but not in CSR format, + a copy will be made even if copy_x is False. + + algorithm : {"lloyd", "elkan"}, default="lloyd" + K-means algorithm to use. The classical EM-style algorithm is `"lloyd"`. + The `"elkan"` variation can be more efficient on some datasets with + well-defined clusters, by using the triangle inequality. However it's + more memory intensive due to the allocation of an extra array of shape + `(n_samples, n_clusters)`. + + .. versionchanged:: 0.18 + Added Elkan algorithm + + .. versionchanged:: 1.1 + Renamed "full" to "lloyd", and deprecated "auto" and "full". + Changed "auto" to use "lloyd" instead of "elkan". + + Attributes + ---------- + cluster_centers_ : ndarray of shape (n_clusters, n_features) + Coordinates of cluster centers. If the algorithm stops before fully + converging (see ``tol`` and ``max_iter``), these will not be + consistent with ``labels_``. + + labels_ : ndarray of shape (n_samples,) + Labels of each point + + inertia_ : float + Sum of squared distances of samples to their closest cluster center, + weighted by the sample weights if provided. + + n_iter_ : int + Number of iterations run. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + MiniBatchKMeans : Alternative online implementation that does incremental + updates of the centers positions using mini-batches. + For large scale learning (say n_samples > 10k) MiniBatchKMeans is + probably much faster than the default batch implementation. + + Notes + ----- + The k-means problem is solved using either Lloyd's or Elkan's algorithm. + + The average complexity is given by O(k n T), where n is the number of + samples and T is the number of iteration. + + The worst case complexity is given by O(n^(k+2/p)) with + n = n_samples, p = n_features. + Refer to :doi:`"How slow is the k-means method?" D. Arthur and S. Vassilvitskii - + SoCG2006.<10.1145/1137856.1137880>` for more details. + + In practice, the k-means algorithm is very fast (one of the fastest + clustering algorithms available), but it falls in local minima. That's why + it can be useful to restart it several times. + + If the algorithm stops before fully converging (because of ``tol`` or + ``max_iter``), ``labels_`` and ``cluster_centers_`` will not be consistent, + i.e. the ``cluster_centers_`` will not be the means of the points in each + cluster. Also, the estimator will reassign ``labels_`` after the last + iteration to make ``labels_`` consistent with ``predict`` on the training + set. + + Examples + -------- + + >>> from sklearn.cluster import KMeans + >>> import numpy as np + >>> X = np.array([[1, 2], [1, 4], [1, 0], + ... [10, 2], [10, 4], [10, 0]]) + >>> kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(X) + >>> kmeans.labels_ + array([1, 1, 1, 0, 0, 0], dtype=int32) + >>> kmeans.predict([[0, 0], [12, 3]]) + array([1, 0], dtype=int32) + >>> kmeans.cluster_centers_ + array([[10., 2.], + [ 1., 2.]]) + + For examples of common problems with K-Means and how to address them see + :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_assumptions.py`. + + For a demonstration of how K-Means can be used to cluster text documents see + :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`. + + For a comparison between K-Means and MiniBatchKMeans refer to example + :ref:`sphx_glr_auto_examples_cluster_plot_mini_batch_kmeans.py`. + + For a comparison between K-Means and BisectingKMeans refer to example + :ref:`sphx_glr_auto_examples_cluster_plot_bisect_kmeans.py`. + """ + + _parameter_constraints: dict = { + **_BaseKMeans._parameter_constraints, + "copy_x": ["boolean"], + "algorithm": [StrOptions({"lloyd", "elkan"})], + } + + def __init__( + self, + n_clusters=8, + *, + init="k-means++", + n_init="auto", + max_iter=300, + tol=1e-4, + verbose=0, + random_state=None, + copy_x=True, + algorithm="lloyd", + ): + super().__init__( + n_clusters=n_clusters, + init=init, + n_init=n_init, + max_iter=max_iter, + tol=tol, + verbose=verbose, + random_state=random_state, + ) + + self.copy_x = copy_x + self.algorithm = algorithm + + def _check_params_vs_input(self, X): + super()._check_params_vs_input(X, default_n_init=10) + + self._algorithm = self.algorithm + if self._algorithm == "elkan" and self.n_clusters == 1: + warnings.warn( + ( + "algorithm='elkan' doesn't make sense for a single " + "cluster. Using 'lloyd' instead." + ), + RuntimeWarning, + ) + self._algorithm = "lloyd" + + def _warn_mkl_vcomp(self, n_active_threads): + """Warn when vcomp and mkl are both present""" + warnings.warn( + "KMeans is known to have a memory leak on Windows " + "with MKL, when there are less chunks than available " + "threads. You can avoid it by setting the environment" + f" variable OMP_NUM_THREADS={n_active_threads}." + ) + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None, sample_weight=None): + """Compute k-means clustering. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training instances to cluster. It must be noted that the data + will be converted to C ordering, which will cause a memory + copy if the given data is not C-contiguous. + If a sparse matrix is passed, a copy will be made if it's not in + CSR format. + + y : Ignored + Not used, present here for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + The weights for each observation in X. If None, all observations + are assigned equal weight. `sample_weight` is not used during + initialization if `init` is a callable or a user provided array. + + .. versionadded:: 0.20 + + Returns + ------- + self : object + Fitted estimator. + """ + X = validate_data( + self, + X, + accept_sparse="csr", + dtype=[np.float64, np.float32], + order="C", + copy=self.copy_x, + accept_large_sparse=False, + ) + + self._check_params_vs_input(X) + + random_state = check_random_state(self.random_state) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + self._n_threads = _openmp_effective_n_threads() + + # Validate init array + init = self.init + init_is_array_like = _is_arraylike_not_scalar(init) + if init_is_array_like: + init = check_array(init, dtype=X.dtype, copy=True, order="C") + self._validate_center_shape(X, init) + + # subtract of mean of x for more accurate distance computations + if not sp.issparse(X): + X_mean = X.mean(axis=0) + # The copy was already done above + X -= X_mean + + if init_is_array_like: + init -= X_mean + + # precompute squared norms of data points + x_squared_norms = row_norms(X, squared=True) + + if self._algorithm == "elkan": + kmeans_single = _kmeans_single_elkan + else: + kmeans_single = _kmeans_single_lloyd + self._check_mkl_vcomp(X, X.shape[0]) + + best_inertia, best_labels = None, None + + for i in range(self._n_init): + # Initialize centers + centers_init = self._init_centroids( + X, + x_squared_norms=x_squared_norms, + init=init, + random_state=random_state, + sample_weight=sample_weight, + ) + if self.verbose: + print("Initialization complete") + + # run a k-means once + labels, inertia, centers, n_iter_ = kmeans_single( + X, + sample_weight, + centers_init, + max_iter=self.max_iter, + verbose=self.verbose, + tol=self._tol, + n_threads=self._n_threads, + ) + + # determine if these results are the best so far + # we chose a new run if it has a better inertia and the clustering is + # different from the best so far (it's possible that the inertia is + # slightly better even if the clustering is the same with potentially + # permuted labels, due to rounding errors) + if best_inertia is None or ( + inertia < best_inertia + and not _is_same_clustering(labels, best_labels, self.n_clusters) + ): + best_labels = labels + best_centers = centers + best_inertia = inertia + best_n_iter = n_iter_ + + if not sp.issparse(X): + if not self.copy_x: + X += X_mean + best_centers += X_mean + + distinct_clusters = len(set(best_labels)) + if distinct_clusters < self.n_clusters: + warnings.warn( + "Number of distinct clusters ({}) found smaller than " + "n_clusters ({}). Possibly due to duplicate points " + "in X.".format(distinct_clusters, self.n_clusters), + ConvergenceWarning, + stacklevel=2, + ) + + self.cluster_centers_ = best_centers + self._n_features_out = self.cluster_centers_.shape[0] + self.labels_ = best_labels + self.inertia_ = best_inertia + self.n_iter_ = best_n_iter + return self + + +def _mini_batch_step( + X, + sample_weight, + centers, + centers_new, + weight_sums, + random_state, + random_reassign=False, + reassignment_ratio=0.01, + verbose=False, + n_threads=1, +): + """Incremental update of the centers for the Minibatch K-Means algorithm. + + Parameters + ---------- + + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + The original data array. If sparse, must be in CSR format. + + x_squared_norms : ndarray of shape (n_samples,) + Squared euclidean norm of each data point. + + sample_weight : ndarray of shape (n_samples,) + The weights for each observation in `X`. + + centers : ndarray of shape (n_clusters, n_features) + The cluster centers before the current iteration + + centers_new : ndarray of shape (n_clusters, n_features) + The cluster centers after the current iteration. Modified in-place. + + weight_sums : ndarray of shape (n_clusters,) + The vector in which we keep track of the numbers of points in a + cluster. This array is modified in place. + + random_state : RandomState instance + Determines random number generation for low count centers reassignment. + See :term:`Glossary `. + + random_reassign : boolean, default=False + If True, centers with very low counts are randomly reassigned + to observations. + + reassignment_ratio : float, default=0.01 + Control the fraction of the maximum number of counts for a + center to be reassigned. A higher value means that low count + centers are more likely to be reassigned, which means that the + model will take longer to converge, but should converge in a + better clustering. + + verbose : bool, default=False + Controls the verbosity. + + n_threads : int, default=1 + The number of OpenMP threads to use for the computation. + + Returns + ------- + inertia : float + Sum of squared distances of samples to their closest cluster center. + The inertia is computed after finding the labels and before updating + the centers. + """ + # Perform label assignment to nearest centers + # For better efficiency, it's better to run _mini_batch_step in a + # threadpool_limit context than using _labels_inertia_threadpool_limit here + labels, inertia = _labels_inertia(X, sample_weight, centers, n_threads=n_threads) + + # Update centers according to the labels + if sp.issparse(X): + _minibatch_update_sparse( + X, sample_weight, centers, centers_new, weight_sums, labels, n_threads + ) + else: + _minibatch_update_dense( + X, + sample_weight, + centers, + centers_new, + weight_sums, + labels, + n_threads, + ) + + # Reassign clusters that have very low weight + if random_reassign and reassignment_ratio > 0: + to_reassign = weight_sums < reassignment_ratio * weight_sums.max() + + # pick at most .5 * batch_size samples as new centers + if to_reassign.sum() > 0.5 * X.shape[0]: + indices_dont_reassign = np.argsort(weight_sums)[int(0.5 * X.shape[0]) :] + to_reassign[indices_dont_reassign] = False + n_reassigns = to_reassign.sum() + + if n_reassigns: + # Pick new clusters amongst observations with uniform probability + new_centers = random_state.choice( + X.shape[0], replace=False, size=n_reassigns + ) + if verbose: + print(f"[MiniBatchKMeans] Reassigning {n_reassigns} cluster centers.") + + if sp.issparse(X): + assign_rows_csr( + X, + new_centers.astype(np.intp, copy=False), + np.where(to_reassign)[0].astype(np.intp, copy=False), + centers_new, + ) + else: + centers_new[to_reassign] = X[new_centers] + + # reset counts of reassigned centers, but don't reset them too small + # to avoid instant reassignment. This is a pretty dirty hack as it + # also modifies the learning rates. + weight_sums[to_reassign] = np.min(weight_sums[~to_reassign]) + + return inertia + + +class MiniBatchKMeans(_BaseKMeans): + """ + Mini-Batch K-Means clustering. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + + n_clusters : int, default=8 + The number of clusters to form as well as the number of + centroids to generate. + + init : {'k-means++', 'random'}, callable or array-like of shape \ + (n_clusters, n_features), default='k-means++' + Method for initialization: + + 'k-means++' : selects initial cluster centroids using sampling based on + an empirical probability distribution of the points' contribution to the + overall inertia. This technique speeds up convergence. The algorithm + implemented is "greedy k-means++". It differs from the vanilla k-means++ + by making several trials at each sampling step and choosing the best centroid + among them. + + 'random': choose `n_clusters` observations (rows) at random from data + for the initial centroids. + + If an array is passed, it should be of shape (n_clusters, n_features) + and gives the initial centers. + + If a callable is passed, it should take arguments X, n_clusters and a + random state and return an initialization. + + For an evaluation of the impact of initialization, see the example + :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_stability_low_dim_dense.py`. + + max_iter : int, default=100 + Maximum number of iterations over the complete dataset before + stopping independently of any early stopping criterion heuristics. + + batch_size : int, default=1024 + Size of the mini batches. + For faster computations, you can set the ``batch_size`` greater than + 256 * number of cores to enable parallelism on all cores. + + .. versionchanged:: 1.0 + `batch_size` default changed from 100 to 1024. + + verbose : int, default=0 + Verbosity mode. + + compute_labels : bool, default=True + Compute label assignment and inertia for the complete dataset + once the minibatch optimization has converged in fit. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for centroid initialization and + random reassignment. Use an int to make the randomness deterministic. + See :term:`Glossary `. + + tol : float, default=0.0 + Control early stopping based on the relative center changes as + measured by a smoothed, variance-normalized of the mean center + squared position changes. This early stopping heuristics is + closer to the one used for the batch variant of the algorithms + but induces a slight computational and memory overhead over the + inertia heuristic. + + To disable convergence detection based on normalized center + change, set tol to 0.0 (default). + + max_no_improvement : int, default=10 + Control early stopping based on the consecutive number of mini + batches that does not yield an improvement on the smoothed inertia. + + To disable convergence detection based on inertia, set + max_no_improvement to None. + + init_size : int, default=None + Number of samples to randomly sample for speeding up the + initialization (sometimes at the expense of accuracy): the + only algorithm is initialized by running a batch KMeans on a + random subset of the data. This needs to be larger than n_clusters. + + If `None`, the heuristic is `init_size = 3 * batch_size` if + `3 * batch_size < n_clusters`, else `init_size = 3 * n_clusters`. + + n_init : 'auto' or int, default="auto" + Number of random initializations that are tried. + In contrast to KMeans, the algorithm is only run once, using the best of + the `n_init` initializations as measured by inertia. Several runs are + recommended for sparse high-dimensional problems (see + :ref:`kmeans_sparse_high_dim`). + + When `n_init='auto'`, the number of runs depends on the value of init: + 3 if using `init='random'` or `init` is a callable; + 1 if using `init='k-means++'` or `init` is an array-like. + + .. versionadded:: 1.2 + Added 'auto' option for `n_init`. + + .. versionchanged:: 1.4 + Default value for `n_init` changed to `'auto'` in version. + + reassignment_ratio : float, default=0.01 + Control the fraction of the maximum number of counts for a center to + be reassigned. A higher value means that low count centers are more + easily reassigned, which means that the model will take longer to + converge, but should converge in a better clustering. However, too high + a value may cause convergence issues, especially with a small batch + size. + + Attributes + ---------- + + cluster_centers_ : ndarray of shape (n_clusters, n_features) + Coordinates of cluster centers. + + labels_ : ndarray of shape (n_samples,) + Labels of each point (if compute_labels is set to True). + + inertia_ : float + The value of the inertia criterion associated with the chosen + partition if compute_labels is set to True. If compute_labels is set to + False, it's an approximation of the inertia based on an exponentially + weighted average of the batch inertiae. + The inertia is defined as the sum of square distances of samples to + their cluster center, weighted by the sample weights if provided. + + n_iter_ : int + Number of iterations over the full dataset. + + n_steps_ : int + Number of minibatches processed. + + .. versionadded:: 1.0 + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + KMeans : The classic implementation of the clustering method based on the + Lloyd's algorithm. It consumes the whole set of input data at each + iteration. + + Notes + ----- + See https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf + + When there are too few points in the dataset, some centers may be + duplicated, which means that a proper clustering in terms of the number + of requesting clusters and the number of returned clusters will not + always match. One solution is to set `reassignment_ratio=0`, which + prevents reassignments of clusters that are too small. + + See :ref:`sphx_glr_auto_examples_cluster_plot_birch_vs_minibatchkmeans.py` for a + comparison with :class:`~sklearn.cluster.BIRCH`. + + Examples + -------- + >>> from sklearn.cluster import MiniBatchKMeans + >>> import numpy as np + >>> X = np.array([[1, 2], [1, 4], [1, 0], + ... [4, 2], [4, 0], [4, 4], + ... [4, 5], [0, 1], [2, 2], + ... [3, 2], [5, 5], [1, -1]]) + >>> # manually fit on batches + >>> kmeans = MiniBatchKMeans(n_clusters=2, + ... random_state=0, + ... batch_size=6, + ... n_init="auto") + >>> kmeans = kmeans.partial_fit(X[0:6,:]) + >>> kmeans = kmeans.partial_fit(X[6:12,:]) + >>> kmeans.cluster_centers_ + array([[3.375, 3. ], + [0.75 , 0.5 ]]) + >>> kmeans.predict([[0, 0], [4, 4]]) + array([1, 0], dtype=int32) + >>> # fit on the whole data + >>> kmeans = MiniBatchKMeans(n_clusters=2, + ... random_state=0, + ... batch_size=6, + ... max_iter=10, + ... n_init="auto").fit(X) + >>> kmeans.cluster_centers_ + array([[3.55102041, 2.48979592], + [1.06896552, 1. ]]) + >>> kmeans.predict([[0, 0], [4, 4]]) + array([1, 0], dtype=int32) + + For a comparison of Mini-Batch K-Means clustering with other clustering algorithms, + see :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py` + """ + + _parameter_constraints: dict = { + **_BaseKMeans._parameter_constraints, + "batch_size": [Interval(Integral, 1, None, closed="left")], + "compute_labels": ["boolean"], + "max_no_improvement": [Interval(Integral, 0, None, closed="left"), None], + "init_size": [Interval(Integral, 1, None, closed="left"), None], + "reassignment_ratio": [Interval(Real, 0, None, closed="left")], + } + + def __init__( + self, + n_clusters=8, + *, + init="k-means++", + max_iter=100, + batch_size=1024, + verbose=0, + compute_labels=True, + random_state=None, + tol=0.0, + max_no_improvement=10, + init_size=None, + n_init="auto", + reassignment_ratio=0.01, + ): + super().__init__( + n_clusters=n_clusters, + init=init, + max_iter=max_iter, + verbose=verbose, + random_state=random_state, + tol=tol, + n_init=n_init, + ) + + self.max_no_improvement = max_no_improvement + self.batch_size = batch_size + self.compute_labels = compute_labels + self.init_size = init_size + self.reassignment_ratio = reassignment_ratio + + def _check_params_vs_input(self, X): + super()._check_params_vs_input(X, default_n_init=3) + + self._batch_size = min(self.batch_size, X.shape[0]) + + # init_size + self._init_size = self.init_size + if self._init_size is None: + self._init_size = 3 * self._batch_size + if self._init_size < self.n_clusters: + self._init_size = 3 * self.n_clusters + elif self._init_size < self.n_clusters: + warnings.warn( + ( + f"init_size={self._init_size} should be larger than " + f"n_clusters={self.n_clusters}. Setting it to " + "min(3*n_clusters, n_samples)" + ), + RuntimeWarning, + stacklevel=2, + ) + self._init_size = 3 * self.n_clusters + self._init_size = min(self._init_size, X.shape[0]) + + # reassignment_ratio + if self.reassignment_ratio < 0: + raise ValueError( + "reassignment_ratio should be >= 0, got " + f"{self.reassignment_ratio} instead." + ) + + def _warn_mkl_vcomp(self, n_active_threads): + """Warn when vcomp and mkl are both present""" + warnings.warn( + "MiniBatchKMeans is known to have a memory leak on " + "Windows with MKL, when there are less chunks than " + "available threads. You can prevent it by setting " + f"batch_size >= {self._n_threads * CHUNK_SIZE} or by " + "setting the environment variable " + f"OMP_NUM_THREADS={n_active_threads}" + ) + + def _mini_batch_convergence( + self, step, n_steps, n_samples, centers_squared_diff, batch_inertia + ): + """Helper function to encapsulate the early stopping logic""" + # Normalize inertia to be able to compare values when + # batch_size changes + batch_inertia /= self._batch_size + + # count steps starting from 1 for user friendly verbose mode. + step = step + 1 + + # Ignore first iteration because it's inertia from initialization. + if step == 1: + if self.verbose: + print( + f"Minibatch step {step}/{n_steps}: mean batch " + f"inertia: {batch_inertia}" + ) + return False + + # Compute an Exponentially Weighted Average of the inertia to + # monitor the convergence while discarding minibatch-local stochastic + # variability: https://en.wikipedia.org/wiki/Moving_average + if self._ewa_inertia is None: + self._ewa_inertia = batch_inertia + else: + alpha = self._batch_size * 2.0 / (n_samples + 1) + alpha = min(alpha, 1) + self._ewa_inertia = self._ewa_inertia * (1 - alpha) + batch_inertia * alpha + + # Log progress to be able to monitor convergence + if self.verbose: + print( + f"Minibatch step {step}/{n_steps}: mean batch inertia: " + f"{batch_inertia}, ewa inertia: {self._ewa_inertia}" + ) + + # Early stopping based on absolute tolerance on squared change of + # centers position + if self._tol > 0.0 and centers_squared_diff <= self._tol: + if self.verbose: + print(f"Converged (small centers change) at step {step}/{n_steps}") + return True + + # Early stopping heuristic due to lack of improvement on smoothed + # inertia + if self._ewa_inertia_min is None or self._ewa_inertia < self._ewa_inertia_min: + self._no_improvement = 0 + self._ewa_inertia_min = self._ewa_inertia + else: + self._no_improvement += 1 + + if ( + self.max_no_improvement is not None + and self._no_improvement >= self.max_no_improvement + ): + if self.verbose: + print( + "Converged (lack of improvement in inertia) at step " + f"{step}/{n_steps}" + ) + return True + + return False + + def _random_reassign(self): + """Check if a random reassignment needs to be done. + + Do random reassignments each time 10 * n_clusters samples have been + processed. + + If there are empty clusters we always want to reassign. + """ + self._n_since_last_reassign += self._batch_size + if (self._counts == 0).any() or self._n_since_last_reassign >= ( + 10 * self.n_clusters + ): + self._n_since_last_reassign = 0 + return True + return False + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None, sample_weight=None): + """Compute the centroids on X by chunking it into mini-batches. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training instances to cluster. It must be noted that the data + will be converted to C ordering, which will cause a memory copy + if the given data is not C-contiguous. + If a sparse matrix is passed, a copy will be made if it's not in + CSR format. + + y : Ignored + Not used, present here for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + The weights for each observation in X. If None, all observations + are assigned equal weight. `sample_weight` is not used during + initialization if `init` is a callable or a user provided array. + + .. versionadded:: 0.20 + + Returns + ------- + self : object + Fitted estimator. + """ + X = validate_data( + self, + X, + accept_sparse="csr", + dtype=[np.float64, np.float32], + order="C", + accept_large_sparse=False, + ) + + self._check_params_vs_input(X) + random_state = check_random_state(self.random_state) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + self._n_threads = _openmp_effective_n_threads() + n_samples, n_features = X.shape + + # Validate init array + init = self.init + if _is_arraylike_not_scalar(init): + init = check_array(init, dtype=X.dtype, copy=True, order="C") + self._validate_center_shape(X, init) + + self._check_mkl_vcomp(X, self._batch_size) + + # precompute squared norms of data points + x_squared_norms = row_norms(X, squared=True) + + # Validation set for the init + validation_indices = random_state.randint(0, n_samples, self._init_size) + X_valid = X[validation_indices] + sample_weight_valid = sample_weight[validation_indices] + + # perform several inits with random subsets + best_inertia = None + for init_idx in range(self._n_init): + if self.verbose: + print(f"Init {init_idx + 1}/{self._n_init} with method {init}") + + # Initialize the centers using only a fraction of the data as we + # expect n_samples to be very large when using MiniBatchKMeans. + cluster_centers = self._init_centroids( + X, + x_squared_norms=x_squared_norms, + init=init, + random_state=random_state, + init_size=self._init_size, + sample_weight=sample_weight, + ) + + # Compute inertia on a validation set. + _, inertia = _labels_inertia_threadpool_limit( + X_valid, + sample_weight_valid, + cluster_centers, + n_threads=self._n_threads, + ) + + if self.verbose: + print(f"Inertia for init {init_idx + 1}/{self._n_init}: {inertia}") + if best_inertia is None or inertia < best_inertia: + init_centers = cluster_centers + best_inertia = inertia + + centers = init_centers + centers_new = np.empty_like(centers) + + # Initialize counts + self._counts = np.zeros(self.n_clusters, dtype=X.dtype) + + # Attributes to monitor the convergence + self._ewa_inertia = None + self._ewa_inertia_min = None + self._no_improvement = 0 + + # Initialize number of samples seen since last reassignment + self._n_since_last_reassign = 0 + + n_steps = (self.max_iter * n_samples) // self._batch_size + + with _get_threadpool_controller().limit(limits=1, user_api="blas"): + # Perform the iterative optimization until convergence + for i in range(n_steps): + # Sample a minibatch from the full dataset + minibatch_indices = random_state.randint(0, n_samples, self._batch_size) + + # Perform the actual update step on the minibatch data + batch_inertia = _mini_batch_step( + X=X[minibatch_indices], + sample_weight=sample_weight[minibatch_indices], + centers=centers, + centers_new=centers_new, + weight_sums=self._counts, + random_state=random_state, + random_reassign=self._random_reassign(), + reassignment_ratio=self.reassignment_ratio, + verbose=self.verbose, + n_threads=self._n_threads, + ) + + if self._tol > 0.0: + centers_squared_diff = np.sum((centers_new - centers) ** 2) + else: + centers_squared_diff = 0 + + centers, centers_new = centers_new, centers + + # Monitor convergence and do early stopping if necessary + if self._mini_batch_convergence( + i, n_steps, n_samples, centers_squared_diff, batch_inertia + ): + break + + self.cluster_centers_ = centers + self._n_features_out = self.cluster_centers_.shape[0] + + self.n_steps_ = i + 1 + self.n_iter_ = int(np.ceil(((i + 1) * self._batch_size) / n_samples)) + + if self.compute_labels: + self.labels_, self.inertia_ = _labels_inertia_threadpool_limit( + X, + sample_weight, + self.cluster_centers_, + n_threads=self._n_threads, + ) + else: + self.inertia_ = self._ewa_inertia * n_samples + + return self + + @_fit_context(prefer_skip_nested_validation=True) + def partial_fit(self, X, y=None, sample_weight=None): + """Update k means estimate on a single mini-batch X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training instances to cluster. It must be noted that the data + will be converted to C ordering, which will cause a memory copy + if the given data is not C-contiguous. + If a sparse matrix is passed, a copy will be made if it's not in + CSR format. + + y : Ignored + Not used, present here for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + The weights for each observation in X. If None, all observations + are assigned equal weight. `sample_weight` is not used during + initialization if `init` is a callable or a user provided array. + + Returns + ------- + self : object + Return updated estimator. + """ + has_centers = hasattr(self, "cluster_centers_") + + X = validate_data( + self, + X, + accept_sparse="csr", + dtype=[np.float64, np.float32], + order="C", + accept_large_sparse=False, + reset=not has_centers, + ) + + self._random_state = getattr( + self, "_random_state", check_random_state(self.random_state) + ) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + self.n_steps_ = getattr(self, "n_steps_", 0) + + # precompute squared norms of data points + x_squared_norms = row_norms(X, squared=True) + + if not has_centers: + # this instance has not been fitted yet (fit or partial_fit) + self._check_params_vs_input(X) + self._n_threads = _openmp_effective_n_threads() + + # Validate init array + init = self.init + if _is_arraylike_not_scalar(init): + init = check_array(init, dtype=X.dtype, copy=True, order="C") + self._validate_center_shape(X, init) + + self._check_mkl_vcomp(X, X.shape[0]) + + # initialize the cluster centers + self.cluster_centers_ = self._init_centroids( + X, + x_squared_norms=x_squared_norms, + init=init, + random_state=self._random_state, + init_size=self._init_size, + sample_weight=sample_weight, + ) + + # Initialize counts + self._counts = np.zeros(self.n_clusters, dtype=X.dtype) + + # Initialize number of samples seen since last reassignment + self._n_since_last_reassign = 0 + + with _get_threadpool_controller().limit(limits=1, user_api="blas"): + _mini_batch_step( + X, + sample_weight=sample_weight, + centers=self.cluster_centers_, + centers_new=self.cluster_centers_, + weight_sums=self._counts, + random_state=self._random_state, + random_reassign=self._random_reassign(), + reassignment_ratio=self.reassignment_ratio, + verbose=self.verbose, + n_threads=self._n_threads, + ) + + if self.compute_labels: + self.labels_, self.inertia_ = _labels_inertia_threadpool_limit( + X, + sample_weight, + self.cluster_centers_, + n_threads=self._n_threads, + ) + + self.n_steps_ += 1 + self._n_features_out = self.cluster_centers_.shape[0] + + return self diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_mean_shift.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_mean_shift.py new file mode 100644 index 0000000000000000000000000000000000000000..1ba4409d14698b482a6854fd1558f014ea3d9f70 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_mean_shift.py @@ -0,0 +1,579 @@ +"""Mean shift clustering algorithm. + +Mean shift clustering aims to discover *blobs* in a smooth density of +samples. It is a centroid based algorithm, which works by updating candidates +for centroids to be the mean of the points within a given region. These +candidates are then filtered in a post-processing stage to eliminate +near-duplicates to form the final set of centroids. + +Seeding is performed using a binning technique for scalability. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from collections import defaultdict +from numbers import Integral, Real + +import numpy as np + +from .._config import config_context +from ..base import BaseEstimator, ClusterMixin, _fit_context +from ..metrics.pairwise import pairwise_distances_argmin +from ..neighbors import NearestNeighbors +from ..utils import check_array, check_random_state, gen_batches +from ..utils._param_validation import Interval, validate_params +from ..utils.parallel import Parallel, delayed +from ..utils.validation import check_is_fitted, validate_data + + +@validate_params( + { + "X": ["array-like"], + "quantile": [Interval(Real, 0, 1, closed="both")], + "n_samples": [Interval(Integral, 1, None, closed="left"), None], + "random_state": ["random_state"], + "n_jobs": [Integral, None], + }, + prefer_skip_nested_validation=True, +) +def estimate_bandwidth(X, *, quantile=0.3, n_samples=None, random_state=0, n_jobs=None): + """Estimate the bandwidth to use with the mean-shift algorithm. + + This function takes time at least quadratic in `n_samples`. For large + datasets, it is wise to subsample by setting `n_samples`. Alternatively, + the parameter `bandwidth` can be set to a small value without estimating + it. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input points. + + quantile : float, default=0.3 + Should be between [0, 1] + 0.5 means that the median of all pairwise distances is used. + + n_samples : int, default=None + The number of samples to use. If not given, all samples are used. + + random_state : int, RandomState instance, default=None + The generator used to randomly select the samples from input points + for bandwidth estimation. Use an int to make the randomness + deterministic. + See :term:`Glossary `. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Returns + ------- + bandwidth : float + The bandwidth parameter. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.cluster import estimate_bandwidth + >>> X = np.array([[1, 1], [2, 1], [1, 0], + ... [4, 7], [3, 5], [3, 6]]) + >>> estimate_bandwidth(X, quantile=0.5) + np.float64(1.61) + """ + X = check_array(X) + + random_state = check_random_state(random_state) + if n_samples is not None: + idx = random_state.permutation(X.shape[0])[:n_samples] + X = X[idx] + n_neighbors = int(X.shape[0] * quantile) + if n_neighbors < 1: # cannot fit NearestNeighbors with n_neighbors = 0 + n_neighbors = 1 + nbrs = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=n_jobs) + nbrs.fit(X) + + bandwidth = 0.0 + for batch in gen_batches(len(X), 500): + d, _ = nbrs.kneighbors(X[batch, :], return_distance=True) + bandwidth += np.max(d, axis=1).sum() + + return bandwidth / X.shape[0] + + +# separate function for each seed's iterative loop +def _mean_shift_single_seed(my_mean, X, nbrs, max_iter): + # For each seed, climb gradient until convergence or max_iter + bandwidth = nbrs.get_params()["radius"] + stop_thresh = 1e-3 * bandwidth # when mean has converged + completed_iterations = 0 + while True: + # Find mean of points within bandwidth + i_nbrs = nbrs.radius_neighbors([my_mean], bandwidth, return_distance=False)[0] + points_within = X[i_nbrs] + if len(points_within) == 0: + break # Depending on seeding strategy this condition may occur + my_old_mean = my_mean # save the old mean + my_mean = np.mean(points_within, axis=0) + # If converged or at max_iter, adds the cluster + if ( + np.linalg.norm(my_mean - my_old_mean) <= stop_thresh + or completed_iterations == max_iter + ): + break + completed_iterations += 1 + return tuple(my_mean), len(points_within), completed_iterations + + +@validate_params( + {"X": ["array-like"]}, + prefer_skip_nested_validation=False, +) +def mean_shift( + X, + *, + bandwidth=None, + seeds=None, + bin_seeding=False, + min_bin_freq=1, + cluster_all=True, + max_iter=300, + n_jobs=None, +): + """Perform mean shift clustering of data using a flat kernel. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + + X : array-like of shape (n_samples, n_features) + Input data. + + bandwidth : float, default=None + Kernel bandwidth. If not None, must be in the range [0, +inf). + + If None, the bandwidth is determined using a heuristic based on + the median of all pairwise distances. This will take quadratic time in + the number of samples. The sklearn.cluster.estimate_bandwidth function + can be used to do this more efficiently. + + seeds : array-like of shape (n_seeds, n_features) or None + Point used as initial kernel locations. If None and bin_seeding=False, + each data point is used as a seed. If None and bin_seeding=True, + see bin_seeding. + + bin_seeding : bool, default=False + If true, initial kernel locations are not locations of all + points, but rather the location of the discretized version of + points, where points are binned onto a grid whose coarseness + corresponds to the bandwidth. Setting this option to True will speed + up the algorithm because fewer seeds will be initialized. + Ignored if seeds argument is not None. + + min_bin_freq : int, default=1 + To speed up the algorithm, accept only those bins with at least + min_bin_freq points as seeds. + + cluster_all : bool, default=True + If true, then all points are clustered, even those orphans that are + not within any kernel. Orphans are assigned to the nearest kernel. + If false, then orphans are given cluster label -1. + + max_iter : int, default=300 + Maximum number of iterations, per seed point before the clustering + operation terminates (for that seed point), if has not converged yet. + + n_jobs : int, default=None + The number of jobs to use for the computation. The following tasks benefit + from the parallelization: + + - The search of nearest neighbors for bandwidth estimation and label + assignments. See the details in the docstring of the + ``NearestNeighbors`` class. + - Hill-climbing optimization for all seeds. + + See :term:`Glossary ` for more details. + + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + .. versionadded:: 0.17 + Parallel Execution using *n_jobs*. + + Returns + ------- + + cluster_centers : ndarray of shape (n_clusters, n_features) + Coordinates of cluster centers. + + labels : ndarray of shape (n_samples,) + Cluster labels for each point. + + Notes + ----- + For a usage example, see + :ref:`sphx_glr_auto_examples_cluster_plot_mean_shift.py`. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.cluster import mean_shift + >>> X = np.array([[1, 1], [2, 1], [1, 0], + ... [4, 7], [3, 5], [3, 6]]) + >>> cluster_centers, labels = mean_shift(X, bandwidth=2) + >>> cluster_centers + array([[3.33, 6. ], + [1.33, 0.66]]) + >>> labels + array([1, 1, 1, 0, 0, 0]) + """ + model = MeanShift( + bandwidth=bandwidth, + seeds=seeds, + min_bin_freq=min_bin_freq, + bin_seeding=bin_seeding, + cluster_all=cluster_all, + n_jobs=n_jobs, + max_iter=max_iter, + ).fit(X) + return model.cluster_centers_, model.labels_ + + +def get_bin_seeds(X, bin_size, min_bin_freq=1): + """Find seeds for mean_shift. + + Finds seeds by first binning data onto a grid whose lines are + spaced bin_size apart, and then choosing those bins with at least + min_bin_freq points. + + Parameters + ---------- + + X : array-like of shape (n_samples, n_features) + Input points, the same points that will be used in mean_shift. + + bin_size : float + Controls the coarseness of the binning. Smaller values lead + to more seeding (which is computationally more expensive). If you're + not sure how to set this, set it to the value of the bandwidth used + in clustering.mean_shift. + + min_bin_freq : int, default=1 + Only bins with at least min_bin_freq will be selected as seeds. + Raising this value decreases the number of seeds found, which + makes mean_shift computationally cheaper. + + Returns + ------- + bin_seeds : array-like of shape (n_samples, n_features) + Points used as initial kernel positions in clustering.mean_shift. + """ + if bin_size == 0: + return X + + # Bin points + bin_sizes = defaultdict(int) + for point in X: + binned_point = np.round(point / bin_size) + bin_sizes[tuple(binned_point)] += 1 + + # Select only those bins as seeds which have enough members + bin_seeds = np.array( + [point for point, freq in bin_sizes.items() if freq >= min_bin_freq], + dtype=np.float32, + ) + if len(bin_seeds) == len(X): + warnings.warn( + "Binning data failed with provided bin_size=%f, using data points as seeds." + % bin_size + ) + return X + bin_seeds = bin_seeds * bin_size + return bin_seeds + + +class MeanShift(ClusterMixin, BaseEstimator): + """Mean shift clustering using a flat kernel. + + Mean shift clustering aims to discover "blobs" in a smooth density of + samples. It is a centroid-based algorithm, which works by updating + candidates for centroids to be the mean of the points within a given + region. These candidates are then filtered in a post-processing stage to + eliminate near-duplicates to form the final set of centroids. + + Seeding is performed using a binning technique for scalability. + + For an example of how to use MeanShift clustering, refer to: + :ref:`sphx_glr_auto_examples_cluster_plot_mean_shift.py`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + bandwidth : float, default=None + Bandwidth used in the flat kernel. + + If not given, the bandwidth is estimated using + sklearn.cluster.estimate_bandwidth; see the documentation for that + function for hints on scalability (see also the Notes, below). + + seeds : array-like of shape (n_samples, n_features), default=None + Seeds used to initialize kernels. If not set, + the seeds are calculated by clustering.get_bin_seeds + with bandwidth as the grid size and default values for + other parameters. + + bin_seeding : bool, default=False + If true, initial kernel locations are not locations of all + points, but rather the location of the discretized version of + points, where points are binned onto a grid whose coarseness + corresponds to the bandwidth. Setting this option to True will speed + up the algorithm because fewer seeds will be initialized. + The default value is False. + Ignored if seeds argument is not None. + + min_bin_freq : int, default=1 + To speed up the algorithm, accept only those bins with at least + min_bin_freq points as seeds. + + cluster_all : bool, default=True + If true, then all points are clustered, even those orphans that are + not within any kernel. Orphans are assigned to the nearest kernel. + If false, then orphans are given cluster label -1. + + n_jobs : int, default=None + The number of jobs to use for the computation. The following tasks benefit + from the parallelization: + + - The search of nearest neighbors for bandwidth estimation and label + assignments. See the details in the docstring of the + ``NearestNeighbors`` class. + - Hill-climbing optimization for all seeds. + + See :term:`Glossary ` for more details. + + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + max_iter : int, default=300 + Maximum number of iterations, per seed point before the clustering + operation terminates (for that seed point), if has not converged yet. + + .. versionadded:: 0.22 + + Attributes + ---------- + cluster_centers_ : ndarray of shape (n_clusters, n_features) + Coordinates of cluster centers. + + labels_ : ndarray of shape (n_samples,) + Labels of each point. + + n_iter_ : int + Maximum number of iterations performed on each seed. + + .. versionadded:: 0.22 + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + KMeans : K-Means clustering. + + Notes + ----- + + Scalability: + + Because this implementation uses a flat kernel and + a Ball Tree to look up members of each kernel, the complexity will tend + towards O(T*n*log(n)) in lower dimensions, with n the number of samples + and T the number of points. In higher dimensions the complexity will + tend towards O(T*n^2). + + Scalability can be boosted by using fewer seeds, for example by using + a higher value of min_bin_freq in the get_bin_seeds function. + + Note that the estimate_bandwidth function is much less scalable than the + mean shift algorithm and will be the bottleneck if it is used. + + References + ---------- + + Dorin Comaniciu and Peter Meer, "Mean Shift: A robust approach toward + feature space analysis". IEEE Transactions on Pattern Analysis and + Machine Intelligence. 2002. pp. 603-619. + + Examples + -------- + >>> from sklearn.cluster import MeanShift + >>> import numpy as np + >>> X = np.array([[1, 1], [2, 1], [1, 0], + ... [4, 7], [3, 5], [3, 6]]) + >>> clustering = MeanShift(bandwidth=2).fit(X) + >>> clustering.labels_ + array([1, 1, 1, 0, 0, 0]) + >>> clustering.predict([[0, 0], [5, 5]]) + array([1, 0]) + >>> clustering + MeanShift(bandwidth=2) + + For a comparison of Mean Shift clustering with other clustering algorithms, see + :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py` + """ + + _parameter_constraints: dict = { + "bandwidth": [Interval(Real, 0, None, closed="neither"), None], + "seeds": ["array-like", None], + "bin_seeding": ["boolean"], + "min_bin_freq": [Interval(Integral, 1, None, closed="left")], + "cluster_all": ["boolean"], + "n_jobs": [Integral, None], + "max_iter": [Interval(Integral, 0, None, closed="left")], + } + + def __init__( + self, + *, + bandwidth=None, + seeds=None, + bin_seeding=False, + min_bin_freq=1, + cluster_all=True, + n_jobs=None, + max_iter=300, + ): + self.bandwidth = bandwidth + self.seeds = seeds + self.bin_seeding = bin_seeding + self.cluster_all = cluster_all + self.min_bin_freq = min_bin_freq + self.n_jobs = n_jobs + self.max_iter = max_iter + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Perform clustering. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Samples to cluster. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Fitted instance. + """ + X = validate_data(self, X) + bandwidth = self.bandwidth + if bandwidth is None: + bandwidth = estimate_bandwidth(X, n_jobs=self.n_jobs) + + seeds = self.seeds + if seeds is None: + if self.bin_seeding: + seeds = get_bin_seeds(X, bandwidth, self.min_bin_freq) + else: + seeds = X + n_samples, n_features = X.shape + center_intensity_dict = {} + + # We use n_jobs=1 because this will be used in nested calls under + # parallel calls to _mean_shift_single_seed so there is no need for + # for further parallelism. + nbrs = NearestNeighbors(radius=bandwidth, n_jobs=1).fit(X) + + # execute iterations on all seeds in parallel + all_res = Parallel(n_jobs=self.n_jobs)( + delayed(_mean_shift_single_seed)(seed, X, nbrs, self.max_iter) + for seed in seeds + ) + # copy results in a dictionary + for i in range(len(seeds)): + if all_res[i][1]: # i.e. len(points_within) > 0 + center_intensity_dict[all_res[i][0]] = all_res[i][1] + + self.n_iter_ = max([x[2] for x in all_res]) + + if not center_intensity_dict: + # nothing near seeds + raise ValueError( + "No point was within bandwidth=%f of any seed. Try a different seeding" + " strategy or increase the bandwidth." + % bandwidth + ) + + # POST PROCESSING: remove near duplicate points + # If the distance between two kernels is less than the bandwidth, + # then we have to remove one because it is a duplicate. Remove the + # one with fewer points. + + sorted_by_intensity = sorted( + center_intensity_dict.items(), + key=lambda tup: (tup[1], tup[0]), + reverse=True, + ) + sorted_centers = np.array([tup[0] for tup in sorted_by_intensity]) + unique = np.ones(len(sorted_centers), dtype=bool) + nbrs = NearestNeighbors(radius=bandwidth, n_jobs=self.n_jobs).fit( + sorted_centers + ) + for i, center in enumerate(sorted_centers): + if unique[i]: + neighbor_idxs = nbrs.radius_neighbors([center], return_distance=False)[ + 0 + ] + unique[neighbor_idxs] = 0 + unique[i] = 1 # leave the current point as unique + cluster_centers = sorted_centers[unique] + + # ASSIGN LABELS: a point belongs to the cluster that it is closest to + nbrs = NearestNeighbors(n_neighbors=1, n_jobs=self.n_jobs).fit(cluster_centers) + labels = np.zeros(n_samples, dtype=int) + distances, idxs = nbrs.kneighbors(X) + if self.cluster_all: + labels = idxs.flatten() + else: + labels.fill(-1) + bool_selector = distances.flatten() <= bandwidth + labels[bool_selector] = idxs.flatten()[bool_selector] + + self.cluster_centers_, self.labels_ = cluster_centers, labels + return self + + def predict(self, X): + """Predict the closest cluster each sample in X belongs to. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + New data to predict. + + Returns + ------- + labels : ndarray of shape (n_samples,) + Index of the cluster each sample belongs to. + """ + check_is_fitted(self) + X = validate_data(self, X, reset=False) + with config_context(assume_finite=True): + return pairwise_distances_argmin(X, self.cluster_centers_) diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_optics.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_optics.py new file mode 100644 index 0000000000000000000000000000000000000000..4a1a80c9065c2d1504a6c97a926b919374e0a1ee --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_optics.py @@ -0,0 +1,1202 @@ +"""Ordering Points To Identify the Clustering Structure (OPTICS) + +These routines execute the OPTICS algorithm, and implement various +cluster extraction methods of the ordered list. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from numbers import Integral, Real + +import numpy as np +from scipy.sparse import SparseEfficiencyWarning, issparse + +from ..base import BaseEstimator, ClusterMixin, _fit_context +from ..exceptions import DataConversionWarning +from ..metrics import pairwise_distances +from ..metrics.pairwise import _VALID_METRICS, PAIRWISE_BOOLEAN_FUNCTIONS +from ..neighbors import NearestNeighbors +from ..utils import gen_batches +from ..utils._chunking import get_chunk_n_rows +from ..utils._param_validation import ( + HasMethods, + Interval, + RealNotInt, + StrOptions, + validate_params, +) +from ..utils.validation import check_memory, validate_data + + +class OPTICS(ClusterMixin, BaseEstimator): + """Estimate clustering structure from vector array. + + OPTICS (Ordering Points To Identify the Clustering Structure), closely + related to DBSCAN, finds core samples of high density and expands clusters + from them [1]_. Unlike DBSCAN, it keeps cluster hierarchy for a variable + neighborhood radius. Better suited for usage on large datasets than the + current scikit-learn implementation of DBSCAN. + + Clusters are then extracted from the cluster-order using a + DBSCAN-like method (cluster_method = 'dbscan') or an automatic + technique proposed in [1]_ (cluster_method = 'xi'). + + This implementation deviates from the original OPTICS by first performing + k-nearest-neighborhood searches on all points to identify core sizes of + all points (instead of computing neighbors while looping through points). + Reachability distances to only unprocessed points are then computed, to + construct the cluster order, similar to the original OPTICS. + Note that we do not employ a heap to manage the expansion + candidates, so the time complexity will be O(n^2). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + min_samples : int > 1 or float between 0 and 1, default=5 + The number of samples in a neighborhood for a point to be considered as + a core point. Also, up and down steep regions can't have more than + ``min_samples`` consecutive non-steep points. Expressed as an absolute + number or a fraction of the number of samples (rounded to be at least + 2). + + max_eps : float, default=np.inf + The maximum distance between two samples for one to be considered as + in the neighborhood of the other. Default value of ``np.inf`` will + identify clusters across all scales; reducing ``max_eps`` will result + in shorter run times. + + metric : str or callable, default='minkowski' + Metric to use for distance computation. Any metric from scikit-learn + or :mod:`scipy.spatial.distance` can be used. + + If `metric` is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays as input and return one value indicating the + distance between them. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. If metric is + "precomputed", `X` is assumed to be a distance matrix and must be + square. + + Valid values for metric are: + + - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan'] + + - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', + 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', + 'yule'] + + Sparse matrices are only supported by scikit-learn metrics. + See :mod:`scipy.spatial.distance` for details on these metrics. + + .. note:: + `'kulsinski'` is deprecated from SciPy 1.9 and will be removed in SciPy 1.11. + + p : float, default=2 + Parameter for the Minkowski metric from + :class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + cluster_method : {'xi', 'dbscan'}, default='xi' + The extraction method used to extract clusters using the calculated + reachability and ordering. + + eps : float, default=None + The maximum distance between two samples for one to be considered as + in the neighborhood of the other. By default it assumes the same value + as ``max_eps``. + Used only when ``cluster_method='dbscan'``. + + xi : float between 0 and 1, default=0.05 + Determines the minimum steepness on the reachability plot that + constitutes a cluster boundary. For example, an upwards point in the + reachability plot is defined by the ratio from one point to its + successor being at most 1-xi. + Used only when ``cluster_method='xi'``. + + predecessor_correction : bool, default=True + Correct clusters according to the predecessors calculated by OPTICS + [2]_. This parameter has minimal effect on most datasets. + Used only when ``cluster_method='xi'``. + + min_cluster_size : int > 1 or float between 0 and 1, default=None + Minimum number of samples in an OPTICS cluster, expressed as an + absolute number or a fraction of the number of samples (rounded to be + at least 2). If ``None``, the value of ``min_samples`` is used instead. + Used only when ``cluster_method='xi'``. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`~sklearn.neighbors.BallTree`. + - 'kd_tree' will use :class:`~sklearn.neighbors.KDTree`. + - 'brute' will use a brute-force search. + - 'auto' (default) will attempt to decide the most appropriate + algorithm based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf size passed to :class:`~sklearn.neighbors.BallTree` or + :class:`~sklearn.neighbors.KDTree`. This can affect the speed of the + construction and query, as well as the memory required to store the + tree. The optimal value depends on the nature of the problem. + + memory : str or object with the joblib.Memory interface, default=None + Used to cache the output of the computation of the tree. + By default, no caching is done. If a string is given, it is the + path to the caching directory. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Attributes + ---------- + labels_ : ndarray of shape (n_samples,) + Cluster labels for each point in the dataset given to fit(). + Noisy samples and points which are not included in a leaf cluster + of ``cluster_hierarchy_`` are labeled as -1. + + reachability_ : ndarray of shape (n_samples,) + Reachability distances per sample, indexed by object order. Use + ``clust.reachability_[clust.ordering_]`` to access in cluster order. + + ordering_ : ndarray of shape (n_samples,) + The cluster ordered list of sample indices. + + core_distances_ : ndarray of shape (n_samples,) + Distance at which each sample becomes a core point, indexed by object + order. Points which will never be core have a distance of inf. Use + ``clust.core_distances_[clust.ordering_]`` to access in cluster order. + + predecessor_ : ndarray of shape (n_samples,) + Point that a sample was reached from, indexed by object order. + Seed points have a predecessor of -1. + + cluster_hierarchy_ : ndarray of shape (n_clusters, 2) + The list of clusters in the form of ``[start, end]`` in each row, with + all indices inclusive. The clusters are ordered according to + ``(end, -start)`` (ascending) so that larger clusters encompassing + smaller clusters come after those smaller ones. Since ``labels_`` does + not reflect the hierarchy, usually + ``len(cluster_hierarchy_) > np.unique(optics.labels_)``. Please also + note that these indices are of the ``ordering_``, i.e. + ``X[ordering_][start:end + 1]`` form a cluster. + Only available when ``cluster_method='xi'``. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + DBSCAN : A similar clustering for a specified neighborhood radius (eps). + Our implementation is optimized for runtime. + + References + ---------- + .. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel, + and Jörg Sander. "OPTICS: ordering points to identify the clustering + structure." ACM SIGMOD Record 28, no. 2 (1999): 49-60. + + .. [2] Schubert, Erich, Michael Gertz. + "Improving the Cluster Structure Extracted from OPTICS Plots." Proc. of + the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329. + + Examples + -------- + >>> from sklearn.cluster import OPTICS + >>> import numpy as np + >>> X = np.array([[1, 2], [2, 5], [3, 6], + ... [8, 7], [8, 8], [7, 3]]) + >>> clustering = OPTICS(min_samples=2).fit(X) + >>> clustering.labels_ + array([0, 0, 0, 1, 1, 1]) + + For a more detailed example see + :ref:`sphx_glr_auto_examples_cluster_plot_optics.py`. + + For a comparison of OPTICS with other clustering algorithms, see + :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py` + """ + + _parameter_constraints: dict = { + "min_samples": [ + Interval(Integral, 2, None, closed="left"), + Interval(RealNotInt, 0, 1, closed="both"), + ], + "max_eps": [Interval(Real, 0, None, closed="both")], + "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable], + "p": [Interval(Real, 1, None, closed="left")], + "metric_params": [dict, None], + "cluster_method": [StrOptions({"dbscan", "xi"})], + "eps": [Interval(Real, 0, None, closed="both"), None], + "xi": [Interval(Real, 0, 1, closed="both")], + "predecessor_correction": ["boolean"], + "min_cluster_size": [ + Interval(Integral, 2, None, closed="left"), + Interval(RealNotInt, 0, 1, closed="right"), + None, + ], + "algorithm": [StrOptions({"auto", "brute", "ball_tree", "kd_tree"})], + "leaf_size": [Interval(Integral, 1, None, closed="left")], + "memory": [str, HasMethods("cache"), None], + "n_jobs": [Integral, None], + } + + def __init__( + self, + *, + min_samples=5, + max_eps=np.inf, + metric="minkowski", + p=2, + metric_params=None, + cluster_method="xi", + eps=None, + xi=0.05, + predecessor_correction=True, + min_cluster_size=None, + algorithm="auto", + leaf_size=30, + memory=None, + n_jobs=None, + ): + self.max_eps = max_eps + self.min_samples = min_samples + self.min_cluster_size = min_cluster_size + self.algorithm = algorithm + self.metric = metric + self.metric_params = metric_params + self.p = p + self.leaf_size = leaf_size + self.cluster_method = cluster_method + self.eps = eps + self.xi = xi + self.predecessor_correction = predecessor_correction + self.memory = memory + self.n_jobs = n_jobs + + @_fit_context( + # Optics.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y=None): + """Perform OPTICS clustering. + + Extracts an ordered list of points and reachability distances, and + performs initial clustering using ``max_eps`` distance specified at + OPTICS object instantiation. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features), or \ + (n_samples, n_samples) if metric='precomputed' + A feature array, or array of distances between samples if + metric='precomputed'. If a sparse matrix is provided, it will be + converted into CSR format. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Returns a fitted instance of self. + """ + dtype = bool if self.metric in PAIRWISE_BOOLEAN_FUNCTIONS else float + if dtype is bool and X.dtype != bool: + msg = ( + "Data will be converted to boolean for" + f" metric {self.metric}, to avoid this warning," + " you may convert the data prior to calling fit." + ) + warnings.warn(msg, DataConversionWarning) + + X = validate_data(self, X, dtype=dtype, accept_sparse="csr") + if self.metric == "precomputed" and issparse(X): + X = X.copy() # copy to avoid in-place modification + with warnings.catch_warnings(): + warnings.simplefilter("ignore", SparseEfficiencyWarning) + # Set each diagonal to an explicit value so each point is its + # own neighbor + X.setdiag(X.diagonal()) + memory = check_memory(self.memory) + + ( + self.ordering_, + self.core_distances_, + self.reachability_, + self.predecessor_, + ) = memory.cache(compute_optics_graph)( + X=X, + min_samples=self.min_samples, + algorithm=self.algorithm, + leaf_size=self.leaf_size, + metric=self.metric, + metric_params=self.metric_params, + p=self.p, + n_jobs=self.n_jobs, + max_eps=self.max_eps, + ) + + # Extract clusters from the calculated orders and reachability + if self.cluster_method == "xi": + labels_, clusters_ = cluster_optics_xi( + reachability=self.reachability_, + predecessor=self.predecessor_, + ordering=self.ordering_, + min_samples=self.min_samples, + min_cluster_size=self.min_cluster_size, + xi=self.xi, + predecessor_correction=self.predecessor_correction, + ) + self.cluster_hierarchy_ = clusters_ + elif self.cluster_method == "dbscan": + if self.eps is None: + eps = self.max_eps + else: + eps = self.eps + + if eps > self.max_eps: + raise ValueError( + "Specify an epsilon smaller than %s. Got %s." % (self.max_eps, eps) + ) + + labels_ = cluster_optics_dbscan( + reachability=self.reachability_, + core_distances=self.core_distances_, + ordering=self.ordering_, + eps=eps, + ) + + self.labels_ = labels_ + return self + + +def _validate_size(size, n_samples, param_name): + if size > n_samples: + raise ValueError( + "%s must be no greater than the number of samples (%d). Got %d" + % (param_name, n_samples, size) + ) + + +# OPTICS helper functions +def _compute_core_distances_(X, neighbors, min_samples, working_memory): + """Compute the k-th nearest neighbor of each sample. + + Equivalent to neighbors.kneighbors(X, self.min_samples)[0][:, -1] + but with more memory efficiency. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data. + neighbors : NearestNeighbors instance + The fitted nearest neighbors estimator. + working_memory : int, default=None + The sought maximum memory for temporary distance matrix chunks. + When None (default), the value of + ``sklearn.get_config()['working_memory']`` is used. + + Returns + ------- + core_distances : ndarray of shape (n_samples,) + Distance at which each sample becomes a core point. + Points which will never be core have a distance of inf. + """ + n_samples = X.shape[0] + core_distances = np.empty(n_samples) + core_distances.fill(np.nan) + + chunk_n_rows = get_chunk_n_rows( + row_bytes=16 * min_samples, max_n_rows=n_samples, working_memory=working_memory + ) + slices = gen_batches(n_samples, chunk_n_rows) + for sl in slices: + core_distances[sl] = neighbors.kneighbors(X[sl], min_samples)[0][:, -1] + return core_distances + + +@validate_params( + { + "X": [np.ndarray, "sparse matrix"], + "min_samples": [ + Interval(Integral, 2, None, closed="left"), + Interval(RealNotInt, 0, 1, closed="both"), + ], + "max_eps": [Interval(Real, 0, None, closed="both")], + "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable], + "p": [Interval(Real, 0, None, closed="right"), None], + "metric_params": [dict, None], + "algorithm": [StrOptions({"auto", "brute", "ball_tree", "kd_tree"})], + "leaf_size": [Interval(Integral, 1, None, closed="left")], + "n_jobs": [Integral, None], + }, + prefer_skip_nested_validation=False, # metric is not validated yet +) +def compute_optics_graph( + X, *, min_samples, max_eps, metric, p, metric_params, algorithm, leaf_size, n_jobs +): + """Compute the OPTICS reachability graph. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features), or \ + (n_samples, n_samples) if metric='precomputed' + A feature array, or array of distances between samples if + metric='precomputed'. + + min_samples : int > 1 or float between 0 and 1 + The number of samples in a neighborhood for a point to be considered + as a core point. Expressed as an absolute number or a fraction of the + number of samples (rounded to be at least 2). + + max_eps : float, default=np.inf + The maximum distance between two samples for one to be considered as + in the neighborhood of the other. Default value of ``np.inf`` will + identify clusters across all scales; reducing ``max_eps`` will result + in shorter run times. + + metric : str or callable, default='minkowski' + Metric to use for distance computation. Any metric from scikit-learn + or scipy.spatial.distance can be used. + + If metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays as input and return one value indicating the + distance between them. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. If metric is + "precomputed", X is assumed to be a distance matrix and must be square. + + Valid values for metric are: + + - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan'] + + - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', + 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', + 'yule'] + + See the documentation for scipy.spatial.distance for details on these + metrics. + + .. note:: + `'kulsinski'` is deprecated from SciPy 1.9 and will be removed in SciPy 1.11. + + p : float, default=2 + Parameter for the Minkowski metric from + :class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`~sklearn.neighbors.BallTree`. + - 'kd_tree' will use :class:`~sklearn.neighbors.KDTree`. + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to `fit` method. (default) + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf size passed to :class:`~sklearn.neighbors.BallTree` or + :class:`~sklearn.neighbors.KDTree`. This can affect the speed of the + construction and query, as well as the memory required to store the + tree. The optimal value depends on the nature of the problem. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Returns + ------- + ordering_ : array of shape (n_samples,) + The cluster ordered list of sample indices. + + core_distances_ : array of shape (n_samples,) + Distance at which each sample becomes a core point, indexed by object + order. Points which will never be core have a distance of inf. Use + ``clust.core_distances_[clust.ordering_]`` to access in cluster order. + + reachability_ : array of shape (n_samples,) + Reachability distances per sample, indexed by object order. Use + ``clust.reachability_[clust.ordering_]`` to access in cluster order. + + predecessor_ : array of shape (n_samples,) + Point that a sample was reached from, indexed by object order. + Seed points have a predecessor of -1. + + References + ---------- + .. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel, + and Jörg Sander. "OPTICS: ordering points to identify the clustering + structure." ACM SIGMOD Record 28, no. 2 (1999): 49-60. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.cluster import compute_optics_graph + >>> X = np.array([[1, 2], [2, 5], [3, 6], + ... [8, 7], [8, 8], [7, 3]]) + >>> ordering, core_distances, reachability, predecessor = compute_optics_graph( + ... X, + ... min_samples=2, + ... max_eps=np.inf, + ... metric="minkowski", + ... p=2, + ... metric_params=None, + ... algorithm="auto", + ... leaf_size=30, + ... n_jobs=None, + ... ) + >>> ordering + array([0, 1, 2, 5, 3, 4]) + >>> core_distances + array([3.16, 1.41, 1.41, 1. , 1. , + 4.12]) + >>> reachability + array([ inf, 3.16, 1.41, 4.12, 1. , + 5. ]) + >>> predecessor + array([-1, 0, 1, 5, 3, 2]) + """ + n_samples = X.shape[0] + _validate_size(min_samples, n_samples, "min_samples") + if min_samples <= 1: + min_samples = max(2, int(min_samples * n_samples)) + + # Start all points as 'unprocessed' ## + reachability_ = np.empty(n_samples) + reachability_.fill(np.inf) + predecessor_ = np.empty(n_samples, dtype=int) + predecessor_.fill(-1) + + nbrs = NearestNeighbors( + n_neighbors=min_samples, + algorithm=algorithm, + leaf_size=leaf_size, + metric=metric, + metric_params=metric_params, + p=p, + n_jobs=n_jobs, + ) + + nbrs.fit(X) + # Here we first do a kNN query for each point, this differs from + # the original OPTICS that only used epsilon range queries. + # TODO: handle working_memory somehow? + core_distances_ = _compute_core_distances_( + X=X, neighbors=nbrs, min_samples=min_samples, working_memory=None + ) + # OPTICS puts an upper limit on these, use inf for undefined. + core_distances_[core_distances_ > max_eps] = np.inf + np.around( + core_distances_, + decimals=np.finfo(core_distances_.dtype).precision, + out=core_distances_, + ) + + # Main OPTICS loop. Not parallelizable. The order that entries are + # written to the 'ordering_' list is important! + # Note that this implementation is O(n^2) theoretically, but + # supposedly with very low constant factors. + processed = np.zeros(X.shape[0], dtype=bool) + ordering = np.zeros(X.shape[0], dtype=int) + for ordering_idx in range(X.shape[0]): + # Choose next based on smallest reachability distance + # (And prefer smaller ids on ties, possibly np.inf!) + index = np.where(processed == 0)[0] + point = index[np.argmin(reachability_[index])] + + processed[point] = True + ordering[ordering_idx] = point + if core_distances_[point] != np.inf: + _set_reach_dist( + core_distances_=core_distances_, + reachability_=reachability_, + predecessor_=predecessor_, + point_index=point, + processed=processed, + X=X, + nbrs=nbrs, + metric=metric, + metric_params=metric_params, + p=p, + max_eps=max_eps, + ) + if np.all(np.isinf(reachability_)): + warnings.warn( + ( + "All reachability values are inf. Set a larger" + " max_eps or all data will be considered outliers." + ), + UserWarning, + ) + return ordering, core_distances_, reachability_, predecessor_ + + +def _set_reach_dist( + core_distances_, + reachability_, + predecessor_, + point_index, + processed, + X, + nbrs, + metric, + metric_params, + p, + max_eps, +): + P = X[point_index : point_index + 1] + # Assume that radius_neighbors is faster without distances + # and we don't need all distances, nevertheless, this means + # we may be doing some work twice. + indices = nbrs.radius_neighbors(P, radius=max_eps, return_distance=False)[0] + + # Getting indices of neighbors that have not been processed + unproc = np.compress(~np.take(processed, indices), indices) + # Neighbors of current point are already processed. + if not unproc.size: + return + + # Only compute distances to unprocessed neighbors: + if metric == "precomputed": + dists = X[[point_index], unproc] + if isinstance(dists, np.matrix): + dists = np.asarray(dists) + dists = dists.ravel() + else: + _params = dict() if metric_params is None else metric_params.copy() + if metric == "minkowski" and "p" not in _params: + # the same logic as neighbors, p is ignored if explicitly set + # in the dict params + _params["p"] = p + dists = pairwise_distances(P, X[unproc], metric, n_jobs=None, **_params).ravel() + + rdists = np.maximum(dists, core_distances_[point_index]) + np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists) + improved = np.where(rdists < np.take(reachability_, unproc)) + reachability_[unproc[improved]] = rdists[improved] + predecessor_[unproc[improved]] = point_index + + +@validate_params( + { + "reachability": [np.ndarray], + "core_distances": [np.ndarray], + "ordering": [np.ndarray], + "eps": [Interval(Real, 0, None, closed="both")], + }, + prefer_skip_nested_validation=True, +) +def cluster_optics_dbscan(*, reachability, core_distances, ordering, eps): + """Perform DBSCAN extraction for an arbitrary epsilon. + + Extracting the clusters runs in linear time. Note that this results in + ``labels_`` which are close to a :class:`~sklearn.cluster.DBSCAN` with + similar settings and ``eps``, only if ``eps`` is close to ``max_eps``. + + Parameters + ---------- + reachability : ndarray of shape (n_samples,) + Reachability distances calculated by OPTICS (``reachability_``). + + core_distances : ndarray of shape (n_samples,) + Distances at which points become core (``core_distances_``). + + ordering : ndarray of shape (n_samples,) + OPTICS ordered point indices (``ordering_``). + + eps : float + DBSCAN ``eps`` parameter. Must be set to < ``max_eps``. Results + will be close to DBSCAN algorithm if ``eps`` and ``max_eps`` are close + to one another. + + Returns + ------- + labels_ : array of shape (n_samples,) + The estimated labels. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.cluster import cluster_optics_dbscan, compute_optics_graph + >>> X = np.array([[1, 2], [2, 5], [3, 6], + ... [8, 7], [8, 8], [7, 3]]) + >>> ordering, core_distances, reachability, predecessor = compute_optics_graph( + ... X, + ... min_samples=2, + ... max_eps=np.inf, + ... metric="minkowski", + ... p=2, + ... metric_params=None, + ... algorithm="auto", + ... leaf_size=30, + ... n_jobs=None, + ... ) + >>> eps = 4.5 + >>> labels = cluster_optics_dbscan( + ... reachability=reachability, + ... core_distances=core_distances, + ... ordering=ordering, + ... eps=eps, + ... ) + >>> labels + array([0, 0, 0, 1, 1, 1]) + """ + n_samples = len(core_distances) + labels = np.zeros(n_samples, dtype=int) + + far_reach = reachability > eps + near_core = core_distances <= eps + labels[ordering] = np.cumsum(far_reach[ordering] & near_core[ordering]) - 1 + labels[far_reach & ~near_core] = -1 + return labels + + +@validate_params( + { + "reachability": [np.ndarray], + "predecessor": [np.ndarray], + "ordering": [np.ndarray], + "min_samples": [ + Interval(Integral, 2, None, closed="left"), + Interval(RealNotInt, 0, 1, closed="both"), + ], + "min_cluster_size": [ + Interval(Integral, 2, None, closed="left"), + Interval(RealNotInt, 0, 1, closed="both"), + None, + ], + "xi": [Interval(Real, 0, 1, closed="both")], + "predecessor_correction": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def cluster_optics_xi( + *, + reachability, + predecessor, + ordering, + min_samples, + min_cluster_size=None, + xi=0.05, + predecessor_correction=True, +): + """Automatically extract clusters according to the Xi-steep method. + + Parameters + ---------- + reachability : ndarray of shape (n_samples,) + Reachability distances calculated by OPTICS (`reachability_`). + + predecessor : ndarray of shape (n_samples,) + Predecessors calculated by OPTICS. + + ordering : ndarray of shape (n_samples,) + OPTICS ordered point indices (`ordering_`). + + min_samples : int > 1 or float between 0 and 1 + The same as the min_samples given to OPTICS. Up and down steep regions + can't have more then ``min_samples`` consecutive non-steep points. + Expressed as an absolute number or a fraction of the number of samples + (rounded to be at least 2). + + min_cluster_size : int > 1 or float between 0 and 1, default=None + Minimum number of samples in an OPTICS cluster, expressed as an + absolute number or a fraction of the number of samples (rounded to be + at least 2). If ``None``, the value of ``min_samples`` is used instead. + + xi : float between 0 and 1, default=0.05 + Determines the minimum steepness on the reachability plot that + constitutes a cluster boundary. For example, an upwards point in the + reachability plot is defined by the ratio from one point to its + successor being at most 1-xi. + + predecessor_correction : bool, default=True + Correct clusters based on the calculated predecessors. + + Returns + ------- + labels : ndarray of shape (n_samples,) + The labels assigned to samples. Points which are not included + in any cluster are labeled as -1. + + clusters : ndarray of shape (n_clusters, 2) + The list of clusters in the form of ``[start, end]`` in each row, with + all indices inclusive. The clusters are ordered according to ``(end, + -start)`` (ascending) so that larger clusters encompassing smaller + clusters come after such nested smaller clusters. Since ``labels`` does + not reflect the hierarchy, usually ``len(clusters) > + np.unique(labels)``. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.cluster import cluster_optics_xi, compute_optics_graph + >>> X = np.array([[1, 2], [2, 5], [3, 6], + ... [8, 7], [8, 8], [7, 3]]) + >>> ordering, core_distances, reachability, predecessor = compute_optics_graph( + ... X, + ... min_samples=2, + ... max_eps=np.inf, + ... metric="minkowski", + ... p=2, + ... metric_params=None, + ... algorithm="auto", + ... leaf_size=30, + ... n_jobs=None + ... ) + >>> min_samples = 2 + >>> labels, clusters = cluster_optics_xi( + ... reachability=reachability, + ... predecessor=predecessor, + ... ordering=ordering, + ... min_samples=min_samples, + ... ) + >>> labels + array([0, 0, 0, 1, 1, 1]) + >>> clusters + array([[0, 2], + [3, 5], + [0, 5]]) + """ + n_samples = len(reachability) + _validate_size(min_samples, n_samples, "min_samples") + if min_samples <= 1: + min_samples = max(2, int(min_samples * n_samples)) + if min_cluster_size is None: + min_cluster_size = min_samples + _validate_size(min_cluster_size, n_samples, "min_cluster_size") + if min_cluster_size <= 1: + min_cluster_size = max(2, int(min_cluster_size * n_samples)) + + clusters = _xi_cluster( + reachability[ordering], + predecessor[ordering], + ordering, + xi, + min_samples, + min_cluster_size, + predecessor_correction, + ) + labels = _extract_xi_labels(ordering, clusters) + return labels, clusters + + +def _extend_region(steep_point, xward_point, start, min_samples): + """Extend the area until it's maximal. + + It's the same function for both upward and downward reagions, depending on + the given input parameters. Assuming: + + - steep_{upward/downward}: bool array indicating whether a point is a + steep {upward/downward}; + - upward/downward: bool array indicating whether a point is + upward/downward; + + To extend an upward reagion, ``steep_point=steep_upward`` and + ``xward_point=downward`` are expected, and to extend a downward region, + ``steep_point=steep_downward`` and ``xward_point=upward``. + + Parameters + ---------- + steep_point : ndarray of shape (n_samples,), dtype=bool + True if the point is steep downward (upward). + + xward_point : ndarray of shape (n_samples,), dtype=bool + True if the point is an upward (respectively downward) point. + + start : int + The start of the xward region. + + min_samples : int + The same as the min_samples given to OPTICS. Up and down steep + regions can't have more then ``min_samples`` consecutive non-steep + points. + + Returns + ------- + index : int + The current index iterating over all the samples, i.e. where we are up + to in our search. + + end : int + The end of the region, which can be behind the index. The region + includes the ``end`` index. + """ + n_samples = len(steep_point) + non_xward_points = 0 + index = start + end = start + # find a maximal area + while index < n_samples: + if steep_point[index]: + non_xward_points = 0 + end = index + elif not xward_point[index]: + # it's not a steep point, but still goes up. + non_xward_points += 1 + # region should include no more than min_samples consecutive + # non steep xward points. + if non_xward_points > min_samples: + break + else: + return end + index += 1 + return end + + +def _update_filter_sdas(sdas, mib, xi_complement, reachability_plot): + """Update steep down areas (SDAs) using the new maximum in between (mib) + value, and the given complement of xi, i.e. ``1 - xi``. + """ + if np.isinf(mib): + return [] + res = [ + sda for sda in sdas if mib <= reachability_plot[sda["start"]] * xi_complement + ] + for sda in res: + sda["mib"] = max(sda["mib"], mib) + return res + + +def _correct_predecessor(reachability_plot, predecessor_plot, ordering, s, e): + """Correct for predecessors. + + Applies Algorithm 2 of [1]_. + + Input parameters are ordered by the computer OPTICS ordering. + + .. [1] Schubert, Erich, Michael Gertz. + "Improving the Cluster Structure Extracted from OPTICS Plots." Proc. of + the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329. + """ + while s < e: + if reachability_plot[s] > reachability_plot[e]: + return s, e + p_e = predecessor_plot[e] + for i in range(s, e): + if p_e == ordering[i]: + return s, e + e -= 1 + return None, None + + +def _xi_cluster( + reachability_plot, + predecessor_plot, + ordering, + xi, + min_samples, + min_cluster_size, + predecessor_correction, +): + """Automatically extract clusters according to the Xi-steep method. + + This is rouphly an implementation of Figure 19 of the OPTICS paper. + + Parameters + ---------- + reachability_plot : array-like of shape (n_samples,) + The reachability plot, i.e. reachability ordered according to + the calculated ordering, all computed by OPTICS. + + predecessor_plot : array-like of shape (n_samples,) + Predecessors ordered according to the calculated ordering. + + xi : float, between 0 and 1 + Determines the minimum steepness on the reachability plot that + constitutes a cluster boundary. For example, an upwards point in the + reachability plot is defined by the ratio from one point to its + successor being at most 1-xi. + + min_samples : int > 1 + The same as the min_samples given to OPTICS. Up and down steep regions + can't have more then ``min_samples`` consecutive non-steep points. + + min_cluster_size : int > 1 + Minimum number of samples in an OPTICS cluster. + + predecessor_correction : bool + Correct clusters based on the calculated predecessors. + + Returns + ------- + clusters : ndarray of shape (n_clusters, 2) + The list of clusters in the form of [start, end] in each row, with all + indices inclusive. The clusters are ordered in a way that larger + clusters encompassing smaller clusters come after those smaller + clusters. + """ + + # Our implementation adds an inf to the end of reachability plot + # this helps to find potential clusters at the end of the + # reachability plot even if there's no upward region at the end of it. + reachability_plot = np.hstack((reachability_plot, np.inf)) + + xi_complement = 1 - xi + sdas = [] # steep down areas, introduced in section 4.3.2 of the paper + clusters = [] + index = 0 + mib = 0.0 # maximum in between, section 4.3.2 + + # Our implementation corrects a mistake in the original + # paper, i.e., in Definition 9 steep downward point, + # r(p) * (1 - x1) <= r(p + 1) should be + # r(p) * (1 - x1) >= r(p + 1) + with np.errstate(invalid="ignore"): + ratio = reachability_plot[:-1] / reachability_plot[1:] + steep_upward = ratio <= xi_complement + steep_downward = ratio >= 1 / xi_complement + downward = ratio > 1 + upward = ratio < 1 + + # the following loop is almost exactly as Figure 19 of the paper. + # it jumps over the areas which are not either steep down or up areas + for steep_index in iter(np.flatnonzero(steep_upward | steep_downward)): + # just continue if steep_index has been a part of a discovered xward + # area. + if steep_index < index: + continue + + mib = max(mib, np.max(reachability_plot[index : steep_index + 1])) + + # steep downward areas + if steep_downward[steep_index]: + sdas = _update_filter_sdas(sdas, mib, xi_complement, reachability_plot) + D_start = steep_index + D_end = _extend_region(steep_downward, upward, D_start, min_samples) + D = {"start": D_start, "end": D_end, "mib": 0.0} + sdas.append(D) + index = D_end + 1 + mib = reachability_plot[index] + + # steep upward areas + else: + sdas = _update_filter_sdas(sdas, mib, xi_complement, reachability_plot) + U_start = steep_index + U_end = _extend_region(steep_upward, downward, U_start, min_samples) + index = U_end + 1 + mib = reachability_plot[index] + + U_clusters = [] + for D in sdas: + c_start = D["start"] + c_end = U_end + + # line (**), sc2* + if reachability_plot[c_end + 1] * xi_complement < D["mib"]: + continue + + # Definition 11: criterion 4 + D_max = reachability_plot[D["start"]] + if D_max * xi_complement >= reachability_plot[c_end + 1]: + # Find the first index from the left side which is almost + # at the same level as the end of the detected cluster. + while ( + reachability_plot[c_start + 1] > reachability_plot[c_end + 1] + and c_start < D["end"] + ): + c_start += 1 + elif reachability_plot[c_end + 1] * xi_complement >= D_max: + # Find the first index from the right side which is almost + # at the same level as the beginning of the detected + # cluster. + # Our implementation corrects a mistake in the original + # paper, i.e., in Definition 11 4c, r(x) < r(sD) should be + # r(x) > r(sD). + while reachability_plot[c_end - 1] > D_max and c_end > U_start: + c_end -= 1 + + # predecessor correction + if predecessor_correction: + c_start, c_end = _correct_predecessor( + reachability_plot, predecessor_plot, ordering, c_start, c_end + ) + if c_start is None: + continue + + # Definition 11: criterion 3.a + if c_end - c_start + 1 < min_cluster_size: + continue + + # Definition 11: criterion 1 + if c_start > D["end"]: + continue + + # Definition 11: criterion 2 + if c_end < U_start: + continue + + U_clusters.append((c_start, c_end)) + + # add smaller clusters first. + U_clusters.reverse() + clusters.extend(U_clusters) + + return np.array(clusters) + + +def _extract_xi_labels(ordering, clusters): + """Extracts the labels from the clusters returned by `_xi_cluster`. + We rely on the fact that clusters are stored + with the smaller clusters coming before the larger ones. + + Parameters + ---------- + ordering : array-like of shape (n_samples,) + The ordering of points calculated by OPTICS + + clusters : array-like of shape (n_clusters, 2) + List of clusters i.e. (start, end) tuples, + as returned by `_xi_cluster`. + + Returns + ------- + labels : ndarray of shape (n_samples,) + """ + + labels = np.full(len(ordering), -1, dtype=int) + label = 0 + for c in clusters: + if not np.any(labels[c[0] : (c[1] + 1)] != -1): + labels[c[0] : (c[1] + 1)] = label + label += 1 + labels[ordering] = labels.copy() + return labels diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/_spectral.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/_spectral.py new file mode 100644 index 0000000000000000000000000000000000000000..00d23437504e5ad019e49583972f244d85a5dae6 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/_spectral.py @@ -0,0 +1,805 @@ +"""Algorithms for spectral clustering""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from numbers import Integral, Real + +import numpy as np +from scipy.linalg import LinAlgError, qr, svd +from scipy.sparse import csc_matrix + +from ..base import BaseEstimator, ClusterMixin, _fit_context +from ..manifold._spectral_embedding import _spectral_embedding +from ..metrics.pairwise import KERNEL_PARAMS, pairwise_kernels +from ..neighbors import NearestNeighbors, kneighbors_graph +from ..utils import as_float_array, check_random_state +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.validation import validate_data +from ._kmeans import k_means + + +def cluster_qr(vectors): + """Find the discrete partition closest to the eigenvector embedding. + + This implementation was proposed in [1]_. + + .. versionadded:: 1.1 + + Parameters + ---------- + vectors : array-like, shape: (n_samples, n_clusters) + The embedding space of the samples. + + Returns + ------- + labels : array of integers, shape: n_samples + The cluster labels of vectors. + + References + ---------- + .. [1] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019 + Anil Damle, Victor Minden, Lexing Ying + <10.1093/imaiai/iay008>` + + """ + + k = vectors.shape[1] + _, _, piv = qr(vectors.T, pivoting=True) + ut, _, v = svd(vectors[piv[:k], :].T) + vectors = abs(np.dot(vectors, np.dot(ut, v.conj()))) + return vectors.argmax(axis=1) + + +def discretize( + vectors, *, copy=True, max_svd_restarts=30, n_iter_max=20, random_state=None +): + """Search for a partition matrix which is closest to the eigenvector embedding. + + This implementation was proposed in [1]_. + + Parameters + ---------- + vectors : array-like of shape (n_samples, n_clusters) + The embedding space of the samples. + + copy : bool, default=True + Whether to copy vectors, or perform in-place normalization. + + max_svd_restarts : int, default=30 + Maximum number of attempts to restart SVD if convergence fails + + n_iter_max : int, default=30 + Maximum number of iterations to attempt in rotation and partition + matrix search if machine precision convergence is not reached + + random_state : int, RandomState instance, default=None + Determines random number generation for rotation matrix initialization. + Use an int to make the randomness deterministic. + See :term:`Glossary `. + + Returns + ------- + labels : array of integers, shape: n_samples + The labels of the clusters. + + References + ---------- + + .. [1] `Multiclass spectral clustering, 2003 + Stella X. Yu, Jianbo Shi + `_ + + Notes + ----- + + The eigenvector embedding is used to iteratively search for the + closest discrete partition. First, the eigenvector embedding is + normalized to the space of partition matrices. An optimal discrete + partition matrix closest to this normalized embedding multiplied by + an initial rotation is calculated. Fixing this discrete partition + matrix, an optimal rotation matrix is calculated. These two + calculations are performed until convergence. The discrete partition + matrix is returned as the clustering solution. Used in spectral + clustering, this method tends to be faster and more robust to random + initialization than k-means. + + """ + + random_state = check_random_state(random_state) + + vectors = as_float_array(vectors, copy=copy) + + eps = np.finfo(float).eps + n_samples, n_components = vectors.shape + + # Normalize the eigenvectors to an equal length of a vector of ones. + # Reorient the eigenvectors to point in the negative direction with respect + # to the first element. This may have to do with constraining the + # eigenvectors to lie in a specific quadrant to make the discretization + # search easier. + norm_ones = np.sqrt(n_samples) + for i in range(vectors.shape[1]): + vectors[:, i] = (vectors[:, i] / np.linalg.norm(vectors[:, i])) * norm_ones + if vectors[0, i] != 0: + vectors[:, i] = -1 * vectors[:, i] * np.sign(vectors[0, i]) + + # Normalize the rows of the eigenvectors. Samples should lie on the unit + # hypersphere centered at the origin. This transforms the samples in the + # embedding space to the space of partition matrices. + vectors = vectors / np.sqrt((vectors**2).sum(axis=1))[:, np.newaxis] + + svd_restarts = 0 + has_converged = False + + # If there is an exception we try to randomize and rerun SVD again + # do this max_svd_restarts times. + while (svd_restarts < max_svd_restarts) and not has_converged: + # Initialize first column of rotation matrix with a row of the + # eigenvectors + rotation = np.zeros((n_components, n_components)) + rotation[:, 0] = vectors[random_state.randint(n_samples), :].T + + # To initialize the rest of the rotation matrix, find the rows + # of the eigenvectors that are as orthogonal to each other as + # possible + c = np.zeros(n_samples) + for j in range(1, n_components): + # Accumulate c to ensure row is as orthogonal as possible to + # previous picks as well as current one + c += np.abs(np.dot(vectors, rotation[:, j - 1])) + rotation[:, j] = vectors[c.argmin(), :].T + + last_objective_value = 0.0 + n_iter = 0 + + while not has_converged: + n_iter += 1 + + t_discrete = np.dot(vectors, rotation) + + labels = t_discrete.argmax(axis=1) + vectors_discrete = csc_matrix( + (np.ones(len(labels)), (np.arange(0, n_samples), labels)), + shape=(n_samples, n_components), + ) + + t_svd = vectors_discrete.T @ vectors + + try: + U, S, Vh = np.linalg.svd(t_svd) + except LinAlgError: + svd_restarts += 1 + print("SVD did not converge, randomizing and trying again") + break + + ncut_value = 2.0 * (n_samples - S.sum()) + if (abs(ncut_value - last_objective_value) < eps) or (n_iter > n_iter_max): + has_converged = True + else: + # otherwise calculate rotation and continue + last_objective_value = ncut_value + rotation = np.dot(Vh.T, U.T) + + if not has_converged: + raise LinAlgError("SVD did not converge") + return labels + + +@validate_params( + {"affinity": ["array-like", "sparse matrix"]}, + prefer_skip_nested_validation=False, +) +def spectral_clustering( + affinity, + *, + n_clusters=8, + n_components=None, + eigen_solver=None, + random_state=None, + n_init=10, + eigen_tol="auto", + assign_labels="kmeans", + verbose=False, +): + """Apply clustering to a projection of the normalized Laplacian. + + In practice Spectral Clustering is very useful when the structure of + the individual clusters is highly non-convex or more generally when + a measure of the center and spread of the cluster is not a suitable + description of the complete cluster. For instance, when clusters are + nested circles on the 2D plane. + + If affinity is the adjacency matrix of a graph, this method can be + used to find normalized graph cuts [1]_, [2]_. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + affinity : {array-like, sparse matrix} of shape (n_samples, n_samples) + The affinity matrix describing the relationship of the samples to + embed. **Must be symmetric**. + + Possible examples: + - adjacency matrix of a graph, + - heat kernel of the pairwise distance matrix of the samples, + - symmetric k-nearest neighbours connectivity matrix of the samples. + + n_clusters : int, default=None + Number of clusters to extract. + + n_components : int, default=n_clusters + Number of eigenvectors to use for the spectral embedding. + + eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'} + The eigenvalue decomposition method. If None then ``'arpack'`` is used. + See [4]_ for more details regarding ``'lobpcg'``. + Eigensolver ``'amg'`` runs ``'lobpcg'`` with optional + Algebraic MultiGrid preconditioning and requires pyamg to be installed. + It can be faster on very large sparse problems [6]_ and [7]_. + + random_state : int, RandomState instance, default=None + A pseudo random number generator used for the initialization + of the lobpcg eigenvectors decomposition when `eigen_solver == + 'amg'`, and for the K-Means initialization. Use an int to make + the results deterministic across calls (See + :term:`Glossary `). + + .. note:: + When using `eigen_solver == 'amg'`, + it is necessary to also fix the global numpy seed with + `np.random.seed(int)` to get deterministic results. See + https://github.com/pyamg/pyamg/issues/139 for further + information. + + n_init : int, default=10 + Number of time the k-means algorithm will be run with different + centroid seeds. The final results will be the best output of n_init + consecutive runs in terms of inertia. Only used if + ``assign_labels='kmeans'``. + + eigen_tol : float, default="auto" + Stopping criterion for eigendecomposition of the Laplacian matrix. + If `eigen_tol="auto"` then the passed tolerance will depend on the + `eigen_solver`: + + - If `eigen_solver="arpack"`, then `eigen_tol=0.0`; + - If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then + `eigen_tol=None` which configures the underlying `lobpcg` solver to + automatically resolve the value according to their heuristics. See, + :func:`scipy.sparse.linalg.lobpcg` for details. + + Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"` + values of `tol<1e-5` may lead to convergence issues and should be + avoided. + + .. versionadded:: 1.2 + Added 'auto' option. + + assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans' + The strategy to use to assign labels in the embedding + space. There are three ways to assign labels after the Laplacian + embedding. k-means can be applied and is a popular choice. But it can + also be sensitive to initialization. Discretization is another + approach which is less sensitive to random initialization [3]_. + The cluster_qr method [5]_ directly extracts clusters from eigenvectors + in spectral clustering. In contrast to k-means and discretization, cluster_qr + has no tuning parameters and is not an iterative method, yet may outperform + k-means and discretization in terms of both quality and speed. For a detailed + comparison of clustering strategies, refer to the following example: + :ref:`sphx_glr_auto_examples_cluster_plot_coin_segmentation.py`. + + .. versionchanged:: 1.1 + Added new labeling method 'cluster_qr'. + + verbose : bool, default=False + Verbosity mode. + + .. versionadded:: 0.24 + + Returns + ------- + labels : array of integers, shape: n_samples + The labels of the clusters. + + Notes + ----- + The graph should contain only one connected component, elsewhere + the results make little sense. + + This algorithm solves the normalized cut for `k=2`: it is a + normalized spectral clustering. + + References + ---------- + + .. [1] :doi:`Normalized cuts and image segmentation, 2000 + Jianbo Shi, Jitendra Malik + <10.1109/34.868688>` + + .. [2] :doi:`A Tutorial on Spectral Clustering, 2007 + Ulrike von Luxburg + <10.1007/s11222-007-9033-z>` + + .. [3] `Multiclass spectral clustering, 2003 + Stella X. Yu, Jianbo Shi + `_ + + .. [4] :doi:`Toward the Optimal Preconditioned Eigensolver: + Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001 + A. V. Knyazev + SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541. + <10.1137/S1064827500366124>` + + .. [5] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019 + Anil Damle, Victor Minden, Lexing Ying + <10.1093/imaiai/iay008>` + + .. [6] :doi:`Multiscale Spectral Image Segmentation Multiscale preconditioning + for computing eigenvalues of graph Laplacians in image segmentation, 2006 + Andrew Knyazev + <10.13140/RG.2.2.35280.02565>` + + .. [7] :doi:`Preconditioned spectral clustering for stochastic block partition + streaming graph challenge (Preliminary version at arXiv.) + David Zhuzhunashvili, Andrew Knyazev + <10.1109/HPEC.2017.8091045>` + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics.pairwise import pairwise_kernels + >>> from sklearn.cluster import spectral_clustering + >>> X = np.array([[1, 1], [2, 1], [1, 0], + ... [4, 7], [3, 5], [3, 6]]) + >>> affinity = pairwise_kernels(X, metric='rbf') + >>> spectral_clustering( + ... affinity=affinity, n_clusters=2, assign_labels="discretize", random_state=0 + ... ) + array([1, 1, 1, 0, 0, 0]) + """ + + clusterer = SpectralClustering( + n_clusters=n_clusters, + n_components=n_components, + eigen_solver=eigen_solver, + random_state=random_state, + n_init=n_init, + affinity="precomputed", + eigen_tol=eigen_tol, + assign_labels=assign_labels, + verbose=verbose, + ).fit(affinity) + + return clusterer.labels_ + + +class SpectralClustering(ClusterMixin, BaseEstimator): + """Apply clustering to a projection of the normalized Laplacian. + + In practice Spectral Clustering is very useful when the structure of + the individual clusters is highly non-convex, or more generally when + a measure of the center and spread of the cluster is not a suitable + description of the complete cluster, such as when clusters are + nested circles on the 2D plane. + + If the affinity matrix is the adjacency matrix of a graph, this method + can be used to find normalized graph cuts [1]_, [2]_. + + When calling ``fit``, an affinity matrix is constructed using either + a kernel function such the Gaussian (aka RBF) kernel with Euclidean + distance ``d(X, X)``:: + + np.exp(-gamma * d(X,X) ** 2) + + or a k-nearest neighbors connectivity matrix. + + Alternatively, a user-provided affinity matrix can be specified by + setting ``affinity='precomputed'``. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_clusters : int, default=8 + The dimension of the projection subspace. + + eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None + The eigenvalue decomposition strategy to use. AMG requires pyamg + to be installed. It can be faster on very large, sparse problems, + but may also lead to instabilities. If None, then ``'arpack'`` is + used. See [4]_ for more details regarding `'lobpcg'`. + + n_components : int, default=None + Number of eigenvectors to use for the spectral embedding. If None, + defaults to `n_clusters`. + + random_state : int, RandomState instance, default=None + A pseudo random number generator used for the initialization + of the lobpcg eigenvectors decomposition when `eigen_solver == + 'amg'`, and for the K-Means initialization. Use an int to make + the results deterministic across calls (See + :term:`Glossary `). + + .. note:: + When using `eigen_solver == 'amg'`, + it is necessary to also fix the global numpy seed with + `np.random.seed(int)` to get deterministic results. See + https://github.com/pyamg/pyamg/issues/139 for further + information. + + n_init : int, default=10 + Number of time the k-means algorithm will be run with different + centroid seeds. The final results will be the best output of n_init + consecutive runs in terms of inertia. Only used if + ``assign_labels='kmeans'``. + + gamma : float, default=1.0 + Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels. + Ignored for ``affinity='nearest_neighbors'``, ``affinity='precomputed'`` + or ``affinity='precomputed_nearest_neighbors'``. + + affinity : str or callable, default='rbf' + How to construct the affinity matrix. + - 'nearest_neighbors': construct the affinity matrix by computing a + graph of nearest neighbors. + - 'rbf': construct the affinity matrix using a radial basis function + (RBF) kernel. + - 'precomputed': interpret ``X`` as a precomputed affinity matrix, + where larger values indicate greater similarity between instances. + - 'precomputed_nearest_neighbors': interpret ``X`` as a sparse graph + of precomputed distances, and construct a binary affinity matrix + from the ``n_neighbors`` nearest neighbors of each instance. + - one of the kernels supported by + :func:`~sklearn.metrics.pairwise.pairwise_kernels`. + + Only kernels that produce similarity scores (non-negative values that + increase with similarity) should be used. This property is not checked + by the clustering algorithm. + + n_neighbors : int, default=10 + Number of neighbors to use when constructing the affinity matrix using + the nearest neighbors method. Ignored for ``affinity='rbf'``. + + eigen_tol : float, default="auto" + Stopping criterion for eigen decomposition of the Laplacian matrix. + If `eigen_tol="auto"` then the passed tolerance will depend on the + `eigen_solver`: + + - If `eigen_solver="arpack"`, then `eigen_tol=0.0`; + - If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then + `eigen_tol=None` which configures the underlying `lobpcg` solver to + automatically resolve the value according to their heuristics. See, + :func:`scipy.sparse.linalg.lobpcg` for details. + + Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"` + values of `tol<1e-5` may lead to convergence issues and should be + avoided. + + .. versionadded:: 1.2 + Added 'auto' option. + + assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans' + The strategy for assigning labels in the embedding space. There are two + ways to assign labels after the Laplacian embedding. k-means is a + popular choice, but it can be sensitive to initialization. + Discretization is another approach which is less sensitive to random + initialization [3]_. + The cluster_qr method [5]_ directly extract clusters from eigenvectors + in spectral clustering. In contrast to k-means and discretization, cluster_qr + has no tuning parameters and runs no iterations, yet may outperform + k-means and discretization in terms of both quality and speed. + + .. versionchanged:: 1.1 + Added new labeling method 'cluster_qr'. + + degree : float, default=3 + Degree of the polynomial kernel. Ignored by other kernels. + + coef0 : float, default=1 + Zero coefficient for polynomial and sigmoid kernels. + Ignored by other kernels. + + kernel_params : dict of str to any, default=None + Parameters (keyword arguments) and values for kernel passed as + callable object. Ignored by other kernels. + + n_jobs : int, default=None + The number of parallel jobs to run when `affinity='nearest_neighbors'` + or `affinity='precomputed_nearest_neighbors'`. The neighbors search + will be done in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + verbose : bool, default=False + Verbosity mode. + + .. versionadded:: 0.24 + + Attributes + ---------- + affinity_matrix_ : array-like of shape (n_samples, n_samples) + Affinity matrix used for clustering. Available only after calling + ``fit``. + + labels_ : ndarray of shape (n_samples,) + Labels of each point + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + sklearn.cluster.KMeans : K-Means clustering. + sklearn.cluster.DBSCAN : Density-Based Spatial Clustering of + Applications with Noise. + + Notes + ----- + A distance matrix for which 0 indicates identical elements and high values + indicate very dissimilar elements can be transformed into an affinity / + similarity matrix that is well-suited for the algorithm by + applying the Gaussian (aka RBF, heat) kernel:: + + np.exp(- dist_matrix ** 2 / (2. * delta ** 2)) + + where ``delta`` is a free parameter representing the width of the Gaussian + kernel. + + An alternative is to take a symmetric version of the k-nearest neighbors + connectivity matrix of the points. + + If the pyamg package is installed, it is used: this greatly + speeds up computation. + + References + ---------- + .. [1] :doi:`Normalized cuts and image segmentation, 2000 + Jianbo Shi, Jitendra Malik + <10.1109/34.868688>` + + .. [2] :doi:`A Tutorial on Spectral Clustering, 2007 + Ulrike von Luxburg + <10.1007/s11222-007-9033-z>` + + .. [3] `Multiclass spectral clustering, 2003 + Stella X. Yu, Jianbo Shi + `_ + + .. [4] :doi:`Toward the Optimal Preconditioned Eigensolver: + Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001 + A. V. Knyazev + SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541. + <10.1137/S1064827500366124>` + + .. [5] :doi:`Simple, direct, and efficient multi-way spectral clustering, 2019 + Anil Damle, Victor Minden, Lexing Ying + <10.1093/imaiai/iay008>` + + Examples + -------- + >>> from sklearn.cluster import SpectralClustering + >>> import numpy as np + >>> X = np.array([[1, 1], [2, 1], [1, 0], + ... [4, 7], [3, 5], [3, 6]]) + >>> clustering = SpectralClustering(n_clusters=2, + ... assign_labels='discretize', + ... random_state=0).fit(X) + >>> clustering.labels_ + array([1, 1, 1, 0, 0, 0]) + >>> clustering + SpectralClustering(assign_labels='discretize', n_clusters=2, + random_state=0) + + For a comparison of Spectral clustering with other clustering algorithms, see + :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py` + """ + + _parameter_constraints: dict = { + "n_clusters": [Interval(Integral, 1, None, closed="left")], + "eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None], + "n_components": [Interval(Integral, 1, None, closed="left"), None], + "random_state": ["random_state"], + "n_init": [Interval(Integral, 1, None, closed="left")], + "gamma": [Interval(Real, 0, None, closed="left")], + "affinity": [ + callable, + StrOptions( + set(KERNEL_PARAMS) + | {"nearest_neighbors", "precomputed", "precomputed_nearest_neighbors"} + ), + ], + "n_neighbors": [Interval(Integral, 1, None, closed="left")], + "eigen_tol": [ + Interval(Real, 0.0, None, closed="left"), + StrOptions({"auto"}), + ], + "assign_labels": [StrOptions({"kmeans", "discretize", "cluster_qr"})], + "degree": [Interval(Real, 0, None, closed="left")], + "coef0": [Interval(Real, None, None, closed="neither")], + "kernel_params": [dict, None], + "n_jobs": [Integral, None], + "verbose": ["verbose"], + } + + def __init__( + self, + n_clusters=8, + *, + eigen_solver=None, + n_components=None, + random_state=None, + n_init=10, + gamma=1.0, + affinity="rbf", + n_neighbors=10, + eigen_tol="auto", + assign_labels="kmeans", + degree=3, + coef0=1, + kernel_params=None, + n_jobs=None, + verbose=False, + ): + self.n_clusters = n_clusters + self.eigen_solver = eigen_solver + self.n_components = n_components + self.random_state = random_state + self.n_init = n_init + self.gamma = gamma + self.affinity = affinity + self.n_neighbors = n_neighbors + self.eigen_tol = eigen_tol + self.assign_labels = assign_labels + self.degree = degree + self.coef0 = coef0 + self.kernel_params = kernel_params + self.n_jobs = n_jobs + self.verbose = verbose + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Perform spectral clustering from features, or affinity matrix. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) + Training instances to cluster, similarities / affinities between + instances if ``affinity='precomputed'``, or distances between + instances if ``affinity='precomputed_nearest_neighbors``. If a + sparse matrix is provided in a format other than ``csr_matrix``, + ``csc_matrix``, or ``coo_matrix``, it will be converted into a + sparse ``csr_matrix``. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self : object + A fitted instance of the estimator. + """ + X = validate_data( + self, + X, + accept_sparse=["csr", "csc", "coo"], + dtype=np.float64, + ensure_min_samples=2, + ) + allow_squared = self.affinity in [ + "precomputed", + "precomputed_nearest_neighbors", + ] + if X.shape[0] == X.shape[1] and not allow_squared: + warnings.warn( + "The spectral clustering API has changed. ``fit``" + "now constructs an affinity matrix from data. To use" + " a custom affinity matrix, " + "set ``affinity=precomputed``." + ) + + if self.affinity == "nearest_neighbors": + connectivity = kneighbors_graph( + X, n_neighbors=self.n_neighbors, include_self=True, n_jobs=self.n_jobs + ) + self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T) + elif self.affinity == "precomputed_nearest_neighbors": + estimator = NearestNeighbors( + n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric="precomputed" + ).fit(X) + connectivity = estimator.kneighbors_graph(X=X, mode="connectivity") + self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T) + elif self.affinity == "precomputed": + self.affinity_matrix_ = X + else: + params = self.kernel_params + if params is None: + params = {} + if not callable(self.affinity): + params["gamma"] = self.gamma + params["degree"] = self.degree + params["coef0"] = self.coef0 + self.affinity_matrix_ = pairwise_kernels( + X, metric=self.affinity, filter_params=True, **params + ) + + random_state = check_random_state(self.random_state) + n_components = ( + self.n_clusters if self.n_components is None else self.n_components + ) + # We now obtain the real valued solution matrix to the + # relaxed Ncut problem, solving the eigenvalue problem + # L_sym x = lambda x and recovering u = D^-1/2 x. + # The first eigenvector is constant only for fully connected graphs + # and should be kept for spectral clustering (drop_first = False) + # See spectral_embedding documentation. + maps = _spectral_embedding( + self.affinity_matrix_, + n_components=n_components, + eigen_solver=self.eigen_solver, + random_state=random_state, + eigen_tol=self.eigen_tol, + drop_first=False, + ) + if self.verbose: + print(f"Computing label assignment using {self.assign_labels}") + + if self.assign_labels == "kmeans": + _, self.labels_, _ = k_means( + maps, + self.n_clusters, + random_state=random_state, + n_init=self.n_init, + verbose=self.verbose, + ) + elif self.assign_labels == "cluster_qr": + self.labels_ = cluster_qr(maps) + else: + self.labels_ = discretize(maps, random_state=random_state) + + return self + + def fit_predict(self, X, y=None): + """Perform spectral clustering on `X` and return cluster labels. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) + Training instances to cluster, similarities / affinities between + instances if ``affinity='precomputed'``, or distances between + instances if ``affinity='precomputed_nearest_neighbors``. If a + sparse matrix is provided in a format other than ``csr_matrix``, + ``csc_matrix``, or ``coo_matrix``, it will be converted into a + sparse ``csr_matrix``. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + labels : ndarray of shape (n_samples,) + Cluster labels. + """ + return super().fit_predict(X, y) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + tags.input_tags.pairwise = self.affinity in [ + "precomputed", + "precomputed_nearest_neighbors", + ] + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/meson.build b/.venv/lib/python3.12/site-packages/sklearn/cluster/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..6c11619f3ca555c58ed43b1d579548d75cf6aea4 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/meson.build @@ -0,0 +1,26 @@ +cluster_extension_metadata = { + '_dbscan_inner': + {'sources': [cython_gen_cpp.process('_dbscan_inner.pyx')]}, + '_hierarchical_fast': + {'sources': [cython_gen_cpp.process('_hierarchical_fast.pyx'), metrics_cython_tree]}, + '_k_means_common': + {'sources': [cython_gen.process('_k_means_common.pyx')], 'dependencies': [openmp_dep]}, + '_k_means_lloyd': + {'sources': [cython_gen.process('_k_means_lloyd.pyx')], 'dependencies': [openmp_dep]}, + '_k_means_elkan': + {'sources': [cython_gen.process('_k_means_elkan.pyx')], 'dependencies': [openmp_dep]}, + '_k_means_minibatch': + {'sources': [cython_gen.process('_k_means_minibatch.pyx')], 'dependencies': [openmp_dep]}, +} + +foreach ext_name, ext_dict : cluster_extension_metadata + py.extension_module( + ext_name, + [ext_dict.get('sources'), utils_cython_tree], + dependencies: [np_dep] + ext_dict.get('dependencies', []), + subdir: 'sklearn/cluster', + install: true + ) +endforeach + +subdir('_hdbscan') diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/common.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/common.py new file mode 100644 index 0000000000000000000000000000000000000000..b1fe047fe230af1c3fbb2ec0b72f3ef20e5aa3aa --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/common.py @@ -0,0 +1,37 @@ +""" +Common utilities for testing clustering. + +""" + +import numpy as np + +############################################################################### +# Generate sample data + + +def generate_clustered_data( + seed=0, n_clusters=3, n_features=2, n_samples_per_cluster=20, std=0.4 +): + prng = np.random.RandomState(seed) + + # the data is voluntary shifted away from zero to check clustering + # algorithm robustness with regards to non centered data + means = ( + np.array( + [ + [1, 1, 1, 0], + [-1, -1, 0, 1], + [1, -1, 1, 1], + [-1, 1, 1, 0], + ] + ) + + 10 + ) + + X = np.empty((0, n_features)) + for i in range(n_clusters): + X = np.r_[ + X, + means[i][:n_features] + std * prng.randn(n_samples_per_cluster, n_features), + ] + return X diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_affinity_propagation.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_affinity_propagation.py new file mode 100644 index 0000000000000000000000000000000000000000..c3138e59111ed849988dd0e6d3433a4bb251e2a1 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_affinity_propagation.py @@ -0,0 +1,321 @@ +""" +Testing for Clustering methods + +""" + +import warnings + +import numpy as np +import pytest + +from sklearn.cluster import AffinityPropagation, affinity_propagation +from sklearn.cluster._affinity_propagation import _equal_similarities_and_preferences +from sklearn.datasets import make_blobs +from sklearn.exceptions import ConvergenceWarning, NotFittedError +from sklearn.metrics import euclidean_distances +from sklearn.utils._testing import assert_allclose, assert_array_equal +from sklearn.utils.fixes import CSR_CONTAINERS + +n_clusters = 3 +centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10 +X, _ = make_blobs( + n_samples=60, + n_features=2, + centers=centers, + cluster_std=0.4, + shuffle=True, + random_state=0, +) + +# TODO: AffinityPropagation must preserve dtype for its fitted attributes +# and test must be created accordingly to this new behavior. +# For more details, see: https://github.com/scikit-learn/scikit-learn/issues/11000 + + +def test_affinity_propagation(global_random_seed, global_dtype): + """Test consistency of the affinity propagations.""" + S = -euclidean_distances(X.astype(global_dtype, copy=False), squared=True) + preference = np.median(S) * 10 + cluster_centers_indices, labels = affinity_propagation( + S, preference=preference, random_state=global_random_seed + ) + + n_clusters_ = len(cluster_centers_indices) + + assert n_clusters == n_clusters_ + + +def test_affinity_propagation_precomputed(): + """Check equality of precomputed affinity matrix to internally computed affinity + matrix. + """ + S = -euclidean_distances(X, squared=True) + preference = np.median(S) * 10 + af = AffinityPropagation( + preference=preference, affinity="precomputed", random_state=28 + ) + labels_precomputed = af.fit(S).labels_ + + af = AffinityPropagation(preference=preference, verbose=True, random_state=37) + labels = af.fit(X).labels_ + + assert_array_equal(labels, labels_precomputed) + + cluster_centers_indices = af.cluster_centers_indices_ + + n_clusters_ = len(cluster_centers_indices) + assert np.unique(labels).size == n_clusters_ + assert n_clusters == n_clusters_ + + +def test_affinity_propagation_no_copy(): + """Check behaviour of not copying the input data.""" + S = -euclidean_distances(X, squared=True) + S_original = S.copy() + preference = np.median(S) * 10 + assert not np.allclose(S.diagonal(), preference) + + # with copy=True S should not be modified + affinity_propagation(S, preference=preference, copy=True, random_state=0) + assert_allclose(S, S_original) + assert not np.allclose(S.diagonal(), preference) + assert_allclose(S.diagonal(), np.zeros(S.shape[0])) + + # with copy=False S will be modified inplace + affinity_propagation(S, preference=preference, copy=False, random_state=0) + assert_allclose(S.diagonal(), preference) + + # test that copy=True and copy=False lead to the same result + S = S_original.copy() + af = AffinityPropagation(preference=preference, verbose=True, random_state=0) + + labels = af.fit(X).labels_ + _, labels_no_copy = affinity_propagation( + S, preference=preference, copy=False, random_state=74 + ) + assert_array_equal(labels, labels_no_copy) + + +def test_affinity_propagation_affinity_shape(): + """Check the shape of the affinity matrix when using `affinity_propagation.""" + S = -euclidean_distances(X, squared=True) + err_msg = "The matrix of similarities must be a square array" + with pytest.raises(ValueError, match=err_msg): + affinity_propagation(S[:, :-1]) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_affinity_propagation_precomputed_with_sparse_input(csr_container): + err_msg = "Sparse data was passed for X, but dense data is required" + with pytest.raises(TypeError, match=err_msg): + AffinityPropagation(affinity="precomputed").fit(csr_container((3, 3))) + + +def test_affinity_propagation_predict(global_random_seed, global_dtype): + # Test AffinityPropagation.predict + af = AffinityPropagation(affinity="euclidean", random_state=global_random_seed) + X_ = X.astype(global_dtype, copy=False) + labels = af.fit_predict(X_) + labels2 = af.predict(X_) + assert_array_equal(labels, labels2) + + +def test_affinity_propagation_predict_error(): + # Test exception in AffinityPropagation.predict + # Not fitted. + af = AffinityPropagation(affinity="euclidean") + with pytest.raises(NotFittedError): + af.predict(X) + + # Predict not supported when affinity="precomputed". + S = np.dot(X, X.T) + af = AffinityPropagation(affinity="precomputed", random_state=57) + af.fit(S) + with pytest.raises(ValueError, match="expecting 60 features as input"): + af.predict(X) + + +def test_affinity_propagation_fit_non_convergence(global_dtype): + # In case of non-convergence of affinity_propagation(), the cluster + # centers should be an empty array and training samples should be labelled + # as noise (-1) + X = np.array([[0, 0], [1, 1], [-2, -2]], dtype=global_dtype) + + # Force non-convergence by allowing only a single iteration + af = AffinityPropagation(preference=-10, max_iter=1, random_state=82) + + with pytest.warns(ConvergenceWarning): + af.fit(X) + assert_allclose(np.empty((0, 2)), af.cluster_centers_) + assert_array_equal(np.array([-1, -1, -1]), af.labels_) + + +def test_affinity_propagation_equal_mutual_similarities(global_dtype): + X = np.array([[-1, 1], [1, -1]], dtype=global_dtype) + S = -euclidean_distances(X, squared=True) + + # setting preference > similarity + with pytest.warns(UserWarning, match="mutually equal"): + cluster_center_indices, labels = affinity_propagation(S, preference=0) + + # expect every sample to become an exemplar + assert_array_equal([0, 1], cluster_center_indices) + assert_array_equal([0, 1], labels) + + # setting preference < similarity + with pytest.warns(UserWarning, match="mutually equal"): + cluster_center_indices, labels = affinity_propagation(S, preference=-10) + + # expect one cluster, with arbitrary (first) sample as exemplar + assert_array_equal([0], cluster_center_indices) + assert_array_equal([0, 0], labels) + + # setting different preferences + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + cluster_center_indices, labels = affinity_propagation( + S, preference=[-20, -10], random_state=37 + ) + + # expect one cluster, with highest-preference sample as exemplar + assert_array_equal([1], cluster_center_indices) + assert_array_equal([0, 0], labels) + + +def test_affinity_propagation_predict_non_convergence(global_dtype): + # In case of non-convergence of affinity_propagation(), the cluster + # centers should be an empty array + X = np.array([[0, 0], [1, 1], [-2, -2]], dtype=global_dtype) + + # Force non-convergence by allowing only a single iteration + with pytest.warns(ConvergenceWarning): + af = AffinityPropagation(preference=-10, max_iter=1, random_state=75).fit(X) + + # At prediction time, consider new samples as noise since there are no + # clusters + to_predict = np.array([[2, 2], [3, 3], [4, 4]]) + with pytest.warns(ConvergenceWarning): + y = af.predict(to_predict) + assert_array_equal(np.array([-1, -1, -1]), y) + + +def test_affinity_propagation_non_convergence_regressiontest(global_dtype): + X = np.array( + [[1, 0, 0, 0, 0, 0], [0, 1, 1, 1, 0, 0], [0, 0, 1, 0, 0, 1]], dtype=global_dtype + ) + af = AffinityPropagation(affinity="euclidean", max_iter=2, random_state=34) + msg = ( + "Affinity propagation did not converge, this model may return degenerate" + " cluster centers and labels." + ) + with pytest.warns(ConvergenceWarning, match=msg): + af.fit(X) + + assert_array_equal(np.array([0, 0, 0]), af.labels_) + + +def test_equal_similarities_and_preferences(global_dtype): + # Unequal distances + X = np.array([[0, 0], [1, 1], [-2, -2]], dtype=global_dtype) + S = -euclidean_distances(X, squared=True) + + assert not _equal_similarities_and_preferences(S, np.array(0)) + assert not _equal_similarities_and_preferences(S, np.array([0, 0])) + assert not _equal_similarities_and_preferences(S, np.array([0, 1])) + + # Equal distances + X = np.array([[0, 0], [1, 1]], dtype=global_dtype) + S = -euclidean_distances(X, squared=True) + + # Different preferences + assert not _equal_similarities_and_preferences(S, np.array([0, 1])) + + # Same preferences + assert _equal_similarities_and_preferences(S, np.array([0, 0])) + assert _equal_similarities_and_preferences(S, np.array(0)) + + +def test_affinity_propagation_random_state(): + """Check that different random states lead to different initialisations + by looking at the center locations after two iterations. + """ + centers = [[1, 1], [-1, -1], [1, -1]] + X, labels_true = make_blobs( + n_samples=300, centers=centers, cluster_std=0.5, random_state=0 + ) + # random_state = 0 + ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=0) + ap.fit(X) + centers0 = ap.cluster_centers_ + + # random_state = 76 + ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=76) + ap.fit(X) + centers76 = ap.cluster_centers_ + # check that the centers have not yet converged to the same solution + assert np.mean((centers0 - centers76) ** 2) > 1 + + +@pytest.mark.parametrize("container", CSR_CONTAINERS + [np.array]) +def test_affinity_propagation_convergence_warning_dense_sparse(container, global_dtype): + """ + Check that having sparse or dense `centers` format should not + influence the convergence. + Non-regression test for gh-13334. + """ + centers = container(np.zeros((1, 10))) + rng = np.random.RandomState(42) + X = rng.rand(40, 10).astype(global_dtype, copy=False) + y = (4 * rng.rand(40)).astype(int) + ap = AffinityPropagation(random_state=46) + ap.fit(X, y) + ap.cluster_centers_ = centers + with warnings.catch_warnings(): + warnings.simplefilter("error", ConvergenceWarning) + assert_array_equal(ap.predict(X), np.zeros(X.shape[0], dtype=int)) + + +# FIXME; this test is broken with different random states, needs to be revisited +def test_correct_clusters(global_dtype): + # Test to fix incorrect clusters due to dtype change + # (non-regression test for issue #10832) + X = np.array( + [[1, 0, 0, 0], [0, 1, 1, 0], [0, 1, 1, 0], [0, 0, 0, 1]], dtype=global_dtype + ) + afp = AffinityPropagation(preference=1, affinity="precomputed", random_state=0).fit( + X + ) + expected = np.array([0, 1, 1, 2]) + assert_array_equal(afp.labels_, expected) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse_input_for_predict(csr_container): + # Test to make sure sparse inputs are accepted for predict + # (non-regression test for issue #20049) + af = AffinityPropagation(affinity="euclidean", random_state=42) + af.fit(X) + labels = af.predict(csr_container((2, 2))) + assert_array_equal(labels, (2, 2)) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse_input_for_fit_predict(csr_container): + # Test to make sure sparse inputs are accepted for fit_predict + # (non-regression test for issue #20049) + af = AffinityPropagation(affinity="euclidean", random_state=42) + rng = np.random.RandomState(42) + X = csr_container(rng.randint(0, 2, size=(5, 5))) + labels = af.fit_predict(X) + assert_array_equal(labels, (0, 1, 1, 2, 3)) + + +def test_affinity_propagation_equal_points(): + """Make sure we do not assign multiple clusters to equal points. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/pull/20043 + """ + X = np.zeros((8, 1)) + af = AffinityPropagation(affinity="euclidean", damping=0.5, random_state=42).fit(X) + assert np.all(af.labels_ == 0) diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_bicluster.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_bicluster.py new file mode 100644 index 0000000000000000000000000000000000000000..ebc845a7bf262c60cf9f039e5ce021d841bdf4d4 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_bicluster.py @@ -0,0 +1,264 @@ +"""Testing for Spectral Biclustering methods""" + +import numpy as np +import pytest +from scipy.sparse import issparse + +from sklearn.base import BaseEstimator, BiclusterMixin +from sklearn.cluster import SpectralBiclustering, SpectralCoclustering +from sklearn.cluster._bicluster import ( + _bistochastic_normalize, + _log_normalize, + _scale_normalize, +) +from sklearn.datasets import make_biclusters, make_checkerboard +from sklearn.metrics import consensus_score, v_measure_score +from sklearn.model_selection import ParameterGrid +from sklearn.utils._testing import ( + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) +from sklearn.utils.fixes import CSR_CONTAINERS + + +class MockBiclustering(BiclusterMixin, BaseEstimator): + # Mock object for testing get_submatrix. + def __init__(self): + pass + + def get_indices(self, i): + # Overridden to reproduce old get_submatrix test. + return ( + np.where([True, True, False, False, True])[0], + np.where([False, False, True, True])[0], + ) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_get_submatrix(csr_container): + data = np.arange(20).reshape(5, 4) + model = MockBiclustering() + + for X in (data, csr_container(data), data.tolist()): + submatrix = model.get_submatrix(0, X) + if issparse(submatrix): + submatrix = submatrix.toarray() + assert_array_equal(submatrix, [[2, 3], [6, 7], [18, 19]]) + submatrix[:] = -1 + if issparse(X): + X = X.toarray() + assert np.all(X != -1) + + +def _test_shape_indices(model): + # Test get_shape and get_indices on fitted model. + for i in range(model.n_clusters): + m, n = model.get_shape(i) + i_ind, j_ind = model.get_indices(i) + assert len(i_ind) == m + assert len(j_ind) == n + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_spectral_coclustering(global_random_seed, csr_container): + # Test Dhillon's Spectral CoClustering on a simple problem. + param_grid = { + "svd_method": ["randomized", "arpack"], + "n_svd_vecs": [None, 20], + "mini_batch": [False, True], + "init": ["k-means++"], + "n_init": [10], + } + S, rows, cols = make_biclusters( + (30, 30), 3, noise=0.1, random_state=global_random_seed + ) + S -= S.min() # needs to be nonnegative before making it sparse + S = np.where(S < 1, 0, S) # threshold some values + for mat in (S, csr_container(S)): + for kwargs in ParameterGrid(param_grid): + model = SpectralCoclustering( + n_clusters=3, random_state=global_random_seed, **kwargs + ) + model.fit(mat) + + assert model.rows_.shape == (3, 30) + assert_array_equal(model.rows_.sum(axis=0), np.ones(30)) + assert_array_equal(model.columns_.sum(axis=0), np.ones(30)) + assert consensus_score(model.biclusters_, (rows, cols)) == 1 + + _test_shape_indices(model) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_spectral_biclustering(global_random_seed, csr_container): + # Test Kluger methods on a checkerboard dataset. + S, rows, cols = make_checkerboard( + (30, 30), 3, noise=0.5, random_state=global_random_seed + ) + + non_default_params = { + "method": ["scale", "log"], + "svd_method": ["arpack"], + "n_svd_vecs": [20], + "mini_batch": [True], + } + + for mat in (S, csr_container(S)): + for param_name, param_values in non_default_params.items(): + for param_value in param_values: + model = SpectralBiclustering( + n_clusters=3, + n_init=3, + init="k-means++", + random_state=global_random_seed, + ) + model.set_params(**dict([(param_name, param_value)])) + + if issparse(mat) and model.get_params().get("method") == "log": + # cannot take log of sparse matrix + with pytest.raises(ValueError): + model.fit(mat) + continue + else: + model.fit(mat) + + assert model.rows_.shape == (9, 30) + assert model.columns_.shape == (9, 30) + assert_array_equal(model.rows_.sum(axis=0), np.repeat(3, 30)) + assert_array_equal(model.columns_.sum(axis=0), np.repeat(3, 30)) + assert consensus_score(model.biclusters_, (rows, cols)) == 1 + + _test_shape_indices(model) + + +def _do_scale_test(scaled): + """Check that rows sum to one constant, and columns to another.""" + row_sum = scaled.sum(axis=1) + col_sum = scaled.sum(axis=0) + if issparse(scaled): + row_sum = np.asarray(row_sum).squeeze() + col_sum = np.asarray(col_sum).squeeze() + assert_array_almost_equal(row_sum, np.tile(row_sum.mean(), 100), decimal=1) + assert_array_almost_equal(col_sum, np.tile(col_sum.mean(), 100), decimal=1) + + +def _do_bistochastic_test(scaled): + """Check that rows and columns sum to the same constant.""" + _do_scale_test(scaled) + assert_almost_equal(scaled.sum(axis=0).mean(), scaled.sum(axis=1).mean(), decimal=1) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_scale_normalize(global_random_seed, csr_container): + generator = np.random.RandomState(global_random_seed) + X = generator.rand(100, 100) + for mat in (X, csr_container(X)): + scaled, _, _ = _scale_normalize(mat) + _do_scale_test(scaled) + if issparse(mat): + assert issparse(scaled) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_bistochastic_normalize(global_random_seed, csr_container): + generator = np.random.RandomState(global_random_seed) + X = generator.rand(100, 100) + for mat in (X, csr_container(X)): + scaled = _bistochastic_normalize(mat) + _do_bistochastic_test(scaled) + if issparse(mat): + assert issparse(scaled) + + +def test_log_normalize(global_random_seed): + # adding any constant to a log-scaled matrix should make it + # bistochastic + generator = np.random.RandomState(global_random_seed) + mat = generator.rand(100, 100) + scaled = _log_normalize(mat) + 1 + _do_bistochastic_test(scaled) + + +def test_fit_best_piecewise(global_random_seed): + model = SpectralBiclustering(random_state=global_random_seed) + vectors = np.array([[0, 0, 0, 1, 1, 1], [2, 2, 2, 3, 3, 3], [0, 1, 2, 3, 4, 5]]) + best = model._fit_best_piecewise(vectors, n_best=2, n_clusters=2) + assert_array_equal(best, vectors[:2]) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_project_and_cluster(global_random_seed, csr_container): + model = SpectralBiclustering(random_state=global_random_seed) + data = np.array([[1, 1, 1], [1, 1, 1], [3, 6, 3], [3, 6, 3]]) + vectors = np.array([[1, 0], [0, 1], [0, 0]]) + for mat in (data, csr_container(data)): + labels = model._project_and_cluster(mat, vectors, n_clusters=2) + assert_almost_equal(v_measure_score(labels, [0, 0, 1, 1]), 1.0) + + +def test_perfect_checkerboard(global_random_seed): + # XXX Previously failed on build bot (not reproducible) + model = SpectralBiclustering( + 3, svd_method="arpack", random_state=global_random_seed + ) + + S, rows, cols = make_checkerboard( + (30, 30), 3, noise=0, random_state=global_random_seed + ) + model.fit(S) + assert consensus_score(model.biclusters_, (rows, cols)) == 1 + + S, rows, cols = make_checkerboard( + (40, 30), 3, noise=0, random_state=global_random_seed + ) + model.fit(S) + assert consensus_score(model.biclusters_, (rows, cols)) == 1 + + S, rows, cols = make_checkerboard( + (30, 40), 3, noise=0, random_state=global_random_seed + ) + model.fit(S) + assert consensus_score(model.biclusters_, (rows, cols)) == 1 + + +@pytest.mark.parametrize( + "params, type_err, err_msg", + [ + ( + {"n_clusters": 6}, + ValueError, + "n_clusters should be <= n_samples=5", + ), + ( + {"n_clusters": (3, 3, 3)}, + ValueError, + "Incorrect parameter n_clusters", + ), + ( + {"n_clusters": (3, 6)}, + ValueError, + "Incorrect parameter n_clusters", + ), + ( + {"n_components": 3, "n_best": 4}, + ValueError, + "n_best=4 must be <= n_components=3", + ), + ], +) +def test_spectralbiclustering_parameter_validation(params, type_err, err_msg): + """Check parameters validation in `SpectralBiClustering`""" + data = np.arange(25).reshape((5, 5)) + model = SpectralBiclustering(**params) + with pytest.raises(type_err, match=err_msg): + model.fit(data) + + +@pytest.mark.parametrize("est", (SpectralBiclustering(), SpectralCoclustering())) +def test_n_features_in_(est): + X, _, _ = make_biclusters((3, 3), 3, random_state=0) + + assert not hasattr(est, "n_features_in_") + est.fit(X) + assert est.n_features_in_ == 3 diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_birch.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_birch.py new file mode 100644 index 0000000000000000000000000000000000000000..bc87934adaecdb507126097e2de945c677587bee --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_birch.py @@ -0,0 +1,250 @@ +""" +Tests for the birch clustering algorithm. +""" + +import numpy as np +import pytest + +from sklearn.cluster import AgglomerativeClustering, Birch +from sklearn.cluster.tests.common import generate_clustered_data +from sklearn.datasets import make_blobs +from sklearn.exceptions import ConvergenceWarning +from sklearn.metrics import pairwise_distances_argmin, v_measure_score +from sklearn.utils._testing import assert_allclose, assert_array_equal +from sklearn.utils.fixes import CSR_CONTAINERS + + +def test_n_samples_leaves_roots(global_random_seed, global_dtype): + # Sanity check for the number of samples in leaves and roots + X, y = make_blobs(n_samples=10, random_state=global_random_seed) + X = X.astype(global_dtype, copy=False) + brc = Birch() + brc.fit(X) + n_samples_root = sum([sc.n_samples_ for sc in brc.root_.subclusters_]) + n_samples_leaves = sum( + [sc.n_samples_ for leaf in brc._get_leaves() for sc in leaf.subclusters_] + ) + assert n_samples_leaves == X.shape[0] + assert n_samples_root == X.shape[0] + + +def test_partial_fit(global_random_seed, global_dtype): + # Test that fit is equivalent to calling partial_fit multiple times + X, y = make_blobs(n_samples=100, random_state=global_random_seed) + X = X.astype(global_dtype, copy=False) + brc = Birch(n_clusters=3) + brc.fit(X) + brc_partial = Birch(n_clusters=None) + brc_partial.partial_fit(X[:50]) + brc_partial.partial_fit(X[50:]) + assert_allclose(brc_partial.subcluster_centers_, brc.subcluster_centers_) + + # Test that same global labels are obtained after calling partial_fit + # with None + brc_partial.set_params(n_clusters=3) + brc_partial.partial_fit(None) + assert_array_equal(brc_partial.subcluster_labels_, brc.subcluster_labels_) + + +def test_birch_predict(global_random_seed, global_dtype): + # Test the predict method predicts the nearest centroid. + rng = np.random.RandomState(global_random_seed) + X = generate_clustered_data(n_clusters=3, n_features=3, n_samples_per_cluster=10) + X = X.astype(global_dtype, copy=False) + + # n_samples * n_samples_per_cluster + shuffle_indices = np.arange(30) + rng.shuffle(shuffle_indices) + X_shuffle = X[shuffle_indices, :] + brc = Birch(n_clusters=4, threshold=1.0) + brc.fit(X_shuffle) + + # Birch must preserve inputs' dtype + assert brc.subcluster_centers_.dtype == global_dtype + + assert_array_equal(brc.labels_, brc.predict(X_shuffle)) + centroids = brc.subcluster_centers_ + nearest_centroid = brc.subcluster_labels_[ + pairwise_distances_argmin(X_shuffle, centroids) + ] + assert_allclose(v_measure_score(nearest_centroid, brc.labels_), 1.0) + + +def test_n_clusters(global_random_seed, global_dtype): + # Test that n_clusters param works properly + X, y = make_blobs(n_samples=100, centers=10, random_state=global_random_seed) + X = X.astype(global_dtype, copy=False) + brc1 = Birch(n_clusters=10) + brc1.fit(X) + assert len(brc1.subcluster_centers_) > 10 + assert len(np.unique(brc1.labels_)) == 10 + + # Test that n_clusters = Agglomerative Clustering gives + # the same results. + gc = AgglomerativeClustering(n_clusters=10) + brc2 = Birch(n_clusters=gc) + brc2.fit(X) + assert_array_equal(brc1.subcluster_labels_, brc2.subcluster_labels_) + assert_array_equal(brc1.labels_, brc2.labels_) + + # Test that a small number of clusters raises a warning. + brc4 = Birch(threshold=10000.0) + with pytest.warns(ConvergenceWarning): + brc4.fit(X) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse_X(global_random_seed, global_dtype, csr_container): + # Test that sparse and dense data give same results + X, y = make_blobs(n_samples=100, centers=10, random_state=global_random_seed) + X = X.astype(global_dtype, copy=False) + brc = Birch(n_clusters=10) + brc.fit(X) + + csr = csr_container(X) + brc_sparse = Birch(n_clusters=10) + brc_sparse.fit(csr) + + # Birch must preserve inputs' dtype + assert brc_sparse.subcluster_centers_.dtype == global_dtype + + assert_array_equal(brc.labels_, brc_sparse.labels_) + assert_allclose(brc.subcluster_centers_, brc_sparse.subcluster_centers_) + + +def test_partial_fit_second_call_error_checks(): + # second partial fit calls will error when n_features is not consistent + # with the first call + X, y = make_blobs(n_samples=100) + brc = Birch(n_clusters=3) + brc.partial_fit(X, y) + + msg = "X has 1 features, but Birch is expecting 2 features" + with pytest.raises(ValueError, match=msg): + brc.partial_fit(X[:, [0]], y) + + +def check_branching_factor(node, branching_factor): + subclusters = node.subclusters_ + assert branching_factor >= len(subclusters) + for cluster in subclusters: + if cluster.child_: + check_branching_factor(cluster.child_, branching_factor) + + +def test_branching_factor(global_random_seed, global_dtype): + # Test that nodes have at max branching_factor number of subclusters + X, y = make_blobs(random_state=global_random_seed) + X = X.astype(global_dtype, copy=False) + branching_factor = 9 + + # Purposefully set a low threshold to maximize the subclusters. + brc = Birch(n_clusters=None, branching_factor=branching_factor, threshold=0.01) + brc.fit(X) + check_branching_factor(brc.root_, branching_factor) + brc = Birch(n_clusters=3, branching_factor=branching_factor, threshold=0.01) + brc.fit(X) + check_branching_factor(brc.root_, branching_factor) + + +def check_threshold(birch_instance, threshold): + """Use the leaf linked list for traversal""" + current_leaf = birch_instance.dummy_leaf_.next_leaf_ + while current_leaf: + subclusters = current_leaf.subclusters_ + for sc in subclusters: + assert threshold >= sc.radius + current_leaf = current_leaf.next_leaf_ + + +def test_threshold(global_random_seed, global_dtype): + # Test that the leaf subclusters have a threshold lesser than radius + X, y = make_blobs(n_samples=80, centers=4, random_state=global_random_seed) + X = X.astype(global_dtype, copy=False) + brc = Birch(threshold=0.5, n_clusters=None) + brc.fit(X) + check_threshold(brc, 0.5) + + brc = Birch(threshold=5.0, n_clusters=None) + brc.fit(X) + check_threshold(brc, 5.0) + + +def test_birch_n_clusters_long_int(): + # Check that birch supports n_clusters with np.int64 dtype, for instance + # coming from np.arange. #16484 + X, _ = make_blobs(random_state=0) + n_clusters = np.int64(5) + Birch(n_clusters=n_clusters).fit(X) + + +def test_feature_names_out(): + """Check `get_feature_names_out` for `Birch`.""" + X, _ = make_blobs(n_samples=80, n_features=4, random_state=0) + brc = Birch(n_clusters=4) + brc.fit(X) + n_clusters = brc.subcluster_centers_.shape[0] + + names_out = brc.get_feature_names_out() + assert_array_equal([f"birch{i}" for i in range(n_clusters)], names_out) + + +def test_transform_match_across_dtypes(global_random_seed): + X, _ = make_blobs(n_samples=80, n_features=4, random_state=global_random_seed) + brc = Birch(n_clusters=4, threshold=1.1) + Y_64 = brc.fit_transform(X) + Y_32 = brc.fit_transform(X.astype(np.float32)) + + assert_allclose(Y_64, Y_32, atol=1e-6) + + +def test_subcluster_dtype(global_dtype): + X = make_blobs(n_samples=80, n_features=4, random_state=0)[0].astype( + global_dtype, copy=False + ) + brc = Birch(n_clusters=4) + assert brc.fit(X).subcluster_centers_.dtype == global_dtype + + +def test_both_subclusters_updated(): + """Check that both subclusters are updated when a node a split, even when there are + duplicated data points. Non-regression test for #23269. + """ + + X = np.array( + [ + [-2.6192791, -1.5053215], + [-2.9993038, -1.6863596], + [-2.3724914, -1.3438171], + [-2.336792, -1.3417323], + [-2.4089134, -1.3290224], + [-2.3724914, -1.3438171], + [-3.364009, -1.8846745], + [-2.3724914, -1.3438171], + [-2.617677, -1.5003285], + [-2.2960556, -1.3260119], + [-2.3724914, -1.3438171], + [-2.5459878, -1.4533926], + [-2.25979, -1.3003055], + [-2.4089134, -1.3290224], + [-2.3724914, -1.3438171], + [-2.4089134, -1.3290224], + [-2.5459878, -1.4533926], + [-2.3724914, -1.3438171], + [-2.9720619, -1.7058647], + [-2.336792, -1.3417323], + [-2.3724914, -1.3438171], + ], + dtype=np.float32, + ) + + # no error + Birch(branching_factor=5, threshold=1e-5, n_clusters=None).fit(X) + + +# TODO(1.8): Remove +def test_birch_copy_deprecated(): + X, _ = make_blobs(n_samples=80, n_features=4, random_state=0) + brc = Birch(n_clusters=4, copy=True) + with pytest.warns(FutureWarning, match="`copy` was deprecated"): + brc.fit(X) diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_bisect_k_means.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_bisect_k_means.py new file mode 100644 index 0000000000000000000000000000000000000000..799ddbc086ce0a14397fe5cb4aef607903c01228 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_bisect_k_means.py @@ -0,0 +1,158 @@ +import numpy as np +import pytest + +from sklearn.cluster import BisectingKMeans +from sklearn.metrics import v_measure_score +from sklearn.utils._testing import assert_allclose, assert_array_equal +from sklearn.utils.fixes import CSR_CONTAINERS + + +@pytest.mark.parametrize("bisecting_strategy", ["biggest_inertia", "largest_cluster"]) +@pytest.mark.parametrize("init", ["k-means++", "random"]) +def test_three_clusters(bisecting_strategy, init): + """Tries to perform bisect k-means for three clusters to check + if splitting data is performed correctly. + """ + X = np.array( + [[1, 1], [10, 1], [3, 1], [10, 0], [2, 1], [10, 2], [10, 8], [10, 9], [10, 10]] + ) + bisect_means = BisectingKMeans( + n_clusters=3, + random_state=0, + bisecting_strategy=bisecting_strategy, + init=init, + ) + bisect_means.fit(X) + + expected_centers = [[2, 1], [10, 1], [10, 9]] + expected_labels = [0, 1, 0, 1, 0, 1, 2, 2, 2] + + assert_allclose( + sorted(expected_centers), sorted(bisect_means.cluster_centers_.tolist()) + ) + assert_allclose(v_measure_score(expected_labels, bisect_means.labels_), 1.0) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse(csr_container): + """Test Bisecting K-Means with sparse data. + + Checks if labels and centers are the same between dense and sparse. + """ + + rng = np.random.RandomState(0) + + X = rng.rand(20, 2) + X[X < 0.8] = 0 + X_csr = csr_container(X) + + bisect_means = BisectingKMeans(n_clusters=3, random_state=0) + + bisect_means.fit(X_csr) + sparse_centers = bisect_means.cluster_centers_ + + bisect_means.fit(X) + normal_centers = bisect_means.cluster_centers_ + + # Check if results is the same for dense and sparse data + assert_allclose(normal_centers, sparse_centers, atol=1e-8) + + +@pytest.mark.parametrize("n_clusters", [4, 5]) +def test_n_clusters(n_clusters): + """Test if resulting labels are in range [0, n_clusters - 1].""" + + rng = np.random.RandomState(0) + X = rng.rand(10, 2) + + bisect_means = BisectingKMeans(n_clusters=n_clusters, random_state=0) + bisect_means.fit(X) + + assert_array_equal(np.unique(bisect_means.labels_), np.arange(n_clusters)) + + +def test_one_cluster(): + """Test single cluster.""" + + X = np.array([[1, 2], [10, 2], [10, 8]]) + + bisect_means = BisectingKMeans(n_clusters=1, random_state=0).fit(X) + + # All labels from fit or predict should be equal 0 + assert all(bisect_means.labels_ == 0) + assert all(bisect_means.predict(X) == 0) + + assert_allclose(bisect_means.cluster_centers_, X.mean(axis=0).reshape(1, -1)) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None]) +def test_fit_predict(csr_container): + """Check if labels from fit(X) method are same as from fit(X).predict(X).""" + rng = np.random.RandomState(0) + + X = rng.rand(10, 2) + + if csr_container is not None: + X[X < 0.8] = 0 + X = csr_container(X) + + bisect_means = BisectingKMeans(n_clusters=3, random_state=0) + bisect_means.fit(X) + + assert_array_equal(bisect_means.labels_, bisect_means.predict(X)) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None]) +def test_dtype_preserved(csr_container, global_dtype): + """Check that centers dtype is the same as input data dtype.""" + rng = np.random.RandomState(0) + X = rng.rand(10, 2).astype(global_dtype, copy=False) + + if csr_container is not None: + X[X < 0.8] = 0 + X = csr_container(X) + + km = BisectingKMeans(n_clusters=3, random_state=0) + km.fit(X) + + assert km.cluster_centers_.dtype == global_dtype + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None]) +def test_float32_float64_equivalence(csr_container): + """Check that the results are the same between float32 and float64.""" + rng = np.random.RandomState(0) + X = rng.rand(10, 2) + + if csr_container is not None: + X[X < 0.8] = 0 + X = csr_container(X) + + km64 = BisectingKMeans(n_clusters=3, random_state=0).fit(X) + km32 = BisectingKMeans(n_clusters=3, random_state=0).fit(X.astype(np.float32)) + + assert_allclose(km32.cluster_centers_, km64.cluster_centers_) + assert_array_equal(km32.labels_, km64.labels_) + + +@pytest.mark.parametrize("algorithm", ("lloyd", "elkan")) +def test_no_crash_on_empty_bisections(algorithm): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/27081 + rng = np.random.RandomState(0) + X_train = rng.rand(3000, 10) + bkm = BisectingKMeans(n_clusters=10, algorithm=algorithm).fit(X_train) + + # predict on scaled data to trigger pathologic case + # where the inner mask leads to empty bisections. + X_test = 50 * rng.rand(100, 10) + labels = bkm.predict(X_test) # should not crash with idiv by 0 + assert np.isin(np.unique(labels), np.arange(10)).all() + + +def test_one_feature(): + # Check that no error is raised when there is only one feature + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/27236 + X = np.random.normal(size=(128, 1)) + BisectingKMeans(bisecting_strategy="biggest_inertia", random_state=0).fit(X) diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_dbscan.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_dbscan.py new file mode 100644 index 0000000000000000000000000000000000000000..556f89312d2fc87ab962ab84551f4941ec8b359b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_dbscan.py @@ -0,0 +1,434 @@ +""" +Tests for DBSCAN clustering algorithm +""" + +import pickle +import warnings + +import numpy as np +import pytest +from scipy.spatial import distance + +from sklearn.cluster import DBSCAN, dbscan +from sklearn.cluster.tests.common import generate_clustered_data +from sklearn.metrics.pairwise import pairwise_distances +from sklearn.neighbors import NearestNeighbors +from sklearn.utils._testing import assert_array_equal +from sklearn.utils.fixes import CSR_CONTAINERS, LIL_CONTAINERS + +n_clusters = 3 +X = generate_clustered_data(n_clusters=n_clusters) + + +def test_dbscan_similarity(): + # Tests the DBSCAN algorithm with a similarity array. + # Parameters chosen specifically for this task. + eps = 0.15 + min_samples = 10 + # Compute similarities + D = distance.squareform(distance.pdist(X)) + D /= np.max(D) + # Compute DBSCAN + core_samples, labels = dbscan( + D, metric="precomputed", eps=eps, min_samples=min_samples + ) + # number of clusters, ignoring noise if present + n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0) + + assert n_clusters_1 == n_clusters + + db = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples) + labels = db.fit(D).labels_ + + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == n_clusters + + +def test_dbscan_feature(): + # Tests the DBSCAN algorithm with a feature vector array. + # Parameters chosen specifically for this task. + # Different eps to other test, because distance is not normalised. + eps = 0.8 + min_samples = 10 + metric = "euclidean" + # Compute DBSCAN + # parameters chosen for task + core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples) + + # number of clusters, ignoring noise if present + n_clusters_1 = len(set(labels)) - int(-1 in labels) + assert n_clusters_1 == n_clusters + + db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples) + labels = db.fit(X).labels_ + + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == n_clusters + + +@pytest.mark.parametrize("lil_container", LIL_CONTAINERS) +def test_dbscan_sparse(lil_container): + core_sparse, labels_sparse = dbscan(lil_container(X), eps=0.8, min_samples=10) + core_dense, labels_dense = dbscan(X, eps=0.8, min_samples=10) + assert_array_equal(core_dense, core_sparse) + assert_array_equal(labels_dense, labels_sparse) + + +@pytest.mark.parametrize("include_self", [False, True]) +def test_dbscan_sparse_precomputed(include_self): + D = pairwise_distances(X) + nn = NearestNeighbors(radius=0.9).fit(X) + X_ = X if include_self else None + D_sparse = nn.radius_neighbors_graph(X=X_, mode="distance") + # Ensure it is sparse not merely on diagonals: + assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1) + core_sparse, labels_sparse = dbscan( + D_sparse, eps=0.8, min_samples=10, metric="precomputed" + ) + core_dense, labels_dense = dbscan(D, eps=0.8, min_samples=10, metric="precomputed") + assert_array_equal(core_dense, core_sparse) + assert_array_equal(labels_dense, labels_sparse) + + +def test_dbscan_sparse_precomputed_different_eps(): + # test that precomputed neighbors graph is filtered if computed with + # a radius larger than DBSCAN's eps. + lower_eps = 0.2 + nn = NearestNeighbors(radius=lower_eps).fit(X) + D_sparse = nn.radius_neighbors_graph(X, mode="distance") + dbscan_lower = dbscan(D_sparse, eps=lower_eps, metric="precomputed") + + higher_eps = lower_eps + 0.7 + nn = NearestNeighbors(radius=higher_eps).fit(X) + D_sparse = nn.radius_neighbors_graph(X, mode="distance") + dbscan_higher = dbscan(D_sparse, eps=lower_eps, metric="precomputed") + + assert_array_equal(dbscan_lower[0], dbscan_higher[0]) + assert_array_equal(dbscan_lower[1], dbscan_higher[1]) + + +@pytest.mark.parametrize("metric", ["precomputed", "minkowski"]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None]) +def test_dbscan_input_not_modified(metric, csr_container): + # test that the input is not modified by dbscan + X = np.random.RandomState(0).rand(10, 10) + X = csr_container(X) if csr_container is not None else X + X_copy = X.copy() + dbscan(X, metric=metric) + + if csr_container is not None: + assert_array_equal(X.toarray(), X_copy.toarray()) + else: + assert_array_equal(X, X_copy) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_dbscan_input_not_modified_precomputed_sparse_nodiag(csr_container): + """Check that we don't modify in-place the pre-computed sparse matrix. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/27508 + """ + X = np.random.RandomState(0).rand(10, 10) + # Add zeros on the diagonal that will be implicit when creating + # the sparse matrix. If `X` is modified in-place, the zeros from + # the diagonal will be made explicit. + np.fill_diagonal(X, 0) + X = csr_container(X) + assert all(row != col for row, col in zip(*X.nonzero())) + X_copy = X.copy() + dbscan(X, metric="precomputed") + # Make sure that we did not modify `X` in-place even by creating + # explicit 0s values. + assert X.nnz == X_copy.nnz + assert_array_equal(X.toarray(), X_copy.toarray()) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_dbscan_no_core_samples(csr_container): + rng = np.random.RandomState(0) + X = rng.rand(40, 10) + X[X < 0.8] = 0 + + for X_ in [X, csr_container(X)]: + db = DBSCAN(min_samples=6).fit(X_) + assert_array_equal(db.components_, np.empty((0, X_.shape[1]))) + assert_array_equal(db.labels_, -1) + assert db.core_sample_indices_.shape == (0,) + + +def test_dbscan_callable(): + # Tests the DBSCAN algorithm with a callable metric. + # Parameters chosen specifically for this task. + # Different eps to other test, because distance is not normalised. + eps = 0.8 + min_samples = 10 + # metric is the function reference, not the string key. + metric = distance.euclidean + # Compute DBSCAN + # parameters chosen for task + core_samples, labels = dbscan( + X, metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree" + ) + + # number of clusters, ignoring noise if present + n_clusters_1 = len(set(labels)) - int(-1 in labels) + assert n_clusters_1 == n_clusters + + db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree") + labels = db.fit(X).labels_ + + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == n_clusters + + +def test_dbscan_metric_params(): + # Tests that DBSCAN works with the metrics_params argument. + eps = 0.8 + min_samples = 10 + p = 1 + + # Compute DBSCAN with metric_params arg + + with warnings.catch_warnings(record=True) as warns: + db = DBSCAN( + metric="minkowski", + metric_params={"p": p}, + eps=eps, + p=None, + min_samples=min_samples, + algorithm="ball_tree", + ).fit(X) + assert not warns, warns[0].message + core_sample_1, labels_1 = db.core_sample_indices_, db.labels_ + + # Test that sample labels are the same as passing Minkowski 'p' directly + db = DBSCAN( + metric="minkowski", eps=eps, min_samples=min_samples, algorithm="ball_tree", p=p + ).fit(X) + core_sample_2, labels_2 = db.core_sample_indices_, db.labels_ + + assert_array_equal(core_sample_1, core_sample_2) + assert_array_equal(labels_1, labels_2) + + # Minkowski with p=1 should be equivalent to Manhattan distance + db = DBSCAN( + metric="manhattan", eps=eps, min_samples=min_samples, algorithm="ball_tree" + ).fit(X) + core_sample_3, labels_3 = db.core_sample_indices_, db.labels_ + + assert_array_equal(core_sample_1, core_sample_3) + assert_array_equal(labels_1, labels_3) + + with pytest.warns( + SyntaxWarning, + match=( + "Parameter p is found in metric_params. " + "The corresponding parameter from __init__ " + "is ignored." + ), + ): + # Test that checks p is ignored in favor of metric_params={'p': } + db = DBSCAN( + metric="minkowski", + metric_params={"p": p}, + eps=eps, + p=p + 1, + min_samples=min_samples, + algorithm="ball_tree", + ).fit(X) + core_sample_4, labels_4 = db.core_sample_indices_, db.labels_ + + assert_array_equal(core_sample_1, core_sample_4) + assert_array_equal(labels_1, labels_4) + + +def test_dbscan_balltree(): + # Tests the DBSCAN algorithm with balltree for neighbor calculation. + eps = 0.8 + min_samples = 10 + + D = pairwise_distances(X) + core_samples, labels = dbscan( + D, metric="precomputed", eps=eps, min_samples=min_samples + ) + + # number of clusters, ignoring noise if present + n_clusters_1 = len(set(labels)) - int(-1 in labels) + assert n_clusters_1 == n_clusters + + db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="ball_tree") + labels = db.fit(X).labels_ + + n_clusters_2 = len(set(labels)) - int(-1 in labels) + assert n_clusters_2 == n_clusters + + db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="kd_tree") + labels = db.fit(X).labels_ + + n_clusters_3 = len(set(labels)) - int(-1 in labels) + assert n_clusters_3 == n_clusters + + db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm="ball_tree") + labels = db.fit(X).labels_ + + n_clusters_4 = len(set(labels)) - int(-1 in labels) + assert n_clusters_4 == n_clusters + + db = DBSCAN(leaf_size=20, eps=eps, min_samples=min_samples, algorithm="ball_tree") + labels = db.fit(X).labels_ + + n_clusters_5 = len(set(labels)) - int(-1 in labels) + assert n_clusters_5 == n_clusters + + +def test_input_validation(): + # DBSCAN.fit should accept a list of lists. + X = [[1.0, 2.0], [3.0, 4.0]] + DBSCAN().fit(X) # must not raise exception + + +def test_pickle(): + obj = DBSCAN() + s = pickle.dumps(obj) + assert type(pickle.loads(s)) is obj.__class__ + + +def test_boundaries(): + # ensure min_samples is inclusive of core point + core, _ = dbscan([[0], [1]], eps=2, min_samples=2) + assert 0 in core + # ensure eps is inclusive of circumference + core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2) + assert 0 in core + core, _ = dbscan([[0], [1], [1]], eps=0.99, min_samples=2) + assert 0 not in core + + +def test_weighted_dbscan(global_random_seed): + # ensure sample_weight is validated + with pytest.raises(ValueError): + dbscan([[0], [1]], sample_weight=[2]) + with pytest.raises(ValueError): + dbscan([[0], [1]], sample_weight=[2, 3, 4]) + + # ensure sample_weight has an effect + assert_array_equal([], dbscan([[0], [1]], sample_weight=None, min_samples=6)[0]) + assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5], min_samples=6)[0]) + assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5], min_samples=6)[0]) + assert_array_equal( + [0, 1], dbscan([[0], [1]], sample_weight=[6, 6], min_samples=6)[0] + ) + + # points within eps of each other: + assert_array_equal( + [0, 1], dbscan([[0], [1]], eps=1.5, sample_weight=[5, 1], min_samples=6)[0] + ) + # and effect of non-positive and non-integer sample_weight: + assert_array_equal( + [], dbscan([[0], [1]], sample_weight=[5, 0], eps=1.5, min_samples=6)[0] + ) + assert_array_equal( + [0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1], eps=1.5, min_samples=6)[0] + ) + assert_array_equal( + [0, 1], dbscan([[0], [1]], sample_weight=[6, 0], eps=1.5, min_samples=6)[0] + ) + assert_array_equal( + [], dbscan([[0], [1]], sample_weight=[6, -1], eps=1.5, min_samples=6)[0] + ) + + # for non-negative sample_weight, cores should be identical to repetition + rng = np.random.RandomState(global_random_seed) + sample_weight = rng.randint(0, 5, X.shape[0]) + core1, label1 = dbscan(X, sample_weight=sample_weight) + assert len(label1) == len(X) + + X_repeated = np.repeat(X, sample_weight, axis=0) + core_repeated, label_repeated = dbscan(X_repeated) + core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool) + core_repeated_mask[core_repeated] = True + core_mask = np.zeros(X.shape[0], dtype=bool) + core_mask[core1] = True + assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask) + + # sample_weight should work with precomputed distance matrix + D = pairwise_distances(X) + core3, label3 = dbscan(D, sample_weight=sample_weight, metric="precomputed") + assert_array_equal(core1, core3) + assert_array_equal(label1, label3) + + # sample_weight should work with estimator + est = DBSCAN().fit(X, sample_weight=sample_weight) + core4 = est.core_sample_indices_ + label4 = est.labels_ + assert_array_equal(core1, core4) + assert_array_equal(label1, label4) + + est = DBSCAN() + label5 = est.fit_predict(X, sample_weight=sample_weight) + core5 = est.core_sample_indices_ + assert_array_equal(core1, core5) + assert_array_equal(label1, label5) + assert_array_equal(label1, est.labels_) + + +@pytest.mark.parametrize("algorithm", ["brute", "kd_tree", "ball_tree"]) +def test_dbscan_core_samples_toy(algorithm): + X = [[0], [2], [3], [4], [6], [8], [10]] + n_samples = len(X) + + # Degenerate case: every sample is a core sample, either with its own + # cluster or including other close core samples. + core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=1) + assert_array_equal(core_samples, np.arange(n_samples)) + assert_array_equal(labels, [0, 1, 1, 1, 2, 3, 4]) + + # With eps=1 and min_samples=2 only the 3 samples from the denser area + # are core samples. All other points are isolated and considered noise. + core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=2) + assert_array_equal(core_samples, [1, 2, 3]) + assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1]) + + # Only the sample in the middle of the dense area is core. Its two + # neighbors are edge samples. Remaining samples are noise. + core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=3) + assert_array_equal(core_samples, [2]) + assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1]) + + # It's no longer possible to extract core samples with eps=1: + # everything is noise. + core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=4) + assert_array_equal(core_samples, []) + assert_array_equal(labels, np.full(n_samples, -1.0)) + + +def test_dbscan_precomputed_metric_with_degenerate_input_arrays(): + # see https://github.com/scikit-learn/scikit-learn/issues/4641 for + # more details + X = np.eye(10) + labels = DBSCAN(eps=0.5, metric="precomputed").fit(X).labels_ + assert len(set(labels)) == 1 + + X = np.zeros((10, 10)) + labels = DBSCAN(eps=0.5, metric="precomputed").fit(X).labels_ + assert len(set(labels)) == 1 + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_dbscan_precomputed_metric_with_initial_rows_zero(csr_container): + # sample matrix with initial two row all zero + ar = np.array( + [ + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0], + [0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.3], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1], + [0.0, 0.0, 0.0, 0.0, 0.3, 0.1, 0.0], + ] + ) + matrix = csr_container(ar) + labels = DBSCAN(eps=0.2, metric="precomputed", min_samples=2).fit(matrix).labels_ + assert_array_equal(labels, [-1, -1, 0, 0, 0, 1, 1]) diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_feature_agglomeration.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_feature_agglomeration.py new file mode 100644 index 0000000000000000000000000000000000000000..80aa251c358153b0771bd201067fa87f8fb6bfdc --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_feature_agglomeration.py @@ -0,0 +1,55 @@ +""" +Tests for sklearn.cluster._feature_agglomeration +""" + +import numpy as np +from numpy.testing import assert_array_equal + +from sklearn.cluster import FeatureAgglomeration +from sklearn.datasets import make_blobs +from sklearn.utils._testing import assert_array_almost_equal + + +def test_feature_agglomeration(): + n_clusters = 1 + X = np.array([0, 0, 1]).reshape(1, 3) # (n_samples, n_features) + + agglo_mean = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.mean) + agglo_median = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.median) + agglo_mean.fit(X) + agglo_median.fit(X) + + assert np.size(np.unique(agglo_mean.labels_)) == n_clusters + assert np.size(np.unique(agglo_median.labels_)) == n_clusters + assert np.size(agglo_mean.labels_) == X.shape[1] + assert np.size(agglo_median.labels_) == X.shape[1] + + # Test transform + Xt_mean = agglo_mean.transform(X) + Xt_median = agglo_median.transform(X) + assert Xt_mean.shape[1] == n_clusters + assert Xt_median.shape[1] == n_clusters + assert Xt_mean == np.array([1 / 3.0]) + assert Xt_median == np.array([0.0]) + + # Test inverse transform + X_full_mean = agglo_mean.inverse_transform(Xt_mean) + X_full_median = agglo_median.inverse_transform(Xt_median) + assert np.unique(X_full_mean[0]).size == n_clusters + assert np.unique(X_full_median[0]).size == n_clusters + + assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean) + assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median) + + +def test_feature_agglomeration_feature_names_out(): + """Check `get_feature_names_out` for `FeatureAgglomeration`.""" + X, _ = make_blobs(n_features=6, random_state=0) + agglo = FeatureAgglomeration(n_clusters=3) + agglo.fit(X) + n_clusters = agglo.n_clusters_ + + names_out = agglo.get_feature_names_out() + assert_array_equal( + [f"featureagglomeration{i}" for i in range(n_clusters)], names_out + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_hdbscan.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_hdbscan.py new file mode 100644 index 0000000000000000000000000000000000000000..3b45d9d3cb7aa290e7fac62f359ac518d105579e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_hdbscan.py @@ -0,0 +1,582 @@ +""" +Tests for HDBSCAN clustering algorithm +Based on the DBSCAN test code +""" + +import numpy as np +import pytest +from scipy import stats +from scipy.spatial import distance + +from sklearn.cluster import HDBSCAN +from sklearn.cluster._hdbscan._tree import ( + CONDENSED_dtype, + _condense_tree, + _do_labelling, +) +from sklearn.cluster._hdbscan.hdbscan import _OUTLIER_ENCODING +from sklearn.datasets import make_blobs +from sklearn.metrics import fowlkes_mallows_score +from sklearn.metrics.pairwise import _VALID_METRICS, euclidean_distances +from sklearn.neighbors import BallTree, KDTree +from sklearn.preprocessing import StandardScaler +from sklearn.utils import shuffle +from sklearn.utils._testing import assert_allclose, assert_array_equal +from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS + +X, y = make_blobs(n_samples=200, random_state=10) +X, y = shuffle(X, y, random_state=7) +X = StandardScaler().fit_transform(X) + +ALGORITHMS = [ + "kd_tree", + "ball_tree", + "brute", + "auto", +] + +OUTLIER_SET = {-1} | {out["label"] for _, out in _OUTLIER_ENCODING.items()} + + +def check_label_quality(labels, threshold=0.99): + n_clusters = len(set(labels) - OUTLIER_SET) + assert n_clusters == 3 + assert fowlkes_mallows_score(labels, y) > threshold + + +@pytest.mark.parametrize("outlier_type", _OUTLIER_ENCODING) +def test_outlier_data(outlier_type): + """ + Tests if np.inf and np.nan data are each treated as special outliers. + """ + outlier = { + "infinite": np.inf, + "missing": np.nan, + }[outlier_type] + prob_check = { + "infinite": lambda x, y: x == y, + "missing": lambda x, y: np.isnan(x), + }[outlier_type] + label = _OUTLIER_ENCODING[outlier_type]["label"] + prob = _OUTLIER_ENCODING[outlier_type]["prob"] + + X_outlier = X.copy() + X_outlier[0] = [outlier, 1] + X_outlier[5] = [outlier, outlier] + model = HDBSCAN().fit(X_outlier) + + (missing_labels_idx,) = (model.labels_ == label).nonzero() + assert_array_equal(missing_labels_idx, [0, 5]) + + (missing_probs_idx,) = (prob_check(model.probabilities_, prob)).nonzero() + assert_array_equal(missing_probs_idx, [0, 5]) + + clean_indices = list(range(1, 5)) + list(range(6, 200)) + clean_model = HDBSCAN().fit(X_outlier[clean_indices]) + assert_array_equal(clean_model.labels_, model.labels_[clean_indices]) + + +def test_hdbscan_distance_matrix(): + """ + Tests that HDBSCAN works with precomputed distance matrices, and throws the + appropriate errors when needed. + """ + D = euclidean_distances(X) + D_original = D.copy() + labels = HDBSCAN(metric="precomputed", copy=True).fit_predict(D) + + assert_allclose(D, D_original) + check_label_quality(labels) + + msg = r"The precomputed distance matrix.*has shape" + with pytest.raises(ValueError, match=msg): + HDBSCAN(metric="precomputed", copy=True).fit_predict(X) + + msg = r"The precomputed distance matrix.*values" + # Ensure the matrix is not symmetric + D[0, 1] = 10 + D[1, 0] = 1 + with pytest.raises(ValueError, match=msg): + HDBSCAN(metric="precomputed").fit_predict(D) + + +@pytest.mark.parametrize("sparse_constructor", [*CSR_CONTAINERS, *CSC_CONTAINERS]) +def test_hdbscan_sparse_distance_matrix(sparse_constructor): + """ + Tests that HDBSCAN works with sparse distance matrices. + """ + D = distance.squareform(distance.pdist(X)) + D /= np.max(D) + + threshold = stats.scoreatpercentile(D.flatten(), 50) + + D[D >= threshold] = 0.0 + D = sparse_constructor(D) + D.eliminate_zeros() + + labels = HDBSCAN(metric="precomputed").fit_predict(D) + check_label_quality(labels) + + +def test_hdbscan_feature_array(): + """ + Tests that HDBSCAN works with feature array, including an arbitrary + goodness of fit check. Note that the check is a simple heuristic. + """ + labels = HDBSCAN().fit_predict(X) + + # Check that clustering is arbitrarily good + # This is a heuristic to guard against regression + check_label_quality(labels) + + +@pytest.mark.parametrize("algo", ALGORITHMS) +@pytest.mark.parametrize("metric", _VALID_METRICS) +def test_hdbscan_algorithms(algo, metric): + """ + Tests that HDBSCAN works with the expected combinations of algorithms and + metrics, or raises the expected errors. + """ + labels = HDBSCAN(algorithm=algo).fit_predict(X) + check_label_quality(labels) + + # Validation for brute is handled by `pairwise_distances` + if algo in ("brute", "auto"): + return + + ALGOS_TREES = { + "kd_tree": KDTree, + "ball_tree": BallTree, + } + metric_params = { + "mahalanobis": {"V": np.eye(X.shape[1])}, + "seuclidean": {"V": np.ones(X.shape[1])}, + "minkowski": {"p": 2}, + "wminkowski": {"p": 2, "w": np.ones(X.shape[1])}, + }.get(metric, None) + + hdb = HDBSCAN( + algorithm=algo, + metric=metric, + metric_params=metric_params, + ) + + if metric not in ALGOS_TREES[algo].valid_metrics: + with pytest.raises(ValueError): + hdb.fit(X) + elif metric == "wminkowski": + with pytest.warns(FutureWarning): + hdb.fit(X) + else: + hdb.fit(X) + + +def test_dbscan_clustering(): + """ + Tests that HDBSCAN can generate a sufficiently accurate dbscan clustering. + This test is more of a sanity check than a rigorous evaluation. + """ + clusterer = HDBSCAN().fit(X) + labels = clusterer.dbscan_clustering(0.3) + + # We use a looser threshold due to dbscan producing a more constrained + # clustering representation + check_label_quality(labels, threshold=0.92) + + +@pytest.mark.parametrize("cut_distance", (0.1, 0.5, 1)) +def test_dbscan_clustering_outlier_data(cut_distance): + """ + Tests if np.inf and np.nan data are each treated as special outliers. + """ + missing_label = _OUTLIER_ENCODING["missing"]["label"] + infinite_label = _OUTLIER_ENCODING["infinite"]["label"] + + X_outlier = X.copy() + X_outlier[0] = [np.inf, 1] + X_outlier[2] = [1, np.nan] + X_outlier[5] = [np.inf, np.nan] + model = HDBSCAN().fit(X_outlier) + labels = model.dbscan_clustering(cut_distance=cut_distance) + + missing_labels_idx = np.flatnonzero(labels == missing_label) + assert_array_equal(missing_labels_idx, [2, 5]) + + infinite_labels_idx = np.flatnonzero(labels == infinite_label) + assert_array_equal(infinite_labels_idx, [0]) + + clean_idx = list(set(range(200)) - set(missing_labels_idx + infinite_labels_idx)) + clean_model = HDBSCAN().fit(X_outlier[clean_idx]) + clean_labels = clean_model.dbscan_clustering(cut_distance=cut_distance) + assert_array_equal(clean_labels, labels[clean_idx]) + + +def test_hdbscan_best_balltree_metric(): + """ + Tests that HDBSCAN using `BallTree` works. + """ + labels = HDBSCAN( + metric="seuclidean", metric_params={"V": np.ones(X.shape[1])} + ).fit_predict(X) + check_label_quality(labels) + + +def test_hdbscan_no_clusters(): + """ + Tests that HDBSCAN correctly does not generate a valid cluster when the + `min_cluster_size` is too large for the data. + """ + labels = HDBSCAN(min_cluster_size=len(X) - 1).fit_predict(X) + assert set(labels).issubset(OUTLIER_SET) + + +def test_hdbscan_min_cluster_size(): + """ + Test that the smallest non-noise cluster has at least `min_cluster_size` + many points + """ + for min_cluster_size in range(2, len(X), 1): + labels = HDBSCAN(min_cluster_size=min_cluster_size).fit_predict(X) + true_labels = [label for label in labels if label != -1] + if len(true_labels) != 0: + assert np.min(np.bincount(true_labels)) >= min_cluster_size + + +def test_hdbscan_callable_metric(): + """ + Tests that HDBSCAN works when passed a callable metric. + """ + metric = distance.euclidean + labels = HDBSCAN(metric=metric).fit_predict(X) + check_label_quality(labels) + + +@pytest.mark.parametrize("tree", ["kd_tree", "ball_tree"]) +def test_hdbscan_precomputed_non_brute(tree): + """ + Tests that HDBSCAN correctly raises an error when passing precomputed data + while requesting a tree-based algorithm. + """ + hdb = HDBSCAN(metric="precomputed", algorithm=tree) + msg = "precomputed is not a valid metric for" + with pytest.raises(ValueError, match=msg): + hdb.fit(X) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_hdbscan_sparse(csr_container): + """ + Tests that HDBSCAN works correctly when passing sparse feature data. + Evaluates correctness by comparing against the same data passed as a dense + array. + """ + + dense_labels = HDBSCAN().fit(X).labels_ + check_label_quality(dense_labels) + + _X_sparse = csr_container(X) + X_sparse = _X_sparse.copy() + sparse_labels = HDBSCAN().fit(X_sparse).labels_ + assert_array_equal(dense_labels, sparse_labels) + + # Compare that the sparse and dense non-precomputed routines return the same labels + # where the 0th observation contains the outlier. + for outlier_val, outlier_type in ((np.inf, "infinite"), (np.nan, "missing")): + X_dense = X.copy() + X_dense[0, 0] = outlier_val + dense_labels = HDBSCAN().fit(X_dense).labels_ + check_label_quality(dense_labels) + assert dense_labels[0] == _OUTLIER_ENCODING[outlier_type]["label"] + + X_sparse = _X_sparse.copy() + X_sparse[0, 0] = outlier_val + sparse_labels = HDBSCAN().fit(X_sparse).labels_ + assert_array_equal(dense_labels, sparse_labels) + + msg = "Sparse data matrices only support algorithm `brute`." + with pytest.raises(ValueError, match=msg): + HDBSCAN(metric="euclidean", algorithm="ball_tree").fit(X_sparse) + + +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_hdbscan_centers(algorithm): + """ + Tests that HDBSCAN centers are calculated and stored properly, and are + accurate to the data. + """ + centers = [(0.0, 0.0), (3.0, 3.0)] + H, _ = make_blobs(n_samples=2000, random_state=0, centers=centers, cluster_std=0.5) + hdb = HDBSCAN(store_centers="both").fit(H) + + for center, centroid, medoid in zip(centers, hdb.centroids_, hdb.medoids_): + assert_allclose(center, centroid, rtol=1, atol=0.05) + assert_allclose(center, medoid, rtol=1, atol=0.05) + + # Ensure that nothing is done for noise + hdb = HDBSCAN( + algorithm=algorithm, store_centers="both", min_cluster_size=X.shape[0] + ).fit(X) + assert hdb.centroids_.shape[0] == 0 + assert hdb.medoids_.shape[0] == 0 + + +def test_hdbscan_allow_single_cluster_with_epsilon(): + """ + Tests that HDBSCAN single-cluster selection with epsilon works correctly. + """ + rng = np.random.RandomState(0) + no_structure = rng.rand(150, 2) + # without epsilon we should see many noise points as children of root. + labels = HDBSCAN( + min_cluster_size=5, + cluster_selection_epsilon=0.0, + cluster_selection_method="eom", + allow_single_cluster=True, + ).fit_predict(no_structure) + unique_labels, counts = np.unique(labels, return_counts=True) + assert len(unique_labels) == 2 + + # Arbitrary heuristic. Would prefer something more precise. + assert counts[unique_labels == -1] > 30 + + # for this random seed an epsilon of 0.18 will produce exactly 2 noise + # points at that cut in single linkage. + labels = HDBSCAN( + min_cluster_size=5, + cluster_selection_epsilon=0.18, + cluster_selection_method="eom", + allow_single_cluster=True, + algorithm="kd_tree", + ).fit_predict(no_structure) + unique_labels, counts = np.unique(labels, return_counts=True) + assert len(unique_labels) == 2 + assert counts[unique_labels == -1] == 2 + + +def test_hdbscan_better_than_dbscan(): + """ + Validate that HDBSCAN can properly cluster this difficult synthetic + dataset. Note that DBSCAN fails on this (see HDBSCAN plotting + example) + """ + centers = [[-0.85, -0.85], [-0.85, 0.85], [3, 3], [3, -3]] + X, y = make_blobs( + n_samples=750, + centers=centers, + cluster_std=[0.2, 0.35, 1.35, 1.35], + random_state=0, + ) + labels = HDBSCAN().fit(X).labels_ + + n_clusters = len(set(labels)) - int(-1 in labels) + assert n_clusters == 4 + fowlkes_mallows_score(labels, y) > 0.99 + + +@pytest.mark.parametrize( + "kwargs, X", + [ + ({"metric": "precomputed"}, np.array([[1, np.inf], [np.inf, 1]])), + ({"metric": "precomputed"}, [[1, 2], [2, 1]]), + ({}, [[1, 2], [3, 4]]), + ], +) +def test_hdbscan_usable_inputs(X, kwargs): + """ + Tests that HDBSCAN works correctly for array-likes and precomputed inputs + with non-finite points. + """ + HDBSCAN(min_samples=1, **kwargs).fit(X) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_hdbscan_sparse_distances_too_few_nonzero(csr_container): + """ + Tests that HDBSCAN raises the correct error when there are too few + non-zero distances. + """ + X = csr_container(np.zeros((10, 10))) + + msg = "There exists points with fewer than" + with pytest.raises(ValueError, match=msg): + HDBSCAN(metric="precomputed").fit(X) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_hdbscan_sparse_distances_disconnected_graph(csr_container): + """ + Tests that HDBSCAN raises the correct error when the distance matrix + has multiple connected components. + """ + # Create symmetric sparse matrix with 2 connected components + X = np.zeros((20, 20)) + X[:5, :5] = 1 + X[5:, 15:] = 1 + X = X + X.T + X = csr_container(X) + msg = "HDBSCAN cannot be performed on a disconnected graph" + with pytest.raises(ValueError, match=msg): + HDBSCAN(metric="precomputed").fit(X) + + +def test_hdbscan_tree_invalid_metric(): + """ + Tests that HDBSCAN correctly raises an error for invalid metric choices. + """ + metric_callable = lambda x: x + msg = ( + ".* is not a valid metric for a .*-based algorithm\\. Please select a different" + " metric\\." + ) + + # Callables are not supported for either + with pytest.raises(ValueError, match=msg): + HDBSCAN(algorithm="kd_tree", metric=metric_callable).fit(X) + with pytest.raises(ValueError, match=msg): + HDBSCAN(algorithm="ball_tree", metric=metric_callable).fit(X) + + # The set of valid metrics for KDTree at the time of writing this test is a + # strict subset of those supported in BallTree + metrics_not_kd = list(set(BallTree.valid_metrics) - set(KDTree.valid_metrics)) + if len(metrics_not_kd) > 0: + with pytest.raises(ValueError, match=msg): + HDBSCAN(algorithm="kd_tree", metric=metrics_not_kd[0]).fit(X) + + +def test_hdbscan_too_many_min_samples(): + """ + Tests that HDBSCAN correctly raises an error when setting `min_samples` + larger than the number of samples. + """ + hdb = HDBSCAN(min_samples=len(X) + 1) + msg = r"min_samples (.*) must be at most" + with pytest.raises(ValueError, match=msg): + hdb.fit(X) + + +def test_hdbscan_precomputed_dense_nan(): + """ + Tests that HDBSCAN correctly raises an error when providing precomputed + distances with `np.nan` values. + """ + X_nan = X.copy() + X_nan[0, 0] = np.nan + msg = "np.nan values found in precomputed-dense" + hdb = HDBSCAN(metric="precomputed") + with pytest.raises(ValueError, match=msg): + hdb.fit(X_nan) + + +@pytest.mark.parametrize("allow_single_cluster", [True, False]) +@pytest.mark.parametrize("epsilon", [0, 0.1]) +def test_labelling_distinct(global_random_seed, allow_single_cluster, epsilon): + """ + Tests that the `_do_labelling` helper function correctly assigns labels. + """ + n_samples = 48 + X, y = make_blobs( + n_samples, + random_state=global_random_seed, + # Ensure the clusters are distinct with no overlap + centers=[ + [0, 0], + [10, 0], + [0, 10], + ], + ) + + est = HDBSCAN().fit(X) + condensed_tree = _condense_tree( + est._single_linkage_tree_, min_cluster_size=est.min_cluster_size + ) + clusters = {n_samples + 2, n_samples + 3, n_samples + 4} + cluster_label_map = {n_samples + 2: 0, n_samples + 3: 1, n_samples + 4: 2} + labels = _do_labelling( + condensed_tree=condensed_tree, + clusters=clusters, + cluster_label_map=cluster_label_map, + allow_single_cluster=allow_single_cluster, + cluster_selection_epsilon=epsilon, + ) + + first_with_label = {_y: np.where(y == _y)[0][0] for _y in list(set(y))} + y_to_labels = {_y: labels[first_with_label[_y]] for _y in list(set(y))} + aligned_target = np.vectorize(y_to_labels.get)(y) + assert_array_equal(labels, aligned_target) + + +def test_labelling_thresholding(): + """ + Tests that the `_do_labelling` helper function correctly thresholds the + incoming lambda values given various `cluster_selection_epsilon` values. + """ + n_samples = 5 + MAX_LAMBDA = 1.5 + condensed_tree = np.array( + [ + (5, 2, MAX_LAMBDA, 1), + (5, 1, 0.1, 1), + (5, 0, MAX_LAMBDA, 1), + (5, 3, 0.2, 1), + (5, 4, 0.3, 1), + ], + dtype=CONDENSED_dtype, + ) + labels = _do_labelling( + condensed_tree=condensed_tree, + clusters={n_samples}, + cluster_label_map={n_samples: 0, n_samples + 1: 1}, + allow_single_cluster=True, + cluster_selection_epsilon=1, + ) + num_noise = condensed_tree["value"] < 1 + assert sum(num_noise) == sum(labels == -1) + + labels = _do_labelling( + condensed_tree=condensed_tree, + clusters={n_samples}, + cluster_label_map={n_samples: 0, n_samples + 1: 1}, + allow_single_cluster=True, + cluster_selection_epsilon=0, + ) + # The threshold should be calculated per-sample based on the largest + # lambda of any simbling node. In this case, all points are siblings + # and the largest value is exactly MAX_LAMBDA. + num_noise = condensed_tree["value"] < MAX_LAMBDA + assert sum(num_noise) == sum(labels == -1) + + +@pytest.mark.parametrize("store_centers", ["centroid", "medoid"]) +def test_hdbscan_error_precomputed_and_store_centers(store_centers): + """Check that we raise an error if the centers are requested together with + a precomputed input matrix. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/27893 + """ + rng = np.random.RandomState(0) + X = rng.random((100, 2)) + X_dist = euclidean_distances(X) + err_msg = "Cannot store centers when using a precomputed distance matrix." + with pytest.raises(ValueError, match=err_msg): + HDBSCAN(metric="precomputed", store_centers=store_centers).fit(X_dist) + + +@pytest.mark.parametrize("valid_algo", ["auto", "brute"]) +def test_hdbscan_cosine_metric_valid_algorithm(valid_algo): + """Test that HDBSCAN works with the "cosine" metric when the algorithm is set + to "brute" or "auto". + + Non-regression test for issue #28631 + """ + HDBSCAN(metric="cosine", algorithm=valid_algo).fit_predict(X) + + +@pytest.mark.parametrize("invalid_algo", ["kd_tree", "ball_tree"]) +def test_hdbscan_cosine_metric_invalid_algorithm(invalid_algo): + """Test that HDBSCAN raises an informative error is raised when an unsupported + algorithm is used with the "cosine" metric. + """ + hdbscan = HDBSCAN(metric="cosine", algorithm=invalid_algo) + with pytest.raises(ValueError, match="cosine is not a valid metric"): + hdbscan.fit_predict(X) diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_hierarchical.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_hierarchical.py new file mode 100644 index 0000000000000000000000000000000000000000..222d4f6cd92649b9d59cb3f69f3d350414493984 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_hierarchical.py @@ -0,0 +1,889 @@ +""" +Several basic tests for hierarchical clustering procedures + +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import itertools +import shutil +from functools import partial +from tempfile import mkdtemp + +import numpy as np +import pytest +from scipy.cluster import hierarchy +from scipy.sparse.csgraph import connected_components + +from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration, ward_tree +from sklearn.cluster._agglomerative import ( + _TREE_BUILDERS, + _fix_connectivity, + _hc_cut, + linkage_tree, +) +from sklearn.cluster._hierarchical_fast import ( + average_merge, + max_merge, + mst_linkage_core, +) +from sklearn.datasets import make_circles, make_moons +from sklearn.feature_extraction.image import grid_to_graph +from sklearn.metrics import DistanceMetric +from sklearn.metrics.cluster import adjusted_rand_score, normalized_mutual_info_score +from sklearn.metrics.pairwise import ( + PAIRED_DISTANCES, + cosine_distances, + manhattan_distances, + pairwise_distances, +) +from sklearn.metrics.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS +from sklearn.neighbors import kneighbors_graph +from sklearn.utils._fast_dict import IntFloatDict +from sklearn.utils._testing import ( + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, + create_memmap_backed_data, + ignore_warnings, +) +from sklearn.utils.fixes import LIL_CONTAINERS + + +def test_linkage_misc(): + # Misc tests on linkage + rng = np.random.RandomState(42) + X = rng.normal(size=(5, 5)) + + with pytest.raises(ValueError): + linkage_tree(X, linkage="foo") + + with pytest.raises(ValueError): + linkage_tree(X, connectivity=np.ones((4, 4))) + + # Smoke test FeatureAgglomeration + FeatureAgglomeration().fit(X) + + # test hierarchical clustering on a precomputed distances matrix + dis = cosine_distances(X) + + res = linkage_tree(dis, affinity="precomputed") + assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0]) + + # test hierarchical clustering on a precomputed distances matrix + res = linkage_tree(X, affinity=manhattan_distances) + assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0]) + + +def test_structured_linkage_tree(): + # Check that we obtain the correct solution for structured linkage trees. + rng = np.random.RandomState(0) + mask = np.ones([10, 10], dtype=bool) + # Avoiding a mask with only 'True' entries + mask[4:7, 4:7] = 0 + X = rng.randn(50, 100) + connectivity = grid_to_graph(*mask.shape) + for tree_builder in _TREE_BUILDERS.values(): + children, n_components, n_leaves, parent = tree_builder( + X.T, connectivity=connectivity + ) + n_nodes = 2 * X.shape[1] - 1 + assert len(children) + n_leaves == n_nodes + # Check that ward_tree raises a ValueError with a connectivity matrix + # of the wrong shape + with pytest.raises(ValueError): + tree_builder(X.T, connectivity=np.ones((4, 4))) + # Check that fitting with no samples raises an error + with pytest.raises(ValueError): + tree_builder(X.T[:0], connectivity=connectivity) + + +def test_unstructured_linkage_tree(): + # Check that we obtain the correct solution for unstructured linkage trees. + rng = np.random.RandomState(0) + X = rng.randn(50, 100) + for this_X in (X, X[0]): + # With specified a number of clusters just for the sake of + # raising a warning and testing the warning code + with ignore_warnings(): + with pytest.warns(UserWarning): + children, n_nodes, n_leaves, parent = ward_tree(this_X.T, n_clusters=10) + n_nodes = 2 * X.shape[1] - 1 + assert len(children) + n_leaves == n_nodes + + for tree_builder in _TREE_BUILDERS.values(): + for this_X in (X, X[0]): + with ignore_warnings(): + with pytest.warns(UserWarning): + children, n_nodes, n_leaves, parent = tree_builder( + this_X.T, n_clusters=10 + ) + n_nodes = 2 * X.shape[1] - 1 + assert len(children) + n_leaves == n_nodes + + +def test_height_linkage_tree(): + # Check that the height of the results of linkage tree is sorted. + rng = np.random.RandomState(0) + mask = np.ones([10, 10], dtype=bool) + X = rng.randn(50, 100) + connectivity = grid_to_graph(*mask.shape) + for linkage_func in _TREE_BUILDERS.values(): + children, n_nodes, n_leaves, parent = linkage_func( + X.T, connectivity=connectivity + ) + n_nodes = 2 * X.shape[1] - 1 + assert len(children) + n_leaves == n_nodes + + +def test_zero_cosine_linkage_tree(): + # Check that zero vectors in X produce an error when + # 'cosine' affinity is used + X = np.array([[0, 1], [0, 0]]) + msg = "Cosine affinity cannot be used when X contains zero vectors" + with pytest.raises(ValueError, match=msg): + linkage_tree(X, affinity="cosine") + + +@pytest.mark.parametrize("n_clusters, distance_threshold", [(None, 0.5), (10, None)]) +@pytest.mark.parametrize("compute_distances", [True, False]) +@pytest.mark.parametrize("linkage", ["ward", "complete", "average", "single"]) +def test_agglomerative_clustering_distances( + n_clusters, compute_distances, distance_threshold, linkage +): + # Check that when `compute_distances` is True or `distance_threshold` is + # given, the fitted model has an attribute `distances_`. + rng = np.random.RandomState(0) + mask = np.ones([10, 10], dtype=bool) + n_samples = 100 + X = rng.randn(n_samples, 50) + connectivity = grid_to_graph(*mask.shape) + + clustering = AgglomerativeClustering( + n_clusters=n_clusters, + connectivity=connectivity, + linkage=linkage, + distance_threshold=distance_threshold, + compute_distances=compute_distances, + ) + clustering.fit(X) + if compute_distances or (distance_threshold is not None): + assert hasattr(clustering, "distances_") + n_children = clustering.children_.shape[0] + n_nodes = n_children + 1 + assert clustering.distances_.shape == (n_nodes - 1,) + else: + assert not hasattr(clustering, "distances_") + + +@pytest.mark.parametrize("lil_container", LIL_CONTAINERS) +def test_agglomerative_clustering(global_random_seed, lil_container): + # Check that we obtain the correct number of clusters with + # agglomerative clustering. + rng = np.random.RandomState(global_random_seed) + mask = np.ones([10, 10], dtype=bool) + n_samples = 100 + X = rng.randn(n_samples, 50) + connectivity = grid_to_graph(*mask.shape) + for linkage in ("ward", "complete", "average", "single"): + clustering = AgglomerativeClustering( + n_clusters=10, connectivity=connectivity, linkage=linkage + ) + clustering.fit(X) + # test caching + try: + tempdir = mkdtemp() + clustering = AgglomerativeClustering( + n_clusters=10, + connectivity=connectivity, + memory=tempdir, + linkage=linkage, + ) + clustering.fit(X) + labels = clustering.labels_ + assert np.size(np.unique(labels)) == 10 + finally: + shutil.rmtree(tempdir) + # Turn caching off now + clustering = AgglomerativeClustering( + n_clusters=10, connectivity=connectivity, linkage=linkage + ) + # Check that we obtain the same solution with early-stopping of the + # tree building + clustering.compute_full_tree = False + clustering.fit(X) + assert_almost_equal(normalized_mutual_info_score(clustering.labels_, labels), 1) + clustering.connectivity = None + clustering.fit(X) + assert np.size(np.unique(clustering.labels_)) == 10 + # Check that we raise a TypeError on dense matrices + clustering = AgglomerativeClustering( + n_clusters=10, + connectivity=lil_container(connectivity.toarray()[:10, :10]), + linkage=linkage, + ) + with pytest.raises(ValueError): + clustering.fit(X) + + # Test that using ward with another metric than euclidean raises an + # exception + clustering = AgglomerativeClustering( + n_clusters=10, + connectivity=connectivity.toarray(), + metric="manhattan", + linkage="ward", + ) + with pytest.raises(ValueError): + clustering.fit(X) + + # Test using another metric than euclidean works with linkage complete + for metric in PAIRED_DISTANCES.keys(): + # Compare our (structured) implementation to scipy + clustering = AgglomerativeClustering( + n_clusters=10, + connectivity=np.ones((n_samples, n_samples)), + metric=metric, + linkage="complete", + ) + clustering.fit(X) + clustering2 = AgglomerativeClustering( + n_clusters=10, connectivity=None, metric=metric, linkage="complete" + ) + clustering2.fit(X) + assert_almost_equal( + normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1 + ) + + # Test that using a distance matrix (affinity = 'precomputed') has same + # results (with connectivity constraints) + clustering = AgglomerativeClustering( + n_clusters=10, connectivity=connectivity, linkage="complete" + ) + clustering.fit(X) + X_dist = pairwise_distances(X) + clustering2 = AgglomerativeClustering( + n_clusters=10, + connectivity=connectivity, + metric="precomputed", + linkage="complete", + ) + clustering2.fit(X_dist) + assert_array_equal(clustering.labels_, clustering2.labels_) + + +def test_agglomerative_clustering_memory_mapped(): + """AgglomerativeClustering must work on mem-mapped dataset. + + Non-regression test for issue #19875. + """ + rng = np.random.RandomState(0) + Xmm = create_memmap_backed_data(rng.randn(50, 100)) + AgglomerativeClustering(metric="euclidean", linkage="single").fit(Xmm) + + +def test_ward_agglomeration(global_random_seed): + # Check that we obtain the correct solution in a simplistic case + rng = np.random.RandomState(global_random_seed) + mask = np.ones([10, 10], dtype=bool) + X = rng.randn(50, 100) + connectivity = grid_to_graph(*mask.shape) + agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity) + agglo.fit(X) + assert np.size(np.unique(agglo.labels_)) == 5 + + X_red = agglo.transform(X) + assert X_red.shape[1] == 5 + X_full = agglo.inverse_transform(X_red) + assert np.unique(X_full[0]).size == 5 + assert_array_almost_equal(agglo.transform(X_full), X_red) + + # Check that fitting with no samples raises a ValueError + with pytest.raises(ValueError): + agglo.fit(X[:0]) + + +def test_single_linkage_clustering(): + # Check that we get the correct result in two emblematic cases + moons, moon_labels = make_moons(noise=0.05, random_state=42) + clustering = AgglomerativeClustering(n_clusters=2, linkage="single") + clustering.fit(moons) + assert_almost_equal( + normalized_mutual_info_score(clustering.labels_, moon_labels), 1 + ) + + circles, circle_labels = make_circles(factor=0.5, noise=0.025, random_state=42) + clustering = AgglomerativeClustering(n_clusters=2, linkage="single") + clustering.fit(circles) + assert_almost_equal( + normalized_mutual_info_score(clustering.labels_, circle_labels), 1 + ) + + +def assess_same_labelling(cut1, cut2): + """Util for comparison with scipy""" + co_clust = [] + for cut in [cut1, cut2]: + n = len(cut) + k = cut.max() + 1 + ecut = np.zeros((n, k)) + ecut[np.arange(n), cut] = 1 + co_clust.append(np.dot(ecut, ecut.T)) + assert (co_clust[0] == co_clust[1]).all() + + +def test_sparse_scikit_vs_scipy(global_random_seed): + # Test scikit linkage with full connectivity (i.e. unstructured) vs scipy + n, p, k = 10, 5, 3 + rng = np.random.RandomState(global_random_seed) + + # Not using a lil_matrix here, just to check that non sparse + # matrices are well handled + connectivity = np.ones((n, n)) + for linkage in _TREE_BUILDERS.keys(): + for i in range(5): + X = 0.1 * rng.normal(size=(n, p)) + X -= 4.0 * np.arange(n)[:, np.newaxis] + X -= X.mean(axis=1)[:, np.newaxis] + + out = hierarchy.linkage(X, method=linkage) + + children_ = out[:, :2].astype(int, copy=False) + children, _, n_leaves, _ = _TREE_BUILDERS[linkage]( + X, connectivity=connectivity + ) + + # Sort the order of child nodes per row for consistency + children.sort(axis=1) + assert_array_equal( + children, + children_, + "linkage tree differs from scipy impl for linkage: " + linkage, + ) + + cut = _hc_cut(k, children, n_leaves) + cut_ = _hc_cut(k, children_, n_leaves) + assess_same_labelling(cut, cut_) + + # Test error management in _hc_cut + with pytest.raises(ValueError): + _hc_cut(n_leaves + 1, children, n_leaves) + + +# Make sure our custom mst_linkage_core gives +# the same results as scipy's builtin +def test_vector_scikit_single_vs_scipy_single(global_random_seed): + n_samples, n_features, n_clusters = 10, 5, 3 + rng = np.random.RandomState(global_random_seed) + X = 0.1 * rng.normal(size=(n_samples, n_features)) + X -= 4.0 * np.arange(n_samples)[:, np.newaxis] + X -= X.mean(axis=1)[:, np.newaxis] + + out = hierarchy.linkage(X, method="single") + children_scipy = out[:, :2].astype(int) + + children, _, n_leaves, _ = _TREE_BUILDERS["single"](X) + + # Sort the order of child nodes per row for consistency + children.sort(axis=1) + assert_array_equal( + children, + children_scipy, + "linkage tree differs from scipy impl for single linkage.", + ) + + cut = _hc_cut(n_clusters, children, n_leaves) + cut_scipy = _hc_cut(n_clusters, children_scipy, n_leaves) + assess_same_labelling(cut, cut_scipy) + + +@pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) +def test_mst_linkage_core_memory_mapped(metric_param_grid): + """The MST-LINKAGE-CORE algorithm must work on mem-mapped dataset. + + Non-regression test for issue #19875. + """ + rng = np.random.RandomState(seed=1) + X = rng.normal(size=(20, 4)) + Xmm = create_memmap_backed_data(X) + metric, param_grid = metric_param_grid + keys = param_grid.keys() + for vals in itertools.product(*param_grid.values()): + kwargs = dict(zip(keys, vals)) + distance_metric = DistanceMetric.get_metric(metric, **kwargs) + mst = mst_linkage_core(X, distance_metric) + mst_mm = mst_linkage_core(Xmm, distance_metric) + np.testing.assert_equal(mst, mst_mm) + + +def test_identical_points(): + # Ensure identical points are handled correctly when using mst with + # a sparse connectivity matrix + X = np.array([[0, 0, 0], [0, 0, 0], [1, 1, 1], [1, 1, 1], [2, 2, 2], [2, 2, 2]]) + true_labels = np.array([0, 0, 1, 1, 2, 2]) + connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False) + connectivity = 0.5 * (connectivity + connectivity.T) + connectivity, n_components = _fix_connectivity(X, connectivity, "euclidean") + + for linkage in ("single", "average", "average", "ward"): + clustering = AgglomerativeClustering( + n_clusters=3, linkage=linkage, connectivity=connectivity + ) + clustering.fit(X) + + assert_almost_equal( + normalized_mutual_info_score(clustering.labels_, true_labels), 1 + ) + + +def test_connectivity_propagation(): + # Check that connectivity in the ward tree is propagated correctly during + # merging. + X = np.array( + [ + (0.014, 0.120), + (0.014, 0.099), + (0.014, 0.097), + (0.017, 0.153), + (0.017, 0.153), + (0.018, 0.153), + (0.018, 0.153), + (0.018, 0.153), + (0.018, 0.153), + (0.018, 0.153), + (0.018, 0.153), + (0.018, 0.153), + (0.018, 0.152), + (0.018, 0.149), + (0.018, 0.144), + ] + ) + connectivity = kneighbors_graph(X, 10, include_self=False) + ward = AgglomerativeClustering( + n_clusters=4, connectivity=connectivity, linkage="ward" + ) + # If changes are not propagated correctly, fit crashes with an + # IndexError + ward.fit(X) + + +def test_ward_tree_children_order(global_random_seed): + # Check that children are ordered in the same way for both structured and + # unstructured versions of ward_tree. + + # test on five random datasets + n, p = 10, 5 + rng = np.random.RandomState(global_random_seed) + + connectivity = np.ones((n, n)) + for i in range(5): + X = 0.1 * rng.normal(size=(n, p)) + X -= 4.0 * np.arange(n)[:, np.newaxis] + X -= X.mean(axis=1)[:, np.newaxis] + + out_unstructured = ward_tree(X) + out_structured = ward_tree(X, connectivity=connectivity) + + assert_array_equal(out_unstructured[0], out_structured[0]) + + +def test_ward_linkage_tree_return_distance(global_random_seed): + # Test return_distance option on linkage and ward trees + + # test that return_distance when set true, gives same + # output on both structured and unstructured clustering. + n, p = 10, 5 + rng = np.random.RandomState(global_random_seed) + + connectivity = np.ones((n, n)) + for i in range(5): + X = 0.1 * rng.normal(size=(n, p)) + X -= 4.0 * np.arange(n)[:, np.newaxis] + X -= X.mean(axis=1)[:, np.newaxis] + + out_unstructured = ward_tree(X, return_distance=True) + out_structured = ward_tree(X, connectivity=connectivity, return_distance=True) + + # get children + children_unstructured = out_unstructured[0] + children_structured = out_structured[0] + + # check if we got the same clusters + assert_array_equal(children_unstructured, children_structured) + + # check if the distances are the same + dist_unstructured = out_unstructured[-1] + dist_structured = out_structured[-1] + + assert_array_almost_equal(dist_unstructured, dist_structured) + + for linkage in ["average", "complete", "single"]: + structured_items = linkage_tree( + X, connectivity=connectivity, linkage=linkage, return_distance=True + )[-1] + unstructured_items = linkage_tree(X, linkage=linkage, return_distance=True)[ + -1 + ] + structured_dist = structured_items[-1] + unstructured_dist = unstructured_items[-1] + structured_children = structured_items[0] + unstructured_children = unstructured_items[0] + assert_array_almost_equal(structured_dist, unstructured_dist) + assert_array_almost_equal(structured_children, unstructured_children) + + # test on the following dataset where we know the truth + # taken from scipy/cluster/tests/hierarchy_test_data.py + X = np.array( + [ + [1.43054825, -7.5693489], + [6.95887839, 6.82293382], + [2.87137846, -9.68248579], + [7.87974764, -6.05485803], + [8.24018364, -6.09495602], + [7.39020262, 8.54004355], + ] + ) + # truth + linkage_X_ward = np.array( + [ + [3.0, 4.0, 0.36265956, 2.0], + [1.0, 5.0, 1.77045373, 2.0], + [0.0, 2.0, 2.55760419, 2.0], + [6.0, 8.0, 9.10208346, 4.0], + [7.0, 9.0, 24.7784379, 6.0], + ] + ) + + linkage_X_complete = np.array( + [ + [3.0, 4.0, 0.36265956, 2.0], + [1.0, 5.0, 1.77045373, 2.0], + [0.0, 2.0, 2.55760419, 2.0], + [6.0, 8.0, 6.96742194, 4.0], + [7.0, 9.0, 18.77445997, 6.0], + ] + ) + + linkage_X_average = np.array( + [ + [3.0, 4.0, 0.36265956, 2.0], + [1.0, 5.0, 1.77045373, 2.0], + [0.0, 2.0, 2.55760419, 2.0], + [6.0, 8.0, 6.55832839, 4.0], + [7.0, 9.0, 15.44089605, 6.0], + ] + ) + + n_samples, n_features = np.shape(X) + connectivity_X = np.ones((n_samples, n_samples)) + + out_X_unstructured = ward_tree(X, return_distance=True) + out_X_structured = ward_tree(X, connectivity=connectivity_X, return_distance=True) + + # check that the labels are the same + assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0]) + assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0]) + + # check that the distances are correct + assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4]) + assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4]) + + linkage_options = ["complete", "average", "single"] + X_linkage_truth = [linkage_X_complete, linkage_X_average] + for linkage, X_truth in zip(linkage_options, X_linkage_truth): + out_X_unstructured = linkage_tree(X, return_distance=True, linkage=linkage) + out_X_structured = linkage_tree( + X, connectivity=connectivity_X, linkage=linkage, return_distance=True + ) + + # check that the labels are the same + assert_array_equal(X_truth[:, :2], out_X_unstructured[0]) + assert_array_equal(X_truth[:, :2], out_X_structured[0]) + + # check that the distances are correct + assert_array_almost_equal(X_truth[:, 2], out_X_unstructured[4]) + assert_array_almost_equal(X_truth[:, 2], out_X_structured[4]) + + +def test_connectivity_fixing_non_lil(): + # Check non regression of a bug if a non item assignable connectivity is + # provided with more than one component. + # create dummy data + x = np.array([[0, 0], [1, 1]]) + # create a mask with several components to force connectivity fixing + m = np.array([[True, False], [False, True]]) + c = grid_to_graph(n_x=2, n_y=2, mask=m) + w = AgglomerativeClustering(connectivity=c, linkage="ward") + with pytest.warns(UserWarning): + w.fit(x) + + +def test_int_float_dict(): + rng = np.random.RandomState(0) + keys = np.unique(rng.randint(100, size=10).astype(np.intp, copy=False)) + values = rng.rand(len(keys)) + + d = IntFloatDict(keys, values) + for key, value in zip(keys, values): + assert d[key] == value + + other_keys = np.arange(50, dtype=np.intp)[::2] + other_values = np.full(50, 0.5)[::2] + other = IntFloatDict(other_keys, other_values) + # Complete smoke test + max_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1) + average_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1) + + +def test_connectivity_callable(): + rng = np.random.RandomState(0) + X = rng.rand(20, 5) + connectivity = kneighbors_graph(X, 3, include_self=False) + aglc1 = AgglomerativeClustering(connectivity=connectivity) + aglc2 = AgglomerativeClustering( + connectivity=partial(kneighbors_graph, n_neighbors=3, include_self=False) + ) + aglc1.fit(X) + aglc2.fit(X) + assert_array_equal(aglc1.labels_, aglc2.labels_) + + +def test_connectivity_ignores_diagonal(): + rng = np.random.RandomState(0) + X = rng.rand(20, 5) + connectivity = kneighbors_graph(X, 3, include_self=False) + connectivity_include_self = kneighbors_graph(X, 3, include_self=True) + aglc1 = AgglomerativeClustering(connectivity=connectivity) + aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self) + aglc1.fit(X) + aglc2.fit(X) + assert_array_equal(aglc1.labels_, aglc2.labels_) + + +def test_compute_full_tree(): + # Test that the full tree is computed if n_clusters is small + rng = np.random.RandomState(0) + X = rng.randn(10, 2) + connectivity = kneighbors_graph(X, 5, include_self=False) + + # When n_clusters is less, the full tree should be built + # that is the number of merges should be n_samples - 1 + agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity) + agc.fit(X) + n_samples = X.shape[0] + n_nodes = agc.children_.shape[0] + assert n_nodes == n_samples - 1 + + # When n_clusters is large, greater than max of 100 and 0.02 * n_samples. + # we should stop when there are n_clusters. + n_clusters = 101 + X = rng.randn(200, 2) + connectivity = kneighbors_graph(X, 10, include_self=False) + agc = AgglomerativeClustering(n_clusters=n_clusters, connectivity=connectivity) + agc.fit(X) + n_samples = X.shape[0] + n_nodes = agc.children_.shape[0] + assert n_nodes == n_samples - n_clusters + + +def test_n_components(): + # Test n_components returned by linkage, average and ward tree + rng = np.random.RandomState(0) + X = rng.rand(5, 5) + + # Connectivity matrix having five components. + connectivity = np.eye(5) + + for linkage_func in _TREE_BUILDERS.values(): + assert ignore_warnings(linkage_func)(X, connectivity=connectivity)[1] == 5 + + +def test_affinity_passed_to_fix_connectivity(): + # Test that the affinity parameter is actually passed to the pairwise + # function + + size = 2 + rng = np.random.RandomState(0) + X = rng.randn(size, size) + mask = np.array([True, False, False, True]) + + connectivity = grid_to_graph(n_x=size, n_y=size, mask=mask, return_as=np.ndarray) + + class FakeAffinity: + def __init__(self): + self.counter = 0 + + def increment(self, *args, **kwargs): + self.counter += 1 + return self.counter + + fa = FakeAffinity() + + linkage_tree(X, connectivity=connectivity, affinity=fa.increment) + + assert fa.counter == 3 + + +@pytest.mark.parametrize("linkage", ["ward", "complete", "average"]) +def test_agglomerative_clustering_with_distance_threshold(linkage, global_random_seed): + # Check that we obtain the correct number of clusters with + # agglomerative clustering with distance_threshold. + rng = np.random.RandomState(global_random_seed) + mask = np.ones([10, 10], dtype=bool) + n_samples = 100 + X = rng.randn(n_samples, 50) + connectivity = grid_to_graph(*mask.shape) + # test when distance threshold is set to 10 + distance_threshold = 10 + for conn in [None, connectivity]: + clustering = AgglomerativeClustering( + n_clusters=None, + distance_threshold=distance_threshold, + connectivity=conn, + linkage=linkage, + ) + clustering.fit(X) + clusters_produced = clustering.labels_ + num_clusters_produced = len(np.unique(clustering.labels_)) + # test if the clusters produced match the point in the linkage tree + # where the distance exceeds the threshold + tree_builder = _TREE_BUILDERS[linkage] + children, n_components, n_leaves, parent, distances = tree_builder( + X, connectivity=conn, n_clusters=None, return_distance=True + ) + num_clusters_at_threshold = ( + np.count_nonzero(distances >= distance_threshold) + 1 + ) + # test number of clusters produced + assert num_clusters_at_threshold == num_clusters_produced + # test clusters produced + clusters_at_threshold = _hc_cut( + n_clusters=num_clusters_produced, children=children, n_leaves=n_leaves + ) + assert np.array_equiv(clusters_produced, clusters_at_threshold) + + +def test_small_distance_threshold(global_random_seed): + rng = np.random.RandomState(global_random_seed) + n_samples = 10 + X = rng.randint(-300, 300, size=(n_samples, 3)) + # this should result in all data in their own clusters, given that + # their pairwise distances are bigger than .1 (which may not be the case + # with a different random seed). + clustering = AgglomerativeClustering( + n_clusters=None, distance_threshold=1.0, linkage="single" + ).fit(X) + # check that the pairwise distances are indeed all larger than .1 + all_distances = pairwise_distances(X, metric="minkowski", p=2) + np.fill_diagonal(all_distances, np.inf) + assert np.all(all_distances > 0.1) + assert clustering.n_clusters_ == n_samples + + +def test_cluster_distances_with_distance_threshold(global_random_seed): + rng = np.random.RandomState(global_random_seed) + n_samples = 100 + X = rng.randint(-10, 10, size=(n_samples, 3)) + # check the distances within the clusters and with other clusters + distance_threshold = 4 + clustering = AgglomerativeClustering( + n_clusters=None, distance_threshold=distance_threshold, linkage="single" + ).fit(X) + labels = clustering.labels_ + D = pairwise_distances(X, metric="minkowski", p=2) + # to avoid taking the 0 diagonal in min() + np.fill_diagonal(D, np.inf) + for label in np.unique(labels): + in_cluster_mask = labels == label + max_in_cluster_distance = ( + D[in_cluster_mask][:, in_cluster_mask].min(axis=0).max() + ) + min_out_cluster_distance = ( + D[in_cluster_mask][:, ~in_cluster_mask].min(axis=0).min() + ) + # single data point clusters only have that inf diagonal here + if in_cluster_mask.sum() > 1: + assert max_in_cluster_distance < distance_threshold + assert min_out_cluster_distance >= distance_threshold + + +@pytest.mark.parametrize("linkage", ["ward", "complete", "average"]) +@pytest.mark.parametrize( + ("threshold", "y_true"), [(0.5, [1, 0]), (1.0, [1, 0]), (1.5, [0, 0])] +) +def test_agglomerative_clustering_with_distance_threshold_edge_case( + linkage, threshold, y_true +): + # test boundary case of distance_threshold matching the distance + X = [[0], [1]] + clusterer = AgglomerativeClustering( + n_clusters=None, distance_threshold=threshold, linkage=linkage + ) + y_pred = clusterer.fit_predict(X) + assert adjusted_rand_score(y_true, y_pred) == 1 + + +def test_dist_threshold_invalid_parameters(): + X = [[0], [1]] + with pytest.raises(ValueError, match="Exactly one of "): + AgglomerativeClustering(n_clusters=None, distance_threshold=None).fit(X) + + with pytest.raises(ValueError, match="Exactly one of "): + AgglomerativeClustering(n_clusters=2, distance_threshold=1).fit(X) + + X = [[0], [1]] + with pytest.raises(ValueError, match="compute_full_tree must be True if"): + AgglomerativeClustering( + n_clusters=None, distance_threshold=1, compute_full_tree=False + ).fit(X) + + +def test_invalid_shape_precomputed_dist_matrix(): + # Check that an error is raised when affinity='precomputed' + # and a non square matrix is passed (PR #16257). + rng = np.random.RandomState(0) + X = rng.rand(5, 3) + with pytest.raises( + ValueError, + match=r"Distance matrix should be square, got matrix of shape \(5, 3\)", + ): + AgglomerativeClustering(metric="precomputed", linkage="complete").fit(X) + + +def test_precomputed_connectivity_metric_with_2_connected_components(): + """Check that connecting components works when connectivity and + affinity are both precomputed and the number of connected components is + greater than 1. Non-regression test for #16151. + """ + + connectivity_matrix = np.array( + [ + [0, 1, 1, 0, 0], + [0, 0, 1, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 1], + [0, 0, 0, 0, 0], + ] + ) + # ensure that connectivity_matrix has two connected components + assert connected_components(connectivity_matrix)[0] == 2 + + rng = np.random.RandomState(0) + X = rng.randn(5, 10) + + X_dist = pairwise_distances(X) + clusterer_precomputed = AgglomerativeClustering( + metric="precomputed", connectivity=connectivity_matrix, linkage="complete" + ) + msg = "Completing it to avoid stopping the tree early" + with pytest.warns(UserWarning, match=msg): + clusterer_precomputed.fit(X_dist) + + clusterer = AgglomerativeClustering( + connectivity=connectivity_matrix, linkage="complete" + ) + with pytest.warns(UserWarning, match=msg): + clusterer.fit(X) + + assert_array_equal(clusterer.labels_, clusterer_precomputed.labels_) + assert_array_equal(clusterer.children_, clusterer_precomputed.children_) diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_k_means.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_k_means.py new file mode 100644 index 0000000000000000000000000000000000000000..0ab602d32d1330fe738ce7a24cd4b4c68cdf9c15 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_k_means.py @@ -0,0 +1,1364 @@ +"""Testing for K-means""" + +import re +import sys +from io import StringIO + +import numpy as np +import pytest +from scipy import sparse as sp + +from sklearn.base import clone +from sklearn.cluster import KMeans, MiniBatchKMeans, k_means, kmeans_plusplus +from sklearn.cluster._k_means_common import ( + _euclidean_dense_dense_wrapper, + _euclidean_sparse_dense_wrapper, + _inertia_dense, + _inertia_sparse, + _is_same_clustering, + _relocate_empty_clusters_dense, + _relocate_empty_clusters_sparse, +) +from sklearn.cluster._kmeans import _labels_inertia, _mini_batch_step +from sklearn.datasets import make_blobs +from sklearn.exceptions import ConvergenceWarning +from sklearn.metrics import pairwise_distances, pairwise_distances_argmin +from sklearn.metrics.cluster import v_measure_score +from sklearn.metrics.pairwise import euclidean_distances +from sklearn.utils._testing import ( + assert_allclose, + assert_array_equal, + create_memmap_backed_data, +) +from sklearn.utils.extmath import row_norms +from sklearn.utils.fixes import CSR_CONTAINERS +from sklearn.utils.parallel import _get_threadpool_controller + +# non centered, sparse centers to check the +centers = np.array( + [ + [0.0, 5.0, 0.0, 0.0, 0.0], + [1.0, 1.0, 4.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 5.0, 1.0], + ] +) +n_samples = 100 +n_clusters, n_features = centers.shape +X, true_labels = make_blobs( + n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42 +) +X_as_any_csr = [container(X) for container in CSR_CONTAINERS] +data_containers = [np.array] + CSR_CONTAINERS +data_containers_ids = ( + ["dense", "sparse_matrix", "sparse_array"] + if len(X_as_any_csr) == 2 + else ["dense", "sparse_matrix"] +) + + +@pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids) +@pytest.mark.parametrize("algo", ["lloyd", "elkan"]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_kmeans_results(array_constr, algo, dtype): + # Checks that KMeans works as intended on toy dataset by comparing with + # expected results computed by hand. + X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype) + sample_weight = [3, 1, 1, 3] + init_centers = np.array([[0, 0], [1, 1]], dtype=dtype) + + expected_labels = [0, 0, 1, 1] + expected_inertia = 0.375 + expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype) + expected_n_iter = 2 + + kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) + kmeans.fit(X, sample_weight=sample_weight) + + assert_array_equal(kmeans.labels_, expected_labels) + assert_allclose(kmeans.inertia_, expected_inertia) + assert_allclose(kmeans.cluster_centers_, expected_centers) + assert kmeans.n_iter_ == expected_n_iter + + +@pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids) +@pytest.mark.parametrize("algo", ["lloyd", "elkan"]) +def test_kmeans_relocated_clusters(array_constr, algo): + # check that empty clusters are relocated as expected + X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]]) + + # second center too far from others points will be empty at first iter + init_centers = np.array([[0.5, 0.5], [3, 3]]) + + kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) + kmeans.fit(X) + + expected_n_iter = 3 + expected_inertia = 0.25 + assert_allclose(kmeans.inertia_, expected_inertia) + assert kmeans.n_iter_ == expected_n_iter + + # There are two acceptable ways of relocating clusters in this example, the output + # depends on how the argpartition strategy breaks ties. We accept both outputs. + try: + expected_labels = [0, 0, 1, 1] + expected_centers = [[0.25, 0], [0.75, 1]] + assert_array_equal(kmeans.labels_, expected_labels) + assert_allclose(kmeans.cluster_centers_, expected_centers) + except AssertionError: + expected_labels = [1, 1, 0, 0] + expected_centers = [[0.75, 1.0], [0.25, 0.0]] + assert_array_equal(kmeans.labels_, expected_labels) + assert_allclose(kmeans.cluster_centers_, expected_centers) + + +@pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids) +def test_relocate_empty_clusters(array_constr): + # test for the _relocate_empty_clusters_(dense/sparse) helpers + + # Synthetic dataset with 3 obvious clusters of different sizes + X = np.array([-10.0, -9.5, -9, -8.5, -8, -1, 1, 9, 9.5, 10]).reshape(-1, 1) + X = array_constr(X) + sample_weight = np.ones(10) + + # centers all initialized to the first point of X + centers_old = np.array([-10.0, -10, -10]).reshape(-1, 1) + + # With this initialization, all points will be assigned to the first center + # At this point a center in centers_new is the weighted sum of the points + # it contains if it's not empty, otherwise it is the same as before. + centers_new = np.array([-16.5, -10, -10]).reshape(-1, 1) + weight_in_clusters = np.array([10.0, 0, 0]) + labels = np.zeros(10, dtype=np.int32) + + if array_constr is np.array: + _relocate_empty_clusters_dense( + X, sample_weight, centers_old, centers_new, weight_in_clusters, labels + ) + else: + _relocate_empty_clusters_sparse( + X.data, + X.indices, + X.indptr, + sample_weight, + centers_old, + centers_new, + weight_in_clusters, + labels, + ) + + # The relocation scheme will take the 2 points farthest from the center and + # assign them to the 2 empty clusters, i.e. points at 10 and at 9.9. The + # first center will be updated to contain the other 8 points. + assert_array_equal(weight_in_clusters, [8, 1, 1]) + assert_allclose(centers_new, [[-36], [10], [9.5]]) + + +@pytest.mark.parametrize("distribution", ["normal", "blobs"]) +@pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids) +@pytest.mark.parametrize("tol", [1e-2, 1e-8, 1e-100, 0]) +def test_kmeans_elkan_results(distribution, array_constr, tol, global_random_seed): + # Check that results are identical between lloyd and elkan algorithms + rnd = np.random.RandomState(global_random_seed) + if distribution == "normal": + X = rnd.normal(size=(5000, 10)) + else: + X, _ = make_blobs(random_state=rnd) + X[X < 0] = 0 + X = array_constr(X) + + km_lloyd = KMeans(n_clusters=5, random_state=global_random_seed, n_init=1, tol=tol) + km_elkan = KMeans( + algorithm="elkan", + n_clusters=5, + random_state=global_random_seed, + n_init=1, + tol=tol, + ) + + km_lloyd.fit(X) + km_elkan.fit(X) + assert_allclose(km_elkan.cluster_centers_, km_lloyd.cluster_centers_) + assert_array_equal(km_elkan.labels_, km_lloyd.labels_) + assert km_elkan.n_iter_ == km_lloyd.n_iter_ + assert km_elkan.inertia_ == pytest.approx(km_lloyd.inertia_, rel=1e-6) + + +@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) +def test_kmeans_convergence(algorithm, global_random_seed): + # Check that KMeans stops when convergence is reached when tol=0. (#16075) + rnd = np.random.RandomState(global_random_seed) + X = rnd.normal(size=(5000, 10)) + max_iter = 300 + + km = KMeans( + algorithm=algorithm, + n_clusters=5, + random_state=global_random_seed, + n_init=1, + tol=0, + max_iter=max_iter, + ).fit(X) + + assert km.n_iter_ < max_iter + + +@pytest.mark.parametrize("X_csr", X_as_any_csr) +def test_minibatch_update_consistency(X_csr, global_random_seed): + # Check that dense and sparse minibatch update give the same results + rng = np.random.RandomState(global_random_seed) + + centers_old = centers + rng.normal(size=centers.shape) + centers_old_csr = centers_old.copy() + + centers_new = np.zeros_like(centers_old) + centers_new_csr = np.zeros_like(centers_old_csr) + + weight_sums = np.zeros(centers_old.shape[0], dtype=X.dtype) + weight_sums_csr = np.zeros(centers_old.shape[0], dtype=X.dtype) + + sample_weight = np.ones(X.shape[0], dtype=X.dtype) + + # extract a small minibatch + X_mb = X[:10] + X_mb_csr = X_csr[:10] + sample_weight_mb = sample_weight[:10] + + # step 1: compute the dense minibatch update + old_inertia = _mini_batch_step( + X_mb, + sample_weight_mb, + centers_old, + centers_new, + weight_sums, + np.random.RandomState(global_random_seed), + random_reassign=False, + ) + assert old_inertia > 0.0 + + # compute the new inertia on the same batch to check that it decreased + labels, new_inertia = _labels_inertia(X_mb, sample_weight_mb, centers_new) + assert new_inertia > 0.0 + assert new_inertia < old_inertia + + # step 2: compute the sparse minibatch update + old_inertia_csr = _mini_batch_step( + X_mb_csr, + sample_weight_mb, + centers_old_csr, + centers_new_csr, + weight_sums_csr, + np.random.RandomState(global_random_seed), + random_reassign=False, + ) + assert old_inertia_csr > 0.0 + + # compute the new inertia on the same batch to check that it decreased + labels_csr, new_inertia_csr = _labels_inertia( + X_mb_csr, sample_weight_mb, centers_new_csr + ) + assert new_inertia_csr > 0.0 + assert new_inertia_csr < old_inertia_csr + + # step 3: check that sparse and dense updates lead to the same results + assert_array_equal(labels, labels_csr) + assert_allclose(centers_new, centers_new_csr) + assert_allclose(old_inertia, old_inertia_csr) + assert_allclose(new_inertia, new_inertia_csr) + + +def _check_fitted_model(km): + # check that the number of clusters centers and distinct labels match + # the expectation + centers = km.cluster_centers_ + assert centers.shape == (n_clusters, n_features) + + labels = km.labels_ + assert np.unique(labels).shape[0] == n_clusters + + # check that the labels assignment are perfect (up to a permutation) + assert_allclose(v_measure_score(true_labels, labels), 1.0) + assert km.inertia_ > 0.0 + + +@pytest.mark.parametrize( + "input_data", + [X] + X_as_any_csr, + ids=data_containers_ids, +) +@pytest.mark.parametrize( + "init", + ["random", "k-means++", centers, lambda X, k, random_state: centers], + ids=["random", "k-means++", "ndarray", "callable"], +) +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_all_init(Estimator, input_data, init): + # Check KMeans and MiniBatchKMeans with all possible init. + n_init = 10 if isinstance(init, str) else 1 + km = Estimator( + init=init, n_clusters=n_clusters, random_state=42, n_init=n_init + ).fit(input_data) + _check_fitted_model(km) + + +@pytest.mark.parametrize( + "init", + ["random", "k-means++", centers, lambda X, k, random_state: centers], + ids=["random", "k-means++", "ndarray", "callable"], +) +def test_minibatch_kmeans_partial_fit_init(init): + # Check MiniBatchKMeans init with partial_fit + n_init = 10 if isinstance(init, str) else 1 + km = MiniBatchKMeans( + init=init, n_clusters=n_clusters, random_state=0, n_init=n_init + ) + for i in range(100): + # "random" init requires many batches to recover the true labels. + km.partial_fit(X) + _check_fitted_model(km) + + +@pytest.mark.parametrize( + "init, expected_n_init", + [ + ("k-means++", 1), + ("random", "default"), + ( + lambda X, n_clusters, random_state: random_state.uniform( + size=(n_clusters, X.shape[1]) + ), + "default", + ), + ("array-like", 1), + ], +) +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_kmeans_init_auto_with_initial_centroids(Estimator, init, expected_n_init): + """Check that `n_init="auto"` chooses the right number of initializations. + Non-regression test for #26657: + https://github.com/scikit-learn/scikit-learn/pull/26657 + """ + n_sample, n_features, n_clusters = 100, 10, 5 + X = np.random.randn(n_sample, n_features) + if init == "array-like": + init = np.random.randn(n_clusters, n_features) + if expected_n_init == "default": + expected_n_init = 3 if Estimator is MiniBatchKMeans else 10 + + kmeans = Estimator(n_clusters=n_clusters, init=init, n_init="auto").fit(X) + assert kmeans._n_init == expected_n_init + + +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_fortran_aligned_data(Estimator, global_random_seed): + # Check that KMeans works with fortran-aligned data. + X_fortran = np.asfortranarray(X) + centers_fortran = np.asfortranarray(centers) + + km_c = Estimator( + n_clusters=n_clusters, init=centers, n_init=1, random_state=global_random_seed + ).fit(X) + km_f = Estimator( + n_clusters=n_clusters, + init=centers_fortran, + n_init=1, + random_state=global_random_seed, + ).fit(X_fortran) + assert_allclose(km_c.cluster_centers_, km_f.cluster_centers_) + assert_array_equal(km_c.labels_, km_f.labels_) + + +def test_minibatch_kmeans_verbose(): + # Check verbose mode of MiniBatchKMeans for better coverage. + km = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, verbose=1) + old_stdout = sys.stdout + sys.stdout = StringIO() + try: + km.fit(X) + finally: + sys.stdout = old_stdout + + +@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) +@pytest.mark.parametrize("tol", [1e-2, 0]) +def test_kmeans_verbose(algorithm, tol, capsys): + # Check verbose mode of KMeans for better coverage. + X = np.random.RandomState(0).normal(size=(5000, 10)) + + KMeans( + algorithm=algorithm, + n_clusters=n_clusters, + random_state=42, + init="random", + n_init=1, + tol=tol, + verbose=1, + ).fit(X) + + captured = capsys.readouterr() + + assert re.search(r"Initialization complete", captured.out) + assert re.search(r"Iteration [0-9]+, inertia", captured.out) + + if tol == 0: + assert re.search(r"strict convergence", captured.out) + else: + assert re.search(r"center shift .* within tolerance", captured.out) + + +def test_minibatch_kmeans_warning_init_size(): + # Check that a warning is raised when init_size is smaller than n_clusters + with pytest.warns( + RuntimeWarning, match=r"init_size.* should be larger than n_clusters" + ): + MiniBatchKMeans(init_size=10, n_clusters=20).fit(X) + + +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_warning_n_init_precomputed_centers(Estimator): + # Check that a warning is raised when n_init > 1 and an array is passed for + # the init parameter. + with pytest.warns( + RuntimeWarning, + match="Explicit initial center position passed: performing only one init", + ): + Estimator(init=centers, n_clusters=n_clusters, n_init=10).fit(X) + + +def test_minibatch_sensible_reassign(global_random_seed): + # check that identical initial clusters are reassigned + # also a regression test for when there are more desired reassignments than + # samples. + zeroed_X, true_labels = make_blobs( + n_samples=100, centers=5, random_state=global_random_seed + ) + zeroed_X[::2, :] = 0 + + km = MiniBatchKMeans( + n_clusters=20, batch_size=10, random_state=global_random_seed, init="random" + ).fit(zeroed_X) + # there should not be too many exact zero cluster centers + num_non_zero_clusters = km.cluster_centers_.any(axis=1).sum() + assert num_non_zero_clusters > 9, f"{num_non_zero_clusters=} is too small" + + # do the same with batch-size > X.shape[0] (regression test) + km = MiniBatchKMeans( + n_clusters=20, batch_size=200, random_state=global_random_seed, init="random" + ).fit(zeroed_X) + # there should not be too many exact zero cluster centers + num_non_zero_clusters = km.cluster_centers_.any(axis=1).sum() + assert num_non_zero_clusters > 9, f"{num_non_zero_clusters=} is too small" + + # do the same with partial_fit API + km = MiniBatchKMeans(n_clusters=20, random_state=global_random_seed, init="random") + for i in range(100): + km.partial_fit(zeroed_X) + # there should not be too many exact zero cluster centers + num_non_zero_clusters = km.cluster_centers_.any(axis=1).sum() + assert num_non_zero_clusters > 9, f"{num_non_zero_clusters=} is too small" + + +@pytest.mark.parametrize( + "input_data", + [X] + X_as_any_csr, + ids=data_containers_ids, +) +def test_minibatch_reassign(input_data, global_random_seed): + # Check the reassignment part of the minibatch step with very high or very + # low reassignment ratio. + perfect_centers = np.empty((n_clusters, n_features)) + for i in range(n_clusters): + perfect_centers[i] = X[true_labels == i].mean(axis=0) + + sample_weight = np.ones(n_samples) + centers_new = np.empty_like(perfect_centers) + + # Give a perfect initialization, but a large reassignment_ratio, as a + # result many centers should be reassigned and the model should no longer + # be good + score_before = -_labels_inertia(input_data, sample_weight, perfect_centers, 1)[1] + + _mini_batch_step( + input_data, + sample_weight, + perfect_centers, + centers_new, + np.zeros(n_clusters), + np.random.RandomState(global_random_seed), + random_reassign=True, + reassignment_ratio=1, + ) + + score_after = -_labels_inertia(input_data, sample_weight, centers_new, 1)[1] + + assert score_before > score_after + + # Give a perfect initialization, with a small reassignment_ratio, + # no center should be reassigned. + _mini_batch_step( + input_data, + sample_weight, + perfect_centers, + centers_new, + np.zeros(n_clusters), + np.random.RandomState(global_random_seed), + random_reassign=True, + reassignment_ratio=1e-15, + ) + + assert_allclose(centers_new, perfect_centers) + + +def test_minibatch_with_many_reassignments(): + # Test for the case that the number of clusters to reassign is bigger + # than the batch_size. Run the test with 100 clusters and a batch_size of + # 10 because it turned out that these values ensure that the number of + # clusters to reassign is always bigger than the batch_size. + MiniBatchKMeans( + n_clusters=100, + batch_size=10, + init_size=n_samples, + random_state=42, + verbose=True, + ).fit(X) + + +def test_minibatch_kmeans_init_size(): + # Check the internal _init_size attribute of MiniBatchKMeans + + # default init size should be 3 * batch_size + km = MiniBatchKMeans(n_clusters=10, batch_size=5, n_init=1).fit(X) + assert km._init_size == 15 + + # if 3 * batch size < n_clusters, it should then be 3 * n_clusters + km = MiniBatchKMeans(n_clusters=10, batch_size=1, n_init=1).fit(X) + assert km._init_size == 30 + + # it should not be larger than n_samples + km = MiniBatchKMeans( + n_clusters=10, batch_size=5, n_init=1, init_size=n_samples + 1 + ).fit(X) + assert km._init_size == n_samples + + +@pytest.mark.parametrize("tol, max_no_improvement", [(1e-4, None), (0, 10)]) +def test_minibatch_declared_convergence(capsys, tol, max_no_improvement): + # Check convergence detection based on ewa batch inertia or on + # small center change. + X, _, centers = make_blobs(centers=3, random_state=0, return_centers=True) + + km = MiniBatchKMeans( + n_clusters=3, + init=centers, + batch_size=20, + tol=tol, + random_state=0, + max_iter=10, + n_init=1, + verbose=1, + max_no_improvement=max_no_improvement, + ) + + km.fit(X) + assert 1 < km.n_iter_ < 10 + + captured = capsys.readouterr() + if max_no_improvement is None: + assert "Converged (small centers change)" in captured.out + if tol == 0: + assert "Converged (lack of improvement in inertia)" in captured.out + + +def test_minibatch_iter_steps(): + # Check consistency of n_iter_ and n_steps_ attributes. + batch_size = 30 + n_samples = X.shape[0] + km = MiniBatchKMeans(n_clusters=3, batch_size=batch_size, random_state=0).fit(X) + + # n_iter_ is the number of started epochs + assert km.n_iter_ == np.ceil((km.n_steps_ * batch_size) / n_samples) + assert isinstance(km.n_iter_, int) + + # without stopping condition, max_iter should be reached + km = MiniBatchKMeans( + n_clusters=3, + batch_size=batch_size, + random_state=0, + tol=0, + max_no_improvement=None, + max_iter=10, + ).fit(X) + + assert km.n_iter_ == 10 + assert km.n_steps_ == (10 * n_samples) // batch_size + assert isinstance(km.n_steps_, int) + + +def test_kmeans_copyx(): + # Check that copy_x=False returns nearly equal X after de-centering. + my_X = X.copy() + km = KMeans(copy_x=False, n_clusters=n_clusters, random_state=42) + km.fit(my_X) + _check_fitted_model(km) + + # check that my_X is de-centered + assert_allclose(my_X, X) + + +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_score_max_iter(Estimator, global_random_seed): + # Check that fitting KMeans or MiniBatchKMeans with more iterations gives + # better score + X = np.random.RandomState(global_random_seed).randn(100, 10) + + km1 = Estimator(n_init=1, random_state=global_random_seed, max_iter=1) + s1 = km1.fit(X).score(X) + km2 = Estimator(n_init=1, random_state=global_random_seed, max_iter=10) + s2 = km2.fit(X).score(X) + assert s2 > s1 + + +@pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids) +@pytest.mark.parametrize( + "Estimator, algorithm", + [(KMeans, "lloyd"), (KMeans, "elkan"), (MiniBatchKMeans, None)], +) +@pytest.mark.parametrize("max_iter", [2, 100]) +def test_kmeans_predict( + Estimator, algorithm, array_constr, max_iter, global_dtype, global_random_seed +): + # Check the predict method and the equivalence between fit.predict and + # fit_predict. + X, _ = make_blobs( + n_samples=200, n_features=10, centers=10, random_state=global_random_seed + ) + X = array_constr(X, dtype=global_dtype) + + km = Estimator( + n_clusters=10, + init="random", + n_init=10, + max_iter=max_iter, + random_state=global_random_seed, + ) + if algorithm is not None: + km.set_params(algorithm=algorithm) + km.fit(X) + labels = km.labels_ + + # re-predict labels for training set using predict + pred = km.predict(X) + assert_array_equal(pred, labels) + + # re-predict labels for training set using fit_predict + pred = km.fit_predict(X) + assert_array_equal(pred, labels) + + # predict centroid labels + pred = km.predict(km.cluster_centers_) + assert_array_equal(pred, np.arange(10)) + + +@pytest.mark.parametrize("X_csr", X_as_any_csr) +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_dense_sparse(Estimator, X_csr, global_random_seed): + # Check that the results are the same for dense and sparse input. + sample_weight = np.random.RandomState(global_random_seed).random_sample( + (n_samples,) + ) + km_dense = Estimator( + n_clusters=n_clusters, random_state=global_random_seed, n_init=1 + ) + km_dense.fit(X, sample_weight=sample_weight) + km_sparse = Estimator( + n_clusters=n_clusters, random_state=global_random_seed, n_init=1 + ) + km_sparse.fit(X_csr, sample_weight=sample_weight) + + assert_array_equal(km_dense.labels_, km_sparse.labels_) + assert_allclose(km_dense.cluster_centers_, km_sparse.cluster_centers_) + + +@pytest.mark.parametrize("X_csr", X_as_any_csr) +@pytest.mark.parametrize( + "init", ["random", "k-means++", centers], ids=["random", "k-means++", "ndarray"] +) +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_predict_dense_sparse(Estimator, init, X_csr): + # check that models trained on sparse input also works for dense input at + # predict time and vice versa. + n_init = 10 if isinstance(init, str) else 1 + km = Estimator(n_clusters=n_clusters, init=init, n_init=n_init, random_state=0) + + km.fit(X_csr) + assert_array_equal(km.predict(X), km.labels_) + + km.fit(X) + assert_array_equal(km.predict(X_csr), km.labels_) + + +@pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids) +@pytest.mark.parametrize("dtype", [np.int32, np.int64]) +@pytest.mark.parametrize("init", ["k-means++", "ndarray"]) +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_integer_input(Estimator, array_constr, dtype, init, global_random_seed): + # Check that KMeans and MiniBatchKMeans work with integer input. + X_dense = np.array([[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]]) + X = array_constr(X_dense, dtype=dtype) + + n_init = 1 if init == "ndarray" else 10 + init = X_dense[:2] if init == "ndarray" else init + + km = Estimator( + n_clusters=2, init=init, n_init=n_init, random_state=global_random_seed + ) + if Estimator is MiniBatchKMeans: + km.set_params(batch_size=2) + + km.fit(X) + + # Internally integer input should be converted to float64 + assert km.cluster_centers_.dtype == np.float64 + + expected_labels = [0, 1, 1, 0, 0, 1] + assert_allclose(v_measure_score(km.labels_, expected_labels), 1.0) + + # Same with partial_fit (#14314) + if Estimator is MiniBatchKMeans: + km = clone(km).partial_fit(X) + assert km.cluster_centers_.dtype == np.float64 + + +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_transform(Estimator, global_random_seed): + # Check the transform method + km = Estimator(n_clusters=n_clusters, random_state=global_random_seed).fit(X) + + # Transorfming cluster_centers_ should return the pairwise distances + # between centers + Xt = km.transform(km.cluster_centers_) + assert_allclose(Xt, pairwise_distances(km.cluster_centers_)) + # In particular, diagonal must be 0 + assert_array_equal(Xt.diagonal(), np.zeros(n_clusters)) + + # Transorfming X should return the pairwise distances between X and the + # centers + Xt = km.transform(X) + assert_allclose(Xt, pairwise_distances(X, km.cluster_centers_)) + + +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_fit_transform(Estimator, global_random_seed): + # Check equivalence between fit.transform and fit_transform + X1 = Estimator(random_state=global_random_seed, n_init=1).fit(X).transform(X) + X2 = Estimator(random_state=global_random_seed, n_init=1).fit_transform(X) + assert_allclose(X1, X2) + + +def test_n_init(global_random_seed): + # Check that increasing the number of init increases the quality + previous_inertia = np.inf + for n_init in [1, 5, 10]: + # set max_iter=1 to avoid finding the global minimum and get the same + # inertia each time + km = KMeans( + n_clusters=n_clusters, + init="random", + n_init=n_init, + random_state=global_random_seed, + max_iter=1, + ).fit(X) + assert km.inertia_ <= previous_inertia + + +def test_k_means_function(global_random_seed): + # test calling the k_means function directly + cluster_centers, labels, inertia = k_means( + X, n_clusters=n_clusters, sample_weight=None, random_state=global_random_seed + ) + + assert cluster_centers.shape == (n_clusters, n_features) + assert np.unique(labels).shape[0] == n_clusters + + # check that the labels assignment are perfect (up to a permutation) + assert_allclose(v_measure_score(true_labels, labels), 1.0) + assert inertia > 0.0 + + +@pytest.mark.parametrize( + "input_data", + [X] + X_as_any_csr, + ids=data_containers_ids, +) +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_float_precision(Estimator, input_data, global_random_seed): + # Check that the results are the same for single and double precision. + km = Estimator(n_init=1, random_state=global_random_seed) + + inertia = {} + Xt = {} + centers = {} + labels = {} + + for dtype in [np.float64, np.float32]: + X = input_data.astype(dtype, copy=False) + km.fit(X) + + inertia[dtype] = km.inertia_ + Xt[dtype] = km.transform(X) + centers[dtype] = km.cluster_centers_ + labels[dtype] = km.labels_ + + # dtype of cluster centers has to be the dtype of the input data + assert km.cluster_centers_.dtype == dtype + + # same with partial_fit + if Estimator is MiniBatchKMeans: + km.partial_fit(X[0:3]) + assert km.cluster_centers_.dtype == dtype + + # compare arrays with low precision since the difference between 32 and + # 64 bit comes from an accumulation of rounding errors. + assert_allclose(inertia[np.float32], inertia[np.float64], rtol=1e-4) + assert_allclose(Xt[np.float32], Xt[np.float64], atol=Xt[np.float64].max() * 1e-4) + assert_allclose( + centers[np.float32], centers[np.float64], atol=centers[np.float64].max() * 1e-4 + ) + assert_array_equal(labels[np.float32], labels[np.float64]) + + +@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_centers_not_mutated(Estimator, dtype): + # Check that KMeans and MiniBatchKMeans won't mutate the user provided + # init centers silently even if input data and init centers have the same + # type. + X_new_type = X.astype(dtype, copy=False) + centers_new_type = centers.astype(dtype, copy=False) + + km = Estimator(init=centers_new_type, n_clusters=n_clusters, n_init=1) + km.fit(X_new_type) + + assert not np.may_share_memory(km.cluster_centers_, centers_new_type) + + +@pytest.mark.parametrize( + "input_data", + [X] + X_as_any_csr, + ids=data_containers_ids, +) +def test_kmeans_init_fitted_centers(input_data): + # Check that starting fitting from a local optimum shouldn't change the + # solution + km1 = KMeans(n_clusters=n_clusters).fit(input_data) + km2 = KMeans(n_clusters=n_clusters, init=km1.cluster_centers_, n_init=1).fit( + input_data + ) + + assert_allclose(km1.cluster_centers_, km2.cluster_centers_) + + +def test_kmeans_warns_less_centers_than_unique_points(global_random_seed): + # Check KMeans when the number of found clusters is smaller than expected + X = np.asarray([[0, 0], [0, 1], [1, 0], [1, 0]]) # last point is duplicated + km = KMeans(n_clusters=4, random_state=global_random_seed) + + # KMeans should warn that fewer labels than cluster centers have been used + msg = ( + r"Number of distinct clusters \(3\) found smaller than " + r"n_clusters \(4\). Possibly due to duplicate points in X." + ) + with pytest.warns(ConvergenceWarning, match=msg): + km.fit(X) + # only three distinct points, so only three clusters + # can have points assigned to them + assert set(km.labels_) == set(range(3)) + + +def _sort_centers(centers): + return np.sort(centers, axis=0) + + +def test_weighted_vs_repeated(global_random_seed): + # Check that a sample weight of N should yield the same result as an N-fold + # repetition of the sample. Valid only if init is precomputed, otherwise + # rng produces different results. Not valid for MinibatchKMeans due to rng + # to extract minibatches. + sample_weight = np.random.RandomState(global_random_seed).randint( + 1, 5, size=n_samples + ) + X_repeat = np.repeat(X, sample_weight, axis=0) + + km = KMeans( + init=centers, n_init=1, n_clusters=n_clusters, random_state=global_random_seed + ) + + km_weighted = clone(km).fit(X, sample_weight=sample_weight) + repeated_labels = np.repeat(km_weighted.labels_, sample_weight) + km_repeated = clone(km).fit(X_repeat) + + assert_array_equal(km_repeated.labels_, repeated_labels) + assert_allclose(km_weighted.inertia_, km_repeated.inertia_) + assert_allclose( + _sort_centers(km_weighted.cluster_centers_), + _sort_centers(km_repeated.cluster_centers_), + ) + + +@pytest.mark.parametrize( + "input_data", + [X] + X_as_any_csr, + ids=data_containers_ids, +) +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_unit_weights_vs_no_weights(Estimator, input_data, global_random_seed): + # Check that not passing sample weights should be equivalent to passing + # sample weights all equal to one. + sample_weight = np.ones(n_samples) + + km = Estimator(n_clusters=n_clusters, random_state=global_random_seed, n_init=1) + km_none = clone(km).fit(input_data, sample_weight=None) + km_ones = clone(km).fit(input_data, sample_weight=sample_weight) + + assert_array_equal(km_none.labels_, km_ones.labels_) + assert_allclose(km_none.cluster_centers_, km_ones.cluster_centers_) + + +@pytest.mark.parametrize( + "input_data", + [X] + X_as_any_csr, + ids=data_containers_ids, +) +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_scaled_weights(Estimator, input_data, global_random_seed): + # Check that scaling all sample weights by a common factor + # shouldn't change the result + sample_weight = np.random.RandomState(global_random_seed).uniform(size=n_samples) + + km = Estimator(n_clusters=n_clusters, random_state=global_random_seed, n_init=1) + km_orig = clone(km).fit(input_data, sample_weight=sample_weight) + km_scaled = clone(km).fit(input_data, sample_weight=0.5 * sample_weight) + + assert_array_equal(km_orig.labels_, km_scaled.labels_) + assert_allclose(km_orig.cluster_centers_, km_scaled.cluster_centers_) + + +def test_kmeans_elkan_iter_attribute(): + # Regression test on bad n_iter_ value. Previous bug n_iter_ was one off + # it's right value (#11340). + km = KMeans(algorithm="elkan", max_iter=1).fit(X) + assert km.n_iter_ == 1 + + +@pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids) +def test_kmeans_empty_cluster_relocated(array_constr): + # check that empty clusters are correctly relocated when using sample + # weights (#13486) + X = array_constr([[-1], [1]]) + sample_weight = [1.9, 0.1] + init = np.array([[-1], [10]]) + + km = KMeans(n_clusters=2, init=init, n_init=1) + km.fit(X, sample_weight=sample_weight) + + assert len(set(km.labels_)) == 2 + assert_allclose(km.cluster_centers_, [[-1], [1]]) + + +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_result_equal_in_diff_n_threads(Estimator, global_random_seed): + # Check that KMeans/MiniBatchKMeans give the same results in parallel mode + # than in sequential mode. + rnd = np.random.RandomState(global_random_seed) + X = rnd.normal(size=(50, 10)) + + with _get_threadpool_controller().limit(limits=1, user_api="openmp"): + result_1 = ( + Estimator(n_clusters=n_clusters, random_state=global_random_seed) + .fit(X) + .labels_ + ) + with _get_threadpool_controller().limit(limits=2, user_api="openmp"): + result_2 = ( + Estimator(n_clusters=n_clusters, random_state=global_random_seed) + .fit(X) + .labels_ + ) + assert_array_equal(result_1, result_2) + + +def test_warning_elkan_1_cluster(): + # Check warning messages specific to KMeans + with pytest.warns( + RuntimeWarning, + match="algorithm='elkan' doesn't make sense for a single cluster", + ): + KMeans(n_clusters=1, algorithm="elkan").fit(X) + + +@pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids) +@pytest.mark.parametrize("algo", ["lloyd", "elkan"]) +def test_k_means_1_iteration(array_constr, algo, global_random_seed): + # check the results after a single iteration (E-step M-step E-step) by + # comparing against a pure python implementation. + X = np.random.RandomState(global_random_seed).uniform(size=(100, 5)) + init_centers = X[:5] + X = array_constr(X) + + def py_kmeans(X, init): + new_centers = init.copy() + labels = pairwise_distances_argmin(X, init) + for label in range(init.shape[0]): + new_centers[label] = X[labels == label].mean(axis=0) + labels = pairwise_distances_argmin(X, new_centers) + return labels, new_centers + + py_labels, py_centers = py_kmeans(X, init_centers) + + cy_kmeans = KMeans( + n_clusters=5, n_init=1, init=init_centers, algorithm=algo, max_iter=1 + ).fit(X) + cy_labels = cy_kmeans.labels_ + cy_centers = cy_kmeans.cluster_centers_ + + assert_array_equal(py_labels, cy_labels) + assert_allclose(py_centers, cy_centers) + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("squared", [True, False]) +def test_euclidean_distance(dtype, squared, global_random_seed): + # Check that the _euclidean_(dense/sparse)_dense helpers produce correct + # results + rng = np.random.RandomState(global_random_seed) + a_sparse = sp.random( + 1, 100, density=0.5, format="csr", random_state=rng, dtype=dtype + ) + a_dense = a_sparse.toarray().reshape(-1) + b = rng.randn(100).astype(dtype, copy=False) + b_squared_norm = (b**2).sum() + + expected = ((a_dense - b) ** 2).sum() + expected = expected if squared else np.sqrt(expected) + + distance_dense_dense = _euclidean_dense_dense_wrapper(a_dense, b, squared) + distance_sparse_dense = _euclidean_sparse_dense_wrapper( + a_sparse.data, a_sparse.indices, b, b_squared_norm, squared + ) + + rtol = 1e-4 if dtype == np.float32 else 1e-7 + assert_allclose(distance_dense_dense, distance_sparse_dense, rtol=rtol) + assert_allclose(distance_dense_dense, expected, rtol=rtol) + assert_allclose(distance_sparse_dense, expected, rtol=rtol) + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_inertia(dtype, global_random_seed): + # Check that the _inertia_(dense/sparse) helpers produce correct results. + rng = np.random.RandomState(global_random_seed) + X_sparse = sp.random( + 100, 10, density=0.5, format="csr", random_state=rng, dtype=dtype + ) + X_dense = X_sparse.toarray() + sample_weight = rng.randn(100).astype(dtype, copy=False) + centers = rng.randn(5, 10).astype(dtype, copy=False) + labels = rng.randint(5, size=100, dtype=np.int32) + + distances = ((X_dense - centers[labels]) ** 2).sum(axis=1) + expected = np.sum(distances * sample_weight) + + inertia_dense = _inertia_dense(X_dense, sample_weight, centers, labels, n_threads=1) + inertia_sparse = _inertia_sparse( + X_sparse, sample_weight, centers, labels, n_threads=1 + ) + + rtol = 1e-4 if dtype == np.float32 else 1e-6 + assert_allclose(inertia_dense, inertia_sparse, rtol=rtol) + assert_allclose(inertia_dense, expected, rtol=rtol) + assert_allclose(inertia_sparse, expected, rtol=rtol) + + # Check the single_label parameter. + label = 1 + mask = labels == label + distances = ((X_dense[mask] - centers[label]) ** 2).sum(axis=1) + expected = np.sum(distances * sample_weight[mask]) + + inertia_dense = _inertia_dense( + X_dense, sample_weight, centers, labels, n_threads=1, single_label=label + ) + inertia_sparse = _inertia_sparse( + X_sparse, sample_weight, centers, labels, n_threads=1, single_label=label + ) + + assert_allclose(inertia_dense, inertia_sparse, rtol=rtol) + assert_allclose(inertia_dense, expected, rtol=rtol) + assert_allclose(inertia_sparse, expected, rtol=rtol) + + +@pytest.mark.parametrize("Klass, default_n_init", [(KMeans, 10), (MiniBatchKMeans, 3)]) +def test_n_init_auto(Klass, default_n_init): + est = Klass(n_init="auto", init="k-means++") + est.fit(X) + assert est._n_init == 1 + + est = Klass(n_init="auto", init="random") + est.fit(X) + assert est._n_init == 10 if Klass.__name__ == "KMeans" else 3 + + +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +def test_sample_weight_unchanged(Estimator): + # Check that sample_weight is not modified in place by KMeans (#17204) + X = np.array([[1], [2], [4]]) + sample_weight = np.array([0.5, 0.2, 0.3]) + Estimator(n_clusters=2, random_state=0).fit(X, sample_weight=sample_weight) + + assert_array_equal(sample_weight, np.array([0.5, 0.2, 0.3])) + + +@pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) +@pytest.mark.parametrize( + "param, match", + [ + ({"n_clusters": n_samples + 1}, r"n_samples.* should be >= n_clusters"), + ( + {"init": X[:2]}, + r"The shape of the initial centers .* does not match " + r"the number of clusters", + ), + ( + {"init": lambda X_, k, random_state: X_[:2]}, + r"The shape of the initial centers .* does not match " + r"the number of clusters", + ), + ( + {"init": X[:8, :2]}, + r"The shape of the initial centers .* does not match " + r"the number of features of the data", + ), + ( + {"init": lambda X_, k, random_state: X_[:8, :2]}, + r"The shape of the initial centers .* does not match " + r"the number of features of the data", + ), + ], +) +def test_wrong_params(Estimator, param, match): + # Check that error are raised with clear error message when wrong values + # are passed for the parameters + # Set n_init=1 by default to avoid warning with precomputed init + km = Estimator(n_init=1) + with pytest.raises(ValueError, match=match): + km.set_params(**param).fit(X) + + +@pytest.mark.parametrize( + "param, match", + [ + ( + {"x_squared_norms": X[:2]}, + r"The length of x_squared_norms .* should " + r"be equal to the length of n_samples", + ), + ], +) +def test_kmeans_plusplus_wrong_params(param, match): + with pytest.raises(ValueError, match=match): + kmeans_plusplus(X, n_clusters, **param) + + +@pytest.mark.parametrize( + "input_data", + [X] + X_as_any_csr, +) +@pytest.mark.parametrize("dtype", [np.float64, np.float32]) +def test_kmeans_plusplus_output(input_data, dtype, global_random_seed): + # Check for the correct number of seeds and all positive values + data = input_data.astype(dtype) + centers, indices = kmeans_plusplus( + data, n_clusters, random_state=global_random_seed + ) + + # Check there are the correct number of indices and that all indices are + # positive and within the number of samples + assert indices.shape[0] == n_clusters + assert (indices >= 0).all() + assert (indices <= data.shape[0]).all() + + # Check for the correct number of seeds and that they are bound by the data + assert centers.shape[0] == n_clusters + assert (centers.max(axis=0) <= data.max(axis=0)).all() + assert (centers.min(axis=0) >= data.min(axis=0)).all() + + # Check that indices correspond to reported centers + # Use X for comparison rather than data, test still works against centers + # calculated with sparse data. + assert_allclose(X[indices].astype(dtype), centers) + + +@pytest.mark.parametrize("x_squared_norms", [row_norms(X, squared=True), None]) +def test_kmeans_plusplus_norms(x_squared_norms): + # Check that defining x_squared_norms returns the same as default=None. + centers, indices = kmeans_plusplus(X, n_clusters, x_squared_norms=x_squared_norms) + + assert_allclose(X[indices], centers) + + +def test_kmeans_plusplus_dataorder(global_random_seed): + # Check that memory layout does not effect result + centers_c, _ = kmeans_plusplus(X, n_clusters, random_state=global_random_seed) + + X_fortran = np.asfortranarray(X) + + centers_fortran, _ = kmeans_plusplus( + X_fortran, n_clusters, random_state=global_random_seed + ) + + assert_allclose(centers_c, centers_fortran) + + +def test_is_same_clustering(): + # Sanity check for the _is_same_clustering utility function + labels1 = np.array([1, 0, 0, 1, 2, 0, 2, 1], dtype=np.int32) + assert _is_same_clustering(labels1, labels1, 3) + + # these other labels represent the same clustering since we can retrieve the first + # labels by simply renaming the labels: 0 -> 1, 1 -> 2, 2 -> 0. + labels2 = np.array([0, 2, 2, 0, 1, 2, 1, 0], dtype=np.int32) + assert _is_same_clustering(labels1, labels2, 3) + + # these other labels do not represent the same clustering since not all ones are + # mapped to a same value + labels3 = np.array([1, 0, 0, 2, 2, 0, 2, 1], dtype=np.int32) + assert not _is_same_clustering(labels1, labels3, 3) + + +@pytest.mark.parametrize( + "kwargs", ({"init": np.str_("k-means++")}, {"init": [[0, 0], [1, 1]], "n_init": 1}) +) +def test_kmeans_with_array_like_or_np_scalar_init(kwargs): + """Check that init works with numpy scalar strings. + + Non-regression test for #21964. + """ + X = np.asarray([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=np.float64) + + clustering = KMeans(n_clusters=2, **kwargs) + # Does not raise + clustering.fit(X) + + +@pytest.mark.parametrize( + "Klass, method", + [(KMeans, "fit"), (MiniBatchKMeans, "fit"), (MiniBatchKMeans, "partial_fit")], +) +def test_feature_names_out(Klass, method): + """Check `feature_names_out` for `KMeans` and `MiniBatchKMeans`.""" + class_name = Klass.__name__.lower() + kmeans = Klass() + getattr(kmeans, method)(X) + n_clusters = kmeans.cluster_centers_.shape[0] + + names_out = kmeans.get_feature_names_out() + assert_array_equal([f"{class_name}{i}" for i in range(n_clusters)], names_out) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None]) +def test_predict_does_not_change_cluster_centers(csr_container): + """Check that predict does not change cluster centers. + + Non-regression test for gh-24253. + """ + X, _ = make_blobs(n_samples=200, n_features=10, centers=10, random_state=0) + if csr_container is not None: + X = csr_container(X) + + kmeans = KMeans() + y_pred1 = kmeans.fit_predict(X) + # Make cluster_centers readonly + kmeans.cluster_centers_ = create_memmap_backed_data(kmeans.cluster_centers_) + kmeans.labels_ = create_memmap_backed_data(kmeans.labels_) + + y_pred2 = kmeans.predict(X) + assert_array_equal(y_pred1, y_pred2) + + +@pytest.mark.parametrize("init", ["k-means++", "random"]) +def test_sample_weight_init(init, global_random_seed): + """Check that sample weight is used during init. + + `_init_centroids` is shared across all classes inheriting from _BaseKMeans so + it's enough to check for KMeans. + """ + rng = np.random.RandomState(global_random_seed) + X, _ = make_blobs( + n_samples=200, n_features=10, centers=10, random_state=global_random_seed + ) + x_squared_norms = row_norms(X, squared=True) + + kmeans = KMeans() + clusters_weighted = kmeans._init_centroids( + X=X, + x_squared_norms=x_squared_norms, + init=init, + sample_weight=rng.uniform(size=X.shape[0]), + n_centroids=5, + random_state=np.random.RandomState(global_random_seed), + ) + clusters = kmeans._init_centroids( + X=X, + x_squared_norms=x_squared_norms, + init=init, + sample_weight=np.ones(X.shape[0]), + n_centroids=5, + random_state=np.random.RandomState(global_random_seed), + ) + with pytest.raises(AssertionError): + assert_allclose(clusters_weighted, clusters) + + +@pytest.mark.parametrize("init", ["k-means++", "random"]) +def test_sample_weight_zero(init, global_random_seed): + """Check that if sample weight is 0, this sample won't be chosen. + + `_init_centroids` is shared across all classes inheriting from _BaseKMeans so + it's enough to check for KMeans. + """ + rng = np.random.RandomState(global_random_seed) + X, _ = make_blobs( + n_samples=100, n_features=5, centers=5, random_state=global_random_seed + ) + sample_weight = rng.uniform(size=X.shape[0]) + sample_weight[::2] = 0 + x_squared_norms = row_norms(X, squared=True) + + kmeans = KMeans() + clusters_weighted = kmeans._init_centroids( + X=X, + x_squared_norms=x_squared_norms, + init=init, + sample_weight=sample_weight, + n_centroids=10, + random_state=np.random.RandomState(global_random_seed), + ) + # No center should be one of the 0 sample weight point + # (i.e. be at a distance=0 from it) + d = euclidean_distances(X[::2], clusters_weighted) + assert not np.any(np.isclose(d, 0)) + + +@pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids) +@pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) +def test_relocating_with_duplicates(algorithm, array_constr): + """Check that kmeans stops when there are more centers than non-duplicate samples + + Non-regression test for issue: + https://github.com/scikit-learn/scikit-learn/issues/28055 + """ + X = np.array([[0, 0], [1, 1], [1, 1], [1, 0], [0, 1]]) + km = KMeans(n_clusters=5, init=X, algorithm=algorithm) + + msg = r"Number of distinct clusters \(4\) found smaller than n_clusters \(5\)" + with pytest.warns(ConvergenceWarning, match=msg): + km.fit(array_constr(X)) + + assert km.n_iter_ == 1 diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_mean_shift.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_mean_shift.py new file mode 100644 index 0000000000000000000000000000000000000000..7216a064ccbc729de42688a48cae3b0be6e89bfa --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_mean_shift.py @@ -0,0 +1,215 @@ +""" +Testing for mean shift clustering methods + +""" + +import warnings + +import numpy as np +import pytest + +from sklearn.cluster import MeanShift, estimate_bandwidth, get_bin_seeds, mean_shift +from sklearn.datasets import make_blobs +from sklearn.metrics import v_measure_score +from sklearn.utils._testing import assert_allclose, assert_array_equal + +n_clusters = 3 +centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10 +X, _ = make_blobs( + n_samples=300, + n_features=2, + centers=centers, + cluster_std=0.4, + shuffle=True, + random_state=11, +) + + +def test_convergence_of_1d_constant_data(): + # Test convergence using 1D constant data + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/28926 + model = MeanShift() + n_iter = model.fit(np.ones(10).reshape(-1, 1)).n_iter_ + assert n_iter < model.max_iter + + +def test_estimate_bandwidth(): + # Test estimate_bandwidth + bandwidth = estimate_bandwidth(X, n_samples=200) + assert 0.9 <= bandwidth <= 1.5 + + +def test_estimate_bandwidth_1sample(global_dtype): + # Test estimate_bandwidth when n_samples=1 and quantile<1, so that + # n_neighbors is set to 1. + bandwidth = estimate_bandwidth( + X.astype(global_dtype, copy=False), n_samples=1, quantile=0.3 + ) + + assert bandwidth.dtype == X.dtype + assert bandwidth == pytest.approx(0.0, abs=1e-5) + + +@pytest.mark.parametrize( + "bandwidth, cluster_all, expected, first_cluster_label", + [(1.2, True, 3, 0), (1.2, False, 4, -1)], +) +def test_mean_shift( + global_dtype, bandwidth, cluster_all, expected, first_cluster_label +): + # Test MeanShift algorithm + X_with_global_dtype = X.astype(global_dtype, copy=False) + ms = MeanShift(bandwidth=bandwidth, cluster_all=cluster_all) + labels = ms.fit(X_with_global_dtype).labels_ + labels_unique = np.unique(labels) + n_clusters_ = len(labels_unique) + assert n_clusters_ == expected + assert labels_unique[0] == first_cluster_label + assert ms.cluster_centers_.dtype == global_dtype + + cluster_centers, labels_mean_shift = mean_shift( + X_with_global_dtype, cluster_all=cluster_all + ) + labels_mean_shift_unique = np.unique(labels_mean_shift) + n_clusters_mean_shift = len(labels_mean_shift_unique) + assert n_clusters_mean_shift == expected + assert labels_mean_shift_unique[0] == first_cluster_label + assert cluster_centers.dtype == global_dtype + + +def test_parallel(global_dtype, global_random_seed): + centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10 + X, _ = make_blobs( + n_samples=50, + n_features=2, + centers=centers, + cluster_std=0.4, + shuffle=True, + random_state=global_random_seed, + ) + + X = X.astype(global_dtype, copy=False) + + ms1 = MeanShift(n_jobs=2) + ms1.fit(X) + + ms2 = MeanShift() + ms2.fit(X) + + assert_allclose(ms1.cluster_centers_, ms2.cluster_centers_) + assert ms1.cluster_centers_.dtype == ms2.cluster_centers_.dtype + assert_array_equal(ms1.labels_, ms2.labels_) + + +def test_meanshift_predict(global_dtype): + # Test MeanShift.predict + ms = MeanShift(bandwidth=1.2) + X_with_global_dtype = X.astype(global_dtype, copy=False) + labels = ms.fit_predict(X_with_global_dtype) + labels2 = ms.predict(X_with_global_dtype) + assert_array_equal(labels, labels2) + + +def test_meanshift_all_orphans(): + # init away from the data, crash with a sensible warning + ms = MeanShift(bandwidth=0.1, seeds=[[-9, -9], [-10, -10]]) + msg = "No point was within bandwidth=0.1" + with pytest.raises(ValueError, match=msg): + ms.fit( + X, + ) + + +def test_unfitted(): + # Non-regression: before fit, there should be not fitted attributes. + ms = MeanShift() + assert not hasattr(ms, "cluster_centers_") + assert not hasattr(ms, "labels_") + + +def test_cluster_intensity_tie(global_dtype): + X = np.array([[1, 1], [2, 1], [1, 0], [4, 7], [3, 5], [3, 6]], dtype=global_dtype) + c1 = MeanShift(bandwidth=2).fit(X) + + X = np.array([[4, 7], [3, 5], [3, 6], [1, 1], [2, 1], [1, 0]], dtype=global_dtype) + c2 = MeanShift(bandwidth=2).fit(X) + assert_array_equal(c1.labels_, [1, 1, 1, 0, 0, 0]) + assert_array_equal(c2.labels_, [0, 0, 0, 1, 1, 1]) + + +def test_bin_seeds(global_dtype): + # Test the bin seeding technique which can be used in the mean shift + # algorithm + # Data is just 6 points in the plane + X = np.array( + [[1.0, 1.0], [1.4, 1.4], [1.8, 1.2], [2.0, 1.0], [2.1, 1.1], [0.0, 0.0]], + dtype=global_dtype, + ) + + # With a bin coarseness of 1.0 and min_bin_freq of 1, 3 bins should be + # found + ground_truth = {(1.0, 1.0), (2.0, 1.0), (0.0, 0.0)} + test_bins = get_bin_seeds(X, 1, 1) + test_result = set(tuple(p) for p in test_bins) + assert len(ground_truth.symmetric_difference(test_result)) == 0 + + # With a bin coarseness of 1.0 and min_bin_freq of 2, 2 bins should be + # found + ground_truth = {(1.0, 1.0), (2.0, 1.0)} + test_bins = get_bin_seeds(X, 1, 2) + test_result = set(tuple(p) for p in test_bins) + assert len(ground_truth.symmetric_difference(test_result)) == 0 + + # With a bin size of 0.01 and min_bin_freq of 1, 6 bins should be found + # we bail and use the whole data here. + with warnings.catch_warnings(record=True): + test_bins = get_bin_seeds(X, 0.01, 1) + assert_allclose(test_bins, X) + + # tight clusters around [0, 0] and [1, 1], only get two bins + X, _ = make_blobs( + n_samples=100, + n_features=2, + centers=[[0, 0], [1, 1]], + cluster_std=0.1, + random_state=0, + ) + X = X.astype(global_dtype, copy=False) + test_bins = get_bin_seeds(X, 1) + assert_array_equal(test_bins, [[0, 0], [1, 1]]) + + +@pytest.mark.parametrize("max_iter", [1, 100]) +def test_max_iter(max_iter): + clusters1, _ = mean_shift(X, max_iter=max_iter) + ms = MeanShift(max_iter=max_iter).fit(X) + clusters2 = ms.cluster_centers_ + + assert ms.n_iter_ <= ms.max_iter + assert len(clusters1) == len(clusters2) + + for c1, c2 in zip(clusters1, clusters2): + assert np.allclose(c1, c2) + + +def test_mean_shift_zero_bandwidth(global_dtype): + # Check that mean shift works when the estimated bandwidth is 0. + X = np.array([1, 1, 1, 2, 2, 2, 3, 3], dtype=global_dtype).reshape(-1, 1) + + # estimate_bandwidth with default args returns 0 on this dataset + bandwidth = estimate_bandwidth(X) + assert bandwidth == 0 + + # get_bin_seeds with a 0 bin_size should return the dataset itself + assert get_bin_seeds(X, bin_size=bandwidth) is X + + # MeanShift with binning and a 0 estimated bandwidth should be equivalent + # to no binning. + ms_binning = MeanShift(bin_seeding=True, bandwidth=None).fit(X) + ms_nobinning = MeanShift(bin_seeding=False).fit(X) + expected_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2]) + + assert v_measure_score(ms_binning.labels_, expected_labels) == pytest.approx(1) + assert v_measure_score(ms_nobinning.labels_, expected_labels) == pytest.approx(1) + assert_allclose(ms_binning.cluster_centers_, ms_nobinning.cluster_centers_) diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_optics.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_optics.py new file mode 100644 index 0000000000000000000000000000000000000000..02184ea454d65cc1f2d9d95f265e52d307c30543 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_optics.py @@ -0,0 +1,874 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings + +import numpy as np +import pytest + +from sklearn.cluster import DBSCAN, OPTICS +from sklearn.cluster._optics import _extend_region, _extract_xi_labels +from sklearn.cluster.tests.common import generate_clustered_data +from sklearn.datasets import make_blobs +from sklearn.exceptions import DataConversionWarning, EfficiencyWarning +from sklearn.metrics.cluster import contingency_matrix +from sklearn.metrics.pairwise import pairwise_distances +from sklearn.utils import shuffle +from sklearn.utils._testing import assert_allclose, assert_array_equal +from sklearn.utils.fixes import CSR_CONTAINERS + +rng = np.random.RandomState(0) +n_points_per_cluster = 10 +C1 = [-5, -2] + 0.8 * rng.randn(n_points_per_cluster, 2) +C2 = [4, -1] + 0.1 * rng.randn(n_points_per_cluster, 2) +C3 = [1, -2] + 0.2 * rng.randn(n_points_per_cluster, 2) +C4 = [-2, 3] + 0.3 * rng.randn(n_points_per_cluster, 2) +C5 = [3, -2] + 1.6 * rng.randn(n_points_per_cluster, 2) +C6 = [5, 6] + 2 * rng.randn(n_points_per_cluster, 2) +X = np.vstack((C1, C2, C3, C4, C5, C6)) + + +@pytest.mark.parametrize( + ("r_plot", "end"), + [ + [[10, 8.9, 8.8, 8.7, 7, 10], 3], + [[10, 8.9, 8.8, 8.7, 8.6, 7, 10], 0], + [[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4], + [[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4], + ], +) +def test_extend_downward(r_plot, end): + r_plot = np.array(r_plot) + ratio = r_plot[:-1] / r_plot[1:] + steep_downward = ratio >= 1 / 0.9 + upward = ratio < 1 + + e = _extend_region(steep_downward, upward, 0, 2) + assert e == end + + +@pytest.mark.parametrize( + ("r_plot", "end"), + [ + [[1, 2, 2.1, 2.2, 4, 8, 8, np.inf], 6], + [[1, 2, 2.1, 2.2, 2.3, 4, 8, 8, np.inf], 0], + [[1, 2, 2.1, 2, np.inf], 0], + [[1, 2, 2.1, np.inf], 2], + ], +) +def test_extend_upward(r_plot, end): + r_plot = np.array(r_plot) + ratio = r_plot[:-1] / r_plot[1:] + steep_upward = ratio <= 0.9 + downward = ratio > 1 + + e = _extend_region(steep_upward, downward, 0, 2) + assert e == end + + +@pytest.mark.parametrize( + ("ordering", "clusters", "expected"), + [ + [[0, 1, 2, 3], [[0, 1], [2, 3]], [0, 0, 1, 1]], + [[0, 1, 2, 3], [[0, 1], [3, 3]], [0, 0, -1, 1]], + [[0, 1, 2, 3], [[0, 1], [3, 3], [0, 3]], [0, 0, -1, 1]], + [[3, 1, 2, 0], [[0, 1], [3, 3], [0, 3]], [1, 0, -1, 0]], + ], +) +def test_the_extract_xi_labels(ordering, clusters, expected): + labels = _extract_xi_labels(ordering, clusters) + + assert_array_equal(labels, expected) + + +def test_extract_xi(global_dtype): + # small and easy test (no clusters around other clusters) + # but with a clear noise data. + # global_random_seed is not used here since the expected labels + # are hardcoded for these specific data. + rng = np.random.RandomState(0) + n_points_per_cluster = 5 + + C1 = [-5, -2] + 0.8 * rng.randn(n_points_per_cluster, 2) + C2 = [4, -1] + 0.1 * rng.randn(n_points_per_cluster, 2) + C3 = [1, -2] + 0.2 * rng.randn(n_points_per_cluster, 2) + C4 = [-2, 3] + 0.3 * rng.randn(n_points_per_cluster, 2) + C5 = [3, -2] + 0.6 * rng.randn(n_points_per_cluster, 2) + C6 = [5, 6] + 0.2 * rng.randn(n_points_per_cluster, 2) + + X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]]), C6)).astype( + global_dtype, copy=False + ) + expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5, -1, [4] * 5] + X, expected_labels = shuffle(X, expected_labels, random_state=rng) + + clust = OPTICS( + min_samples=3, min_cluster_size=2, max_eps=20, cluster_method="xi", xi=0.4 + ).fit(X) + assert_array_equal(clust.labels_, expected_labels) + + # check float min_samples and min_cluster_size + clust = OPTICS( + min_samples=0.1, min_cluster_size=0.08, max_eps=20, cluster_method="xi", xi=0.4 + ).fit(X) + assert_array_equal(clust.labels_, expected_labels) + + X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6)).astype( + global_dtype, copy=False + ) + expected_labels = np.r_[ + [1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5, -1, -1, [4] * 5 + ] + X, expected_labels = shuffle(X, expected_labels, random_state=rng) + + clust = OPTICS( + min_samples=3, min_cluster_size=3, max_eps=20, cluster_method="xi", xi=0.3 + ).fit(X) + # this may fail if the predecessor correction is not at work! + assert_array_equal(clust.labels_, expected_labels) + + C1 = [[0, 0], [0, 0.1], [0, -0.1], [0.1, 0]] + C2 = [[10, 10], [10, 9], [10, 11], [9, 10]] + C3 = [[100, 100], [100, 90], [100, 110], [90, 100]] + X = np.vstack((C1, C2, C3)).astype(global_dtype, copy=False) + expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4] + X, expected_labels = shuffle(X, expected_labels, random_state=rng) + + clust = OPTICS( + min_samples=2, min_cluster_size=2, max_eps=np.inf, cluster_method="xi", xi=0.04 + ).fit(X) + assert_array_equal(clust.labels_, expected_labels) + + +def test_cluster_hierarchy(global_dtype, global_random_seed): + rng = np.random.RandomState(global_random_seed) + n_points_per_cluster = 100 + C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2).astype( + global_dtype, copy=False + ) + C2 = [0, 0] + 50 * rng.randn(n_points_per_cluster, 2).astype( + global_dtype, copy=False + ) + X = np.vstack((C1, C2)) + X = shuffle(X, random_state=rng) + + clusters = OPTICS(min_samples=20, xi=0.2).fit(X).cluster_hierarchy_ + assert clusters.shape == (2, 2) + + # The first cluster should contain all point from C1 but due to how the data is + # generated, some points from C2 may end up in it. + assert 100 <= np.diff(clusters[0]) + 1 <= 115 + # The second cluster should contain all points from C1 and C2. + assert np.diff(clusters[-1]) + 1 == 200 + + +@pytest.mark.parametrize( + "csr_container, metric", + [(None, "minkowski")] + [(container, "euclidean") for container in CSR_CONTAINERS], +) +def test_correct_number_of_clusters(metric, csr_container): + # in 'auto' mode + + n_clusters = 3 + X = generate_clustered_data(n_clusters=n_clusters) + # Parameters chosen specifically for this task. + # Compute OPTICS + clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=0.1, metric=metric) + clust.fit(csr_container(X) if csr_container is not None else X) + # number of clusters, ignoring noise if present + n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_) + assert n_clusters_1 == n_clusters + + # check attribute types and sizes + assert clust.labels_.shape == (len(X),) + assert clust.labels_.dtype.kind == "i" + + assert clust.reachability_.shape == (len(X),) + assert clust.reachability_.dtype.kind == "f" + + assert clust.core_distances_.shape == (len(X),) + assert clust.core_distances_.dtype.kind == "f" + + assert clust.ordering_.shape == (len(X),) + assert clust.ordering_.dtype.kind == "i" + assert set(clust.ordering_) == set(range(len(X))) + + +def test_minimum_number_of_sample_check(): + # test that we check a minimum number of samples + msg = "min_samples must be no greater than" + + # Compute OPTICS + X = [[1, 1]] + clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1.0) + + # Run the fit + with pytest.raises(ValueError, match=msg): + clust.fit(X) + + +def test_bad_extract(): + # Test an extraction of eps too close to original eps + msg = "Specify an epsilon smaller than 0.15. Got 0.3." + centers = [[1, 1], [-1, -1], [1, -1]] + X, labels_true = make_blobs( + n_samples=750, centers=centers, cluster_std=0.4, random_state=0 + ) + + # Compute OPTICS + clust = OPTICS(max_eps=5.0 * 0.03, cluster_method="dbscan", eps=0.3, min_samples=10) + with pytest.raises(ValueError, match=msg): + clust.fit(X) + + +def test_bad_reachability(): + msg = "All reachability values are inf. Set a larger max_eps." + centers = [[1, 1], [-1, -1], [1, -1]] + X, labels_true = make_blobs( + n_samples=750, centers=centers, cluster_std=0.4, random_state=0 + ) + + with pytest.warns(UserWarning, match=msg): + clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015) + clust.fit(X) + + +def test_nowarn_if_metric_bool_data_bool(): + # make sure no warning is raised if metric and data are both boolean + # non-regression test for + # https://github.com/scikit-learn/scikit-learn/issues/18996 + + pairwise_metric = "rogerstanimoto" + X = np.random.randint(2, size=(5, 2), dtype=bool) + + with warnings.catch_warnings(): + warnings.simplefilter("error", DataConversionWarning) + + OPTICS(metric=pairwise_metric).fit(X) + + +def test_warn_if_metric_bool_data_no_bool(): + # make sure a *single* conversion warning is raised if metric is boolean + # but data isn't + # non-regression test for + # https://github.com/scikit-learn/scikit-learn/issues/18996 + + pairwise_metric = "rogerstanimoto" + X = np.random.randint(2, size=(5, 2), dtype=np.int32) + msg = f"Data will be converted to boolean for metric {pairwise_metric}" + + with pytest.warns(DataConversionWarning, match=msg) as warn_record: + # Silence a DeprecationWarning from joblib <= 1.5.1 in Python 3.14+. + warnings.filterwarnings( + "ignore", + message="'asyncio.iscoroutinefunction' is deprecated", + category=DeprecationWarning, + ) + OPTICS(metric=pairwise_metric).fit(X) + assert len(warn_record) == 1 + + +def test_nowarn_if_metric_no_bool(): + # make sure no conversion warning is raised if + # metric isn't boolean, no matter what the data type is + pairwise_metric = "minkowski" + X_bool = np.random.randint(2, size=(5, 2), dtype=bool) + X_num = np.random.randint(2, size=(5, 2), dtype=np.int32) + + with warnings.catch_warnings(): + warnings.simplefilter("error", DataConversionWarning) + + # fit boolean data + OPTICS(metric=pairwise_metric).fit(X_bool) + # fit numeric data + OPTICS(metric=pairwise_metric).fit(X_num) + + +def test_close_extract(): + # Test extract where extraction eps is close to scaled max_eps + + centers = [[1, 1], [-1, -1], [1, -1]] + X, labels_true = make_blobs( + n_samples=750, centers=centers, cluster_std=0.4, random_state=0 + ) + + # Compute OPTICS + clust = OPTICS(max_eps=1.0, cluster_method="dbscan", eps=0.3, min_samples=10).fit(X) + # Cluster ordering starts at 0; max cluster label = 2 is 3 clusters + assert max(clust.labels_) == 2 + + +@pytest.mark.parametrize("eps", [0.1, 0.3, 0.5]) +@pytest.mark.parametrize("min_samples", [3, 10, 20]) +@pytest.mark.parametrize( + "csr_container, metric", + [(None, "minkowski"), (None, "euclidean")] + + [(container, "euclidean") for container in CSR_CONTAINERS], +) +def test_dbscan_optics_parity(eps, min_samples, metric, global_dtype, csr_container): + # Test that OPTICS clustering labels are <= 5% difference of DBSCAN + + centers = [[1, 1], [-1, -1], [1, -1]] + X, labels_true = make_blobs( + n_samples=150, centers=centers, cluster_std=0.4, random_state=0 + ) + X = csr_container(X) if csr_container is not None else X + + X = X.astype(global_dtype, copy=False) + + # calculate optics with dbscan extract at 0.3 epsilon + op = OPTICS( + min_samples=min_samples, cluster_method="dbscan", eps=eps, metric=metric + ).fit(X) + + # calculate dbscan labels + db = DBSCAN(eps=eps, min_samples=min_samples).fit(X) + + contingency = contingency_matrix(db.labels_, op.labels_) + agree = min( + np.sum(np.max(contingency, axis=0)), np.sum(np.max(contingency, axis=1)) + ) + disagree = X.shape[0] - agree + + percent_mismatch = np.round((disagree - 1) / X.shape[0], 2) + + # verify label mismatch is <= 5% labels + assert percent_mismatch <= 0.05 + + +def test_min_samples_edge_case(global_dtype): + C1 = [[0, 0], [0, 0.1], [0, -0.1]] + C2 = [[10, 10], [10, 9], [10, 11]] + C3 = [[100, 100], [100, 96], [100, 106]] + X = np.vstack((C1, C2, C3)).astype(global_dtype, copy=False) + + expected_labels = np.r_[[0] * 3, [1] * 3, [2] * 3] + clust = OPTICS(min_samples=3, max_eps=7, cluster_method="xi", xi=0.04).fit(X) + assert_array_equal(clust.labels_, expected_labels) + + expected_labels = np.r_[[0] * 3, [1] * 3, [-1] * 3] + clust = OPTICS(min_samples=3, max_eps=3, cluster_method="xi", xi=0.04).fit(X) + assert_array_equal(clust.labels_, expected_labels) + + expected_labels = np.r_[[-1] * 9] + with pytest.warns(UserWarning, match="All reachability values"): + clust = OPTICS(min_samples=4, max_eps=3, cluster_method="xi", xi=0.04).fit(X) + assert_array_equal(clust.labels_, expected_labels) + + +# try arbitrary minimum sizes +@pytest.mark.parametrize("min_cluster_size", range(2, X.shape[0] // 10, 23)) +def test_min_cluster_size(min_cluster_size, global_dtype): + redX = X[::2].astype(global_dtype, copy=False) # reduce for speed + clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size).fit(redX) + cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1]) + if cluster_sizes.size: + assert min(cluster_sizes) >= min_cluster_size + # check behaviour is the same when min_cluster_size is a fraction + clust_frac = OPTICS( + min_samples=9, + min_cluster_size=min_cluster_size / redX.shape[0], + ) + clust_frac.fit(redX) + assert_array_equal(clust.labels_, clust_frac.labels_) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_min_cluster_size_invalid2(csr_container): + clust = OPTICS(min_cluster_size=len(X) + 1) + with pytest.raises(ValueError, match="must be no greater than the "): + clust.fit(X) + + clust = OPTICS(min_cluster_size=len(X) + 1, metric="euclidean") + with pytest.raises(ValueError, match="must be no greater than the "): + clust.fit(csr_container(X)) + + +def test_processing_order(): + # Ensure that we consider all unprocessed points, + # not only direct neighbors. when picking the next point. + Y = [[0], [10], [-10], [25]] + + clust = OPTICS(min_samples=3, max_eps=15).fit(Y) + assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15]) + assert_array_equal(clust.core_distances_, [10, 15, np.inf, np.inf]) + assert_array_equal(clust.ordering_, [0, 1, 2, 3]) + + +def test_compare_to_ELKI(): + # Expected values, computed with (future) ELKI 0.7.5 using: + # java -jar elki.jar cli -dbc.in csv -dbc.filter FixedDBIDsFilter + # -algorithm clustering.optics.OPTICSHeap -optics.minpts 5 + # where the FixedDBIDsFilter gives 0-indexed ids. + r1 = [ + np.inf, + 1.0574896366427478, + 0.7587934993548423, + 0.7290174038973836, + 0.7290174038973836, + 0.7290174038973836, + 0.6861627576116127, + 0.7587934993548423, + 0.9280118450166668, + 1.1748022534146194, + 3.3355455741292257, + 0.49618389254482587, + 0.2552805046961355, + 0.2552805046961355, + 0.24944622248445714, + 0.24944622248445714, + 0.24944622248445714, + 0.2552805046961355, + 0.2552805046961355, + 0.3086779122185853, + 4.163024452756142, + 1.623152630340929, + 0.45315840475822655, + 0.25468325192031926, + 0.2254004358159971, + 0.18765711877083036, + 0.1821471333893275, + 0.1821471333893275, + 0.18765711877083036, + 0.18765711877083036, + 0.2240202988740153, + 1.154337614548715, + 1.342604473837069, + 1.323308536402633, + 0.8607514948648837, + 0.27219111215810565, + 0.13260875220533205, + 0.13260875220533205, + 0.09890587675958984, + 0.09890587675958984, + 0.13548790801634494, + 0.1575483940837384, + 0.17515137170530226, + 0.17575920159442388, + 0.27219111215810565, + 0.6101447895405373, + 1.3189208094864302, + 1.323308536402633, + 2.2509184159764577, + 2.4517810628594527, + 3.675977064404973, + 3.8264795626020365, + 2.9130735341510614, + 2.9130735341510614, + 2.9130735341510614, + 2.9130735341510614, + 2.8459300127258036, + 2.8459300127258036, + 2.8459300127258036, + 3.0321982337972537, + ] + o1 = [ + 0, + 3, + 6, + 4, + 7, + 8, + 2, + 9, + 5, + 1, + 31, + 30, + 32, + 34, + 33, + 38, + 39, + 35, + 37, + 36, + 44, + 21, + 23, + 24, + 22, + 25, + 27, + 29, + 26, + 28, + 20, + 40, + 45, + 46, + 10, + 15, + 11, + 13, + 17, + 19, + 18, + 12, + 16, + 14, + 47, + 49, + 43, + 48, + 42, + 41, + 53, + 57, + 51, + 52, + 56, + 59, + 54, + 55, + 58, + 50, + ] + p1 = [ + -1, + 0, + 3, + 6, + 6, + 6, + 8, + 3, + 7, + 5, + 1, + 31, + 30, + 30, + 34, + 34, + 34, + 32, + 32, + 37, + 36, + 44, + 21, + 23, + 24, + 22, + 25, + 25, + 22, + 22, + 22, + 21, + 40, + 45, + 46, + 10, + 15, + 15, + 13, + 13, + 15, + 11, + 19, + 15, + 10, + 47, + 12, + 45, + 14, + 43, + 42, + 53, + 57, + 57, + 57, + 57, + 59, + 59, + 59, + 58, + ] + + # Tests against known extraction array + # Does NOT work with metric='euclidean', because sklearn euclidean has + # worse numeric precision. 'minkowski' is slower but more accurate. + clust1 = OPTICS(min_samples=5).fit(X) + + assert_array_equal(clust1.ordering_, np.array(o1)) + assert_array_equal(clust1.predecessor_[clust1.ordering_], np.array(p1)) + assert_allclose(clust1.reachability_[clust1.ordering_], np.array(r1)) + # ELKI currently does not print the core distances (which are not used much + # in literature, but we can at least ensure to have this consistency: + for i in clust1.ordering_[1:]: + assert clust1.reachability_[i] >= clust1.core_distances_[clust1.predecessor_[i]] + + # Expected values, computed with (future) ELKI 0.7.5 using + r2 = [ + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + 0.27219111215810565, + 0.13260875220533205, + 0.13260875220533205, + 0.09890587675958984, + 0.09890587675958984, + 0.13548790801634494, + 0.1575483940837384, + 0.17515137170530226, + 0.17575920159442388, + 0.27219111215810565, + 0.4928068613197889, + np.inf, + 0.2666183922512113, + 0.18765711877083036, + 0.1821471333893275, + 0.1821471333893275, + 0.1821471333893275, + 0.18715928772277457, + 0.18765711877083036, + 0.18765711877083036, + 0.25468325192031926, + np.inf, + 0.2552805046961355, + 0.2552805046961355, + 0.24944622248445714, + 0.24944622248445714, + 0.24944622248445714, + 0.2552805046961355, + 0.2552805046961355, + 0.3086779122185853, + 0.34466409325984865, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + ] + o2 = [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 15, + 11, + 13, + 17, + 19, + 18, + 12, + 16, + 14, + 47, + 46, + 20, + 22, + 25, + 23, + 27, + 29, + 24, + 26, + 28, + 21, + 30, + 32, + 34, + 33, + 38, + 39, + 35, + 37, + 36, + 31, + 40, + 41, + 42, + 43, + 44, + 45, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + ] + p2 = [ + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + 10, + 15, + 15, + 13, + 13, + 15, + 11, + 19, + 15, + 10, + 47, + -1, + 20, + 22, + 25, + 25, + 25, + 25, + 22, + 22, + 23, + -1, + 30, + 30, + 34, + 34, + 34, + 32, + 32, + 37, + 38, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + -1, + ] + clust2 = OPTICS(min_samples=5, max_eps=0.5).fit(X) + + assert_array_equal(clust2.ordering_, np.array(o2)) + assert_array_equal(clust2.predecessor_[clust2.ordering_], np.array(p2)) + assert_allclose(clust2.reachability_[clust2.ordering_], np.array(r2)) + + index = np.where(clust1.core_distances_ <= 0.5)[0] + assert_allclose(clust1.core_distances_[index], clust2.core_distances_[index]) + + +def test_extract_dbscan(global_dtype, global_random_seed): + # testing an easy dbscan case. Not including clusters with different + # densities. + rng = np.random.RandomState(global_random_seed) + n_points_per_cluster = 20 + C1 = [-5, -2] + 0.2 * rng.randn(n_points_per_cluster, 2) + C2 = [4, -1] + 0.2 * rng.randn(n_points_per_cluster, 2) + C3 = [1, 2] + 0.2 * rng.randn(n_points_per_cluster, 2) + C4 = [-2, 3] + 0.2 * rng.randn(n_points_per_cluster, 2) + X = np.vstack((C1, C2, C3, C4)).astype(global_dtype, copy=False) + + clust = OPTICS(cluster_method="dbscan", eps=0.5).fit(X) + assert_array_equal( + np.sort(np.unique(clust.labels_[clust.labels_ != -1])), [0, 1, 2, 3] + ) + + +@pytest.mark.parametrize("csr_container", [None] + CSR_CONTAINERS) +def test_precomputed_dists(global_dtype, csr_container): + redX = X[::2].astype(global_dtype, copy=False) + dists = pairwise_distances(redX, metric="euclidean") + dists = csr_container(dists) if csr_container is not None else dists + with warnings.catch_warnings(): + warnings.simplefilter("ignore", EfficiencyWarning) + clust1 = OPTICS(min_samples=10, algorithm="brute", metric="precomputed").fit( + dists + ) + clust2 = OPTICS(min_samples=10, algorithm="brute", metric="euclidean").fit(redX) + + assert_allclose(clust1.reachability_, clust2.reachability_) + assert_array_equal(clust1.labels_, clust2.labels_) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_optics_input_not_modified_precomputed_sparse_nodiag( + csr_container, global_random_seed +): + """Check that we don't modify in-place the pre-computed sparse matrix. + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/27508 + """ + X = np.random.RandomState(global_random_seed).rand(6, 6) + # Add zeros on the diagonal that will be implicit when creating + # the sparse matrix. If `X` is modified in-place, the zeros from + # the diagonal will be made explicit. + np.fill_diagonal(X, 0) + X = csr_container(X) + assert all(row != col for row, col in zip(*X.nonzero())) + X_copy = X.copy() + OPTICS(metric="precomputed").fit(X) + # Make sure that we did not modify `X` in-place even by creating + # explicit 0s values. + assert X.nnz == X_copy.nnz + assert_array_equal(X.toarray(), X_copy.toarray()) + + +def test_optics_predecessor_correction_ordering(): + """Check that cluster correction using predecessor is working as expected. + + In the following example, the predecessor correction was not working properly + since it was not using the right indices. + + This non-regression test check that reordering the data does not change the results. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/26324 + """ + X_1 = np.array([1, 2, 3, 1, 8, 8, 7, 100]).reshape(-1, 1) + reorder = [0, 1, 2, 4, 5, 6, 7, 3] + X_2 = X_1[reorder] + + optics_1 = OPTICS(min_samples=3, metric="euclidean").fit(X_1) + optics_2 = OPTICS(min_samples=3, metric="euclidean").fit(X_2) + + assert_array_equal(optics_1.labels_[reorder], optics_2.labels_) diff --git a/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_spectral.py b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_spectral.py new file mode 100644 index 0000000000000000000000000000000000000000..71b11c9fe151c310f4fd5a60f99323360e493506 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cluster/tests/test_spectral.py @@ -0,0 +1,335 @@ +"""Testing for Spectral Clustering methods""" + +import pickle +import re + +import numpy as np +import pytest +from scipy.linalg import LinAlgError + +from sklearn.cluster import SpectralClustering, spectral_clustering +from sklearn.cluster._spectral import cluster_qr, discretize +from sklearn.datasets import make_blobs +from sklearn.feature_extraction import img_to_graph +from sklearn.metrics import adjusted_rand_score +from sklearn.metrics.pairwise import kernel_metrics, rbf_kernel +from sklearn.neighbors import NearestNeighbors +from sklearn.utils import check_random_state +from sklearn.utils._testing import assert_array_equal +from sklearn.utils.fixes import COO_CONTAINERS, CSR_CONTAINERS + +try: + from pyamg import smoothed_aggregation_solver # noqa: F401 + + amg_loaded = True +except ImportError: + amg_loaded = False + +centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10 +X, _ = make_blobs( + n_samples=60, + n_features=2, + centers=centers, + cluster_std=0.4, + shuffle=True, + random_state=0, +) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +@pytest.mark.parametrize("eigen_solver", ("arpack", "lobpcg")) +@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr")) +def test_spectral_clustering( + eigen_solver, assign_labels, csr_container, global_random_seed +): + S = np.array( + [ + [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0], + [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0], + [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0], + [0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0], + [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0], + [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0], + [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0], + ] + ) + + for mat in (S, csr_container(S)): + model = SpectralClustering( + random_state=global_random_seed, + n_clusters=2, + affinity="precomputed", + eigen_solver=eigen_solver, + assign_labels=assign_labels, + ).fit(mat) + labels = model.labels_ + if labels[0] == 0: + labels = 1 - labels + + assert adjusted_rand_score(labels, [1, 1, 1, 0, 0, 0, 0]) == 1 + + model_copy = pickle.loads(pickle.dumps(model)) + assert model_copy.n_clusters == model.n_clusters + assert model_copy.eigen_solver == model.eigen_solver + assert_array_equal(model_copy.labels_, model.labels_) + + +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr")) +def test_spectral_clustering_sparse(assign_labels, coo_container, global_random_seed): + X, y = make_blobs( + n_samples=20, + random_state=global_random_seed, + centers=[[1, 1], [-1, -1]], + cluster_std=0.01, + ) + + S = rbf_kernel(X, gamma=1) + S = np.maximum(S - 1e-4, 0) + S = coo_container(S) + + labels = ( + SpectralClustering( + random_state=global_random_seed, + n_clusters=2, + affinity="precomputed", + assign_labels=assign_labels, + ) + .fit(S) + .labels_ + ) + assert adjusted_rand_score(y, labels) == 1 + + +def test_precomputed_nearest_neighbors_filtering(global_random_seed): + # Test precomputed graph filtering when containing too many neighbors + X, y = make_blobs( + n_samples=250, + random_state=global_random_seed, + centers=[[1, 1, 1], [-1, -1, -1]], + cluster_std=0.01, + ) + + n_neighbors = 2 + results = [] + for additional_neighbors in [0, 10]: + nn = NearestNeighbors(n_neighbors=n_neighbors + additional_neighbors).fit(X) + graph = nn.kneighbors_graph(X, mode="distance") + labels = ( + SpectralClustering( + random_state=global_random_seed, + n_clusters=2, + affinity="precomputed_nearest_neighbors", + n_neighbors=n_neighbors, + ) + .fit(graph) + .labels_ + ) + results.append(labels) + + assert_array_equal(results[0], results[1]) + + +def test_affinities(global_random_seed): + # Note: in the following, random_state has been selected to have + # a dataset that yields a stable eigen decomposition both when built + # on OSX and Linux + X, y = make_blobs( + n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01 + ) + # nearest neighbors affinity + sp = SpectralClustering(n_clusters=2, affinity="nearest_neighbors", random_state=0) + with pytest.warns(UserWarning, match="not fully connected"): + sp.fit(X) + assert adjusted_rand_score(y, sp.labels_) == 1 + + sp = SpectralClustering(n_clusters=2, gamma=2, random_state=global_random_seed) + labels = sp.fit(X).labels_ + assert adjusted_rand_score(y, labels) == 1 + + X = check_random_state(10).rand(10, 5) * 10 + + kernels_available = kernel_metrics() + for kern in kernels_available: + # Additive chi^2 gives a negative similarity matrix which + # doesn't make sense for spectral clustering + if kern != "additive_chi2": + sp = SpectralClustering(n_clusters=2, affinity=kern, random_state=0) + labels = sp.fit(X).labels_ + assert (X.shape[0],) == labels.shape + + sp = SpectralClustering(n_clusters=2, affinity=lambda x, y: 1, random_state=0) + labels = sp.fit(X).labels_ + assert (X.shape[0],) == labels.shape + + def histogram(x, y, **kwargs): + # Histogram kernel implemented as a callable. + assert kwargs == {} # no kernel_params that we didn't ask for + return np.minimum(x, y).sum() + + sp = SpectralClustering(n_clusters=2, affinity=histogram, random_state=0) + labels = sp.fit(X).labels_ + assert (X.shape[0],) == labels.shape + + +def test_cluster_qr(global_random_seed): + # cluster_qr by itself should not be used for clustering generic data + # other than the rows of the eigenvectors within spectral clustering, + # but cluster_qr must still preserve the labels for different dtypes + # of the generic fixed input even if the labels may be meaningless. + random_state = np.random.RandomState(seed=global_random_seed) + n_samples, n_components = 10, 5 + data = random_state.randn(n_samples, n_components) + labels_float64 = cluster_qr(data.astype(np.float64)) + # Each sample is assigned a cluster identifier + assert labels_float64.shape == (n_samples,) + # All components should be covered by the assignment + assert np.array_equal(np.unique(labels_float64), np.arange(n_components)) + # Single precision data should yield the same cluster assignments + labels_float32 = cluster_qr(data.astype(np.float32)) + assert np.array_equal(labels_float64, labels_float32) + + +def test_cluster_qr_permutation_invariance(global_random_seed): + # cluster_qr must be invariant to sample permutation. + random_state = np.random.RandomState(seed=global_random_seed) + n_samples, n_components = 100, 5 + data = random_state.randn(n_samples, n_components) + perm = random_state.permutation(n_samples) + assert np.array_equal( + cluster_qr(data)[perm], + cluster_qr(data[perm]), + ) + + +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +@pytest.mark.parametrize("n_samples", [50, 100, 150, 500]) +def test_discretize(n_samples, coo_container, global_random_seed): + # Test the discretize using a noise assignment matrix + random_state = np.random.RandomState(seed=global_random_seed) + for n_class in range(2, 10): + # random class labels + y_true = random_state.randint(0, n_class + 1, n_samples) + y_true = np.array(y_true, float) + # noise class assignment matrix + y_indicator = coo_container( + (np.ones(n_samples), (np.arange(n_samples), y_true)), + shape=(n_samples, n_class + 1), + ) + y_true_noisy = y_indicator.toarray() + 0.1 * random_state.randn( + n_samples, n_class + 1 + ) + y_pred = discretize(y_true_noisy, random_state=random_state) + assert adjusted_rand_score(y_true, y_pred) > 0.8 + + +def test_spectral_clustering_with_arpack_amg_solvers(global_random_seed): + # Test that spectral_clustering is the same for arpack and amg solver + # Based on toy example from plot_segmentation_toy.py + + # a small two coin image + x, y = np.indices((40, 40)) + + center1, center2 = (14, 12), (20, 25) + radius1, radius2 = 8, 7 + + circle1 = (x - center1[0]) ** 2 + (y - center1[1]) ** 2 < radius1**2 + circle2 = (x - center2[0]) ** 2 + (y - center2[1]) ** 2 < radius2**2 + + circles = circle1 | circle2 + mask = circles.copy() + img = circles.astype(float) + + graph = img_to_graph(img, mask=mask) + graph.data = np.exp(-graph.data / graph.data.std()) + + labels_arpack = spectral_clustering( + graph, n_clusters=2, eigen_solver="arpack", random_state=global_random_seed + ) + + assert len(np.unique(labels_arpack)) == 2 + + if amg_loaded: + labels_amg = spectral_clustering( + graph, n_clusters=2, eigen_solver="amg", random_state=global_random_seed + ) + assert adjusted_rand_score(labels_arpack, labels_amg) == 1 + else: + with pytest.raises(ValueError): + spectral_clustering(graph, n_clusters=2, eigen_solver="amg", random_state=0) + + +def test_n_components(global_random_seed): + # Test that after adding n_components, result is different and + # n_components = n_clusters by default + X, y = make_blobs( + n_samples=20, + random_state=global_random_seed, + centers=[[1, 1], [-1, -1]], + cluster_std=0.01, + ) + sp = SpectralClustering(n_clusters=2, random_state=global_random_seed) + labels = sp.fit(X).labels_ + # set n_components = n_cluster and test if result is the same + labels_same_ncomp = ( + SpectralClustering( + n_clusters=2, n_components=2, random_state=global_random_seed + ) + .fit(X) + .labels_ + ) + # test that n_components=n_clusters by default + assert_array_equal(labels, labels_same_ncomp) + + # test that n_components affect result + # n_clusters=8 by default, and set n_components=2 + labels_diff_ncomp = ( + SpectralClustering(n_components=2, random_state=global_random_seed) + .fit(X) + .labels_ + ) + assert not np.array_equal(labels, labels_diff_ncomp) + + +@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr")) +def test_verbose(assign_labels, capsys): + # Check verbose mode of KMeans for better coverage. + X, y = make_blobs( + n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01 + ) + + SpectralClustering(n_clusters=2, random_state=42, verbose=1).fit(X) + + captured = capsys.readouterr() + + assert re.search(r"Computing label assignment using", captured.out) + + if assign_labels == "kmeans": + assert re.search(r"Initialization complete", captured.out) + assert re.search(r"Iteration [0-9]+, inertia", captured.out) + + +def test_spectral_clustering_np_matrix_raises(): + """Check that spectral_clustering raises an informative error when passed + a np.matrix. See #10993""" + X = np.matrix([[0.0, 2.0], [2.0, 0.0]]) + + msg = r"np\.matrix is not supported. Please convert to a numpy array" + with pytest.raises(TypeError, match=msg): + spectral_clustering(X) + + +def test_spectral_clustering_not_infinite_loop(capsys, monkeypatch): + """Check that discretize raises LinAlgError when svd never converges. + + Non-regression test for #21380 + """ + + def new_svd(*args, **kwargs): + raise LinAlgError() + + monkeypatch.setattr(np.linalg, "svd", new_svd) + vectors = np.ones((10, 4)) + + with pytest.raises(LinAlgError, match="SVD did not converge"): + discretize(vectors) diff --git a/.venv/lib/python3.12/site-packages/sklearn/compose/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/compose/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..842a86ba21d9b7e2e738284faf1a394e3f6ae7e9 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/compose/__init__.py @@ -0,0 +1,23 @@ +"""Meta-estimators for building composite models with transformers. + +In addition to its current contents, this module will eventually be home to +refurbished versions of :class:`~sklearn.pipeline.Pipeline` and +:class:`~sklearn.pipeline.FeatureUnion`. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._column_transformer import ( + ColumnTransformer, + make_column_selector, + make_column_transformer, +) +from ._target import TransformedTargetRegressor + +__all__ = [ + "ColumnTransformer", + "TransformedTargetRegressor", + "make_column_selector", + "make_column_transformer", +] diff --git a/.venv/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py b/.venv/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..940d9194dd97657f0cd746b89b482ea4d75d8852 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py @@ -0,0 +1,1604 @@ +""" +The :mod:`sklearn.compose._column_transformer` module implements utilities +to work with heterogeneous data and to apply different transformers to +different columns. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from collections import Counter +from functools import partial +from itertools import chain +from numbers import Integral, Real + +import numpy as np +from scipy import sparse + +from ..base import TransformerMixin, _fit_context, clone +from ..pipeline import _fit_transform_one, _name_estimators, _transform_one +from ..preprocessing import FunctionTransformer +from ..utils import Bunch +from ..utils._indexing import _determine_key_type, _get_column_indices, _safe_indexing +from ..utils._metadata_requests import METHODS +from ..utils._param_validation import HasMethods, Hidden, Interval, StrOptions +from ..utils._repr_html.estimator import _VisualBlock +from ..utils._set_output import ( + _get_container_adapter, + _get_output_config, + _safe_set_output, +) +from ..utils._tags import get_tags +from ..utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + _routing_enabled, + process_routing, +) +from ..utils.metaestimators import _BaseComposition +from ..utils.parallel import Parallel, delayed +from ..utils.validation import ( + _check_feature_names, + _check_feature_names_in, + _check_n_features, + _get_feature_names, + _is_pandas_df, + _num_samples, + check_array, + check_is_fitted, +) + +__all__ = ["ColumnTransformer", "make_column_selector", "make_column_transformer"] + + +_ERR_MSG_1DCOLUMN = ( + "1D data passed to a transformer that expects 2D data. " + "Try to specify the column selection as a list of one " + "item instead of a scalar." +) + + +class ColumnTransformer(TransformerMixin, _BaseComposition): + """Applies transformers to columns of an array or pandas DataFrame. + + This estimator allows different columns or column subsets of the input + to be transformed separately and the features generated by each transformer + will be concatenated to form a single feature space. + This is useful for heterogeneous or columnar data, to combine several + feature extraction mechanisms or transformations into a single transformer. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.20 + + Parameters + ---------- + transformers : list of tuples + List of (name, transformer, columns) tuples specifying the + transformer objects to be applied to subsets of the data. + + name : str + Like in Pipeline and FeatureUnion, this allows the transformer and + its parameters to be set using ``set_params`` and searched in grid + search. + transformer : {'drop', 'passthrough'} or estimator + Estimator must support :term:`fit` and :term:`transform`. + Special-cased strings 'drop' and 'passthrough' are accepted as + well, to indicate to drop the columns or to pass them through + untransformed, respectively. + columns : str, array-like of str, int, array-like of int, \ + array-like of bool, slice or callable + Indexes the data on its second axis. Integers are interpreted as + positional columns, while strings can reference DataFrame columns + by name. A scalar string or int should be used where + ``transformer`` expects X to be a 1d array-like (vector), + otherwise a 2d array will be passed to the transformer. + A callable is passed the input data `X` and can return any of the + above. To select multiple columns by name or dtype, you can use + :obj:`make_column_selector`. + + remainder : {'drop', 'passthrough'} or estimator, default='drop' + By default, only the specified columns in `transformers` are + transformed and combined in the output, and the non-specified + columns are dropped. (default of ``'drop'``). + By specifying ``remainder='passthrough'``, all remaining columns that + were not specified in `transformers`, but present in the data passed + to `fit` will be automatically passed through. This subset of columns + is concatenated with the output of the transformers. For dataframes, + extra columns not seen during `fit` will be excluded from the output + of `transform`. + By setting ``remainder`` to be an estimator, the remaining + non-specified columns will use the ``remainder`` estimator. The + estimator must support :term:`fit` and :term:`transform`. + Note that using this feature requires that the DataFrame columns + input at :term:`fit` and :term:`transform` have identical order. + + sparse_threshold : float, default=0.3 + If the output of the different transformers contains sparse matrices, + these will be stacked as a sparse matrix if the overall density is + lower than this value. Use ``sparse_threshold=0`` to always return + dense. When the transformed output consists of all dense data, the + stacked result will be dense, and this keyword will be ignored. + + n_jobs : int, default=None + Number of jobs to run in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + transformer_weights : dict, default=None + Multiplicative weights for features per transformer. The output of the + transformer is multiplied by these weights. Keys are transformer names, + values the weights. + + verbose : bool, default=False + If True, the time elapsed while fitting each transformer will be + printed as it is completed. + + verbose_feature_names_out : bool, str or Callable[[str, str], str], default=True + + - If True, :meth:`ColumnTransformer.get_feature_names_out` will prefix + all feature names with the name of the transformer that generated that + feature. It is equivalent to setting + `verbose_feature_names_out="{transformer_name}__{feature_name}"`. + - If False, :meth:`ColumnTransformer.get_feature_names_out` will not + prefix any feature names and will error if feature names are not + unique. + - If ``Callable[[str, str], str]``, + :meth:`ColumnTransformer.get_feature_names_out` will rename all the features + using the name of the transformer. The first argument of the callable is the + transformer name and the second argument is the feature name. The returned + string will be the new feature name. + - If ``str``, it must be a string ready for formatting. The given string will + be formatted using two field names: ``transformer_name`` and ``feature_name``. + e.g. ``"{feature_name}__{transformer_name}"``. See :meth:`str.format` method + from the standard library for more info. + + .. versionadded:: 1.0 + + .. versionchanged:: 1.6 + `verbose_feature_names_out` can be a callable or a string to be formatted. + + force_int_remainder_cols : bool, default=False + This parameter has no effect. + + .. note:: + If you do not access the list of columns for the remainder columns + in the `transformers_` fitted attribute, you do not need to set + this parameter. + + .. versionadded:: 1.5 + + .. versionchanged:: 1.7 + The default value for `force_int_remainder_cols` will change from + `True` to `False` in version 1.7. + + .. deprecated:: 1.7 + `force_int_remainder_cols` is deprecated and will be removed in 1.9. + + Attributes + ---------- + transformers_ : list + The collection of fitted transformers as tuples of (name, + fitted_transformer, column). `fitted_transformer` can be an estimator, + or `'drop'`; `'passthrough'` is replaced with an equivalent + :class:`~sklearn.preprocessing.FunctionTransformer`. In case there were + no columns selected, this will be the unfitted transformer. If there + are remaining columns, the final element is a tuple of the form: + ('remainder', transformer, remaining_columns) corresponding to the + ``remainder`` parameter. If there are remaining columns, then + ``len(transformers_)==len(transformers)+1``, otherwise + ``len(transformers_)==len(transformers)``. + + .. versionadded:: 1.7 + The format of the remaining columns now attempts to match that of the other + transformers: if all columns were provided as column names (`str`), the + remaining columns are stored as column names; if all columns were provided + as mask arrays (`bool`), so are the remaining columns; in all other cases + the remaining columns are stored as indices (`int`). + + named_transformers_ : :class:`~sklearn.utils.Bunch` + Read-only attribute to access any transformer by given name. + Keys are transformer names and values are the fitted transformer + objects. + + sparse_output_ : bool + Boolean flag indicating whether the output of ``transform`` is a + sparse matrix or a dense numpy array, which depends on the output + of the individual transformers and the `sparse_threshold` keyword. + + output_indices_ : dict + A dictionary from each transformer name to a slice, where the slice + corresponds to indices in the transformed output. This is useful to + inspect which transformer is responsible for which transformed + feature(s). + + .. versionadded:: 1.0 + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying transformers expose such an attribute when fit. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + make_column_transformer : Convenience function for + combining the outputs of multiple transformer objects applied to + column subsets of the original feature space. + make_column_selector : Convenience function for selecting + columns based on datatype or the columns name with a regex pattern. + + Notes + ----- + The order of the columns in the transformed feature matrix follows the + order of how the columns are specified in the `transformers` list. + Columns of the original feature matrix that are not specified are + dropped from the resulting transformed feature matrix, unless specified + in the `passthrough` keyword. Those columns specified with `passthrough` + are added at the right to the output of the transformers. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.compose import ColumnTransformer + >>> from sklearn.preprocessing import Normalizer + >>> ct = ColumnTransformer( + ... [("norm1", Normalizer(norm='l1'), [0, 1]), + ... ("norm2", Normalizer(norm='l1'), slice(2, 4))]) + >>> X = np.array([[0., 1., 2., 2.], + ... [1., 1., 0., 1.]]) + >>> # Normalizer scales each row of X to unit norm. A separate scaling + >>> # is applied for the two first and two last elements of each + >>> # row independently. + >>> ct.fit_transform(X) + array([[0. , 1. , 0.5, 0.5], + [0.5, 0.5, 0. , 1. ]]) + + :class:`ColumnTransformer` can be configured with a transformer that requires + a 1d array by setting the column to a string: + + >>> from sklearn.feature_extraction.text import CountVectorizer + >>> from sklearn.preprocessing import MinMaxScaler + >>> import pandas as pd # doctest: +SKIP + >>> X = pd.DataFrame({ + ... "documents": ["First item", "second one here", "Is this the last?"], + ... "width": [3, 4, 5], + ... }) # doctest: +SKIP + >>> # "documents" is a string which configures ColumnTransformer to + >>> # pass the documents column as a 1d array to the CountVectorizer + >>> ct = ColumnTransformer( + ... [("text_preprocess", CountVectorizer(), "documents"), + ... ("num_preprocess", MinMaxScaler(), ["width"])]) + >>> X_trans = ct.fit_transform(X) # doctest: +SKIP + + For a more detailed example of usage, see + :ref:`sphx_glr_auto_examples_compose_plot_column_transformer_mixed_types.py`. + """ + + _parameter_constraints: dict = { + "transformers": [list, Hidden(tuple)], + "remainder": [ + StrOptions({"drop", "passthrough"}), + HasMethods(["fit", "transform"]), + HasMethods(["fit_transform", "transform"]), + ], + "sparse_threshold": [Interval(Real, 0, 1, closed="both")], + "n_jobs": [Integral, None], + "transformer_weights": [dict, None], + "verbose": ["verbose"], + "verbose_feature_names_out": ["boolean", str, callable], + "force_int_remainder_cols": ["boolean", Hidden(StrOptions({"deprecated"}))], + } + + def __init__( + self, + transformers, + *, + remainder="drop", + sparse_threshold=0.3, + n_jobs=None, + transformer_weights=None, + verbose=False, + verbose_feature_names_out=True, + force_int_remainder_cols="deprecated", + ): + self.transformers = transformers + self.remainder = remainder + self.sparse_threshold = sparse_threshold + self.n_jobs = n_jobs + self.transformer_weights = transformer_weights + self.verbose = verbose + self.verbose_feature_names_out = verbose_feature_names_out + self.force_int_remainder_cols = force_int_remainder_cols + + @property + def _transformers(self): + """ + Internal list of transformer only containing the name and + transformers, dropping the columns. + + DO NOT USE: This is for the implementation of get_params via + BaseComposition._get_params which expects lists of tuples of len 2. + + To iterate through the transformers, use ``self._iter`` instead. + """ + try: + return [(name, trans) for name, trans, _ in self.transformers] + except (TypeError, ValueError): + return self.transformers + + @_transformers.setter + def _transformers(self, value): + """DO NOT USE: This is for the implementation of set_params via + BaseComposition._get_params which gives lists of tuples of len 2. + """ + try: + self.transformers = [ + (name, trans, col) + for ((name, trans), (_, _, col)) in zip(value, self.transformers) + ] + except (TypeError, ValueError): + self.transformers = value + + def set_output(self, *, transform=None): + """Set the output container when `"transform"` and `"fit_transform"` are called. + + Calling `set_output` will set the output of all estimators in `transformers` + and `transformers_`. + + Parameters + ---------- + transform : {"default", "pandas", "polars"}, default=None + Configure output of `transform` and `fit_transform`. + + - `"default"`: Default output format of a transformer + - `"pandas"`: DataFrame output + - `"polars"`: Polars output + - `None`: Transform configuration is unchanged + + .. versionadded:: 1.4 + `"polars"` option was added. + + Returns + ------- + self : estimator instance + Estimator instance. + """ + super().set_output(transform=transform) + + transformers = ( + trans + for _, trans, _ in chain( + self.transformers, getattr(self, "transformers_", []) + ) + if trans not in {"passthrough", "drop"} + ) + for trans in transformers: + _safe_set_output(trans, transform=transform) + + if self.remainder not in {"passthrough", "drop"}: + _safe_set_output(self.remainder, transform=transform) + + return self + + def get_params(self, deep=True): + """Get parameters for this estimator. + + Returns the parameters given in the constructor as well as the + estimators contained within the `transformers` of the + `ColumnTransformer`. + + Parameters + ---------- + deep : bool, default=True + If True, will return the parameters for this estimator and + contained subobjects that are estimators. + + Returns + ------- + params : dict + Parameter names mapped to their values. + """ + return self._get_params("_transformers", deep=deep) + + def set_params(self, **kwargs): + """Set the parameters of this estimator. + + Valid parameter keys can be listed with ``get_params()``. Note that you + can directly set the parameters of the estimators contained in + `transformers` of `ColumnTransformer`. + + Parameters + ---------- + **kwargs : dict + Estimator parameters. + + Returns + ------- + self : ColumnTransformer + This estimator. + """ + self._set_params("_transformers", **kwargs) + return self + + def _iter(self, fitted, column_as_labels, skip_drop, skip_empty_columns): + """ + Generate (name, trans, columns, weight) tuples. + + + Parameters + ---------- + fitted : bool + If True, use the fitted transformers (``self.transformers_``) to + iterate through transformers, else use the transformers passed by + the user (``self.transformers``). + + column_as_labels : bool + If True, columns are returned as string labels. If False, columns + are returned as they were given by the user. This can only be True + if the ``ColumnTransformer`` is already fitted. + + skip_drop : bool + If True, 'drop' transformers are filtered out. + + skip_empty_columns : bool + If True, transformers with empty selected columns are filtered out. + + Yields + ------ + A generator of tuples containing: + - name : the name of the transformer + - transformer : the transformer object + - columns : the columns for that transformer + - weight : the weight of the transformer + """ + if fitted: + transformers = self.transformers_ + else: + # interleave the validated column specifiers + transformers = [ + (name, trans, column) + for (name, trans, _), column in zip(self.transformers, self._columns) + ] + # add transformer tuple for remainder + if self._remainder[2]: + transformers = chain(transformers, [self._remainder]) + + get_weight = (self.transformer_weights or {}).get + + for name, trans, columns in transformers: + if skip_drop and trans == "drop": + continue + if skip_empty_columns and _is_empty_column_selection(columns): + continue + + if column_as_labels: + # Convert all columns to using their string labels + columns_is_scalar = np.isscalar(columns) + + indices = self._transformer_to_input_indices[name] + columns = self.feature_names_in_[indices] + + if columns_is_scalar: + # selection is done with one dimension + columns = columns[0] + + yield (name, trans, columns, get_weight(name)) + + def _validate_transformers(self): + """Validate names of transformers and the transformers themselves. + + This checks whether given transformers have the required methods, i.e. + `fit` or `fit_transform` and `transform` implemented. + """ + if not self.transformers: + return + + names, transformers, _ = zip(*self.transformers) + + # validate names + self._validate_names(names) + + # validate estimators + for t in transformers: + if t in ("drop", "passthrough"): + continue + if not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not hasattr( + t, "transform" + ): + # Used to validate the transformers in the `transformers` list + raise TypeError( + "All estimators should implement fit and " + "transform, or can be 'drop' or 'passthrough' " + "specifiers. '%s' (type %s) doesn't." % (t, type(t)) + ) + + def _validate_column_callables(self, X): + """ + Converts callable column specifications. + + This stores a dictionary of the form `{step_name: column_indices}` and + calls the `columns` on `X` if `columns` is a callable for a given + transformer. + + The results are then stored in `self._transformer_to_input_indices`. + """ + all_columns = [] + transformer_to_input_indices = {} + for name, _, columns in self.transformers: + if callable(columns): + columns = columns(X) + all_columns.append(columns) + transformer_to_input_indices[name] = _get_column_indices(X, columns) + + self._columns = all_columns + self._transformer_to_input_indices = transformer_to_input_indices + + def _validate_remainder(self, X): + """ + Validates ``remainder`` and defines ``_remainder`` targeting + the remaining columns. + """ + cols = set(chain(*self._transformer_to_input_indices.values())) + remaining = sorted(set(range(self.n_features_in_)) - cols) + self._transformer_to_input_indices["remainder"] = remaining + remainder_cols = self._get_remainder_cols(remaining) + self._remainder = ("remainder", self.remainder, remainder_cols) + + def _get_remainder_cols_dtype(self): + try: + all_dtypes = {_determine_key_type(c) for (*_, c) in self.transformers} + if len(all_dtypes) == 1: + return next(iter(all_dtypes)) + except ValueError: + # _determine_key_type raises a ValueError if some transformer + # columns are Callables + return "int" + return "int" + + def _get_remainder_cols(self, indices): + dtype = self._get_remainder_cols_dtype() + if dtype == "str": + return list(self.feature_names_in_[indices]) + if dtype == "bool": + return [i in indices for i in range(self.n_features_in_)] + return indices + + @property + def named_transformers_(self): + """Access the fitted transformer by name. + + Read-only attribute to access any transformer by given name. + Keys are transformer names and values are the fitted transformer + objects. + """ + # Use Bunch object to improve autocomplete + return Bunch(**{name: trans for name, trans, _ in self.transformers_}) + + def _get_feature_name_out_for_transformer(self, name, trans, feature_names_in): + """Gets feature names of transformer. + + Used in conjunction with self._iter(fitted=True) in get_feature_names_out. + """ + column_indices = self._transformer_to_input_indices[name] + names = feature_names_in[column_indices] + # An actual transformer + if not hasattr(trans, "get_feature_names_out"): + raise AttributeError( + f"Transformer {name} (type {type(trans).__name__}) does " + "not provide get_feature_names_out." + ) + return trans.get_feature_names_out(names) + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then the following input feature names are generated: + `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + check_is_fitted(self) + input_features = _check_feature_names_in(self, input_features) + + # List of tuples (name, feature_names_out) + transformer_with_feature_names_out = [] + for name, trans, *_ in self._iter( + fitted=True, + column_as_labels=False, + skip_empty_columns=True, + skip_drop=True, + ): + feature_names_out = self._get_feature_name_out_for_transformer( + name, trans, input_features + ) + if feature_names_out is None: + continue + transformer_with_feature_names_out.append((name, feature_names_out)) + + if not transformer_with_feature_names_out: + # No feature names + return np.array([], dtype=object) + + return self._add_prefix_for_feature_names_out( + transformer_with_feature_names_out + ) + + def _add_prefix_for_feature_names_out(self, transformer_with_feature_names_out): + """Add prefix for feature names out that includes the transformer names. + + Parameters + ---------- + transformer_with_feature_names_out : list of tuples of (str, array-like of str) + The tuple consistent of the transformer's name and its feature names out. + + Returns + ------- + feature_names_out : ndarray of shape (n_features,), dtype=str + Transformed feature names. + """ + feature_names_out_callable = None + if callable(self.verbose_feature_names_out): + feature_names_out_callable = self.verbose_feature_names_out + elif isinstance(self.verbose_feature_names_out, str): + feature_names_out_callable = partial( + _feature_names_out_with_str_format, + str_format=self.verbose_feature_names_out, + ) + elif self.verbose_feature_names_out is True: + feature_names_out_callable = partial( + _feature_names_out_with_str_format, + str_format="{transformer_name}__{feature_name}", + ) + + if feature_names_out_callable is not None: + # Prefix the feature names out with the transformers name + names = list( + chain.from_iterable( + (feature_names_out_callable(name, i) for i in feature_names_out) + for name, feature_names_out in transformer_with_feature_names_out + ) + ) + return np.asarray(names, dtype=object) + + # verbose_feature_names_out is False + # Check that names are all unique without a prefix + feature_names_count = Counter( + chain.from_iterable(s for _, s in transformer_with_feature_names_out) + ) + top_6_overlap = [ + name for name, count in feature_names_count.most_common(6) if count > 1 + ] + top_6_overlap.sort() + if top_6_overlap: + if len(top_6_overlap) == 6: + # There are more than 5 overlapping names, we only show the 5 + # of the feature names + names_repr = str(top_6_overlap[:5])[:-1] + ", ...]" + else: + names_repr = str(top_6_overlap) + raise ValueError( + f"Output feature names: {names_repr} are not unique. Please set " + "verbose_feature_names_out=True to add prefixes to feature names" + ) + + return np.concatenate( + [name for _, name in transformer_with_feature_names_out], + ) + + def _update_fitted_transformers(self, transformers): + """Set self.transformers_ from given transformers. + + Parameters + ---------- + transformers : list of estimators + The fitted estimators as the output of + `self._call_func_on_transformers(func=_fit_transform_one, ...)`. + That function doesn't include 'drop' or transformers for which no + column is selected. 'drop' is kept as is, and for the no-column + transformers the unfitted transformer is put in + `self.transformers_`. + """ + # transformers are fitted; excludes 'drop' cases + fitted_transformers = iter(transformers) + transformers_ = [] + + for name, old, column, _ in self._iter( + fitted=False, + column_as_labels=False, + skip_drop=False, + skip_empty_columns=False, + ): + if old == "drop": + trans = "drop" + elif _is_empty_column_selection(column): + trans = old + else: + trans = next(fitted_transformers) + transformers_.append((name, trans, column)) + + # sanity check that transformers is exhausted + assert not list(fitted_transformers) + self.transformers_ = transformers_ + + def _validate_output(self, result): + """ + Ensure that the output of each transformer is 2D. Otherwise + hstack can raise an error or produce incorrect results. + """ + names = [ + name + for name, _, _, _ in self._iter( + fitted=True, + column_as_labels=False, + skip_drop=True, + skip_empty_columns=True, + ) + ] + for Xs, name in zip(result, names): + if not getattr(Xs, "ndim", 0) == 2 and not hasattr(Xs, "__dataframe__"): + raise ValueError( + "The output of the '{0}' transformer should be 2D (numpy array, " + "scipy sparse array, dataframe).".format(name) + ) + if _get_output_config("transform", self)["dense"] == "pandas": + return + try: + import pandas as pd + except ImportError: + return + for Xs, name in zip(result, names): + if not _is_pandas_df(Xs): + continue + for col_name, dtype in Xs.dtypes.to_dict().items(): + if getattr(dtype, "na_value", None) is not pd.NA: + continue + if pd.NA not in Xs[col_name].values: + continue + class_name = self.__class__.__name__ + raise ValueError( + f"The output of the '{name}' transformer for column" + f" '{col_name}' has dtype {dtype} and uses pandas.NA to" + " represent null values. Storing this output in a numpy array" + " can cause errors in downstream scikit-learn estimators, and" + " inefficiencies. To avoid this problem you can (i)" + " store the output in a pandas DataFrame by using" + f" {class_name}.set_output(transform='pandas') or (ii) modify" + f" the input data or the '{name}' transformer to avoid the" + " presence of pandas.NA (for example by using" + " pandas.DataFrame.astype)." + ) + + def _record_output_indices(self, Xs): + """ + Record which transformer produced which column. + """ + idx = 0 + self.output_indices_ = {} + + for transformer_idx, (name, _, _, _) in enumerate( + self._iter( + fitted=True, + column_as_labels=False, + skip_drop=True, + skip_empty_columns=True, + ) + ): + n_columns = Xs[transformer_idx].shape[1] + self.output_indices_[name] = slice(idx, idx + n_columns) + idx += n_columns + + # `_iter` only generates transformers that have a non empty + # selection. Here we set empty slices for transformers that + # generate no output, which are safe for indexing + all_names = [t[0] for t in self.transformers] + ["remainder"] + for name in all_names: + if name not in self.output_indices_: + self.output_indices_[name] = slice(0, 0) + + def _log_message(self, name, idx, total): + if not self.verbose: + return None + return "(%d of %d) Processing %s" % (idx, total, name) + + def _call_func_on_transformers(self, X, y, func, column_as_labels, routed_params): + """ + Private function to fit and/or transform on demand. + + Parameters + ---------- + X : {array-like, dataframe} of shape (n_samples, n_features) + The data to be used in fit and/or transform. + + y : array-like of shape (n_samples,) + Targets. + + func : callable + Function to call, which can be _fit_transform_one or + _transform_one. + + column_as_labels : bool + Used to iterate through transformers. If True, columns are returned + as strings. If False, columns are returned as they were given by + the user. Can be True only if the ``ColumnTransformer`` is already + fitted. + + routed_params : dict + The routed parameters as the output from ``process_routing``. + + Returns + ------- + Return value (transformers and/or transformed X data) depends + on the passed function. + """ + if func is _fit_transform_one: + fitted = False + else: # func is _transform_one + fitted = True + + transformers = list( + self._iter( + fitted=fitted, + column_as_labels=column_as_labels, + skip_drop=True, + skip_empty_columns=True, + ) + ) + try: + jobs = [] + for idx, (name, trans, columns, weight) in enumerate(transformers, start=1): + if func is _fit_transform_one: + if trans == "passthrough": + output_config = _get_output_config("transform", self) + trans = FunctionTransformer( + accept_sparse=True, + check_inverse=False, + feature_names_out="one-to-one", + ).set_output(transform=output_config["dense"]) + + extra_args = dict( + message_clsname="ColumnTransformer", + message=self._log_message(name, idx, len(transformers)), + ) + else: # func is _transform_one + extra_args = {} + jobs.append( + delayed(func)( + transformer=clone(trans) if not fitted else trans, + X=_safe_indexing(X, columns, axis=1), + y=y, + weight=weight, + **extra_args, + params=routed_params[name], + ) + ) + + return Parallel(n_jobs=self.n_jobs)(jobs) + + except ValueError as e: + if "Expected 2D array, got 1D array instead" in str(e): + raise ValueError(_ERR_MSG_1DCOLUMN) from e + else: + raise + + def fit(self, X, y=None, **params): + """Fit all transformers using X. + + Parameters + ---------- + X : {array-like, dataframe} of shape (n_samples, n_features) + Input data, of which specified subsets are used to fit the + transformers. + + y : array-like of shape (n_samples,...), default=None + Targets for supervised learning. + + **params : dict, default=None + Parameters to be passed to the underlying transformers' ``fit`` and + ``transform`` methods. + + You can only pass this if metadata routing is enabled, which you + can enable using ``sklearn.set_config(enable_metadata_routing=True)``. + + .. versionadded:: 1.4 + + Returns + ------- + self : ColumnTransformer + This estimator. + """ + _raise_for_params(params, self, "fit") + # we use fit_transform to make sure to set sparse_output_ (for which we + # need the transformed data) to have consistent output type in predict + self.fit_transform(X, y=y, **params) + return self + + @_fit_context( + # estimators in ColumnTransformer.transformers are not validated yet + prefer_skip_nested_validation=False + ) + def fit_transform(self, X, y=None, **params): + """Fit all transformers, transform the data and concatenate results. + + Parameters + ---------- + X : {array-like, dataframe} of shape (n_samples, n_features) + Input data, of which specified subsets are used to fit the + transformers. + + y : array-like of shape (n_samples,), default=None + Targets for supervised learning. + + **params : dict, default=None + Parameters to be passed to the underlying transformers' ``fit`` and + ``transform`` methods. + + You can only pass this if metadata routing is enabled, which you + can enable using ``sklearn.set_config(enable_metadata_routing=True)``. + + .. versionadded:: 1.4 + + Returns + ------- + X_t : {array-like, sparse matrix} of \ + shape (n_samples, sum_n_components) + Horizontally stacked results of transformers. sum_n_components is the + sum of n_components (output dimension) over transformers. If + any result is a sparse matrix, everything will be converted to + sparse matrices. + """ + _raise_for_params(params, self, "fit_transform") + _check_feature_names(self, X, reset=True) + + if self.force_int_remainder_cols != "deprecated": + warnings.warn( + "The parameter `force_int_remainder_cols` is deprecated and will be " + "removed in 1.9. It has no effect. Leave it to its default value to " + "avoid this warning.", + FutureWarning, + ) + + X = _check_X(X) + # set n_features_in_ attribute + _check_n_features(self, X, reset=True) + self._validate_transformers() + n_samples = _num_samples(X) + + self._validate_column_callables(X) + self._validate_remainder(X) + + if _routing_enabled(): + routed_params = process_routing(self, "fit_transform", **params) + else: + routed_params = self._get_empty_routing() + + result = self._call_func_on_transformers( + X, + y, + _fit_transform_one, + column_as_labels=False, + routed_params=routed_params, + ) + + if not result: + self._update_fitted_transformers([]) + # All transformers are None + return np.zeros((n_samples, 0)) + + Xs, transformers = zip(*result) + + # determine if concatenated output will be sparse or not + if any(sparse.issparse(X) for X in Xs): + nnz = sum(X.nnz if sparse.issparse(X) else X.size for X in Xs) + total = sum( + X.shape[0] * X.shape[1] if sparse.issparse(X) else X.size for X in Xs + ) + density = nnz / total + self.sparse_output_ = density < self.sparse_threshold + else: + self.sparse_output_ = False + + self._update_fitted_transformers(transformers) + self._validate_output(Xs) + self._record_output_indices(Xs) + + return self._hstack(list(Xs), n_samples=n_samples) + + def transform(self, X, **params): + """Transform X separately by each transformer, concatenate results. + + Parameters + ---------- + X : {array-like, dataframe} of shape (n_samples, n_features) + The data to be transformed by subset. + + **params : dict, default=None + Parameters to be passed to the underlying transformers' ``transform`` + method. + + You can only pass this if metadata routing is enabled, which you + can enable using ``sklearn.set_config(enable_metadata_routing=True)``. + + .. versionadded:: 1.4 + + Returns + ------- + X_t : {array-like, sparse matrix} of \ + shape (n_samples, sum_n_components) + Horizontally stacked results of transformers. sum_n_components is the + sum of n_components (output dimension) over transformers. If + any result is a sparse matrix, everything will be converted to + sparse matrices. + """ + _raise_for_params(params, self, "transform") + check_is_fitted(self) + X = _check_X(X) + + # If ColumnTransformer is fit using a dataframe, and now a dataframe is + # passed to be transformed, we select columns by name instead. This + # enables the user to pass X at transform time with extra columns which + # were not present in fit time, and the order of the columns doesn't + # matter. + fit_dataframe_and_transform_dataframe = hasattr(self, "feature_names_in_") and ( + _is_pandas_df(X) or hasattr(X, "__dataframe__") + ) + + n_samples = _num_samples(X) + column_names = _get_feature_names(X) + + if fit_dataframe_and_transform_dataframe: + named_transformers = self.named_transformers_ + # check that all names seen in fit are in transform, unless + # they were dropped + non_dropped_indices = [ + ind + for name, ind in self._transformer_to_input_indices.items() + if name in named_transformers and named_transformers[name] != "drop" + ] + + all_indices = set(chain(*non_dropped_indices)) + all_names = set(self.feature_names_in_[ind] for ind in all_indices) + + diff = all_names - set(column_names) + if diff: + raise ValueError(f"columns are missing: {diff}") + else: + # ndarray was used for fitting or transforming, thus we only + # check that n_features_in_ is consistent + _check_n_features(self, X, reset=False) + + if _routing_enabled(): + routed_params = process_routing(self, "transform", **params) + else: + routed_params = self._get_empty_routing() + + Xs = self._call_func_on_transformers( + X, + None, + _transform_one, + column_as_labels=fit_dataframe_and_transform_dataframe, + routed_params=routed_params, + ) + self._validate_output(Xs) + + if not Xs: + # All transformers are None + return np.zeros((n_samples, 0)) + + return self._hstack(list(Xs), n_samples=n_samples) + + def _hstack(self, Xs, *, n_samples): + """Stacks Xs horizontally. + + This allows subclasses to control the stacking behavior, while reusing + everything else from ColumnTransformer. + + Parameters + ---------- + Xs : list of {array-like, sparse matrix, dataframe} + The container to concatenate. + n_samples : int + The number of samples in the input data to checking the transformation + consistency. + """ + if self.sparse_output_: + try: + # since all columns should be numeric before stacking them + # in a sparse matrix, `check_array` is used for the + # dtype conversion if necessary. + converted_Xs = [ + check_array(X, accept_sparse=True, ensure_all_finite=False) + for X in Xs + ] + except ValueError as e: + raise ValueError( + "For a sparse output, all columns should " + "be a numeric or convertible to a numeric." + ) from e + + return sparse.hstack(converted_Xs).tocsr() + else: + Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs] + adapter = _get_container_adapter("transform", self) + if adapter and all(adapter.is_supported_container(X) for X in Xs): + # rename before stacking as it avoids to error on temporary duplicated + # columns + transformer_names = [ + t[0] + for t in self._iter( + fitted=True, + column_as_labels=False, + skip_drop=True, + skip_empty_columns=True, + ) + ] + feature_names_outs = [X.columns for X in Xs if X.shape[1] != 0] + if self.verbose_feature_names_out: + # `_add_prefix_for_feature_names_out` takes care about raising + # an error if there are duplicated columns. + feature_names_outs = self._add_prefix_for_feature_names_out( + list(zip(transformer_names, feature_names_outs)) + ) + else: + # check for duplicated columns and raise if any + feature_names_outs = list(chain.from_iterable(feature_names_outs)) + feature_names_count = Counter(feature_names_outs) + if any(count > 1 for count in feature_names_count.values()): + duplicated_feature_names = sorted( + name + for name, count in feature_names_count.items() + if count > 1 + ) + err_msg = ( + "Duplicated feature names found before concatenating the" + " outputs of the transformers:" + f" {duplicated_feature_names}.\n" + ) + for transformer_name, X in zip(transformer_names, Xs): + if X.shape[1] == 0: + continue + dup_cols_in_transformer = sorted( + set(X.columns).intersection(duplicated_feature_names) + ) + if len(dup_cols_in_transformer): + err_msg += ( + f"Transformer {transformer_name} has conflicting " + f"columns names: {dup_cols_in_transformer}.\n" + ) + raise ValueError( + err_msg + + "Either make sure that the transformers named above " + "do not generate columns with conflicting names or set " + "verbose_feature_names_out=True to automatically " + "prefix to the output feature names with the name " + "of the transformer to prevent any conflicting " + "names." + ) + + names_idx = 0 + for X in Xs: + if X.shape[1] == 0: + continue + names_out = feature_names_outs[names_idx : names_idx + X.shape[1]] + adapter.rename_columns(X, names_out) + names_idx += X.shape[1] + + output = adapter.hstack(Xs) + output_samples = output.shape[0] + if output_samples != n_samples: + raise ValueError( + "Concatenating DataFrames from the transformer's output lead to" + " an inconsistent number of samples. The output may have Pandas" + " Indexes that do not match, or that transformers are returning" + " number of samples which are not the same as the number input" + " samples." + ) + + return output + + return np.hstack(Xs) + + def _sk_visual_block_(self): + if isinstance(self.remainder, str) and self.remainder == "drop": + transformers = self.transformers + elif hasattr(self, "_remainder"): + remainder_columns = self._remainder[2] + if ( + hasattr(self, "feature_names_in_") + and remainder_columns + and not all(isinstance(col, str) for col in remainder_columns) + ): + remainder_columns = self.feature_names_in_[remainder_columns].tolist() + transformers = chain( + self.transformers, [("remainder", self.remainder, remainder_columns)] + ) + else: + transformers = chain(self.transformers, [("remainder", self.remainder, "")]) + + names, transformers, name_details = zip(*transformers) + return _VisualBlock( + "parallel", transformers, names=names, name_details=name_details + ) + + def __getitem__(self, key): + try: + return self.named_transformers_[key] + except AttributeError as e: + raise TypeError( + "ColumnTransformer is subscriptable after it is fitted" + ) from e + except KeyError as e: + raise KeyError(f"'{key}' is not a valid transformer name") from e + + def _get_empty_routing(self): + """Return empty routing. + + Used while routing can be disabled. + + TODO: Remove when ``set_config(enable_metadata_routing=False)`` is no + more an option. + """ + return Bunch( + **{ + name: Bunch(**{method: {} for method in METHODS}) + for name, step, _, _ in self._iter( + fitted=False, + column_as_labels=False, + skip_drop=True, + skip_empty_columns=True, + ) + } + ) + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.4 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__) + # Here we don't care about which columns are used for which + # transformers, and whether or not a transformer is used at all, which + # might happen if no columns are selected for that transformer. We + # request all metadata requested by all transformers. + transformers = chain(self.transformers, [("remainder", self.remainder, None)]) + for name, step, _ in transformers: + method_mapping = MethodMapping() + if hasattr(step, "fit_transform"): + ( + method_mapping.add(caller="fit", callee="fit_transform").add( + caller="fit_transform", callee="fit_transform" + ) + ) + else: + ( + method_mapping.add(caller="fit", callee="fit") + .add(caller="fit", callee="transform") + .add(caller="fit_transform", callee="fit") + .add(caller="fit_transform", callee="transform") + ) + method_mapping.add(caller="transform", callee="transform") + router.add(method_mapping=method_mapping, **{name: step}) + + return router + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + try: + tags.input_tags.sparse = all( + get_tags(trans).input_tags.sparse + for name, trans, _ in self.transformers + if trans not in {"passthrough", "drop"} + ) + except Exception: + # If `transformers` does not comply with our API (list of tuples) + # then it will fail. In this case, we assume that `sparse` is False + # but the parameter validation will raise an error during `fit`. + pass # pragma: no cover + return tags + + +def _check_X(X): + """Use check_array only when necessary, e.g. on lists and other non-array-likes.""" + if ( + (hasattr(X, "__array__") and hasattr(X, "shape")) + or hasattr(X, "__dataframe__") + or sparse.issparse(X) + ): + return X + return check_array(X, ensure_all_finite="allow-nan", dtype=object) + + +def _is_empty_column_selection(column): + """ + Return True if the column selection is empty (empty list or all-False + boolean array). + + """ + if ( + hasattr(column, "dtype") + # Not necessarily a numpy dtype, can be a pandas dtype as well + and isinstance(column.dtype, np.dtype) + and np.issubdtype(column.dtype, np.bool_) + ): + return not column.any() + elif hasattr(column, "__len__"): + return len(column) == 0 or ( + all(isinstance(col, bool) for col in column) and not any(column) + ) + else: + return False + + +def _get_transformer_list(estimators): + """ + Construct (name, trans, column) tuples from list + + """ + transformers, columns = zip(*estimators) + names, _ = zip(*_name_estimators(transformers)) + + transformer_list = list(zip(names, transformers, columns)) + return transformer_list + + +# This function is not validated using validate_params because +# it's just a factory for ColumnTransformer. +def make_column_transformer( + *transformers, + remainder="drop", + sparse_threshold=0.3, + n_jobs=None, + verbose=False, + verbose_feature_names_out=True, + force_int_remainder_cols="deprecated", +): + """Construct a ColumnTransformer from the given transformers. + + This is a shorthand for the ColumnTransformer constructor; it does not + require, and does not permit, naming the transformers. Instead, they will + be given names automatically based on their types. It also does not allow + weighting with ``transformer_weights``. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + *transformers : tuples + Tuples of the form (transformer, columns) specifying the + transformer objects to be applied to subsets of the data. + + transformer : {'drop', 'passthrough'} or estimator + Estimator must support :term:`fit` and :term:`transform`. + Special-cased strings 'drop' and 'passthrough' are accepted as + well, to indicate to drop the columns or to pass them through + untransformed, respectively. + columns : str, array-like of str, int, array-like of int, slice, \ + array-like of bool or callable + Indexes the data on its second axis. Integers are interpreted as + positional columns, while strings can reference DataFrame columns + by name. A scalar string or int should be used where + ``transformer`` expects X to be a 1d array-like (vector), + otherwise a 2d array will be passed to the transformer. + A callable is passed the input data `X` and can return any of the + above. To select multiple columns by name or dtype, you can use + :obj:`make_column_selector`. + + remainder : {'drop', 'passthrough'} or estimator, default='drop' + By default, only the specified columns in `transformers` are + transformed and combined in the output, and the non-specified + columns are dropped. (default of ``'drop'``). + By specifying ``remainder='passthrough'``, all remaining columns that + were not specified in `transformers` will be automatically passed + through. This subset of columns is concatenated with the output of + the transformers. + By setting ``remainder`` to be an estimator, the remaining + non-specified columns will use the ``remainder`` estimator. The + estimator must support :term:`fit` and :term:`transform`. + + sparse_threshold : float, default=0.3 + If the transformed output consists of a mix of sparse and dense data, + it will be stacked as a sparse matrix if the density is lower than this + value. Use ``sparse_threshold=0`` to always return dense. + When the transformed output consists of all sparse or all dense data, + the stacked result will be sparse or dense, respectively, and this + keyword will be ignored. + + n_jobs : int, default=None + Number of jobs to run in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + verbose : bool, default=False + If True, the time elapsed while fitting each transformer will be + printed as it is completed. + + verbose_feature_names_out : bool, default=True + If True, :meth:`ColumnTransformer.get_feature_names_out` will prefix + all feature names with the name of the transformer that generated that + feature. + If False, :meth:`ColumnTransformer.get_feature_names_out` will not + prefix any feature names and will error if feature names are not + unique. + + .. versionadded:: 1.0 + + force_int_remainder_cols : bool, default=True + This parameter has no effect. + + .. note:: + If you do not access the list of columns for the remainder columns + in the :attr:`ColumnTransformer.transformers_` fitted attribute, + you do not need to set this parameter. + + .. versionadded:: 1.5 + + .. versionchanged:: 1.7 + The default value for `force_int_remainder_cols` will change from + `True` to `False` in version 1.7. + + .. deprecated:: 1.7 + `force_int_remainder_cols` is deprecated and will be removed in version 1.9. + + Returns + ------- + ct : ColumnTransformer + Returns a :class:`ColumnTransformer` object. + + See Also + -------- + ColumnTransformer : Class that allows combining the + outputs of multiple transformer objects used on column subsets + of the data into a single feature space. + + Examples + -------- + >>> from sklearn.preprocessing import StandardScaler, OneHotEncoder + >>> from sklearn.compose import make_column_transformer + >>> make_column_transformer( + ... (StandardScaler(), ['numerical_column']), + ... (OneHotEncoder(), ['categorical_column'])) + ColumnTransformer(transformers=[('standardscaler', StandardScaler(...), + ['numerical_column']), + ('onehotencoder', OneHotEncoder(...), + ['categorical_column'])]) + """ + # transformer_weights keyword is not passed through because the user + # would need to know the automatically generated names of the transformers + transformer_list = _get_transformer_list(transformers) + return ColumnTransformer( + transformer_list, + n_jobs=n_jobs, + remainder=remainder, + sparse_threshold=sparse_threshold, + verbose=verbose, + verbose_feature_names_out=verbose_feature_names_out, + force_int_remainder_cols=force_int_remainder_cols, + ) + + +class make_column_selector: + """Create a callable to select columns to be used with + :class:`ColumnTransformer`. + + :func:`make_column_selector` can select columns based on datatype or the + columns name with a regex. When using multiple selection criteria, **all** + criteria must match for a column to be selected. + + For an example of how to use :func:`make_column_selector` within a + :class:`ColumnTransformer` to select columns based on data type (i.e. + `dtype`), refer to + :ref:`sphx_glr_auto_examples_compose_plot_column_transformer_mixed_types.py`. + + Parameters + ---------- + pattern : str, default=None + Name of columns containing this regex pattern will be included. If + None, column selection will not be selected based on pattern. + + dtype_include : column dtype or list of column dtypes, default=None + A selection of dtypes to include. For more details, see + :meth:`pandas.DataFrame.select_dtypes`. + + dtype_exclude : column dtype or list of column dtypes, default=None + A selection of dtypes to exclude. For more details, see + :meth:`pandas.DataFrame.select_dtypes`. + + Returns + ------- + selector : callable + Callable for column selection to be used by a + :class:`ColumnTransformer`. + + See Also + -------- + ColumnTransformer : Class that allows combining the + outputs of multiple transformer objects used on column subsets + of the data into a single feature space. + + Examples + -------- + >>> from sklearn.preprocessing import StandardScaler, OneHotEncoder + >>> from sklearn.compose import make_column_transformer + >>> from sklearn.compose import make_column_selector + >>> import numpy as np + >>> import pandas as pd # doctest: +SKIP + >>> X = pd.DataFrame({'city': ['London', 'London', 'Paris', 'Sallisaw'], + ... 'rating': [5, 3, 4, 5]}) # doctest: +SKIP + >>> ct = make_column_transformer( + ... (StandardScaler(), + ... make_column_selector(dtype_include=np.number)), # rating + ... (OneHotEncoder(), + ... make_column_selector(dtype_include=object))) # city + >>> ct.fit_transform(X) # doctest: +SKIP + array([[ 0.90453403, 1. , 0. , 0. ], + [-1.50755672, 1. , 0. , 0. ], + [-0.30151134, 0. , 1. , 0. ], + [ 0.90453403, 0. , 0. , 1. ]]) + """ + + def __init__(self, pattern=None, *, dtype_include=None, dtype_exclude=None): + self.pattern = pattern + self.dtype_include = dtype_include + self.dtype_exclude = dtype_exclude + + def __call__(self, df): + """Callable for column selection to be used by a + :class:`ColumnTransformer`. + + Parameters + ---------- + df : dataframe of shape (n_features, n_samples) + DataFrame to select columns from. + """ + if not hasattr(df, "iloc"): + raise ValueError( + "make_column_selector can only be applied to pandas dataframes" + ) + df_row = df.iloc[:1] + if self.dtype_include is not None or self.dtype_exclude is not None: + df_row = df_row.select_dtypes( + include=self.dtype_include, exclude=self.dtype_exclude + ) + cols = df_row.columns + if self.pattern is not None: + cols = cols[cols.str.contains(self.pattern, regex=True)] + return cols.tolist() + + +def _feature_names_out_with_str_format( + transformer_name: str, feature_name: str, str_format: str +) -> str: + return str_format.format( + transformer_name=transformer_name, feature_name=feature_name + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/compose/_target.py b/.venv/lib/python3.12/site-packages/sklearn/compose/_target.py new file mode 100644 index 0000000000000000000000000000000000000000..7f713767b30cb8ce0cb5724f2252e427df05a788 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/compose/_target.py @@ -0,0 +1,397 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings + +import numpy as np + +from ..base import BaseEstimator, RegressorMixin, _fit_context, clone +from ..exceptions import NotFittedError +from ..linear_model import LinearRegression +from ..preprocessing import FunctionTransformer +from ..utils import Bunch, _safe_indexing, check_array +from ..utils._metadata_requests import ( + MetadataRouter, + MethodMapping, + _routing_enabled, + process_routing, +) +from ..utils._param_validation import HasMethods +from ..utils._tags import get_tags +from ..utils.validation import check_is_fitted + +__all__ = ["TransformedTargetRegressor"] + + +class TransformedTargetRegressor(RegressorMixin, BaseEstimator): + """Meta-estimator to regress on a transformed target. + + Useful for applying a non-linear transformation to the target `y` in + regression problems. This transformation can be given as a Transformer + such as the :class:`~sklearn.preprocessing.QuantileTransformer` or as a + function and its inverse such as `np.log` and `np.exp`. + + The computation during :meth:`fit` is:: + + regressor.fit(X, func(y)) + + or:: + + regressor.fit(X, transformer.transform(y)) + + The computation during :meth:`predict` is:: + + inverse_func(regressor.predict(X)) + + or:: + + transformer.inverse_transform(regressor.predict(X)) + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.20 + + Parameters + ---------- + regressor : object, default=None + Regressor object such as derived from + :class:`~sklearn.base.RegressorMixin`. This regressor will + automatically be cloned each time prior to fitting. If `regressor is + None`, :class:`~sklearn.linear_model.LinearRegression` is created and used. + + transformer : object, default=None + Estimator object such as derived from + :class:`~sklearn.base.TransformerMixin`. Cannot be set at the same time + as `func` and `inverse_func`. If `transformer is None` as well as + `func` and `inverse_func`, the transformer will be an identity + transformer. Note that the transformer will be cloned during fitting. + Also, the transformer is restricting `y` to be a numpy array. + + func : function, default=None + Function to apply to `y` before passing to :meth:`fit`. Cannot be set + at the same time as `transformer`. If `func is None`, the function used will be + the identity function. If `func` is set, `inverse_func` also needs to be + provided. The function needs to return a 2-dimensional array. + + inverse_func : function, default=None + Function to apply to the prediction of the regressor. Cannot be set at + the same time as `transformer`. The inverse function is used to return + predictions to the same space of the original training labels. If + `inverse_func` is set, `func` also needs to be provided. The inverse + function needs to return a 2-dimensional array. + + check_inverse : bool, default=True + Whether to check that `transform` followed by `inverse_transform` + or `func` followed by `inverse_func` leads to the original targets. + + Attributes + ---------- + regressor_ : object + Fitted regressor. + + transformer_ : object + Transformer used in :meth:`fit` and :meth:`predict`. + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying regressor exposes such an attribute when fit. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + sklearn.preprocessing.FunctionTransformer : Construct a transformer from an + arbitrary callable. + + Notes + ----- + Internally, the target `y` is always converted into a 2-dimensional array + to be used by scikit-learn transformers. At the time of prediction, the + output will be reshaped to a have the same number of dimensions as `y`. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.linear_model import LinearRegression + >>> from sklearn.compose import TransformedTargetRegressor + >>> tt = TransformedTargetRegressor(regressor=LinearRegression(), + ... func=np.log, inverse_func=np.exp) + >>> X = np.arange(4).reshape(-1, 1) + >>> y = np.exp(2 * X).ravel() + >>> tt.fit(X, y) + TransformedTargetRegressor(...) + >>> tt.score(X, y) + 1.0 + >>> tt.regressor_.coef_ + array([2.]) + + For a more detailed example use case refer to + :ref:`sphx_glr_auto_examples_compose_plot_transformed_target.py`. + """ + + _parameter_constraints: dict = { + "regressor": [HasMethods(["fit", "predict"]), None], + "transformer": [HasMethods("transform"), None], + "func": [callable, None], + "inverse_func": [callable, None], + "check_inverse": ["boolean"], + } + + def __init__( + self, + regressor=None, + *, + transformer=None, + func=None, + inverse_func=None, + check_inverse=True, + ): + self.regressor = regressor + self.transformer = transformer + self.func = func + self.inverse_func = inverse_func + self.check_inverse = check_inverse + + def _fit_transformer(self, y): + """Check transformer and fit transformer. + + Create the default transformer, fit it and make additional inverse + check on a subset (optional). + + """ + if self.transformer is not None and ( + self.func is not None or self.inverse_func is not None + ): + raise ValueError( + "'transformer' and functions 'func'/'inverse_func' cannot both be set." + ) + elif self.transformer is not None: + self.transformer_ = clone(self.transformer) + else: + if (self.func is not None and self.inverse_func is None) or ( + self.func is None and self.inverse_func is not None + ): + lacking_param, existing_param = ( + ("func", "inverse_func") + if self.func is None + else ("inverse_func", "func") + ) + raise ValueError( + f"When '{existing_param}' is provided, '{lacking_param}' must also" + f" be provided. If {lacking_param} is supposed to be the default," + " you need to explicitly pass it the identity function." + ) + self.transformer_ = FunctionTransformer( + func=self.func, + inverse_func=self.inverse_func, + validate=True, + check_inverse=self.check_inverse, + ) + # We are transforming the target here and not the features, so we set the + # output of FunctionTransformer() to be a numpy array (default) and to not + # depend on the global configuration: + self.transformer_.set_output(transform="default") + # XXX: sample_weight is not currently passed to the + # transformer. However, if transformer starts using sample_weight, the + # code should be modified accordingly. At the time to consider the + # sample_prop feature, it is also a good use case to be considered. + self.transformer_.fit(y) + if self.check_inverse: + idx_selected = slice(None, None, max(1, y.shape[0] // 10)) + y_sel = _safe_indexing(y, idx_selected) + y_sel_t = self.transformer_.transform(y_sel) + if not np.allclose(y_sel, self.transformer_.inverse_transform(y_sel_t)): + warnings.warn( + ( + "The provided functions or transformer are" + " not strictly inverse of each other. If" + " you are sure you want to proceed regardless" + ", set 'check_inverse=False'" + ), + UserWarning, + ) + + @_fit_context( + # TransformedTargetRegressor.regressor/transformer are not validated yet. + prefer_skip_nested_validation=False + ) + def fit(self, X, y, **fit_params): + """Fit the model according to the given training data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target values. + + **fit_params : dict + - If `enable_metadata_routing=False` (default): Parameters directly passed + to the `fit` method of the underlying regressor. + + - If `enable_metadata_routing=True`: Parameters safely routed to the `fit` + method of the underlying regressor. + + .. versionchanged:: 1.6 + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Fitted estimator. + """ + if y is None: + raise ValueError( + f"This {self.__class__.__name__} estimator " + "requires y to be passed, but the target y is None." + ) + y = check_array( + y, + input_name="y", + accept_sparse=False, + ensure_all_finite=True, + ensure_2d=False, + dtype="numeric", + allow_nd=True, + ) + + # store the number of dimension of the target to predict an array of + # similar shape at predict + self._training_dim = y.ndim + + # transformers are designed to modify X which is 2d dimensional, we + # need to modify y accordingly. + if y.ndim == 1: + y_2d = y.reshape(-1, 1) + else: + y_2d = y + self._fit_transformer(y_2d) + + # transform y and convert back to 1d array if needed + y_trans = self.transformer_.transform(y_2d) + # FIXME: a FunctionTransformer can return a 1D array even when validate + # is set to True. Therefore, we need to check the number of dimension + # first. + if y_trans.ndim == 2 and y_trans.shape[1] == 1 and self._training_dim == 1: + y_trans = y_trans.squeeze(axis=1) + + self.regressor_ = self._get_regressor(get_clone=True) + if _routing_enabled(): + routed_params = process_routing(self, "fit", **fit_params) + else: + routed_params = Bunch(regressor=Bunch(fit=fit_params)) + + self.regressor_.fit(X, y_trans, **routed_params.regressor.fit) + + if hasattr(self.regressor_, "feature_names_in_"): + self.feature_names_in_ = self.regressor_.feature_names_in_ + + return self + + def predict(self, X, **predict_params): + """Predict using the base regressor, applying inverse. + + The regressor is used to predict and the `inverse_func` or + `inverse_transform` is applied before returning the prediction. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Samples. + + **predict_params : dict of str -> object + - If `enable_metadata_routing=False` (default): Parameters directly passed + to the `predict` method of the underlying regressor. + + - If `enable_metadata_routing=True`: Parameters safely routed to the + `predict` method of the underlying regressor. + + .. versionchanged:: 1.6 + See :ref:`Metadata Routing User Guide ` + for more details. + + Returns + ------- + y_hat : ndarray of shape (n_samples,) + Predicted values. + """ + check_is_fitted(self) + if _routing_enabled(): + routed_params = process_routing(self, "predict", **predict_params) + else: + routed_params = Bunch(regressor=Bunch(predict=predict_params)) + + pred = self.regressor_.predict(X, **routed_params.regressor.predict) + if pred.ndim == 1: + pred_trans = self.transformer_.inverse_transform(pred.reshape(-1, 1)) + else: + pred_trans = self.transformer_.inverse_transform(pred) + if ( + self._training_dim == 1 + and pred_trans.ndim == 2 + and pred_trans.shape[1] == 1 + ): + pred_trans = pred_trans.squeeze(axis=1) + + return pred_trans + + def __sklearn_tags__(self): + regressor = self._get_regressor() + tags = super().__sklearn_tags__() + tags.regressor_tags.poor_score = True + tags.input_tags.sparse = get_tags(regressor).input_tags.sparse + tags.target_tags.multi_output = get_tags(regressor).target_tags.multi_output + return tags + + @property + def n_features_in_(self): + """Number of features seen during :term:`fit`.""" + # For consistency with other estimators we raise a AttributeError so + # that hasattr() returns False the estimator isn't fitted. + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute.".format( + self.__class__.__name__ + ) + ) from nfe + + return self.regressor_.n_features_in_ + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.6 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__).add( + regressor=self._get_regressor(), + method_mapping=MethodMapping() + .add(caller="fit", callee="fit") + .add(caller="predict", callee="predict"), + ) + return router + + def _get_regressor(self, get_clone=False): + if self.regressor is None: + return LinearRegression() + + return clone(self.regressor) if get_clone else self.regressor diff --git a/.venv/lib/python3.12/site-packages/sklearn/compose/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/compose/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/compose/tests/test_column_transformer.py b/.venv/lib/python3.12/site-packages/sklearn/compose/tests/test_column_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..c7c69c657f2eab27e71df599d7507be8effe73cc --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/compose/tests/test_column_transformer.py @@ -0,0 +1,2804 @@ +""" +Test the ColumnTransformer. +""" + +import pickle +import re +import warnings + +import joblib +import numpy as np +import pytest +from numpy.testing import assert_allclose +from scipy import sparse + +from sklearn import config_context +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.compose import ( + ColumnTransformer, + make_column_selector, + make_column_transformer, +) +from sklearn.exceptions import NotFittedError +from sklearn.feature_extraction import DictVectorizer +from sklearn.feature_selection import VarianceThreshold +from sklearn.preprocessing import ( + FunctionTransformer, + Normalizer, + OneHotEncoder, + StandardScaler, +) +from sklearn.tests.metadata_routing_common import ( + ConsumingTransformer, + _Registry, + check_recorded_metadata, +) +from sklearn.utils._indexing import _safe_indexing +from sklearn.utils._testing import ( + _convert_container, + assert_allclose_dense_sparse, + assert_almost_equal, + assert_array_equal, +) +from sklearn.utils.fixes import CSR_CONTAINERS, parse_version + + +class Trans(TransformerMixin, BaseEstimator): + def fit(self, X, y=None): + return self + + def transform(self, X, y=None): + # 1D Series -> 2D DataFrame + if hasattr(X, "to_frame"): + return X.to_frame() + # 1D array -> 2D array + if getattr(X, "ndim", 2) == 1: + return np.atleast_2d(X).T + return X + + +class DoubleTrans(BaseEstimator): + def fit(self, X, y=None): + return self + + def transform(self, X): + return 2 * X + + +class SparseMatrixTrans(BaseEstimator): + def __init__(self, csr_container): + self.csr_container = csr_container + + def fit(self, X, y=None): + return self + + def transform(self, X, y=None): + n_samples = len(X) + return self.csr_container(sparse.eye(n_samples, n_samples)) + + +class TransNo2D(BaseEstimator): + def fit(self, X, y=None): + return self + + def transform(self, X, y=None): + return X + + +class TransRaise(BaseEstimator): + def fit(self, X, y=None): + raise ValueError("specific message") + + def transform(self, X, y=None): + raise ValueError("specific message") + + +def test_column_transformer(): + X_array = np.array([[0, 1, 2], [2, 4, 6]]).T + + X_res_first1D = np.array([0, 1, 2]) + X_res_second1D = np.array([2, 4, 6]) + X_res_first = X_res_first1D.reshape(-1, 1) + X_res_both = X_array + + cases = [ + # single column 1D / 2D + (0, X_res_first), + ([0], X_res_first), + # list-like + ([0, 1], X_res_both), + (np.array([0, 1]), X_res_both), + # slice + (slice(0, 1), X_res_first), + (slice(0, 2), X_res_both), + # boolean mask + (np.array([True, False]), X_res_first), + ([True, False], X_res_first), + (np.array([True, True]), X_res_both), + ([True, True], X_res_both), + ] + + for selection, res in cases: + ct = ColumnTransformer([("trans", Trans(), selection)], remainder="drop") + assert_array_equal(ct.fit_transform(X_array), res) + assert_array_equal(ct.fit(X_array).transform(X_array), res) + + # callable that returns any of the allowed specifiers + ct = ColumnTransformer( + [("trans", Trans(), lambda x: selection)], remainder="drop" + ) + assert_array_equal(ct.fit_transform(X_array), res) + assert_array_equal(ct.fit(X_array).transform(X_array), res) + + ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])]) + assert_array_equal(ct.fit_transform(X_array), X_res_both) + assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) + assert len(ct.transformers_) == 2 + + # test with transformer_weights + transformer_weights = {"trans1": 0.1, "trans2": 10} + both = ColumnTransformer( + [("trans1", Trans(), [0]), ("trans2", Trans(), [1])], + transformer_weights=transformer_weights, + ) + res = np.vstack( + [ + transformer_weights["trans1"] * X_res_first1D, + transformer_weights["trans2"] * X_res_second1D, + ] + ).T + assert_array_equal(both.fit_transform(X_array), res) + assert_array_equal(both.fit(X_array).transform(X_array), res) + assert len(both.transformers_) == 2 + + both = ColumnTransformer( + [("trans", Trans(), [0, 1])], transformer_weights={"trans": 0.1} + ) + assert_array_equal(both.fit_transform(X_array), 0.1 * X_res_both) + assert_array_equal(both.fit(X_array).transform(X_array), 0.1 * X_res_both) + assert len(both.transformers_) == 1 + + +def test_column_transformer_tuple_transformers_parameter(): + X_array = np.array([[0, 1, 2], [2, 4, 6]]).T + + transformers = [("trans1", Trans(), [0]), ("trans2", Trans(), [1])] + + ct_with_list = ColumnTransformer(transformers) + ct_with_tuple = ColumnTransformer(tuple(transformers)) + + assert_array_equal( + ct_with_list.fit_transform(X_array), ct_with_tuple.fit_transform(X_array) + ) + assert_array_equal( + ct_with_list.fit(X_array).transform(X_array), + ct_with_tuple.fit(X_array).transform(X_array), + ) + + +@pytest.mark.parametrize("constructor_name", ["dataframe", "polars"]) +def test_column_transformer_dataframe(constructor_name): + if constructor_name == "dataframe": + dataframe_lib = pytest.importorskip("pandas") + else: + dataframe_lib = pytest.importorskip(constructor_name) + + X_array = np.array([[0, 1, 2], [2, 4, 6]]).T + X_df = _convert_container( + X_array, constructor_name, columns_name=["first", "second"] + ) + + X_res_first = np.array([0, 1, 2]).reshape(-1, 1) + X_res_both = X_array + + cases = [ + # String keys: label based + # list + (["first"], X_res_first), + (["first", "second"], X_res_both), + # slice + (slice("first", "second"), X_res_both), + # int keys: positional + # list + ([0], X_res_first), + ([0, 1], X_res_both), + (np.array([0, 1]), X_res_both), + # slice + (slice(0, 1), X_res_first), + (slice(0, 2), X_res_both), + # boolean mask + (np.array([True, False]), X_res_first), + ([True, False], X_res_first), + ] + if constructor_name == "dataframe": + # Scalars are only supported for pandas dataframes. + cases.extend( + [ + # scalar + (0, X_res_first), + ("first", X_res_first), + ( + dataframe_lib.Series([True, False], index=["first", "second"]), + X_res_first, + ), + ] + ) + + for selection, res in cases: + ct = ColumnTransformer([("trans", Trans(), selection)], remainder="drop") + assert_array_equal(ct.fit_transform(X_df), res) + assert_array_equal(ct.fit(X_df).transform(X_df), res) + + # callable that returns any of the allowed specifiers + ct = ColumnTransformer( + [("trans", Trans(), lambda X: selection)], remainder="drop" + ) + assert_array_equal(ct.fit_transform(X_df), res) + assert_array_equal(ct.fit(X_df).transform(X_df), res) + + ct = ColumnTransformer( + [("trans1", Trans(), ["first"]), ("trans2", Trans(), ["second"])] + ) + assert_array_equal(ct.fit_transform(X_df), X_res_both) + assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) + assert len(ct.transformers_) == 2 + assert ct.transformers_[-1][0] != "remainder" + + ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])]) + assert_array_equal(ct.fit_transform(X_df), X_res_both) + assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) + assert len(ct.transformers_) == 2 + assert ct.transformers_[-1][0] != "remainder" + + # test with transformer_weights + transformer_weights = {"trans1": 0.1, "trans2": 10} + both = ColumnTransformer( + [("trans1", Trans(), ["first"]), ("trans2", Trans(), ["second"])], + transformer_weights=transformer_weights, + ) + res = np.vstack( + [ + transformer_weights["trans1"] * X_df["first"], + transformer_weights["trans2"] * X_df["second"], + ] + ).T + assert_array_equal(both.fit_transform(X_df), res) + assert_array_equal(both.fit(X_df).transform(X_df), res) + assert len(both.transformers_) == 2 + assert both.transformers_[-1][0] != "remainder" + + # test multiple columns + both = ColumnTransformer( + [("trans", Trans(), ["first", "second"])], transformer_weights={"trans": 0.1} + ) + assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both) + assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both) + assert len(both.transformers_) == 1 + assert both.transformers_[-1][0] != "remainder" + + both = ColumnTransformer( + [("trans", Trans(), [0, 1])], transformer_weights={"trans": 0.1} + ) + assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both) + assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both) + assert len(both.transformers_) == 1 + assert both.transformers_[-1][0] != "remainder" + + # ensure pandas object is passed through + + class TransAssert(BaseEstimator): + def __init__(self, expected_type_transform): + self.expected_type_transform = expected_type_transform + + def fit(self, X, y=None): + return self + + def transform(self, X, y=None): + assert isinstance(X, self.expected_type_transform) + if isinstance(X, dataframe_lib.Series): + X = X.to_frame() + return X + + ct = ColumnTransformer( + [ + ( + "trans", + TransAssert(expected_type_transform=dataframe_lib.DataFrame), + ["first", "second"], + ) + ] + ) + ct.fit_transform(X_df) + + if constructor_name == "dataframe": + # DataFrame protocol does not have 1d columns, so we only test on Pandas + # dataframes. + ct = ColumnTransformer( + [ + ( + "trans", + TransAssert(expected_type_transform=dataframe_lib.Series), + "first", + ) + ], + remainder="drop", + ) + ct.fit_transform(X_df) + + # Only test on pandas because the dataframe protocol requires string column + # names + # integer column spec + integer column names -> still use positional + X_df2 = X_df.copy() + X_df2.columns = [1, 0] + ct = ColumnTransformer([("trans", Trans(), 0)], remainder="drop") + assert_array_equal(ct.fit_transform(X_df2), X_res_first) + assert_array_equal(ct.fit(X_df2).transform(X_df2), X_res_first) + + assert len(ct.transformers_) == 2 + assert ct.transformers_[-1][0] == "remainder" + assert ct.transformers_[-1][1] == "drop" + assert_array_equal(ct.transformers_[-1][2], [1]) + + +@pytest.mark.parametrize("pandas", [True, False], ids=["pandas", "numpy"]) +@pytest.mark.parametrize( + "column_selection", + [[], np.array([False, False]), [False, False]], + ids=["list", "bool", "bool_int"], +) +@pytest.mark.parametrize("callable_column", [False, True]) +def test_column_transformer_empty_columns(pandas, column_selection, callable_column): + # test case that ensures that the column transformer does also work when + # a given transformer doesn't have any columns to work on + X_array = np.array([[0, 1, 2], [2, 4, 6]]).T + X_res_both = X_array + + if pandas: + pd = pytest.importorskip("pandas") + X = pd.DataFrame(X_array, columns=["first", "second"]) + else: + X = X_array + + if callable_column: + column = lambda X: column_selection + else: + column = column_selection + + ct = ColumnTransformer( + [("trans1", Trans(), [0, 1]), ("trans2", TransRaise(), column)] + ) + assert_array_equal(ct.fit_transform(X), X_res_both) + assert_array_equal(ct.fit(X).transform(X), X_res_both) + assert len(ct.transformers_) == 2 + assert isinstance(ct.transformers_[1][1], TransRaise) + + ct = ColumnTransformer( + [("trans1", TransRaise(), column), ("trans2", Trans(), [0, 1])] + ) + assert_array_equal(ct.fit_transform(X), X_res_both) + assert_array_equal(ct.fit(X).transform(X), X_res_both) + assert len(ct.transformers_) == 2 + assert isinstance(ct.transformers_[0][1], TransRaise) + + ct = ColumnTransformer([("trans", TransRaise(), column)], remainder="passthrough") + assert_array_equal(ct.fit_transform(X), X_res_both) + assert_array_equal(ct.fit(X).transform(X), X_res_both) + assert len(ct.transformers_) == 2 # including remainder + assert isinstance(ct.transformers_[0][1], TransRaise) + + fixture = np.array([[], [], []]) + ct = ColumnTransformer([("trans", TransRaise(), column)], remainder="drop") + assert_array_equal(ct.fit_transform(X), fixture) + assert_array_equal(ct.fit(X).transform(X), fixture) + assert len(ct.transformers_) == 2 # including remainder + assert isinstance(ct.transformers_[0][1], TransRaise) + + +def test_column_transformer_output_indices(): + # Checks for the output_indices_ attribute + X_array = np.arange(6).reshape(3, 2) + + ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])]) + X_trans = ct.fit_transform(X_array) + assert ct.output_indices_ == { + "trans1": slice(0, 1), + "trans2": slice(1, 2), + "remainder": slice(0, 0), + } + assert_array_equal(X_trans[:, [0]], X_trans[:, ct.output_indices_["trans1"]]) + assert_array_equal(X_trans[:, [1]], X_trans[:, ct.output_indices_["trans2"]]) + + # test with transformer_weights and multiple columns + ct = ColumnTransformer( + [("trans", Trans(), [0, 1])], transformer_weights={"trans": 0.1} + ) + X_trans = ct.fit_transform(X_array) + assert ct.output_indices_ == {"trans": slice(0, 2), "remainder": slice(0, 0)} + assert_array_equal(X_trans[:, [0, 1]], X_trans[:, ct.output_indices_["trans"]]) + assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]]) + + # test case that ensures that the attribute does also work when + # a given transformer doesn't have any columns to work on + ct = ColumnTransformer([("trans1", Trans(), [0, 1]), ("trans2", TransRaise(), [])]) + X_trans = ct.fit_transform(X_array) + assert ct.output_indices_ == { + "trans1": slice(0, 2), + "trans2": slice(0, 0), + "remainder": slice(0, 0), + } + assert_array_equal(X_trans[:, [0, 1]], X_trans[:, ct.output_indices_["trans1"]]) + assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["trans2"]]) + assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]]) + + ct = ColumnTransformer([("trans", TransRaise(), [])], remainder="passthrough") + X_trans = ct.fit_transform(X_array) + assert ct.output_indices_ == {"trans": slice(0, 0), "remainder": slice(0, 2)} + assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["trans"]]) + assert_array_equal(X_trans[:, [0, 1]], X_trans[:, ct.output_indices_["remainder"]]) + + +def test_column_transformer_output_indices_df(): + # Checks for the output_indices_ attribute with data frames + pd = pytest.importorskip("pandas") + + X_df = pd.DataFrame(np.arange(6).reshape(3, 2), columns=["first", "second"]) + + ct = ColumnTransformer( + [("trans1", Trans(), ["first"]), ("trans2", Trans(), ["second"])] + ) + X_trans = ct.fit_transform(X_df) + assert ct.output_indices_ == { + "trans1": slice(0, 1), + "trans2": slice(1, 2), + "remainder": slice(0, 0), + } + assert_array_equal(X_trans[:, [0]], X_trans[:, ct.output_indices_["trans1"]]) + assert_array_equal(X_trans[:, [1]], X_trans[:, ct.output_indices_["trans2"]]) + assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]]) + + ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", Trans(), [1])]) + X_trans = ct.fit_transform(X_df) + assert ct.output_indices_ == { + "trans1": slice(0, 1), + "trans2": slice(1, 2), + "remainder": slice(0, 0), + } + assert_array_equal(X_trans[:, [0]], X_trans[:, ct.output_indices_["trans1"]]) + assert_array_equal(X_trans[:, [1]], X_trans[:, ct.output_indices_["trans2"]]) + assert_array_equal(X_trans[:, []], X_trans[:, ct.output_indices_["remainder"]]) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_column_transformer_sparse_array(csr_container): + X_sparse = csr_container(sparse.eye(3, 2)) + + # no distinction between 1D and 2D + X_res_first = X_sparse[:, [0]] + X_res_both = X_sparse + + for col in [(0,), [0], slice(0, 1)]: + for remainder, res in [("drop", X_res_first), ("passthrough", X_res_both)]: + ct = ColumnTransformer( + [("trans", Trans(), col)], remainder=remainder, sparse_threshold=0.8 + ) + assert sparse.issparse(ct.fit_transform(X_sparse)) + assert_allclose_dense_sparse(ct.fit_transform(X_sparse), res) + assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), res) + + for col in [[0, 1], slice(0, 2)]: + ct = ColumnTransformer([("trans", Trans(), col)], sparse_threshold=0.8) + assert sparse.issparse(ct.fit_transform(X_sparse)) + assert_allclose_dense_sparse(ct.fit_transform(X_sparse), X_res_both) + assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), X_res_both) + + +def test_column_transformer_list(): + X_list = [[1, float("nan"), "a"], [0, 0, "b"]] + expected_result = np.array( + [ + [1, float("nan"), 1, 0], + [-1, 0, 0, 1], + ] + ) + + ct = ColumnTransformer( + [ + ("numerical", StandardScaler(), [0, 1]), + ("categorical", OneHotEncoder(), [2]), + ] + ) + + assert_array_equal(ct.fit_transform(X_list), expected_result) + assert_array_equal(ct.fit(X_list).transform(X_list), expected_result) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_column_transformer_sparse_stacking(csr_container): + X_array = np.array([[0, 1, 2], [2, 4, 6]]).T + col_trans = ColumnTransformer( + [("trans1", Trans(), [0]), ("trans2", SparseMatrixTrans(csr_container), 1)], + sparse_threshold=0.8, + ) + col_trans.fit(X_array) + X_trans = col_trans.transform(X_array) + assert sparse.issparse(X_trans) + assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1) + assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0])) + assert len(col_trans.transformers_) == 2 + assert col_trans.transformers_[-1][0] != "remainder" + + col_trans = ColumnTransformer( + [("trans1", Trans(), [0]), ("trans2", SparseMatrixTrans(csr_container), 1)], + sparse_threshold=0.1, + ) + col_trans.fit(X_array) + X_trans = col_trans.transform(X_array) + assert not sparse.issparse(X_trans) + assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1) + assert_array_equal(X_trans[:, 1:], np.eye(X_trans.shape[0])) + + +def test_column_transformer_mixed_cols_sparse(): + df = np.array([["a", 1, True], ["b", 2, False]], dtype="O") + + ct = make_column_transformer( + (OneHotEncoder(), [0]), ("passthrough", [1, 2]), sparse_threshold=1.0 + ) + + # this shouldn't fail, since boolean can be coerced into a numeric + # See: https://github.com/scikit-learn/scikit-learn/issues/11912 + X_trans = ct.fit_transform(df) + assert X_trans.format == "csr" + assert_array_equal(X_trans.toarray(), np.array([[1, 0, 1, 1], [0, 1, 2, 0]])) + + ct = make_column_transformer( + (OneHotEncoder(), [0]), ("passthrough", [0]), sparse_threshold=1.0 + ) + with pytest.raises(ValueError, match="For a sparse output, all columns should"): + # this fails since strings `a` and `b` cannot be + # coerced into a numeric. + ct.fit_transform(df) + + +def test_column_transformer_sparse_threshold(): + X_array = np.array([["a", "b"], ["A", "B"]], dtype=object).T + # above data has sparsity of 4 / 8 = 0.5 + + # apply threshold even if all sparse + col_trans = ColumnTransformer( + [("trans1", OneHotEncoder(), [0]), ("trans2", OneHotEncoder(), [1])], + sparse_threshold=0.2, + ) + res = col_trans.fit_transform(X_array) + assert not sparse.issparse(res) + assert not col_trans.sparse_output_ + + # mixed -> sparsity of (4 + 2) / 8 = 0.75 + for thres in [0.75001, 1]: + col_trans = ColumnTransformer( + [ + ("trans1", OneHotEncoder(sparse_output=True), [0]), + ("trans2", OneHotEncoder(sparse_output=False), [1]), + ], + sparse_threshold=thres, + ) + res = col_trans.fit_transform(X_array) + assert sparse.issparse(res) + assert col_trans.sparse_output_ + + for thres in [0.75, 0]: + col_trans = ColumnTransformer( + [ + ("trans1", OneHotEncoder(sparse_output=True), [0]), + ("trans2", OneHotEncoder(sparse_output=False), [1]), + ], + sparse_threshold=thres, + ) + res = col_trans.fit_transform(X_array) + assert not sparse.issparse(res) + assert not col_trans.sparse_output_ + + # if nothing is sparse -> no sparse + for thres in [0.33, 0, 1]: + col_trans = ColumnTransformer( + [ + ("trans1", OneHotEncoder(sparse_output=False), [0]), + ("trans2", OneHotEncoder(sparse_output=False), [1]), + ], + sparse_threshold=thres, + ) + res = col_trans.fit_transform(X_array) + assert not sparse.issparse(res) + assert not col_trans.sparse_output_ + + +def test_column_transformer_error_msg_1D(): + X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T + + col_trans = ColumnTransformer([("trans", StandardScaler(), 0)]) + msg = "1D data passed to a transformer" + with pytest.raises(ValueError, match=msg): + col_trans.fit(X_array) + + with pytest.raises(ValueError, match=msg): + col_trans.fit_transform(X_array) + + col_trans = ColumnTransformer([("trans", TransRaise(), 0)]) + for func in [col_trans.fit, col_trans.fit_transform]: + with pytest.raises(ValueError, match="specific message"): + func(X_array) + + +def test_2D_transformer_output(): + X_array = np.array([[0, 1, 2], [2, 4, 6]]).T + + # if one transformer is dropped, test that name is still correct + ct = ColumnTransformer([("trans1", "drop", 0), ("trans2", TransNo2D(), 1)]) + + msg = "the 'trans2' transformer should be 2D" + with pytest.raises(ValueError, match=msg): + ct.fit_transform(X_array) + # because fit is also doing transform, this raises already on fit + with pytest.raises(ValueError, match=msg): + ct.fit(X_array) + + +def test_2D_transformer_output_pandas(): + pd = pytest.importorskip("pandas") + + X_array = np.array([[0, 1, 2], [2, 4, 6]]).T + X_df = pd.DataFrame(X_array, columns=["col1", "col2"]) + + # if one transformer is dropped, test that name is still correct + ct = ColumnTransformer([("trans1", TransNo2D(), "col1")]) + msg = "the 'trans1' transformer should be 2D" + with pytest.raises(ValueError, match=msg): + ct.fit_transform(X_df) + # because fit is also doing transform, this raises already on fit + with pytest.raises(ValueError, match=msg): + ct.fit(X_df) + + +@pytest.mark.parametrize("remainder", ["drop", "passthrough"]) +def test_column_transformer_invalid_columns(remainder): + X_array = np.array([[0, 1, 2], [2, 4, 6]]).T + + # general invalid + for col in [1.5, ["string", 1], slice(1, "s"), np.array([1.0])]: + ct = ColumnTransformer([("trans", Trans(), col)], remainder=remainder) + with pytest.raises(ValueError, match="No valid specification"): + ct.fit(X_array) + + # invalid for arrays + for col in ["string", ["string", "other"], slice("a", "b")]: + ct = ColumnTransformer([("trans", Trans(), col)], remainder=remainder) + with pytest.raises(ValueError, match="Specifying the columns"): + ct.fit(X_array) + + # transformed n_features does not match fitted n_features + col = [0, 1] + ct = ColumnTransformer([("trans", Trans(), col)], remainder=remainder) + ct.fit(X_array) + X_array_more = np.array([[0, 1, 2], [2, 4, 6], [3, 6, 9]]).T + msg = "X has 3 features, but ColumnTransformer is expecting 2 features as input." + with pytest.raises(ValueError, match=msg): + ct.transform(X_array_more) + X_array_fewer = np.array( + [ + [0, 1, 2], + ] + ).T + err_msg = ( + "X has 1 features, but ColumnTransformer is expecting 2 features as input." + ) + with pytest.raises(ValueError, match=err_msg): + ct.transform(X_array_fewer) + + +def test_column_transformer_invalid_transformer(): + class NoTrans(BaseEstimator): + def fit(self, X, y=None): + return self + + def predict(self, X): + return X + + X_array = np.array([[0, 1, 2], [2, 4, 6]]).T + ct = ColumnTransformer([("trans", NoTrans(), [0])]) + msg = "All estimators should implement fit and transform" + with pytest.raises(TypeError, match=msg): + ct.fit(X_array) + + +def test_make_column_transformer(): + scaler = StandardScaler() + norm = Normalizer() + ct = make_column_transformer((scaler, "first"), (norm, ["second"])) + names, transformers, columns = zip(*ct.transformers) + assert names == ("standardscaler", "normalizer") + assert transformers == (scaler, norm) + assert columns == ("first", ["second"]) + + +def test_make_column_transformer_pandas(): + pd = pytest.importorskip("pandas") + X_array = np.array([[0, 1, 2], [2, 4, 6]]).T + X_df = pd.DataFrame(X_array, columns=["first", "second"]) + norm = Normalizer() + ct1 = ColumnTransformer([("norm", Normalizer(), X_df.columns)]) + ct2 = make_column_transformer((norm, X_df.columns)) + assert_almost_equal(ct1.fit_transform(X_df), ct2.fit_transform(X_df)) + + +def test_make_column_transformer_kwargs(): + scaler = StandardScaler() + norm = Normalizer() + ct = make_column_transformer( + (scaler, "first"), + (norm, ["second"]), + n_jobs=3, + remainder="drop", + sparse_threshold=0.5, + ) + assert ( + ct.transformers + == make_column_transformer((scaler, "first"), (norm, ["second"])).transformers + ) + assert ct.n_jobs == 3 + assert ct.remainder == "drop" + assert ct.sparse_threshold == 0.5 + # invalid keyword parameters should raise an error message + msg = re.escape( + "make_column_transformer() got an unexpected " + "keyword argument 'transformer_weights'" + ) + with pytest.raises(TypeError, match=msg): + make_column_transformer( + (scaler, "first"), + (norm, ["second"]), + transformer_weights={"pca": 10, "Transf": 1}, + ) + + +def test_make_column_transformer_remainder_transformer(): + scaler = StandardScaler() + norm = Normalizer() + remainder = StandardScaler() + ct = make_column_transformer( + (scaler, "first"), (norm, ["second"]), remainder=remainder + ) + assert ct.remainder == remainder + + +def test_column_transformer_get_set_params(): + ct = ColumnTransformer( + [("trans1", StandardScaler(), [0]), ("trans2", StandardScaler(), [1])] + ) + + exp = { + "n_jobs": None, + "remainder": "drop", + "sparse_threshold": 0.3, + "trans1": ct.transformers[0][1], + "trans1__copy": True, + "trans1__with_mean": True, + "trans1__with_std": True, + "trans2": ct.transformers[1][1], + "trans2__copy": True, + "trans2__with_mean": True, + "trans2__with_std": True, + "transformers": ct.transformers, + "transformer_weights": None, + "verbose_feature_names_out": True, + "verbose": False, + "force_int_remainder_cols": "deprecated", + } + + assert ct.get_params() == exp + + ct.set_params(trans1__with_mean=False) + assert not ct.get_params()["trans1__with_mean"] + + ct.set_params(trans1="passthrough") + exp = { + "n_jobs": None, + "remainder": "drop", + "sparse_threshold": 0.3, + "trans1": "passthrough", + "trans2": ct.transformers[1][1], + "trans2__copy": True, + "trans2__with_mean": True, + "trans2__with_std": True, + "transformers": ct.transformers, + "transformer_weights": None, + "verbose_feature_names_out": True, + "verbose": False, + "force_int_remainder_cols": "deprecated", + } + + assert ct.get_params() == exp + + +def test_column_transformer_named_estimators(): + X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T + ct = ColumnTransformer( + [ + ("trans1", StandardScaler(), [0]), + ("trans2", StandardScaler(with_std=False), [1]), + ] + ) + assert not hasattr(ct, "transformers_") + ct.fit(X_array) + assert hasattr(ct, "transformers_") + assert isinstance(ct.named_transformers_["trans1"], StandardScaler) + assert isinstance(ct.named_transformers_.trans1, StandardScaler) + assert isinstance(ct.named_transformers_["trans2"], StandardScaler) + assert isinstance(ct.named_transformers_.trans2, StandardScaler) + assert not ct.named_transformers_.trans2.with_std + # check it are fitted transformers + assert ct.named_transformers_.trans1.mean_ == 1.0 + + +def test_column_transformer_cloning(): + X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T + + ct = ColumnTransformer([("trans", StandardScaler(), [0])]) + ct.fit(X_array) + assert not hasattr(ct.transformers[0][1], "mean_") + assert hasattr(ct.transformers_[0][1], "mean_") + + ct = ColumnTransformer([("trans", StandardScaler(), [0])]) + ct.fit_transform(X_array) + assert not hasattr(ct.transformers[0][1], "mean_") + assert hasattr(ct.transformers_[0][1], "mean_") + + +def test_column_transformer_get_feature_names(): + X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T + ct = ColumnTransformer([("trans", Trans(), [0, 1])]) + # raise correct error when not fitted + with pytest.raises(NotFittedError): + ct.get_feature_names_out() + # raise correct error when no feature names are available + ct.fit(X_array) + msg = re.escape( + "Transformer trans (type Trans) does not provide get_feature_names_out" + ) + with pytest.raises(AttributeError, match=msg): + ct.get_feature_names_out() + + +def test_column_transformer_special_strings(): + # one 'drop' -> ignore + X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T + ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", "drop", [1])]) + exp = np.array([[0.0], [1.0], [2.0]]) + assert_array_equal(ct.fit_transform(X_array), exp) + assert_array_equal(ct.fit(X_array).transform(X_array), exp) + assert len(ct.transformers_) == 2 + assert ct.transformers_[-1][0] != "remainder" + + # all 'drop' -> return shape 0 array + ct = ColumnTransformer([("trans1", "drop", [0]), ("trans2", "drop", [1])]) + assert_array_equal(ct.fit(X_array).transform(X_array).shape, (3, 0)) + assert_array_equal(ct.fit_transform(X_array).shape, (3, 0)) + assert len(ct.transformers_) == 2 + assert ct.transformers_[-1][0] != "remainder" + + # 'passthrough' + X_array = np.array([[0.0, 1.0, 2.0], [2.0, 4.0, 6.0]]).T + ct = ColumnTransformer([("trans1", Trans(), [0]), ("trans2", "passthrough", [1])]) + exp = X_array + assert_array_equal(ct.fit_transform(X_array), exp) + assert_array_equal(ct.fit(X_array).transform(X_array), exp) + assert len(ct.transformers_) == 2 + assert ct.transformers_[-1][0] != "remainder" + + +def test_column_transformer_remainder(): + X_array = np.array([[0, 1, 2], [2, 4, 6]]).T + + X_res_first = np.array([0, 1, 2]).reshape(-1, 1) + X_res_second = np.array([2, 4, 6]).reshape(-1, 1) + X_res_both = X_array + + # default drop + ct = ColumnTransformer([("trans1", Trans(), [0])]) + assert_array_equal(ct.fit_transform(X_array), X_res_first) + assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first) + assert len(ct.transformers_) == 2 + assert ct.transformers_[-1][0] == "remainder" + assert ct.transformers_[-1][1] == "drop" + assert_array_equal(ct.transformers_[-1][2], [1]) + + # specify passthrough + ct = ColumnTransformer([("trans", Trans(), [0])], remainder="passthrough") + assert_array_equal(ct.fit_transform(X_array), X_res_both) + assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) + assert len(ct.transformers_) == 2 + assert ct.transformers_[-1][0] == "remainder" + assert isinstance(ct.transformers_[-1][1], FunctionTransformer) + assert_array_equal(ct.transformers_[-1][2], [1]) + + # column order is not preserved (passed through added to end) + ct = ColumnTransformer([("trans1", Trans(), [1])], remainder="passthrough") + assert_array_equal(ct.fit_transform(X_array), X_res_both[:, ::-1]) + assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both[:, ::-1]) + assert len(ct.transformers_) == 2 + assert ct.transformers_[-1][0] == "remainder" + assert isinstance(ct.transformers_[-1][1], FunctionTransformer) + assert_array_equal(ct.transformers_[-1][2], [0]) + + # passthrough when all actual transformers are skipped + ct = ColumnTransformer([("trans1", "drop", [0])], remainder="passthrough") + assert_array_equal(ct.fit_transform(X_array), X_res_second) + assert_array_equal(ct.fit(X_array).transform(X_array), X_res_second) + assert len(ct.transformers_) == 2 + assert ct.transformers_[-1][0] == "remainder" + assert isinstance(ct.transformers_[-1][1], FunctionTransformer) + assert_array_equal(ct.transformers_[-1][2], [1]) + + # check default for make_column_transformer + ct = make_column_transformer((Trans(), [0])) + assert ct.remainder == "drop" + + +@pytest.mark.parametrize( + "cols1, cols2, expected_remainder_cols", + [ + ([0], [False, True, False], [2]), # mix types + ([0], [1], [2]), # ints + (lambda x: [0], lambda x: [1], [2]), # callables + (["A"], ["B"], ["C"]), # all strings + ([True, False, False], [False, True, False], [False, False, True]), # all bools + ], +) +def test_column_transformer_remainder_dtypes(cols1, cols2, expected_remainder_cols): + """Check that the remainder columns format matches the format of the other + columns when they're all strings or masks. + """ + X = np.ones((1, 3)) + + if isinstance(cols1, list) and isinstance(cols1[0], str): + pd = pytest.importorskip("pandas") + X = pd.DataFrame(X, columns=["A", "B", "C"]) + + # if inputs are column names store remainder columns as column names + ct = make_column_transformer( + (Trans(), cols1), + (Trans(), cols2), + remainder="passthrough", + ) + ct.fit_transform(X) + assert ct.transformers_[-1][-1] == expected_remainder_cols + + +# TODO(1.9): remove this test +@pytest.mark.parametrize("force_int_remainder_cols", [True, False]) +def test_force_int_remainder_cols_deprecation(force_int_remainder_cols): + """Check that ColumnTransformer raises a FutureWarning when + force_int_remainder_cols is set. + """ + X = np.ones((1, 3)) + ct = ColumnTransformer( + [("T1", Trans(), [0]), ("T2", Trans(), [1])], + remainder="passthrough", + force_int_remainder_cols=force_int_remainder_cols, + ) + + with pytest.warns(FutureWarning, match="`force_int_remainder_cols` is deprecated"): + ct.fit(X) + + +@pytest.mark.parametrize( + "key, expected_cols", + [ + ([0], [1]), + (np.array([0]), [1]), + (slice(0, 1), [1]), + (np.array([True, False]), [False, True]), + ], +) +def test_column_transformer_remainder_numpy(key, expected_cols): + # test different ways that columns are specified with passthrough + X_array = np.array([[0, 1, 2], [2, 4, 6]]).T + X_res_both = X_array + + ct = ColumnTransformer( + [("trans1", Trans(), key)], + remainder="passthrough", + ) + assert_array_equal(ct.fit_transform(X_array), X_res_both) + assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) + assert len(ct.transformers_) == 2 + assert ct.transformers_[-1][0] == "remainder" + assert isinstance(ct.transformers_[-1][1], FunctionTransformer) + assert ct.transformers_[-1][2] == expected_cols + + +@pytest.mark.parametrize( + "key, expected_cols", + [ + ([0], [1]), + (slice(0, 1), [1]), + (np.array([True, False]), [False, True]), + (["first"], ["second"]), + ("pd-index", ["second"]), + (np.array(["first"]), ["second"]), + (np.array(["first"], dtype=object), ["second"]), + (slice(None, "first"), ["second"]), + (slice("first", "first"), ["second"]), + ], +) +def test_column_transformer_remainder_pandas(key, expected_cols): + # test different ways that columns are specified with passthrough + pd = pytest.importorskip("pandas") + if isinstance(key, str) and key == "pd-index": + key = pd.Index(["first"]) + + X_array = np.array([[0, 1, 2], [2, 4, 6]]).T + X_df = pd.DataFrame(X_array, columns=["first", "second"]) + X_res_both = X_array + + ct = ColumnTransformer( + [("trans1", Trans(), key)], + remainder="passthrough", + ) + assert_array_equal(ct.fit_transform(X_df), X_res_both) + assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) + assert len(ct.transformers_) == 2 + assert ct.transformers_[-1][0] == "remainder" + assert isinstance(ct.transformers_[-1][1], FunctionTransformer) + assert ct.transformers_[-1][2] == expected_cols + + +@pytest.mark.parametrize( + "key, expected_cols", + [ + ([0], [1, 2]), + (np.array([0]), [1, 2]), + (slice(0, 1), [1, 2]), + (np.array([True, False, False]), [False, True, True]), + ], +) +def test_column_transformer_remainder_transformer(key, expected_cols): + X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T + X_res_both = X_array.copy() + + # second and third columns are doubled when remainder = DoubleTrans + X_res_both[:, 1:3] *= 2 + + ct = ColumnTransformer( + [("trans1", Trans(), key)], + remainder=DoubleTrans(), + ) + + assert_array_equal(ct.fit_transform(X_array), X_res_both) + assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) + assert len(ct.transformers_) == 2 + assert ct.transformers_[-1][0] == "remainder" + assert isinstance(ct.transformers_[-1][1], DoubleTrans) + assert ct.transformers_[-1][2] == expected_cols + + +def test_column_transformer_no_remaining_remainder_transformer(): + X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T + + ct = ColumnTransformer([("trans1", Trans(), [0, 1, 2])], remainder=DoubleTrans()) + + assert_array_equal(ct.fit_transform(X_array), X_array) + assert_array_equal(ct.fit(X_array).transform(X_array), X_array) + assert len(ct.transformers_) == 1 + assert ct.transformers_[-1][0] != "remainder" + + +def test_column_transformer_drops_all_remainder_transformer(): + X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T + + # columns are doubled when remainder = DoubleTrans + X_res_both = 2 * X_array.copy()[:, 1:3] + + ct = ColumnTransformer([("trans1", "drop", [0])], remainder=DoubleTrans()) + + assert_array_equal(ct.fit_transform(X_array), X_res_both) + assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) + assert len(ct.transformers_) == 2 + assert ct.transformers_[-1][0] == "remainder" + assert isinstance(ct.transformers_[-1][1], DoubleTrans) + assert_array_equal(ct.transformers_[-1][2], [1, 2]) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_column_transformer_sparse_remainder_transformer(csr_container): + X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T + + ct = ColumnTransformer( + [("trans1", Trans(), [0])], + remainder=SparseMatrixTrans(csr_container), + sparse_threshold=0.8, + ) + + X_trans = ct.fit_transform(X_array) + assert sparse.issparse(X_trans) + # SparseMatrixTrans creates 3 features for each column. There is + # one column in ``transformers``, thus: + assert X_trans.shape == (3, 3 + 1) + + exp_array = np.hstack((X_array[:, 0].reshape(-1, 1), np.eye(3))) + assert_array_equal(X_trans.toarray(), exp_array) + assert len(ct.transformers_) == 2 + assert ct.transformers_[-1][0] == "remainder" + assert isinstance(ct.transformers_[-1][1], SparseMatrixTrans) + assert_array_equal(ct.transformers_[-1][2], [1, 2]) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_column_transformer_drop_all_sparse_remainder_transformer(csr_container): + X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T + ct = ColumnTransformer( + [("trans1", "drop", [0])], + remainder=SparseMatrixTrans(csr_container), + sparse_threshold=0.8, + ) + + X_trans = ct.fit_transform(X_array) + assert sparse.issparse(X_trans) + + # SparseMatrixTrans creates 3 features for each column, thus: + assert X_trans.shape == (3, 3) + assert_array_equal(X_trans.toarray(), np.eye(3)) + assert len(ct.transformers_) == 2 + assert ct.transformers_[-1][0] == "remainder" + assert isinstance(ct.transformers_[-1][1], SparseMatrixTrans) + assert_array_equal(ct.transformers_[-1][2], [1, 2]) + + +def test_column_transformer_get_set_params_with_remainder(): + ct = ColumnTransformer( + [("trans1", StandardScaler(), [0])], remainder=StandardScaler() + ) + + exp = { + "n_jobs": None, + "remainder": ct.remainder, + "remainder__copy": True, + "remainder__with_mean": True, + "remainder__with_std": True, + "sparse_threshold": 0.3, + "trans1": ct.transformers[0][1], + "trans1__copy": True, + "trans1__with_mean": True, + "trans1__with_std": True, + "transformers": ct.transformers, + "transformer_weights": None, + "verbose_feature_names_out": True, + "verbose": False, + "force_int_remainder_cols": "deprecated", + } + + assert ct.get_params() == exp + + ct.set_params(remainder__with_std=False) + assert not ct.get_params()["remainder__with_std"] + + ct.set_params(trans1="passthrough") + exp = { + "n_jobs": None, + "remainder": ct.remainder, + "remainder__copy": True, + "remainder__with_mean": True, + "remainder__with_std": False, + "sparse_threshold": 0.3, + "trans1": "passthrough", + "transformers": ct.transformers, + "transformer_weights": None, + "verbose_feature_names_out": True, + "verbose": False, + "force_int_remainder_cols": "deprecated", + } + assert ct.get_params() == exp + + +def test_column_transformer_no_estimators(): + X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).astype("float").T + ct = ColumnTransformer([], remainder=StandardScaler()) + + params = ct.get_params() + assert params["remainder__with_mean"] + + X_trans = ct.fit_transform(X_array) + assert X_trans.shape == X_array.shape + assert len(ct.transformers_) == 1 + assert ct.transformers_[-1][0] == "remainder" + assert ct.transformers_[-1][2] == [0, 1, 2] + + +@pytest.mark.parametrize( + ["est", "pattern"], + [ + ( + ColumnTransformer( + [("trans1", Trans(), [0]), ("trans2", Trans(), [1])], + remainder=DoubleTrans(), + ), + ( + r"\[ColumnTransformer\].*\(1 of 3\) Processing trans1.* total=.*\n" + r"\[ColumnTransformer\].*\(2 of 3\) Processing trans2.* total=.*\n" + r"\[ColumnTransformer\].*\(3 of 3\) Processing remainder.* total=.*\n$" + ), + ), + ( + ColumnTransformer( + [("trans1", Trans(), [0]), ("trans2", Trans(), [1])], + remainder="passthrough", + ), + ( + r"\[ColumnTransformer\].*\(1 of 3\) Processing trans1.* total=.*\n" + r"\[ColumnTransformer\].*\(2 of 3\) Processing trans2.* total=.*\n" + r"\[ColumnTransformer\].*\(3 of 3\) Processing remainder.* total=.*\n$" + ), + ), + ( + ColumnTransformer( + [("trans1", Trans(), [0]), ("trans2", "drop", [1])], + remainder="passthrough", + ), + ( + r"\[ColumnTransformer\].*\(1 of 2\) Processing trans1.* total=.*\n" + r"\[ColumnTransformer\].*\(2 of 2\) Processing remainder.* total=.*\n$" + ), + ), + ( + ColumnTransformer( + [("trans1", Trans(), [0]), ("trans2", "passthrough", [1])], + remainder="passthrough", + ), + ( + r"\[ColumnTransformer\].*\(1 of 3\) Processing trans1.* total=.*\n" + r"\[ColumnTransformer\].*\(2 of 3\) Processing trans2.* total=.*\n" + r"\[ColumnTransformer\].*\(3 of 3\) Processing remainder.* total=.*\n$" + ), + ), + ( + ColumnTransformer([("trans1", Trans(), [0])], remainder="passthrough"), + ( + r"\[ColumnTransformer\].*\(1 of 2\) Processing trans1.* total=.*\n" + r"\[ColumnTransformer\].*\(2 of 2\) Processing remainder.* total=.*\n$" + ), + ), + ( + ColumnTransformer( + [("trans1", Trans(), [0]), ("trans2", Trans(), [1])], remainder="drop" + ), + ( + r"\[ColumnTransformer\].*\(1 of 2\) Processing trans1.* total=.*\n" + r"\[ColumnTransformer\].*\(2 of 2\) Processing trans2.* total=.*\n$" + ), + ), + ( + ColumnTransformer([("trans1", Trans(), [0])], remainder="drop"), + r"\[ColumnTransformer\].*\(1 of 1\) Processing trans1.* total=.*\n$", + ), + ], +) +@pytest.mark.parametrize("method", ["fit", "fit_transform"]) +def test_column_transformer_verbose(est, pattern, method, capsys): + X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T + + func = getattr(est, method) + est.set_params(verbose=False) + func(X_array) + assert not capsys.readouterr().out, "Got output for verbose=False" + + est.set_params(verbose=True) + func(X_array) + assert re.match(pattern, capsys.readouterr()[0]) + + +def test_column_transformer_no_estimators_set_params(): + ct = ColumnTransformer([]).set_params(n_jobs=2) + assert ct.n_jobs == 2 + + +def test_column_transformer_callable_specifier(): + # assert that function gets the full array + X_array = np.array([[0, 1, 2], [2, 4, 6]]).T + X_res_first = np.array([[0, 1, 2]]).T + + def func(X): + assert_array_equal(X, X_array) + return [0] + + ct = ColumnTransformer([("trans", Trans(), func)], remainder="drop") + assert_array_equal(ct.fit_transform(X_array), X_res_first) + assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first) + assert callable(ct.transformers[0][2]) + assert ct.transformers_[0][2] == [0] + + +def test_column_transformer_callable_specifier_dataframe(): + # assert that function gets the full dataframe + pd = pytest.importorskip("pandas") + X_array = np.array([[0, 1, 2], [2, 4, 6]]).T + X_res_first = np.array([[0, 1, 2]]).T + + X_df = pd.DataFrame(X_array, columns=["first", "second"]) + + def func(X): + assert_array_equal(X.columns, X_df.columns) + assert_array_equal(X.values, X_df.values) + return ["first"] + + ct = ColumnTransformer([("trans", Trans(), func)], remainder="drop") + assert_array_equal(ct.fit_transform(X_df), X_res_first) + assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first) + assert callable(ct.transformers[0][2]) + assert ct.transformers_[0][2] == ["first"] + + +def test_column_transformer_negative_column_indexes(): + X = np.random.randn(2, 2) + X_categories = np.array([[1], [2]]) + X = np.concatenate([X, X_categories], axis=1) + + ohe = OneHotEncoder() + + tf_1 = ColumnTransformer([("ohe", ohe, [-1])], remainder="passthrough") + tf_2 = ColumnTransformer([("ohe", ohe, [2])], remainder="passthrough") + assert_array_equal(tf_1.fit_transform(X), tf_2.fit_transform(X)) + + +@pytest.mark.parametrize("array_type", [np.asarray, *CSR_CONTAINERS]) +def test_column_transformer_mask_indexing(array_type): + # Regression test for #14510 + # Boolean array-like does not behave as boolean array with sparse matrices. + X = np.transpose([[1, 2, 3], [4, 5, 6], [5, 6, 7], [8, 9, 10]]) + X = array_type(X) + column_transformer = ColumnTransformer( + [("identity", FunctionTransformer(), [False, True, False, True])] + ) + X_trans = column_transformer.fit_transform(X) + assert X_trans.shape == (3, 2) + + +def test_n_features_in(): + # make sure n_features_in is what is passed as input to the column + # transformer. + + X = [[1, 2], [3, 4], [5, 6]] + ct = ColumnTransformer([("a", DoubleTrans(), [0]), ("b", DoubleTrans(), [1])]) + assert not hasattr(ct, "n_features_in_") + ct.fit(X) + assert ct.n_features_in_ == 2 + + +@pytest.mark.parametrize( + "cols, pattern, include, exclude", + [ + (["col_int", "col_float"], None, np.number, None), + (["col_int", "col_float"], None, None, [object, "string"]), + (["col_int", "col_float"], None, [int, float], None), + (["col_str"], None, [object, "string"], None), + (["col_float"], None, [float], None), + (["col_float"], None, float, None), + (["col_float"], "at$", [np.number], None), + (["col_int"], None, [int], None), + (["col_int"], "^col_int", [np.number], None), + (["col_float", "col_str"], "float|str", None, None), + (["col_str"], "^col_s", None, [int]), + ([], "str$", float, None), + ( + ["col_int", "col_float", "col_str"], + None, + [np.number, object, "string"], + None, + ), + ], +) +def test_make_column_selector_with_select_dtypes(cols, pattern, include, exclude): + pd = pytest.importorskip("pandas") + + X_df = pd.DataFrame( + { + "col_int": np.array([0, 1, 2], dtype=int), + "col_float": np.array([0.0, 1.0, 2.0], dtype=float), + "col_str": ["one", "two", "three"], + }, + columns=["col_int", "col_float", "col_str"], + ) + + selector = make_column_selector( + dtype_include=include, dtype_exclude=exclude, pattern=pattern + ) + + assert_array_equal(selector(X_df), cols) + + +def test_column_transformer_with_make_column_selector(): + # Functional test for column transformer + column selector + pd = pytest.importorskip("pandas") + X_df = pd.DataFrame( + { + "col_int": np.array([0, 1, 2], dtype=int), + "col_float": np.array([0.0, 1.0, 2.0], dtype=float), + "col_cat": ["one", "two", "one"], + "col_str": ["low", "middle", "high"], + }, + columns=["col_int", "col_float", "col_cat", "col_str"], + ) + X_df["col_str"] = X_df["col_str"].astype("category") + + cat_selector = make_column_selector(dtype_include=["category", object, "string"]) + num_selector = make_column_selector(dtype_include=np.number) + + ohe = OneHotEncoder() + scaler = StandardScaler() + + ct_selector = make_column_transformer((ohe, cat_selector), (scaler, num_selector)) + ct_direct = make_column_transformer( + (ohe, ["col_cat", "col_str"]), (scaler, ["col_float", "col_int"]) + ) + + X_selector = ct_selector.fit_transform(X_df) + X_direct = ct_direct.fit_transform(X_df) + + assert_allclose(X_selector, X_direct) + + +def test_make_column_selector_error(): + selector = make_column_selector(dtype_include=np.number) + X = np.array([[0.1, 0.2]]) + msg = "make_column_selector can only be applied to pandas dataframes" + with pytest.raises(ValueError, match=msg): + selector(X) + + +def test_make_column_selector_pickle(): + pd = pytest.importorskip("pandas") + + X_df = pd.DataFrame( + { + "col_int": np.array([0, 1, 2], dtype=int), + "col_float": np.array([0.0, 1.0, 2.0], dtype=float), + "col_str": ["one", "two", "three"], + }, + columns=["col_int", "col_float", "col_str"], + ) + + selector = make_column_selector(dtype_include=[object]) + selector_picked = pickle.loads(pickle.dumps(selector)) + + assert_array_equal(selector(X_df), selector_picked(X_df)) + + +@pytest.mark.parametrize( + "empty_col", + [[], np.array([], dtype=int), lambda x: []], + ids=["list", "array", "callable"], +) +def test_feature_names_empty_columns(empty_col): + pd = pytest.importorskip("pandas") + + df = pd.DataFrame({"col1": ["a", "a", "b"], "col2": ["z", "z", "z"]}) + + ct = ColumnTransformer( + transformers=[ + ("ohe", OneHotEncoder(), ["col1", "col2"]), + ("empty_features", OneHotEncoder(), empty_col), + ], + ) + + ct.fit(df) + assert_array_equal( + ct.get_feature_names_out(), ["ohe__col1_a", "ohe__col1_b", "ohe__col2_z"] + ) + + +@pytest.mark.parametrize( + "selector", + [ + [1], + lambda x: [1], + ["col2"], + lambda x: ["col2"], + [False, True], + lambda x: [False, True], + ], +) +def test_feature_names_out_pandas(selector): + """Checks name when selecting only the second column""" + pd = pytest.importorskip("pandas") + df = pd.DataFrame({"col1": ["a", "a", "b"], "col2": ["z", "z", "z"]}) + ct = ColumnTransformer([("ohe", OneHotEncoder(), selector)]) + ct.fit(df) + + assert_array_equal(ct.get_feature_names_out(), ["ohe__col2_z"]) + + +@pytest.mark.parametrize( + "selector", [[1], lambda x: [1], [False, True], lambda x: [False, True]] +) +def test_feature_names_out_non_pandas(selector): + """Checks name when selecting the second column with numpy array""" + X = [["a", "z"], ["a", "z"], ["b", "z"]] + ct = ColumnTransformer([("ohe", OneHotEncoder(), selector)]) + ct.fit(X) + + assert_array_equal(ct.get_feature_names_out(), ["ohe__x1_z"]) + + +@pytest.mark.parametrize("remainder", ["passthrough", StandardScaler()]) +def test_sk_visual_block_remainder(remainder): + # remainder='passthrough' or an estimator will be shown in repr_html + ohe = OneHotEncoder() + ct = ColumnTransformer( + transformers=[("ohe", ohe, ["col1", "col2"])], remainder=remainder + ) + visual_block = ct._sk_visual_block_() + assert visual_block.names == ("ohe", "remainder") + assert visual_block.name_details == (["col1", "col2"], "") + assert visual_block.estimators == (ohe, remainder) + + +def test_sk_visual_block_remainder_drop(): + # remainder='drop' is not shown in repr_html + ohe = OneHotEncoder() + ct = ColumnTransformer(transformers=[("ohe", ohe, ["col1", "col2"])]) + visual_block = ct._sk_visual_block_() + assert visual_block.names == ("ohe",) + assert visual_block.name_details == (["col1", "col2"],) + assert visual_block.estimators == (ohe,) + + +@pytest.mark.parametrize("remainder", ["passthrough", StandardScaler()]) +def test_sk_visual_block_remainder_fitted_pandas(remainder): + # Remainder shows the columns after fitting + pd = pytest.importorskip("pandas") + ohe = OneHotEncoder() + ct = ColumnTransformer( + transformers=[("ohe", ohe, ["col1", "col2"])], + remainder=remainder, + ) + df = pd.DataFrame( + { + "col1": ["a", "b", "c"], + "col2": ["z", "z", "z"], + "col3": [1, 2, 3], + "col4": [3, 4, 5], + } + ) + ct.fit(df) + visual_block = ct._sk_visual_block_() + assert visual_block.names == ("ohe", "remainder") + assert visual_block.name_details == (["col1", "col2"], ["col3", "col4"]) + assert visual_block.estimators == (ohe, remainder) + + +@pytest.mark.parametrize("remainder", ["passthrough", StandardScaler()]) +def test_sk_visual_block_remainder_fitted_numpy(remainder): + # Remainder shows the indices after fitting + X = np.array([[1, 2, 3], [4, 5, 6]], dtype=float) + scaler = StandardScaler() + ct = ColumnTransformer( + transformers=[("scale", scaler, [0, 2])], remainder=remainder + ) + ct.fit(X) + visual_block = ct._sk_visual_block_() + assert visual_block.names == ("scale", "remainder") + assert visual_block.name_details == ([0, 2], [1]) + assert visual_block.estimators == (scaler, remainder) + + +@pytest.mark.parametrize("explicit_colname", ["first", "second", 0, 1]) +@pytest.mark.parametrize("remainder", [Trans(), "passthrough", "drop"]) +def test_column_transformer_reordered_column_names_remainder( + explicit_colname, remainder +): + """Test the interaction between remainder and column transformer""" + pd = pytest.importorskip("pandas") + + X_fit_array = np.array([[0, 1, 2], [2, 4, 6]]).T + X_fit_df = pd.DataFrame(X_fit_array, columns=["first", "second"]) + + X_trans_array = np.array([[2, 4, 6], [0, 1, 2]]).T + X_trans_df = pd.DataFrame(X_trans_array, columns=["second", "first"]) + + tf = ColumnTransformer([("bycol", Trans(), explicit_colname)], remainder=remainder) + + tf.fit(X_fit_df) + X_fit_trans = tf.transform(X_fit_df) + + # Changing the order still works + X_trans = tf.transform(X_trans_df) + assert_allclose(X_trans, X_fit_trans) + + # extra columns are ignored + X_extended_df = X_fit_df.copy() + X_extended_df["third"] = [3, 6, 9] + X_trans = tf.transform(X_extended_df) + assert_allclose(X_trans, X_fit_trans) + + if isinstance(explicit_colname, str): + # Raise error if columns are specified by names but input only allows + # to specify by position, e.g. numpy array instead of a pandas df. + X_array = X_fit_array.copy() + err_msg = "Specifying the columns" + with pytest.raises(ValueError, match=err_msg): + tf.transform(X_array) + + +def test_feature_name_validation_missing_columns_drop_passthough(): + """Test the interaction between {'drop', 'passthrough'} and + missing column names.""" + pd = pytest.importorskip("pandas") + + X = np.ones(shape=(3, 4)) + df = pd.DataFrame(X, columns=["a", "b", "c", "d"]) + + df_dropped = df.drop("c", axis=1) + + # with remainder='passthrough', all columns seen during `fit` must be + # present + tf = ColumnTransformer([("bycol", Trans(), [1])], remainder="passthrough") + tf.fit(df) + msg = r"columns are missing: {'c'}" + with pytest.raises(ValueError, match=msg): + tf.transform(df_dropped) + + # with remainder='drop', it is allowed to have column 'c' missing + tf = ColumnTransformer([("bycol", Trans(), [1])], remainder="drop") + tf.fit(df) + + df_dropped_trans = tf.transform(df_dropped) + df_fit_trans = tf.transform(df) + assert_allclose(df_dropped_trans, df_fit_trans) + + # bycol drops 'c', thus it is allowed for 'c' to be missing + tf = ColumnTransformer([("bycol", "drop", ["c"])], remainder="passthrough") + tf.fit(df) + df_dropped_trans = tf.transform(df_dropped) + df_fit_trans = tf.transform(df) + assert_allclose(df_dropped_trans, df_fit_trans) + + +def test_feature_names_in_(): + """Feature names are stored in column transformer. + + Column transformer deliberately does not check for column name consistency. + It only checks that the non-dropped names seen in `fit` are seen + in `transform`. This behavior is already tested in + `test_feature_name_validation_missing_columns_drop_passthough`""" + + pd = pytest.importorskip("pandas") + + feature_names = ["a", "c", "d"] + df = pd.DataFrame([[1, 2, 3]], columns=feature_names) + ct = ColumnTransformer([("bycol", Trans(), ["a", "d"])], remainder="passthrough") + + ct.fit(df) + assert_array_equal(ct.feature_names_in_, feature_names) + assert isinstance(ct.feature_names_in_, np.ndarray) + assert ct.feature_names_in_.dtype == object + + +class TransWithNames(Trans): + def __init__(self, feature_names_out=None): + self.feature_names_out = feature_names_out + + def get_feature_names_out(self, input_features=None): + if self.feature_names_out is not None: + return np.asarray(self.feature_names_out, dtype=object) + return input_features + + +@pytest.mark.parametrize( + "transformers, remainder, expected_names", + [ + ( + [ + ("bycol1", TransWithNames(), ["d", "c"]), + ("bycol2", "passthrough", ["d"]), + ], + "passthrough", + ["bycol1__d", "bycol1__c", "bycol2__d", "remainder__a", "remainder__b"], + ), + ( + [ + ("bycol1", TransWithNames(), ["d", "c"]), + ("bycol2", "passthrough", ["d"]), + ], + "drop", + ["bycol1__d", "bycol1__c", "bycol2__d"], + ), + ( + [ + ("bycol1", TransWithNames(), ["b"]), + ("bycol2", "drop", ["d"]), + ], + "passthrough", + ["bycol1__b", "remainder__a", "remainder__c"], + ), + ( + [ + ("bycol1", TransWithNames(["pca1", "pca2"]), ["a", "b", "d"]), + ], + "passthrough", + ["bycol1__pca1", "bycol1__pca2", "remainder__c"], + ), + ( + [ + ("bycol1", TransWithNames(["a", "b"]), ["d"]), + ("bycol2", "passthrough", ["b"]), + ], + "drop", + ["bycol1__a", "bycol1__b", "bycol2__b"], + ), + ( + [ + ("bycol1", TransWithNames([f"pca{i}" for i in range(2)]), ["b"]), + ("bycol2", TransWithNames([f"pca{i}" for i in range(2)]), ["b"]), + ], + "passthrough", + [ + "bycol1__pca0", + "bycol1__pca1", + "bycol2__pca0", + "bycol2__pca1", + "remainder__a", + "remainder__c", + "remainder__d", + ], + ), + ( + [ + ("bycol1", "drop", ["d"]), + ], + "drop", + [], + ), + ( + [ + ("bycol1", TransWithNames(), slice(1, 3)), + ], + "drop", + ["bycol1__b", "bycol1__c"], + ), + ( + [ + ("bycol1", TransWithNames(), ["b"]), + ("bycol2", "drop", slice(3, 4)), + ], + "passthrough", + ["bycol1__b", "remainder__a", "remainder__c"], + ), + ( + [ + ("bycol1", TransWithNames(), ["d", "c"]), + ("bycol2", "passthrough", slice(3, 4)), + ], + "passthrough", + ["bycol1__d", "bycol1__c", "bycol2__d", "remainder__a", "remainder__b"], + ), + ( + [ + ("bycol1", TransWithNames(), slice("b", "c")), + ], + "drop", + ["bycol1__b", "bycol1__c"], + ), + ( + [ + ("bycol1", TransWithNames(), ["b"]), + ("bycol2", "drop", slice("c", "d")), + ], + "passthrough", + ["bycol1__b", "remainder__a"], + ), + ( + [ + ("bycol1", TransWithNames(), ["d", "c"]), + ("bycol2", "passthrough", slice("c", "d")), + ], + "passthrough", + [ + "bycol1__d", + "bycol1__c", + "bycol2__c", + "bycol2__d", + "remainder__a", + "remainder__b", + ], + ), + ], +) +def test_verbose_feature_names_out_true(transformers, remainder, expected_names): + """Check feature_names_out for verbose_feature_names_out=True (default)""" + pd = pytest.importorskip("pandas") + df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"]) + ct = ColumnTransformer( + transformers, + remainder=remainder, + ) + ct.fit(df) + + names = ct.get_feature_names_out() + assert isinstance(names, np.ndarray) + assert names.dtype == object + assert_array_equal(names, expected_names) + + +def _feature_names_out_callable_name_clash(trans_name: str, feat_name: str): + return f"{trans_name[:2]}++{feat_name}" + + +def _feature_names_out_callable_upper(trans_name: str, feat_name: str): + return f"{trans_name.upper()}={feat_name.upper()}" + + +@pytest.mark.parametrize( + "transformers, remainder, verbose_feature_names_out, expected_names", + [ + ( + [ + ("bycol1", TransWithNames(), ["d", "c"]), + ("bycol2", "passthrough", ["d"]), + ], + "passthrough", + _feature_names_out_callable_name_clash, + ["by++d", "by++c", "by++d", "re++a", "re++b"], + ), + ( + [ + ("bycol1", TransWithNames(), ["d", "c"]), + ("bycol2", "passthrough", ["d"]), + ], + "drop", + "{feature_name}-{transformer_name}", + ["d-bycol1", "c-bycol1", "d-bycol2"], + ), + ( + [ + ("bycol1", TransWithNames(), ["d", "c"]), + ("bycol2", "passthrough", slice("c", "d")), + ], + "passthrough", + _feature_names_out_callable_upper, + [ + "BYCOL1=D", + "BYCOL1=C", + "BYCOL2=C", + "BYCOL2=D", + "REMAINDER=A", + "REMAINDER=B", + ], + ), + ], +) +def test_verbose_feature_names_out_callable_or_str( + transformers, remainder, verbose_feature_names_out, expected_names +): + """Check feature_names_out for verbose_feature_names_out=True (default)""" + pd = pytest.importorskip("pandas") + df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"]) + ct = ColumnTransformer( + transformers, + remainder=remainder, + verbose_feature_names_out=verbose_feature_names_out, + ) + ct.fit(df) + + names = ct.get_feature_names_out() + assert isinstance(names, np.ndarray) + assert names.dtype == object + assert_array_equal(names, expected_names) + + +@pytest.mark.parametrize( + "transformers, remainder, expected_names", + [ + ( + [ + ("bycol1", TransWithNames(), ["d", "c"]), + ("bycol2", "passthrough", ["a"]), + ], + "passthrough", + ["d", "c", "a", "b"], + ), + ( + [ + ("bycol1", TransWithNames(["a"]), ["d", "c"]), + ("bycol2", "passthrough", ["d"]), + ], + "drop", + ["a", "d"], + ), + ( + [ + ("bycol1", TransWithNames(), ["b"]), + ("bycol2", "drop", ["d"]), + ], + "passthrough", + ["b", "a", "c"], + ), + ( + [ + ("bycol1", TransWithNames(["pca1", "pca2"]), ["a", "b", "d"]), + ], + "passthrough", + ["pca1", "pca2", "c"], + ), + ( + [ + ("bycol1", TransWithNames(["a", "c"]), ["d"]), + ("bycol2", "passthrough", ["d"]), + ], + "drop", + ["a", "c", "d"], + ), + ( + [ + ("bycol1", TransWithNames([f"pca{i}" for i in range(2)]), ["b"]), + ("bycol2", TransWithNames([f"kpca{i}" for i in range(2)]), ["b"]), + ], + "passthrough", + ["pca0", "pca1", "kpca0", "kpca1", "a", "c", "d"], + ), + ( + [ + ("bycol1", "drop", ["d"]), + ], + "drop", + [], + ), + ( + [ + ("bycol1", TransWithNames(), slice(1, 2)), + ("bycol2", "drop", ["d"]), + ], + "passthrough", + ["b", "a", "c"], + ), + ( + [ + ("bycol1", TransWithNames(), ["b"]), + ("bycol2", "drop", slice(3, 4)), + ], + "passthrough", + ["b", "a", "c"], + ), + ( + [ + ("bycol1", TransWithNames(), ["d", "c"]), + ("bycol2", "passthrough", slice(0, 2)), + ], + "drop", + ["d", "c", "a", "b"], + ), + ( + [ + ("bycol1", TransWithNames(), slice("a", "b")), + ("bycol2", "drop", ["d"]), + ], + "passthrough", + ["a", "b", "c"], + ), + ( + [ + ("bycol1", TransWithNames(), ["b"]), + ("bycol2", "drop", slice("c", "d")), + ], + "passthrough", + ["b", "a"], + ), + ( + [ + ("bycol1", TransWithNames(), ["d", "c"]), + ("bycol2", "passthrough", slice("a", "b")), + ], + "drop", + ["d", "c", "a", "b"], + ), + ( + [ + ("bycol1", TransWithNames(), ["d", "c"]), + ("bycol2", "passthrough", slice("b", "b")), + ], + "drop", + ["d", "c", "b"], + ), + ], +) +def test_verbose_feature_names_out_false(transformers, remainder, expected_names): + """Check feature_names_out for verbose_feature_names_out=False""" + pd = pytest.importorskip("pandas") + df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"]) + ct = ColumnTransformer( + transformers, + remainder=remainder, + verbose_feature_names_out=False, + ) + ct.fit(df) + + names = ct.get_feature_names_out() + assert isinstance(names, np.ndarray) + assert names.dtype == object + assert_array_equal(names, expected_names) + + +@pytest.mark.parametrize( + "transformers, remainder, colliding_columns", + [ + ( + [ + ("bycol1", TransWithNames(), ["b"]), + ("bycol2", "passthrough", ["b"]), + ], + "drop", + "['b']", + ), + ( + [ + ("bycol1", TransWithNames(["c", "d"]), ["c"]), + ("bycol2", "passthrough", ["c"]), + ], + "drop", + "['c']", + ), + ( + [ + ("bycol1", TransWithNames(["a"]), ["b"]), + ("bycol2", "passthrough", ["b"]), + ], + "passthrough", + "['a']", + ), + ( + [ + ("bycol1", TransWithNames(["a"]), ["b"]), + ("bycol2", "drop", ["b"]), + ], + "passthrough", + "['a']", + ), + ( + [ + ("bycol1", TransWithNames(["c", "b"]), ["b"]), + ("bycol2", "passthrough", ["c", "b"]), + ], + "drop", + "['b', 'c']", + ), + ( + [ + ("bycol1", TransWithNames(["a"]), ["b"]), + ("bycol2", "passthrough", ["a"]), + ("bycol3", TransWithNames(["a"]), ["b"]), + ], + "passthrough", + "['a']", + ), + ( + [ + ("bycol1", TransWithNames(["a", "b"]), ["b"]), + ("bycol2", "passthrough", ["a"]), + ("bycol3", TransWithNames(["b"]), ["c"]), + ], + "passthrough", + "['a', 'b']", + ), + ( + [ + ("bycol1", TransWithNames([f"pca{i}" for i in range(6)]), ["b"]), + ("bycol2", TransWithNames([f"pca{i}" for i in range(6)]), ["b"]), + ], + "passthrough", + "['pca0', 'pca1', 'pca2', 'pca3', 'pca4', ...]", + ), + ( + [ + ("bycol1", TransWithNames(["a", "b"]), slice(1, 2)), + ("bycol2", "passthrough", ["a"]), + ("bycol3", TransWithNames(["b"]), ["c"]), + ], + "passthrough", + "['a', 'b']", + ), + ( + [ + ("bycol1", TransWithNames(["a", "b"]), ["b"]), + ("bycol2", "passthrough", slice(0, 1)), + ("bycol3", TransWithNames(["b"]), ["c"]), + ], + "passthrough", + "['a', 'b']", + ), + ( + [ + ("bycol1", TransWithNames(["a", "b"]), slice("b", "c")), + ("bycol2", "passthrough", ["a"]), + ("bycol3", TransWithNames(["b"]), ["c"]), + ], + "passthrough", + "['a', 'b']", + ), + ( + [ + ("bycol1", TransWithNames(["a", "b"]), ["b"]), + ("bycol2", "passthrough", slice("a", "a")), + ("bycol3", TransWithNames(["b"]), ["c"]), + ], + "passthrough", + "['a', 'b']", + ), + ], +) +def test_verbose_feature_names_out_false_errors( + transformers, remainder, colliding_columns +): + """Check feature_names_out for verbose_feature_names_out=False""" + + pd = pytest.importorskip("pandas") + df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"]) + ct = ColumnTransformer( + transformers, + remainder=remainder, + verbose_feature_names_out=False, + ) + ct.fit(df) + + msg = re.escape( + f"Output feature names: {colliding_columns} are not unique. Please set " + "verbose_feature_names_out=True to add prefixes to feature names" + ) + with pytest.raises(ValueError, match=msg): + ct.get_feature_names_out() + + +@pytest.mark.parametrize("verbose_feature_names_out", [True, False]) +@pytest.mark.parametrize("remainder", ["drop", "passthrough"]) +def test_column_transformer_set_output(verbose_feature_names_out, remainder): + """Check column transformer behavior with set_output.""" + pd = pytest.importorskip("pandas") + df = pd.DataFrame([[1, 2, 3, 4]], columns=["a", "b", "c", "d"], index=[10]) + ct = ColumnTransformer( + [("first", TransWithNames(), ["a", "c"]), ("second", TransWithNames(), ["d"])], + remainder=remainder, + verbose_feature_names_out=verbose_feature_names_out, + ) + X_trans = ct.fit_transform(df) + assert isinstance(X_trans, np.ndarray) + + ct.set_output(transform="pandas") + + df_test = pd.DataFrame([[1, 2, 3, 4]], columns=df.columns, index=[20]) + X_trans = ct.transform(df_test) + assert isinstance(X_trans, pd.DataFrame) + + feature_names_out = ct.get_feature_names_out() + assert_array_equal(X_trans.columns, feature_names_out) + assert_array_equal(X_trans.index, df_test.index) + + +@pytest.mark.parametrize("remainder", ["drop", "passthrough"]) +@pytest.mark.parametrize("fit_transform", [True, False]) +def test_column_transform_set_output_mixed(remainder, fit_transform): + """Check ColumnTransformer outputs mixed types correctly.""" + pd = pytest.importorskip("pandas") + df = pd.DataFrame( + { + "pet": pd.Series(["dog", "cat", "snake"], dtype="category"), + "color": pd.Series(["green", "blue", "red"], dtype="object"), + "age": [1.4, 2.1, 4.4], + "height": [20, 40, 10], + "distance": pd.Series([20, pd.NA, 100], dtype="Int32"), + } + ) + ct = ColumnTransformer( + [ + ( + "color_encode", + OneHotEncoder(sparse_output=False, dtype="int8"), + ["color"], + ), + ("age", StandardScaler(), ["age"]), + ], + remainder=remainder, + verbose_feature_names_out=False, + ).set_output(transform="pandas") + if fit_transform: + X_trans = ct.fit_transform(df) + else: + X_trans = ct.fit(df).transform(df) + + assert isinstance(X_trans, pd.DataFrame) + assert_array_equal(X_trans.columns, ct.get_feature_names_out()) + + expected_dtypes = { + "color_blue": "int8", + "color_green": "int8", + "color_red": "int8", + "age": "float64", + "pet": "category", + "height": "int64", + "distance": "Int32", + } + for col, dtype in X_trans.dtypes.items(): + assert dtype == expected_dtypes[col] + + +@pytest.mark.parametrize("remainder", ["drop", "passthrough"]) +def test_column_transform_set_output_after_fitting(remainder): + pd = pytest.importorskip("pandas") + df = pd.DataFrame( + { + "pet": pd.Series(["dog", "cat", "snake"], dtype="category"), + "age": [1.4, 2.1, 4.4], + "height": [20, 40, 10], + } + ) + ct = ColumnTransformer( + [ + ( + "color_encode", + OneHotEncoder(sparse_output=False, dtype="int16"), + ["pet"], + ), + ("age", StandardScaler(), ["age"]), + ], + remainder=remainder, + verbose_feature_names_out=False, + ) + + # fit without calling set_output + X_trans = ct.fit_transform(df) + assert isinstance(X_trans, np.ndarray) + assert X_trans.dtype == "float64" + + ct.set_output(transform="pandas") + X_trans_df = ct.transform(df) + expected_dtypes = { + "pet_cat": "int16", + "pet_dog": "int16", + "pet_snake": "int16", + "height": "int64", + "age": "float64", + } + for col, dtype in X_trans_df.dtypes.items(): + assert dtype == expected_dtypes[col] + + +# PandasOutTransformer that does not define get_feature_names_out and always expects +# the input to be a DataFrame. +class PandasOutTransformer(BaseEstimator): + def __init__(self, offset=1.0): + self.offset = offset + + def fit(self, X, y=None): + pd = pytest.importorskip("pandas") + assert isinstance(X, pd.DataFrame) + return self + + def transform(self, X, y=None): + pd = pytest.importorskip("pandas") + assert isinstance(X, pd.DataFrame) + return X - self.offset + + def set_output(self, transform=None): + # This transformer will always output a DataFrame regardless of the + # configuration. + return self + + +@pytest.mark.parametrize( + "trans_1, expected_verbose_names, expected_non_verbose_names", + [ + ( + PandasOutTransformer(offset=2.0), + ["trans_0__feat1", "trans_1__feat0"], + ["feat1", "feat0"], + ), + ( + "drop", + ["trans_0__feat1"], + ["feat1"], + ), + ( + "passthrough", + ["trans_0__feat1", "trans_1__feat0"], + ["feat1", "feat0"], + ), + ], +) +def test_transformers_with_pandas_out_but_not_feature_names_out( + trans_1, expected_verbose_names, expected_non_verbose_names +): + """Check that set_config(transform="pandas") is compatible with more transformers. + + Specifically, if transformers returns a DataFrame, but does not define + `get_feature_names_out`. + """ + pd = pytest.importorskip("pandas") + + X_df = pd.DataFrame({"feat0": [1.0, 2.0, 3.0], "feat1": [2.0, 3.0, 4.0]}) + ct = ColumnTransformer( + [ + ("trans_0", PandasOutTransformer(offset=3.0), ["feat1"]), + ("trans_1", trans_1, ["feat0"]), + ] + ) + X_trans_np = ct.fit_transform(X_df) + assert isinstance(X_trans_np, np.ndarray) + + # `ct` does not have `get_feature_names_out` because `PandasOutTransformer` does + # not define the method. + with pytest.raises(AttributeError, match="not provide get_feature_names_out"): + ct.get_feature_names_out() + + # The feature names are prefixed because verbose_feature_names_out=True is default + ct.set_output(transform="pandas") + X_trans_df0 = ct.fit_transform(X_df) + assert_array_equal(X_trans_df0.columns, expected_verbose_names) + + ct.set_params(verbose_feature_names_out=False) + X_trans_df1 = ct.fit_transform(X_df) + assert_array_equal(X_trans_df1.columns, expected_non_verbose_names) + + +@pytest.mark.parametrize( + "empty_selection", + [[], np.array([False, False]), [False, False]], + ids=["list", "bool", "bool_int"], +) +def test_empty_selection_pandas_output(empty_selection): + """Check that pandas output works when there is an empty selection. + + Non-regression test for gh-25487 + """ + pd = pytest.importorskip("pandas") + + X = pd.DataFrame([[1.0, 2.2], [3.0, 1.0]], columns=["a", "b"]) + ct = ColumnTransformer( + [ + ("categorical", "passthrough", empty_selection), + ("numerical", StandardScaler(), ["a", "b"]), + ], + verbose_feature_names_out=True, + ) + ct.set_output(transform="pandas") + X_out = ct.fit_transform(X) + assert_array_equal(X_out.columns, ["numerical__a", "numerical__b"]) + + ct.set_params(verbose_feature_names_out=False) + X_out = ct.fit_transform(X) + assert_array_equal(X_out.columns, ["a", "b"]) + + +def test_raise_error_if_index_not_aligned(): + """Check column transformer raises error if indices are not aligned. + + Non-regression test for gh-26210. + """ + pd = pytest.importorskip("pandas") + + X = pd.DataFrame([[1.0, 2.2], [3.0, 1.0]], columns=["a", "b"], index=[8, 3]) + reset_index_transformer = FunctionTransformer( + lambda x: x.reset_index(drop=True), feature_names_out="one-to-one" + ) + + ct = ColumnTransformer( + [ + ("num1", "passthrough", ["a"]), + ("num2", reset_index_transformer, ["b"]), + ], + ) + ct.set_output(transform="pandas") + msg = ( + "Concatenating DataFrames from the transformer's output lead to" + " an inconsistent number of samples. The output may have Pandas" + " Indexes that do not match." + ) + with pytest.raises(ValueError, match=msg): + ct.fit_transform(X) + + +def test_remainder_set_output(): + """Check that the output is set for the remainder. + + Non-regression test for #26306. + """ + + pd = pytest.importorskip("pandas") + df = pd.DataFrame({"a": [True, False, True], "b": [1, 2, 3]}) + + ct = make_column_transformer( + (VarianceThreshold(), make_column_selector(dtype_include=bool)), + remainder=VarianceThreshold(), + verbose_feature_names_out=False, + ) + ct.set_output(transform="pandas") + + out = ct.fit_transform(df) + pd.testing.assert_frame_equal(out, df) + + ct.set_output(transform="default") + out = ct.fit_transform(df) + assert isinstance(out, np.ndarray) + + +def test_transform_pd_na(): + """Check behavior when a tranformer's output contains pandas.NA + + It should raise an error unless the output config is set to 'pandas'. + """ + pd = pytest.importorskip("pandas") + if not hasattr(pd, "Float64Dtype"): + pytest.skip( + "The issue with pd.NA tested here does not happen in old versions that do" + " not have the extension dtypes" + ) + df = pd.DataFrame({"a": [1.5, None]}) + ct = make_column_transformer(("passthrough", ["a"])) + # No warning with non-extension dtypes and np.nan + with warnings.catch_warnings(): + warnings.simplefilter("error") + ct.fit_transform(df) + df = df.convert_dtypes() + + # Error with extension dtype and pd.NA + with pytest.raises(ValueError, match=r"set_output\(transform='pandas'\)"): + ct.fit_transform(df) + + # No error when output is set to pandas + ct.set_output(transform="pandas") + ct.fit_transform(df) + ct.set_output(transform="default") + + # No error when there are no pd.NA + ct.fit_transform(df.fillna(-1.0)) + + +def test_dataframe_different_dataframe_libraries(): + """Check fitting and transforming on pandas and polars dataframes.""" + pd = pytest.importorskip("pandas") + pl = pytest.importorskip("polars") + X_train_np = np.array([[0, 1], [2, 4], [4, 5]]) + X_test_np = np.array([[1, 2], [1, 3], [2, 3]]) + + # Fit on pandas and transform on polars + X_train_pd = pd.DataFrame(X_train_np, columns=["a", "b"]) + X_test_pl = pl.DataFrame(X_test_np, schema=["a", "b"]) + + ct = make_column_transformer((Trans(), [0, 1])) + ct.fit(X_train_pd) + + out_pl_in = ct.transform(X_test_pl) + assert_array_equal(out_pl_in, X_test_np) + + # Fit on polars and transform on pandas + X_train_pl = pl.DataFrame(X_train_np, schema=["a", "b"]) + X_test_pd = pd.DataFrame(X_test_np, columns=["a", "b"]) + ct.fit(X_train_pl) + + out_pd_in = ct.transform(X_test_pd) + assert_array_equal(out_pd_in, X_test_np) + + +def test_column_transformer__getitem__(): + """Check __getitem__ for ColumnTransformer.""" + X = np.array([[0, 1, 2], [3, 4, 5]]) + ct = ColumnTransformer([("t1", Trans(), [0, 1]), ("t2", Trans(), [1, 2])]) + + msg = "ColumnTransformer is subscriptable after it is fitted" + with pytest.raises(TypeError, match=msg): + ct["t1"] + + ct.fit(X) + assert ct["t1"] is ct.named_transformers_["t1"] + assert ct["t2"] is ct.named_transformers_["t2"] + + msg = "'does_not_exist' is not a valid transformer name" + with pytest.raises(KeyError, match=msg): + ct["does_not_exist"] + + +@pytest.mark.parametrize("transform_output", ["default", "pandas"]) +def test_column_transformer_remainder_passthrough_naming_consistency(transform_output): + """Check that when `remainder="passthrough"`, inconsistent naming is handled + correctly by the underlying `FunctionTransformer`. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/28232 + """ + pd = pytest.importorskip("pandas") + X = pd.DataFrame(np.random.randn(10, 4)) + + preprocessor = ColumnTransformer( + transformers=[("scaler", StandardScaler(), [0, 1])], + remainder="passthrough", + ).set_output(transform=transform_output) + X_trans = preprocessor.fit_transform(X) + assert X_trans.shape == X.shape + + expected_column_names = [ + "scaler__x0", + "scaler__x1", + "remainder__x2", + "remainder__x3", + ] + if hasattr(X_trans, "columns"): + assert X_trans.columns.tolist() == expected_column_names + assert preprocessor.get_feature_names_out().tolist() == expected_column_names + + +@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"]) +def test_column_transformer_column_renaming(dataframe_lib): + """Check that we properly rename columns when using `ColumnTransformer` and + selected columns are redundant between transformers. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/28260 + """ + lib = pytest.importorskip(dataframe_lib) + + df = lib.DataFrame({"x1": [1, 2, 3], "x2": [10, 20, 30], "x3": [100, 200, 300]}) + + transformer = ColumnTransformer( + transformers=[ + ("A", "passthrough", ["x1", "x2", "x3"]), + ("B", FunctionTransformer(), ["x1", "x2"]), + ("C", StandardScaler(), ["x1", "x3"]), + # special case of a transformer returning 0-columns, e.g feature selector + ( + "D", + FunctionTransformer(lambda x: _safe_indexing(x, [], axis=1)), + ["x1", "x2", "x3"], + ), + ], + verbose_feature_names_out=True, + ).set_output(transform=dataframe_lib) + df_trans = transformer.fit_transform(df) + assert list(df_trans.columns) == [ + "A__x1", + "A__x2", + "A__x3", + "B__x1", + "B__x2", + "C__x1", + "C__x3", + ] + + +@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"]) +def test_column_transformer_error_with_duplicated_columns(dataframe_lib): + """Check that we raise an error when using `ColumnTransformer` and + the columns names are duplicated between transformers.""" + lib = pytest.importorskip(dataframe_lib) + + df = lib.DataFrame({"x1": [1, 2, 3], "x2": [10, 20, 30], "x3": [100, 200, 300]}) + + transformer = ColumnTransformer( + transformers=[ + ("A", "passthrough", ["x1", "x2", "x3"]), + ("B", FunctionTransformer(), ["x1", "x2"]), + ("C", StandardScaler(), ["x1", "x3"]), + # special case of a transformer returning 0-columns, e.g feature selector + ( + "D", + FunctionTransformer(lambda x: _safe_indexing(x, [], axis=1)), + ["x1", "x2", "x3"], + ), + ], + verbose_feature_names_out=False, + ).set_output(transform=dataframe_lib) + err_msg = re.escape( + "Duplicated feature names found before concatenating the outputs of the " + "transformers: ['x1', 'x2', 'x3'].\n" + "Transformer A has conflicting columns names: ['x1', 'x2', 'x3'].\n" + "Transformer B has conflicting columns names: ['x1', 'x2'].\n" + "Transformer C has conflicting columns names: ['x1', 'x3'].\n" + ) + with pytest.raises(ValueError, match=err_msg): + transformer.fit_transform(df) + + +@pytest.mark.skipif( + parse_version(joblib.__version__) < parse_version("1.3"), + reason="requires joblib >= 1.3", +) +def test_column_transformer_auto_memmap(): + """Check that ColumnTransformer works in parallel with joblib's auto-memmapping. + + non-regression test for issue #28781 + """ + X = np.random.RandomState(0).uniform(size=(3, 4)) + + scaler = StandardScaler(copy=False) + + transformer = ColumnTransformer( + transformers=[("scaler", scaler, [0])], + n_jobs=2, + ) + + with joblib.parallel_backend("loky", max_nbytes=1): + Xt = transformer.fit_transform(X) + + assert_allclose(Xt, StandardScaler().fit_transform(X[:, [0]])) + + +def test_column_transformer_non_default_index(): + """Check index handling when both pd.Series and pd.DataFrame slices are used in + ColumnTransformer. + + Non-regression test for issue #31546. + """ + pd = pytest.importorskip("pandas") + df = pd.DataFrame( + { + "dict_col": [{"foo": 1, "bar": 2}, {"foo": 3, "baz": 1}], + "dummy_col": [1, 2], + }, + index=[1, 2], + ) + t = make_column_transformer( + (DictVectorizer(sparse=False), "dict_col"), + (FunctionTransformer(), ["dummy_col"]), + ) + t.set_output(transform="pandas") + X = t.fit_transform(df) + assert list(X.index) == [1, 2] + + +# Metadata Routing Tests +# ====================== + + +@pytest.mark.parametrize("method", ["transform", "fit_transform", "fit"]) +def test_routing_passed_metadata_not_supported(method): + """Test that the right error message is raised when metadata is passed while + not supported when `enable_metadata_routing=False`.""" + + X = np.array([[0, 1, 2], [2, 4, 6]]).T + y = [1, 2, 3] + trs = ColumnTransformer([("trans", Trans(), [0])]).fit(X, y) + + with pytest.raises( + ValueError, match="is only supported if enable_metadata_routing=True" + ): + getattr(trs, method)([[1]], sample_weight=[1], prop="a") + + +@pytest.mark.parametrize("method", ["transform", "fit_transform", "fit"]) +@config_context(enable_metadata_routing=True) +def test_metadata_routing_for_column_transformer(method): + """Test that metadata is routed correctly for column transformer.""" + X = np.array([[0, 1, 2], [2, 4, 6]]).T + y = [1, 2, 3] + registry = _Registry() + sample_weight, metadata = [1], "a" + trs = ColumnTransformer( + [ + ( + "trans", + ConsumingTransformer(registry=registry) + .set_fit_request(sample_weight=True, metadata=True) + .set_transform_request(sample_weight=True, metadata=True), + [0], + ) + ] + ) + + if method == "transform": + trs.fit(X, y, sample_weight=sample_weight, metadata=metadata) + trs.transform(X, sample_weight=sample_weight, metadata=metadata) + else: + getattr(trs, method)(X, y, sample_weight=sample_weight, metadata=metadata) + + assert len(registry) + for _trs in registry: + check_recorded_metadata( + obj=_trs, + method=method, + parent=method, + sample_weight=sample_weight, + metadata=metadata, + ) + + +@config_context(enable_metadata_routing=True) +def test_metadata_routing_no_fit_transform(): + """Test metadata routing when the sub-estimator doesn't implement + ``fit_transform``.""" + + class NoFitTransform(BaseEstimator): + def fit(self, X, y=None, sample_weight=None, metadata=None): + assert sample_weight + assert metadata + return self + + def transform(self, X, sample_weight=None, metadata=None): + assert sample_weight + assert metadata + return X + + X = np.array([[0, 1, 2], [2, 4, 6]]).T + y = [1, 2, 3] + sample_weight, metadata = [1], "a" + trs = ColumnTransformer( + [ + ( + "trans", + NoFitTransform() + .set_fit_request(sample_weight=True, metadata=True) + .set_transform_request(sample_weight=True, metadata=True), + [0], + ) + ] + ) + + trs.fit(X, y, sample_weight=sample_weight, metadata=metadata) + trs.fit_transform(X, y, sample_weight=sample_weight, metadata=metadata) + + +@pytest.mark.parametrize("method", ["transform", "fit_transform", "fit"]) +@config_context(enable_metadata_routing=True) +def test_metadata_routing_error_for_column_transformer(method): + """Test that the right error is raised when metadata is not requested.""" + X = np.array([[0, 1, 2], [2, 4, 6]]).T + y = [1, 2, 3] + sample_weight, metadata = [1], "a" + trs = ColumnTransformer([("trans", ConsumingTransformer(), [0])]) + + error_message = ( + "[sample_weight, metadata] are passed but are not explicitly set as requested" + f" or not requested for ConsumingTransformer.{method}" + ) + with pytest.raises(ValueError, match=re.escape(error_message)): + if method == "transform": + trs.fit(X, y) + trs.transform(X, sample_weight=sample_weight, metadata=metadata) + else: + getattr(trs, method)(X, y, sample_weight=sample_weight, metadata=metadata) + + +@config_context(enable_metadata_routing=True) +def test_get_metadata_routing_works_without_fit(): + # Regression test for https://github.com/scikit-learn/scikit-learn/issues/28186 + # Make sure ct.get_metadata_routing() works w/o having called fit. + ct = ColumnTransformer([("trans", ConsumingTransformer(), [0])]) + ct.get_metadata_routing() + + +@config_context(enable_metadata_routing=True) +def test_remainder_request_always_present(): + # Test that remainder request is always present. + ct = ColumnTransformer( + [("trans", StandardScaler(), [0])], + remainder=ConsumingTransformer() + .set_fit_request(metadata=True) + .set_transform_request(metadata=True), + ) + router = ct.get_metadata_routing() + assert router.consumes("fit", ["metadata"]) == set(["metadata"]) + + +@config_context(enable_metadata_routing=True) +def test_unused_transformer_request_present(): + # Test that the request of a transformer is always present even when not + # used due to no selected columns. + ct = ColumnTransformer( + [ + ( + "trans", + ConsumingTransformer() + .set_fit_request(metadata=True) + .set_transform_request(metadata=True), + lambda X: [], + ) + ] + ) + router = ct.get_metadata_routing() + assert router.consumes("fit", ["metadata"]) == set(["metadata"]) + + +# End of Metadata Routing Tests +# ============================= diff --git a/.venv/lib/python3.12/site-packages/sklearn/compose/tests/test_target.py b/.venv/lib/python3.12/site-packages/sklearn/compose/tests/test_target.py new file mode 100644 index 0000000000000000000000000000000000000000..19dcfb5dc7f031f8b1a5303c84e84389fbcccc1e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/compose/tests/test_target.py @@ -0,0 +1,439 @@ +import warnings + +import numpy as np +import pytest + +from sklearn import config_context, datasets +from sklearn.base import BaseEstimator, TransformerMixin, clone +from sklearn.compose import TransformedTargetRegressor +from sklearn.dummy import DummyRegressor +from sklearn.linear_model import LinearRegression, OrthogonalMatchingPursuit +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import FunctionTransformer, StandardScaler +from sklearn.utils._testing import assert_allclose + +friedman = datasets.make_friedman1(random_state=0) + + +def test_transform_target_regressor_error(): + X, y = friedman + # provide a transformer and functions at the same time + regr = TransformedTargetRegressor( + regressor=LinearRegression(), + transformer=StandardScaler(), + func=np.exp, + inverse_func=np.log, + ) + with pytest.raises( + ValueError, + match="'transformer' and functions 'func'/'inverse_func' cannot both be set.", + ): + regr.fit(X, y) + # fit with sample_weight with a regressor which does not support it + sample_weight = np.ones((y.shape[0],)) + regr = TransformedTargetRegressor( + regressor=OrthogonalMatchingPursuit(), transformer=StandardScaler() + ) + with pytest.raises( + TypeError, + match=r"fit\(\) got an unexpected keyword argument 'sample_weight'", + ): + regr.fit(X, y, sample_weight=sample_weight) + + # one of (func, inverse_func) is given but the other one is not + regr = TransformedTargetRegressor(func=np.exp) + with pytest.raises( + ValueError, + match="When 'func' is provided, 'inverse_func' must also be provided", + ): + regr.fit(X, y) + + regr = TransformedTargetRegressor(inverse_func=np.log) + with pytest.raises( + ValueError, + match="When 'inverse_func' is provided, 'func' must also be provided", + ): + regr.fit(X, y) + + +def test_transform_target_regressor_invertible(): + X, y = friedman + regr = TransformedTargetRegressor( + regressor=LinearRegression(), + func=np.sqrt, + inverse_func=np.log, + check_inverse=True, + ) + with pytest.warns( + UserWarning, + match=(r"The provided functions.* are not strictly inverse of each other"), + ): + regr.fit(X, y) + regr = TransformedTargetRegressor( + regressor=LinearRegression(), func=np.sqrt, inverse_func=np.log + ) + regr.set_params(check_inverse=False) + + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + regr.fit(X, y) + + +def _check_standard_scaled(y, y_pred): + y_mean = np.mean(y, axis=0) + y_std = np.std(y, axis=0) + assert_allclose((y - y_mean) / y_std, y_pred) + + +def _check_shifted_by_one(y, y_pred): + assert_allclose(y + 1, y_pred) + + +def test_transform_target_regressor_functions(): + X, y = friedman + regr = TransformedTargetRegressor( + regressor=LinearRegression(), func=np.log, inverse_func=np.exp + ) + y_pred = regr.fit(X, y).predict(X) + # check the transformer output + y_tran = regr.transformer_.transform(y.reshape(-1, 1)).squeeze() + assert_allclose(np.log(y), y_tran) + assert_allclose( + y, regr.transformer_.inverse_transform(y_tran.reshape(-1, 1)).squeeze() + ) + assert y.shape == y_pred.shape + assert_allclose(y_pred, regr.inverse_func(regr.regressor_.predict(X))) + # check the regressor output + lr = LinearRegression().fit(X, regr.func(y)) + assert_allclose(regr.regressor_.coef_.ravel(), lr.coef_.ravel()) + + +def test_transform_target_regressor_functions_multioutput(): + X = friedman[0] + y = np.vstack((friedman[1], friedman[1] ** 2 + 1)).T + regr = TransformedTargetRegressor( + regressor=LinearRegression(), func=np.log, inverse_func=np.exp + ) + y_pred = regr.fit(X, y).predict(X) + # check the transformer output + y_tran = regr.transformer_.transform(y) + assert_allclose(np.log(y), y_tran) + assert_allclose(y, regr.transformer_.inverse_transform(y_tran)) + assert y.shape == y_pred.shape + assert_allclose(y_pred, regr.inverse_func(regr.regressor_.predict(X))) + # check the regressor output + lr = LinearRegression().fit(X, regr.func(y)) + assert_allclose(regr.regressor_.coef_.ravel(), lr.coef_.ravel()) + + +@pytest.mark.parametrize( + "X,y", [friedman, (friedman[0], np.vstack((friedman[1], friedman[1] ** 2 + 1)).T)] +) +def test_transform_target_regressor_1d_transformer(X, y): + # All transformer in scikit-learn expect 2D data. FunctionTransformer with + # validate=False lift this constraint without checking that the input is a + # 2D vector. We check the consistency of the data shape using a 1D and 2D y + # array. + transformer = FunctionTransformer( + func=lambda x: x + 1, inverse_func=lambda x: x - 1 + ) + regr = TransformedTargetRegressor( + regressor=LinearRegression(), transformer=transformer + ) + y_pred = regr.fit(X, y).predict(X) + assert y.shape == y_pred.shape + # consistency forward transform + y_tran = regr.transformer_.transform(y) + _check_shifted_by_one(y, y_tran) + assert y.shape == y_pred.shape + # consistency inverse transform + assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze()) + # consistency of the regressor + lr = LinearRegression() + transformer2 = clone(transformer) + lr.fit(X, transformer2.fit_transform(y)) + y_lr_pred = lr.predict(X) + assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred)) + assert_allclose(regr.regressor_.coef_, lr.coef_) + + +@pytest.mark.parametrize( + "X,y", [friedman, (friedman[0], np.vstack((friedman[1], friedman[1] ** 2 + 1)).T)] +) +def test_transform_target_regressor_2d_transformer(X, y): + # Check consistency with transformer accepting only 2D array and a 1D/2D y + # array. + transformer = StandardScaler() + regr = TransformedTargetRegressor( + regressor=LinearRegression(), transformer=transformer + ) + y_pred = regr.fit(X, y).predict(X) + assert y.shape == y_pred.shape + # consistency forward transform + if y.ndim == 1: # create a 2D array and squeeze results + y_tran = regr.transformer_.transform(y.reshape(-1, 1)) + else: + y_tran = regr.transformer_.transform(y) + _check_standard_scaled(y, y_tran.squeeze()) + assert y.shape == y_pred.shape + # consistency inverse transform + assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze()) + # consistency of the regressor + lr = LinearRegression() + transformer2 = clone(transformer) + if y.ndim == 1: # create a 2D array and squeeze results + lr.fit(X, transformer2.fit_transform(y.reshape(-1, 1)).squeeze()) + y_lr_pred = lr.predict(X).reshape(-1, 1) + y_pred2 = transformer2.inverse_transform(y_lr_pred).squeeze() + else: + lr.fit(X, transformer2.fit_transform(y)) + y_lr_pred = lr.predict(X) + y_pred2 = transformer2.inverse_transform(y_lr_pred) + + assert_allclose(y_pred, y_pred2) + assert_allclose(regr.regressor_.coef_, lr.coef_) + + +def test_transform_target_regressor_2d_transformer_multioutput(): + # Check consistency with transformer accepting only 2D array and a 2D y + # array. + X = friedman[0] + y = np.vstack((friedman[1], friedman[1] ** 2 + 1)).T + transformer = StandardScaler() + regr = TransformedTargetRegressor( + regressor=LinearRegression(), transformer=transformer + ) + y_pred = regr.fit(X, y).predict(X) + assert y.shape == y_pred.shape + # consistency forward transform + y_tran = regr.transformer_.transform(y) + _check_standard_scaled(y, y_tran) + assert y.shape == y_pred.shape + # consistency inverse transform + assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze()) + # consistency of the regressor + lr = LinearRegression() + transformer2 = clone(transformer) + lr.fit(X, transformer2.fit_transform(y)) + y_lr_pred = lr.predict(X) + assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred)) + assert_allclose(regr.regressor_.coef_, lr.coef_) + + +def test_transform_target_regressor_3d_target(): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/18866 + # Check with a 3D target with a transformer that reshapes the target + X = friedman[0] + y = np.tile(friedman[1].reshape(-1, 1, 1), [1, 3, 2]) + + def flatten_data(data): + return data.reshape(data.shape[0], -1) + + def unflatten_data(data): + return data.reshape(data.shape[0], -1, 2) + + transformer = FunctionTransformer(func=flatten_data, inverse_func=unflatten_data) + regr = TransformedTargetRegressor( + regressor=LinearRegression(), transformer=transformer + ) + y_pred = regr.fit(X, y).predict(X) + assert y.shape == y_pred.shape + + +def test_transform_target_regressor_multi_to_single(): + X = friedman[0] + y = np.transpose([friedman[1], (friedman[1] ** 2 + 1)]) + + def func(y): + out = np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2) + return out[:, np.newaxis] + + def inverse_func(y): + return y + + tt = TransformedTargetRegressor( + func=func, inverse_func=inverse_func, check_inverse=False + ) + tt.fit(X, y) + y_pred_2d_func = tt.predict(X) + assert y_pred_2d_func.shape == (100, 1) + + # force that the function only return a 1D array + def func(y): + return np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2) + + tt = TransformedTargetRegressor( + func=func, inverse_func=inverse_func, check_inverse=False + ) + tt.fit(X, y) + y_pred_1d_func = tt.predict(X) + assert y_pred_1d_func.shape == (100, 1) + + assert_allclose(y_pred_1d_func, y_pred_2d_func) + + +class DummyCheckerArrayTransformer(TransformerMixin, BaseEstimator): + def fit(self, X, y=None): + assert isinstance(X, np.ndarray) + return self + + def transform(self, X): + assert isinstance(X, np.ndarray) + return X + + def inverse_transform(self, X): + assert isinstance(X, np.ndarray) + return X + + +class DummyCheckerListRegressor(DummyRegressor): + def fit(self, X, y, sample_weight=None): + assert isinstance(X, list) + return super().fit(X, y, sample_weight) + + def predict(self, X): + assert isinstance(X, list) + return super().predict(X) + + +def test_transform_target_regressor_ensure_y_array(): + # check that the target ``y`` passed to the transformer will always be a + # numpy array. Similarly, if ``X`` is passed as a list, we check that the + # predictor receive as it is. + X, y = friedman + tt = TransformedTargetRegressor( + transformer=DummyCheckerArrayTransformer(), + regressor=DummyCheckerListRegressor(), + check_inverse=False, + ) + tt.fit(X.tolist(), y.tolist()) + tt.predict(X.tolist()) + with pytest.raises(AssertionError): + tt.fit(X, y.tolist()) + with pytest.raises(AssertionError): + tt.predict(X) + + +class DummyTransformer(TransformerMixin, BaseEstimator): + """Dummy transformer which count how many time fit was called.""" + + def __init__(self, fit_counter=0): + self.fit_counter = fit_counter + + def fit(self, X, y=None): + self.fit_counter += 1 + return self + + def transform(self, X): + return X + + def inverse_transform(self, X): + return X + + +@pytest.mark.parametrize("check_inverse", [False, True]) +def test_transform_target_regressor_count_fit(check_inverse): + # regression test for gh-issue #11618 + # check that we only call a single time fit for the transformer + X, y = friedman + ttr = TransformedTargetRegressor( + transformer=DummyTransformer(), check_inverse=check_inverse + ) + ttr.fit(X, y) + assert ttr.transformer_.fit_counter == 1 + + +class DummyRegressorWithExtraFitParams(DummyRegressor): + def fit(self, X, y, sample_weight=None, check_input=True): + # on the test below we force this to false, we make sure this is + # actually passed to the regressor + assert not check_input + return super().fit(X, y, sample_weight) + + +def test_transform_target_regressor_pass_fit_parameters(): + X, y = friedman + regr = TransformedTargetRegressor( + regressor=DummyRegressorWithExtraFitParams(), transformer=DummyTransformer() + ) + + regr.fit(X, y, check_input=False) + assert regr.transformer_.fit_counter == 1 + + +def test_transform_target_regressor_route_pipeline(): + X, y = friedman + + regr = TransformedTargetRegressor( + regressor=DummyRegressorWithExtraFitParams(), transformer=DummyTransformer() + ) + estimators = [("normalize", StandardScaler()), ("est", regr)] + + pip = Pipeline(estimators) + pip.fit(X, y, **{"est__check_input": False}) + + assert regr.transformer_.fit_counter == 1 + + +class DummyRegressorWithExtraPredictParams(DummyRegressor): + def predict(self, X, check_input=True): + # In the test below we make sure that the check input parameter is + # passed as false + self.predict_called = True + assert not check_input + return super().predict(X) + + +def test_transform_target_regressor_pass_extra_predict_parameters(): + # Checks that predict kwargs are passed to regressor. + X, y = friedman + regr = TransformedTargetRegressor( + regressor=DummyRegressorWithExtraPredictParams(), transformer=DummyTransformer() + ) + + regr.fit(X, y) + regr.predict(X, check_input=False) + assert regr.regressor_.predict_called + + +@pytest.mark.parametrize("output_format", ["pandas", "polars"]) +def test_transform_target_regressor_not_warns_with_global_output_set(output_format): + """Test that TransformedTargetRegressor will not raise warnings if + set_config(transform_output="pandas"/"polars") is set globally; regression test for + issue #29361.""" + X, y = datasets.make_regression() + y = np.abs(y) + 1 + with config_context(transform_output=output_format): + with warnings.catch_warnings(): + warnings.simplefilter("error") + TransformedTargetRegressor( + regressor=LinearRegression(), func=np.log, inverse_func=np.exp + ).fit(X, y) + + +class ValidateDimensionRegressor(BaseEstimator): + """A regressor that expects the target to have a specific number of dimensions.""" + + def __init__(self, ndim): + self.ndim = ndim + + def fit(self, X, y): + assert y.ndim == self.ndim + + def predict(self, X): + pass # pragma: no cover + + +@pytest.mark.parametrize("ndim", [1, 2]) +def test_transform_target_regressor_preserves_input_shape(ndim): + """Check that TransformedTargetRegressor internally preserves the shape of the input + + non-regression test for issue #26530. + """ + X, y = datasets.make_regression(n_samples=10, n_features=5, random_state=42) + if ndim == 2: + y = y.reshape(-1, 1) + + regr = TransformedTargetRegressor(regressor=ValidateDimensionRegressor(ndim)) + regr.fit(X, y) diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..65817ef7b977b84bcd4c8eb913866d54ce756999 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/covariance/__init__.py @@ -0,0 +1,46 @@ +"""Methods and algorithms to robustly estimate covariance. + +They estimate the covariance of features at given sets of points, as well as the +precision matrix defined as the inverse of the covariance. Covariance estimation is +closely related to the theory of Gaussian graphical models. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._elliptic_envelope import EllipticEnvelope +from ._empirical_covariance import ( + EmpiricalCovariance, + empirical_covariance, + log_likelihood, +) +from ._graph_lasso import GraphicalLasso, GraphicalLassoCV, graphical_lasso +from ._robust_covariance import MinCovDet, fast_mcd +from ._shrunk_covariance import ( + OAS, + LedoitWolf, + ShrunkCovariance, + ledoit_wolf, + ledoit_wolf_shrinkage, + oas, + shrunk_covariance, +) + +__all__ = [ + "OAS", + "EllipticEnvelope", + "EmpiricalCovariance", + "GraphicalLasso", + "GraphicalLassoCV", + "LedoitWolf", + "MinCovDet", + "ShrunkCovariance", + "empirical_covariance", + "fast_mcd", + "graphical_lasso", + "ledoit_wolf", + "ledoit_wolf_shrinkage", + "log_likelihood", + "oas", + "shrunk_covariance", +] diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/_elliptic_envelope.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/_elliptic_envelope.py new file mode 100644 index 0000000000000000000000000000000000000000..71fb72ccd683d04a708162774487922b719cbe4c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/covariance/_elliptic_envelope.py @@ -0,0 +1,266 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Real + +import numpy as np + +from ..base import OutlierMixin, _fit_context +from ..metrics import accuracy_score +from ..utils._param_validation import Interval +from ..utils.validation import check_is_fitted +from ._robust_covariance import MinCovDet + + +class EllipticEnvelope(OutlierMixin, MinCovDet): + """An object for detecting outliers in a Gaussian distributed dataset. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + store_precision : bool, default=True + Specify if the estimated precision is stored. + + assume_centered : bool, default=False + If True, the support of robust location and covariance estimates + is computed, and a covariance estimate is recomputed from it, + without centering the data. + Useful to work with data whose mean is significantly equal to + zero but is not exactly zero. + If False, the robust location and covariance are directly computed + with the FastMCD algorithm without additional treatment. + + support_fraction : float, default=None + The proportion of points to be included in the support of the raw + MCD estimate. If None, the minimum value of support_fraction will + be used within the algorithm: `(n_samples + n_features + 1) / 2 * n_samples`. + Range is (0, 1). + + contamination : float, default=0.1 + The amount of contamination of the data set, i.e. the proportion + of outliers in the data set. Range is (0, 0.5]. + + random_state : int, RandomState instance or None, default=None + Determines the pseudo random number generator for shuffling + the data. Pass an int for reproducible results across multiple function + calls. See :term:`Glossary `. + + Attributes + ---------- + location_ : ndarray of shape (n_features,) + Estimated robust location. + + covariance_ : ndarray of shape (n_features, n_features) + Estimated robust covariance matrix. + + precision_ : ndarray of shape (n_features, n_features) + Estimated pseudo inverse matrix. + (stored only if store_precision is True) + + support_ : ndarray of shape (n_samples,) + A mask of the observations that have been used to compute the + robust estimates of location and shape. + + offset_ : float + Offset used to define the decision function from the raw scores. + We have the relation: ``decision_function = score_samples - offset_``. + The offset depends on the contamination parameter and is defined in + such a way we obtain the expected number of outliers (samples with + decision function < 0) in training. + + .. versionadded:: 0.20 + + raw_location_ : ndarray of shape (n_features,) + The raw robust estimated location before correction and re-weighting. + + raw_covariance_ : ndarray of shape (n_features, n_features) + The raw robust estimated covariance before correction and re-weighting. + + raw_support_ : ndarray of shape (n_samples,) + A mask of the observations that have been used to compute + the raw robust estimates of location and shape, before correction + and re-weighting. + + dist_ : ndarray of shape (n_samples,) + Mahalanobis distances of the training set (on which :meth:`fit` is + called) observations. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + EmpiricalCovariance : Maximum likelihood covariance estimator. + GraphicalLasso : Sparse inverse covariance estimation + with an l1-penalized estimator. + LedoitWolf : LedoitWolf Estimator. + MinCovDet : Minimum Covariance Determinant + (robust estimator of covariance). + OAS : Oracle Approximating Shrinkage Estimator. + ShrunkCovariance : Covariance estimator with shrinkage. + + Notes + ----- + Outlier detection from covariance estimation may break or not + perform well in high-dimensional settings. In particular, one will + always take care to work with ``n_samples > n_features ** 2``. + + References + ---------- + .. [1] Rousseeuw, P.J., Van Driessen, K. "A fast algorithm for the + minimum covariance determinant estimator" Technometrics 41(3), 212 + (1999) + + Examples + -------- + >>> import numpy as np + >>> from sklearn.covariance import EllipticEnvelope + >>> true_cov = np.array([[.8, .3], + ... [.3, .4]]) + >>> X = np.random.RandomState(0).multivariate_normal(mean=[0, 0], + ... cov=true_cov, + ... size=500) + >>> cov = EllipticEnvelope(random_state=0).fit(X) + >>> # predict returns 1 for an inlier and -1 for an outlier + >>> cov.predict([[0, 0], + ... [3, 3]]) + array([ 1, -1]) + >>> cov.covariance_ + array([[0.7411, 0.2535], + [0.2535, 0.3053]]) + >>> cov.location_ + array([0.0813 , 0.0427]) + """ + + _parameter_constraints: dict = { + **MinCovDet._parameter_constraints, + "contamination": [Interval(Real, 0, 0.5, closed="right")], + } + + def __init__( + self, + *, + store_precision=True, + assume_centered=False, + support_fraction=None, + contamination=0.1, + random_state=None, + ): + super().__init__( + store_precision=store_precision, + assume_centered=assume_centered, + support_fraction=support_fraction, + random_state=random_state, + ) + self.contamination = contamination + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit the EllipticEnvelope model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Returns the instance itself. + """ + super().fit(X) + self.offset_ = np.percentile(-self.dist_, 100.0 * self.contamination) + return self + + def decision_function(self, X): + """Compute the decision function of the given observations. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data matrix. + + Returns + ------- + decision : ndarray of shape (n_samples,) + Decision function of the samples. + It is equal to the shifted Mahalanobis distances. + The threshold for being an outlier is 0, which ensures a + compatibility with other outlier detection algorithms. + """ + check_is_fitted(self) + negative_mahal_dist = self.score_samples(X) + return negative_mahal_dist - self.offset_ + + def score_samples(self, X): + """Compute the negative Mahalanobis distances. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data matrix. + + Returns + ------- + negative_mahal_distances : array-like of shape (n_samples,) + Opposite of the Mahalanobis distances. + """ + check_is_fitted(self) + return -self.mahalanobis(X) + + def predict(self, X): + """ + Predict labels (1 inlier, -1 outlier) of X according to fitted model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data matrix. + + Returns + ------- + is_inlier : ndarray of shape (n_samples,) + Returns -1 for anomalies/outliers and +1 for inliers. + """ + values = self.decision_function(X) + is_inlier = np.full(values.shape[0], -1, dtype=int) + is_inlier[values >= 0] = 1 + + return is_inlier + + def score(self, X, y, sample_weight=None): + """Return the mean accuracy on the given test data and labels. + + In multi-label classification, this is the subset accuracy + which is a harsh metric since you require for each sample that + each label set be correctly predicted. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Test samples. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + True labels for X. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + score : float + Mean accuracy of self.predict(X) w.r.t. y. + """ + return accuracy_score(y, self.predict(X), sample_weight=sample_weight) diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/_empirical_covariance.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/_empirical_covariance.py new file mode 100644 index 0000000000000000000000000000000000000000..c8ee198cc477275da749de0c212c4c874937f51b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/covariance/_empirical_covariance.py @@ -0,0 +1,370 @@ +""" +Maximum likelihood covariance estimator. + +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# avoid division truncation +import warnings + +import numpy as np +from scipy import linalg + +from sklearn.utils import metadata_routing + +from .. import config_context +from ..base import BaseEstimator, _fit_context +from ..metrics.pairwise import pairwise_distances +from ..utils import check_array +from ..utils._param_validation import validate_params +from ..utils.extmath import fast_logdet +from ..utils.validation import validate_data + + +@validate_params( + { + "emp_cov": [np.ndarray], + "precision": [np.ndarray], + }, + prefer_skip_nested_validation=True, +) +def log_likelihood(emp_cov, precision): + """Compute the sample mean of the log_likelihood under a covariance model. + + Computes the empirical expected log-likelihood, allowing for universal + comparison (beyond this software package), and accounts for normalization + terms and scaling. + + Parameters + ---------- + emp_cov : ndarray of shape (n_features, n_features) + Maximum Likelihood Estimator of covariance. + + precision : ndarray of shape (n_features, n_features) + The precision matrix of the covariance model to be tested. + + Returns + ------- + log_likelihood_ : float + Sample mean of the log-likelihood. + """ + p = precision.shape[0] + log_likelihood_ = -np.sum(emp_cov * precision) + fast_logdet(precision) + log_likelihood_ -= p * np.log(2 * np.pi) + log_likelihood_ /= 2.0 + return log_likelihood_ + + +@validate_params( + { + "X": ["array-like"], + "assume_centered": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def empirical_covariance(X, *, assume_centered=False): + """Compute the Maximum likelihood covariance estimator. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Data from which to compute the covariance estimate. + + assume_centered : bool, default=False + If `True`, data will not be centered before computation. + Useful when working with data whose mean is almost, but not exactly + zero. + If `False`, data will be centered before computation. + + Returns + ------- + covariance : ndarray of shape (n_features, n_features) + Empirical covariance (Maximum Likelihood Estimator). + + Examples + -------- + >>> from sklearn.covariance import empirical_covariance + >>> X = [[1,1,1],[1,1,1],[1,1,1], + ... [0,0,0],[0,0,0],[0,0,0]] + >>> empirical_covariance(X) + array([[0.25, 0.25, 0.25], + [0.25, 0.25, 0.25], + [0.25, 0.25, 0.25]]) + """ + X = check_array(X, ensure_2d=False, ensure_all_finite=False) + + if X.ndim == 1: + X = np.reshape(X, (1, -1)) + + if X.shape[0] == 1: + warnings.warn( + "Only one sample available. You may want to reshape your data array" + ) + + if assume_centered: + covariance = np.dot(X.T, X) / X.shape[0] + else: + covariance = np.cov(X.T, bias=1) + + if covariance.ndim == 0: + covariance = np.array([[covariance]]) + return covariance + + +class EmpiricalCovariance(BaseEstimator): + """Maximum likelihood covariance estimator. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + store_precision : bool, default=True + Specifies if the estimated precision is stored. + + assume_centered : bool, default=False + If True, data are not centered before computation. + Useful when working with data whose mean is almost, but not exactly + zero. + If False (default), data are centered before computation. + + Attributes + ---------- + location_ : ndarray of shape (n_features,) + Estimated location, i.e. the estimated mean. + + covariance_ : ndarray of shape (n_features, n_features) + Estimated covariance matrix. + + precision_ : ndarray of shape (n_features, n_features) + Estimated pseudo-inverse matrix. + (stored only if store_precision is True) + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + EllipticEnvelope : An object for detecting outliers in + a Gaussian distributed dataset. + GraphicalLasso : Sparse inverse covariance estimation + with an l1-penalized estimator. + LedoitWolf : LedoitWolf Estimator. + MinCovDet : Minimum Covariance Determinant + (robust estimator of covariance). + OAS : Oracle Approximating Shrinkage Estimator. + ShrunkCovariance : Covariance estimator with shrinkage. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.covariance import EmpiricalCovariance + >>> from sklearn.datasets import make_gaussian_quantiles + >>> real_cov = np.array([[.8, .3], + ... [.3, .4]]) + >>> rng = np.random.RandomState(0) + >>> X = rng.multivariate_normal(mean=[0, 0], + ... cov=real_cov, + ... size=500) + >>> cov = EmpiricalCovariance().fit(X) + >>> cov.covariance_ + array([[0.7569, 0.2818], + [0.2818, 0.3928]]) + >>> cov.location_ + array([0.0622, 0.0193]) + """ + + # X_test should have been called X + __metadata_request__score = {"X_test": metadata_routing.UNUSED} + + _parameter_constraints: dict = { + "store_precision": ["boolean"], + "assume_centered": ["boolean"], + } + + def __init__(self, *, store_precision=True, assume_centered=False): + self.store_precision = store_precision + self.assume_centered = assume_centered + + def _set_covariance(self, covariance): + """Saves the covariance and precision estimates + + Storage is done accordingly to `self.store_precision`. + Precision stored only if invertible. + + Parameters + ---------- + covariance : array-like of shape (n_features, n_features) + Estimated covariance matrix to be stored, and from which precision + is computed. + """ + covariance = check_array(covariance) + # set covariance + self.covariance_ = covariance + # set precision + if self.store_precision: + self.precision_ = linalg.pinvh(covariance, check_finite=False) + else: + self.precision_ = None + + def get_precision(self): + """Getter for the precision matrix. + + Returns + ------- + precision_ : array-like of shape (n_features, n_features) + The precision matrix associated to the current covariance object. + """ + if self.store_precision: + precision = self.precision_ + else: + precision = linalg.pinvh(self.covariance_, check_finite=False) + return precision + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit the maximum likelihood covariance estimator to X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Returns the instance itself. + """ + X = validate_data(self, X) + if self.assume_centered: + self.location_ = np.zeros(X.shape[1]) + else: + self.location_ = X.mean(0) + covariance = empirical_covariance(X, assume_centered=self.assume_centered) + self._set_covariance(covariance) + + return self + + def score(self, X_test, y=None): + """Compute the log-likelihood of `X_test` under the estimated Gaussian model. + + The Gaussian model is defined by its mean and covariance matrix which are + represented respectively by `self.location_` and `self.covariance_`. + + Parameters + ---------- + X_test : array-like of shape (n_samples, n_features) + Test data of which we compute the likelihood, where `n_samples` is + the number of samples and `n_features` is the number of features. + `X_test` is assumed to be drawn from the same distribution than + the data used in fit (including centering). + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + res : float + The log-likelihood of `X_test` with `self.location_` and `self.covariance_` + as estimators of the Gaussian model mean and covariance matrix respectively. + """ + X_test = validate_data(self, X_test, reset=False) + # compute empirical covariance of the test set + test_cov = empirical_covariance(X_test - self.location_, assume_centered=True) + # compute log likelihood + res = log_likelihood(test_cov, self.get_precision()) + + return res + + def error_norm(self, comp_cov, norm="frobenius", scaling=True, squared=True): + """Compute the Mean Squared Error between two covariance estimators. + + Parameters + ---------- + comp_cov : array-like of shape (n_features, n_features) + The covariance to compare with. + + norm : {"frobenius", "spectral"}, default="frobenius" + The type of norm used to compute the error. Available error types: + - 'frobenius' (default): sqrt(tr(A^t.A)) + - 'spectral': sqrt(max(eigenvalues(A^t.A)) + where A is the error ``(comp_cov - self.covariance_)``. + + scaling : bool, default=True + If True (default), the squared error norm is divided by n_features. + If False, the squared error norm is not rescaled. + + squared : bool, default=True + Whether to compute the squared error norm or the error norm. + If True (default), the squared error norm is returned. + If False, the error norm is returned. + + Returns + ------- + result : float + The Mean Squared Error (in the sense of the Frobenius norm) between + `self` and `comp_cov` covariance estimators. + """ + # compute the error + error = comp_cov - self.covariance_ + # compute the error norm + if norm == "frobenius": + squared_norm = np.sum(error**2) + elif norm == "spectral": + squared_norm = np.amax(linalg.svdvals(np.dot(error.T, error))) + else: + raise NotImplementedError( + "Only spectral and frobenius norms are implemented" + ) + # optionally scale the error norm + if scaling: + squared_norm = squared_norm / error.shape[0] + # finally get either the squared norm or the norm + if squared: + result = squared_norm + else: + result = np.sqrt(squared_norm) + + return result + + def mahalanobis(self, X): + """Compute the squared Mahalanobis distances of given observations. + + For a detailed example of how outliers affects the Mahalanobis distance, + see :ref:`sphx_glr_auto_examples_covariance_plot_mahalanobis_distances.py`. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The observations, the Mahalanobis distances of the which we + compute. Observations are assumed to be drawn from the same + distribution than the data used in fit. + + Returns + ------- + dist : ndarray of shape (n_samples,) + Squared Mahalanobis distances of the observations. + """ + X = validate_data(self, X, reset=False) + + precision = self.get_precision() + with config_context(assume_finite=True): + # compute mahalanobis distances + dist = pairwise_distances( + X, self.location_[np.newaxis, :], metric="mahalanobis", VI=precision + ) + + return np.reshape(dist, (len(X),)) ** 2 diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/_graph_lasso.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/_graph_lasso.py new file mode 100644 index 0000000000000000000000000000000000000000..e94663120216dbeab7f8edd963554b9653e58221 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/covariance/_graph_lasso.py @@ -0,0 +1,1145 @@ +"""GraphicalLasso: sparse inverse covariance estimation with an l1-penalized +estimator. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import operator +import sys +import time +import warnings +from numbers import Integral, Real + +import numpy as np +from scipy import linalg + +from ..base import _fit_context +from ..exceptions import ConvergenceWarning + +# mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast' +from ..linear_model import _cd_fast as cd_fast # type: ignore[attr-defined] +from ..linear_model import lars_path_gram +from ..model_selection import check_cv, cross_val_score +from ..utils import Bunch +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + _routing_enabled, + process_routing, +) +from ..utils.parallel import Parallel, delayed +from ..utils.validation import ( + _is_arraylike_not_scalar, + check_random_state, + check_scalar, + validate_data, +) +from . import EmpiricalCovariance, empirical_covariance, log_likelihood + + +# Helper functions to compute the objective and dual objective functions +# of the l1-penalized estimator +def _objective(mle, precision_, alpha): + """Evaluation of the graphical-lasso objective function + + the objective function is made of a shifted scaled version of the + normalized log-likelihood (i.e. its empirical mean over the samples) and a + penalisation term to promote sparsity + """ + p = precision_.shape[0] + cost = -2.0 * log_likelihood(mle, precision_) + p * np.log(2 * np.pi) + cost += alpha * (np.abs(precision_).sum() - np.abs(np.diag(precision_)).sum()) + return cost + + +def _dual_gap(emp_cov, precision_, alpha): + """Expression of the dual gap convergence criterion + + The specific definition is given in Duchi "Projected Subgradient Methods + for Learning Sparse Gaussians". + """ + gap = np.sum(emp_cov * precision_) + gap -= precision_.shape[0] + gap += alpha * (np.abs(precision_).sum() - np.abs(np.diag(precision_)).sum()) + return gap + + +# The g-lasso algorithm +def _graphical_lasso( + emp_cov, + alpha, + *, + cov_init=None, + mode="cd", + tol=1e-4, + enet_tol=1e-4, + max_iter=100, + verbose=False, + eps=np.finfo(np.float64).eps, +): + _, n_features = emp_cov.shape + if alpha == 0: + # Early return without regularization + precision_ = linalg.inv(emp_cov) + cost = -2.0 * log_likelihood(emp_cov, precision_) + cost += n_features * np.log(2 * np.pi) + d_gap = np.sum(emp_cov * precision_) - n_features + return emp_cov, precision_, (cost, d_gap), 0 + + if cov_init is None: + covariance_ = emp_cov.copy() + else: + covariance_ = cov_init.copy() + # As a trivial regularization (Tikhonov like), we scale down the + # off-diagonal coefficients of our starting point: This is needed, as + # in the cross-validation the cov_init can easily be + # ill-conditioned, and the CV loop blows. Beside, this takes + # conservative stand-point on the initial conditions, and it tends to + # make the convergence go faster. + covariance_ *= 0.95 + diagonal = emp_cov.flat[:: n_features + 1] + covariance_.flat[:: n_features + 1] = diagonal + precision_ = linalg.pinvh(covariance_) + + indices = np.arange(n_features) + i = 0 # initialize the counter to be robust to `max_iter=0` + costs = list() + # The different l1 regression solver have different numerical errors + if mode == "cd": + errors = dict(over="raise", invalid="ignore") + else: + errors = dict(invalid="raise") + try: + # be robust to the max_iter=0 edge case, see: + # https://github.com/scikit-learn/scikit-learn/issues/4134 + d_gap = np.inf + # set a sub_covariance buffer + sub_covariance = np.copy(covariance_[1:, 1:], order="C") + for i in range(max_iter): + for idx in range(n_features): + # To keep the contiguous matrix `sub_covariance` equal to + # covariance_[indices != idx].T[indices != idx] + # we only need to update 1 column and 1 line when idx changes + if idx > 0: + di = idx - 1 + sub_covariance[di] = covariance_[di][indices != idx] + sub_covariance[:, di] = covariance_[:, di][indices != idx] + else: + sub_covariance[:] = covariance_[1:, 1:] + row = emp_cov[idx, indices != idx] + with np.errstate(**errors): + if mode == "cd": + # Use coordinate descent + coefs = -( + precision_[indices != idx, idx] + / (precision_[idx, idx] + 1000 * eps) + ) + coefs, _, _, _ = cd_fast.enet_coordinate_descent_gram( + coefs, + alpha, + 0, + sub_covariance, + row, + row, + max_iter, + enet_tol, + check_random_state(None), + False, + ) + else: # mode == "lars" + _, _, coefs = lars_path_gram( + Xy=row, + Gram=sub_covariance, + n_samples=row.size, + alpha_min=alpha / (n_features - 1), + copy_Gram=True, + eps=eps, + method="lars", + return_path=False, + ) + # Update the precision matrix + precision_[idx, idx] = 1.0 / ( + covariance_[idx, idx] + - np.dot(covariance_[indices != idx, idx], coefs) + ) + precision_[indices != idx, idx] = -precision_[idx, idx] * coefs + precision_[idx, indices != idx] = -precision_[idx, idx] * coefs + coefs = np.dot(sub_covariance, coefs) + covariance_[idx, indices != idx] = coefs + covariance_[indices != idx, idx] = coefs + if not np.isfinite(precision_.sum()): + raise FloatingPointError( + "The system is too ill-conditioned for this solver" + ) + d_gap = _dual_gap(emp_cov, precision_, alpha) + cost = _objective(emp_cov, precision_, alpha) + if verbose: + print( + "[graphical_lasso] Iteration % 3i, cost % 3.2e, dual gap %.3e" + % (i, cost, d_gap) + ) + costs.append((cost, d_gap)) + if np.abs(d_gap) < tol: + break + if not np.isfinite(cost) and i > 0: + raise FloatingPointError( + "Non SPD result: the system is too ill-conditioned for this solver" + ) + else: + warnings.warn( + "graphical_lasso: did not converge after %i iteration: dual gap: %.3e" + % (max_iter, d_gap), + ConvergenceWarning, + ) + except FloatingPointError as e: + e.args = (e.args[0] + ". The system is too ill-conditioned for this solver",) + raise e + + return covariance_, precision_, costs, i + 1 + + +def alpha_max(emp_cov): + """Find the maximum alpha for which there are some non-zeros off-diagonal. + + Parameters + ---------- + emp_cov : ndarray of shape (n_features, n_features) + The sample covariance matrix. + + Notes + ----- + This results from the bound for the all the Lasso that are solved + in GraphicalLasso: each time, the row of cov corresponds to Xy. As the + bound for alpha is given by `max(abs(Xy))`, the result follows. + """ + A = np.copy(emp_cov) + A.flat[:: A.shape[0] + 1] = 0 + return np.max(np.abs(A)) + + +@validate_params( + { + "emp_cov": ["array-like"], + "return_costs": ["boolean"], + "return_n_iter": ["boolean"], + }, + prefer_skip_nested_validation=False, +) +def graphical_lasso( + emp_cov, + alpha, + *, + mode="cd", + tol=1e-4, + enet_tol=1e-4, + max_iter=100, + verbose=False, + return_costs=False, + eps=np.finfo(np.float64).eps, + return_n_iter=False, +): + """L1-penalized covariance estimator. + + Read more in the :ref:`User Guide `. + + .. versionchanged:: v0.20 + graph_lasso has been renamed to graphical_lasso + + Parameters + ---------- + emp_cov : array-like of shape (n_features, n_features) + Empirical covariance from which to compute the covariance estimate. + + alpha : float + The regularization parameter: the higher alpha, the more + regularization, the sparser the inverse covariance. + Range is (0, inf]. + + mode : {'cd', 'lars'}, default='cd' + The Lasso solver to use: coordinate descent or LARS. Use LARS for + very sparse underlying graphs, where p > n. Elsewhere prefer cd + which is more numerically stable. + + tol : float, default=1e-4 + The tolerance to declare convergence: if the dual gap goes below + this value, iterations are stopped. Range is (0, inf]. + + enet_tol : float, default=1e-4 + The tolerance for the elastic net solver used to calculate the descent + direction. This parameter controls the accuracy of the search direction + for a given column update, not of the overall parameter estimate. Only + used for mode='cd'. Range is (0, inf]. + + max_iter : int, default=100 + The maximum number of iterations. + + verbose : bool, default=False + If verbose is True, the objective function and dual gap are + printed at each iteration. + + return_costs : bool, default=False + If return_costs is True, the objective function and dual gap + at each iteration are returned. + + eps : float, default=eps + The machine-precision regularization in the computation of the + Cholesky diagonal factors. Increase this for very ill-conditioned + systems. Default is `np.finfo(np.float64).eps`. + + return_n_iter : bool, default=False + Whether or not to return the number of iterations. + + Returns + ------- + covariance : ndarray of shape (n_features, n_features) + The estimated covariance matrix. + + precision : ndarray of shape (n_features, n_features) + The estimated (sparse) precision matrix. + + costs : list of (objective, dual_gap) pairs + The list of values of the objective function and the dual gap at + each iteration. Returned only if return_costs is True. + + n_iter : int + Number of iterations. Returned only if `return_n_iter` is set to True. + + See Also + -------- + GraphicalLasso : Sparse inverse covariance estimation + with an l1-penalized estimator. + GraphicalLassoCV : Sparse inverse covariance with + cross-validated choice of the l1 penalty. + + Notes + ----- + The algorithm employed to solve this problem is the GLasso algorithm, + from the Friedman 2008 Biostatistics paper. It is the same algorithm + as in the R `glasso` package. + + One possible difference with the `glasso` R package is that the + diagonal coefficients are not penalized. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.datasets import make_sparse_spd_matrix + >>> from sklearn.covariance import empirical_covariance, graphical_lasso + >>> true_cov = make_sparse_spd_matrix(n_dim=3,random_state=42) + >>> rng = np.random.RandomState(42) + >>> X = rng.multivariate_normal(mean=np.zeros(3), cov=true_cov, size=3) + >>> emp_cov = empirical_covariance(X, assume_centered=True) + >>> emp_cov, _ = graphical_lasso(emp_cov, alpha=0.05) + >>> emp_cov + array([[ 1.687, 0.212, -0.209], + [ 0.212, 0.221, -0.0817], + [-0.209, -0.0817, 0.232]]) + """ + model = GraphicalLasso( + alpha=alpha, + mode=mode, + covariance="precomputed", + tol=tol, + enet_tol=enet_tol, + max_iter=max_iter, + verbose=verbose, + eps=eps, + assume_centered=True, + ).fit(emp_cov) + + output = [model.covariance_, model.precision_] + if return_costs: + output.append(model.costs_) + if return_n_iter: + output.append(model.n_iter_) + return tuple(output) + + +class BaseGraphicalLasso(EmpiricalCovariance): + _parameter_constraints: dict = { + **EmpiricalCovariance._parameter_constraints, + "tol": [Interval(Real, 0, None, closed="right")], + "enet_tol": [Interval(Real, 0, None, closed="right")], + "max_iter": [Interval(Integral, 0, None, closed="left")], + "mode": [StrOptions({"cd", "lars"})], + "verbose": ["verbose"], + "eps": [Interval(Real, 0, None, closed="both")], + } + _parameter_constraints.pop("store_precision") + + def __init__( + self, + tol=1e-4, + enet_tol=1e-4, + max_iter=100, + mode="cd", + verbose=False, + eps=np.finfo(np.float64).eps, + assume_centered=False, + ): + super().__init__(assume_centered=assume_centered) + self.tol = tol + self.enet_tol = enet_tol + self.max_iter = max_iter + self.mode = mode + self.verbose = verbose + self.eps = eps + + +class GraphicalLasso(BaseGraphicalLasso): + """Sparse inverse covariance estimation with an l1-penalized estimator. + + For a usage example see + :ref:`sphx_glr_auto_examples_applications_plot_stock_market.py`. + + Read more in the :ref:`User Guide `. + + .. versionchanged:: v0.20 + GraphLasso has been renamed to GraphicalLasso + + Parameters + ---------- + alpha : float, default=0.01 + The regularization parameter: the higher alpha, the more + regularization, the sparser the inverse covariance. + Range is (0, inf]. + + mode : {'cd', 'lars'}, default='cd' + The Lasso solver to use: coordinate descent or LARS. Use LARS for + very sparse underlying graphs, where p > n. Elsewhere prefer cd + which is more numerically stable. + + covariance : "precomputed", default=None + If covariance is "precomputed", the input data in `fit` is assumed + to be the covariance matrix. If `None`, the empirical covariance + is estimated from the data `X`. + + .. versionadded:: 1.3 + + tol : float, default=1e-4 + The tolerance to declare convergence: if the dual gap goes below + this value, iterations are stopped. Range is (0, inf]. + + enet_tol : float, default=1e-4 + The tolerance for the elastic net solver used to calculate the descent + direction. This parameter controls the accuracy of the search direction + for a given column update, not of the overall parameter estimate. Only + used for mode='cd'. Range is (0, inf]. + + max_iter : int, default=100 + The maximum number of iterations. + + verbose : bool, default=False + If verbose is True, the objective function and dual gap are + plotted at each iteration. + + eps : float, default=eps + The machine-precision regularization in the computation of the + Cholesky diagonal factors. Increase this for very ill-conditioned + systems. Default is `np.finfo(np.float64).eps`. + + .. versionadded:: 1.3 + + assume_centered : bool, default=False + If True, data are not centered before computation. + Useful when working with data whose mean is almost, but not exactly + zero. + If False, data are centered before computation. + + Attributes + ---------- + location_ : ndarray of shape (n_features,) + Estimated location, i.e. the estimated mean. + + covariance_ : ndarray of shape (n_features, n_features) + Estimated covariance matrix + + precision_ : ndarray of shape (n_features, n_features) + Estimated pseudo inverse matrix. + + n_iter_ : int + Number of iterations run. + + costs_ : list of (objective, dual_gap) pairs + The list of values of the objective function and the dual gap at + each iteration. Returned only if return_costs is True. + + .. versionadded:: 1.3 + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + graphical_lasso : L1-penalized covariance estimator. + GraphicalLassoCV : Sparse inverse covariance with + cross-validated choice of the l1 penalty. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.covariance import GraphicalLasso + >>> true_cov = np.array([[0.8, 0.0, 0.2, 0.0], + ... [0.0, 0.4, 0.0, 0.0], + ... [0.2, 0.0, 0.3, 0.1], + ... [0.0, 0.0, 0.1, 0.7]]) + >>> np.random.seed(0) + >>> X = np.random.multivariate_normal(mean=[0, 0, 0, 0], + ... cov=true_cov, + ... size=200) + >>> cov = GraphicalLasso().fit(X) + >>> np.around(cov.covariance_, decimals=3) + array([[0.816, 0.049, 0.218, 0.019], + [0.049, 0.364, 0.017, 0.034], + [0.218, 0.017, 0.322, 0.093], + [0.019, 0.034, 0.093, 0.69 ]]) + >>> np.around(cov.location_, decimals=3) + array([0.073, 0.04 , 0.038, 0.143]) + """ + + _parameter_constraints: dict = { + **BaseGraphicalLasso._parameter_constraints, + "alpha": [Interval(Real, 0, None, closed="both")], + "covariance": [StrOptions({"precomputed"}), None], + } + + def __init__( + self, + alpha=0.01, + *, + mode="cd", + covariance=None, + tol=1e-4, + enet_tol=1e-4, + max_iter=100, + verbose=False, + eps=np.finfo(np.float64).eps, + assume_centered=False, + ): + super().__init__( + tol=tol, + enet_tol=enet_tol, + max_iter=max_iter, + mode=mode, + verbose=verbose, + eps=eps, + assume_centered=assume_centered, + ) + self.alpha = alpha + self.covariance = covariance + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit the GraphicalLasso model to X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data from which to compute the covariance estimate. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Returns the instance itself. + """ + # Covariance does not make sense for a single feature + X = validate_data(self, X, ensure_min_features=2, ensure_min_samples=2) + + if self.covariance == "precomputed": + emp_cov = X.copy() + self.location_ = np.zeros(X.shape[1]) + else: + emp_cov = empirical_covariance(X, assume_centered=self.assume_centered) + if self.assume_centered: + self.location_ = np.zeros(X.shape[1]) + else: + self.location_ = X.mean(0) + + self.covariance_, self.precision_, self.costs_, self.n_iter_ = _graphical_lasso( + emp_cov, + alpha=self.alpha, + cov_init=None, + mode=self.mode, + tol=self.tol, + enet_tol=self.enet_tol, + max_iter=self.max_iter, + verbose=self.verbose, + eps=self.eps, + ) + return self + + +# Cross-validation with GraphicalLasso +def graphical_lasso_path( + X, + alphas, + cov_init=None, + X_test=None, + mode="cd", + tol=1e-4, + enet_tol=1e-4, + max_iter=100, + verbose=False, + eps=np.finfo(np.float64).eps, +): + """l1-penalized covariance estimator along a path of decreasing alphas + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Data from which to compute the covariance estimate. + + alphas : array-like of shape (n_alphas,) + The list of regularization parameters, decreasing order. + + cov_init : array of shape (n_features, n_features), default=None + The initial guess for the covariance. + + X_test : array of shape (n_test_samples, n_features), default=None + Optional test matrix to measure generalisation error. + + mode : {'cd', 'lars'}, default='cd' + The Lasso solver to use: coordinate descent or LARS. Use LARS for + very sparse underlying graphs, where p > n. Elsewhere prefer cd + which is more numerically stable. + + tol : float, default=1e-4 + The tolerance to declare convergence: if the dual gap goes below + this value, iterations are stopped. The tolerance must be a positive + number. + + enet_tol : float, default=1e-4 + The tolerance for the elastic net solver used to calculate the descent + direction. This parameter controls the accuracy of the search direction + for a given column update, not of the overall parameter estimate. Only + used for mode='cd'. The tolerance must be a positive number. + + max_iter : int, default=100 + The maximum number of iterations. This parameter should be a strictly + positive integer. + + verbose : int or bool, default=False + The higher the verbosity flag, the more information is printed + during the fitting. + + eps : float, default=eps + The machine-precision regularization in the computation of the + Cholesky diagonal factors. Increase this for very ill-conditioned + systems. Default is `np.finfo(np.float64).eps`. + + .. versionadded:: 1.3 + + Returns + ------- + covariances_ : list of shape (n_alphas,) of ndarray of shape \ + (n_features, n_features) + The estimated covariance matrices. + + precisions_ : list of shape (n_alphas,) of ndarray of shape \ + (n_features, n_features) + The estimated (sparse) precision matrices. + + scores_ : list of shape (n_alphas,), dtype=float + The generalisation error (log-likelihood) on the test data. + Returned only if test data is passed. + """ + inner_verbose = max(0, verbose - 1) + emp_cov = empirical_covariance(X) + if cov_init is None: + covariance_ = emp_cov.copy() + else: + covariance_ = cov_init + covariances_ = list() + precisions_ = list() + scores_ = list() + if X_test is not None: + test_emp_cov = empirical_covariance(X_test) + + for alpha in alphas: + try: + # Capture the errors, and move on + covariance_, precision_, _, _ = _graphical_lasso( + emp_cov, + alpha=alpha, + cov_init=covariance_, + mode=mode, + tol=tol, + enet_tol=enet_tol, + max_iter=max_iter, + verbose=inner_verbose, + eps=eps, + ) + covariances_.append(covariance_) + precisions_.append(precision_) + if X_test is not None: + this_score = log_likelihood(test_emp_cov, precision_) + except FloatingPointError: + this_score = -np.inf + covariances_.append(np.nan) + precisions_.append(np.nan) + if X_test is not None: + if not np.isfinite(this_score): + this_score = -np.inf + scores_.append(this_score) + if verbose == 1: + sys.stderr.write(".") + elif verbose > 1: + if X_test is not None: + print( + "[graphical_lasso_path] alpha: %.2e, score: %.2e" + % (alpha, this_score) + ) + else: + print("[graphical_lasso_path] alpha: %.2e" % alpha) + if X_test is not None: + return covariances_, precisions_, scores_ + return covariances_, precisions_ + + +class GraphicalLassoCV(BaseGraphicalLasso): + """Sparse inverse covariance w/ cross-validated choice of the l1 penalty. + + See glossary entry for :term:`cross-validation estimator`. + + Read more in the :ref:`User Guide `. + + .. versionchanged:: v0.20 + GraphLassoCV has been renamed to GraphicalLassoCV + + Parameters + ---------- + alphas : int or array-like of shape (n_alphas,), dtype=float, default=4 + If an integer is given, it fixes the number of points on the + grids of alpha to be used. If a list is given, it gives the + grid to be used. See the notes in the class docstring for + more details. Range is [1, inf) for an integer. + Range is (0, inf] for an array-like of floats. + + n_refinements : int, default=4 + The number of times the grid is refined. Not used if explicit + values of alphas are passed. Range is [1, inf). + + cv : int, cross-validation generator or iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross-validation, + - integer, to specify the number of folds. + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs :class:`~sklearn.model_selection.KFold` is used. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.20 + ``cv`` default value if None changed from 3-fold to 5-fold. + + tol : float, default=1e-4 + The tolerance to declare convergence: if the dual gap goes below + this value, iterations are stopped. Range is (0, inf]. + + enet_tol : float, default=1e-4 + The tolerance for the elastic net solver used to calculate the descent + direction. This parameter controls the accuracy of the search direction + for a given column update, not of the overall parameter estimate. Only + used for mode='cd'. Range is (0, inf]. + + max_iter : int, default=100 + Maximum number of iterations. + + mode : {'cd', 'lars'}, default='cd' + The Lasso solver to use: coordinate descent or LARS. Use LARS for + very sparse underlying graphs, where number of features is greater + than number of samples. Elsewhere prefer cd which is more numerically + stable. + + n_jobs : int, default=None + Number of jobs to run in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + .. versionchanged:: v0.20 + `n_jobs` default changed from 1 to None + + verbose : bool, default=False + If verbose is True, the objective function and duality gap are + printed at each iteration. + + eps : float, default=eps + The machine-precision regularization in the computation of the + Cholesky diagonal factors. Increase this for very ill-conditioned + systems. Default is `np.finfo(np.float64).eps`. + + .. versionadded:: 1.3 + + assume_centered : bool, default=False + If True, data are not centered before computation. + Useful when working with data whose mean is almost, but not exactly + zero. + If False, data are centered before computation. + + Attributes + ---------- + location_ : ndarray of shape (n_features,) + Estimated location, i.e. the estimated mean. + + covariance_ : ndarray of shape (n_features, n_features) + Estimated covariance matrix. + + precision_ : ndarray of shape (n_features, n_features) + Estimated precision matrix (inverse covariance). + + costs_ : list of (objective, dual_gap) pairs + The list of values of the objective function and the dual gap at + each iteration. Returned only if return_costs is True. + + .. versionadded:: 1.3 + + alpha_ : float + Penalization parameter selected. + + cv_results_ : dict of ndarrays + A dict with keys: + + alphas : ndarray of shape (n_alphas,) + All penalization parameters explored. + + split(k)_test_score : ndarray of shape (n_alphas,) + Log-likelihood score on left-out data across (k)th fold. + + .. versionadded:: 1.0 + + mean_test_score : ndarray of shape (n_alphas,) + Mean of scores over the folds. + + .. versionadded:: 1.0 + + std_test_score : ndarray of shape (n_alphas,) + Standard deviation of scores over the folds. + + .. versionadded:: 1.0 + + n_iter_ : int + Number of iterations run for the optimal alpha. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + graphical_lasso : L1-penalized covariance estimator. + GraphicalLasso : Sparse inverse covariance estimation + with an l1-penalized estimator. + + Notes + ----- + The search for the optimal penalization parameter (`alpha`) is done on an + iteratively refined grid: first the cross-validated scores on a grid are + computed, then a new refined grid is centered around the maximum, and so + on. + + One of the challenges which is faced here is that the solvers can + fail to converge to a well-conditioned estimate. The corresponding + values of `alpha` then come out as missing values, but the optimum may + be close to these missing values. + + In `fit`, once the best parameter `alpha` is found through + cross-validation, the model is fit again using the entire training set. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.covariance import GraphicalLassoCV + >>> true_cov = np.array([[0.8, 0.0, 0.2, 0.0], + ... [0.0, 0.4, 0.0, 0.0], + ... [0.2, 0.0, 0.3, 0.1], + ... [0.0, 0.0, 0.1, 0.7]]) + >>> np.random.seed(0) + >>> X = np.random.multivariate_normal(mean=[0, 0, 0, 0], + ... cov=true_cov, + ... size=200) + >>> cov = GraphicalLassoCV().fit(X) + >>> np.around(cov.covariance_, decimals=3) + array([[0.816, 0.051, 0.22 , 0.017], + [0.051, 0.364, 0.018, 0.036], + [0.22 , 0.018, 0.322, 0.094], + [0.017, 0.036, 0.094, 0.69 ]]) + >>> np.around(cov.location_, decimals=3) + array([0.073, 0.04 , 0.038, 0.143]) + + For an example comparing :class:`sklearn.covariance.GraphicalLassoCV`, + :func:`sklearn.covariance.ledoit_wolf` shrinkage and the empirical covariance + on high-dimensional gaussian data, see + :ref:`sphx_glr_auto_examples_covariance_plot_sparse_cov.py`. + """ + + _parameter_constraints: dict = { + **BaseGraphicalLasso._parameter_constraints, + "alphas": [Interval(Integral, 0, None, closed="left"), "array-like"], + "n_refinements": [Interval(Integral, 1, None, closed="left")], + "cv": ["cv_object"], + "n_jobs": [Integral, None], + } + + def __init__( + self, + *, + alphas=4, + n_refinements=4, + cv=None, + tol=1e-4, + enet_tol=1e-4, + max_iter=100, + mode="cd", + n_jobs=None, + verbose=False, + eps=np.finfo(np.float64).eps, + assume_centered=False, + ): + super().__init__( + tol=tol, + enet_tol=enet_tol, + max_iter=max_iter, + mode=mode, + verbose=verbose, + eps=eps, + assume_centered=assume_centered, + ) + self.alphas = alphas + self.n_refinements = n_refinements + self.cv = cv + self.n_jobs = n_jobs + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None, **params): + """Fit the GraphicalLasso covariance model to X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data from which to compute the covariance estimate. + + y : Ignored + Not used, present for API consistency by convention. + + **params : dict, default=None + Parameters to be passed to the CV splitter and the + cross_val_score function. + + .. versionadded:: 1.5 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Returns the instance itself. + """ + # Covariance does not make sense for a single feature + _raise_for_params(params, self, "fit") + + X = validate_data(self, X, ensure_min_features=2) + if self.assume_centered: + self.location_ = np.zeros(X.shape[1]) + else: + self.location_ = X.mean(0) + emp_cov = empirical_covariance(X, assume_centered=self.assume_centered) + + cv = check_cv(self.cv, y, classifier=False) + + # List of (alpha, scores, covs) + path = list() + n_alphas = self.alphas + inner_verbose = max(0, self.verbose - 1) + + if _is_arraylike_not_scalar(n_alphas): + for alpha in self.alphas: + check_scalar( + alpha, + "alpha", + Real, + min_val=0, + max_val=np.inf, + include_boundaries="right", + ) + alphas = self.alphas + n_refinements = 1 + else: + n_refinements = self.n_refinements + alpha_1 = alpha_max(emp_cov) + alpha_0 = 1e-2 * alpha_1 + alphas = np.logspace(np.log10(alpha_0), np.log10(alpha_1), n_alphas)[::-1] + + if _routing_enabled(): + routed_params = process_routing(self, "fit", **params) + else: + routed_params = Bunch(splitter=Bunch(split={})) + + t0 = time.time() + for i in range(n_refinements): + with warnings.catch_warnings(): + # No need to see the convergence warnings on this grid: + # they will always be points that will not converge + # during the cross-validation + warnings.simplefilter("ignore", ConvergenceWarning) + # Compute the cross-validated loss on the current grid + + # NOTE: Warm-restarting graphical_lasso_path has been tried, + # and this did not allow to gain anything + # (same execution time with or without). + this_path = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( + delayed(graphical_lasso_path)( + X[train], + alphas=alphas, + X_test=X[test], + mode=self.mode, + tol=self.tol, + enet_tol=self.enet_tol, + max_iter=int(0.1 * self.max_iter), + verbose=inner_verbose, + eps=self.eps, + ) + for train, test in cv.split(X, y, **routed_params.splitter.split) + ) + + # Little danse to transform the list in what we need + covs, _, scores = zip(*this_path) + covs = zip(*covs) + scores = zip(*scores) + path.extend(zip(alphas, scores, covs)) + path = sorted(path, key=operator.itemgetter(0), reverse=True) + + # Find the maximum (avoid using built in 'max' function to + # have a fully-reproducible selection of the smallest alpha + # in case of equality) + best_score = -np.inf + last_finite_idx = 0 + for index, (alpha, scores, _) in enumerate(path): + this_score = np.mean(scores) + if this_score >= 0.1 / np.finfo(np.float64).eps: + this_score = np.nan + if np.isfinite(this_score): + last_finite_idx = index + if this_score >= best_score: + best_score = this_score + best_index = index + + # Refine the grid + if best_index == 0: + # We do not need to go back: we have chosen + # the highest value of alpha for which there are + # non-zero coefficients + alpha_1 = path[0][0] + alpha_0 = path[1][0] + elif best_index == last_finite_idx and not best_index == len(path) - 1: + # We have non-converged models on the upper bound of the + # grid, we need to refine the grid there + alpha_1 = path[best_index][0] + alpha_0 = path[best_index + 1][0] + elif best_index == len(path) - 1: + alpha_1 = path[best_index][0] + alpha_0 = 0.01 * path[best_index][0] + else: + alpha_1 = path[best_index - 1][0] + alpha_0 = path[best_index + 1][0] + + if not _is_arraylike_not_scalar(n_alphas): + alphas = np.logspace(np.log10(alpha_1), np.log10(alpha_0), n_alphas + 2) + alphas = alphas[1:-1] + + if self.verbose and n_refinements > 1: + print( + "[GraphicalLassoCV] Done refinement % 2i out of %i: % 3is" + % (i + 1, n_refinements, time.time() - t0) + ) + + path = list(zip(*path)) + grid_scores = list(path[1]) + alphas = list(path[0]) + # Finally, compute the score with alpha = 0 + alphas.append(0) + grid_scores.append( + cross_val_score( + EmpiricalCovariance(), + X, + cv=cv, + n_jobs=self.n_jobs, + verbose=inner_verbose, + params=params, + ) + ) + grid_scores = np.array(grid_scores) + + self.cv_results_ = {"alphas": np.array(alphas)} + + for i in range(grid_scores.shape[1]): + self.cv_results_[f"split{i}_test_score"] = grid_scores[:, i] + + self.cv_results_["mean_test_score"] = np.mean(grid_scores, axis=1) + self.cv_results_["std_test_score"] = np.std(grid_scores, axis=1) + + best_alpha = alphas[best_index] + self.alpha_ = best_alpha + + # Finally fit the model with the selected alpha + self.covariance_, self.precision_, self.costs_, self.n_iter_ = _graphical_lasso( + emp_cov, + alpha=best_alpha, + mode=self.mode, + tol=self.tol, + enet_tol=self.enet_tol, + max_iter=self.max_iter, + verbose=inner_verbose, + eps=self.eps, + ) + return self + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.5 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__).add( + splitter=check_cv(self.cv), + method_mapping=MethodMapping().add(callee="split", caller="fit"), + ) + return router diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/_robust_covariance.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/_robust_covariance.py new file mode 100644 index 0000000000000000000000000000000000000000..81fc194c6e410da364db9eba432e7201e6ab44cb --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/covariance/_robust_covariance.py @@ -0,0 +1,874 @@ +""" +Robust location and covariance estimators. + +Here are implemented estimators that are resistant to outliers. + +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from numbers import Integral, Real + +import numpy as np +from scipy import linalg +from scipy.stats import chi2 + +from ..base import _fit_context +from ..utils import check_array, check_random_state +from ..utils._param_validation import Interval +from ..utils.extmath import fast_logdet +from ..utils.validation import validate_data +from ._empirical_covariance import EmpiricalCovariance, empirical_covariance + + +# Minimum Covariance Determinant +# Implementing of an algorithm by Rousseeuw & Van Driessen described in +# (A Fast Algorithm for the Minimum Covariance Determinant Estimator, +# 1999, American Statistical Association and the American Society +# for Quality, TECHNOMETRICS) +# XXX Is this really a public function? It's not listed in the docs or +# exported by sklearn.covariance. Deprecate? +def c_step( + X, + n_support, + remaining_iterations=30, + initial_estimates=None, + verbose=False, + cov_computation_method=empirical_covariance, + random_state=None, +): + """C_step procedure described in [Rouseeuw1984]_ aiming at computing MCD. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data set in which we look for the n_support observations whose + scatter matrix has minimum determinant. + + n_support : int + Number of observations to compute the robust estimates of location + and covariance from. This parameter must be greater than + `n_samples / 2`. + + remaining_iterations : int, default=30 + Number of iterations to perform. + According to [Rouseeuw1999]_, two iterations are sufficient to get + close to the minimum, and we never need more than 30 to reach + convergence. + + initial_estimates : tuple of shape (2,), default=None + Initial estimates of location and shape from which to run the c_step + procedure: + - initial_estimates[0]: an initial location estimate + - initial_estimates[1]: an initial covariance estimate + + verbose : bool, default=False + Verbose mode. + + cov_computation_method : callable, \ + default=:func:`sklearn.covariance.empirical_covariance` + The function which will be used to compute the covariance. + Must return array of shape (n_features, n_features). + + random_state : int, RandomState instance or None, default=None + Determines the pseudo random number generator for shuffling the data. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + location : ndarray of shape (n_features,) + Robust location estimates. + + covariance : ndarray of shape (n_features, n_features) + Robust covariance estimates. + + support : ndarray of shape (n_samples,) + A mask for the `n_support` observations whose scatter matrix has + minimum determinant. + + References + ---------- + .. [Rouseeuw1999] A Fast Algorithm for the Minimum Covariance Determinant + Estimator, 1999, American Statistical Association and the American + Society for Quality, TECHNOMETRICS + """ + X = np.asarray(X) + random_state = check_random_state(random_state) + return _c_step( + X, + n_support, + remaining_iterations=remaining_iterations, + initial_estimates=initial_estimates, + verbose=verbose, + cov_computation_method=cov_computation_method, + random_state=random_state, + ) + + +def _c_step( + X, + n_support, + random_state, + remaining_iterations=30, + initial_estimates=None, + verbose=False, + cov_computation_method=empirical_covariance, +): + n_samples, n_features = X.shape + dist = np.inf + + # Initialisation + if initial_estimates is None: + # compute initial robust estimates from a random subset + support_indices = random_state.permutation(n_samples)[:n_support] + else: + # get initial robust estimates from the function parameters + location = initial_estimates[0] + covariance = initial_estimates[1] + # run a special iteration for that case (to get an initial support_indices) + precision = linalg.pinvh(covariance) + X_centered = X - location + dist = (np.dot(X_centered, precision) * X_centered).sum(1) + # compute new estimates + support_indices = np.argpartition(dist, n_support - 1)[:n_support] + + X_support = X[support_indices] + location = X_support.mean(0) + covariance = cov_computation_method(X_support) + + # Iterative procedure for Minimum Covariance Determinant computation + det = fast_logdet(covariance) + # If the data already has singular covariance, calculate the precision, + # as the loop below will not be entered. + if np.isinf(det): + precision = linalg.pinvh(covariance) + + previous_det = np.inf + while det < previous_det and remaining_iterations > 0 and not np.isinf(det): + # save old estimates values + previous_location = location + previous_covariance = covariance + previous_det = det + previous_support_indices = support_indices + # compute a new support_indices from the full data set mahalanobis distances + precision = linalg.pinvh(covariance) + X_centered = X - location + dist = (np.dot(X_centered, precision) * X_centered).sum(axis=1) + # compute new estimates + support_indices = np.argpartition(dist, n_support - 1)[:n_support] + X_support = X[support_indices] + location = X_support.mean(axis=0) + covariance = cov_computation_method(X_support) + det = fast_logdet(covariance) + # update remaining iterations for early stopping + remaining_iterations -= 1 + + previous_dist = dist + dist = (np.dot(X - location, precision) * (X - location)).sum(axis=1) + # Check if best fit already found (det => 0, logdet => -inf) + if np.isinf(det): + results = location, covariance, det, support_indices, dist + # Check convergence + if np.allclose(det, previous_det): + # c_step procedure converged + if verbose: + print( + "Optimal couple (location, covariance) found before" + " ending iterations (%d left)" % (remaining_iterations) + ) + results = location, covariance, det, support_indices, dist + elif det > previous_det: + # determinant has increased (should not happen) + warnings.warn( + "Determinant has increased; this should not happen: " + "log(det) > log(previous_det) (%.15f > %.15f). " + "You may want to try with a higher value of " + "support_fraction (current value: %.3f)." + % (det, previous_det, n_support / n_samples), + RuntimeWarning, + ) + results = ( + previous_location, + previous_covariance, + previous_det, + previous_support_indices, + previous_dist, + ) + + # Check early stopping + if remaining_iterations == 0: + if verbose: + print("Maximum number of iterations reached") + results = location, covariance, det, support_indices, dist + + location, covariance, det, support_indices, dist = results + # Convert from list of indices to boolean mask. + support = np.bincount(support_indices, minlength=n_samples).astype(bool) + return location, covariance, det, support, dist + + +def select_candidates( + X, + n_support, + n_trials, + select=1, + n_iter=30, + verbose=False, + cov_computation_method=empirical_covariance, + random_state=None, +): + """Finds the best pure subset of observations to compute MCD from it. + + The purpose of this function is to find the best sets of n_support + observations with respect to a minimization of their covariance + matrix determinant. Equivalently, it removes n_samples-n_support + observations to construct what we call a pure data set (i.e. not + containing outliers). The list of the observations of the pure + data set is referred to as the `support`. + + Starting from a random support, the pure data set is found by the + c_step procedure introduced by Rousseeuw and Van Driessen in + [RV]_. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data (sub)set in which we look for the n_support purest observations. + + n_support : int + The number of samples the pure data set must contain. + This parameter must be in the range `[(n + p + 1)/2] < n_support < n`. + + n_trials : int or tuple of shape (2,) + Number of different initial sets of observations from which to + run the algorithm. This parameter should be a strictly positive + integer. + Instead of giving a number of trials to perform, one can provide a + list of initial estimates that will be used to iteratively run + c_step procedures. In this case: + - n_trials[0]: array-like, shape (n_trials, n_features) + is the list of `n_trials` initial location estimates + - n_trials[1]: array-like, shape (n_trials, n_features, n_features) + is the list of `n_trials` initial covariances estimates + + select : int, default=1 + Number of best candidates results to return. This parameter must be + a strictly positive integer. + + n_iter : int, default=30 + Maximum number of iterations for the c_step procedure. + (2 is enough to be close to the final solution. "Never" exceeds 20). + This parameter must be a strictly positive integer. + + verbose : bool, default=False + Control the output verbosity. + + cov_computation_method : callable, \ + default=:func:`sklearn.covariance.empirical_covariance` + The function which will be used to compute the covariance. + Must return an array of shape (n_features, n_features). + + random_state : int, RandomState instance or None, default=None + Determines the pseudo random number generator for shuffling the data. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + See Also + --------- + c_step + + Returns + ------- + best_locations : ndarray of shape (select, n_features) + The `select` location estimates computed from the `select` best + supports found in the data set (`X`). + + best_covariances : ndarray of shape (select, n_features, n_features) + The `select` covariance estimates computed from the `select` + best supports found in the data set (`X`). + + best_supports : ndarray of shape (select, n_samples) + The `select` best supports found in the data set (`X`). + + References + ---------- + .. [RV] A Fast Algorithm for the Minimum Covariance Determinant + Estimator, 1999, American Statistical Association and the American + Society for Quality, TECHNOMETRICS + """ + random_state = check_random_state(random_state) + + if isinstance(n_trials, Integral): + run_from_estimates = False + elif isinstance(n_trials, tuple): + run_from_estimates = True + estimates_list = n_trials + n_trials = estimates_list[0].shape[0] + else: + raise TypeError( + "Invalid 'n_trials' parameter, expected tuple or integer, got %s (%s)" + % (n_trials, type(n_trials)) + ) + + # compute `n_trials` location and shape estimates candidates in the subset + all_estimates = [] + if not run_from_estimates: + # perform `n_trials` computations from random initial supports + for j in range(n_trials): + all_estimates.append( + _c_step( + X, + n_support, + remaining_iterations=n_iter, + verbose=verbose, + cov_computation_method=cov_computation_method, + random_state=random_state, + ) + ) + else: + # perform computations from every given initial estimates + for j in range(n_trials): + initial_estimates = (estimates_list[0][j], estimates_list[1][j]) + all_estimates.append( + _c_step( + X, + n_support, + remaining_iterations=n_iter, + initial_estimates=initial_estimates, + verbose=verbose, + cov_computation_method=cov_computation_method, + random_state=random_state, + ) + ) + all_locs_sub, all_covs_sub, all_dets_sub, all_supports_sub, all_ds_sub = zip( + *all_estimates + ) + # find the `n_best` best results among the `n_trials` ones + index_best = np.argsort(all_dets_sub)[:select] + best_locations = np.asarray(all_locs_sub)[index_best] + best_covariances = np.asarray(all_covs_sub)[index_best] + best_supports = np.asarray(all_supports_sub)[index_best] + best_ds = np.asarray(all_ds_sub)[index_best] + + return best_locations, best_covariances, best_supports, best_ds + + +def fast_mcd( + X, + support_fraction=None, + cov_computation_method=empirical_covariance, + random_state=None, +): + """Estimate the Minimum Covariance Determinant matrix. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data matrix, with p features and n samples. + + support_fraction : float, default=None + The proportion of points to be included in the support of the raw + MCD estimate. Default is `None`, which implies that the minimum + value of `support_fraction` will be used within the algorithm: + `(n_samples + n_features + 1) / 2 * n_samples`. This parameter must be + in the range (0, 1). + + cov_computation_method : callable, \ + default=:func:`sklearn.covariance.empirical_covariance` + The function which will be used to compute the covariance. + Must return an array of shape (n_features, n_features). + + random_state : int, RandomState instance or None, default=None + Determines the pseudo random number generator for shuffling the data. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + location : ndarray of shape (n_features,) + Robust location of the data. + + covariance : ndarray of shape (n_features, n_features) + Robust covariance of the features. + + support : ndarray of shape (n_samples,), dtype=bool + A mask of the observations that have been used to compute + the robust location and covariance estimates of the data set. + + Notes + ----- + The FastMCD algorithm has been introduced by Rousseuw and Van Driessen + in "A Fast Algorithm for the Minimum Covariance Determinant Estimator, + 1999, American Statistical Association and the American Society + for Quality, TECHNOMETRICS". + The principle is to compute robust estimates and random subsets before + pooling them into a larger subsets, and finally into the full data set. + Depending on the size of the initial sample, we have one, two or three + such computation levels. + + Note that only raw estimates are returned. If one is interested in + the correction and reweighting steps described in [RouseeuwVan]_, + see the MinCovDet object. + + References + ---------- + + .. [RouseeuwVan] A Fast Algorithm for the Minimum Covariance + Determinant Estimator, 1999, American Statistical Association + and the American Society for Quality, TECHNOMETRICS + + .. [Butler1993] R. W. Butler, P. L. Davies and M. Jhun, + Asymptotics For The Minimum Covariance Determinant Estimator, + The Annals of Statistics, 1993, Vol. 21, No. 3, 1385-1400 + """ + random_state = check_random_state(random_state) + + X = check_array(X, ensure_min_samples=2, estimator="fast_mcd") + n_samples, n_features = X.shape + + # minimum breakdown value + if support_fraction is None: + n_support = min(int(np.ceil(0.5 * (n_samples + n_features + 1))), n_samples) + else: + n_support = int(support_fraction * n_samples) + + # 1-dimensional case quick computation + # (Rousseeuw, P. J. and Leroy, A. M. (2005) References, in Robust + # Regression and Outlier Detection, John Wiley & Sons, chapter 4) + if n_features == 1: + if n_support < n_samples: + # find the sample shortest halves + X_sorted = np.sort(np.ravel(X)) + diff = X_sorted[n_support:] - X_sorted[: (n_samples - n_support)] + halves_start = np.where(diff == np.min(diff))[0] + # take the middle points' mean to get the robust location estimate + location = ( + 0.5 + * (X_sorted[n_support + halves_start] + X_sorted[halves_start]).mean() + ) + support = np.zeros(n_samples, dtype=bool) + X_centered = X - location + support[np.argsort(np.abs(X_centered), 0)[:n_support]] = True + covariance = np.asarray([[np.var(X[support])]]) + location = np.array([location]) + # get precision matrix in an optimized way + precision = linalg.pinvh(covariance) + dist = (np.dot(X_centered, precision) * (X_centered)).sum(axis=1) + else: + support = np.ones(n_samples, dtype=bool) + covariance = np.asarray([[np.var(X)]]) + location = np.asarray([np.mean(X)]) + X_centered = X - location + # get precision matrix in an optimized way + precision = linalg.pinvh(covariance) + dist = (np.dot(X_centered, precision) * (X_centered)).sum(axis=1) + # Starting FastMCD algorithm for p-dimensional case + if (n_samples > 500) and (n_features > 1): + # 1. Find candidate supports on subsets + # a. split the set in subsets of size ~ 300 + n_subsets = n_samples // 300 + n_samples_subsets = n_samples // n_subsets + samples_shuffle = random_state.permutation(n_samples) + h_subset = int(np.ceil(n_samples_subsets * (n_support / float(n_samples)))) + # b. perform a total of 500 trials + n_trials_tot = 500 + # c. select 10 best (location, covariance) for each subset + n_best_sub = 10 + n_trials = max(10, n_trials_tot // n_subsets) + n_best_tot = n_subsets * n_best_sub + all_best_locations = np.zeros((n_best_tot, n_features)) + try: + all_best_covariances = np.zeros((n_best_tot, n_features, n_features)) + except MemoryError: + # The above is too big. Let's try with something much small + # (and less optimal) + n_best_tot = 10 + all_best_covariances = np.zeros((n_best_tot, n_features, n_features)) + n_best_sub = 2 + for i in range(n_subsets): + low_bound = i * n_samples_subsets + high_bound = low_bound + n_samples_subsets + current_subset = X[samples_shuffle[low_bound:high_bound]] + best_locations_sub, best_covariances_sub, _, _ = select_candidates( + current_subset, + h_subset, + n_trials, + select=n_best_sub, + n_iter=2, + cov_computation_method=cov_computation_method, + random_state=random_state, + ) + subset_slice = np.arange(i * n_best_sub, (i + 1) * n_best_sub) + all_best_locations[subset_slice] = best_locations_sub + all_best_covariances[subset_slice] = best_covariances_sub + # 2. Pool the candidate supports into a merged set + # (possibly the full dataset) + n_samples_merged = min(1500, n_samples) + h_merged = int(np.ceil(n_samples_merged * (n_support / float(n_samples)))) + if n_samples > 1500: + n_best_merged = 10 + else: + n_best_merged = 1 + # find the best couples (location, covariance) on the merged set + selection = random_state.permutation(n_samples)[:n_samples_merged] + locations_merged, covariances_merged, supports_merged, d = select_candidates( + X[selection], + h_merged, + n_trials=(all_best_locations, all_best_covariances), + select=n_best_merged, + cov_computation_method=cov_computation_method, + random_state=random_state, + ) + # 3. Finally get the overall best (locations, covariance) couple + if n_samples < 1500: + # directly get the best couple (location, covariance) + location = locations_merged[0] + covariance = covariances_merged[0] + support = np.zeros(n_samples, dtype=bool) + dist = np.zeros(n_samples) + support[selection] = supports_merged[0] + dist[selection] = d[0] + else: + # select the best couple on the full dataset + locations_full, covariances_full, supports_full, d = select_candidates( + X, + n_support, + n_trials=(locations_merged, covariances_merged), + select=1, + cov_computation_method=cov_computation_method, + random_state=random_state, + ) + location = locations_full[0] + covariance = covariances_full[0] + support = supports_full[0] + dist = d[0] + elif n_features > 1: + # 1. Find the 10 best couples (location, covariance) + # considering two iterations + n_trials = 30 + n_best = 10 + locations_best, covariances_best, _, _ = select_candidates( + X, + n_support, + n_trials=n_trials, + select=n_best, + n_iter=2, + cov_computation_method=cov_computation_method, + random_state=random_state, + ) + # 2. Select the best couple on the full dataset amongst the 10 + locations_full, covariances_full, supports_full, d = select_candidates( + X, + n_support, + n_trials=(locations_best, covariances_best), + select=1, + cov_computation_method=cov_computation_method, + random_state=random_state, + ) + location = locations_full[0] + covariance = covariances_full[0] + support = supports_full[0] + dist = d[0] + + return location, covariance, support, dist + + +class MinCovDet(EmpiricalCovariance): + """Minimum Covariance Determinant (MCD): robust estimator of covariance. + + The Minimum Covariance Determinant covariance estimator is to be applied + on Gaussian-distributed data, but could still be relevant on data + drawn from a unimodal, symmetric distribution. It is not meant to be used + with multi-modal data (the algorithm used to fit a MinCovDet object is + likely to fail in such a case). + One should consider projection pursuit methods to deal with multi-modal + datasets. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + store_precision : bool, default=True + Specify if the estimated precision is stored. + + assume_centered : bool, default=False + If True, the support of the robust location and the covariance + estimates is computed, and a covariance estimate is recomputed from + it, without centering the data. + Useful to work with data whose mean is significantly equal to + zero but is not exactly zero. + If False, the robust location and covariance are directly computed + with the FastMCD algorithm without additional treatment. + + support_fraction : float, default=None + The proportion of points to be included in the support of the raw + MCD estimate. Default is None, which implies that the minimum + value of support_fraction will be used within the algorithm: + `(n_samples + n_features + 1) / 2 * n_samples`. The parameter must be + in the range (0, 1]. + + random_state : int, RandomState instance or None, default=None + Determines the pseudo random number generator for shuffling the data. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + raw_location_ : ndarray of shape (n_features,) + The raw robust estimated location before correction and re-weighting. + + raw_covariance_ : ndarray of shape (n_features, n_features) + The raw robust estimated covariance before correction and re-weighting. + + raw_support_ : ndarray of shape (n_samples,) + A mask of the observations that have been used to compute + the raw robust estimates of location and shape, before correction + and re-weighting. + + location_ : ndarray of shape (n_features,) + Estimated robust location. + + For an example of comparing raw robust estimates with + the true location and covariance, refer to + :ref:`sphx_glr_auto_examples_covariance_plot_robust_vs_empirical_covariance.py`. + + covariance_ : ndarray of shape (n_features, n_features) + Estimated robust covariance matrix. + + precision_ : ndarray of shape (n_features, n_features) + Estimated pseudo inverse matrix. + (stored only if store_precision is True) + + support_ : ndarray of shape (n_samples,) + A mask of the observations that have been used to compute + the robust estimates of location and shape. + + dist_ : ndarray of shape (n_samples,) + Mahalanobis distances of the training set (on which :meth:`fit` is + called) observations. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + EllipticEnvelope : An object for detecting outliers in + a Gaussian distributed dataset. + EmpiricalCovariance : Maximum likelihood covariance estimator. + GraphicalLasso : Sparse inverse covariance estimation + with an l1-penalized estimator. + GraphicalLassoCV : Sparse inverse covariance with cross-validated + choice of the l1 penalty. + LedoitWolf : LedoitWolf Estimator. + OAS : Oracle Approximating Shrinkage Estimator. + ShrunkCovariance : Covariance estimator with shrinkage. + + References + ---------- + + .. [Rouseeuw1984] P. J. Rousseeuw. Least median of squares regression. + J. Am Stat Ass, 79:871, 1984. + .. [Rousseeuw] A Fast Algorithm for the Minimum Covariance Determinant + Estimator, 1999, American Statistical Association and the American + Society for Quality, TECHNOMETRICS + .. [ButlerDavies] R. W. Butler, P. L. Davies and M. Jhun, + Asymptotics For The Minimum Covariance Determinant Estimator, + The Annals of Statistics, 1993, Vol. 21, No. 3, 1385-1400 + + Examples + -------- + >>> import numpy as np + >>> from sklearn.covariance import MinCovDet + >>> from sklearn.datasets import make_gaussian_quantiles + >>> real_cov = np.array([[.8, .3], + ... [.3, .4]]) + >>> rng = np.random.RandomState(0) + >>> X = rng.multivariate_normal(mean=[0, 0], + ... cov=real_cov, + ... size=500) + >>> cov = MinCovDet(random_state=0).fit(X) + >>> cov.covariance_ + array([[0.7411, 0.2535], + [0.2535, 0.3053]]) + >>> cov.location_ + array([0.0813 , 0.0427]) + """ + + _parameter_constraints: dict = { + **EmpiricalCovariance._parameter_constraints, + "support_fraction": [Interval(Real, 0, 1, closed="right"), None], + "random_state": ["random_state"], + } + _nonrobust_covariance = staticmethod(empirical_covariance) + + def __init__( + self, + *, + store_precision=True, + assume_centered=False, + support_fraction=None, + random_state=None, + ): + self.store_precision = store_precision + self.assume_centered = assume_centered + self.support_fraction = support_fraction + self.random_state = random_state + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit a Minimum Covariance Determinant with the FastMCD algorithm. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Returns the instance itself. + """ + X = validate_data(self, X, ensure_min_samples=2, estimator="MinCovDet") + random_state = check_random_state(self.random_state) + n_samples, n_features = X.shape + # check that the empirical covariance is full rank + if (linalg.svdvals(np.dot(X.T, X)) > 1e-8).sum() != n_features: + warnings.warn( + "The covariance matrix associated to your dataset is not full rank" + ) + # compute and store raw estimates + raw_location, raw_covariance, raw_support, raw_dist = fast_mcd( + X, + support_fraction=self.support_fraction, + cov_computation_method=self._nonrobust_covariance, + random_state=random_state, + ) + if self.assume_centered: + raw_location = np.zeros(n_features) + raw_covariance = self._nonrobust_covariance( + X[raw_support], assume_centered=True + ) + # get precision matrix in an optimized way + precision = linalg.pinvh(raw_covariance) + raw_dist = np.sum(np.dot(X, precision) * X, 1) + self.raw_location_ = raw_location + self.raw_covariance_ = raw_covariance + self.raw_support_ = raw_support + self.location_ = raw_location + self.support_ = raw_support + self.dist_ = raw_dist + # obtain consistency at normal models + self.correct_covariance(X) + # re-weight estimator + self.reweight_covariance(X) + + return self + + def correct_covariance(self, data): + """Apply a correction to raw Minimum Covariance Determinant estimates. + + Correction using the empirical correction factor suggested + by Rousseeuw and Van Driessen in [RVD]_. + + Parameters + ---------- + data : array-like of shape (n_samples, n_features) + The data matrix, with p features and n samples. + The data set must be the one which was used to compute + the raw estimates. + + Returns + ------- + covariance_corrected : ndarray of shape (n_features, n_features) + Corrected robust covariance estimate. + + References + ---------- + + .. [RVD] A Fast Algorithm for the Minimum Covariance + Determinant Estimator, 1999, American Statistical Association + and the American Society for Quality, TECHNOMETRICS + """ + + # Check that the covariance of the support data is not equal to 0. + # Otherwise self.dist_ = 0 and thus correction = 0. + n_samples = len(self.dist_) + n_support = np.sum(self.support_) + if n_support < n_samples and np.allclose(self.raw_covariance_, 0): + raise ValueError( + "The covariance matrix of the support data " + "is equal to 0, try to increase support_fraction" + ) + correction = np.median(self.dist_) / chi2(data.shape[1]).isf(0.5) + covariance_corrected = self.raw_covariance_ * correction + self.dist_ /= correction + return covariance_corrected + + def reweight_covariance(self, data): + """Re-weight raw Minimum Covariance Determinant estimates. + + Re-weight observations using Rousseeuw's method (equivalent to + deleting outlying observations from the data set before + computing location and covariance estimates) described + in [RVDriessen]_. + + Parameters + ---------- + data : array-like of shape (n_samples, n_features) + The data matrix, with p features and n samples. + The data set must be the one which was used to compute + the raw estimates. + + Returns + ------- + location_reweighted : ndarray of shape (n_features,) + Re-weighted robust location estimate. + + covariance_reweighted : ndarray of shape (n_features, n_features) + Re-weighted robust covariance estimate. + + support_reweighted : ndarray of shape (n_samples,), dtype=bool + A mask of the observations that have been used to compute + the re-weighted robust location and covariance estimates. + + References + ---------- + + .. [RVDriessen] A Fast Algorithm for the Minimum Covariance + Determinant Estimator, 1999, American Statistical Association + and the American Society for Quality, TECHNOMETRICS + """ + n_samples, n_features = data.shape + mask = self.dist_ < chi2(n_features).isf(0.025) + if self.assume_centered: + location_reweighted = np.zeros(n_features) + else: + location_reweighted = data[mask].mean(0) + covariance_reweighted = self._nonrobust_covariance( + data[mask], assume_centered=self.assume_centered + ) + support_reweighted = np.zeros(n_samples, dtype=bool) + support_reweighted[mask] = True + self._set_covariance(covariance_reweighted) + self.location_ = location_reweighted + self.support_ = support_reweighted + X_centered = data - self.location_ + self.dist_ = np.sum(np.dot(X_centered, self.get_precision()) * X_centered, 1) + return location_reweighted, covariance_reweighted, support_reweighted diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/_shrunk_covariance.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/_shrunk_covariance.py new file mode 100644 index 0000000000000000000000000000000000000000..99d6f70f57d6eee24fc442bd42f496bf8ae9a9a2 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/covariance/_shrunk_covariance.py @@ -0,0 +1,822 @@ +""" +Covariance estimators using shrinkage. + +Shrinkage corresponds to regularising `cov` using a convex combination: +shrunk_cov = (1-shrinkage)*cov + shrinkage*structured_estimate. + +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# avoid division truncation +import warnings +from numbers import Integral, Real + +import numpy as np + +from ..base import _fit_context +from ..utils import check_array +from ..utils._param_validation import Interval, validate_params +from ..utils.validation import validate_data +from . import EmpiricalCovariance, empirical_covariance + + +def _ledoit_wolf(X, *, assume_centered, block_size): + """Estimate the shrunk Ledoit-Wolf covariance matrix.""" + # for only one feature, the result is the same whatever the shrinkage + if len(X.shape) == 2 and X.shape[1] == 1: + if not assume_centered: + X = X - X.mean() + return np.atleast_2d((X**2).mean()), 0.0 + n_features = X.shape[1] + + # get Ledoit-Wolf shrinkage + shrinkage = ledoit_wolf_shrinkage( + X, assume_centered=assume_centered, block_size=block_size + ) + emp_cov = empirical_covariance(X, assume_centered=assume_centered) + mu = np.sum(np.trace(emp_cov)) / n_features + shrunk_cov = (1.0 - shrinkage) * emp_cov + shrunk_cov.flat[:: n_features + 1] += shrinkage * mu + + return shrunk_cov, shrinkage + + +def _oas(X, *, assume_centered=False): + """Estimate covariance with the Oracle Approximating Shrinkage algorithm. + + The formulation is based on [1]_. + [1] "Shrinkage algorithms for MMSE covariance estimation.", + Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O. + IEEE Transactions on Signal Processing, 58(10), 5016-5029, 2010. + https://arxiv.org/pdf/0907.4698.pdf + """ + if len(X.shape) == 2 and X.shape[1] == 1: + # for only one feature, the result is the same whatever the shrinkage + if not assume_centered: + X = X - X.mean() + return np.atleast_2d((X**2).mean()), 0.0 + + n_samples, n_features = X.shape + + emp_cov = empirical_covariance(X, assume_centered=assume_centered) + + # The shrinkage is defined as: + # shrinkage = min( + # trace(S @ S.T) + trace(S)**2) / ((n + 1) (trace(S @ S.T) - trace(S)**2 / p), 1 + # ) + # where n and p are n_samples and n_features, respectively (cf. Eq. 23 in [1]). + # The factor 2 / p is omitted since it does not impact the value of the estimator + # for large p. + + # Instead of computing trace(S)**2, we can compute the average of the squared + # elements of S that is equal to trace(S)**2 / p**2. + # See the definition of the Frobenius norm: + # https://en.wikipedia.org/wiki/Matrix_norm#Frobenius_norm + alpha = np.mean(emp_cov**2) + mu = np.trace(emp_cov) / n_features + mu_squared = mu**2 + + # The factor 1 / p**2 will cancel out since it is in both the numerator and + # denominator + num = alpha + mu_squared + den = (n_samples + 1) * (alpha - mu_squared / n_features) + shrinkage = 1.0 if den == 0 else min(num / den, 1.0) + + # The shrunk covariance is defined as: + # (1 - shrinkage) * S + shrinkage * F (cf. Eq. 4 in [1]) + # where S is the empirical covariance and F is the shrinkage target defined as + # F = trace(S) / n_features * np.identity(n_features) (cf. Eq. 3 in [1]) + shrunk_cov = (1.0 - shrinkage) * emp_cov + shrunk_cov.flat[:: n_features + 1] += shrinkage * mu + + return shrunk_cov, shrinkage + + +############################################################################### +# Public API +# ShrunkCovariance estimator + + +@validate_params( + { + "emp_cov": ["array-like"], + "shrinkage": [Interval(Real, 0, 1, closed="both")], + }, + prefer_skip_nested_validation=True, +) +def shrunk_covariance(emp_cov, shrinkage=0.1): + """Calculate covariance matrices shrunk on the diagonal. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + emp_cov : array-like of shape (..., n_features, n_features) + Covariance matrices to be shrunk, at least 2D ndarray. + + shrinkage : float, default=0.1 + Coefficient in the convex combination used for the computation + of the shrunk estimate. Range is [0, 1]. + + Returns + ------- + shrunk_cov : ndarray of shape (..., n_features, n_features) + Shrunk covariance matrices. + + Notes + ----- + The regularized (shrunk) covariance is given by:: + + (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features) + + where `mu = trace(cov) / n_features`. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.datasets import make_gaussian_quantiles + >>> from sklearn.covariance import empirical_covariance, shrunk_covariance + >>> real_cov = np.array([[.8, .3], [.3, .4]]) + >>> rng = np.random.RandomState(0) + >>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=500) + >>> shrunk_covariance(empirical_covariance(X)) + array([[0.739, 0.254], + [0.254, 0.411]]) + """ + emp_cov = check_array(emp_cov, allow_nd=True) + n_features = emp_cov.shape[-1] + + shrunk_cov = (1.0 - shrinkage) * emp_cov + mu = np.trace(emp_cov, axis1=-2, axis2=-1) / n_features + mu = np.expand_dims(mu, axis=tuple(range(mu.ndim, emp_cov.ndim))) + shrunk_cov += shrinkage * mu * np.eye(n_features) + + return shrunk_cov + + +class ShrunkCovariance(EmpiricalCovariance): + """Covariance estimator with shrinkage. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + store_precision : bool, default=True + Specify if the estimated precision is stored. + + assume_centered : bool, default=False + If True, data will not be centered before computation. + Useful when working with data whose mean is almost, but not exactly + zero. + If False, data will be centered before computation. + + shrinkage : float, default=0.1 + Coefficient in the convex combination used for the computation + of the shrunk estimate. Range is [0, 1]. + + Attributes + ---------- + covariance_ : ndarray of shape (n_features, n_features) + Estimated covariance matrix + + location_ : ndarray of shape (n_features,) + Estimated location, i.e. the estimated mean. + + precision_ : ndarray of shape (n_features, n_features) + Estimated pseudo inverse matrix. + (stored only if store_precision is True) + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + EllipticEnvelope : An object for detecting outliers in + a Gaussian distributed dataset. + EmpiricalCovariance : Maximum likelihood covariance estimator. + GraphicalLasso : Sparse inverse covariance estimation + with an l1-penalized estimator. + GraphicalLassoCV : Sparse inverse covariance with cross-validated + choice of the l1 penalty. + LedoitWolf : LedoitWolf Estimator. + MinCovDet : Minimum Covariance Determinant + (robust estimator of covariance). + OAS : Oracle Approximating Shrinkage Estimator. + + Notes + ----- + The regularized covariance is given by: + + (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features) + + where mu = trace(cov) / n_features + + Examples + -------- + >>> import numpy as np + >>> from sklearn.covariance import ShrunkCovariance + >>> from sklearn.datasets import make_gaussian_quantiles + >>> real_cov = np.array([[.8, .3], + ... [.3, .4]]) + >>> rng = np.random.RandomState(0) + >>> X = rng.multivariate_normal(mean=[0, 0], + ... cov=real_cov, + ... size=500) + >>> cov = ShrunkCovariance().fit(X) + >>> cov.covariance_ + array([[0.7387, 0.2536], + [0.2536, 0.4110]]) + >>> cov.location_ + array([0.0622, 0.0193]) + """ + + _parameter_constraints: dict = { + **EmpiricalCovariance._parameter_constraints, + "shrinkage": [Interval(Real, 0, 1, closed="both")], + } + + def __init__(self, *, store_precision=True, assume_centered=False, shrinkage=0.1): + super().__init__( + store_precision=store_precision, assume_centered=assume_centered + ) + self.shrinkage = shrinkage + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit the shrunk covariance model to X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Returns the instance itself. + """ + X = validate_data(self, X) + # Not calling the parent object to fit, to avoid a potential + # matrix inversion when setting the precision + if self.assume_centered: + self.location_ = np.zeros(X.shape[1]) + else: + self.location_ = X.mean(0) + covariance = empirical_covariance(X, assume_centered=self.assume_centered) + covariance = shrunk_covariance(covariance, self.shrinkage) + self._set_covariance(covariance) + + return self + + +# Ledoit-Wolf estimator + + +@validate_params( + { + "X": ["array-like"], + "assume_centered": ["boolean"], + "block_size": [Interval(Integral, 1, None, closed="left")], + }, + prefer_skip_nested_validation=True, +) +def ledoit_wolf_shrinkage(X, assume_centered=False, block_size=1000): + """Estimate the shrunk Ledoit-Wolf covariance matrix. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data from which to compute the Ledoit-Wolf shrunk covariance shrinkage. + + assume_centered : bool, default=False + If True, data will not be centered before computation. + Useful to work with data whose mean is significantly equal to + zero but is not exactly zero. + If False, data will be centered before computation. + + block_size : int, default=1000 + Size of blocks into which the covariance matrix will be split. + + Returns + ------- + shrinkage : float + Coefficient in the convex combination used for the computation + of the shrunk estimate. + + Notes + ----- + The regularized (shrunk) covariance is: + + (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features) + + where mu = trace(cov) / n_features + + Examples + -------- + >>> import numpy as np + >>> from sklearn.covariance import ledoit_wolf_shrinkage + >>> real_cov = np.array([[.4, .2], [.2, .8]]) + >>> rng = np.random.RandomState(0) + >>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=50) + >>> shrinkage_coefficient = ledoit_wolf_shrinkage(X) + >>> shrinkage_coefficient + np.float64(0.23) + """ + X = check_array(X) + # for only one feature, the result is the same whatever the shrinkage + if len(X.shape) == 2 and X.shape[1] == 1: + return 0.0 + if X.ndim == 1: + X = np.reshape(X, (1, -1)) + + if X.shape[0] == 1: + warnings.warn( + "Only one sample available. You may want to reshape your data array" + ) + n_samples, n_features = X.shape + + # optionally center data + if not assume_centered: + X = X - X.mean(0) + + # A non-blocked version of the computation is present in the tests + # in tests/test_covariance.py + + # number of blocks to split the covariance matrix into + n_splits = int(n_features / block_size) + X2 = X**2 + emp_cov_trace = np.sum(X2, axis=0) / n_samples + mu = np.sum(emp_cov_trace) / n_features + beta_ = 0.0 # sum of the coefficients of + delta_ = 0.0 # sum of the *squared* coefficients of + # starting block computation + for i in range(n_splits): + for j in range(n_splits): + rows = slice(block_size * i, block_size * (i + 1)) + cols = slice(block_size * j, block_size * (j + 1)) + beta_ += np.sum(np.dot(X2.T[rows], X2[:, cols])) + delta_ += np.sum(np.dot(X.T[rows], X[:, cols]) ** 2) + rows = slice(block_size * i, block_size * (i + 1)) + beta_ += np.sum(np.dot(X2.T[rows], X2[:, block_size * n_splits :])) + delta_ += np.sum(np.dot(X.T[rows], X[:, block_size * n_splits :]) ** 2) + for j in range(n_splits): + cols = slice(block_size * j, block_size * (j + 1)) + beta_ += np.sum(np.dot(X2.T[block_size * n_splits :], X2[:, cols])) + delta_ += np.sum(np.dot(X.T[block_size * n_splits :], X[:, cols]) ** 2) + delta_ += np.sum( + np.dot(X.T[block_size * n_splits :], X[:, block_size * n_splits :]) ** 2 + ) + delta_ /= n_samples**2 + beta_ += np.sum( + np.dot(X2.T[block_size * n_splits :], X2[:, block_size * n_splits :]) + ) + # use delta_ to compute beta + beta = 1.0 / (n_features * n_samples) * (beta_ / n_samples - delta_) + # delta is the sum of the squared coefficients of ( - mu*Id) / p + delta = delta_ - 2.0 * mu * emp_cov_trace.sum() + n_features * mu**2 + delta /= n_features + # get final beta as the min between beta and delta + # We do this to prevent shrinking more than "1", which would invert + # the value of covariances + beta = min(beta, delta) + # finally get shrinkage + shrinkage = 0 if beta == 0 else beta / delta + return shrinkage + + +@validate_params( + {"X": ["array-like"]}, + prefer_skip_nested_validation=False, +) +def ledoit_wolf(X, *, assume_centered=False, block_size=1000): + """Estimate the shrunk Ledoit-Wolf covariance matrix. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data from which to compute the covariance estimate. + + assume_centered : bool, default=False + If True, data will not be centered before computation. + Useful to work with data whose mean is significantly equal to + zero but is not exactly zero. + If False, data will be centered before computation. + + block_size : int, default=1000 + Size of blocks into which the covariance matrix will be split. + This is purely a memory optimization and does not affect results. + + Returns + ------- + shrunk_cov : ndarray of shape (n_features, n_features) + Shrunk covariance. + + shrinkage : float + Coefficient in the convex combination used for the computation + of the shrunk estimate. + + Notes + ----- + The regularized (shrunk) covariance is: + + (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features) + + where mu = trace(cov) / n_features + + Examples + -------- + >>> import numpy as np + >>> from sklearn.covariance import empirical_covariance, ledoit_wolf + >>> real_cov = np.array([[.4, .2], [.2, .8]]) + >>> rng = np.random.RandomState(0) + >>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=50) + >>> covariance, shrinkage = ledoit_wolf(X) + >>> covariance + array([[0.44, 0.16], + [0.16, 0.80]]) + >>> shrinkage + np.float64(0.23) + """ + estimator = LedoitWolf( + assume_centered=assume_centered, + block_size=block_size, + store_precision=False, + ).fit(X) + + return estimator.covariance_, estimator.shrinkage_ + + +class LedoitWolf(EmpiricalCovariance): + """LedoitWolf Estimator. + + Ledoit-Wolf is a particular form of shrinkage, where the shrinkage + coefficient is computed using O. Ledoit and M. Wolf's formula as + described in "A Well-Conditioned Estimator for Large-Dimensional + Covariance Matrices", Ledoit and Wolf, Journal of Multivariate + Analysis, Volume 88, Issue 2, February 2004, pages 365-411. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + store_precision : bool, default=True + Specify if the estimated precision is stored. + + assume_centered : bool, default=False + If True, data will not be centered before computation. + Useful when working with data whose mean is almost, but not exactly + zero. + If False (default), data will be centered before computation. + + block_size : int, default=1000 + Size of blocks into which the covariance matrix will be split + during its Ledoit-Wolf estimation. This is purely a memory + optimization and does not affect results. + + Attributes + ---------- + covariance_ : ndarray of shape (n_features, n_features) + Estimated covariance matrix. + + location_ : ndarray of shape (n_features,) + Estimated location, i.e. the estimated mean. + + precision_ : ndarray of shape (n_features, n_features) + Estimated pseudo inverse matrix. + (stored only if store_precision is True) + + shrinkage_ : float + Coefficient in the convex combination used for the computation + of the shrunk estimate. Range is [0, 1]. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + EllipticEnvelope : An object for detecting outliers in + a Gaussian distributed dataset. + EmpiricalCovariance : Maximum likelihood covariance estimator. + GraphicalLasso : Sparse inverse covariance estimation + with an l1-penalized estimator. + GraphicalLassoCV : Sparse inverse covariance with cross-validated + choice of the l1 penalty. + MinCovDet : Minimum Covariance Determinant + (robust estimator of covariance). + OAS : Oracle Approximating Shrinkage Estimator. + ShrunkCovariance : Covariance estimator with shrinkage. + + Notes + ----- + The regularised covariance is: + + (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features) + + where mu = trace(cov) / n_features + and shrinkage is given by the Ledoit and Wolf formula (see References) + + References + ---------- + "A Well-Conditioned Estimator for Large-Dimensional Covariance Matrices", + Ledoit and Wolf, Journal of Multivariate Analysis, Volume 88, Issue 2, + February 2004, pages 365-411. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.covariance import LedoitWolf + >>> real_cov = np.array([[.4, .2], + ... [.2, .8]]) + >>> np.random.seed(0) + >>> X = np.random.multivariate_normal(mean=[0, 0], + ... cov=real_cov, + ... size=50) + >>> cov = LedoitWolf().fit(X) + >>> cov.covariance_ + array([[0.4406, 0.1616], + [0.1616, 0.8022]]) + >>> cov.location_ + array([ 0.0595 , -0.0075]) + + See also :ref:`sphx_glr_auto_examples_covariance_plot_covariance_estimation.py` + and :ref:`sphx_glr_auto_examples_covariance_plot_lw_vs_oas.py` + for more detailed examples. + """ + + _parameter_constraints: dict = { + **EmpiricalCovariance._parameter_constraints, + "block_size": [Interval(Integral, 1, None, closed="left")], + } + + def __init__(self, *, store_precision=True, assume_centered=False, block_size=1000): + super().__init__( + store_precision=store_precision, assume_centered=assume_centered + ) + self.block_size = block_size + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit the Ledoit-Wolf shrunk covariance model to X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Returns the instance itself. + """ + # Not calling the parent object to fit, to avoid computing the + # covariance matrix (and potentially the precision) + X = validate_data(self, X) + if self.assume_centered: + self.location_ = np.zeros(X.shape[1]) + else: + self.location_ = X.mean(0) + covariance, shrinkage = _ledoit_wolf( + X - self.location_, assume_centered=True, block_size=self.block_size + ) + self.shrinkage_ = shrinkage + self._set_covariance(covariance) + + return self + + +# OAS estimator +@validate_params( + {"X": ["array-like"]}, + prefer_skip_nested_validation=False, +) +def oas(X, *, assume_centered=False): + """Estimate covariance with the Oracle Approximating Shrinkage. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data from which to compute the covariance estimate. + + assume_centered : bool, default=False + If True, data will not be centered before computation. + Useful to work with data whose mean is significantly equal to + zero but is not exactly zero. + If False, data will be centered before computation. + + Returns + ------- + shrunk_cov : array-like of shape (n_features, n_features) + Shrunk covariance. + + shrinkage : float + Coefficient in the convex combination used for the computation + of the shrunk estimate. + + Notes + ----- + The regularised covariance is: + + (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features), + + where mu = trace(cov) / n_features and shrinkage is given by the OAS formula + (see [1]_). + + The shrinkage formulation implemented here differs from Eq. 23 in [1]_. In + the original article, formula (23) states that 2/p (p being the number of + features) is multiplied by Trace(cov*cov) in both the numerator and + denominator, but this operation is omitted because for a large p, the value + of 2/p is so small that it doesn't affect the value of the estimator. + + References + ---------- + .. [1] :arxiv:`"Shrinkage algorithms for MMSE covariance estimation.", + Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O. + IEEE Transactions on Signal Processing, 58(10), 5016-5029, 2010. + <0907.4698>` + + Examples + -------- + >>> import numpy as np + >>> from sklearn.covariance import oas + >>> rng = np.random.RandomState(0) + >>> real_cov = [[.8, .3], [.3, .4]] + >>> X = rng.multivariate_normal(mean=[0, 0], cov=real_cov, size=500) + >>> shrunk_cov, shrinkage = oas(X) + >>> shrunk_cov + array([[0.7533, 0.2763], + [0.2763, 0.3964]]) + >>> shrinkage + np.float64(0.0195) + """ + estimator = OAS( + assume_centered=assume_centered, + ).fit(X) + return estimator.covariance_, estimator.shrinkage_ + + +class OAS(EmpiricalCovariance): + """Oracle Approximating Shrinkage Estimator. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + store_precision : bool, default=True + Specify if the estimated precision is stored. + + assume_centered : bool, default=False + If True, data will not be centered before computation. + Useful when working with data whose mean is almost, but not exactly + zero. + If False (default), data will be centered before computation. + + Attributes + ---------- + covariance_ : ndarray of shape (n_features, n_features) + Estimated covariance matrix. + + location_ : ndarray of shape (n_features,) + Estimated location, i.e. the estimated mean. + + precision_ : ndarray of shape (n_features, n_features) + Estimated pseudo inverse matrix. + (stored only if store_precision is True) + + shrinkage_ : float + coefficient in the convex combination used for the computation + of the shrunk estimate. Range is [0, 1]. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + EllipticEnvelope : An object for detecting outliers in + a Gaussian distributed dataset. + EmpiricalCovariance : Maximum likelihood covariance estimator. + GraphicalLasso : Sparse inverse covariance estimation + with an l1-penalized estimator. + GraphicalLassoCV : Sparse inverse covariance with cross-validated + choice of the l1 penalty. + LedoitWolf : LedoitWolf Estimator. + MinCovDet : Minimum Covariance Determinant + (robust estimator of covariance). + ShrunkCovariance : Covariance estimator with shrinkage. + + Notes + ----- + The regularised covariance is: + + (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features), + + where mu = trace(cov) / n_features and shrinkage is given by the OAS formula + (see [1]_). + + The shrinkage formulation implemented here differs from Eq. 23 in [1]_. In + the original article, formula (23) states that 2/p (p being the number of + features) is multiplied by Trace(cov*cov) in both the numerator and + denominator, but this operation is omitted because for a large p, the value + of 2/p is so small that it doesn't affect the value of the estimator. + + References + ---------- + .. [1] :arxiv:`"Shrinkage algorithms for MMSE covariance estimation.", + Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O. + IEEE Transactions on Signal Processing, 58(10), 5016-5029, 2010. + <0907.4698>` + + Examples + -------- + >>> import numpy as np + >>> from sklearn.covariance import OAS + >>> from sklearn.datasets import make_gaussian_quantiles + >>> real_cov = np.array([[.8, .3], + ... [.3, .4]]) + >>> rng = np.random.RandomState(0) + >>> X = rng.multivariate_normal(mean=[0, 0], + ... cov=real_cov, + ... size=500) + >>> oas = OAS().fit(X) + >>> oas.covariance_ + array([[0.7533, 0.2763], + [0.2763, 0.3964]]) + >>> oas.precision_ + array([[ 1.7833, -1.2431 ], + [-1.2431, 3.3889]]) + >>> oas.shrinkage_ + np.float64(0.0195) + + See also :ref:`sphx_glr_auto_examples_covariance_plot_covariance_estimation.py` + and :ref:`sphx_glr_auto_examples_covariance_plot_lw_vs_oas.py` + for more detailed examples. + """ + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit the Oracle Approximating Shrinkage covariance model to X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Returns the instance itself. + """ + X = validate_data(self, X) + # Not calling the parent object to fit, to avoid computing the + # covariance matrix (and potentially the precision) + if self.assume_centered: + self.location_ = np.zeros(X.shape[1]) + else: + self.location_ = X.mean(0) + + covariance, shrinkage = _oas(X - self.location_, assume_centered=True) + self.shrinkage_ = shrinkage + self._set_covariance(covariance) + + return self diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_covariance.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_covariance.py new file mode 100644 index 0000000000000000000000000000000000000000..9c55012c158e19df20e4c4770867fc19398213d0 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_covariance.py @@ -0,0 +1,374 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np +import pytest + +from sklearn import datasets +from sklearn.covariance import ( + OAS, + EmpiricalCovariance, + LedoitWolf, + ShrunkCovariance, + empirical_covariance, + ledoit_wolf, + ledoit_wolf_shrinkage, + oas, + shrunk_covariance, +) +from sklearn.covariance._shrunk_covariance import _ledoit_wolf +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) + +from .._shrunk_covariance import _oas + +X, _ = datasets.load_diabetes(return_X_y=True) +X_1d = X[:, 0] +n_samples, n_features = X.shape + + +def test_covariance(): + # Tests Covariance module on a simple dataset. + # test covariance fit from data + cov = EmpiricalCovariance() + cov.fit(X) + emp_cov = empirical_covariance(X) + assert_array_almost_equal(emp_cov, cov.covariance_, 4) + assert_almost_equal(cov.error_norm(emp_cov), 0) + assert_almost_equal(cov.error_norm(emp_cov, norm="spectral"), 0) + assert_almost_equal(cov.error_norm(emp_cov, norm="frobenius"), 0) + assert_almost_equal(cov.error_norm(emp_cov, scaling=False), 0) + assert_almost_equal(cov.error_norm(emp_cov, squared=False), 0) + with pytest.raises(NotImplementedError): + cov.error_norm(emp_cov, norm="foo") + # Mahalanobis distances computation test + mahal_dist = cov.mahalanobis(X) + assert np.amin(mahal_dist) > 0 + + # test with n_features = 1 + X_1d = X[:, 0].reshape((-1, 1)) + cov = EmpiricalCovariance() + cov.fit(X_1d) + assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) + assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0) + assert_almost_equal(cov.error_norm(empirical_covariance(X_1d), norm="spectral"), 0) + + # test with one sample + # Create X with 1 sample and 5 features + X_1sample = np.arange(5).reshape(1, 5) + cov = EmpiricalCovariance() + warn_msg = "Only one sample available. You may want to reshape your data array" + with pytest.warns(UserWarning, match=warn_msg): + cov.fit(X_1sample) + + assert_array_almost_equal(cov.covariance_, np.zeros(shape=(5, 5), dtype=np.float64)) + + # test integer type + X_integer = np.asarray([[0, 1], [1, 0]]) + result = np.asarray([[0.25, -0.25], [-0.25, 0.25]]) + assert_array_almost_equal(empirical_covariance(X_integer), result) + + # test centered case + cov = EmpiricalCovariance(assume_centered=True) + cov.fit(X) + assert_array_equal(cov.location_, np.zeros(X.shape[1])) + + +@pytest.mark.parametrize("n_matrices", [1, 3]) +def test_shrunk_covariance_func(n_matrices): + """Check `shrunk_covariance` function.""" + + n_features = 2 + cov = np.ones((n_features, n_features)) + cov_target = np.array([[1, 0.5], [0.5, 1]]) + + if n_matrices > 1: + cov = np.repeat(cov[np.newaxis, ...], n_matrices, axis=0) + cov_target = np.repeat(cov_target[np.newaxis, ...], n_matrices, axis=0) + + cov_shrunk = shrunk_covariance(cov, 0.5) + assert_allclose(cov_shrunk, cov_target) + + +def test_shrunk_covariance(): + """Check consistency between `ShrunkCovariance` and `shrunk_covariance`.""" + + # Tests ShrunkCovariance module on a simple dataset. + # compare shrunk covariance obtained from data and from MLE estimate + cov = ShrunkCovariance(shrinkage=0.5) + cov.fit(X) + assert_array_almost_equal( + shrunk_covariance(empirical_covariance(X), shrinkage=0.5), cov.covariance_, 4 + ) + + # same test with shrinkage not provided + cov = ShrunkCovariance() + cov.fit(X) + assert_array_almost_equal( + shrunk_covariance(empirical_covariance(X)), cov.covariance_, 4 + ) + + # same test with shrinkage = 0 (<==> empirical_covariance) + cov = ShrunkCovariance(shrinkage=0.0) + cov.fit(X) + assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4) + + # test with n_features = 1 + X_1d = X[:, 0].reshape((-1, 1)) + cov = ShrunkCovariance(shrinkage=0.3) + cov.fit(X_1d) + assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) + + # test shrinkage coeff on a simple data set (without saving precision) + cov = ShrunkCovariance(shrinkage=0.5, store_precision=False) + cov.fit(X) + assert cov.precision_ is None + + +def test_ledoit_wolf(): + # Tests LedoitWolf module on a simple dataset. + # test shrinkage coeff on a simple data set + X_centered = X - X.mean(axis=0) + lw = LedoitWolf(assume_centered=True) + lw.fit(X_centered) + shrinkage_ = lw.shrinkage_ + + score_ = lw.score(X_centered) + assert_almost_equal( + ledoit_wolf_shrinkage(X_centered, assume_centered=True), shrinkage_ + ) + assert_almost_equal( + ledoit_wolf_shrinkage(X_centered, assume_centered=True, block_size=6), + shrinkage_, + ) + # compare shrunk covariance obtained from data and from MLE estimate + lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf( + X_centered, assume_centered=True + ) + assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) + assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_) + # compare estimates given by LW and ShrunkCovariance + scov = ShrunkCovariance(shrinkage=lw.shrinkage_, assume_centered=True) + scov.fit(X_centered) + assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) + + # test with n_features = 1 + X_1d = X[:, 0].reshape((-1, 1)) + lw = LedoitWolf(assume_centered=True) + lw.fit(X_1d) + lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d, assume_centered=True) + assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) + assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_) + assert_array_almost_equal((X_1d**2).sum() / n_samples, lw.covariance_, 4) + + # test shrinkage coeff on a simple data set (without saving precision) + lw = LedoitWolf(store_precision=False, assume_centered=True) + lw.fit(X_centered) + assert_almost_equal(lw.score(X_centered), score_, 4) + assert lw.precision_ is None + + # Same tests without assuming centered data + # test shrinkage coeff on a simple data set + lw = LedoitWolf() + lw.fit(X) + assert_almost_equal(lw.shrinkage_, shrinkage_, 4) + assert_almost_equal(lw.shrinkage_, ledoit_wolf_shrinkage(X)) + assert_almost_equal(lw.shrinkage_, ledoit_wolf(X)[1]) + assert_almost_equal( + lw.shrinkage_, _ledoit_wolf(X=X, assume_centered=False, block_size=10000)[1] + ) + assert_almost_equal(lw.score(X), score_, 4) + # compare shrunk covariance obtained from data and from MLE estimate + lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X) + assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) + assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_) + # compare estimates given by LW and ShrunkCovariance + scov = ShrunkCovariance(shrinkage=lw.shrinkage_) + scov.fit(X) + assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) + + # test with n_features = 1 + X_1d = X[:, 0].reshape((-1, 1)) + lw = LedoitWolf() + lw.fit(X_1d) + assert_allclose( + X_1d.var(ddof=0), + _ledoit_wolf(X=X_1d, assume_centered=False, block_size=10000)[0], + ) + lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d) + assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) + assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_) + assert_array_almost_equal(empirical_covariance(X_1d), lw.covariance_, 4) + + # test with one sample + # warning should be raised when using only 1 sample + X_1sample = np.arange(5).reshape(1, 5) + lw = LedoitWolf() + + warn_msg = "Only one sample available. You may want to reshape your data array" + with pytest.warns(UserWarning, match=warn_msg): + lw.fit(X_1sample) + + assert_array_almost_equal(lw.covariance_, np.zeros(shape=(5, 5), dtype=np.float64)) + + # test shrinkage coeff on a simple data set (without saving precision) + lw = LedoitWolf(store_precision=False) + lw.fit(X) + assert_almost_equal(lw.score(X), score_, 4) + assert lw.precision_ is None + + +def _naive_ledoit_wolf_shrinkage(X): + # A simple implementation of the formulas from Ledoit & Wolf + + # The computation below achieves the following computations of the + # "O. Ledoit and M. Wolf, A Well-Conditioned Estimator for + # Large-Dimensional Covariance Matrices" + # beta and delta are given in the beginning of section 3.2 + n_samples, n_features = X.shape + emp_cov = empirical_covariance(X, assume_centered=False) + mu = np.trace(emp_cov) / n_features + delta_ = emp_cov.copy() + delta_.flat[:: n_features + 1] -= mu + delta = (delta_**2).sum() / n_features + X2 = X**2 + beta_ = ( + 1.0 + / (n_features * n_samples) + * np.sum(np.dot(X2.T, X2) / n_samples - emp_cov**2) + ) + + beta = min(beta_, delta) + shrinkage = beta / delta + return shrinkage + + +def test_ledoit_wolf_small(): + # Compare our blocked implementation to the naive implementation + X_small = X[:, :4] + lw = LedoitWolf() + lw.fit(X_small) + shrinkage_ = lw.shrinkage_ + + assert_almost_equal(shrinkage_, _naive_ledoit_wolf_shrinkage(X_small)) + + +def test_ledoit_wolf_large(): + # test that ledoit_wolf doesn't error on data that is wider than block_size + rng = np.random.RandomState(0) + # use a number of features that is larger than the block-size + X = rng.normal(size=(10, 20)) + lw = LedoitWolf(block_size=10).fit(X) + # check that covariance is about diagonal (random normal noise) + assert_almost_equal(lw.covariance_, np.eye(20), 0) + cov = lw.covariance_ + + # check that the result is consistent with not splitting data into blocks. + lw = LedoitWolf(block_size=25).fit(X) + assert_almost_equal(lw.covariance_, cov) + + +@pytest.mark.parametrize( + "ledoit_wolf_fitting_function", [LedoitWolf().fit, ledoit_wolf_shrinkage] +) +def test_ledoit_wolf_empty_array(ledoit_wolf_fitting_function): + """Check that we validate X and raise proper error with 0-sample array.""" + X_empty = np.zeros((0, 2)) + with pytest.raises(ValueError, match="Found array with 0 sample"): + ledoit_wolf_fitting_function(X_empty) + + +def test_oas(): + # Tests OAS module on a simple dataset. + # test shrinkage coeff on a simple data set + X_centered = X - X.mean(axis=0) + oa = OAS(assume_centered=True) + oa.fit(X_centered) + shrinkage_ = oa.shrinkage_ + score_ = oa.score(X_centered) + # compare shrunk covariance obtained from data and from MLE estimate + oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_centered, assume_centered=True) + assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4) + assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_) + # compare estimates given by OAS and ShrunkCovariance + scov = ShrunkCovariance(shrinkage=oa.shrinkage_, assume_centered=True) + scov.fit(X_centered) + assert_array_almost_equal(scov.covariance_, oa.covariance_, 4) + + # test with n_features = 1 + X_1d = X[:, 0:1] + oa = OAS(assume_centered=True) + oa.fit(X_1d) + oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_1d, assume_centered=True) + assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4) + assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_) + assert_array_almost_equal((X_1d**2).sum() / n_samples, oa.covariance_, 4) + + # test shrinkage coeff on a simple data set (without saving precision) + oa = OAS(store_precision=False, assume_centered=True) + oa.fit(X_centered) + assert_almost_equal(oa.score(X_centered), score_, 4) + assert oa.precision_ is None + + # Same tests without assuming centered data-------------------------------- + # test shrinkage coeff on a simple data set + oa = OAS() + oa.fit(X) + assert_almost_equal(oa.shrinkage_, shrinkage_, 4) + assert_almost_equal(oa.score(X), score_, 4) + # compare shrunk covariance obtained from data and from MLE estimate + oa_cov_from_mle, oa_shrinkage_from_mle = oas(X) + assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4) + assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_) + # compare estimates given by OAS and ShrunkCovariance + scov = ShrunkCovariance(shrinkage=oa.shrinkage_) + scov.fit(X) + assert_array_almost_equal(scov.covariance_, oa.covariance_, 4) + + # test with n_features = 1 + X_1d = X[:, 0].reshape((-1, 1)) + oa = OAS() + oa.fit(X_1d) + oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_1d) + assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4) + assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_) + assert_array_almost_equal(empirical_covariance(X_1d), oa.covariance_, 4) + + # test with one sample + # warning should be raised when using only 1 sample + X_1sample = np.arange(5).reshape(1, 5) + oa = OAS() + warn_msg = "Only one sample available. You may want to reshape your data array" + with pytest.warns(UserWarning, match=warn_msg): + oa.fit(X_1sample) + + assert_array_almost_equal(oa.covariance_, np.zeros(shape=(5, 5), dtype=np.float64)) + + # test shrinkage coeff on a simple data set (without saving precision) + oa = OAS(store_precision=False) + oa.fit(X) + assert_almost_equal(oa.score(X), score_, 4) + assert oa.precision_ is None + + # test function _oas without assuming centered data + X_1f = X[:, 0:1] + oa = OAS() + oa.fit(X_1f) + # compare shrunk covariance obtained from data and from MLE estimate + _oa_cov_from_mle, _oa_shrinkage_from_mle = _oas(X_1f) + assert_array_almost_equal(_oa_cov_from_mle, oa.covariance_, 4) + assert_almost_equal(_oa_shrinkage_from_mle, oa.shrinkage_) + assert_array_almost_equal((X_1f**2).sum() / n_samples, oa.covariance_, 4) + + +def test_EmpiricalCovariance_validates_mahalanobis(): + """Checks that EmpiricalCovariance validates data with mahalanobis.""" + cov = EmpiricalCovariance().fit(X) + + msg = f"X has 2 features, but \\w+ is expecting {X.shape[1]} features as input" + with pytest.raises(ValueError, match=msg): + cov.mahalanobis(X[:, :2]) diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_elliptic_envelope.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_elliptic_envelope.py new file mode 100644 index 0000000000000000000000000000000000000000..ca85717fb378243ff8dcb75db1adade9a6c50c18 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_elliptic_envelope.py @@ -0,0 +1,52 @@ +""" +Testing for Elliptic Envelope algorithm (sklearn.covariance.elliptic_envelope). +""" + +import numpy as np +import pytest + +from sklearn.covariance import EllipticEnvelope +from sklearn.exceptions import NotFittedError +from sklearn.utils._testing import ( + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) + + +def test_elliptic_envelope(global_random_seed): + rnd = np.random.RandomState(global_random_seed) + X = rnd.randn(100, 10) + clf = EllipticEnvelope(contamination=0.1) + with pytest.raises(NotFittedError): + clf.predict(X) + with pytest.raises(NotFittedError): + clf.decision_function(X) + clf.fit(X) + y_pred = clf.predict(X) + scores = clf.score_samples(X) + decisions = clf.decision_function(X) + + assert_array_almost_equal(scores, -clf.mahalanobis(X)) + assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) + assert_almost_equal( + clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0 + ) + assert sum(y_pred == -1) == sum(decisions < 0) + + +def test_score_samples(): + X_train = [[1, 1], [1, 2], [2, 1]] + clf1 = EllipticEnvelope(contamination=0.2).fit(X_train) + clf2 = EllipticEnvelope().fit(X_train) + assert_array_equal( + clf1.score_samples([[2.0, 2.0]]), + clf1.decision_function([[2.0, 2.0]]) + clf1.offset_, + ) + assert_array_equal( + clf2.score_samples([[2.0, 2.0]]), + clf2.decision_function([[2.0, 2.0]]) + clf2.offset_, + ) + assert_array_equal( + clf1.score_samples([[2.0, 2.0]]), clf2.score_samples([[2.0, 2.0]]) + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_graphical_lasso.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_graphical_lasso.py new file mode 100644 index 0000000000000000000000000000000000000000..9698b64bf4407e216229b9e55fa4cd19896af823 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_graphical_lasso.py @@ -0,0 +1,318 @@ +"""Test the graphical_lasso module.""" + +import sys +from io import StringIO + +import numpy as np +import pytest +from numpy.testing import assert_allclose +from scipy import linalg + +from sklearn import config_context, datasets +from sklearn.covariance import ( + GraphicalLasso, + GraphicalLassoCV, + empirical_covariance, + graphical_lasso, +) +from sklearn.datasets import make_sparse_spd_matrix +from sklearn.model_selection import GroupKFold +from sklearn.utils import check_random_state +from sklearn.utils._testing import ( + _convert_container, + assert_array_almost_equal, + assert_array_less, +) + + +def test_graphical_lassos(random_state=1): + """Test the graphical lasso solvers. + + This checks is unstable for some random seeds where the covariance found with "cd" + and "lars" solvers are different (4 cases / 100 tries). + """ + # Sample data from a sparse multivariate normal + dim = 20 + n_samples = 100 + random_state = check_random_state(random_state) + prec = make_sparse_spd_matrix(dim, alpha=0.95, random_state=random_state) + cov = linalg.inv(prec) + X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples) + emp_cov = empirical_covariance(X) + + for alpha in (0.0, 0.1, 0.25): + covs = dict() + icovs = dict() + for method in ("cd", "lars"): + cov_, icov_, costs = graphical_lasso( + emp_cov, return_costs=True, alpha=alpha, mode=method + ) + covs[method] = cov_ + icovs[method] = icov_ + costs, dual_gap = np.array(costs).T + # Check that the costs always decrease (doesn't hold if alpha == 0) + if not alpha == 0: + # use 1e-12 since the cost can be exactly 0 + assert_array_less(np.diff(costs), 1e-12) + # Check that the 2 approaches give similar results + assert_allclose(covs["cd"], covs["lars"], atol=5e-4) + assert_allclose(icovs["cd"], icovs["lars"], atol=5e-4) + + # Smoke test the estimator + model = GraphicalLasso(alpha=0.25).fit(X) + model.score(X) + assert_array_almost_equal(model.covariance_, covs["cd"], decimal=4) + assert_array_almost_equal(model.covariance_, covs["lars"], decimal=4) + + # For a centered matrix, assume_centered could be chosen True or False + # Check that this returns indeed the same result for centered data + Z = X - X.mean(0) + precs = list() + for assume_centered in (False, True): + prec_ = GraphicalLasso(assume_centered=assume_centered).fit(Z).precision_ + precs.append(prec_) + assert_array_almost_equal(precs[0], precs[1]) + + +def test_graphical_lasso_when_alpha_equals_0(): + """Test graphical_lasso's early return condition when alpha=0.""" + X = np.random.randn(100, 10) + emp_cov = empirical_covariance(X, assume_centered=True) + + model = GraphicalLasso(alpha=0, covariance="precomputed").fit(emp_cov) + assert_allclose(model.precision_, np.linalg.inv(emp_cov)) + + _, precision = graphical_lasso(emp_cov, alpha=0) + assert_allclose(precision, np.linalg.inv(emp_cov)) + + +@pytest.mark.parametrize("mode", ["cd", "lars"]) +def test_graphical_lasso_n_iter(mode): + X, _ = datasets.make_classification(n_samples=5_000, n_features=20, random_state=0) + emp_cov = empirical_covariance(X) + + _, _, n_iter = graphical_lasso( + emp_cov, 0.2, mode=mode, max_iter=2, return_n_iter=True + ) + assert n_iter == 2 + + +def test_graphical_lasso_iris(): + # Hard-coded solution from R glasso package for alpha=1.0 + # (need to set penalize.diagonal to FALSE) + cov_R = np.array( + [ + [0.68112222, 0.0000000, 0.265820, 0.02464314], + [0.00000000, 0.1887129, 0.000000, 0.00000000], + [0.26582000, 0.0000000, 3.095503, 0.28697200], + [0.02464314, 0.0000000, 0.286972, 0.57713289], + ] + ) + icov_R = np.array( + [ + [1.5190747, 0.000000, -0.1304475, 0.0000000], + [0.0000000, 5.299055, 0.0000000, 0.0000000], + [-0.1304475, 0.000000, 0.3498624, -0.1683946], + [0.0000000, 0.000000, -0.1683946, 1.8164353], + ] + ) + X = datasets.load_iris().data + emp_cov = empirical_covariance(X) + for method in ("cd", "lars"): + cov, icov = graphical_lasso(emp_cov, alpha=1.0, return_costs=False, mode=method) + assert_array_almost_equal(cov, cov_R) + assert_array_almost_equal(icov, icov_R) + + +def test_graph_lasso_2D(): + # Hard-coded solution from Python skggm package + # obtained by calling `quic(emp_cov, lam=.1, tol=1e-8)` + cov_skggm = np.array([[3.09550269, 1.186972], [1.186972, 0.57713289]]) + + icov_skggm = np.array([[1.52836773, -3.14334831], [-3.14334831, 8.19753385]]) + X = datasets.load_iris().data[:, 2:] + emp_cov = empirical_covariance(X) + for method in ("cd", "lars"): + cov, icov = graphical_lasso(emp_cov, alpha=0.1, return_costs=False, mode=method) + assert_array_almost_equal(cov, cov_skggm) + assert_array_almost_equal(icov, icov_skggm) + + +def test_graphical_lasso_iris_singular(): + # Small subset of rows to test the rank-deficient case + # Need to choose samples such that none of the variances are zero + indices = np.arange(10, 13) + + # Hard-coded solution from R glasso package for alpha=0.01 + cov_R = np.array( + [ + [0.08, 0.056666662595, 0.00229729713223, 0.00153153142149], + [0.056666662595, 0.082222222222, 0.00333333333333, 0.00222222222222], + [0.002297297132, 0.003333333333, 0.00666666666667, 0.00009009009009], + [0.001531531421, 0.002222222222, 0.00009009009009, 0.00222222222222], + ] + ) + icov_R = np.array( + [ + [24.42244057, -16.831679593, 0.0, 0.0], + [-16.83168201, 24.351841681, -6.206896552, -12.5], + [0.0, -6.206896171, 153.103448276, 0.0], + [0.0, -12.499999143, 0.0, 462.5], + ] + ) + X = datasets.load_iris().data[indices, :] + emp_cov = empirical_covariance(X) + for method in ("cd", "lars"): + cov, icov = graphical_lasso( + emp_cov, alpha=0.01, return_costs=False, mode=method + ) + assert_array_almost_equal(cov, cov_R, decimal=5) + assert_array_almost_equal(icov, icov_R, decimal=5) + + +def test_graphical_lasso_cv(random_state=1): + # Sample data from a sparse multivariate normal + dim = 5 + n_samples = 6 + random_state = check_random_state(random_state) + prec = make_sparse_spd_matrix(dim, alpha=0.96, random_state=random_state) + cov = linalg.inv(prec) + X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples) + # Capture stdout, to smoke test the verbose mode + orig_stdout = sys.stdout + try: + sys.stdout = StringIO() + # We need verbose very high so that Parallel prints on stdout + GraphicalLassoCV(verbose=100, alphas=5, tol=1e-1).fit(X) + finally: + sys.stdout = orig_stdout + + +@pytest.mark.parametrize("alphas_container_type", ["list", "tuple", "array"]) +def test_graphical_lasso_cv_alphas_iterable(alphas_container_type): + """Check that we can pass an array-like to `alphas`. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/22489 + """ + true_cov = np.array( + [ + [0.8, 0.0, 0.2, 0.0], + [0.0, 0.4, 0.0, 0.0], + [0.2, 0.0, 0.3, 0.1], + [0.0, 0.0, 0.1, 0.7], + ] + ) + rng = np.random.RandomState(0) + X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200) + alphas = _convert_container([0.02, 0.03], alphas_container_type) + GraphicalLassoCV(alphas=alphas, tol=1e-1, n_jobs=1).fit(X) + + +@pytest.mark.parametrize( + "alphas,err_type,err_msg", + [ + ([-0.02, 0.03], ValueError, "must be > 0"), + ([0, 0.03], ValueError, "must be > 0"), + (["not_number", 0.03], TypeError, "must be an instance of float"), + ], +) +def test_graphical_lasso_cv_alphas_invalid_array(alphas, err_type, err_msg): + """Check that if an array-like containing a value + outside of (0, inf] is passed to `alphas`, a ValueError is raised. + Check if a string is passed, a TypeError is raised. + """ + true_cov = np.array( + [ + [0.8, 0.0, 0.2, 0.0], + [0.0, 0.4, 0.0, 0.0], + [0.2, 0.0, 0.3, 0.1], + [0.0, 0.0, 0.1, 0.7], + ] + ) + rng = np.random.RandomState(0) + X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200) + + with pytest.raises(err_type, match=err_msg): + GraphicalLassoCV(alphas=alphas, tol=1e-1, n_jobs=1).fit(X) + + +def test_graphical_lasso_cv_scores(): + splits = 4 + n_alphas = 5 + n_refinements = 3 + true_cov = np.array( + [ + [0.8, 0.0, 0.2, 0.0], + [0.0, 0.4, 0.0, 0.0], + [0.2, 0.0, 0.3, 0.1], + [0.0, 0.0, 0.1, 0.7], + ] + ) + rng = np.random.RandomState(0) + X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200) + cov = GraphicalLassoCV(cv=splits, alphas=n_alphas, n_refinements=n_refinements).fit( + X + ) + + _assert_graphical_lasso_cv_scores( + cov=cov, + n_splits=splits, + n_refinements=n_refinements, + n_alphas=n_alphas, + ) + + +@config_context(enable_metadata_routing=True) +def test_graphical_lasso_cv_scores_with_routing(global_random_seed): + """Check that `GraphicalLassoCV` internally dispatches metadata to + the splitter. + """ + splits = 5 + n_alphas = 5 + n_refinements = 3 + true_cov = np.array( + [ + [0.8, 0.0, 0.2, 0.0], + [0.0, 0.4, 0.0, 0.0], + [0.2, 0.0, 0.3, 0.1], + [0.0, 0.0, 0.1, 0.7], + ] + ) + rng = np.random.RandomState(global_random_seed) + X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=300) + n_samples = X.shape[0] + groups = rng.randint(0, 5, n_samples) + params = {"groups": groups} + cv = GroupKFold(n_splits=splits) + cv.set_split_request(groups=True) + + cov = GraphicalLassoCV(cv=cv, alphas=n_alphas, n_refinements=n_refinements).fit( + X, **params + ) + + _assert_graphical_lasso_cv_scores( + cov=cov, + n_splits=splits, + n_refinements=n_refinements, + n_alphas=n_alphas, + ) + + +def _assert_graphical_lasso_cv_scores(cov, n_splits, n_refinements, n_alphas): + cv_results = cov.cv_results_ + # alpha and one for each split + + total_alphas = n_refinements * n_alphas + 1 + keys = ["alphas"] + split_keys = [f"split{i}_test_score" for i in range(n_splits)] + for key in keys + split_keys: + assert key in cv_results + assert len(cv_results[key]) == total_alphas + + cv_scores = np.asarray([cov.cv_results_[key] for key in split_keys]) + expected_mean = cv_scores.mean(axis=0) + expected_std = cv_scores.std(axis=0) + + assert_allclose(cov.cv_results_["mean_test_score"], expected_mean) + assert_allclose(cov.cv_results_["std_test_score"], expected_std) diff --git a/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_robust_covariance.py b/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_robust_covariance.py new file mode 100644 index 0000000000000000000000000000000000000000..a7bd3996b9e4bdc39af0f961976eb8b727c9a130 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/covariance/tests/test_robust_covariance.py @@ -0,0 +1,171 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import itertools + +import numpy as np +import pytest + +from sklearn import datasets +from sklearn.covariance import MinCovDet, empirical_covariance, fast_mcd +from sklearn.utils._testing import assert_array_almost_equal + +X = datasets.load_iris().data +X_1d = X[:, 0] +n_samples, n_features = X.shape + + +def test_mcd(global_random_seed): + # Tests the FastMCD algorithm implementation + # Small data set + # test without outliers (random independent normal data) + launch_mcd_on_dataset(100, 5, 0, 0.02, 0.1, 75, global_random_seed) + # test with a contaminated data set (medium contamination) + launch_mcd_on_dataset(100, 5, 20, 0.3, 0.3, 65, global_random_seed) + # test with a contaminated data set (strong contamination) + launch_mcd_on_dataset(100, 5, 40, 0.1, 0.1, 50, global_random_seed) + + # Medium data set + launch_mcd_on_dataset(1000, 5, 450, 0.1, 0.1, 540, global_random_seed) + + # Large data set + launch_mcd_on_dataset(1700, 5, 800, 0.1, 0.1, 870, global_random_seed) + + # 1D data set + launch_mcd_on_dataset(500, 1, 100, 0.02, 0.02, 350, global_random_seed) + + # n_samples == n_features + launch_mcd_on_dataset(20, 20, 0, 0.1, 0.1, 15, global_random_seed) + + +def test_fast_mcd_on_invalid_input(): + X = np.arange(100) + msg = "Expected 2D array, got 1D array instead" + with pytest.raises(ValueError, match=msg): + fast_mcd(X) + + +def test_mcd_class_on_invalid_input(): + X = np.arange(100) + mcd = MinCovDet() + msg = "Expected 2D array, got 1D array instead" + with pytest.raises(ValueError, match=msg): + mcd.fit(X) + + +def launch_mcd_on_dataset( + n_samples, n_features, n_outliers, tol_loc, tol_cov, tol_support, seed +): + rand_gen = np.random.RandomState(seed) + data = rand_gen.randn(n_samples, n_features) + # add some outliers + outliers_index = rand_gen.permutation(n_samples)[:n_outliers] + outliers_offset = 10.0 * (rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5) + data[outliers_index] += outliers_offset + inliers_mask = np.ones(n_samples).astype(bool) + inliers_mask[outliers_index] = False + + pure_data = data[inliers_mask] + # compute MCD by fitting an object + mcd_fit = MinCovDet(random_state=seed).fit(data) + T = mcd_fit.location_ + S = mcd_fit.covariance_ + H = mcd_fit.support_ + # compare with the estimates learnt from the inliers + error_location = np.mean((pure_data.mean(0) - T) ** 2) + assert error_location < tol_loc + error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2) + assert error_cov < tol_cov + assert np.sum(H) >= tol_support + assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_) + + +def test_mcd_issue1127(): + # Check that the code does not break with X.shape = (3, 1) + # (i.e. n_support = n_samples) + rnd = np.random.RandomState(0) + X = rnd.normal(size=(3, 1)) + mcd = MinCovDet() + mcd.fit(X) + + +def test_mcd_issue3367(global_random_seed): + # Check that MCD completes when the covariance matrix is singular + # i.e. one of the rows and columns are all zeros + rand_gen = np.random.RandomState(global_random_seed) + + # Think of these as the values for X and Y -> 10 values between -5 and 5 + data_values = np.linspace(-5, 5, 10).tolist() + # Get the cartesian product of all possible coordinate pairs from above set + data = np.array(list(itertools.product(data_values, data_values))) + + # Add a third column that's all zeros to make our data a set of point + # within a plane, which means that the covariance matrix will be singular + data = np.hstack((data, np.zeros((data.shape[0], 1)))) + + # The below line of code should raise an exception if the covariance matrix + # is singular. As a further test, since we have points in XYZ, the + # principle components (Eigenvectors) of these directly relate to the + # geometry of the points. Since it's a plane, we should be able to test + # that the Eigenvector that corresponds to the smallest Eigenvalue is the + # plane normal, specifically [0, 0, 1], since everything is in the XY plane + # (as I've set it up above). To do this one would start by: + # + # evals, evecs = np.linalg.eigh(mcd_fit.covariance_) + # normal = evecs[:, np.argmin(evals)] + # + # After which we need to assert that our `normal` is equal to [0, 0, 1]. + # Do note that there is floating point error associated with this, so it's + # best to subtract the two and then compare some small tolerance (e.g. + # 1e-12). + MinCovDet(random_state=rand_gen).fit(data) + + +def test_mcd_support_covariance_is_zero(): + # Check that MCD returns a ValueError with informative message when the + # covariance of the support data is equal to 0. + X_1 = np.array([0.5, 0.1, 0.1, 0.1, 0.957, 0.1, 0.1, 0.1, 0.4285, 0.1]) + X_1 = X_1.reshape(-1, 1) + X_2 = np.array([0.5, 0.3, 0.3, 0.3, 0.957, 0.3, 0.3, 0.3, 0.4285, 0.3]) + X_2 = X_2.reshape(-1, 1) + msg = ( + "The covariance matrix of the support data is equal to 0, try to " + "increase support_fraction" + ) + for X in [X_1, X_2]: + with pytest.raises(ValueError, match=msg): + MinCovDet().fit(X) + + +def test_mcd_increasing_det_warning(global_random_seed): + # Check that a warning is raised if we observe increasing determinants + # during the c_step. In theory the sequence of determinants should be + # decreasing. Increasing determinants are likely due to ill-conditioned + # covariance matrices that result in poor precision matrices. + + X = [ + [5.1, 3.5, 1.4, 0.2], + [4.9, 3.0, 1.4, 0.2], + [4.7, 3.2, 1.3, 0.2], + [4.6, 3.1, 1.5, 0.2], + [5.0, 3.6, 1.4, 0.2], + [4.6, 3.4, 1.4, 0.3], + [5.0, 3.4, 1.5, 0.2], + [4.4, 2.9, 1.4, 0.2], + [4.9, 3.1, 1.5, 0.1], + [5.4, 3.7, 1.5, 0.2], + [4.8, 3.4, 1.6, 0.2], + [4.8, 3.0, 1.4, 0.1], + [4.3, 3.0, 1.1, 0.1], + [5.1, 3.5, 1.4, 0.3], + [5.7, 3.8, 1.7, 0.3], + [5.4, 3.4, 1.7, 0.2], + [4.6, 3.6, 1.0, 0.2], + [5.0, 3.0, 1.6, 0.2], + [5.2, 3.5, 1.5, 0.2], + ] + + mcd = MinCovDet(support_fraction=0.5, random_state=global_random_seed) + warn_msg = "Determinant has increased" + with pytest.warns(RuntimeWarning, match=warn_msg): + mcd.fit(X) diff --git a/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f78f33811e5c7bfd26fac6dda83022e4d8719191 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/__init__.py @@ -0,0 +1,8 @@ +"""Algorithms for cross decomposition.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._pls import CCA, PLSSVD, PLSCanonical, PLSRegression + +__all__ = ["CCA", "PLSSVD", "PLSCanonical", "PLSRegression"] diff --git a/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/_pls.py b/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/_pls.py new file mode 100644 index 0000000000000000000000000000000000000000..0bf6ec8f01d065f7f170f278c3ba87a0b0ce9823 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/_pls.py @@ -0,0 +1,1097 @@ +""" +The :mod:`sklearn.pls` module implements Partial Least Squares (PLS). +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from abc import ABCMeta, abstractmethod +from numbers import Integral, Real + +import numpy as np +from scipy.linalg import pinv, svd + +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + MultiOutputMixin, + RegressorMixin, + TransformerMixin, + _fit_context, +) +from ..exceptions import ConvergenceWarning +from ..utils import check_array, check_consistent_length +from ..utils._param_validation import Interval, StrOptions +from ..utils.extmath import svd_flip +from ..utils.validation import FLOAT_DTYPES, check_is_fitted, validate_data + +__all__ = ["PLSSVD", "PLSCanonical", "PLSRegression"] + + +def _pinv2_old(a): + # Used previous scipy pinv2 that was updated in: + # https://github.com/scipy/scipy/pull/10067 + # We can not set `cond` or `rcond` for pinv2 in scipy >= 1.3 to keep the + # same behavior of pinv2 for scipy < 1.3, because the condition used to + # determine the rank is dependent on the output of svd. + u, s, vh = svd(a, full_matrices=False, check_finite=False) + + t = u.dtype.char.lower() + factor = {"f": 1e3, "d": 1e6} + cond = np.max(s) * factor[t] * np.finfo(t).eps + rank = np.sum(s > cond) + + u = u[:, :rank] + u /= s[:rank] + return np.transpose(np.conjugate(np.dot(u, vh[:rank]))) + + +def _get_first_singular_vectors_power_method( + X, y, mode="A", max_iter=500, tol=1e-06, norm_y_weights=False +): + """Return the first left and right singular vectors of X'y. + + Provides an alternative to the svd(X'y) and uses the power method instead. + With norm_y_weights to True and in mode A, this corresponds to the + algorithm section 11.3 of the Wegelin's review, except this starts at the + "update saliences" part. + """ + + eps = np.finfo(X.dtype).eps + try: + y_score = next(col for col in y.T if np.any(np.abs(col) > eps)) + except StopIteration as e: + raise StopIteration("y residual is constant") from e + + x_weights_old = 100 # init to big value for first convergence check + + if mode == "B": + # Precompute pseudo inverse matrices + # Basically: X_pinv = (X.T X)^-1 X.T + # Which requires inverting a (n_features, n_features) matrix. + # As a result, and as detailed in the Wegelin's review, CCA (i.e. mode + # B) will be unstable if n_features > n_samples or n_targets > + # n_samples + X_pinv, y_pinv = _pinv2_old(X), _pinv2_old(y) + + for i in range(max_iter): + if mode == "B": + x_weights = np.dot(X_pinv, y_score) + else: + x_weights = np.dot(X.T, y_score) / np.dot(y_score, y_score) + + x_weights /= np.sqrt(np.dot(x_weights, x_weights)) + eps + x_score = np.dot(X, x_weights) + + if mode == "B": + y_weights = np.dot(y_pinv, x_score) + else: + y_weights = np.dot(y.T, x_score) / np.dot(x_score.T, x_score) + + if norm_y_weights: + y_weights /= np.sqrt(np.dot(y_weights, y_weights)) + eps + + y_score = np.dot(y, y_weights) / (np.dot(y_weights, y_weights) + eps) + + x_weights_diff = x_weights - x_weights_old + if np.dot(x_weights_diff, x_weights_diff) < tol or y.shape[1] == 1: + break + x_weights_old = x_weights + + n_iter = i + 1 + if n_iter == max_iter: + warnings.warn("Maximum number of iterations reached", ConvergenceWarning) + + return x_weights, y_weights, n_iter + + +def _get_first_singular_vectors_svd(X, y): + """Return the first left and right singular vectors of X'y. + + Here the whole SVD is computed. + """ + C = np.dot(X.T, y) + U, _, Vt = svd(C, full_matrices=False) + return U[:, 0], Vt[0, :] + + +def _center_scale_xy(X, y, scale=True): + """Center X, y and scale if the scale parameter==True + + Returns + ------- + X, y, x_mean, y_mean, x_std, y_std + """ + # center + x_mean = X.mean(axis=0) + X -= x_mean + y_mean = y.mean(axis=0) + y -= y_mean + # scale + if scale: + x_std = X.std(axis=0, ddof=1) + x_std[x_std == 0.0] = 1.0 + X /= x_std + y_std = y.std(axis=0, ddof=1) + y_std[y_std == 0.0] = 1.0 + y /= y_std + else: + x_std = np.ones(X.shape[1]) + y_std = np.ones(y.shape[1]) + return X, y, x_mean, y_mean, x_std, y_std + + +def _svd_flip_1d(u, v): + """Same as svd_flip but works on 1d arrays, and is inplace""" + # svd_flip would force us to convert to 2d array and would also return 2d + # arrays. We don't want that. + biggest_abs_val_idx = np.argmax(np.abs(u)) + sign = np.sign(u[biggest_abs_val_idx]) + u *= sign + v *= sign + + +class _PLS( + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + RegressorMixin, + MultiOutputMixin, + BaseEstimator, + metaclass=ABCMeta, +): + """Partial Least Squares (PLS) + + This class implements the generic PLS algorithm. + + Main ref: Wegelin, a survey of Partial Least Squares (PLS) methods, + with emphasis on the two-block case + https://stat.uw.edu/sites/default/files/files/reports/2000/tr371.pdf + """ + + _parameter_constraints: dict = { + "n_components": [Interval(Integral, 1, None, closed="left")], + "scale": ["boolean"], + "deflation_mode": [StrOptions({"regression", "canonical"})], + "mode": [StrOptions({"A", "B"})], + "algorithm": [StrOptions({"svd", "nipals"})], + "max_iter": [Interval(Integral, 1, None, closed="left")], + "tol": [Interval(Real, 0, None, closed="left")], + "copy": ["boolean"], + } + + @abstractmethod + def __init__( + self, + n_components=2, + *, + scale=True, + deflation_mode="regression", + mode="A", + algorithm="nipals", + max_iter=500, + tol=1e-06, + copy=True, + ): + self.n_components = n_components + self.deflation_mode = deflation_mode + self.mode = mode + self.scale = scale + self.algorithm = algorithm + self.max_iter = max_iter + self.tol = tol + self.copy = copy + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y): + """Fit model to data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of predictors. + + y : array-like of shape (n_samples,) or (n_samples, n_targets) + Target vectors, where `n_samples` is the number of samples and + `n_targets` is the number of response variables. + + Returns + ------- + self : object + Fitted model. + """ + check_consistent_length(X, y) + X = validate_data( + self, + X, + dtype=np.float64, + force_writeable=True, + copy=self.copy, + ensure_min_samples=2, + ) + y = check_array( + y, + input_name="y", + dtype=np.float64, + force_writeable=True, + copy=self.copy, + ensure_2d=False, + ) + if y.ndim == 1: + self._predict_1d = True + y = y.reshape(-1, 1) + else: + self._predict_1d = False + + n = X.shape[0] + p = X.shape[1] + q = y.shape[1] + + n_components = self.n_components + # With PLSRegression n_components is bounded by the rank of (X.T X) see + # Wegelin page 25. With CCA and PLSCanonical, n_components is bounded + # by the rank of X and the rank of y: see Wegelin page 12 + rank_upper_bound = ( + min(n, p) if self.deflation_mode == "regression" else min(n, p, q) + ) + if n_components > rank_upper_bound: + raise ValueError( + f"`n_components` upper bound is {rank_upper_bound}. " + f"Got {n_components} instead. Reduce `n_components`." + ) + + self._norm_y_weights = self.deflation_mode == "canonical" # 1.1 + norm_y_weights = self._norm_y_weights + + # Scale (in place) + Xk, yk, self._x_mean, self._y_mean, self._x_std, self._y_std = _center_scale_xy( + X, y, self.scale + ) + + self.x_weights_ = np.zeros((p, n_components)) # U + self.y_weights_ = np.zeros((q, n_components)) # V + self._x_scores = np.zeros((n, n_components)) # Xi + self._y_scores = np.zeros((n, n_components)) # Omega + self.x_loadings_ = np.zeros((p, n_components)) # Gamma + self.y_loadings_ = np.zeros((q, n_components)) # Delta + self.n_iter_ = [] + + # This whole thing corresponds to the algorithm in section 4.1 of the + # review from Wegelin. See above for a notation mapping from code to + # paper. + y_eps = np.finfo(yk.dtype).eps + for k in range(n_components): + # Find first left and right singular vectors of the X.T.dot(y) + # cross-covariance matrix. + if self.algorithm == "nipals": + # Replace columns that are all close to zero with zeros + yk_mask = np.all(np.abs(yk) < 10 * y_eps, axis=0) + yk[:, yk_mask] = 0.0 + + try: + ( + x_weights, + y_weights, + n_iter_, + ) = _get_first_singular_vectors_power_method( + Xk, + yk, + mode=self.mode, + max_iter=self.max_iter, + tol=self.tol, + norm_y_weights=norm_y_weights, + ) + except StopIteration as e: + if str(e) != "y residual is constant": + raise + warnings.warn(f"y residual is constant at iteration {k}") + break + + self.n_iter_.append(n_iter_) + + elif self.algorithm == "svd": + x_weights, y_weights = _get_first_singular_vectors_svd(Xk, yk) + + # inplace sign flip for consistency across solvers and archs + _svd_flip_1d(x_weights, y_weights) + + # compute scores, i.e. the projections of X and y + x_scores = np.dot(Xk, x_weights) + if norm_y_weights: + y_ss = 1 + else: + y_ss = np.dot(y_weights, y_weights) + y_scores = np.dot(yk, y_weights) / y_ss + + # Deflation: subtract rank-one approx to obtain Xk+1 and yk+1 + x_loadings = np.dot(x_scores, Xk) / np.dot(x_scores, x_scores) + Xk -= np.outer(x_scores, x_loadings) + + if self.deflation_mode == "canonical": + # regress yk on y_score + y_loadings = np.dot(y_scores, yk) / np.dot(y_scores, y_scores) + yk -= np.outer(y_scores, y_loadings) + if self.deflation_mode == "regression": + # regress yk on x_score + y_loadings = np.dot(x_scores, yk) / np.dot(x_scores, x_scores) + yk -= np.outer(x_scores, y_loadings) + + self.x_weights_[:, k] = x_weights + self.y_weights_[:, k] = y_weights + self._x_scores[:, k] = x_scores + self._y_scores[:, k] = y_scores + self.x_loadings_[:, k] = x_loadings + self.y_loadings_[:, k] = y_loadings + + # X was approximated as Xi . Gamma.T + X_(R+1) + # Xi . Gamma.T is a sum of n_components rank-1 matrices. X_(R+1) is + # whatever is left to fully reconstruct X, and can be 0 if X is of rank + # n_components. + # Similarly, y was approximated as Omega . Delta.T + y_(R+1) + + # Compute transformation matrices (rotations_). See User Guide. + self.x_rotations_ = np.dot( + self.x_weights_, + pinv(np.dot(self.x_loadings_.T, self.x_weights_), check_finite=False), + ) + self.y_rotations_ = np.dot( + self.y_weights_, + pinv(np.dot(self.y_loadings_.T, self.y_weights_), check_finite=False), + ) + self.coef_ = np.dot(self.x_rotations_, self.y_loadings_.T) + self.coef_ = (self.coef_ * self._y_std).T / self._x_std + self.intercept_ = self._y_mean + self._n_features_out = self.x_rotations_.shape[1] + return self + + def transform(self, X, y=None, copy=True): + """Apply the dimension reduction. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Samples to transform. + + y : array-like of shape (n_samples, n_targets), default=None + Target vectors. + + copy : bool, default=True + Whether to copy `X` and `y`, or perform in-place normalization. + + Returns + ------- + x_scores, y_scores : array-like or tuple of array-like + Return `x_scores` if `y` is not given, `(x_scores, y_scores)` otherwise. + """ + check_is_fitted(self) + X = validate_data(self, X, copy=copy, dtype=FLOAT_DTYPES, reset=False) + # Normalize + X -= self._x_mean + X /= self._x_std + # Apply rotation + x_scores = np.dot(X, self.x_rotations_) + if y is not None: + y = check_array( + y, input_name="y", ensure_2d=False, copy=copy, dtype=FLOAT_DTYPES + ) + if y.ndim == 1: + y = y.reshape(-1, 1) + y -= self._y_mean + y /= self._y_std + y_scores = np.dot(y, self.y_rotations_) + return x_scores, y_scores + + return x_scores + + def inverse_transform(self, X, y=None): + """Transform data back to its original space. + + Parameters + ---------- + X : array-like of shape (n_samples, n_components) + New data, where `n_samples` is the number of samples + and `n_components` is the number of pls components. + + y : array-like of shape (n_samples,) or (n_samples, n_components) + New target, where `n_samples` is the number of samples + and `n_components` is the number of pls components. + + Returns + ------- + X_original : ndarray of shape (n_samples, n_features) + Return the reconstructed `X` data. + + y_original : ndarray of shape (n_samples, n_targets) + Return the reconstructed `X` target. Only returned when `y` is given. + + Notes + ----- + This transformation will only be exact if `n_components=n_features`. + """ + check_is_fitted(self) + X = check_array(X, input_name="X", dtype=FLOAT_DTYPES) + # From pls space to original space + X_reconstructed = np.matmul(X, self.x_loadings_.T) + # Denormalize + X_reconstructed *= self._x_std + X_reconstructed += self._x_mean + + if y is not None: + y = check_array(y, input_name="y", dtype=FLOAT_DTYPES) + # From pls space to original space + y_reconstructed = np.matmul(y, self.y_loadings_.T) + # Denormalize + y_reconstructed *= self._y_std + y_reconstructed += self._y_mean + return X_reconstructed, y_reconstructed + + return X_reconstructed + + def predict(self, X, copy=True): + """Predict targets of given samples. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Samples. + + copy : bool, default=True + Whether to copy `X` or perform in-place normalization. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) or (n_samples, n_targets) + Returns predicted values. + + Notes + ----- + This call requires the estimation of a matrix of shape + `(n_features, n_targets)`, which may be an issue in high dimensional + space. + """ + check_is_fitted(self) + X = validate_data(self, X, copy=copy, dtype=FLOAT_DTYPES, reset=False) + # Only center X but do not scale it since the coefficients are already scaled + X -= self._x_mean + y_pred = X @ self.coef_.T + self.intercept_ + return y_pred.ravel() if self._predict_1d else y_pred + + def fit_transform(self, X, y=None): + """Learn and apply the dimension reduction on the train data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of predictors. + + y : array-like of shape (n_samples, n_targets), default=None + Target vectors, where `n_samples` is the number of samples and + `n_targets` is the number of response variables. + + Returns + ------- + self : ndarray of shape (n_samples, n_components) + Return `x_scores` if `y` is not given, `(x_scores, y_scores)` otherwise. + """ + return self.fit(X, y).transform(X, y) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.regressor_tags.poor_score = True + tags.target_tags.required = False + return tags + + +class PLSRegression(_PLS): + """PLS regression. + + PLSRegression is also known as PLS2 or PLS1, depending on the number of + targets. + + For a comparison between other cross decomposition algorithms, see + :ref:`sphx_glr_auto_examples_cross_decomposition_plot_compare_cross_decomposition.py`. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.8 + + Parameters + ---------- + n_components : int, default=2 + Number of components to keep. Should be in `[1, n_features]`. + + scale : bool, default=True + Whether to scale `X` and `y`. + + max_iter : int, default=500 + The maximum number of iterations of the power method when + `algorithm='nipals'`. Ignored otherwise. + + tol : float, default=1e-06 + The tolerance used as convergence criteria in the power method: the + algorithm stops whenever the squared norm of `u_i - u_{i-1}` is less + than `tol`, where `u` corresponds to the left singular vector. + + copy : bool, default=True + Whether to copy `X` and `y` in :term:`fit` before applying centering, + and potentially scaling. If `False`, these operations will be done + inplace, modifying both arrays. + + Attributes + ---------- + x_weights_ : ndarray of shape (n_features, n_components) + The left singular vectors of the cross-covariance matrices of each + iteration. + + y_weights_ : ndarray of shape (n_targets, n_components) + The right singular vectors of the cross-covariance matrices of each + iteration. + + x_loadings_ : ndarray of shape (n_features, n_components) + The loadings of `X`. + + y_loadings_ : ndarray of shape (n_targets, n_components) + The loadings of `y`. + + x_scores_ : ndarray of shape (n_samples, n_components) + The transformed training samples. + + y_scores_ : ndarray of shape (n_samples, n_components) + The transformed training targets. + + x_rotations_ : ndarray of shape (n_features, n_components) + The projection matrix used to transform `X`. + + y_rotations_ : ndarray of shape (n_targets, n_components) + The projection matrix used to transform `y`. + + coef_ : ndarray of shape (n_target, n_features) + The coefficients of the linear model such that `y` is approximated as + `y = X @ coef_.T + intercept_`. + + intercept_ : ndarray of shape (n_targets,) + The intercepts of the linear model such that `y` is approximated as + `y = X @ coef_.T + intercept_`. + + .. versionadded:: 1.1 + + n_iter_ : list of shape (n_components,) + Number of iterations of the power method, for each + component. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + PLSCanonical : Partial Least Squares transformer and regressor. + + Examples + -------- + >>> from sklearn.cross_decomposition import PLSRegression + >>> X = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [2.,5.,4.]] + >>> y = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]] + >>> pls2 = PLSRegression(n_components=2) + >>> pls2.fit(X, y) + PLSRegression() + >>> y_pred = pls2.predict(X) + + For a comparison between PLS Regression and :class:`~sklearn.decomposition.PCA`, see + :ref:`sphx_glr_auto_examples_cross_decomposition_plot_pcr_vs_pls.py`. + """ + + _parameter_constraints: dict = {**_PLS._parameter_constraints} + for param in ("deflation_mode", "mode", "algorithm"): + _parameter_constraints.pop(param) + + # This implementation provides the same results that 3 PLS packages + # provided in the R language (R-project): + # - "mixOmics" with function pls(X, y, mode = "regression") + # - "plspm " with function plsreg2(X, y) + # - "pls" with function oscorespls.fit(X, y) + + def __init__( + self, n_components=2, *, scale=True, max_iter=500, tol=1e-06, copy=True + ): + super().__init__( + n_components=n_components, + scale=scale, + deflation_mode="regression", + mode="A", + algorithm="nipals", + max_iter=max_iter, + tol=tol, + copy=copy, + ) + + def fit(self, X, y): + """Fit model to data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of predictors. + + y : array-like of shape (n_samples,) or (n_samples, n_targets) + Target vectors, where `n_samples` is the number of samples and + `n_targets` is the number of response variables. + + Returns + ------- + self : object + Fitted model. + """ + super().fit(X, y) + # expose the fitted attributes `x_scores_` and `y_scores_` + self.x_scores_ = self._x_scores + self.y_scores_ = self._y_scores + return self + + +class PLSCanonical(_PLS): + """Partial Least Squares transformer and regressor. + + For a comparison between other cross decomposition algorithms, see + :ref:`sphx_glr_auto_examples_cross_decomposition_plot_compare_cross_decomposition.py`. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.8 + + Parameters + ---------- + n_components : int, default=2 + Number of components to keep. Should be in `[1, min(n_samples, + n_features, n_targets)]`. + + scale : bool, default=True + Whether to scale `X` and `y`. + + algorithm : {'nipals', 'svd'}, default='nipals' + The algorithm used to estimate the first singular vectors of the + cross-covariance matrix. 'nipals' uses the power method while 'svd' + will compute the whole SVD. + + max_iter : int, default=500 + The maximum number of iterations of the power method when + `algorithm='nipals'`. Ignored otherwise. + + tol : float, default=1e-06 + The tolerance used as convergence criteria in the power method: the + algorithm stops whenever the squared norm of `u_i - u_{i-1}` is less + than `tol`, where `u` corresponds to the left singular vector. + + copy : bool, default=True + Whether to copy `X` and `y` in fit before applying centering, and + potentially scaling. If False, these operations will be done inplace, + modifying both arrays. + + Attributes + ---------- + x_weights_ : ndarray of shape (n_features, n_components) + The left singular vectors of the cross-covariance matrices of each + iteration. + + y_weights_ : ndarray of shape (n_targets, n_components) + The right singular vectors of the cross-covariance matrices of each + iteration. + + x_loadings_ : ndarray of shape (n_features, n_components) + The loadings of `X`. + + y_loadings_ : ndarray of shape (n_targets, n_components) + The loadings of `y`. + + x_rotations_ : ndarray of shape (n_features, n_components) + The projection matrix used to transform `X`. + + y_rotations_ : ndarray of shape (n_targets, n_components) + The projection matrix used to transform `y`. + + coef_ : ndarray of shape (n_targets, n_features) + The coefficients of the linear model such that `y` is approximated as + `y = X @ coef_.T + intercept_`. + + intercept_ : ndarray of shape (n_targets,) + The intercepts of the linear model such that `y` is approximated as + `y = X @ coef_.T + intercept_`. + + .. versionadded:: 1.1 + + n_iter_ : list of shape (n_components,) + Number of iterations of the power method, for each + component. Empty if `algorithm='svd'`. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + CCA : Canonical Correlation Analysis. + PLSSVD : Partial Least Square SVD. + + Examples + -------- + >>> from sklearn.cross_decomposition import PLSCanonical + >>> X = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [2.,5.,4.]] + >>> y = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]] + >>> plsca = PLSCanonical(n_components=2) + >>> plsca.fit(X, y) + PLSCanonical() + >>> X_c, y_c = plsca.transform(X, y) + """ + + _parameter_constraints: dict = {**_PLS._parameter_constraints} + for param in ("deflation_mode", "mode"): + _parameter_constraints.pop(param) + + # This implementation provides the same results that the "plspm" package + # provided in the R language (R-project), using the function plsca(X, y). + # Results are equal or collinear with the function + # ``pls(..., mode = "canonical")`` of the "mixOmics" package. The + # difference relies in the fact that mixOmics implementation does not + # exactly implement the Wold algorithm since it does not normalize + # y_weights to one. + + def __init__( + self, + n_components=2, + *, + scale=True, + algorithm="nipals", + max_iter=500, + tol=1e-06, + copy=True, + ): + super().__init__( + n_components=n_components, + scale=scale, + deflation_mode="canonical", + mode="A", + algorithm=algorithm, + max_iter=max_iter, + tol=tol, + copy=copy, + ) + + +class CCA(_PLS): + """Canonical Correlation Analysis, also known as "Mode B" PLS. + + For a comparison between other cross decomposition algorithms, see + :ref:`sphx_glr_auto_examples_cross_decomposition_plot_compare_cross_decomposition.py`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int, default=2 + Number of components to keep. Should be in `[1, min(n_samples, + n_features, n_targets)]`. + + scale : bool, default=True + Whether to scale `X` and `y`. + + max_iter : int, default=500 + The maximum number of iterations of the power method. + + tol : float, default=1e-06 + The tolerance used as convergence criteria in the power method: the + algorithm stops whenever the squared norm of `u_i - u_{i-1}` is less + than `tol`, where `u` corresponds to the left singular vector. + + copy : bool, default=True + Whether to copy `X` and `y` in fit before applying centering, and + potentially scaling. If False, these operations will be done inplace, + modifying both arrays. + + Attributes + ---------- + x_weights_ : ndarray of shape (n_features, n_components) + The left singular vectors of the cross-covariance matrices of each + iteration. + + y_weights_ : ndarray of shape (n_targets, n_components) + The right singular vectors of the cross-covariance matrices of each + iteration. + + x_loadings_ : ndarray of shape (n_features, n_components) + The loadings of `X`. + + y_loadings_ : ndarray of shape (n_targets, n_components) + The loadings of `y`. + + x_rotations_ : ndarray of shape (n_features, n_components) + The projection matrix used to transform `X`. + + y_rotations_ : ndarray of shape (n_targets, n_components) + The projection matrix used to transform `y`. + + coef_ : ndarray of shape (n_targets, n_features) + The coefficients of the linear model such that `y` is approximated as + `y = X @ coef_.T + intercept_`. + + intercept_ : ndarray of shape (n_targets,) + The intercepts of the linear model such that `y` is approximated as + `y = X @ coef_.T + intercept_`. + + .. versionadded:: 1.1 + + n_iter_ : list of shape (n_components,) + Number of iterations of the power method, for each + component. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + PLSCanonical : Partial Least Squares transformer and regressor. + PLSSVD : Partial Least Square SVD. + + Examples + -------- + >>> from sklearn.cross_decomposition import CCA + >>> X = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [3.,5.,4.]] + >>> y = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]] + >>> cca = CCA(n_components=1) + >>> cca.fit(X, y) + CCA(n_components=1) + >>> X_c, y_c = cca.transform(X, y) + """ + + _parameter_constraints: dict = {**_PLS._parameter_constraints} + for param in ("deflation_mode", "mode", "algorithm"): + _parameter_constraints.pop(param) + + def __init__( + self, n_components=2, *, scale=True, max_iter=500, tol=1e-06, copy=True + ): + super().__init__( + n_components=n_components, + scale=scale, + deflation_mode="canonical", + mode="B", + algorithm="nipals", + max_iter=max_iter, + tol=tol, + copy=copy, + ) + + +class PLSSVD(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): + """Partial Least Square SVD. + + This transformer simply performs a SVD on the cross-covariance matrix + `X'y`. It is able to project both the training data `X` and the targets + `y`. The training data `X` is projected on the left singular vectors, while + the targets are projected on the right singular vectors. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.8 + + Parameters + ---------- + n_components : int, default=2 + The number of components to keep. Should be in `[1, + min(n_samples, n_features, n_targets)]`. + + scale : bool, default=True + Whether to scale `X` and `y`. + + copy : bool, default=True + Whether to copy `X` and `y` in fit before applying centering, and + potentially scaling. If `False`, these operations will be done inplace, + modifying both arrays. + + Attributes + ---------- + x_weights_ : ndarray of shape (n_features, n_components) + The left singular vectors of the SVD of the cross-covariance matrix. + Used to project `X` in :meth:`transform`. + + y_weights_ : ndarray of (n_targets, n_components) + The right singular vectors of the SVD of the cross-covariance matrix. + Used to project `X` in :meth:`transform`. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + PLSCanonical : Partial Least Squares transformer and regressor. + CCA : Canonical Correlation Analysis. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.cross_decomposition import PLSSVD + >>> X = np.array([[0., 0., 1.], + ... [1., 0., 0.], + ... [2., 2., 2.], + ... [2., 5., 4.]]) + >>> y = np.array([[0.1, -0.2], + ... [0.9, 1.1], + ... [6.2, 5.9], + ... [11.9, 12.3]]) + >>> pls = PLSSVD(n_components=2).fit(X, y) + >>> X_c, y_c = pls.transform(X, y) + >>> X_c.shape, y_c.shape + ((4, 2), (4, 2)) + """ + + _parameter_constraints: dict = { + "n_components": [Interval(Integral, 1, None, closed="left")], + "scale": ["boolean"], + "copy": ["boolean"], + } + + def __init__(self, n_components=2, *, scale=True, copy=True): + self.n_components = n_components + self.scale = scale + self.copy = copy + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y): + """Fit model to data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training samples. + + y : array-like of shape (n_samples,) or (n_samples, n_targets) + Targets. + + Returns + ------- + self : object + Fitted estimator. + """ + check_consistent_length(X, y) + X = validate_data( + self, + X, + dtype=np.float64, + force_writeable=True, + copy=self.copy, + ensure_min_samples=2, + ) + y = check_array( + y, + input_name="y", + dtype=np.float64, + force_writeable=True, + copy=self.copy, + ensure_2d=False, + ) + if y.ndim == 1: + y = y.reshape(-1, 1) + + # we'll compute the SVD of the cross-covariance matrix = X.T.dot(y) + # This matrix rank is at most min(n_samples, n_features, n_targets) so + # n_components cannot be bigger than that. + n_components = self.n_components + rank_upper_bound = min(X.shape[0], X.shape[1], y.shape[1]) + if n_components > rank_upper_bound: + raise ValueError( + f"`n_components` upper bound is {rank_upper_bound}. " + f"Got {n_components} instead. Reduce `n_components`." + ) + + X, y, self._x_mean, self._y_mean, self._x_std, self._y_std = _center_scale_xy( + X, y, self.scale + ) + + # Compute SVD of cross-covariance matrix + C = np.dot(X.T, y) + U, s, Vt = svd(C, full_matrices=False) + U = U[:, :n_components] + Vt = Vt[:n_components] + U, Vt = svd_flip(U, Vt) + V = Vt.T + + self.x_weights_ = U + self.y_weights_ = V + self._n_features_out = self.x_weights_.shape[1] + return self + + def transform(self, X, y=None): + """ + Apply the dimensionality reduction. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Samples to be transformed. + + y : array-like of shape (n_samples,) or (n_samples, n_targets), \ + default=None + Targets. + + Returns + ------- + x_scores : array-like or tuple of array-like + The transformed data `X_transformed` if `y is not None`, + `(X_transformed, y_transformed)` otherwise. + """ + check_is_fitted(self) + X = validate_data(self, X, dtype=np.float64, reset=False) + Xr = (X - self._x_mean) / self._x_std + x_scores = np.dot(Xr, self.x_weights_) + if y is not None: + y = check_array(y, input_name="y", ensure_2d=False, dtype=np.float64) + if y.ndim == 1: + y = y.reshape(-1, 1) + yr = (y - self._y_mean) / self._y_std + y_scores = np.dot(yr, self.y_weights_) + return x_scores, y_scores + return x_scores + + def fit_transform(self, X, y=None): + """Learn and apply the dimensionality reduction. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training samples. + + y : array-like of shape (n_samples,) or (n_samples, n_targets), \ + default=None + Targets. + + Returns + ------- + out : array-like or tuple of array-like + The transformed data `X_transformed` if `y is not None`, + `(X_transformed, y_transformed)` otherwise. + """ + return self.fit(X, y).transform(X, y) diff --git a/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/tests/test_pls.py b/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/tests/test_pls.py new file mode 100644 index 0000000000000000000000000000000000000000..7e516d71b6f988710b71c2b8d575a80e42e87d65 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/cross_decomposition/tests/test_pls.py @@ -0,0 +1,677 @@ +import warnings + +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal + +from sklearn.cross_decomposition import CCA, PLSSVD, PLSCanonical, PLSRegression +from sklearn.cross_decomposition._pls import ( + _center_scale_xy, + _get_first_singular_vectors_power_method, + _get_first_singular_vectors_svd, + _svd_flip_1d, +) +from sklearn.datasets import load_linnerud, make_regression +from sklearn.ensemble import VotingRegressor +from sklearn.exceptions import ConvergenceWarning +from sklearn.linear_model import LinearRegression +from sklearn.utils import check_random_state +from sklearn.utils.extmath import svd_flip + + +def assert_matrix_orthogonal(M): + K = np.dot(M.T, M) + assert_array_almost_equal(K, np.diag(np.diag(K))) + + +def test_pls_canonical_basics(): + # Basic checks for PLSCanonical + d = load_linnerud() + X = d.data + y = d.target + + pls = PLSCanonical(n_components=X.shape[1]) + pls.fit(X, y) + + assert_matrix_orthogonal(pls.x_weights_) + assert_matrix_orthogonal(pls.y_weights_) + assert_matrix_orthogonal(pls._x_scores) + assert_matrix_orthogonal(pls._y_scores) + + # Check X = TP' and y = UQ' + T = pls._x_scores + P = pls.x_loadings_ + U = pls._y_scores + Q = pls.y_loadings_ + # Need to scale first + Xc, yc, x_mean, y_mean, x_std, y_std = _center_scale_xy( + X.copy(), y.copy(), scale=True + ) + assert_array_almost_equal(Xc, np.dot(T, P.T)) + assert_array_almost_equal(yc, np.dot(U, Q.T)) + + # Check that rotations on training data lead to scores + Xt = pls.transform(X) + assert_array_almost_equal(Xt, pls._x_scores) + Xt, yt = pls.transform(X, y) + assert_array_almost_equal(Xt, pls._x_scores) + assert_array_almost_equal(yt, pls._y_scores) + + # Check that inverse_transform works + X_back = pls.inverse_transform(Xt) + assert_array_almost_equal(X_back, X) + _, y_back = pls.inverse_transform(Xt, yt) + assert_array_almost_equal(y_back, y) + + +def test_sanity_check_pls_regression(): + # Sanity check for PLSRegression + # The results were checked against the R-packages plspm, misOmics and pls + + d = load_linnerud() + X = d.data + y = d.target + + pls = PLSRegression(n_components=X.shape[1]) + X_trans, _ = pls.fit_transform(X, y) + + # FIXME: one would expect y_trans == pls.y_scores_ but this is not + # the case. + # xref: https://github.com/scikit-learn/scikit-learn/issues/22420 + assert_allclose(X_trans, pls.x_scores_) + + expected_x_weights = np.array( + [ + [-0.61330704, -0.00443647, 0.78983213], + [-0.74697144, -0.32172099, -0.58183269], + [-0.25668686, 0.94682413, -0.19399983], + ] + ) + + expected_x_loadings = np.array( + [ + [-0.61470416, -0.24574278, 0.78983213], + [-0.65625755, -0.14396183, -0.58183269], + [-0.51733059, 1.00609417, -0.19399983], + ] + ) + + expected_y_weights = np.array( + [ + [+0.32456184, 0.29892183, 0.20316322], + [+0.42439636, 0.61970543, 0.19320542], + [-0.13143144, -0.26348971, -0.17092916], + ] + ) + + expected_y_loadings = np.array( + [ + [+0.32456184, 0.29892183, 0.20316322], + [+0.42439636, 0.61970543, 0.19320542], + [-0.13143144, -0.26348971, -0.17092916], + ] + ) + + assert_array_almost_equal(np.abs(pls.x_loadings_), np.abs(expected_x_loadings)) + assert_array_almost_equal(np.abs(pls.x_weights_), np.abs(expected_x_weights)) + assert_array_almost_equal(np.abs(pls.y_loadings_), np.abs(expected_y_loadings)) + assert_array_almost_equal(np.abs(pls.y_weights_), np.abs(expected_y_weights)) + + # The R / Python difference in the signs should be consistent across + # loadings, weights, etc. + x_loadings_sign_flip = np.sign(pls.x_loadings_ / expected_x_loadings) + x_weights_sign_flip = np.sign(pls.x_weights_ / expected_x_weights) + y_weights_sign_flip = np.sign(pls.y_weights_ / expected_y_weights) + y_loadings_sign_flip = np.sign(pls.y_loadings_ / expected_y_loadings) + assert_array_almost_equal(x_loadings_sign_flip, x_weights_sign_flip) + assert_array_almost_equal(y_loadings_sign_flip, y_weights_sign_flip) + + +def test_sanity_check_pls_regression_constant_column_y(): + # Check behavior when the first column of y is constant + # The results are checked against a modified version of plsreg2 + # from the R-package plsdepot + d = load_linnerud() + X = d.data + y = d.target + y[:, 0] = 1 + pls = PLSRegression(n_components=X.shape[1]) + pls.fit(X, y) + + expected_x_weights = np.array( + [ + [-0.6273573, 0.007081799, 0.7786994], + [-0.7493417, -0.277612681, -0.6011807], + [-0.2119194, 0.960666981, -0.1794690], + ] + ) + + expected_x_loadings = np.array( + [ + [-0.6273512, -0.22464538, 0.7786994], + [-0.6643156, -0.09871193, -0.6011807], + [-0.5125877, 1.01407380, -0.1794690], + ] + ) + + expected_y_loadings = np.array( + [ + [0.0000000, 0.0000000, 0.0000000], + [0.4357300, 0.5828479, 0.2174802], + [-0.1353739, -0.2486423, -0.1810386], + ] + ) + + assert_array_almost_equal(np.abs(expected_x_weights), np.abs(pls.x_weights_)) + assert_array_almost_equal(np.abs(expected_x_loadings), np.abs(pls.x_loadings_)) + # For the PLSRegression with default parameters, y_loadings == y_weights + assert_array_almost_equal(np.abs(pls.y_loadings_), np.abs(expected_y_loadings)) + assert_array_almost_equal(np.abs(pls.y_weights_), np.abs(expected_y_loadings)) + + x_loadings_sign_flip = np.sign(expected_x_loadings / pls.x_loadings_) + x_weights_sign_flip = np.sign(expected_x_weights / pls.x_weights_) + # we ignore the first full-zeros row for y + y_loadings_sign_flip = np.sign(expected_y_loadings[1:] / pls.y_loadings_[1:]) + + assert_array_equal(x_loadings_sign_flip, x_weights_sign_flip) + assert_array_equal(x_loadings_sign_flip[1:], y_loadings_sign_flip) + + +def test_sanity_check_pls_canonical(): + # Sanity check for PLSCanonical + # The results were checked against the R-package plspm + + d = load_linnerud() + X = d.data + y = d.target + + pls = PLSCanonical(n_components=X.shape[1]) + pls.fit(X, y) + + expected_x_weights = np.array( + [ + [-0.61330704, 0.25616119, -0.74715187], + [-0.74697144, 0.11930791, 0.65406368], + [-0.25668686, -0.95924297, -0.11817271], + ] + ) + + expected_x_rotations = np.array( + [ + [-0.61330704, 0.41591889, -0.62297525], + [-0.74697144, 0.31388326, 0.77368233], + [-0.25668686, -0.89237972, -0.24121788], + ] + ) + + expected_y_weights = np.array( + [ + [+0.58989127, 0.7890047, 0.1717553], + [+0.77134053, -0.61351791, 0.16920272], + [-0.23887670, -0.03267062, 0.97050016], + ] + ) + + expected_y_rotations = np.array( + [ + [+0.58989127, 0.7168115, 0.30665872], + [+0.77134053, -0.70791757, 0.19786539], + [-0.23887670, -0.00343595, 0.94162826], + ] + ) + + assert_array_almost_equal(np.abs(pls.x_rotations_), np.abs(expected_x_rotations)) + assert_array_almost_equal(np.abs(pls.x_weights_), np.abs(expected_x_weights)) + assert_array_almost_equal(np.abs(pls.y_rotations_), np.abs(expected_y_rotations)) + assert_array_almost_equal(np.abs(pls.y_weights_), np.abs(expected_y_weights)) + + x_rotations_sign_flip = np.sign(pls.x_rotations_ / expected_x_rotations) + x_weights_sign_flip = np.sign(pls.x_weights_ / expected_x_weights) + y_rotations_sign_flip = np.sign(pls.y_rotations_ / expected_y_rotations) + y_weights_sign_flip = np.sign(pls.y_weights_ / expected_y_weights) + assert_array_almost_equal(x_rotations_sign_flip, x_weights_sign_flip) + assert_array_almost_equal(y_rotations_sign_flip, y_weights_sign_flip) + + assert_matrix_orthogonal(pls.x_weights_) + assert_matrix_orthogonal(pls.y_weights_) + + assert_matrix_orthogonal(pls._x_scores) + assert_matrix_orthogonal(pls._y_scores) + + +def test_sanity_check_pls_canonical_random(): + # Sanity check for PLSCanonical on random data + # The results were checked against the R-package plspm + n = 500 + p_noise = 10 + q_noise = 5 + # 2 latents vars: + rng = check_random_state(11) + l1 = rng.normal(size=n) + l2 = rng.normal(size=n) + latents = np.array([l1, l1, l2, l2]).T + X = latents + rng.normal(size=4 * n).reshape((n, 4)) + y = latents + rng.normal(size=4 * n).reshape((n, 4)) + X = np.concatenate((X, rng.normal(size=p_noise * n).reshape(n, p_noise)), axis=1) + y = np.concatenate((y, rng.normal(size=q_noise * n).reshape(n, q_noise)), axis=1) + + pls = PLSCanonical(n_components=3) + pls.fit(X, y) + + expected_x_weights = np.array( + [ + [0.65803719, 0.19197924, 0.21769083], + [0.7009113, 0.13303969, -0.15376699], + [0.13528197, -0.68636408, 0.13856546], + [0.16854574, -0.66788088, -0.12485304], + [-0.03232333, -0.04189855, 0.40690153], + [0.1148816, -0.09643158, 0.1613305], + [0.04792138, -0.02384992, 0.17175319], + [-0.06781, -0.01666137, -0.18556747], + [-0.00266945, -0.00160224, 0.11893098], + [-0.00849528, -0.07706095, 0.1570547], + [-0.00949471, -0.02964127, 0.34657036], + [-0.03572177, 0.0945091, 0.3414855], + [0.05584937, -0.02028961, -0.57682568], + [0.05744254, -0.01482333, -0.17431274], + ] + ) + + expected_x_loadings = np.array( + [ + [0.65649254, 0.1847647, 0.15270699], + [0.67554234, 0.15237508, -0.09182247], + [0.19219925, -0.67750975, 0.08673128], + [0.2133631, -0.67034809, -0.08835483], + [-0.03178912, -0.06668336, 0.43395268], + [0.15684588, -0.13350241, 0.20578984], + [0.03337736, -0.03807306, 0.09871553], + [-0.06199844, 0.01559854, -0.1881785], + [0.00406146, -0.00587025, 0.16413253], + [-0.00374239, -0.05848466, 0.19140336], + [0.00139214, -0.01033161, 0.32239136], + [-0.05292828, 0.0953533, 0.31916881], + [0.04031924, -0.01961045, -0.65174036], + [0.06172484, -0.06597366, -0.1244497], + ] + ) + + expected_y_weights = np.array( + [ + [0.66101097, 0.18672553, 0.22826092], + [0.69347861, 0.18463471, -0.23995597], + [0.14462724, -0.66504085, 0.17082434], + [0.22247955, -0.6932605, -0.09832993], + [0.07035859, 0.00714283, 0.67810124], + [0.07765351, -0.0105204, -0.44108074], + [-0.00917056, 0.04322147, 0.10062478], + [-0.01909512, 0.06182718, 0.28830475], + [0.01756709, 0.04797666, 0.32225745], + ] + ) + + expected_y_loadings = np.array( + [ + [0.68568625, 0.1674376, 0.0969508], + [0.68782064, 0.20375837, -0.1164448], + [0.11712173, -0.68046903, 0.12001505], + [0.17860457, -0.6798319, -0.05089681], + [0.06265739, -0.0277703, 0.74729584], + [0.0914178, 0.00403751, -0.5135078], + [-0.02196918, -0.01377169, 0.09564505], + [-0.03288952, 0.09039729, 0.31858973], + [0.04287624, 0.05254676, 0.27836841], + ] + ) + + assert_array_almost_equal(np.abs(pls.x_loadings_), np.abs(expected_x_loadings)) + assert_array_almost_equal(np.abs(pls.x_weights_), np.abs(expected_x_weights)) + assert_array_almost_equal(np.abs(pls.y_loadings_), np.abs(expected_y_loadings)) + assert_array_almost_equal(np.abs(pls.y_weights_), np.abs(expected_y_weights)) + + x_loadings_sign_flip = np.sign(pls.x_loadings_ / expected_x_loadings) + x_weights_sign_flip = np.sign(pls.x_weights_ / expected_x_weights) + y_weights_sign_flip = np.sign(pls.y_weights_ / expected_y_weights) + y_loadings_sign_flip = np.sign(pls.y_loadings_ / expected_y_loadings) + assert_array_almost_equal(x_loadings_sign_flip, x_weights_sign_flip) + assert_array_almost_equal(y_loadings_sign_flip, y_weights_sign_flip) + + assert_matrix_orthogonal(pls.x_weights_) + assert_matrix_orthogonal(pls.y_weights_) + + assert_matrix_orthogonal(pls._x_scores) + assert_matrix_orthogonal(pls._y_scores) + + +def test_convergence_fail(): + # Make sure ConvergenceWarning is raised if max_iter is too small + d = load_linnerud() + X = d.data + y = d.target + pls_nipals = PLSCanonical(n_components=X.shape[1], max_iter=2) + with pytest.warns(ConvergenceWarning): + pls_nipals.fit(X, y) + + +@pytest.mark.parametrize("Est", (PLSSVD, PLSRegression, PLSCanonical)) +def test_attibutes_shapes(Est): + # Make sure attributes are of the correct shape depending on n_components + d = load_linnerud() + X = d.data + y = d.target + n_components = 2 + pls = Est(n_components=n_components) + pls.fit(X, y) + assert all( + attr.shape[1] == n_components for attr in (pls.x_weights_, pls.y_weights_) + ) + + +@pytest.mark.parametrize("Est", (PLSRegression, PLSCanonical, CCA)) +def test_univariate_equivalence(Est): + # Ensure 2D y with 1 column is equivalent to 1D y + d = load_linnerud() + X = d.data + y = d.target + + est = Est(n_components=1) + one_d_coeff = est.fit(X, y[:, 0]).coef_ + two_d_coeff = est.fit(X, y[:, :1]).coef_ + + assert one_d_coeff.shape == two_d_coeff.shape + assert_array_almost_equal(one_d_coeff, two_d_coeff) + + +@pytest.mark.parametrize("Est", (PLSRegression, PLSCanonical, CCA, PLSSVD)) +def test_copy(Est): + # check that the "copy" keyword works + d = load_linnerud() + X = d.data + y = d.target + X_orig = X.copy() + + # copy=True won't modify inplace + pls = Est(copy=True).fit(X, y) + assert_array_equal(X, X_orig) + + # copy=False will modify inplace + with pytest.raises(AssertionError): + Est(copy=False).fit(X, y) + assert_array_almost_equal(X, X_orig) + + if Est is PLSSVD: + return # PLSSVD does not support copy param in predict or transform + + X_orig = X.copy() + with pytest.raises(AssertionError): + pls.transform(X, y, copy=False) + assert_array_almost_equal(X, X_orig) + + X_orig = X.copy() + with pytest.raises(AssertionError): + pls.predict(X, copy=False) + assert_array_almost_equal(X, X_orig) + + # Make sure copy=True gives same transform and predictions as predict=False + assert_array_almost_equal( + pls.transform(X, y, copy=True), pls.transform(X.copy(), y.copy(), copy=False) + ) + assert_array_almost_equal( + pls.predict(X, copy=True), pls.predict(X.copy(), copy=False) + ) + + +def _generate_test_scale_and_stability_datasets(): + """Generate dataset for test_scale_and_stability""" + # dataset for non-regression 7818 + rng = np.random.RandomState(0) + n_samples = 1000 + n_targets = 5 + n_features = 10 + Q = rng.randn(n_targets, n_features) + y = rng.randn(n_samples, n_targets) + X = np.dot(y, Q) + 2 * rng.randn(n_samples, n_features) + 1 + X *= 1000 + yield X, y + + # Data set where one of the features is constraint + X, y = load_linnerud(return_X_y=True) + # causes X[:, -1].std() to be zero + X[:, -1] = 1.0 + yield X, y + + X = np.array([[0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [2.0, 2.0, 2.0], [3.0, 5.0, 4.0]]) + y = np.array([[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]]) + yield X, y + + # Seeds that provide a non-regression test for #18746, where CCA fails + seeds = [530, 741] + for seed in seeds: + rng = np.random.RandomState(seed) + X = rng.randn(4, 3) + y = rng.randn(4, 2) + yield X, y + + +@pytest.mark.parametrize("Est", (CCA, PLSCanonical, PLSRegression, PLSSVD)) +@pytest.mark.parametrize("X, y", _generate_test_scale_and_stability_datasets()) +def test_scale_and_stability(Est, X, y): + """scale=True is equivalent to scale=False on centered/scaled data + This allows to check numerical stability over platforms as well""" + + X_s, y_s, *_ = _center_scale_xy(X, y) + + X_score, y_score = Est(scale=True).fit_transform(X, y) + X_s_score, y_s_score = Est(scale=False).fit_transform(X_s, y_s) + + assert_allclose(X_s_score, X_score, atol=1e-4) + assert_allclose(y_s_score, y_score, atol=1e-4) + + +@pytest.mark.parametrize("Estimator", (PLSSVD, PLSRegression, PLSCanonical, CCA)) +def test_n_components_upper_bounds(Estimator): + """Check the validation of `n_components` upper bounds for `PLS` regressors.""" + rng = np.random.RandomState(0) + X = rng.randn(10, 5) + y = rng.randn(10, 3) + est = Estimator(n_components=10) + err_msg = "`n_components` upper bound is .*. Got 10 instead. Reduce `n_components`." + with pytest.raises(ValueError, match=err_msg): + est.fit(X, y) + + +def test_n_components_upper_PLSRegression(): + """Check the validation of `n_components` upper bounds for PLSRegression.""" + rng = np.random.RandomState(0) + X = rng.randn(20, 64) + y = rng.randn(20, 3) + est = PLSRegression(n_components=30) + err_msg = "`n_components` upper bound is 20. Got 30 instead. Reduce `n_components`." + with pytest.raises(ValueError, match=err_msg): + est.fit(X, y) + + +@pytest.mark.parametrize("n_samples, n_features", [(100, 10), (100, 200)]) +def test_singular_value_helpers(n_samples, n_features, global_random_seed): + # Make sure SVD and power method give approximately the same results + X, y = make_regression( + n_samples, n_features, n_targets=5, random_state=global_random_seed + ) + u1, v1, _ = _get_first_singular_vectors_power_method(X, y, norm_y_weights=True) + u2, v2 = _get_first_singular_vectors_svd(X, y) + + _svd_flip_1d(u1, v1) + _svd_flip_1d(u2, v2) + + rtol = 1e-3 + # Setting atol because some coordinates are very close to zero + assert_allclose(u1, u2, atol=u2.max() * rtol) + assert_allclose(v1, v2, atol=v2.max() * rtol) + + +def test_one_component_equivalence(global_random_seed): + # PLSSVD, PLSRegression and PLSCanonical should all be equivalent when + # n_components is 1 + X, y = make_regression(100, 10, n_targets=5, random_state=global_random_seed) + svd = PLSSVD(n_components=1).fit(X, y).transform(X) + reg = PLSRegression(n_components=1).fit(X, y).transform(X) + canonical = PLSCanonical(n_components=1).fit(X, y).transform(X) + + rtol = 1e-3 + # Setting atol because some entries are very close to zero + assert_allclose(svd, reg, atol=reg.max() * rtol) + assert_allclose(svd, canonical, atol=canonical.max() * rtol) + + +def test_svd_flip_1d(): + # Make sure svd_flip_1d is equivalent to svd_flip + u = np.array([1, -4, 2]) + v = np.array([1, 2, 3]) + + u_expected, v_expected = svd_flip(u.reshape(-1, 1), v.reshape(1, -1)) + _svd_flip_1d(u, v) # inplace + + assert_allclose(u, u_expected.ravel()) + assert_allclose(u, [-1, 4, -2]) + + assert_allclose(v, v_expected.ravel()) + assert_allclose(v, [-1, -2, -3]) + + +def test_loadings_converges(global_random_seed): + """Test that CCA converges. Non-regression test for #19549.""" + X, y = make_regression( + n_samples=200, n_features=20, n_targets=20, random_state=global_random_seed + ) + + cca = CCA(n_components=10, max_iter=500) + + with warnings.catch_warnings(): + warnings.simplefilter("error", ConvergenceWarning) + + cca.fit(X, y) + + # Loadings converges to reasonable values + assert np.all(np.abs(cca.x_loadings_) < 1) + + +def test_pls_constant_y(): + """Checks warning when y is constant. Non-regression test for #19831""" + rng = np.random.RandomState(42) + x = rng.rand(100, 3) + y = np.zeros(100) + + pls = PLSRegression() + + msg = "y residual is constant at iteration" + with pytest.warns(UserWarning, match=msg): + pls.fit(x, y) + + assert_allclose(pls.x_rotations_, 0) + + +@pytest.mark.parametrize("PLSEstimator", [PLSRegression, PLSCanonical, CCA]) +def test_pls_coef_shape(PLSEstimator): + """Check the shape of `coef_` attribute. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/12410 + """ + d = load_linnerud() + X = d.data + y = d.target + + pls = PLSEstimator(copy=True).fit(X, y) + + n_targets, n_features = y.shape[1], X.shape[1] + assert pls.coef_.shape == (n_targets, n_features) + + +@pytest.mark.parametrize("scale", [True, False]) +@pytest.mark.parametrize("PLSEstimator", [PLSRegression, PLSCanonical, CCA]) +def test_pls_prediction(PLSEstimator, scale): + """Check the behaviour of the prediction function.""" + d = load_linnerud() + X = d.data + y = d.target + + pls = PLSEstimator(copy=True, scale=scale).fit(X, y) + y_pred = pls.predict(X, copy=True) + + y_mean = y.mean(axis=0) + X_trans = X - X.mean(axis=0) + + assert_allclose(pls.intercept_, y_mean) + assert_allclose(y_pred, X_trans @ pls.coef_.T + pls.intercept_) + + +@pytest.mark.parametrize("Klass", [CCA, PLSSVD, PLSRegression, PLSCanonical]) +def test_pls_feature_names_out(Klass): + """Check `get_feature_names_out` cross_decomposition module.""" + X, y = load_linnerud(return_X_y=True) + + est = Klass().fit(X, y) + names_out = est.get_feature_names_out() + + class_name_lower = Klass.__name__.lower() + expected_names_out = np.array( + [f"{class_name_lower}{i}" for i in range(est.x_weights_.shape[1])], + dtype=object, + ) + assert_array_equal(names_out, expected_names_out) + + +@pytest.mark.parametrize("Klass", [CCA, PLSSVD, PLSRegression, PLSCanonical]) +def test_pls_set_output(Klass): + """Check `set_output` in cross_decomposition module.""" + pd = pytest.importorskip("pandas") + X, y = load_linnerud(return_X_y=True, as_frame=True) + + est = Klass().set_output(transform="pandas").fit(X, y) + X_trans, y_trans = est.transform(X, y) + assert isinstance(y_trans, np.ndarray) + assert isinstance(X_trans, pd.DataFrame) + assert_array_equal(X_trans.columns, est.get_feature_names_out()) + + +def test_pls_regression_fit_1d_y(): + """Check that when fitting with 1d `y`, prediction should also be 1d. + + Non-regression test for Issue #26549. + """ + X = np.array([[1, 1], [2, 4], [3, 9], [4, 16], [5, 25], [6, 36]]) + y = np.array([2, 6, 12, 20, 30, 42]) + expected = y.copy() + + plsr = PLSRegression().fit(X, y) + y_pred = plsr.predict(X) + assert y_pred.shape == expected.shape + + # Check that it works in VotingRegressor + lr = LinearRegression().fit(X, y) + vr = VotingRegressor([("lr", lr), ("plsr", plsr)]) + y_pred = vr.fit(X, y).predict(X) + assert y_pred.shape == expected.shape + assert_allclose(y_pred, expected) + + +def test_pls_regression_scaling_coef(): + """Check that when using `scale=True`, the coefficients are using the std. dev. from + both `X` and `y`. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/27964 + """ + # handcrafted data where we can predict y from X with an additional scaling factor + rng = np.random.RandomState(0) + coef = rng.uniform(size=(3, 5)) + X = rng.normal(scale=10, size=(30, 5)) # add a std of 10 + y = X @ coef.T + + # we need to make sure that the dimension of the latent space is large enough to + # perfectly predict `y` from `X` (no information loss) + pls = PLSRegression(n_components=5, scale=True).fit(X, y) + assert_allclose(pls.coef_, coef) + + # we therefore should be able to predict `y` from `X` + assert_allclose(pls.predict(X), y) diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8863fe489f3b62740757c3801ee55d7e1e406703 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/__init__.py @@ -0,0 +1,166 @@ +"""Utilities to load popular datasets and artificial data generators.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import textwrap + +from ._base import ( + clear_data_home, + fetch_file, + get_data_home, + load_breast_cancer, + load_diabetes, + load_digits, + load_files, + load_iris, + load_linnerud, + load_sample_image, + load_sample_images, + load_wine, +) +from ._california_housing import fetch_california_housing +from ._covtype import fetch_covtype +from ._kddcup99 import fetch_kddcup99 +from ._lfw import fetch_lfw_pairs, fetch_lfw_people +from ._olivetti_faces import fetch_olivetti_faces +from ._openml import fetch_openml +from ._rcv1 import fetch_rcv1 +from ._samples_generator import ( + make_biclusters, + make_blobs, + make_checkerboard, + make_circles, + make_classification, + make_friedman1, + make_friedman2, + make_friedman3, + make_gaussian_quantiles, + make_hastie_10_2, + make_low_rank_matrix, + make_moons, + make_multilabel_classification, + make_regression, + make_s_curve, + make_sparse_coded_signal, + make_sparse_spd_matrix, + make_sparse_uncorrelated, + make_spd_matrix, + make_swiss_roll, +) +from ._species_distributions import fetch_species_distributions +from ._svmlight_format_io import ( + dump_svmlight_file, + load_svmlight_file, + load_svmlight_files, +) +from ._twenty_newsgroups import fetch_20newsgroups, fetch_20newsgroups_vectorized + +__all__ = [ + "clear_data_home", + "dump_svmlight_file", + "fetch_20newsgroups", + "fetch_20newsgroups_vectorized", + "fetch_california_housing", + "fetch_covtype", + "fetch_file", + "fetch_kddcup99", + "fetch_lfw_pairs", + "fetch_lfw_people", + "fetch_olivetti_faces", + "fetch_openml", + "fetch_rcv1", + "fetch_species_distributions", + "get_data_home", + "load_breast_cancer", + "load_diabetes", + "load_digits", + "load_files", + "load_iris", + "load_linnerud", + "load_sample_image", + "load_sample_images", + "load_svmlight_file", + "load_svmlight_files", + "load_wine", + "make_biclusters", + "make_blobs", + "make_checkerboard", + "make_circles", + "make_classification", + "make_friedman1", + "make_friedman2", + "make_friedman3", + "make_gaussian_quantiles", + "make_hastie_10_2", + "make_low_rank_matrix", + "make_moons", + "make_multilabel_classification", + "make_regression", + "make_s_curve", + "make_sparse_coded_signal", + "make_sparse_spd_matrix", + "make_sparse_uncorrelated", + "make_spd_matrix", + "make_swiss_roll", +] + + +def __getattr__(name): + if name == "load_boston": + msg = textwrap.dedent( + """ + `load_boston` has been removed from scikit-learn since version 1.2. + + The Boston housing prices dataset has an ethical problem: as + investigated in [1], the authors of this dataset engineered a + non-invertible variable "B" assuming that racial self-segregation had a + positive impact on house prices [2]. Furthermore the goal of the + research that led to the creation of this dataset was to study the + impact of air quality but it did not give adequate demonstration of the + validity of this assumption. + + The scikit-learn maintainers therefore strongly discourage the use of + this dataset unless the purpose of the code is to study and educate + about ethical issues in data science and machine learning. + + In this special case, you can fetch the dataset from the original + source:: + + import pandas as pd + import numpy as np + + data_url = "http://lib.stat.cmu.edu/datasets/boston" + raw_df = pd.read_csv(data_url, sep="\\s+", skiprows=22, header=None) + data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) + target = raw_df.values[1::2, 2] + + Alternative datasets include the California housing dataset and the + Ames housing dataset. You can load the datasets as follows:: + + from sklearn.datasets import fetch_california_housing + housing = fetch_california_housing() + + for the California housing dataset and:: + + from sklearn.datasets import fetch_openml + housing = fetch_openml(name="house_prices", as_frame=True) + + for the Ames housing dataset. + + [1] M Carlisle. + "Racist data destruction?" + + + [2] Harrison Jr, David, and Daniel L. Rubinfeld. + "Hedonic housing prices and the demand for clean air." + Journal of environmental economics and management 5.1 (1978): 81-102. + + """ + ) + raise ImportError(msg) + try: + return globals()[name] + except KeyError: + # This is turned into the appropriate ImportError + raise AttributeError diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_arff_parser.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_arff_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..fb6e629a73c8d509ef8bc00404311b6c1bdcbb8f --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_arff_parser.py @@ -0,0 +1,543 @@ +"""Implementation of ARFF parsers: via LIAC-ARFF and pandas.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import itertools +import re +from collections import OrderedDict +from collections.abc import Generator +from typing import List + +import numpy as np +import scipy as sp + +from ..externals import _arff +from ..externals._arff import ArffSparseDataType +from ..utils._chunking import chunk_generator, get_chunk_n_rows +from ..utils._optional_dependencies import check_pandas_support +from ..utils.fixes import pd_fillna + + +def _split_sparse_columns( + arff_data: ArffSparseDataType, include_columns: List +) -> ArffSparseDataType: + """Obtains several columns from sparse ARFF representation. Additionally, + the column indices are re-labelled, given the columns that are not + included. (e.g., when including [1, 2, 3], the columns will be relabelled + to [0, 1, 2]). + + Parameters + ---------- + arff_data : tuple + A tuple of three lists of equal size; first list indicating the value, + second the x coordinate and the third the y coordinate. + + include_columns : list + A list of columns to include. + + Returns + ------- + arff_data_new : tuple + Subset of arff data with only the include columns indicated by the + include_columns argument. + """ + arff_data_new: ArffSparseDataType = (list(), list(), list()) + reindexed_columns = { + column_idx: array_idx for array_idx, column_idx in enumerate(include_columns) + } + for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]): + if col_idx in include_columns: + arff_data_new[0].append(val) + arff_data_new[1].append(row_idx) + arff_data_new[2].append(reindexed_columns[col_idx]) + return arff_data_new + + +def _sparse_data_to_array( + arff_data: ArffSparseDataType, include_columns: List +) -> np.ndarray: + # turns the sparse data back into an array (can't use toarray() function, + # as this does only work on numeric data) + num_obs = max(arff_data[1]) + 1 + y_shape = (num_obs, len(include_columns)) + reindexed_columns = { + column_idx: array_idx for array_idx, column_idx in enumerate(include_columns) + } + # TODO: improve for efficiency + y = np.empty(y_shape, dtype=np.float64) + for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]): + if col_idx in include_columns: + y[row_idx, reindexed_columns[col_idx]] = val + return y + + +def _post_process_frame(frame, feature_names, target_names): + """Post process a dataframe to select the desired columns in `X` and `y`. + + Parameters + ---------- + frame : dataframe + The dataframe to split into `X` and `y`. + + feature_names : list of str + The list of feature names to populate `X`. + + target_names : list of str + The list of target names to populate `y`. + + Returns + ------- + X : dataframe + The dataframe containing the features. + + y : {series, dataframe} or None + The series or dataframe containing the target. + """ + X = frame[feature_names] + if len(target_names) >= 2: + y = frame[target_names] + elif len(target_names) == 1: + y = frame[target_names[0]] + else: + y = None + return X, y + + +def _liac_arff_parser( + gzip_file, + output_arrays_type, + openml_columns_info, + feature_names_to_select, + target_names_to_select, + shape=None, +): + """ARFF parser using the LIAC-ARFF library coded purely in Python. + + This parser is quite slow but consumes a generator. Currently it is needed + to parse sparse datasets. For dense datasets, it is recommended to instead + use the pandas-based parser, although it does not always handles the + dtypes exactly the same. + + Parameters + ---------- + gzip_file : GzipFile instance + The file compressed to be read. + + output_arrays_type : {"numpy", "sparse", "pandas"} + The type of the arrays that will be returned. The possibilities ara: + + - `"numpy"`: both `X` and `y` will be NumPy arrays; + - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array; + - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a + pandas Series or DataFrame. + + columns_info : dict + The information provided by OpenML regarding the columns of the ARFF + file. + + feature_names_to_select : list of str + A list of the feature names to be selected. + + target_names_to_select : list of str + A list of the target names to be selected. + + Returns + ------- + X : {ndarray, sparse matrix, dataframe} + The data matrix. + + y : {ndarray, dataframe, series} + The target. + + frame : dataframe or None + A dataframe containing both `X` and `y`. `None` if + `output_array_type != "pandas"`. + + categories : list of str or None + The names of the features that are categorical. `None` if + `output_array_type == "pandas"`. + """ + + def _io_to_generator(gzip_file): + for line in gzip_file: + yield line.decode("utf-8") + + stream = _io_to_generator(gzip_file) + + # find which type (dense or sparse) ARFF type we will have to deal with + return_type = _arff.COO if output_arrays_type == "sparse" else _arff.DENSE_GEN + # we should not let LIAC-ARFF to encode the nominal attributes with NumPy + # arrays to have only numerical values. + encode_nominal = not (output_arrays_type == "pandas") + arff_container = _arff.load( + stream, return_type=return_type, encode_nominal=encode_nominal + ) + columns_to_select = feature_names_to_select + target_names_to_select + + categories = { + name: cat + for name, cat in arff_container["attributes"] + if isinstance(cat, list) and name in columns_to_select + } + if output_arrays_type == "pandas": + pd = check_pandas_support("fetch_openml with as_frame=True") + + columns_info = OrderedDict(arff_container["attributes"]) + columns_names = list(columns_info.keys()) + + # calculate chunksize + first_row = next(arff_container["data"]) + first_df = pd.DataFrame([first_row], columns=columns_names, copy=False) + + row_bytes = first_df.memory_usage(deep=True).sum() + chunksize = get_chunk_n_rows(row_bytes) + + # read arff data with chunks + columns_to_keep = [col for col in columns_names if col in columns_to_select] + dfs = [first_df[columns_to_keep]] + for data in chunk_generator(arff_container["data"], chunksize): + dfs.append( + pd.DataFrame(data, columns=columns_names, copy=False)[columns_to_keep] + ) + # dfs[0] contains only one row, which may not have enough data to infer to + # column's dtype. Here we use `dfs[1]` to configure the dtype in dfs[0] + if len(dfs) >= 2: + dfs[0] = dfs[0].astype(dfs[1].dtypes) + + # liac-arff parser does not depend on NumPy and uses None to represent + # missing values. To be consistent with the pandas parser, we replace + # None with np.nan. + frame = pd.concat(dfs, ignore_index=True) + frame = pd_fillna(pd, frame) + del dfs, first_df + + # cast the columns frame + dtypes = {} + for name in frame.columns: + column_dtype = openml_columns_info[name]["data_type"] + if column_dtype.lower() == "integer": + # Use a pandas extension array instead of np.int64 to be able + # to support missing values. + dtypes[name] = "Int64" + elif column_dtype.lower() == "nominal": + dtypes[name] = "category" + else: + dtypes[name] = frame.dtypes[name] + frame = frame.astype(dtypes) + + X, y = _post_process_frame( + frame, feature_names_to_select, target_names_to_select + ) + else: + arff_data = arff_container["data"] + + feature_indices_to_select = [ + int(openml_columns_info[col_name]["index"]) + for col_name in feature_names_to_select + ] + target_indices_to_select = [ + int(openml_columns_info[col_name]["index"]) + for col_name in target_names_to_select + ] + + if isinstance(arff_data, Generator): + if shape is None: + raise ValueError( + "shape must be provided when arr['data'] is a Generator" + ) + if shape[0] == -1: + count = -1 + else: + count = shape[0] * shape[1] + data = np.fromiter( + itertools.chain.from_iterable(arff_data), + dtype="float64", + count=count, + ) + data = data.reshape(*shape) + X = data[:, feature_indices_to_select] + y = data[:, target_indices_to_select] + elif isinstance(arff_data, tuple): + arff_data_X = _split_sparse_columns(arff_data, feature_indices_to_select) + num_obs = max(arff_data[1]) + 1 + X_shape = (num_obs, len(feature_indices_to_select)) + X = sp.sparse.coo_matrix( + (arff_data_X[0], (arff_data_X[1], arff_data_X[2])), + shape=X_shape, + dtype=np.float64, + ) + X = X.tocsr() + y = _sparse_data_to_array(arff_data, target_indices_to_select) + else: + # This should never happen + raise ValueError( + f"Unexpected type for data obtained from arff: {type(arff_data)}" + ) + + is_classification = { + col_name in categories for col_name in target_names_to_select + } + if not is_classification: + # No target + pass + elif all(is_classification): + y = np.hstack( + [ + np.take( + np.asarray(categories.pop(col_name), dtype="O"), + y[:, i : i + 1].astype(int, copy=False), + ) + for i, col_name in enumerate(target_names_to_select) + ] + ) + elif any(is_classification): + raise ValueError( + "Mix of nominal and non-nominal targets is not currently supported" + ) + + # reshape y back to 1-D array, if there is only 1 target column; + # back to None if there are not target columns + if y.shape[1] == 1: + y = y.reshape((-1,)) + elif y.shape[1] == 0: + y = None + + if output_arrays_type == "pandas": + return X, y, frame, None + return X, y, None, categories + + +def _pandas_arff_parser( + gzip_file, + output_arrays_type, + openml_columns_info, + feature_names_to_select, + target_names_to_select, + read_csv_kwargs=None, +): + """ARFF parser using `pandas.read_csv`. + + This parser uses the metadata fetched directly from OpenML and skips the metadata + headers of ARFF file itself. The data is loaded as a CSV file. + + Parameters + ---------- + gzip_file : GzipFile instance + The GZip compressed file with the ARFF formatted payload. + + output_arrays_type : {"numpy", "sparse", "pandas"} + The type of the arrays that will be returned. The possibilities are: + + - `"numpy"`: both `X` and `y` will be NumPy arrays; + - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array; + - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a + pandas Series or DataFrame. + + openml_columns_info : dict + The information provided by OpenML regarding the columns of the ARFF + file. + + feature_names_to_select : list of str + A list of the feature names to be selected to build `X`. + + target_names_to_select : list of str + A list of the target names to be selected to build `y`. + + read_csv_kwargs : dict, default=None + Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite + the default options. + + Returns + ------- + X : {ndarray, sparse matrix, dataframe} + The data matrix. + + y : {ndarray, dataframe, series} + The target. + + frame : dataframe or None + A dataframe containing both `X` and `y`. `None` if + `output_array_type != "pandas"`. + + categories : list of str or None + The names of the features that are categorical. `None` if + `output_array_type == "pandas"`. + """ + import pandas as pd + + # read the file until the data section to skip the ARFF metadata headers + for line in gzip_file: + if line.decode("utf-8").lower().startswith("@data"): + break + + dtypes = {} + for name in openml_columns_info: + column_dtype = openml_columns_info[name]["data_type"] + if column_dtype.lower() == "integer": + # Use Int64 to infer missing values from data + # XXX: this line is not covered by our tests. Is this really needed? + dtypes[name] = "Int64" + elif column_dtype.lower() == "nominal": + dtypes[name] = "category" + # since we will not pass `names` when reading the ARFF file, we need to translate + # `dtypes` from column names to column indices to pass to `pandas.read_csv` + dtypes_positional = { + col_idx: dtypes[name] + for col_idx, name in enumerate(openml_columns_info) + if name in dtypes + } + + default_read_csv_kwargs = { + "header": None, + "index_col": False, # always force pandas to not use the first column as index + "na_values": ["?"], # missing values are represented by `?` + "keep_default_na": False, # only `?` is a missing value given the ARFF specs + "comment": "%", # skip line starting by `%` since they are comments + "quotechar": '"', # delimiter to use for quoted strings + "skipinitialspace": True, # skip spaces after delimiter to follow ARFF specs + "escapechar": "\\", + "dtype": dtypes_positional, + } + read_csv_kwargs = {**default_read_csv_kwargs, **(read_csv_kwargs or {})} + frame = pd.read_csv(gzip_file, **read_csv_kwargs) + try: + # Setting the columns while reading the file will select the N first columns + # and not raise a ParserError. Instead, we set the columns after reading the + # file and raise a ParserError if the number of columns does not match the + # number of columns in the metadata given by OpenML. + frame.columns = [name for name in openml_columns_info] + except ValueError as exc: + raise pd.errors.ParserError( + "The number of columns provided by OpenML does not match the number of " + "columns inferred by pandas when reading the file." + ) from exc + + columns_to_select = feature_names_to_select + target_names_to_select + columns_to_keep = [col for col in frame.columns if col in columns_to_select] + frame = frame[columns_to_keep] + + # `pd.read_csv` automatically handles double quotes for quoting non-numeric + # CSV cell values. Contrary to LIAC-ARFF, `pd.read_csv` cannot be configured to + # consider either single quotes and double quotes as valid quoting chars at + # the same time since this case does not occur in regular (non-ARFF) CSV files. + # To mimic the behavior of LIAC-ARFF parser, we manually strip single quotes + # on categories as a post-processing steps if needed. + # + # Note however that we intentionally do not attempt to do this kind of manual + # post-processing of (non-categorical) string-typed columns because we cannot + # resolve the ambiguity of the case of CSV cell with nesting quoting such as + # `"'some string value'"` with pandas. + single_quote_pattern = re.compile(r"^'(?P.*)'$") + + def strip_single_quotes(input_string): + match = re.search(single_quote_pattern, input_string) + if match is None: + return input_string + + return match.group("contents") + + categorical_columns = [ + name + for name, dtype in frame.dtypes.items() + if isinstance(dtype, pd.CategoricalDtype) + ] + for col in categorical_columns: + frame[col] = frame[col].cat.rename_categories(strip_single_quotes) + + X, y = _post_process_frame(frame, feature_names_to_select, target_names_to_select) + + if output_arrays_type == "pandas": + return X, y, frame, None + else: + X, y = X.to_numpy(), y.to_numpy() + + categories = { + name: dtype.categories.tolist() + for name, dtype in frame.dtypes.items() + if isinstance(dtype, pd.CategoricalDtype) + } + return X, y, None, categories + + +def load_arff_from_gzip_file( + gzip_file, + parser, + output_type, + openml_columns_info, + feature_names_to_select, + target_names_to_select, + shape=None, + read_csv_kwargs=None, +): + """Load a compressed ARFF file using a given parser. + + Parameters + ---------- + gzip_file : GzipFile instance + The file compressed to be read. + + parser : {"pandas", "liac-arff"} + The parser used to parse the ARFF file. "pandas" is recommended + but only supports loading dense datasets. + + output_type : {"numpy", "sparse", "pandas"} + The type of the arrays that will be returned. The possibilities ara: + + - `"numpy"`: both `X` and `y` will be NumPy arrays; + - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array; + - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a + pandas Series or DataFrame. + + openml_columns_info : dict + The information provided by OpenML regarding the columns of the ARFF + file. + + feature_names_to_select : list of str + A list of the feature names to be selected. + + target_names_to_select : list of str + A list of the target names to be selected. + + read_csv_kwargs : dict, default=None + Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite + the default options. + + Returns + ------- + X : {ndarray, sparse matrix, dataframe} + The data matrix. + + y : {ndarray, dataframe, series} + The target. + + frame : dataframe or None + A dataframe containing both `X` and `y`. `None` if + `output_array_type != "pandas"`. + + categories : list of str or None + The names of the features that are categorical. `None` if + `output_array_type == "pandas"`. + """ + if parser == "liac-arff": + return _liac_arff_parser( + gzip_file, + output_type, + openml_columns_info, + feature_names_to_select, + target_names_to_select, + shape, + ) + elif parser == "pandas": + return _pandas_arff_parser( + gzip_file, + output_type, + openml_columns_info, + feature_names_to_select, + target_names_to_select, + read_csv_kwargs, + ) + else: + raise ValueError( + f"Unknown parser: '{parser}'. Should be 'liac-arff' or 'pandas'." + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_base.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..e6e6939ddbc193ebab2022ecad56e23516b7e8a4 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_base.py @@ -0,0 +1,1636 @@ +""" +Base IO code for all datasets +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import csv +import gzip +import hashlib +import os +import re +import shutil +import time +import unicodedata +import warnings +from collections import namedtuple +from importlib import resources +from numbers import Integral +from os import environ, listdir, makedirs +from os.path import expanduser, isdir, join, splitext +from pathlib import Path +from tempfile import NamedTemporaryFile +from urllib.error import URLError +from urllib.parse import urlparse +from urllib.request import urlretrieve + +import numpy as np + +from ..preprocessing import scale +from ..utils import Bunch, check_random_state +from ..utils._optional_dependencies import check_pandas_support +from ..utils._param_validation import Interval, StrOptions, validate_params + +DATA_MODULE = "sklearn.datasets.data" +DESCR_MODULE = "sklearn.datasets.descr" +IMAGES_MODULE = "sklearn.datasets.images" + +RemoteFileMetadata = namedtuple("RemoteFileMetadata", ["filename", "url", "checksum"]) + + +@validate_params( + { + "data_home": [str, os.PathLike, None], + }, + prefer_skip_nested_validation=True, +) +def get_data_home(data_home=None) -> str: + """Return the path of the scikit-learn data directory. + + This folder is used by some large dataset loaders to avoid downloading the + data several times. + + By default the data directory is set to a folder named 'scikit_learn_data' in the + user home folder. + + Alternatively, it can be set by the 'SCIKIT_LEARN_DATA' environment + variable or programmatically by giving an explicit folder path. The '~' + symbol is expanded to the user home folder. + + If the folder does not already exist, it is automatically created. + + Parameters + ---------- + data_home : str or path-like, default=None + The path to scikit-learn data directory. If `None`, the default path + is `~/scikit_learn_data`. + + Returns + ------- + data_home: str + The path to scikit-learn data directory. + + Examples + -------- + >>> import os + >>> from sklearn.datasets import get_data_home + >>> data_home_path = get_data_home() + >>> os.path.exists(data_home_path) + True + """ + if data_home is None: + data_home = environ.get("SCIKIT_LEARN_DATA", join("~", "scikit_learn_data")) + data_home = expanduser(data_home) + makedirs(data_home, exist_ok=True) + return data_home + + +@validate_params( + { + "data_home": [str, os.PathLike, None], + }, + prefer_skip_nested_validation=True, +) +def clear_data_home(data_home=None): + """Delete all the content of the data home cache. + + Parameters + ---------- + data_home : str or path-like, default=None + The path to scikit-learn data directory. If `None`, the default path + is `~/scikit_learn_data`. + + Examples + -------- + >>> from sklearn.datasets import clear_data_home + >>> clear_data_home() # doctest: +SKIP + """ + data_home = get_data_home(data_home) + shutil.rmtree(data_home) + + +def _convert_data_dataframe( + caller_name, data, target, feature_names, target_names, sparse_data=False +): + pd = check_pandas_support("{} with as_frame=True".format(caller_name)) + if not sparse_data: + data_df = pd.DataFrame(data, columns=feature_names, copy=False) + else: + data_df = pd.DataFrame.sparse.from_spmatrix(data, columns=feature_names) + + target_df = pd.DataFrame(target, columns=target_names) + combined_df = pd.concat([data_df, target_df], axis=1) + X = combined_df[feature_names] + y = combined_df[target_names] + if y.shape[1] == 1: + y = y.iloc[:, 0] + return combined_df, X, y + + +@validate_params( + { + "container_path": [str, os.PathLike], + "description": [str, None], + "categories": [list, None], + "load_content": ["boolean"], + "shuffle": ["boolean"], + "encoding": [str, None], + "decode_error": [StrOptions({"strict", "ignore", "replace"})], + "random_state": ["random_state"], + "allowed_extensions": [list, None], + }, + prefer_skip_nested_validation=True, +) +def load_files( + container_path, + *, + description=None, + categories=None, + load_content=True, + shuffle=True, + encoding=None, + decode_error="strict", + random_state=0, + allowed_extensions=None, +): + """Load text files with categories as subfolder names. + + Individual samples are assumed to be files stored a two levels folder + structure such as the following: + + .. code-block:: text + + container_folder/ + category_1_folder/ + file_1.txt + file_2.txt + ... + file_42.txt + category_2_folder/ + file_43.txt + file_44.txt + ... + + The folder names are used as supervised signal label names. The individual + file names are not important. + + This function does not try to extract features into a numpy array or scipy + sparse matrix. In addition, if load_content is false it does not try to + load the files in memory. + + To use text files in a scikit-learn classification or clustering algorithm, + you will need to use the :mod:`~sklearn.feature_extraction.text` module to + build a feature extraction transformer that suits your problem. + + If you set load_content=True, you should also specify the encoding of the + text using the 'encoding' parameter. For many modern text files, 'utf-8' + will be the correct encoding. If you leave encoding equal to None, then the + content will be made of bytes instead of Unicode, and you will not be able + to use most functions in :mod:`~sklearn.feature_extraction.text`. + + Similar feature extractors should be built for other kind of unstructured + data input such as images, audio, video, ... + + If you want files with a specific file extension (e.g. `.txt`) then you + can pass a list of those file extensions to `allowed_extensions`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + container_path : str + Path to the main folder holding one subfolder per category. + + description : str, default=None + A paragraph describing the characteristic of the dataset: its source, + reference, etc. + + categories : list of str, default=None + If None (default), load all the categories. If not None, list of + category names to load (other categories ignored). + + load_content : bool, default=True + Whether to load or not the content of the different files. If true a + 'data' attribute containing the text information is present in the data + structure returned. If not, a filenames attribute gives the path to the + files. + + shuffle : bool, default=True + Whether or not to shuffle the data: might be important for models that + make the assumption that the samples are independent and identically + distributed (i.i.d.), such as stochastic gradient descent. + + encoding : str, default=None + If None, do not try to decode the content of the files (e.g. for images + or other non-text content). If not None, encoding to use to decode text + files to Unicode if load_content is True. + + decode_error : {'strict', 'ignore', 'replace'}, default='strict' + Instruction on what to do if a byte sequence is given to analyze that + contains characters not of the given `encoding`. Passed as keyword + argument 'errors' to bytes.decode. + + random_state : int, RandomState instance or None, default=0 + Determines random number generation for dataset shuffling. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + allowed_extensions : list of str, default=None + List of desired file extensions to filter the files to be loaded. + + Returns + ------- + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : list of str + Only present when `load_content=True`. + The raw text data to learn. + target : ndarray + The target labels (integer index). + target_names : list + The names of target classes. + DESCR : str + The full description of the dataset. + filenames: ndarray + The filenames holding the dataset. + + Examples + -------- + >>> from sklearn.datasets import load_files + >>> container_path = "./" + >>> load_files(container_path) # doctest: +SKIP + """ + + target = [] + target_names = [] + filenames = [] + + folders = [ + f for f in sorted(listdir(container_path)) if isdir(join(container_path, f)) + ] + + if categories is not None: + folders = [f for f in folders if f in categories] + + if allowed_extensions is not None: + allowed_extensions = frozenset(allowed_extensions) + + for label, folder in enumerate(folders): + target_names.append(folder) + folder_path = join(container_path, folder) + files = sorted(listdir(folder_path)) + if allowed_extensions is not None: + documents = [ + join(folder_path, file) + for file in files + if os.path.splitext(file)[1] in allowed_extensions + ] + else: + documents = [join(folder_path, file) for file in files] + target.extend(len(documents) * [label]) + filenames.extend(documents) + + # convert to array for fancy indexing + filenames = np.array(filenames) + target = np.array(target) + + if shuffle: + random_state = check_random_state(random_state) + indices = np.arange(filenames.shape[0]) + random_state.shuffle(indices) + filenames = filenames[indices] + target = target[indices] + + if load_content: + data = [] + for filename in filenames: + data.append(Path(filename).read_bytes()) + if encoding is not None: + data = [d.decode(encoding, decode_error) for d in data] + return Bunch( + data=data, + filenames=filenames, + target_names=target_names, + target=target, + DESCR=description, + ) + + return Bunch( + filenames=filenames, target_names=target_names, target=target, DESCR=description + ) + + +def load_csv_data( + data_file_name, + *, + data_module=DATA_MODULE, + descr_file_name=None, + descr_module=DESCR_MODULE, + encoding="utf-8", +): + """Loads `data_file_name` from `data_module with `importlib.resources`. + + Parameters + ---------- + data_file_name : str + Name of csv file to be loaded from `data_module/data_file_name`. + For example `'wine_data.csv'`. + + data_module : str or module, default='sklearn.datasets.data' + Module where data lives. The default is `'sklearn.datasets.data'`. + + descr_file_name : str, default=None + Name of rst file to be loaded from `descr_module/descr_file_name`. + For example `'wine_data.rst'`. See also :func:`load_descr`. + If not None, also returns the corresponding description of + the dataset. + + descr_module : str or module, default='sklearn.datasets.descr' + Module where `descr_file_name` lives. See also :func:`load_descr`. + The default is `'sklearn.datasets.descr'`. + + Returns + ------- + data : ndarray of shape (n_samples, n_features) + A 2D array with each row representing one sample and each column + representing the features of a given sample. + + target : ndarry of shape (n_samples,) + A 1D array holding target variables for all the samples in `data`. + For example target[0] is the target variable for data[0]. + + target_names : ndarry of shape (n_samples,) + A 1D array containing the names of the classifications. For example + target_names[0] is the name of the target[0] class. + + descr : str, optional + Description of the dataset (the content of `descr_file_name`). + Only returned if `descr_file_name` is not None. + + encoding : str, optional + Text encoding of the CSV file. + + .. versionadded:: 1.4 + """ + data_path = resources.files(data_module) / data_file_name + with data_path.open("r", encoding="utf-8") as csv_file: + data_file = csv.reader(csv_file) + temp = next(data_file) + n_samples = int(temp[0]) + n_features = int(temp[1]) + target_names = np.array(temp[2:]) + data = np.empty((n_samples, n_features)) + target = np.empty((n_samples,), dtype=int) + + for i, ir in enumerate(data_file): + data[i] = np.asarray(ir[:-1], dtype=np.float64) + target[i] = np.asarray(ir[-1], dtype=int) + + if descr_file_name is None: + return data, target, target_names + else: + assert descr_module is not None + descr = load_descr(descr_module=descr_module, descr_file_name=descr_file_name) + return data, target, target_names, descr + + +def load_gzip_compressed_csv_data( + data_file_name, + *, + data_module=DATA_MODULE, + descr_file_name=None, + descr_module=DESCR_MODULE, + encoding="utf-8", + **kwargs, +): + """Loads gzip-compressed with `importlib.resources`. + + 1) Open resource file with `importlib.resources.open_binary` + 2) Decompress file obj with `gzip.open` + 3) Load decompressed data with `np.loadtxt` + + Parameters + ---------- + data_file_name : str + Name of gzip-compressed csv file (`'*.csv.gz'`) to be loaded from + `data_module/data_file_name`. For example `'diabetes_data.csv.gz'`. + + data_module : str or module, default='sklearn.datasets.data' + Module where data lives. The default is `'sklearn.datasets.data'`. + + descr_file_name : str, default=None + Name of rst file to be loaded from `descr_module/descr_file_name`. + For example `'wine_data.rst'`. See also :func:`load_descr`. + If not None, also returns the corresponding description of + the dataset. + + descr_module : str or module, default='sklearn.datasets.descr' + Module where `descr_file_name` lives. See also :func:`load_descr`. + The default is `'sklearn.datasets.descr'`. + + encoding : str, default="utf-8" + Name of the encoding that the gzip-decompressed file will be + decoded with. The default is 'utf-8'. + + **kwargs : dict, optional + Keyword arguments to be passed to `np.loadtxt`; + e.g. delimiter=','. + + Returns + ------- + data : ndarray of shape (n_samples, n_features) + A 2D array with each row representing one sample and each column + representing the features and/or target of a given sample. + + descr : str, optional + Description of the dataset (the content of `descr_file_name`). + Only returned if `descr_file_name` is not None. + """ + data_path = resources.files(data_module) / data_file_name + with data_path.open("rb") as compressed_file: + compressed_file = gzip.open(compressed_file, mode="rt", encoding=encoding) + data = np.loadtxt(compressed_file, **kwargs) + + if descr_file_name is None: + return data + else: + assert descr_module is not None + descr = load_descr(descr_module=descr_module, descr_file_name=descr_file_name) + return data, descr + + +def load_descr(descr_file_name, *, descr_module=DESCR_MODULE, encoding="utf-8"): + """Load `descr_file_name` from `descr_module` with `importlib.resources`. + + Parameters + ---------- + descr_file_name : str, default=None + Name of rst file to be loaded from `descr_module/descr_file_name`. + For example `'wine_data.rst'`. See also :func:`load_descr`. + If not None, also returns the corresponding description of + the dataset. + + descr_module : str or module, default='sklearn.datasets.descr' + Module where `descr_file_name` lives. See also :func:`load_descr`. + The default is `'sklearn.datasets.descr'`. + + encoding : str, default="utf-8" + Name of the encoding that `descr_file_name` will be decoded with. + The default is 'utf-8'. + + .. versionadded:: 1.4 + + Returns + ------- + fdescr : str + Content of `descr_file_name`. + """ + path = resources.files(descr_module) / descr_file_name + return path.read_text(encoding=encoding) + + +@validate_params( + { + "return_X_y": ["boolean"], + "as_frame": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_wine(*, return_X_y=False, as_frame=False): + """Load and return the wine dataset (classification). + + .. versionadded:: 0.18 + + The wine dataset is a classic and very easy multi-class classification + dataset. + + ================= ============== + Classes 3 + Samples per class [59,71,48] + Samples total 178 + Dimensionality 13 + Features real, positive + ================= ============== + + The copy of UCI ML Wine Data Set dataset is downloaded and modified to fit + standard format from: + https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + return_X_y : bool, default=False + If True, returns ``(data, target)`` instead of a Bunch object. + See below for more information about the `data` and `target` object. + + as_frame : bool, default=False + If True, the data is a pandas DataFrame including columns with + appropriate dtypes (numeric). The target is + a pandas DataFrame or Series depending on the number of target columns. + If `return_X_y` is True, then (`data`, `target`) will be pandas + DataFrames or Series as described below. + + .. versionadded:: 0.23 + + Returns + ------- + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : {ndarray, dataframe} of shape (178, 13) + The data matrix. If `as_frame=True`, `data` will be a pandas + DataFrame. + target: {ndarray, Series} of shape (178,) + The classification target. If `as_frame=True`, `target` will be + a pandas Series. + feature_names: list + The names of the dataset columns. + target_names: list + The names of target classes. + frame: DataFrame of shape (178, 14) + Only present when `as_frame=True`. DataFrame with `data` and + `target`. + + .. versionadded:: 0.23 + DESCR: str + The full description of the dataset. + + (data, target) : tuple if ``return_X_y`` is True + A tuple of two ndarrays by default. The first contains a 2D array of shape + (178, 13) with each row representing one sample and each column representing + the features. The second array of shape (178,) contains the target samples. + + Examples + -------- + Let's say you are interested in the samples 10, 80, and 140, and want to + know their class name. + + >>> from sklearn.datasets import load_wine + >>> data = load_wine() + >>> data.target[[10, 80, 140]] + array([0, 1, 2]) + >>> list(data.target_names) + [np.str_('class_0'), np.str_('class_1'), np.str_('class_2')] + """ + + data, target, target_names, fdescr = load_csv_data( + data_file_name="wine_data.csv", descr_file_name="wine_data.rst" + ) + + feature_names = [ + "alcohol", + "malic_acid", + "ash", + "alcalinity_of_ash", + "magnesium", + "total_phenols", + "flavanoids", + "nonflavanoid_phenols", + "proanthocyanins", + "color_intensity", + "hue", + "od280/od315_of_diluted_wines", + "proline", + ] + + frame = None + target_columns = [ + "target", + ] + if as_frame: + frame, data, target = _convert_data_dataframe( + "load_wine", data, target, feature_names, target_columns + ) + + if return_X_y: + return data, target + + return Bunch( + data=data, + target=target, + frame=frame, + target_names=target_names, + DESCR=fdescr, + feature_names=feature_names, + ) + + +@validate_params( + {"return_X_y": ["boolean"], "as_frame": ["boolean"]}, + prefer_skip_nested_validation=True, +) +def load_iris(*, return_X_y=False, as_frame=False): + """Load and return the iris dataset (classification). + + The iris dataset is a classic and very easy multi-class classification + dataset. + + ================= ============== + Classes 3 + Samples per class 50 + Samples total 150 + Dimensionality 4 + Features real, positive + ================= ============== + + Read more in the :ref:`User Guide `. + + .. versionchanged:: 0.20 + Fixed two wrong data points according to Fisher's paper. + The new version is the same as in R, but not as in the UCI + Machine Learning Repository. + + Parameters + ---------- + return_X_y : bool, default=False + If True, returns ``(data, target)`` instead of a Bunch object. See + below for more information about the `data` and `target` object. + + .. versionadded:: 0.18 + + as_frame : bool, default=False + If True, the data is a pandas DataFrame including columns with + appropriate dtypes (numeric). The target is + a pandas DataFrame or Series depending on the number of target columns. + If `return_X_y` is True, then (`data`, `target`) will be pandas + DataFrames or Series as described below. + + .. versionadded:: 0.23 + + Returns + ------- + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : {ndarray, dataframe} of shape (150, 4) + The data matrix. If `as_frame=True`, `data` will be a pandas + DataFrame. + target: {ndarray, Series} of shape (150,) + The classification target. If `as_frame=True`, `target` will be + a pandas Series. + feature_names: list + The names of the dataset columns. + target_names: ndarray of shape (3, ) + The names of target classes. + frame: DataFrame of shape (150, 5) + Only present when `as_frame=True`. DataFrame with `data` and + `target`. + + .. versionadded:: 0.23 + DESCR: str + The full description of the dataset. + filename: str + The path to the location of the data. + + .. versionadded:: 0.20 + + (data, target) : tuple if ``return_X_y`` is True + A tuple of two ndarray. The first containing a 2D array of shape + (n_samples, n_features) with each row representing one sample and + each column representing the features. The second ndarray of shape + (n_samples,) containing the target samples. + + .. versionadded:: 0.18 + + Examples + -------- + Let's say you are interested in the samples 10, 25, and 50, and want to + know their class name. + + >>> from sklearn.datasets import load_iris + >>> data = load_iris() + >>> data.target[[10, 25, 50]] + array([0, 0, 1]) + >>> list(data.target_names) + [np.str_('setosa'), np.str_('versicolor'), np.str_('virginica')] + + See :ref:`sphx_glr_auto_examples_decomposition_plot_pca_iris.py` for a more + detailed example of how to work with the iris dataset. + """ + data_file_name = "iris.csv" + data, target, target_names, fdescr = load_csv_data( + data_file_name=data_file_name, descr_file_name="iris.rst" + ) + + feature_names = [ + "sepal length (cm)", + "sepal width (cm)", + "petal length (cm)", + "petal width (cm)", + ] + + frame = None + target_columns = [ + "target", + ] + if as_frame: + frame, data, target = _convert_data_dataframe( + "load_iris", data, target, feature_names, target_columns + ) + + if return_X_y: + return data, target + + return Bunch( + data=data, + target=target, + frame=frame, + target_names=target_names, + DESCR=fdescr, + feature_names=feature_names, + filename=data_file_name, + data_module=DATA_MODULE, + ) + + +@validate_params( + {"return_X_y": ["boolean"], "as_frame": ["boolean"]}, + prefer_skip_nested_validation=True, +) +def load_breast_cancer(*, return_X_y=False, as_frame=False): + """Load and return the breast cancer Wisconsin dataset (classification). + + The breast cancer dataset is a classic and very easy binary classification + dataset. + + ================= ============== + Classes 2 + Samples per class 212(M),357(B) + Samples total 569 + Dimensionality 30 + Features real, positive + ================= ============== + + The copy of UCI ML Breast Cancer Wisconsin (Diagnostic) dataset is + downloaded from: + https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + return_X_y : bool, default=False + If True, returns ``(data, target)`` instead of a Bunch object. + See below for more information about the `data` and `target` object. + + .. versionadded:: 0.18 + + as_frame : bool, default=False + If True, the data is a pandas DataFrame including columns with + appropriate dtypes (numeric). The target is + a pandas DataFrame or Series depending on the number of target columns. + If `return_X_y` is True, then (`data`, `target`) will be pandas + DataFrames or Series as described below. + + .. versionadded:: 0.23 + + Returns + ------- + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : {ndarray, dataframe} of shape (569, 30) + The data matrix. If `as_frame=True`, `data` will be a pandas + DataFrame. + target : {ndarray, Series} of shape (569,) + The classification target. If `as_frame=True`, `target` will be + a pandas Series. + feature_names : ndarray of shape (30,) + The names of the dataset columns. + target_names : ndarray of shape (2,) + The names of target classes. + frame : DataFrame of shape (569, 31) + Only present when `as_frame=True`. DataFrame with `data` and + `target`. + + .. versionadded:: 0.23 + DESCR : str + The full description of the dataset. + filename : str + The path to the location of the data. + + .. versionadded:: 0.20 + + (data, target) : tuple if ``return_X_y`` is True + A tuple of two ndarrays by default. The first contains a 2D ndarray of + shape (569, 30) with each row representing one sample and each column + representing the features. The second ndarray of shape (569,) contains + the target samples. If `as_frame=True`, both arrays are pandas objects, + i.e. `X` a dataframe and `y` a series. + + .. versionadded:: 0.18 + + Examples + -------- + Let's say you are interested in the samples 10, 50, and 85, and want to + know their class name. + + >>> from sklearn.datasets import load_breast_cancer + >>> data = load_breast_cancer() + >>> data.target[[10, 50, 85]] + array([0, 1, 0]) + >>> list(data.target_names) + [np.str_('malignant'), np.str_('benign')] + """ + data_file_name = "breast_cancer.csv" + data, target, target_names, fdescr = load_csv_data( + data_file_name=data_file_name, descr_file_name="breast_cancer.rst" + ) + + feature_names = np.array( + [ + "mean radius", + "mean texture", + "mean perimeter", + "mean area", + "mean smoothness", + "mean compactness", + "mean concavity", + "mean concave points", + "mean symmetry", + "mean fractal dimension", + "radius error", + "texture error", + "perimeter error", + "area error", + "smoothness error", + "compactness error", + "concavity error", + "concave points error", + "symmetry error", + "fractal dimension error", + "worst radius", + "worst texture", + "worst perimeter", + "worst area", + "worst smoothness", + "worst compactness", + "worst concavity", + "worst concave points", + "worst symmetry", + "worst fractal dimension", + ] + ) + + frame = None + target_columns = [ + "target", + ] + if as_frame: + frame, data, target = _convert_data_dataframe( + "load_breast_cancer", data, target, feature_names, target_columns + ) + + if return_X_y: + return data, target + + return Bunch( + data=data, + target=target, + frame=frame, + target_names=target_names, + DESCR=fdescr, + feature_names=feature_names, + filename=data_file_name, + data_module=DATA_MODULE, + ) + + +@validate_params( + { + "n_class": [Interval(Integral, 1, 10, closed="both")], + "return_X_y": ["boolean"], + "as_frame": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_digits(*, n_class=10, return_X_y=False, as_frame=False): + """Load and return the digits dataset (classification). + + Each datapoint is a 8x8 image of a digit. + + ================= ============== + Classes 10 + Samples per class ~180 + Samples total 1797 + Dimensionality 64 + Features integers 0-16 + ================= ============== + + This is a copy of the test set of the UCI ML hand-written digits datasets + https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_class : int, default=10 + The number of classes to return. Between 0 and 10. + + return_X_y : bool, default=False + If True, returns ``(data, target)`` instead of a Bunch object. + See below for more information about the `data` and `target` object. + + .. versionadded:: 0.18 + + as_frame : bool, default=False + If True, the data is a pandas DataFrame including columns with + appropriate dtypes (numeric). The target is + a pandas DataFrame or Series depending on the number of target columns. + If `return_X_y` is True, then (`data`, `target`) will be pandas + DataFrames or Series as described below. + + .. versionadded:: 0.23 + + Returns + ------- + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : {ndarray, dataframe} of shape (1797, 64) + The flattened data matrix. If `as_frame=True`, `data` will be + a pandas DataFrame. + target: {ndarray, Series} of shape (1797,) + The classification target. If `as_frame=True`, `target` will be + a pandas Series. + feature_names: list + The names of the dataset columns. + target_names: list + The names of target classes. + + .. versionadded:: 0.20 + + frame: DataFrame of shape (1797, 65) + Only present when `as_frame=True`. DataFrame with `data` and + `target`. + + .. versionadded:: 0.23 + images: {ndarray} of shape (1797, 8, 8) + The raw image data. + DESCR: str + The full description of the dataset. + + (data, target) : tuple if ``return_X_y`` is True + A tuple of two ndarrays by default. The first contains a 2D ndarray of + shape (1797, 64) with each row representing one sample and each column + representing the features. The second ndarray of shape (1797) contains + the target samples. If `as_frame=True`, both arrays are pandas objects, + i.e. `X` a dataframe and `y` a series. + + .. versionadded:: 0.18 + + Examples + -------- + To load the data and visualize the images:: + + >>> from sklearn.datasets import load_digits + >>> digits = load_digits() + >>> print(digits.data.shape) + (1797, 64) + >>> import matplotlib.pyplot as plt + >>> plt.matshow(digits.images[0], cmap="gray") + <...> + >>> plt.show() + """ + + data, fdescr = load_gzip_compressed_csv_data( + data_file_name="digits.csv.gz", descr_file_name="digits.rst", delimiter="," + ) + + target = data[:, -1].astype(int, copy=False) + flat_data = data[:, :-1] + images = flat_data.view() + images.shape = (-1, 8, 8) + + if n_class < 10: + idx = target < n_class + flat_data, target = flat_data[idx], target[idx] + images = images[idx] + + feature_names = [ + "pixel_{}_{}".format(row_idx, col_idx) + for row_idx in range(8) + for col_idx in range(8) + ] + + frame = None + target_columns = [ + "target", + ] + if as_frame: + frame, flat_data, target = _convert_data_dataframe( + "load_digits", flat_data, target, feature_names, target_columns + ) + + if return_X_y: + return flat_data, target + + return Bunch( + data=flat_data, + target=target, + frame=frame, + feature_names=feature_names, + target_names=np.arange(10), + images=images, + DESCR=fdescr, + ) + + +@validate_params( + {"return_X_y": ["boolean"], "as_frame": ["boolean"], "scaled": ["boolean"]}, + prefer_skip_nested_validation=True, +) +def load_diabetes(*, return_X_y=False, as_frame=False, scaled=True): + """Load and return the diabetes dataset (regression). + + ============== ================== + Samples total 442 + Dimensionality 10 + Features real, -.2 < x < .2 + Targets integer 25 - 346 + ============== ================== + + .. note:: + The meaning of each feature (i.e. `feature_names`) might be unclear + (especially for `ltg`) as the documentation of the original dataset is + not explicit. We provide information that seems correct in regard with + the scientific literature in this field of research. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + return_X_y : bool, default=False + If True, returns ``(data, target)`` instead of a Bunch object. + See below for more information about the `data` and `target` object. + + .. versionadded:: 0.18 + + as_frame : bool, default=False + If True, the data is a pandas DataFrame including columns with + appropriate dtypes (numeric). The target is + a pandas DataFrame or Series depending on the number of target columns. + If `return_X_y` is True, then (`data`, `target`) will be pandas + DataFrames or Series as described below. + + .. versionadded:: 0.23 + + scaled : bool, default=True + If True, the feature variables are mean centered and scaled by the + standard deviation times the square root of `n_samples`. + If False, raw data is returned for the feature variables. + + .. versionadded:: 1.1 + + Returns + ------- + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : {ndarray, dataframe} of shape (442, 10) + The data matrix. If `as_frame=True`, `data` will be a pandas + DataFrame. + target: {ndarray, Series} of shape (442,) + The regression target. If `as_frame=True`, `target` will be + a pandas Series. + feature_names: list + The names of the dataset columns. + frame: DataFrame of shape (442, 11) + Only present when `as_frame=True`. DataFrame with `data` and + `target`. + + .. versionadded:: 0.23 + DESCR: str + The full description of the dataset. + data_filename: str + The path to the location of the data. + target_filename: str + The path to the location of the target. + + (data, target) : tuple if ``return_X_y`` is True + Returns a tuple of two ndarray of shape (n_samples, n_features) + A 2D array with each row representing one sample and each column + representing the features and/or target of a given sample. + + .. versionadded:: 0.18 + + Examples + -------- + >>> from sklearn.datasets import load_diabetes + >>> diabetes = load_diabetes() + >>> diabetes.target[:3] + array([151., 75., 141.]) + >>> diabetes.data.shape + (442, 10) + """ + data_filename = "diabetes_data_raw.csv.gz" + target_filename = "diabetes_target.csv.gz" + data = load_gzip_compressed_csv_data(data_filename) + target = load_gzip_compressed_csv_data(target_filename) + + if scaled: + data = scale(data, copy=False) + data /= data.shape[0] ** 0.5 + + fdescr = load_descr("diabetes.rst") + + feature_names = ["age", "sex", "bmi", "bp", "s1", "s2", "s3", "s4", "s5", "s6"] + + frame = None + target_columns = [ + "target", + ] + if as_frame: + frame, data, target = _convert_data_dataframe( + "load_diabetes", data, target, feature_names, target_columns + ) + + if return_X_y: + return data, target + + return Bunch( + data=data, + target=target, + frame=frame, + DESCR=fdescr, + feature_names=feature_names, + data_filename=data_filename, + target_filename=target_filename, + data_module=DATA_MODULE, + ) + + +@validate_params( + { + "return_X_y": ["boolean"], + "as_frame": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def load_linnerud(*, return_X_y=False, as_frame=False): + """Load and return the physical exercise Linnerud dataset. + + This dataset is suitable for multi-output regression tasks. + + ============== ============================ + Samples total 20 + Dimensionality 3 (for both data and target) + Features integer + Targets integer + ============== ============================ + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + return_X_y : bool, default=False + If True, returns ``(data, target)`` instead of a Bunch object. + See below for more information about the `data` and `target` object. + + .. versionadded:: 0.18 + + as_frame : bool, default=False + If True, the data is a pandas DataFrame including columns with + appropriate dtypes (numeric, string or categorical). The target is + a pandas DataFrame or Series depending on the number of target columns. + If `return_X_y` is True, then (`data`, `target`) will be pandas + DataFrames or Series as described below. + + .. versionadded:: 0.23 + + Returns + ------- + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : {ndarray, dataframe} of shape (20, 3) + The data matrix. If `as_frame=True`, `data` will be a pandas + DataFrame. + target: {ndarray, dataframe} of shape (20, 3) + The regression targets. If `as_frame=True`, `target` will be + a pandas DataFrame. + feature_names: list + The names of the dataset columns. + target_names: list + The names of the target columns. + frame: DataFrame of shape (20, 6) + Only present when `as_frame=True`. DataFrame with `data` and + `target`. + + .. versionadded:: 0.23 + DESCR: str + The full description of the dataset. + data_filename: str + The path to the location of the data. + target_filename: str + The path to the location of the target. + + .. versionadded:: 0.20 + + (data, target) : tuple if ``return_X_y`` is True + Returns a tuple of two ndarrays or dataframe of shape + `(20, 3)`. Each row represents one sample and each column represents the + features in `X` and a target in `y` of a given sample. + + .. versionadded:: 0.18 + + Examples + -------- + >>> from sklearn.datasets import load_linnerud + >>> linnerud = load_linnerud() + >>> linnerud.data.shape + (20, 3) + >>> linnerud.target.shape + (20, 3) + """ + data_filename = "linnerud_exercise.csv" + target_filename = "linnerud_physiological.csv" + + data_module_path = resources.files(DATA_MODULE) + # Read header and data + data_path = data_module_path / data_filename + with data_path.open("r", encoding="utf-8") as f: + header_exercise = f.readline().split() + f.seek(0) # reset file obj + data_exercise = np.loadtxt(f, skiprows=1) + + target_path = data_module_path / target_filename + with target_path.open("r", encoding="utf-8") as f: + header_physiological = f.readline().split() + f.seek(0) # reset file obj + data_physiological = np.loadtxt(f, skiprows=1) + + fdescr = load_descr("linnerud.rst") + + frame = None + if as_frame: + (frame, data_exercise, data_physiological) = _convert_data_dataframe( + "load_linnerud", + data_exercise, + data_physiological, + header_exercise, + header_physiological, + ) + if return_X_y: + return data_exercise, data_physiological + + return Bunch( + data=data_exercise, + feature_names=header_exercise, + target=data_physiological, + target_names=header_physiological, + frame=frame, + DESCR=fdescr, + data_filename=data_filename, + target_filename=target_filename, + data_module=DATA_MODULE, + ) + + +def load_sample_images(): + """Load sample images for image manipulation. + + Loads both, ``china`` and ``flower``. + + Read more in the :ref:`User Guide `. + + Returns + ------- + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + images : list of ndarray of shape (427, 640, 3) + The two sample image. + filenames : list + The filenames for the images. + DESCR : str + The full description of the dataset. + + Examples + -------- + To load the data and visualize the images: + + >>> from sklearn.datasets import load_sample_images + >>> dataset = load_sample_images() #doctest: +SKIP + >>> len(dataset.images) #doctest: +SKIP + 2 + >>> first_img_data = dataset.images[0] #doctest: +SKIP + >>> first_img_data.shape #doctest: +SKIP + (427, 640, 3) + >>> first_img_data.dtype #doctest: +SKIP + dtype('uint8') + """ + try: + from PIL import Image + except ImportError: + raise ImportError( + "The Python Imaging Library (PIL) is required to load data " + "from jpeg files. Please refer to " + "https://pillow.readthedocs.io/en/stable/installation.html " + "for installing PIL." + ) + + descr = load_descr("README.txt", descr_module=IMAGES_MODULE) + + filenames, images = [], [] + + jpg_paths = sorted( + resource + for resource in resources.files(IMAGES_MODULE).iterdir() + if resource.is_file() and resource.match("*.jpg") + ) + + for path in jpg_paths: + filenames.append(str(path)) + with path.open("rb") as image_file: + pil_image = Image.open(image_file) + image = np.asarray(pil_image) + images.append(image) + + return Bunch(images=images, filenames=filenames, DESCR=descr) + + +@validate_params( + { + "image_name": [StrOptions({"china.jpg", "flower.jpg"})], + }, + prefer_skip_nested_validation=True, +) +def load_sample_image(image_name): + """Load the numpy array of a single sample image. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + image_name : {`china.jpg`, `flower.jpg`} + The name of the sample image loaded. + + Returns + ------- + img : 3D array + The image as a numpy array: height x width x color. + + Examples + -------- + + >>> from sklearn.datasets import load_sample_image + >>> china = load_sample_image('china.jpg') # doctest: +SKIP + >>> china.dtype # doctest: +SKIP + dtype('uint8') + >>> china.shape # doctest: +SKIP + (427, 640, 3) + >>> flower = load_sample_image('flower.jpg') # doctest: +SKIP + >>> flower.dtype # doctest: +SKIP + dtype('uint8') + >>> flower.shape # doctest: +SKIP + (427, 640, 3) + """ + images = load_sample_images() + index = None + for i, filename in enumerate(images.filenames): + if filename.endswith(image_name): + index = i + break + if index is None: + raise AttributeError("Cannot find sample image: %s" % image_name) + return images.images[index] + + +def _pkl_filepath(*args, **kwargs): + """Return filename for Python 3 pickles + + args[-1] is expected to be the ".pkl" filename. For compatibility with + older scikit-learn versions, a suffix is inserted before the extension. + + _pkl_filepath('/path/to/folder', 'filename.pkl') returns + '/path/to/folder/filename_py3.pkl' + + """ + py3_suffix = kwargs.get("py3_suffix", "_py3") + basename, ext = splitext(args[-1]) + basename += py3_suffix + new_args = args[:-1] + (basename + ext,) + return join(*new_args) + + +def _sha256(path): + """Calculate the sha256 hash of the file at path.""" + sha256hash = hashlib.sha256() + chunk_size = 8192 + with open(path, "rb") as f: + while True: + buffer = f.read(chunk_size) + if not buffer: + break + sha256hash.update(buffer) + return sha256hash.hexdigest() + + +def _fetch_remote(remote, dirname=None, n_retries=3, delay=1): + """Helper function to download a remote dataset. + + Fetch a dataset pointed by remote's url, save into path using remote's + filename and ensure its integrity based on the SHA256 checksum of the + downloaded file. + + .. versionchanged:: 1.6 + + If the file already exists locally and the SHA256 checksums match, the + path to the local file is returned without re-downloading. + + Parameters + ---------- + remote : RemoteFileMetadata + Named tuple containing remote dataset meta information: url, filename + and checksum. + + dirname : str or Path, default=None + Directory to save the file to. If None, the current working directory + is used. + + n_retries : int, default=3 + Number of retries when HTTP errors are encountered. + + .. versionadded:: 1.5 + + delay : int, default=1 + Number of seconds between retries. + + .. versionadded:: 1.5 + + Returns + ------- + file_path: Path + Full path of the created file. + """ + if dirname is None: + folder_path = Path(".") + else: + folder_path = Path(dirname) + + file_path = folder_path / remote.filename + + if file_path.exists(): + if remote.checksum is None: + return file_path + + checksum = _sha256(file_path) + if checksum == remote.checksum: + return file_path + else: + warnings.warn( + f"SHA256 checksum of existing local file {file_path.name} " + f"({checksum}) differs from expected ({remote.checksum}): " + f"re-downloading from {remote.url} ." + ) + + # We create a temporary file dedicated to this particular download to avoid + # conflicts with parallel downloads. If the download is successful, the + # temporary file is atomically renamed to the final file path (with + # `shutil.move`). We therefore pass `delete=False` to `NamedTemporaryFile`. + # Otherwise, garbage collecting temp_file would raise an error when + # attempting to delete a file that was already renamed. If the download + # fails or the result does not match the expected SHA256 digest, the + # temporary file is removed manually in the except block. + temp_file = NamedTemporaryFile( + prefix=remote.filename + ".part_", dir=folder_path, delete=False + ) + # Note that Python 3.12's `delete_on_close=True` is ignored as we set + # `delete=False` explicitly. So after this line the empty temporary file still + # exists on disk to make sure that it's uniquely reserved for this specific call of + # `_fetch_remote` and therefore it protects against any corruption by parallel + # calls. + temp_file.close() + try: + temp_file_path = Path(temp_file.name) + while True: + try: + urlretrieve(remote.url, temp_file_path) + break + except (URLError, TimeoutError): + if n_retries == 0: + # If no more retries are left, re-raise the caught exception. + raise + warnings.warn(f"Retry downloading from url: {remote.url}") + n_retries -= 1 + time.sleep(delay) + + checksum = _sha256(temp_file_path) + if remote.checksum is not None and remote.checksum != checksum: + raise OSError( + f"The SHA256 checksum of {remote.filename} ({checksum}) " + f"differs from expected ({remote.checksum})." + ) + except (Exception, KeyboardInterrupt): + os.unlink(temp_file.name) + raise + + # The following renaming is atomic whenever temp_file_path and + # file_path are on the same filesystem. This should be the case most of + # the time, but we still use shutil.move instead of os.rename in case + # they are not. + shutil.move(temp_file_path, file_path) + + return file_path + + +def _filter_filename(value, filter_dots=True): + """Derive a name that is safe to use as filename from the given string. + + Adapted from the `slugify` function of django: + https://github.com/django/django/blob/master/django/utils/text.py + + Convert spaces or repeated dashes to single dashes. Replace characters that + aren't alphanumerics, underscores, hyphens or dots by underscores. Convert + to lowercase. Also strip leading and trailing whitespace, dashes, and + underscores. + """ + value = unicodedata.normalize("NFKD", value).lower() + if filter_dots: + value = re.sub(r"[^\w\s-]+", "_", value) + else: + value = re.sub(r"[^.\w\s-]+", "_", value) + value = re.sub(r"[\s-]+", "-", value) + return value.strip("-_.") + + +def _derive_folder_and_filename_from_url(url): + parsed_url = urlparse(url) + if not parsed_url.hostname: + raise ValueError(f"Invalid URL: {url}") + folder_components = [_filter_filename(parsed_url.hostname, filter_dots=False)] + path = parsed_url.path + + if "/" in path: + base_folder, raw_filename = path.rsplit("/", 1) + + base_folder = _filter_filename(base_folder) + if base_folder: + folder_components.append(base_folder) + else: + raw_filename = path + + filename = _filter_filename(raw_filename, filter_dots=False) + if not filename: + filename = "downloaded_file" + + return "/".join(folder_components), filename + + +def fetch_file( + url, folder=None, local_filename=None, sha256=None, n_retries=3, delay=1 +): + """Fetch a file from the web if not already present in the local folder. + + If the file already exists locally (and the SHA256 checksums match when + provided), the path to the local file is returned without re-downloading. + + .. versionadded:: 1.6 + + Parameters + ---------- + url : str + URL of the file to download. + + folder : str or Path, default=None + Directory to save the file to. If None, the file is downloaded in a + folder with a name derived from the URL host name and path under + scikit-learn data home folder. + + local_filename : str, default=None + Name of the file to save. If None, the filename is inferred from the + URL. + + sha256 : str, default=None + SHA256 checksum of the file. If None, no checksum is verified. + + n_retries : int, default=3 + Number of retries when HTTP errors are encountered. + + delay : int, default=1 + Number of seconds between retries. + + Returns + ------- + file_path : Path + Full path of the downloaded file. + """ + folder_from_url, filename_from_url = _derive_folder_and_filename_from_url(url) + + if local_filename is None: + local_filename = filename_from_url + + if folder is None: + folder = Path(get_data_home()) / folder_from_url + makedirs(folder, exist_ok=True) + + remote_metadata = RemoteFileMetadata( + filename=local_filename, url=url, checksum=sha256 + ) + return _fetch_remote( + remote_metadata, dirname=folder, n_retries=n_retries, delay=delay + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_california_housing.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_california_housing.py new file mode 100644 index 0000000000000000000000000000000000000000..749f8528da338010a70cfdb59c6ee91d060a3441 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_california_housing.py @@ -0,0 +1,248 @@ +"""California housing dataset. + +The original database is available from StatLib + + http://lib.stat.cmu.edu/datasets/ + +The data contains 20,640 observations on 9 variables. + +This dataset contains the average house value as target variable +and the following input variables (features): average income, +housing average age, average rooms, average bedrooms, population, +average occupation, latitude, and longitude in that order. + +References +---------- + +Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions, +Statistics and Probability Letters, 33:291-297, 1997. + +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import logging +import tarfile +from numbers import Integral, Real +from os import PathLike, makedirs, remove +from os.path import exists + +import joblib +import numpy as np + +from ..utils import Bunch +from ..utils._param_validation import Interval, validate_params +from . import get_data_home +from ._base import ( + RemoteFileMetadata, + _convert_data_dataframe, + _fetch_remote, + _pkl_filepath, + load_descr, +) + +# The original data can be found at: +# https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz +ARCHIVE = RemoteFileMetadata( + filename="cal_housing.tgz", + url="https://ndownloader.figshare.com/files/5976036", + checksum="aaa5c9a6afe2225cc2aed2723682ae403280c4a3695a2ddda4ffb5d8215ea681", +) + +logger = logging.getLogger(__name__) + + +@validate_params( + { + "data_home": [str, PathLike, None], + "download_if_missing": ["boolean"], + "return_X_y": ["boolean"], + "as_frame": ["boolean"], + "n_retries": [Interval(Integral, 1, None, closed="left")], + "delay": [Interval(Real, 0.0, None, closed="neither")], + }, + prefer_skip_nested_validation=True, +) +def fetch_california_housing( + *, + data_home=None, + download_if_missing=True, + return_X_y=False, + as_frame=False, + n_retries=3, + delay=1.0, +): + """Load the California housing dataset (regression). + + ============== ============== + Samples total 20640 + Dimensionality 8 + Features real + Target real 0.15 - 5. + ============== ============== + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + data_home : str or path-like, default=None + Specify another download and cache folder for the datasets. By default + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + + download_if_missing : bool, default=True + If False, raise an OSError if the data is not locally available + instead of trying to download the data from the source site. + + return_X_y : bool, default=False + If True, returns ``(data.data, data.target)`` instead of a Bunch + object. + + .. versionadded:: 0.20 + + as_frame : bool, default=False + If True, the data is a pandas DataFrame including columns with + appropriate dtypes (numeric, string or categorical). The target is + a pandas DataFrame or Series depending on the number of target_columns. + + .. versionadded:: 0.23 + + n_retries : int, default=3 + Number of retries when HTTP errors are encountered. + + .. versionadded:: 1.5 + + delay : float, default=1.0 + Number of seconds between retries. + + .. versionadded:: 1.5 + + Returns + ------- + dataset : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : ndarray, shape (20640, 8) + Each row corresponding to the 8 feature values in order. + If ``as_frame`` is True, ``data`` is a pandas object. + target : numpy array of shape (20640,) + Each value corresponds to the average + house value in units of 100,000. + If ``as_frame`` is True, ``target`` is a pandas object. + feature_names : list of length 8 + Array of ordered feature names used in the dataset. + DESCR : str + Description of the California housing dataset. + frame : pandas DataFrame + Only present when `as_frame=True`. DataFrame with ``data`` and + ``target``. + + .. versionadded:: 0.23 + + (data, target) : tuple if ``return_X_y`` is True + A tuple of two ndarray. The first containing a 2D array of + shape (n_samples, n_features) with each row representing one + sample and each column representing the features. The second + ndarray of shape (n_samples,) containing the target samples. + + .. versionadded:: 0.20 + + Notes + ----- + + This dataset consists of 20,640 samples and 9 features. + + Examples + -------- + >>> from sklearn.datasets import fetch_california_housing + >>> housing = fetch_california_housing() + >>> print(housing.data.shape, housing.target.shape) + (20640, 8) (20640,) + >>> print(housing.feature_names[0:6]) + ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup'] + """ + data_home = get_data_home(data_home=data_home) + if not exists(data_home): + makedirs(data_home) + + filepath = _pkl_filepath(data_home, "cal_housing.pkz") + if not exists(filepath): + if not download_if_missing: + raise OSError("Data not found and `download_if_missing` is False") + + logger.info( + "Downloading Cal. housing from {} to {}".format(ARCHIVE.url, data_home) + ) + + archive_path = _fetch_remote( + ARCHIVE, + dirname=data_home, + n_retries=n_retries, + delay=delay, + ) + + with tarfile.open(mode="r:gz", name=archive_path) as f: + cal_housing = np.loadtxt( + f.extractfile("CaliforniaHousing/cal_housing.data"), delimiter="," + ) + # Columns are not in the same order compared to the previous + # URL resource on lib.stat.cmu.edu + columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0] + cal_housing = cal_housing[:, columns_index] + + joblib.dump(cal_housing, filepath, compress=6) + remove(archive_path) + + else: + cal_housing = joblib.load(filepath) + + feature_names = [ + "MedInc", + "HouseAge", + "AveRooms", + "AveBedrms", + "Population", + "AveOccup", + "Latitude", + "Longitude", + ] + + target, data = cal_housing[:, 0], cal_housing[:, 1:] + + # avg rooms = total rooms / households + data[:, 2] /= data[:, 5] + + # avg bed rooms = total bed rooms / households + data[:, 3] /= data[:, 5] + + # avg occupancy = population / households + data[:, 5] = data[:, 4] / data[:, 5] + + # target in units of 100,000 + target = target / 100000.0 + + descr = load_descr("california_housing.rst") + + X = data + y = target + + frame = None + target_names = [ + "MedHouseVal", + ] + if as_frame: + frame, X, y = _convert_data_dataframe( + "fetch_california_housing", data, target, feature_names, target_names + ) + + if return_X_y: + return X, y + + return Bunch( + data=X, + target=y, + frame=frame, + target_names=target_names, + feature_names=feature_names, + DESCR=descr, + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_covtype.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_covtype.py new file mode 100644 index 0000000000000000000000000000000000000000..6a0138bafa9c5b7bc902883572d3715d8a297c94 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_covtype.py @@ -0,0 +1,252 @@ +"""Forest covertype dataset. + +A classic dataset for classification benchmarks, featuring categorical and +real-valued features. + +The dataset page is available from UCI Machine Learning Repository + + https://archive.ics.uci.edu/ml/datasets/Covertype + +Courtesy of Jock A. Blackard and Colorado State University. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import logging +import os +from gzip import GzipFile +from numbers import Integral, Real +from os.path import exists, join +from tempfile import TemporaryDirectory + +import joblib +import numpy as np + +from ..utils import Bunch, check_random_state +from ..utils._param_validation import Interval, validate_params +from . import get_data_home +from ._base import ( + RemoteFileMetadata, + _convert_data_dataframe, + _fetch_remote, + _pkl_filepath, + load_descr, +) + +# The original data can be found in: +# https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz +ARCHIVE = RemoteFileMetadata( + filename="covtype.data.gz", + url="https://ndownloader.figshare.com/files/5976039", + checksum="614360d0257557dd1792834a85a1cdebfadc3c4f30b011d56afee7ffb5b15771", +) + +logger = logging.getLogger(__name__) + +# Column names reference: +# https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info +FEATURE_NAMES = [ + "Elevation", + "Aspect", + "Slope", + "Horizontal_Distance_To_Hydrology", + "Vertical_Distance_To_Hydrology", + "Horizontal_Distance_To_Roadways", + "Hillshade_9am", + "Hillshade_Noon", + "Hillshade_3pm", + "Horizontal_Distance_To_Fire_Points", +] +FEATURE_NAMES += [f"Wilderness_Area_{i}" for i in range(4)] +FEATURE_NAMES += [f"Soil_Type_{i}" for i in range(40)] +TARGET_NAMES = ["Cover_Type"] + + +@validate_params( + { + "data_home": [str, os.PathLike, None], + "download_if_missing": ["boolean"], + "random_state": ["random_state"], + "shuffle": ["boolean"], + "return_X_y": ["boolean"], + "as_frame": ["boolean"], + "n_retries": [Interval(Integral, 1, None, closed="left")], + "delay": [Interval(Real, 0.0, None, closed="neither")], + }, + prefer_skip_nested_validation=True, +) +def fetch_covtype( + *, + data_home=None, + download_if_missing=True, + random_state=None, + shuffle=False, + return_X_y=False, + as_frame=False, + n_retries=3, + delay=1.0, +): + """Load the covertype dataset (classification). + + Download it if necessary. + + ================= ============ + Classes 7 + Samples total 581012 + Dimensionality 54 + Features int + ================= ============ + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + data_home : str or path-like, default=None + Specify another download and cache folder for the datasets. By default + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + + download_if_missing : bool, default=True + If False, raise an OSError if the data is not locally available + instead of trying to download the data from the source site. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset shuffling. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + shuffle : bool, default=False + Whether to shuffle dataset. + + return_X_y : bool, default=False + If True, returns ``(data.data, data.target)`` instead of a Bunch + object. + + .. versionadded:: 0.20 + + as_frame : bool, default=False + If True, the data is a pandas DataFrame including columns with + appropriate dtypes (numeric). The target is a pandas DataFrame or + Series depending on the number of target columns. If `return_X_y` is + True, then (`data`, `target`) will be pandas DataFrames or Series as + described below. + + .. versionadded:: 0.24 + + n_retries : int, default=3 + Number of retries when HTTP errors are encountered. + + .. versionadded:: 1.5 + + delay : float, default=1.0 + Number of seconds between retries. + + .. versionadded:: 1.5 + + Returns + ------- + dataset : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : ndarray of shape (581012, 54) + Each row corresponds to the 54 features in the dataset. + target : ndarray of shape (581012,) + Each value corresponds to one of + the 7 forest covertypes with values + ranging between 1 to 7. + frame : dataframe of shape (581012, 55) + Only present when `as_frame=True`. Contains `data` and `target`. + DESCR : str + Description of the forest covertype dataset. + feature_names : list + The names of the dataset columns. + target_names: list + The names of the target columns. + + (data, target) : tuple if ``return_X_y`` is True + A tuple of two ndarray. The first containing a 2D array of + shape (n_samples, n_features) with each row representing one + sample and each column representing the features. The second + ndarray of shape (n_samples,) containing the target samples. + + .. versionadded:: 0.20 + + Examples + -------- + >>> from sklearn.datasets import fetch_covtype + >>> cov_type = fetch_covtype() + >>> cov_type.data.shape + (581012, 54) + >>> cov_type.target.shape + (581012,) + >>> # Let's check the 4 first feature names + >>> cov_type.feature_names[:4] + ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology'] + """ + data_home = get_data_home(data_home=data_home) + covtype_dir = join(data_home, "covertype") + samples_path = _pkl_filepath(covtype_dir, "samples") + targets_path = _pkl_filepath(covtype_dir, "targets") + available = exists(samples_path) and exists(targets_path) + + if download_if_missing and not available: + os.makedirs(covtype_dir, exist_ok=True) + + # Creating temp_dir as a direct subdirectory of the target directory + # guarantees that both reside on the same filesystem, so that we can use + # os.rename to atomically move the data files to their target location. + with TemporaryDirectory(dir=covtype_dir) as temp_dir: + logger.info(f"Downloading {ARCHIVE.url}") + archive_path = _fetch_remote( + ARCHIVE, dirname=temp_dir, n_retries=n_retries, delay=delay + ) + Xy = np.genfromtxt(GzipFile(filename=archive_path), delimiter=",") + + X = Xy[:, :-1] + y = Xy[:, -1].astype(np.int32, copy=False) + + samples_tmp_path = _pkl_filepath(temp_dir, "samples") + joblib.dump(X, samples_tmp_path, compress=9) + os.rename(samples_tmp_path, samples_path) + + targets_tmp_path = _pkl_filepath(temp_dir, "targets") + joblib.dump(y, targets_tmp_path, compress=9) + os.rename(targets_tmp_path, targets_path) + + elif not available and not download_if_missing: + raise OSError("Data not found and `download_if_missing` is False") + try: + X, y + except NameError: + X = joblib.load(samples_path) + y = joblib.load(targets_path) + + if shuffle: + ind = np.arange(X.shape[0]) + rng = check_random_state(random_state) + rng.shuffle(ind) + X = X[ind] + y = y[ind] + + fdescr = load_descr("covtype.rst") + + frame = None + if as_frame: + frame, X, y = _convert_data_dataframe( + caller_name="fetch_covtype", + data=X, + target=y, + feature_names=FEATURE_NAMES, + target_names=TARGET_NAMES, + ) + if return_X_y: + return X, y + + return Bunch( + data=X, + target=y, + frame=frame, + target_names=TARGET_NAMES, + feature_names=FEATURE_NAMES, + DESCR=fdescr, + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_kddcup99.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_kddcup99.py new file mode 100644 index 0000000000000000000000000000000000000000..f379da42eb9dfe8877529bb7f8c8d12df39cb812 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_kddcup99.py @@ -0,0 +1,429 @@ +"""KDDCUP 99 dataset. + +A classic dataset for anomaly detection. + +The dataset page is available from UCI Machine Learning Repository + +https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data.gz + +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import errno +import logging +import os +from gzip import GzipFile +from numbers import Integral, Real +from os.path import exists, join + +import joblib +import numpy as np + +from ..utils import Bunch, check_random_state +from ..utils import shuffle as shuffle_method +from ..utils._param_validation import Interval, StrOptions, validate_params +from . import get_data_home +from ._base import ( + RemoteFileMetadata, + _convert_data_dataframe, + _fetch_remote, + load_descr, +) + +# The original data can be found at: +# https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data.gz +ARCHIVE = RemoteFileMetadata( + filename="kddcup99_data", + url="https://ndownloader.figshare.com/files/5976045", + checksum="3b6c942aa0356c0ca35b7b595a26c89d343652c9db428893e7494f837b274292", +) + +# The original data can be found at: +# https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data_10_percent.gz +ARCHIVE_10_PERCENT = RemoteFileMetadata( + filename="kddcup99_10_data", + url="https://ndownloader.figshare.com/files/5976042", + checksum="8045aca0d84e70e622d1148d7df782496f6333bf6eb979a1b0837c42a9fd9561", +) + +logger = logging.getLogger(__name__) + + +@validate_params( + { + "subset": [StrOptions({"SA", "SF", "http", "smtp"}), None], + "data_home": [str, os.PathLike, None], + "shuffle": ["boolean"], + "random_state": ["random_state"], + "percent10": ["boolean"], + "download_if_missing": ["boolean"], + "return_X_y": ["boolean"], + "as_frame": ["boolean"], + "n_retries": [Interval(Integral, 1, None, closed="left")], + "delay": [Interval(Real, 0.0, None, closed="neither")], + }, + prefer_skip_nested_validation=True, +) +def fetch_kddcup99( + *, + subset=None, + data_home=None, + shuffle=False, + random_state=None, + percent10=True, + download_if_missing=True, + return_X_y=False, + as_frame=False, + n_retries=3, + delay=1.0, +): + """Load the kddcup99 dataset (classification). + + Download it if necessary. + + ================= ==================================== + Classes 23 + Samples total 4898431 + Dimensionality 41 + Features discrete (int) or continuous (float) + ================= ==================================== + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.18 + + Parameters + ---------- + subset : {'SA', 'SF', 'http', 'smtp'}, default=None + To return the corresponding classical subsets of kddcup 99. + If None, return the entire kddcup 99 dataset. + + data_home : str or path-like, default=None + Specify another download and cache folder for the datasets. By default + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + + .. versionadded:: 0.19 + + shuffle : bool, default=False + Whether to shuffle dataset. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset shuffling and for + selection of abnormal samples if `subset='SA'`. Pass an int for + reproducible output across multiple function calls. + See :term:`Glossary `. + + percent10 : bool, default=True + Whether to load only 10 percent of the data. + + download_if_missing : bool, default=True + If False, raise an OSError if the data is not locally available + instead of trying to download the data from the source site. + + return_X_y : bool, default=False + If True, returns ``(data, target)`` instead of a Bunch object. See + below for more information about the `data` and `target` object. + + .. versionadded:: 0.20 + + as_frame : bool, default=False + If `True`, returns a pandas Dataframe for the ``data`` and ``target`` + objects in the `Bunch` returned object; `Bunch` return object will also + have a ``frame`` member. + + .. versionadded:: 0.24 + + n_retries : int, default=3 + Number of retries when HTTP errors are encountered. + + .. versionadded:: 1.5 + + delay : float, default=1.0 + Number of seconds between retries. + + .. versionadded:: 1.5 + + Returns + ------- + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : {ndarray, dataframe} of shape (494021, 41) + The data matrix to learn. If `as_frame=True`, `data` will be a + pandas DataFrame. + target : {ndarray, series} of shape (494021,) + The regression target for each sample. If `as_frame=True`, `target` + will be a pandas Series. + frame : dataframe of shape (494021, 42) + Only present when `as_frame=True`. Contains `data` and `target`. + DESCR : str + The full description of the dataset. + feature_names : list + The names of the dataset columns + target_names: list + The names of the target columns + + (data, target) : tuple if ``return_X_y`` is True + A tuple of two ndarray. The first containing a 2D array of + shape (n_samples, n_features) with each row representing one + sample and each column representing the features. The second + ndarray of shape (n_samples,) containing the target samples. + + .. versionadded:: 0.20 + """ + data_home = get_data_home(data_home=data_home) + kddcup99 = _fetch_brute_kddcup99( + data_home=data_home, + percent10=percent10, + download_if_missing=download_if_missing, + n_retries=n_retries, + delay=delay, + ) + + data = kddcup99.data + target = kddcup99.target + feature_names = kddcup99.feature_names + target_names = kddcup99.target_names + + if subset == "SA": + s = target == b"normal." + t = np.logical_not(s) + normal_samples = data[s, :] + normal_targets = target[s] + abnormal_samples = data[t, :] + abnormal_targets = target[t] + + n_samples_abnormal = abnormal_samples.shape[0] + # selected abnormal samples: + random_state = check_random_state(random_state) + r = random_state.randint(0, n_samples_abnormal, 3377) + abnormal_samples = abnormal_samples[r] + abnormal_targets = abnormal_targets[r] + + data = np.r_[normal_samples, abnormal_samples] + target = np.r_[normal_targets, abnormal_targets] + + if subset == "SF" or subset == "http" or subset == "smtp": + # select all samples with positive logged_in attribute: + s = data[:, 11] == 1 + data = np.c_[data[s, :11], data[s, 12:]] + feature_names = feature_names[:11] + feature_names[12:] + target = target[s] + + data[:, 0] = np.log((data[:, 0] + 0.1).astype(float, copy=False)) + data[:, 4] = np.log((data[:, 4] + 0.1).astype(float, copy=False)) + data[:, 5] = np.log((data[:, 5] + 0.1).astype(float, copy=False)) + + if subset == "http": + s = data[:, 2] == b"http" + data = data[s] + target = target[s] + data = np.c_[data[:, 0], data[:, 4], data[:, 5]] + feature_names = [feature_names[0], feature_names[4], feature_names[5]] + + if subset == "smtp": + s = data[:, 2] == b"smtp" + data = data[s] + target = target[s] + data = np.c_[data[:, 0], data[:, 4], data[:, 5]] + feature_names = [feature_names[0], feature_names[4], feature_names[5]] + + if subset == "SF": + data = np.c_[data[:, 0], data[:, 2], data[:, 4], data[:, 5]] + feature_names = [ + feature_names[0], + feature_names[2], + feature_names[4], + feature_names[5], + ] + + if shuffle: + data, target = shuffle_method(data, target, random_state=random_state) + + fdescr = load_descr("kddcup99.rst") + + frame = None + if as_frame: + frame, data, target = _convert_data_dataframe( + "fetch_kddcup99", data, target, feature_names, target_names + ) + + if return_X_y: + return data, target + + return Bunch( + data=data, + target=target, + frame=frame, + target_names=target_names, + feature_names=feature_names, + DESCR=fdescr, + ) + + +def _fetch_brute_kddcup99( + data_home=None, download_if_missing=True, percent10=True, n_retries=3, delay=1.0 +): + """Load the kddcup99 dataset, downloading it if necessary. + + Parameters + ---------- + data_home : str, default=None + Specify another download and cache folder for the datasets. By default + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + + download_if_missing : bool, default=True + If False, raise an OSError if the data is not locally available + instead of trying to download the data from the source site. + + percent10 : bool, default=True + Whether to load only 10 percent of the data. + + n_retries : int, default=3 + Number of retries when HTTP errors are encountered. + + delay : float, default=1.0 + Number of seconds between retries. + + Returns + ------- + dataset : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : ndarray of shape (494021, 41) + Each row corresponds to the 41 features in the dataset. + target : ndarray of shape (494021,) + Each value corresponds to one of the 21 attack types or to the + label 'normal.'. + feature_names : list + The names of the dataset columns + target_names: list + The names of the target columns + DESCR : str + Description of the kddcup99 dataset. + + """ + + data_home = get_data_home(data_home=data_home) + dir_suffix = "-py3" + + if percent10: + kddcup_dir = join(data_home, "kddcup99_10" + dir_suffix) + archive = ARCHIVE_10_PERCENT + else: + kddcup_dir = join(data_home, "kddcup99" + dir_suffix) + archive = ARCHIVE + + samples_path = join(kddcup_dir, "samples") + targets_path = join(kddcup_dir, "targets") + available = exists(samples_path) + + dt = [ + ("duration", int), + ("protocol_type", "S4"), + ("service", "S11"), + ("flag", "S6"), + ("src_bytes", int), + ("dst_bytes", int), + ("land", int), + ("wrong_fragment", int), + ("urgent", int), + ("hot", int), + ("num_failed_logins", int), + ("logged_in", int), + ("num_compromised", int), + ("root_shell", int), + ("su_attempted", int), + ("num_root", int), + ("num_file_creations", int), + ("num_shells", int), + ("num_access_files", int), + ("num_outbound_cmds", int), + ("is_host_login", int), + ("is_guest_login", int), + ("count", int), + ("srv_count", int), + ("serror_rate", float), + ("srv_serror_rate", float), + ("rerror_rate", float), + ("srv_rerror_rate", float), + ("same_srv_rate", float), + ("diff_srv_rate", float), + ("srv_diff_host_rate", float), + ("dst_host_count", int), + ("dst_host_srv_count", int), + ("dst_host_same_srv_rate", float), + ("dst_host_diff_srv_rate", float), + ("dst_host_same_src_port_rate", float), + ("dst_host_srv_diff_host_rate", float), + ("dst_host_serror_rate", float), + ("dst_host_srv_serror_rate", float), + ("dst_host_rerror_rate", float), + ("dst_host_srv_rerror_rate", float), + ("labels", "S16"), + ] + + column_names = [c[0] for c in dt] + target_names = column_names[-1] + feature_names = column_names[:-1] + + if available: + try: + X = joblib.load(samples_path) + y = joblib.load(targets_path) + except Exception as e: + raise OSError( + "The cache for fetch_kddcup99 is invalid, please delete " + f"{kddcup_dir} and run the fetch_kddcup99 again" + ) from e + + elif download_if_missing: + _mkdirp(kddcup_dir) + logger.info("Downloading %s" % archive.url) + _fetch_remote(archive, dirname=kddcup_dir, n_retries=n_retries, delay=delay) + DT = np.dtype(dt) + logger.debug("extracting archive") + archive_path = join(kddcup_dir, archive.filename) + file_ = GzipFile(filename=archive_path, mode="r") + Xy = [] + for line in file_.readlines(): + line = line.decode() + Xy.append(line.replace("\n", "").split(",")) + file_.close() + logger.debug("extraction done") + os.remove(archive_path) + + Xy = np.asarray(Xy, dtype=object) + for j in range(42): + Xy[:, j] = Xy[:, j].astype(DT[j]) + + X = Xy[:, :-1] + y = Xy[:, -1] + # XXX bug when compress!=0: + # (error: 'Incorrect data length while decompressing[...] the file + # could be corrupted.') + + joblib.dump(X, samples_path, compress=0) + joblib.dump(y, targets_path, compress=0) + else: + raise OSError("Data not found and `download_if_missing` is False") + + return Bunch( + data=X, + target=y, + feature_names=feature_names, + target_names=[target_names], + ) + + +def _mkdirp(d): + """Ensure directory d exists (like mkdir -p on Unix) + No guarantee that the directory is writable. + """ + try: + os.makedirs(d) + except OSError as e: + if e.errno != errno.EEXIST: + raise diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_lfw.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_lfw.py new file mode 100644 index 0000000000000000000000000000000000000000..4f725b9250cc5e325659612f0c83c7724288828b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_lfw.py @@ -0,0 +1,648 @@ +"""Labeled Faces in the Wild (LFW) dataset + +This dataset is a collection of JPEG pictures of famous people collected +over the internet, all details are available on the official website: + + http://vis-www.cs.umass.edu/lfw/ +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import logging +from numbers import Integral, Real +from os import PathLike, listdir, makedirs, remove +from os.path import exists, isdir, join + +import numpy as np +from joblib import Memory + +from ..utils import Bunch +from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params +from ..utils.fixes import tarfile_extractall +from ._base import ( + RemoteFileMetadata, + _fetch_remote, + get_data_home, + load_descr, +) + +logger = logging.getLogger(__name__) + +# The original data can be found in: +# http://vis-www.cs.umass.edu/lfw/lfw.tgz +ARCHIVE = RemoteFileMetadata( + filename="lfw.tgz", + url="https://ndownloader.figshare.com/files/5976018", + checksum="055f7d9c632d7370e6fb4afc7468d40f970c34a80d4c6f50ffec63f5a8d536c0", +) + +# The original funneled data can be found in: +# http://vis-www.cs.umass.edu/lfw/lfw-funneled.tgz +FUNNELED_ARCHIVE = RemoteFileMetadata( + filename="lfw-funneled.tgz", + url="https://ndownloader.figshare.com/files/5976015", + checksum="b47c8422c8cded889dc5a13418c4bc2abbda121092b3533a83306f90d900100a", +) + +# The original target data can be found in: +# http://vis-www.cs.umass.edu/lfw/pairsDevTrain.txt', +# http://vis-www.cs.umass.edu/lfw/pairsDevTest.txt', +# http://vis-www.cs.umass.edu/lfw/pairs.txt', +TARGETS = ( + RemoteFileMetadata( + filename="pairsDevTrain.txt", + url="https://ndownloader.figshare.com/files/5976012", + checksum="1d454dada7dfeca0e7eab6f65dc4e97a6312d44cf142207be28d688be92aabfa", + ), + RemoteFileMetadata( + filename="pairsDevTest.txt", + url="https://ndownloader.figshare.com/files/5976009", + checksum="7cb06600ea8b2814ac26e946201cdb304296262aad67d046a16a7ec85d0ff87c", + ), + RemoteFileMetadata( + filename="pairs.txt", + url="https://ndownloader.figshare.com/files/5976006", + checksum="ea42330c62c92989f9d7c03237ed5d591365e89b3e649747777b70e692dc1592", + ), +) + + +# +# Common private utilities for data fetching from the original LFW website +# local disk caching, and image decoding. +# + + +def _check_fetch_lfw( + data_home=None, funneled=True, download_if_missing=True, n_retries=3, delay=1.0 +): + """Helper function to download any missing LFW data""" + + data_home = get_data_home(data_home=data_home) + lfw_home = join(data_home, "lfw_home") + + if not exists(lfw_home): + makedirs(lfw_home) + + for target in TARGETS: + target_filepath = join(lfw_home, target.filename) + if not exists(target_filepath): + if download_if_missing: + logger.info("Downloading LFW metadata: %s", target.url) + _fetch_remote( + target, dirname=lfw_home, n_retries=n_retries, delay=delay + ) + else: + raise OSError("%s is missing" % target_filepath) + + if funneled: + data_folder_path = join(lfw_home, "lfw_funneled") + archive = FUNNELED_ARCHIVE + else: + data_folder_path = join(lfw_home, "lfw") + archive = ARCHIVE + + if not exists(data_folder_path): + archive_path = join(lfw_home, archive.filename) + if not exists(archive_path): + if download_if_missing: + logger.info("Downloading LFW data (~200MB): %s", archive.url) + _fetch_remote( + archive, dirname=lfw_home, n_retries=n_retries, delay=delay + ) + else: + raise OSError("%s is missing" % archive_path) + + import tarfile + + logger.debug("Decompressing the data archive to %s", data_folder_path) + with tarfile.open(archive_path, "r:gz") as fp: + tarfile_extractall(fp, path=lfw_home) + + remove(archive_path) + + return lfw_home, data_folder_path + + +def _load_imgs(file_paths, slice_, color, resize): + """Internally used to load images""" + try: + from PIL import Image + except ImportError: + raise ImportError( + "The Python Imaging Library (PIL) is required to load data " + "from jpeg files. Please refer to " + "https://pillow.readthedocs.io/en/stable/installation.html " + "for installing PIL." + ) + + # compute the portion of the images to load to respect the slice_ parameter + # given by the caller + default_slice = (slice(0, 250), slice(0, 250)) + if slice_ is None: + slice_ = default_slice + else: + slice_ = tuple(s or ds for s, ds in zip(slice_, default_slice)) + + h_slice, w_slice = slice_ + h = (h_slice.stop - h_slice.start) // (h_slice.step or 1) + w = (w_slice.stop - w_slice.start) // (w_slice.step or 1) + + if resize is not None: + resize = float(resize) + h = int(resize * h) + w = int(resize * w) + + # allocate some contiguous memory to host the decoded image slices + n_faces = len(file_paths) + if not color: + faces = np.zeros((n_faces, h, w), dtype=np.float32) + else: + faces = np.zeros((n_faces, h, w, 3), dtype=np.float32) + + # iterate over the collected file path to load the jpeg files as numpy + # arrays + for i, file_path in enumerate(file_paths): + if i % 1000 == 0: + logger.debug("Loading face #%05d / %05d", i + 1, n_faces) + + # Checks if jpeg reading worked. Refer to issue #3594 for more + # details. + pil_img = Image.open(file_path) + pil_img = pil_img.crop( + (w_slice.start, h_slice.start, w_slice.stop, h_slice.stop) + ) + if resize is not None: + pil_img = pil_img.resize((w, h)) + face = np.asarray(pil_img, dtype=np.float32) + + if face.ndim == 0: + raise RuntimeError( + "Failed to read the image file %s, " + "Please make sure that libjpeg is installed" % file_path + ) + + face /= 255.0 # scale uint8 coded colors to the [0.0, 1.0] floats + if not color: + # average the color channels to compute a gray levels + # representation + face = face.mean(axis=2) + + faces[i, ...] = face + + return faces + + +# +# Task #1: Face Identification on picture with names +# + + +def _fetch_lfw_people( + data_folder_path, slice_=None, color=False, resize=None, min_faces_per_person=0 +): + """Perform the actual data loading for the lfw people dataset + + This operation is meant to be cached by a joblib wrapper. + """ + # scan the data folder content to retain people with more that + # `min_faces_per_person` face pictures + person_names, file_paths = [], [] + for person_name in sorted(listdir(data_folder_path)): + folder_path = join(data_folder_path, person_name) + if not isdir(folder_path): + continue + paths = [join(folder_path, f) for f in sorted(listdir(folder_path))] + n_pictures = len(paths) + if n_pictures >= min_faces_per_person: + person_name = person_name.replace("_", " ") + person_names.extend([person_name] * n_pictures) + file_paths.extend(paths) + + n_faces = len(file_paths) + if n_faces == 0: + raise ValueError( + "min_faces_per_person=%d is too restrictive" % min_faces_per_person + ) + + target_names = np.unique(person_names) + target = np.searchsorted(target_names, person_names) + + faces = _load_imgs(file_paths, slice_, color, resize) + + # shuffle the faces with a deterministic RNG scheme to avoid having + # all faces of the same person in a row, as it would break some + # cross validation and learning algorithms such as SGD and online + # k-means that make an IID assumption + + indices = np.arange(n_faces) + np.random.RandomState(42).shuffle(indices) + faces, target = faces[indices], target[indices] + return faces, target, target_names + + +@validate_params( + { + "data_home": [str, PathLike, None], + "funneled": ["boolean"], + "resize": [Interval(Real, 0, None, closed="neither"), None], + "min_faces_per_person": [Interval(Integral, 0, None, closed="left"), None], + "color": ["boolean"], + "slice_": [tuple, Hidden(None)], + "download_if_missing": ["boolean"], + "return_X_y": ["boolean"], + "n_retries": [Interval(Integral, 1, None, closed="left")], + "delay": [Interval(Real, 0.0, None, closed="neither")], + }, + prefer_skip_nested_validation=True, +) +def fetch_lfw_people( + *, + data_home=None, + funneled=True, + resize=0.5, + min_faces_per_person=0, + color=False, + slice_=(slice(70, 195), slice(78, 172)), + download_if_missing=True, + return_X_y=False, + n_retries=3, + delay=1.0, +): + """Load the Labeled Faces in the Wild (LFW) people dataset \ +(classification). + + Download it if necessary. + + ================= ======================= + Classes 5749 + Samples total 13233 + Dimensionality 5828 + Features real, between 0 and 255 + ================= ======================= + + For a usage example of this dataset, see + :ref:`sphx_glr_auto_examples_applications_plot_face_recognition.py`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + data_home : str or path-like, default=None + Specify another download and cache folder for the datasets. By default + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + + funneled : bool, default=True + Download and use the funneled variant of the dataset. + + resize : float or None, default=0.5 + Ratio used to resize the each face picture. If `None`, no resizing is + performed. + + min_faces_per_person : int, default=None + The extracted dataset will only retain pictures of people that have at + least `min_faces_per_person` different pictures. + + color : bool, default=False + Keep the 3 RGB channels instead of averaging them to a single + gray level channel. If color is True the shape of the data has + one more dimension than the shape with color = False. + + slice_ : tuple of slice, default=(slice(70, 195), slice(78, 172)) + Provide a custom 2D slice (height, width) to extract the + 'interesting' part of the jpeg files and avoid use statistical + correlation from the background. + + download_if_missing : bool, default=True + If False, raise an OSError if the data is not locally available + instead of trying to download the data from the source site. + + return_X_y : bool, default=False + If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch + object. See below for more information about the `dataset.data` and + `dataset.target` object. + + .. versionadded:: 0.20 + + n_retries : int, default=3 + Number of retries when HTTP errors are encountered. + + .. versionadded:: 1.5 + + delay : float, default=1.0 + Number of seconds between retries. + + .. versionadded:: 1.5 + + Returns + ------- + dataset : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : numpy array of shape (13233, 2914) + Each row corresponds to a ravelled face image + of original size 62 x 47 pixels. + Changing the ``slice_`` or resize parameters will change the + shape of the output. + images : numpy array of shape (13233, 62, 47) + Each row is a face image corresponding to one of the 5749 people in + the dataset. Changing the ``slice_`` + or resize parameters will change the shape of the output. + target : numpy array of shape (13233,) + Labels associated to each face image. + Those labels range from 0-5748 and correspond to the person IDs. + target_names : numpy array of shape (5749,) + Names of all persons in the dataset. + Position in array corresponds to the person ID in the target array. + DESCR : str + Description of the Labeled Faces in the Wild (LFW) dataset. + + (data, target) : tuple if ``return_X_y`` is True + A tuple of two ndarray. The first containing a 2D array of + shape (n_samples, n_features) with each row representing one + sample and each column representing the features. The second + ndarray of shape (n_samples,) containing the target samples. + + .. versionadded:: 0.20 + + Examples + -------- + >>> from sklearn.datasets import fetch_lfw_people + >>> lfw_people = fetch_lfw_people() + >>> lfw_people.data.shape + (13233, 2914) + >>> lfw_people.target.shape + (13233,) + >>> for name in lfw_people.target_names[:5]: + ... print(name) + AJ Cook + AJ Lamas + Aaron Eckhart + Aaron Guiel + Aaron Patterson + """ + lfw_home, data_folder_path = _check_fetch_lfw( + data_home=data_home, + funneled=funneled, + download_if_missing=download_if_missing, + n_retries=n_retries, + delay=delay, + ) + logger.debug("Loading LFW people faces from %s", lfw_home) + + # wrap the loader in a memoizing function that will return memmaped data + # arrays for optimal memory usage + m = Memory(location=lfw_home, compress=6, verbose=0) + load_func = m.cache(_fetch_lfw_people) + + # load and memoize the pairs as np arrays + faces, target, target_names = load_func( + data_folder_path, + resize=resize, + min_faces_per_person=min_faces_per_person, + color=color, + slice_=slice_, + ) + + X = faces.reshape(len(faces), -1) + + fdescr = load_descr("lfw.rst") + + if return_X_y: + return X, target + + # pack the results as a Bunch instance + return Bunch( + data=X, images=faces, target=target, target_names=target_names, DESCR=fdescr + ) + + +# +# Task #2: Face Verification on pairs of face pictures +# + + +def _fetch_lfw_pairs( + index_file_path, data_folder_path, slice_=None, color=False, resize=None +): + """Perform the actual data loading for the LFW pairs dataset + + This operation is meant to be cached by a joblib wrapper. + """ + # parse the index file to find the number of pairs to be able to allocate + # the right amount of memory before starting to decode the jpeg files + with open(index_file_path, "rb") as index_file: + split_lines = [ln.decode().strip().split("\t") for ln in index_file] + pair_specs = [sl for sl in split_lines if len(sl) > 2] + n_pairs = len(pair_specs) + + # iterating over the metadata lines for each pair to find the filename to + # decode and load in memory + target = np.zeros(n_pairs, dtype=int) + file_paths = list() + for i, components in enumerate(pair_specs): + if len(components) == 3: + target[i] = 1 + pair = ( + (components[0], int(components[1]) - 1), + (components[0], int(components[2]) - 1), + ) + elif len(components) == 4: + target[i] = 0 + pair = ( + (components[0], int(components[1]) - 1), + (components[2], int(components[3]) - 1), + ) + else: + raise ValueError("invalid line %d: %r" % (i + 1, components)) + for j, (name, idx) in enumerate(pair): + try: + person_folder = join(data_folder_path, name) + except TypeError: + person_folder = join(data_folder_path, str(name, "UTF-8")) + filenames = list(sorted(listdir(person_folder))) + file_path = join(person_folder, filenames[idx]) + file_paths.append(file_path) + + pairs = _load_imgs(file_paths, slice_, color, resize) + shape = list(pairs.shape) + n_faces = shape.pop(0) + shape.insert(0, 2) + shape.insert(0, n_faces // 2) + pairs.shape = shape + + return pairs, target, np.array(["Different persons", "Same person"]) + + +@validate_params( + { + "subset": [StrOptions({"train", "test", "10_folds"})], + "data_home": [str, PathLike, None], + "funneled": ["boolean"], + "resize": [Interval(Real, 0, None, closed="neither"), None], + "color": ["boolean"], + "slice_": [tuple, Hidden(None)], + "download_if_missing": ["boolean"], + "n_retries": [Interval(Integral, 1, None, closed="left")], + "delay": [Interval(Real, 0.0, None, closed="neither")], + }, + prefer_skip_nested_validation=True, +) +def fetch_lfw_pairs( + *, + subset="train", + data_home=None, + funneled=True, + resize=0.5, + color=False, + slice_=(slice(70, 195), slice(78, 172)), + download_if_missing=True, + n_retries=3, + delay=1.0, +): + """Load the Labeled Faces in the Wild (LFW) pairs dataset (classification). + + Download it if necessary. + + ================= ======================= + Classes 2 + Samples total 13233 + Dimensionality 5828 + Features real, between 0 and 255 + ================= ======================= + + In the `original paper `_ + the "pairs" version corresponds to the "restricted task", where + the experimenter should not use the name of a person to infer + the equivalence or non-equivalence of two face images that + are not explicitly given in the training set. + + The original images are 250 x 250 pixels, but the default slice and resize + arguments reduce them to 62 x 47. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + subset : {'train', 'test', '10_folds'}, default='train' + Select the dataset to load: 'train' for the development training + set, 'test' for the development test set, and '10_folds' for the + official evaluation set that is meant to be used with a 10-folds + cross validation. + + data_home : str or path-like, default=None + Specify another download and cache folder for the datasets. By + default all scikit-learn data is stored in '~/scikit_learn_data' + subfolders. + + funneled : bool, default=True + Download and use the funneled variant of the dataset. + + resize : float, default=0.5 + Ratio used to resize the each face picture. + + color : bool, default=False + Keep the 3 RGB channels instead of averaging them to a single + gray level channel. If color is True the shape of the data has + one more dimension than the shape with color = False. + + slice_ : tuple of slice, default=(slice(70, 195), slice(78, 172)) + Provide a custom 2D slice (height, width) to extract the + 'interesting' part of the jpeg files and avoid use statistical + correlation from the background. + + download_if_missing : bool, default=True + If False, raise an OSError if the data is not locally available + instead of trying to download the data from the source site. + + n_retries : int, default=3 + Number of retries when HTTP errors are encountered. + + .. versionadded:: 1.5 + + delay : float, default=1.0 + Number of seconds between retries. + + .. versionadded:: 1.5 + + Returns + ------- + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : ndarray of shape (2200, 5828). Shape depends on ``subset``. + Each row corresponds to 2 ravel'd face images + of original size 62 x 47 pixels. + Changing the ``slice_``, ``resize`` or ``subset`` parameters + will change the shape of the output. + pairs : ndarray of shape (2200, 2, 62, 47). Shape depends on ``subset`` + Each row has 2 face images corresponding + to same or different person from the dataset + containing 5749 people. Changing the ``slice_``, + ``resize`` or ``subset`` parameters will change the shape of the + output. + target : numpy array of shape (2200,). Shape depends on ``subset``. + Labels associated to each pair of images. + The two label values being different persons or the same person. + target_names : numpy array of shape (2,) + Explains the target values of the target array. + 0 corresponds to "Different person", 1 corresponds to "same person". + DESCR : str + Description of the Labeled Faces in the Wild (LFW) dataset. + + Examples + -------- + >>> from sklearn.datasets import fetch_lfw_pairs + >>> lfw_pairs_train = fetch_lfw_pairs(subset='train') + >>> list(lfw_pairs_train.target_names) + [np.str_('Different persons'), np.str_('Same person')] + >>> lfw_pairs_train.pairs.shape + (2200, 2, 62, 47) + >>> lfw_pairs_train.data.shape + (2200, 5828) + >>> lfw_pairs_train.target.shape + (2200,) + """ + lfw_home, data_folder_path = _check_fetch_lfw( + data_home=data_home, + funneled=funneled, + download_if_missing=download_if_missing, + n_retries=n_retries, + delay=delay, + ) + logger.debug("Loading %s LFW pairs from %s", subset, lfw_home) + + # wrap the loader in a memoizing function that will return memmaped data + # arrays for optimal memory usage + m = Memory(location=lfw_home, compress=6, verbose=0) + load_func = m.cache(_fetch_lfw_pairs) + + # select the right metadata file according to the requested subset + label_filenames = { + "train": "pairsDevTrain.txt", + "test": "pairsDevTest.txt", + "10_folds": "pairs.txt", + } + if subset not in label_filenames: + raise ValueError( + "subset='%s' is invalid: should be one of %r" + % (subset, list(sorted(label_filenames.keys()))) + ) + index_file_path = join(lfw_home, label_filenames[subset]) + + # load and memoize the pairs as np arrays + pairs, target, target_names = load_func( + index_file_path, data_folder_path, resize=resize, color=color, slice_=slice_ + ) + + fdescr = load_descr("lfw.rst") + + # pack the results as a Bunch instance + return Bunch( + data=pairs.reshape(len(pairs), -1), + pairs=pairs, + target=target, + target_names=target_names, + DESCR=fdescr, + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_olivetti_faces.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_olivetti_faces.py new file mode 100644 index 0000000000000000000000000000000000000000..efb382b1dcdda0bd3dadc2216da9be21d40dddd2 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_olivetti_faces.py @@ -0,0 +1,184 @@ +"""Modified Olivetti faces dataset. + +The original database was available from (now defunct) + + https://www.cl.cam.ac.uk/research/dtg/attarchive/facedatabase.html + +The version retrieved here comes in MATLAB format from the personal +web page of Sam Roweis: + + https://cs.nyu.edu/~roweis/ +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Integral, Real +from os import PathLike, makedirs, remove +from os.path import exists + +import joblib +import numpy as np +from scipy.io import loadmat + +from ..utils import Bunch, check_random_state +from ..utils._param_validation import Interval, validate_params +from . import get_data_home +from ._base import RemoteFileMetadata, _fetch_remote, _pkl_filepath, load_descr + +# The original data can be found at: +# https://cs.nyu.edu/~roweis/data/olivettifaces.mat +FACES = RemoteFileMetadata( + filename="olivettifaces.mat", + url="https://ndownloader.figshare.com/files/5976027", + checksum="b612fb967f2dc77c9c62d3e1266e0c73d5fca46a4b8906c18e454d41af987794", +) + + +@validate_params( + { + "data_home": [str, PathLike, None], + "shuffle": ["boolean"], + "random_state": ["random_state"], + "download_if_missing": ["boolean"], + "return_X_y": ["boolean"], + "n_retries": [Interval(Integral, 1, None, closed="left")], + "delay": [Interval(Real, 0.0, None, closed="neither")], + }, + prefer_skip_nested_validation=True, +) +def fetch_olivetti_faces( + *, + data_home=None, + shuffle=False, + random_state=0, + download_if_missing=True, + return_X_y=False, + n_retries=3, + delay=1.0, +): + """Load the Olivetti faces data-set from AT&T (classification). + + Download it if necessary. + + ================= ===================== + Classes 40 + Samples total 400 + Dimensionality 4096 + Features real, between 0 and 1 + ================= ===================== + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + data_home : str or path-like, default=None + Specify another download and cache folder for the datasets. By default + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + + shuffle : bool, default=False + If True the order of the dataset is shuffled to avoid having + images of the same person grouped. + + random_state : int, RandomState instance or None, default=0 + Determines random number generation for dataset shuffling. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + download_if_missing : bool, default=True + If False, raise an OSError if the data is not locally available + instead of trying to download the data from the source site. + + return_X_y : bool, default=False + If True, returns `(data, target)` instead of a `Bunch` object. See + below for more information about the `data` and `target` object. + + .. versionadded:: 0.22 + + n_retries : int, default=3 + Number of retries when HTTP errors are encountered. + + .. versionadded:: 1.5 + + delay : float, default=1.0 + Number of seconds between retries. + + .. versionadded:: 1.5 + + Returns + ------- + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data: ndarray, shape (400, 4096) + Each row corresponds to a ravelled + face image of original size 64 x 64 pixels. + images : ndarray, shape (400, 64, 64) + Each row is a face image + corresponding to one of the 40 subjects of the dataset. + target : ndarray, shape (400,) + Labels associated to each face image. + Those labels are ranging from 0-39 and correspond to the + Subject IDs. + DESCR : str + Description of the modified Olivetti Faces Dataset. + + (data, target) : tuple if `return_X_y=True` + Tuple with the `data` and `target` objects described above. + + .. versionadded:: 0.22 + + Examples + -------- + >>> from sklearn.datasets import fetch_olivetti_faces + >>> olivetti_faces = fetch_olivetti_faces() + >>> olivetti_faces.data.shape + (400, 4096) + >>> olivetti_faces.target.shape + (400,) + >>> olivetti_faces.images.shape + (400, 64, 64) + """ + data_home = get_data_home(data_home=data_home) + if not exists(data_home): + makedirs(data_home) + filepath = _pkl_filepath(data_home, "olivetti.pkz") + if not exists(filepath): + if not download_if_missing: + raise OSError("Data not found and `download_if_missing` is False") + + print("downloading Olivetti faces from %s to %s" % (FACES.url, data_home)) + mat_path = _fetch_remote( + FACES, dirname=data_home, n_retries=n_retries, delay=delay + ) + mfile = loadmat(file_name=mat_path) + # delete raw .mat data + remove(mat_path) + + faces = mfile["faces"].T.copy() + joblib.dump(faces, filepath, compress=6) + del mfile + else: + faces = joblib.load(filepath) + + # We want floating point data, but float32 is enough (there is only + # one byte of precision in the original uint8s anyway) + faces = np.float32(faces) + faces = faces - faces.min() + faces /= faces.max() + faces = faces.reshape((400, 64, 64)).transpose(0, 2, 1) + # 10 images per class, 400 images total, each class is contiguous. + target = np.array([i // 10 for i in range(400)]) + if shuffle: + random_state = check_random_state(random_state) + order = random_state.permutation(len(faces)) + faces = faces[order] + target = target[order] + faces_vectorized = faces.reshape(len(faces), -1) + + fdescr = load_descr("olivetti_faces.rst") + + if return_X_y: + return faces_vectorized, target + + return Bunch(data=faces_vectorized, images=faces, target=target, DESCR=fdescr) diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_openml.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_openml.py new file mode 100644 index 0000000000000000000000000000000000000000..537f6cde499a2384ba08032af4efe004e9321bce --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_openml.py @@ -0,0 +1,1164 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import gzip +import hashlib +import json +import os +import shutil +import time +from contextlib import closing +from functools import wraps +from os.path import join +from tempfile import TemporaryDirectory +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from urllib.error import HTTPError, URLError +from urllib.parse import urlparse +from urllib.request import Request, urlopen +from warnings import warn + +import numpy as np + +from ..utils import Bunch +from ..utils._optional_dependencies import check_pandas_support +from ..utils._param_validation import ( + Integral, + Interval, + Real, + StrOptions, + validate_params, +) +from . import get_data_home +from ._arff_parser import load_arff_from_gzip_file + +__all__ = ["fetch_openml"] + +_SEARCH_NAME = "https://api.openml.org/api/v1/json/data/list/data_name/{}/limit/2" +_DATA_INFO = "https://api.openml.org/api/v1/json/data/{}" +_DATA_FEATURES = "https://api.openml.org/api/v1/json/data/features/{}" +_DATA_QUALITIES = "https://api.openml.org/api/v1/json/data/qualities/{}" + +OpenmlQualitiesType = List[Dict[str, str]] +OpenmlFeaturesType = List[Dict[str, str]] + + +def _get_local_path(openml_path: str, data_home: str) -> str: + return os.path.join(data_home, "openml.org", openml_path + ".gz") + + +def _retry_with_clean_cache( + openml_path: str, + data_home: Optional[str], + no_retry_exception: Optional[Exception] = None, +) -> Callable: + """If the first call to the decorated function fails, the local cached + file is removed, and the function is called again. If ``data_home`` is + ``None``, then the function is called once. We can provide a specific + exception to not retry on using `no_retry_exception` parameter. + """ + + def decorator(f): + @wraps(f) + def wrapper(*args, **kw): + if data_home is None: + return f(*args, **kw) + try: + return f(*args, **kw) + except URLError: + raise + except Exception as exc: + if no_retry_exception is not None and isinstance( + exc, no_retry_exception + ): + raise + warn("Invalid cache, redownloading file", RuntimeWarning) + local_path = _get_local_path(openml_path, data_home) + if os.path.exists(local_path): + os.unlink(local_path) + return f(*args, **kw) + + return wrapper + + return decorator + + +def _retry_on_network_error( + n_retries: int = 3, delay: float = 1.0, url: str = "" +) -> Callable: + """If the function call results in a network error, call the function again + up to ``n_retries`` times with a ``delay`` between each call. If the error + has a 412 status code, don't call the function again as this is a specific + OpenML error. + The url parameter is used to give more information to the user about the + error. + """ + + def decorator(f): + @wraps(f) + def wrapper(*args, **kwargs): + retry_counter = n_retries + while True: + try: + return f(*args, **kwargs) + except (URLError, TimeoutError) as e: + # 412 is a specific OpenML error code. + if isinstance(e, HTTPError) and e.code == 412: + raise + if retry_counter == 0: + raise + warn( + f"A network error occurred while downloading {url}. Retrying..." + ) + # Avoid a ResourceWarning on Python 3.14 and later. + if isinstance(e, HTTPError): + e.close() + + retry_counter -= 1 + time.sleep(delay) + + return wrapper + + return decorator + + +def _open_openml_url( + url: str, data_home: Optional[str], n_retries: int = 3, delay: float = 1.0 +): + """ + Returns a resource from OpenML.org. Caches it to data_home if required. + + Parameters + ---------- + url : str + OpenML URL that will be downloaded and cached locally. The path component + of the URL is used to replicate the tree structure as sub-folders of the local + cache folder. + + data_home : str + Directory to which the files will be cached. If None, no caching will + be applied. + + n_retries : int, default=3 + Number of retries when HTTP errors are encountered. Error with status + code 412 won't be retried as they represent OpenML generic errors. + + delay : float, default=1.0 + Number of seconds between retries. + + Returns + ------- + result : stream + A stream to the OpenML resource. + """ + + def is_gzip_encoded(_fsrc): + return _fsrc.info().get("Content-Encoding", "") == "gzip" + + req = Request(url) + req.add_header("Accept-encoding", "gzip") + + if data_home is None: + fsrc = _retry_on_network_error(n_retries, delay, req.full_url)(urlopen)(req) + if is_gzip_encoded(fsrc): + return gzip.GzipFile(fileobj=fsrc, mode="rb") + return fsrc + + openml_path = urlparse(url).path.lstrip("/") + local_path = _get_local_path(openml_path, data_home) + dir_name, file_name = os.path.split(local_path) + if not os.path.exists(local_path): + os.makedirs(dir_name, exist_ok=True) + try: + # Create a tmpdir as a subfolder of dir_name where the final file will + # be moved to if the download is successful. This guarantees that the + # renaming operation to the final location is atomic to ensure the + # concurrence safety of the dataset caching mechanism. + with TemporaryDirectory(dir=dir_name) as tmpdir: + with closing( + _retry_on_network_error(n_retries, delay, req.full_url)(urlopen)( + req + ) + ) as fsrc: + opener: Callable + if is_gzip_encoded(fsrc): + opener = open + else: + opener = gzip.GzipFile + with opener(os.path.join(tmpdir, file_name), "wb") as fdst: + shutil.copyfileobj(fsrc, fdst) + shutil.move(fdst.name, local_path) + except Exception: + if os.path.exists(local_path): + os.unlink(local_path) + raise + + # XXX: First time, decompression will not be necessary (by using fsrc), but + # it will happen nonetheless + return gzip.GzipFile(local_path, "rb") + + +class OpenMLError(ValueError): + """HTTP 412 is a specific OpenML error code, indicating a generic error""" + + pass + + +def _get_json_content_from_openml_api( + url: str, + error_message: Optional[str], + data_home: Optional[str], + n_retries: int = 3, + delay: float = 1.0, +) -> Dict: + """ + Loads json data from the openml api. + + Parameters + ---------- + url : str + The URL to load from. Should be an official OpenML endpoint. + + error_message : str or None + The error message to raise if an acceptable OpenML error is thrown + (acceptable error is, e.g., data id not found. Other errors, like 404's + will throw the native error message). + + data_home : str or None + Location to cache the response. None if no cache is required. + + n_retries : int, default=3 + Number of retries when HTTP errors are encountered. Error with status + code 412 won't be retried as they represent OpenML generic errors. + + delay : float, default=1.0 + Number of seconds between retries. + + Returns + ------- + json_data : json + the json result from the OpenML server if the call was successful. + An exception otherwise. + """ + + @_retry_with_clean_cache(url, data_home=data_home) + def _load_json(): + with closing( + _open_openml_url(url, data_home, n_retries=n_retries, delay=delay) + ) as response: + return json.loads(response.read().decode("utf-8")) + + try: + return _load_json() + except HTTPError as error: + # 412 is an OpenML specific error code, indicating a generic error + # (e.g., data not found) + if error.code != 412: + raise error + + # 412 error, not in except for nicer traceback + raise OpenMLError(error_message) + + +def _get_data_info_by_name( + name: str, + version: Union[int, str], + data_home: Optional[str], + n_retries: int = 3, + delay: float = 1.0, +): + """ + Utilizes the openml dataset listing api to find a dataset by + name/version + OpenML api function: + https://www.openml.org/api_docs#!/data/get_data_list_data_name_data_name + + Parameters + ---------- + name : str + name of the dataset + + version : int or str + If version is an integer, the exact name/version will be obtained from + OpenML. If version is a string (value: "active") it will take the first + version from OpenML that is annotated as active. Any other string + values except "active" are treated as integer. + + data_home : str or None + Location to cache the response. None if no cache is required. + + n_retries : int, default=3 + Number of retries when HTTP errors are encountered. Error with status + code 412 won't be retried as they represent OpenML generic errors. + + delay : float, default=1.0 + Number of seconds between retries. + + Returns + ------- + first_dataset : json + json representation of the first dataset object that adhired to the + search criteria + + """ + if version == "active": + # situation in which we return the oldest active version + url = _SEARCH_NAME.format(name) + "/status/active/" + error_msg = "No active dataset {} found.".format(name) + json_data = _get_json_content_from_openml_api( + url, + error_msg, + data_home=data_home, + n_retries=n_retries, + delay=delay, + ) + res = json_data["data"]["dataset"] + if len(res) > 1: + first_version = version = res[0]["version"] + warning_msg = ( + "Multiple active versions of the dataset matching the name" + f" {name} exist. Versions may be fundamentally different, " + f"returning version {first_version}. " + "Available versions:\n" + ) + for r in res: + warning_msg += f"- version {r['version']}, status: {r['status']}\n" + warning_msg += ( + f" url: https://www.openml.org/search?type=data&id={r['did']}\n" + ) + warn(warning_msg) + return res[0] + + # an integer version has been provided + url = (_SEARCH_NAME + "/data_version/{}").format(name, version) + try: + json_data = _get_json_content_from_openml_api( + url, + error_message=None, + data_home=data_home, + n_retries=n_retries, + delay=delay, + ) + except OpenMLError: + # we can do this in 1 function call if OpenML does not require the + # specification of the dataset status (i.e., return datasets with a + # given name / version regardless of active, deactivated, etc. ) + # TODO: feature request OpenML. + url += "/status/deactivated" + error_msg = "Dataset {} with version {} not found.".format(name, version) + json_data = _get_json_content_from_openml_api( + url, + error_msg, + data_home=data_home, + n_retries=n_retries, + delay=delay, + ) + + return json_data["data"]["dataset"][0] + + +def _get_data_description_by_id( + data_id: int, + data_home: Optional[str], + n_retries: int = 3, + delay: float = 1.0, +) -> Dict[str, Any]: + # OpenML API function: https://www.openml.org/api_docs#!/data/get_data_id + url = _DATA_INFO.format(data_id) + error_message = "Dataset with data_id {} not found.".format(data_id) + json_data = _get_json_content_from_openml_api( + url, + error_message, + data_home=data_home, + n_retries=n_retries, + delay=delay, + ) + return json_data["data_set_description"] + + +def _get_data_features( + data_id: int, + data_home: Optional[str], + n_retries: int = 3, + delay: float = 1.0, +) -> OpenmlFeaturesType: + # OpenML function: + # https://www.openml.org/api_docs#!/data/get_data_features_id + url = _DATA_FEATURES.format(data_id) + error_message = "Dataset with data_id {} not found.".format(data_id) + json_data = _get_json_content_from_openml_api( + url, + error_message, + data_home=data_home, + n_retries=n_retries, + delay=delay, + ) + return json_data["data_features"]["feature"] + + +def _get_data_qualities( + data_id: int, + data_home: Optional[str], + n_retries: int = 3, + delay: float = 1.0, +) -> OpenmlQualitiesType: + # OpenML API function: + # https://www.openml.org/api_docs#!/data/get_data_qualities_id + url = _DATA_QUALITIES.format(data_id) + error_message = "Dataset with data_id {} not found.".format(data_id) + json_data = _get_json_content_from_openml_api( + url, + error_message, + data_home=data_home, + n_retries=n_retries, + delay=delay, + ) + # the qualities might not be available, but we still try to process + # the data + return json_data.get("data_qualities", {}).get("quality", []) + + +def _get_num_samples(data_qualities: OpenmlQualitiesType) -> int: + """Get the number of samples from data qualities. + + Parameters + ---------- + data_qualities : list of dict + Used to retrieve the number of instances (samples) in the dataset. + + Returns + ------- + n_samples : int + The number of samples in the dataset or -1 if data qualities are + unavailable. + """ + # If the data qualities are unavailable, we return -1 + default_n_samples = -1 + + qualities = {d["name"]: d["value"] for d in data_qualities} + return int(float(qualities.get("NumberOfInstances", default_n_samples))) + + +def _load_arff_response( + url: str, + data_home: Optional[str], + parser: str, + output_type: str, + openml_columns_info: dict, + feature_names_to_select: List[str], + target_names_to_select: List[str], + shape: Optional[Tuple[int, int]], + md5_checksum: str, + n_retries: int = 3, + delay: float = 1.0, + read_csv_kwargs: Optional[Dict] = None, +): + """Load the ARFF data associated with the OpenML URL. + + In addition of loading the data, this function will also check the + integrity of the downloaded file from OpenML using MD5 checksum. + + Parameters + ---------- + url : str + The URL of the ARFF file on OpenML. + + data_home : str + The location where to cache the data. + + parser : {"liac-arff", "pandas"} + The parser used to parse the ARFF file. + + output_type : {"numpy", "pandas", "sparse"} + The type of the arrays that will be returned. The possibilities are: + + - `"numpy"`: both `X` and `y` will be NumPy arrays; + - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array; + - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a + pandas Series or DataFrame. + + openml_columns_info : dict + The information provided by OpenML regarding the columns of the ARFF + file. + + feature_names_to_select : list of str + The list of the features to be selected. + + target_names_to_select : list of str + The list of the target variables to be selected. + + shape : tuple or None + With `parser="liac-arff"`, when using a generator to load the data, + one needs to provide the shape of the data beforehand. + + md5_checksum : str + The MD5 checksum provided by OpenML to check the data integrity. + + n_retries : int, default=3 + The number of times to retry downloading the data if it fails. + + delay : float, default=1.0 + The delay between two consecutive downloads in seconds. + + read_csv_kwargs : dict, default=None + Keyword arguments to pass to `pandas.read_csv` when using the pandas parser. + It allows to overwrite the default options. + + .. versionadded:: 1.3 + + Returns + ------- + X : {ndarray, sparse matrix, dataframe} + The data matrix. + + y : {ndarray, dataframe, series} + The target. + + frame : dataframe or None + A dataframe containing both `X` and `y`. `None` if + `output_array_type != "pandas"`. + + categories : list of str or None + The names of the features that are categorical. `None` if + `output_array_type == "pandas"`. + """ + gzip_file = _open_openml_url(url, data_home, n_retries=n_retries, delay=delay) + with closing(gzip_file): + md5 = hashlib.md5() + for chunk in iter(lambda: gzip_file.read(4096), b""): + md5.update(chunk) + actual_md5_checksum = md5.hexdigest() + + if actual_md5_checksum != md5_checksum: + raise ValueError( + f"md5 checksum of local file for {url} does not match description: " + f"expected: {md5_checksum} but got {actual_md5_checksum}. " + "Downloaded file could have been modified / corrupted, clean cache " + "and retry..." + ) + + def _open_url_and_load_gzip_file(url, data_home, n_retries, delay, arff_params): + gzip_file = _open_openml_url(url, data_home, n_retries=n_retries, delay=delay) + with closing(gzip_file): + return load_arff_from_gzip_file(gzip_file, **arff_params) + + arff_params: Dict = dict( + parser=parser, + output_type=output_type, + openml_columns_info=openml_columns_info, + feature_names_to_select=feature_names_to_select, + target_names_to_select=target_names_to_select, + shape=shape, + read_csv_kwargs=read_csv_kwargs or {}, + ) + try: + X, y, frame, categories = _open_url_and_load_gzip_file( + url, data_home, n_retries, delay, arff_params + ) + except Exception as exc: + if parser != "pandas": + raise + + from pandas.errors import ParserError + + if not isinstance(exc, ParserError): + raise + + # A parsing error could come from providing the wrong quotechar + # to pandas. By default, we use a double quote. Thus, we retry + # with a single quote before to raise the error. + arff_params["read_csv_kwargs"].update(quotechar="'") + X, y, frame, categories = _open_url_and_load_gzip_file( + url, data_home, n_retries, delay, arff_params + ) + + return X, y, frame, categories + + +def _download_data_to_bunch( + url: str, + sparse: bool, + data_home: Optional[str], + *, + as_frame: bool, + openml_columns_info: List[dict], + data_columns: List[str], + target_columns: List[str], + shape: Optional[Tuple[int, int]], + md5_checksum: str, + n_retries: int = 3, + delay: float = 1.0, + parser: str, + read_csv_kwargs: Optional[Dict] = None, +): + """Download ARFF data, load it to a specific container and create to Bunch. + + This function has a mechanism to retry/cache/clean the data. + + Parameters + ---------- + url : str + The URL of the ARFF file on OpenML. + + sparse : bool + Whether the dataset is expected to use the sparse ARFF format. + + data_home : str + The location where to cache the data. + + as_frame : bool + Whether or not to return the data into a pandas DataFrame. + + openml_columns_info : list of dict + The information regarding the columns provided by OpenML for the + ARFF dataset. The information is stored as a list of dictionaries. + + data_columns : list of str + The list of the features to be selected. + + target_columns : list of str + The list of the target variables to be selected. + + shape : tuple or None + With `parser="liac-arff"`, when using a generator to load the data, + one needs to provide the shape of the data beforehand. + + md5_checksum : str + The MD5 checksum provided by OpenML to check the data integrity. + + n_retries : int, default=3 + Number of retries when HTTP errors are encountered. Error with status + code 412 won't be retried as they represent OpenML generic errors. + + delay : float, default=1.0 + Number of seconds between retries. + + parser : {"liac-arff", "pandas"} + The parser used to parse the ARFF file. + + read_csv_kwargs : dict, default=None + Keyword arguments to pass to `pandas.read_csv` when using the pandas parser. + It allows to overwrite the default options. + + .. versionadded:: 1.3 + + Returns + ------- + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + X : {ndarray, sparse matrix, dataframe} + The data matrix. + y : {ndarray, dataframe, series} + The target. + frame : dataframe or None + A dataframe containing both `X` and `y`. `None` if + `output_array_type != "pandas"`. + categories : list of str or None + The names of the features that are categorical. `None` if + `output_array_type == "pandas"`. + """ + # Prepare which columns and data types should be returned for the X and y + features_dict = {feature["name"]: feature for feature in openml_columns_info} + + if sparse: + output_type = "sparse" + elif as_frame: + output_type = "pandas" + else: + output_type = "numpy" + + # XXX: target columns should all be categorical or all numeric + _verify_target_data_type(features_dict, target_columns) + for name in target_columns: + column_info = features_dict[name] + n_missing_values = int(column_info["number_of_missing_values"]) + if n_missing_values > 0: + raise ValueError( + f"Target column '{column_info['name']}' has {n_missing_values} missing " + "values. Missing values are not supported for target columns." + ) + + no_retry_exception = None + if parser == "pandas": + # If we get a ParserError with pandas, then we don't want to retry and we raise + # early. + from pandas.errors import ParserError + + no_retry_exception = ParserError + + X, y, frame, categories = _retry_with_clean_cache( + url, data_home, no_retry_exception + )(_load_arff_response)( + url, + data_home, + parser=parser, + output_type=output_type, + openml_columns_info=features_dict, + feature_names_to_select=data_columns, + target_names_to_select=target_columns, + shape=shape, + md5_checksum=md5_checksum, + n_retries=n_retries, + delay=delay, + read_csv_kwargs=read_csv_kwargs, + ) + + return Bunch( + data=X, + target=y, + frame=frame, + categories=categories, + feature_names=data_columns, + target_names=target_columns, + ) + + +def _verify_target_data_type(features_dict, target_columns): + # verifies the data type of the y array in case there are multiple targets + # (throws an error if these targets do not comply with sklearn support) + if not isinstance(target_columns, list): + raise ValueError("target_column should be list, got: %s" % type(target_columns)) + found_types = set() + for target_column in target_columns: + if target_column not in features_dict: + raise KeyError(f"Could not find target_column='{target_column}'") + if features_dict[target_column]["data_type"] == "numeric": + found_types.add(np.float64) + else: + found_types.add(object) + + # note: we compare to a string, not boolean + if features_dict[target_column]["is_ignore"] == "true": + warn(f"target_column='{target_column}' has flag is_ignore.") + if features_dict[target_column]["is_row_identifier"] == "true": + warn(f"target_column='{target_column}' has flag is_row_identifier.") + if len(found_types) > 1: + raise ValueError( + "Can only handle homogeneous multi-target datasets, " + "i.e., all targets are either numeric or " + "categorical." + ) + + +def _valid_data_column_names(features_list, target_columns): + # logic for determining on which columns can be learned. Note that from the + # OpenML guide follows that columns that have the `is_row_identifier` or + # `is_ignore` flag, these can not be learned on. Also target columns are + # excluded. + valid_data_column_names = [] + for feature in features_list: + if ( + feature["name"] not in target_columns + and feature["is_ignore"] != "true" + and feature["is_row_identifier"] != "true" + ): + valid_data_column_names.append(feature["name"]) + return valid_data_column_names + + +@validate_params( + { + "name": [str, None], + "version": [Interval(Integral, 1, None, closed="left"), StrOptions({"active"})], + "data_id": [Interval(Integral, 1, None, closed="left"), None], + "data_home": [str, os.PathLike, None], + "target_column": [str, list, None], + "cache": [bool], + "return_X_y": [bool], + "as_frame": [bool, StrOptions({"auto"})], + "n_retries": [Interval(Integral, 1, None, closed="left")], + "delay": [Interval(Real, 0.0, None, closed="neither")], + "parser": [ + StrOptions({"auto", "pandas", "liac-arff"}), + ], + "read_csv_kwargs": [dict, None], + }, + prefer_skip_nested_validation=True, +) +def fetch_openml( + name: Optional[str] = None, + *, + version: Union[str, int] = "active", + data_id: Optional[int] = None, + data_home: Optional[Union[str, os.PathLike]] = None, + target_column: Optional[Union[str, List]] = "default-target", + cache: bool = True, + return_X_y: bool = False, + as_frame: Union[str, bool] = "auto", + n_retries: int = 3, + delay: float = 1.0, + parser: str = "auto", + read_csv_kwargs: Optional[Dict] = None, +): + """Fetch dataset from openml by name or dataset id. + + Datasets are uniquely identified by either an integer ID or by a + combination of name and version (i.e. there might be multiple + versions of the 'iris' dataset). Please give either name or data_id + (not both). In case a name is given, a version can also be + provided. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.20 + + .. note:: EXPERIMENTAL + + The API is experimental (particularly the return value structure), + and might have small backward-incompatible changes without notice + or warning in future releases. + + Parameters + ---------- + name : str, default=None + String identifier of the dataset. Note that OpenML can have multiple + datasets with the same name. + + version : int or 'active', default='active' + Version of the dataset. Can only be provided if also ``name`` is given. + If 'active' the oldest version that's still active is used. Since + there may be more than one active version of a dataset, and those + versions may fundamentally be different from one another, setting an + exact version is highly recommended. + + data_id : int, default=None + OpenML ID of the dataset. The most specific way of retrieving a + dataset. If data_id is not given, name (and potential version) are + used to obtain a dataset. + + data_home : str or path-like, default=None + Specify another download and cache folder for the data sets. By default + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + + target_column : str, list or None, default='default-target' + Specify the column name in the data to use as target. If + 'default-target', the standard target column a stored on the server + is used. If ``None``, all columns are returned as data and the + target is ``None``. If list (of strings), all columns with these names + are returned as multi-target (Note: not all scikit-learn classifiers + can handle all types of multi-output combinations). + + cache : bool, default=True + Whether to cache the downloaded datasets into `data_home`. + + return_X_y : bool, default=False + If True, returns ``(data, target)`` instead of a Bunch object. See + below for more information about the `data` and `target` objects. + + as_frame : bool or 'auto', default='auto' + If True, the data is a pandas DataFrame including columns with + appropriate dtypes (numeric, string or categorical). The target is + a pandas DataFrame or Series depending on the number of target_columns. + The Bunch will contain a ``frame`` attribute with the target and the + data. If ``return_X_y`` is True, then ``(data, target)`` will be pandas + DataFrames or Series as describe above. + + If `as_frame` is 'auto', the data and target will be converted to + DataFrame or Series as if `as_frame` is set to True, unless the dataset + is stored in sparse format. + + If `as_frame` is False, the data and target will be NumPy arrays and + the `data` will only contain numerical values when `parser="liac-arff"` + where the categories are provided in the attribute `categories` of the + `Bunch` instance. When `parser="pandas"`, no ordinal encoding is made. + + .. versionchanged:: 0.24 + The default value of `as_frame` changed from `False` to `'auto'` + in 0.24. + + n_retries : int, default=3 + Number of retries when HTTP errors or network timeouts are encountered. + Error with status code 412 won't be retried as they represent OpenML + generic errors. + + delay : float, default=1.0 + Number of seconds between retries. + + parser : {"auto", "pandas", "liac-arff"}, default="auto" + Parser used to load the ARFF file. Two parsers are implemented: + + - `"pandas"`: this is the most efficient parser. However, it requires + pandas to be installed and can only open dense datasets. + - `"liac-arff"`: this is a pure Python ARFF parser that is much less + memory- and CPU-efficient. It deals with sparse ARFF datasets. + + If `"auto"`, the parser is chosen automatically such that `"liac-arff"` + is selected for sparse ARFF datasets, otherwise `"pandas"` is selected. + + .. versionadded:: 1.2 + .. versionchanged:: 1.4 + The default value of `parser` changes from `"liac-arff"` to + `"auto"`. + + read_csv_kwargs : dict, default=None + Keyword arguments passed to :func:`pandas.read_csv` when loading the data + from a ARFF file and using the pandas parser. It can allow to + overwrite some default parameters. + + .. versionadded:: 1.3 + + Returns + ------- + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : np.array, scipy.sparse.csr_matrix of floats, or pandas DataFrame + The feature matrix. Categorical features are encoded as ordinals. + target : np.array, pandas Series or DataFrame + The regression target or classification labels, if applicable. + Dtype is float if numeric, and object if categorical. If + ``as_frame`` is True, ``target`` is a pandas object. + DESCR : str + The full description of the dataset. + feature_names : list + The names of the dataset columns. + target_names: list + The names of the target columns. + + .. versionadded:: 0.22 + + categories : dict or None + Maps each categorical feature name to a list of values, such + that the value encoded as i is ith in the list. If ``as_frame`` + is True, this is None. + details : dict + More metadata from OpenML. + frame : pandas DataFrame + Only present when `as_frame=True`. DataFrame with ``data`` and + ``target``. + + (data, target) : tuple if ``return_X_y`` is True + + .. note:: EXPERIMENTAL + + This interface is **experimental** and subsequent releases may + change attributes without notice (although there should only be + minor changes to ``data`` and ``target``). + + Missing values in the 'data' are represented as NaN's. Missing values + in 'target' are represented as NaN's (numerical target) or None + (categorical target). + + Notes + ----- + The `"pandas"` and `"liac-arff"` parsers can lead to different data types + in the output. The notable differences are the following: + + - The `"liac-arff"` parser always encodes categorical features as `str` objects. + To the contrary, the `"pandas"` parser instead infers the type while + reading and numerical categories will be casted into integers whenever + possible. + - The `"liac-arff"` parser uses float64 to encode numerical features + tagged as 'REAL' and 'NUMERICAL' in the metadata. The `"pandas"` + parser instead infers if these numerical features corresponds + to integers and uses panda's Integer extension dtype. + - In particular, classification datasets with integer categories are + typically loaded as such `(0, 1, ...)` with the `"pandas"` parser while + `"liac-arff"` will force the use of string encoded class labels such as + `"0"`, `"1"` and so on. + - The `"pandas"` parser will not strip single quotes - i.e. `'` - from + string columns. For instance, a string `'my string'` will be kept as is + while the `"liac-arff"` parser will strip the single quotes. For + categorical columns, the single quotes are stripped from the values. + + In addition, when `as_frame=False` is used, the `"liac-arff"` parser + returns ordinally encoded data where the categories are provided in the + attribute `categories` of the `Bunch` instance. Instead, `"pandas"` returns + a NumPy array were the categories are not encoded. + + Examples + -------- + >>> from sklearn.datasets import fetch_openml + >>> adult = fetch_openml("adult", version=2) # doctest: +SKIP + >>> adult.frame.info() # doctest: +SKIP + + RangeIndex: 48842 entries, 0 to 48841 + Data columns (total 15 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 age 48842 non-null int64 + 1 workclass 46043 non-null category + 2 fnlwgt 48842 non-null int64 + 3 education 48842 non-null category + 4 education-num 48842 non-null int64 + 5 marital-status 48842 non-null category + 6 occupation 46033 non-null category + 7 relationship 48842 non-null category + 8 race 48842 non-null category + 9 sex 48842 non-null category + 10 capital-gain 48842 non-null int64 + 11 capital-loss 48842 non-null int64 + 12 hours-per-week 48842 non-null int64 + 13 native-country 47985 non-null category + 14 class 48842 non-null category + dtypes: category(9), int64(6) + memory usage: 2.7 MB + """ + if cache is False: + # no caching will be applied + data_home = None + else: + data_home = get_data_home(data_home=data_home) + data_home = join(str(data_home), "openml") + + # check valid function arguments. data_id XOR (name, version) should be + # provided + if name is not None: + # OpenML is case-insensitive, but the caching mechanism is not + # convert all data names (str) to lower case + name = name.lower() + if data_id is not None: + raise ValueError( + "Dataset data_id={} and name={} passed, but you can only " + "specify a numeric data_id or a name, not " + "both.".format(data_id, name) + ) + data_info = _get_data_info_by_name( + name, version, data_home, n_retries=n_retries, delay=delay + ) + data_id = data_info["did"] + elif data_id is not None: + # from the previous if statement, it is given that name is None + if version != "active": + raise ValueError( + "Dataset data_id={} and version={} passed, but you can only " + "specify a numeric data_id or a version, not " + "both.".format(data_id, version) + ) + else: + raise ValueError( + "Neither name nor data_id are provided. Please provide name or data_id." + ) + + data_description = _get_data_description_by_id(data_id, data_home) + if data_description["status"] != "active": + warn( + "Version {} of dataset {} is inactive, meaning that issues have " + "been found in the dataset. Try using a newer version from " + "this URL: {}".format( + data_description["version"], + data_description["name"], + data_description["url"], + ) + ) + if "error" in data_description: + warn( + "OpenML registered a problem with the dataset. It might be " + "unusable. Error: {}".format(data_description["error"]) + ) + if "warning" in data_description: + warn( + "OpenML raised a warning on the dataset. It might be " + "unusable. Warning: {}".format(data_description["warning"]) + ) + + return_sparse = data_description["format"].lower() == "sparse_arff" + as_frame = not return_sparse if as_frame == "auto" else as_frame + if parser == "auto": + parser_ = "liac-arff" if return_sparse else "pandas" + else: + parser_ = parser + + if parser_ == "pandas": + try: + check_pandas_support("`fetch_openml`") + except ImportError as exc: + if as_frame: + err_msg = ( + "Returning pandas objects requires pandas to be installed. " + "Alternatively, explicitly set `as_frame=False` and " + "`parser='liac-arff'`." + ) + else: + err_msg = ( + f"Using `parser={parser!r}` with dense data requires pandas to be " + "installed. Alternatively, explicitly set `parser='liac-arff'`." + ) + raise ImportError(err_msg) from exc + + if return_sparse: + if as_frame: + raise ValueError( + "Sparse ARFF datasets cannot be loaded with as_frame=True. " + "Use as_frame=False or as_frame='auto' instead." + ) + if parser_ == "pandas": + raise ValueError( + f"Sparse ARFF datasets cannot be loaded with parser={parser!r}. " + "Use parser='liac-arff' or parser='auto' instead." + ) + + # download data features, meta-info about column types + features_list = _get_data_features(data_id, data_home) + + if not as_frame: + for feature in features_list: + if "true" in (feature["is_ignore"], feature["is_row_identifier"]): + continue + if feature["data_type"] == "string": + raise ValueError( + "STRING attributes are not supported for " + "array representation. Try as_frame=True" + ) + + if target_column == "default-target": + # determines the default target based on the data feature results + # (which is currently more reliable than the data description; + # see issue: https://github.com/openml/OpenML/issues/768) + target_columns = [ + feature["name"] + for feature in features_list + if feature["is_target"] == "true" + ] + elif isinstance(target_column, str): + # for code-simplicity, make target_column by default a list + target_columns = [target_column] + elif target_column is None: + target_columns = [] + else: + # target_column already is of type list + target_columns = target_column + data_columns = _valid_data_column_names(features_list, target_columns) + + shape: Optional[Tuple[int, int]] + # determine arff encoding to return + if not return_sparse: + # The shape must include the ignored features to keep the right indexes + # during the arff data conversion. + data_qualities = _get_data_qualities(data_id, data_home) + shape = _get_num_samples(data_qualities), len(features_list) + else: + shape = None + + # obtain the data + url = data_description["url"] + bunch = _download_data_to_bunch( + url, + return_sparse, + data_home, + as_frame=bool(as_frame), + openml_columns_info=features_list, + shape=shape, + target_columns=target_columns, + data_columns=data_columns, + md5_checksum=data_description["md5_checksum"], + n_retries=n_retries, + delay=delay, + parser=parser_, + read_csv_kwargs=read_csv_kwargs, + ) + + if return_X_y: + return bunch.data, bunch.target + + description = "{}\n\nDownloaded from openml.org.".format( + data_description.pop("description") + ) + + bunch.update( + DESCR=description, + details=data_description, + url="https://www.openml.org/d/{}".format(data_id), + ) + + return bunch diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_rcv1.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_rcv1.py new file mode 100644 index 0000000000000000000000000000000000000000..b673f938f0e46f180e6cbd9235cc79b21fde1154 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_rcv1.py @@ -0,0 +1,334 @@ +"""RCV1 dataset. + +The dataset page is available at + + http://jmlr.csail.mit.edu/papers/volume5/lewis04a/ +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import logging +from gzip import GzipFile +from numbers import Integral, Real +from os import PathLike, makedirs, remove +from os.path import exists, join + +import joblib +import numpy as np +import scipy.sparse as sp + +from ..utils import Bunch +from ..utils import shuffle as shuffle_ +from ..utils._param_validation import Interval, StrOptions, validate_params +from . import get_data_home +from ._base import RemoteFileMetadata, _fetch_remote, _pkl_filepath, load_descr +from ._svmlight_format_io import load_svmlight_files + +# The original vectorized data can be found at: +# http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_test_pt0.dat.gz +# http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_test_pt1.dat.gz +# http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_test_pt2.dat.gz +# http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_test_pt3.dat.gz +# http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a13-vector-files/lyrl2004_vectors_train.dat.gz +# while the original stemmed token files can be found +# in the README, section B.12.i.: +# http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/lyrl2004_rcv1v2_README.htm +XY_METADATA = ( + RemoteFileMetadata( + url="https://ndownloader.figshare.com/files/5976069", + checksum="ed40f7e418d10484091b059703eeb95ae3199fe042891dcec4be6696b9968374", + filename="lyrl2004_vectors_test_pt0.dat.gz", + ), + RemoteFileMetadata( + url="https://ndownloader.figshare.com/files/5976066", + checksum="87700668ae45d45d5ca1ef6ae9bd81ab0f5ec88cc95dcef9ae7838f727a13aa6", + filename="lyrl2004_vectors_test_pt1.dat.gz", + ), + RemoteFileMetadata( + url="https://ndownloader.figshare.com/files/5976063", + checksum="48143ac703cbe33299f7ae9f4995db49a258690f60e5debbff8995c34841c7f5", + filename="lyrl2004_vectors_test_pt2.dat.gz", + ), + RemoteFileMetadata( + url="https://ndownloader.figshare.com/files/5976060", + checksum="dfcb0d658311481523c6e6ca0c3f5a3e1d3d12cde5d7a8ce629a9006ec7dbb39", + filename="lyrl2004_vectors_test_pt3.dat.gz", + ), + RemoteFileMetadata( + url="https://ndownloader.figshare.com/files/5976057", + checksum="5468f656d0ba7a83afc7ad44841cf9a53048a5c083eedc005dcdb5cc768924ae", + filename="lyrl2004_vectors_train.dat.gz", + ), +) + +# The original data can be found at: +# http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a08-topic-qrels/rcv1-v2.topics.qrels.gz +TOPICS_METADATA = RemoteFileMetadata( + url="https://ndownloader.figshare.com/files/5976048", + checksum="2a98e5e5d8b770bded93afc8930d88299474317fe14181aee1466cc754d0d1c1", + filename="rcv1v2.topics.qrels.gz", +) + +logger = logging.getLogger(__name__) + + +@validate_params( + { + "data_home": [str, PathLike, None], + "subset": [StrOptions({"train", "test", "all"})], + "download_if_missing": ["boolean"], + "random_state": ["random_state"], + "shuffle": ["boolean"], + "return_X_y": ["boolean"], + "n_retries": [Interval(Integral, 1, None, closed="left")], + "delay": [Interval(Real, 0.0, None, closed="neither")], + }, + prefer_skip_nested_validation=True, +) +def fetch_rcv1( + *, + data_home=None, + subset="all", + download_if_missing=True, + random_state=None, + shuffle=False, + return_X_y=False, + n_retries=3, + delay=1.0, +): + """Load the RCV1 multilabel dataset (classification). + + Download it if necessary. + + Version: RCV1-v2, vectors, full sets, topics multilabels. + + ================= ===================== + Classes 103 + Samples total 804414 + Dimensionality 47236 + Features real, between 0 and 1 + ================= ===================== + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.17 + + Parameters + ---------- + data_home : str or path-like, default=None + Specify another download and cache folder for the datasets. By default + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + + subset : {'train', 'test', 'all'}, default='all' + Select the dataset to load: 'train' for the training set + (23149 samples), 'test' for the test set (781265 samples), + 'all' for both, with the training samples first if shuffle is False. + This follows the official LYRL2004 chronological split. + + download_if_missing : bool, default=True + If False, raise an OSError if the data is not locally available + instead of trying to download the data from the source site. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset shuffling. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + shuffle : bool, default=False + Whether to shuffle dataset. + + return_X_y : bool, default=False + If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch + object. See below for more information about the `dataset.data` and + `dataset.target` object. + + .. versionadded:: 0.20 + + n_retries : int, default=3 + Number of retries when HTTP errors are encountered. + + .. versionadded:: 1.5 + + delay : float, default=1.0 + Number of seconds between retries. + + .. versionadded:: 1.5 + + Returns + ------- + dataset : :class:`~sklearn.utils.Bunch` + Dictionary-like object. Returned only if `return_X_y` is False. + `dataset` has the following attributes: + + - data : sparse matrix of shape (804414, 47236), dtype=np.float64 + The array has 0.16% of non zero values. Will be of CSR format. + - target : sparse matrix of shape (804414, 103), dtype=np.uint8 + Each sample has a value of 1 in its categories, and 0 in others. + The array has 3.15% of non zero values. Will be of CSR format. + - sample_id : ndarray of shape (804414,), dtype=np.uint32, + Identification number of each sample, as ordered in dataset.data. + - target_names : ndarray of shape (103,), dtype=object + Names of each target (RCV1 topics), as ordered in dataset.target. + - DESCR : str + Description of the RCV1 dataset. + + (data, target) : tuple + A tuple consisting of `dataset.data` and `dataset.target`, as + described above. Returned only if `return_X_y` is True. + + .. versionadded:: 0.20 + + Examples + -------- + >>> from sklearn.datasets import fetch_rcv1 + >>> rcv1 = fetch_rcv1() + >>> rcv1.data.shape + (804414, 47236) + >>> rcv1.target.shape + (804414, 103) + """ + N_SAMPLES = 804414 + N_FEATURES = 47236 + N_CATEGORIES = 103 + N_TRAIN = 23149 + + data_home = get_data_home(data_home=data_home) + rcv1_dir = join(data_home, "RCV1") + if download_if_missing: + if not exists(rcv1_dir): + makedirs(rcv1_dir) + + samples_path = _pkl_filepath(rcv1_dir, "samples.pkl") + sample_id_path = _pkl_filepath(rcv1_dir, "sample_id.pkl") + sample_topics_path = _pkl_filepath(rcv1_dir, "sample_topics.pkl") + topics_path = _pkl_filepath(rcv1_dir, "topics_names.pkl") + + # load data (X) and sample_id + if download_if_missing and (not exists(samples_path) or not exists(sample_id_path)): + files = [] + for each in XY_METADATA: + logger.info("Downloading %s" % each.url) + file_path = _fetch_remote( + each, dirname=rcv1_dir, n_retries=n_retries, delay=delay + ) + files.append(GzipFile(filename=file_path)) + + Xy = load_svmlight_files(files, n_features=N_FEATURES) + + # Training data is before testing data + X = sp.vstack([Xy[8], Xy[0], Xy[2], Xy[4], Xy[6]]).tocsr() + sample_id = np.hstack((Xy[9], Xy[1], Xy[3], Xy[5], Xy[7])) + sample_id = sample_id.astype(np.uint32, copy=False) + + joblib.dump(X, samples_path, compress=9) + joblib.dump(sample_id, sample_id_path, compress=9) + + # delete archives + for f in files: + f.close() + remove(f.name) + else: + X = joblib.load(samples_path) + sample_id = joblib.load(sample_id_path) + + # load target (y), categories, and sample_id_bis + if download_if_missing and ( + not exists(sample_topics_path) or not exists(topics_path) + ): + logger.info("Downloading %s" % TOPICS_METADATA.url) + topics_archive_path = _fetch_remote( + TOPICS_METADATA, dirname=rcv1_dir, n_retries=n_retries, delay=delay + ) + + # parse the target file + n_cat = -1 + n_doc = -1 + doc_previous = -1 + y = np.zeros((N_SAMPLES, N_CATEGORIES), dtype=np.uint8) + sample_id_bis = np.zeros(N_SAMPLES, dtype=np.int32) + category_names = {} + with GzipFile(filename=topics_archive_path, mode="rb") as f: + for line in f: + line_components = line.decode("ascii").split(" ") + if len(line_components) == 3: + cat, doc, _ = line_components + if cat not in category_names: + n_cat += 1 + category_names[cat] = n_cat + + doc = int(doc) + if doc != doc_previous: + doc_previous = doc + n_doc += 1 + sample_id_bis[n_doc] = doc + y[n_doc, category_names[cat]] = 1 + + # delete archive + remove(topics_archive_path) + + # Samples in X are ordered with sample_id, + # whereas in y, they are ordered with sample_id_bis. + permutation = _find_permutation(sample_id_bis, sample_id) + y = y[permutation, :] + + # save category names in a list, with same order than y + categories = np.empty(N_CATEGORIES, dtype=object) + for k in category_names.keys(): + categories[category_names[k]] = k + + # reorder categories in lexicographic order + order = np.argsort(categories) + categories = categories[order] + y = sp.csr_matrix(y[:, order]) + + joblib.dump(y, sample_topics_path, compress=9) + joblib.dump(categories, topics_path, compress=9) + else: + y = joblib.load(sample_topics_path) + categories = joblib.load(topics_path) + + if subset == "all": + pass + elif subset == "train": + X = X[:N_TRAIN, :] + y = y[:N_TRAIN, :] + sample_id = sample_id[:N_TRAIN] + elif subset == "test": + X = X[N_TRAIN:, :] + y = y[N_TRAIN:, :] + sample_id = sample_id[N_TRAIN:] + else: + raise ValueError( + "Unknown subset parameter. Got '%s' instead of one" + " of ('all', 'train', test')" % subset + ) + + if shuffle: + X, y, sample_id = shuffle_(X, y, sample_id, random_state=random_state) + + fdescr = load_descr("rcv1.rst") + + if return_X_y: + return X, y + + return Bunch( + data=X, target=y, sample_id=sample_id, target_names=categories, DESCR=fdescr + ) + + +def _inverse_permutation(p): + """Inverse permutation p.""" + n = p.size + s = np.zeros(n, dtype=np.int32) + i = np.arange(n, dtype=np.int32) + np.put(s, p, i) # s[p] = i + return s + + +def _find_permutation(a, b): + """Find the permutation from a to b.""" + t = np.argsort(a) + u = np.argsort(b) + u_ = _inverse_permutation(u) + return t[u_] diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_samples_generator.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_samples_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..c3b4622d6a91bc579b505fa5dd8dd429de563198 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_samples_generator.py @@ -0,0 +1,2383 @@ +""" +Generate samples of synthetic data sets. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import array +import numbers +from collections.abc import Iterable +from numbers import Integral, Real + +import numpy as np +import scipy.sparse as sp +from scipy import linalg + +from sklearn.utils import Bunch + +from ..preprocessing import MultiLabelBinarizer +from ..utils import check_array, check_random_state +from ..utils import shuffle as util_shuffle +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.random import sample_without_replacement + + +def _generate_hypercube(samples, dimensions, rng): + """Returns distinct binary samples of length dimensions.""" + if dimensions > 30: + return np.hstack( + [ + rng.randint(2, size=(samples, dimensions - 30)), + _generate_hypercube(samples, 30, rng), + ] + ) + out = sample_without_replacement(2**dimensions, samples, random_state=rng).astype( + dtype=">u4", copy=False + ) + out = np.unpackbits(out.view(">u1")).reshape((-1, 32))[:, -dimensions:] + return out + + +@validate_params( + { + "n_samples": [Interval(Integral, 1, None, closed="left")], + "n_features": [Interval(Integral, 1, None, closed="left")], + "n_informative": [Interval(Integral, 1, None, closed="left")], + "n_redundant": [Interval(Integral, 0, None, closed="left")], + "n_repeated": [Interval(Integral, 0, None, closed="left")], + "n_classes": [Interval(Integral, 1, None, closed="left")], + "n_clusters_per_class": [Interval(Integral, 1, None, closed="left")], + "weights": ["array-like", None], + "flip_y": [Interval(Real, 0, 1, closed="both")], + "class_sep": [Interval(Real, 0, None, closed="neither")], + "hypercube": ["boolean"], + "shift": [Interval(Real, None, None, closed="neither"), "array-like", None], + "scale": [Interval(Real, 0, None, closed="neither"), "array-like", None], + "shuffle": ["boolean"], + "random_state": ["random_state"], + "return_X_y": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def make_classification( + n_samples=100, + n_features=20, + *, + n_informative=2, + n_redundant=2, + n_repeated=0, + n_classes=2, + n_clusters_per_class=2, + weights=None, + flip_y=0.01, + class_sep=1.0, + hypercube=True, + shift=0.0, + scale=1.0, + shuffle=True, + random_state=None, + return_X_y=True, +): + """Generate a random n-class classification problem. + + This initially creates clusters of points normally distributed (std=1) + about vertices of an ``n_informative``-dimensional hypercube with sides of + length ``2*class_sep`` and assigns an equal number of clusters to each + class. It introduces interdependence between these features and adds + various types of further noise to the data. + + Without shuffling, ``X`` horizontally stacks features in the following + order: the primary ``n_informative`` features, followed by ``n_redundant`` + linear combinations of the informative features, followed by ``n_repeated`` + duplicates, drawn randomly with replacement from the informative and + redundant features. The remaining features are filled with random noise. + Thus, without shuffling, all useful features are contained in the columns + ``X[:, :n_informative + n_redundant + n_repeated]``. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_samples : int, default=100 + The number of samples. + + n_features : int, default=20 + The total number of features. These comprise ``n_informative`` + informative features, ``n_redundant`` redundant features, + ``n_repeated`` duplicated features and + ``n_features-n_informative-n_redundant-n_repeated`` useless features + drawn at random. + + n_informative : int, default=2 + The number of informative features. Each class is composed of a number + of gaussian clusters each located around the vertices of a hypercube + in a subspace of dimension ``n_informative``. For each cluster, + informative features are drawn independently from N(0, 1) and then + randomly linearly combined within each cluster in order to add + covariance. The clusters are then placed on the vertices of the + hypercube. + + n_redundant : int, default=2 + The number of redundant features. These features are generated as + random linear combinations of the informative features. + + n_repeated : int, default=0 + The number of duplicated features, drawn randomly from the informative + and the redundant features. + + n_classes : int, default=2 + The number of classes (or labels) of the classification problem. + + n_clusters_per_class : int, default=2 + The number of clusters per class. + + weights : array-like of shape (n_classes,) or (n_classes - 1,),\ + default=None + The proportions of samples assigned to each class. If None, then + classes are balanced. Note that if ``len(weights) == n_classes - 1``, + then the last class weight is automatically inferred. + More than ``n_samples`` samples may be returned if the sum of + ``weights`` exceeds 1. Note that the actual class proportions will + not exactly match ``weights`` when ``flip_y`` isn't 0. + + flip_y : float, default=0.01 + The fraction of samples whose class is assigned randomly. Larger + values introduce noise in the labels and make the classification + task harder. Note that the default setting flip_y > 0 might lead + to less than ``n_classes`` in y in some cases. + + class_sep : float, default=1.0 + The factor multiplying the hypercube size. Larger values spread + out the clusters/classes and make the classification task easier. + + hypercube : bool, default=True + If True, the clusters are put on the vertices of a hypercube. If + False, the clusters are put on the vertices of a random polytope. + + shift : float, ndarray of shape (n_features,) or None, default=0.0 + Shift features by the specified value. If None, then features + are shifted by a random value drawn in [-class_sep, class_sep]. + + scale : float, ndarray of shape (n_features,) or None, default=1.0 + Multiply features by the specified value. If None, then features + are scaled by a random value drawn in [1, 100]. Note that scaling + happens after shifting. + + shuffle : bool, default=True + Shuffle the samples and the features. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + return_X_y : bool, default=True + If True, a tuple ``(X, y)`` instead of a Bunch object is returned. + + .. versionadded:: 1.7 + + Returns + ------- + data : :class:`~sklearn.utils.Bunch` if `return_X_y` is `False`. + Dictionary-like object, with the following attributes. + + DESCR : str + A description of the function that generated the dataset. + parameter : dict + A dictionary that stores the values of the arguments passed to the + generator function. + feature_info : list of len(n_features) + A description for each generated feature. + X : ndarray of shape (n_samples, n_features) + The generated samples. + y : ndarray of shape (n_samples,) + An integer label for class membership of each sample. + + .. versionadded:: 1.7 + + (X, y) : tuple if ``return_X_y`` is True + A tuple of generated samples and labels. + + See Also + -------- + make_blobs : Simplified variant. + make_multilabel_classification : Unrelated generator for multilabel tasks. + + Notes + ----- + The algorithm is adapted from Guyon [1] and was designed to generate + the "Madelon" dataset. + + References + ---------- + .. [1] I. Guyon, "Design of experiments for the NIPS 2003 variable + selection benchmark", 2003. + + Examples + -------- + >>> from sklearn.datasets import make_classification + >>> X, y = make_classification(random_state=42) + >>> X.shape + (100, 20) + >>> y.shape + (100,) + >>> list(y[:5]) + [np.int64(0), np.int64(0), np.int64(1), np.int64(1), np.int64(0)] + """ + generator = check_random_state(random_state) + + # Count features, clusters and samples + if n_informative + n_redundant + n_repeated > n_features: + raise ValueError( + "Number of informative, redundant and repeated " + "features must sum to less than the number of total" + " features" + ) + # Use log2 to avoid overflow errors + if n_informative < np.log2(n_classes * n_clusters_per_class): + msg = "n_classes({}) * n_clusters_per_class({}) must be" + msg += " smaller or equal 2**n_informative({})={}" + raise ValueError( + msg.format(n_classes, n_clusters_per_class, n_informative, 2**n_informative) + ) + + if weights is not None: + # we define new variable, weight_, instead of modifying user defined parameter. + if len(weights) not in [n_classes, n_classes - 1]: + raise ValueError( + "Weights specified but incompatible with number of classes." + ) + if len(weights) == n_classes - 1: + if isinstance(weights, list): + weights_ = weights + [1.0 - sum(weights)] + else: + weights_ = np.resize(weights, n_classes) + weights_[-1] = 1.0 - sum(weights_[:-1]) + else: + weights_ = weights.copy() + else: + weights_ = [1.0 / n_classes] * n_classes + + n_random = n_features - n_informative - n_redundant - n_repeated + n_clusters = n_classes * n_clusters_per_class + + # Distribute samples among clusters by weight + n_samples_per_cluster = [ + int(n_samples * weights_[k % n_classes] / n_clusters_per_class) + for k in range(n_clusters) + ] + + for i in range(n_samples - sum(n_samples_per_cluster)): + n_samples_per_cluster[i % n_clusters] += 1 + + # Initialize X and y + X = np.zeros((n_samples, n_features)) + y = np.zeros(n_samples, dtype=int) + + # Build the polytope whose vertices become cluster centroids + centroids = _generate_hypercube(n_clusters, n_informative, generator).astype( + float, copy=False + ) + centroids *= 2 * class_sep + centroids -= class_sep + if not hypercube: + centroids *= generator.uniform(size=(n_clusters, 1)) + centroids *= generator.uniform(size=(1, n_informative)) + + # Initially draw informative features from the standard normal + X[:, :n_informative] = generator.standard_normal(size=(n_samples, n_informative)) + + # Create each cluster; a variant of make_blobs + stop = 0 + for k, centroid in enumerate(centroids): + start, stop = stop, stop + n_samples_per_cluster[k] + y[start:stop] = k % n_classes # assign labels + X_k = X[start:stop, :n_informative] # slice a view of the cluster + + A = 2 * generator.uniform(size=(n_informative, n_informative)) - 1 + X_k[...] = np.dot(X_k, A) # introduce random covariance + + X_k += centroid # shift the cluster to a vertex + + # Create redundant features + if n_redundant > 0: + B = 2 * generator.uniform(size=(n_informative, n_redundant)) - 1 + X[:, n_informative : n_informative + n_redundant] = np.dot( + X[:, :n_informative], B + ) + + # Repeat some features + n = n_informative + n_redundant + if n_repeated > 0: + indices = ((n - 1) * generator.uniform(size=n_repeated) + 0.5).astype(np.intp) + X[:, n : n + n_repeated] = X[:, indices] + + # Fill useless features + if n_random > 0: + X[:, -n_random:] = generator.standard_normal(size=(n_samples, n_random)) + + # Randomly replace labels + if flip_y >= 0.0: + flip_mask = generator.uniform(size=n_samples) < flip_y + y[flip_mask] = generator.randint(n_classes, size=flip_mask.sum()) + + # Randomly shift and scale + if shift is None: + shift = (2 * generator.uniform(size=n_features) - 1) * class_sep + X += shift + + if scale is None: + scale = 1 + 100 * generator.uniform(size=n_features) + X *= scale + + indices = np.arange(n_features) + if shuffle: + # Randomly permute samples + X, y = util_shuffle(X, y, random_state=generator) + + # Randomly permute features + generator.shuffle(indices) + X[:, :] = X[:, indices] + + if return_X_y: + return X, y + + # feat_desc describes features in X + feat_desc = ["random"] * n_features + for i, index in enumerate(indices): + if index < n_informative: + feat_desc[i] = "informative" + elif n_informative <= index < n_informative + n_redundant: + feat_desc[i] = "redundant" + elif n <= index < n + n_repeated: + feat_desc[i] = "repeated" + + parameters = { + "n_samples": n_samples, + "n_features": n_features, + "n_informative": n_informative, + "n_redundant": n_redundant, + "n_repeated": n_repeated, + "n_classes": n_classes, + "n_clusters_per_class": n_clusters_per_class, + "weights": weights, + "flip_y": flip_y, + "class_sep": class_sep, + "hypercube": hypercube, + "shift": shift, + "scale": scale, + "shuffle": shuffle, + "random_state": random_state, + "return_X_y": return_X_y, + } + + bunch = Bunch( + DESCR=make_classification.__doc__, + parameters=parameters, + feature_info=feat_desc, + X=X, + y=y, + ) + + return bunch + + +@validate_params( + { + "n_samples": [Interval(Integral, 1, None, closed="left")], + "n_features": [Interval(Integral, 1, None, closed="left")], + "n_classes": [Interval(Integral, 1, None, closed="left")], + "n_labels": [Interval(Integral, 0, None, closed="left")], + "length": [Interval(Integral, 1, None, closed="left")], + "allow_unlabeled": ["boolean"], + "sparse": ["boolean"], + "return_indicator": [StrOptions({"dense", "sparse"}), "boolean"], + "return_distributions": ["boolean"], + "random_state": ["random_state"], + }, + prefer_skip_nested_validation=True, +) +def make_multilabel_classification( + n_samples=100, + n_features=20, + *, + n_classes=5, + n_labels=2, + length=50, + allow_unlabeled=True, + sparse=False, + return_indicator="dense", + return_distributions=False, + random_state=None, +): + """Generate a random multilabel classification problem. + + For each sample, the generative process is: + - pick the number of labels: n ~ Poisson(n_labels) + - n times, choose a class c: c ~ Multinomial(theta) + - pick the document length: k ~ Poisson(length) + - k times, choose a word: w ~ Multinomial(theta_c) + + In the above process, rejection sampling is used to make sure that + n is never zero or more than `n_classes`, and that the document length + is never zero. Likewise, we reject classes which have already been chosen. + + For an example of usage, see + :ref:`sphx_glr_auto_examples_datasets_plot_random_multilabel_dataset.py`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_samples : int, default=100 + The number of samples. + + n_features : int, default=20 + The total number of features. + + n_classes : int, default=5 + The number of classes of the classification problem. + + n_labels : int, default=2 + The average number of labels per instance. More precisely, the number + of labels per sample is drawn from a Poisson distribution with + ``n_labels`` as its expected value, but samples are bounded (using + rejection sampling) by ``n_classes``, and must be nonzero if + ``allow_unlabeled`` is False. + + length : int, default=50 + The sum of the features (number of words if documents) is drawn from + a Poisson distribution with this expected value. + + allow_unlabeled : bool, default=True + If ``True``, some instances might not belong to any class. + + sparse : bool, default=False + If ``True``, return a sparse feature matrix. + + .. versionadded:: 0.17 + parameter to allow *sparse* output. + + return_indicator : {'dense', 'sparse'} or False, default='dense' + If ``'dense'`` return ``Y`` in the dense binary indicator format. If + ``'sparse'`` return ``Y`` in the sparse binary indicator format. + ``False`` returns a list of lists of labels. + + return_distributions : bool, default=False + If ``True``, return the prior class probability and conditional + probabilities of features given classes, from which the data was + drawn. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + X : ndarray of shape (n_samples, n_features) + The generated samples. + + Y : {ndarray, sparse matrix} of shape (n_samples, n_classes) + The label sets. Sparse matrix should be of CSR format. + + p_c : ndarray of shape (n_classes,) + The probability of each class being drawn. Only returned if + ``return_distributions=True``. + + p_w_c : ndarray of shape (n_features, n_classes) + The probability of each feature being drawn given each class. + Only returned if ``return_distributions=True``. + + Examples + -------- + >>> from sklearn.datasets import make_multilabel_classification + >>> X, y = make_multilabel_classification(n_labels=3, random_state=42) + >>> X.shape + (100, 20) + >>> y.shape + (100, 5) + >>> list(y[:3]) + [array([1, 1, 0, 1, 0]), array([0, 1, 1, 1, 0]), array([0, 1, 0, 0, 0])] + """ + + generator = check_random_state(random_state) + p_c = generator.uniform(size=n_classes) + p_c /= p_c.sum() + cumulative_p_c = np.cumsum(p_c) + p_w_c = generator.uniform(size=(n_features, n_classes)) + p_w_c /= np.sum(p_w_c, axis=0) + + def sample_example(): + _, n_classes = p_w_c.shape + + # pick a nonzero number of labels per document by rejection sampling + y_size = n_classes + 1 + while (not allow_unlabeled and y_size == 0) or y_size > n_classes: + y_size = generator.poisson(n_labels) + + # pick n classes + y = set() + while len(y) != y_size: + # pick a class with probability P(c) + c = np.searchsorted(cumulative_p_c, generator.uniform(size=y_size - len(y))) + y.update(c) + y = list(y) + + # pick a non-zero document length by rejection sampling + n_words = 0 + while n_words == 0: + n_words = generator.poisson(length) + + # generate a document of length n_words + if len(y) == 0: + # if sample does not belong to any class, generate noise word + words = generator.randint(n_features, size=n_words) + return words, y + + # sample words with replacement from selected classes + cumulative_p_w_sample = p_w_c.take(y, axis=1).sum(axis=1).cumsum() + cumulative_p_w_sample /= cumulative_p_w_sample[-1] + words = np.searchsorted(cumulative_p_w_sample, generator.uniform(size=n_words)) + return words, y + + X_indices = array.array("i") + X_indptr = array.array("i", [0]) + Y = [] + for i in range(n_samples): + words, y = sample_example() + X_indices.extend(words) + X_indptr.append(len(X_indices)) + Y.append(y) + X_data = np.ones(len(X_indices), dtype=np.float64) + X = sp.csr_matrix((X_data, X_indices, X_indptr), shape=(n_samples, n_features)) + X.sum_duplicates() + if not sparse: + X = X.toarray() + + # return_indicator can be True due to backward compatibility + if return_indicator in (True, "sparse", "dense"): + lb = MultiLabelBinarizer(sparse_output=(return_indicator == "sparse")) + Y = lb.fit([range(n_classes)]).transform(Y) + if return_distributions: + return X, Y, p_c, p_w_c + return X, Y + + +@validate_params( + { + "n_samples": [Interval(Integral, 1, None, closed="left")], + "random_state": ["random_state"], + }, + prefer_skip_nested_validation=True, +) +def make_hastie_10_2(n_samples=12000, *, random_state=None): + """Generate data for binary classification used in Hastie et al. 2009, Example 10.2. + + The ten features are standard independent Gaussian and + the target ``y`` is defined by:: + + y[i] = 1 if np.sum(X[i] ** 2) > 9.34 else -1 + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_samples : int, default=12000 + The number of samples. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + X : ndarray of shape (n_samples, 10) + The input samples. + + y : ndarray of shape (n_samples,) + The output values. + + See Also + -------- + make_gaussian_quantiles : A generalization of this dataset approach. + + References + ---------- + .. [1] T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical + Learning Ed. 2", Springer, 2009. + + Examples + -------- + >>> from sklearn.datasets import make_hastie_10_2 + >>> X, y = make_hastie_10_2(n_samples=24000, random_state=42) + >>> X.shape + (24000, 10) + >>> y.shape + (24000,) + >>> list(y[:5]) + [np.float64(-1.0), np.float64(1.0), np.float64(-1.0), np.float64(1.0), + np.float64(-1.0)] + """ + rs = check_random_state(random_state) + + shape = (n_samples, 10) + X = rs.normal(size=shape).reshape(shape) + y = ((X**2.0).sum(axis=1) > 9.34).astype(np.float64, copy=False) + y[y == 0.0] = -1.0 + + return X, y + + +@validate_params( + { + "n_samples": [Interval(Integral, 1, None, closed="left")], + "n_features": [Interval(Integral, 1, None, closed="left")], + "n_informative": [Interval(Integral, 0, None, closed="left")], + "n_targets": [Interval(Integral, 1, None, closed="left")], + "bias": [Interval(Real, None, None, closed="neither")], + "effective_rank": [Interval(Integral, 1, None, closed="left"), None], + "tail_strength": [Interval(Real, 0, 1, closed="both")], + "noise": [Interval(Real, 0, None, closed="left")], + "shuffle": ["boolean"], + "coef": ["boolean"], + "random_state": ["random_state"], + }, + prefer_skip_nested_validation=True, +) +def make_regression( + n_samples=100, + n_features=100, + *, + n_informative=10, + n_targets=1, + bias=0.0, + effective_rank=None, + tail_strength=0.5, + noise=0.0, + shuffle=True, + coef=False, + random_state=None, +): + """Generate a random regression problem. + + The input set can either be well conditioned (by default) or have a low + rank-fat tail singular profile. See :func:`make_low_rank_matrix` for + more details. + + The output is generated by applying a (potentially biased) random linear + regression model with `n_informative` nonzero regressors to the previously + generated input and some gaussian centered noise with some adjustable + scale. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_samples : int, default=100 + The number of samples. + + n_features : int, default=100 + The number of features. + + n_informative : int, default=10 + The number of informative features, i.e., the number of features used + to build the linear model used to generate the output. + + n_targets : int, default=1 + The number of regression targets, i.e., the dimension of the y output + vector associated with a sample. By default, the output is a scalar. + + bias : float, default=0.0 + The bias term in the underlying linear model. + + effective_rank : int, default=None + If not None: + The approximate number of singular vectors required to explain most + of the input data by linear combinations. Using this kind of + singular spectrum in the input allows the generator to reproduce + the correlations often observed in practice. + If None: + The input set is well conditioned, centered and gaussian with + unit variance. + + tail_strength : float, default=0.5 + The relative importance of the fat noisy tail of the singular values + profile if `effective_rank` is not None. When a float, it should be + between 0 and 1. + + noise : float, default=0.0 + The standard deviation of the gaussian noise applied to the output. + + shuffle : bool, default=True + Shuffle the samples and the features. + + coef : bool, default=False + If True, the coefficients of the underlying linear model are returned. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + X : ndarray of shape (n_samples, n_features) + The input samples. + + y : ndarray of shape (n_samples,) or (n_samples, n_targets) + The output values. + + coef : ndarray of shape (n_features,) or (n_features, n_targets) + The coefficient of the underlying linear model. It is returned only if + coef is True. + + Examples + -------- + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression(n_samples=5, n_features=2, noise=1, random_state=42) + >>> X + array([[ 0.4967, -0.1382 ], + [ 0.6476, 1.523], + [-0.2341, -0.2341], + [-0.4694, 0.5425], + [ 1.579, 0.7674]]) + >>> y + array([ 6.737, 37.79, -10.27, 0.4017, 42.22]) + """ + n_informative = min(n_features, n_informative) + generator = check_random_state(random_state) + + if effective_rank is None: + # Randomly generate a well conditioned input set + X = generator.standard_normal(size=(n_samples, n_features)) + + else: + # Randomly generate a low rank, fat tail input set + X = make_low_rank_matrix( + n_samples=n_samples, + n_features=n_features, + effective_rank=effective_rank, + tail_strength=tail_strength, + random_state=generator, + ) + + # Generate a ground truth model with only n_informative features being non + # zeros (the other features are not correlated to y and should be ignored + # by a sparsifying regularizers such as L1 or elastic net) + ground_truth = np.zeros((n_features, n_targets)) + ground_truth[:n_informative, :] = 100 * generator.uniform( + size=(n_informative, n_targets) + ) + + y = np.dot(X, ground_truth) + bias + + # Add noise + if noise > 0.0: + y += generator.normal(scale=noise, size=y.shape) + + # Randomly permute samples and features + if shuffle: + X, y = util_shuffle(X, y, random_state=generator) + + indices = np.arange(n_features) + generator.shuffle(indices) + X[:, :] = X[:, indices] + ground_truth = ground_truth[indices] + + y = np.squeeze(y) + + if coef: + return X, y, np.squeeze(ground_truth) + + else: + return X, y + + +@validate_params( + { + "n_samples": [Interval(Integral, 0, None, closed="left"), tuple], + "shuffle": ["boolean"], + "noise": [Interval(Real, 0, None, closed="left"), None], + "random_state": ["random_state"], + "factor": [Interval(Real, 0, 1, closed="left")], + }, + prefer_skip_nested_validation=True, +) +def make_circles( + n_samples=100, *, shuffle=True, noise=None, random_state=None, factor=0.8 +): + """Make a large circle containing a smaller circle in 2d. + + A simple toy dataset to visualize clustering and classification + algorithms. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_samples : int or tuple of shape (2,), dtype=int, default=100 + If int, it is the total number of points generated. + For odd numbers, the inner circle will have one point more than the + outer circle. + If two-element tuple, number of points in outer circle and inner + circle. + + .. versionchanged:: 0.23 + Added two-element tuple. + + shuffle : bool, default=True + Whether to shuffle the samples. + + noise : float, default=None + Standard deviation of Gaussian noise added to the data. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset shuffling and noise. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + factor : float, default=.8 + Scale factor between inner and outer circle in the range `[0, 1)`. + + Returns + ------- + X : ndarray of shape (n_samples, 2) + The generated samples. + + y : ndarray of shape (n_samples,) + The integer labels (0 or 1) for class membership of each sample. + + Examples + -------- + >>> from sklearn.datasets import make_circles + >>> X, y = make_circles(random_state=42) + >>> X.shape + (100, 2) + >>> y.shape + (100,) + >>> list(y[:5]) + [np.int64(1), np.int64(1), np.int64(1), np.int64(0), np.int64(0)] + """ + if isinstance(n_samples, numbers.Integral): + n_samples_out = n_samples // 2 + n_samples_in = n_samples - n_samples_out + else: # n_samples is a tuple + if len(n_samples) != 2: + raise ValueError("When a tuple, n_samples must have exactly two elements.") + n_samples_out, n_samples_in = n_samples + + generator = check_random_state(random_state) + # so as not to have the first point = last point, we set endpoint=False + linspace_out = np.linspace(0, 2 * np.pi, n_samples_out, endpoint=False) + linspace_in = np.linspace(0, 2 * np.pi, n_samples_in, endpoint=False) + outer_circ_x = np.cos(linspace_out) + outer_circ_y = np.sin(linspace_out) + inner_circ_x = np.cos(linspace_in) * factor + inner_circ_y = np.sin(linspace_in) * factor + + X = np.vstack( + [np.append(outer_circ_x, inner_circ_x), np.append(outer_circ_y, inner_circ_y)] + ).T + y = np.hstack( + [np.zeros(n_samples_out, dtype=np.intp), np.ones(n_samples_in, dtype=np.intp)] + ) + if shuffle: + X, y = util_shuffle(X, y, random_state=generator) + + if noise is not None: + X += generator.normal(scale=noise, size=X.shape) + + return X, y + + +@validate_params( + { + "n_samples": [Interval(Integral, 1, None, closed="left"), tuple], + "shuffle": ["boolean"], + "noise": [Interval(Real, 0, None, closed="left"), None], + "random_state": ["random_state"], + }, + prefer_skip_nested_validation=True, +) +def make_moons(n_samples=100, *, shuffle=True, noise=None, random_state=None): + """Make two interleaving half circles. + + A simple toy dataset to visualize clustering and classification + algorithms. Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_samples : int or tuple of shape (2,), dtype=int, default=100 + If int, the total number of points generated. + If two-element tuple, number of points in each of two moons. + + .. versionchanged:: 0.23 + Added two-element tuple. + + shuffle : bool, default=True + Whether to shuffle the samples. + + noise : float, default=None + Standard deviation of Gaussian noise added to the data. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset shuffling and noise. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + X : ndarray of shape (n_samples, 2) + The generated samples. + + y : ndarray of shape (n_samples,) + The integer labels (0 or 1) for class membership of each sample. + + Examples + -------- + >>> from sklearn.datasets import make_moons + >>> X, y = make_moons(n_samples=200, noise=0.2, random_state=42) + >>> X.shape + (200, 2) + >>> y.shape + (200,) + """ + + if isinstance(n_samples, numbers.Integral): + n_samples_out = n_samples // 2 + n_samples_in = n_samples - n_samples_out + else: + try: + n_samples_out, n_samples_in = n_samples + except ValueError as e: + raise ValueError( + "`n_samples` can be either an int or a two-element tuple." + ) from e + + generator = check_random_state(random_state) + + outer_circ_x = np.cos(np.linspace(0, np.pi, n_samples_out)) + outer_circ_y = np.sin(np.linspace(0, np.pi, n_samples_out)) + inner_circ_x = 1 - np.cos(np.linspace(0, np.pi, n_samples_in)) + inner_circ_y = 1 - np.sin(np.linspace(0, np.pi, n_samples_in)) - 0.5 + + X = np.vstack( + [np.append(outer_circ_x, inner_circ_x), np.append(outer_circ_y, inner_circ_y)] + ).T + y = np.hstack( + [np.zeros(n_samples_out, dtype=np.intp), np.ones(n_samples_in, dtype=np.intp)] + ) + + if shuffle: + X, y = util_shuffle(X, y, random_state=generator) + + if noise is not None: + X += generator.normal(scale=noise, size=X.shape) + + return X, y + + +@validate_params( + { + "n_samples": [Interval(Integral, 1, None, closed="left"), "array-like"], + "n_features": [Interval(Integral, 1, None, closed="left")], + "centers": [Interval(Integral, 1, None, closed="left"), "array-like", None], + "cluster_std": [Interval(Real, 0, None, closed="left"), "array-like"], + "center_box": [tuple], + "shuffle": ["boolean"], + "random_state": ["random_state"], + "return_centers": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def make_blobs( + n_samples=100, + n_features=2, + *, + centers=None, + cluster_std=1.0, + center_box=(-10.0, 10.0), + shuffle=True, + random_state=None, + return_centers=False, +): + """Generate isotropic Gaussian blobs for clustering. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_samples : int or array-like, default=100 + If int, it is the total number of points equally divided among + clusters. + If array-like, each element of the sequence indicates + the number of samples per cluster. + + .. versionchanged:: v0.20 + one can now pass an array-like to the ``n_samples`` parameter + + n_features : int, default=2 + The number of features for each sample. + + centers : int or array-like of shape (n_centers, n_features), default=None + The number of centers to generate, or the fixed center locations. + If n_samples is an int and centers is None, 3 centers are generated. + If n_samples is array-like, centers must be + either None or an array of length equal to the length of n_samples. + + cluster_std : float or array-like of float, default=1.0 + The standard deviation of the clusters. + + center_box : tuple of float (min, max), default=(-10.0, 10.0) + The bounding box for each cluster center when centers are + generated at random. + + shuffle : bool, default=True + Shuffle the samples. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + return_centers : bool, default=False + If True, then return the centers of each cluster. + + .. versionadded:: 0.23 + + Returns + ------- + X : ndarray of shape (n_samples, n_features) + The generated samples. + + y : ndarray of shape (n_samples,) + The integer labels for cluster membership of each sample. + + centers : ndarray of shape (n_centers, n_features) + The centers of each cluster. Only returned if + ``return_centers=True``. + + See Also + -------- + make_classification : A more intricate variant. + + Examples + -------- + >>> from sklearn.datasets import make_blobs + >>> X, y = make_blobs(n_samples=10, centers=3, n_features=2, + ... random_state=0) + >>> print(X.shape) + (10, 2) + >>> y + array([0, 0, 1, 0, 2, 2, 2, 1, 1, 0]) + >>> X, y = make_blobs(n_samples=[3, 3, 4], centers=None, n_features=2, + ... random_state=0) + >>> print(X.shape) + (10, 2) + >>> y + array([0, 1, 2, 0, 2, 2, 2, 1, 1, 0]) + """ + generator = check_random_state(random_state) + + if isinstance(n_samples, numbers.Integral): + # Set n_centers by looking at centers arg + if centers is None: + centers = 3 + + if isinstance(centers, numbers.Integral): + n_centers = centers + centers = generator.uniform( + center_box[0], center_box[1], size=(n_centers, n_features) + ) + + else: + centers = check_array(centers) + n_features = centers.shape[1] + n_centers = centers.shape[0] + + else: + # Set n_centers by looking at [n_samples] arg + n_centers = len(n_samples) + if centers is None: + centers = generator.uniform( + center_box[0], center_box[1], size=(n_centers, n_features) + ) + if not isinstance(centers, Iterable): + raise ValueError( + "Parameter `centers` must be array-like. Got {!r} instead".format( + centers + ) + ) + if len(centers) != n_centers: + raise ValueError( + "Length of `n_samples` not consistent with number of " + f"centers. Got n_samples = {n_samples} and centers = {centers}" + ) + centers = check_array(centers) + n_features = centers.shape[1] + + # stds: if cluster_std is given as list, it must be consistent + # with the n_centers + if hasattr(cluster_std, "__len__") and len(cluster_std) != n_centers: + raise ValueError( + "Length of `clusters_std` not consistent with " + "number of centers. Got centers = {} " + "and cluster_std = {}".format(centers, cluster_std) + ) + + if isinstance(cluster_std, numbers.Real): + cluster_std = np.full(len(centers), cluster_std) + + if isinstance(n_samples, Iterable): + n_samples_per_center = n_samples + else: + n_samples_per_center = [int(n_samples // n_centers)] * n_centers + + for i in range(n_samples % n_centers): + n_samples_per_center[i] += 1 + + cum_sum_n_samples = np.cumsum(n_samples_per_center) + X = np.empty(shape=(sum(n_samples_per_center), n_features), dtype=np.float64) + y = np.empty(shape=(sum(n_samples_per_center),), dtype=int) + + for i, (n, std) in enumerate(zip(n_samples_per_center, cluster_std)): + start_idx = cum_sum_n_samples[i - 1] if i > 0 else 0 + end_idx = cum_sum_n_samples[i] + X[start_idx:end_idx] = generator.normal( + loc=centers[i], scale=std, size=(n, n_features) + ) + y[start_idx:end_idx] = i + + if shuffle: + X, y = util_shuffle(X, y, random_state=generator) + + if return_centers: + return X, y, centers + else: + return X, y + + +@validate_params( + { + "n_samples": [Interval(Integral, 1, None, closed="left")], + "n_features": [Interval(Integral, 5, None, closed="left")], + "noise": [Interval(Real, 0.0, None, closed="left")], + "random_state": ["random_state"], + }, + prefer_skip_nested_validation=True, +) +def make_friedman1(n_samples=100, n_features=10, *, noise=0.0, random_state=None): + """Generate the "Friedman #1" regression problem. + + This dataset is described in Friedman [1] and Breiman [2]. + + Inputs `X` are independent features uniformly distributed on the interval + [0, 1]. The output `y` is created according to the formula:: + + y(X) = 10 * sin(pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 \ ++ 10 * X[:, 3] + 5 * X[:, 4] + noise * N(0, 1). + + Out of the `n_features` features, only 5 are actually used to compute + `y`. The remaining features are independent of `y`. + + The number of features has to be >= 5. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_samples : int, default=100 + The number of samples. + + n_features : int, default=10 + The number of features. Should be at least 5. + + noise : float, default=0.0 + The standard deviation of the gaussian noise applied to the output. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset noise. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + X : ndarray of shape (n_samples, n_features) + The input samples. + + y : ndarray of shape (n_samples,) + The output values. + + References + ---------- + .. [1] J. Friedman, "Multivariate adaptive regression splines", The Annals + of Statistics 19 (1), pages 1-67, 1991. + + .. [2] L. Breiman, "Bagging predictors", Machine Learning 24, + pages 123-140, 1996. + + Examples + -------- + >>> from sklearn.datasets import make_friedman1 + >>> X, y = make_friedman1(random_state=42) + >>> X.shape + (100, 10) + >>> y.shape + (100,) + >>> list(y[:3]) + [np.float64(16.8), np.float64(5.87), np.float64(9.46)] + """ + generator = check_random_state(random_state) + + X = generator.uniform(size=(n_samples, n_features)) + y = ( + 10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + + 20 * (X[:, 2] - 0.5) ** 2 + + 10 * X[:, 3] + + 5 * X[:, 4] + + noise * generator.standard_normal(size=(n_samples)) + ) + + return X, y + + +@validate_params( + { + "n_samples": [Interval(Integral, 1, None, closed="left")], + "noise": [Interval(Real, 0, None, closed="left")], + "random_state": ["random_state"], + }, + prefer_skip_nested_validation=True, +) +def make_friedman2(n_samples=100, *, noise=0.0, random_state=None): + """Generate the "Friedman #2" regression problem. + + This dataset is described in Friedman [1] and Breiman [2]. + + Inputs `X` are 4 independent features uniformly distributed on the + intervals:: + + 0 <= X[:, 0] <= 100, + 40 * pi <= X[:, 1] <= 560 * pi, + 0 <= X[:, 2] <= 1, + 1 <= X[:, 3] <= 11. + + The output `y` is created according to the formula:: + + y(X) = (X[:, 0] ** 2 + (X[:, 1] * X[:, 2] \ + - 1 / (X[:, 1] * X[:, 3])) ** 2) ** 0.5 + noise * N(0, 1). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_samples : int, default=100 + The number of samples. + + noise : float, default=0.0 + The standard deviation of the gaussian noise applied to the output. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset noise. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + X : ndarray of shape (n_samples, 4) + The input samples. + + y : ndarray of shape (n_samples,) + The output values. + + References + ---------- + .. [1] J. Friedman, "Multivariate adaptive regression splines", The Annals + of Statistics 19 (1), pages 1-67, 1991. + + .. [2] L. Breiman, "Bagging predictors", Machine Learning 24, + pages 123-140, 1996. + + Examples + -------- + >>> from sklearn.datasets import make_friedman2 + >>> X, y = make_friedman2(random_state=42) + >>> X.shape + (100, 4) + >>> y.shape + (100,) + >>> list(y[:3]) + [np.float64(1229.4), np.float64(27.0), np.float64(65.6)] + """ + generator = check_random_state(random_state) + + X = generator.uniform(size=(n_samples, 4)) + X[:, 0] *= 100 + X[:, 1] *= 520 * np.pi + X[:, 1] += 40 * np.pi + X[:, 3] *= 10 + X[:, 3] += 1 + + y = ( + X[:, 0] ** 2 + (X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) ** 2 + ) ** 0.5 + noise * generator.standard_normal(size=(n_samples)) + + return X, y + + +@validate_params( + { + "n_samples": [Interval(Integral, 1, None, closed="left")], + "noise": [Interval(Real, 0, None, closed="left")], + "random_state": ["random_state"], + }, + prefer_skip_nested_validation=True, +) +def make_friedman3(n_samples=100, *, noise=0.0, random_state=None): + """Generate the "Friedman #3" regression problem. + + This dataset is described in Friedman [1] and Breiman [2]. + + Inputs `X` are 4 independent features uniformly distributed on the + intervals:: + + 0 <= X[:, 0] <= 100, + 40 * pi <= X[:, 1] <= 560 * pi, + 0 <= X[:, 2] <= 1, + 1 <= X[:, 3] <= 11. + + The output `y` is created according to the formula:: + + y(X) = arctan((X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) \ +/ X[:, 0]) + noise * N(0, 1). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_samples : int, default=100 + The number of samples. + + noise : float, default=0.0 + The standard deviation of the gaussian noise applied to the output. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset noise. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + X : ndarray of shape (n_samples, 4) + The input samples. + + y : ndarray of shape (n_samples,) + The output values. + + References + ---------- + .. [1] J. Friedman, "Multivariate adaptive regression splines", The Annals + of Statistics 19 (1), pages 1-67, 1991. + + .. [2] L. Breiman, "Bagging predictors", Machine Learning 24, + pages 123-140, 1996. + + Examples + -------- + >>> from sklearn.datasets import make_friedman3 + >>> X, y = make_friedman3(random_state=42) + >>> X.shape + (100, 4) + >>> y.shape + (100,) + >>> list(y[:3]) + [np.float64(1.54), np.float64(0.956), np.float64(0.414)] + """ + generator = check_random_state(random_state) + + X = generator.uniform(size=(n_samples, 4)) + X[:, 0] *= 100 + X[:, 1] *= 520 * np.pi + X[:, 1] += 40 * np.pi + X[:, 3] *= 10 + X[:, 3] += 1 + + y = np.arctan( + (X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) / X[:, 0] + ) + noise * generator.standard_normal(size=(n_samples)) + + return X, y + + +@validate_params( + { + "n_samples": [Interval(Integral, 1, None, closed="left")], + "n_features": [Interval(Integral, 1, None, closed="left")], + "effective_rank": [Interval(Integral, 1, None, closed="left")], + "tail_strength": [Interval(Real, 0, 1, closed="both")], + "random_state": ["random_state"], + }, + prefer_skip_nested_validation=True, +) +def make_low_rank_matrix( + n_samples=100, + n_features=100, + *, + effective_rank=10, + tail_strength=0.5, + random_state=None, +): + """Generate a mostly low rank matrix with bell-shaped singular values. + + Most of the variance can be explained by a bell-shaped curve of width + effective_rank: the low rank part of the singular values profile is:: + + (1 - tail_strength) * exp(-1.0 * (i / effective_rank) ** 2) + + The remaining singular values' tail is fat, decreasing as:: + + tail_strength * exp(-0.1 * i / effective_rank). + + The low rank part of the profile can be considered the structured + signal part of the data while the tail can be considered the noisy + part of the data that cannot be summarized by a low number of linear + components (singular vectors). + + This kind of singular profiles is often seen in practice, for instance: + - gray level pictures of faces + - TF-IDF vectors of text documents crawled from the web + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_samples : int, default=100 + The number of samples. + + n_features : int, default=100 + The number of features. + + effective_rank : int, default=10 + The approximate number of singular vectors required to explain most of + the data by linear combinations. + + tail_strength : float, default=0.5 + The relative importance of the fat noisy tail of the singular values + profile. The value should be between 0 and 1. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + X : ndarray of shape (n_samples, n_features) + The matrix. + + Examples + -------- + >>> from numpy.linalg import svd + >>> from sklearn.datasets import make_low_rank_matrix + >>> X = make_low_rank_matrix( + ... n_samples=50, + ... n_features=25, + ... effective_rank=5, + ... tail_strength=0.01, + ... random_state=0, + ... ) + >>> X.shape + (50, 25) + """ + generator = check_random_state(random_state) + n = min(n_samples, n_features) + + # Random (ortho normal) vectors + u, _ = linalg.qr( + generator.standard_normal(size=(n_samples, n)), + mode="economic", + check_finite=False, + ) + v, _ = linalg.qr( + generator.standard_normal(size=(n_features, n)), + mode="economic", + check_finite=False, + ) + + # Index of the singular values + singular_ind = np.arange(n, dtype=np.float64) + + # Build the singular profile by assembling signal and noise components + low_rank = (1 - tail_strength) * np.exp(-1.0 * (singular_ind / effective_rank) ** 2) + tail = tail_strength * np.exp(-0.1 * singular_ind / effective_rank) + s = np.identity(n) * (low_rank + tail) + + return np.dot(np.dot(u, s), v.T) + + +@validate_params( + { + "n_samples": [Interval(Integral, 1, None, closed="left")], + "n_components": [Interval(Integral, 1, None, closed="left")], + "n_features": [Interval(Integral, 1, None, closed="left")], + "n_nonzero_coefs": [Interval(Integral, 1, None, closed="left")], + "random_state": ["random_state"], + }, + prefer_skip_nested_validation=True, +) +def make_sparse_coded_signal( + n_samples, + *, + n_components, + n_features, + n_nonzero_coefs, + random_state=None, +): + """Generate a signal as a sparse combination of dictionary elements. + + Returns matrices `Y`, `D` and `X` such that `Y = XD` where `X` is of shape + `(n_samples, n_components)`, `D` is of shape `(n_components, n_features)`, and + each row of `X` has exactly `n_nonzero_coefs` non-zero elements. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_samples : int + Number of samples to generate. + + n_components : int + Number of components in the dictionary. + + n_features : int + Number of features of the dataset to generate. + + n_nonzero_coefs : int + Number of active (non-zero) coefficients in each sample. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + data : ndarray of shape (n_samples, n_features) + The encoded signal (Y). + + dictionary : ndarray of shape (n_components, n_features) + The dictionary with normalized components (D). + + code : ndarray of shape (n_samples, n_components) + The sparse code such that each column of this matrix has exactly + n_nonzero_coefs non-zero items (X). + + Examples + -------- + >>> from sklearn.datasets import make_sparse_coded_signal + >>> data, dictionary, code = make_sparse_coded_signal( + ... n_samples=50, + ... n_components=100, + ... n_features=10, + ... n_nonzero_coefs=4, + ... random_state=0 + ... ) + >>> data.shape + (50, 10) + >>> dictionary.shape + (100, 10) + >>> code.shape + (50, 100) + """ + generator = check_random_state(random_state) + + # generate dictionary + D = generator.standard_normal(size=(n_features, n_components)) + D /= np.sqrt(np.sum((D**2), axis=0)) + + # generate code + X = np.zeros((n_components, n_samples)) + for i in range(n_samples): + idx = np.arange(n_components) + generator.shuffle(idx) + idx = idx[:n_nonzero_coefs] + X[idx, i] = generator.standard_normal(size=n_nonzero_coefs) + + # encode signal + Y = np.dot(D, X) + + # Transpose to have shapes consistent with the rest of the API + Y, D, X = Y.T, D.T, X.T + + return map(np.squeeze, (Y, D, X)) + + +@validate_params( + { + "n_samples": [Interval(Integral, 1, None, closed="left")], + "n_features": [Interval(Integral, 1, None, closed="left")], + "random_state": ["random_state"], + }, + prefer_skip_nested_validation=True, +) +def make_sparse_uncorrelated(n_samples=100, n_features=10, *, random_state=None): + """Generate a random regression problem with sparse uncorrelated design. + + This dataset is described in Celeux et al [1]. as:: + + X ~ N(0, 1) + y(X) = X[:, 0] + 2 * X[:, 1] - 2 * X[:, 2] - 1.5 * X[:, 3] + + Only the first 4 features are informative. The remaining features are + useless. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_samples : int, default=100 + The number of samples. + + n_features : int, default=10 + The number of features. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + X : ndarray of shape (n_samples, n_features) + The input samples. + + y : ndarray of shape (n_samples,) + The output values. + + References + ---------- + .. [1] G. Celeux, M. El Anbari, J.-M. Marin, C. P. Robert, + "Regularization in regression: comparing Bayesian and frequentist + methods in a poorly informative situation", 2009. + + Examples + -------- + >>> from sklearn.datasets import make_sparse_uncorrelated + >>> X, y = make_sparse_uncorrelated(random_state=0) + >>> X.shape + (100, 10) + >>> y.shape + (100,) + """ + generator = check_random_state(random_state) + + X = generator.normal(loc=0, scale=1, size=(n_samples, n_features)) + y = generator.normal( + loc=(X[:, 0] + 2 * X[:, 1] - 2 * X[:, 2] - 1.5 * X[:, 3]), + scale=np.ones(n_samples), + ) + + return X, y + + +@validate_params( + { + "n_dim": [Interval(Integral, 1, None, closed="left")], + "random_state": ["random_state"], + }, + prefer_skip_nested_validation=True, +) +def make_spd_matrix(n_dim, *, random_state=None): + """Generate a random symmetric, positive-definite matrix. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_dim : int + The matrix dimension. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + X : ndarray of shape (n_dim, n_dim) + The random symmetric, positive-definite matrix. + + See Also + -------- + make_sparse_spd_matrix: Generate a sparse symmetric definite positive matrix. + + Examples + -------- + >>> from sklearn.datasets import make_spd_matrix + >>> make_spd_matrix(n_dim=2, random_state=42) + array([[2.093, 0.346], + [0.346, 0.218]]) + """ + generator = check_random_state(random_state) + + A = generator.uniform(size=(n_dim, n_dim)) + U, _, Vt = linalg.svd(np.dot(A.T, A), check_finite=False) + X = np.dot(np.dot(U, 1.0 + np.diag(generator.uniform(size=n_dim))), Vt) + + return X + + +@validate_params( + { + "n_dim": [Interval(Integral, 1, None, closed="left")], + "alpha": [Interval(Real, 0, 1, closed="both")], + "norm_diag": ["boolean"], + "smallest_coef": [Interval(Real, 0, 1, closed="both")], + "largest_coef": [Interval(Real, 0, 1, closed="both")], + "sparse_format": [ + StrOptions({"bsr", "coo", "csc", "csr", "dia", "dok", "lil"}), + None, + ], + "random_state": ["random_state"], + }, + prefer_skip_nested_validation=True, +) +def make_sparse_spd_matrix( + n_dim=1, + *, + alpha=0.95, + norm_diag=False, + smallest_coef=0.1, + largest_coef=0.9, + sparse_format=None, + random_state=None, +): + """Generate a sparse symmetric definite positive matrix. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_dim : int, default=1 + The size of the random matrix to generate. + + .. versionchanged:: 1.4 + Renamed from ``dim`` to ``n_dim``. + + alpha : float, default=0.95 + The probability that a coefficient is zero (see notes). Larger values + enforce more sparsity. The value should be in the range 0 and 1. + + norm_diag : bool, default=False + Whether to normalize the output matrix to make the leading diagonal + elements all 1. + + smallest_coef : float, default=0.1 + The value of the smallest coefficient between 0 and 1. + + largest_coef : float, default=0.9 + The value of the largest coefficient between 0 and 1. + + sparse_format : str, default=None + String representing the output sparse format, such as 'csc', 'csr', etc. + If ``None``, return a dense numpy ndarray. + + .. versionadded:: 1.4 + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + prec : ndarray or sparse matrix of shape (dim, dim) + The generated matrix. If ``sparse_format=None``, this would be an ndarray. + Otherwise, this will be a sparse matrix of the specified format. + + See Also + -------- + make_spd_matrix : Generate a random symmetric, positive-definite matrix. + + Notes + ----- + The sparsity is actually imposed on the cholesky factor of the matrix. + Thus alpha does not translate directly into the filling fraction of + the matrix itself. + + Examples + -------- + >>> from sklearn.datasets import make_sparse_spd_matrix + >>> make_sparse_spd_matrix(n_dim=4, norm_diag=False, random_state=42) + array([[1., 0., 0., 0.], + [0., 1., 0., 0.], + [0., 0., 1., 0.], + [0., 0., 0., 1.]]) + """ + random_state = check_random_state(random_state) + + chol = -sp.eye(n_dim) + aux = sp.random( + m=n_dim, + n=n_dim, + density=1 - alpha, + data_rvs=lambda x: random_state.uniform( + low=smallest_coef, high=largest_coef, size=x + ), + random_state=random_state, + ) + # We need to avoid "coo" format because it does not support slicing + aux = sp.tril(aux, k=-1, format="csc") + + # Permute the lines: we don't want to have asymmetries in the final + # SPD matrix + permutation = random_state.permutation(n_dim) + aux = aux[permutation].T[permutation] + chol += aux + prec = chol.T @ chol + + if norm_diag: + # Form the diagonal vector into a row matrix + d = sp.diags(1.0 / np.sqrt(prec.diagonal())) + prec = d @ prec @ d + + if sparse_format is None: + return prec.toarray() + else: + return prec.asformat(sparse_format) + + +@validate_params( + { + "n_samples": [Interval(Integral, 1, None, closed="left")], + "noise": [Interval(Real, 0, None, closed="left")], + "random_state": ["random_state"], + "hole": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def make_swiss_roll(n_samples=100, *, noise=0.0, random_state=None, hole=False): + """Generate a swiss roll dataset. + + Read more in the :ref:`User Guide `. + + Adapted with permission from Stephen Marsland's code [1]. + + Parameters + ---------- + n_samples : int, default=100 + The number of sample points on the Swiss Roll. + + noise : float, default=0.0 + The standard deviation of the gaussian noise. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + hole : bool, default=False + If True generates the swiss roll with hole dataset. + + Returns + ------- + X : ndarray of shape (n_samples, 3) + The points. + + t : ndarray of shape (n_samples,) + The univariate position of the sample according to the main dimension + of the points in the manifold. + + Notes + ----- + The algorithm is from Marsland [1]. + + References + ---------- + .. [1] S. Marsland, "Machine Learning: An Algorithmic Perspective", 2nd edition, + Chapter 6, 2014. + https://homepages.ecs.vuw.ac.nz/~marslast/Code/Ch6/lle.py + + Examples + -------- + >>> from sklearn.datasets import make_swiss_roll + >>> X, t = make_swiss_roll(noise=0.05, random_state=0) + >>> X.shape + (100, 3) + >>> t.shape + (100,) + """ + generator = check_random_state(random_state) + + if not hole: + t = 1.5 * np.pi * (1 + 2 * generator.uniform(size=n_samples)) + y = 21 * generator.uniform(size=n_samples) + else: + corners = np.array( + [[np.pi * (1.5 + i), j * 7] for i in range(3) for j in range(3)] + ) + corners = np.delete(corners, 4, axis=0) + corner_index = generator.choice(8, n_samples) + parameters = generator.uniform(size=(2, n_samples)) * np.array([[np.pi], [7]]) + t, y = corners[corner_index].T + parameters + + x = t * np.cos(t) + z = t * np.sin(t) + + X = np.vstack((x, y, z)) + X += noise * generator.standard_normal(size=(3, n_samples)) + X = X.T + t = np.squeeze(t) + + return X, t + + +@validate_params( + { + "n_samples": [Interval(Integral, 1, None, closed="left")], + "noise": [Interval(Real, 0, None, closed="left")], + "random_state": ["random_state"], + }, + prefer_skip_nested_validation=True, +) +def make_s_curve(n_samples=100, *, noise=0.0, random_state=None): + """Generate an S curve dataset. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_samples : int, default=100 + The number of sample points on the S curve. + + noise : float, default=0.0 + The standard deviation of the gaussian noise. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + X : ndarray of shape (n_samples, 3) + The points. + + t : ndarray of shape (n_samples,) + The univariate position of the sample according + to the main dimension of the points in the manifold. + + Examples + -------- + >>> from sklearn.datasets import make_s_curve + >>> X, t = make_s_curve(noise=0.05, random_state=0) + >>> X.shape + (100, 3) + >>> t.shape + (100,) + """ + generator = check_random_state(random_state) + + t = 3 * np.pi * (generator.uniform(size=(1, n_samples)) - 0.5) + X = np.empty(shape=(n_samples, 3), dtype=np.float64) + X[:, 0] = np.sin(t) + X[:, 1] = 2.0 * generator.uniform(size=n_samples) + X[:, 2] = np.sign(t) * (np.cos(t) - 1) + X += noise * generator.standard_normal(size=(3, n_samples)).T + t = np.squeeze(t) + + return X, t + + +@validate_params( + { + "mean": ["array-like", None], + "cov": [Interval(Real, 0, None, closed="left")], + "n_samples": [Interval(Integral, 1, None, closed="left")], + "n_features": [Interval(Integral, 1, None, closed="left")], + "n_classes": [Interval(Integral, 1, None, closed="left")], + "shuffle": ["boolean"], + "random_state": ["random_state"], + }, + prefer_skip_nested_validation=True, +) +def make_gaussian_quantiles( + *, + mean=None, + cov=1.0, + n_samples=100, + n_features=2, + n_classes=3, + shuffle=True, + random_state=None, +): + r"""Generate isotropic Gaussian and label samples by quantile. + + This classification dataset is constructed by taking a multi-dimensional + standard normal distribution and defining classes separated by nested + concentric multi-dimensional spheres such that roughly equal numbers of + samples are in each class (quantiles of the :math:`\chi^2` distribution). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + mean : array-like of shape (n_features,), default=None + The mean of the multi-dimensional normal distribution. + If None then use the origin (0, 0, ...). + + cov : float, default=1.0 + The covariance matrix will be this value times the unit matrix. This + dataset only produces symmetric normal distributions. + + n_samples : int, default=100 + The total number of points equally divided among classes. + + n_features : int, default=2 + The number of features for each sample. + + n_classes : int, default=3 + The number of classes. + + shuffle : bool, default=True + Shuffle the samples. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + X : ndarray of shape (n_samples, n_features) + The generated samples. + + y : ndarray of shape (n_samples,) + The integer labels for quantile membership of each sample. + + Notes + ----- + The dataset is from Zhu et al [1]. + + References + ---------- + .. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", 2009. + + Examples + -------- + >>> from sklearn.datasets import make_gaussian_quantiles + >>> X, y = make_gaussian_quantiles(random_state=42) + >>> X.shape + (100, 2) + >>> y.shape + (100,) + >>> list(y[:5]) + [np.int64(2), np.int64(0), np.int64(1), np.int64(0), np.int64(2)] + """ + if n_samples < n_classes: + raise ValueError("n_samples must be at least n_classes") + + generator = check_random_state(random_state) + + if mean is None: + mean = np.zeros(n_features) + else: + mean = np.array(mean) + + # Build multivariate normal distribution + X = generator.multivariate_normal(mean, cov * np.identity(n_features), (n_samples,)) + + # Sort by distance from origin + idx = np.argsort(np.sum((X - mean[np.newaxis, :]) ** 2, axis=1)) + X = X[idx, :] + + # Label by quantile + step = n_samples // n_classes + + y = np.hstack( + [ + np.repeat(np.arange(n_classes), step), + np.repeat(n_classes - 1, n_samples - step * n_classes), + ] + ) + + if shuffle: + X, y = util_shuffle(X, y, random_state=generator) + + return X, y + + +def _shuffle(data, random_state=None): + generator = check_random_state(random_state) + n_rows, n_cols = data.shape + row_idx = generator.permutation(n_rows) + col_idx = generator.permutation(n_cols) + result = data[row_idx][:, col_idx] + return result, row_idx, col_idx + + +@validate_params( + { + "shape": [tuple], + "n_clusters": [Interval(Integral, 1, None, closed="left")], + "noise": [Interval(Real, 0, None, closed="left")], + "minval": [Interval(Real, None, None, closed="neither")], + "maxval": [Interval(Real, None, None, closed="neither")], + "shuffle": ["boolean"], + "random_state": ["random_state"], + }, + prefer_skip_nested_validation=True, +) +def make_biclusters( + shape, + n_clusters, + *, + noise=0.0, + minval=10, + maxval=100, + shuffle=True, + random_state=None, +): + """Generate a constant block diagonal structure array for biclustering. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + shape : tuple of shape (n_rows, n_cols) + The shape of the result. + + n_clusters : int + The number of biclusters. + + noise : float, default=0.0 + The standard deviation of the gaussian noise. + + minval : float, default=10 + Minimum value of a bicluster. + + maxval : float, default=100 + Maximum value of a bicluster. + + shuffle : bool, default=True + Shuffle the samples. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + X : ndarray of shape `shape` + The generated array. + + rows : ndarray of shape (n_clusters, X.shape[0]) + The indicators for cluster membership of each row. + + cols : ndarray of shape (n_clusters, X.shape[1]) + The indicators for cluster membership of each column. + + See Also + -------- + make_checkerboard: Generate an array with block checkerboard structure for + biclustering. + + References + ---------- + + .. [1] Dhillon, I. S. (2001, August). Co-clustering documents and + words using bipartite spectral graph partitioning. In Proceedings + of the seventh ACM SIGKDD international conference on Knowledge + discovery and data mining (pp. 269-274). ACM. + + Examples + -------- + >>> from sklearn.datasets import make_biclusters + >>> data, rows, cols = make_biclusters( + ... shape=(10, 20), n_clusters=2, random_state=42 + ... ) + >>> data.shape + (10, 20) + >>> rows.shape + (2, 10) + >>> cols.shape + (2, 20) + """ + generator = check_random_state(random_state) + n_rows, n_cols = shape + consts = generator.uniform(minval, maxval, n_clusters) + + # row and column clusters of approximately equal sizes + row_sizes = generator.multinomial(n_rows, np.repeat(1.0 / n_clusters, n_clusters)) + col_sizes = generator.multinomial(n_cols, np.repeat(1.0 / n_clusters, n_clusters)) + + row_labels = np.hstack( + [np.repeat(val, rep) for val, rep in zip(range(n_clusters), row_sizes)] + ) + col_labels = np.hstack( + [np.repeat(val, rep) for val, rep in zip(range(n_clusters), col_sizes)] + ) + + result = np.zeros(shape, dtype=np.float64) + for i in range(n_clusters): + selector = np.outer(row_labels == i, col_labels == i) + result[selector] += consts[i] + + if noise > 0: + result += generator.normal(scale=noise, size=result.shape) + + if shuffle: + result, row_idx, col_idx = _shuffle(result, random_state) + row_labels = row_labels[row_idx] + col_labels = col_labels[col_idx] + + rows = np.vstack([row_labels == c for c in range(n_clusters)]) + cols = np.vstack([col_labels == c for c in range(n_clusters)]) + + return result, rows, cols + + +@validate_params( + { + "shape": [tuple], + "n_clusters": [Interval(Integral, 1, None, closed="left"), "array-like"], + "noise": [Interval(Real, 0, None, closed="left")], + "minval": [Interval(Real, None, None, closed="neither")], + "maxval": [Interval(Real, None, None, closed="neither")], + "shuffle": ["boolean"], + "random_state": ["random_state"], + }, + prefer_skip_nested_validation=True, +) +def make_checkerboard( + shape, + n_clusters, + *, + noise=0.0, + minval=10, + maxval=100, + shuffle=True, + random_state=None, +): + """Generate an array with block checkerboard structure for biclustering. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + shape : tuple of shape (n_rows, n_cols) + The shape of the result. + + n_clusters : int or array-like or shape (n_row_clusters, n_column_clusters) + The number of row and column clusters. + + noise : float, default=0.0 + The standard deviation of the gaussian noise. + + minval : float, default=10 + Minimum value of a bicluster. + + maxval : float, default=100 + Maximum value of a bicluster. + + shuffle : bool, default=True + Shuffle the samples. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for dataset creation. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + X : ndarray of shape `shape` + The generated array. + + rows : ndarray of shape (n_clusters, X.shape[0]) + The indicators for cluster membership of each row. + + cols : ndarray of shape (n_clusters, X.shape[1]) + The indicators for cluster membership of each column. + + See Also + -------- + make_biclusters : Generate an array with constant block diagonal structure + for biclustering. + + References + ---------- + .. [1] Kluger, Y., Basri, R., Chang, J. T., & Gerstein, M. (2003). + Spectral biclustering of microarray data: coclustering genes + and conditions. Genome research, 13(4), 703-716. + + Examples + -------- + >>> from sklearn.datasets import make_checkerboard + >>> data, rows, columns = make_checkerboard(shape=(300, 300), n_clusters=10, + ... random_state=42) + >>> data.shape + (300, 300) + >>> rows.shape + (100, 300) + >>> columns.shape + (100, 300) + >>> print(rows[0][:5], columns[0][:5]) + [False False False True False] [False False False False False] + """ + generator = check_random_state(random_state) + + if hasattr(n_clusters, "__len__"): + n_row_clusters, n_col_clusters = n_clusters + else: + n_row_clusters = n_col_clusters = n_clusters + + # row and column clusters of approximately equal sizes + n_rows, n_cols = shape + row_sizes = generator.multinomial( + n_rows, np.repeat(1.0 / n_row_clusters, n_row_clusters) + ) + col_sizes = generator.multinomial( + n_cols, np.repeat(1.0 / n_col_clusters, n_col_clusters) + ) + + row_labels = np.hstack( + [np.repeat(val, rep) for val, rep in zip(range(n_row_clusters), row_sizes)] + ) + col_labels = np.hstack( + [np.repeat(val, rep) for val, rep in zip(range(n_col_clusters), col_sizes)] + ) + + result = np.zeros(shape, dtype=np.float64) + for i in range(n_row_clusters): + for j in range(n_col_clusters): + selector = np.outer(row_labels == i, col_labels == j) + result[selector] += generator.uniform(minval, maxval) + + if noise > 0: + result += generator.normal(scale=noise, size=result.shape) + + if shuffle: + result, row_idx, col_idx = _shuffle(result, random_state) + row_labels = row_labels[row_idx] + col_labels = col_labels[col_idx] + + rows = np.vstack( + [ + row_labels == label + for label in range(n_row_clusters) + for _ in range(n_col_clusters) + ] + ) + cols = np.vstack( + [ + col_labels == label + for _ in range(n_row_clusters) + for label in range(n_col_clusters) + ] + ) + + return result, rows, cols diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_species_distributions.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_species_distributions.py new file mode 100644 index 0000000000000000000000000000000000000000..e871949e41312b2600512551f0c3d2593ad8cf64 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_species_distributions.py @@ -0,0 +1,289 @@ +""" +============================= +Species distribution dataset +============================= + +This dataset represents the geographic distribution of species. +The dataset is provided by Phillips et. al. (2006). + +The two species are: + + - `"Bradypus variegatus" + `_ , + the Brown-throated Sloth. + + - `"Microryzomys minutus" + `_ , + also known as the Forest Small Rice Rat, a rodent that lives in Peru, + Colombia, Ecuador, Peru, and Venezuela. + +References +---------- + +`"Maximum entropy modeling of species geographic distributions" +`_ S. J. Phillips, +R. P. Anderson, R. E. Schapire - Ecological Modelling, 190:231-259, 2006. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import logging +from io import BytesIO +from numbers import Integral, Real +from os import PathLike, makedirs, remove +from os.path import exists + +import joblib +import numpy as np + +from ..utils import Bunch +from ..utils._param_validation import Interval, validate_params +from . import get_data_home +from ._base import RemoteFileMetadata, _fetch_remote, _pkl_filepath + +# The original data can be found at: +# https://biodiversityinformatics.amnh.org/open_source/maxent/samples.zip +SAMPLES = RemoteFileMetadata( + filename="samples.zip", + url="https://ndownloader.figshare.com/files/5976075", + checksum="abb07ad284ac50d9e6d20f1c4211e0fd3c098f7f85955e89d321ee8efe37ac28", +) + +# The original data can be found at: +# https://biodiversityinformatics.amnh.org/open_source/maxent/coverages.zip +COVERAGES = RemoteFileMetadata( + filename="coverages.zip", + url="https://ndownloader.figshare.com/files/5976078", + checksum="4d862674d72e79d6cee77e63b98651ec7926043ba7d39dcb31329cf3f6073807", +) + +DATA_ARCHIVE_NAME = "species_coverage.pkz" + + +logger = logging.getLogger(__name__) + + +def _load_coverage(F, header_length=6, dtype=np.int16): + """Load a coverage file from an open file object. + + This will return a numpy array of the given dtype + """ + header = [F.readline() for _ in range(header_length)] + make_tuple = lambda t: (t.split()[0], float(t.split()[1])) + header = dict([make_tuple(line) for line in header]) + + M = np.loadtxt(F, dtype=dtype) + nodata = int(header[b"NODATA_value"]) + if nodata != -9999: + M[nodata] = -9999 + return M + + +def _load_csv(F): + """Load csv file. + + Parameters + ---------- + F : file object + CSV file open in byte mode. + + Returns + ------- + rec : np.ndarray + record array representing the data + """ + names = F.readline().decode("ascii").strip().split(",") + + rec = np.loadtxt(F, skiprows=0, delimiter=",", dtype="S22,f4,f4") + rec.dtype.names = names + return rec + + +def construct_grids(batch): + """Construct the map grid from the batch object + + Parameters + ---------- + batch : Batch object + The object returned by :func:`fetch_species_distributions` + + Returns + ------- + (xgrid, ygrid) : 1-D arrays + The grid corresponding to the values in batch.coverages + """ + # x,y coordinates for corner cells + xmin = batch.x_left_lower_corner + batch.grid_size + xmax = xmin + (batch.Nx * batch.grid_size) + ymin = batch.y_left_lower_corner + batch.grid_size + ymax = ymin + (batch.Ny * batch.grid_size) + + # x coordinates of the grid cells + xgrid = np.arange(xmin, xmax, batch.grid_size) + # y coordinates of the grid cells + ygrid = np.arange(ymin, ymax, batch.grid_size) + + return (xgrid, ygrid) + + +@validate_params( + { + "data_home": [str, PathLike, None], + "download_if_missing": ["boolean"], + "n_retries": [Interval(Integral, 1, None, closed="left")], + "delay": [Interval(Real, 0.0, None, closed="neither")], + }, + prefer_skip_nested_validation=True, +) +def fetch_species_distributions( + *, + data_home=None, + download_if_missing=True, + n_retries=3, + delay=1.0, +): + """Loader for species distribution dataset from Phillips et. al. (2006). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + data_home : str or path-like, default=None + Specify another download and cache folder for the datasets. By default + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + + download_if_missing : bool, default=True + If False, raise an OSError if the data is not locally available + instead of trying to download the data from the source site. + + n_retries : int, default=3 + Number of retries when HTTP errors are encountered. + + .. versionadded:: 1.5 + + delay : float, default=1.0 + Number of seconds between retries. + + .. versionadded:: 1.5 + + Returns + ------- + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + coverages : array, shape = [14, 1592, 1212] + These represent the 14 features measured + at each point of the map grid. + The latitude/longitude values for the grid are discussed below. + Missing data is represented by the value -9999. + train : record array, shape = (1624,) + The training points for the data. Each point has three fields: + + - train['species'] is the species name + - train['dd long'] is the longitude, in degrees + - train['dd lat'] is the latitude, in degrees + test : record array, shape = (620,) + The test points for the data. Same format as the training data. + Nx, Ny : integers + The number of longitudes (x) and latitudes (y) in the grid + x_left_lower_corner, y_left_lower_corner : floats + The (x,y) position of the lower-left corner, in degrees + grid_size : float + The spacing between points of the grid, in degrees + + Notes + ----- + + This dataset represents the geographic distribution of species. + The dataset is provided by Phillips et. al. (2006). + + The two species are: + + - `"Bradypus variegatus" + `_ , + the Brown-throated Sloth. + + - `"Microryzomys minutus" + `_ , + also known as the Forest Small Rice Rat, a rodent that lives in Peru, + Colombia, Ecuador, Peru, and Venezuela. + + References + ---------- + + * `"Maximum entropy modeling of species geographic distributions" + `_ + S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling, + 190:231-259, 2006. + + Examples + -------- + >>> from sklearn.datasets import fetch_species_distributions + >>> species = fetch_species_distributions() + >>> species.train[:5] + array([(b'microryzomys_minutus', -64.7 , -17.85 ), + (b'microryzomys_minutus', -67.8333, -16.3333), + (b'microryzomys_minutus', -67.8833, -16.3 ), + (b'microryzomys_minutus', -67.8 , -16.2667), + (b'microryzomys_minutus', -67.9833, -15.9 )], + dtype=[('species', 'S22'), ('dd long', ' 0 else -1 + + # Special-case float32 but use float64 for everything else; + # the Python code will do further conversions. + if dtype == np.float32: + data = array.array("f") + else: + dtype = np.float64 + data = array.array("d") + + indices = array.array("q") + indptr = array.array("q", [0]) + query = np.arange(0, dtype=np.int64) + + if multilabel: + labels = [] + else: + labels = array.array("d") + + if offset > 0: + f.seek(offset) + # drop the current line that might be truncated and is to be + # fetched by another call + f.readline() + + for line in f: + # skip comments + line_cstr = line + hash_ptr = strchr(line_cstr, 35) # ASCII value of '#' is 35 + if hash_ptr != NULL: + line = line[:hash_ptr - line_cstr] + + line_parts = line.split() + if len(line_parts) == 0: + continue + + target, features = line_parts[0], line_parts[1:] + if multilabel: + if COLON in target: + target, features = [], line_parts[0:] + else: + target = [float(y) for y in target.split(COMMA)] + target.sort() + labels.append(tuple(target)) + else: + array.resize_smart(labels, len(labels) + 1) + labels[len(labels) - 1] = float(target) + + prev_idx = -1 + n_features = len(features) + if n_features and features[0].startswith(qid_prefix): + _, value = features[0].split(COLON, 1) + if query_id: + query.resize(len(query) + 1) + query[len(query) - 1] = np.int64(value) + features.pop(0) + n_features -= 1 + + for i in range(0, n_features): + idx_s, value = features[i].split(COLON, 1) + idx = int(idx_s) + if idx < 0 or not zero_based and idx == 0: + raise ValueError( + "Invalid index %d in SVMlight/LibSVM data file." % idx) + if idx <= prev_idx: + raise ValueError("Feature indices in SVMlight/LibSVM data " + "file should be sorted and unique.") + + array.resize_smart(indices, len(indices) + 1) + indices[len(indices) - 1] = idx + + array.resize_smart(data, len(data) + 1) + data[len(data) - 1] = float(value) + + prev_idx = idx + + # increment index pointer array size + array.resize_smart(indptr, len(indptr) + 1) + indptr[len(indptr) - 1] = len(data) + + if offset_max != -1 and f.tell() > offset_max: + # Stop here and let another call deal with the following. + break + + return (dtype, data, indices, indptr, labels, query) + + +# Two fused types are defined to be able to +# use all possible combinations of parameters. +ctypedef fused int_or_float: + cython.integral + cython.floating + signed long long + +ctypedef fused double_or_longlong: + double + signed long long + +ctypedef fused int_or_longlong: + cython.integral + signed long long + + +def get_dense_row_string( + const int_or_float[:, :] X, + Py_ssize_t[:] x_inds, + double_or_longlong[:] x_vals, + Py_ssize_t row, + str value_pattern, + bint one_based, +): + cdef: + Py_ssize_t row_length = X.shape[1] + Py_ssize_t x_nz_used = 0 + Py_ssize_t k + int_or_float val + + for k in range(row_length): + val = X[row, k] + if val == 0: + continue + x_inds[x_nz_used] = k + x_vals[x_nz_used] = val + x_nz_used += 1 + + reprs = [ + value_pattern % (x_inds[i] + one_based, x_vals[i]) + for i in range(x_nz_used) + ] + + return " ".join(reprs) + + +def get_sparse_row_string( + int_or_float[:] X_data, + int[:] X_indptr, + int[:] X_indices, + Py_ssize_t row, + str value_pattern, + bint one_based, +): + cdef: + Py_ssize_t row_start = X_indptr[row] + Py_ssize_t row_end = X_indptr[row+1] + + reprs = [ + value_pattern % (X_indices[i] + one_based, X_data[i]) + for i in range(row_start, row_end) + ] + + return " ".join(reprs) + + +def _dump_svmlight_file( + X, + y, + f, + bint multilabel, + bint one_based, + int_or_longlong[:] query_id, + bint X_is_sp, + bint y_is_sp, +): + cdef bint X_is_integral + cdef bint query_id_is_not_empty = query_id.size > 0 + X_is_integral = X.dtype.kind == "i" + if X_is_integral: + value_pattern = "%d:%d" + else: + value_pattern = "%d:%.16g" + if y.dtype.kind == "i": + label_pattern = "%d" + else: + label_pattern = "%.16g" + + line_pattern = "%s" + if query_id_is_not_empty: + line_pattern += " qid:%d" + line_pattern += " %s\n" + + cdef: + Py_ssize_t num_labels = y.shape[1] + Py_ssize_t x_len = X.shape[0] + Py_ssize_t row_length = X.shape[1] + Py_ssize_t i + Py_ssize_t j + Py_ssize_t col_start + Py_ssize_t col_end + Py_ssize_t[:] x_inds = np.empty(row_length, dtype=np.intp) + signed long long[:] x_vals_int + double[:] x_vals_float + + if not X_is_sp: + if X_is_integral: + x_vals_int = np.zeros(row_length, dtype=np.longlong) + else: + x_vals_float = np.zeros(row_length, dtype=np.float64) + + for i in range(x_len): + if not X_is_sp: + if X_is_integral: + s = get_dense_row_string(X, x_inds, x_vals_int, i, value_pattern, one_based) + else: + s = get_dense_row_string(X, x_inds, x_vals_float, i, value_pattern, one_based) + else: + s = get_sparse_row_string(X.data, X.indptr, X.indices, i, value_pattern, one_based) + if multilabel: + if y_is_sp: + col_start = y.indptr[i] + col_end = y.indptr[i+1] + labels_str = ','.join(tuple(label_pattern % y.indices[j] for j in range(col_start, col_end) if y.data[j] != 0)) + else: + labels_str = ','.join(label_pattern % j for j in range(num_labels) if y[i, j] != 0) + else: + if y_is_sp: + labels_str = label_pattern % y.data[i] + else: + labels_str = label_pattern % y[i, 0] + + if query_id_is_not_empty: + feat = (labels_str, query_id[i], s) + else: + feat = (labels_str, s) + + f.write((line_pattern % feat).encode("utf-8")) diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_svmlight_format_io.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_svmlight_format_io.py new file mode 100644 index 0000000000000000000000000000000000000000..e3a833efb86c02675a318fd09674e33ad5dfb526 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_svmlight_format_io.py @@ -0,0 +1,585 @@ +"""This module implements a loader and dumper for the svmlight format + +This format is a text-based format, with one sample per line. It does +not store zero valued features hence is suitable for sparse dataset. + +The first element of each line can be used to store a target variable to +predict. + +This format is used as the default format for both svmlight and the +libsvm command line programs. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import os.path +from contextlib import closing +from numbers import Integral + +import numpy as np +import scipy.sparse as sp + +from .. import __version__ +from ..utils import check_array +from ..utils._param_validation import HasMethods, Interval, StrOptions, validate_params +from ._svmlight_format_fast import ( + _dump_svmlight_file, + _load_svmlight_file, +) + + +@validate_params( + { + "f": [ + str, + Interval(Integral, 0, None, closed="left"), + os.PathLike, + HasMethods("read"), + ], + "n_features": [Interval(Integral, 1, None, closed="left"), None], + "dtype": "no_validation", # delegate validation to numpy + "multilabel": ["boolean"], + "zero_based": ["boolean", StrOptions({"auto"})], + "query_id": ["boolean"], + "offset": [Interval(Integral, 0, None, closed="left")], + "length": [Integral], + }, + prefer_skip_nested_validation=True, +) +def load_svmlight_file( + f, + *, + n_features=None, + dtype=np.float64, + multilabel=False, + zero_based="auto", + query_id=False, + offset=0, + length=-1, +): + """Load datasets in the svmlight / libsvm format into sparse CSR matrix. + + This format is a text-based format, with one sample per line. It does + not store zero valued features hence is suitable for sparse dataset. + + The first element of each line can be used to store a target variable + to predict. + + This format is used as the default format for both svmlight and the + libsvm command line programs. + + Parsing a text based source can be expensive. When repeatedly + working on the same dataset, it is recommended to wrap this + loader with joblib.Memory.cache to store a memmapped backup of the + CSR results of the first call and benefit from the near instantaneous + loading of memmapped structures for the subsequent calls. + + In case the file contains a pairwise preference constraint (known + as "qid" in the svmlight format) these are ignored unless the + query_id parameter is set to True. These pairwise preference + constraints can be used to constraint the combination of samples + when using pairwise loss functions (as is the case in some + learning to rank problems) so that only pairs with the same + query_id value are considered. + + This implementation is written in Cython and is reasonably fast. + However, a faster API-compatible loader is also available at: + https://github.com/mblondel/svmlight-loader + + Parameters + ---------- + f : str, path-like, file-like or int + (Path to) a file to load. If a path ends in ".gz" or ".bz2", it will + be uncompressed on the fly. If an integer is passed, it is assumed to + be a file descriptor. A file-like or file descriptor will not be closed + by this function. A file-like object must be opened in binary mode. + + .. versionchanged:: 1.2 + Path-like objects are now accepted. + + n_features : int, default=None + The number of features to use. If None, it will be inferred. This + argument is useful to load several files that are subsets of a + bigger sliced dataset: each subset might not have examples of + every feature, hence the inferred shape might vary from one + slice to another. + n_features is only required if ``offset`` or ``length`` are passed a + non-default value. + + dtype : numpy data type, default=np.float64 + Data type of dataset to be loaded. This will be the data type of the + output numpy arrays ``X`` and ``y``. + + multilabel : bool, default=False + Samples may have several labels each (see + https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html). + + zero_based : bool or "auto", default="auto" + Whether column indices in f are zero-based (True) or one-based + (False). If column indices are one-based, they are transformed to + zero-based to match Python/NumPy conventions. + If set to "auto", a heuristic check is applied to determine this from + the file contents. Both kinds of files occur "in the wild", but they + are unfortunately not self-identifying. Using "auto" or True should + always be safe when no ``offset`` or ``length`` is passed. + If ``offset`` or ``length`` are passed, the "auto" mode falls back + to ``zero_based=True`` to avoid having the heuristic check yield + inconsistent results on different segments of the file. + + query_id : bool, default=False + If True, will return the query_id array for each file. + + offset : int, default=0 + Ignore the offset first bytes by seeking forward, then + discarding the following bytes up until the next new line + character. + + length : int, default=-1 + If strictly positive, stop reading any new line of data once the + position in the file has reached the (offset + length) bytes threshold. + + Returns + ------- + X : scipy.sparse matrix of shape (n_samples, n_features) + The data matrix. + + y : ndarray of shape (n_samples,), or a list of tuples of length n_samples + The target. It is a list of tuples when ``multilabel=True``, else a + ndarray. + + query_id : array of shape (n_samples,) + The query_id for each sample. Only returned when query_id is set to + True. + + See Also + -------- + load_svmlight_files : Similar function for loading multiple files in this + format, enforcing the same number of features/columns on all of them. + + Examples + -------- + To use joblib.Memory to cache the svmlight file:: + + from joblib import Memory + from sklearn.datasets import load_svmlight_file + mem = Memory("./mycache") + + @mem.cache + def get_data(): + data = load_svmlight_file("mysvmlightfile") + return data[0], data[1] + + X, y = get_data() + """ + return tuple( + load_svmlight_files( + [f], + n_features=n_features, + dtype=dtype, + multilabel=multilabel, + zero_based=zero_based, + query_id=query_id, + offset=offset, + length=length, + ) + ) + + +def _gen_open(f): + if isinstance(f, int): # file descriptor + return open(f, "rb", closefd=False) + elif isinstance(f, os.PathLike): + f = os.fspath(f) + elif not isinstance(f, str): + raise TypeError("expected {str, int, path-like, file-like}, got %s" % type(f)) + + _, ext = os.path.splitext(f) + if ext == ".gz": + import gzip + + return gzip.open(f, "rb") + elif ext == ".bz2": + from bz2 import BZ2File + + return BZ2File(f, "rb") + else: + return open(f, "rb") + + +def _open_and_load(f, dtype, multilabel, zero_based, query_id, offset=0, length=-1): + if hasattr(f, "read"): + actual_dtype, data, ind, indptr, labels, query = _load_svmlight_file( + f, dtype, multilabel, zero_based, query_id, offset, length + ) + else: + with closing(_gen_open(f)) as f: + actual_dtype, data, ind, indptr, labels, query = _load_svmlight_file( + f, dtype, multilabel, zero_based, query_id, offset, length + ) + + # convert from array.array, give data the right dtype + if not multilabel: + labels = np.frombuffer(labels, np.float64) + data = np.frombuffer(data, actual_dtype) + indices = np.frombuffer(ind, np.longlong) + indptr = np.frombuffer(indptr, dtype=np.longlong) # never empty + query = np.frombuffer(query, np.int64) + + data = np.asarray(data, dtype=dtype) # no-op for float{32,64} + return data, indices, indptr, labels, query + + +@validate_params( + { + "files": [ + "array-like", + str, + os.PathLike, + HasMethods("read"), + Interval(Integral, 0, None, closed="left"), + ], + "n_features": [Interval(Integral, 1, None, closed="left"), None], + "dtype": "no_validation", # delegate validation to numpy + "multilabel": ["boolean"], + "zero_based": ["boolean", StrOptions({"auto"})], + "query_id": ["boolean"], + "offset": [Interval(Integral, 0, None, closed="left")], + "length": [Integral], + }, + prefer_skip_nested_validation=True, +) +def load_svmlight_files( + files, + *, + n_features=None, + dtype=np.float64, + multilabel=False, + zero_based="auto", + query_id=False, + offset=0, + length=-1, +): + """Load dataset from multiple files in SVMlight format. + + This function is equivalent to mapping load_svmlight_file over a list of + files, except that the results are concatenated into a single, flat list + and the samples vectors are constrained to all have the same number of + features. + + In case the file contains a pairwise preference constraint (known + as "qid" in the svmlight format) these are ignored unless the + query_id parameter is set to True. These pairwise preference + constraints can be used to constraint the combination of samples + when using pairwise loss functions (as is the case in some + learning to rank problems) so that only pairs with the same + query_id value are considered. + + Parameters + ---------- + files : array-like, dtype=str, path-like, file-like or int + (Paths of) files to load. If a path ends in ".gz" or ".bz2", it will + be uncompressed on the fly. If an integer is passed, it is assumed to + be a file descriptor. File-likes and file descriptors will not be + closed by this function. File-like objects must be opened in binary + mode. + + .. versionchanged:: 1.2 + Path-like objects are now accepted. + + n_features : int, default=None + The number of features to use. If None, it will be inferred from the + maximum column index occurring in any of the files. + + This can be set to a higher value than the actual number of features + in any of the input files, but setting it to a lower value will cause + an exception to be raised. + + dtype : numpy data type, default=np.float64 + Data type of dataset to be loaded. This will be the data type of the + output numpy arrays ``X`` and ``y``. + + multilabel : bool, default=False + Samples may have several labels each (see + https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html). + + zero_based : bool or "auto", default="auto" + Whether column indices in f are zero-based (True) or one-based + (False). If column indices are one-based, they are transformed to + zero-based to match Python/NumPy conventions. + If set to "auto", a heuristic check is applied to determine this from + the file contents. Both kinds of files occur "in the wild", but they + are unfortunately not self-identifying. Using "auto" or True should + always be safe when no offset or length is passed. + If offset or length are passed, the "auto" mode falls back + to zero_based=True to avoid having the heuristic check yield + inconsistent results on different segments of the file. + + query_id : bool, default=False + If True, will return the query_id array for each file. + + offset : int, default=0 + Ignore the offset first bytes by seeking forward, then + discarding the following bytes up until the next new line + character. + + length : int, default=-1 + If strictly positive, stop reading any new line of data once the + position in the file has reached the (offset + length) bytes threshold. + + Returns + ------- + [X1, y1, ..., Xn, yn] or [X1, y1, q1, ..., Xn, yn, qn]: list of arrays + Each (Xi, yi) pair is the result from load_svmlight_file(files[i]). + If query_id is set to True, this will return instead (Xi, yi, qi) + triplets. + + See Also + -------- + load_svmlight_file: Similar function for loading a single file in this + format. + + Notes + ----- + When fitting a model to a matrix X_train and evaluating it against a + matrix X_test, it is essential that X_train and X_test have the same + number of features (X_train.shape[1] == X_test.shape[1]). This may not + be the case if you load the files individually with load_svmlight_file. + + Examples + -------- + To use joblib.Memory to cache the svmlight file:: + + from joblib import Memory + from sklearn.datasets import load_svmlight_file + mem = Memory("./mycache") + + @mem.cache + def get_data(): + data_train, target_train, data_test, target_test = load_svmlight_files( + ["svmlight_file_train", "svmlight_file_test"] + ) + return data_train, target_train, data_test, target_test + + X_train, y_train, X_test, y_test = get_data() + """ + if (offset != 0 or length > 0) and zero_based == "auto": + # disable heuristic search to avoid getting inconsistent results on + # different segments of the file + zero_based = True + + if (offset != 0 or length > 0) and n_features is None: + raise ValueError("n_features is required when offset or length is specified.") + + r = [ + _open_and_load( + f, + dtype, + multilabel, + bool(zero_based), + bool(query_id), + offset=offset, + length=length, + ) + for f in files + ] + + if zero_based is False or ( + zero_based == "auto" and all(len(tmp[1]) and np.min(tmp[1]) > 0 for tmp in r) + ): + for _, indices, _, _, _ in r: + indices -= 1 + + n_f = max(ind[1].max() if len(ind[1]) else 0 for ind in r) + 1 + + if n_features is None: + n_features = n_f + elif n_features < n_f: + raise ValueError( + "n_features was set to {}, but input file contains {} features".format( + n_features, n_f + ) + ) + + result = [] + for data, indices, indptr, y, query_values in r: + shape = (indptr.shape[0] - 1, n_features) + X = sp.csr_matrix((data, indices, indptr), shape) + X.sort_indices() + result += X, y + if query_id: + result.append(query_values) + + return result + + +def _dump_svmlight(X, y, f, multilabel, one_based, comment, query_id): + if comment: + f.write( + ( + "# Generated by dump_svmlight_file from scikit-learn %s\n" % __version__ + ).encode() + ) + f.write( + ("# Column indices are %s-based\n" % ["zero", "one"][one_based]).encode() + ) + + f.write(b"#\n") + f.writelines(b"# %s\n" % line for line in comment.splitlines()) + X_is_sp = sp.issparse(X) + y_is_sp = sp.issparse(y) + if not multilabel and not y_is_sp: + y = y[:, np.newaxis] + _dump_svmlight_file( + X, + y, + f, + multilabel, + one_based, + query_id, + X_is_sp, + y_is_sp, + ) + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "y": ["array-like", "sparse matrix"], + "f": [str, HasMethods(["write"])], + "zero_based": ["boolean"], + "comment": [str, bytes, None], + "query_id": ["array-like", None], + "multilabel": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def dump_svmlight_file( + X, + y, + f, + *, + zero_based=True, + comment=None, + query_id=None, + multilabel=False, +): + """Dump the dataset in svmlight / libsvm file format. + + This format is a text-based format, with one sample per line. It does + not store zero valued features hence is suitable for sparse dataset. + + The first element of each line can be used to store a target variable + to predict. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : {array-like, sparse matrix}, shape = (n_samples,) or (n_samples, n_labels) + Target values. Class labels must be an + integer or float, or array-like objects of integer or float for + multilabel classifications. + + f : str or file-like in binary mode + If string, specifies the path that will contain the data. + If file-like, data will be written to f. f should be opened in binary + mode. + + zero_based : bool, default=True + Whether column indices should be written zero-based (True) or one-based + (False). + + comment : str or bytes, default=None + Comment to insert at the top of the file. This should be either a + Unicode string, which will be encoded as UTF-8, or an ASCII byte + string. + If a comment is given, then it will be preceded by one that identifies + the file as having been dumped by scikit-learn. Note that not all + tools grok comments in SVMlight files. + + query_id : array-like of shape (n_samples,), default=None + Array containing pairwise preference constraints (qid in svmlight + format). + + multilabel : bool, default=False + Samples may have several labels each (see + https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html). + + .. versionadded:: 0.17 + parameter `multilabel` to support multilabel datasets. + + Examples + -------- + >>> from sklearn.datasets import dump_svmlight_file, make_classification + >>> X, y = make_classification(random_state=0) + >>> output_file = "my_dataset.svmlight" + >>> dump_svmlight_file(X, y, output_file) # doctest: +SKIP + """ + if comment is not None: + # Convert comment string to list of lines in UTF-8. + # If a byte string is passed, then check whether it's ASCII; + # if a user wants to get fancy, they'll have to decode themselves. + if isinstance(comment, bytes): + comment.decode("ascii") # just for the exception + else: + comment = comment.encode("utf-8") + if b"\0" in comment: + raise ValueError("comment string contains NUL byte") + + yval = check_array(y, accept_sparse="csr", ensure_2d=False) + if sp.issparse(yval): + if yval.shape[1] != 1 and not multilabel: + raise ValueError( + "expected y of shape (n_samples, 1), got %r" % (yval.shape,) + ) + else: + if yval.ndim != 1 and not multilabel: + raise ValueError("expected y of shape (n_samples,), got %r" % (yval.shape,)) + + Xval = check_array(X, accept_sparse="csr") + if Xval.shape[0] != yval.shape[0]: + raise ValueError( + "X.shape[0] and y.shape[0] should be the same, got %r and %r instead." + % (Xval.shape[0], yval.shape[0]) + ) + + # We had some issues with CSR matrices with unsorted indices (e.g. #1501), + # so sort them here, but first make sure we don't modify the user's X. + # TODO We can do this cheaper; sorted_indices copies the whole matrix. + if yval is y and hasattr(yval, "sorted_indices"): + y = yval.sorted_indices() + else: + y = yval + if hasattr(y, "sort_indices"): + y.sort_indices() + + if Xval is X and hasattr(Xval, "sorted_indices"): + X = Xval.sorted_indices() + else: + X = Xval + if hasattr(X, "sort_indices"): + X.sort_indices() + + if query_id is None: + # NOTE: query_id is passed to Cython functions using a fused type on query_id. + # Yet as of Cython>=3.0, memory views can't be None otherwise the runtime + # would not known which concrete implementation to dispatch the Python call to. + # TODO: simplify interfaces and implementations in _svmlight_format_fast.pyx. + query_id = np.array([], dtype=np.int32) + else: + query_id = np.asarray(query_id) + if query_id.shape[0] != y.shape[0]: + raise ValueError( + "expected query_id of shape (n_samples,), got %r" % (query_id.shape,) + ) + + one_based = not zero_based + + if hasattr(f, "write"): + _dump_svmlight(X, y, f, multilabel, one_based, comment, query_id) + else: + with open(f, "wb") as f: + _dump_svmlight(X, y, f, multilabel, one_based, comment, query_id) diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/_twenty_newsgroups.py b/.venv/lib/python3.12/site-packages/sklearn/datasets/_twenty_newsgroups.py new file mode 100644 index 0000000000000000000000000000000000000000..1dc5fb6244f1b9411d9fa3147b4402bf2a68e559 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_twenty_newsgroups.py @@ -0,0 +1,625 @@ +"""Caching loader for the 20 newsgroups text classification dataset. + + +The description of the dataset is available on the official website at: + + http://people.csail.mit.edu/jrennie/20Newsgroups/ + +Quoting the introduction: + + The 20 Newsgroups data set is a collection of approximately 20,000 + newsgroup documents, partitioned (nearly) evenly across 20 different + newsgroups. To the best of my knowledge, it was originally collected + by Ken Lang, probably for his Newsweeder: Learning to filter netnews + paper, though he does not explicitly mention this collection. The 20 + newsgroups collection has become a popular data set for experiments + in text applications of machine learning techniques, such as text + classification and text clustering. + +This dataset loader will download the recommended "by date" variant of the +dataset and which features a point in time split between the train and +test sets. The compressed dataset size is around 14 Mb compressed. Once +uncompressed the train set is 52 MB and the test set is 34 MB. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import codecs +import logging +import os +import pickle +import re +import shutil +import tarfile +from contextlib import suppress +from numbers import Integral, Real + +import joblib +import numpy as np +import scipy.sparse as sp + +from .. import preprocessing +from ..feature_extraction.text import CountVectorizer +from ..utils import Bunch, check_random_state +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.fixes import tarfile_extractall +from . import get_data_home, load_files +from ._base import ( + RemoteFileMetadata, + _convert_data_dataframe, + _fetch_remote, + _pkl_filepath, + load_descr, +) + +logger = logging.getLogger(__name__) + +# The original data can be found at: +# https://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz +ARCHIVE = RemoteFileMetadata( + filename="20news-bydate.tar.gz", + url="https://ndownloader.figshare.com/files/5975967", + checksum="8f1b2514ca22a5ade8fbb9cfa5727df95fa587f4c87b786e15c759fa66d95610", +) + +CACHE_NAME = "20news-bydate.pkz" +TRAIN_FOLDER = "20news-bydate-train" +TEST_FOLDER = "20news-bydate-test" + + +def _download_20newsgroups(target_dir, cache_path, n_retries, delay): + """Download the 20 newsgroups data and stored it as a zipped pickle.""" + train_path = os.path.join(target_dir, TRAIN_FOLDER) + test_path = os.path.join(target_dir, TEST_FOLDER) + + os.makedirs(target_dir, exist_ok=True) + + logger.info("Downloading dataset from %s (14 MB)", ARCHIVE.url) + archive_path = _fetch_remote( + ARCHIVE, dirname=target_dir, n_retries=n_retries, delay=delay + ) + + logger.debug("Decompressing %s", archive_path) + with tarfile.open(archive_path, "r:gz") as fp: + tarfile_extractall(fp, path=target_dir) + + with suppress(FileNotFoundError): + os.remove(archive_path) + + # Store a zipped pickle + cache = dict( + train=load_files(train_path, encoding="latin1"), + test=load_files(test_path, encoding="latin1"), + ) + compressed_content = codecs.encode(pickle.dumps(cache), "zlib_codec") + with open(cache_path, "wb") as f: + f.write(compressed_content) + + shutil.rmtree(target_dir) + return cache + + +def strip_newsgroup_header(text): + """ + Given text in "news" format, strip the headers, by removing everything + before the first blank line. + + Parameters + ---------- + text : str + The text from which to remove the signature block. + """ + _before, _blankline, after = text.partition("\n\n") + return after + + +_QUOTE_RE = re.compile( + r"(writes in|writes:|wrote:|says:|said:|^In article|^Quoted from|^\||^>)" +) + + +def strip_newsgroup_quoting(text): + """ + Given text in "news" format, strip lines beginning with the quote + characters > or |, plus lines that often introduce a quoted section + (for example, because they contain the string 'writes:'.) + + Parameters + ---------- + text : str + The text from which to remove the signature block. + """ + good_lines = [line for line in text.split("\n") if not _QUOTE_RE.search(line)] + return "\n".join(good_lines) + + +def strip_newsgroup_footer(text): + """ + Given text in "news" format, attempt to remove a signature block. + + As a rough heuristic, we assume that signatures are set apart by either + a blank line or a line made of hyphens, and that it is the last such line + in the file (disregarding blank lines at the end). + + Parameters + ---------- + text : str + The text from which to remove the signature block. + """ + lines = text.strip().split("\n") + for line_num in range(len(lines) - 1, -1, -1): + line = lines[line_num] + if line.strip().strip("-") == "": + break + + if line_num > 0: + return "\n".join(lines[:line_num]) + else: + return text + + +@validate_params( + { + "data_home": [str, os.PathLike, None], + "subset": [StrOptions({"train", "test", "all"})], + "categories": ["array-like", None], + "shuffle": ["boolean"], + "random_state": ["random_state"], + "remove": [tuple], + "download_if_missing": ["boolean"], + "return_X_y": ["boolean"], + "n_retries": [Interval(Integral, 1, None, closed="left")], + "delay": [Interval(Real, 0.0, None, closed="neither")], + }, + prefer_skip_nested_validation=True, +) +def fetch_20newsgroups( + *, + data_home=None, + subset="train", + categories=None, + shuffle=True, + random_state=42, + remove=(), + download_if_missing=True, + return_X_y=False, + n_retries=3, + delay=1.0, +): + """Load the filenames and data from the 20 newsgroups dataset \ +(classification). + + Download it if necessary. + + ================= ========== + Classes 20 + Samples total 18846 + Dimensionality 1 + Features text + ================= ========== + + Read more in the :ref:`User Guide <20newsgroups_dataset>`. + + Parameters + ---------- + data_home : str or path-like, default=None + Specify a download and cache folder for the datasets. If None, + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + + subset : {'train', 'test', 'all'}, default='train' + Select the dataset to load: 'train' for the training set, 'test' + for the test set, 'all' for both, with shuffled ordering. + + categories : array-like, dtype=str, default=None + If None (default), load all the categories. + If not None, list of category names to load (other categories + ignored). + + shuffle : bool, default=True + Whether or not to shuffle the data: might be important for models that + make the assumption that the samples are independent and identically + distributed (i.i.d.), such as stochastic gradient descent. + + random_state : int, RandomState instance or None, default=42 + Determines random number generation for dataset shuffling. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. + + remove : tuple, default=() + May contain any subset of ('headers', 'footers', 'quotes'). Each of + these are kinds of text that will be detected and removed from the + newsgroup posts, preventing classifiers from overfitting on + metadata. + + 'headers' removes newsgroup headers, 'footers' removes blocks at the + ends of posts that look like signatures, and 'quotes' removes lines + that appear to be quoting another post. + + 'headers' follows an exact standard; the other filters are not always + correct. + + download_if_missing : bool, default=True + If False, raise an OSError if the data is not locally available + instead of trying to download the data from the source site. + + return_X_y : bool, default=False + If True, returns `(data.data, data.target)` instead of a Bunch + object. + + .. versionadded:: 0.22 + + n_retries : int, default=3 + Number of retries when HTTP errors are encountered. + + .. versionadded:: 1.5 + + delay : float, default=1.0 + Number of seconds between retries. + + .. versionadded:: 1.5 + + Returns + ------- + bunch : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data : list of shape (n_samples,) + The data list to learn. + target: ndarray of shape (n_samples,) + The target labels. + filenames: list of shape (n_samples,) + The path to the location of the data. + DESCR: str + The full description of the dataset. + target_names: list of shape (n_classes,) + The names of target classes. + + (data, target) : tuple if `return_X_y=True` + A tuple of two ndarrays. The first contains a 2D array of shape + (n_samples, n_classes) with each row representing one sample and each + column representing the features. The second array of shape + (n_samples,) contains the target samples. + + .. versionadded:: 0.22 + + Examples + -------- + >>> from sklearn.datasets import fetch_20newsgroups + >>> cats = ['alt.atheism', 'sci.space'] + >>> newsgroups_train = fetch_20newsgroups(subset='train', categories=cats) + >>> list(newsgroups_train.target_names) + ['alt.atheism', 'sci.space'] + >>> newsgroups_train.filenames.shape + (1073,) + >>> newsgroups_train.target.shape + (1073,) + >>> newsgroups_train.target[:10] + array([0, 1, 1, 1, 0, 1, 1, 0, 0, 0]) + """ + + data_home = get_data_home(data_home=data_home) + cache_path = _pkl_filepath(data_home, CACHE_NAME) + twenty_home = os.path.join(data_home, "20news_home") + cache = None + if os.path.exists(cache_path): + try: + with open(cache_path, "rb") as f: + compressed_content = f.read() + uncompressed_content = codecs.decode(compressed_content, "zlib_codec") + cache = pickle.loads(uncompressed_content) + except Exception as e: + print(80 * "_") + print("Cache loading failed") + print(80 * "_") + print(e) + + if cache is None: + if download_if_missing: + logger.info("Downloading 20news dataset. This may take a few minutes.") + cache = _download_20newsgroups( + target_dir=twenty_home, + cache_path=cache_path, + n_retries=n_retries, + delay=delay, + ) + else: + raise OSError("20Newsgroups dataset not found") + + if subset in ("train", "test"): + data = cache[subset] + elif subset == "all": + data_lst = list() + target = list() + filenames = list() + for subset in ("train", "test"): + data = cache[subset] + data_lst.extend(data.data) + target.extend(data.target) + filenames.extend(data.filenames) + + data.data = data_lst + data.target = np.array(target) + data.filenames = np.array(filenames) + + fdescr = load_descr("twenty_newsgroups.rst") + + data.DESCR = fdescr + + if "headers" in remove: + data.data = [strip_newsgroup_header(text) for text in data.data] + if "footers" in remove: + data.data = [strip_newsgroup_footer(text) for text in data.data] + if "quotes" in remove: + data.data = [strip_newsgroup_quoting(text) for text in data.data] + + if categories is not None: + labels = [(data.target_names.index(cat), cat) for cat in categories] + # Sort the categories to have the ordering of the labels + labels.sort() + labels, categories = zip(*labels) + mask = np.isin(data.target, labels) + data.filenames = data.filenames[mask] + data.target = data.target[mask] + # searchsorted to have continuous labels + data.target = np.searchsorted(labels, data.target) + data.target_names = list(categories) + # Use an object array to shuffle: avoids memory copy + data_lst = np.array(data.data, dtype=object) + data_lst = data_lst[mask] + data.data = data_lst.tolist() + + if shuffle: + random_state = check_random_state(random_state) + indices = np.arange(data.target.shape[0]) + random_state.shuffle(indices) + data.filenames = data.filenames[indices] + data.target = data.target[indices] + # Use an object array to shuffle: avoids memory copy + data_lst = np.array(data.data, dtype=object) + data_lst = data_lst[indices] + data.data = data_lst.tolist() + + if return_X_y: + return data.data, data.target + + return data + + +@validate_params( + { + "subset": [StrOptions({"train", "test", "all"})], + "remove": [tuple], + "data_home": [str, os.PathLike, None], + "download_if_missing": ["boolean"], + "return_X_y": ["boolean"], + "normalize": ["boolean"], + "as_frame": ["boolean"], + "n_retries": [Interval(Integral, 1, None, closed="left")], + "delay": [Interval(Real, 0.0, None, closed="neither")], + }, + prefer_skip_nested_validation=True, +) +def fetch_20newsgroups_vectorized( + *, + subset="train", + remove=(), + data_home=None, + download_if_missing=True, + return_X_y=False, + normalize=True, + as_frame=False, + n_retries=3, + delay=1.0, +): + """Load and vectorize the 20 newsgroups dataset (classification). + + Download it if necessary. + + This is a convenience function; the transformation is done using the + default settings for + :class:`~sklearn.feature_extraction.text.CountVectorizer`. For more + advanced usage (stopword filtering, n-gram extraction, etc.), combine + fetch_20newsgroups with a custom + :class:`~sklearn.feature_extraction.text.CountVectorizer`, + :class:`~sklearn.feature_extraction.text.HashingVectorizer`, + :class:`~sklearn.feature_extraction.text.TfidfTransformer` or + :class:`~sklearn.feature_extraction.text.TfidfVectorizer`. + + The resulting counts are normalized using + :func:`sklearn.preprocessing.normalize` unless normalize is set to False. + + ================= ========== + Classes 20 + Samples total 18846 + Dimensionality 130107 + Features real + ================= ========== + + Read more in the :ref:`User Guide <20newsgroups_dataset>`. + + Parameters + ---------- + subset : {'train', 'test', 'all'}, default='train' + Select the dataset to load: 'train' for the training set, 'test' + for the test set, 'all' for both, with shuffled ordering. + + remove : tuple, default=() + May contain any subset of ('headers', 'footers', 'quotes'). Each of + these are kinds of text that will be detected and removed from the + newsgroup posts, preventing classifiers from overfitting on + metadata. + + 'headers' removes newsgroup headers, 'footers' removes blocks at the + ends of posts that look like signatures, and 'quotes' removes lines + that appear to be quoting another post. + + data_home : str or path-like, default=None + Specify an download and cache folder for the datasets. If None, + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. + + download_if_missing : bool, default=True + If False, raise an OSError if the data is not locally available + instead of trying to download the data from the source site. + + return_X_y : bool, default=False + If True, returns ``(data.data, data.target)`` instead of a Bunch + object. + + .. versionadded:: 0.20 + + normalize : bool, default=True + If True, normalizes each document's feature vector to unit norm using + :func:`sklearn.preprocessing.normalize`. + + .. versionadded:: 0.22 + + as_frame : bool, default=False + If True, the data is a pandas DataFrame including columns with + appropriate dtypes (numeric, string, or categorical). The target is + a pandas DataFrame or Series depending on the number of + `target_columns`. + + .. versionadded:: 0.24 + + n_retries : int, default=3 + Number of retries when HTTP errors are encountered. + + .. versionadded:: 1.5 + + delay : float, default=1.0 + Number of seconds between retries. + + .. versionadded:: 1.5 + + Returns + ------- + bunch : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + data: {sparse matrix, dataframe} of shape (n_samples, n_features) + The input data matrix. If ``as_frame`` is `True`, ``data`` is + a pandas DataFrame with sparse columns. + target: {ndarray, series} of shape (n_samples,) + The target labels. If ``as_frame`` is `True`, ``target`` is a + pandas Series. + target_names: list of shape (n_classes,) + The names of target classes. + DESCR: str + The full description of the dataset. + frame: dataframe of shape (n_samples, n_features + 1) + Only present when `as_frame=True`. Pandas DataFrame with ``data`` + and ``target``. + + .. versionadded:: 0.24 + + (data, target) : tuple if ``return_X_y`` is True + `data` and `target` would be of the format defined in the `Bunch` + description above. + + .. versionadded:: 0.20 + + Examples + -------- + >>> from sklearn.datasets import fetch_20newsgroups_vectorized + >>> newsgroups_vectorized = fetch_20newsgroups_vectorized(subset='test') + >>> newsgroups_vectorized.data.shape + (7532, 130107) + >>> newsgroups_vectorized.target.shape + (7532,) + """ + data_home = get_data_home(data_home=data_home) + filebase = "20newsgroup_vectorized" + if remove: + filebase += "remove-" + "-".join(remove) + target_file = _pkl_filepath(data_home, filebase + ".pkl") + + # we shuffle but use a fixed seed for the memoization + data_train = fetch_20newsgroups( + data_home=data_home, + subset="train", + categories=None, + shuffle=True, + random_state=12, + remove=remove, + download_if_missing=download_if_missing, + n_retries=n_retries, + delay=delay, + ) + + data_test = fetch_20newsgroups( + data_home=data_home, + subset="test", + categories=None, + shuffle=True, + random_state=12, + remove=remove, + download_if_missing=download_if_missing, + n_retries=n_retries, + delay=delay, + ) + + if os.path.exists(target_file): + try: + X_train, X_test, feature_names = joblib.load(target_file) + except ValueError as e: + raise ValueError( + f"The cached dataset located in {target_file} was fetched " + "with an older scikit-learn version and it is not compatible " + "with the scikit-learn version imported. You need to " + f"manually delete the file: {target_file}." + ) from e + else: + vectorizer = CountVectorizer(dtype=np.int16) + X_train = vectorizer.fit_transform(data_train.data).tocsr() + X_test = vectorizer.transform(data_test.data).tocsr() + feature_names = vectorizer.get_feature_names_out() + + joblib.dump((X_train, X_test, feature_names), target_file, compress=9) + + # the data is stored as int16 for compactness + # but normalize needs floats + if normalize: + X_train = X_train.astype(np.float64) + X_test = X_test.astype(np.float64) + preprocessing.normalize(X_train, copy=False) + preprocessing.normalize(X_test, copy=False) + + target_names = data_train.target_names + + if subset == "train": + data = X_train + target = data_train.target + elif subset == "test": + data = X_test + target = data_test.target + elif subset == "all": + data = sp.vstack((X_train, X_test)).tocsr() + target = np.concatenate((data_train.target, data_test.target)) + + fdescr = load_descr("twenty_newsgroups.rst") + + frame = None + target_name = ["category_class"] + + if as_frame: + frame, data, target = _convert_data_dataframe( + "fetch_20newsgroups_vectorized", + data, + target, + feature_names, + target_names=target_name, + sparse_data=True, + ) + + if return_X_y: + return data, target + + return Bunch( + data=data, + target=target, + frame=frame, + target_names=target_names, + feature_names=feature_names, + DESCR=fdescr, + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/datasets/meson.build b/.venv/lib/python3.12/site-packages/sklearn/datasets/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..4efcd279315de3478eae4da682c9760c58a8f92b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/meson.build @@ -0,0 +1,7 @@ +py.extension_module( + '_svmlight_format_fast', + cython_gen.process('_svmlight_format_fast.pyx'), + dependencies: [np_dep], + subdir: 'sklearn/datasets', + install: true +) diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6d3fa9b42895a624ac2f3b50a14155c2c5fffd82 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/__init__.py @@ -0,0 +1,54 @@ +"""Matrix decomposition algorithms. + +These include PCA, NMF, ICA, and more. Most of the algorithms of this module can be +regarded as dimensionality reduction techniques. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ..utils.extmath import randomized_svd +from ._dict_learning import ( + DictionaryLearning, + MiniBatchDictionaryLearning, + SparseCoder, + dict_learning, + dict_learning_online, + sparse_encode, +) +from ._factor_analysis import FactorAnalysis +from ._fastica import FastICA, fastica +from ._incremental_pca import IncrementalPCA +from ._kernel_pca import KernelPCA +from ._lda import LatentDirichletAllocation +from ._nmf import ( + NMF, + MiniBatchNMF, + non_negative_factorization, +) +from ._pca import PCA +from ._sparse_pca import MiniBatchSparsePCA, SparsePCA +from ._truncated_svd import TruncatedSVD + +__all__ = [ + "NMF", + "PCA", + "DictionaryLearning", + "FactorAnalysis", + "FastICA", + "IncrementalPCA", + "KernelPCA", + "LatentDirichletAllocation", + "MiniBatchDictionaryLearning", + "MiniBatchNMF", + "MiniBatchSparsePCA", + "SparseCoder", + "SparsePCA", + "TruncatedSVD", + "dict_learning", + "dict_learning_online", + "fastica", + "non_negative_factorization", + "randomized_svd", + "sparse_encode", +] diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_base.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..783c316b50f27b784767b019be3605be9b832027 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_base.py @@ -0,0 +1,202 @@ +"""Principal Component Analysis Base Classes""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from abc import ABCMeta, abstractmethod + +import numpy as np +from scipy import linalg + +from ..base import BaseEstimator, ClassNamePrefixFeaturesOutMixin, TransformerMixin +from ..utils._array_api import _fill_or_add_to_diagonal, device, get_namespace +from ..utils.validation import check_is_fitted, validate_data + + +class _BasePCA( + ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator, metaclass=ABCMeta +): + """Base class for PCA methods. + + Warning: This class should not be used directly. + Use derived classes instead. + """ + + def get_covariance(self): + """Compute data covariance with the generative model. + + ``cov = components_.T * S**2 * components_ + sigma2 * eye(n_features)`` + where S**2 contains the explained variances, and sigma2 contains the + noise variances. + + Returns + ------- + cov : array of shape=(n_features, n_features) + Estimated covariance of data. + """ + xp, _ = get_namespace(self.components_) + + components_ = self.components_ + exp_var = self.explained_variance_ + if self.whiten: + components_ = components_ * xp.sqrt(exp_var[:, np.newaxis]) + exp_var_diff = exp_var - self.noise_variance_ + exp_var_diff = xp.where( + exp_var > self.noise_variance_, + exp_var_diff, + xp.asarray(0.0, device=device(exp_var), dtype=exp_var.dtype), + ) + cov = (components_.T * exp_var_diff) @ components_ + _fill_or_add_to_diagonal(cov, self.noise_variance_, xp) + return cov + + def get_precision(self): + """Compute data precision matrix with the generative model. + + Equals the inverse of the covariance but computed with + the matrix inversion lemma for efficiency. + + Returns + ------- + precision : array, shape=(n_features, n_features) + Estimated precision of data. + """ + xp, is_array_api_compliant = get_namespace(self.components_) + + n_features = self.components_.shape[1] + + # handle corner cases first + if self.n_components_ == 0: + return xp.eye(n_features) / self.noise_variance_ + + if is_array_api_compliant: + linalg_inv = xp.linalg.inv + else: + linalg_inv = linalg.inv + + if self.noise_variance_ == 0.0: + return linalg_inv(self.get_covariance()) + + # Get precision using matrix inversion lemma + components_ = self.components_ + exp_var = self.explained_variance_ + if self.whiten: + components_ = components_ * xp.sqrt(exp_var[:, np.newaxis]) + exp_var_diff = exp_var - self.noise_variance_ + exp_var_diff = xp.where( + exp_var > self.noise_variance_, + exp_var_diff, + xp.asarray(0.0, device=device(exp_var)), + ) + precision = components_ @ components_.T / self.noise_variance_ + _fill_or_add_to_diagonal(precision, 1.0 / exp_var_diff, xp) + precision = components_.T @ linalg_inv(precision) @ components_ + precision /= -(self.noise_variance_**2) + _fill_or_add_to_diagonal(precision, 1.0 / self.noise_variance_, xp) + return precision + + @abstractmethod + def fit(self, X, y=None): + """Placeholder for fit. Subclasses should implement this method! + + Fit the model with X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples and + `n_features` is the number of features. + + Returns + ------- + self : object + Returns the instance itself. + """ + + def transform(self, X): + """Apply dimensionality reduction to X. + + X is projected on the first principal components previously extracted + from a training set. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + New data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + Returns + ------- + X_new : array-like of shape (n_samples, n_components) + Projection of X in the first principal components, where `n_samples` + is the number of samples and `n_components` is the number of the components. + """ + xp, _ = get_namespace(X, self.components_, self.explained_variance_) + + check_is_fitted(self) + + X = validate_data( + self, + X, + dtype=[xp.float64, xp.float32], + accept_sparse=("csr", "csc"), + reset=False, + ) + return self._transform(X, xp=xp, x_is_centered=False) + + def _transform(self, X, xp, x_is_centered=False): + X_transformed = X @ self.components_.T + if not x_is_centered: + # Apply the centering after the projection. + # For dense X this avoids copying or mutating the data passed by + # the caller. + # For sparse X it keeps sparsity and avoids having to wrap X into + # a linear operator. + X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T + if self.whiten: + # For some solvers (such as "arpack" and "covariance_eigh"), on + # rank deficient data, some components can have a variance + # arbitrarily close to zero, leading to non-finite results when + # whitening. To avoid this problem we clip the variance below. + scale = xp.sqrt(self.explained_variance_) + min_scale = xp.finfo(scale.dtype).eps + scale[scale < min_scale] = min_scale + X_transformed /= scale + return X_transformed + + def inverse_transform(self, X): + """Transform data back to its original space. + + In other words, return an input `X_original` whose transform would be X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_components) + New data, where `n_samples` is the number of samples + and `n_components` is the number of components. + + Returns + ------- + X_original : array-like of shape (n_samples, n_features) + Original data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + Notes + ----- + If whitening is enabled, inverse_transform will compute the + exact inverse operation, which includes reversing whitening. + """ + xp, _ = get_namespace(X) + + if self.whiten: + scaled_components = ( + xp.sqrt(self.explained_variance_[:, np.newaxis]) * self.components_ + ) + return X @ scaled_components + self.mean_ + else: + return X @ self.components_ + self.mean_ + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_cdnmf_fast.pyx b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_cdnmf_fast.pyx new file mode 100644 index 0000000000000000000000000000000000000000..b2a07fb275bded974524b0c372a931de850d9142 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_cdnmf_fast.pyx @@ -0,0 +1,38 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from cython cimport floating +from libc.math cimport fabs + + +def _update_cdnmf_fast(floating[:, ::1] W, floating[:, :] HHt, + floating[:, :] XHt, Py_ssize_t[::1] permutation): + cdef: + floating violation = 0 + Py_ssize_t n_components = W.shape[1] + Py_ssize_t n_samples = W.shape[0] # n_features for H update + floating grad, pg, hess + Py_ssize_t i, r, s, t + + with nogil: + for s in range(n_components): + t = permutation[s] + + for i in range(n_samples): + # gradient = GW[t, i] where GW = np.dot(W, HHt) - XHt + grad = -XHt[i, t] + + for r in range(n_components): + grad += HHt[t, r] * W[i, r] + + # projected gradient + pg = min(0., grad) if W[i, t] == 0 else grad + violation += fabs(pg) + + # Hessian + hess = HHt[t, t] + + if hess != 0: + W[i, t] = max(W[i, t] - grad / hess, 0.) + + return violation diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_dict_learning.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_dict_learning.py new file mode 100644 index 0000000000000000000000000000000000000000..ae40e28e9f013295dc5b2c4c8dd365fda7ac6bc6 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_dict_learning.py @@ -0,0 +1,2329 @@ +"""Dictionary learning.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import itertools +import sys +import time +from numbers import Integral, Real + +import numpy as np +from joblib import effective_n_jobs +from scipy import linalg + +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) +from ..linear_model import Lars, Lasso, LassoLars, orthogonal_mp_gram +from ..utils import check_array, check_random_state, gen_batches, gen_even_slices +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.extmath import _randomized_svd, row_norms, svd_flip +from ..utils.parallel import Parallel, delayed +from ..utils.validation import check_is_fitted, validate_data + + +def _check_positive_coding(method, positive): + if positive and method in ["omp", "lars"]: + raise ValueError( + "Positive constraint not supported for '{}' coding method.".format(method) + ) + + +def _sparse_encode_precomputed( + X, + dictionary, + *, + gram=None, + cov=None, + algorithm="lasso_lars", + regularization=None, + copy_cov=True, + init=None, + max_iter=1000, + verbose=0, + positive=False, +): + """Generic sparse coding with precomputed Gram and/or covariance matrices. + + Each row of the result is the solution to a Lasso problem. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Data matrix. + + dictionary : ndarray of shape (n_components, n_features) + The dictionary matrix against which to solve the sparse coding of + the data. Some of the algorithms assume normalized rows. + + gram : ndarray of shape (n_components, n_components), default=None + Precomputed Gram matrix, `dictionary * dictionary'` + gram can be `None` if method is 'threshold'. + + cov : ndarray of shape (n_components, n_samples), default=None + Precomputed covariance, `dictionary * X'`. + + algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, \ + default='lasso_lars' + The algorithm used: + + * `'lars'`: uses the least angle regression method + (`linear_model.lars_path`); + * `'lasso_lars'`: uses Lars to compute the Lasso solution; + * `'lasso_cd'`: uses the coordinate descent method to compute the + Lasso solution (`linear_model.Lasso`). lasso_lars will be faster if + the estimated components are sparse; + * `'omp'`: uses orthogonal matching pursuit to estimate the sparse + solution; + * `'threshold'`: squashes to zero all coefficients less than + regularization from the projection `dictionary * data'`. + + regularization : int or float, default=None + The regularization parameter. It corresponds to alpha when + algorithm is `'lasso_lars'`, `'lasso_cd'` or `'threshold'`. + Otherwise it corresponds to `n_nonzero_coefs`. + + init : ndarray of shape (n_samples, n_components), default=None + Initialization value of the sparse code. Only used if + `algorithm='lasso_cd'`. + + max_iter : int, default=1000 + Maximum number of iterations to perform if `algorithm='lasso_cd'` or + `'lasso_lars'`. + + copy_cov : bool, default=True + Whether to copy the precomputed covariance matrix; if `False`, it may + be overwritten. + + verbose : int, default=0 + Controls the verbosity; the higher, the more messages. + + positive: bool, default=False + Whether to enforce a positivity constraint on the sparse code. + + .. versionadded:: 0.20 + + Returns + ------- + code : ndarray of shape (n_components, n_features) + The sparse codes. + """ + n_samples, n_features = X.shape + n_components = dictionary.shape[0] + + if algorithm == "lasso_lars": + alpha = float(regularization) / n_features # account for scaling + try: + err_mgt = np.seterr(all="ignore") + + # Not passing in verbose=max(0, verbose-1) because Lars.fit already + # corrects the verbosity level. + lasso_lars = LassoLars( + alpha=alpha, + fit_intercept=False, + verbose=verbose, + precompute=gram, + fit_path=False, + positive=positive, + max_iter=max_iter, + ) + lasso_lars.fit(dictionary.T, X.T, Xy=cov) + new_code = lasso_lars.coef_ + finally: + np.seterr(**err_mgt) + + elif algorithm == "lasso_cd": + alpha = float(regularization) / n_features # account for scaling + + # TODO: Make verbosity argument for Lasso? + # sklearn.linear_model.coordinate_descent.enet_path has a verbosity + # argument that we could pass in from Lasso. + clf = Lasso( + alpha=alpha, + fit_intercept=False, + precompute=gram, + max_iter=max_iter, + warm_start=True, + positive=positive, + ) + + if init is not None: + # In some workflows using coordinate descent algorithms: + # - users might provide NumPy arrays with read-only buffers + # - `joblib` might memmap arrays making their buffer read-only + # TODO: move this handling (which is currently too broad) + # closer to the actual private function which need buffers to be writable. + if not init.flags["WRITEABLE"]: + init = np.array(init) + clf.coef_ = init + + clf.fit(dictionary.T, X.T, check_input=False) + new_code = clf.coef_ + + elif algorithm == "lars": + try: + err_mgt = np.seterr(all="ignore") + + # Not passing in verbose=max(0, verbose-1) because Lars.fit already + # corrects the verbosity level. + lars = Lars( + fit_intercept=False, + verbose=verbose, + precompute=gram, + n_nonzero_coefs=int(regularization), + fit_path=False, + ) + lars.fit(dictionary.T, X.T, Xy=cov) + new_code = lars.coef_ + finally: + np.seterr(**err_mgt) + + elif algorithm == "threshold": + new_code = (np.sign(cov) * np.maximum(np.abs(cov) - regularization, 0)).T + if positive: + np.clip(new_code, 0, None, out=new_code) + + elif algorithm == "omp": + new_code = orthogonal_mp_gram( + Gram=gram, + Xy=cov, + n_nonzero_coefs=int(regularization), + tol=None, + norms_squared=row_norms(X, squared=True), + copy_Xy=copy_cov, + ).T + + return new_code.reshape(n_samples, n_components) + + +@validate_params( + { + "X": ["array-like"], + "dictionary": ["array-like"], + "gram": ["array-like", None], + "cov": ["array-like", None], + "algorithm": [ + StrOptions({"lasso_lars", "lasso_cd", "lars", "omp", "threshold"}) + ], + "n_nonzero_coefs": [Interval(Integral, 1, None, closed="left"), None], + "alpha": [Interval(Real, 0, None, closed="left"), None], + "copy_cov": ["boolean"], + "init": ["array-like", None], + "max_iter": [Interval(Integral, 0, None, closed="left")], + "n_jobs": [Integral, None], + "check_input": ["boolean"], + "verbose": ["verbose"], + "positive": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +# XXX : could be moved to the linear_model module +def sparse_encode( + X, + dictionary, + *, + gram=None, + cov=None, + algorithm="lasso_lars", + n_nonzero_coefs=None, + alpha=None, + copy_cov=True, + init=None, + max_iter=1000, + n_jobs=None, + check_input=True, + verbose=0, + positive=False, +): + """Sparse coding. + + Each row of the result is the solution to a sparse coding problem. + The goal is to find a sparse array `code` such that:: + + X ~= code * dictionary + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data matrix. + + dictionary : array-like of shape (n_components, n_features) + The dictionary matrix against which to solve the sparse coding of + the data. Some of the algorithms assume normalized rows for meaningful + output. + + gram : array-like of shape (n_components, n_components), default=None + Precomputed Gram matrix, `dictionary * dictionary'`. + + cov : array-like of shape (n_components, n_samples), default=None + Precomputed covariance, `dictionary' * X`. + + algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, \ + default='lasso_lars' + The algorithm used: + + * `'lars'`: uses the least angle regression method + (`linear_model.lars_path`); + * `'lasso_lars'`: uses Lars to compute the Lasso solution; + * `'lasso_cd'`: uses the coordinate descent method to compute the + Lasso solution (`linear_model.Lasso`). lasso_lars will be faster if + the estimated components are sparse; + * `'omp'`: uses orthogonal matching pursuit to estimate the sparse + solution; + * `'threshold'`: squashes to zero all coefficients less than + regularization from the projection `dictionary * data'`. + + n_nonzero_coefs : int, default=None + Number of nonzero coefficients to target in each column of the + solution. This is only used by `algorithm='lars'` and `algorithm='omp'` + and is overridden by `alpha` in the `omp` case. If `None`, then + `n_nonzero_coefs=int(n_features / 10)`. + + alpha : float, default=None + If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the + penalty applied to the L1 norm. + If `algorithm='threshold'`, `alpha` is the absolute value of the + threshold below which coefficients will be squashed to zero. + If `algorithm='omp'`, `alpha` is the tolerance parameter: the value of + the reconstruction error targeted. In this case, it overrides + `n_nonzero_coefs`. + If `None`, default to 1. + + copy_cov : bool, default=True + Whether to copy the precomputed covariance matrix; if `False`, it may + be overwritten. + + init : ndarray of shape (n_samples, n_components), default=None + Initialization value of the sparse codes. Only used if + `algorithm='lasso_cd'`. + + max_iter : int, default=1000 + Maximum number of iterations to perform if `algorithm='lasso_cd'` or + `'lasso_lars'`. + + n_jobs : int, default=None + Number of parallel jobs to run. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + check_input : bool, default=True + If `False`, the input arrays X and dictionary will not be checked. + + verbose : int, default=0 + Controls the verbosity; the higher, the more messages. + + positive : bool, default=False + Whether to enforce positivity when finding the encoding. + + .. versionadded:: 0.20 + + Returns + ------- + code : ndarray of shape (n_samples, n_components) + The sparse codes. + + See Also + -------- + sklearn.linear_model.lars_path : Compute Least Angle Regression or Lasso + path using LARS algorithm. + sklearn.linear_model.orthogonal_mp : Solves Orthogonal Matching Pursuit problems. + sklearn.linear_model.Lasso : Train Linear Model with L1 prior as regularizer. + SparseCoder : Find a sparse representation of data from a fixed precomputed + dictionary. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.decomposition import sparse_encode + >>> X = np.array([[-1, -1, -1], [0, 0, 3]]) + >>> dictionary = np.array( + ... [[0, 1, 0], + ... [-1, -1, 2], + ... [1, 1, 1], + ... [0, 1, 1], + ... [0, 2, 1]], + ... dtype=np.float64 + ... ) + >>> sparse_encode(X, dictionary, alpha=1e-10) + array([[ 0., 0., -1., 0., 0.], + [ 0., 1., 1., 0., 0.]]) + """ + if check_input: + if algorithm == "lasso_cd": + dictionary = check_array( + dictionary, order="C", dtype=[np.float64, np.float32] + ) + X = check_array(X, order="C", dtype=[np.float64, np.float32]) + else: + dictionary = check_array(dictionary) + X = check_array(X) + + if dictionary.shape[1] != X.shape[1]: + raise ValueError( + "Dictionary and X have different numbers of features:" + "dictionary.shape: {} X.shape{}".format(dictionary.shape, X.shape) + ) + + _check_positive_coding(algorithm, positive) + + return _sparse_encode( + X, + dictionary, + gram=gram, + cov=cov, + algorithm=algorithm, + n_nonzero_coefs=n_nonzero_coefs, + alpha=alpha, + copy_cov=copy_cov, + init=init, + max_iter=max_iter, + n_jobs=n_jobs, + verbose=verbose, + positive=positive, + ) + + +def _sparse_encode( + X, + dictionary, + *, + gram=None, + cov=None, + algorithm="lasso_lars", + n_nonzero_coefs=None, + alpha=None, + copy_cov=True, + init=None, + max_iter=1000, + n_jobs=None, + verbose=0, + positive=False, +): + """Sparse coding without input/parameter validation.""" + + n_samples, n_features = X.shape + n_components = dictionary.shape[0] + + if algorithm in ("lars", "omp"): + regularization = n_nonzero_coefs + if regularization is None: + regularization = min(max(n_features / 10, 1), n_components) + else: + regularization = alpha + if regularization is None: + regularization = 1.0 + + if gram is None and algorithm != "threshold": + gram = np.dot(dictionary, dictionary.T) + + if cov is None and algorithm != "lasso_cd": + copy_cov = False + cov = np.dot(dictionary, X.T) + + if effective_n_jobs(n_jobs) == 1 or algorithm == "threshold": + code = _sparse_encode_precomputed( + X, + dictionary, + gram=gram, + cov=cov, + algorithm=algorithm, + regularization=regularization, + copy_cov=copy_cov, + init=init, + max_iter=max_iter, + verbose=verbose, + positive=positive, + ) + return code + + # Enter parallel code block + n_samples = X.shape[0] + n_components = dictionary.shape[0] + code = np.empty((n_samples, n_components)) + slices = list(gen_even_slices(n_samples, effective_n_jobs(n_jobs))) + + code_views = Parallel(n_jobs=n_jobs, verbose=verbose)( + delayed(_sparse_encode_precomputed)( + X[this_slice], + dictionary, + gram=gram, + cov=cov[:, this_slice] if cov is not None else None, + algorithm=algorithm, + regularization=regularization, + copy_cov=copy_cov, + init=init[this_slice] if init is not None else None, + max_iter=max_iter, + verbose=verbose, + positive=positive, + ) + for this_slice in slices + ) + for this_slice, this_view in zip(slices, code_views): + code[this_slice] = this_view + return code + + +def _update_dict( + dictionary, + Y, + code, + A=None, + B=None, + verbose=False, + random_state=None, + positive=False, +): + """Update the dense dictionary factor in place. + + Parameters + ---------- + dictionary : ndarray of shape (n_components, n_features) + Value of the dictionary at the previous iteration. + + Y : ndarray of shape (n_samples, n_features) + Data matrix. + + code : ndarray of shape (n_samples, n_components) + Sparse coding of the data against which to optimize the dictionary. + + A : ndarray of shape (n_components, n_components), default=None + Together with `B`, sufficient stats of the online model to update the + dictionary. + + B : ndarray of shape (n_features, n_components), default=None + Together with `A`, sufficient stats of the online model to update the + dictionary. + + verbose: bool, default=False + Degree of output the procedure will print. + + random_state : int, RandomState instance or None, default=None + Used for randomly initializing the dictionary. Pass an int for + reproducible results across multiple function calls. + See :term:`Glossary `. + + positive : bool, default=False + Whether to enforce positivity when finding the dictionary. + + .. versionadded:: 0.20 + """ + n_samples, n_components = code.shape + random_state = check_random_state(random_state) + + if A is None: + A = code.T @ code + if B is None: + B = Y.T @ code + + n_unused = 0 + + for k in range(n_components): + if A[k, k] > 1e-6: + # 1e-6 is arbitrary but consistent with the spams implementation + dictionary[k] += (B[:, k] - A[k] @ dictionary) / A[k, k] + else: + # kth atom is almost never used -> sample a new one from the data + newd = Y[random_state.choice(n_samples)] + + # add small noise to avoid making the sparse coding ill conditioned + noise_level = 0.01 * (newd.std() or 1) # avoid 0 std + noise = random_state.normal(0, noise_level, size=len(newd)) + + dictionary[k] = newd + noise + code[:, k] = 0 + n_unused += 1 + + if positive: + np.clip(dictionary[k], 0, None, out=dictionary[k]) + + # Projection on the constraint set ||V_k|| <= 1 + dictionary[k] /= max(linalg.norm(dictionary[k]), 1) + + if verbose and n_unused > 0: + print(f"{n_unused} unused atoms resampled.") + + +def _dict_learning( + X, + n_components, + *, + alpha, + max_iter, + tol, + method, + n_jobs, + dict_init, + code_init, + callback, + verbose, + random_state, + return_n_iter, + positive_dict, + positive_code, + method_max_iter, +): + """Main dictionary learning algorithm""" + t0 = time.time() + # Init the code and the dictionary with SVD of Y + if code_init is not None and dict_init is not None: + code = np.array(code_init, order="F") + # Don't copy V, it will happen below + dictionary = dict_init + else: + code, S, dictionary = linalg.svd(X, full_matrices=False) + # flip the initial code's sign to enforce deterministic output + code, dictionary = svd_flip(code, dictionary) + dictionary = S[:, np.newaxis] * dictionary + r = len(dictionary) + if n_components <= r: # True even if n_components=None + code = code[:, :n_components] + dictionary = dictionary[:n_components, :] + else: + code = np.c_[code, np.zeros((len(code), n_components - r))] + dictionary = np.r_[ + dictionary, np.zeros((n_components - r, dictionary.shape[1])) + ] + + # Fortran-order dict better suited for the sparse coding which is the + # bottleneck of this algorithm. + dictionary = np.asfortranarray(dictionary) + + errors = [] + current_cost = np.nan + + if verbose == 1: + print("[dict_learning]", end=" ") + + # If max_iter is 0, number of iterations returned should be zero + ii = -1 + + for ii in range(max_iter): + dt = time.time() - t0 + if verbose == 1: + sys.stdout.write(".") + sys.stdout.flush() + elif verbose: + print( + "Iteration % 3i (elapsed time: % 3is, % 4.1fmn, current cost % 7.3f)" + % (ii, dt, dt / 60, current_cost) + ) + + # Update code + code = sparse_encode( + X, + dictionary, + algorithm=method, + alpha=alpha, + init=code, + n_jobs=n_jobs, + positive=positive_code, + max_iter=method_max_iter, + verbose=verbose, + ) + + # Update dictionary in place + _update_dict( + dictionary, + X, + code, + verbose=verbose, + random_state=random_state, + positive=positive_dict, + ) + + # Cost function + current_cost = 0.5 * np.sum((X - code @ dictionary) ** 2) + alpha * np.sum( + np.abs(code) + ) + errors.append(current_cost) + + if ii > 0: + dE = errors[-2] - errors[-1] + # assert(dE >= -tol * errors[-1]) + if dE < tol * errors[-1]: + if verbose == 1: + # A line return + print("") + elif verbose: + print("--- Convergence reached after %d iterations" % ii) + break + if ii % 5 == 0 and callback is not None: + callback(locals()) + + if return_n_iter: + return code, dictionary, errors, ii + 1 + else: + return code, dictionary, errors + + +@validate_params( + { + "X": ["array-like"], + "return_code": ["boolean"], + "method": [StrOptions({"cd", "lars"})], + "method_max_iter": [Interval(Integral, 0, None, closed="left")], + }, + prefer_skip_nested_validation=False, +) +def dict_learning_online( + X, + n_components=2, + *, + alpha=1, + max_iter=100, + return_code=True, + dict_init=None, + callback=None, + batch_size=256, + verbose=False, + shuffle=True, + n_jobs=None, + method="lars", + random_state=None, + positive_dict=False, + positive_code=False, + method_max_iter=1000, + tol=1e-3, + max_no_improvement=10, +): + """Solve a dictionary learning matrix factorization problem online. + + Finds the best dictionary and the corresponding sparse code for + approximating the data matrix X by solving:: + + (U^*, V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1 + (U,V) + with || V_k ||_2 = 1 for all 0 <= k < n_components + + where V is the dictionary and U is the sparse code. ||.||_Fro stands for + the Frobenius norm and ||.||_1,1 stands for the entry-wise matrix norm + which is the sum of the absolute values of all the entries in the matrix. + This is accomplished by repeatedly iterating over mini-batches by slicing + the input data. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data matrix. + + n_components : int or None, default=2 + Number of dictionary atoms to extract. If None, then ``n_components`` + is set to ``n_features``. + + alpha : float, default=1 + Sparsity controlling parameter. + + max_iter : int, default=100 + Maximum number of iterations over the complete dataset before + stopping independently of any early stopping criterion heuristics. + + .. versionadded:: 1.1 + + return_code : bool, default=True + Whether to also return the code U or just the dictionary `V`. + + dict_init : ndarray of shape (n_components, n_features), default=None + Initial values for the dictionary for warm restart scenarios. + If `None`, the initial values for the dictionary are created + with an SVD decomposition of the data via + :func:`~sklearn.utils.extmath.randomized_svd`. + + callback : callable, default=None + A callable that gets invoked at the end of each iteration. + + batch_size : int, default=256 + The number of samples to take in each batch. + + .. versionchanged:: 1.3 + The default value of `batch_size` changed from 3 to 256 in version 1.3. + + verbose : bool, default=False + To control the verbosity of the procedure. + + shuffle : bool, default=True + Whether to shuffle the data before splitting it in batches. + + n_jobs : int, default=None + Number of parallel jobs to run. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + method : {'lars', 'cd'}, default='lars' + * `'lars'`: uses the least angle regression method to solve the lasso + problem (`linear_model.lars_path`); + * `'cd'`: uses the coordinate descent method to compute the + Lasso solution (`linear_model.Lasso`). Lars will be faster if + the estimated components are sparse. + + random_state : int, RandomState instance or None, default=None + Used for initializing the dictionary when ``dict_init`` is not + specified, randomly shuffling the data when ``shuffle`` is set to + ``True``, and updating the dictionary. Pass an int for reproducible + results across multiple function calls. + See :term:`Glossary `. + + positive_dict : bool, default=False + Whether to enforce positivity when finding the dictionary. + + .. versionadded:: 0.20 + + positive_code : bool, default=False + Whether to enforce positivity when finding the code. + + .. versionadded:: 0.20 + + method_max_iter : int, default=1000 + Maximum number of iterations to perform when solving the lasso problem. + + .. versionadded:: 0.22 + + tol : float, default=1e-3 + Control early stopping based on the norm of the differences in the + dictionary between 2 steps. + + To disable early stopping based on changes in the dictionary, set + `tol` to 0.0. + + .. versionadded:: 1.1 + + max_no_improvement : int, default=10 + Control early stopping based on the consecutive number of mini batches + that does not yield an improvement on the smoothed cost function. + + To disable convergence detection based on cost function, set + `max_no_improvement` to None. + + .. versionadded:: 1.1 + + Returns + ------- + code : ndarray of shape (n_samples, n_components), + The sparse code (only returned if `return_code=True`). + + dictionary : ndarray of shape (n_components, n_features), + The solutions to the dictionary learning problem. + + n_iter : int + Number of iterations run. Returned only if `return_n_iter` is + set to `True`. + + See Also + -------- + dict_learning : Solve a dictionary learning matrix factorization problem. + DictionaryLearning : Find a dictionary that sparsely encodes data. + MiniBatchDictionaryLearning : A faster, less accurate, version of the dictionary + learning algorithm. + SparsePCA : Sparse Principal Components Analysis. + MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.datasets import make_sparse_coded_signal + >>> from sklearn.decomposition import dict_learning_online + >>> X, _, _ = make_sparse_coded_signal( + ... n_samples=30, n_components=15, n_features=20, n_nonzero_coefs=10, + ... random_state=42, + ... ) + >>> U, V = dict_learning_online( + ... X, n_components=15, alpha=0.2, max_iter=20, batch_size=3, random_state=42 + ... ) + + We can check the level of sparsity of `U`: + + >>> np.mean(U == 0) + np.float64(0.53) + + We can compare the average squared euclidean norm of the reconstruction + error of the sparse coded signal relative to the squared euclidean norm of + the original signal: + + >>> X_hat = U @ V + >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1)) + np.float64(0.053) + """ + transform_algorithm = "lasso_" + method + + est = MiniBatchDictionaryLearning( + n_components=n_components, + alpha=alpha, + max_iter=max_iter, + n_jobs=n_jobs, + fit_algorithm=method, + batch_size=batch_size, + shuffle=shuffle, + dict_init=dict_init, + random_state=random_state, + transform_algorithm=transform_algorithm, + transform_alpha=alpha, + positive_code=positive_code, + positive_dict=positive_dict, + transform_max_iter=method_max_iter, + verbose=verbose, + callback=callback, + tol=tol, + max_no_improvement=max_no_improvement, + ).fit(X) + + if not return_code: + return est.components_ + else: + code = est.transform(X) + return code, est.components_ + + +@validate_params( + { + "X": ["array-like"], + "method": [StrOptions({"lars", "cd"})], + "return_n_iter": ["boolean"], + "method_max_iter": [Interval(Integral, 0, None, closed="left")], + }, + prefer_skip_nested_validation=False, +) +def dict_learning( + X, + n_components, + *, + alpha, + max_iter=100, + tol=1e-8, + method="lars", + n_jobs=None, + dict_init=None, + code_init=None, + callback=None, + verbose=False, + random_state=None, + return_n_iter=False, + positive_dict=False, + positive_code=False, + method_max_iter=1000, +): + """Solve a dictionary learning matrix factorization problem. + + Finds the best dictionary and the corresponding sparse code for + approximating the data matrix X by solving:: + + (U^*, V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1 + (U,V) + with || V_k ||_2 = 1 for all 0 <= k < n_components + + where V is the dictionary and U is the sparse code. ||.||_Fro stands for + the Frobenius norm and ||.||_1,1 stands for the entry-wise matrix norm + which is the sum of the absolute values of all the entries in the matrix. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data matrix. + + n_components : int + Number of dictionary atoms to extract. + + alpha : int or float + Sparsity controlling parameter. + + max_iter : int, default=100 + Maximum number of iterations to perform. + + tol : float, default=1e-8 + Tolerance for the stopping condition. + + method : {'lars', 'cd'}, default='lars' + The method used: + + * `'lars'`: uses the least angle regression method to solve the lasso + problem (`linear_model.lars_path`); + * `'cd'`: uses the coordinate descent method to compute the + Lasso solution (`linear_model.Lasso`). Lars will be faster if + the estimated components are sparse. + + n_jobs : int, default=None + Number of parallel jobs to run. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + dict_init : ndarray of shape (n_components, n_features), default=None + Initial value for the dictionary for warm restart scenarios. Only used + if `code_init` and `dict_init` are not None. + + code_init : ndarray of shape (n_samples, n_components), default=None + Initial value for the sparse code for warm restart scenarios. Only used + if `code_init` and `dict_init` are not None. + + callback : callable, default=None + Callable that gets invoked every five iterations. + + verbose : bool, default=False + To control the verbosity of the procedure. + + random_state : int, RandomState instance or None, default=None + Used for randomly initializing the dictionary. Pass an int for + reproducible results across multiple function calls. + See :term:`Glossary `. + + return_n_iter : bool, default=False + Whether or not to return the number of iterations. + + positive_dict : bool, default=False + Whether to enforce positivity when finding the dictionary. + + .. versionadded:: 0.20 + + positive_code : bool, default=False + Whether to enforce positivity when finding the code. + + .. versionadded:: 0.20 + + method_max_iter : int, default=1000 + Maximum number of iterations to perform. + + .. versionadded:: 0.22 + + Returns + ------- + code : ndarray of shape (n_samples, n_components) + The sparse code factor in the matrix factorization. + + dictionary : ndarray of shape (n_components, n_features), + The dictionary factor in the matrix factorization. + + errors : array + Vector of errors at each iteration. + + n_iter : int + Number of iterations run. Returned only if `return_n_iter` is + set to True. + + See Also + -------- + dict_learning_online : Solve a dictionary learning matrix factorization + problem online. + DictionaryLearning : Find a dictionary that sparsely encodes data. + MiniBatchDictionaryLearning : A faster, less accurate version + of the dictionary learning algorithm. + SparsePCA : Sparse Principal Components Analysis. + MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.datasets import make_sparse_coded_signal + >>> from sklearn.decomposition import dict_learning + >>> X, _, _ = make_sparse_coded_signal( + ... n_samples=30, n_components=15, n_features=20, n_nonzero_coefs=10, + ... random_state=42, + ... ) + >>> U, V, errors = dict_learning(X, n_components=15, alpha=0.1, random_state=42) + + We can check the level of sparsity of `U`: + + >>> np.mean(U == 0) + np.float64(0.62) + + We can compare the average squared euclidean norm of the reconstruction + error of the sparse coded signal relative to the squared euclidean norm of + the original signal: + + >>> X_hat = U @ V + >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1)) + np.float64(0.0192) + """ + estimator = DictionaryLearning( + n_components=n_components, + alpha=alpha, + max_iter=max_iter, + tol=tol, + fit_algorithm=method, + n_jobs=n_jobs, + dict_init=dict_init, + callback=callback, + code_init=code_init, + verbose=verbose, + random_state=random_state, + positive_code=positive_code, + positive_dict=positive_dict, + transform_max_iter=method_max_iter, + ).set_output(transform="default") + code = estimator.fit_transform(X) + if return_n_iter: + return ( + code, + estimator.components_, + estimator.error_, + estimator.n_iter_, + ) + return code, estimator.components_, estimator.error_ + + +class _BaseSparseCoding(ClassNamePrefixFeaturesOutMixin, TransformerMixin): + """Base class from SparseCoder and DictionaryLearning algorithms.""" + + def __init__( + self, + transform_algorithm, + transform_n_nonzero_coefs, + transform_alpha, + split_sign, + n_jobs, + positive_code, + transform_max_iter, + ): + self.transform_algorithm = transform_algorithm + self.transform_n_nonzero_coefs = transform_n_nonzero_coefs + self.transform_alpha = transform_alpha + self.transform_max_iter = transform_max_iter + self.split_sign = split_sign + self.n_jobs = n_jobs + self.positive_code = positive_code + + def _transform(self, X, dictionary): + """Private method allowing to accommodate both DictionaryLearning and + SparseCoder.""" + X = validate_data(self, X, reset=False) + + if hasattr(self, "alpha") and self.transform_alpha is None: + transform_alpha = self.alpha + else: + transform_alpha = self.transform_alpha + + code = sparse_encode( + X, + dictionary, + algorithm=self.transform_algorithm, + n_nonzero_coefs=self.transform_n_nonzero_coefs, + alpha=transform_alpha, + max_iter=self.transform_max_iter, + n_jobs=self.n_jobs, + positive=self.positive_code, + ) + + if self.split_sign: + # feature vector is split into a positive and negative side + n_samples, n_features = code.shape + split_code = np.empty((n_samples, 2 * n_features)) + split_code[:, :n_features] = np.maximum(code, 0) + split_code[:, n_features:] = -np.minimum(code, 0) + code = split_code + + return code + + def transform(self, X): + """Encode the data as a sparse combination of the dictionary atoms. + + Coding method is determined by the object parameter + `transform_algorithm`. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Test data to be transformed, must have the same number of + features as the data used to train the model. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_components) + Transformed data. + """ + check_is_fitted(self) + return self._transform(X, self.components_) + + def _inverse_transform(self, code, dictionary): + """Private method allowing to accommodate both DictionaryLearning and + SparseCoder.""" + code = check_array(code) + # compute number of expected features in code + expected_n_components = dictionary.shape[0] + if self.split_sign: + expected_n_components += expected_n_components + if not code.shape[1] == expected_n_components: + raise ValueError( + "The number of components in the code is different from the " + "number of components in the dictionary." + f"Expected {expected_n_components}, got {code.shape[1]}." + ) + if self.split_sign: + n_samples, n_features = code.shape + n_features //= 2 + code = code[:, :n_features] - code[:, n_features:] + + return code @ dictionary + + def inverse_transform(self, X): + """Transform data back to its original space. + + Parameters + ---------- + X : array-like of shape (n_samples, n_components) + Data to be transformed back. Must have the same number of + components as the data used to train the model. + + Returns + ------- + X_original : ndarray of shape (n_samples, n_features) + Transformed data. + """ + check_is_fitted(self) + return self._inverse_transform(X, self.components_) + + +class SparseCoder(_BaseSparseCoding, BaseEstimator): + """Sparse coding. + + Finds a sparse representation of data against a fixed, precomputed + dictionary. + + Each row of the result is the solution to a sparse coding problem. + The goal is to find a sparse array `code` such that:: + + X ~= code * dictionary + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + dictionary : ndarray of shape (n_components, n_features) + The dictionary atoms used for sparse coding. Lines are assumed to be + normalized to unit norm. + + transform_algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', \ + 'threshold'}, default='omp' + Algorithm used to transform the data: + + - `'lars'`: uses the least angle regression method + (`linear_model.lars_path`); + - `'lasso_lars'`: uses Lars to compute the Lasso solution; + - `'lasso_cd'`: uses the coordinate descent method to compute the + Lasso solution (linear_model.Lasso). `'lasso_lars'` will be faster if + the estimated components are sparse; + - `'omp'`: uses orthogonal matching pursuit to estimate the sparse + solution; + - `'threshold'`: squashes to zero all coefficients less than alpha from + the projection ``dictionary * X'``. + + transform_n_nonzero_coefs : int, default=None + Number of nonzero coefficients to target in each column of the + solution. This is only used by `algorithm='lars'` and `algorithm='omp'` + and is overridden by `alpha` in the `omp` case. If `None`, then + `transform_n_nonzero_coefs=int(n_features / 10)`. + + transform_alpha : float, default=None + If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the + penalty applied to the L1 norm. + If `algorithm='threshold'`, `alpha` is the absolute value of the + threshold below which coefficients will be squashed to zero. + If `algorithm='omp'`, `alpha` is the tolerance parameter: the value of + the reconstruction error targeted. In this case, it overrides + `n_nonzero_coefs`. + If `None`, default to 1. + + split_sign : bool, default=False + Whether to split the sparse feature vector into the concatenation of + its negative part and its positive part. This can improve the + performance of downstream classifiers. + + n_jobs : int, default=None + Number of parallel jobs to run. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + positive_code : bool, default=False + Whether to enforce positivity when finding the code. + + .. versionadded:: 0.20 + + transform_max_iter : int, default=1000 + Maximum number of iterations to perform if `algorithm='lasso_cd'` or + `lasso_lars`. + + .. versionadded:: 0.22 + + Attributes + ---------- + n_components_ : int + Number of atoms. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + DictionaryLearning : Find a dictionary that sparsely encodes data. + MiniBatchDictionaryLearning : A faster, less accurate, version of the + dictionary learning algorithm. + MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis. + SparsePCA : Sparse Principal Components Analysis. + sparse_encode : Sparse coding where each row of the result is the solution + to a sparse coding problem. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.decomposition import SparseCoder + >>> X = np.array([[-1, -1, -1], [0, 0, 3]]) + >>> dictionary = np.array( + ... [[0, 1, 0], + ... [-1, -1, 2], + ... [1, 1, 1], + ... [0, 1, 1], + ... [0, 2, 1]], + ... dtype=np.float64 + ... ) + >>> coder = SparseCoder( + ... dictionary=dictionary, transform_algorithm='lasso_lars', + ... transform_alpha=1e-10, + ... ) + >>> coder.transform(X) + array([[ 0., 0., -1., 0., 0.], + [ 0., 1., 1., 0., 0.]]) + """ + + def __init__( + self, + dictionary, + *, + transform_algorithm="omp", + transform_n_nonzero_coefs=None, + transform_alpha=None, + split_sign=False, + n_jobs=None, + positive_code=False, + transform_max_iter=1000, + ): + super().__init__( + transform_algorithm, + transform_n_nonzero_coefs, + transform_alpha, + split_sign, + n_jobs, + positive_code, + transform_max_iter, + ) + self.dictionary = dictionary + + def fit(self, X, y=None): + """Do nothing and return the estimator unchanged. + + This method is just there to implement the usual API and hence + work in pipelines. + + Parameters + ---------- + X : Ignored + Not used, present for API consistency by convention. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Returns the instance itself. + """ + return self + + def transform(self, X, y=None): + """Encode the data as a sparse combination of the dictionary atoms. + + Coding method is determined by the object parameter + `transform_algorithm`. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_components) + Transformed data. + """ + return super()._transform(X, self.dictionary) + + def inverse_transform(self, X): + """Transform data back to its original space. + + Parameters + ---------- + X : array-like of shape (n_samples, n_components) + Data to be transformed back. Must have the same number of + components as the data used to train the model. + + Returns + ------- + X_original : ndarray of shape (n_samples, n_features) + Transformed data. + """ + return self._inverse_transform(X, self.dictionary) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.requires_fit = False + tags.transformer_tags.preserves_dtype = ["float64", "float32"] + return tags + + @property + def n_components_(self): + """Number of atoms.""" + return self.dictionary.shape[0] + + @property + def n_features_in_(self): + """Number of features seen during `fit`.""" + return self.dictionary.shape[1] + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.n_components_ + + +class DictionaryLearning(_BaseSparseCoding, BaseEstimator): + """Dictionary learning. + + Finds a dictionary (a set of atoms) that performs well at sparsely + encoding the fitted data. + + Solves the optimization problem:: + + (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1 + (U,V) + with || V_k ||_2 <= 1 for all 0 <= k < n_components + + ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for + the entry-wise matrix norm which is the sum of the absolute values + of all the entries in the matrix. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int, default=None + Number of dictionary elements to extract. If None, then ``n_components`` + is set to ``n_features``. + + alpha : float, default=1.0 + Sparsity controlling parameter. + + max_iter : int, default=1000 + Maximum number of iterations to perform. + + tol : float, default=1e-8 + Tolerance for numerical error. + + fit_algorithm : {'lars', 'cd'}, default='lars' + * `'lars'`: uses the least angle regression method to solve the lasso + problem (:func:`~sklearn.linear_model.lars_path`); + * `'cd'`: uses the coordinate descent method to compute the + Lasso solution (:class:`~sklearn.linear_model.Lasso`). Lars will be + faster if the estimated components are sparse. + + .. versionadded:: 0.17 + *cd* coordinate descent method to improve speed. + + transform_algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', \ + 'threshold'}, default='omp' + Algorithm used to transform the data: + + - `'lars'`: uses the least angle regression method + (:func:`~sklearn.linear_model.lars_path`); + - `'lasso_lars'`: uses Lars to compute the Lasso solution. + - `'lasso_cd'`: uses the coordinate descent method to compute the + Lasso solution (:class:`~sklearn.linear_model.Lasso`). `'lasso_lars'` + will be faster if the estimated components are sparse. + - `'omp'`: uses orthogonal matching pursuit to estimate the sparse + solution. + - `'threshold'`: squashes to zero all coefficients less than alpha from + the projection ``dictionary * X'``. + + .. versionadded:: 0.17 + *lasso_cd* coordinate descent method to improve speed. + + transform_n_nonzero_coefs : int, default=None + Number of nonzero coefficients to target in each column of the + solution. This is only used by `algorithm='lars'` and + `algorithm='omp'`. If `None`, then + `transform_n_nonzero_coefs=int(n_features / 10)`. + + transform_alpha : float, default=None + If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the + penalty applied to the L1 norm. + If `algorithm='threshold'`, `alpha` is the absolute value of the + threshold below which coefficients will be squashed to zero. + If `None`, defaults to `alpha`. + + .. versionchanged:: 1.2 + When None, default value changed from 1.0 to `alpha`. + + n_jobs : int or None, default=None + Number of parallel jobs to run. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + code_init : ndarray of shape (n_samples, n_components), default=None + Initial value for the code, for warm restart. Only used if `code_init` + and `dict_init` are not None. + + dict_init : ndarray of shape (n_components, n_features), default=None + Initial values for the dictionary, for warm restart. Only used if + `code_init` and `dict_init` are not None. + + callback : callable, default=None + Callable that gets invoked every five iterations. + + .. versionadded:: 1.3 + + verbose : bool, default=False + To control the verbosity of the procedure. + + split_sign : bool, default=False + Whether to split the sparse feature vector into the concatenation of + its negative part and its positive part. This can improve the + performance of downstream classifiers. + + random_state : int, RandomState instance or None, default=None + Used for initializing the dictionary when ``dict_init`` is not + specified, randomly shuffling the data when ``shuffle`` is set to + ``True``, and updating the dictionary. Pass an int for reproducible + results across multiple function calls. + See :term:`Glossary `. + + positive_code : bool, default=False + Whether to enforce positivity when finding the code. + + .. versionadded:: 0.20 + + positive_dict : bool, default=False + Whether to enforce positivity when finding the dictionary. + + .. versionadded:: 0.20 + + transform_max_iter : int, default=1000 + Maximum number of iterations to perform if `algorithm='lasso_cd'` or + `'lasso_lars'`. + + .. versionadded:: 0.22 + + Attributes + ---------- + components_ : ndarray of shape (n_components, n_features) + dictionary atoms extracted from the data + + error_ : array + vector of errors at each iteration + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + Number of iterations run. + + See Also + -------- + MiniBatchDictionaryLearning: A faster, less accurate, version of the + dictionary learning algorithm. + MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis. + SparseCoder : Find a sparse representation of data from a fixed, + precomputed dictionary. + SparsePCA : Sparse Principal Components Analysis. + + References + ---------- + + J. Mairal, F. Bach, J. Ponce, G. Sapiro, 2009: Online dictionary learning + for sparse coding (https://www.di.ens.fr/~fbach/mairal_icml09.pdf) + + Examples + -------- + >>> import numpy as np + >>> from sklearn.datasets import make_sparse_coded_signal + >>> from sklearn.decomposition import DictionaryLearning + >>> X, dictionary, code = make_sparse_coded_signal( + ... n_samples=30, n_components=15, n_features=20, n_nonzero_coefs=10, + ... random_state=42, + ... ) + >>> dict_learner = DictionaryLearning( + ... n_components=15, transform_algorithm='lasso_lars', transform_alpha=0.1, + ... random_state=42, + ... ) + >>> X_transformed = dict_learner.fit(X).transform(X) + + We can check the level of sparsity of `X_transformed`: + + >>> np.mean(X_transformed == 0) + np.float64(0.527) + + We can compare the average squared euclidean norm of the reconstruction + error of the sparse coded signal relative to the squared euclidean norm of + the original signal: + + >>> X_hat = X_transformed @ dict_learner.components_ + >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1)) + np.float64(0.056) + """ + + _parameter_constraints: dict = { + "n_components": [Interval(Integral, 1, None, closed="left"), None], + "alpha": [Interval(Real, 0, None, closed="left")], + "max_iter": [Interval(Integral, 0, None, closed="left")], + "tol": [Interval(Real, 0, None, closed="left")], + "fit_algorithm": [StrOptions({"lars", "cd"})], + "transform_algorithm": [ + StrOptions({"lasso_lars", "lasso_cd", "lars", "omp", "threshold"}) + ], + "transform_n_nonzero_coefs": [Interval(Integral, 1, None, closed="left"), None], + "transform_alpha": [Interval(Real, 0, None, closed="left"), None], + "n_jobs": [Integral, None], + "code_init": [np.ndarray, None], + "dict_init": [np.ndarray, None], + "callback": [callable, None], + "verbose": ["verbose"], + "split_sign": ["boolean"], + "random_state": ["random_state"], + "positive_code": ["boolean"], + "positive_dict": ["boolean"], + "transform_max_iter": [Interval(Integral, 0, None, closed="left")], + } + + def __init__( + self, + n_components=None, + *, + alpha=1, + max_iter=1000, + tol=1e-8, + fit_algorithm="lars", + transform_algorithm="omp", + transform_n_nonzero_coefs=None, + transform_alpha=None, + n_jobs=None, + code_init=None, + dict_init=None, + callback=None, + verbose=False, + split_sign=False, + random_state=None, + positive_code=False, + positive_dict=False, + transform_max_iter=1000, + ): + super().__init__( + transform_algorithm, + transform_n_nonzero_coefs, + transform_alpha, + split_sign, + n_jobs, + positive_code, + transform_max_iter, + ) + self.n_components = n_components + self.alpha = alpha + self.max_iter = max_iter + self.tol = tol + self.fit_algorithm = fit_algorithm + self.code_init = code_init + self.dict_init = dict_init + self.callback = callback + self.verbose = verbose + self.random_state = random_state + self.positive_dict = positive_dict + + def fit(self, X, y=None): + """Fit the model from data in X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Returns the instance itself. + """ + self.fit_transform(X) + return self + + @_fit_context(prefer_skip_nested_validation=True) + def fit_transform(self, X, y=None): + """Fit the model from data in X and return the transformed data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + V : ndarray of shape (n_samples, n_components) + Transformed data. + """ + _check_positive_coding(method=self.fit_algorithm, positive=self.positive_code) + + method = "lasso_" + self.fit_algorithm + + random_state = check_random_state(self.random_state) + X = validate_data(self, X) + + if self.n_components is None: + n_components = X.shape[1] + else: + n_components = self.n_components + + V, U, E, self.n_iter_ = _dict_learning( + X, + n_components, + alpha=self.alpha, + tol=self.tol, + max_iter=self.max_iter, + method=method, + method_max_iter=self.transform_max_iter, + n_jobs=self.n_jobs, + code_init=self.code_init, + dict_init=self.dict_init, + callback=self.callback, + verbose=self.verbose, + random_state=random_state, + return_n_iter=True, + positive_dict=self.positive_dict, + positive_code=self.positive_code, + ) + self.components_ = U + self.error_ = E + + return V + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = ["float64", "float32"] + return tags + + +class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator): + """Mini-batch dictionary learning. + + Finds a dictionary (a set of atoms) that performs well at sparsely + encoding the fitted data. + + Solves the optimization problem:: + + (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1 + (U,V) + with || V_k ||_2 <= 1 for all 0 <= k < n_components + + ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for + the entry-wise matrix norm which is the sum of the absolute values + of all the entries in the matrix. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int, default=None + Number of dictionary elements to extract. + + alpha : float, default=1 + Sparsity controlling parameter. + + max_iter : int, default=1_000 + Maximum number of iterations over the complete dataset before + stopping independently of any early stopping criterion heuristics. + + .. versionadded:: 1.1 + + fit_algorithm : {'lars', 'cd'}, default='lars' + The algorithm used: + + - `'lars'`: uses the least angle regression method to solve the lasso + problem (`linear_model.lars_path`) + - `'cd'`: uses the coordinate descent method to compute the + Lasso solution (`linear_model.Lasso`). Lars will be faster if + the estimated components are sparse. + + n_jobs : int, default=None + Number of parallel jobs to run. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + batch_size : int, default=256 + Number of samples in each mini-batch. + + .. versionchanged:: 1.3 + The default value of `batch_size` changed from 3 to 256 in version 1.3. + + shuffle : bool, default=True + Whether to shuffle the samples before forming batches. + + dict_init : ndarray of shape (n_components, n_features), default=None + Initial value of the dictionary for warm restart scenarios. + + transform_algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', \ + 'threshold'}, default='omp' + Algorithm used to transform the data: + + - `'lars'`: uses the least angle regression method + (`linear_model.lars_path`); + - `'lasso_lars'`: uses Lars to compute the Lasso solution. + - `'lasso_cd'`: uses the coordinate descent method to compute the + Lasso solution (`linear_model.Lasso`). `'lasso_lars'` will be faster + if the estimated components are sparse. + - `'omp'`: uses orthogonal matching pursuit to estimate the sparse + solution. + - `'threshold'`: squashes to zero all coefficients less than alpha from + the projection ``dictionary * X'``. + + transform_n_nonzero_coefs : int, default=None + Number of nonzero coefficients to target in each column of the + solution. This is only used by `algorithm='lars'` and + `algorithm='omp'`. If `None`, then + `transform_n_nonzero_coefs=int(n_features / 10)`. + + transform_alpha : float, default=None + If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the + penalty applied to the L1 norm. + If `algorithm='threshold'`, `alpha` is the absolute value of the + threshold below which coefficients will be squashed to zero. + If `None`, defaults to `alpha`. + + .. versionchanged:: 1.2 + When None, default value changed from 1.0 to `alpha`. + + verbose : bool or int, default=False + To control the verbosity of the procedure. + + split_sign : bool, default=False + Whether to split the sparse feature vector into the concatenation of + its negative part and its positive part. This can improve the + performance of downstream classifiers. + + random_state : int, RandomState instance or None, default=None + Used for initializing the dictionary when ``dict_init`` is not + specified, randomly shuffling the data when ``shuffle`` is set to + ``True``, and updating the dictionary. Pass an int for reproducible + results across multiple function calls. + See :term:`Glossary `. + + positive_code : bool, default=False + Whether to enforce positivity when finding the code. + + .. versionadded:: 0.20 + + positive_dict : bool, default=False + Whether to enforce positivity when finding the dictionary. + + .. versionadded:: 0.20 + + transform_max_iter : int, default=1000 + Maximum number of iterations to perform if `algorithm='lasso_cd'` or + `'lasso_lars'`. + + .. versionadded:: 0.22 + + callback : callable, default=None + A callable that gets invoked at the end of each iteration. + + .. versionadded:: 1.1 + + tol : float, default=1e-3 + Control early stopping based on the norm of the differences in the + dictionary between 2 steps. + + To disable early stopping based on changes in the dictionary, set + `tol` to 0.0. + + .. versionadded:: 1.1 + + max_no_improvement : int, default=10 + Control early stopping based on the consecutive number of mini batches + that does not yield an improvement on the smoothed cost function. + + To disable convergence detection based on cost function, set + `max_no_improvement` to None. + + .. versionadded:: 1.1 + + Attributes + ---------- + components_ : ndarray of shape (n_components, n_features) + Components extracted from the data. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + Number of iterations over the full dataset. + + n_steps_ : int + Number of mini-batches processed. + + .. versionadded:: 1.1 + + See Also + -------- + DictionaryLearning : Find a dictionary that sparsely encodes data. + MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis. + SparseCoder : Find a sparse representation of data from a fixed, + precomputed dictionary. + SparsePCA : Sparse Principal Components Analysis. + + References + ---------- + + J. Mairal, F. Bach, J. Ponce, G. Sapiro, 2009: Online dictionary learning + for sparse coding (https://www.di.ens.fr/~fbach/mairal_icml09.pdf) + + Examples + -------- + >>> import numpy as np + >>> from sklearn.datasets import make_sparse_coded_signal + >>> from sklearn.decomposition import MiniBatchDictionaryLearning + >>> X, dictionary, code = make_sparse_coded_signal( + ... n_samples=30, n_components=15, n_features=20, n_nonzero_coefs=10, + ... random_state=42) + >>> dict_learner = MiniBatchDictionaryLearning( + ... n_components=15, batch_size=3, transform_algorithm='lasso_lars', + ... transform_alpha=0.1, max_iter=20, random_state=42) + >>> X_transformed = dict_learner.fit_transform(X) + + We can check the level of sparsity of `X_transformed`: + + >>> np.mean(X_transformed == 0) > 0.5 + np.True_ + + We can compare the average squared euclidean norm of the reconstruction + error of the sparse coded signal relative to the squared euclidean norm of + the original signal: + + >>> X_hat = X_transformed @ dict_learner.components_ + >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1)) + np.float64(0.052) + """ + + _parameter_constraints: dict = { + "n_components": [Interval(Integral, 1, None, closed="left"), None], + "alpha": [Interval(Real, 0, None, closed="left")], + "max_iter": [Interval(Integral, 0, None, closed="left")], + "fit_algorithm": [StrOptions({"cd", "lars"})], + "n_jobs": [None, Integral], + "batch_size": [Interval(Integral, 1, None, closed="left")], + "shuffle": ["boolean"], + "dict_init": [None, np.ndarray], + "transform_algorithm": [ + StrOptions({"lasso_lars", "lasso_cd", "lars", "omp", "threshold"}) + ], + "transform_n_nonzero_coefs": [Interval(Integral, 1, None, closed="left"), None], + "transform_alpha": [Interval(Real, 0, None, closed="left"), None], + "verbose": ["verbose"], + "split_sign": ["boolean"], + "random_state": ["random_state"], + "positive_code": ["boolean"], + "positive_dict": ["boolean"], + "transform_max_iter": [Interval(Integral, 0, None, closed="left")], + "callback": [None, callable], + "tol": [Interval(Real, 0, None, closed="left")], + "max_no_improvement": [Interval(Integral, 0, None, closed="left"), None], + } + + def __init__( + self, + n_components=None, + *, + alpha=1, + max_iter=1_000, + fit_algorithm="lars", + n_jobs=None, + batch_size=256, + shuffle=True, + dict_init=None, + transform_algorithm="omp", + transform_n_nonzero_coefs=None, + transform_alpha=None, + verbose=False, + split_sign=False, + random_state=None, + positive_code=False, + positive_dict=False, + transform_max_iter=1000, + callback=None, + tol=1e-3, + max_no_improvement=10, + ): + super().__init__( + transform_algorithm, + transform_n_nonzero_coefs, + transform_alpha, + split_sign, + n_jobs, + positive_code, + transform_max_iter, + ) + self.n_components = n_components + self.alpha = alpha + self.max_iter = max_iter + self.fit_algorithm = fit_algorithm + self.dict_init = dict_init + self.verbose = verbose + self.shuffle = shuffle + self.batch_size = batch_size + self.split_sign = split_sign + self.random_state = random_state + self.positive_dict = positive_dict + self.callback = callback + self.max_no_improvement = max_no_improvement + self.tol = tol + + def _check_params(self, X): + # n_components + self._n_components = self.n_components + if self._n_components is None: + self._n_components = X.shape[1] + + # fit_algorithm + _check_positive_coding(self.fit_algorithm, self.positive_code) + self._fit_algorithm = "lasso_" + self.fit_algorithm + + # batch_size + self._batch_size = min(self.batch_size, X.shape[0]) + + def _initialize_dict(self, X, random_state): + """Initialization of the dictionary.""" + if self.dict_init is not None: + dictionary = self.dict_init + else: + # Init V with SVD of X + _, S, dictionary = _randomized_svd( + X, self._n_components, random_state=random_state + ) + dictionary = S[:, np.newaxis] * dictionary + + if self._n_components <= len(dictionary): + dictionary = dictionary[: self._n_components, :] + else: + dictionary = np.concatenate( + ( + dictionary, + np.zeros( + (self._n_components - len(dictionary), dictionary.shape[1]), + dtype=dictionary.dtype, + ), + ) + ) + + dictionary = check_array(dictionary, order="F", dtype=X.dtype, copy=False) + dictionary = np.require(dictionary, requirements="W") + + return dictionary + + def _update_inner_stats(self, X, code, batch_size, step): + """Update the inner stats inplace.""" + if step < batch_size - 1: + theta = (step + 1) * batch_size + else: + theta = batch_size**2 + step + 1 - batch_size + beta = (theta + 1 - batch_size) / (theta + 1) + + self._A *= beta + self._A += code.T @ code / batch_size + self._B *= beta + self._B += X.T @ code / batch_size + + def _minibatch_step(self, X, dictionary, random_state, step): + """Perform the update on the dictionary for one minibatch.""" + batch_size = X.shape[0] + + # Compute code for this batch + code = _sparse_encode( + X, + dictionary, + algorithm=self._fit_algorithm, + alpha=self.alpha, + n_jobs=self.n_jobs, + positive=self.positive_code, + max_iter=self.transform_max_iter, + verbose=self.verbose, + ) + + batch_cost = ( + 0.5 * ((X - code @ dictionary) ** 2).sum() + + self.alpha * np.sum(np.abs(code)) + ) / batch_size + + # Update inner stats + self._update_inner_stats(X, code, batch_size, step) + + # Update dictionary + _update_dict( + dictionary, + X, + code, + self._A, + self._B, + verbose=self.verbose, + random_state=random_state, + positive=self.positive_dict, + ) + + return batch_cost + + def _check_convergence( + self, X, batch_cost, new_dict, old_dict, n_samples, step, n_steps + ): + """Helper function to encapsulate the early stopping logic. + + Early stopping is based on two factors: + - A small change of the dictionary between two minibatch updates. This is + controlled by the tol parameter. + - No more improvement on a smoothed estimate of the objective function for a + a certain number of consecutive minibatch updates. This is controlled by + the max_no_improvement parameter. + """ + batch_size = X.shape[0] + + # counts steps starting from 1 for user friendly verbose mode. + step = step + 1 + + # Ignore 100 first steps or 1 epoch to avoid initializing the ewa_cost with a + # too bad value + if step <= min(100, n_samples / batch_size): + if self.verbose: + print(f"Minibatch step {step}/{n_steps}: mean batch cost: {batch_cost}") + return False + + # Compute an Exponentially Weighted Average of the cost function to + # monitor the convergence while discarding minibatch-local stochastic + # variability: https://en.wikipedia.org/wiki/Moving_average + if self._ewa_cost is None: + self._ewa_cost = batch_cost + else: + alpha = batch_size / (n_samples + 1) + alpha = min(alpha, 1) + self._ewa_cost = self._ewa_cost * (1 - alpha) + batch_cost * alpha + + if self.verbose: + print( + f"Minibatch step {step}/{n_steps}: mean batch cost: " + f"{batch_cost}, ewa cost: {self._ewa_cost}" + ) + + # Early stopping based on change of dictionary + dict_diff = linalg.norm(new_dict - old_dict) / self._n_components + if self.tol > 0 and dict_diff <= self.tol: + if self.verbose: + print(f"Converged (small dictionary change) at step {step}/{n_steps}") + return True + + # Early stopping heuristic due to lack of improvement on smoothed + # cost function + if self._ewa_cost_min is None or self._ewa_cost < self._ewa_cost_min: + self._no_improvement = 0 + self._ewa_cost_min = self._ewa_cost + else: + self._no_improvement += 1 + + if ( + self.max_no_improvement is not None + and self._no_improvement >= self.max_no_improvement + ): + if self.verbose: + print( + "Converged (lack of improvement in objective function) " + f"at step {step}/{n_steps}" + ) + return True + + return False + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit the model from data in X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Returns the instance itself. + """ + X = validate_data( + self, X, dtype=[np.float64, np.float32], order="C", copy=False + ) + + self._check_params(X) + self._random_state = check_random_state(self.random_state) + + dictionary = self._initialize_dict(X, self._random_state) + old_dict = dictionary.copy() + + if self.shuffle: + X_train = X.copy() + self._random_state.shuffle(X_train) + else: + X_train = X + + n_samples, n_features = X_train.shape + + if self.verbose: + print("[dict_learning]") + + # Inner stats + self._A = np.zeros( + (self._n_components, self._n_components), dtype=X_train.dtype + ) + self._B = np.zeros((n_features, self._n_components), dtype=X_train.dtype) + + # Attributes to monitor the convergence + self._ewa_cost = None + self._ewa_cost_min = None + self._no_improvement = 0 + + batches = gen_batches(n_samples, self._batch_size) + batches = itertools.cycle(batches) + n_steps_per_iter = int(np.ceil(n_samples / self._batch_size)) + n_steps = self.max_iter * n_steps_per_iter + + i = -1 # to allow max_iter = 0 + + for i, batch in zip(range(n_steps), batches): + X_batch = X_train[batch] + + batch_cost = self._minibatch_step( + X_batch, dictionary, self._random_state, i + ) + + if self._check_convergence( + X_batch, batch_cost, dictionary, old_dict, n_samples, i, n_steps + ): + break + + # XXX callback param added for backward compat in #18975 but a common + # unified callback API should be preferred + if self.callback is not None: + self.callback(locals()) + + old_dict[:] = dictionary + + self.n_steps_ = i + 1 + self.n_iter_ = np.ceil(self.n_steps_ / n_steps_per_iter) + self.components_ = dictionary + + return self + + @_fit_context(prefer_skip_nested_validation=True) + def partial_fit(self, X, y=None): + """Update the model using the data in X as a mini-batch. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Return the instance itself. + """ + has_components = hasattr(self, "components_") + + X = validate_data( + self, X, dtype=[np.float64, np.float32], order="C", reset=not has_components + ) + + if not has_components: + # This instance has not been fitted yet (fit or partial_fit) + self._check_params(X) + self._random_state = check_random_state(self.random_state) + + dictionary = self._initialize_dict(X, self._random_state) + + self.n_steps_ = 0 + + self._A = np.zeros((self._n_components, self._n_components), dtype=X.dtype) + self._B = np.zeros((X.shape[1], self._n_components), dtype=X.dtype) + else: + dictionary = self.components_ + + self._minibatch_step(X, dictionary, self._random_state, self.n_steps_) + + self.components_ = dictionary + self.n_steps_ += 1 + + return self + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = ["float64", "float32"] + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_factor_analysis.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_factor_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..d6d5e72a5b7d3a3b032a1465de639f60ebc58d7f --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_factor_analysis.py @@ -0,0 +1,457 @@ +"""Factor Analysis. + +A latent linear variable model. + +FactorAnalysis is similar to probabilistic PCA implemented by PCA.score +While PCA assumes Gaussian noise with the same variance for each +feature, the FactorAnalysis model assumes different variances for +each of them. + +This implementation is based on David Barber's Book, +Bayesian Reasoning and Machine Learning, +http://www.cs.ucl.ac.uk/staff/d.barber/brml, +Algorithm 21.1 +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from math import log, sqrt +from numbers import Integral, Real + +import numpy as np +from scipy import linalg + +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) +from ..exceptions import ConvergenceWarning +from ..utils import check_random_state +from ..utils._param_validation import Interval, StrOptions +from ..utils.extmath import _randomized_svd, fast_logdet, squared_norm +from ..utils.validation import check_is_fitted, validate_data + + +class FactorAnalysis(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): + """Factor Analysis (FA). + + A simple linear generative model with Gaussian latent variables. + + The observations are assumed to be caused by a linear transformation of + lower dimensional latent factors and added Gaussian noise. + Without loss of generality the factors are distributed according to a + Gaussian with zero mean and unit covariance. The noise is also zero mean + and has an arbitrary diagonal covariance matrix. + + If we would restrict the model further, by assuming that the Gaussian + noise is even isotropic (all diagonal entries are the same) we would obtain + :class:`PCA`. + + FactorAnalysis performs a maximum likelihood estimate of the so-called + `loading` matrix, the transformation of the latent variables to the + observed ones, using SVD based approach. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.13 + + Parameters + ---------- + n_components : int, default=None + Dimensionality of latent space, the number of components + of ``X`` that are obtained after ``transform``. + If None, n_components is set to the number of features. + + tol : float, default=1e-2 + Stopping tolerance for log-likelihood increase. + + copy : bool, default=True + Whether to make a copy of X. If ``False``, the input X gets overwritten + during fitting. + + max_iter : int, default=1000 + Maximum number of iterations. + + noise_variance_init : array-like of shape (n_features,), default=None + The initial guess of the noise variance for each feature. + If None, it defaults to np.ones(n_features). + + svd_method : {'lapack', 'randomized'}, default='randomized' + Which SVD method to use. If 'lapack' use standard SVD from + scipy.linalg, if 'randomized' use fast ``randomized_svd`` function. + Defaults to 'randomized'. For most applications 'randomized' will + be sufficiently precise while providing significant speed gains. + Accuracy can also be improved by setting higher values for + `iterated_power`. If this is not sufficient, for maximum precision + you should choose 'lapack'. + + iterated_power : int, default=3 + Number of iterations for the power method. 3 by default. Only used + if ``svd_method`` equals 'randomized'. + + rotation : {'varimax', 'quartimax'}, default=None + If not None, apply the indicated rotation. Currently, varimax and + quartimax are implemented. See + `"The varimax criterion for analytic rotation in factor analysis" + `_ + H. F. Kaiser, 1958. + + .. versionadded:: 0.24 + + random_state : int or RandomState instance, default=0 + Only used when ``svd_method`` equals 'randomized'. Pass an int for + reproducible results across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + components_ : ndarray of shape (n_components, n_features) + Components with maximum variance. + + loglike_ : list of shape (n_iterations,) + The log likelihood at each iteration. + + noise_variance_ : ndarray of shape (n_features,) + The estimated noise variance for each feature. + + n_iter_ : int + Number of iterations run. + + mean_ : ndarray of shape (n_features,) + Per-feature empirical mean, estimated from the training set. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + PCA: Principal component analysis is also a latent linear variable model + which however assumes equal noise variance for each feature. + This extra assumption makes probabilistic PCA faster as it can be + computed in closed form. + FastICA: Independent component analysis, a latent variable model with + non-Gaussian latent variables. + + References + ---------- + - David Barber, Bayesian Reasoning and Machine Learning, + Algorithm 21.1. + + - Christopher M. Bishop: Pattern Recognition and Machine Learning, + Chapter 12.2.4. + + Examples + -------- + >>> from sklearn.datasets import load_digits + >>> from sklearn.decomposition import FactorAnalysis + >>> X, _ = load_digits(return_X_y=True) + >>> transformer = FactorAnalysis(n_components=7, random_state=0) + >>> X_transformed = transformer.fit_transform(X) + >>> X_transformed.shape + (1797, 7) + """ + + _parameter_constraints: dict = { + "n_components": [Interval(Integral, 0, None, closed="left"), None], + "tol": [Interval(Real, 0.0, None, closed="left")], + "copy": ["boolean"], + "max_iter": [Interval(Integral, 1, None, closed="left")], + "noise_variance_init": ["array-like", None], + "svd_method": [StrOptions({"randomized", "lapack"})], + "iterated_power": [Interval(Integral, 0, None, closed="left")], + "rotation": [StrOptions({"varimax", "quartimax"}), None], + "random_state": ["random_state"], + } + + def __init__( + self, + n_components=None, + *, + tol=1e-2, + copy=True, + max_iter=1000, + noise_variance_init=None, + svd_method="randomized", + iterated_power=3, + rotation=None, + random_state=0, + ): + self.n_components = n_components + self.copy = copy + self.tol = tol + self.max_iter = max_iter + self.svd_method = svd_method + + self.noise_variance_init = noise_variance_init + self.iterated_power = iterated_power + self.random_state = random_state + self.rotation = rotation + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit the FactorAnalysis model to X using SVD based approach. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + + y : Ignored + Ignored parameter. + + Returns + ------- + self : object + FactorAnalysis class instance. + """ + X = validate_data( + self, X, copy=self.copy, dtype=np.float64, force_writeable=True + ) + + n_samples, n_features = X.shape + n_components = self.n_components + if n_components is None: + n_components = n_features + + self.mean_ = np.mean(X, axis=0) + X -= self.mean_ + + # some constant terms + nsqrt = sqrt(n_samples) + llconst = n_features * log(2.0 * np.pi) + n_components + var = np.var(X, axis=0) + + if self.noise_variance_init is None: + psi = np.ones(n_features, dtype=X.dtype) + else: + if len(self.noise_variance_init) != n_features: + raise ValueError( + "noise_variance_init dimension does not " + "with number of features : %d != %d" + % (len(self.noise_variance_init), n_features) + ) + psi = np.array(self.noise_variance_init) + + loglike = [] + old_ll = -np.inf + SMALL = 1e-12 + + # we'll modify svd outputs to return unexplained variance + # to allow for unified computation of loglikelihood + if self.svd_method == "lapack": + + def my_svd(X): + _, s, Vt = linalg.svd(X, full_matrices=False, check_finite=False) + return ( + s[:n_components], + Vt[:n_components], + squared_norm(s[n_components:]), + ) + + else: # svd_method == "randomized" + random_state = check_random_state(self.random_state) + + def my_svd(X): + _, s, Vt = _randomized_svd( + X, + n_components, + random_state=random_state, + n_iter=self.iterated_power, + ) + return s, Vt, squared_norm(X) - squared_norm(s) + + for i in range(self.max_iter): + # SMALL helps numerics + sqrt_psi = np.sqrt(psi) + SMALL + s, Vt, unexp_var = my_svd(X / (sqrt_psi * nsqrt)) + s **= 2 + # Use 'maximum' here to avoid sqrt problems. + W = np.sqrt(np.maximum(s - 1.0, 0.0))[:, np.newaxis] * Vt + del Vt + W *= sqrt_psi + + # loglikelihood + ll = llconst + np.sum(np.log(s)) + ll += unexp_var + np.sum(np.log(psi)) + ll *= -n_samples / 2.0 + loglike.append(ll) + if (ll - old_ll) < self.tol: + break + old_ll = ll + + psi = np.maximum(var - np.sum(W**2, axis=0), SMALL) + else: + warnings.warn( + "FactorAnalysis did not converge." + " You might want" + " to increase the number of iterations.", + ConvergenceWarning, + ) + + self.components_ = W + if self.rotation is not None: + self.components_ = self._rotate(W) + self.noise_variance_ = psi + self.loglike_ = loglike + self.n_iter_ = i + 1 + return self + + def transform(self, X): + """Apply dimensionality reduction to X using the model. + + Compute the expected mean of the latent variables. + See Barber, 21.2.33 (or Bishop, 12.66). + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_components) + The latent variables of X. + """ + check_is_fitted(self) + + X = validate_data(self, X, reset=False) + Ih = np.eye(len(self.components_)) + + X_transformed = X - self.mean_ + + Wpsi = self.components_ / self.noise_variance_ + cov_z = linalg.inv(Ih + np.dot(Wpsi, self.components_.T)) + tmp = np.dot(X_transformed, Wpsi.T) + X_transformed = np.dot(tmp, cov_z) + + return X_transformed + + def get_covariance(self): + """Compute data covariance with the FactorAnalysis model. + + ``cov = components_.T * components_ + diag(noise_variance)`` + + Returns + ------- + cov : ndarray of shape (n_features, n_features) + Estimated covariance of data. + """ + check_is_fitted(self) + + cov = np.dot(self.components_.T, self.components_) + cov.flat[:: len(cov) + 1] += self.noise_variance_ # modify diag inplace + return cov + + def get_precision(self): + """Compute data precision matrix with the FactorAnalysis model. + + Returns + ------- + precision : ndarray of shape (n_features, n_features) + Estimated precision of data. + """ + check_is_fitted(self) + + n_features = self.components_.shape[1] + + # handle corner cases first + if self.n_components == 0: + return np.diag(1.0 / self.noise_variance_) + if self.n_components == n_features: + return linalg.inv(self.get_covariance()) + + # Get precision using matrix inversion lemma + components_ = self.components_ + precision = np.dot(components_ / self.noise_variance_, components_.T) + precision.flat[:: len(precision) + 1] += 1.0 + precision = np.dot(components_.T, np.dot(linalg.inv(precision), components_)) + precision /= self.noise_variance_[:, np.newaxis] + precision /= -self.noise_variance_[np.newaxis, :] + precision.flat[:: len(precision) + 1] += 1.0 / self.noise_variance_ + return precision + + def score_samples(self, X): + """Compute the log-likelihood of each sample. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + The data. + + Returns + ------- + ll : ndarray of shape (n_samples,) + Log-likelihood of each sample under the current model. + """ + check_is_fitted(self) + X = validate_data(self, X, reset=False) + Xr = X - self.mean_ + precision = self.get_precision() + n_features = X.shape[1] + log_like = -0.5 * (Xr * (np.dot(Xr, precision))).sum(axis=1) + log_like -= 0.5 * (n_features * log(2.0 * np.pi) - fast_logdet(precision)) + return log_like + + def score(self, X, y=None): + """Compute the average log-likelihood of the samples. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + The data. + + y : Ignored + Ignored parameter. + + Returns + ------- + ll : float + Average log-likelihood of the samples under the current model. + """ + return np.mean(self.score_samples(X)) + + def _rotate(self, components, n_components=None, tol=1e-6): + "Rotate the factor analysis solution." + # note that tol is not exposed + return _ortho_rotation(components.T, method=self.rotation, tol=tol)[ + : self.n_components + ] + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] + + +def _ortho_rotation(components, method="varimax", tol=1e-6, max_iter=100): + """Return rotated components.""" + nrow, ncol = components.shape + rotation_matrix = np.eye(ncol) + var = 0 + + for _ in range(max_iter): + comp_rot = np.dot(components, rotation_matrix) + if method == "varimax": + tmp = comp_rot * np.transpose((comp_rot**2).sum(axis=0) / nrow) + elif method == "quartimax": + tmp = 0 + u, s, v = np.linalg.svd(np.dot(components.T, comp_rot**3 - tmp)) + rotation_matrix = np.dot(u, v) + var_new = np.sum(s) + if var != 0 and var_new < var * (1 + tol): + break + var = var_new + + return np.dot(components, rotation_matrix).T diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_fastica.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_fastica.py new file mode 100644 index 0000000000000000000000000000000000000000..efda7bfca56b60f361d6bafa1edf0d66effe3ef6 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_fastica.py @@ -0,0 +1,804 @@ +""" +Python implementation of the fast ICA algorithms. + +Reference: Tables 8.3 and 8.4 page 196 in the book: +Independent Component Analysis, by Hyvarinen et al. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from numbers import Integral, Real + +import numpy as np +from scipy import linalg + +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) +from ..exceptions import ConvergenceWarning +from ..utils import as_float_array, check_array, check_random_state +from ..utils._param_validation import Interval, Options, StrOptions, validate_params +from ..utils.validation import check_is_fitted, validate_data + +__all__ = ["FastICA", "fastica"] + + +def _gs_decorrelation(w, W, j): + """ + Orthonormalize w wrt the first j rows of W. + + Parameters + ---------- + w : ndarray of shape (n,) + Array to be orthogonalized + + W : ndarray of shape (p, n) + Null space definition + + j : int < p + The no of (from the first) rows of Null space W wrt which w is + orthogonalized. + + Notes + ----- + Assumes that W is orthogonal + w changed in place + """ + w -= np.linalg.multi_dot([w, W[:j].T, W[:j]]) + return w + + +def _sym_decorrelation(W): + """Symmetric decorrelation + i.e. W <- (W * W.T) ^{-1/2} * W + """ + s, u = linalg.eigh(np.dot(W, W.T)) + # Avoid sqrt of negative values because of rounding errors. Note that + # np.sqrt(tiny) is larger than tiny and therefore this clipping also + # prevents division by zero in the next step. + s = np.clip(s, a_min=np.finfo(W.dtype).tiny, a_max=None) + + # u (resp. s) contains the eigenvectors (resp. square roots of + # the eigenvalues) of W * W.T + return np.linalg.multi_dot([u * (1.0 / np.sqrt(s)), u.T, W]) + + +def _ica_def(X, tol, g, fun_args, max_iter, w_init): + """Deflationary FastICA using fun approx to neg-entropy function + + Used internally by FastICA. + """ + + n_components = w_init.shape[0] + W = np.zeros((n_components, n_components), dtype=X.dtype) + n_iter = [] + + # j is the index of the extracted component + for j in range(n_components): + w = w_init[j, :].copy() + w /= np.sqrt((w**2).sum()) + + for i in range(max_iter): + gwtx, g_wtx = g(np.dot(w.T, X), fun_args) + + w1 = (X * gwtx).mean(axis=1) - g_wtx.mean() * w + + _gs_decorrelation(w1, W, j) + + w1 /= np.sqrt((w1**2).sum()) + + lim = np.abs(np.abs((w1 * w).sum()) - 1) + w = w1 + if lim < tol: + break + + n_iter.append(i + 1) + W[j, :] = w + + return W, max(n_iter) + + +def _ica_par(X, tol, g, fun_args, max_iter, w_init): + """Parallel FastICA. + + Used internally by FastICA --main loop + + """ + W = _sym_decorrelation(w_init) + del w_init + p_ = float(X.shape[1]) + for ii in range(max_iter): + gwtx, g_wtx = g(np.dot(W, X), fun_args) + W1 = _sym_decorrelation(np.dot(gwtx, X.T) / p_ - g_wtx[:, np.newaxis] * W) + del gwtx, g_wtx + # builtin max, abs are faster than numpy counter parts. + # np.einsum allows having the lowest memory footprint. + # It is faster than np.diag(np.dot(W1, W.T)). + lim = max(abs(abs(np.einsum("ij,ij->i", W1, W)) - 1)) + W = W1 + if lim < tol: + break + else: + warnings.warn( + ( + "FastICA did not converge. Consider increasing " + "tolerance or the maximum number of iterations." + ), + ConvergenceWarning, + ) + + return W, ii + 1 + + +# Some standard non-linear functions. +# XXX: these should be optimized, as they can be a bottleneck. +def _logcosh(x, fun_args=None): + alpha = fun_args.get("alpha", 1.0) # comment it out? + + x *= alpha + gx = np.tanh(x, x) # apply the tanh inplace + g_x = np.empty(x.shape[0], dtype=x.dtype) + # XXX compute in chunks to avoid extra allocation + for i, gx_i in enumerate(gx): # please don't vectorize. + g_x[i] = (alpha * (1 - gx_i**2)).mean() + return gx, g_x + + +def _exp(x, fun_args): + exp = np.exp(-(x**2) / 2) + gx = x * exp + g_x = (1 - x**2) * exp + return gx, g_x.mean(axis=-1) + + +def _cube(x, fun_args): + return x**3, (3 * x**2).mean(axis=-1) + + +@validate_params( + { + "X": ["array-like"], + "return_X_mean": ["boolean"], + "compute_sources": ["boolean"], + "return_n_iter": ["boolean"], + }, + prefer_skip_nested_validation=False, +) +def fastica( + X, + n_components=None, + *, + algorithm="parallel", + whiten="unit-variance", + fun="logcosh", + fun_args=None, + max_iter=200, + tol=1e-04, + w_init=None, + whiten_solver="svd", + random_state=None, + return_X_mean=False, + compute_sources=True, + return_n_iter=False, +): + """Perform Fast Independent Component Analysis. + + The implementation is based on [1]_. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + + n_components : int, default=None + Number of components to use. If None is passed, all are used. + + algorithm : {'parallel', 'deflation'}, default='parallel' + Specify which algorithm to use for FastICA. + + whiten : str or bool, default='unit-variance' + Specify the whitening strategy to use. + + - If 'arbitrary-variance', a whitening with variance + arbitrary is used. + - If 'unit-variance', the whitening matrix is rescaled to ensure that + each recovered source has unit variance. + - If False, the data is already considered to be whitened, and no + whitening is performed. + + .. versionchanged:: 1.3 + The default value of `whiten` changed to 'unit-variance' in 1.3. + + fun : {'logcosh', 'exp', 'cube'} or callable, default='logcosh' + The functional form of the G function used in the + approximation to neg-entropy. Could be either 'logcosh', 'exp', + or 'cube'. + You can also provide your own function. It should return a tuple + containing the value of the function, and of its derivative, in the + point. The derivative should be averaged along its last dimension. + Example:: + + def my_g(x): + return x ** 3, (3 * x ** 2).mean(axis=-1) + + fun_args : dict, default=None + Arguments to send to the functional form. + If empty or None and if fun='logcosh', fun_args will take value + {'alpha' : 1.0}. + + max_iter : int, default=200 + Maximum number of iterations to perform. + + tol : float, default=1e-4 + A positive scalar giving the tolerance at which the + un-mixing matrix is considered to have converged. + + w_init : ndarray of shape (n_components, n_components), default=None + Initial un-mixing array. If `w_init=None`, then an array of values + drawn from a normal distribution is used. + + whiten_solver : {"eigh", "svd"}, default="svd" + The solver to use for whitening. + + - "svd" is more stable numerically if the problem is degenerate, and + often faster when `n_samples <= n_features`. + + - "eigh" is generally more memory efficient when + `n_samples >= n_features`, and can be faster when + `n_samples >= 50 * n_features`. + + .. versionadded:: 1.2 + + random_state : int, RandomState instance or None, default=None + Used to initialize ``w_init`` when not specified, with a + normal distribution. Pass an int, for reproducible results + across multiple function calls. + See :term:`Glossary `. + + return_X_mean : bool, default=False + If True, X_mean is returned too. + + compute_sources : bool, default=True + If False, sources are not computed, but only the rotation matrix. + This can save memory when working with big data. Defaults to True. + + return_n_iter : bool, default=False + Whether or not to return the number of iterations. + + Returns + ------- + K : ndarray of shape (n_components, n_features) or None + If whiten is 'True', K is the pre-whitening matrix that projects data + onto the first n_components principal components. If whiten is 'False', + K is 'None'. + + W : ndarray of shape (n_components, n_components) + The square matrix that unmixes the data after whitening. + The mixing matrix is the pseudo-inverse of matrix ``W K`` + if K is not None, else it is the inverse of W. + + S : ndarray of shape (n_samples, n_components) or None + Estimated source matrix. + + X_mean : ndarray of shape (n_features,) + The mean over features. Returned only if return_X_mean is True. + + n_iter : int + If the algorithm is "deflation", n_iter is the + maximum number of iterations run across all components. Else + they are just the number of iterations taken to converge. This is + returned only when return_n_iter is set to `True`. + + Notes + ----- + The data matrix X is considered to be a linear combination of + non-Gaussian (independent) components i.e. X = AS where columns of S + contain the independent components and A is a linear mixing + matrix. In short ICA attempts to `un-mix' the data by estimating an + un-mixing matrix W where ``S = W K X.`` + While FastICA was proposed to estimate as many sources + as features, it is possible to estimate less by setting + n_components < n_features. It this case K is not a square matrix + and the estimated A is the pseudo-inverse of ``W K``. + + This implementation was originally made for data of shape + [n_features, n_samples]. Now the input is transposed + before the algorithm is applied. This makes it slightly + faster for Fortran-ordered input. + + References + ---------- + .. [1] A. Hyvarinen and E. Oja, "Fast Independent Component Analysis", + Algorithms and Applications, Neural Networks, 13(4-5), 2000, + pp. 411-430. + + Examples + -------- + >>> from sklearn.datasets import load_digits + >>> from sklearn.decomposition import fastica + >>> X, _ = load_digits(return_X_y=True) + >>> K, W, S = fastica(X, n_components=7, random_state=0, whiten='unit-variance') + >>> K.shape + (7, 64) + >>> W.shape + (7, 7) + >>> S.shape + (1797, 7) + """ + est = FastICA( + n_components=n_components, + algorithm=algorithm, + whiten=whiten, + fun=fun, + fun_args=fun_args, + max_iter=max_iter, + tol=tol, + w_init=w_init, + whiten_solver=whiten_solver, + random_state=random_state, + ) + est._validate_params() + S = est._fit_transform(X, compute_sources=compute_sources) + + if est.whiten in ["unit-variance", "arbitrary-variance"]: + K = est.whitening_ + X_mean = est.mean_ + else: + K = None + X_mean = None + + returned_values = [K, est._unmixing, S] + if return_X_mean: + returned_values.append(X_mean) + if return_n_iter: + returned_values.append(est.n_iter_) + + return returned_values + + +class FastICA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): + """FastICA: a fast algorithm for Independent Component Analysis. + + The implementation is based on [1]_. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int, default=None + Number of components to use. If None is passed, all are used. + + algorithm : {'parallel', 'deflation'}, default='parallel' + Specify which algorithm to use for FastICA. + + whiten : str or bool, default='unit-variance' + Specify the whitening strategy to use. + + - If 'arbitrary-variance', a whitening with variance + arbitrary is used. + - If 'unit-variance', the whitening matrix is rescaled to ensure that + each recovered source has unit variance. + - If False, the data is already considered to be whitened, and no + whitening is performed. + + .. versionchanged:: 1.3 + The default value of `whiten` changed to 'unit-variance' in 1.3. + + fun : {'logcosh', 'exp', 'cube'} or callable, default='logcosh' + The functional form of the G function used in the + approximation to neg-entropy. Could be either 'logcosh', 'exp', + or 'cube'. + You can also provide your own function. It should return a tuple + containing the value of the function, and of its derivative, in the + point. The derivative should be averaged along its last dimension. + Example:: + + def my_g(x): + return x ** 3, (3 * x ** 2).mean(axis=-1) + + fun_args : dict, default=None + Arguments to send to the functional form. + If empty or None and if fun='logcosh', fun_args will take value + {'alpha' : 1.0}. + + max_iter : int, default=200 + Maximum number of iterations during fit. + + tol : float, default=1e-4 + A positive scalar giving the tolerance at which the + un-mixing matrix is considered to have converged. + + w_init : array-like of shape (n_components, n_components), default=None + Initial un-mixing array. If `w_init=None`, then an array of values + drawn from a normal distribution is used. + + whiten_solver : {"eigh", "svd"}, default="svd" + The solver to use for whitening. + + - "svd" is more stable numerically if the problem is degenerate, and + often faster when `n_samples <= n_features`. + + - "eigh" is generally more memory efficient when + `n_samples >= n_features`, and can be faster when + `n_samples >= 50 * n_features`. + + .. versionadded:: 1.2 + + random_state : int, RandomState instance or None, default=None + Used to initialize ``w_init`` when not specified, with a + normal distribution. Pass an int, for reproducible results + across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + components_ : ndarray of shape (n_components, n_features) + The linear operator to apply to the data to get the independent + sources. This is equal to the unmixing matrix when ``whiten`` is + False, and equal to ``np.dot(unmixing_matrix, self.whitening_)`` when + ``whiten`` is True. + + mixing_ : ndarray of shape (n_features, n_components) + The pseudo-inverse of ``components_``. It is the linear operator + that maps independent sources to the data. + + mean_ : ndarray of shape(n_features,) + The mean over features. Only set if `self.whiten` is True. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + If the algorithm is "deflation", n_iter is the + maximum number of iterations run across all components. Else + they are just the number of iterations taken to converge. + + whitening_ : ndarray of shape (n_components, n_features) + Only set if whiten is 'True'. This is the pre-whitening matrix + that projects data onto the first `n_components` principal components. + + See Also + -------- + PCA : Principal component analysis (PCA). + IncrementalPCA : Incremental principal components analysis (IPCA). + KernelPCA : Kernel Principal component analysis (KPCA). + MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis. + SparsePCA : Sparse Principal Components Analysis (SparsePCA). + + References + ---------- + .. [1] A. Hyvarinen and E. Oja, Independent Component Analysis: + Algorithms and Applications, Neural Networks, 13(4-5), 2000, + pp. 411-430. + + Examples + -------- + >>> from sklearn.datasets import load_digits + >>> from sklearn.decomposition import FastICA + >>> X, _ = load_digits(return_X_y=True) + >>> transformer = FastICA(n_components=7, + ... random_state=0, + ... whiten='unit-variance') + >>> X_transformed = transformer.fit_transform(X) + >>> X_transformed.shape + (1797, 7) + """ + + _parameter_constraints: dict = { + "n_components": [Interval(Integral, 1, None, closed="left"), None], + "algorithm": [StrOptions({"parallel", "deflation"})], + "whiten": [ + StrOptions({"arbitrary-variance", "unit-variance"}), + Options(bool, {False}), + ], + "fun": [StrOptions({"logcosh", "exp", "cube"}), callable], + "fun_args": [dict, None], + "max_iter": [Interval(Integral, 1, None, closed="left")], + "tol": [Interval(Real, 0.0, None, closed="left")], + "w_init": ["array-like", None], + "whiten_solver": [StrOptions({"eigh", "svd"})], + "random_state": ["random_state"], + } + + def __init__( + self, + n_components=None, + *, + algorithm="parallel", + whiten="unit-variance", + fun="logcosh", + fun_args=None, + max_iter=200, + tol=1e-4, + w_init=None, + whiten_solver="svd", + random_state=None, + ): + super().__init__() + self.n_components = n_components + self.algorithm = algorithm + self.whiten = whiten + self.fun = fun + self.fun_args = fun_args + self.max_iter = max_iter + self.tol = tol + self.w_init = w_init + self.whiten_solver = whiten_solver + self.random_state = random_state + + def _fit_transform(self, X, compute_sources=False): + """Fit the model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + compute_sources : bool, default=False + If False, sources are not computes but only the rotation matrix. + This can save memory when working with big data. Defaults to False. + + Returns + ------- + S : ndarray of shape (n_samples, n_components) or None + Sources matrix. `None` if `compute_sources` is `False`. + """ + XT = validate_data( + self, + X, + copy=self.whiten, + dtype=[np.float64, np.float32], + ensure_min_samples=2, + ).T + fun_args = {} if self.fun_args is None else self.fun_args + random_state = check_random_state(self.random_state) + + alpha = fun_args.get("alpha", 1.0) + if not 1 <= alpha <= 2: + raise ValueError("alpha must be in [1,2]") + + if self.fun == "logcosh": + g = _logcosh + elif self.fun == "exp": + g = _exp + elif self.fun == "cube": + g = _cube + elif callable(self.fun): + + def g(x, fun_args): + return self.fun(x, **fun_args) + + n_features, n_samples = XT.shape + n_components = self.n_components + if not self.whiten and n_components is not None: + n_components = None + warnings.warn("Ignoring n_components with whiten=False.") + + if n_components is None: + n_components = min(n_samples, n_features) + if n_components > min(n_samples, n_features): + n_components = min(n_samples, n_features) + warnings.warn( + "n_components is too large: it will be set to %s" % n_components + ) + + if self.whiten: + # Centering the features of X + X_mean = XT.mean(axis=-1) + XT -= X_mean[:, np.newaxis] + + # Whitening and preprocessing by PCA + if self.whiten_solver == "eigh": + # Faster when num_samples >> n_features + d, u = linalg.eigh(XT.dot(X)) + sort_indices = np.argsort(d)[::-1] + eps = np.finfo(d.dtype).eps * 10 + degenerate_idx = d < eps + if np.any(degenerate_idx): + warnings.warn( + "There are some small singular values, using " + "whiten_solver = 'svd' might lead to more " + "accurate results." + ) + d[degenerate_idx] = eps # For numerical issues + np.sqrt(d, out=d) + d, u = d[sort_indices], u[:, sort_indices] + elif self.whiten_solver == "svd": + u, d = linalg.svd(XT, full_matrices=False, check_finite=False)[:2] + + # Give consistent eigenvectors for both svd solvers + u *= np.sign(u[0]) + + K = (u / d).T[:n_components] # see (6.33) p.140 + del u, d + X1 = np.dot(K, XT) + # see (13.6) p.267 Here X1 is white and data + # in X has been projected onto a subspace by PCA + X1 *= np.sqrt(n_samples) + else: + # X must be casted to floats to avoid typing issues with numpy + # 2.0 and the line below + X1 = as_float_array(XT, copy=False) # copy has been taken care of + + w_init = self.w_init + if w_init is None: + w_init = np.asarray( + random_state.normal(size=(n_components, n_components)), dtype=X1.dtype + ) + + else: + w_init = np.asarray(w_init) + if w_init.shape != (n_components, n_components): + raise ValueError( + "w_init has invalid shape -- should be %(shape)s" + % {"shape": (n_components, n_components)} + ) + + kwargs = { + "tol": self.tol, + "g": g, + "fun_args": fun_args, + "max_iter": self.max_iter, + "w_init": w_init, + } + + if self.algorithm == "parallel": + W, n_iter = _ica_par(X1, **kwargs) + elif self.algorithm == "deflation": + W, n_iter = _ica_def(X1, **kwargs) + del X1 + + self.n_iter_ = n_iter + + if compute_sources: + if self.whiten: + S = np.linalg.multi_dot([W, K, XT]).T + else: + S = np.dot(W, XT).T + else: + S = None + + if self.whiten: + if self.whiten == "unit-variance": + if not compute_sources: + S = np.linalg.multi_dot([W, K, XT]).T + S_std = np.std(S, axis=0, keepdims=True) + S /= S_std + W /= S_std.T + + self.components_ = np.dot(W, K) + self.mean_ = X_mean + self.whitening_ = K + else: + self.components_ = W + + self.mixing_ = linalg.pinv(self.components_, check_finite=False) + self._unmixing = W + + return S + + @_fit_context(prefer_skip_nested_validation=True) + def fit_transform(self, X, y=None): + """Fit the model and recover the sources from X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_components) + Estimated sources obtained by transforming the data with the + estimated unmixing matrix. + """ + return self._fit_transform(X, compute_sources=True) + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit the model to X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Returns the instance itself. + """ + self._fit_transform(X, compute_sources=False) + return self + + def transform(self, X, copy=True): + """Recover the sources from X (apply the unmixing matrix). + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data to transform, where `n_samples` is the number of samples + and `n_features` is the number of features. + + copy : bool, default=True + If False, data passed to fit can be overwritten. Defaults to True. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_components) + Estimated sources obtained by transforming the data with the + estimated unmixing matrix. + """ + check_is_fitted(self) + + X = validate_data( + self, + X, + copy=(copy and self.whiten), + dtype=[np.float64, np.float32], + reset=False, + ) + if self.whiten: + X -= self.mean_ + + return np.dot(X, self.components_.T) + + def inverse_transform(self, X, copy=True): + """Transform the sources back to the mixed data (apply mixing matrix). + + Parameters + ---------- + X : array-like of shape (n_samples, n_components) + Sources, where `n_samples` is the number of samples + and `n_components` is the number of components. + copy : bool, default=True + If False, data passed to fit are overwritten. Defaults to True. + + Returns + ------- + X_original : ndarray of shape (n_samples, n_features) + Reconstructed data obtained with the mixing matrix. + """ + check_is_fitted(self) + + X = check_array(X, copy=(copy and self.whiten), dtype=[np.float64, np.float32]) + X = np.dot(X, self.mixing_.T) + if self.whiten: + X += self.mean_ + + return X + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = ["float64", "float32"] + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_incremental_pca.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_incremental_pca.py new file mode 100644 index 0000000000000000000000000000000000000000..da617ef8fa787402810e17a563ce3152b5e1da89 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_incremental_pca.py @@ -0,0 +1,426 @@ +"""Incremental Principal Components Analysis.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Integral + +import numpy as np +from scipy import linalg, sparse + +from sklearn.utils import metadata_routing + +from ..base import _fit_context +from ..utils import gen_batches +from ..utils._param_validation import Interval +from ..utils.extmath import _incremental_mean_and_var, svd_flip +from ..utils.validation import validate_data +from ._base import _BasePCA + + +class IncrementalPCA(_BasePCA): + """Incremental principal components analysis (IPCA). + + Linear dimensionality reduction using Singular Value Decomposition of + the data, keeping only the most significant singular vectors to + project the data to a lower dimensional space. The input data is centered + but not scaled for each feature before applying the SVD. + + Depending on the size of the input data, this algorithm can be much more + memory efficient than a PCA, and allows sparse input. + + This algorithm has constant memory complexity, on the order + of ``batch_size * n_features``, enabling use of np.memmap files without + loading the entire file into memory. For sparse matrices, the input + is converted to dense in batches (in order to be able to subtract the + mean) which avoids storing the entire dense matrix at any one time. + + The computational overhead of each SVD is + ``O(batch_size * n_features ** 2)``, but only 2 * batch_size samples + remain in memory at a time. There will be ``n_samples / batch_size`` SVD + computations to get the principal components, versus 1 large SVD of + complexity ``O(n_samples * n_features ** 2)`` for PCA. + + For a usage example, see + :ref:`sphx_glr_auto_examples_decomposition_plot_incremental_pca.py`. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.16 + + Parameters + ---------- + n_components : int, default=None + Number of components to keep. If ``n_components`` is ``None``, + then ``n_components`` is set to ``min(n_samples, n_features)``. + + whiten : bool, default=False + When True (False by default) the ``components_`` vectors are divided + by ``n_samples`` times ``components_`` to ensure uncorrelated outputs + with unit component-wise variances. + + Whitening will remove some information from the transformed signal + (the relative variance scales of the components) but can sometimes + improve the predictive accuracy of the downstream estimators by + making data respect some hard-wired assumptions. + + copy : bool, default=True + If False, X will be overwritten. ``copy=False`` can be used to + save memory but is unsafe for general use. + + batch_size : int, default=None + The number of samples to use for each batch. Only used when calling + ``fit``. If ``batch_size`` is ``None``, then ``batch_size`` + is inferred from the data and set to ``5 * n_features``, to provide a + balance between approximation accuracy and memory consumption. + + Attributes + ---------- + components_ : ndarray of shape (n_components, n_features) + Principal axes in feature space, representing the directions of + maximum variance in the data. Equivalently, the right singular + vectors of the centered input data, parallel to its eigenvectors. + The components are sorted by decreasing ``explained_variance_``. + + explained_variance_ : ndarray of shape (n_components,) + Variance explained by each of the selected components. + + explained_variance_ratio_ : ndarray of shape (n_components,) + Percentage of variance explained by each of the selected components. + If all components are stored, the sum of explained variances is equal + to 1.0. + + singular_values_ : ndarray of shape (n_components,) + The singular values corresponding to each of the selected components. + The singular values are equal to the 2-norms of the ``n_components`` + variables in the lower-dimensional space. + + mean_ : ndarray of shape (n_features,) + Per-feature empirical mean, aggregate over calls to ``partial_fit``. + + var_ : ndarray of shape (n_features,) + Per-feature empirical variance, aggregate over calls to + ``partial_fit``. + + noise_variance_ : float + The estimated noise covariance following the Probabilistic PCA model + from Tipping and Bishop 1999. See "Pattern Recognition and + Machine Learning" by C. Bishop, 12.2.1 p. 574 or + http://www.miketipping.com/papers/met-mppca.pdf. + + n_components_ : int + The estimated number of components. Relevant when + ``n_components=None``. + + n_samples_seen_ : int + The number of samples processed by the estimator. Will be reset on + new calls to fit, but increments across ``partial_fit`` calls. + + batch_size_ : int + Inferred batch size from ``batch_size``. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + PCA : Principal component analysis (PCA). + KernelPCA : Kernel Principal component analysis (KPCA). + SparsePCA : Sparse Principal Components Analysis (SparsePCA). + TruncatedSVD : Dimensionality reduction using truncated SVD. + + Notes + ----- + Implements the incremental PCA model from: + *D. Ross, J. Lim, R. Lin, M. Yang, Incremental Learning for Robust Visual + Tracking, International Journal of Computer Vision, Volume 77, Issue 1-3, + pp. 125-141, May 2008.* + See https://www.cs.toronto.edu/~dross/ivt/RossLimLinYang_ijcv.pdf + + This model is an extension of the Sequential Karhunen-Loeve Transform from: + :doi:`A. Levy and M. Lindenbaum, Sequential Karhunen-Loeve Basis Extraction and + its Application to Images, IEEE Transactions on Image Processing, Volume 9, + Number 8, pp. 1371-1374, August 2000. <10.1109/83.855432>` + + We have specifically abstained from an optimization used by authors of both + papers, a QR decomposition used in specific situations to reduce the + algorithmic complexity of the SVD. The source for this technique is + *Matrix Computations, Third Edition, G. Holub and C. Van Loan, Chapter 5, + section 5.4.4, pp 252-253.*. This technique has been omitted because it is + advantageous only when decomposing a matrix with ``n_samples`` (rows) + >= 5/3 * ``n_features`` (columns), and hurts the readability of the + implemented algorithm. This would be a good opportunity for future + optimization, if it is deemed necessary. + + References + ---------- + D. Ross, J. Lim, R. Lin, M. Yang. Incremental Learning for Robust Visual + Tracking, International Journal of Computer Vision, Volume 77, + Issue 1-3, pp. 125-141, May 2008. + + G. Golub and C. Van Loan. Matrix Computations, Third Edition, Chapter 5, + Section 5.4.4, pp. 252-253. + + Examples + -------- + >>> from sklearn.datasets import load_digits + >>> from sklearn.decomposition import IncrementalPCA + >>> from scipy import sparse + >>> X, _ = load_digits(return_X_y=True) + >>> transformer = IncrementalPCA(n_components=7, batch_size=200) + >>> # either partially fit on smaller batches of data + >>> transformer.partial_fit(X[:100, :]) + IncrementalPCA(batch_size=200, n_components=7) + >>> # or let the fit function itself divide the data into batches + >>> X_sparse = sparse.csr_matrix(X) + >>> X_transformed = transformer.fit_transform(X_sparse) + >>> X_transformed.shape + (1797, 7) + """ + + __metadata_request__partial_fit = {"check_input": metadata_routing.UNUSED} + + _parameter_constraints: dict = { + "n_components": [Interval(Integral, 1, None, closed="left"), None], + "whiten": ["boolean"], + "copy": ["boolean"], + "batch_size": [Interval(Integral, 1, None, closed="left"), None], + } + + def __init__(self, n_components=None, *, whiten=False, copy=True, batch_size=None): + self.n_components = n_components + self.whiten = whiten + self.copy = copy + self.batch_size = batch_size + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit the model with X, using minibatches of size batch_size. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Returns the instance itself. + """ + self.components_ = None + self.n_samples_seen_ = 0 + self.mean_ = 0.0 + self.var_ = 0.0 + self.singular_values_ = None + self.explained_variance_ = None + self.explained_variance_ratio_ = None + self.noise_variance_ = None + + X = validate_data( + self, + X, + accept_sparse=["csr", "csc", "lil"], + copy=self.copy, + dtype=[np.float64, np.float32], + force_writeable=True, + ) + n_samples, n_features = X.shape + + if self.batch_size is None: + self.batch_size_ = 5 * n_features + else: + self.batch_size_ = self.batch_size + + for batch in gen_batches( + n_samples, self.batch_size_, min_batch_size=self.n_components or 0 + ): + X_batch = X[batch] + if sparse.issparse(X_batch): + X_batch = X_batch.toarray() + self.partial_fit(X_batch, check_input=False) + + return self + + @_fit_context(prefer_skip_nested_validation=True) + def partial_fit(self, X, y=None, check_input=True): + """Incremental fit with X. All of X is processed as a single batch. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + check_input : bool, default=True + Run check_array on X. + + Returns + ------- + self : object + Returns the instance itself. + """ + first_pass = not hasattr(self, "components_") + + if check_input: + if sparse.issparse(X): + raise TypeError( + "IncrementalPCA.partial_fit does not support " + "sparse input. Either convert data to dense " + "or use IncrementalPCA.fit to do so in batches." + ) + X = validate_data( + self, + X, + copy=self.copy, + dtype=[np.float64, np.float32], + force_writeable=True, + reset=first_pass, + ) + n_samples, n_features = X.shape + if first_pass: + self.components_ = None + + if self.n_components is None: + if self.components_ is None: + self.n_components_ = min(n_samples, n_features) + else: + self.n_components_ = self.components_.shape[0] + elif not self.n_components <= n_features: + raise ValueError( + "n_components=%r invalid for n_features=%d, need " + "more rows than columns for IncrementalPCA " + "processing" % (self.n_components, n_features) + ) + elif self.n_components > n_samples and first_pass: + raise ValueError( + f"n_components={self.n_components} must be less or equal to " + f"the batch number of samples {n_samples} for the first " + "partial_fit call." + ) + else: + self.n_components_ = self.n_components + + if (self.components_ is not None) and ( + self.components_.shape[0] != self.n_components_ + ): + raise ValueError( + "Number of input features has changed from %i " + "to %i between calls to partial_fit! Try " + "setting n_components to a fixed value." + % (self.components_.shape[0], self.n_components_) + ) + + # This is the first partial_fit + if not hasattr(self, "n_samples_seen_"): + self.n_samples_seen_ = 0 + self.mean_ = 0.0 + self.var_ = 0.0 + + # Update stats - they are 0 if this is the first step + col_mean, col_var, n_total_samples = _incremental_mean_and_var( + X, + last_mean=self.mean_, + last_variance=self.var_, + last_sample_count=np.repeat(self.n_samples_seen_, X.shape[1]), + ) + n_total_samples = n_total_samples[0] + + # Whitening + if self.n_samples_seen_ == 0: + # If it is the first step, simply whiten X + X -= col_mean + else: + col_batch_mean = np.mean(X, axis=0) + X -= col_batch_mean + # Build matrix of combined previous basis and new data + mean_correction = np.sqrt( + (self.n_samples_seen_ / n_total_samples) * n_samples + ) * (self.mean_ - col_batch_mean) + X = np.vstack( + ( + self.singular_values_.reshape((-1, 1)) * self.components_, + X, + mean_correction, + ) + ) + + U, S, Vt = linalg.svd(X, full_matrices=False, check_finite=False) + U, Vt = svd_flip(U, Vt, u_based_decision=False) + explained_variance = S**2 / (n_total_samples - 1) + explained_variance_ratio = S**2 / np.sum(col_var * n_total_samples) + + self.n_samples_seen_ = n_total_samples + self.components_ = Vt[: self.n_components_] + self.singular_values_ = S[: self.n_components_] + self.mean_ = col_mean + self.var_ = col_var + self.explained_variance_ = explained_variance[: self.n_components_] + self.explained_variance_ratio_ = explained_variance_ratio[: self.n_components_] + # we already checked `self.n_components <= n_samples` above + if self.n_components_ not in (n_samples, n_features): + self.noise_variance_ = explained_variance[self.n_components_ :].mean() + else: + self.noise_variance_ = 0.0 + return self + + def transform(self, X): + """Apply dimensionality reduction to X. + + X is projected on the first principal components previously extracted + from a training set, using minibatches of size batch_size if X is + sparse. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + New data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_components) + Projection of X in the first principal components. + + Examples + -------- + + >>> import numpy as np + >>> from sklearn.decomposition import IncrementalPCA + >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], + ... [1, 1], [2, 1], [3, 2]]) + >>> ipca = IncrementalPCA(n_components=2, batch_size=3) + >>> ipca.fit(X) + IncrementalPCA(batch_size=3, n_components=2) + >>> ipca.transform(X) # doctest: +SKIP + """ + if sparse.issparse(X): + n_samples = X.shape[0] + output = [] + for batch in gen_batches( + n_samples, self.batch_size_, min_batch_size=self.n_components or 0 + ): + output.append(super().transform(X[batch].toarray())) + return np.vstack(output) + else: + return super().transform(X) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + # Beware that fit accepts sparse data but partial_fit doesn't + tags.input_tags.sparse = True + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_kernel_pca.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_kernel_pca.py new file mode 100644 index 0000000000000000000000000000000000000000..cd862079a1682deed3705f42da5672af8ca10acb --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_kernel_pca.py @@ -0,0 +1,579 @@ +"""Kernel Principal Components Analysis.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Integral, Real + +import numpy as np +from scipy import linalg +from scipy.linalg import eigh +from scipy.sparse.linalg import eigsh + +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) +from ..exceptions import NotFittedError +from ..metrics.pairwise import pairwise_kernels +from ..preprocessing import KernelCenterer +from ..utils._arpack import _init_arpack_v0 +from ..utils._param_validation import Interval, StrOptions +from ..utils.extmath import _randomized_eigsh, svd_flip +from ..utils.validation import ( + _check_psd_eigenvalues, + check_is_fitted, + validate_data, +) + + +class KernelPCA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): + """Kernel Principal component analysis (KPCA). + + Non-linear dimensionality reduction through the use of kernels [1]_, see also + :ref:`metrics`. + + It uses the :func:`scipy.linalg.eigh` LAPACK implementation of the full SVD + or the :func:`scipy.sparse.linalg.eigsh` ARPACK implementation of the + truncated SVD, depending on the shape of the input data and the number of + components to extract. It can also use a randomized truncated SVD by the + method proposed in [3]_, see `eigen_solver`. + + For a usage example and comparison between + Principal Components Analysis (PCA) and its kernelized version (KPCA), see + :ref:`sphx_glr_auto_examples_decomposition_plot_kernel_pca.py`. + + For a usage example in denoising images using KPCA, see + :ref:`sphx_glr_auto_examples_applications_plot_digits_denoising.py`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int, default=None + Number of components. If None, all non-zero components are kept. + + kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'cosine', 'precomputed'} \ + or callable, default='linear' + Kernel used for PCA. + + gamma : float, default=None + Kernel coefficient for rbf, poly and sigmoid kernels. Ignored by other + kernels. If ``gamma`` is ``None``, then it is set to ``1/n_features``. + + degree : float, default=3 + Degree for poly kernels. Ignored by other kernels. + + coef0 : float, default=1 + Independent term in poly and sigmoid kernels. + Ignored by other kernels. + + kernel_params : dict, default=None + Parameters (keyword arguments) and + values for kernel passed as callable object. + Ignored by other kernels. + + alpha : float, default=1.0 + Hyperparameter of the ridge regression that learns the + inverse transform (when fit_inverse_transform=True). + + fit_inverse_transform : bool, default=False + Learn the inverse transform for non-precomputed kernels + (i.e. learn to find the pre-image of a point). This method is based + on [2]_. + + eigen_solver : {'auto', 'dense', 'arpack', 'randomized'}, \ + default='auto' + Select eigensolver to use. If `n_components` is much + less than the number of training samples, randomized (or arpack to a + smaller extent) may be more efficient than the dense eigensolver. + Randomized SVD is performed according to the method of Halko et al + [3]_. + + auto : + the solver is selected by a default policy based on n_samples + (the number of training samples) and `n_components`: + if the number of components to extract is less than 10 (strict) and + the number of samples is more than 200 (strict), the 'arpack' + method is enabled. Otherwise the exact full eigenvalue + decomposition is computed and optionally truncated afterwards + ('dense' method). + dense : + run exact full eigenvalue decomposition calling the standard + LAPACK solver via `scipy.linalg.eigh`, and select the components + by postprocessing + arpack : + run SVD truncated to n_components calling ARPACK solver using + `scipy.sparse.linalg.eigsh`. It requires strictly + 0 < n_components < n_samples + randomized : + run randomized SVD by the method of Halko et al. [3]_. The current + implementation selects eigenvalues based on their module; therefore + using this method can lead to unexpected results if the kernel is + not positive semi-definite. See also [4]_. + + .. versionchanged:: 1.0 + `'randomized'` was added. + + tol : float, default=0 + Convergence tolerance for arpack. + If 0, optimal value will be chosen by arpack. + + max_iter : int, default=None + Maximum number of iterations for arpack. + If None, optimal value will be chosen by arpack. + + iterated_power : int >= 0, or 'auto', default='auto' + Number of iterations for the power method computed by + svd_solver == 'randomized'. When 'auto', it is set to 7 when + `n_components < 0.1 * min(X.shape)`, other it is set to 4. + + .. versionadded:: 1.0 + + remove_zero_eig : bool, default=False + If True, then all components with zero eigenvalues are removed, so + that the number of components in the output may be < n_components + (and sometimes even zero due to numerical instability). + When n_components is None, this parameter is ignored and components + with zero eigenvalues are removed regardless. + + random_state : int, RandomState instance or None, default=None + Used when ``eigen_solver`` == 'arpack' or 'randomized'. Pass an int + for reproducible results across multiple function calls. + See :term:`Glossary `. + + .. versionadded:: 0.18 + + copy_X : bool, default=True + If True, input X is copied and stored by the model in the `X_fit_` + attribute. If no further changes will be done to X, setting + `copy_X=False` saves memory by storing a reference. + + .. versionadded:: 0.18 + + n_jobs : int, default=None + The number of parallel jobs to run. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + .. versionadded:: 0.18 + + Attributes + ---------- + eigenvalues_ : ndarray of shape (n_components,) + Eigenvalues of the centered kernel matrix in decreasing order. + If `n_components` and `remove_zero_eig` are not set, + then all values are stored. + + eigenvectors_ : ndarray of shape (n_samples, n_components) + Eigenvectors of the centered kernel matrix. If `n_components` and + `remove_zero_eig` are not set, then all components are stored. + + dual_coef_ : ndarray of shape (n_samples, n_features) + Inverse transform matrix. Only available when + ``fit_inverse_transform`` is True. + + X_transformed_fit_ : ndarray of shape (n_samples, n_components) + Projection of the fitted data on the kernel principal components. + Only available when ``fit_inverse_transform`` is True. + + X_fit_ : ndarray of shape (n_samples, n_features) + The data used to fit the model. If `copy_X=False`, then `X_fit_` is + a reference. This attribute is used for the calls to transform. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + gamma_ : float + Kernel coefficient for rbf, poly and sigmoid kernels. When `gamma` + is explicitly provided, this is just the same as `gamma`. When `gamma` + is `None`, this is the actual value of kernel coefficient. + + .. versionadded:: 1.3 + + See Also + -------- + FastICA : A fast algorithm for Independent Component Analysis. + IncrementalPCA : Incremental Principal Component Analysis. + NMF : Non-Negative Matrix Factorization. + PCA : Principal Component Analysis. + SparsePCA : Sparse Principal Component Analysis. + TruncatedSVD : Dimensionality reduction using truncated SVD. + + References + ---------- + .. [1] `Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller. + "Kernel principal component analysis." + International conference on artificial neural networks. + Springer, Berlin, Heidelberg, 1997. + `_ + + .. [2] `Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf. + "Learning to find pre-images." + Advances in neural information processing systems 16 (2004): 449-456. + `_ + + .. [3] :arxiv:`Halko, Nathan, Per-Gunnar Martinsson, and Joel A. Tropp. + "Finding structure with randomness: Probabilistic algorithms for + constructing approximate matrix decompositions." + SIAM review 53.2 (2011): 217-288. <0909.4061>` + + .. [4] `Martinsson, Per-Gunnar, Vladimir Rokhlin, and Mark Tygert. + "A randomized algorithm for the decomposition of matrices." + Applied and Computational Harmonic Analysis 30.1 (2011): 47-68. + `_ + + Examples + -------- + >>> from sklearn.datasets import load_digits + >>> from sklearn.decomposition import KernelPCA + >>> X, _ = load_digits(return_X_y=True) + >>> transformer = KernelPCA(n_components=7, kernel='linear') + >>> X_transformed = transformer.fit_transform(X) + >>> X_transformed.shape + (1797, 7) + """ + + _parameter_constraints: dict = { + "n_components": [ + Interval(Integral, 1, None, closed="left"), + None, + ], + "kernel": [ + StrOptions({"linear", "poly", "rbf", "sigmoid", "cosine", "precomputed"}), + callable, + ], + "gamma": [ + Interval(Real, 0, None, closed="left"), + None, + ], + "degree": [Interval(Real, 0, None, closed="left")], + "coef0": [Interval(Real, None, None, closed="neither")], + "kernel_params": [dict, None], + "alpha": [Interval(Real, 0, None, closed="left")], + "fit_inverse_transform": ["boolean"], + "eigen_solver": [StrOptions({"auto", "dense", "arpack", "randomized"})], + "tol": [Interval(Real, 0, None, closed="left")], + "max_iter": [ + Interval(Integral, 1, None, closed="left"), + None, + ], + "iterated_power": [ + Interval(Integral, 0, None, closed="left"), + StrOptions({"auto"}), + ], + "remove_zero_eig": ["boolean"], + "random_state": ["random_state"], + "copy_X": ["boolean"], + "n_jobs": [None, Integral], + } + + def __init__( + self, + n_components=None, + *, + kernel="linear", + gamma=None, + degree=3, + coef0=1, + kernel_params=None, + alpha=1.0, + fit_inverse_transform=False, + eigen_solver="auto", + tol=0, + max_iter=None, + iterated_power="auto", + remove_zero_eig=False, + random_state=None, + copy_X=True, + n_jobs=None, + ): + self.n_components = n_components + self.kernel = kernel + self.kernel_params = kernel_params + self.gamma = gamma + self.degree = degree + self.coef0 = coef0 + self.alpha = alpha + self.fit_inverse_transform = fit_inverse_transform + self.eigen_solver = eigen_solver + self.tol = tol + self.max_iter = max_iter + self.iterated_power = iterated_power + self.remove_zero_eig = remove_zero_eig + self.random_state = random_state + self.n_jobs = n_jobs + self.copy_X = copy_X + + def _get_kernel(self, X, Y=None): + if callable(self.kernel): + params = self.kernel_params or {} + else: + params = {"gamma": self.gamma_, "degree": self.degree, "coef0": self.coef0} + return pairwise_kernels( + X, Y, metric=self.kernel, filter_params=True, n_jobs=self.n_jobs, **params + ) + + def _fit_transform_in_place(self, K): + """Fit's using kernel K""" + # center kernel in place + K = self._centerer.fit(K).transform(K, copy=False) + + # adjust n_components according to user inputs + if self.n_components is None: + n_components = K.shape[0] # use all dimensions + else: + n_components = min(K.shape[0], self.n_components) + + # compute eigenvectors + if self.eigen_solver == "auto": + if K.shape[0] > 200 and n_components < 10: + eigen_solver = "arpack" + else: + eigen_solver = "dense" + else: + eigen_solver = self.eigen_solver + + if eigen_solver == "dense": + # Note: subset_by_index specifies the indices of smallest/largest to return + self.eigenvalues_, self.eigenvectors_ = eigh( + K, subset_by_index=(K.shape[0] - n_components, K.shape[0] - 1) + ) + elif eigen_solver == "arpack": + v0 = _init_arpack_v0(K.shape[0], self.random_state) + self.eigenvalues_, self.eigenvectors_ = eigsh( + K, n_components, which="LA", tol=self.tol, maxiter=self.max_iter, v0=v0 + ) + elif eigen_solver == "randomized": + self.eigenvalues_, self.eigenvectors_ = _randomized_eigsh( + K, + n_components=n_components, + n_iter=self.iterated_power, + random_state=self.random_state, + selection="module", + ) + + # make sure that the eigenvalues are ok and fix numerical issues + self.eigenvalues_ = _check_psd_eigenvalues( + self.eigenvalues_, enable_warnings=False + ) + + # flip eigenvectors' sign to enforce deterministic output + self.eigenvectors_, _ = svd_flip(u=self.eigenvectors_, v=None) + + # sort eigenvectors in descending order + indices = self.eigenvalues_.argsort()[::-1] + self.eigenvalues_ = self.eigenvalues_[indices] + self.eigenvectors_ = self.eigenvectors_[:, indices] + + # remove eigenvectors with a zero eigenvalue (null space) if required + if self.remove_zero_eig or self.n_components is None: + self.eigenvectors_ = self.eigenvectors_[:, self.eigenvalues_ > 0] + self.eigenvalues_ = self.eigenvalues_[self.eigenvalues_ > 0] + + # Maintenance note on Eigenvectors normalization + # ---------------------------------------------- + # there is a link between + # the eigenvectors of K=Phi(X)'Phi(X) and the ones of Phi(X)Phi(X)' + # if v is an eigenvector of K + # then Phi(X)v is an eigenvector of Phi(X)Phi(X)' + # if u is an eigenvector of Phi(X)Phi(X)' + # then Phi(X)'u is an eigenvector of Phi(X)'Phi(X) + # + # At this stage our self.eigenvectors_ (the v) have norm 1, we need to scale + # them so that eigenvectors in kernel feature space (the u) have norm=1 + # instead + # + # We COULD scale them here: + # self.eigenvectors_ = self.eigenvectors_ / np.sqrt(self.eigenvalues_) + # + # But choose to perform that LATER when needed, in `fit()` and in + # `transform()`. + + return K + + def _fit_inverse_transform(self, X_transformed, X): + if hasattr(X, "tocsr"): + raise NotImplementedError( + "Inverse transform not implemented for sparse matrices!" + ) + + n_samples = X_transformed.shape[0] + K = self._get_kernel(X_transformed) + K.flat[:: n_samples + 1] += self.alpha + self.dual_coef_ = linalg.solve(K, X, assume_a="pos", overwrite_a=True) + self.X_transformed_fit_ = X_transformed + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit the model from data in X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Returns the instance itself. + """ + if self.fit_inverse_transform and self.kernel == "precomputed": + raise ValueError("Cannot fit_inverse_transform with a precomputed kernel.") + X = validate_data(self, X, accept_sparse="csr", copy=self.copy_X) + self.gamma_ = 1 / X.shape[1] if self.gamma is None else self.gamma + self._centerer = KernelCenterer().set_output(transform="default") + K = self._get_kernel(X) + # When kernel="precomputed", K is X but it's safe to perform in place operations + # on K because a copy was made before if requested by copy_X. + self._fit_transform_in_place(K) + + if self.fit_inverse_transform: + # no need to use the kernel to transform X, use shortcut expression + X_transformed = self.eigenvectors_ * np.sqrt(self.eigenvalues_) + + self._fit_inverse_transform(X_transformed, X) + + self.X_fit_ = X + return self + + def fit_transform(self, X, y=None, **params): + """Fit the model from data in X and transform X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + **params : kwargs + Parameters (keyword arguments) and values passed to + the fit_transform instance. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_components) + Transformed values. + """ + self.fit(X, **params) + + # no need to use the kernel to transform X, use shortcut expression + X_transformed = self.eigenvectors_ * np.sqrt(self.eigenvalues_) + + if self.fit_inverse_transform: + self._fit_inverse_transform(X_transformed, X) + + return X_transformed + + def transform(self, X): + """Transform X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples + and `n_features` is the number of features. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_components) + Projection of X in the first principal components, where `n_samples` + is the number of samples and `n_components` is the number of the components. + """ + check_is_fitted(self) + X = validate_data(self, X, accept_sparse="csr", reset=False) + + # Compute centered gram matrix between X and training data X_fit_ + K = self._centerer.transform(self._get_kernel(X, self.X_fit_)) + + # scale eigenvectors (properly account for null-space for dot product) + non_zeros = np.flatnonzero(self.eigenvalues_) + scaled_alphas = np.zeros_like(self.eigenvectors_) + scaled_alphas[:, non_zeros] = self.eigenvectors_[:, non_zeros] / np.sqrt( + self.eigenvalues_[non_zeros] + ) + + # Project with a scalar product between K and the scaled eigenvectors + return np.dot(K, scaled_alphas) + + def inverse_transform(self, X): + """Transform X back to original space. + + ``inverse_transform`` approximates the inverse transformation using + a learned pre-image. The pre-image is learned by kernel ridge + regression of the original data on their low-dimensional representation + vectors. + + .. note: + :meth:`~sklearn.decomposition.fit` internally uses a centered + kernel. As the centered kernel no longer contains the information + of the mean of kernel features, such information is not taken into + account in reconstruction. + + .. note:: + When users want to compute inverse transformation for 'linear' + kernel, it is recommended that they use + :class:`~sklearn.decomposition.PCA` instead. Unlike + :class:`~sklearn.decomposition.PCA`, + :class:`~sklearn.decomposition.KernelPCA`'s ``inverse_transform`` + does not reconstruct the mean of data when 'linear' kernel is used + due to the use of centered kernel. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_components) + Training vector, where `n_samples` is the number of samples + and `n_features` is the number of features. + + Returns + ------- + X_original : ndarray of shape (n_samples, n_features) + Original data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + References + ---------- + `Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf. + "Learning to find pre-images." + Advances in neural information processing systems 16 (2004): 449-456. + `_ + """ + if not self.fit_inverse_transform: + raise NotFittedError( + "The fit_inverse_transform parameter was not" + " set to True when instantiating and hence " + "the inverse transform is not available." + ) + + K = self._get_kernel(X, self.X_transformed_fit_) + return np.dot(K, self.dual_coef_) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + tags.transformer_tags.preserves_dtype = ["float64", "float32"] + tags.input_tags.pairwise = self.kernel == "precomputed" + return tags + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.eigenvalues_.shape[0] diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_lda.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_lda.py new file mode 100644 index 0000000000000000000000000000000000000000..94b1413745a2214572a06f7bfceaeffd5403ba48 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_lda.py @@ -0,0 +1,959 @@ +""" + +============================================================= +Online Latent Dirichlet Allocation with variational inference +============================================================= + +This implementation is modified from Matthew D. Hoffman's onlineldavb code +Link: https://github.com/blei-lab/onlineldavb +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Integral, Real + +import numpy as np +import scipy.sparse as sp +from joblib import effective_n_jobs +from scipy.special import gammaln, logsumexp + +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) +from ..utils import check_random_state, gen_batches, gen_even_slices +from ..utils._param_validation import Interval, StrOptions +from ..utils.parallel import Parallel, delayed +from ..utils.validation import check_is_fitted, check_non_negative, validate_data +from ._online_lda_fast import ( + _dirichlet_expectation_1d as cy_dirichlet_expectation_1d, +) +from ._online_lda_fast import ( + _dirichlet_expectation_2d, +) +from ._online_lda_fast import ( + mean_change as cy_mean_change, +) + +EPS = np.finfo(float).eps + + +def _update_doc_distribution( + X, + exp_topic_word_distr, + doc_topic_prior, + max_doc_update_iter, + mean_change_tol, + cal_sstats, + random_state, +): + """E-step: update document-topic distribution. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Document word matrix. + + exp_topic_word_distr : ndarray of shape (n_topics, n_features) + Exponential value of expectation of log topic word distribution. + In the literature, this is `exp(E[log(beta)])`. + + doc_topic_prior : float + Prior of document topic distribution `theta`. + + max_doc_update_iter : int + Max number of iterations for updating document topic distribution in + the E-step. + + mean_change_tol : float + Stopping tolerance for updating document topic distribution in E-step. + + cal_sstats : bool + Parameter that indicate to calculate sufficient statistics or not. + Set `cal_sstats` to `True` when we need to run M-step. + + random_state : RandomState instance or None + Parameter that indicate how to initialize document topic distribution. + Set `random_state` to None will initialize document topic distribution + to a constant number. + + Returns + ------- + (doc_topic_distr, suff_stats) : + `doc_topic_distr` is unnormalized topic distribution for each document. + In the literature, this is `gamma`. we can calculate `E[log(theta)]` + from it. + `suff_stats` is expected sufficient statistics for the M-step. + When `cal_sstats == False`, this will be None. + + """ + is_sparse_x = sp.issparse(X) + n_samples, n_features = X.shape + n_topics = exp_topic_word_distr.shape[0] + + if random_state: + doc_topic_distr = random_state.gamma(100.0, 0.01, (n_samples, n_topics)).astype( + X.dtype, copy=False + ) + else: + doc_topic_distr = np.ones((n_samples, n_topics), dtype=X.dtype) + + # In the literature, this is `exp(E[log(theta)])` + exp_doc_topic = np.exp(_dirichlet_expectation_2d(doc_topic_distr)) + + # diff on `component_` (only calculate it when `cal_diff` is True) + suff_stats = ( + np.zeros(exp_topic_word_distr.shape, dtype=X.dtype) if cal_sstats else None + ) + + if is_sparse_x: + X_data = X.data + X_indices = X.indices + X_indptr = X.indptr + + # These cython functions are called in a nested loop on usually very small arrays + # (length=n_topics). In that case, finding the appropriate signature of the + # fused-typed function can be more costly than its execution, hence the dispatch + # is done outside of the loop. + ctype = "float" if X.dtype == np.float32 else "double" + mean_change = cy_mean_change[ctype] + dirichlet_expectation_1d = cy_dirichlet_expectation_1d[ctype] + eps = np.finfo(X.dtype).eps + + for idx_d in range(n_samples): + if is_sparse_x: + ids = X_indices[X_indptr[idx_d] : X_indptr[idx_d + 1]] + cnts = X_data[X_indptr[idx_d] : X_indptr[idx_d + 1]] + else: + ids = np.nonzero(X[idx_d, :])[0] + cnts = X[idx_d, ids] + + doc_topic_d = doc_topic_distr[idx_d, :] + # The next one is a copy, since the inner loop overwrites it. + exp_doc_topic_d = exp_doc_topic[idx_d, :].copy() + exp_topic_word_d = exp_topic_word_distr[:, ids] + + # Iterate between `doc_topic_d` and `norm_phi` until convergence + for _ in range(0, max_doc_update_iter): + last_d = doc_topic_d + + # The optimal phi_{dwk} is proportional to + # exp(E[log(theta_{dk})]) * exp(E[log(beta_{dw})]). + norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + eps + + doc_topic_d = exp_doc_topic_d * np.dot(cnts / norm_phi, exp_topic_word_d.T) + # Note: adds doc_topic_prior to doc_topic_d, in-place. + dirichlet_expectation_1d(doc_topic_d, doc_topic_prior, exp_doc_topic_d) + + if mean_change(last_d, doc_topic_d) < mean_change_tol: + break + doc_topic_distr[idx_d, :] = doc_topic_d + + # Contribution of document d to the expected sufficient + # statistics for the M step. + if cal_sstats: + norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + eps + suff_stats[:, ids] += np.outer(exp_doc_topic_d, cnts / norm_phi) + + return (doc_topic_distr, suff_stats) + + +class LatentDirichletAllocation( + ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator +): + """Latent Dirichlet Allocation with online variational Bayes algorithm. + + The implementation is based on [1]_ and [2]_. + + .. versionadded:: 0.17 + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int, default=10 + Number of topics. + + .. versionchanged:: 0.19 + ``n_topics`` was renamed to ``n_components`` + + doc_topic_prior : float, default=None + Prior of document topic distribution `theta`. If the value is None, + defaults to `1 / n_components`. + In [1]_, this is called `alpha`. + + topic_word_prior : float, default=None + Prior of topic word distribution `beta`. If the value is None, defaults + to `1 / n_components`. + In [1]_, this is called `eta`. + + learning_method : {'batch', 'online'}, default='batch' + Method used to update `_component`. Only used in :meth:`fit` method. + In general, if the data size is large, the online update will be much + faster than the batch update. + + Valid options: + + - 'batch': Batch variational Bayes method. Use all training data in each EM + update. Old `components_` will be overwritten in each iteration. + - 'online': Online variational Bayes method. In each EM update, use mini-batch + of training data to update the ``components_`` variable incrementally. The + learning rate is controlled by the ``learning_decay`` and the + ``learning_offset`` parameters. + + .. versionchanged:: 0.20 + The default learning method is now ``"batch"``. + + learning_decay : float, default=0.7 + It is a parameter that control learning rate in the online learning + method. The value should be set between (0.5, 1.0] to guarantee + asymptotic convergence. When the value is 0.0 and batch_size is + ``n_samples``, the update method is same as batch learning. In the + literature, this is called kappa. + + learning_offset : float, default=10.0 + A (positive) parameter that downweights early iterations in online + learning. It should be greater than 1.0. In the literature, this is + called tau_0. + + max_iter : int, default=10 + The maximum number of passes over the training data (aka epochs). + It only impacts the behavior in the :meth:`fit` method, and not the + :meth:`partial_fit` method. + + batch_size : int, default=128 + Number of documents to use in each EM iteration. Only used in online + learning. + + evaluate_every : int, default=-1 + How often to evaluate perplexity. Only used in `fit` method. + set it to 0 or negative number to not evaluate perplexity in + training at all. Evaluating perplexity can help you check convergence + in training process, but it will also increase total training time. + Evaluating perplexity in every iteration might increase training time + up to two-fold. + + total_samples : int, default=1e6 + Total number of documents. Only used in the :meth:`partial_fit` method. + + perp_tol : float, default=1e-1 + Perplexity tolerance. Only used when ``evaluate_every`` is greater than 0. + + mean_change_tol : float, default=1e-3 + Stopping tolerance for updating document topic distribution in E-step. + + max_doc_update_iter : int, default=100 + Max number of iterations for updating document topic distribution in + the E-step. + + n_jobs : int, default=None + The number of jobs to use in the E-step. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + verbose : int, default=0 + Verbosity level. + + random_state : int, RandomState instance or None, default=None + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + components_ : ndarray of shape (n_components, n_features) + Variational parameters for topic word distribution. Since the complete + conditional for topic word distribution is a Dirichlet, + ``components_[i, j]`` can be viewed as pseudocount that represents the + number of times word `j` was assigned to topic `i`. + It can also be viewed as distribution over the words for each topic + after normalization: + ``model.components_ / model.components_.sum(axis=1)[:, np.newaxis]``. + + exp_dirichlet_component_ : ndarray of shape (n_components, n_features) + Exponential value of expectation of log topic word distribution. + In the literature, this is `exp(E[log(beta)])`. + + n_batch_iter_ : int + Number of iterations of the EM step. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + Number of passes over the dataset. + + bound_ : float + Final perplexity score on training set. + + doc_topic_prior_ : float + Prior of document topic distribution `theta`. If the value is None, + it is `1 / n_components`. + + random_state_ : RandomState instance + RandomState instance that is generated either from a seed, the random + number generator or by `np.random`. + + topic_word_prior_ : float + Prior of topic word distribution `beta`. If the value is None, it is + `1 / n_components`. + + See Also + -------- + sklearn.discriminant_analysis.LinearDiscriminantAnalysis: + A classifier with a linear decision boundary, generated by fitting + class conditional densities to the data and using Bayes' rule. + + References + ---------- + .. [1] "Online Learning for Latent Dirichlet Allocation", Matthew D. + Hoffman, David M. Blei, Francis Bach, 2010 + https://github.com/blei-lab/onlineldavb + + .. [2] "Stochastic Variational Inference", Matthew D. Hoffman, + David M. Blei, Chong Wang, John Paisley, 2013 + + Examples + -------- + >>> from sklearn.decomposition import LatentDirichletAllocation + >>> from sklearn.datasets import make_multilabel_classification + >>> # This produces a feature matrix of token counts, similar to what + >>> # CountVectorizer would produce on text. + >>> X, _ = make_multilabel_classification(random_state=0) + >>> lda = LatentDirichletAllocation(n_components=5, + ... random_state=0) + >>> lda.fit(X) + LatentDirichletAllocation(...) + >>> # get topics for some given samples: + >>> lda.transform(X[-2:]) + array([[0.00360392, 0.25499205, 0.0036211 , 0.64236448, 0.09541846], + [0.15297572, 0.00362644, 0.44412786, 0.39568399, 0.003586 ]]) + """ + + _parameter_constraints: dict = { + "n_components": [Interval(Integral, 0, None, closed="neither")], + "doc_topic_prior": [None, Interval(Real, 0, 1, closed="both")], + "topic_word_prior": [None, Interval(Real, 0, 1, closed="both")], + "learning_method": [StrOptions({"batch", "online"})], + "learning_decay": [Interval(Real, 0, 1, closed="both")], + "learning_offset": [Interval(Real, 1.0, None, closed="left")], + "max_iter": [Interval(Integral, 0, None, closed="left")], + "batch_size": [Interval(Integral, 0, None, closed="neither")], + "evaluate_every": [Interval(Integral, None, None, closed="neither")], + "total_samples": [Interval(Real, 0, None, closed="neither")], + "perp_tol": [Interval(Real, 0, None, closed="left")], + "mean_change_tol": [Interval(Real, 0, None, closed="left")], + "max_doc_update_iter": [Interval(Integral, 0, None, closed="left")], + "n_jobs": [None, Integral], + "verbose": ["verbose"], + "random_state": ["random_state"], + } + + def __init__( + self, + n_components=10, + *, + doc_topic_prior=None, + topic_word_prior=None, + learning_method="batch", + learning_decay=0.7, + learning_offset=10.0, + max_iter=10, + batch_size=128, + evaluate_every=-1, + total_samples=1e6, + perp_tol=1e-1, + mean_change_tol=1e-3, + max_doc_update_iter=100, + n_jobs=None, + verbose=0, + random_state=None, + ): + self.n_components = n_components + self.doc_topic_prior = doc_topic_prior + self.topic_word_prior = topic_word_prior + self.learning_method = learning_method + self.learning_decay = learning_decay + self.learning_offset = learning_offset + self.max_iter = max_iter + self.batch_size = batch_size + self.evaluate_every = evaluate_every + self.total_samples = total_samples + self.perp_tol = perp_tol + self.mean_change_tol = mean_change_tol + self.max_doc_update_iter = max_doc_update_iter + self.n_jobs = n_jobs + self.verbose = verbose + self.random_state = random_state + + def _init_latent_vars(self, n_features, dtype=np.float64): + """Initialize latent variables.""" + + self.random_state_ = check_random_state(self.random_state) + self.n_batch_iter_ = 1 + self.n_iter_ = 0 + + if self.doc_topic_prior is None: + self.doc_topic_prior_ = 1.0 / self.n_components + else: + self.doc_topic_prior_ = self.doc_topic_prior + + if self.topic_word_prior is None: + self.topic_word_prior_ = 1.0 / self.n_components + else: + self.topic_word_prior_ = self.topic_word_prior + + init_gamma = 100.0 + init_var = 1.0 / init_gamma + # In the literature, this is called `lambda` + self.components_ = self.random_state_.gamma( + init_gamma, init_var, (self.n_components, n_features) + ).astype(dtype, copy=False) + + # In the literature, this is `exp(E[log(beta)])` + self.exp_dirichlet_component_ = np.exp( + _dirichlet_expectation_2d(self.components_) + ) + + def _e_step(self, X, cal_sstats, random_init, parallel=None): + """E-step in EM update. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Document word matrix. + + cal_sstats : bool + Parameter that indicate whether to calculate sufficient statistics + or not. Set ``cal_sstats`` to True when we need to run M-step. + + random_init : bool + Parameter that indicate whether to initialize document topic + distribution randomly in the E-step. Set it to True in training + steps. + + parallel : joblib.Parallel, default=None + Pre-initialized instance of joblib.Parallel. + + Returns + ------- + (doc_topic_distr, suff_stats) : + `doc_topic_distr` is unnormalized topic distribution for each + document. In the literature, this is called `gamma`. + `suff_stats` is expected sufficient statistics for the M-step. + When `cal_sstats == False`, it will be None. + + """ + + # Run e-step in parallel + random_state = self.random_state_ if random_init else None + + # TODO: make Parallel._effective_n_jobs public instead? + n_jobs = effective_n_jobs(self.n_jobs) + if parallel is None: + parallel = Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) + results = parallel( + delayed(_update_doc_distribution)( + X[idx_slice, :], + self.exp_dirichlet_component_, + self.doc_topic_prior_, + self.max_doc_update_iter, + self.mean_change_tol, + cal_sstats, + random_state, + ) + for idx_slice in gen_even_slices(X.shape[0], n_jobs) + ) + + # merge result + doc_topics, sstats_list = zip(*results) + doc_topic_distr = np.vstack(doc_topics) + + if cal_sstats: + # This step finishes computing the sufficient statistics for the + # M-step. + suff_stats = np.zeros(self.components_.shape, dtype=self.components_.dtype) + for sstats in sstats_list: + suff_stats += sstats + suff_stats *= self.exp_dirichlet_component_ + else: + suff_stats = None + + return (doc_topic_distr, suff_stats) + + def _em_step(self, X, total_samples, batch_update, parallel=None): + """EM update for 1 iteration. + + update `component_` by batch VB or online VB. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Document word matrix. + + total_samples : int + Total number of documents. It is only used when + batch_update is `False`. + + batch_update : bool + Parameter that controls updating method. + `True` for batch learning, `False` for online learning. + + parallel : joblib.Parallel, default=None + Pre-initialized instance of joblib.Parallel + + Returns + ------- + doc_topic_distr : ndarray of shape (n_samples, n_components) + Unnormalized document topic distribution. + """ + + # E-step + _, suff_stats = self._e_step( + X, cal_sstats=True, random_init=True, parallel=parallel + ) + + # M-step + if batch_update: + self.components_ = self.topic_word_prior_ + suff_stats + else: + # online update + # In the literature, the weight is `rho` + weight = np.power( + self.learning_offset + self.n_batch_iter_, -self.learning_decay + ) + doc_ratio = float(total_samples) / X.shape[0] + self.components_ *= 1 - weight + self.components_ += weight * ( + self.topic_word_prior_ + doc_ratio * suff_stats + ) + + # update `component_` related variables + self.exp_dirichlet_component_ = np.exp( + _dirichlet_expectation_2d(self.components_) + ) + self.n_batch_iter_ += 1 + return + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.positive_only = True + tags.input_tags.sparse = True + tags.transformer_tags.preserves_dtype = ["float32", "float64"] + return tags + + def _check_non_neg_array(self, X, reset_n_features, whom): + """check X format + + check X format and make sure no negative value in X. + + Parameters + ---------- + X : array-like or sparse matrix + + """ + dtype = [np.float64, np.float32] if reset_n_features else self.components_.dtype + + X = validate_data( + self, + X, + reset=reset_n_features, + accept_sparse="csr", + dtype=dtype, + ) + check_non_negative(X, whom) + + return X + + @_fit_context(prefer_skip_nested_validation=True) + def partial_fit(self, X, y=None): + """Online VB with Mini-Batch update. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Document word matrix. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self + Partially fitted estimator. + """ + first_time = not hasattr(self, "components_") + + X = self._check_non_neg_array( + X, reset_n_features=first_time, whom="LatentDirichletAllocation.partial_fit" + ) + n_samples, n_features = X.shape + batch_size = self.batch_size + + # initialize parameters or check + if first_time: + self._init_latent_vars(n_features, dtype=X.dtype) + + if n_features != self.components_.shape[1]: + raise ValueError( + "The provided data has %d dimensions while " + "the model was trained with feature size %d." + % (n_features, self.components_.shape[1]) + ) + + n_jobs = effective_n_jobs(self.n_jobs) + with Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) as parallel: + for idx_slice in gen_batches(n_samples, batch_size): + self._em_step( + X[idx_slice, :], + total_samples=self.total_samples, + batch_update=False, + parallel=parallel, + ) + + return self + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Learn model for the data X with variational Bayes method. + + When `learning_method` is 'online', use mini-batch update. + Otherwise, use batch update. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Document word matrix. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self + Fitted estimator. + """ + X = self._check_non_neg_array( + X, reset_n_features=True, whom="LatentDirichletAllocation.fit" + ) + n_samples, n_features = X.shape + max_iter = self.max_iter + evaluate_every = self.evaluate_every + learning_method = self.learning_method + + batch_size = self.batch_size + + # initialize parameters + self._init_latent_vars(n_features, dtype=X.dtype) + # change to perplexity later + last_bound = None + n_jobs = effective_n_jobs(self.n_jobs) + with Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) as parallel: + for i in range(max_iter): + if learning_method == "online": + for idx_slice in gen_batches(n_samples, batch_size): + self._em_step( + X[idx_slice, :], + total_samples=n_samples, + batch_update=False, + parallel=parallel, + ) + else: + # batch update + self._em_step( + X, total_samples=n_samples, batch_update=True, parallel=parallel + ) + + # check perplexity + if evaluate_every > 0 and (i + 1) % evaluate_every == 0: + doc_topics_distr, _ = self._e_step( + X, cal_sstats=False, random_init=False, parallel=parallel + ) + bound = self._perplexity_precomp_distr( + X, doc_topics_distr, sub_sampling=False + ) + if self.verbose: + print( + "iteration: %d of max_iter: %d, perplexity: %.4f" + % (i + 1, max_iter, bound) + ) + + if last_bound and abs(last_bound - bound) < self.perp_tol: + break + last_bound = bound + + elif self.verbose: + print("iteration: %d of max_iter: %d" % (i + 1, max_iter)) + self.n_iter_ += 1 + + # calculate final perplexity value on train set + doc_topics_distr, _ = self._e_step( + X, cal_sstats=False, random_init=False, parallel=parallel + ) + self.bound_ = self._perplexity_precomp_distr( + X, doc_topics_distr, sub_sampling=False + ) + + return self + + def _unnormalized_transform(self, X): + """Transform data X according to fitted model. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Document word matrix. + + Returns + ------- + doc_topic_distr : ndarray of shape (n_samples, n_components) + Document topic distribution for X. + """ + doc_topic_distr, _ = self._e_step(X, cal_sstats=False, random_init=False) + + return doc_topic_distr + + def transform(self, X, *, normalize=True): + """Transform data X according to the fitted model. + + .. versionchanged:: 0.18 + `doc_topic_distr` is now normalized. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Document word matrix. + + normalize : bool, default=True + Whether to normalize the document topic distribution. + + Returns + ------- + doc_topic_distr : ndarray of shape (n_samples, n_components) + Document topic distribution for X. + """ + check_is_fitted(self) + X = self._check_non_neg_array( + X, reset_n_features=False, whom="LatentDirichletAllocation.transform" + ) + doc_topic_distr = self._unnormalized_transform(X) + if normalize: + doc_topic_distr /= doc_topic_distr.sum(axis=1)[:, np.newaxis] + return doc_topic_distr + + def fit_transform(self, X, y=None, *, normalize=True): + """ + Fit to data, then transform it. + + Fits transformer to `X` and `y` and returns a transformed version of `X`. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input samples. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs), \ + default=None + Target values (None for unsupervised transformations). + + normalize : bool, default=True + Whether to normalize the document topic distribution in `transform`. + + Returns + ------- + X_new : ndarray array of shape (n_samples, n_components) + Transformed array. + """ + return self.fit(X, y).transform(X, normalize=normalize) + + def _approx_bound(self, X, doc_topic_distr, sub_sampling): + """Estimate the variational bound. + + Estimate the variational bound over "all documents" using only the + documents passed in as X. Since log-likelihood of each word cannot + be computed directly, we use this bound to estimate it. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Document word matrix. + + doc_topic_distr : ndarray of shape (n_samples, n_components) + Document topic distribution. In the literature, this is called + gamma. + + sub_sampling : bool, default=False + Compensate for subsampling of documents. + It is used in calculate bound in online learning. + + Returns + ------- + score : float + + """ + + def _loglikelihood(prior, distr, dirichlet_distr, size): + # calculate log-likelihood + score = np.sum((prior - distr) * dirichlet_distr) + score += np.sum(gammaln(distr) - gammaln(prior)) + score += np.sum(gammaln(prior * size) - gammaln(np.sum(distr, 1))) + return score + + is_sparse_x = sp.issparse(X) + n_samples, n_components = doc_topic_distr.shape + n_features = self.components_.shape[1] + score = 0 + + dirichlet_doc_topic = _dirichlet_expectation_2d(doc_topic_distr) + dirichlet_component_ = _dirichlet_expectation_2d(self.components_) + doc_topic_prior = self.doc_topic_prior_ + topic_word_prior = self.topic_word_prior_ + + if is_sparse_x: + X_data = X.data + X_indices = X.indices + X_indptr = X.indptr + + # E[log p(docs | theta, beta)] + for idx_d in range(0, n_samples): + if is_sparse_x: + ids = X_indices[X_indptr[idx_d] : X_indptr[idx_d + 1]] + cnts = X_data[X_indptr[idx_d] : X_indptr[idx_d + 1]] + else: + ids = np.nonzero(X[idx_d, :])[0] + cnts = X[idx_d, ids] + temp = ( + dirichlet_doc_topic[idx_d, :, np.newaxis] + dirichlet_component_[:, ids] + ) + norm_phi = logsumexp(temp, axis=0) + score += np.dot(cnts, norm_phi) + + # compute E[log p(theta | alpha) - log q(theta | gamma)] + score += _loglikelihood( + doc_topic_prior, doc_topic_distr, dirichlet_doc_topic, self.n_components + ) + + # Compensate for the subsampling of the population of documents + if sub_sampling: + doc_ratio = float(self.total_samples) / n_samples + score *= doc_ratio + + # E[log p(beta | eta) - log q (beta | lambda)] + score += _loglikelihood( + topic_word_prior, self.components_, dirichlet_component_, n_features + ) + + return score + + def score(self, X, y=None): + """Calculate approximate log-likelihood as score. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Document word matrix. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + score : float + Use approximate bound as score. + """ + check_is_fitted(self) + X = self._check_non_neg_array( + X, reset_n_features=False, whom="LatentDirichletAllocation.score" + ) + + doc_topic_distr = self._unnormalized_transform(X) + score = self._approx_bound(X, doc_topic_distr, sub_sampling=False) + return score + + def _perplexity_precomp_distr(self, X, doc_topic_distr=None, sub_sampling=False): + """Calculate approximate perplexity for data X with ability to accept + precomputed doc_topic_distr + + Perplexity is defined as exp(-1. * log-likelihood per word) + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Document word matrix. + + doc_topic_distr : ndarray of shape (n_samples, n_components), \ + default=None + Document topic distribution. + If it is None, it will be generated by applying transform on X. + + Returns + ------- + score : float + Perplexity score. + """ + if doc_topic_distr is None: + doc_topic_distr = self._unnormalized_transform(X) + else: + n_samples, n_components = doc_topic_distr.shape + if n_samples != X.shape[0]: + raise ValueError( + "Number of samples in X and doc_topic_distr do not match." + ) + + if n_components != self.n_components: + raise ValueError("Number of topics does not match.") + + current_samples = X.shape[0] + bound = self._approx_bound(X, doc_topic_distr, sub_sampling) + + if sub_sampling: + word_cnt = X.sum() * (float(self.total_samples) / current_samples) + else: + word_cnt = X.sum() + perword_bound = bound / word_cnt + + return np.exp(-1.0 * perword_bound) + + def perplexity(self, X, sub_sampling=False): + """Calculate approximate perplexity for data X. + + Perplexity is defined as exp(-1. * log-likelihood per word) + + .. versionchanged:: 0.19 + *doc_topic_distr* argument has been deprecated and is ignored + because user no longer has access to unnormalized distribution + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Document word matrix. + + sub_sampling : bool + Do sub-sampling or not. + + Returns + ------- + score : float + Perplexity score. + """ + check_is_fitted(self) + X = self._check_non_neg_array( + X, reset_n_features=True, whom="LatentDirichletAllocation.perplexity" + ) + return self._perplexity_precomp_distr(X, sub_sampling=sub_sampling) + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_nmf.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_nmf.py new file mode 100644 index 0000000000000000000000000000000000000000..4c963538619a38d35bb74affb7ad8ebb64c071eb --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_nmf.py @@ -0,0 +1,2409 @@ +"""Non-negative matrix factorization.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import itertools +import time +import warnings +from abc import ABC +from math import sqrt +from numbers import Integral, Real + +import numpy as np +import scipy.sparse as sp +from scipy import linalg + +from .._config import config_context +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) +from ..exceptions import ConvergenceWarning +from ..utils import check_array, check_random_state, gen_batches +from ..utils._param_validation import ( + Interval, + StrOptions, + validate_params, +) +from ..utils.extmath import _randomized_svd, safe_sparse_dot, squared_norm +from ..utils.validation import ( + check_is_fitted, + check_non_negative, + validate_data, +) +from ._cdnmf_fast import _update_cdnmf_fast + +EPSILON = np.finfo(np.float32).eps + + +def norm(x): + """Dot product-based Euclidean norm implementation. + + See: http://fa.bianp.net/blog/2011/computing-the-vector-norm/ + + Parameters + ---------- + x : array-like + Vector for which to compute the norm. + """ + return sqrt(squared_norm(x)) + + +def trace_dot(X, Y): + """Trace of np.dot(X, Y.T). + + Parameters + ---------- + X : array-like + First matrix. + Y : array-like + Second matrix. + """ + return np.dot(X.ravel(), Y.ravel()) + + +def _check_init(A, shape, whom): + A = check_array(A) + if shape[0] != "auto" and A.shape[0] != shape[0]: + raise ValueError( + f"Array with wrong first dimension passed to {whom}. Expected {shape[0]}, " + f"but got {A.shape[0]}." + ) + if shape[1] != "auto" and A.shape[1] != shape[1]: + raise ValueError( + f"Array with wrong second dimension passed to {whom}. Expected {shape[1]}, " + f"but got {A.shape[1]}." + ) + check_non_negative(A, whom) + if np.max(A) == 0: + raise ValueError(f"Array passed to {whom} is full of zeros.") + + +def _beta_divergence(X, W, H, beta, square_root=False): + """Compute the beta-divergence of X and dot(W, H). + + Parameters + ---------- + X : float or array-like of shape (n_samples, n_features) + + W : float or array-like of shape (n_samples, n_components) + + H : float or array-like of shape (n_components, n_features) + + beta : float or {'frobenius', 'kullback-leibler', 'itakura-saito'} + Parameter of the beta-divergence. + If beta == 2, this is half the Frobenius *squared* norm. + If beta == 1, this is the generalized Kullback-Leibler divergence. + If beta == 0, this is the Itakura-Saito divergence. + Else, this is the general beta-divergence. + + square_root : bool, default=False + If True, return np.sqrt(2 * res) + For beta == 2, it corresponds to the Frobenius norm. + + Returns + ------- + res : float + Beta divergence of X and np.dot(X, H). + """ + beta = _beta_loss_to_float(beta) + + # The method can be called with scalars + if not sp.issparse(X): + X = np.atleast_2d(X) + W = np.atleast_2d(W) + H = np.atleast_2d(H) + + # Frobenius norm + if beta == 2: + # Avoid the creation of the dense np.dot(W, H) if X is sparse. + if sp.issparse(X): + norm_X = np.dot(X.data, X.data) + norm_WH = trace_dot(np.linalg.multi_dot([W.T, W, H]), H) + cross_prod = trace_dot((X @ H.T), W) + res = (norm_X + norm_WH - 2.0 * cross_prod) / 2.0 + else: + res = squared_norm(X - np.dot(W, H)) / 2.0 + + if square_root: + return np.sqrt(res * 2) + else: + return res + + if sp.issparse(X): + # compute np.dot(W, H) only where X is nonzero + WH_data = _special_sparse_dot(W, H, X).data + X_data = X.data + else: + WH = np.dot(W, H) + WH_data = WH.ravel() + X_data = X.ravel() + + # do not affect the zeros: here 0 ** (-1) = 0 and not infinity + indices = X_data > EPSILON + WH_data = WH_data[indices] + X_data = X_data[indices] + + # used to avoid division by zero + WH_data[WH_data < EPSILON] = EPSILON + + # generalized Kullback-Leibler divergence + if beta == 1: + # fast and memory efficient computation of np.sum(np.dot(W, H)) + sum_WH = np.dot(np.sum(W, axis=0), np.sum(H, axis=1)) + # computes np.sum(X * log(X / WH)) only where X is nonzero + div = X_data / WH_data + res = np.dot(X_data, np.log(div)) + # add full np.sum(np.dot(W, H)) - np.sum(X) + res += sum_WH - X_data.sum() + + # Itakura-Saito divergence + elif beta == 0: + div = X_data / WH_data + res = np.sum(div) - np.prod(X.shape) - np.sum(np.log(div)) + + # beta-divergence, beta not in (0, 1, 2) + else: + if sp.issparse(X): + # slow loop, but memory efficient computation of : + # np.sum(np.dot(W, H) ** beta) + sum_WH_beta = 0 + for i in range(X.shape[1]): + sum_WH_beta += np.sum(np.dot(W, H[:, i]) ** beta) + + else: + sum_WH_beta = np.sum(WH**beta) + + sum_X_WH = np.dot(X_data, WH_data ** (beta - 1)) + res = (X_data**beta).sum() - beta * sum_X_WH + res += sum_WH_beta * (beta - 1) + res /= beta * (beta - 1) + + if square_root: + res = max(res, 0) # avoid negative number due to rounding errors + return np.sqrt(2 * res) + else: + return res + + +def _special_sparse_dot(W, H, X): + """Computes np.dot(W, H), only where X is non zero.""" + if sp.issparse(X): + ii, jj = X.nonzero() + n_vals = ii.shape[0] + dot_vals = np.empty(n_vals) + n_components = W.shape[1] + + batch_size = max(n_components, n_vals // n_components) + for start in range(0, n_vals, batch_size): + batch = slice(start, start + batch_size) + dot_vals[batch] = np.multiply(W[ii[batch], :], H.T[jj[batch], :]).sum( + axis=1 + ) + + WH = sp.coo_matrix((dot_vals, (ii, jj)), shape=X.shape) + return WH.tocsr() + else: + return np.dot(W, H) + + +def _beta_loss_to_float(beta_loss): + """Convert string beta_loss to float.""" + beta_loss_map = {"frobenius": 2, "kullback-leibler": 1, "itakura-saito": 0} + if isinstance(beta_loss, str): + beta_loss = beta_loss_map[beta_loss] + return beta_loss + + +def _initialize_nmf(X, n_components, init=None, eps=1e-6, random_state=None): + """Algorithms for NMF initialization. + + Computes an initial guess for the non-negative + rank k matrix approximation for X: X = WH. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data matrix to be decomposed. + + n_components : int + The number of components desired in the approximation. + + init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar'}, default=None + Method used to initialize the procedure. + Valid options: + + - None: 'nndsvda' if n_components <= min(n_samples, n_features), + otherwise 'random'. + + - 'random': non-negative random matrices, scaled with: + sqrt(X.mean() / n_components) + + - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD) + initialization (better for sparseness) + + - 'nndsvda': NNDSVD with zeros filled with the average of X + (better when sparsity is not desired) + + - 'nndsvdar': NNDSVD with zeros filled with small random values + (generally faster, less accurate alternative to NNDSVDa + for when sparsity is not desired) + + - 'custom': use custom matrices W and H + + .. versionchanged:: 1.1 + When `init=None` and n_components is less than n_samples and n_features + defaults to `nndsvda` instead of `nndsvd`. + + eps : float, default=1e-6 + Truncate all values less then this in output to zero. + + random_state : int, RandomState instance or None, default=None + Used when ``init`` == 'nndsvdar' or 'random'. Pass an int for + reproducible results across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + W : array-like of shape (n_samples, n_components) + Initial guesses for solving X ~= WH. + + H : array-like of shape (n_components, n_features) + Initial guesses for solving X ~= WH. + + References + ---------- + C. Boutsidis, E. Gallopoulos: SVD based initialization: A head start for + nonnegative matrix factorization - Pattern Recognition, 2008 + http://tinyurl.com/nndsvd + """ + check_non_negative(X, "NMF initialization") + n_samples, n_features = X.shape + + if ( + init is not None + and init != "random" + and n_components > min(n_samples, n_features) + ): + raise ValueError( + "init = '{}' can only be used when " + "n_components <= min(n_samples, n_features)".format(init) + ) + + if init is None: + if n_components <= min(n_samples, n_features): + init = "nndsvda" + else: + init = "random" + + # Random initialization + if init == "random": + avg = np.sqrt(X.mean() / n_components) + rng = check_random_state(random_state) + H = avg * rng.standard_normal(size=(n_components, n_features)).astype( + X.dtype, copy=False + ) + W = avg * rng.standard_normal(size=(n_samples, n_components)).astype( + X.dtype, copy=False + ) + np.abs(H, out=H) + np.abs(W, out=W) + return W, H + + # NNDSVD initialization + U, S, V = _randomized_svd(X, n_components, random_state=random_state) + W = np.zeros_like(U) + H = np.zeros_like(V) + + # The leading singular triplet is non-negative + # so it can be used as is for initialization. + W[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0]) + H[0, :] = np.sqrt(S[0]) * np.abs(V[0, :]) + + for j in range(1, n_components): + x, y = U[:, j], V[j, :] + + # extract positive and negative parts of column vectors + x_p, y_p = np.maximum(x, 0), np.maximum(y, 0) + x_n, y_n = np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0)) + + # and their norms + x_p_nrm, y_p_nrm = norm(x_p), norm(y_p) + x_n_nrm, y_n_nrm = norm(x_n), norm(y_n) + + m_p, m_n = x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm + + # choose update + if m_p > m_n: + u = x_p / x_p_nrm + v = y_p / y_p_nrm + sigma = m_p + else: + u = x_n / x_n_nrm + v = y_n / y_n_nrm + sigma = m_n + + lbd = np.sqrt(S[j] * sigma) + W[:, j] = lbd * u + H[j, :] = lbd * v + + W[W < eps] = 0 + H[H < eps] = 0 + + if init == "nndsvd": + pass + elif init == "nndsvda": + avg = X.mean() + W[W == 0] = avg + H[H == 0] = avg + elif init == "nndsvdar": + rng = check_random_state(random_state) + avg = X.mean() + W[W == 0] = abs(avg * rng.standard_normal(size=len(W[W == 0])) / 100) + H[H == 0] = abs(avg * rng.standard_normal(size=len(H[H == 0])) / 100) + else: + raise ValueError( + "Invalid init parameter: got %r instead of one of %r" + % (init, (None, "random", "nndsvd", "nndsvda", "nndsvdar")) + ) + + return W, H + + +def _update_coordinate_descent(X, W, Ht, l1_reg, l2_reg, shuffle, random_state): + """Helper function for _fit_coordinate_descent. + + Update W to minimize the objective function, iterating once over all + coordinates. By symmetry, to update H, one can call + _update_coordinate_descent(X.T, Ht, W, ...). + + """ + n_components = Ht.shape[1] + + HHt = np.dot(Ht.T, Ht) + XHt = safe_sparse_dot(X, Ht) + + # L2 regularization corresponds to increase of the diagonal of HHt + if l2_reg != 0.0: + # adds l2_reg only on the diagonal + HHt.flat[:: n_components + 1] += l2_reg + # L1 regularization corresponds to decrease of each element of XHt + if l1_reg != 0.0: + XHt -= l1_reg + + if shuffle: + permutation = random_state.permutation(n_components) + else: + permutation = np.arange(n_components) + # The following seems to be required on 64-bit Windows w/ Python 3.5. + permutation = np.asarray(permutation, dtype=np.intp) + return _update_cdnmf_fast(W, HHt, XHt, permutation) + + +def _fit_coordinate_descent( + X, + W, + H, + tol=1e-4, + max_iter=200, + l1_reg_W=0, + l1_reg_H=0, + l2_reg_W=0, + l2_reg_H=0, + update_H=True, + verbose=0, + shuffle=False, + random_state=None, +): + """Compute Non-negative Matrix Factorization (NMF) with Coordinate Descent + + The objective function is minimized with an alternating minimization of W + and H. Each minimization is done with a cyclic (up to a permutation of the + features) Coordinate Descent. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Constant matrix. + + W : array-like of shape (n_samples, n_components) + Initial guess for the solution. + + H : array-like of shape (n_components, n_features) + Initial guess for the solution. + + tol : float, default=1e-4 + Tolerance of the stopping condition. + + max_iter : int, default=200 + Maximum number of iterations before timing out. + + l1_reg_W : float, default=0. + L1 regularization parameter for W. + + l1_reg_H : float, default=0. + L1 regularization parameter for H. + + l2_reg_W : float, default=0. + L2 regularization parameter for W. + + l2_reg_H : float, default=0. + L2 regularization parameter for H. + + update_H : bool, default=True + Set to True, both W and H will be estimated from initial guesses. + Set to False, only W will be estimated. + + verbose : int, default=0 + The verbosity level. + + shuffle : bool, default=False + If true, randomize the order of coordinates in the CD solver. + + random_state : int, RandomState instance or None, default=None + Used to randomize the coordinates in the CD solver, when + ``shuffle`` is set to ``True``. Pass an int for reproducible + results across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + W : ndarray of shape (n_samples, n_components) + Solution to the non-negative least squares problem. + + H : ndarray of shape (n_components, n_features) + Solution to the non-negative least squares problem. + + n_iter : int + The number of iterations done by the algorithm. + + References + ---------- + .. [1] :doi:`"Fast local algorithms for large scale nonnegative matrix and tensor + factorizations" <10.1587/transfun.E92.A.708>` + Cichocki, Andrzej, and P. H. A. N. Anh-Huy. IEICE transactions on fundamentals + of electronics, communications and computer sciences 92.3: 708-721, 2009. + """ + # so W and Ht are both in C order in memory + Ht = check_array(H.T, order="C") + X = check_array(X, accept_sparse="csr") + + rng = check_random_state(random_state) + + for n_iter in range(1, max_iter + 1): + violation = 0.0 + + # Update W + violation += _update_coordinate_descent( + X, W, Ht, l1_reg_W, l2_reg_W, shuffle, rng + ) + # Update H + if update_H: + violation += _update_coordinate_descent( + X.T, Ht, W, l1_reg_H, l2_reg_H, shuffle, rng + ) + + if n_iter == 1: + violation_init = violation + + if violation_init == 0: + break + + if verbose: + print("violation:", violation / violation_init) + + if violation / violation_init <= tol: + if verbose: + print("Converged at iteration", n_iter + 1) + break + + return W, Ht.T, n_iter + + +def _multiplicative_update_w( + X, + W, + H, + beta_loss, + l1_reg_W, + l2_reg_W, + gamma, + H_sum=None, + HHt=None, + XHt=None, + update_H=True, +): + """Update W in Multiplicative Update NMF.""" + if beta_loss == 2: + # Numerator + if XHt is None: + XHt = safe_sparse_dot(X, H.T) + if update_H: + # avoid a copy of XHt, which will be re-computed (update_H=True) + numerator = XHt + else: + # preserve the XHt, which is not re-computed (update_H=False) + numerator = XHt.copy() + + # Denominator + if HHt is None: + HHt = np.dot(H, H.T) + denominator = np.dot(W, HHt) + + else: + # Numerator + # if X is sparse, compute WH only where X is non zero + WH_safe_X = _special_sparse_dot(W, H, X) + if sp.issparse(X): + WH_safe_X_data = WH_safe_X.data + X_data = X.data + else: + WH_safe_X_data = WH_safe_X + X_data = X + # copy used in the Denominator + WH = WH_safe_X.copy() + if beta_loss - 1.0 < 0: + WH[WH < EPSILON] = EPSILON + + # to avoid taking a negative power of zero + if beta_loss - 2.0 < 0: + WH_safe_X_data[WH_safe_X_data < EPSILON] = EPSILON + + if beta_loss == 1: + np.divide(X_data, WH_safe_X_data, out=WH_safe_X_data) + elif beta_loss == 0: + # speeds up computation time + # refer to /numpy/numpy/issues/9363 + WH_safe_X_data **= -1 + WH_safe_X_data **= 2 + # element-wise multiplication + WH_safe_X_data *= X_data + else: + WH_safe_X_data **= beta_loss - 2 + # element-wise multiplication + WH_safe_X_data *= X_data + + # here numerator = dot(X * (dot(W, H) ** (beta_loss - 2)), H.T) + numerator = safe_sparse_dot(WH_safe_X, H.T) + + # Denominator + if beta_loss == 1: + if H_sum is None: + H_sum = np.sum(H, axis=1) # shape(n_components, ) + denominator = H_sum[np.newaxis, :] + + else: + # computation of WHHt = dot(dot(W, H) ** beta_loss - 1, H.T) + if sp.issparse(X): + # memory efficient computation + # (compute row by row, avoiding the dense matrix WH) + WHHt = np.empty(W.shape) + for i in range(X.shape[0]): + WHi = np.dot(W[i, :], H) + if beta_loss - 1 < 0: + WHi[WHi < EPSILON] = EPSILON + WHi **= beta_loss - 1 + WHHt[i, :] = np.dot(WHi, H.T) + else: + WH **= beta_loss - 1 + WHHt = np.dot(WH, H.T) + denominator = WHHt + + # Add L1 and L2 regularization + if l1_reg_W > 0: + denominator += l1_reg_W + if l2_reg_W > 0: + denominator = denominator + l2_reg_W * W + denominator[denominator == 0] = EPSILON + + numerator /= denominator + delta_W = numerator + + # gamma is in ]0, 1] + if gamma != 1: + delta_W **= gamma + + W *= delta_W + + return W, H_sum, HHt, XHt + + +def _multiplicative_update_h( + X, W, H, beta_loss, l1_reg_H, l2_reg_H, gamma, A=None, B=None, rho=None +): + """update H in Multiplicative Update NMF.""" + if beta_loss == 2: + numerator = safe_sparse_dot(W.T, X) + denominator = np.linalg.multi_dot([W.T, W, H]) + + else: + # Numerator + WH_safe_X = _special_sparse_dot(W, H, X) + if sp.issparse(X): + WH_safe_X_data = WH_safe_X.data + X_data = X.data + else: + WH_safe_X_data = WH_safe_X + X_data = X + # copy used in the Denominator + WH = WH_safe_X.copy() + if beta_loss - 1.0 < 0: + WH[WH < EPSILON] = EPSILON + + # to avoid division by zero + if beta_loss - 2.0 < 0: + WH_safe_X_data[WH_safe_X_data < EPSILON] = EPSILON + + if beta_loss == 1: + np.divide(X_data, WH_safe_X_data, out=WH_safe_X_data) + elif beta_loss == 0: + # speeds up computation time + # refer to /numpy/numpy/issues/9363 + WH_safe_X_data **= -1 + WH_safe_X_data **= 2 + # element-wise multiplication + WH_safe_X_data *= X_data + else: + WH_safe_X_data **= beta_loss - 2 + # element-wise multiplication + WH_safe_X_data *= X_data + + # here numerator = dot(W.T, (dot(W, H) ** (beta_loss - 2)) * X) + numerator = safe_sparse_dot(W.T, WH_safe_X) + + # Denominator + if beta_loss == 1: + W_sum = np.sum(W, axis=0) # shape(n_components, ) + W_sum[W_sum == 0] = 1.0 + denominator = W_sum[:, np.newaxis] + + # beta_loss not in (1, 2) + else: + # computation of WtWH = dot(W.T, dot(W, H) ** beta_loss - 1) + if sp.issparse(X): + # memory efficient computation + # (compute column by column, avoiding the dense matrix WH) + WtWH = np.empty(H.shape) + for i in range(X.shape[1]): + WHi = np.dot(W, H[:, i]) + if beta_loss - 1 < 0: + WHi[WHi < EPSILON] = EPSILON + WHi **= beta_loss - 1 + WtWH[:, i] = np.dot(W.T, WHi) + else: + WH **= beta_loss - 1 + WtWH = np.dot(W.T, WH) + denominator = WtWH + + # Add L1 and L2 regularization + if l1_reg_H > 0: + denominator += l1_reg_H + if l2_reg_H > 0: + denominator = denominator + l2_reg_H * H + denominator[denominator == 0] = EPSILON + + if A is not None and B is not None: + # Updates for the online nmf + if gamma != 1: + H **= 1 / gamma + numerator *= H + A *= rho + B *= rho + A += numerator + B += denominator + H = A / B + + if gamma != 1: + H **= gamma + else: + delta_H = numerator + delta_H /= denominator + if gamma != 1: + delta_H **= gamma + H *= delta_H + + return H + + +def _fit_multiplicative_update( + X, + W, + H, + beta_loss="frobenius", + max_iter=200, + tol=1e-4, + l1_reg_W=0, + l1_reg_H=0, + l2_reg_W=0, + l2_reg_H=0, + update_H=True, + verbose=0, +): + """Compute Non-negative Matrix Factorization with Multiplicative Update. + + The objective function is _beta_divergence(X, WH) and is minimized with an + alternating minimization of W and H. Each minimization is done with a + Multiplicative Update. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Constant input matrix. + + W : array-like of shape (n_samples, n_components) + Initial guess for the solution. + + H : array-like of shape (n_components, n_features) + Initial guess for the solution. + + beta_loss : float or {'frobenius', 'kullback-leibler', \ + 'itakura-saito'}, default='frobenius' + String must be in {'frobenius', 'kullback-leibler', 'itakura-saito'}. + Beta divergence to be minimized, measuring the distance between X + and the dot product WH. Note that values different from 'frobenius' + (or 2) and 'kullback-leibler' (or 1) lead to significantly slower + fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input + matrix X cannot contain zeros. + + max_iter : int, default=200 + Number of iterations. + + tol : float, default=1e-4 + Tolerance of the stopping condition. + + l1_reg_W : float, default=0. + L1 regularization parameter for W. + + l1_reg_H : float, default=0. + L1 regularization parameter for H. + + l2_reg_W : float, default=0. + L2 regularization parameter for W. + + l2_reg_H : float, default=0. + L2 regularization parameter for H. + + update_H : bool, default=True + Set to True, both W and H will be estimated from initial guesses. + Set to False, only W will be estimated. + + verbose : int, default=0 + The verbosity level. + + Returns + ------- + W : ndarray of shape (n_samples, n_components) + Solution to the non-negative least squares problem. + + H : ndarray of shape (n_components, n_features) + Solution to the non-negative least squares problem. + + n_iter : int + The number of iterations done by the algorithm. + + References + ---------- + Lee, D. D., & Seung, H., S. (2001). Algorithms for Non-negative Matrix + Factorization. Adv. Neural Inform. Process. Syst.. 13. + Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix + factorization with the beta-divergence. Neural Computation, 23(9). + """ + start_time = time.time() + + beta_loss = _beta_loss_to_float(beta_loss) + + # gamma for Maximization-Minimization (MM) algorithm [Fevotte 2011] + if beta_loss < 1: + gamma = 1.0 / (2.0 - beta_loss) + elif beta_loss > 2: + gamma = 1.0 / (beta_loss - 1.0) + else: + gamma = 1.0 + + # used for the convergence criterion + error_at_init = _beta_divergence(X, W, H, beta_loss, square_root=True) + previous_error = error_at_init + + H_sum, HHt, XHt = None, None, None + for n_iter in range(1, max_iter + 1): + # update W + # H_sum, HHt and XHt are saved and reused if not update_H + W, H_sum, HHt, XHt = _multiplicative_update_w( + X, + W, + H, + beta_loss=beta_loss, + l1_reg_W=l1_reg_W, + l2_reg_W=l2_reg_W, + gamma=gamma, + H_sum=H_sum, + HHt=HHt, + XHt=XHt, + update_H=update_H, + ) + + # necessary for stability with beta_loss < 1 + if beta_loss < 1: + W[W < np.finfo(np.float64).eps] = 0.0 + + # update H (only at fit or fit_transform) + if update_H: + H = _multiplicative_update_h( + X, + W, + H, + beta_loss=beta_loss, + l1_reg_H=l1_reg_H, + l2_reg_H=l2_reg_H, + gamma=gamma, + ) + + # These values will be recomputed since H changed + H_sum, HHt, XHt = None, None, None + + # necessary for stability with beta_loss < 1 + if beta_loss <= 1: + H[H < np.finfo(np.float64).eps] = 0.0 + + # test convergence criterion every 10 iterations + if tol > 0 and n_iter % 10 == 0: + error = _beta_divergence(X, W, H, beta_loss, square_root=True) + + if verbose: + iter_time = time.time() + print( + "Epoch %02d reached after %.3f seconds, error: %f" + % (n_iter, iter_time - start_time, error) + ) + + if (previous_error - error) / error_at_init < tol: + break + previous_error = error + + # do not print if we have already printed in the convergence test + if verbose and (tol == 0 or n_iter % 10 != 0): + end_time = time.time() + print( + "Epoch %02d reached after %.3f seconds." % (n_iter, end_time - start_time) + ) + + return W, H, n_iter + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "W": ["array-like", None], + "H": ["array-like", None], + "update_H": ["boolean"], + }, + prefer_skip_nested_validation=False, +) +def non_negative_factorization( + X, + W=None, + H=None, + n_components="auto", + *, + init=None, + update_H=True, + solver="cd", + beta_loss="frobenius", + tol=1e-4, + max_iter=200, + alpha_W=0.0, + alpha_H="same", + l1_ratio=0.0, + random_state=None, + verbose=0, + shuffle=False, +): + """Compute Non-negative Matrix Factorization (NMF). + + Find two non-negative matrices (W, H) whose product approximates the non- + negative matrix X. This factorization can be used for example for + dimensionality reduction, source separation or topic extraction. + + The objective function is: + + .. math:: + + L(W, H) &= 0.5 * ||X - WH||_{loss}^2 + + &+ alpha\\_W * l1\\_ratio * n\\_features * ||vec(W)||_1 + + &+ alpha\\_H * l1\\_ratio * n\\_samples * ||vec(H)||_1 + + &+ 0.5 * alpha\\_W * (1 - l1\\_ratio) * n\\_features * ||W||_{Fro}^2 + + &+ 0.5 * alpha\\_H * (1 - l1\\_ratio) * n\\_samples * ||H||_{Fro}^2, + + where :math:`||A||_{Fro}^2 = \\sum_{i,j} A_{ij}^2` (Frobenius norm) and + :math:`||vec(A)||_1 = \\sum_{i,j} abs(A_{ij})` (Elementwise L1 norm) + + The generic norm :math:`||X - WH||_{loss}^2` may represent + the Frobenius norm or another supported beta-divergence loss. + The choice between options is controlled by the `beta_loss` parameter. + + The regularization terms are scaled by `n_features` for `W` and by `n_samples` for + `H` to keep their impact balanced with respect to one another and to the data fit + term as independent as possible of the size `n_samples` of the training set. + + The objective function is minimized with an alternating minimization of W + and H. If H is given and update_H=False, it solves for W only. + + Note that the transformed data is named W and the components matrix is named H. In + the NMF literature, the naming convention is usually the opposite since the data + matrix X is transposed. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Constant matrix. + + W : array-like of shape (n_samples, n_components), default=None + If `init='custom'`, it is used as initial guess for the solution. + If `update_H=False`, it is initialised as an array of zeros, unless + `solver='mu'`, then it is filled with values calculated by + `np.sqrt(X.mean() / self._n_components)`. + If `None`, uses the initialisation method specified in `init`. + + H : array-like of shape (n_components, n_features), default=None + If `init='custom'`, it is used as initial guess for the solution. + If `update_H=False`, it is used as a constant, to solve for W only. + If `None`, uses the initialisation method specified in `init`. + + n_components : int or {'auto'} or None, default='auto' + Number of components. If `None`, all features are kept. + If `n_components='auto'`, the number of components is automatically inferred + from `W` or `H` shapes. + + .. versionchanged:: 1.4 + Added `'auto'` value. + + .. versionchanged:: 1.6 + Default value changed from `None` to `'auto'`. + + init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}, default=None + Method used to initialize the procedure. + + Valid options: + + - None: 'nndsvda' if n_components < n_features, otherwise 'random'. + - 'random': non-negative random matrices, scaled with: + `sqrt(X.mean() / n_components)` + - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD) + initialization (better for sparseness) + - 'nndsvda': NNDSVD with zeros filled with the average of X + (better when sparsity is not desired) + - 'nndsvdar': NNDSVD with zeros filled with small random values + (generally faster, less accurate alternative to NNDSVDa + for when sparsity is not desired) + - 'custom': If `update_H=True`, use custom matrices W and H which must both + be provided. If `update_H=False`, then only custom matrix H is used. + + .. versionchanged:: 0.23 + The default value of `init` changed from 'random' to None in 0.23. + + .. versionchanged:: 1.1 + When `init=None` and n_components is less than n_samples and n_features + defaults to `nndsvda` instead of `nndsvd`. + + update_H : bool, default=True + Set to True, both W and H will be estimated from initial guesses. + Set to False, only W will be estimated. + + solver : {'cd', 'mu'}, default='cd' + Numerical solver to use: + + - 'cd' is a Coordinate Descent solver that uses Fast Hierarchical + Alternating Least Squares (Fast HALS). + - 'mu' is a Multiplicative Update solver. + + .. versionadded:: 0.17 + Coordinate Descent solver. + + .. versionadded:: 0.19 + Multiplicative Update solver. + + beta_loss : float or {'frobenius', 'kullback-leibler', \ + 'itakura-saito'}, default='frobenius' + Beta divergence to be minimized, measuring the distance between X + and the dot product WH. Note that values different from 'frobenius' + (or 2) and 'kullback-leibler' (or 1) lead to significantly slower + fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input + matrix X cannot contain zeros. Used only in 'mu' solver. + + .. versionadded:: 0.19 + + tol : float, default=1e-4 + Tolerance of the stopping condition. + + max_iter : int, default=200 + Maximum number of iterations before timing out. + + alpha_W : float, default=0.0 + Constant that multiplies the regularization terms of `W`. Set it to zero + (default) to have no regularization on `W`. + + .. versionadded:: 1.0 + + alpha_H : float or "same", default="same" + Constant that multiplies the regularization terms of `H`. Set it to zero to + have no regularization on `H`. If "same" (default), it takes the same value as + `alpha_W`. + + .. versionadded:: 1.0 + + l1_ratio : float, default=0.0 + The regularization mixing parameter, with 0 <= l1_ratio <= 1. + For l1_ratio = 0 the penalty is an elementwise L2 penalty + (aka Frobenius Norm). + For l1_ratio = 1 it is an elementwise L1 penalty. + For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2. + + random_state : int, RandomState instance or None, default=None + Used for NMF initialisation (when ``init`` == 'nndsvdar' or + 'random'), and in Coordinate Descent. Pass an int for reproducible + results across multiple function calls. + See :term:`Glossary `. + + verbose : int, default=0 + The verbosity level. + + shuffle : bool, default=False + If true, randomize the order of coordinates in the CD solver. + + Returns + ------- + W : ndarray of shape (n_samples, n_components) + Solution to the non-negative least squares problem. + + H : ndarray of shape (n_components, n_features) + Solution to the non-negative least squares problem. + + n_iter : int + Actual number of iterations. + + References + ---------- + .. [1] :doi:`"Fast local algorithms for large scale nonnegative matrix and tensor + factorizations" <10.1587/transfun.E92.A.708>` + Cichocki, Andrzej, and P. H. A. N. Anh-Huy. IEICE transactions on fundamentals + of electronics, communications and computer sciences 92.3: 708-721, 2009. + + .. [2] :doi:`"Algorithms for nonnegative matrix factorization with the + beta-divergence" <10.1162/NECO_a_00168>` + Fevotte, C., & Idier, J. (2011). Neural Computation, 23(9). + + Examples + -------- + >>> import numpy as np + >>> X = np.array([[1,1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]) + >>> from sklearn.decomposition import non_negative_factorization + >>> W, H, n_iter = non_negative_factorization( + ... X, n_components=2, init='random', random_state=0) + """ + est = NMF( + n_components=n_components, + init=init, + solver=solver, + beta_loss=beta_loss, + tol=tol, + max_iter=max_iter, + random_state=random_state, + alpha_W=alpha_W, + alpha_H=alpha_H, + l1_ratio=l1_ratio, + verbose=verbose, + shuffle=shuffle, + ) + est._validate_params() + + X = check_array(X, accept_sparse=("csr", "csc"), dtype=[np.float64, np.float32]) + + with config_context(assume_finite=True): + W, H, n_iter = est._fit_transform(X, W=W, H=H, update_H=update_H) + + return W, H, n_iter + + +class _BaseNMF(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator, ABC): + """Base class for NMF and MiniBatchNMF.""" + + _parameter_constraints: dict = { + "n_components": [ + Interval(Integral, 1, None, closed="left"), + None, + StrOptions({"auto"}), + ], + "init": [ + StrOptions({"random", "nndsvd", "nndsvda", "nndsvdar", "custom"}), + None, + ], + "beta_loss": [ + StrOptions({"frobenius", "kullback-leibler", "itakura-saito"}), + Real, + ], + "tol": [Interval(Real, 0, None, closed="left")], + "max_iter": [Interval(Integral, 1, None, closed="left")], + "random_state": ["random_state"], + "alpha_W": [Interval(Real, 0, None, closed="left")], + "alpha_H": [Interval(Real, 0, None, closed="left"), StrOptions({"same"})], + "l1_ratio": [Interval(Real, 0, 1, closed="both")], + "verbose": ["verbose"], + } + + def __init__( + self, + n_components="auto", + *, + init=None, + beta_loss="frobenius", + tol=1e-4, + max_iter=200, + random_state=None, + alpha_W=0.0, + alpha_H="same", + l1_ratio=0.0, + verbose=0, + ): + self.n_components = n_components + self.init = init + self.beta_loss = beta_loss + self.tol = tol + self.max_iter = max_iter + self.random_state = random_state + self.alpha_W = alpha_W + self.alpha_H = alpha_H + self.l1_ratio = l1_ratio + self.verbose = verbose + + def _check_params(self, X): + # n_components + self._n_components = self.n_components + if self._n_components is None: + self._n_components = X.shape[1] + + # beta_loss + self._beta_loss = _beta_loss_to_float(self.beta_loss) + + def _check_w_h(self, X, W, H, update_H): + """Check W and H, or initialize them.""" + n_samples, n_features = X.shape + + if self.init == "custom" and update_H: + _check_init(H, (self._n_components, n_features), "NMF (input H)") + _check_init(W, (n_samples, self._n_components), "NMF (input W)") + if self._n_components == "auto": + self._n_components = H.shape[0] + + if H.dtype != X.dtype or W.dtype != X.dtype: + raise TypeError( + "H and W should have the same dtype as X. Got " + "H.dtype = {} and W.dtype = {}.".format(H.dtype, W.dtype) + ) + + elif not update_H: + if W is not None: + warnings.warn( + "When update_H=False, the provided initial W is not used.", + RuntimeWarning, + ) + + _check_init(H, (self._n_components, n_features), "NMF (input H)") + if self._n_components == "auto": + self._n_components = H.shape[0] + + if H.dtype != X.dtype: + raise TypeError( + "H should have the same dtype as X. Got H.dtype = {}.".format( + H.dtype + ) + ) + + # 'mu' solver should not be initialized by zeros + if self.solver == "mu": + avg = np.sqrt(X.mean() / self._n_components) + W = np.full((n_samples, self._n_components), avg, dtype=X.dtype) + else: + W = np.zeros((n_samples, self._n_components), dtype=X.dtype) + + else: + if W is not None or H is not None: + warnings.warn( + ( + "When init!='custom', provided W or H are ignored. Set " + " init='custom' to use them as initialization." + ), + RuntimeWarning, + ) + + if self._n_components == "auto": + self._n_components = X.shape[1] + + W, H = _initialize_nmf( + X, self._n_components, init=self.init, random_state=self.random_state + ) + + return W, H + + def _compute_regularization(self, X): + """Compute scaled regularization terms.""" + n_samples, n_features = X.shape + alpha_W = self.alpha_W + alpha_H = self.alpha_W if self.alpha_H == "same" else self.alpha_H + + l1_reg_W = n_features * alpha_W * self.l1_ratio + l1_reg_H = n_samples * alpha_H * self.l1_ratio + l2_reg_W = n_features * alpha_W * (1.0 - self.l1_ratio) + l2_reg_H = n_samples * alpha_H * (1.0 - self.l1_ratio) + + return l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H + + def fit(self, X, y=None, **params): + """Learn a NMF model for the data X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + **params : kwargs + Parameters (keyword arguments) and values passed to + the fit_transform instance. + + Returns + ------- + self : object + Returns the instance itself. + """ + # param validation is done in fit_transform + + self.fit_transform(X, **params) + return self + + def inverse_transform(self, X): + """Transform data back to its original space. + + .. versionadded:: 0.18 + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_components) + Transformed data matrix. + + Returns + ------- + X_original : ndarray of shape (n_samples, n_features) + Returns a data matrix of the original shape. + """ + + check_is_fitted(self) + return X @ self.components_ + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.positive_only = True + tags.input_tags.sparse = True + tags.transformer_tags.preserves_dtype = ["float64", "float32"] + return tags + + +class NMF(_BaseNMF): + """Non-Negative Matrix Factorization (NMF). + + Find two non-negative matrices, i.e. matrices with all non-negative elements, (W, H) + whose product approximates the non-negative matrix X. This factorization can be used + for example for dimensionality reduction, source separation or topic extraction. + + The objective function is: + + .. math:: + + L(W, H) &= 0.5 * ||X - WH||_{loss}^2 + + &+ alpha\\_W * l1\\_ratio * n\\_features * ||vec(W)||_1 + + &+ alpha\\_H * l1\\_ratio * n\\_samples * ||vec(H)||_1 + + &+ 0.5 * alpha\\_W * (1 - l1\\_ratio) * n\\_features * ||W||_{Fro}^2 + + &+ 0.5 * alpha\\_H * (1 - l1\\_ratio) * n\\_samples * ||H||_{Fro}^2, + + where :math:`||A||_{Fro}^2 = \\sum_{i,j} A_{ij}^2` (Frobenius norm) and + :math:`||vec(A)||_1 = \\sum_{i,j} abs(A_{ij})` (Elementwise L1 norm). + + The generic norm :math:`||X - WH||_{loss}` may represent + the Frobenius norm or another supported beta-divergence loss. + The choice between options is controlled by the `beta_loss` parameter. + + The regularization terms are scaled by `n_features` for `W` and by `n_samples` for + `H` to keep their impact balanced with respect to one another and to the data fit + term as independent as possible of the size `n_samples` of the training set. + + The objective function is minimized with an alternating minimization of W + and H. + + Note that the transformed data is named W and the components matrix is named H. In + the NMF literature, the naming convention is usually the opposite since the data + matrix X is transposed. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int or {'auto'} or None, default='auto' + Number of components. If `None`, all features are kept. + If `n_components='auto'`, the number of components is automatically inferred + from W or H shapes. + + .. versionchanged:: 1.4 + Added `'auto'` value. + + .. versionchanged:: 1.6 + Default value changed from `None` to `'auto'`. + + init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}, default=None + Method used to initialize the procedure. + Valid options: + + - `None`: 'nndsvda' if n_components <= min(n_samples, n_features), + otherwise random. + + - `'random'`: non-negative random matrices, scaled with: + `sqrt(X.mean() / n_components)` + + - `'nndsvd'`: Nonnegative Double Singular Value Decomposition (NNDSVD) + initialization (better for sparseness) + + - `'nndsvda'`: NNDSVD with zeros filled with the average of X + (better when sparsity is not desired) + + - `'nndsvdar'` NNDSVD with zeros filled with small random values + (generally faster, less accurate alternative to NNDSVDa + for when sparsity is not desired) + + - `'custom'`: Use custom matrices `W` and `H` which must both be provided. + + .. versionchanged:: 1.1 + When `init=None` and n_components is less than n_samples and n_features + defaults to `nndsvda` instead of `nndsvd`. + + solver : {'cd', 'mu'}, default='cd' + Numerical solver to use: + + - 'cd' is a Coordinate Descent solver. + - 'mu' is a Multiplicative Update solver. + + .. versionadded:: 0.17 + Coordinate Descent solver. + + .. versionadded:: 0.19 + Multiplicative Update solver. + + beta_loss : float or {'frobenius', 'kullback-leibler', \ + 'itakura-saito'}, default='frobenius' + Beta divergence to be minimized, measuring the distance between X + and the dot product WH. Note that values different from 'frobenius' + (or 2) and 'kullback-leibler' (or 1) lead to significantly slower + fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input + matrix X cannot contain zeros. Used only in 'mu' solver. + + .. versionadded:: 0.19 + + tol : float, default=1e-4 + Tolerance of the stopping condition. + + max_iter : int, default=200 + Maximum number of iterations before timing out. + + random_state : int, RandomState instance or None, default=None + Used for initialisation (when ``init`` == 'nndsvdar' or + 'random'), and in Coordinate Descent. Pass an int for reproducible + results across multiple function calls. + See :term:`Glossary `. + + alpha_W : float, default=0.0 + Constant that multiplies the regularization terms of `W`. Set it to zero + (default) to have no regularization on `W`. + + .. versionadded:: 1.0 + + alpha_H : float or "same", default="same" + Constant that multiplies the regularization terms of `H`. Set it to zero to + have no regularization on `H`. If "same" (default), it takes the same value as + `alpha_W`. + + .. versionadded:: 1.0 + + l1_ratio : float, default=0.0 + The regularization mixing parameter, with 0 <= l1_ratio <= 1. + For l1_ratio = 0 the penalty is an elementwise L2 penalty + (aka Frobenius Norm). + For l1_ratio = 1 it is an elementwise L1 penalty. + For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2. + + .. versionadded:: 0.17 + Regularization parameter *l1_ratio* used in the Coordinate Descent + solver. + + verbose : int, default=0 + Whether to be verbose. + + shuffle : bool, default=False + If true, randomize the order of coordinates in the CD solver. + + .. versionadded:: 0.17 + *shuffle* parameter used in the Coordinate Descent solver. + + Attributes + ---------- + components_ : ndarray of shape (n_components, n_features) + Factorization matrix, sometimes called 'dictionary'. + + n_components_ : int + The number of components. It is same as the `n_components` parameter + if it was given. Otherwise, it will be same as the number of + features. + + reconstruction_err_ : float + Frobenius norm of the matrix difference, or beta-divergence, between + the training data ``X`` and the reconstructed data ``WH`` from + the fitted model. + + n_iter_ : int + Actual number of iterations. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + DictionaryLearning : Find a dictionary that sparsely encodes data. + MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis. + PCA : Principal component analysis. + SparseCoder : Find a sparse representation of data from a fixed, + precomputed dictionary. + SparsePCA : Sparse Principal Components Analysis. + TruncatedSVD : Dimensionality reduction using truncated SVD. + + References + ---------- + .. [1] :doi:`"Fast local algorithms for large scale nonnegative matrix and tensor + factorizations" <10.1587/transfun.E92.A.708>` + Cichocki, Andrzej, and P. H. A. N. Anh-Huy. IEICE transactions on fundamentals + of electronics, communications and computer sciences 92.3: 708-721, 2009. + + .. [2] :doi:`"Algorithms for nonnegative matrix factorization with the + beta-divergence" <10.1162/NECO_a_00168>` + Fevotte, C., & Idier, J. (2011). Neural Computation, 23(9). + + Examples + -------- + >>> import numpy as np + >>> X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]) + >>> from sklearn.decomposition import NMF + >>> model = NMF(n_components=2, init='random', random_state=0) + >>> W = model.fit_transform(X) + >>> H = model.components_ + """ + + _parameter_constraints: dict = { + **_BaseNMF._parameter_constraints, + "solver": [StrOptions({"mu", "cd"})], + "shuffle": ["boolean"], + } + + def __init__( + self, + n_components="auto", + *, + init=None, + solver="cd", + beta_loss="frobenius", + tol=1e-4, + max_iter=200, + random_state=None, + alpha_W=0.0, + alpha_H="same", + l1_ratio=0.0, + verbose=0, + shuffle=False, + ): + super().__init__( + n_components=n_components, + init=init, + beta_loss=beta_loss, + tol=tol, + max_iter=max_iter, + random_state=random_state, + alpha_W=alpha_W, + alpha_H=alpha_H, + l1_ratio=l1_ratio, + verbose=verbose, + ) + + self.solver = solver + self.shuffle = shuffle + + def _check_params(self, X): + super()._check_params(X) + + # solver + if self.solver != "mu" and self.beta_loss not in (2, "frobenius"): + # 'mu' is the only solver that handles other beta losses than 'frobenius' + raise ValueError( + f"Invalid beta_loss parameter: solver {self.solver!r} does not handle " + f"beta_loss = {self.beta_loss!r}" + ) + if self.solver == "mu" and self.init == "nndsvd": + warnings.warn( + ( + "The multiplicative update ('mu') solver cannot update " + "zeros present in the initialization, and so leads to " + "poorer results when used jointly with init='nndsvd'. " + "You may try init='nndsvda' or init='nndsvdar' instead." + ), + UserWarning, + ) + + return self + + @_fit_context(prefer_skip_nested_validation=True) + def fit_transform(self, X, y=None, W=None, H=None): + """Learn a NMF model for the data X and returns the transformed data. + + This is more efficient than calling fit followed by transform. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + W : array-like of shape (n_samples, n_components), default=None + If `init='custom'`, it is used as initial guess for the solution. + If `None`, uses the initialisation method specified in `init`. + + H : array-like of shape (n_components, n_features), default=None + If `init='custom'`, it is used as initial guess for the solution. + If `None`, uses the initialisation method specified in `init`. + + Returns + ------- + W : ndarray of shape (n_samples, n_components) + Transformed data. + """ + X = validate_data( + self, X, accept_sparse=("csr", "csc"), dtype=[np.float64, np.float32] + ) + + with config_context(assume_finite=True): + W, H, n_iter = self._fit_transform(X, W=W, H=H) + + self.reconstruction_err_ = _beta_divergence( + X, W, H, self._beta_loss, square_root=True + ) + + self.n_components_ = H.shape[0] + self.components_ = H + self.n_iter_ = n_iter + + return W + + def _fit_transform(self, X, y=None, W=None, H=None, update_H=True): + """Learn a NMF model for the data X and returns the transformed data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Data matrix to be decomposed + + y : Ignored + + W : array-like of shape (n_samples, n_components), default=None + If `init='custom'`, it is used as initial guess for the solution. + If `update_H=False`, it is initialised as an array of zeros, unless + `solver='mu'`, then it is filled with values calculated by + `np.sqrt(X.mean() / self._n_components)`. + If `None`, uses the initialisation method specified in `init`. + + H : array-like of shape (n_components, n_features), default=None + If `init='custom'`, it is used as initial guess for the solution. + If `update_H=False`, it is used as a constant, to solve for W only. + If `None`, uses the initialisation method specified in `init`. + + update_H : bool, default=True + If True, both W and H will be estimated from initial guesses, + this corresponds to a call to the 'fit_transform' method. + If False, only W will be estimated, this corresponds to a call + to the 'transform' method. + + Returns + ------- + W : ndarray of shape (n_samples, n_components) + Transformed data. + + H : ndarray of shape (n_components, n_features) + Factorization matrix, sometimes called 'dictionary'. + + n_iter_ : int + Actual number of iterations. + """ + # check parameters + self._check_params(X) + + if X.min() == 0 and self._beta_loss <= 0: + raise ValueError( + "When beta_loss <= 0 and X contains zeros, " + "the solver may diverge. Please add small values " + "to X, or use a positive beta_loss." + ) + + # initialize or check W and H + W, H = self._check_w_h(X, W, H, update_H) + + # scale the regularization terms + l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = self._compute_regularization(X) + + if self.solver == "cd": + W, H, n_iter = _fit_coordinate_descent( + X, + W, + H, + self.tol, + self.max_iter, + l1_reg_W, + l1_reg_H, + l2_reg_W, + l2_reg_H, + update_H=update_H, + verbose=self.verbose, + shuffle=self.shuffle, + random_state=self.random_state, + ) + elif self.solver == "mu": + W, H, n_iter, *_ = _fit_multiplicative_update( + X, + W, + H, + self._beta_loss, + self.max_iter, + self.tol, + l1_reg_W, + l1_reg_H, + l2_reg_W, + l2_reg_H, + update_H, + self.verbose, + ) + else: + raise ValueError("Invalid solver parameter '%s'." % self.solver) + + if n_iter == self.max_iter and self.tol > 0: + warnings.warn( + "Maximum number of iterations %d reached. Increase " + "it to improve convergence." % self.max_iter, + ConvergenceWarning, + ) + + return W, H, n_iter + + def transform(self, X): + """Transform the data X according to the fitted NMF model. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples + and `n_features` is the number of features. + + Returns + ------- + W : ndarray of shape (n_samples, n_components) + Transformed data. + """ + check_is_fitted(self) + X = validate_data( + self, + X, + accept_sparse=("csr", "csc"), + dtype=[np.float64, np.float32], + reset=False, + ensure_non_negative=True, + ) + + with config_context(assume_finite=True): + W, *_ = self._fit_transform(X, H=self.components_, update_H=False) + + return W + + +class MiniBatchNMF(_BaseNMF): + """Mini-Batch Non-Negative Matrix Factorization (NMF). + + .. versionadded:: 1.1 + + Find two non-negative matrices, i.e. matrices with all non-negative elements, + (`W`, `H`) whose product approximates the non-negative matrix `X`. This + factorization can be used for example for dimensionality reduction, source + separation or topic extraction. + + The objective function is: + + .. math:: + + L(W, H) &= 0.5 * ||X - WH||_{loss}^2 + + &+ alpha\\_W * l1\\_ratio * n\\_features * ||vec(W)||_1 + + &+ alpha\\_H * l1\\_ratio * n\\_samples * ||vec(H)||_1 + + &+ 0.5 * alpha\\_W * (1 - l1\\_ratio) * n\\_features * ||W||_{Fro}^2 + + &+ 0.5 * alpha\\_H * (1 - l1\\_ratio) * n\\_samples * ||H||_{Fro}^2, + + where :math:`||A||_{Fro}^2 = \\sum_{i,j} A_{ij}^2` (Frobenius norm) and + :math:`||vec(A)||_1 = \\sum_{i,j} abs(A_{ij})` (Elementwise L1 norm). + + The generic norm :math:`||X - WH||_{loss}^2` may represent + the Frobenius norm or another supported beta-divergence loss. + The choice between options is controlled by the `beta_loss` parameter. + + The objective function is minimized with an alternating minimization of `W` + and `H`. + + Note that the transformed data is named `W` and the components matrix is + named `H`. In the NMF literature, the naming convention is usually the opposite + since the data matrix `X` is transposed. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int or {'auto'} or None, default='auto' + Number of components. If `None`, all features are kept. + If `n_components='auto'`, the number of components is automatically inferred + from W or H shapes. + + .. versionchanged:: 1.4 + Added `'auto'` value. + + .. versionchanged:: 1.6 + Default value changed from `None` to `'auto'`. + + init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}, default=None + Method used to initialize the procedure. + Valid options: + + - `None`: 'nndsvda' if `n_components <= min(n_samples, n_features)`, + otherwise random. + + - `'random'`: non-negative random matrices, scaled with: + `sqrt(X.mean() / n_components)` + + - `'nndsvd'`: Nonnegative Double Singular Value Decomposition (NNDSVD) + initialization (better for sparseness). + + - `'nndsvda'`: NNDSVD with zeros filled with the average of X + (better when sparsity is not desired). + + - `'nndsvdar'` NNDSVD with zeros filled with small random values + (generally faster, less accurate alternative to NNDSVDa + for when sparsity is not desired). + + - `'custom'`: Use custom matrices `W` and `H` which must both be provided. + + batch_size : int, default=1024 + Number of samples in each mini-batch. Large batch sizes + give better long-term convergence at the cost of a slower start. + + beta_loss : float or {'frobenius', 'kullback-leibler', \ + 'itakura-saito'}, default='frobenius' + Beta divergence to be minimized, measuring the distance between `X` + and the dot product `WH`. Note that values different from 'frobenius' + (or 2) and 'kullback-leibler' (or 1) lead to significantly slower + fits. Note that for `beta_loss <= 0` (or 'itakura-saito'), the input + matrix `X` cannot contain zeros. + + tol : float, default=1e-4 + Control early stopping based on the norm of the differences in `H` + between 2 steps. To disable early stopping based on changes in `H`, set + `tol` to 0.0. + + max_no_improvement : int, default=10 + Control early stopping based on the consecutive number of mini batches + that does not yield an improvement on the smoothed cost function. + To disable convergence detection based on cost function, set + `max_no_improvement` to None. + + max_iter : int, default=200 + Maximum number of iterations over the complete dataset before + timing out. + + alpha_W : float, default=0.0 + Constant that multiplies the regularization terms of `W`. Set it to zero + (default) to have no regularization on `W`. + + alpha_H : float or "same", default="same" + Constant that multiplies the regularization terms of `H`. Set it to zero to + have no regularization on `H`. If "same" (default), it takes the same value as + `alpha_W`. + + l1_ratio : float, default=0.0 + The regularization mixing parameter, with 0 <= l1_ratio <= 1. + For l1_ratio = 0 the penalty is an elementwise L2 penalty + (aka Frobenius Norm). + For l1_ratio = 1 it is an elementwise L1 penalty. + For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2. + + forget_factor : float, default=0.7 + Amount of rescaling of past information. Its value could be 1 with + finite datasets. Choosing values < 1 is recommended with online + learning as more recent batches will weight more than past batches. + + fresh_restarts : bool, default=False + Whether to completely solve for W at each step. Doing fresh restarts will likely + lead to a better solution for a same number of iterations but it is much slower. + + fresh_restarts_max_iter : int, default=30 + Maximum number of iterations when solving for W at each step. Only used when + doing fresh restarts. These iterations may be stopped early based on a small + change of W controlled by `tol`. + + transform_max_iter : int, default=None + Maximum number of iterations when solving for W at transform time. + If None, it defaults to `max_iter`. + + random_state : int, RandomState instance or None, default=None + Used for initialisation (when ``init`` == 'nndsvdar' or + 'random'), and in Coordinate Descent. Pass an int for reproducible + results across multiple function calls. + See :term:`Glossary `. + + verbose : bool, default=False + Whether to be verbose. + + Attributes + ---------- + components_ : ndarray of shape (n_components, n_features) + Factorization matrix, sometimes called 'dictionary'. + + n_components_ : int + The number of components. It is same as the `n_components` parameter + if it was given. Otherwise, it will be same as the number of + features. + + reconstruction_err_ : float + Frobenius norm of the matrix difference, or beta-divergence, between + the training data `X` and the reconstructed data `WH` from + the fitted model. + + n_iter_ : int + Actual number of started iterations over the whole dataset. + + n_steps_ : int + Number of mini-batches processed. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + See Also + -------- + NMF : Non-negative matrix factorization. + MiniBatchDictionaryLearning : Finds a dictionary that can best be used to represent + data using a sparse code. + + References + ---------- + .. [1] :doi:`"Fast local algorithms for large scale nonnegative matrix and tensor + factorizations" <10.1587/transfun.E92.A.708>` + Cichocki, Andrzej, and P. H. A. N. Anh-Huy. IEICE transactions on fundamentals + of electronics, communications and computer sciences 92.3: 708-721, 2009. + + .. [2] :doi:`"Algorithms for nonnegative matrix factorization with the + beta-divergence" <10.1162/NECO_a_00168>` + Fevotte, C., & Idier, J. (2011). Neural Computation, 23(9). + + .. [3] :doi:`"Online algorithms for nonnegative matrix factorization with the + Itakura-Saito divergence" <10.1109/ASPAA.2011.6082314>` + Lefevre, A., Bach, F., Fevotte, C. (2011). WASPA. + + Examples + -------- + >>> import numpy as np + >>> X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]]) + >>> from sklearn.decomposition import MiniBatchNMF + >>> model = MiniBatchNMF(n_components=2, init='random', random_state=0) + >>> W = model.fit_transform(X) + >>> H = model.components_ + """ + + _parameter_constraints: dict = { + **_BaseNMF._parameter_constraints, + "max_no_improvement": [Interval(Integral, 1, None, closed="left"), None], + "batch_size": [Interval(Integral, 1, None, closed="left")], + "forget_factor": [Interval(Real, 0, 1, closed="both")], + "fresh_restarts": ["boolean"], + "fresh_restarts_max_iter": [Interval(Integral, 1, None, closed="left")], + "transform_max_iter": [Interval(Integral, 1, None, closed="left"), None], + } + + def __init__( + self, + n_components="auto", + *, + init=None, + batch_size=1024, + beta_loss="frobenius", + tol=1e-4, + max_no_improvement=10, + max_iter=200, + alpha_W=0.0, + alpha_H="same", + l1_ratio=0.0, + forget_factor=0.7, + fresh_restarts=False, + fresh_restarts_max_iter=30, + transform_max_iter=None, + random_state=None, + verbose=0, + ): + super().__init__( + n_components=n_components, + init=init, + beta_loss=beta_loss, + tol=tol, + max_iter=max_iter, + random_state=random_state, + alpha_W=alpha_W, + alpha_H=alpha_H, + l1_ratio=l1_ratio, + verbose=verbose, + ) + + self.max_no_improvement = max_no_improvement + self.batch_size = batch_size + self.forget_factor = forget_factor + self.fresh_restarts = fresh_restarts + self.fresh_restarts_max_iter = fresh_restarts_max_iter + self.transform_max_iter = transform_max_iter + + def _check_params(self, X): + super()._check_params(X) + + # batch_size + self._batch_size = min(self.batch_size, X.shape[0]) + + # forget_factor + self._rho = self.forget_factor ** (self._batch_size / X.shape[0]) + + # gamma for Maximization-Minimization (MM) algorithm [Fevotte 2011] + if self._beta_loss < 1: + self._gamma = 1.0 / (2.0 - self._beta_loss) + elif self._beta_loss > 2: + self._gamma = 1.0 / (self._beta_loss - 1.0) + else: + self._gamma = 1.0 + + # transform_max_iter + self._transform_max_iter = ( + self.max_iter + if self.transform_max_iter is None + else self.transform_max_iter + ) + + return self + + def _solve_W(self, X, H, max_iter): + """Minimize the objective function w.r.t W. + + Update W with H being fixed, until convergence. This is the heart + of `transform` but it's also used during `fit` when doing fresh restarts. + """ + avg = np.sqrt(X.mean() / self._n_components) + W = np.full((X.shape[0], self._n_components), avg, dtype=X.dtype) + W_buffer = W.copy() + + # Get scaled regularization terms. Done for each minibatch to take into account + # variable sizes of minibatches. + l1_reg_W, _, l2_reg_W, _ = self._compute_regularization(X) + + for _ in range(max_iter): + W, *_ = _multiplicative_update_w( + X, W, H, self._beta_loss, l1_reg_W, l2_reg_W, self._gamma + ) + + W_diff = linalg.norm(W - W_buffer) / linalg.norm(W) + if self.tol > 0 and W_diff <= self.tol: + break + + W_buffer[:] = W + + return W + + def _minibatch_step(self, X, W, H, update_H): + """Perform the update of W and H for one minibatch.""" + batch_size = X.shape[0] + + # get scaled regularization terms. Done for each minibatch to take into account + # variable sizes of minibatches. + l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = self._compute_regularization(X) + + # update W + if self.fresh_restarts or W is None: + W = self._solve_W(X, H, self.fresh_restarts_max_iter) + else: + W, *_ = _multiplicative_update_w( + X, W, H, self._beta_loss, l1_reg_W, l2_reg_W, self._gamma + ) + + # necessary for stability with beta_loss < 1 + if self._beta_loss < 1: + W[W < np.finfo(np.float64).eps] = 0.0 + + batch_cost = ( + _beta_divergence(X, W, H, self._beta_loss) + + l1_reg_W * W.sum() + + l1_reg_H * H.sum() + + l2_reg_W * (W**2).sum() + + l2_reg_H * (H**2).sum() + ) / batch_size + + # update H (only at fit or fit_transform) + if update_H: + H[:] = _multiplicative_update_h( + X, + W, + H, + beta_loss=self._beta_loss, + l1_reg_H=l1_reg_H, + l2_reg_H=l2_reg_H, + gamma=self._gamma, + A=self._components_numerator, + B=self._components_denominator, + rho=self._rho, + ) + + # necessary for stability with beta_loss < 1 + if self._beta_loss <= 1: + H[H < np.finfo(np.float64).eps] = 0.0 + + return batch_cost + + def _minibatch_convergence( + self, X, batch_cost, H, H_buffer, n_samples, step, n_steps + ): + """Helper function to encapsulate the early stopping logic""" + batch_size = X.shape[0] + + # counts steps starting from 1 for user friendly verbose mode. + step = step + 1 + + # Ignore first iteration because H is not updated yet. + if step == 1: + if self.verbose: + print(f"Minibatch step {step}/{n_steps}: mean batch cost: {batch_cost}") + return False + + # Compute an Exponentially Weighted Average of the cost function to + # monitor the convergence while discarding minibatch-local stochastic + # variability: https://en.wikipedia.org/wiki/Moving_average + if self._ewa_cost is None: + self._ewa_cost = batch_cost + else: + alpha = batch_size / (n_samples + 1) + alpha = min(alpha, 1) + self._ewa_cost = self._ewa_cost * (1 - alpha) + batch_cost * alpha + + # Log progress to be able to monitor convergence + if self.verbose: + print( + f"Minibatch step {step}/{n_steps}: mean batch cost: " + f"{batch_cost}, ewa cost: {self._ewa_cost}" + ) + + # Early stopping based on change of H + H_diff = linalg.norm(H - H_buffer) / linalg.norm(H) + if self.tol > 0 and H_diff <= self.tol: + if self.verbose: + print(f"Converged (small H change) at step {step}/{n_steps}") + return True + + # Early stopping heuristic due to lack of improvement on smoothed + # cost function + if self._ewa_cost_min is None or self._ewa_cost < self._ewa_cost_min: + self._no_improvement = 0 + self._ewa_cost_min = self._ewa_cost + else: + self._no_improvement += 1 + + if ( + self.max_no_improvement is not None + and self._no_improvement >= self.max_no_improvement + ): + if self.verbose: + print( + "Converged (lack of improvement in objective function) " + f"at step {step}/{n_steps}" + ) + return True + + return False + + @_fit_context(prefer_skip_nested_validation=True) + def fit_transform(self, X, y=None, W=None, H=None): + """Learn a NMF model for the data X and returns the transformed data. + + This is more efficient than calling fit followed by transform. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Data matrix to be decomposed. + + y : Ignored + Not used, present here for API consistency by convention. + + W : array-like of shape (n_samples, n_components), default=None + If `init='custom'`, it is used as initial guess for the solution. + If `None`, uses the initialisation method specified in `init`. + + H : array-like of shape (n_components, n_features), default=None + If `init='custom'`, it is used as initial guess for the solution. + If `None`, uses the initialisation method specified in `init`. + + Returns + ------- + W : ndarray of shape (n_samples, n_components) + Transformed data. + """ + X = validate_data( + self, X, accept_sparse=("csr", "csc"), dtype=[np.float64, np.float32] + ) + + with config_context(assume_finite=True): + W, H, n_iter, n_steps = self._fit_transform(X, W=W, H=H) + + self.reconstruction_err_ = _beta_divergence( + X, W, H, self._beta_loss, square_root=True + ) + + self.n_components_ = H.shape[0] + self.components_ = H + self.n_iter_ = n_iter + self.n_steps_ = n_steps + + return W + + def _fit_transform(self, X, W=None, H=None, update_H=True): + """Learn a NMF model for the data X and returns the transformed data. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + Data matrix to be decomposed. + + W : array-like of shape (n_samples, n_components), default=None + If `init='custom'`, it is used as initial guess for the solution. + If `update_H=False`, it is initialised as an array of zeros, unless + `solver='mu'`, then it is filled with values calculated by + `np.sqrt(X.mean() / self._n_components)`. + If `None`, uses the initialisation method specified in `init`. + + H : array-like of shape (n_components, n_features), default=None + If `init='custom'`, it is used as initial guess for the solution. + If `update_H=False`, it is used as a constant, to solve for W only. + If `None`, uses the initialisation method specified in `init`. + + update_H : bool, default=True + If True, both W and H will be estimated from initial guesses, + this corresponds to a call to the `fit_transform` method. + If False, only W will be estimated, this corresponds to a call + to the `transform` method. + + Returns + ------- + W : ndarray of shape (n_samples, n_components) + Transformed data. + + H : ndarray of shape (n_components, n_features) + Factorization matrix, sometimes called 'dictionary'. + + n_iter : int + Actual number of started iterations over the whole dataset. + + n_steps : int + Number of mini-batches processed. + """ + check_non_negative(X, "MiniBatchNMF (input X)") + self._check_params(X) + + if X.min() == 0 and self._beta_loss <= 0: + raise ValueError( + "When beta_loss <= 0 and X contains zeros, " + "the solver may diverge. Please add small values " + "to X, or use a positive beta_loss." + ) + + n_samples = X.shape[0] + + # initialize or check W and H + W, H = self._check_w_h(X, W, H, update_H) + H_buffer = H.copy() + + # Initialize auxiliary matrices + self._components_numerator = H.copy() + self._components_denominator = np.ones(H.shape, dtype=H.dtype) + + # Attributes to monitor the convergence + self._ewa_cost = None + self._ewa_cost_min = None + self._no_improvement = 0 + + batches = gen_batches(n_samples, self._batch_size) + batches = itertools.cycle(batches) + n_steps_per_iter = int(np.ceil(n_samples / self._batch_size)) + n_steps = self.max_iter * n_steps_per_iter + + for i, batch in zip(range(n_steps), batches): + batch_cost = self._minibatch_step(X[batch], W[batch], H, update_H) + + if update_H and self._minibatch_convergence( + X[batch], batch_cost, H, H_buffer, n_samples, i, n_steps + ): + break + + H_buffer[:] = H + + if self.fresh_restarts: + W = self._solve_W(X, H, self._transform_max_iter) + + n_steps = i + 1 + n_iter = int(np.ceil(n_steps / n_steps_per_iter)) + + if n_iter == self.max_iter and self.tol > 0: + warnings.warn( + ( + f"Maximum number of iterations {self.max_iter} reached. " + "Increase it to improve convergence." + ), + ConvergenceWarning, + ) + + return W, H, n_iter, n_steps + + def transform(self, X): + """Transform the data X according to the fitted MiniBatchNMF model. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Data matrix to be transformed by the model. + + Returns + ------- + W : ndarray of shape (n_samples, n_components) + Transformed data. + """ + check_is_fitted(self) + X = validate_data( + self, + X, + accept_sparse=("csr", "csc"), + dtype=[np.float64, np.float32], + reset=False, + ) + + W = self._solve_W(X, self.components_, self._transform_max_iter) + + return W + + @_fit_context(prefer_skip_nested_validation=True) + def partial_fit(self, X, y=None, W=None, H=None): + """Update the model using the data in `X` as a mini-batch. + + This method is expected to be called several times consecutively + on different chunks of a dataset so as to implement out-of-core + or online learning. + + This is especially useful when the whole dataset is too big to fit in + memory at once (see :ref:`scaling_strategies`). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Data matrix to be decomposed. + + y : Ignored + Not used, present here for API consistency by convention. + + W : array-like of shape (n_samples, n_components), default=None + If `init='custom'`, it is used as initial guess for the solution. + Only used for the first call to `partial_fit`. + + H : array-like of shape (n_components, n_features), default=None + If `init='custom'`, it is used as initial guess for the solution. + Only used for the first call to `partial_fit`. + + Returns + ------- + self + Returns the instance itself. + """ + has_components = hasattr(self, "components_") + + X = validate_data( + self, + X, + accept_sparse=("csr", "csc"), + dtype=[np.float64, np.float32], + reset=not has_components, + ) + + if not has_components: + # This instance has not been fitted yet (fit or partial_fit) + self._check_params(X) + _, H = self._check_w_h(X, W=W, H=H, update_H=True) + + self._components_numerator = H.copy() + self._components_denominator = np.ones(H.shape, dtype=H.dtype) + self.n_steps_ = 0 + else: + H = self.components_ + + self._minibatch_step(X, None, H, update_H=True) + + self.n_components_ = H.shape[0] + self.components_ = H + self.n_steps_ += 1 + + return self diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_online_lda_fast.pyx b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_online_lda_fast.pyx new file mode 100644 index 0000000000000000000000000000000000000000..14f45ba9675f5e6b188d3dfdd1c06bdf4136f0ca --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_online_lda_fast.pyx @@ -0,0 +1,110 @@ +import numpy as np + + +from cython cimport floating +from libc.math cimport exp, fabs, log + +from ..utils._typedefs cimport float64_t, intp_t + + +def mean_change(const floating[:] arr_1, const floating[:] arr_2): + """Calculate the mean difference between two arrays. + + Equivalent to np.abs(arr_1 - arr2).mean(). + """ + + cdef float64_t total, diff + cdef intp_t i, size + + size = arr_1.shape[0] + total = 0.0 + for i in range(size): + diff = fabs(arr_1[i] - arr_2[i]) + total += diff + + return total / size + + +def _dirichlet_expectation_1d( + floating[:] doc_topic, + floating doc_topic_prior, + floating[:] out +): + """Dirichlet expectation for a single sample: + exp(E[log(theta)]) for theta ~ Dir(doc_topic) + after adding doc_topic_prior to doc_topic, in-place. + + Equivalent to + doc_topic += doc_topic_prior + out[:] = np.exp(psi(doc_topic) - psi(np.sum(doc_topic))) + """ + + cdef floating dt, psi_total, total + cdef intp_t i, size + + size = doc_topic.shape[0] + + total = 0.0 + for i in range(size): + dt = doc_topic[i] + doc_topic_prior + doc_topic[i] = dt + total += dt + psi_total = psi(total) + + for i in range(size): + out[i] = exp(psi(doc_topic[i]) - psi_total) + + +def _dirichlet_expectation_2d(const floating[:, :] arr): + """Dirichlet expectation for multiple samples: + E[log(theta)] for theta ~ Dir(arr). + + Equivalent to psi(arr) - psi(np.sum(arr, axis=1))[:, np.newaxis]. + + Note that unlike _dirichlet_expectation_1d, this function doesn't compute + the exp and doesn't add in the prior. + """ + cdef floating row_total, psi_row_total + cdef floating[:, :] d_exp + cdef intp_t i, j, n_rows, n_cols + + n_rows = arr.shape[0] + n_cols = arr.shape[1] + + d_exp = np.empty_like(arr) + for i in range(n_rows): + row_total = 0 + for j in range(n_cols): + row_total += arr[i, j] + psi_row_total = psi(row_total) + + for j in range(n_cols): + d_exp[i, j] = psi(arr[i, j]) - psi_row_total + + return d_exp.base + + +# Psi function for positive arguments. Optimized for speed, not accuracy. +# +# After: J. Bernardo (1976). Algorithm AS 103: Psi (Digamma) Function. +# https://www.uv.es/~bernardo/1976AppStatist.pdf +cdef floating psi(floating x) noexcept nogil: + cdef double EULER = 0.577215664901532860606512090082402431 + if x <= 1e-6: + # psi(x) = -EULER - 1/x + O(x) + return -EULER - 1. / x + + cdef floating r, result = 0 + + # psi(x + 1) = psi(x) + 1/x + while x < 6: + result -= 1. / x + x += 1 + + # psi(x) = log(x) - 1/(2x) - 1/(12x**2) + 1/(120x**4) - 1/(252x**6) + # + O(1/x**8) + r = 1. / x + result += log(x) - .5 * r + r = r * r + result -= r * ((1./12.) - r * ((1./120.) - r * (1./252.))) + return result diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_pca.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_pca.py new file mode 100644 index 0000000000000000000000000000000000000000..1b0d21d5d38be9288a7e3c3405bd4ca449cfd808 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_pca.py @@ -0,0 +1,857 @@ +"""Principal Component Analysis.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from math import lgamma, log, sqrt +from numbers import Integral, Real + +import numpy as np +from scipy import linalg +from scipy.sparse import issparse +from scipy.sparse.linalg import svds + +from ..base import _fit_context +from ..utils import check_random_state +from ..utils._arpack import _init_arpack_v0 +from ..utils._array_api import _convert_to_numpy, get_namespace +from ..utils._param_validation import Interval, RealNotInt, StrOptions +from ..utils.extmath import _randomized_svd, fast_logdet, stable_cumsum, svd_flip +from ..utils.sparsefuncs import _implicit_column_offset, mean_variance_axis +from ..utils.validation import check_is_fitted, validate_data +from ._base import _BasePCA + + +def _assess_dimension(spectrum, rank, n_samples): + """Compute the log-likelihood of a rank ``rank`` dataset. + + The dataset is assumed to be embedded in gaussian noise of shape(n, + dimf) having spectrum ``spectrum``. This implements the method of + T. P. Minka. + + Parameters + ---------- + spectrum : ndarray of shape (n_features,) + Data spectrum. + rank : int + Tested rank value. It should be strictly lower than n_features, + otherwise the method isn't specified (division by zero in equation + (31) from the paper). + n_samples : int + Number of samples. + + Returns + ------- + ll : float + The log-likelihood. + + References + ---------- + This implements the method of `Thomas P. Minka: + Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604 + `_ + """ + xp, _ = get_namespace(spectrum) + + n_features = spectrum.shape[0] + if not 1 <= rank < n_features: + raise ValueError("the tested rank should be in [1, n_features - 1]") + + eps = 1e-15 + + if spectrum[rank - 1] < eps: + # When the tested rank is associated with a small eigenvalue, there's + # no point in computing the log-likelihood: it's going to be very + # small and won't be the max anyway. Also, it can lead to numerical + # issues below when computing pa, in particular in log((spectrum[i] - + # spectrum[j]) because this will take the log of something very small. + return -xp.inf + + pu = -rank * log(2.0) + for i in range(1, rank + 1): + pu += ( + lgamma((n_features - i + 1) / 2.0) - log(xp.pi) * (n_features - i + 1) / 2.0 + ) + + pl = xp.sum(xp.log(spectrum[:rank])) + pl = -pl * n_samples / 2.0 + + v = max(eps, xp.sum(spectrum[rank:]) / (n_features - rank)) + pv = -log(v) * n_samples * (n_features - rank) / 2.0 + + m = n_features * rank - rank * (rank + 1.0) / 2.0 + pp = log(2.0 * xp.pi) * (m + rank) / 2.0 + + pa = 0.0 + spectrum_ = xp.asarray(spectrum, copy=True) + spectrum_[rank:n_features] = v + for i in range(rank): + for j in range(i + 1, spectrum.shape[0]): + pa += log( + (spectrum[i] - spectrum[j]) * (1.0 / spectrum_[j] - 1.0 / spectrum_[i]) + ) + log(n_samples) + + ll = pu + pl + pv + pp - pa / 2.0 - rank * log(n_samples) / 2.0 + + return ll + + +def _infer_dimension(spectrum, n_samples): + """Infers the dimension of a dataset with a given spectrum. + + The returned value will be in [1, n_features - 1]. + """ + xp, _ = get_namespace(spectrum) + + ll = xp.empty_like(spectrum) + ll[0] = -xp.inf # we don't want to return n_components = 0 + for rank in range(1, spectrum.shape[0]): + ll[rank] = _assess_dimension(spectrum, rank, n_samples) + return xp.argmax(ll) + + +class PCA(_BasePCA): + """Principal component analysis (PCA). + + Linear dimensionality reduction using Singular Value Decomposition of the + data to project it to a lower dimensional space. The input data is centered + but not scaled for each feature before applying the SVD. + + It uses the LAPACK implementation of the full SVD or a randomized truncated + SVD by the method of Halko et al. 2009, depending on the shape of the input + data and the number of components to extract. + + With sparse inputs, the ARPACK implementation of the truncated SVD can be + used (i.e. through :func:`scipy.sparse.linalg.svds`). Alternatively, one + may consider :class:`TruncatedSVD` where the data are not centered. + + Notice that this class only supports sparse inputs for some solvers such as + "arpack" and "covariance_eigh". See :class:`TruncatedSVD` for an + alternative with sparse data. + + For a usage example, see + :ref:`sphx_glr_auto_examples_decomposition_plot_pca_iris.py` + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int, float or 'mle', default=None + Number of components to keep. + if n_components is not set all components are kept:: + + n_components == min(n_samples, n_features) + + If ``n_components == 'mle'`` and ``svd_solver == 'full'``, Minka's + MLE is used to guess the dimension. Use of ``n_components == 'mle'`` + will interpret ``svd_solver == 'auto'`` as ``svd_solver == 'full'``. + + If ``0 < n_components < 1`` and ``svd_solver == 'full'``, select the + number of components such that the amount of variance that needs to be + explained is greater than the percentage specified by n_components. + + If ``svd_solver == 'arpack'``, the number of components must be + strictly less than the minimum of n_features and n_samples. + + Hence, the None case results in:: + + n_components == min(n_samples, n_features) - 1 + + copy : bool, default=True + If False, data passed to fit are overwritten and running + fit(X).transform(X) will not yield the expected results, + use fit_transform(X) instead. + + whiten : bool, default=False + When True (False by default) the `components_` vectors are multiplied + by the square root of n_samples and then divided by the singular values + to ensure uncorrelated outputs with unit component-wise variances. + + Whitening will remove some information from the transformed signal + (the relative variance scales of the components) but can sometime + improve the predictive accuracy of the downstream estimators by + making their data respect some hard-wired assumptions. + + svd_solver : {'auto', 'full', 'covariance_eigh', 'arpack', 'randomized'},\ + default='auto' + "auto" : + The solver is selected by a default 'auto' policy is based on `X.shape` and + `n_components`: if the input data has fewer than 1000 features and + more than 10 times as many samples, then the "covariance_eigh" + solver is used. Otherwise, if the input data is larger than 500x500 + and the number of components to extract is lower than 80% of the + smallest dimension of the data, then the more efficient + "randomized" method is selected. Otherwise the exact "full" SVD is + computed and optionally truncated afterwards. + "full" : + Run exact full SVD calling the standard LAPACK solver via + `scipy.linalg.svd` and select the components by postprocessing + "covariance_eigh" : + Precompute the covariance matrix (on centered data), run a + classical eigenvalue decomposition on the covariance matrix + typically using LAPACK and select the components by postprocessing. + This solver is very efficient for n_samples >> n_features and small + n_features. It is, however, not tractable otherwise for large + n_features (large memory footprint required to materialize the + covariance matrix). Also note that compared to the "full" solver, + this solver effectively doubles the condition number and is + therefore less numerical stable (e.g. on input data with a large + range of singular values). + "arpack" : + Run SVD truncated to `n_components` calling ARPACK solver via + `scipy.sparse.linalg.svds`. It requires strictly + `0 < n_components < min(X.shape)` + "randomized" : + Run randomized SVD by the method of Halko et al. + + .. versionadded:: 0.18.0 + + .. versionchanged:: 1.5 + Added the 'covariance_eigh' solver. + + tol : float, default=0.0 + Tolerance for singular values computed by svd_solver == 'arpack'. + Must be of range [0.0, infinity). + + .. versionadded:: 0.18.0 + + iterated_power : int or 'auto', default='auto' + Number of iterations for the power method computed by + svd_solver == 'randomized'. + Must be of range [0, infinity). + + .. versionadded:: 0.18.0 + + n_oversamples : int, default=10 + This parameter is only relevant when `svd_solver="randomized"`. + It corresponds to the additional number of random vectors to sample the + range of `X` so as to ensure proper conditioning. See + :func:`~sklearn.utils.extmath.randomized_svd` for more details. + + .. versionadded:: 1.1 + + power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto' + Power iteration normalizer for randomized SVD solver. + Not used by ARPACK. See :func:`~sklearn.utils.extmath.randomized_svd` + for more details. + + .. versionadded:: 1.1 + + random_state : int, RandomState instance or None, default=None + Used when the 'arpack' or 'randomized' solvers are used. Pass an int + for reproducible results across multiple function calls. + See :term:`Glossary `. + + .. versionadded:: 0.18.0 + + Attributes + ---------- + components_ : ndarray of shape (n_components, n_features) + Principal axes in feature space, representing the directions of + maximum variance in the data. Equivalently, the right singular + vectors of the centered input data, parallel to its eigenvectors. + The components are sorted by decreasing ``explained_variance_``. + + explained_variance_ : ndarray of shape (n_components,) + The amount of variance explained by each of the selected components. + The variance estimation uses `n_samples - 1` degrees of freedom. + + Equal to n_components largest eigenvalues + of the covariance matrix of X. + + .. versionadded:: 0.18 + + explained_variance_ratio_ : ndarray of shape (n_components,) + Percentage of variance explained by each of the selected components. + + If ``n_components`` is not set then all components are stored and the + sum of the ratios is equal to 1.0. + + singular_values_ : ndarray of shape (n_components,) + The singular values corresponding to each of the selected components. + The singular values are equal to the 2-norms of the ``n_components`` + variables in the lower-dimensional space. + + .. versionadded:: 0.19 + + mean_ : ndarray of shape (n_features,) + Per-feature empirical mean, estimated from the training set. + + Equal to `X.mean(axis=0)`. + + n_components_ : int + The estimated number of components. When n_components is set + to 'mle' or a number between 0 and 1 (with svd_solver == 'full') this + number is estimated from input data. Otherwise it equals the parameter + n_components, or the lesser value of n_features and n_samples + if n_components is None. + + n_samples_ : int + Number of samples in the training data. + + noise_variance_ : float + The estimated noise covariance following the Probabilistic PCA model + from Tipping and Bishop 1999. See "Pattern Recognition and + Machine Learning" by C. Bishop, 12.2.1 p. 574 or + http://www.miketipping.com/papers/met-mppca.pdf. It is required to + compute the estimated data covariance and score samples. + + Equal to the average of (min(n_features, n_samples) - n_components) + smallest eigenvalues of the covariance matrix of X. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + KernelPCA : Kernel Principal Component Analysis. + SparsePCA : Sparse Principal Component Analysis. + TruncatedSVD : Dimensionality reduction using truncated SVD. + IncrementalPCA : Incremental Principal Component Analysis. + + References + ---------- + For n_components == 'mle', this class uses the method from: + `Minka, T. P.. "Automatic choice of dimensionality for PCA". + In NIPS, pp. 598-604 `_ + + Implements the probabilistic PCA model from: + `Tipping, M. E., and Bishop, C. M. (1999). "Probabilistic principal + component analysis". Journal of the Royal Statistical Society: + Series B (Statistical Methodology), 61(3), 611-622. + `_ + via the score and score_samples methods. + + For svd_solver == 'arpack', refer to `scipy.sparse.linalg.svds`. + + For svd_solver == 'randomized', see: + :doi:`Halko, N., Martinsson, P. G., and Tropp, J. A. (2011). + "Finding structure with randomness: Probabilistic algorithms for + constructing approximate matrix decompositions". + SIAM review, 53(2), 217-288. + <10.1137/090771806>` + and also + :doi:`Martinsson, P. G., Rokhlin, V., and Tygert, M. (2011). + "A randomized algorithm for the decomposition of matrices". + Applied and Computational Harmonic Analysis, 30(1), 47-68. + <10.1016/j.acha.2010.02.003>` + + Examples + -------- + >>> import numpy as np + >>> from sklearn.decomposition import PCA + >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) + >>> pca = PCA(n_components=2) + >>> pca.fit(X) + PCA(n_components=2) + >>> print(pca.explained_variance_ratio_) + [0.9924 0.0075] + >>> print(pca.singular_values_) + [6.30061 0.54980] + + >>> pca = PCA(n_components=2, svd_solver='full') + >>> pca.fit(X) + PCA(n_components=2, svd_solver='full') + >>> print(pca.explained_variance_ratio_) + [0.9924 0.00755] + >>> print(pca.singular_values_) + [6.30061 0.54980] + + >>> pca = PCA(n_components=1, svd_solver='arpack') + >>> pca.fit(X) + PCA(n_components=1, svd_solver='arpack') + >>> print(pca.explained_variance_ratio_) + [0.99244] + >>> print(pca.singular_values_) + [6.30061] + """ + + _parameter_constraints: dict = { + "n_components": [ + Interval(Integral, 0, None, closed="left"), + Interval(RealNotInt, 0, 1, closed="neither"), + StrOptions({"mle"}), + None, + ], + "copy": ["boolean"], + "whiten": ["boolean"], + "svd_solver": [ + StrOptions({"auto", "full", "covariance_eigh", "arpack", "randomized"}) + ], + "tol": [Interval(Real, 0, None, closed="left")], + "iterated_power": [ + StrOptions({"auto"}), + Interval(Integral, 0, None, closed="left"), + ], + "n_oversamples": [Interval(Integral, 1, None, closed="left")], + "power_iteration_normalizer": [StrOptions({"auto", "QR", "LU", "none"})], + "random_state": ["random_state"], + } + + def __init__( + self, + n_components=None, + *, + copy=True, + whiten=False, + svd_solver="auto", + tol=0.0, + iterated_power="auto", + n_oversamples=10, + power_iteration_normalizer="auto", + random_state=None, + ): + self.n_components = n_components + self.copy = copy + self.whiten = whiten + self.svd_solver = svd_solver + self.tol = tol + self.iterated_power = iterated_power + self.n_oversamples = n_oversamples + self.power_iteration_normalizer = power_iteration_normalizer + self.random_state = random_state + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit the model with X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : Ignored + Ignored. + + Returns + ------- + self : object + Returns the instance itself. + """ + self._fit(X) + return self + + @_fit_context(prefer_skip_nested_validation=True) + def fit_transform(self, X, y=None): + """Fit the model with X and apply the dimensionality reduction on X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : Ignored + Ignored. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_components) + Transformed values. + + Notes + ----- + This method returns a Fortran-ordered array. To convert it to a + C-ordered array, use 'np.ascontiguousarray'. + """ + U, S, _, X, x_is_centered, xp = self._fit(X) + if U is not None: + U = U[:, : self.n_components_] + + if self.whiten: + # X_new = X * V / S * sqrt(n_samples) = U * sqrt(n_samples) + U *= sqrt(X.shape[0] - 1) + else: + # X_new = X * V = U * S * Vt * V = U * S + U *= S[: self.n_components_] + + return U + else: # solver="covariance_eigh" does not compute U at fit time. + return self._transform(X, xp, x_is_centered=x_is_centered) + + def _fit(self, X): + """Dispatch to the right submethod depending on the chosen solver.""" + xp, is_array_api_compliant = get_namespace(X) + + # Raise an error for sparse input and unsupported svd_solver + if issparse(X) and self.svd_solver not in ["auto", "arpack", "covariance_eigh"]: + raise TypeError( + 'PCA only support sparse inputs with the "arpack" and' + f' "covariance_eigh" solvers, while "{self.svd_solver}" was passed. See' + " TruncatedSVD for a possible alternative." + ) + if self.svd_solver == "arpack" and is_array_api_compliant: + raise ValueError( + "PCA with svd_solver='arpack' is not supported for Array API inputs." + ) + + # Validate the data, without ever forcing a copy as any solver that + # supports sparse input data and the `covariance_eigh` solver are + # written in a way to avoid the need for any inplace modification of + # the input data contrary to the other solvers. + # The copy will happen + # later, only if needed, once the solver negotiation below is done. + X = validate_data( + self, + X, + dtype=[xp.float64, xp.float32], + force_writeable=True, + accept_sparse=("csr", "csc"), + ensure_2d=True, + copy=False, + ) + self._fit_svd_solver = self.svd_solver + if self._fit_svd_solver == "auto" and issparse(X): + self._fit_svd_solver = "arpack" + + if self.n_components is None: + if self._fit_svd_solver != "arpack": + n_components = min(X.shape) + else: + n_components = min(X.shape) - 1 + else: + n_components = self.n_components + + if self._fit_svd_solver == "auto": + # Tall and skinny problems are best handled by precomputing the + # covariance matrix. + if X.shape[1] <= 1_000 and X.shape[0] >= 10 * X.shape[1]: + self._fit_svd_solver = "covariance_eigh" + # Small problem or n_components == 'mle', just call full PCA + elif max(X.shape) <= 500 or n_components == "mle": + self._fit_svd_solver = "full" + elif 1 <= n_components < 0.8 * min(X.shape): + self._fit_svd_solver = "randomized" + # This is also the case of n_components in (0, 1) + else: + self._fit_svd_solver = "full" + + # Call different fits for either full or truncated SVD + if self._fit_svd_solver in ("full", "covariance_eigh"): + return self._fit_full(X, n_components, xp, is_array_api_compliant) + elif self._fit_svd_solver in ["arpack", "randomized"]: + return self._fit_truncated(X, n_components, xp) + + def _fit_full(self, X, n_components, xp, is_array_api_compliant): + """Fit the model by computing full SVD on X.""" + n_samples, n_features = X.shape + + if n_components == "mle": + if n_samples < n_features: + raise ValueError( + "n_components='mle' is only supported if n_samples >= n_features" + ) + elif not 0 <= n_components <= min(n_samples, n_features): + raise ValueError( + f"n_components={n_components} must be between 0 and " + f"min(n_samples, n_features)={min(n_samples, n_features)} with " + f"svd_solver={self._fit_svd_solver!r}" + ) + + self.mean_ = xp.mean(X, axis=0) + # When X is a scipy sparse matrix, self.mean_ is a numpy matrix, so we need + # to transform it to a 1D array. Note that this is not the case when X + # is a scipy sparse array. + # TODO: remove the following two lines when scikit-learn only depends + # on scipy versions that no longer support scipy.sparse matrices. + self.mean_ = xp.reshape(xp.asarray(self.mean_), (-1,)) + + if self._fit_svd_solver == "full": + X_centered = xp.asarray(X, copy=True) if self.copy else X + X_centered -= self.mean_ + x_is_centered = not self.copy + + if not is_array_api_compliant: + # Use scipy.linalg with NumPy/SciPy inputs for the sake of not + # introducing unanticipated behavior changes. In the long run we + # could instead decide to always use xp.linalg.svd for all inputs, + # but that would make this code rely on numpy's SVD instead of + # scipy's. It's not 100% clear whether they use the same LAPACK + # solver by default though (assuming both are built against the + # same BLAS). + U, S, Vt = linalg.svd(X_centered, full_matrices=False) + else: + U, S, Vt = xp.linalg.svd(X_centered, full_matrices=False) + explained_variance_ = (S**2) / (n_samples - 1) + + else: + assert self._fit_svd_solver == "covariance_eigh" + # In the following, we center the covariance matrix C afterwards + # (without centering the data X first) to avoid an unnecessary copy + # of X. Note that the mean_ attribute is still needed to center + # test data in the transform method. + # + # Note: at the time of writing, `xp.cov` does not exist in the + # Array API standard: + # https://github.com/data-apis/array-api/issues/43 + # + # Besides, using `numpy.cov`, as of numpy 1.26.0, would not be + # memory efficient for our use case when `n_samples >> n_features`: + # `numpy.cov` centers a copy of the data before computing the + # matrix product instead of subtracting a small `(n_features, + # n_features)` square matrix from the gram matrix X.T @ X, as we do + # below. + x_is_centered = False + C = X.T @ X + C -= ( + n_samples + * xp.reshape(self.mean_, (-1, 1)) + * xp.reshape(self.mean_, (1, -1)) + ) + C /= n_samples - 1 + eigenvals, eigenvecs = xp.linalg.eigh(C) + + # When X is a scipy sparse matrix, the following two datastructures + # are returned as instances of the soft-deprecated numpy.matrix + # class. Note that this problem does not occur when X is a scipy + # sparse array (or another other kind of supported array). + # TODO: remove the following two lines when scikit-learn only + # depends on scipy versions that no longer support scipy.sparse + # matrices. + eigenvals = xp.reshape(xp.asarray(eigenvals), (-1,)) + eigenvecs = xp.asarray(eigenvecs) + + eigenvals = xp.flip(eigenvals, axis=0) + eigenvecs = xp.flip(eigenvecs, axis=1) + + # The covariance matrix C is positive semi-definite by + # construction. However, the eigenvalues returned by xp.linalg.eigh + # can be slightly negative due to numerical errors. This would be + # an issue for the subsequent sqrt, hence the manual clipping. + eigenvals[eigenvals < 0.0] = 0.0 + explained_variance_ = eigenvals + + # Re-construct SVD of centered X indirectly and make it consistent + # with the other solvers. + S = xp.sqrt(eigenvals * (n_samples - 1)) + Vt = eigenvecs.T + U = None + + # flip eigenvectors' sign to enforce deterministic output + U, Vt = svd_flip(U, Vt, u_based_decision=False) + + components_ = Vt + + # Get variance explained by singular values + total_var = xp.sum(explained_variance_) + explained_variance_ratio_ = explained_variance_ / total_var + singular_values_ = xp.asarray(S, copy=True) # Store the singular values. + + # Postprocess the number of components required + if n_components == "mle": + n_components = _infer_dimension(explained_variance_, n_samples) + elif 0 < n_components < 1.0: + # number of components for which the cumulated explained + # variance percentage is superior to the desired threshold + # side='right' ensures that number of features selected + # their variance is always greater than n_components float + # passed. More discussion in issue: #15669 + if is_array_api_compliant: + # Convert to numpy as xp.cumsum and xp.searchsorted are not + # part of the Array API standard yet: + # + # https://github.com/data-apis/array-api/issues/597 + # https://github.com/data-apis/array-api/issues/688 + # + # Furthermore, it's not always safe to call them for namespaces + # that already implement them: for instance as + # cupy.searchsorted does not accept a float as second argument. + explained_variance_ratio_np = _convert_to_numpy( + explained_variance_ratio_, xp=xp + ) + else: + explained_variance_ratio_np = explained_variance_ratio_ + ratio_cumsum = stable_cumsum(explained_variance_ratio_np) + n_components = np.searchsorted(ratio_cumsum, n_components, side="right") + 1 + + # Compute noise covariance using Probabilistic PCA model + # The sigma2 maximum likelihood (cf. eq. 12.46) + if n_components < min(n_features, n_samples): + self.noise_variance_ = xp.mean(explained_variance_[n_components:]) + else: + self.noise_variance_ = 0.0 + + self.n_samples_ = n_samples + self.n_components_ = n_components + # Assign a copy of the result of the truncation of the components in + # order to: + # - release the memory used by the discarded components, + # - ensure that the kept components are allocated contiguously in + # memory to make the transform method faster by leveraging cache + # locality. + self.components_ = xp.asarray(components_[:n_components, :], copy=True) + + # We do the same for the other arrays for the sake of consistency. + self.explained_variance_ = xp.asarray( + explained_variance_[:n_components], copy=True + ) + self.explained_variance_ratio_ = xp.asarray( + explained_variance_ratio_[:n_components], copy=True + ) + self.singular_values_ = xp.asarray(singular_values_[:n_components], copy=True) + + return U, S, Vt, X, x_is_centered, xp + + def _fit_truncated(self, X, n_components, xp): + """Fit the model by computing truncated SVD (by ARPACK or randomized) + on X. + """ + n_samples, n_features = X.shape + + svd_solver = self._fit_svd_solver + if isinstance(n_components, str): + raise ValueError( + "n_components=%r cannot be a string with svd_solver='%s'" + % (n_components, svd_solver) + ) + elif not 1 <= n_components <= min(n_samples, n_features): + raise ValueError( + "n_components=%r must be between 1 and " + "min(n_samples, n_features)=%r with " + "svd_solver='%s'" + % (n_components, min(n_samples, n_features), svd_solver) + ) + elif svd_solver == "arpack" and n_components == min(n_samples, n_features): + raise ValueError( + "n_components=%r must be strictly less than " + "min(n_samples, n_features)=%r with " + "svd_solver='%s'" + % (n_components, min(n_samples, n_features), svd_solver) + ) + + random_state = check_random_state(self.random_state) + + # Center data + total_var = None + if issparse(X): + self.mean_, var = mean_variance_axis(X, axis=0) + total_var = var.sum() * n_samples / (n_samples - 1) # ddof=1 + X_centered = _implicit_column_offset(X, self.mean_) + x_is_centered = False + else: + self.mean_ = xp.mean(X, axis=0) + X_centered = xp.asarray(X, copy=True) if self.copy else X + X_centered -= self.mean_ + x_is_centered = not self.copy + + if svd_solver == "arpack": + v0 = _init_arpack_v0(min(X.shape), random_state) + U, S, Vt = svds(X_centered, k=n_components, tol=self.tol, v0=v0) + # svds doesn't abide by scipy.linalg.svd/randomized_svd + # conventions, so reverse its outputs. + S = S[::-1] + # flip eigenvectors' sign to enforce deterministic output + U, Vt = svd_flip(U[:, ::-1], Vt[::-1], u_based_decision=False) + + elif svd_solver == "randomized": + # sign flipping is done inside + U, S, Vt = _randomized_svd( + X_centered, + n_components=n_components, + n_oversamples=self.n_oversamples, + n_iter=self.iterated_power, + power_iteration_normalizer=self.power_iteration_normalizer, + flip_sign=False, + random_state=random_state, + ) + U, Vt = svd_flip(U, Vt, u_based_decision=False) + + self.n_samples_ = n_samples + self.components_ = Vt + self.n_components_ = n_components + + # Get variance explained by singular values + self.explained_variance_ = (S**2) / (n_samples - 1) + + # Workaround in-place variance calculation since at the time numpy + # did not have a way to calculate variance in-place. + # + # TODO: update this code to either: + # * Use the array-api variance calculation, unless memory usage suffers + # * Update sklearn.utils.extmath._incremental_mean_and_var to support array-api + # See: https://github.com/scikit-learn/scikit-learn/pull/18689#discussion_r1335540991 + if total_var is None: + N = X.shape[0] - 1 + X_centered **= 2 + total_var = xp.sum(X_centered) / N + + self.explained_variance_ratio_ = self.explained_variance_ / total_var + self.singular_values_ = xp.asarray(S, copy=True) # Store the singular values. + + if self.n_components_ < min(n_features, n_samples): + self.noise_variance_ = total_var - xp.sum(self.explained_variance_) + self.noise_variance_ /= min(n_features, n_samples) - n_components + else: + self.noise_variance_ = 0.0 + + return U, S, Vt, X, x_is_centered, xp + + def score_samples(self, X): + """Return the log-likelihood of each sample. + + See. "Pattern Recognition and Machine Learning" + by C. Bishop, 12.2.1 p. 574 + or http://www.miketipping.com/papers/met-mppca.pdf + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data. + + Returns + ------- + ll : ndarray of shape (n_samples,) + Log-likelihood of each sample under the current model. + """ + check_is_fitted(self) + xp, _ = get_namespace(X) + X = validate_data(self, X, dtype=[xp.float64, xp.float32], reset=False) + Xr = X - self.mean_ + n_features = X.shape[1] + precision = self.get_precision() + log_like = -0.5 * xp.sum(Xr * (Xr @ precision), axis=1) + log_like -= 0.5 * (n_features * log(2.0 * np.pi) - fast_logdet(precision)) + return log_like + + def score(self, X, y=None): + """Return the average log-likelihood of all samples. + + See. "Pattern Recognition and Machine Learning" + by C. Bishop, 12.2.1 p. 574 + or http://www.miketipping.com/papers/met-mppca.pdf + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data. + + y : Ignored + Ignored. + + Returns + ------- + ll : float + Average log-likelihood of the samples under the current model. + """ + xp, _ = get_namespace(X) + return float(xp.mean(self.score_samples(X))) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = ["float64", "float32"] + tags.array_api_support = True + tags.input_tags.sparse = self.svd_solver in ( + "auto", + "arpack", + "covariance_eigh", + ) + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_sparse_pca.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_sparse_pca.py new file mode 100644 index 0000000000000000000000000000000000000000..2717230c9df92511543eed80c4c52d39e54d15d3 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_sparse_pca.py @@ -0,0 +1,548 @@ +"""Matrix factorization with Sparse PCA.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Integral, Real + +import numpy as np + +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) +from ..linear_model import ridge_regression +from ..utils import check_random_state +from ..utils._param_validation import Interval, StrOptions +from ..utils.extmath import svd_flip +from ..utils.validation import check_array, check_is_fitted, validate_data +from ._dict_learning import MiniBatchDictionaryLearning, dict_learning + + +class _BaseSparsePCA(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): + """Base class for SparsePCA and MiniBatchSparsePCA""" + + _parameter_constraints: dict = { + "n_components": [None, Interval(Integral, 1, None, closed="left")], + "alpha": [Interval(Real, 0.0, None, closed="left")], + "ridge_alpha": [Interval(Real, 0.0, None, closed="left")], + "max_iter": [Interval(Integral, 0, None, closed="left")], + "tol": [Interval(Real, 0.0, None, closed="left")], + "method": [StrOptions({"lars", "cd"})], + "n_jobs": [Integral, None], + "verbose": ["verbose"], + "random_state": ["random_state"], + } + + def __init__( + self, + n_components=None, + *, + alpha=1, + ridge_alpha=0.01, + max_iter=1000, + tol=1e-8, + method="lars", + n_jobs=None, + verbose=False, + random_state=None, + ): + self.n_components = n_components + self.alpha = alpha + self.ridge_alpha = ridge_alpha + self.max_iter = max_iter + self.tol = tol + self.method = method + self.n_jobs = n_jobs + self.verbose = verbose + self.random_state = random_state + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit the model from data in X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self : object + Returns the instance itself. + """ + random_state = check_random_state(self.random_state) + X = validate_data(self, X) + + self.mean_ = X.mean(axis=0) + X = X - self.mean_ + + if self.n_components is None: + n_components = X.shape[1] + else: + n_components = self.n_components + + return self._fit(X, n_components, random_state) + + def transform(self, X): + """Least Squares projection of the data onto the sparse components. + + To avoid instability issues in case the system is under-determined, + regularization can be applied (Ridge regression) via the + `ridge_alpha` parameter. + + Note that Sparse PCA components orthogonality is not enforced as in PCA + hence one cannot use a simple linear projection. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Test data to be transformed, must have the same number of + features as the data used to train the model. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_components) + Transformed data. + """ + check_is_fitted(self) + + X = validate_data(self, X, reset=False) + X = X - self.mean_ + + U = ridge_regression( + self.components_.T, X.T, self.ridge_alpha, solver="cholesky" + ) + + return U + + def inverse_transform(self, X): + """Transform data from the latent space to the original space. + + This inversion is an approximation due to the loss of information + induced by the forward decomposition. + + .. versionadded:: 1.2 + + Parameters + ---------- + X : ndarray of shape (n_samples, n_components) + Data in the latent space. + + Returns + ------- + X_original : ndarray of shape (n_samples, n_features) + Reconstructed data in the original space. + """ + check_is_fitted(self) + X = check_array(X) + + return (X @ self.components_) + self.mean_ + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = ["float64", "float32"] + return tags + + +class SparsePCA(_BaseSparsePCA): + """Sparse Principal Components Analysis (SparsePCA). + + Finds the set of sparse components that can optimally reconstruct + the data. The amount of sparseness is controllable by the coefficient + of the L1 penalty, given by the parameter alpha. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int, default=None + Number of sparse atoms to extract. If None, then ``n_components`` + is set to ``n_features``. + + alpha : float, default=1 + Sparsity controlling parameter. Higher values lead to sparser + components. + + ridge_alpha : float, default=0.01 + Amount of ridge shrinkage to apply in order to improve + conditioning when calling the transform method. + + max_iter : int, default=1000 + Maximum number of iterations to perform. + + tol : float, default=1e-8 + Tolerance for the stopping condition. + + method : {'lars', 'cd'}, default='lars' + Method to be used for optimization. + lars: uses the least angle regression method to solve the lasso problem + (linear_model.lars_path) + cd: uses the coordinate descent method to compute the + Lasso solution (linear_model.Lasso). Lars will be faster if + the estimated components are sparse. + + n_jobs : int, default=None + Number of parallel jobs to run. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + U_init : ndarray of shape (n_samples, n_components), default=None + Initial values for the loadings for warm restart scenarios. Only used + if `U_init` and `V_init` are not None. + + V_init : ndarray of shape (n_components, n_features), default=None + Initial values for the components for warm restart scenarios. Only used + if `U_init` and `V_init` are not None. + + verbose : int or bool, default=False + Controls the verbosity; the higher, the more messages. Defaults to 0. + + random_state : int, RandomState instance or None, default=None + Used during dictionary learning. Pass an int for reproducible results + across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + components_ : ndarray of shape (n_components, n_features) + Sparse components extracted from the data. + + error_ : ndarray + Vector of errors at each iteration. + + n_components_ : int + Estimated number of components. + + .. versionadded:: 0.23 + + n_iter_ : int + Number of iterations run. + + mean_ : ndarray of shape (n_features,) + Per-feature empirical mean, estimated from the training set. + Equal to ``X.mean(axis=0)``. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + PCA : Principal Component Analysis implementation. + MiniBatchSparsePCA : Mini batch variant of `SparsePCA` that is faster but less + accurate. + DictionaryLearning : Generic dictionary learning problem using a sparse code. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.datasets import make_friedman1 + >>> from sklearn.decomposition import SparsePCA + >>> X, _ = make_friedman1(n_samples=200, n_features=30, random_state=0) + >>> transformer = SparsePCA(n_components=5, random_state=0) + >>> transformer.fit(X) + SparsePCA(...) + >>> X_transformed = transformer.transform(X) + >>> X_transformed.shape + (200, 5) + >>> # most values in the components_ are zero (sparsity) + >>> np.mean(transformer.components_ == 0) + np.float64(0.9666) + """ + + _parameter_constraints: dict = { + **_BaseSparsePCA._parameter_constraints, + "U_init": [None, np.ndarray], + "V_init": [None, np.ndarray], + } + + def __init__( + self, + n_components=None, + *, + alpha=1, + ridge_alpha=0.01, + max_iter=1000, + tol=1e-8, + method="lars", + n_jobs=None, + U_init=None, + V_init=None, + verbose=False, + random_state=None, + ): + super().__init__( + n_components=n_components, + alpha=alpha, + ridge_alpha=ridge_alpha, + max_iter=max_iter, + tol=tol, + method=method, + n_jobs=n_jobs, + verbose=verbose, + random_state=random_state, + ) + self.U_init = U_init + self.V_init = V_init + + def _fit(self, X, n_components, random_state): + """Specialized `fit` for SparsePCA.""" + + code_init = self.V_init.T if self.V_init is not None else None + dict_init = self.U_init.T if self.U_init is not None else None + code, dictionary, E, self.n_iter_ = dict_learning( + X.T, + n_components, + alpha=self.alpha, + tol=self.tol, + max_iter=self.max_iter, + method=self.method, + n_jobs=self.n_jobs, + verbose=self.verbose, + random_state=random_state, + code_init=code_init, + dict_init=dict_init, + return_n_iter=True, + ) + # flip eigenvectors' sign to enforce deterministic output + code, dictionary = svd_flip(code, dictionary, u_based_decision=True) + self.components_ = code.T + components_norm = np.linalg.norm(self.components_, axis=1)[:, np.newaxis] + components_norm[components_norm == 0] = 1 + self.components_ /= components_norm + self.n_components_ = len(self.components_) + + self.error_ = E + return self + + +class MiniBatchSparsePCA(_BaseSparsePCA): + """Mini-batch Sparse Principal Components Analysis. + + Finds the set of sparse components that can optimally reconstruct + the data. The amount of sparseness is controllable by the coefficient + of the L1 penalty, given by the parameter alpha. + + For an example comparing sparse PCA to PCA, see + :ref:`sphx_glr_auto_examples_decomposition_plot_faces_decomposition.py` + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int, default=None + Number of sparse atoms to extract. If None, then ``n_components`` + is set to ``n_features``. + + alpha : int, default=1 + Sparsity controlling parameter. Higher values lead to sparser + components. + + ridge_alpha : float, default=0.01 + Amount of ridge shrinkage to apply in order to improve + conditioning when calling the transform method. + + max_iter : int, default=1_000 + Maximum number of iterations over the complete dataset before + stopping independently of any early stopping criterion heuristics. + + .. versionadded:: 1.2 + + callback : callable, default=None + Callable that gets invoked every five iterations. + + batch_size : int, default=3 + The number of features to take in each mini batch. + + verbose : int or bool, default=False + Controls the verbosity; the higher, the more messages. Defaults to 0. + + shuffle : bool, default=True + Whether to shuffle the data before splitting it in batches. + + n_jobs : int, default=None + Number of parallel jobs to run. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + method : {'lars', 'cd'}, default='lars' + Method to be used for optimization. + lars: uses the least angle regression method to solve the lasso problem + (linear_model.lars_path) + cd: uses the coordinate descent method to compute the + Lasso solution (linear_model.Lasso). Lars will be faster if + the estimated components are sparse. + + random_state : int, RandomState instance or None, default=None + Used for random shuffling when ``shuffle`` is set to ``True``, + during online dictionary learning. Pass an int for reproducible results + across multiple function calls. + See :term:`Glossary `. + + tol : float, default=1e-3 + Control early stopping based on the norm of the differences in the + dictionary between 2 steps. + + To disable early stopping based on changes in the dictionary, set + `tol` to 0.0. + + .. versionadded:: 1.1 + + max_no_improvement : int or None, default=10 + Control early stopping based on the consecutive number of mini batches + that does not yield an improvement on the smoothed cost function. + + To disable convergence detection based on cost function, set + `max_no_improvement` to `None`. + + .. versionadded:: 1.1 + + Attributes + ---------- + components_ : ndarray of shape (n_components, n_features) + Sparse components extracted from the data. + + n_components_ : int + Estimated number of components. + + .. versionadded:: 0.23 + + n_iter_ : int + Number of iterations run. + + mean_ : ndarray of shape (n_features,) + Per-feature empirical mean, estimated from the training set. + Equal to ``X.mean(axis=0)``. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + DictionaryLearning : Find a dictionary that sparsely encodes data. + IncrementalPCA : Incremental principal components analysis. + PCA : Principal component analysis. + SparsePCA : Sparse Principal Components Analysis. + TruncatedSVD : Dimensionality reduction using truncated SVD. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.datasets import make_friedman1 + >>> from sklearn.decomposition import MiniBatchSparsePCA + >>> X, _ = make_friedman1(n_samples=200, n_features=30, random_state=0) + >>> transformer = MiniBatchSparsePCA(n_components=5, batch_size=50, + ... max_iter=10, random_state=0) + >>> transformer.fit(X) + MiniBatchSparsePCA(...) + >>> X_transformed = transformer.transform(X) + >>> X_transformed.shape + (200, 5) + >>> # most values in the components_ are zero (sparsity) + >>> np.mean(transformer.components_ == 0) + np.float64(0.9) + """ + + _parameter_constraints: dict = { + **_BaseSparsePCA._parameter_constraints, + "max_iter": [Interval(Integral, 0, None, closed="left")], + "callback": [None, callable], + "batch_size": [Interval(Integral, 1, None, closed="left")], + "shuffle": ["boolean"], + "max_no_improvement": [Interval(Integral, 0, None, closed="left"), None], + } + + def __init__( + self, + n_components=None, + *, + alpha=1, + ridge_alpha=0.01, + max_iter=1_000, + callback=None, + batch_size=3, + verbose=False, + shuffle=True, + n_jobs=None, + method="lars", + random_state=None, + tol=1e-3, + max_no_improvement=10, + ): + super().__init__( + n_components=n_components, + alpha=alpha, + ridge_alpha=ridge_alpha, + max_iter=max_iter, + tol=tol, + method=method, + n_jobs=n_jobs, + verbose=verbose, + random_state=random_state, + ) + self.callback = callback + self.batch_size = batch_size + self.shuffle = shuffle + self.max_no_improvement = max_no_improvement + + def _fit(self, X, n_components, random_state): + """Specialized `fit` for MiniBatchSparsePCA.""" + + transform_algorithm = "lasso_" + self.method + est = MiniBatchDictionaryLearning( + n_components=n_components, + alpha=self.alpha, + max_iter=self.max_iter, + dict_init=None, + batch_size=self.batch_size, + shuffle=self.shuffle, + n_jobs=self.n_jobs, + fit_algorithm=self.method, + random_state=random_state, + transform_algorithm=transform_algorithm, + transform_alpha=self.alpha, + verbose=self.verbose, + callback=self.callback, + tol=self.tol, + max_no_improvement=self.max_no_improvement, + ) + est.set_output(transform="default") + est.fit(X.T) + + self.components_, self.n_iter_ = est.transform(X.T).T, est.n_iter_ + + components_norm = np.linalg.norm(self.components_, axis=1)[:, np.newaxis] + components_norm[components_norm == 0] = 1 + self.components_ /= components_norm + self.n_components_ = len(self.components_) + + return self diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/_truncated_svd.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_truncated_svd.py new file mode 100644 index 0000000000000000000000000000000000000000..6165aba4e8db6a0eaa8c81d54a98214e6c782cae --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/_truncated_svd.py @@ -0,0 +1,322 @@ +"""Truncated SVD for sparse matrices, aka latent semantic analysis (LSA).""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Integral, Real + +import numpy as np +import scipy.sparse as sp +from scipy.sparse.linalg import svds + +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) +from ..utils import check_array, check_random_state +from ..utils._arpack import _init_arpack_v0 +from ..utils._param_validation import Interval, StrOptions +from ..utils.extmath import _randomized_svd, safe_sparse_dot, svd_flip +from ..utils.sparsefuncs import mean_variance_axis +from ..utils.validation import check_is_fitted, validate_data + +__all__ = ["TruncatedSVD"] + + +class TruncatedSVD(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): + """Dimensionality reduction using truncated SVD (aka LSA). + + This transformer performs linear dimensionality reduction by means of + truncated singular value decomposition (SVD). Contrary to PCA, this + estimator does not center the data before computing the singular value + decomposition. This means it can work with sparse matrices + efficiently. + + In particular, truncated SVD works on term count/tf-idf matrices as + returned by the vectorizers in :mod:`sklearn.feature_extraction.text`. In + that context, it is known as latent semantic analysis (LSA). + + This estimator supports two algorithms: a fast randomized SVD solver, and + a "naive" algorithm that uses ARPACK as an eigensolver on `X * X.T` or + `X.T * X`, whichever is more efficient. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int, default=2 + Desired dimensionality of output data. + If algorithm='arpack', must be strictly less than the number of features. + If algorithm='randomized', must be less than or equal to the number of features. + The default value is useful for visualisation. For LSA, a value of + 100 is recommended. + + algorithm : {'arpack', 'randomized'}, default='randomized' + SVD solver to use. Either "arpack" for the ARPACK wrapper in SciPy + (scipy.sparse.linalg.svds), or "randomized" for the randomized + algorithm due to Halko (2009). + + n_iter : int, default=5 + Number of iterations for randomized SVD solver. Not used by ARPACK. The + default is larger than the default in + :func:`~sklearn.utils.extmath.randomized_svd` to handle sparse + matrices that may have large slowly decaying spectrum. + + n_oversamples : int, default=10 + Number of oversamples for randomized SVD solver. Not used by ARPACK. + See :func:`~sklearn.utils.extmath.randomized_svd` for a complete + description. + + .. versionadded:: 1.1 + + power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto' + Power iteration normalizer for randomized SVD solver. + Not used by ARPACK. See :func:`~sklearn.utils.extmath.randomized_svd` + for more details. + + .. versionadded:: 1.1 + + random_state : int, RandomState instance or None, default=None + Used during randomized svd. Pass an int for reproducible results across + multiple function calls. + See :term:`Glossary `. + + tol : float, default=0.0 + Tolerance for ARPACK. 0 means machine precision. Ignored by randomized + SVD solver. + + Attributes + ---------- + components_ : ndarray of shape (n_components, n_features) + The right singular vectors of the input data. + + explained_variance_ : ndarray of shape (n_components,) + The variance of the training samples transformed by a projection to + each component. + + explained_variance_ratio_ : ndarray of shape (n_components,) + Percentage of variance explained by each of the selected components. + + singular_values_ : ndarray of shape (n_components,) + The singular values corresponding to each of the selected components. + The singular values are equal to the 2-norms of the ``n_components`` + variables in the lower-dimensional space. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + DictionaryLearning : Find a dictionary that sparsely encodes data. + FactorAnalysis : A simple linear generative model with + Gaussian latent variables. + IncrementalPCA : Incremental principal components analysis. + KernelPCA : Kernel Principal component analysis. + NMF : Non-Negative Matrix Factorization. + PCA : Principal component analysis. + + Notes + ----- + SVD suffers from a problem called "sign indeterminacy", which means the + sign of the ``components_`` and the output from transform depend on the + algorithm and random state. To work around this, fit instances of this + class to data once, then keep the instance around to do transformations. + + References + ---------- + :arxiv:`Halko, et al. (2009). "Finding structure with randomness: + Stochastic algorithms for constructing approximate matrix decompositions" + <0909.4061>` + + Examples + -------- + >>> from sklearn.decomposition import TruncatedSVD + >>> from scipy.sparse import csr_matrix + >>> import numpy as np + >>> np.random.seed(0) + >>> X_dense = np.random.rand(100, 100) + >>> X_dense[:, 2 * np.arange(50)] = 0 + >>> X = csr_matrix(X_dense) + >>> svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42) + >>> svd.fit(X) + TruncatedSVD(n_components=5, n_iter=7, random_state=42) + >>> print(svd.explained_variance_ratio_) + [0.0157 0.0512 0.0499 0.0479 0.0453] + >>> print(svd.explained_variance_ratio_.sum()) + 0.2102 + >>> print(svd.singular_values_) + [35.2410 4.5981 4.5420 4.4486 4.3288] + """ + + _parameter_constraints: dict = { + "n_components": [Interval(Integral, 1, None, closed="left")], + "algorithm": [StrOptions({"arpack", "randomized"})], + "n_iter": [Interval(Integral, 0, None, closed="left")], + "n_oversamples": [Interval(Integral, 1, None, closed="left")], + "power_iteration_normalizer": [StrOptions({"auto", "OR", "LU", "none"})], + "random_state": ["random_state"], + "tol": [Interval(Real, 0, None, closed="left")], + } + + def __init__( + self, + n_components=2, + *, + algorithm="randomized", + n_iter=5, + n_oversamples=10, + power_iteration_normalizer="auto", + random_state=None, + tol=0.0, + ): + self.algorithm = algorithm + self.n_components = n_components + self.n_iter = n_iter + self.n_oversamples = n_oversamples + self.power_iteration_normalizer = power_iteration_normalizer + self.random_state = random_state + self.tol = tol + + def fit(self, X, y=None): + """Fit model on training data X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self : object + Returns the transformer object. + """ + self.fit_transform(X) + return self + + @_fit_context(prefer_skip_nested_validation=True) + def fit_transform(self, X, y=None): + """Fit model to X and perform dimensionality reduction on X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_components) + Reduced version of X. This will always be a dense array. + """ + X = validate_data(self, X, accept_sparse=["csr", "csc"], ensure_min_features=2) + random_state = check_random_state(self.random_state) + + if self.algorithm == "arpack": + v0 = _init_arpack_v0(min(X.shape), random_state) + U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol, v0=v0) + # svds doesn't abide by scipy.linalg.svd/randomized_svd + # conventions, so reverse its outputs. + Sigma = Sigma[::-1] + # u_based_decision=False is needed to be consistent with PCA. + U, VT = svd_flip(U[:, ::-1], VT[::-1], u_based_decision=False) + + elif self.algorithm == "randomized": + if self.n_components > X.shape[1]: + raise ValueError( + f"n_components({self.n_components}) must be <=" + f" n_features({X.shape[1]})." + ) + U, Sigma, VT = _randomized_svd( + X, + self.n_components, + n_iter=self.n_iter, + n_oversamples=self.n_oversamples, + power_iteration_normalizer=self.power_iteration_normalizer, + random_state=random_state, + flip_sign=False, + ) + U, VT = svd_flip(U, VT, u_based_decision=False) + + self.components_ = VT + + # As a result of the SVD approximation error on X ~ U @ Sigma @ V.T, + # X @ V is not the same as U @ Sigma + if self.algorithm == "randomized" or ( + self.algorithm == "arpack" and self.tol > 0 + ): + X_transformed = safe_sparse_dot(X, self.components_.T) + else: + X_transformed = U * Sigma + + # Calculate explained variance & explained variance ratio + self.explained_variance_ = exp_var = np.var(X_transformed, axis=0) + if sp.issparse(X): + _, full_var = mean_variance_axis(X, axis=0) + full_var = full_var.sum() + else: + full_var = np.var(X, axis=0).sum() + self.explained_variance_ratio_ = exp_var / full_var + self.singular_values_ = Sigma # Store the singular values. + + return X_transformed + + def transform(self, X): + """Perform dimensionality reduction on X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + New data. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_components) + Reduced version of X. This will always be a dense array. + """ + check_is_fitted(self) + X = validate_data(self, X, accept_sparse=["csr", "csc"], reset=False) + return safe_sparse_dot(X, self.components_.T) + + def inverse_transform(self, X): + """Transform X back to its original space. + + Returns an array X_original whose transform would be X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_components) + New data. + + Returns + ------- + X_original : ndarray of shape (n_samples, n_features) + Note that this is always a dense array. + """ + X = check_array(X) + return np.dot(X, self.components_) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + tags.transformer_tags.preserves_dtype = ["float64", "float32"] + return tags + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/meson.build b/.venv/lib/python3.12/site-packages/sklearn/decomposition/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..75b67a46981f4e394b73332980b4088087d6bc23 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/meson.build @@ -0,0 +1,14 @@ +py.extension_module( + '_online_lda_fast', + [cython_gen.process('_online_lda_fast.pyx'), utils_cython_tree], + subdir: 'sklearn/decomposition', + install: true +) + +py.extension_module( + '_cdnmf_fast', + cython_gen.process('_cdnmf_fast.pyx'), + dependencies: [np_dep], + subdir: 'sklearn/decomposition', + install: true +) diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_dict_learning.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_dict_learning.py new file mode 100644 index 0000000000000000000000000000000000000000..717c56d0abdbecb8636033f9515a41a8f35f1151 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_dict_learning.py @@ -0,0 +1,988 @@ +import itertools +import warnings +from functools import partial + +import numpy as np +import pytest + +import sklearn +from sklearn.base import clone +from sklearn.decomposition import ( + DictionaryLearning, + MiniBatchDictionaryLearning, + SparseCoder, + dict_learning, + dict_learning_online, + sparse_encode, +) +from sklearn.decomposition._dict_learning import _update_dict +from sklearn.exceptions import ConvergenceWarning +from sklearn.utils import check_array +from sklearn.utils._testing import ( + TempMemmap, + assert_allclose, + assert_array_almost_equal, + assert_array_equal, + ignore_warnings, +) +from sklearn.utils.estimator_checks import ( + check_transformer_data_not_an_array, + check_transformer_general, + check_transformers_unfitted, +) +from sklearn.utils.parallel import Parallel + +rng_global = np.random.RandomState(0) +n_samples, n_features = 10, 8 +X = rng_global.randn(n_samples, n_features) + + +def test_sparse_encode_shapes_omp(): + rng = np.random.RandomState(0) + algorithms = ["omp", "lasso_lars", "lasso_cd", "lars", "threshold"] + for n_components, n_samples in itertools.product([1, 5], [1, 9]): + X_ = rng.randn(n_samples, n_features) + dictionary = rng.randn(n_components, n_features) + for algorithm, n_jobs in itertools.product(algorithms, [1, 2]): + code = sparse_encode(X_, dictionary, algorithm=algorithm, n_jobs=n_jobs) + assert code.shape == (n_samples, n_components) + + +def test_dict_learning_shapes(): + n_components = 5 + dico = DictionaryLearning(n_components, random_state=0).fit(X) + assert dico.components_.shape == (n_components, n_features) + + n_components = 1 + dico = DictionaryLearning(n_components, random_state=0).fit(X) + assert dico.components_.shape == (n_components, n_features) + assert dico.transform(X).shape == (X.shape[0], n_components) + + +def test_dict_learning_overcomplete(): + n_components = 12 + dico = DictionaryLearning(n_components, random_state=0).fit(X) + assert dico.components_.shape == (n_components, n_features) + + +def test_max_iter(): + def ricker_function(resolution, center, width): + """Discrete sub-sampled Ricker (Mexican hat) wavelet""" + x = np.linspace(0, resolution - 1, resolution) + x = ( + (2 / (np.sqrt(3 * width) * np.pi**0.25)) + * (1 - (x - center) ** 2 / width**2) + * np.exp(-((x - center) ** 2) / (2 * width**2)) + ) + return x + + def ricker_matrix(width, resolution, n_components): + """Dictionary of Ricker (Mexican hat) wavelets""" + centers = np.linspace(0, resolution - 1, n_components) + D = np.empty((n_components, resolution)) + for i, center in enumerate(centers): + D[i] = ricker_function(resolution, center, width) + D /= np.sqrt(np.sum(D**2, axis=1))[:, np.newaxis] + return D + + transform_algorithm = "lasso_cd" + resolution = 1024 + subsampling = 3 # subsampling factor + n_components = resolution // subsampling + + # Compute a wavelet dictionary + D_multi = np.r_[ + tuple( + ricker_matrix( + width=w, resolution=resolution, n_components=n_components // 5 + ) + for w in (10, 50, 100, 500, 1000) + ) + ] + + X = np.linspace(0, resolution - 1, resolution) + first_quarter = X < resolution / 4 + X[first_quarter] = 3.0 + X[np.logical_not(first_quarter)] = -1.0 + X = X.reshape(1, -1) + + # check that the underlying model fails to converge + with pytest.warns(ConvergenceWarning): + model = SparseCoder( + D_multi, transform_algorithm=transform_algorithm, transform_max_iter=1 + ) + model.fit_transform(X) + + # check that the underlying model converges w/o warnings + with warnings.catch_warnings(): + warnings.simplefilter("error", ConvergenceWarning) + model = SparseCoder( + D_multi, transform_algorithm=transform_algorithm, transform_max_iter=2000 + ) + model.fit_transform(X) + + +def test_dict_learning_lars_positive_parameter(): + n_components = 5 + alpha = 1 + err_msg = "Positive constraint not supported for 'lars' coding method." + with pytest.raises(ValueError, match=err_msg): + dict_learning(X, n_components, alpha=alpha, positive_code=True) + + +@pytest.mark.parametrize( + "transform_algorithm", + [ + "lasso_lars", + "lasso_cd", + "threshold", + ], +) +@pytest.mark.parametrize("positive_code", [False, True]) +@pytest.mark.parametrize("positive_dict", [False, True]) +def test_dict_learning_positivity(transform_algorithm, positive_code, positive_dict): + n_components = 5 + dico = DictionaryLearning( + n_components, + transform_algorithm=transform_algorithm, + random_state=0, + positive_code=positive_code, + positive_dict=positive_dict, + fit_algorithm="cd", + ).fit(X) + + code = dico.transform(X) + if positive_dict: + assert (dico.components_ >= 0).all() + else: + assert (dico.components_ < 0).any() + if positive_code: + assert (code >= 0).all() + else: + assert (code < 0).any() + + +@pytest.mark.parametrize("positive_dict", [False, True]) +def test_dict_learning_lars_dict_positivity(positive_dict): + n_components = 5 + dico = DictionaryLearning( + n_components, + transform_algorithm="lars", + random_state=0, + positive_dict=positive_dict, + fit_algorithm="cd", + ).fit(X) + + if positive_dict: + assert (dico.components_ >= 0).all() + else: + assert (dico.components_ < 0).any() + + +def test_dict_learning_lars_code_positivity(): + n_components = 5 + dico = DictionaryLearning( + n_components, + transform_algorithm="lars", + random_state=0, + positive_code=True, + fit_algorithm="cd", + ).fit(X) + + err_msg = "Positive constraint not supported for '{}' coding method." + err_msg = err_msg.format("lars") + with pytest.raises(ValueError, match=err_msg): + dico.transform(X) + + +def test_dict_learning_reconstruction(): + n_components = 12 + dico = DictionaryLearning( + n_components, transform_algorithm="omp", transform_alpha=0.001, random_state=0 + ) + code = dico.fit(X).transform(X) + assert_array_almost_equal(np.dot(code, dico.components_), X) + assert_array_almost_equal(dico.inverse_transform(code), X) + + dico.set_params(transform_algorithm="lasso_lars") + code = dico.transform(X) + assert_array_almost_equal(np.dot(code, dico.components_), X, decimal=2) + assert_array_almost_equal(dico.inverse_transform(code), X, decimal=2) + + # test error raised for wrong code size + with pytest.raises(ValueError, match="Expected 12, got 11."): + dico.inverse_transform(code[:, :-1]) + + # used to test lars here too, but there's no guarantee the number of + # nonzero atoms is right. + + +def test_dict_learning_reconstruction_parallel(): + # regression test that parallel reconstruction works with n_jobs>1 + n_components = 12 + dico = DictionaryLearning( + n_components, + transform_algorithm="omp", + transform_alpha=0.001, + random_state=0, + n_jobs=4, + ) + code = dico.fit(X).transform(X) + assert_array_almost_equal(np.dot(code, dico.components_), X) + + dico.set_params(transform_algorithm="lasso_lars") + code = dico.transform(X) + assert_array_almost_equal(np.dot(code, dico.components_), X, decimal=2) + + +def test_dict_learning_lassocd_readonly_data(): + n_components = 12 + with TempMemmap(X) as X_read_only: + dico = DictionaryLearning( + n_components, + transform_algorithm="lasso_cd", + transform_alpha=0.001, + random_state=0, + n_jobs=4, + ) + with ignore_warnings(category=ConvergenceWarning): + code = dico.fit(X_read_only).transform(X_read_only) + assert_array_almost_equal( + np.dot(code, dico.components_), X_read_only, decimal=2 + ) + + +def test_dict_learning_nonzero_coefs(): + n_components = 4 + dico = DictionaryLearning( + n_components, + transform_algorithm="lars", + transform_n_nonzero_coefs=3, + random_state=0, + ) + code = dico.fit(X).transform(X[np.newaxis, 1]) + assert len(np.flatnonzero(code)) == 3 + + dico.set_params(transform_algorithm="omp") + code = dico.transform(X[np.newaxis, 1]) + assert len(np.flatnonzero(code)) == 3 + + +def test_dict_learning_split(): + n_components = 5 + dico = DictionaryLearning( + n_components, transform_algorithm="threshold", random_state=0 + ) + code = dico.fit(X).transform(X) + Xr = dico.inverse_transform(code) + + dico.split_sign = True + split_code = dico.transform(X) + + assert_array_almost_equal( + split_code[:, :n_components] - split_code[:, n_components:], code + ) + + Xr2 = dico.inverse_transform(split_code) + assert_array_almost_equal(Xr, Xr2) + + +def test_dict_learning_online_shapes(): + rng = np.random.RandomState(0) + n_components = 8 + + code, dictionary = dict_learning_online( + X, + n_components=n_components, + batch_size=4, + max_iter=10, + method="cd", + random_state=rng, + return_code=True, + ) + assert code.shape == (n_samples, n_components) + assert dictionary.shape == (n_components, n_features) + assert np.dot(code, dictionary).shape == X.shape + + dictionary = dict_learning_online( + X, + n_components=n_components, + batch_size=4, + max_iter=10, + method="cd", + random_state=rng, + return_code=False, + ) + assert dictionary.shape == (n_components, n_features) + + +def test_dict_learning_online_lars_positive_parameter(): + err_msg = "Positive constraint not supported for 'lars' coding method." + with pytest.raises(ValueError, match=err_msg): + dict_learning_online(X, batch_size=4, max_iter=10, positive_code=True) + + +@pytest.mark.parametrize( + "transform_algorithm", + [ + "lasso_lars", + "lasso_cd", + "threshold", + ], +) +@pytest.mark.parametrize("positive_code", [False, True]) +@pytest.mark.parametrize("positive_dict", [False, True]) +def test_minibatch_dictionary_learning_positivity( + transform_algorithm, positive_code, positive_dict +): + n_components = 8 + dico = MiniBatchDictionaryLearning( + n_components, + batch_size=4, + max_iter=10, + transform_algorithm=transform_algorithm, + random_state=0, + positive_code=positive_code, + positive_dict=positive_dict, + fit_algorithm="cd", + ).fit(X) + + code = dico.transform(X) + if positive_dict: + assert (dico.components_ >= 0).all() + else: + assert (dico.components_ < 0).any() + if positive_code: + assert (code >= 0).all() + else: + assert (code < 0).any() + + +@pytest.mark.parametrize("positive_dict", [False, True]) +def test_minibatch_dictionary_learning_lars(positive_dict): + n_components = 8 + + dico = MiniBatchDictionaryLearning( + n_components, + batch_size=4, + max_iter=10, + transform_algorithm="lars", + random_state=0, + positive_dict=positive_dict, + fit_algorithm="cd", + ).fit(X) + + if positive_dict: + assert (dico.components_ >= 0).all() + else: + assert (dico.components_ < 0).any() + + +@pytest.mark.parametrize("positive_code", [False, True]) +@pytest.mark.parametrize("positive_dict", [False, True]) +def test_dict_learning_online_positivity(positive_code, positive_dict): + rng = np.random.RandomState(0) + n_components = 8 + + code, dictionary = dict_learning_online( + X, + n_components=n_components, + batch_size=4, + method="cd", + alpha=1, + random_state=rng, + positive_dict=positive_dict, + positive_code=positive_code, + ) + if positive_dict: + assert (dictionary >= 0).all() + else: + assert (dictionary < 0).any() + if positive_code: + assert (code >= 0).all() + else: + assert (code < 0).any() + + +def test_dict_learning_online_verbosity(): + # test verbosity for better coverage + n_components = 5 + import sys + from io import StringIO + + old_stdout = sys.stdout + try: + sys.stdout = StringIO() + + # convergence monitoring verbosity + dico = MiniBatchDictionaryLearning( + n_components, batch_size=4, max_iter=5, verbose=1, tol=0.1, random_state=0 + ) + dico.fit(X) + dico = MiniBatchDictionaryLearning( + n_components, + batch_size=4, + max_iter=5, + verbose=1, + max_no_improvement=2, + random_state=0, + ) + dico.fit(X) + # higher verbosity level + dico = MiniBatchDictionaryLearning( + n_components, batch_size=4, max_iter=5, verbose=2, random_state=0 + ) + dico.fit(X) + + # function API verbosity + dict_learning_online( + X, + n_components=n_components, + batch_size=4, + alpha=1, + verbose=1, + random_state=0, + ) + dict_learning_online( + X, + n_components=n_components, + batch_size=4, + alpha=1, + verbose=2, + random_state=0, + ) + finally: + sys.stdout = old_stdout + + assert dico.components_.shape == (n_components, n_features) + + +def test_dict_learning_online_estimator_shapes(): + n_components = 5 + dico = MiniBatchDictionaryLearning( + n_components, batch_size=4, max_iter=5, random_state=0 + ) + dico.fit(X) + assert dico.components_.shape == (n_components, n_features) + + +def test_dict_learning_online_overcomplete(): + n_components = 12 + dico = MiniBatchDictionaryLearning( + n_components, batch_size=4, max_iter=5, random_state=0 + ).fit(X) + assert dico.components_.shape == (n_components, n_features) + + +def test_dict_learning_online_initialization(): + n_components = 12 + rng = np.random.RandomState(0) + V = rng.randn(n_components, n_features) + dico = MiniBatchDictionaryLearning( + n_components, batch_size=4, max_iter=0, dict_init=V, random_state=0 + ).fit(X) + assert_array_equal(dico.components_, V) + + +def test_dict_learning_online_readonly_initialization(): + n_components = 12 + rng = np.random.RandomState(0) + V = rng.randn(n_components, n_features) + V.setflags(write=False) + MiniBatchDictionaryLearning( + n_components, + batch_size=4, + max_iter=1, + dict_init=V, + random_state=0, + shuffle=False, + ).fit(X) + + +def test_dict_learning_online_partial_fit(): + n_components = 12 + rng = np.random.RandomState(0) + V = rng.randn(n_components, n_features) # random init + V /= np.sum(V**2, axis=1)[:, np.newaxis] + dict1 = MiniBatchDictionaryLearning( + n_components, + max_iter=10, + batch_size=1, + alpha=1, + shuffle=False, + dict_init=V, + max_no_improvement=None, + tol=0.0, + random_state=0, + ).fit(X) + dict2 = MiniBatchDictionaryLearning( + n_components, alpha=1, dict_init=V, random_state=0 + ) + for i in range(10): + for sample in X: + dict2.partial_fit(sample[np.newaxis, :]) + + assert not np.all(sparse_encode(X, dict1.components_, alpha=1) == 0) + assert_array_almost_equal(dict1.components_, dict2.components_, decimal=2) + + # partial_fit should ignore max_iter (#17433) + assert dict1.n_steps_ == dict2.n_steps_ == 100 + + +def test_sparse_encode_shapes(): + n_components = 12 + rng = np.random.RandomState(0) + V = rng.randn(n_components, n_features) # random init + V /= np.sum(V**2, axis=1)[:, np.newaxis] + for algo in ("lasso_lars", "lasso_cd", "lars", "omp", "threshold"): + code = sparse_encode(X, V, algorithm=algo) + assert code.shape == (n_samples, n_components) + + +@pytest.mark.parametrize("algo", ["lasso_lars", "lasso_cd", "threshold"]) +@pytest.mark.parametrize("positive", [False, True]) +def test_sparse_encode_positivity(algo, positive): + n_components = 12 + rng = np.random.RandomState(0) + V = rng.randn(n_components, n_features) # random init + V /= np.sum(V**2, axis=1)[:, np.newaxis] + code = sparse_encode(X, V, algorithm=algo, positive=positive) + if positive: + assert (code >= 0).all() + else: + assert (code < 0).any() + + +@pytest.mark.parametrize("algo", ["lars", "omp"]) +def test_sparse_encode_unavailable_positivity(algo): + n_components = 12 + rng = np.random.RandomState(0) + V = rng.randn(n_components, n_features) # random init + V /= np.sum(V**2, axis=1)[:, np.newaxis] + err_msg = "Positive constraint not supported for '{}' coding method." + err_msg = err_msg.format(algo) + with pytest.raises(ValueError, match=err_msg): + sparse_encode(X, V, algorithm=algo, positive=True) + + +def test_sparse_encode_input(): + n_components = 100 + rng = np.random.RandomState(0) + V = rng.randn(n_components, n_features) # random init + V /= np.sum(V**2, axis=1)[:, np.newaxis] + Xf = check_array(X, order="F") + for algo in ("lasso_lars", "lasso_cd", "lars", "omp", "threshold"): + a = sparse_encode(X, V, algorithm=algo) + b = sparse_encode(Xf, V, algorithm=algo) + assert_array_almost_equal(a, b) + + +def test_sparse_encode_error(): + n_components = 12 + rng = np.random.RandomState(0) + V = rng.randn(n_components, n_features) # random init + V /= np.sum(V**2, axis=1)[:, np.newaxis] + code = sparse_encode(X, V, alpha=0.001) + assert not np.all(code == 0) + assert np.sqrt(np.sum((np.dot(code, V) - X) ** 2)) < 0.1 + + +def test_sparse_encode_error_default_sparsity(): + rng = np.random.RandomState(0) + X = rng.randn(100, 64) + D = rng.randn(2, 64) + code = ignore_warnings(sparse_encode)(X, D, algorithm="omp", n_nonzero_coefs=None) + assert code.shape == (100, 2) + + +def test_sparse_coder_estimator(): + n_components = 12 + rng = np.random.RandomState(0) + V = rng.randn(n_components, n_features) # random init + V /= np.sum(V**2, axis=1)[:, np.newaxis] + coder = SparseCoder( + dictionary=V, transform_algorithm="lasso_lars", transform_alpha=0.001 + ) + code = coder.fit_transform(X) + Xr = coder.inverse_transform(code) + assert not np.all(code == 0) + assert np.sqrt(np.sum((np.dot(code, V) - X) ** 2)) < 0.1 + np.testing.assert_allclose(Xr, np.dot(code, V)) + + +def test_sparse_coder_estimator_clone(): + n_components = 12 + rng = np.random.RandomState(0) + V = rng.randn(n_components, n_features) # random init + V /= np.sum(V**2, axis=1)[:, np.newaxis] + coder = SparseCoder( + dictionary=V, transform_algorithm="lasso_lars", transform_alpha=0.001 + ) + cloned = clone(coder) + assert id(cloned) != id(coder) + np.testing.assert_allclose(cloned.dictionary, coder.dictionary) + assert id(cloned.dictionary) != id(coder.dictionary) + assert cloned.n_components_ == coder.n_components_ + assert cloned.n_features_in_ == coder.n_features_in_ + data = np.random.rand(n_samples, n_features).astype(np.float32) + np.testing.assert_allclose(cloned.transform(data), coder.transform(data)) + + +def test_sparse_coder_parallel_mmap(): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/5956 + # Test that SparseCoder does not error by passing reading only + # arrays to child processes + + rng = np.random.RandomState(777) + n_components, n_features = 40, 64 + init_dict = rng.rand(n_components, n_features) + # Ensure that `data` is >2M. Joblib memory maps arrays + # if they are larger than 1MB. The 4 accounts for float32 + # data type + n_samples = int(2e6) // (4 * n_features) + data = np.random.rand(n_samples, n_features).astype(np.float32) + + sc = SparseCoder(init_dict, transform_algorithm="omp", n_jobs=2) + sc.fit_transform(data) + + +def test_sparse_coder_common_transformer(): + rng = np.random.RandomState(777) + n_components, n_features = 40, 3 + init_dict = rng.rand(n_components, n_features) + + sc = SparseCoder(init_dict) + + check_transformer_data_not_an_array(sc.__class__.__name__, sc) + check_transformer_general(sc.__class__.__name__, sc) + check_transformer_general_memmap = partial( + check_transformer_general, readonly_memmap=True + ) + check_transformer_general_memmap(sc.__class__.__name__, sc) + check_transformers_unfitted(sc.__class__.__name__, sc) + + +def test_sparse_coder_n_features_in(): + d = np.array([[1, 2, 3], [1, 2, 3]]) + sc = SparseCoder(d) + assert sc.n_features_in_ == d.shape[1] + + +def test_update_dict(): + # Check the dict update in batch mode vs online mode + # Non-regression test for #4866 + rng = np.random.RandomState(0) + + code = np.array([[0.5, -0.5], [0.1, 0.9]]) + dictionary = np.array([[1.0, 0.0], [0.6, 0.8]]) + + X = np.dot(code, dictionary) + rng.randn(2, 2) + + # full batch update + newd_batch = dictionary.copy() + _update_dict(newd_batch, X, code) + + # online update + A = np.dot(code.T, code) + B = np.dot(X.T, code) + newd_online = dictionary.copy() + _update_dict(newd_online, X, code, A, B) + + assert_allclose(newd_batch, newd_online) + + +@pytest.mark.parametrize( + "algorithm", ("lasso_lars", "lasso_cd", "lars", "threshold", "omp") +) +@pytest.mark.parametrize("data_type", (np.float32, np.float64)) +# Note: do not check integer input because `lasso_lars` and `lars` fail with +# `ValueError` in `_lars_path_solver` +def test_sparse_encode_dtype_match(data_type, algorithm): + n_components = 6 + rng = np.random.RandomState(0) + dictionary = rng.randn(n_components, n_features) + code = sparse_encode( + X.astype(data_type), dictionary.astype(data_type), algorithm=algorithm + ) + assert code.dtype == data_type + + +@pytest.mark.parametrize( + "algorithm", ("lasso_lars", "lasso_cd", "lars", "threshold", "omp") +) +def test_sparse_encode_numerical_consistency(algorithm): + # verify numerical consistency among np.float32 and np.float64 + rtol = 1e-4 + n_components = 6 + rng = np.random.RandomState(0) + dictionary = rng.randn(n_components, n_features) + code_32 = sparse_encode( + X.astype(np.float32), dictionary.astype(np.float32), algorithm=algorithm + ) + code_64 = sparse_encode( + X.astype(np.float64), dictionary.astype(np.float64), algorithm=algorithm + ) + assert_allclose(code_32, code_64, rtol=rtol) + + +@pytest.mark.parametrize( + "transform_algorithm", ("lasso_lars", "lasso_cd", "lars", "threshold", "omp") +) +@pytest.mark.parametrize("data_type", (np.float32, np.float64)) +# Note: do not check integer input because `lasso_lars` and `lars` fail with +# `ValueError` in `_lars_path_solver` +def test_sparse_coder_dtype_match(data_type, transform_algorithm): + # Verify preserving dtype for transform in sparse coder + n_components = 6 + rng = np.random.RandomState(0) + dictionary = rng.randn(n_components, n_features) + coder = SparseCoder( + dictionary.astype(data_type), transform_algorithm=transform_algorithm + ) + code = coder.transform(X.astype(data_type)) + assert code.dtype == data_type + + +@pytest.mark.parametrize("fit_algorithm", ("lars", "cd")) +@pytest.mark.parametrize( + "transform_algorithm", ("lasso_lars", "lasso_cd", "lars", "threshold", "omp") +) +@pytest.mark.parametrize( + "data_type, expected_type", + ( + (np.float32, np.float32), + (np.float64, np.float64), + (np.int32, np.float64), + (np.int64, np.float64), + ), +) +def test_dictionary_learning_dtype_match( + data_type, + expected_type, + fit_algorithm, + transform_algorithm, +): + # Verify preserving dtype for fit and transform in dictionary learning class + dict_learner = DictionaryLearning( + n_components=8, + fit_algorithm=fit_algorithm, + transform_algorithm=transform_algorithm, + random_state=0, + ) + dict_learner.fit(X.astype(data_type)) + assert dict_learner.components_.dtype == expected_type + assert dict_learner.transform(X.astype(data_type)).dtype == expected_type + + +@pytest.mark.parametrize("fit_algorithm", ("lars", "cd")) +@pytest.mark.parametrize( + "transform_algorithm", ("lasso_lars", "lasso_cd", "lars", "threshold", "omp") +) +@pytest.mark.parametrize( + "data_type, expected_type", + ( + (np.float32, np.float32), + (np.float64, np.float64), + (np.int32, np.float64), + (np.int64, np.float64), + ), +) +def test_minibatch_dictionary_learning_dtype_match( + data_type, + expected_type, + fit_algorithm, + transform_algorithm, +): + # Verify preserving dtype for fit and transform in minibatch dictionary learning + dict_learner = MiniBatchDictionaryLearning( + n_components=8, + batch_size=10, + fit_algorithm=fit_algorithm, + transform_algorithm=transform_algorithm, + max_iter=100, + tol=1e-1, + random_state=0, + ) + dict_learner.fit(X.astype(data_type)) + + assert dict_learner.components_.dtype == expected_type + assert dict_learner.transform(X.astype(data_type)).dtype == expected_type + assert dict_learner._A.dtype == expected_type + assert dict_learner._B.dtype == expected_type + + +@pytest.mark.parametrize("method", ("lars", "cd")) +@pytest.mark.parametrize( + "data_type, expected_type", + ( + (np.float32, np.float32), + (np.float64, np.float64), + (np.int32, np.float64), + (np.int64, np.float64), + ), +) +def test_dict_learning_dtype_match(data_type, expected_type, method): + # Verify output matrix dtype + rng = np.random.RandomState(0) + n_components = 8 + code, dictionary, _ = dict_learning( + X.astype(data_type), + n_components=n_components, + alpha=1, + random_state=rng, + method=method, + ) + assert code.dtype == expected_type + assert dictionary.dtype == expected_type + + +@pytest.mark.parametrize("method", ("lars", "cd")) +def test_dict_learning_numerical_consistency(method): + # verify numerically consistent among np.float32 and np.float64 + rtol = 1e-6 + n_components = 4 + alpha = 2 + + U_64, V_64, _ = dict_learning( + X.astype(np.float64), + n_components=n_components, + alpha=alpha, + random_state=0, + method=method, + ) + U_32, V_32, _ = dict_learning( + X.astype(np.float32), + n_components=n_components, + alpha=alpha, + random_state=0, + method=method, + ) + + # Optimal solution (U*, V*) is not unique. + # If (U*, V*) is optimal solution, (-U*,-V*) is also optimal, + # and (column permutated U*, row permutated V*) are also optional + # as long as holding UV. + # So here UV, ||U||_1,1 and sum(||V_k||_2^2) are verified + # instead of comparing directly U and V. + assert_allclose(np.matmul(U_64, V_64), np.matmul(U_32, V_32), rtol=rtol) + assert_allclose(np.sum(np.abs(U_64)), np.sum(np.abs(U_32)), rtol=rtol) + assert_allclose(np.sum(V_64**2), np.sum(V_32**2), rtol=rtol) + # verify an obtained solution is not degenerate + assert np.mean(U_64 != 0.0) > 0.05 + assert np.count_nonzero(U_64 != 0.0) == np.count_nonzero(U_32 != 0.0) + + +@pytest.mark.parametrize("method", ("lars", "cd")) +@pytest.mark.parametrize( + "data_type, expected_type", + ( + (np.float32, np.float32), + (np.float64, np.float64), + (np.int32, np.float64), + (np.int64, np.float64), + ), +) +def test_dict_learning_online_dtype_match(data_type, expected_type, method): + # Verify output matrix dtype + rng = np.random.RandomState(0) + n_components = 8 + code, dictionary = dict_learning_online( + X.astype(data_type), + n_components=n_components, + alpha=1, + batch_size=10, + random_state=rng, + method=method, + ) + assert code.dtype == expected_type + assert dictionary.dtype == expected_type + + +@pytest.mark.parametrize("method", ("lars", "cd")) +def test_dict_learning_online_numerical_consistency(method): + # verify numerically consistent among np.float32 and np.float64 + rtol = 1e-4 + n_components = 4 + alpha = 1 + + U_64, V_64 = dict_learning_online( + X.astype(np.float64), + n_components=n_components, + max_iter=1_000, + alpha=alpha, + batch_size=10, + random_state=0, + method=method, + tol=0.0, + max_no_improvement=None, + ) + U_32, V_32 = dict_learning_online( + X.astype(np.float32), + n_components=n_components, + max_iter=1_000, + alpha=alpha, + batch_size=10, + random_state=0, + method=method, + tol=0.0, + max_no_improvement=None, + ) + + # Optimal solution (U*, V*) is not unique. + # If (U*, V*) is optimal solution, (-U*,-V*) is also optimal, + # and (column permutated U*, row permutated V*) are also optional + # as long as holding UV. + # So here UV, ||U||_1,1 and sum(||V_k||_2) are verified + # instead of comparing directly U and V. + assert_allclose(np.matmul(U_64, V_64), np.matmul(U_32, V_32), rtol=rtol) + assert_allclose(np.sum(np.abs(U_64)), np.sum(np.abs(U_32)), rtol=rtol) + assert_allclose(np.sum(V_64**2), np.sum(V_32**2), rtol=rtol) + # verify an obtained solution is not degenerate + assert np.mean(U_64 != 0.0) > 0.05 + assert np.count_nonzero(U_64 != 0.0) == np.count_nonzero(U_32 != 0.0) + + +@pytest.mark.parametrize( + "estimator", + [ + SparseCoder(X.T), + DictionaryLearning(), + MiniBatchDictionaryLearning(batch_size=4, max_iter=10), + ], + ids=lambda x: x.__class__.__name__, +) +def test_get_feature_names_out(estimator): + """Check feature names for dict learning estimators.""" + estimator.fit(X) + n_components = X.shape[1] + + feature_names_out = estimator.get_feature_names_out() + estimator_name = estimator.__class__.__name__.lower() + assert_array_equal( + feature_names_out, + [f"{estimator_name}{i}" for i in range(n_components)], + ) + + +def test_cd_work_on_joblib_memmapped_data(monkeypatch): + monkeypatch.setattr( + sklearn.decomposition._dict_learning, + "Parallel", + partial(Parallel, max_nbytes=100), + ) + + rng = np.random.RandomState(0) + X_train = rng.randn(10, 10) + + dict_learner = DictionaryLearning( + n_components=5, + random_state=0, + n_jobs=2, + fit_algorithm="cd", + max_iter=50, + verbose=True, + ) + + # This must run and complete without error. + dict_learner.fit(X_train) diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_factor_analysis.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_factor_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..9175829695b0d8695cdf294419869d8aaf066489 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_factor_analysis.py @@ -0,0 +1,109 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from itertools import combinations + +import numpy as np +import pytest + +from sklearn.decomposition import FactorAnalysis +from sklearn.decomposition._factor_analysis import _ortho_rotation +from sklearn.exceptions import ConvergenceWarning +from sklearn.utils._testing import assert_almost_equal, assert_array_almost_equal + + +def test_factor_analysis(global_random_seed): + # Test FactorAnalysis ability to recover the data covariance structure + rng = np.random.RandomState(global_random_seed) + n_samples, n_features, n_components = 20, 5, 3 + + # Some random settings for the generative model + W = rng.randn(n_components, n_features) + # latent variable of dim 3, 20 of it + h = rng.randn(n_samples, n_components) + # using gamma to model different noise variance + # per component + noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features) + + # generate observations + # wlog, mean is 0 + X = np.dot(h, W) + noise + + fas = [] + for method in ["randomized", "lapack"]: + fa = FactorAnalysis(n_components=n_components, svd_method=method) + fa.fit(X) + fas.append(fa) + + X_t = fa.transform(X) + assert X_t.shape == (n_samples, n_components) + + assert_almost_equal(fa.loglike_[-1], fa.score_samples(X).sum()) + assert_almost_equal(fa.score_samples(X).mean(), fa.score(X)) + + diff = np.all(np.diff(fa.loglike_)) + assert diff > 0.0, "Log likelihood dif not increase" + + # Sample Covariance + scov = np.cov(X, rowvar=0.0, bias=1.0) + + # Model Covariance + mcov = fa.get_covariance() + diff = np.sum(np.abs(scov - mcov)) / W.size + assert diff < 0.2, "Mean absolute difference is %f" % diff + fa = FactorAnalysis( + n_components=n_components, noise_variance_init=np.ones(n_features) + ) + with pytest.raises(ValueError): + fa.fit(X[:, :2]) + + def f(x, y): + return np.abs(getattr(x, y)) # sign will not be equal + + fa1, fa2 = fas + for attr in ["loglike_", "components_", "noise_variance_"]: + assert_almost_equal(f(fa1, attr), f(fa2, attr)) + + fa1.max_iter = 1 + fa1.verbose = True + with pytest.warns(ConvergenceWarning): + fa1.fit(X) + + # Test get_covariance and get_precision with n_components == n_features + # with n_components < n_features and with n_components == 0 + for n_components in [0, 2, X.shape[1]]: + fa.n_components = n_components + fa.fit(X) + cov = fa.get_covariance() + precision = fa.get_precision() + assert_array_almost_equal(np.dot(cov, precision), np.eye(X.shape[1]), 12) + + # test rotation + n_components = 2 + + results, projections = {}, {} + for method in (None, "varimax", "quartimax"): + fa_var = FactorAnalysis(n_components=n_components, rotation=method) + results[method] = fa_var.fit_transform(X) + projections[method] = fa_var.get_covariance() + for rot1, rot2 in combinations([None, "varimax", "quartimax"], 2): + assert not np.allclose(results[rot1], results[rot2]) + assert np.allclose(projections[rot1], projections[rot2], atol=3) + + # test against R's psych::principal with rotate="varimax" + # (i.e., the values below stem from rotating the components in R) + # R's factor analysis returns quite different values; therefore, we only + # test the rotation itself + factors = np.array( + [ + [0.89421016, -0.35854928, -0.27770122, 0.03773647], + [-0.45081822, -0.89132754, 0.0932195, -0.01787973], + [0.99500666, -0.02031465, 0.05426497, -0.11539407], + [0.96822861, -0.06299656, 0.24411001, 0.07540887], + ] + ) + r_solution = np.array( + [[0.962, 0.052], [-0.141, 0.989], [0.949, -0.300], [0.937, -0.251]] + ) + rotated = _ortho_rotation(factors[:, :n_components], method="varimax").T + assert_array_almost_equal(np.abs(rotated), np.abs(r_solution), decimal=3) diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_fastica.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_fastica.py new file mode 100644 index 0000000000000000000000000000000000000000..6f8c9c55db621a36b6153aeef28a830ac7675a25 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_fastica.py @@ -0,0 +1,457 @@ +""" +Test the fastica algorithm. +""" + +import itertools +import os +import warnings + +import numpy as np +import pytest +from scipy import stats + +from sklearn.decomposition import PCA, FastICA, fastica +from sklearn.decomposition._fastica import _gs_decorrelation +from sklearn.exceptions import ConvergenceWarning +from sklearn.utils._testing import assert_allclose, ignore_warnings + + +def center_and_norm(x, axis=-1): + """Centers and norms x **in place** + + Parameters + ----------- + x: ndarray + Array with an axis of observations (statistical units) measured on + random variables. + axis: int, optional + Axis along which the mean and variance are calculated. + """ + x = np.rollaxis(x, axis) + x -= x.mean(axis=0) + x /= x.std(axis=0) + + +def test_gs(global_random_seed): + # Test gram schmidt orthonormalization + # generate a random orthogonal matrix + rng = np.random.RandomState(global_random_seed) + W, _, _ = np.linalg.svd(rng.randn(10, 10)) + w = rng.randn(10) + _gs_decorrelation(w, W, 10) + assert (w**2).sum() < 1.0e-10 + w = rng.randn(10) + u = _gs_decorrelation(w, W, 5) + tmp = np.dot(u, W.T) + assert (tmp[:5] ** 2).sum() < 1.0e-10 + + +def test_fastica_attributes_dtypes(global_dtype): + rng = np.random.RandomState(0) + X = rng.random_sample((100, 10)).astype(global_dtype, copy=False) + fica = FastICA( + n_components=5, max_iter=1000, whiten="unit-variance", random_state=0 + ).fit(X) + assert fica.components_.dtype == global_dtype + assert fica.mixing_.dtype == global_dtype + assert fica.mean_.dtype == global_dtype + assert fica.whitening_.dtype == global_dtype + + +def test_fastica_return_dtypes(global_dtype): + rng = np.random.RandomState(0) + X = rng.random_sample((100, 10)).astype(global_dtype, copy=False) + k_, mixing_, s_ = fastica( + X, max_iter=1000, whiten="unit-variance", random_state=rng + ) + assert k_.dtype == global_dtype + assert mixing_.dtype == global_dtype + assert s_.dtype == global_dtype + + +@pytest.mark.parametrize("add_noise", [True, False]) +def test_fastica_simple(add_noise, global_random_seed, global_dtype): + if ( + global_random_seed == 20 + and global_dtype == np.float32 + and not add_noise + and os.getenv("DISTRIB") == "ubuntu" + ): + pytest.xfail( + "FastICA instability with Ubuntu Atlas build with float32 " + "global_dtype. For more details, see " + "https://github.com/scikit-learn/scikit-learn/issues/24131#issuecomment-1208091119" + ) + + # Test the FastICA algorithm on very simple data. + rng = np.random.RandomState(global_random_seed) + n_samples = 1000 + # Generate two sources: + s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1 + s2 = stats.t.rvs(1, size=n_samples, random_state=global_random_seed) + s = np.c_[s1, s2].T + center_and_norm(s) + s = s.astype(global_dtype) + s1, s2 = s + + # Mixing angle + phi = 0.6 + mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi), -np.cos(phi)]]) + mixing = mixing.astype(global_dtype) + m = np.dot(mixing, s) + + if add_noise: + m += 0.1 * rng.randn(2, 1000) + + center_and_norm(m) + + # function as fun arg + def g_test(x): + return x**3, (3 * x**2).mean(axis=-1) + + algos = ["parallel", "deflation"] + nls = ["logcosh", "exp", "cube", g_test] + whitening = ["arbitrary-variance", "unit-variance", False] + for algo, nl, whiten in itertools.product(algos, nls, whitening): + if whiten: + k_, mixing_, s_ = fastica( + m.T, fun=nl, whiten=whiten, algorithm=algo, random_state=rng + ) + with pytest.raises(ValueError): + fastica(m.T, fun=np.tanh, whiten=whiten, algorithm=algo) + else: + pca = PCA(n_components=2, whiten=True, random_state=rng) + X = pca.fit_transform(m.T) + k_, mixing_, s_ = fastica( + X, fun=nl, algorithm=algo, whiten=False, random_state=rng + ) + with pytest.raises(ValueError): + fastica(X, fun=np.tanh, algorithm=algo) + s_ = s_.T + # Check that the mixing model described in the docstring holds: + if whiten: + # XXX: exact reconstruction to standard relative tolerance is not + # possible. This is probably expected when add_noise is True but we + # also need a non-trivial atol in float32 when add_noise is False. + # + # Note that the 2 sources are non-Gaussian in this test. + atol = 1e-5 if global_dtype == np.float32 else 0 + assert_allclose(np.dot(np.dot(mixing_, k_), m), s_, atol=atol) + + center_and_norm(s_) + s1_, s2_ = s_ + # Check to see if the sources have been estimated + # in the wrong order + if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)): + s2_, s1_ = s_ + s1_ *= np.sign(np.dot(s1_, s1)) + s2_ *= np.sign(np.dot(s2_, s2)) + + # Check that we have estimated the original sources + if not add_noise: + assert_allclose(np.dot(s1_, s1) / n_samples, 1, atol=1e-2) + assert_allclose(np.dot(s2_, s2) / n_samples, 1, atol=1e-2) + else: + assert_allclose(np.dot(s1_, s1) / n_samples, 1, atol=1e-1) + assert_allclose(np.dot(s2_, s2) / n_samples, 1, atol=1e-1) + + # Test FastICA class + _, _, sources_fun = fastica( + m.T, fun=nl, algorithm=algo, random_state=global_random_seed + ) + ica = FastICA(fun=nl, algorithm=algo, random_state=global_random_seed) + sources = ica.fit_transform(m.T) + assert ica.components_.shape == (2, 2) + assert sources.shape == (1000, 2) + + assert_allclose(sources_fun, sources) + # Set atol to account for the different magnitudes of the elements in sources + # (from 1e-4 to 1e1). + atol = np.max(np.abs(sources)) * (1e-5 if global_dtype == np.float32 else 1e-7) + assert_allclose(sources, ica.transform(m.T), atol=atol) + + assert ica.mixing_.shape == (2, 2) + + ica = FastICA(fun=np.tanh, algorithm=algo) + with pytest.raises(ValueError): + ica.fit(m.T) + + +def test_fastica_nowhiten(): + m = [[0, 1], [1, 0]] + + # test for issue #697 + ica = FastICA(n_components=1, whiten=False, random_state=0) + warn_msg = "Ignoring n_components with whiten=False." + with pytest.warns(UserWarning, match=warn_msg): + ica.fit(m) + assert hasattr(ica, "mixing_") + + +def test_fastica_convergence_fail(global_random_seed): + # Test the FastICA algorithm on very simple data + # (see test_non_square_fastica). + # Ensure a ConvergenceWarning raised if the tolerance is sufficiently low. + rng = np.random.RandomState(global_random_seed) + + n_samples = 1000 + # Generate two sources: + t = np.linspace(0, 100, n_samples) + s1 = np.sin(t) + s2 = np.ceil(np.sin(np.pi * t)) + s = np.c_[s1, s2].T + center_and_norm(s) + + # Mixing matrix + mixing = rng.randn(6, 2) + m = np.dot(mixing, s) + + # Do fastICA with tolerance 0. to ensure failing convergence + warn_msg = ( + "FastICA did not converge. Consider increasing tolerance " + "or the maximum number of iterations." + ) + with pytest.warns(ConvergenceWarning, match=warn_msg): + ica = FastICA( + algorithm="parallel", n_components=2, random_state=rng, max_iter=2, tol=0.0 + ) + ica.fit(m.T) + + +@pytest.mark.parametrize("add_noise", [True, False]) +def test_non_square_fastica(global_random_seed, add_noise): + # Test the FastICA algorithm on very simple data. + rng = np.random.RandomState(global_random_seed) + + n_samples = 1000 + # Generate two sources: + t = np.linspace(0, 100, n_samples) + s1 = np.sin(t) + s2 = np.ceil(np.sin(np.pi * t)) + s = np.c_[s1, s2].T + center_and_norm(s) + s1, s2 = s + + # Mixing matrix + mixing = rng.randn(6, 2) + m = np.dot(mixing, s) + + if add_noise: + m += 0.1 * rng.randn(6, n_samples) + + center_and_norm(m) + + k_, mixing_, s_ = fastica( + m.T, n_components=2, whiten="unit-variance", random_state=rng + ) + s_ = s_.T + + # Check that the mixing model described in the docstring holds: + assert_allclose(s_, np.dot(np.dot(mixing_, k_), m)) + + center_and_norm(s_) + s1_, s2_ = s_ + # Check to see if the sources have been estimated + # in the wrong order + if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)): + s2_, s1_ = s_ + s1_ *= np.sign(np.dot(s1_, s1)) + s2_ *= np.sign(np.dot(s2_, s2)) + + # Check that we have estimated the original sources + if not add_noise: + assert_allclose(np.dot(s1_, s1) / n_samples, 1, atol=1e-3) + assert_allclose(np.dot(s2_, s2) / n_samples, 1, atol=1e-3) + + +def test_fit_transform(global_random_seed, global_dtype): + """Test unit variance of transformed data using FastICA algorithm. + + Check that `fit_transform` gives the same result as applying + `fit` and then `transform`. + + Bug #13056 + """ + # multivariate uniform data in [0, 1] + rng = np.random.RandomState(global_random_seed) + X = rng.random_sample((100, 10)).astype(global_dtype) + max_iter = 300 + for whiten, n_components in [["unit-variance", 5], [False, None]]: + n_components_ = n_components if n_components is not None else X.shape[1] + + ica = FastICA( + n_components=n_components, max_iter=max_iter, whiten=whiten, random_state=0 + ) + with warnings.catch_warnings(): + # make sure that numerical errors do not cause sqrt of negative + # values + warnings.simplefilter("error", RuntimeWarning) + # XXX: for some seeds, the model does not converge. + # However this is not what we test here. + warnings.simplefilter("ignore", ConvergenceWarning) + Xt = ica.fit_transform(X) + assert ica.components_.shape == (n_components_, 10) + assert Xt.shape == (X.shape[0], n_components_) + + ica2 = FastICA( + n_components=n_components, max_iter=max_iter, whiten=whiten, random_state=0 + ) + with warnings.catch_warnings(): + # make sure that numerical errors do not cause sqrt of negative + # values + warnings.simplefilter("error", RuntimeWarning) + warnings.simplefilter("ignore", ConvergenceWarning) + ica2.fit(X) + assert ica2.components_.shape == (n_components_, 10) + Xt2 = ica2.transform(X) + + # XXX: we have to set atol for this test to pass for all seeds when + # fitting with float32 data. Is this revealing a bug? + if global_dtype: + atol = np.abs(Xt2).mean() / 1e6 + else: + atol = 0.0 # the default rtol is enough for float64 data + assert_allclose(Xt, Xt2, atol=atol) + + +@pytest.mark.filterwarnings("ignore:Ignoring n_components with whiten=False.") +@pytest.mark.parametrize( + "whiten, n_components, expected_mixing_shape", + [ + ("arbitrary-variance", 5, (10, 5)), + ("arbitrary-variance", 10, (10, 10)), + ("unit-variance", 5, (10, 5)), + ("unit-variance", 10, (10, 10)), + (False, 5, (10, 10)), + (False, 10, (10, 10)), + ], +) +def test_inverse_transform( + whiten, n_components, expected_mixing_shape, global_random_seed, global_dtype +): + # Test FastICA.inverse_transform + n_samples = 100 + rng = np.random.RandomState(global_random_seed) + X = rng.random_sample((n_samples, 10)).astype(global_dtype) + + ica = FastICA(n_components=n_components, random_state=rng, whiten=whiten) + with warnings.catch_warnings(): + # For some dataset (depending on the value of global_dtype) the model + # can fail to converge but this should not impact the definition of + # a valid inverse transform. + warnings.simplefilter("ignore", ConvergenceWarning) + Xt = ica.fit_transform(X) + assert ica.mixing_.shape == expected_mixing_shape + X2 = ica.inverse_transform(Xt) + assert X.shape == X2.shape + + # reversibility test in non-reduction case + if n_components == X.shape[1]: + # XXX: we have to set atol for this test to pass for all seeds when + # fitting with float32 data. Is this revealing a bug? + if global_dtype: + # XXX: dividing by a smaller number makes + # tests fail for some seeds. + atol = np.abs(X2).mean() / 1e5 + else: + atol = 0.0 # the default rtol is enough for float64 data + assert_allclose(X, X2, atol=atol) + + +def test_fastica_errors(): + n_features = 3 + n_samples = 10 + rng = np.random.RandomState(0) + X = rng.random_sample((n_samples, n_features)) + w_init = rng.randn(n_features + 1, n_features + 1) + with pytest.raises(ValueError, match=r"alpha must be in \[1,2\]"): + fastica(X, fun_args={"alpha": 0}) + with pytest.raises( + ValueError, match=r"w_init has invalid shape.+should be \(3L?, 3L?\)" + ): + fastica(X, w_init=w_init) + + +def test_fastica_whiten_unit_variance(global_random_seed): + """Test unit variance of transformed data using FastICA algorithm. + + Bug #13056 + """ + rng = np.random.RandomState(global_random_seed) + X = rng.random_sample((100, 10)) + n_components = X.shape[1] + ica = FastICA(n_components=n_components, whiten="unit-variance", random_state=0) + Xt = ica.fit_transform(X) + + assert np.var(Xt) == pytest.approx(1.0) + + +@pytest.mark.parametrize("whiten", ["arbitrary-variance", "unit-variance", False]) +@pytest.mark.parametrize("return_X_mean", [True, False]) +@pytest.mark.parametrize("return_n_iter", [True, False]) +def test_fastica_output_shape(whiten, return_X_mean, return_n_iter): + n_features = 3 + n_samples = 10 + rng = np.random.RandomState(0) + X = rng.random_sample((n_samples, n_features)) + + expected_len = 3 + return_X_mean + return_n_iter + + out = fastica( + X, whiten=whiten, return_n_iter=return_n_iter, return_X_mean=return_X_mean + ) + + assert len(out) == expected_len + if not whiten: + assert out[0] is None + + +@pytest.mark.parametrize("add_noise", [True, False]) +def test_fastica_simple_different_solvers(add_noise, global_random_seed): + """Test FastICA is consistent between whiten_solvers.""" + rng = np.random.RandomState(global_random_seed) + n_samples = 1000 + # Generate two sources: + s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1 + s2 = stats.t.rvs(1, size=n_samples, random_state=rng) + s = np.c_[s1, s2].T + center_and_norm(s) + s1, s2 = s + + # Mixing angle + phi = rng.rand() * 2 * np.pi + mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi), -np.cos(phi)]]) + m = np.dot(mixing, s) + + if add_noise: + m += 0.1 * rng.randn(2, 1000) + + center_and_norm(m) + + outs = {} + for solver in ("svd", "eigh"): + ica = FastICA(random_state=0, whiten="unit-variance", whiten_solver=solver) + sources = ica.fit_transform(m.T) + outs[solver] = sources + assert ica.components_.shape == (2, 2) + assert sources.shape == (1000, 2) + + # compared numbers are not all on the same magnitude. Using a small atol to + # make the test less brittle + assert_allclose(outs["eigh"], outs["svd"], atol=1e-12) + + +def test_fastica_eigh_low_rank_warning(global_random_seed): + """Test FastICA eigh solver raises warning for low-rank data.""" + rng = np.random.RandomState(global_random_seed) + A = rng.randn(10, 2) + X = A @ A.T + ica = FastICA(random_state=0, whiten="unit-variance", whiten_solver="eigh") + msg = "There are some small singular values" + + with pytest.warns(UserWarning, match=msg): + with ignore_warnings(category=ConvergenceWarning): + # The FastICA solver may not converge for some data with specific + # random seeds but this happens after the whiten step so this is + # not want we want to test here. + ica.fit(X) diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_incremental_pca.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_incremental_pca.py new file mode 100644 index 0000000000000000000000000000000000000000..c4ea1c222901c0159fdf90c9675cdaad4da60450 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_incremental_pca.py @@ -0,0 +1,487 @@ +"""Tests for Incremental PCA.""" + +import itertools +import warnings + +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_array_equal + +from sklearn import datasets +from sklearn.decomposition import PCA, IncrementalPCA +from sklearn.utils._testing import ( + assert_allclose_dense_sparse, + assert_almost_equal, + assert_array_almost_equal, +) +from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS, LIL_CONTAINERS + +iris = datasets.load_iris() + + +def test_incremental_pca(): + # Incremental PCA on dense arrays. + X = iris.data + batch_size = X.shape[0] // 3 + ipca = IncrementalPCA(n_components=2, batch_size=batch_size) + pca = PCA(n_components=2) + pca.fit_transform(X) + + X_transformed = ipca.fit_transform(X) + + assert X_transformed.shape == (X.shape[0], 2) + np.testing.assert_allclose( + ipca.explained_variance_ratio_.sum(), + pca.explained_variance_ratio_.sum(), + rtol=1e-3, + ) + + for n_components in [1, 2, X.shape[1]]: + ipca = IncrementalPCA(n_components, batch_size=batch_size) + ipca.fit(X) + cov = ipca.get_covariance() + precision = ipca.get_precision() + np.testing.assert_allclose( + np.dot(cov, precision), np.eye(X.shape[1]), atol=1e-13 + ) + + +@pytest.mark.parametrize( + "sparse_container", CSC_CONTAINERS + CSR_CONTAINERS + LIL_CONTAINERS +) +def test_incremental_pca_sparse(sparse_container): + # Incremental PCA on sparse arrays. + X = iris.data + pca = PCA(n_components=2) + pca.fit_transform(X) + X_sparse = sparse_container(X) + batch_size = X_sparse.shape[0] // 3 + ipca = IncrementalPCA(n_components=2, batch_size=batch_size) + + X_transformed = ipca.fit_transform(X_sparse) + + assert X_transformed.shape == (X_sparse.shape[0], 2) + np.testing.assert_allclose( + ipca.explained_variance_ratio_.sum(), + pca.explained_variance_ratio_.sum(), + rtol=1e-3, + ) + + for n_components in [1, 2, X.shape[1]]: + ipca = IncrementalPCA(n_components, batch_size=batch_size) + ipca.fit(X_sparse) + cov = ipca.get_covariance() + precision = ipca.get_precision() + np.testing.assert_allclose( + np.dot(cov, precision), np.eye(X_sparse.shape[1]), atol=1e-13 + ) + + with pytest.raises( + TypeError, + match=( + "IncrementalPCA.partial_fit does not support " + "sparse input. Either convert data to dense " + "or use IncrementalPCA.fit to do so in batches." + ), + ): + ipca.partial_fit(X_sparse) + + +def test_incremental_pca_check_projection(global_random_seed): + # Test that the projection of data is correct. + rng = np.random.RandomState(global_random_seed) + n, p = 100, 3 + X = rng.randn(n, p) * 0.1 + X[:10] += np.array([3, 4, 5]) + Xt = 0.1 * rng.randn(1, p) + np.array([3, 4, 5]) + + # Get the reconstruction of the generated data X + # Note that Xt has the same "components" as X, just separated + # This is what we want to ensure is recreated correctly + Yt = IncrementalPCA(n_components=2).fit(X).transform(Xt) + + # Normalize + Yt /= np.sqrt((Yt**2).sum()) + + # Make sure that the first element of Yt is ~1, this means + # the reconstruction worked as expected + assert_almost_equal(np.abs(Yt[0][0]), 1.0, 1) + + +def test_incremental_pca_inverse(global_random_seed): + # Test that the projection of data can be inverted. + rng = np.random.RandomState(global_random_seed) + n, p = 50, 3 + X = rng.randn(n, p) # spherical data + X[:, 1] *= 0.00001 # make middle component relatively small + X += [5, 4, 3] # make a large mean + + # same check that we can find the original data from the transformed + # signal (since the data is almost of rank n_components) + ipca = IncrementalPCA(n_components=2, batch_size=10).fit(X) + Y = ipca.transform(X) + Y_inverse = ipca.inverse_transform(Y) + assert_almost_equal(X, Y_inverse, decimal=3) + + +def test_incremental_pca_validation(): + # Test that n_components is <= n_features. + X = np.array([[0, 1, 0], [1, 0, 0]]) + n_samples, n_features = X.shape + n_components = 4 + with pytest.raises( + ValueError, + match=( + "n_components={} invalid" + " for n_features={}, need more rows than" + " columns for IncrementalPCA" + " processing".format(n_components, n_features) + ), + ): + IncrementalPCA(n_components, batch_size=10).fit(X) + + # Test that n_components is also <= n_samples in first call to partial fit. + n_components = 3 + with pytest.raises( + ValueError, + match=( + f"n_components={n_components} must be less or equal to the batch " + f"number of samples {n_samples} for the first partial_fit call." + ), + ): + IncrementalPCA(n_components=n_components).partial_fit(X) + + +def test_n_samples_equal_n_components(): + # Ensures no warning is raised when n_samples==n_components + # Non-regression test for gh-19050 + ipca = IncrementalPCA(n_components=5) + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + ipca.partial_fit(np.random.randn(5, 7)) + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + ipca.fit(np.random.randn(5, 7)) + + +def test_n_components_none(): + # Ensures that n_components == None is handled correctly + rng = np.random.RandomState(1999) + for n_samples, n_features in [(50, 10), (10, 50)]: + X = rng.rand(n_samples, n_features) + ipca = IncrementalPCA(n_components=None) + + # First partial_fit call, ipca.n_components_ is inferred from + # min(X.shape) + ipca.partial_fit(X) + assert ipca.n_components_ == min(X.shape) + + # Second partial_fit call, ipca.n_components_ is inferred from + # ipca.components_ computed from the first partial_fit call + ipca.partial_fit(X) + assert ipca.n_components_ == ipca.components_.shape[0] + + +def test_incremental_pca_set_params(): + # Test that components_ sign is stable over batch sizes. + rng = np.random.RandomState(1999) + n_samples = 100 + n_features = 20 + X = rng.randn(n_samples, n_features) + X2 = rng.randn(n_samples, n_features) + X3 = rng.randn(n_samples, n_features) + ipca = IncrementalPCA(n_components=20) + ipca.fit(X) + # Decreasing number of components + ipca.set_params(n_components=10) + with pytest.raises(ValueError): + ipca.partial_fit(X2) + # Increasing number of components + ipca.set_params(n_components=15) + with pytest.raises(ValueError): + ipca.partial_fit(X3) + # Returning to original setting + ipca.set_params(n_components=20) + ipca.partial_fit(X) + + +def test_incremental_pca_num_features_change(): + # Test that changing n_components will raise an error. + rng = np.random.RandomState(1999) + n_samples = 100 + X = rng.randn(n_samples, 20) + X2 = rng.randn(n_samples, 50) + ipca = IncrementalPCA(n_components=None) + ipca.fit(X) + with pytest.raises(ValueError): + ipca.partial_fit(X2) + + +def test_incremental_pca_batch_signs(global_random_seed): + # Test that components_ sign is stable over batch sizes. + rng = np.random.RandomState(global_random_seed) + n_samples = 100 + n_features = 3 + X = rng.randn(n_samples, n_features) + all_components = [] + batch_sizes = np.arange(10, 20) + for batch_size in batch_sizes: + ipca = IncrementalPCA(n_components=None, batch_size=batch_size).fit(X) + all_components.append(ipca.components_) + + for i, j in itertools.pairwise(all_components): + assert_almost_equal(np.sign(i), np.sign(j), decimal=6) + + +def test_incremental_pca_partial_fit_small_batch(): + # Test that there is no minimum batch size after the first partial_fit + # Non-regression test + rng = np.random.RandomState(1999) + n, p = 50, 3 + X = rng.randn(n, p) # spherical data + X[:, 1] *= 0.00001 # make middle component relatively small + X += [5, 4, 3] # make a large mean + + n_components = p + pipca = IncrementalPCA(n_components=n_components) + pipca.partial_fit(X[:n_components]) + for idx in range(n_components, n): + pipca.partial_fit(X[idx : idx + 1]) + + pca = PCA(n_components=n_components) + pca.fit(X) + + assert_allclose(pca.components_, pipca.components_, atol=1e-3) + + +def test_incremental_pca_batch_values(global_random_seed): + # Test that components_ values are stable over batch sizes. + rng = np.random.RandomState(global_random_seed) + n_samples = 100 + n_features = 3 + X = rng.randn(n_samples, n_features) + all_components = [] + batch_sizes = np.arange(20, 40, 3) + for batch_size in batch_sizes: + ipca = IncrementalPCA(n_components=None, batch_size=batch_size).fit(X) + all_components.append(ipca.components_) + + for i, j in itertools.pairwise(all_components): + assert_almost_equal(i, j, decimal=1) + + +def test_incremental_pca_batch_rank(): + # Test sample size in each batch is always larger or equal to n_components + rng = np.random.RandomState(1999) + n_samples = 100 + n_features = 20 + X = rng.randn(n_samples, n_features) + all_components = [] + batch_sizes = np.arange(20, 90, 3) + for batch_size in batch_sizes: + ipca = IncrementalPCA(n_components=20, batch_size=batch_size).fit(X) + all_components.append(ipca.components_) + + for components_i, components_j in itertools.pairwise(all_components): + assert_allclose_dense_sparse(components_i, components_j) + + +def test_incremental_pca_partial_fit(global_random_seed): + # Test that fit and partial_fit get equivalent results. + rng = np.random.RandomState(global_random_seed) + n, p = 50, 3 + X = rng.randn(n, p) # spherical data + X[:, 1] *= 0.00001 # make middle component relatively small + X += [5, 4, 3] # make a large mean + + # same check that we can find the original data from the transformed + # signal (since the data is almost of rank n_components) + batch_size = 10 + ipca = IncrementalPCA(n_components=2, batch_size=batch_size).fit(X) + pipca = IncrementalPCA(n_components=2, batch_size=batch_size) + # Add one to make sure endpoint is included + batch_itr = np.arange(0, n + 1, batch_size) + for i, j in itertools.pairwise(batch_itr): + pipca.partial_fit(X[i:j, :]) + assert_almost_equal(ipca.components_, pipca.components_, decimal=3) + + +def test_incremental_pca_against_pca_iris(): + # Test that IncrementalPCA and PCA are approximate (to a sign flip). + X = iris.data + + Y_pca = PCA(n_components=2).fit_transform(X) + Y_ipca = IncrementalPCA(n_components=2, batch_size=25).fit_transform(X) + + assert_almost_equal(np.abs(Y_pca), np.abs(Y_ipca), 1) + + +def test_incremental_pca_against_pca_random_data(global_random_seed): + # Test that IncrementalPCA and PCA are approximate (to a sign flip). + rng = np.random.RandomState(global_random_seed) + n_samples = 100 + n_features = 3 + X = rng.randn(n_samples, n_features) + 5 * rng.rand(1, n_features) + + Y_pca = PCA(n_components=3).fit_transform(X) + Y_ipca = IncrementalPCA(n_components=3, batch_size=25).fit_transform(X) + + assert_almost_equal(np.abs(Y_pca), np.abs(Y_ipca), 1) + + +def test_explained_variances(): + # Test that PCA and IncrementalPCA calculations match + X = datasets.make_low_rank_matrix( + 1000, 100, tail_strength=0.0, effective_rank=10, random_state=1999 + ) + prec = 3 + n_samples, n_features = X.shape + for nc in [None, 99]: + pca = PCA(n_components=nc).fit(X) + ipca = IncrementalPCA(n_components=nc, batch_size=100).fit(X) + assert_almost_equal( + pca.explained_variance_, ipca.explained_variance_, decimal=prec + ) + assert_almost_equal( + pca.explained_variance_ratio_, ipca.explained_variance_ratio_, decimal=prec + ) + assert_almost_equal(pca.noise_variance_, ipca.noise_variance_, decimal=prec) + + +def test_singular_values(global_random_seed): + # Check that the IncrementalPCA output has the correct singular values + + rng = np.random.RandomState(global_random_seed) + n_samples = 1000 + n_features = 100 + + X = datasets.make_low_rank_matrix( + n_samples, n_features, tail_strength=0.0, effective_rank=10, random_state=rng + ) + + pca = PCA(n_components=10, svd_solver="full", random_state=rng).fit(X) + ipca = IncrementalPCA(n_components=10, batch_size=150).fit(X) + assert_array_almost_equal(pca.singular_values_, ipca.singular_values_, 2) + + # Compare to the Frobenius norm + X_pca = pca.transform(X) + X_ipca = ipca.transform(X) + assert_array_almost_equal( + np.sum(pca.singular_values_**2.0), np.linalg.norm(X_pca, "fro") ** 2.0, 12 + ) + assert_array_almost_equal( + np.sum(ipca.singular_values_**2.0), np.linalg.norm(X_ipca, "fro") ** 2.0, 2 + ) + + # Compare to the 2-norms of the score vectors + assert_array_almost_equal( + pca.singular_values_, np.sqrt(np.sum(X_pca**2.0, axis=0)), 12 + ) + assert_array_almost_equal( + ipca.singular_values_, np.sqrt(np.sum(X_ipca**2.0, axis=0)), 2 + ) + + # Set the singular values and see what we get back + rng = np.random.RandomState(global_random_seed) + n_samples = 100 + n_features = 110 + + X = datasets.make_low_rank_matrix( + n_samples, n_features, tail_strength=0.0, effective_rank=3, random_state=rng + ) + + pca = PCA(n_components=3, svd_solver="full", random_state=rng) + ipca = IncrementalPCA(n_components=3, batch_size=100) + + X_pca = pca.fit_transform(X) + X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0)) + X_pca[:, 0] *= 3.142 + X_pca[:, 1] *= 2.718 + + X_hat = np.dot(X_pca, pca.components_) + pca.fit(X_hat) + ipca.fit(X_hat) + assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14) + assert_array_almost_equal(ipca.singular_values_, [3.142, 2.718, 1.0], 14) + + +def test_whitening(global_random_seed): + # Test that PCA and IncrementalPCA transforms match to sign flip. + X = datasets.make_low_rank_matrix( + 1000, 10, tail_strength=0.0, effective_rank=2, random_state=global_random_seed + ) + atol = 1e-3 + for nc in [None, 9]: + pca = PCA(whiten=True, n_components=nc).fit(X) + ipca = IncrementalPCA(whiten=True, n_components=nc, batch_size=250).fit(X) + + # Since the data is rank deficient, some components are pure noise. We + # should not expect those dimensions to carry any signal and their + # values might be arbitrarily changed by implementation details of the + # internal SVD solver. We therefore filter them out before comparison. + stable_mask = pca.explained_variance_ratio_ > 1e-12 + + Xt_pca = pca.transform(X) + Xt_ipca = ipca.transform(X) + assert_allclose( + np.abs(Xt_pca)[:, stable_mask], + np.abs(Xt_ipca)[:, stable_mask], + atol=atol, + ) + + # The noisy dimensions are in the null space of the inverse transform, + # so they are not influencing the reconstruction. We therefore don't + # need to apply the mask here. + Xinv_ipca = ipca.inverse_transform(Xt_ipca) + Xinv_pca = pca.inverse_transform(Xt_pca) + assert_allclose(X, Xinv_ipca, atol=atol) + assert_allclose(X, Xinv_pca, atol=atol) + assert_allclose(Xinv_pca, Xinv_ipca, atol=atol) + + +def test_incremental_pca_partial_fit_float_division(): + # Test to ensure float division is used in all versions of Python + # (non-regression test for issue #9489) + + rng = np.random.RandomState(0) + A = rng.randn(5, 3) + 2 + B = rng.randn(7, 3) + 5 + + pca = IncrementalPCA(n_components=2) + pca.partial_fit(A) + # Set n_samples_seen_ to be a floating point number instead of an int + pca.n_samples_seen_ = float(pca.n_samples_seen_) + pca.partial_fit(B) + singular_vals_float_samples_seen = pca.singular_values_ + + pca2 = IncrementalPCA(n_components=2) + pca2.partial_fit(A) + pca2.partial_fit(B) + singular_vals_int_samples_seen = pca2.singular_values_ + + np.testing.assert_allclose( + singular_vals_float_samples_seen, singular_vals_int_samples_seen + ) + + +def test_incremental_pca_fit_overflow_error(): + # Test for overflow error on Windows OS + # (non-regression test for issue #17693) + rng = np.random.RandomState(0) + A = rng.rand(500000, 2) + + ipca = IncrementalPCA(n_components=2, batch_size=10000) + ipca.fit(A) + + pca = PCA(n_components=2) + pca.fit(A) + + np.testing.assert_allclose(ipca.singular_values_, pca.singular_values_) + + +def test_incremental_pca_feature_names_out(): + """Check feature names out for IncrementalPCA.""" + ipca = IncrementalPCA(n_components=2).fit(iris.data) + + names = ipca.get_feature_names_out() + assert_array_equal([f"incrementalpca{i}" for i in range(2)], names) diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_kernel_pca.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_kernel_pca.py new file mode 100644 index 0000000000000000000000000000000000000000..57ae75c184622679b1db7350eab8b1ff9f94296e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_kernel_pca.py @@ -0,0 +1,566 @@ +import warnings + +import numpy as np +import pytest + +import sklearn +from sklearn.datasets import load_iris, make_blobs, make_circles +from sklearn.decomposition import PCA, KernelPCA +from sklearn.exceptions import NotFittedError +from sklearn.linear_model import Perceptron +from sklearn.metrics.pairwise import rbf_kernel +from sklearn.model_selection import GridSearchCV +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.utils._testing import ( + assert_allclose, + assert_array_almost_equal, + assert_array_equal, +) +from sklearn.utils.fixes import CSR_CONTAINERS +from sklearn.utils.validation import _check_psd_eigenvalues + + +def test_kernel_pca(global_random_seed): + """Nominal test for all solvers and all known kernels + a custom one + + It tests + - that fit_transform is equivalent to fit+transform + - that the shapes of transforms and inverse transforms are correct + """ + rng = np.random.RandomState(global_random_seed) + X_fit = rng.random_sample((5, 4)) + X_pred = rng.random_sample((2, 4)) + + def histogram(x, y, **kwargs): + # Histogram kernel implemented as a callable. + assert kwargs == {} # no kernel_params that we didn't ask for + return np.minimum(x, y).sum() + + for eigen_solver in ("auto", "dense", "arpack", "randomized"): + for kernel in ("linear", "rbf", "poly", histogram): + # histogram kernel produces singular matrix inside linalg.solve + # XXX use a least-squares approximation? + inv = not callable(kernel) + + # transform fit data + kpca = KernelPCA( + 4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=inv + ) + X_fit_transformed = kpca.fit_transform(X_fit) + X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit) + assert_array_almost_equal( + np.abs(X_fit_transformed), np.abs(X_fit_transformed2) + ) + + # non-regression test: previously, gamma would be 0 by default, + # forcing all eigenvalues to 0 under the poly kernel + assert X_fit_transformed.size != 0 + + # transform new data + X_pred_transformed = kpca.transform(X_pred) + assert X_pred_transformed.shape[1] == X_fit_transformed.shape[1] + + # inverse transform + if inv: + X_pred2 = kpca.inverse_transform(X_pred_transformed) + assert X_pred2.shape == X_pred.shape + + +def test_kernel_pca_invalid_parameters(): + """Check that kPCA raises an error if the parameters are invalid + + Tests fitting inverse transform with a precomputed kernel raises a + ValueError. + """ + estimator = KernelPCA( + n_components=10, fit_inverse_transform=True, kernel="precomputed" + ) + err_ms = "Cannot fit_inverse_transform with a precomputed kernel" + with pytest.raises(ValueError, match=err_ms): + estimator.fit(np.random.randn(10, 10)) + + +def test_kernel_pca_consistent_transform(global_random_seed): + """Check robustness to mutations in the original training array + + Test that after fitting a kPCA model, it stays independent of any + mutation of the values of the original data object by relying on an + internal copy. + """ + # X_fit_ needs to retain the old, unmodified copy of X + state = np.random.RandomState(global_random_seed) + X = state.rand(10, 10) + kpca = KernelPCA(random_state=state).fit(X) + transformed1 = kpca.transform(X) + + X_copy = X.copy() + X[:, 0] = 666 + transformed2 = kpca.transform(X_copy) + assert_array_almost_equal(transformed1, transformed2) + + +def test_kernel_pca_deterministic_output(global_random_seed): + """Test that Kernel PCA produces deterministic output + + Tests that the same inputs and random state produce the same output. + """ + rng = np.random.RandomState(global_random_seed) + X = rng.rand(10, 10) + eigen_solver = ("arpack", "dense") + + for solver in eigen_solver: + transformed_X = np.zeros((20, 2)) + for i in range(20): + kpca = KernelPCA(n_components=2, eigen_solver=solver, random_state=rng) + transformed_X[i, :] = kpca.fit_transform(X)[0] + assert_allclose(transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2)) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_kernel_pca_sparse(csr_container, global_random_seed): + """Test that kPCA works on a sparse data input. + + Same test as ``test_kernel_pca except inverse_transform`` since it's not + implemented for sparse matrices. + """ + rng = np.random.RandomState(global_random_seed) + X_fit = csr_container(rng.random_sample((5, 4))) + X_pred = csr_container(rng.random_sample((2, 4))) + + for eigen_solver in ("auto", "arpack", "randomized"): + for kernel in ("linear", "rbf", "poly"): + # transform fit data + kpca = KernelPCA( + 4, + kernel=kernel, + eigen_solver=eigen_solver, + fit_inverse_transform=False, + random_state=0, + ) + X_fit_transformed = kpca.fit_transform(X_fit) + X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit) + assert_array_almost_equal( + np.abs(X_fit_transformed), np.abs(X_fit_transformed2) + ) + + # transform new data + X_pred_transformed = kpca.transform(X_pred) + assert X_pred_transformed.shape[1] == X_fit_transformed.shape[1] + + # inverse transform: not available for sparse matrices + # XXX: should we raise another exception type here? For instance: + # NotImplementedError. + with pytest.raises(NotFittedError): + kpca.inverse_transform(X_pred_transformed) + + +@pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"]) +@pytest.mark.parametrize("n_features", [4, 10]) +def test_kernel_pca_linear_kernel(solver, n_features, global_random_seed): + """Test that kPCA with linear kernel is equivalent to PCA for all solvers. + + KernelPCA with linear kernel should produce the same output as PCA. + """ + rng = np.random.RandomState(global_random_seed) + X_fit = rng.random_sample((5, n_features)) + X_pred = rng.random_sample((2, n_features)) + + # for a linear kernel, kernel PCA should find the same projection as PCA + # modulo the sign (direction) + # fit only the first four components: fifth is near zero eigenvalue, so + # can be trimmed due to roundoff error + n_comps = 3 if solver == "arpack" else 4 + assert_array_almost_equal( + np.abs(KernelPCA(n_comps, eigen_solver=solver).fit(X_fit).transform(X_pred)), + np.abs( + PCA(n_comps, svd_solver=solver if solver != "dense" else "full") + .fit(X_fit) + .transform(X_pred) + ), + ) + + +def test_kernel_pca_n_components(): + """Test that `n_components` is correctly taken into account for projections + + For all solvers this tests that the output has the correct shape depending + on the selected number of components. + """ + rng = np.random.RandomState(0) + X_fit = rng.random_sample((5, 4)) + X_pred = rng.random_sample((2, 4)) + + for eigen_solver in ("dense", "arpack", "randomized"): + for c in [1, 2, 4]: + kpca = KernelPCA(n_components=c, eigen_solver=eigen_solver) + shape = kpca.fit(X_fit).transform(X_pred).shape + + assert shape == (2, c) + + +def test_remove_zero_eig(): + """Check that the ``remove_zero_eig`` parameter works correctly. + + Tests that the null-space (Zero) eigenvalues are removed when + remove_zero_eig=True, whereas they are not by default. + """ + X = np.array([[1 - 1e-30, 1], [1, 1], [1, 1 - 1e-20]]) + + # n_components=None (default) => remove_zero_eig is True + kpca = KernelPCA() + Xt = kpca.fit_transform(X) + assert Xt.shape == (3, 0) + + kpca = KernelPCA(n_components=2) + Xt = kpca.fit_transform(X) + assert Xt.shape == (3, 2) + + kpca = KernelPCA(n_components=2, remove_zero_eig=True) + Xt = kpca.fit_transform(X) + assert Xt.shape == (3, 0) + + +def test_leave_zero_eig(): + """Non-regression test for issue #12141 (PR #12143) + + This test checks that fit().transform() returns the same result as + fit_transform() in case of non-removed zero eigenvalue. + """ + X_fit = np.array([[1, 1], [0, 0]]) + + # Assert that even with all np warnings on, there is no div by zero warning + with warnings.catch_warnings(): + # There might be warnings about the kernel being badly conditioned, + # but there should not be warnings about division by zero. + # (Numpy division by zero warning can have many message variants, but + # at least we know that it is a RuntimeWarning so lets check only this) + warnings.simplefilter("error", RuntimeWarning) + with np.errstate(all="warn"): + k = KernelPCA(n_components=2, remove_zero_eig=False, eigen_solver="dense") + # Fit, then transform + A = k.fit(X_fit).transform(X_fit) + # Do both at once + B = k.fit_transform(X_fit) + # Compare + assert_array_almost_equal(np.abs(A), np.abs(B)) + + +def test_kernel_pca_precomputed(global_random_seed): + """Test that kPCA works with a precomputed kernel, for all solvers""" + rng = np.random.RandomState(global_random_seed) + X_fit = rng.random_sample((5, 4)) + X_pred = rng.random_sample((2, 4)) + + for eigen_solver in ("dense", "arpack", "randomized"): + X_kpca = ( + KernelPCA(4, eigen_solver=eigen_solver, random_state=0) + .fit(X_fit) + .transform(X_pred) + ) + + X_kpca2 = ( + KernelPCA( + 4, eigen_solver=eigen_solver, kernel="precomputed", random_state=0 + ) + .fit(np.dot(X_fit, X_fit.T)) + .transform(np.dot(X_pred, X_fit.T)) + ) + + X_kpca_train = KernelPCA( + 4, eigen_solver=eigen_solver, kernel="precomputed", random_state=0 + ).fit_transform(np.dot(X_fit, X_fit.T)) + + X_kpca_train2 = ( + KernelPCA( + 4, eigen_solver=eigen_solver, kernel="precomputed", random_state=0 + ) + .fit(np.dot(X_fit, X_fit.T)) + .transform(np.dot(X_fit, X_fit.T)) + ) + + assert_array_almost_equal(np.abs(X_kpca), np.abs(X_kpca2)) + + assert_array_almost_equal(np.abs(X_kpca_train), np.abs(X_kpca_train2)) + + +@pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"]) +def test_kernel_pca_precomputed_non_symmetric(solver): + """Check that the kernel centerer works. + + Tests that a non symmetric precomputed kernel is actually accepted + because the kernel centerer does its job correctly. + """ + + # a non symmetric gram matrix + K = [[1, 2], [3, 40]] + kpca = KernelPCA( + kernel="precomputed", eigen_solver=solver, n_components=1, random_state=0 + ) + kpca.fit(K) # no error + + # same test with centered kernel + Kc = [[9, -9], [-9, 9]] + kpca_c = KernelPCA( + kernel="precomputed", eigen_solver=solver, n_components=1, random_state=0 + ) + kpca_c.fit(Kc) + + # comparison between the non-centered and centered versions + assert_array_equal(kpca.eigenvectors_, kpca_c.eigenvectors_) + assert_array_equal(kpca.eigenvalues_, kpca_c.eigenvalues_) + + +def test_gridsearch_pipeline(): + """Check that kPCA works as expected in a grid search pipeline + + Test if we can do a grid-search to find parameters to separate + circles with a perceptron model. + """ + X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0) + kpca = KernelPCA(kernel="rbf", n_components=2) + pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron(max_iter=5))]) + param_grid = dict(kernel_pca__gamma=2.0 ** np.arange(-2, 2)) + grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid) + grid_search.fit(X, y) + assert grid_search.best_score_ == 1 + + +def test_gridsearch_pipeline_precomputed(): + """Check that kPCA works as expected in a grid search pipeline (2) + + Test if we can do a grid-search to find parameters to separate + circles with a perceptron model. This test uses a precomputed kernel. + """ + X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0) + kpca = KernelPCA(kernel="precomputed", n_components=2) + pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron(max_iter=5))]) + param_grid = dict(Perceptron__max_iter=np.arange(1, 5)) + grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid) + X_kernel = rbf_kernel(X, gamma=2.0) + grid_search.fit(X_kernel, y) + assert grid_search.best_score_ == 1 + + +def test_nested_circles(): + """Check that kPCA projects in a space where nested circles are separable + + Tests that 2D nested circles become separable with a perceptron when + projected in the first 2 kPCA using an RBF kernel, while raw samples + are not directly separable in the original space. + """ + X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0) + + # 2D nested circles are not linearly separable + train_score = Perceptron(max_iter=5).fit(X, y).score(X, y) + assert train_score < 0.8 + + # Project the circles data into the first 2 components of a RBF Kernel + # PCA model. + # Note that the gamma value is data dependent. If this test breaks + # and the gamma value has to be updated, the Kernel PCA example will + # have to be updated too. + kpca = KernelPCA( + kernel="rbf", n_components=2, fit_inverse_transform=True, gamma=2.0 + ) + X_kpca = kpca.fit_transform(X) + + # The data is perfectly linearly separable in that space + train_score = Perceptron(max_iter=5).fit(X_kpca, y).score(X_kpca, y) + assert train_score == 1.0 + + +def test_kernel_conditioning(): + """Check that ``_check_psd_eigenvalues`` is correctly called in kPCA + + Non-regression test for issue #12140 (PR #12145). + """ + + # create a pathological X leading to small non-zero eigenvalue + X = [[5, 1], [5 + 1e-8, 1e-8], [5 + 1e-8, 0]] + kpca = KernelPCA(kernel="linear", n_components=2, fit_inverse_transform=True) + kpca.fit(X) + + # check that the small non-zero eigenvalue was correctly set to zero + assert kpca.eigenvalues_.min() == 0 + assert np.all(kpca.eigenvalues_ == _check_psd_eigenvalues(kpca.eigenvalues_)) + + +@pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"]) +def test_precomputed_kernel_not_psd(solver): + """Check how KernelPCA works with non-PSD kernels depending on n_components + + Tests for all methods what happens with a non PSD gram matrix (this + can happen in an isomap scenario, or with custom kernel functions, or + maybe with ill-posed datasets). + + When ``n_component`` is large enough to capture a negative eigenvalue, an + error should be raised. Otherwise, KernelPCA should run without error + since the negative eigenvalues are not selected. + """ + + # a non PSD kernel with large eigenvalues, already centered + # it was captured from an isomap call and multiplied by 100 for compacity + K = [ + [4.48, -1.0, 8.07, 2.33, 2.33, 2.33, -5.76, -12.78], + [-1.0, -6.48, 4.5, -1.24, -1.24, -1.24, -0.81, 7.49], + [8.07, 4.5, 15.48, 2.09, 2.09, 2.09, -11.1, -23.23], + [2.33, -1.24, 2.09, 4.0, -3.65, -3.65, 1.02, -0.9], + [2.33, -1.24, 2.09, -3.65, 4.0, -3.65, 1.02, -0.9], + [2.33, -1.24, 2.09, -3.65, -3.65, 4.0, 1.02, -0.9], + [-5.76, -0.81, -11.1, 1.02, 1.02, 1.02, 4.86, 9.75], + [-12.78, 7.49, -23.23, -0.9, -0.9, -0.9, 9.75, 21.46], + ] + # this gram matrix has 5 positive eigenvalues and 3 negative ones + # [ 52.72, 7.65, 7.65, 5.02, 0. , -0. , -6.13, -15.11] + + # 1. ask for enough components to get a significant negative one + kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, n_components=7) + # make sure that the appropriate error is raised + with pytest.raises(ValueError, match="There are significant negative eigenvalues"): + kpca.fit(K) + + # 2. ask for a small enough n_components to get only positive ones + kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, n_components=2) + if solver == "randomized": + # the randomized method is still inconsistent with the others on this + # since it selects the eigenvalues based on the largest 2 modules, not + # on the largest 2 values. + # + # At least we can ensure that we return an error instead of returning + # the wrong eigenvalues + with pytest.raises( + ValueError, match="There are significant negative eigenvalues" + ): + kpca.fit(K) + else: + # general case: make sure that it works + kpca.fit(K) + + +@pytest.mark.parametrize("n_components", [4, 10, 20]) +def test_kernel_pca_solvers_equivalence(n_components): + """Check that 'dense' 'arpack' & 'randomized' solvers give similar results""" + + # Generate random data + n_train, n_test = 1_000, 100 + X, _ = make_circles( + n_samples=(n_train + n_test), factor=0.3, noise=0.05, random_state=0 + ) + X_fit, X_pred = X[:n_train, :], X[n_train:, :] + + # reference (full) + ref_pred = ( + KernelPCA(n_components, eigen_solver="dense", random_state=0) + .fit(X_fit) + .transform(X_pred) + ) + + # arpack + a_pred = ( + KernelPCA(n_components, eigen_solver="arpack", random_state=0) + .fit(X_fit) + .transform(X_pred) + ) + # check that the result is still correct despite the approx + assert_array_almost_equal(np.abs(a_pred), np.abs(ref_pred)) + + # randomized + r_pred = ( + KernelPCA(n_components, eigen_solver="randomized", random_state=0) + .fit(X_fit) + .transform(X_pred) + ) + # check that the result is still correct despite the approximation + assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred)) + + +def test_kernel_pca_inverse_transform_reconstruction(): + """Test if the reconstruction is a good approximation. + + Note that in general it is not possible to get an arbitrarily good + reconstruction because of kernel centering that does not + preserve all the information of the original data. + """ + X, *_ = make_blobs(n_samples=100, n_features=4, random_state=0) + + kpca = KernelPCA( + n_components=20, kernel="rbf", fit_inverse_transform=True, alpha=1e-3 + ) + X_trans = kpca.fit_transform(X) + X_reconst = kpca.inverse_transform(X_trans) + assert np.linalg.norm(X - X_reconst) / np.linalg.norm(X) < 1e-1 + + +def test_kernel_pca_raise_not_fitted_error(): + X = np.random.randn(15).reshape(5, 3) + kpca = KernelPCA() + kpca.fit(X) + with pytest.raises(NotFittedError): + kpca.inverse_transform(X) + + +def test_32_64_decomposition_shape(): + """Test that the decomposition is similar for 32 and 64 bits data + + Non regression test for + https://github.com/scikit-learn/scikit-learn/issues/18146 + """ + X, y = make_blobs( + n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, cluster_std=0.1 + ) + X = StandardScaler().fit_transform(X) + X -= X.min() + + # Compare the shapes (corresponds to the number of non-zero eigenvalues) + kpca = KernelPCA() + assert kpca.fit_transform(X).shape == kpca.fit_transform(X.astype(np.float32)).shape + + +def test_kernel_pca_feature_names_out(): + """Check feature names out for KernelPCA.""" + X, *_ = make_blobs(n_samples=100, n_features=4, random_state=0) + kpca = KernelPCA(n_components=2).fit(X) + + names = kpca.get_feature_names_out() + assert_array_equal([f"kernelpca{i}" for i in range(2)], names) + + +def test_kernel_pca_inverse_correct_gamma(global_random_seed): + """Check that gamma is set correctly when not provided. + + Non-regression test for #26280 + """ + rng = np.random.RandomState(global_random_seed) + X = rng.random_sample((5, 4)) + + kwargs = { + "n_components": 2, + "random_state": rng, + "fit_inverse_transform": True, + "kernel": "rbf", + } + + expected_gamma = 1 / X.shape[1] + kpca1 = KernelPCA(gamma=None, **kwargs).fit(X) + kpca2 = KernelPCA(gamma=expected_gamma, **kwargs).fit(X) + + assert kpca1.gamma_ == expected_gamma + assert kpca2.gamma_ == expected_gamma + + X1_recon = kpca1.inverse_transform(kpca1.transform(X)) + X2_recon = kpca2.inverse_transform(kpca1.transform(X)) + + assert_allclose(X1_recon, X2_recon) + + +def test_kernel_pca_pandas_output(): + """Check that KernelPCA works with pandas output when the solver is arpack. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/27579 + """ + pytest.importorskip("pandas") + X, _ = load_iris(as_frame=True, return_X_y=True) + with sklearn.config_context(transform_output="pandas"): + KernelPCA(n_components=2, eigen_solver="arpack").fit_transform(X) diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_nmf.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_nmf.py new file mode 100644 index 0000000000000000000000000000000000000000..17be798b3f3921460cb2378c798209de60c963a4 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_nmf.py @@ -0,0 +1,1010 @@ +import re +import sys +from io import StringIO + +import numpy as np +import pytest +from scipy import linalg + +from sklearn.base import clone +from sklearn.decomposition import NMF, MiniBatchNMF, non_negative_factorization +from sklearn.decomposition import _nmf as nmf # For testing internals +from sklearn.exceptions import ConvergenceWarning +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) +from sklearn.utils.extmath import squared_norm +from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS + + +@pytest.mark.parametrize( + ["Estimator", "solver"], + [[NMF, {"solver": "cd"}], [NMF, {"solver": "mu"}], [MiniBatchNMF, {}]], +) +def test_convergence_warning(Estimator, solver): + convergence_warning = ( + "Maximum number of iterations 1 reached. Increase it to improve convergence." + ) + A = np.ones((2, 2)) + with pytest.warns(ConvergenceWarning, match=convergence_warning): + Estimator(max_iter=1, n_components="auto", **solver).fit(A) + + +def test_initialize_nn_output(): + # Test that initialization does not return negative values + rng = np.random.mtrand.RandomState(42) + data = np.abs(rng.randn(10, 10)) + for init in ("random", "nndsvd", "nndsvda", "nndsvdar"): + W, H = nmf._initialize_nmf(data, 10, init=init, random_state=0) + assert not ((W < 0).any() or (H < 0).any()) + + +@pytest.mark.filterwarnings( + r"ignore:The multiplicative update \('mu'\) solver cannot update zeros present in" + r" the initialization", +) +def test_parameter_checking(): + # Here we only check for invalid parameter values that are not already + # automatically tested in the common tests. + + A = np.ones((2, 2)) + + msg = "Invalid beta_loss parameter: solver 'cd' does not handle beta_loss = 1.0" + with pytest.raises(ValueError, match=msg): + NMF(solver="cd", beta_loss=1.0).fit(A) + msg = "Negative values in data passed to" + with pytest.raises(ValueError, match=msg): + NMF().fit(-A) + clf = NMF(2, tol=0.1).fit(A) + with pytest.raises(ValueError, match=msg): + clf.transform(-A) + with pytest.raises(ValueError, match=msg): + nmf._initialize_nmf(-A, 2, "nndsvd") + + for init in ["nndsvd", "nndsvda", "nndsvdar"]: + msg = re.escape( + "init = '{}' can only be used when " + "n_components <= min(n_samples, n_features)".format(init) + ) + with pytest.raises(ValueError, match=msg): + NMF(3, init=init).fit(A) + with pytest.raises(ValueError, match=msg): + MiniBatchNMF(3, init=init).fit(A) + with pytest.raises(ValueError, match=msg): + nmf._initialize_nmf(A, 3, init) + + +def test_initialize_close(): + # Test NNDSVD error + # Test that _initialize_nmf error is less than the standard deviation of + # the entries in the matrix. + rng = np.random.mtrand.RandomState(42) + A = np.abs(rng.randn(10, 10)) + W, H = nmf._initialize_nmf(A, 10, init="nndsvd") + error = linalg.norm(np.dot(W, H) - A) + sdev = linalg.norm(A - A.mean()) + assert error <= sdev + + +def test_initialize_variants(): + # Test NNDSVD variants correctness + # Test that the variants 'nndsvda' and 'nndsvdar' differ from basic + # 'nndsvd' only where the basic version has zeros. + rng = np.random.mtrand.RandomState(42) + data = np.abs(rng.randn(10, 10)) + W0, H0 = nmf._initialize_nmf(data, 10, init="nndsvd") + Wa, Ha = nmf._initialize_nmf(data, 10, init="nndsvda") + War, Har = nmf._initialize_nmf(data, 10, init="nndsvdar", random_state=0) + + for ref, evl in ((W0, Wa), (W0, War), (H0, Ha), (H0, Har)): + assert_almost_equal(evl[ref != 0], ref[ref != 0]) + + +# ignore UserWarning raised when both solver='mu' and init='nndsvd' +@pytest.mark.filterwarnings( + r"ignore:The multiplicative update \('mu'\) solver cannot update zeros present in" + r" the initialization" +) +@pytest.mark.parametrize( + ["Estimator", "solver"], + [[NMF, {"solver": "cd"}], [NMF, {"solver": "mu"}], [MiniBatchNMF, {}]], +) +@pytest.mark.parametrize("init", (None, "nndsvd", "nndsvda", "nndsvdar", "random")) +@pytest.mark.parametrize("alpha_W", (0.0, 1.0)) +@pytest.mark.parametrize("alpha_H", (0.0, 1.0, "same")) +def test_nmf_fit_nn_output(Estimator, solver, init, alpha_W, alpha_H): + # Test that the decomposition does not contain negative values + A = np.c_[5.0 - np.arange(1, 6), 5.0 + np.arange(1, 6)] + model = Estimator( + n_components=2, + init=init, + alpha_W=alpha_W, + alpha_H=alpha_H, + random_state=0, + **solver, + ) + transf = model.fit_transform(A) + assert not ((model.components_ < 0).any() or (transf < 0).any()) + + +@pytest.mark.parametrize( + ["Estimator", "solver"], + [[NMF, {"solver": "cd"}], [NMF, {"solver": "mu"}], [MiniBatchNMF, {}]], +) +def test_nmf_fit_close(Estimator, solver): + rng = np.random.mtrand.RandomState(42) + # Test that the fit is not too far away + pnmf = Estimator( + 5, + init="nndsvdar", + random_state=0, + max_iter=600, + **solver, + ) + X = np.abs(rng.randn(6, 5)) + assert pnmf.fit(X).reconstruction_err_ < 0.1 + + +def test_nmf_true_reconstruction(): + # Test that the fit is not too far away from an exact solution + # (by construction) + n_samples = 15 + n_features = 10 + n_components = 5 + beta_loss = 1 + batch_size = 3 + max_iter = 1000 + + rng = np.random.mtrand.RandomState(42) + W_true = np.zeros([n_samples, n_components]) + W_array = np.abs(rng.randn(n_samples)) + for j in range(n_components): + W_true[j % n_samples, j] = W_array[j % n_samples] + H_true = np.zeros([n_components, n_features]) + H_array = np.abs(rng.randn(n_components)) + for j in range(n_features): + H_true[j % n_components, j] = H_array[j % n_components] + X = np.dot(W_true, H_true) + + model = NMF( + n_components=n_components, + solver="mu", + beta_loss=beta_loss, + max_iter=max_iter, + random_state=0, + ) + transf = model.fit_transform(X) + X_calc = np.dot(transf, model.components_) + + assert model.reconstruction_err_ < 0.1 + assert_allclose(X, X_calc) + + mbmodel = MiniBatchNMF( + n_components=n_components, + beta_loss=beta_loss, + batch_size=batch_size, + random_state=0, + max_iter=max_iter, + ) + transf = mbmodel.fit_transform(X) + X_calc = np.dot(transf, mbmodel.components_) + + assert mbmodel.reconstruction_err_ < 0.1 + assert_allclose(X, X_calc, atol=1) + + +@pytest.mark.parametrize("solver", ["cd", "mu"]) +def test_nmf_transform(solver): + # Test that fit_transform is equivalent to fit.transform for NMF + # Test that NMF.transform returns close values + rng = np.random.mtrand.RandomState(42) + A = np.abs(rng.randn(6, 5)) + m = NMF( + solver=solver, + n_components=3, + init="random", + random_state=0, + tol=1e-6, + ) + ft = m.fit_transform(A) + t = m.transform(A) + assert_allclose(ft, t, atol=1e-1) + + +def test_minibatch_nmf_transform(): + # Test that fit_transform is equivalent to fit.transform for MiniBatchNMF + # Only guaranteed with fresh restarts + rng = np.random.mtrand.RandomState(42) + A = np.abs(rng.randn(6, 5)) + m = MiniBatchNMF( + n_components=3, + random_state=0, + tol=1e-3, + fresh_restarts=True, + ) + ft = m.fit_transform(A) + t = m.transform(A) + assert_allclose(ft, t) + + +@pytest.mark.parametrize( + ["Estimator", "solver"], + [[NMF, {"solver": "cd"}], [NMF, {"solver": "mu"}], [MiniBatchNMF, {}]], +) +def test_nmf_transform_custom_init(Estimator, solver): + # Smoke test that checks if NMF.transform works with custom initialization + random_state = np.random.RandomState(0) + A = np.abs(random_state.randn(6, 5)) + n_components = 4 + avg = np.sqrt(A.mean() / n_components) + H_init = np.abs(avg * random_state.randn(n_components, 5)) + W_init = np.abs(avg * random_state.randn(6, n_components)) + + m = Estimator( + n_components=n_components, init="custom", random_state=0, tol=1e-3, **solver + ) + m.fit_transform(A, W=W_init, H=H_init) + m.transform(A) + + +@pytest.mark.parametrize("solver", ("cd", "mu")) +def test_nmf_inverse_transform(solver): + # Test that NMF.inverse_transform returns close values + random_state = np.random.RandomState(0) + A = np.abs(random_state.randn(6, 4)) + m = NMF( + solver=solver, + n_components=4, + init="random", + random_state=0, + max_iter=1000, + ) + ft = m.fit_transform(A) + A_new = m.inverse_transform(ft) + assert_array_almost_equal(A, A_new, decimal=2) + + +def test_mbnmf_inverse_transform(): + # Test that MiniBatchNMF.transform followed by MiniBatchNMF.inverse_transform + # is close to the identity + rng = np.random.RandomState(0) + A = np.abs(rng.randn(6, 4)) + nmf = MiniBatchNMF( + random_state=rng, + max_iter=500, + init="nndsvdar", + fresh_restarts=True, + ) + ft = nmf.fit_transform(A) + A_new = nmf.inverse_transform(ft) + assert_allclose(A, A_new, rtol=1e-3, atol=1e-2) + + +@pytest.mark.parametrize("Estimator", [NMF, MiniBatchNMF]) +def test_n_components_greater_n_features(Estimator): + # Smoke test for the case of more components than features. + rng = np.random.mtrand.RandomState(42) + A = np.abs(rng.randn(30, 10)) + Estimator(n_components=15, random_state=0, tol=1e-2).fit(A) + + +@pytest.mark.parametrize( + ["Estimator", "solver"], + [[NMF, {"solver": "cd"}], [NMF, {"solver": "mu"}], [MiniBatchNMF, {}]], +) +@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) +@pytest.mark.parametrize("alpha_W", (0.0, 1.0)) +@pytest.mark.parametrize("alpha_H", (0.0, 1.0, "same")) +def test_nmf_sparse_input(Estimator, solver, sparse_container, alpha_W, alpha_H): + # Test that sparse matrices are accepted as input + rng = np.random.mtrand.RandomState(42) + A = np.abs(rng.randn(10, 10)) + A[:, 2 * np.arange(5)] = 0 + A_sparse = sparse_container(A) + + est1 = Estimator( + n_components=5, + init="random", + alpha_W=alpha_W, + alpha_H=alpha_H, + random_state=0, + tol=0, + max_iter=100, + **solver, + ) + est2 = clone(est1) + + W1 = est1.fit_transform(A) + W2 = est2.fit_transform(A_sparse) + H1 = est1.components_ + H2 = est2.components_ + + assert_allclose(W1, W2) + assert_allclose(H1, H2) + + +@pytest.mark.parametrize( + ["Estimator", "solver"], + [[NMF, {"solver": "cd"}], [NMF, {"solver": "mu"}], [MiniBatchNMF, {}]], +) +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_nmf_sparse_transform(Estimator, solver, csc_container): + # Test that transform works on sparse data. Issue #2124 + rng = np.random.mtrand.RandomState(42) + A = np.abs(rng.randn(3, 2)) + A[1, 1] = 0 + A = csc_container(A) + + model = Estimator(random_state=0, n_components=2, max_iter=400, **solver) + A_fit_tr = model.fit_transform(A) + A_tr = model.transform(A) + assert_allclose(A_fit_tr, A_tr, atol=1e-1) + + +@pytest.mark.parametrize("init", ["random", "nndsvd"]) +@pytest.mark.parametrize("solver", ("cd", "mu")) +@pytest.mark.parametrize("alpha_W", (0.0, 1.0)) +@pytest.mark.parametrize("alpha_H", (0.0, 1.0, "same")) +def test_non_negative_factorization_consistency(init, solver, alpha_W, alpha_H): + # Test that the function is called in the same way, either directly + # or through the NMF class + max_iter = 500 + rng = np.random.mtrand.RandomState(42) + A = np.abs(rng.randn(10, 10)) + A[:, 2 * np.arange(5)] = 0 + + W_nmf, H, _ = non_negative_factorization( + A, + init=init, + solver=solver, + max_iter=max_iter, + alpha_W=alpha_W, + alpha_H=alpha_H, + random_state=1, + tol=1e-2, + ) + W_nmf_2, H, _ = non_negative_factorization( + A, + H=H, + update_H=False, + init=init, + solver=solver, + max_iter=max_iter, + alpha_W=alpha_W, + alpha_H=alpha_H, + random_state=1, + tol=1e-2, + ) + + model_class = NMF( + init=init, + solver=solver, + max_iter=max_iter, + alpha_W=alpha_W, + alpha_H=alpha_H, + random_state=1, + tol=1e-2, + ) + W_cls = model_class.fit_transform(A) + W_cls_2 = model_class.transform(A) + + assert_allclose(W_nmf, W_cls) + assert_allclose(W_nmf_2, W_cls_2) + + +def test_non_negative_factorization_checking(): + # Note that the validity of parameter types and range of possible values + # for scalar numerical or str parameters is already checked in the common + # tests. Here we only check for problems that cannot be captured by simple + # declarative constraints on the valid parameter values. + + A = np.ones((2, 2)) + # Test parameters checking in public function + nnmf = non_negative_factorization + msg = re.escape("Negative values in data passed to NMF (input H)") + with pytest.raises(ValueError, match=msg): + nnmf(A, A, -A, 2, init="custom") + msg = re.escape("Negative values in data passed to NMF (input W)") + with pytest.raises(ValueError, match=msg): + nnmf(A, -A, A, 2, init="custom") + msg = re.escape("Array passed to NMF (input H) is full of zeros") + with pytest.raises(ValueError, match=msg): + nnmf(A, A, 0 * A, 2, init="custom") + + +def _beta_divergence_dense(X, W, H, beta): + """Compute the beta-divergence of X and W.H for dense array only. + + Used as a reference for testing nmf._beta_divergence. + """ + WH = np.dot(W, H) + + if beta == 2: + return squared_norm(X - WH) / 2 + + WH_Xnonzero = WH[X != 0] + X_nonzero = X[X != 0] + np.maximum(WH_Xnonzero, 1e-9, out=WH_Xnonzero) + + if beta == 1: + res = np.sum(X_nonzero * np.log(X_nonzero / WH_Xnonzero)) + res += WH.sum() - X.sum() + + elif beta == 0: + div = X_nonzero / WH_Xnonzero + res = np.sum(div) - X.size - np.sum(np.log(div)) + else: + res = (X_nonzero**beta).sum() + res += (beta - 1) * (WH**beta).sum() + res -= beta * (X_nonzero * (WH_Xnonzero ** (beta - 1))).sum() + res /= beta * (beta - 1) + + return res + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_beta_divergence(csr_container): + # Compare _beta_divergence with the reference _beta_divergence_dense + n_samples = 20 + n_features = 10 + n_components = 5 + beta_losses = [0.0, 0.5, 1.0, 1.5, 2.0, 3.0] + + # initialization + rng = np.random.mtrand.RandomState(42) + X = rng.randn(n_samples, n_features) + np.clip(X, 0, None, out=X) + X_csr = csr_container(X) + W, H = nmf._initialize_nmf(X, n_components, init="random", random_state=42) + + for beta in beta_losses: + ref = _beta_divergence_dense(X, W, H, beta) + loss = nmf._beta_divergence(X, W, H, beta) + loss_csr = nmf._beta_divergence(X_csr, W, H, beta) + + assert_almost_equal(ref, loss, decimal=7) + assert_almost_equal(ref, loss_csr, decimal=7) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_special_sparse_dot(csr_container): + # Test the function that computes np.dot(W, H), only where X is non zero. + n_samples = 10 + n_features = 5 + n_components = 3 + rng = np.random.mtrand.RandomState(42) + X = rng.randn(n_samples, n_features) + np.clip(X, 0, None, out=X) + X_csr = csr_container(X) + + W = np.abs(rng.randn(n_samples, n_components)) + H = np.abs(rng.randn(n_components, n_features)) + + WH_safe = nmf._special_sparse_dot(W, H, X_csr) + WH = nmf._special_sparse_dot(W, H, X) + + # test that both results have same values, in X_csr nonzero elements + ii, jj = X_csr.nonzero() + WH_safe_data = np.asarray(WH_safe[ii, jj]).ravel() + assert_array_almost_equal(WH_safe_data, WH[ii, jj], decimal=10) + + # test that WH_safe and X_csr have the same sparse structure + assert_array_equal(WH_safe.indices, X_csr.indices) + assert_array_equal(WH_safe.indptr, X_csr.indptr) + assert_array_equal(WH_safe.shape, X_csr.shape) + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_nmf_multiplicative_update_sparse(csr_container): + # Compare sparse and dense input in multiplicative update NMF + # Also test continuity of the results with respect to beta_loss parameter + n_samples = 20 + n_features = 10 + n_components = 5 + alpha = 0.1 + l1_ratio = 0.5 + n_iter = 20 + + # initialization + rng = np.random.mtrand.RandomState(1337) + X = rng.randn(n_samples, n_features) + X = np.abs(X) + X_csr = csr_container(X) + W0, H0 = nmf._initialize_nmf(X, n_components, init="random", random_state=42) + + for beta_loss in (-1.2, 0, 0.2, 1.0, 2.0, 2.5): + # Reference with dense array X + W, H = W0.copy(), H0.copy() + W1, H1, _ = non_negative_factorization( + X, + W, + H, + n_components, + init="custom", + update_H=True, + solver="mu", + beta_loss=beta_loss, + max_iter=n_iter, + alpha_W=alpha, + l1_ratio=l1_ratio, + random_state=42, + ) + + # Compare with sparse X + W, H = W0.copy(), H0.copy() + W2, H2, _ = non_negative_factorization( + X_csr, + W, + H, + n_components, + init="custom", + update_H=True, + solver="mu", + beta_loss=beta_loss, + max_iter=n_iter, + alpha_W=alpha, + l1_ratio=l1_ratio, + random_state=42, + ) + + assert_allclose(W1, W2, atol=1e-7) + assert_allclose(H1, H2, atol=1e-7) + + # Compare with almost same beta_loss, since some values have a specific + # behavior, but the results should be continuous w.r.t beta_loss + beta_loss -= 1.0e-5 + W, H = W0.copy(), H0.copy() + W3, H3, _ = non_negative_factorization( + X_csr, + W, + H, + n_components, + init="custom", + update_H=True, + solver="mu", + beta_loss=beta_loss, + max_iter=n_iter, + alpha_W=alpha, + l1_ratio=l1_ratio, + random_state=42, + ) + + assert_allclose(W1, W3, atol=1e-4) + assert_allclose(H1, H3, atol=1e-4) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_nmf_negative_beta_loss(csr_container): + # Test that an error is raised if beta_loss < 0 and X contains zeros. + # Test that the output has not NaN values when the input contains zeros. + n_samples = 6 + n_features = 5 + n_components = 3 + + rng = np.random.mtrand.RandomState(42) + X = rng.randn(n_samples, n_features) + np.clip(X, 0, None, out=X) + X_csr = csr_container(X) + + def _assert_nmf_no_nan(X, beta_loss): + W, H, _ = non_negative_factorization( + X, + init="random", + n_components=n_components, + solver="mu", + beta_loss=beta_loss, + random_state=0, + max_iter=1000, + ) + assert not np.any(np.isnan(W)) + assert not np.any(np.isnan(H)) + + msg = "When beta_loss <= 0 and X contains zeros, the solver may diverge." + for beta_loss in (-0.6, 0.0): + with pytest.raises(ValueError, match=msg): + _assert_nmf_no_nan(X, beta_loss) + _assert_nmf_no_nan(X + 1e-9, beta_loss) + + for beta_loss in (0.2, 1.0, 1.2, 2.0, 2.5): + _assert_nmf_no_nan(X, beta_loss) + _assert_nmf_no_nan(X_csr, beta_loss) + + +@pytest.mark.parametrize("beta_loss", [-0.5, 0.0]) +def test_minibatch_nmf_negative_beta_loss(beta_loss): + """Check that an error is raised if beta_loss < 0 and X contains zeros.""" + rng = np.random.RandomState(0) + X = rng.normal(size=(6, 5)) + X[X < 0] = 0 + + nmf = MiniBatchNMF(beta_loss=beta_loss, random_state=0) + + msg = "When beta_loss <= 0 and X contains zeros, the solver may diverge." + with pytest.raises(ValueError, match=msg): + nmf.fit(X) + + +@pytest.mark.parametrize( + ["Estimator", "solver"], + [[NMF, {"solver": "cd"}], [NMF, {"solver": "mu"}], [MiniBatchNMF, {}]], +) +def test_nmf_regularization(Estimator, solver): + # Test the effect of L1 and L2 regularizations + n_samples = 6 + n_features = 5 + n_components = 3 + rng = np.random.mtrand.RandomState(42) + X = np.abs(rng.randn(n_samples, n_features)) + + # L1 regularization should increase the number of zeros + l1_ratio = 1.0 + regul = Estimator( + n_components=n_components, + alpha_W=0.5, + l1_ratio=l1_ratio, + random_state=42, + **solver, + ) + model = Estimator( + n_components=n_components, + alpha_W=0.0, + l1_ratio=l1_ratio, + random_state=42, + **solver, + ) + + W_regul = regul.fit_transform(X) + W_model = model.fit_transform(X) + + H_regul = regul.components_ + H_model = model.components_ + + eps = np.finfo(np.float64).eps + W_regul_n_zeros = W_regul[W_regul <= eps].size + W_model_n_zeros = W_model[W_model <= eps].size + H_regul_n_zeros = H_regul[H_regul <= eps].size + H_model_n_zeros = H_model[H_model <= eps].size + + assert W_regul_n_zeros > W_model_n_zeros + assert H_regul_n_zeros > H_model_n_zeros + + # L2 regularization should decrease the sum of the squared norm + # of the matrices W and H + l1_ratio = 0.0 + regul = Estimator( + n_components=n_components, + alpha_W=0.5, + l1_ratio=l1_ratio, + random_state=42, + **solver, + ) + model = Estimator( + n_components=n_components, + alpha_W=0.0, + l1_ratio=l1_ratio, + random_state=42, + **solver, + ) + + W_regul = regul.fit_transform(X) + W_model = model.fit_transform(X) + + H_regul = regul.components_ + H_model = model.components_ + + assert (linalg.norm(W_model)) ** 2.0 + (linalg.norm(H_model)) ** 2.0 > ( + linalg.norm(W_regul) + ) ** 2.0 + (linalg.norm(H_regul)) ** 2.0 + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +@pytest.mark.parametrize("solver", ("cd", "mu")) +def test_nmf_decreasing(solver): + # test that the objective function is decreasing at each iteration + n_samples = 20 + n_features = 15 + n_components = 10 + alpha = 0.1 + l1_ratio = 0.5 + tol = 0.0 + + # initialization + rng = np.random.mtrand.RandomState(42) + X = rng.randn(n_samples, n_features) + np.abs(X, X) + W0, H0 = nmf._initialize_nmf(X, n_components, init="random", random_state=42) + + for beta_loss in (-1.2, 0, 0.2, 1.0, 2.0, 2.5): + if solver != "mu" and beta_loss != 2: + # not implemented + continue + W, H = W0.copy(), H0.copy() + previous_loss = None + for _ in range(30): + # one more iteration starting from the previous results + W, H, _ = non_negative_factorization( + X, + W, + H, + beta_loss=beta_loss, + init="custom", + n_components=n_components, + max_iter=1, + alpha_W=alpha, + solver=solver, + tol=tol, + l1_ratio=l1_ratio, + verbose=0, + random_state=0, + update_H=True, + ) + + loss = ( + nmf._beta_divergence(X, W, H, beta_loss) + + alpha * l1_ratio * n_features * W.sum() + + alpha * l1_ratio * n_samples * H.sum() + + alpha * (1 - l1_ratio) * n_features * (W**2).sum() + + alpha * (1 - l1_ratio) * n_samples * (H**2).sum() + ) + if previous_loss is not None: + assert previous_loss > loss + previous_loss = loss + + +def test_nmf_underflow(): + # Regression test for an underflow issue in _beta_divergence + rng = np.random.RandomState(0) + n_samples, n_features, n_components = 10, 2, 2 + X = np.abs(rng.randn(n_samples, n_features)) * 10 + W = np.abs(rng.randn(n_samples, n_components)) * 10 + H = np.abs(rng.randn(n_components, n_features)) + + X[0, 0] = 0 + ref = nmf._beta_divergence(X, W, H, beta=1.0) + X[0, 0] = 1e-323 + res = nmf._beta_divergence(X, W, H, beta=1.0) + assert_almost_equal(res, ref) + + +@pytest.mark.parametrize( + "dtype_in, dtype_out", + [ + (np.float32, np.float32), + (np.float64, np.float64), + (np.int32, np.float64), + (np.int64, np.float64), + ], +) +@pytest.mark.parametrize( + ["Estimator", "solver"], + [[NMF, {"solver": "cd"}], [NMF, {"solver": "mu"}], [MiniBatchNMF, {}]], +) +def test_nmf_dtype_match(Estimator, solver, dtype_in, dtype_out): + # Check that NMF preserves dtype (float32 and float64) + X = np.random.RandomState(0).randn(20, 15).astype(dtype_in, copy=False) + np.abs(X, out=X) + + nmf = Estimator( + alpha_W=1.0, + alpha_H=1.0, + tol=1e-2, + random_state=0, + **solver, + ) + + assert nmf.fit(X).transform(X).dtype == dtype_out + assert nmf.fit_transform(X).dtype == dtype_out + assert nmf.components_.dtype == dtype_out + + +@pytest.mark.parametrize( + ["Estimator", "solver"], + [[NMF, {"solver": "cd"}], [NMF, {"solver": "mu"}], [MiniBatchNMF, {}]], +) +def test_nmf_float32_float64_consistency(Estimator, solver): + # Check that the result of NMF is the same between float32 and float64 + X = np.random.RandomState(0).randn(50, 7) + np.abs(X, out=X) + nmf32 = Estimator(random_state=0, tol=1e-3, **solver) + W32 = nmf32.fit_transform(X.astype(np.float32)) + nmf64 = Estimator(random_state=0, tol=1e-3, **solver) + W64 = nmf64.fit_transform(X) + + assert_allclose(W32, W64, atol=1e-5) + + +@pytest.mark.parametrize("Estimator", [NMF, MiniBatchNMF]) +def test_nmf_custom_init_dtype_error(Estimator): + # Check that an error is raise if custom H and/or W don't have the same + # dtype as X. + rng = np.random.RandomState(0) + X = rng.random_sample((20, 15)) + H = rng.random_sample((15, 15)).astype(np.float32) + W = rng.random_sample((20, 15)) + + with pytest.raises(TypeError, match="should have the same dtype as X"): + Estimator(init="custom").fit(X, H=H, W=W) + + with pytest.raises(TypeError, match="should have the same dtype as X"): + non_negative_factorization(X, H=H, update_H=False) + + +@pytest.mark.parametrize("beta_loss", [-0.5, 0, 0.5, 1, 1.5, 2, 2.5]) +def test_nmf_minibatchnmf_equivalence(beta_loss): + # Test that MiniBatchNMF is equivalent to NMF when batch_size = n_samples and + # forget_factor 0.0 (stopping criterion put aside) + rng = np.random.mtrand.RandomState(42) + X = np.abs(rng.randn(48, 5)) + + nmf = NMF( + n_components=5, + beta_loss=beta_loss, + solver="mu", + random_state=0, + tol=0, + ) + mbnmf = MiniBatchNMF( + n_components=5, + beta_loss=beta_loss, + random_state=0, + tol=0, + max_no_improvement=None, + batch_size=X.shape[0], + forget_factor=0.0, + ) + W = nmf.fit_transform(X) + mbW = mbnmf.fit_transform(X) + assert_allclose(W, mbW) + + +def test_minibatch_nmf_partial_fit(): + # Check fit / partial_fit equivalence. Applicable only with fresh restarts. + rng = np.random.mtrand.RandomState(42) + X = np.abs(rng.randn(100, 5)) + + n_components = 5 + batch_size = 10 + max_iter = 2 + + mbnmf1 = MiniBatchNMF( + n_components=n_components, + init="custom", + random_state=0, + max_iter=max_iter, + batch_size=batch_size, + tol=0, + max_no_improvement=None, + fresh_restarts=False, + ) + mbnmf2 = MiniBatchNMF(n_components=n_components, init="custom", random_state=0) + + # Force the same init of H (W is recomputed anyway) to be able to compare results. + W, H = nmf._initialize_nmf( + X, n_components=n_components, init="random", random_state=0 + ) + + mbnmf1.fit(X, W=W, H=H) + for i in range(max_iter): + for j in range(batch_size): + mbnmf2.partial_fit(X[j : j + batch_size], W=W[:batch_size], H=H) + + assert mbnmf1.n_steps_ == mbnmf2.n_steps_ + assert_allclose(mbnmf1.components_, mbnmf2.components_) + + +def test_feature_names_out(): + """Check feature names out for NMF.""" + random_state = np.random.RandomState(0) + X = np.abs(random_state.randn(10, 4)) + nmf = NMF(n_components=3).fit(X) + + names = nmf.get_feature_names_out() + assert_array_equal([f"nmf{i}" for i in range(3)], names) + + +def test_minibatch_nmf_verbose(): + # Check verbose mode of MiniBatchNMF for better coverage. + A = np.random.RandomState(0).random_sample((100, 10)) + nmf = MiniBatchNMF(tol=1e-2, random_state=0, verbose=1) + old_stdout = sys.stdout + sys.stdout = StringIO() + try: + nmf.fit(A) + finally: + sys.stdout = old_stdout + + +@pytest.mark.parametrize("Estimator", [NMF, MiniBatchNMF]) +def test_nmf_n_components_auto(Estimator): + # Check that n_components is correctly inferred + # from the provided custom initialization. + rng = np.random.RandomState(0) + X = rng.random_sample((6, 5)) + W = rng.random_sample((6, 2)) + H = rng.random_sample((2, 5)) + est = Estimator( + n_components="auto", + init="custom", + random_state=0, + tol=1e-6, + ) + est.fit_transform(X, W=W, H=H) + assert est._n_components == H.shape[0] + + +def test_nmf_non_negative_factorization_n_components_auto(): + # Check that n_components is correctly inferred from the provided + # custom initialization. + rng = np.random.RandomState(0) + X = rng.random_sample((6, 5)) + W_init = rng.random_sample((6, 2)) + H_init = rng.random_sample((2, 5)) + W, H, _ = non_negative_factorization( + X, W=W_init, H=H_init, init="custom", n_components="auto" + ) + assert H.shape == H_init.shape + assert W.shape == W_init.shape + + +def test_nmf_n_components_auto_no_h_update(): + # Tests that non_negative_factorization does not fail when setting + # n_components="auto" also tests that the inferred n_component + # value is the right one. + rng = np.random.RandomState(0) + X = rng.random_sample((6, 5)) + H_true = rng.random_sample((2, 5)) + W, H, _ = non_negative_factorization( + X, H=H_true, n_components="auto", update_H=False + ) # should not fail + assert_allclose(H, H_true) + assert W.shape == (X.shape[0], H_true.shape[0]) + + +def test_nmf_w_h_not_used_warning(): + # Check that warnings are raised if user provided W and H are not used + # and initialization overrides value of W or H + rng = np.random.RandomState(0) + X = rng.random_sample((6, 5)) + W_init = rng.random_sample((6, 2)) + H_init = rng.random_sample((2, 5)) + with pytest.warns( + RuntimeWarning, + match="When init!='custom', provided W or H are ignored", + ): + non_negative_factorization(X, H=H_init, update_H=True, n_components="auto") + + with pytest.warns( + RuntimeWarning, + match="When init!='custom', provided W or H are ignored", + ): + non_negative_factorization( + X, W=W_init, H=H_init, update_H=True, n_components="auto" + ) + + with pytest.warns( + RuntimeWarning, match="When update_H=False, the provided initial W is not used." + ): + # When update_H is False, W is ignored regardless of init + # TODO: use the provided W when init="custom". + non_negative_factorization( + X, W=W_init, H=H_init, update_H=False, n_components="auto" + ) + + +def test_nmf_custom_init_shape_error(): + # Check that an informative error is raised when custom initialization does not + # have the right shape + rng = np.random.RandomState(0) + X = rng.random_sample((6, 5)) + H = rng.random_sample((2, 5)) + nmf = NMF(n_components=2, init="custom", random_state=0) + + with pytest.raises(ValueError, match="Array with wrong first dimension passed"): + nmf.fit(X, H=H, W=rng.random_sample((5, 2))) + + with pytest.raises(ValueError, match="Array with wrong second dimension passed"): + nmf.fit(X, H=H, W=rng.random_sample((6, 3))) diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_online_lda.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_online_lda.py new file mode 100644 index 0000000000000000000000000000000000000000..c3dafa1912eba231d9e7a8aaeb719203483255ad --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_online_lda.py @@ -0,0 +1,482 @@ +import sys +from io import StringIO + +import numpy as np +import pytest +from numpy.testing import assert_array_equal +from scipy.linalg import block_diag +from scipy.special import psi + +from sklearn.decomposition import LatentDirichletAllocation +from sklearn.decomposition._online_lda_fast import ( + _dirichlet_expectation_1d, + _dirichlet_expectation_2d, +) +from sklearn.exceptions import NotFittedError +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + if_safe_multiprocessing_with_blas, +) +from sklearn.utils.fixes import CSR_CONTAINERS + + +def _build_sparse_array(csr_container): + # Create 3 topics and each topic has 3 distinct words. + # (Each word only belongs to a single topic.) + n_components = 3 + block = np.full((3, 3), n_components, dtype=int) + blocks = [block] * n_components + X = block_diag(*blocks) + X = csr_container(X) + return (n_components, X) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_lda_default_prior_params(csr_container): + # default prior parameter should be `1 / topics` + # and verbose params should not affect result + n_components, X = _build_sparse_array(csr_container) + prior = 1.0 / n_components + lda_1 = LatentDirichletAllocation( + n_components=n_components, + doc_topic_prior=prior, + topic_word_prior=prior, + random_state=0, + ) + lda_2 = LatentDirichletAllocation(n_components=n_components, random_state=0) + topic_distr_1 = lda_1.fit_transform(X) + topic_distr_2 = lda_2.fit_transform(X) + assert_almost_equal(topic_distr_1, topic_distr_2) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_lda_fit_batch(csr_container): + # Test LDA batch learning_offset (`fit` method with 'batch' learning) + rng = np.random.RandomState(0) + n_components, X = _build_sparse_array(csr_container) + lda = LatentDirichletAllocation( + n_components=n_components, + evaluate_every=1, + learning_method="batch", + random_state=rng, + ) + lda.fit(X) + + correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] + for component in lda.components_: + # Find top 3 words in each LDA component + top_idx = set(component.argsort()[-3:][::-1]) + assert tuple(sorted(top_idx)) in correct_idx_grps + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_lda_fit_online(csr_container): + # Test LDA online learning (`fit` method with 'online' learning) + rng = np.random.RandomState(0) + n_components, X = _build_sparse_array(csr_container) + lda = LatentDirichletAllocation( + n_components=n_components, + learning_offset=10.0, + evaluate_every=1, + learning_method="online", + random_state=rng, + ) + lda.fit(X) + + correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] + for component in lda.components_: + # Find top 3 words in each LDA component + top_idx = set(component.argsort()[-3:][::-1]) + assert tuple(sorted(top_idx)) in correct_idx_grps + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_lda_partial_fit(csr_container): + # Test LDA online learning (`partial_fit` method) + # (same as test_lda_batch) + rng = np.random.RandomState(0) + n_components, X = _build_sparse_array(csr_container) + lda = LatentDirichletAllocation( + n_components=n_components, + learning_offset=10.0, + total_samples=100, + random_state=rng, + ) + for i in range(3): + lda.partial_fit(X) + + correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] + for c in lda.components_: + top_idx = set(c.argsort()[-3:][::-1]) + assert tuple(sorted(top_idx)) in correct_idx_grps + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_lda_dense_input(csr_container): + # Test LDA with dense input. + rng = np.random.RandomState(0) + n_components, X = _build_sparse_array(csr_container) + lda = LatentDirichletAllocation( + n_components=n_components, learning_method="batch", random_state=rng + ) + lda.fit(X.toarray()) + + correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] + for component in lda.components_: + # Find top 3 words in each LDA component + top_idx = set(component.argsort()[-3:][::-1]) + assert tuple(sorted(top_idx)) in correct_idx_grps + + +def test_lda_transform(): + # Test LDA transform. + # Transform result cannot be negative and should be normalized by default + rng = np.random.RandomState(0) + X = rng.randint(5, size=(20, 10)) + n_components = 3 + lda = LatentDirichletAllocation(n_components=n_components, random_state=rng) + X_trans = lda.fit_transform(X) + assert (X_trans > 0.0).any() + assert_array_almost_equal(np.sum(X_trans, axis=1), np.ones(X_trans.shape[0])) + + X_trans_unnormalized = lda.transform(X, normalize=False) + assert_array_almost_equal( + X_trans, X_trans_unnormalized / X_trans_unnormalized.sum(axis=1)[:, np.newaxis] + ) + + +@pytest.mark.parametrize("method", ("online", "batch")) +def test_lda_fit_transform(method): + # Test LDA fit_transform & transform + # fit_transform and transform result should be the same + rng = np.random.RandomState(0) + X = rng.randint(10, size=(50, 20)) + lda = LatentDirichletAllocation( + n_components=5, learning_method=method, random_state=rng + ) + X_fit = lda.fit_transform(X) + X_trans = lda.transform(X) + assert_array_almost_equal(X_fit, X_trans, 4) + + +def test_lda_negative_input(): + # test pass dense matrix with sparse negative input. + X = np.full((5, 10), -1.0) + lda = LatentDirichletAllocation() + regex = r"^Negative values in data passed" + with pytest.raises(ValueError, match=regex): + lda.fit(X) + + +def test_lda_no_component_error(): + # test `perplexity` before `fit` + rng = np.random.RandomState(0) + X = rng.randint(4, size=(20, 10)) + lda = LatentDirichletAllocation() + regex = ( + "This LatentDirichletAllocation instance is not fitted yet. " + "Call 'fit' with appropriate arguments before using this " + "estimator." + ) + with pytest.raises(NotFittedError, match=regex): + lda.perplexity(X) + + +@if_safe_multiprocessing_with_blas +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +@pytest.mark.parametrize("method", ("online", "batch")) +def test_lda_multi_jobs(method, csr_container): + n_components, X = _build_sparse_array(csr_container) + # Test LDA batch training with multi CPU + rng = np.random.RandomState(0) + lda = LatentDirichletAllocation( + n_components=n_components, + n_jobs=2, + learning_method=method, + evaluate_every=1, + random_state=rng, + ) + lda.fit(X) + + correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] + for c in lda.components_: + top_idx = set(c.argsort()[-3:][::-1]) + assert tuple(sorted(top_idx)) in correct_idx_grps + + +@if_safe_multiprocessing_with_blas +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_lda_partial_fit_multi_jobs(csr_container): + # Test LDA online training with multi CPU + rng = np.random.RandomState(0) + n_components, X = _build_sparse_array(csr_container) + lda = LatentDirichletAllocation( + n_components=n_components, + n_jobs=2, + learning_offset=5.0, + total_samples=30, + random_state=rng, + ) + for i in range(2): + lda.partial_fit(X) + + correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] + for c in lda.components_: + top_idx = set(c.argsort()[-3:][::-1]) + assert tuple(sorted(top_idx)) in correct_idx_grps + + +def test_lda_preplexity_mismatch(): + # test dimension mismatch in `perplexity` method + rng = np.random.RandomState(0) + n_components = rng.randint(3, 6) + n_samples = rng.randint(6, 10) + X = np.random.randint(4, size=(n_samples, 10)) + lda = LatentDirichletAllocation( + n_components=n_components, + learning_offset=5.0, + total_samples=20, + random_state=rng, + ) + lda.fit(X) + # invalid samples + invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_components)) + with pytest.raises(ValueError, match=r"Number of samples"): + lda._perplexity_precomp_distr(X, invalid_n_samples) + # invalid topic number + invalid_n_components = rng.randint(4, size=(n_samples, n_components + 1)) + with pytest.raises(ValueError, match=r"Number of topics"): + lda._perplexity_precomp_distr(X, invalid_n_components) + + +@pytest.mark.parametrize("method", ("online", "batch")) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_lda_perplexity(method, csr_container): + # Test LDA perplexity for batch training + # perplexity should be lower after each iteration + n_components, X = _build_sparse_array(csr_container) + lda_1 = LatentDirichletAllocation( + n_components=n_components, + max_iter=1, + learning_method=method, + total_samples=100, + random_state=0, + ) + lda_2 = LatentDirichletAllocation( + n_components=n_components, + max_iter=10, + learning_method=method, + total_samples=100, + random_state=0, + ) + lda_1.fit(X) + perp_1 = lda_1.perplexity(X, sub_sampling=False) + + lda_2.fit(X) + perp_2 = lda_2.perplexity(X, sub_sampling=False) + assert perp_1 >= perp_2 + + perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True) + perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True) + assert perp_1_subsampling >= perp_2_subsampling + + +@pytest.mark.parametrize("method", ("online", "batch")) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_lda_score(method, csr_container): + # Test LDA score for batch training + # score should be higher after each iteration + n_components, X = _build_sparse_array(csr_container) + lda_1 = LatentDirichletAllocation( + n_components=n_components, + max_iter=1, + learning_method=method, + total_samples=100, + random_state=0, + ) + lda_2 = LatentDirichletAllocation( + n_components=n_components, + max_iter=10, + learning_method=method, + total_samples=100, + random_state=0, + ) + lda_1.fit_transform(X) + score_1 = lda_1.score(X) + + lda_2.fit_transform(X) + score_2 = lda_2.score(X) + assert score_2 >= score_1 + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_perplexity_input_format(csr_container): + # Test LDA perplexity for sparse and dense input + # score should be the same for both dense and sparse input + n_components, X = _build_sparse_array(csr_container) + lda = LatentDirichletAllocation( + n_components=n_components, + max_iter=1, + learning_method="batch", + total_samples=100, + random_state=0, + ) + lda.fit(X) + perp_1 = lda.perplexity(X) + perp_2 = lda.perplexity(X.toarray()) + assert_almost_equal(perp_1, perp_2) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_lda_score_perplexity(csr_container): + # Test the relationship between LDA score and perplexity + n_components, X = _build_sparse_array(csr_container) + lda = LatentDirichletAllocation( + n_components=n_components, max_iter=10, random_state=0 + ) + lda.fit(X) + perplexity_1 = lda.perplexity(X, sub_sampling=False) + + score = lda.score(X) + perplexity_2 = np.exp(-1.0 * (score / np.sum(X.data))) + assert_almost_equal(perplexity_1, perplexity_2) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_lda_fit_perplexity(csr_container): + # Test that the perplexity computed during fit is consistent with what is + # returned by the perplexity method + n_components, X = _build_sparse_array(csr_container) + lda = LatentDirichletAllocation( + n_components=n_components, + max_iter=1, + learning_method="batch", + random_state=0, + evaluate_every=1, + ) + lda.fit(X) + + # Perplexity computed at end of fit method + perplexity1 = lda.bound_ + + # Result of perplexity method on the train set + perplexity2 = lda.perplexity(X) + + assert_almost_equal(perplexity1, perplexity2) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_lda_empty_docs(csr_container): + """Test LDA on empty document (all-zero rows).""" + Z = np.zeros((5, 4)) + for X in [Z, csr_container(Z)]: + lda = LatentDirichletAllocation(max_iter=750).fit(X) + assert_almost_equal( + lda.components_.sum(axis=0), np.ones(lda.components_.shape[1]) + ) + + +def test_dirichlet_expectation(): + """Test Cython version of Dirichlet expectation calculation.""" + x = np.logspace(-100, 10, 10000) + expectation = np.empty_like(x) + _dirichlet_expectation_1d(x, 0, expectation) + assert_allclose(expectation, np.exp(psi(x) - psi(np.sum(x))), atol=1e-19) + + x = x.reshape(100, 100) + assert_allclose( + _dirichlet_expectation_2d(x), + psi(x) - psi(np.sum(x, axis=1)[:, np.newaxis]), + rtol=1e-11, + atol=3e-9, + ) + + +def check_verbosity( + verbose, evaluate_every, expected_lines, expected_perplexities, csr_container +): + n_components, X = _build_sparse_array(csr_container) + lda = LatentDirichletAllocation( + n_components=n_components, + max_iter=3, + learning_method="batch", + verbose=verbose, + evaluate_every=evaluate_every, + random_state=0, + ) + out = StringIO() + old_out, sys.stdout = sys.stdout, out + try: + lda.fit(X) + finally: + sys.stdout = old_out + + n_lines = out.getvalue().count("\n") + n_perplexity = out.getvalue().count("perplexity") + assert expected_lines == n_lines + assert expected_perplexities == n_perplexity + + +@pytest.mark.parametrize( + "verbose,evaluate_every,expected_lines,expected_perplexities", + [ + (False, 1, 0, 0), + (False, 0, 0, 0), + (True, 0, 3, 0), + (True, 1, 3, 3), + (True, 2, 3, 1), + ], +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_verbosity( + verbose, evaluate_every, expected_lines, expected_perplexities, csr_container +): + check_verbosity( + verbose, evaluate_every, expected_lines, expected_perplexities, csr_container + ) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_lda_feature_names_out(csr_container): + """Check feature names out for LatentDirichletAllocation.""" + n_components, X = _build_sparse_array(csr_container) + lda = LatentDirichletAllocation(n_components=n_components).fit(X) + + names = lda.get_feature_names_out() + assert_array_equal( + [f"latentdirichletallocation{i}" for i in range(n_components)], names + ) + + +@pytest.mark.parametrize("learning_method", ("batch", "online")) +def test_lda_dtype_match(learning_method, global_dtype): + """Check data type preservation of fitted attributes.""" + rng = np.random.RandomState(0) + X = rng.uniform(size=(20, 10)).astype(global_dtype, copy=False) + + lda = LatentDirichletAllocation( + n_components=5, random_state=0, learning_method=learning_method + ) + lda.fit(X) + assert lda.components_.dtype == global_dtype + assert lda.exp_dirichlet_component_.dtype == global_dtype + + +@pytest.mark.parametrize("learning_method", ("batch", "online")) +def test_lda_numerical_consistency(learning_method, global_random_seed): + """Check numerical consistency between np.float32 and np.float64.""" + rng = np.random.RandomState(global_random_seed) + X64 = rng.uniform(size=(20, 10)) + X32 = X64.astype(np.float32) + + lda_64 = LatentDirichletAllocation( + n_components=5, random_state=global_random_seed, learning_method=learning_method + ).fit(X64) + lda_32 = LatentDirichletAllocation( + n_components=5, random_state=global_random_seed, learning_method=learning_method + ).fit(X32) + + assert_allclose(lda_32.components_, lda_64.components_) + assert_allclose(lda_32.transform(X32), lda_64.transform(X64)) diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_pca.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_pca.py new file mode 100644 index 0000000000000000000000000000000000000000..2b97138c4dea385b55fd91c6a1ec9f0d8298226b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_pca.py @@ -0,0 +1,1154 @@ +import os +import re +import warnings + +import numpy as np +import pytest +import scipy as sp +from numpy.testing import assert_array_equal + +from sklearn import config_context, datasets +from sklearn.base import clone +from sklearn.datasets import load_iris, make_classification, make_low_rank_matrix +from sklearn.decomposition import PCA +from sklearn.decomposition._pca import _assess_dimension, _infer_dimension +from sklearn.utils._array_api import ( + _atol_for_type, + _convert_to_numpy, + _get_namespace_device_dtype_ids, + yield_namespace_device_dtype_combinations, +) +from sklearn.utils._array_api import device as array_device +from sklearn.utils._test_common.instance_generator import _get_check_estimator_ids +from sklearn.utils._testing import _array_api_for_tests, assert_allclose +from sklearn.utils.estimator_checks import ( + check_array_api_input_and_values, +) +from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS + +iris = datasets.load_iris() +PCA_SOLVERS = ["full", "covariance_eigh", "arpack", "randomized", "auto"] + +# `SPARSE_M` and `SPARSE_N` could be larger, but be aware: +# * SciPy's generation of random sparse matrix can be costly +# * A (SPARSE_M, SPARSE_N) dense array is allocated to compare against +SPARSE_M, SPARSE_N = 1000, 300 # arbitrary +SPARSE_MAX_COMPONENTS = min(SPARSE_M, SPARSE_N) + + +def _check_fitted_pca_close(pca1, pca2, rtol=1e-7, atol=1e-12): + assert_allclose(pca1.components_, pca2.components_, rtol=rtol, atol=atol) + assert_allclose( + pca1.explained_variance_, pca2.explained_variance_, rtol=rtol, atol=atol + ) + assert_allclose(pca1.singular_values_, pca2.singular_values_, rtol=rtol, atol=atol) + assert_allclose(pca1.mean_, pca2.mean_, rtol=rtol, atol=atol) + assert_allclose(pca1.noise_variance_, pca2.noise_variance_, rtol=rtol, atol=atol) + + assert pca1.n_components_ == pca2.n_components_ + assert pca1.n_samples_ == pca2.n_samples_ + assert pca1.n_features_in_ == pca2.n_features_in_ + + +@pytest.mark.parametrize("svd_solver", PCA_SOLVERS) +@pytest.mark.parametrize("n_components", range(1, iris.data.shape[1])) +def test_pca(svd_solver, n_components): + X = iris.data + pca = PCA(n_components=n_components, svd_solver=svd_solver) + + # check the shape of fit.transform + X_r = pca.fit(X).transform(X) + assert X_r.shape[1] == n_components + + # check the equivalence of fit.transform and fit_transform + X_r2 = pca.fit_transform(X) + assert_allclose(X_r, X_r2) + X_r = pca.transform(X) + assert_allclose(X_r, X_r2) + + # Test get_covariance and get_precision + cov = pca.get_covariance() + precision = pca.get_precision() + assert_allclose(np.dot(cov, precision), np.eye(X.shape[1]), atol=1e-12) + + +@pytest.mark.parametrize("density", [0.01, 0.1, 0.30]) +@pytest.mark.parametrize("n_components", [1, 2, 10]) +@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) +@pytest.mark.parametrize("svd_solver", ["arpack", "covariance_eigh"]) +@pytest.mark.parametrize("scale", [1, 10, 100]) +def test_pca_sparse( + global_random_seed, svd_solver, sparse_container, n_components, density, scale +): + """Check that the results are the same for sparse and dense input.""" + + # Set atol in addition of the default rtol to account for the very wide range of + # result values (1e-8 to 1e0). + atol = 1e-12 + transform_atol = 1e-10 + + random_state = np.random.default_rng(global_random_seed) + X = sparse_container( + sp.sparse.random( + SPARSE_M, + SPARSE_N, + random_state=random_state, + density=density, + ) + ) + # Scale the data + vary the column means + scale_vector = random_state.random(X.shape[1]) * scale + X = X.multiply(scale_vector) + + pca = PCA( + n_components=n_components, + svd_solver=svd_solver, + random_state=global_random_seed, + ) + pca.fit(X) + + Xd = X.toarray() + pcad = PCA( + n_components=n_components, + svd_solver=svd_solver, + random_state=global_random_seed, + ) + pcad.fit(Xd) + + # Fitted attributes equality + _check_fitted_pca_close(pca, pcad, atol=atol) + + # Test transform + X2 = sparse_container( + sp.sparse.random( + SPARSE_M, + SPARSE_N, + random_state=random_state, + density=density, + ) + ) + X2d = X2.toarray() + + assert_allclose(pca.transform(X2), pca.transform(X2d), atol=transform_atol) + assert_allclose(pca.transform(X2), pcad.transform(X2d), atol=transform_atol) + + +@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) +def test_pca_sparse_fit_transform(global_random_seed, sparse_container): + random_state = np.random.default_rng(global_random_seed) + X = sparse_container( + sp.sparse.random( + SPARSE_M, + SPARSE_N, + random_state=random_state, + density=0.01, + ) + ) + X2 = sparse_container( + sp.sparse.random( + SPARSE_M, + SPARSE_N, + random_state=random_state, + density=0.01, + ) + ) + + pca_fit = PCA(n_components=10, svd_solver="arpack", random_state=global_random_seed) + pca_fit_transform = PCA( + n_components=10, svd_solver="arpack", random_state=global_random_seed + ) + + pca_fit.fit(X) + transformed_X = pca_fit_transform.fit_transform(X) + + _check_fitted_pca_close(pca_fit, pca_fit_transform) + assert_allclose(transformed_X, pca_fit_transform.transform(X)) + assert_allclose(transformed_X, pca_fit.transform(X)) + assert_allclose(pca_fit.transform(X2), pca_fit_transform.transform(X2)) + + +@pytest.mark.parametrize("svd_solver", ["randomized", "full"]) +@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) +def test_sparse_pca_solver_error(global_random_seed, svd_solver, sparse_container): + random_state = np.random.RandomState(global_random_seed) + X = sparse_container( + sp.sparse.random( + SPARSE_M, + SPARSE_N, + random_state=random_state, + ) + ) + pca = PCA(n_components=30, svd_solver=svd_solver) + error_msg_pattern = ( + 'PCA only support sparse inputs with the "arpack" and "covariance_eigh"' + f' solvers, while "{svd_solver}" was passed' + ) + with pytest.raises(TypeError, match=error_msg_pattern): + pca.fit(X) + + +@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) +def test_sparse_pca_auto_arpack_singluar_values_consistency( + global_random_seed, sparse_container +): + """Check that "auto" and "arpack" solvers are equivalent for sparse inputs.""" + random_state = np.random.RandomState(global_random_seed) + X = sparse_container( + sp.sparse.random( + SPARSE_M, + SPARSE_N, + random_state=random_state, + ) + ) + pca_arpack = PCA(n_components=10, svd_solver="arpack").fit(X) + pca_auto = PCA(n_components=10, svd_solver="auto").fit(X) + assert_allclose(pca_arpack.singular_values_, pca_auto.singular_values_, rtol=5e-3) + + +def test_no_empty_slice_warning(): + # test if we avoid numpy warnings for computing over empty arrays + n_components = 10 + n_features = n_components + 2 # anything > n_comps triggered it in 0.16 + X = np.random.uniform(-1, 1, size=(n_components, n_features)) + pca = PCA(n_components=n_components) + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + pca.fit(X) + + +@pytest.mark.parametrize("copy", [True, False]) +@pytest.mark.parametrize("solver", PCA_SOLVERS) +def test_whitening(solver, copy): + # Check that PCA output has unit-variance + rng = np.random.RandomState(0) + n_samples = 100 + n_features = 80 + n_components = 30 + rank = 50 + + # some low rank data with correlated features + X = np.dot( + rng.randn(n_samples, rank), + np.dot(np.diag(np.linspace(10.0, 1.0, rank)), rng.randn(rank, n_features)), + ) + # the component-wise variance of the first 50 features is 3 times the + # mean component-wise variance of the remaining 30 features + X[:, :50] *= 3 + + assert X.shape == (n_samples, n_features) + + # the component-wise variance is thus highly varying: + assert X.std(axis=0).std() > 43.8 + + # whiten the data while projecting to the lower dim subspace + X_ = X.copy() # make sure we keep an original across iterations. + pca = PCA( + n_components=n_components, + whiten=True, + copy=copy, + svd_solver=solver, + random_state=0, + iterated_power=7, + ) + # test fit_transform + X_whitened = pca.fit_transform(X_.copy()) + assert X_whitened.shape == (n_samples, n_components) + X_whitened2 = pca.transform(X_) + assert_allclose(X_whitened, X_whitened2, rtol=5e-4) + + assert_allclose(X_whitened.std(ddof=1, axis=0), np.ones(n_components)) + assert_allclose(X_whitened.mean(axis=0), np.zeros(n_components), atol=1e-12) + + X_ = X.copy() + pca = PCA( + n_components=n_components, whiten=False, copy=copy, svd_solver=solver + ).fit(X_.copy()) + X_unwhitened = pca.transform(X_) + assert X_unwhitened.shape == (n_samples, n_components) + + # in that case the output components still have varying variances + assert X_unwhitened.std(axis=0).std() == pytest.approx(74.1, rel=1e-1) + # we always center, so no test for non-centering. + + +@pytest.mark.parametrize( + "other_svd_solver", sorted(list(set(PCA_SOLVERS) - {"full", "auto"})) +) +@pytest.mark.parametrize("data_shape", ["tall", "wide"]) +@pytest.mark.parametrize("rank_deficient", [False, True]) +@pytest.mark.parametrize("whiten", [False, True]) +def test_pca_solver_equivalence( + other_svd_solver, + data_shape, + rank_deficient, + whiten, + global_random_seed, + global_dtype, +): + if data_shape == "tall": + n_samples, n_features = 100, 30 + else: + n_samples, n_features = 30, 100 + n_samples_test = 10 + + if rank_deficient: + rng = np.random.default_rng(global_random_seed) + rank = min(n_samples, n_features) // 2 + X = rng.standard_normal( + size=(n_samples + n_samples_test, rank) + ) @ rng.standard_normal(size=(rank, n_features)) + else: + X = make_low_rank_matrix( + n_samples=n_samples + n_samples_test, + n_features=n_features, + tail_strength=0.5, + random_state=global_random_seed, + ) + # With a non-zero tail strength, the data is actually full-rank. + rank = min(n_samples, n_features) + + X = X.astype(global_dtype, copy=False) + X_train, X_test = X[:n_samples], X[n_samples:] + + if global_dtype == np.float32: + tols = dict(atol=3e-2, rtol=1e-5) + variance_threshold = 1e-5 + else: + tols = dict(atol=1e-10, rtol=1e-12) + variance_threshold = 1e-12 + + extra_other_kwargs = {} + if other_svd_solver == "randomized": + # Only check for a truncated result with a large number of iterations + # to make sure that we can recover precise results. + n_components = 10 + extra_other_kwargs = {"iterated_power": 50} + elif other_svd_solver == "arpack": + # Test all components except the last one which cannot be estimated by + # arpack. + n_components = np.minimum(n_samples, n_features) - 1 + else: + # Test all components to high precision. + n_components = None + + pca_full = PCA(n_components=n_components, svd_solver="full", whiten=whiten) + pca_other = PCA( + n_components=n_components, + svd_solver=other_svd_solver, + whiten=whiten, + random_state=global_random_seed, + **extra_other_kwargs, + ) + X_trans_full_train = pca_full.fit_transform(X_train) + assert np.isfinite(X_trans_full_train).all() + assert X_trans_full_train.dtype == global_dtype + X_trans_other_train = pca_other.fit_transform(X_train) + assert np.isfinite(X_trans_other_train).all() + assert X_trans_other_train.dtype == global_dtype + + assert (pca_full.explained_variance_ >= 0).all() + assert_allclose(pca_full.explained_variance_, pca_other.explained_variance_, **tols) + assert_allclose( + pca_full.explained_variance_ratio_, + pca_other.explained_variance_ratio_, + **tols, + ) + reference_components = pca_full.components_ + assert np.isfinite(reference_components).all() + other_components = pca_other.components_ + assert np.isfinite(other_components).all() + + # For some choice of n_components and data distribution, some components + # might be pure noise, let's ignore them in the comparison: + stable = pca_full.explained_variance_ > variance_threshold + assert stable.sum() > 1 + assert_allclose(reference_components[stable], other_components[stable], **tols) + + # As a result the output of fit_transform should be the same: + assert_allclose( + X_trans_other_train[:, stable], X_trans_full_train[:, stable], **tols + ) + + # And similarly for the output of transform on new data (except for the + # last component that can be underdetermined): + X_trans_full_test = pca_full.transform(X_test) + assert np.isfinite(X_trans_full_test).all() + assert X_trans_full_test.dtype == global_dtype + X_trans_other_test = pca_other.transform(X_test) + assert np.isfinite(X_trans_other_test).all() + assert X_trans_other_test.dtype == global_dtype + assert_allclose(X_trans_other_test[:, stable], X_trans_full_test[:, stable], **tols) + + # Check that inverse transform reconstructions for both solvers are + # compatible. + X_recons_full_test = pca_full.inverse_transform(X_trans_full_test) + assert np.isfinite(X_recons_full_test).all() + assert X_recons_full_test.dtype == global_dtype + X_recons_other_test = pca_other.inverse_transform(X_trans_other_test) + assert np.isfinite(X_recons_other_test).all() + assert X_recons_other_test.dtype == global_dtype + + if pca_full.components_.shape[0] == pca_full.components_.shape[1]: + # In this case, the models should have learned the same invertible + # transform. They should therefore both be able to reconstruct the test + # data. + assert_allclose(X_recons_full_test, X_test, **tols) + assert_allclose(X_recons_other_test, X_test, **tols) + elif pca_full.components_.shape[0] < rank: + # In the absence of noisy components, both models should be able to + # reconstruct the same low-rank approximation of the original data. + assert pca_full.explained_variance_.min() > variance_threshold + assert_allclose(X_recons_full_test, X_recons_other_test, **tols) + else: + # When n_features > n_samples and n_components is larger than the rank + # of the training set, the output of the `inverse_transform` function + # is ill-defined. We can only check that we reach the same fixed point + # after another round of transform: + assert_allclose( + pca_full.transform(X_recons_full_test)[:, stable], + pca_other.transform(X_recons_other_test)[:, stable], + **tols, + ) + + +@pytest.mark.parametrize( + "X", + [ + np.random.RandomState(0).randn(100, 80), + datasets.make_classification(100, 80, n_informative=78, random_state=0)[0], + np.random.RandomState(0).randn(10, 100), + ], + ids=["random-tall", "correlated-tall", "random-wide"], +) +@pytest.mark.parametrize("svd_solver", PCA_SOLVERS) +def test_pca_explained_variance_empirical(X, svd_solver): + pca = PCA(n_components=2, svd_solver=svd_solver, random_state=0) + X_pca = pca.fit_transform(X) + assert_allclose(pca.explained_variance_, np.var(X_pca, ddof=1, axis=0)) + + expected_result = np.linalg.eig(np.cov(X, rowvar=False))[0] + expected_result = sorted(expected_result, reverse=True)[:2] + assert_allclose(pca.explained_variance_, expected_result, rtol=5e-3) + + +@pytest.mark.parametrize("svd_solver", ["arpack", "randomized"]) +def test_pca_singular_values_consistency(svd_solver): + rng = np.random.RandomState(0) + n_samples, n_features = 100, 80 + X = rng.randn(n_samples, n_features) + + pca_full = PCA(n_components=2, svd_solver="full", random_state=rng) + pca_other = PCA(n_components=2, svd_solver=svd_solver, random_state=rng) + + pca_full.fit(X) + pca_other.fit(X) + + assert_allclose(pca_full.singular_values_, pca_other.singular_values_, rtol=5e-3) + + +@pytest.mark.parametrize("svd_solver", PCA_SOLVERS) +def test_pca_singular_values(svd_solver): + rng = np.random.RandomState(0) + n_samples, n_features = 100, 80 + X = rng.randn(n_samples, n_features) + + pca = PCA(n_components=2, svd_solver=svd_solver, random_state=rng) + X_trans = pca.fit_transform(X) + + # compare to the Frobenius norm + assert_allclose( + np.sum(pca.singular_values_**2), np.linalg.norm(X_trans, "fro") ** 2 + ) + # Compare to the 2-norms of the score vectors + assert_allclose(pca.singular_values_, np.sqrt(np.sum(X_trans**2, axis=0))) + + # set the singular values and see what er get back + n_samples, n_features = 100, 110 + X = rng.randn(n_samples, n_features) + + pca = PCA(n_components=3, svd_solver=svd_solver, random_state=rng) + X_trans = pca.fit_transform(X) + X_trans /= np.sqrt(np.sum(X_trans**2, axis=0)) + X_trans[:, 0] *= 3.142 + X_trans[:, 1] *= 2.718 + X_hat = np.dot(X_trans, pca.components_) + pca.fit(X_hat) + assert_allclose(pca.singular_values_, [3.142, 2.718, 1.0]) + + +@pytest.mark.parametrize("svd_solver", PCA_SOLVERS) +def test_pca_check_projection(svd_solver): + # Test that the projection of data is correct + rng = np.random.RandomState(0) + n, p = 100, 3 + X = rng.randn(n, p) * 0.1 + X[:10] += np.array([3, 4, 5]) + Xt = 0.1 * rng.randn(1, p) + np.array([3, 4, 5]) + + Yt = PCA(n_components=2, svd_solver=svd_solver).fit(X).transform(Xt) + Yt /= np.sqrt((Yt**2).sum()) + + assert_allclose(np.abs(Yt[0][0]), 1.0, rtol=5e-3) + + +@pytest.mark.parametrize("svd_solver", PCA_SOLVERS) +def test_pca_check_projection_list(svd_solver): + # Test that the projection of data is correct + X = [[1.0, 0.0], [0.0, 1.0]] + pca = PCA(n_components=1, svd_solver=svd_solver, random_state=0) + X_trans = pca.fit_transform(X) + assert X_trans.shape, (2, 1) + assert_allclose(X_trans.mean(), 0.00, atol=1e-12) + assert_allclose(X_trans.std(), 0.71, rtol=5e-3) + + +@pytest.mark.parametrize("svd_solver", ["full", "arpack", "randomized"]) +@pytest.mark.parametrize("whiten", [False, True]) +def test_pca_inverse(svd_solver, whiten): + # Test that the projection of data can be inverted + rng = np.random.RandomState(0) + n, p = 50, 3 + X = rng.randn(n, p) # spherical data + X[:, 1] *= 0.00001 # make middle component relatively small + X += [5, 4, 3] # make a large mean + + # same check that we can find the original data from the transformed + # signal (since the data is almost of rank n_components) + pca = PCA(n_components=2, svd_solver=svd_solver, whiten=whiten).fit(X) + Y = pca.transform(X) + Y_inverse = pca.inverse_transform(Y) + assert_allclose(X, Y_inverse, rtol=5e-6) + + +@pytest.mark.parametrize( + "data", [np.array([[0, 1, 0], [1, 0, 0]]), np.array([[0, 1, 0], [1, 0, 0]]).T] +) +@pytest.mark.parametrize( + "svd_solver, n_components, err_msg", + [ + ("arpack", 0, r"must be between 1 and min\(n_samples, n_features\)"), + ("randomized", 0, r"must be between 1 and min\(n_samples, n_features\)"), + ("arpack", 2, r"must be strictly less than min"), + ( + "auto", + 3, + ( + r"n_components=3 must be between 0 and min\(n_samples, " + r"n_features\)=2 with svd_solver='full'" + ), + ), + ], +) +def test_pca_validation(svd_solver, data, n_components, err_msg): + # Ensures that solver-specific extreme inputs for the n_components + # parameter raise errors + smallest_d = 2 # The smallest dimension + pca_fitted = PCA(n_components, svd_solver=svd_solver) + + with pytest.raises(ValueError, match=err_msg): + pca_fitted.fit(data) + + # Additional case for arpack + if svd_solver == "arpack": + n_components = smallest_d + + err_msg = ( + "n_components={}L? must be strictly less than " + r"min\(n_samples, n_features\)={}L? with " + "svd_solver='arpack'".format(n_components, smallest_d) + ) + with pytest.raises(ValueError, match=err_msg): + PCA(n_components, svd_solver=svd_solver).fit(data) + + +@pytest.mark.parametrize( + "solver, n_components_", + [ + ("full", min(iris.data.shape)), + ("arpack", min(iris.data.shape) - 1), + ("randomized", min(iris.data.shape)), + ], +) +@pytest.mark.parametrize("data", [iris.data, iris.data.T]) +def test_n_components_none(data, solver, n_components_): + pca = PCA(svd_solver=solver) + pca.fit(data) + assert pca.n_components_ == n_components_ + + +@pytest.mark.parametrize("svd_solver", ["auto", "full"]) +def test_n_components_mle(svd_solver): + # Ensure that n_components == 'mle' doesn't raise error for auto/full + rng = np.random.RandomState(0) + n_samples, n_features = 600, 10 + X = rng.randn(n_samples, n_features) + pca = PCA(n_components="mle", svd_solver=svd_solver) + pca.fit(X) + assert pca.n_components_ == 1 + + +@pytest.mark.parametrize("svd_solver", ["arpack", "randomized"]) +def test_n_components_mle_error(svd_solver): + # Ensure that n_components == 'mle' will raise an error for unsupported + # solvers + rng = np.random.RandomState(0) + n_samples, n_features = 600, 10 + X = rng.randn(n_samples, n_features) + pca = PCA(n_components="mle", svd_solver=svd_solver) + err_msg = "n_components='mle' cannot be a string with svd_solver='{}'".format( + svd_solver + ) + with pytest.raises(ValueError, match=err_msg): + pca.fit(X) + + +def test_pca_dim(): + # Check automated dimensionality setting + rng = np.random.RandomState(0) + n, p = 100, 5 + X = rng.randn(n, p) * 0.1 + X[:10] += np.array([3, 4, 5, 1, 2]) + pca = PCA(n_components="mle", svd_solver="full").fit(X) + assert pca.n_components == "mle" + assert pca.n_components_ == 1 + + +def test_infer_dim_1(): + # TODO: explain what this is testing + # Or at least use explicit variable names... + n, p = 1000, 5 + rng = np.random.RandomState(0) + X = ( + rng.randn(n, p) * 0.1 + + rng.randn(n, 1) * np.array([3, 4, 5, 1, 2]) + + np.array([1, 0, 7, 4, 6]) + ) + pca = PCA(n_components=p, svd_solver="full") + pca.fit(X) + spect = pca.explained_variance_ + ll = np.array([_assess_dimension(spect, k, n) for k in range(1, p)]) + assert ll[1] > ll.max() - 0.01 * n + + +def test_infer_dim_2(): + # TODO: explain what this is testing + # Or at least use explicit variable names... + n, p = 1000, 5 + rng = np.random.RandomState(0) + X = rng.randn(n, p) * 0.1 + X[:10] += np.array([3, 4, 5, 1, 2]) + X[10:20] += np.array([6, 0, 7, 2, -1]) + pca = PCA(n_components=p, svd_solver="full") + pca.fit(X) + spect = pca.explained_variance_ + assert _infer_dimension(spect, n) > 1 + + +def test_infer_dim_3(): + n, p = 100, 5 + rng = np.random.RandomState(0) + X = rng.randn(n, p) * 0.1 + X[:10] += np.array([3, 4, 5, 1, 2]) + X[10:20] += np.array([6, 0, 7, 2, -1]) + X[30:40] += 2 * np.array([-1, 1, -1, 1, -1]) + pca = PCA(n_components=p, svd_solver="full") + pca.fit(X) + spect = pca.explained_variance_ + assert _infer_dimension(spect, n) > 2 + + +@pytest.mark.parametrize( + "X, n_components, n_components_validated", + [ + (iris.data, 0.95, 2), # row > col + (iris.data, 0.01, 1), # row > col + (np.random.RandomState(0).rand(5, 20), 0.5, 2), + ], # row < col +) +def test_infer_dim_by_explained_variance(X, n_components, n_components_validated): + pca = PCA(n_components=n_components, svd_solver="full") + pca.fit(X) + assert pca.n_components == pytest.approx(n_components) + assert pca.n_components_ == n_components_validated + + +@pytest.mark.parametrize("svd_solver", PCA_SOLVERS) +def test_pca_score(svd_solver): + # Test that probabilistic PCA scoring yields a reasonable score + n, p = 1000, 3 + rng = np.random.RandomState(0) + X = rng.randn(n, p) * 0.1 + np.array([3, 4, 5]) + pca = PCA(n_components=2, svd_solver=svd_solver) + pca.fit(X) + + ll1 = pca.score(X) + h = -0.5 * np.log(2 * np.pi * np.exp(1) * 0.1**2) * p + assert_allclose(ll1 / h, 1, rtol=5e-2) + + ll2 = pca.score(rng.randn(n, p) * 0.2 + np.array([3, 4, 5])) + assert ll1 > ll2 + + pca = PCA(n_components=2, whiten=True, svd_solver=svd_solver) + pca.fit(X) + ll2 = pca.score(X) + assert ll1 > ll2 + + +def test_pca_score3(): + # Check that probabilistic PCA selects the right model + n, p = 200, 3 + rng = np.random.RandomState(0) + Xl = rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) + np.array([1, 0, 7]) + Xt = rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) + np.array([1, 0, 7]) + ll = np.zeros(p) + for k in range(p): + pca = PCA(n_components=k, svd_solver="full") + pca.fit(Xl) + ll[k] = pca.score(Xt) + + assert ll.argmax() == 1 + + +@pytest.mark.parametrize("svd_solver", PCA_SOLVERS) +def test_pca_sanity_noise_variance(svd_solver): + # Sanity check for the noise_variance_. For more details see + # https://github.com/scikit-learn/scikit-learn/issues/7568 + # https://github.com/scikit-learn/scikit-learn/issues/8541 + # https://github.com/scikit-learn/scikit-learn/issues/8544 + X, _ = datasets.load_digits(return_X_y=True) + pca = PCA(n_components=30, svd_solver=svd_solver, random_state=0) + pca.fit(X) + assert np.all((pca.explained_variance_ - pca.noise_variance_) >= 0) + + +@pytest.mark.parametrize("svd_solver", ["arpack", "randomized"]) +def test_pca_score_consistency_solvers(svd_solver): + # Check the consistency of score between solvers + X, _ = datasets.load_digits(return_X_y=True) + pca_full = PCA(n_components=30, svd_solver="full", random_state=0) + pca_other = PCA(n_components=30, svd_solver=svd_solver, random_state=0) + pca_full.fit(X) + pca_other.fit(X) + assert_allclose(pca_full.score(X), pca_other.score(X), rtol=5e-6) + + +# arpack raises ValueError for n_components == min(n_samples, n_features) +@pytest.mark.parametrize("svd_solver", ["full", "randomized"]) +def test_pca_zero_noise_variance_edge_cases(svd_solver): + # ensure that noise_variance_ is 0 in edge cases + # when n_components == min(n_samples, n_features) + n, p = 100, 3 + rng = np.random.RandomState(0) + X = rng.randn(n, p) * 0.1 + np.array([3, 4, 5]) + + pca = PCA(n_components=p, svd_solver=svd_solver) + pca.fit(X) + assert pca.noise_variance_ == 0 + # Non-regression test for gh-12489 + # ensure no divide-by-zero error for n_components == n_features < n_samples + pca.score(X) + + pca.fit(X.T) + assert pca.noise_variance_ == 0 + # Non-regression test for gh-12489 + # ensure no divide-by-zero error for n_components == n_samples < n_features + pca.score(X.T) + + +@pytest.mark.parametrize( + "n_samples, n_features, n_components, expected_solver", + [ + # case: n_samples < 10 * n_features and max(X.shape) <= 500 => 'full' + (10, 50, 5, "full"), + # case: n_samples > 10 * n_features and n_features < 500 => 'covariance_eigh' + (1000, 50, 50, "covariance_eigh"), + # case: n_components >= .8 * min(X.shape) => 'full' + (1000, 500, 400, "full"), + # n_components >= 1 and n_components < .8*min(X.shape) => 'randomized' + (1000, 500, 10, "randomized"), + # case: n_components in (0,1) => 'full' + (1000, 500, 0.5, "full"), + ], +) +def test_pca_svd_solver_auto(n_samples, n_features, n_components, expected_solver): + data = np.random.RandomState(0).uniform(size=(n_samples, n_features)) + pca_auto = PCA(n_components=n_components, random_state=0) + pca_test = PCA( + n_components=n_components, svd_solver=expected_solver, random_state=0 + ) + pca_auto.fit(data) + assert pca_auto._fit_svd_solver == expected_solver + pca_test.fit(data) + assert_allclose(pca_auto.components_, pca_test.components_) + + +@pytest.mark.parametrize("svd_solver", PCA_SOLVERS) +def test_pca_deterministic_output(svd_solver): + rng = np.random.RandomState(0) + X = rng.rand(10, 10) + + transformed_X = np.zeros((20, 2)) + for i in range(20): + pca = PCA(n_components=2, svd_solver=svd_solver, random_state=rng) + transformed_X[i, :] = pca.fit_transform(X)[0] + assert_allclose(transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2)) + + +@pytest.mark.parametrize("svd_solver", PCA_SOLVERS) +def test_pca_dtype_preservation(svd_solver, global_random_seed): + check_pca_float_dtype_preservation(svd_solver, global_random_seed) + check_pca_int_dtype_upcast_to_double(svd_solver) + + +def check_pca_float_dtype_preservation(svd_solver, seed): + # Ensure that PCA does not upscale the dtype when input is float32 + X = np.random.RandomState(seed).rand(1000, 4) + X_float64 = X.astype(np.float64, copy=False) + X_float32 = X.astype(np.float32) + + pca_64 = PCA(n_components=3, svd_solver=svd_solver, random_state=seed).fit( + X_float64 + ) + pca_32 = PCA(n_components=3, svd_solver=svd_solver, random_state=seed).fit( + X_float32 + ) + + assert pca_64.components_.dtype == np.float64 + assert pca_32.components_.dtype == np.float32 + assert pca_64.transform(X_float64).dtype == np.float64 + assert pca_32.transform(X_float32).dtype == np.float32 + + # The atol and rtol are set such that the test passes for all random seeds + # on all supported platforms on our CI and conda-forge with the default + # random seed. + assert_allclose(pca_64.components_, pca_32.components_, rtol=1e-3, atol=1e-3) + + +def check_pca_int_dtype_upcast_to_double(svd_solver): + # Ensure that all int types will be upcast to float64 + X_i64 = np.random.RandomState(0).randint(0, 1000, (1000, 4)) + X_i64 = X_i64.astype(np.int64, copy=False) + X_i32 = X_i64.astype(np.int32, copy=False) + + pca_64 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_i64) + pca_32 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_i32) + + assert pca_64.components_.dtype == np.float64 + assert pca_32.components_.dtype == np.float64 + assert pca_64.transform(X_i64).dtype == np.float64 + assert pca_32.transform(X_i32).dtype == np.float64 + + assert_allclose(pca_64.components_, pca_32.components_, rtol=1e-4) + + +def test_pca_n_components_mostly_explained_variance_ratio(): + # when n_components is the second highest cumulative sum of the + # explained_variance_ratio_, then n_components_ should equal the + # number of features in the dataset #15669 + X, y = load_iris(return_X_y=True) + pca1 = PCA().fit(X, y) + + n_components = pca1.explained_variance_ratio_.cumsum()[-2] + pca2 = PCA(n_components=n_components).fit(X, y) + assert pca2.n_components_ == X.shape[1] + + +def test_assess_dimension_bad_rank(): + # Test error when tested rank not in [1, n_features - 1] + spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) + n_samples = 10 + for rank in (0, 5): + with pytest.raises(ValueError, match=r"should be in \[1, n_features - 1\]"): + _assess_dimension(spectrum, rank, n_samples) + + +def test_small_eigenvalues_mle(): + # Test rank associated with tiny eigenvalues are given a log-likelihood of + # -inf. The inferred rank will be 1 + spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) + + assert _assess_dimension(spectrum, rank=1, n_samples=10) > -np.inf + + for rank in (2, 3): + assert _assess_dimension(spectrum, rank, 10) == -np.inf + + assert _infer_dimension(spectrum, 10) == 1 + + +def test_mle_redundant_data(): + # Test 'mle' with pathological X: only one relevant feature should give a + # rank of 1 + X, _ = datasets.make_classification( + n_features=20, + n_informative=1, + n_repeated=18, + n_redundant=1, + n_clusters_per_class=1, + random_state=42, + ) + pca = PCA(n_components="mle").fit(X) + assert pca.n_components_ == 1 + + +def test_fit_mle_too_few_samples(): + # Tests that an error is raised when the number of samples is smaller + # than the number of features during an mle fit + X, _ = datasets.make_classification(n_samples=20, n_features=21, random_state=42) + + pca = PCA(n_components="mle", svd_solver="full") + with pytest.raises( + ValueError, + match="n_components='mle' is only supported if n_samples >= n_features", + ): + pca.fit(X) + + +def test_mle_simple_case(): + # non-regression test for issue + # https://github.com/scikit-learn/scikit-learn/issues/16730 + n_samples, n_dim = 1000, 10 + X = np.random.RandomState(0).randn(n_samples, n_dim) + X[:, -1] = np.mean(X[:, :-1], axis=-1) # true X dim is ndim - 1 + pca_skl = PCA("mle", svd_solver="full") + pca_skl.fit(X) + assert pca_skl.n_components_ == n_dim - 1 + + +def test_assess_dimesion_rank_one(): + # Make sure assess_dimension works properly on a matrix of rank 1 + n_samples, n_features = 9, 6 + X = np.ones((n_samples, n_features)) # rank 1 matrix + _, s, _ = np.linalg.svd(X, full_matrices=True) + # except for rank 1, all eigenvalues are 0 resp. close to 0 (FP) + assert_allclose(s[1:], np.zeros(n_features - 1), atol=1e-12) + + assert np.isfinite(_assess_dimension(s, rank=1, n_samples=n_samples)) + for rank in range(2, n_features): + assert _assess_dimension(s, rank, n_samples) == -np.inf + + +def test_pca_randomized_svd_n_oversamples(): + """Check that exposing and setting `n_oversamples` will provide accurate results + even when `X` as a large number of features. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/20589 + """ + rng = np.random.RandomState(0) + n_features = 100 + X = rng.randn(1_000, n_features) + + # The default value of `n_oversamples` will lead to inaccurate results + # We force it to the number of features. + pca_randomized = PCA( + n_components=1, + svd_solver="randomized", + n_oversamples=n_features, + random_state=0, + ).fit(X) + pca_full = PCA(n_components=1, svd_solver="full").fit(X) + pca_arpack = PCA(n_components=1, svd_solver="arpack", random_state=0).fit(X) + + assert_allclose(np.abs(pca_full.components_), np.abs(pca_arpack.components_)) + assert_allclose(np.abs(pca_randomized.components_), np.abs(pca_arpack.components_)) + + +def test_feature_names_out(): + """Check feature names out for PCA.""" + pca = PCA(n_components=2).fit(iris.data) + + names = pca.get_feature_names_out() + assert_array_equal([f"pca{i}" for i in range(2)], names) + + +@pytest.mark.parametrize("copy", [True, False]) +def test_variance_correctness(copy): + """Check the accuracy of PCA's internal variance calculation""" + rng = np.random.RandomState(0) + X = rng.randn(1000, 200) + pca = PCA().fit(X) + pca_var = pca.explained_variance_ / pca.explained_variance_ratio_ + true_var = np.var(X, ddof=1, axis=0).sum() + np.testing.assert_allclose(pca_var, true_var) + + +def check_array_api_get_precision(name, estimator, array_namespace, device, dtype_name): + xp = _array_api_for_tests(array_namespace, device) + iris_np = iris.data.astype(dtype_name) + iris_xp = xp.asarray(iris_np, device=device) + + estimator.fit(iris_np) + precision_np = estimator.get_precision() + covariance_np = estimator.get_covariance() + + rtol = 2e-4 if iris_np.dtype == "float32" else 2e-7 + with config_context(array_api_dispatch=True): + estimator_xp = clone(estimator).fit(iris_xp) + precision_xp = estimator_xp.get_precision() + assert precision_xp.shape == (4, 4) + assert precision_xp.dtype == iris_xp.dtype + + assert_allclose( + _convert_to_numpy(precision_xp, xp=xp), + precision_np, + rtol=rtol, + atol=_atol_for_type(dtype_name), + ) + covariance_xp = estimator_xp.get_covariance() + assert covariance_xp.shape == (4, 4) + assert covariance_xp.dtype == iris_xp.dtype + + assert_allclose( + _convert_to_numpy(covariance_xp, xp=xp), + covariance_np, + rtol=rtol, + atol=_atol_for_type(dtype_name), + ) + + +@pytest.mark.parametrize( + "array_namespace, device, dtype_name", + yield_namespace_device_dtype_combinations(), + ids=_get_namespace_device_dtype_ids, +) +@pytest.mark.parametrize( + "check", + [check_array_api_input_and_values, check_array_api_get_precision], + ids=_get_check_estimator_ids, +) +@pytest.mark.parametrize( + "estimator", + [ + PCA(n_components=2, svd_solver="full"), + PCA(n_components=2, svd_solver="full", whiten=True), + PCA(n_components=0.1, svd_solver="full", whiten=True), + PCA(n_components=2, svd_solver="covariance_eigh"), + PCA(n_components=2, svd_solver="covariance_eigh", whiten=True), + PCA( + n_components=2, + svd_solver="randomized", + power_iteration_normalizer="QR", + random_state=0, # how to use global_random_seed here? + ), + ], + ids=_get_check_estimator_ids, +) +def test_pca_array_api_compliance( + estimator, check, array_namespace, device, dtype_name +): + name = estimator.__class__.__name__ + check(name, estimator, array_namespace, device=device, dtype_name=dtype_name) + + +@pytest.mark.parametrize( + "array_namespace, device, dtype_name", + yield_namespace_device_dtype_combinations(), + ids=_get_namespace_device_dtype_ids, +) +@pytest.mark.parametrize( + "check", + [check_array_api_get_precision], + ids=_get_check_estimator_ids, +) +@pytest.mark.parametrize( + "estimator", + [ + # PCA with mle cannot use check_array_api_input_and_values because of + # rounding errors in the noisy (low variance) components. Even checking + # the shape of the `components_` is problematic because the number of + # components depends on trimming threshold of the mle algorithm which + # can depend on device-specific rounding errors. + PCA(n_components="mle", svd_solver="full"), + ], + ids=_get_check_estimator_ids, +) +def test_pca_mle_array_api_compliance( + estimator, check, array_namespace, device, dtype_name +): + name = estimator.__class__.__name__ + check(name, estimator, array_namespace, device=device, dtype_name=dtype_name) + + # Simpler variant of the generic check_array_api_input checker tailored for + # the specific case of PCA with mle-trimmed components. + xp = _array_api_for_tests(array_namespace, device) + + X, y = make_classification(random_state=42) + X = X.astype(dtype_name, copy=False) + atol = _atol_for_type(X.dtype) + + est = clone(estimator) + + X_xp = xp.asarray(X, device=device) + y_xp = xp.asarray(y, device=device) + + est.fit(X, y) + + components_np = est.components_ + explained_variance_np = est.explained_variance_ + + est_xp = clone(est) + with config_context(array_api_dispatch=True): + est_xp.fit(X_xp, y_xp) + components_xp = est_xp.components_ + assert array_device(components_xp) == array_device(X_xp) + components_xp_np = _convert_to_numpy(components_xp, xp=xp) + + explained_variance_xp = est_xp.explained_variance_ + assert array_device(explained_variance_xp) == array_device(X_xp) + explained_variance_xp_np = _convert_to_numpy(explained_variance_xp, xp=xp) + + assert components_xp_np.dtype == components_np.dtype + assert components_xp_np.shape[1] == components_np.shape[1] + assert explained_variance_xp_np.dtype == explained_variance_np.dtype + + # Check that the explained variance values match for the + # common components: + min_components = min(components_xp_np.shape[0], components_np.shape[0]) + assert_allclose( + explained_variance_xp_np[:min_components], + explained_variance_np[:min_components], + atol=atol, + ) + + # If the number of components differ, check that the explained variance of + # the trimmed components is very small. + if components_xp_np.shape[0] != components_np.shape[0]: + reference_variance = explained_variance_np[-1] + extra_variance_np = explained_variance_np[min_components:] + extra_variance_xp_np = explained_variance_xp_np[min_components:] + assert all(np.abs(extra_variance_np - reference_variance) < atol) + assert all(np.abs(extra_variance_xp_np - reference_variance) < atol) + + +@pytest.mark.skipif( + os.environ.get("SCIPY_ARRAY_API") != "1", reason="SCIPY_ARRAY_API not set to 1." +) +def test_array_api_error_and_warnings_on_unsupported_params(): + xp = pytest.importorskip("array_api_strict") + iris_xp = xp.asarray(iris.data) + + pca = PCA(n_components=2, svd_solver="arpack", random_state=0) + expected_msg = re.escape( + "PCA with svd_solver='arpack' is not supported for Array API inputs." + ) + with pytest.raises(ValueError, match=expected_msg): + with config_context(array_api_dispatch=True): + pca.fit(iris_xp) + + pca.set_params(svd_solver="randomized", power_iteration_normalizer="LU") + expected_msg = re.escape( + "Array API does not support LU factorization. Set" + " `power_iteration_normalizer='QR'` instead." + ) + with pytest.raises(ValueError, match=expected_msg): + with config_context(array_api_dispatch=True): + pca.fit(iris_xp) + + pca.set_params(svd_solver="randomized", power_iteration_normalizer="auto") + expected_msg = re.escape( + "Array API does not support LU factorization, falling back to QR instead. Set" + " `power_iteration_normalizer='QR'` explicitly to silence this warning." + ) + with pytest.warns(UserWarning, match=expected_msg): + with config_context(array_api_dispatch=True): + pca.fit(iris_xp) diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_sparse_pca.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_sparse_pca.py new file mode 100644 index 0000000000000000000000000000000000000000..f8c71a5d0e752580dd90b6670804bf91b8ab0b72 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_sparse_pca.py @@ -0,0 +1,347 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + + +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +from sklearn.datasets import make_low_rank_matrix +from sklearn.decomposition import PCA, MiniBatchSparsePCA, SparsePCA +from sklearn.utils import check_random_state +from sklearn.utils._testing import ( + assert_allclose, + assert_array_almost_equal, + if_safe_multiprocessing_with_blas, +) +from sklearn.utils.extmath import svd_flip + + +def generate_toy_data(n_components, n_samples, image_size, random_state=None): + n_features = image_size[0] * image_size[1] + + rng = check_random_state(random_state) + U = rng.randn(n_samples, n_components) + V = rng.randn(n_components, n_features) + + centers = [(3, 3), (6, 7), (8, 1)] + sz = [1, 2, 1] + for k in range(n_components): + img = np.zeros(image_size) + xmin, xmax = centers[k][0] - sz[k], centers[k][0] + sz[k] + ymin, ymax = centers[k][1] - sz[k], centers[k][1] + sz[k] + img[xmin:xmax][:, ymin:ymax] = 1.0 + V[k, :] = img.ravel() + + # Y is defined by : Y = UV + noise + Y = np.dot(U, V) + Y += 0.1 * rng.randn(Y.shape[0], Y.shape[1]) # Add noise + return Y, U, V + + +# SparsePCA can be a bit slow. To avoid having test times go up, we +# test different aspects of the code in the same test + + +def test_correct_shapes(): + rng = np.random.RandomState(0) + X = rng.randn(12, 10) + spca = SparsePCA(n_components=8, random_state=rng) + U = spca.fit_transform(X) + assert spca.components_.shape == (8, 10) + assert U.shape == (12, 8) + # test overcomplete decomposition + spca = SparsePCA(n_components=13, random_state=rng) + U = spca.fit_transform(X) + assert spca.components_.shape == (13, 10) + assert U.shape == (12, 13) + + +def test_fit_transform(global_random_seed): + alpha = 1 + rng = np.random.RandomState(global_random_seed) + Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array + spca_lars = SparsePCA( + n_components=3, method="lars", alpha=alpha, random_state=global_random_seed + ) + spca_lars.fit(Y) + + # Test that CD gives similar results + spca_lasso = SparsePCA( + n_components=3, method="cd", random_state=global_random_seed, alpha=alpha + ) + spca_lasso.fit(Y) + assert_array_almost_equal(spca_lasso.components_, spca_lars.components_) + + +@if_safe_multiprocessing_with_blas +def test_fit_transform_parallel(global_random_seed): + alpha = 1 + rng = np.random.RandomState(global_random_seed) + Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array + spca_lars = SparsePCA( + n_components=3, method="lars", alpha=alpha, random_state=global_random_seed + ) + spca_lars.fit(Y) + U1 = spca_lars.transform(Y) + # Test multiple CPUs + spca = SparsePCA( + n_components=3, + n_jobs=2, + method="lars", + alpha=alpha, + random_state=global_random_seed, + ).fit(Y) + U2 = spca.transform(Y) + assert not np.all(spca_lars.components_ == 0) + assert_array_almost_equal(U1, U2) + + +def test_transform_nan(global_random_seed): + # Test that SparsePCA won't return NaN when there is 0 feature in all + # samples. + rng = np.random.RandomState(global_random_seed) + Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array + Y[:, 0] = 0 + estimator = SparsePCA(n_components=8, random_state=global_random_seed) + assert not np.any(np.isnan(estimator.fit_transform(Y))) + + +def test_fit_transform_tall(global_random_seed): + rng = np.random.RandomState(global_random_seed) + Y, _, _ = generate_toy_data(3, 65, (8, 8), random_state=rng) # tall array + spca_lars = SparsePCA(n_components=3, method="lars", random_state=rng) + U1 = spca_lars.fit_transform(Y) + spca_lasso = SparsePCA(n_components=3, method="cd", random_state=rng) + U2 = spca_lasso.fit(Y).transform(Y) + assert_array_almost_equal(U1, U2) + + +def test_initialization(global_random_seed): + rng = np.random.RandomState(global_random_seed) + U_init = rng.randn(5, 3) + V_init = rng.randn(3, 4) + model = SparsePCA( + n_components=3, U_init=U_init, V_init=V_init, max_iter=0, random_state=rng + ) + model.fit(rng.randn(5, 4)) + + expected_components = V_init / np.linalg.norm(V_init, axis=1, keepdims=True) + expected_components = svd_flip(u=expected_components.T, v=None)[0].T + assert_allclose(model.components_, expected_components) + + +def test_mini_batch_correct_shapes(): + rng = np.random.RandomState(0) + X = rng.randn(12, 10) + pca = MiniBatchSparsePCA(n_components=8, max_iter=1, random_state=rng) + U = pca.fit_transform(X) + assert pca.components_.shape == (8, 10) + assert U.shape == (12, 8) + # test overcomplete decomposition + pca = MiniBatchSparsePCA(n_components=13, max_iter=1, random_state=rng) + U = pca.fit_transform(X) + assert pca.components_.shape == (13, 10) + assert U.shape == (12, 13) + + +def test_scaling_fit_transform(global_random_seed): + alpha = 1 + rng = np.random.RandomState(global_random_seed) + Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng) + spca_lars = SparsePCA(n_components=3, method="lars", alpha=alpha, random_state=rng) + results_train = spca_lars.fit_transform(Y) + results_test = spca_lars.transform(Y[:10]) + assert_allclose(results_train[0], results_test[0]) + + +def test_pca_vs_spca(global_random_seed): + rng = np.random.RandomState(global_random_seed) + Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng) + Z, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) + spca = SparsePCA(alpha=0, ridge_alpha=0, n_components=2, random_state=rng) + pca = PCA(n_components=2, random_state=rng) + pca.fit(Y) + spca.fit(Y) + results_test_pca = pca.transform(Z) + results_test_spca = spca.transform(Z) + assert_allclose( + np.abs(spca.components_.dot(pca.components_.T)), np.eye(2), atol=1e-4 + ) + results_test_pca *= np.sign(results_test_pca[0, :]) + results_test_spca *= np.sign(results_test_spca[0, :]) + assert_allclose(results_test_pca, results_test_spca, atol=1e-4) + + +@pytest.mark.parametrize("SPCA", [SparsePCA, MiniBatchSparsePCA]) +@pytest.mark.parametrize("n_components", [None, 3]) +def test_spca_n_components_(SPCA, n_components): + rng = np.random.RandomState(0) + n_samples, n_features = 12, 10 + X = rng.randn(n_samples, n_features) + + model = SPCA(n_components=n_components).fit(X) + + if n_components is not None: + assert model.n_components_ == n_components + else: + assert model.n_components_ == n_features + + +@pytest.mark.parametrize("SPCA", (SparsePCA, MiniBatchSparsePCA)) +@pytest.mark.parametrize("method", ("lars", "cd")) +@pytest.mark.parametrize( + "data_type, expected_type", + ( + (np.float32, np.float32), + (np.float64, np.float64), + (np.int32, np.float64), + (np.int64, np.float64), + ), +) +def test_sparse_pca_dtype_match(SPCA, method, data_type, expected_type): + # Verify output matrix dtype + n_samples, n_features, n_components = 12, 10, 3 + rng = np.random.RandomState(0) + input_array = rng.randn(n_samples, n_features).astype(data_type) + model = SPCA(n_components=n_components, method=method) + transformed = model.fit_transform(input_array) + + assert transformed.dtype == expected_type + assert model.components_.dtype == expected_type + + +@pytest.mark.parametrize("SPCA", (SparsePCA, MiniBatchSparsePCA)) +@pytest.mark.parametrize("method", ("lars", "cd")) +def test_sparse_pca_numerical_consistency(SPCA, method, global_random_seed): + # Verify numericall consistentency among np.float32 and np.float64 + n_samples, n_features, n_components = 20, 20, 5 + input_array = make_low_rank_matrix( + n_samples=n_samples, + n_features=n_features, + effective_rank=n_components, + random_state=global_random_seed, + ) + + model_32 = SPCA( + n_components=n_components, + method=method, + random_state=global_random_seed, + ) + transformed_32 = model_32.fit_transform(input_array.astype(np.float32)) + + model_64 = SPCA( + n_components=n_components, + method=method, + random_state=global_random_seed, + ) + transformed_64 = model_64.fit_transform(input_array.astype(np.float64)) + assert_allclose(transformed_64, transformed_32) + assert_allclose(model_64.components_, model_32.components_) + + +@pytest.mark.parametrize("SPCA", [SparsePCA, MiniBatchSparsePCA]) +def test_spca_feature_names_out(SPCA): + """Check feature names out for *SparsePCA.""" + rng = np.random.RandomState(0) + n_samples, n_features = 12, 10 + X = rng.randn(n_samples, n_features) + + model = SPCA(n_components=4).fit(X) + names = model.get_feature_names_out() + + estimator_name = SPCA.__name__.lower() + assert_array_equal([f"{estimator_name}{i}" for i in range(4)], names) + + +def test_spca_early_stopping(global_random_seed): + """Check that `tol` and `max_no_improvement` act as early stopping.""" + rng = np.random.RandomState(global_random_seed) + n_samples, n_features = 50, 10 + X = rng.randn(n_samples, n_features) + + # vary the tolerance to force the early stopping of one of the model + model_early_stopped = MiniBatchSparsePCA( + max_iter=100, tol=0.5, random_state=global_random_seed + ).fit(X) + model_not_early_stopped = MiniBatchSparsePCA( + max_iter=100, tol=1e-3, random_state=global_random_seed + ).fit(X) + assert model_early_stopped.n_iter_ < model_not_early_stopped.n_iter_ + + # force the max number of no improvement to a large value to check that + # it does help to early stop + model_early_stopped = MiniBatchSparsePCA( + max_iter=100, tol=1e-6, max_no_improvement=2, random_state=global_random_seed + ).fit(X) + model_not_early_stopped = MiniBatchSparsePCA( + max_iter=100, tol=1e-6, max_no_improvement=100, random_state=global_random_seed + ).fit(X) + assert model_early_stopped.n_iter_ < model_not_early_stopped.n_iter_ + + +def test_equivalence_components_pca_spca(global_random_seed): + """Check the equivalence of the components found by PCA and SparsePCA. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/23932 + """ + rng = np.random.RandomState(global_random_seed) + X = rng.randn(50, 4) + + n_components = 2 + pca = PCA( + n_components=n_components, + svd_solver="randomized", + random_state=0, + ).fit(X) + spca = SparsePCA( + n_components=n_components, + method="lars", + ridge_alpha=0, + alpha=0, + random_state=0, + ).fit(X) + + assert_allclose(pca.components_, spca.components_) + + +def test_sparse_pca_inverse_transform(global_random_seed): + """Check that `inverse_transform` in `SparsePCA` and `PCA` are similar.""" + rng = np.random.RandomState(global_random_seed) + n_samples, n_features = 10, 5 + X = rng.randn(n_samples, n_features) + + n_components = 2 + spca = SparsePCA( + n_components=n_components, + alpha=1e-12, + ridge_alpha=1e-12, + random_state=global_random_seed, + ) + pca = PCA(n_components=n_components, random_state=global_random_seed) + X_trans_spca = spca.fit_transform(X) + X_trans_pca = pca.fit_transform(X) + assert_allclose( + spca.inverse_transform(X_trans_spca), pca.inverse_transform(X_trans_pca) + ) + + +@pytest.mark.parametrize("SPCA", [SparsePCA, MiniBatchSparsePCA]) +def test_transform_inverse_transform_round_trip(SPCA, global_random_seed): + """Check the `transform` and `inverse_transform` round trip with no loss of + information. + """ + rng = np.random.RandomState(global_random_seed) + n_samples, n_features = 10, 5 + X = rng.randn(n_samples, n_features) + + n_components = n_features + spca = SPCA( + n_components=n_components, + alpha=1e-12, + ridge_alpha=1e-12, + random_state=global_random_seed, + ) + X_trans_spca = spca.fit_transform(X) + assert_allclose(spca.inverse_transform(X_trans_spca), X) diff --git a/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_truncated_svd.py b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_truncated_svd.py new file mode 100644 index 0000000000000000000000000000000000000000..07b35c873ee3e2faad40808bcf3337e81f78ff8a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/decomposition/tests/test_truncated_svd.py @@ -0,0 +1,212 @@ +"""Test truncated SVD transformer.""" + +import numpy as np +import pytest +import scipy.sparse as sp + +from sklearn.decomposition import PCA, TruncatedSVD +from sklearn.utils import check_random_state +from sklearn.utils._testing import assert_allclose, assert_array_less + +SVD_SOLVERS = ["arpack", "randomized"] + + +@pytest.fixture(scope="module") +def X_sparse(): + # Make an X that looks somewhat like a small tf-idf matrix. + rng = check_random_state(42) + X = sp.random(60, 55, density=0.2, format="csr", random_state=rng) + X.data[:] = 1 + np.log(X.data) + return X + + +@pytest.mark.parametrize("solver", ["randomized"]) +@pytest.mark.parametrize("kind", ("dense", "sparse")) +def test_solvers(X_sparse, solver, kind): + X = X_sparse if kind == "sparse" else X_sparse.toarray() + svd_a = TruncatedSVD(30, algorithm="arpack") + svd = TruncatedSVD(30, algorithm=solver, random_state=42, n_oversamples=100) + + Xa = svd_a.fit_transform(X)[:, :6] + Xr = svd.fit_transform(X)[:, :6] + assert_allclose(Xa, Xr, rtol=2e-3) + + comp_a = np.abs(svd_a.components_) + comp = np.abs(svd.components_) + # All elements are equal, but some elements are more equal than others. + assert_allclose(comp_a[:9], comp[:9], rtol=1e-3) + assert_allclose(comp_a[9:], comp[9:], atol=1e-2) + + +@pytest.mark.parametrize("n_components", (10, 25, 41, 55)) +def test_attributes(n_components, X_sparse): + n_features = X_sparse.shape[1] + tsvd = TruncatedSVD(n_components).fit(X_sparse) + assert tsvd.n_components == n_components + assert tsvd.components_.shape == (n_components, n_features) + + +@pytest.mark.parametrize( + "algorithm, n_components", + [ + ("arpack", 55), + ("arpack", 56), + ("randomized", 56), + ], +) +def test_too_many_components(X_sparse, algorithm, n_components): + tsvd = TruncatedSVD(n_components=n_components, algorithm=algorithm) + with pytest.raises(ValueError): + tsvd.fit(X_sparse) + + +@pytest.mark.parametrize("fmt", ("array", "csr", "csc", "coo", "lil")) +def test_sparse_formats(fmt, X_sparse): + n_samples = X_sparse.shape[0] + Xfmt = X_sparse.toarray() if fmt == "dense" else getattr(X_sparse, "to" + fmt)() + tsvd = TruncatedSVD(n_components=11) + Xtrans = tsvd.fit_transform(Xfmt) + assert Xtrans.shape == (n_samples, 11) + Xtrans = tsvd.transform(Xfmt) + assert Xtrans.shape == (n_samples, 11) + + +@pytest.mark.parametrize("algo", SVD_SOLVERS) +def test_inverse_transform(algo, X_sparse): + # We need a lot of components for the reconstruction to be "almost + # equal" in all positions. XXX Test means or sums instead? + tsvd = TruncatedSVD(n_components=52, random_state=42, algorithm=algo) + Xt = tsvd.fit_transform(X_sparse) + Xinv = tsvd.inverse_transform(Xt) + assert_allclose(Xinv, X_sparse.toarray(), rtol=1e-1, atol=2e-1) + + +def test_integers(X_sparse): + n_samples = X_sparse.shape[0] + Xint = X_sparse.astype(np.int64) + tsvd = TruncatedSVD(n_components=6) + Xtrans = tsvd.fit_transform(Xint) + assert Xtrans.shape == (n_samples, tsvd.n_components) + + +@pytest.mark.parametrize("kind", ("dense", "sparse")) +@pytest.mark.parametrize("n_components", [10, 20]) +@pytest.mark.parametrize("solver", SVD_SOLVERS) +def test_explained_variance(X_sparse, kind, n_components, solver): + X = X_sparse if kind == "sparse" else X_sparse.toarray() + svd = TruncatedSVD(n_components, algorithm=solver) + X_tr = svd.fit_transform(X) + # Assert that all the values are greater than 0 + assert_array_less(0.0, svd.explained_variance_ratio_) + + # Assert that total explained variance is less than 1 + assert_array_less(svd.explained_variance_ratio_.sum(), 1.0) + + # Test that explained_variance is correct + total_variance = np.var(X_sparse.toarray(), axis=0).sum() + variances = np.var(X_tr, axis=0) + true_explained_variance_ratio = variances / total_variance + + assert_allclose( + svd.explained_variance_ratio_, + true_explained_variance_ratio, + ) + + +@pytest.mark.parametrize("kind", ("dense", "sparse")) +@pytest.mark.parametrize("solver", SVD_SOLVERS) +def test_explained_variance_components_10_20(X_sparse, kind, solver): + X = X_sparse if kind == "sparse" else X_sparse.toarray() + svd_10 = TruncatedSVD(10, algorithm=solver, n_iter=10).fit(X) + svd_20 = TruncatedSVD(20, algorithm=solver, n_iter=10).fit(X) + + # Assert the 1st component is equal + assert_allclose( + svd_10.explained_variance_ratio_, + svd_20.explained_variance_ratio_[:10], + rtol=5e-3, + ) + + # Assert that 20 components has higher explained variance than 10 + assert ( + svd_20.explained_variance_ratio_.sum() > svd_10.explained_variance_ratio_.sum() + ) + + +@pytest.mark.parametrize("solver", SVD_SOLVERS) +def test_singular_values_consistency(solver, global_random_seed): + # Check that the TruncatedSVD output has the correct singular values + rng = np.random.RandomState(global_random_seed) + n_samples, n_features = 100, 80 + X = rng.randn(n_samples, n_features) + + pca = TruncatedSVD(n_components=2, algorithm=solver, random_state=rng).fit(X) + + # Compare to the Frobenius norm + X_pca = pca.transform(X) + assert_allclose( + np.sum(pca.singular_values_**2.0), + np.linalg.norm(X_pca, "fro") ** 2.0, + rtol=1e-2, + ) + + # Compare to the 2-norms of the score vectors + assert_allclose( + pca.singular_values_, np.sqrt(np.sum(X_pca**2.0, axis=0)), rtol=1e-2 + ) + + +@pytest.mark.parametrize("solver", SVD_SOLVERS) +def test_singular_values_expected(solver, global_random_seed): + # Set the singular values and see what we get back + rng = np.random.RandomState(global_random_seed) + n_samples = 100 + n_features = 110 + + X = rng.randn(n_samples, n_features) + + pca = TruncatedSVD(n_components=3, algorithm=solver, random_state=rng) + X_pca = pca.fit_transform(X) + + X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0)) + X_pca[:, 0] *= 3.142 + X_pca[:, 1] *= 2.718 + + X_hat_pca = np.dot(X_pca, pca.components_) + pca.fit(X_hat_pca) + assert_allclose(pca.singular_values_, [3.142, 2.718, 1.0], rtol=1e-14) + + +def test_truncated_svd_eq_pca(X_sparse): + # TruncatedSVD should be equal to PCA on centered data + + X_dense = X_sparse.toarray() + + X_c = X_dense - X_dense.mean(axis=0) + + params = dict(n_components=10, random_state=42) + + svd = TruncatedSVD(algorithm="arpack", **params) + pca = PCA(svd_solver="arpack", **params) + + Xt_svd = svd.fit_transform(X_c) + Xt_pca = pca.fit_transform(X_c) + + assert_allclose(Xt_svd, Xt_pca, rtol=1e-9) + assert_allclose(pca.mean_, 0, atol=1e-9) + assert_allclose(svd.components_, pca.components_) + + +@pytest.mark.parametrize( + "algorithm, tol", [("randomized", 0.0), ("arpack", 1e-6), ("arpack", 0.0)] +) +@pytest.mark.parametrize("kind", ("dense", "sparse")) +def test_fit_transform(X_sparse, algorithm, tol, kind): + # fit_transform(X) should equal fit(X).transform(X) + X = X_sparse if kind == "sparse" else X_sparse.toarray() + svd = TruncatedSVD( + n_components=5, n_iter=7, random_state=42, algorithm=algorithm, tol=tol + ) + X_transformed_1 = svd.fit_transform(X) + X_transformed_2 = svd.fit(X).transform(X) + assert_allclose(X_transformed_1, X_transformed_2) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..62a538d340318f3eeb745e77e1b13a1a5ea809af --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/__init__.py @@ -0,0 +1,45 @@ +"""Ensemble-based methods for classification, regression and anomaly detection.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._bagging import BaggingClassifier, BaggingRegressor +from ._base import BaseEnsemble +from ._forest import ( + ExtraTreesClassifier, + ExtraTreesRegressor, + RandomForestClassifier, + RandomForestRegressor, + RandomTreesEmbedding, +) +from ._gb import GradientBoostingClassifier, GradientBoostingRegressor +from ._hist_gradient_boosting.gradient_boosting import ( + HistGradientBoostingClassifier, + HistGradientBoostingRegressor, +) +from ._iforest import IsolationForest +from ._stacking import StackingClassifier, StackingRegressor +from ._voting import VotingClassifier, VotingRegressor +from ._weight_boosting import AdaBoostClassifier, AdaBoostRegressor + +__all__ = [ + "AdaBoostClassifier", + "AdaBoostRegressor", + "BaggingClassifier", + "BaggingRegressor", + "BaseEnsemble", + "ExtraTreesClassifier", + "ExtraTreesRegressor", + "GradientBoostingClassifier", + "GradientBoostingRegressor", + "HistGradientBoostingClassifier", + "HistGradientBoostingRegressor", + "IsolationForest", + "RandomForestClassifier", + "RandomForestRegressor", + "RandomTreesEmbedding", + "StackingClassifier", + "StackingRegressor", + "VotingClassifier", + "VotingRegressor", +] diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_bagging.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_bagging.py new file mode 100644 index 0000000000000000000000000000000000000000..34b613b15281aa946fac14178afed662dbbf3449 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_bagging.py @@ -0,0 +1,1480 @@ +"""Bagging meta-estimator.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import itertools +import numbers +from abc import ABCMeta, abstractmethod +from functools import partial +from numbers import Integral +from warnings import warn + +import numpy as np + +from ..base import ClassifierMixin, RegressorMixin, _fit_context +from ..metrics import accuracy_score, r2_score +from ..tree import DecisionTreeClassifier, DecisionTreeRegressor +from ..utils import ( + Bunch, + _safe_indexing, + check_random_state, + column_or_1d, +) +from ..utils._mask import indices_to_mask +from ..utils._param_validation import HasMethods, Interval, RealNotInt +from ..utils._tags import get_tags +from ..utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + _routing_enabled, + get_routing_for_object, + process_routing, +) +from ..utils.metaestimators import available_if +from ..utils.multiclass import check_classification_targets +from ..utils.parallel import Parallel, delayed +from ..utils.random import sample_without_replacement +from ..utils.validation import ( + _check_method_params, + _check_sample_weight, + _estimator_has, + check_is_fitted, + has_fit_parameter, + validate_data, +) +from ._base import BaseEnsemble, _partition_estimators + +__all__ = ["BaggingClassifier", "BaggingRegressor"] + +MAX_INT = np.iinfo(np.int32).max + + +def _generate_indices(random_state, bootstrap, n_population, n_samples): + """Draw randomly sampled indices.""" + # Draw sample indices + if bootstrap: + indices = random_state.randint(0, n_population, n_samples) + else: + indices = sample_without_replacement( + n_population, n_samples, random_state=random_state + ) + + return indices + + +def _generate_bagging_indices( + random_state, + bootstrap_features, + bootstrap_samples, + n_features, + n_samples, + max_features, + max_samples, +): + """Randomly draw feature and sample indices.""" + # Get valid random state + random_state = check_random_state(random_state) + + # Draw indices + feature_indices = _generate_indices( + random_state, bootstrap_features, n_features, max_features + ) + sample_indices = _generate_indices( + random_state, bootstrap_samples, n_samples, max_samples + ) + + return feature_indices, sample_indices + + +def _parallel_build_estimators( + n_estimators, + ensemble, + X, + y, + seeds, + total_n_estimators, + verbose, + check_input, + fit_params, +): + """Private function used to build a batch of estimators within a job.""" + # Retrieve settings + n_samples, n_features = X.shape + max_features = ensemble._max_features + max_samples = ensemble._max_samples + bootstrap = ensemble.bootstrap + bootstrap_features = ensemble.bootstrap_features + has_check_input = has_fit_parameter(ensemble.estimator_, "check_input") + requires_feature_indexing = bootstrap_features or max_features != n_features + + # Build estimators + estimators = [] + estimators_features = [] + + # TODO: (slep6) remove if condition for unrouted sample_weight when metadata + # routing can't be disabled. + support_sample_weight = has_fit_parameter(ensemble.estimator_, "sample_weight") + if not _routing_enabled() and ( + not support_sample_weight and fit_params.get("sample_weight") is not None + ): + raise ValueError( + "The base estimator doesn't support sample weight, but sample_weight is " + "passed to the fit method." + ) + + for i in range(n_estimators): + if verbose > 1: + print( + "Building estimator %d of %d for this parallel run (total %d)..." + % (i + 1, n_estimators, total_n_estimators) + ) + + random_state = seeds[i] + estimator = ensemble._make_estimator(append=False, random_state=random_state) + + if has_check_input: + estimator_fit = partial(estimator.fit, check_input=check_input) + else: + estimator_fit = estimator.fit + + # Draw random feature, sample indices + features, indices = _generate_bagging_indices( + random_state, + bootstrap_features, + bootstrap, + n_features, + n_samples, + max_features, + max_samples, + ) + + fit_params_ = fit_params.copy() + + # TODO(SLEP6): remove if condition for unrouted sample_weight when metadata + # routing can't be disabled. + # 1. If routing is enabled, we will check if the routing supports sample + # weight and use it if it does. + # 2. If routing is not enabled, we will check if the base + # estimator supports sample_weight and use it if it does. + + # Note: Row sampling can be achieved either through setting sample_weight or + # by indexing. The former is more efficient. Therefore, use this method + # if possible, otherwise use indexing. + if _routing_enabled(): + request_or_router = get_routing_for_object(ensemble.estimator_) + consumes_sample_weight = request_or_router.consumes( + "fit", ("sample_weight",) + ) + else: + consumes_sample_weight = support_sample_weight + if consumes_sample_weight: + # Draw sub samples, using sample weights, and then fit + curr_sample_weight = _check_sample_weight( + fit_params_.pop("sample_weight", None), X + ).copy() + + if bootstrap: + sample_counts = np.bincount(indices, minlength=n_samples) + curr_sample_weight *= sample_counts + else: + not_indices_mask = ~indices_to_mask(indices, n_samples) + curr_sample_weight[not_indices_mask] = 0 + + fit_params_["sample_weight"] = curr_sample_weight + X_ = X[:, features] if requires_feature_indexing else X + estimator_fit(X_, y, **fit_params_) + else: + # cannot use sample_weight, so use indexing + y_ = _safe_indexing(y, indices) + X_ = _safe_indexing(X, indices) + fit_params_ = _check_method_params(X, params=fit_params_, indices=indices) + if requires_feature_indexing: + X_ = X_[:, features] + estimator_fit(X_, y_, **fit_params_) + + estimators.append(estimator) + estimators_features.append(features) + + return estimators, estimators_features + + +def _parallel_predict_proba( + estimators, + estimators_features, + X, + n_classes, + predict_params=None, + predict_proba_params=None, +): + """Private function used to compute (proba-)predictions within a job.""" + n_samples = X.shape[0] + proba = np.zeros((n_samples, n_classes)) + + for estimator, features in zip(estimators, estimators_features): + if hasattr(estimator, "predict_proba"): + proba_estimator = estimator.predict_proba( + X[:, features], **(predict_params or {}) + ) + + if n_classes == len(estimator.classes_): + proba += proba_estimator + + else: + proba[:, estimator.classes_] += proba_estimator[ + :, range(len(estimator.classes_)) + ] + + else: + # Resort to voting + predictions = estimator.predict( + X[:, features], **(predict_proba_params or {}) + ) + + for i in range(n_samples): + proba[i, predictions[i]] += 1 + + return proba + + +def _parallel_predict_log_proba(estimators, estimators_features, X, n_classes, params): + """Private function used to compute log probabilities within a job.""" + n_samples = X.shape[0] + log_proba = np.empty((n_samples, n_classes)) + log_proba.fill(-np.inf) + all_classes = np.arange(n_classes, dtype=int) + + for estimator, features in zip(estimators, estimators_features): + log_proba_estimator = estimator.predict_log_proba(X[:, features], **params) + + if n_classes == len(estimator.classes_): + log_proba = np.logaddexp(log_proba, log_proba_estimator) + + else: + log_proba[:, estimator.classes_] = np.logaddexp( + log_proba[:, estimator.classes_], + log_proba_estimator[:, range(len(estimator.classes_))], + ) + + missing = np.setdiff1d(all_classes, estimator.classes_) + log_proba[:, missing] = np.logaddexp(log_proba[:, missing], -np.inf) + + return log_proba + + +def _parallel_decision_function(estimators, estimators_features, X, params): + """Private function used to compute decisions within a job.""" + return sum( + estimator.decision_function(X[:, features], **params) + for estimator, features in zip(estimators, estimators_features) + ) + + +def _parallel_predict_regression(estimators, estimators_features, X, params): + """Private function used to compute predictions within a job.""" + return sum( + estimator.predict(X[:, features], **params) + for estimator, features in zip(estimators, estimators_features) + ) + + +class BaseBagging(BaseEnsemble, metaclass=ABCMeta): + """Base class for Bagging meta-estimator. + + Warning: This class should not be used directly. Use derived classes + instead. + """ + + _parameter_constraints: dict = { + "estimator": [HasMethods(["fit", "predict"]), None], + "n_estimators": [Interval(Integral, 1, None, closed="left")], + "max_samples": [ + Interval(Integral, 1, None, closed="left"), + Interval(RealNotInt, 0, 1, closed="right"), + ], + "max_features": [ + Interval(Integral, 1, None, closed="left"), + Interval(RealNotInt, 0, 1, closed="right"), + ], + "bootstrap": ["boolean"], + "bootstrap_features": ["boolean"], + "oob_score": ["boolean"], + "warm_start": ["boolean"], + "n_jobs": [None, Integral], + "random_state": ["random_state"], + "verbose": ["verbose"], + } + + @abstractmethod + def __init__( + self, + estimator=None, + n_estimators=10, + *, + max_samples=1.0, + max_features=1.0, + bootstrap=True, + bootstrap_features=False, + oob_score=False, + warm_start=False, + n_jobs=None, + random_state=None, + verbose=0, + ): + super().__init__( + estimator=estimator, + n_estimators=n_estimators, + ) + self.max_samples = max_samples + self.max_features = max_features + self.bootstrap = bootstrap + self.bootstrap_features = bootstrap_features + self.oob_score = oob_score + self.warm_start = warm_start + self.n_jobs = n_jobs + self.random_state = random_state + self.verbose = verbose + + @_fit_context( + # BaseBagging.estimator is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y, sample_weight=None, **fit_params): + """Build a Bagging ensemble of estimators from the training set (X, y). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only if + they are supported by the base estimator. + + y : array-like of shape (n_samples,) + The target values (class labels in classification, real numbers in + regression). + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. + Note that this is supported only if the base estimator supports + sample weighting. + **fit_params : dict + Parameters to pass to the underlying estimators. + + .. versionadded:: 1.5 + + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Fitted estimator. + """ + _raise_for_params(fit_params, self, "fit") + + # Convert data (X is required to be 2d and indexable) + X, y = validate_data( + self, + X, + y, + accept_sparse=["csr", "csc"], + dtype=None, + ensure_all_finite=False, + multi_output=True, + ) + + return self._fit( + X, + y, + max_samples=self.max_samples, + sample_weight=sample_weight, + **fit_params, + ) + + def _parallel_args(self): + return {} + + def _fit( + self, + X, + y, + max_samples=None, + max_depth=None, + check_input=True, + sample_weight=None, + **fit_params, + ): + """Build a Bagging ensemble of estimators from the training + set (X, y). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only if + they are supported by the base estimator. + + y : array-like of shape (n_samples,) + The target values (class labels in classification, real numbers in + regression). + + max_samples : int or float, default=None + Argument to use instead of self.max_samples. + + max_depth : int, default=None + Override value used when constructing base estimator. Only + supported if the base estimator has a max_depth parameter. + + check_input : bool, default=True + Override value used when fitting base estimator. Only supported + if the base estimator has a check_input parameter for fit function. + If the meta-estimator already checks the input, set this value to + False to prevent redundant input validation. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. + Note that this is supported only if the base estimator supports + sample weighting. + + **fit_params : dict, default=None + Parameters to pass to the :term:`fit` method of the underlying + estimator. + + Returns + ------- + self : object + Fitted estimator. + """ + random_state = check_random_state(self.random_state) + + # Remap output + n_samples = X.shape[0] + self._n_samples = n_samples + y = self._validate_y(y) + + # Check parameters + self._validate_estimator(self._get_estimator()) + + if sample_weight is not None: + fit_params["sample_weight"] = sample_weight + + if _routing_enabled(): + routed_params = process_routing(self, "fit", **fit_params) + else: + routed_params = Bunch() + routed_params.estimator = Bunch(fit=fit_params) + if "sample_weight" in fit_params: + routed_params.estimator.fit["sample_weight"] = fit_params[ + "sample_weight" + ] + + if max_depth is not None: + self.estimator_.max_depth = max_depth + + # Validate max_samples + if max_samples is None: + max_samples = self.max_samples + elif not isinstance(max_samples, numbers.Integral): + max_samples = int(max_samples * X.shape[0]) + + if max_samples > X.shape[0]: + raise ValueError("max_samples must be <= n_samples") + + # Store validated integer row sampling value + self._max_samples = max_samples + + # Validate max_features + if isinstance(self.max_features, numbers.Integral): + max_features = self.max_features + elif isinstance(self.max_features, float): + max_features = int(self.max_features * self.n_features_in_) + + if max_features > self.n_features_in_: + raise ValueError("max_features must be <= n_features") + + max_features = max(1, int(max_features)) + + # Store validated integer feature sampling value + self._max_features = max_features + + # Other checks + if not self.bootstrap and self.oob_score: + raise ValueError("Out of bag estimation only available if bootstrap=True") + + if self.warm_start and self.oob_score: + raise ValueError("Out of bag estimate only available if warm_start=False") + + if hasattr(self, "oob_score_") and self.warm_start: + del self.oob_score_ + + if not self.warm_start or not hasattr(self, "estimators_"): + # Free allocated memory, if any + self.estimators_ = [] + self.estimators_features_ = [] + + n_more_estimators = self.n_estimators - len(self.estimators_) + + if n_more_estimators < 0: + raise ValueError( + "n_estimators=%d must be larger or equal to " + "len(estimators_)=%d when warm_start==True" + % (self.n_estimators, len(self.estimators_)) + ) + + elif n_more_estimators == 0: + warn( + "Warm-start fitting without increasing n_estimators does not " + "fit new trees." + ) + return self + + # Parallel loop + n_jobs, n_estimators, starts = _partition_estimators( + n_more_estimators, self.n_jobs + ) + total_n_estimators = sum(n_estimators) + + # Advance random state to state after training + # the first n_estimators + if self.warm_start and len(self.estimators_) > 0: + random_state.randint(MAX_INT, size=len(self.estimators_)) + + seeds = random_state.randint(MAX_INT, size=n_more_estimators) + self._seeds = seeds + + all_results = Parallel( + n_jobs=n_jobs, verbose=self.verbose, **self._parallel_args() + )( + delayed(_parallel_build_estimators)( + n_estimators[i], + self, + X, + y, + seeds[starts[i] : starts[i + 1]], + total_n_estimators, + verbose=self.verbose, + check_input=check_input, + fit_params=routed_params.estimator.fit, + ) + for i in range(n_jobs) + ) + + # Reduce + self.estimators_ += list( + itertools.chain.from_iterable(t[0] for t in all_results) + ) + self.estimators_features_ += list( + itertools.chain.from_iterable(t[1] for t in all_results) + ) + + if self.oob_score: + self._set_oob_score(X, y) + + return self + + @abstractmethod + def _set_oob_score(self, X, y): + """Calculate out of bag predictions and score.""" + + def _validate_y(self, y): + if len(y.shape) == 1 or y.shape[1] == 1: + return column_or_1d(y, warn=True) + return y + + def _get_estimators_indices(self): + # Get drawn indices along both sample and feature axes + for seed in self._seeds: + # Operations accessing random_state must be performed identically + # to those in `_parallel_build_estimators()` + feature_indices, sample_indices = _generate_bagging_indices( + seed, + self.bootstrap_features, + self.bootstrap, + self.n_features_in_, + self._n_samples, + self._max_features, + self._max_samples, + ) + + yield feature_indices, sample_indices + + @property + def estimators_samples_(self): + """ + The subset of drawn samples for each base estimator. + + Returns a dynamically generated list of indices identifying + the samples used for fitting each member of the ensemble, i.e., + the in-bag samples. + + Note: the list is re-created at each call to the property in order + to reduce the object memory footprint by not storing the sampling + data. Thus fetching the property may be slower than expected. + """ + return [sample_indices for _, sample_indices in self._get_estimators_indices()] + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.5 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__) + + method_mapping = MethodMapping() + method_mapping.add(caller="fit", callee="fit").add( + caller="decision_function", callee="decision_function" + ) + + # the router needs to be built depending on whether the sub-estimator has a + # `predict_proba` method (as BaggingClassifier decides dynamically at runtime): + if hasattr(self._get_estimator(), "predict_proba"): + ( + method_mapping.add(caller="predict", callee="predict_proba").add( + caller="predict_proba", callee="predict_proba" + ) + ) + + else: + ( + method_mapping.add(caller="predict", callee="predict").add( + caller="predict_proba", callee="predict" + ) + ) + + # the router needs to be built depending on whether the sub-estimator has a + # `predict_log_proba` method (as BaggingClassifier decides dynamically at + # runtime): + if hasattr(self._get_estimator(), "predict_log_proba"): + method_mapping.add(caller="predict_log_proba", callee="predict_log_proba") + + else: + # if `predict_log_proba` is not available in BaggingClassifier's + # sub-estimator, the routing should go to its `predict_proba` if it is + # available or else to its `predict` method; according to how + # `sample_weight` is passed to the respective methods dynamically at + # runtime: + if hasattr(self._get_estimator(), "predict_proba"): + method_mapping.add(caller="predict_log_proba", callee="predict_proba") + + else: + method_mapping.add(caller="predict_log_proba", callee="predict") + + router.add(estimator=self._get_estimator(), method_mapping=method_mapping) + return router + + @abstractmethod + def _get_estimator(self): + """Resolve which estimator to return.""" + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = get_tags(self._get_estimator()).input_tags.sparse + tags.input_tags.allow_nan = get_tags(self._get_estimator()).input_tags.allow_nan + return tags + + +class BaggingClassifier(ClassifierMixin, BaseBagging): + """A Bagging classifier. + + A Bagging classifier is an ensemble meta-estimator that fits base + classifiers each on random subsets of the original dataset and then + aggregate their individual predictions (either by voting or by averaging) + to form a final prediction. Such a meta-estimator can typically be used as + a way to reduce the variance of a black-box estimator (e.g., a decision + tree), by introducing randomization into its construction procedure and + then making an ensemble out of it. + + This algorithm encompasses several works from the literature. When random + subsets of the dataset are drawn as random subsets of the samples, then + this algorithm is known as Pasting [1]_. If samples are drawn with + replacement, then the method is known as Bagging [2]_. When random subsets + of the dataset are drawn as random subsets of the features, then the method + is known as Random Subspaces [3]_. Finally, when base estimators are built + on subsets of both samples and features, then the method is known as + Random Patches [4]_. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.15 + + Parameters + ---------- + estimator : object, default=None + The base estimator to fit on random subsets of the dataset. + If None, then the base estimator is a + :class:`~sklearn.tree.DecisionTreeClassifier`. + + .. versionadded:: 1.2 + `base_estimator` was renamed to `estimator`. + + n_estimators : int, default=10 + The number of base estimators in the ensemble. + + max_samples : int or float, default=1.0 + The number of samples to draw from X to train each base estimator (with + replacement by default, see `bootstrap` for more details). + + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. + + max_features : int or float, default=1.0 + The number of features to draw from X to train each base estimator ( + without replacement by default, see `bootstrap_features` for more + details). + + - If int, then draw `max_features` features. + - If float, then draw `max(1, int(max_features * n_features_in_))` features. + + bootstrap : bool, default=True + Whether samples are drawn with replacement. If False, sampling + without replacement is performed. + + bootstrap_features : bool, default=False + Whether features are drawn with replacement. + + oob_score : bool, default=False + Whether to use out-of-bag samples to estimate + the generalization error. Only available if bootstrap=True. + + warm_start : bool, default=False + When set to True, reuse the solution of the previous call to fit + and add more estimators to the ensemble, otherwise, just fit + a whole new ensemble. See :term:`the Glossary `. + + .. versionadded:: 0.17 + *warm_start* constructor parameter. + + n_jobs : int, default=None + The number of jobs to run in parallel for both :meth:`fit` and + :meth:`predict`. ``None`` means 1 unless in a + :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. See :term:`Glossary ` for more details. + + random_state : int, RandomState instance or None, default=None + Controls the random resampling of the original dataset + (sample wise and feature wise). + If the base estimator accepts a `random_state` attribute, a different + seed is generated for each instance in the ensemble. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + verbose : int, default=0 + Controls the verbosity when fitting and predicting. + + Attributes + ---------- + estimator_ : estimator + The base estimator from which the ensemble is grown. + + .. versionadded:: 1.2 + `base_estimator_` was renamed to `estimator_`. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + estimators_ : list of estimators + The collection of fitted base estimators. + + estimators_samples_ : list of arrays + The subset of drawn samples (i.e., the in-bag samples) for each base + estimator. Each subset is defined by an array of the indices selected. + + estimators_features_ : list of arrays + The subset of drawn features for each base estimator. + + classes_ : ndarray of shape (n_classes,) + The classes labels. + + n_classes_ : int or list + The number of classes. + + oob_score_ : float + Score of the training dataset obtained using an out-of-bag estimate. + This attribute exists only when ``oob_score`` is True. + + oob_decision_function_ : ndarray of shape (n_samples, n_classes) + Decision function computed with out-of-bag estimate on the training + set. If n_estimators is small it might be possible that a data point + was never left out during the bootstrap. In this case, + `oob_decision_function_` might contain NaN. This attribute exists + only when ``oob_score`` is True. + + See Also + -------- + BaggingRegressor : A Bagging regressor. + + References + ---------- + + .. [1] L. Breiman, "Pasting small votes for classification in large + databases and on-line", Machine Learning, 36(1), 85-103, 1999. + + .. [2] L. Breiman, "Bagging predictors", Machine Learning, 24(2), 123-140, + 1996. + + .. [3] T. Ho, "The random subspace method for constructing decision + forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844, + 1998. + + .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine + Learning and Knowledge Discovery in Databases, 346-361, 2012. + + Examples + -------- + >>> from sklearn.svm import SVC + >>> from sklearn.ensemble import BaggingClassifier + >>> from sklearn.datasets import make_classification + >>> X, y = make_classification(n_samples=100, n_features=4, + ... n_informative=2, n_redundant=0, + ... random_state=0, shuffle=False) + >>> clf = BaggingClassifier(estimator=SVC(), + ... n_estimators=10, random_state=0).fit(X, y) + >>> clf.predict([[0, 0, 0, 0]]) + array([1]) + """ + + def __init__( + self, + estimator=None, + n_estimators=10, + *, + max_samples=1.0, + max_features=1.0, + bootstrap=True, + bootstrap_features=False, + oob_score=False, + warm_start=False, + n_jobs=None, + random_state=None, + verbose=0, + ): + super().__init__( + estimator=estimator, + n_estimators=n_estimators, + max_samples=max_samples, + max_features=max_features, + bootstrap=bootstrap, + bootstrap_features=bootstrap_features, + oob_score=oob_score, + warm_start=warm_start, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + ) + + def _get_estimator(self): + """Resolve which estimator to return (default is DecisionTreeClassifier)""" + if self.estimator is None: + return DecisionTreeClassifier() + return self.estimator + + def _set_oob_score(self, X, y): + n_samples = y.shape[0] + n_classes_ = self.n_classes_ + + predictions = np.zeros((n_samples, n_classes_)) + + for estimator, samples, features in zip( + self.estimators_, self.estimators_samples_, self.estimators_features_ + ): + # Create mask for OOB samples + mask = ~indices_to_mask(samples, n_samples) + + if hasattr(estimator, "predict_proba"): + predictions[mask, :] += estimator.predict_proba( + (X[mask, :])[:, features] + ) + + else: + p = estimator.predict((X[mask, :])[:, features]) + j = 0 + + for i in range(n_samples): + if mask[i]: + predictions[i, p[j]] += 1 + j += 1 + + if (predictions.sum(axis=1) == 0).any(): + warn( + "Some inputs do not have OOB scores. " + "This probably means too few estimators were used " + "to compute any reliable oob estimates." + ) + + oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis] + oob_score = accuracy_score(y, np.argmax(predictions, axis=1)) + + self.oob_decision_function_ = oob_decision_function + self.oob_score_ = oob_score + + def _validate_y(self, y): + y = column_or_1d(y, warn=True) + check_classification_targets(y) + self.classes_, y = np.unique(y, return_inverse=True) + self.n_classes_ = len(self.classes_) + + return y + + def predict(self, X, **params): + """Predict class for X. + + The predicted class of an input sample is computed as the class with + the highest mean predicted probability. If base estimators do not + implement a ``predict_proba`` method, then it resorts to voting. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only if + they are supported by the base estimator. + + **params : dict + Parameters routed to the `predict_proba` (if available) or the `predict` + method (otherwise) of the sub-estimators via the metadata routing API. + + .. versionadded:: 1.7 + + Only available if + `sklearn.set_config(enable_metadata_routing=True)` is set. See + :ref:`Metadata Routing User Guide ` for more + details. + + Returns + ------- + y : ndarray of shape (n_samples,) + The predicted classes. + """ + _raise_for_params(params, self, "predict") + + predicted_probabilitiy = self.predict_proba(X, **params) + return self.classes_.take((np.argmax(predicted_probabilitiy, axis=1)), axis=0) + + def predict_proba(self, X, **params): + """Predict class probabilities for X. + + The predicted class probabilities of an input sample is computed as + the mean predicted class probabilities of the base estimators in the + ensemble. If base estimators do not implement a ``predict_proba`` + method, then it resorts to voting and the predicted class probabilities + of an input sample represents the proportion of estimators predicting + each class. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only if + they are supported by the base estimator. + + **params : dict + Parameters routed to the `predict_proba` (if available) or the `predict` + method (otherwise) of the sub-estimators via the metadata routing API. + + .. versionadded:: 1.7 + + Only available if + `sklearn.set_config(enable_metadata_routing=True)` is set. See + :ref:`Metadata Routing User Guide ` for more + details. + + Returns + ------- + p : ndarray of shape (n_samples, n_classes) + The class probabilities of the input samples. The order of the + classes corresponds to that in the attribute :term:`classes_`. + """ + _raise_for_params(params, self, "predict_proba") + + check_is_fitted(self) + # Check data + X = validate_data( + self, + X, + accept_sparse=["csr", "csc"], + dtype=None, + ensure_all_finite=False, + reset=False, + ) + + if _routing_enabled(): + routed_params = process_routing(self, "predict_proba", **params) + else: + routed_params = Bunch() + routed_params.estimator = Bunch(predict_proba=Bunch()) + + # Parallel loop + n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs) + + all_proba = Parallel( + n_jobs=n_jobs, verbose=self.verbose, **self._parallel_args() + )( + delayed(_parallel_predict_proba)( + self.estimators_[starts[i] : starts[i + 1]], + self.estimators_features_[starts[i] : starts[i + 1]], + X, + self.n_classes_, + predict_params=routed_params.estimator.get("predict", None), + predict_proba_params=routed_params.estimator.get("predict_proba", None), + ) + for i in range(n_jobs) + ) + + # Reduce + proba = sum(all_proba) / self.n_estimators + + return proba + + def predict_log_proba(self, X, **params): + """Predict class log-probabilities for X. + + The predicted class log-probabilities of an input sample is computed as + the log of the mean predicted class probabilities of the base + estimators in the ensemble. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only if + they are supported by the base estimator. + + **params : dict + Parameters routed to the `predict_log_proba`, the `predict_proba` or the + `proba` method of the sub-estimators via the metadata routing API. The + routing is tried in the mentioned order depending on whether this method is + available on the sub-estimator. + + .. versionadded:: 1.7 + + Only available if + `sklearn.set_config(enable_metadata_routing=True)` is set. See + :ref:`Metadata Routing User Guide ` for more + details. + + Returns + ------- + p : ndarray of shape (n_samples, n_classes) + The class log-probabilities of the input samples. The order of the + classes corresponds to that in the attribute :term:`classes_`. + """ + _raise_for_params(params, self, "predict_log_proba") + + check_is_fitted(self) + + if hasattr(self.estimator_, "predict_log_proba"): + # Check data + X = validate_data( + self, + X, + accept_sparse=["csr", "csc"], + dtype=None, + ensure_all_finite=False, + reset=False, + ) + + if _routing_enabled(): + routed_params = process_routing(self, "predict_log_proba", **params) + else: + routed_params = Bunch() + routed_params.estimator = Bunch(predict_log_proba=Bunch()) + + # Parallel loop + n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs) + + all_log_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)( + delayed(_parallel_predict_log_proba)( + self.estimators_[starts[i] : starts[i + 1]], + self.estimators_features_[starts[i] : starts[i + 1]], + X, + self.n_classes_, + params=routed_params.estimator.predict_log_proba, + ) + for i in range(n_jobs) + ) + + # Reduce + log_proba = all_log_proba[0] + + for j in range(1, len(all_log_proba)): + log_proba = np.logaddexp(log_proba, all_log_proba[j]) + + log_proba -= np.log(self.n_estimators) + + else: + log_proba = np.log(self.predict_proba(X, **params)) + + return log_proba + + @available_if( + _estimator_has("decision_function", delegates=("estimators_", "estimator")) + ) + def decision_function(self, X, **params): + """Average of the decision functions of the base classifiers. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only if + they are supported by the base estimator. + + **params : dict + Parameters routed to the `decision_function` method of the sub-estimators + via the metadata routing API. + + .. versionadded:: 1.7 + + Only available if + `sklearn.set_config(enable_metadata_routing=True)` is set. See + :ref:`Metadata Routing User Guide ` for more + details. + + Returns + ------- + score : ndarray of shape (n_samples, k) + The decision function of the input samples. The columns correspond + to the classes in sorted order, as they appear in the attribute + ``classes_``. Regression and binary classification are special + cases with ``k == 1``, otherwise ``k==n_classes``. + """ + _raise_for_params(params, self, "decision_function") + + check_is_fitted(self) + + # Check data + X = validate_data( + self, + X, + accept_sparse=["csr", "csc"], + dtype=None, + ensure_all_finite=False, + reset=False, + ) + + if _routing_enabled(): + routed_params = process_routing(self, "decision_function", **params) + else: + routed_params = Bunch() + routed_params.estimator = Bunch(decision_function=Bunch()) + + # Parallel loop + n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs) + + all_decisions = Parallel(n_jobs=n_jobs, verbose=self.verbose)( + delayed(_parallel_decision_function)( + self.estimators_[starts[i] : starts[i + 1]], + self.estimators_features_[starts[i] : starts[i + 1]], + X, + params=routed_params.estimator.decision_function, + ) + for i in range(n_jobs) + ) + + # Reduce + decisions = sum(all_decisions) / self.n_estimators + + return decisions + + +class BaggingRegressor(RegressorMixin, BaseBagging): + """A Bagging regressor. + + A Bagging regressor is an ensemble meta-estimator that fits base + regressors each on random subsets of the original dataset and then + aggregate their individual predictions (either by voting or by averaging) + to form a final prediction. Such a meta-estimator can typically be used as + a way to reduce the variance of a black-box estimator (e.g., a decision + tree), by introducing randomization into its construction procedure and + then making an ensemble out of it. + + This algorithm encompasses several works from the literature. When random + subsets of the dataset are drawn as random subsets of the samples, then + this algorithm is known as Pasting [1]_. If samples are drawn with + replacement, then the method is known as Bagging [2]_. When random subsets + of the dataset are drawn as random subsets of the features, then the method + is known as Random Subspaces [3]_. Finally, when base estimators are built + on subsets of both samples and features, then the method is known as + Random Patches [4]_. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.15 + + Parameters + ---------- + estimator : object, default=None + The base estimator to fit on random subsets of the dataset. + If None, then the base estimator is a + :class:`~sklearn.tree.DecisionTreeRegressor`. + + .. versionadded:: 1.2 + `base_estimator` was renamed to `estimator`. + + n_estimators : int, default=10 + The number of base estimators in the ensemble. + + max_samples : int or float, default=1.0 + The number of samples to draw from X to train each base estimator (with + replacement by default, see `bootstrap` for more details). + + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. + + max_features : int or float, default=1.0 + The number of features to draw from X to train each base estimator ( + without replacement by default, see `bootstrap_features` for more + details). + + - If int, then draw `max_features` features. + - If float, then draw `max(1, int(max_features * n_features_in_))` features. + + bootstrap : bool, default=True + Whether samples are drawn with replacement. If False, sampling + without replacement is performed. + + bootstrap_features : bool, default=False + Whether features are drawn with replacement. + + oob_score : bool, default=False + Whether to use out-of-bag samples to estimate + the generalization error. Only available if bootstrap=True. + + warm_start : bool, default=False + When set to True, reuse the solution of the previous call to fit + and add more estimators to the ensemble, otherwise, just fit + a whole new ensemble. See :term:`the Glossary `. + + n_jobs : int, default=None + The number of jobs to run in parallel for both :meth:`fit` and + :meth:`predict`. ``None`` means 1 unless in a + :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. See :term:`Glossary ` for more details. + + random_state : int, RandomState instance or None, default=None + Controls the random resampling of the original dataset + (sample wise and feature wise). + If the base estimator accepts a `random_state` attribute, a different + seed is generated for each instance in the ensemble. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + verbose : int, default=0 + Controls the verbosity when fitting and predicting. + + Attributes + ---------- + estimator_ : estimator + The base estimator from which the ensemble is grown. + + .. versionadded:: 1.2 + `base_estimator_` was renamed to `estimator_`. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + estimators_ : list of estimators + The collection of fitted sub-estimators. + + estimators_samples_ : list of arrays + The subset of drawn samples (i.e., the in-bag samples) for each base + estimator. Each subset is defined by an array of the indices selected. + + estimators_features_ : list of arrays + The subset of drawn features for each base estimator. + + oob_score_ : float + Score of the training dataset obtained using an out-of-bag estimate. + This attribute exists only when ``oob_score`` is True. + + oob_prediction_ : ndarray of shape (n_samples,) + Prediction computed with out-of-bag estimate on the training + set. If n_estimators is small it might be possible that a data point + was never left out during the bootstrap. In this case, + `oob_prediction_` might contain NaN. This attribute exists only + when ``oob_score`` is True. + + See Also + -------- + BaggingClassifier : A Bagging classifier. + + References + ---------- + + .. [1] L. Breiman, "Pasting small votes for classification in large + databases and on-line", Machine Learning, 36(1), 85-103, 1999. + + .. [2] L. Breiman, "Bagging predictors", Machine Learning, 24(2), 123-140, + 1996. + + .. [3] T. Ho, "The random subspace method for constructing decision + forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844, + 1998. + + .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine + Learning and Knowledge Discovery in Databases, 346-361, 2012. + + Examples + -------- + >>> from sklearn.svm import SVR + >>> from sklearn.ensemble import BaggingRegressor + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression(n_samples=100, n_features=4, + ... n_informative=2, n_targets=1, + ... random_state=0, shuffle=False) + >>> regr = BaggingRegressor(estimator=SVR(), + ... n_estimators=10, random_state=0).fit(X, y) + >>> regr.predict([[0, 0, 0, 0]]) + array([-2.8720]) + """ + + def __init__( + self, + estimator=None, + n_estimators=10, + *, + max_samples=1.0, + max_features=1.0, + bootstrap=True, + bootstrap_features=False, + oob_score=False, + warm_start=False, + n_jobs=None, + random_state=None, + verbose=0, + ): + super().__init__( + estimator=estimator, + n_estimators=n_estimators, + max_samples=max_samples, + max_features=max_features, + bootstrap=bootstrap, + bootstrap_features=bootstrap_features, + oob_score=oob_score, + warm_start=warm_start, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + ) + + def predict(self, X, **params): + """Predict regression target for X. + + The predicted regression target of an input sample is computed as the + mean predicted regression targets of the estimators in the ensemble. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only if + they are supported by the base estimator. + + **params : dict + Parameters routed to the `predict` method of the sub-estimators via the + metadata routing API. + + .. versionadded:: 1.7 + + Only available if + `sklearn.set_config(enable_metadata_routing=True)` is set. See + :ref:`Metadata Routing User Guide ` for more + details. + + Returns + ------- + y : ndarray of shape (n_samples,) + The predicted values. + """ + _raise_for_params(params, self, "predict") + + check_is_fitted(self) + # Check data + X = validate_data( + self, + X, + accept_sparse=["csr", "csc"], + dtype=None, + ensure_all_finite=False, + reset=False, + ) + + if _routing_enabled(): + routed_params = process_routing(self, "predict", **params) + else: + routed_params = Bunch() + routed_params.estimator = Bunch(predict=Bunch()) + + # Parallel loop + n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs) + + all_y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose)( + delayed(_parallel_predict_regression)( + self.estimators_[starts[i] : starts[i + 1]], + self.estimators_features_[starts[i] : starts[i + 1]], + X, + params=routed_params.estimator.predict, + ) + for i in range(n_jobs) + ) + + # Reduce + y_hat = sum(all_y_hat) / self.n_estimators + + return y_hat + + def _set_oob_score(self, X, y): + n_samples = y.shape[0] + + predictions = np.zeros((n_samples,)) + n_predictions = np.zeros((n_samples,)) + + for estimator, samples, features in zip( + self.estimators_, self.estimators_samples_, self.estimators_features_ + ): + # Create mask for OOB samples + mask = ~indices_to_mask(samples, n_samples) + + predictions[mask] += estimator.predict((X[mask, :])[:, features]) + n_predictions[mask] += 1 + + if (n_predictions == 0).any(): + warn( + "Some inputs do not have OOB scores. " + "This probably means too few estimators were used " + "to compute any reliable oob estimates." + ) + n_predictions[n_predictions == 0] = 1 + + predictions /= n_predictions + + self.oob_prediction_ = predictions + self.oob_score_ = r2_score(y, predictions) + + def _get_estimator(self): + """Resolve which estimator to return (default is DecisionTreeClassifier)""" + if self.estimator is None: + return DecisionTreeRegressor() + return self.estimator diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_base.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..e04645eec174f8d6adad6346ad2e0729577f0b5e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_base.py @@ -0,0 +1,307 @@ +"""Base class for ensemble-based estimators.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from abc import ABCMeta, abstractmethod + +import numpy as np +from joblib import effective_n_jobs + +from ..base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier, is_regressor +from ..utils import Bunch, check_random_state +from ..utils._tags import get_tags +from ..utils._user_interface import _print_elapsed_time +from ..utils.metadata_routing import _routing_enabled +from ..utils.metaestimators import _BaseComposition + + +def _fit_single_estimator( + estimator, X, y, fit_params, message_clsname=None, message=None +): + """Private function used to fit an estimator within a job.""" + # TODO(SLEP6): remove if-condition for unrouted sample_weight when metadata + # routing can't be disabled. + if not _routing_enabled() and "sample_weight" in fit_params: + try: + with _print_elapsed_time(message_clsname, message): + estimator.fit(X, y, sample_weight=fit_params["sample_weight"]) + except TypeError as exc: + if "unexpected keyword argument 'sample_weight'" in str(exc): + raise TypeError( + "Underlying estimator {} does not support sample weights.".format( + estimator.__class__.__name__ + ) + ) from exc + raise + else: + with _print_elapsed_time(message_clsname, message): + estimator.fit(X, y, **fit_params) + return estimator + + +def _set_random_states(estimator, random_state=None): + """Set fixed random_state parameters for an estimator. + + Finds all parameters ending ``random_state`` and sets them to integers + derived from ``random_state``. + + Parameters + ---------- + estimator : estimator supporting get/set_params + Estimator with potential randomness managed by random_state + parameters. + + random_state : int, RandomState instance or None, default=None + Pseudo-random number generator to control the generation of the random + integers. Pass an int for reproducible output across multiple function + calls. + See :term:`Glossary `. + + Notes + ----- + This does not necessarily set *all* ``random_state`` attributes that + control an estimator's randomness, only those accessible through + ``estimator.get_params()``. ``random_state``s not controlled include + those belonging to: + + * cross-validation splitters + * ``scipy.stats`` rvs + """ + random_state = check_random_state(random_state) + to_set = {} + for key in sorted(estimator.get_params(deep=True)): + if key == "random_state" or key.endswith("__random_state"): + to_set[key] = random_state.randint(np.iinfo(np.int32).max) + + if to_set: + estimator.set_params(**to_set) + + +class BaseEnsemble(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta): + """Base class for all ensemble classes. + + Warning: This class should not be used directly. Use derived classes + instead. + + Parameters + ---------- + estimator : object + The base estimator from which the ensemble is built. + + n_estimators : int, default=10 + The number of estimators in the ensemble. + + estimator_params : list of str, default=tuple() + The list of attributes to use as parameters when instantiating a + new base estimator. If none are given, default parameters are used. + + Attributes + ---------- + estimator_ : estimator + The base estimator from which the ensemble is grown. + + estimators_ : list of estimators + The collection of fitted base estimators. + """ + + @abstractmethod + def __init__( + self, + estimator=None, + *, + n_estimators=10, + estimator_params=tuple(), + ): + # Set parameters + self.estimator = estimator + self.n_estimators = n_estimators + self.estimator_params = estimator_params + + # Don't instantiate estimators now! Parameters of estimator might + # still change. Eg., when grid-searching with the nested object syntax. + # self.estimators_ needs to be filled by the derived classes in fit. + + def _validate_estimator(self, default=None): + """Check the base estimator. + + Sets the `estimator_` attributes. + """ + if self.estimator is not None: + self.estimator_ = self.estimator + else: + self.estimator_ = default + + def _make_estimator(self, append=True, random_state=None): + """Make and configure a copy of the `estimator_` attribute. + + Warning: This method should be used to properly instantiate new + sub-estimators. + """ + estimator = clone(self.estimator_) + estimator.set_params(**{p: getattr(self, p) for p in self.estimator_params}) + + if random_state is not None: + _set_random_states(estimator, random_state) + + if append: + self.estimators_.append(estimator) + + return estimator + + def __len__(self): + """Return the number of estimators in the ensemble.""" + return len(self.estimators_) + + def __getitem__(self, index): + """Return the index'th estimator in the ensemble.""" + return self.estimators_[index] + + def __iter__(self): + """Return iterator over estimators in the ensemble.""" + return iter(self.estimators_) + + +def _partition_estimators(n_estimators, n_jobs): + """Private function used to partition estimators between jobs.""" + # Compute the number of jobs + n_jobs = min(effective_n_jobs(n_jobs), n_estimators) + + # Partition estimators between jobs + n_estimators_per_job = np.full(n_jobs, n_estimators // n_jobs, dtype=int) + n_estimators_per_job[: n_estimators % n_jobs] += 1 + starts = np.cumsum(n_estimators_per_job) + + return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist() + + +class _BaseHeterogeneousEnsemble( + MetaEstimatorMixin, _BaseComposition, metaclass=ABCMeta +): + """Base class for heterogeneous ensemble of learners. + + Parameters + ---------- + estimators : list of (str, estimator) tuples + The ensemble of estimators to use in the ensemble. Each element of the + list is defined as a tuple of string (i.e. name of the estimator) and + an estimator instance. An estimator can be set to `'drop'` using + `set_params`. + + Attributes + ---------- + estimators_ : list of estimators + The elements of the estimators parameter, having been fitted on the + training data. If an estimator has been set to `'drop'`, it will not + appear in `estimators_`. + """ + + @property + def named_estimators(self): + """Dictionary to access any fitted sub-estimators by name. + + Returns + ------- + :class:`~sklearn.utils.Bunch` + """ + return Bunch(**dict(self.estimators)) + + @abstractmethod + def __init__(self, estimators): + self.estimators = estimators + + def _validate_estimators(self): + if len(self.estimators) == 0 or not all( + isinstance(item, (tuple, list)) and isinstance(item[0], str) + for item in self.estimators + ): + raise ValueError( + "Invalid 'estimators' attribute, 'estimators' should be a " + "non-empty list of (string, estimator) tuples." + ) + names, estimators = zip(*self.estimators) + # defined by MetaEstimatorMixin + self._validate_names(names) + + has_estimator = any(est != "drop" for est in estimators) + if not has_estimator: + raise ValueError( + "All estimators are dropped. At least one is required " + "to be an estimator." + ) + + is_estimator_type = is_classifier if is_classifier(self) else is_regressor + + for est in estimators: + if est != "drop" and not is_estimator_type(est): + raise ValueError( + "The estimator {} should be a {}.".format( + est.__class__.__name__, is_estimator_type.__name__[3:] + ) + ) + + return names, estimators + + def set_params(self, **params): + """ + Set the parameters of an estimator from the ensemble. + + Valid parameter keys can be listed with `get_params()`. Note that you + can directly set the parameters of the estimators contained in + `estimators`. + + Parameters + ---------- + **params : keyword arguments + Specific parameters using e.g. + `set_params(parameter_name=new_value)`. In addition, to setting the + parameters of the estimator, the individual estimator of the + estimators can also be set, or can be removed by setting them to + 'drop'. + + Returns + ------- + self : object + Estimator instance. + """ + super()._set_params("estimators", **params) + return self + + def get_params(self, deep=True): + """ + Get the parameters of an estimator from the ensemble. + + Returns the parameters given in the constructor as well as the + estimators contained within the `estimators` parameter. + + Parameters + ---------- + deep : bool, default=True + Setting it to True gets the various estimators and the parameters + of the estimators as well. + + Returns + ------- + params : dict + Parameter and estimator names mapped to their values or parameter + names mapped to their values. + """ + return super()._get_params("estimators", deep=deep) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + try: + tags.input_tags.allow_nan = all( + get_tags(est[1]).input_tags.allow_nan if est[1] != "drop" else True + for est in self.estimators + ) + tags.input_tags.sparse = all( + get_tags(est[1]).input_tags.sparse if est[1] != "drop" else True + for est in self.estimators + ) + except Exception: + # If `estimators` does not comply with our API (list of tuples) then it will + # fail. In this case, we assume that `allow_nan` and `sparse` are False but + # the parameter validation will raise an error during `fit`. + pass # pragma: no cover + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_forest.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_forest.py new file mode 100644 index 0000000000000000000000000000000000000000..5b27e789b1d137126336f62955bd50895d7fa4f7 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_forest.py @@ -0,0 +1,3045 @@ +""" +Forest of trees-based ensemble methods. + +Those methods include random forests and extremely randomized trees. + +The module structure is the following: + +- The ``BaseForest`` base class implements a common ``fit`` method for all + the estimators in the module. The ``fit`` method of the base ``Forest`` + class calls the ``fit`` method of each sub-estimator on random samples + (with replacement, a.k.a. bootstrap) of the training set. + + The init of the sub-estimator is further delegated to the + ``BaseEnsemble`` constructor. + +- The ``ForestClassifier`` and ``ForestRegressor`` base classes further + implement the prediction logic by computing an average of the predicted + outcomes of the sub-estimators. + +- The ``RandomForestClassifier`` and ``RandomForestRegressor`` derived + classes provide the user with concrete implementations of + the forest ensemble method using classical, deterministic + ``DecisionTreeClassifier`` and ``DecisionTreeRegressor`` as + sub-estimator implementations. + +- The ``ExtraTreesClassifier`` and ``ExtraTreesRegressor`` derived + classes provide the user with concrete implementations of the + forest ensemble method using the extremely randomized trees + ``ExtraTreeClassifier`` and ``ExtraTreeRegressor`` as + sub-estimator implementations. + +Single and multi-output problems are both handled. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import threading +from abc import ABCMeta, abstractmethod +from numbers import Integral, Real +from warnings import catch_warnings, simplefilter, warn + +import numpy as np +from scipy.sparse import hstack as sparse_hstack +from scipy.sparse import issparse + +from ..base import ( + ClassifierMixin, + MultiOutputMixin, + RegressorMixin, + TransformerMixin, + _fit_context, + is_classifier, +) +from ..exceptions import DataConversionWarning +from ..metrics import accuracy_score, r2_score +from ..preprocessing import OneHotEncoder +from ..tree import ( + BaseDecisionTree, + DecisionTreeClassifier, + DecisionTreeRegressor, + ExtraTreeClassifier, + ExtraTreeRegressor, +) +from ..tree._tree import DOUBLE, DTYPE +from ..utils import check_random_state, compute_sample_weight +from ..utils._param_validation import Interval, RealNotInt, StrOptions +from ..utils._tags import get_tags +from ..utils.multiclass import check_classification_targets, type_of_target +from ..utils.parallel import Parallel, delayed +from ..utils.validation import ( + _check_feature_names_in, + _check_sample_weight, + _num_samples, + check_is_fitted, + validate_data, +) +from ._base import BaseEnsemble, _partition_estimators + +__all__ = [ + "ExtraTreesClassifier", + "ExtraTreesRegressor", + "RandomForestClassifier", + "RandomForestRegressor", + "RandomTreesEmbedding", +] + +MAX_INT = np.iinfo(np.int32).max + + +def _get_n_samples_bootstrap(n_samples, max_samples): + """ + Get the number of samples in a bootstrap sample. + + Parameters + ---------- + n_samples : int + Number of samples in the dataset. + max_samples : int or float + The maximum number of samples to draw from the total available: + - if float, this indicates a fraction of the total and should be + the interval `(0.0, 1.0]`; + - if int, this indicates the exact number of samples; + - if None, this indicates the total number of samples. + + Returns + ------- + n_samples_bootstrap : int + The total number of samples to draw for the bootstrap sample. + """ + if max_samples is None: + return n_samples + + if isinstance(max_samples, Integral): + if max_samples > n_samples: + msg = "`max_samples` must be <= n_samples={} but got value {}" + raise ValueError(msg.format(n_samples, max_samples)) + return max_samples + + if isinstance(max_samples, Real): + return max(round(n_samples * max_samples), 1) + + +def _generate_sample_indices(random_state, n_samples, n_samples_bootstrap): + """ + Private function used to _parallel_build_trees function.""" + + random_instance = check_random_state(random_state) + sample_indices = random_instance.randint( + 0, n_samples, n_samples_bootstrap, dtype=np.int32 + ) + + return sample_indices + + +def _generate_unsampled_indices(random_state, n_samples, n_samples_bootstrap): + """ + Private function used to forest._set_oob_score function.""" + sample_indices = _generate_sample_indices( + random_state, n_samples, n_samples_bootstrap + ) + sample_counts = np.bincount(sample_indices, minlength=n_samples) + unsampled_mask = sample_counts == 0 + indices_range = np.arange(n_samples) + unsampled_indices = indices_range[unsampled_mask] + + return unsampled_indices + + +def _parallel_build_trees( + tree, + bootstrap, + X, + y, + sample_weight, + tree_idx, + n_trees, + verbose=0, + class_weight=None, + n_samples_bootstrap=None, + missing_values_in_feature_mask=None, +): + """ + Private function used to fit a single tree in parallel.""" + if verbose > 1: + print("building tree %d of %d" % (tree_idx + 1, n_trees)) + + if bootstrap: + n_samples = X.shape[0] + if sample_weight is None: + curr_sample_weight = np.ones((n_samples,), dtype=np.float64) + else: + curr_sample_weight = sample_weight.copy() + + indices = _generate_sample_indices( + tree.random_state, n_samples, n_samples_bootstrap + ) + sample_counts = np.bincount(indices, minlength=n_samples) + curr_sample_weight *= sample_counts + + if class_weight == "subsample": + with catch_warnings(): + simplefilter("ignore", DeprecationWarning) + curr_sample_weight *= compute_sample_weight("auto", y, indices=indices) + elif class_weight == "balanced_subsample": + curr_sample_weight *= compute_sample_weight("balanced", y, indices=indices) + + tree._fit( + X, + y, + sample_weight=curr_sample_weight, + check_input=False, + missing_values_in_feature_mask=missing_values_in_feature_mask, + ) + else: + tree._fit( + X, + y, + sample_weight=sample_weight, + check_input=False, + missing_values_in_feature_mask=missing_values_in_feature_mask, + ) + + return tree + + +class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta): + """ + Base class for forests of trees. + + Warning: This class should not be used directly. Use derived classes + instead. + """ + + _parameter_constraints: dict = { + "n_estimators": [Interval(Integral, 1, None, closed="left")], + "bootstrap": ["boolean"], + "oob_score": ["boolean", callable], + "n_jobs": [Integral, None], + "random_state": ["random_state"], + "verbose": ["verbose"], + "warm_start": ["boolean"], + "max_samples": [ + None, + Interval(RealNotInt, 0.0, 1.0, closed="right"), + Interval(Integral, 1, None, closed="left"), + ], + } + + @abstractmethod + def __init__( + self, + estimator, + n_estimators=100, + *, + estimator_params=tuple(), + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + max_samples=None, + ): + super().__init__( + estimator=estimator, + n_estimators=n_estimators, + estimator_params=estimator_params, + ) + + self.bootstrap = bootstrap + self.oob_score = oob_score + self.n_jobs = n_jobs + self.random_state = random_state + self.verbose = verbose + self.warm_start = warm_start + self.class_weight = class_weight + self.max_samples = max_samples + + def apply(self, X): + """ + Apply trees in the forest to X, return leaf indices. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, its dtype will be converted to + ``dtype=np.float32``. If a sparse matrix is provided, it will be + converted into a sparse ``csr_matrix``. + + Returns + ------- + X_leaves : ndarray of shape (n_samples, n_estimators) + For each datapoint x in X and for each tree in the forest, + return the index of the leaf x ends up in. + """ + X = self._validate_X_predict(X) + results = Parallel( + n_jobs=self.n_jobs, + verbose=self.verbose, + prefer="threads", + )(delayed(tree.apply)(X, check_input=False) for tree in self.estimators_) + + return np.array(results).T + + def decision_path(self, X): + """ + Return the decision path in the forest. + + .. versionadded:: 0.18 + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, its dtype will be converted to + ``dtype=np.float32``. If a sparse matrix is provided, it will be + converted into a sparse ``csr_matrix``. + + Returns + ------- + indicator : sparse matrix of shape (n_samples, n_nodes) + Return a node indicator matrix where non zero elements indicates + that the samples goes through the nodes. The matrix is of CSR + format. + + n_nodes_ptr : ndarray of shape (n_estimators + 1,) + The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]] + gives the indicator value for the i-th estimator. + """ + X = self._validate_X_predict(X) + indicators = Parallel( + n_jobs=self.n_jobs, + verbose=self.verbose, + prefer="threads", + )( + delayed(tree.decision_path)(X, check_input=False) + for tree in self.estimators_ + ) + + n_nodes = [0] + n_nodes.extend([i.shape[1] for i in indicators]) + n_nodes_ptr = np.array(n_nodes).cumsum() + + return sparse_hstack(indicators).tocsr(), n_nodes_ptr + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, sample_weight=None): + """ + Build a forest of trees from the training set (X, y). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Internally, its dtype will be converted + to ``dtype=np.float32``. If a sparse matrix is provided, it will be + converted into a sparse ``csc_matrix``. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + The target values (class labels in classification, real numbers in + regression). + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. Splits + that would create child nodes with net zero or negative weight are + ignored while searching for a split in each node. In the case of + classification, splits are also ignored if they would result in any + single class carrying a negative weight in either child node. + + Returns + ------- + self : object + Fitted estimator. + """ + # Validate or convert input data + if issparse(y): + raise ValueError("sparse multilabel-indicator for y is not supported.") + + X, y = validate_data( + self, + X, + y, + multi_output=True, + accept_sparse="csc", + dtype=DTYPE, + ensure_all_finite=False, + ) + # _compute_missing_values_in_feature_mask checks if X has missing values and + # will raise an error if the underlying tree base estimator can't handle missing + # values. Only the criterion is required to determine if the tree supports + # missing values. + estimator = type(self.estimator)(criterion=self.criterion) + missing_values_in_feature_mask = ( + estimator._compute_missing_values_in_feature_mask( + X, estimator_name=self.__class__.__name__ + ) + ) + + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) + + if issparse(X): + # Pre-sort indices to avoid that each individual tree of the + # ensemble sorts the indices. + X.sort_indices() + + y = np.atleast_1d(y) + if y.ndim == 2 and y.shape[1] == 1: + warn( + ( + "A column-vector y was passed when a 1d array was" + " expected. Please change the shape of y to " + "(n_samples,), for example using ravel()." + ), + DataConversionWarning, + stacklevel=2, + ) + + if y.ndim == 1: + # reshape is necessary to preserve the data contiguity against vs + # [:, np.newaxis] that does not. + y = np.reshape(y, (-1, 1)) + + if self.criterion == "poisson": + if np.any(y < 0): + raise ValueError( + "Some value(s) of y are negative which is " + "not allowed for Poisson regression." + ) + if np.sum(y) <= 0: + raise ValueError( + "Sum of y is not strictly positive which " + "is necessary for Poisson regression." + ) + + self._n_samples, self.n_outputs_ = y.shape + + y, expanded_class_weight = self._validate_y_class_weight(y) + + if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: + y = np.ascontiguousarray(y, dtype=DOUBLE) + + if expanded_class_weight is not None: + if sample_weight is not None: + sample_weight = sample_weight * expanded_class_weight + else: + sample_weight = expanded_class_weight + + if not self.bootstrap and self.max_samples is not None: + raise ValueError( + "`max_sample` cannot be set if `bootstrap=False`. " + "Either switch to `bootstrap=True` or set " + "`max_sample=None`." + ) + elif self.bootstrap: + n_samples_bootstrap = _get_n_samples_bootstrap( + n_samples=X.shape[0], max_samples=self.max_samples + ) + else: + n_samples_bootstrap = None + + self._n_samples_bootstrap = n_samples_bootstrap + + self._validate_estimator() + + if not self.bootstrap and self.oob_score: + raise ValueError("Out of bag estimation only available if bootstrap=True") + + random_state = check_random_state(self.random_state) + + if not self.warm_start or not hasattr(self, "estimators_"): + # Free allocated memory, if any + self.estimators_ = [] + + n_more_estimators = self.n_estimators - len(self.estimators_) + + if n_more_estimators < 0: + raise ValueError( + "n_estimators=%d must be larger or equal to " + "len(estimators_)=%d when warm_start==True" + % (self.n_estimators, len(self.estimators_)) + ) + + elif n_more_estimators == 0: + warn( + "Warm-start fitting without increasing n_estimators does not " + "fit new trees." + ) + else: + if self.warm_start and len(self.estimators_) > 0: + # We draw from the random state to get the random state we + # would have got if we hadn't used a warm_start. + random_state.randint(MAX_INT, size=len(self.estimators_)) + + trees = [ + self._make_estimator(append=False, random_state=random_state) + for i in range(n_more_estimators) + ] + + # Parallel loop: we prefer the threading backend as the Cython code + # for fitting the trees is internally releasing the Python GIL + # making threading more efficient than multiprocessing in + # that case. However, for joblib 0.12+ we respect any + # parallel_backend contexts set at a higher level, + # since correctness does not rely on using threads. + trees = Parallel( + n_jobs=self.n_jobs, + verbose=self.verbose, + prefer="threads", + )( + delayed(_parallel_build_trees)( + t, + self.bootstrap, + X, + y, + sample_weight, + i, + len(trees), + verbose=self.verbose, + class_weight=self.class_weight, + n_samples_bootstrap=n_samples_bootstrap, + missing_values_in_feature_mask=missing_values_in_feature_mask, + ) + for i, t in enumerate(trees) + ) + + # Collect newly grown trees + self.estimators_.extend(trees) + + if self.oob_score and ( + n_more_estimators > 0 or not hasattr(self, "oob_score_") + ): + y_type = type_of_target(y) + if y_type == "unknown" or ( + is_classifier(self) and y_type == "multiclass-multioutput" + ): + # FIXME: we could consider to support multiclass-multioutput if + # we introduce or reuse a constructor parameter (e.g. + # oob_score) allowing our user to pass a callable defining the + # scoring strategy on OOB sample. + raise ValueError( + "The type of target cannot be used to compute OOB " + f"estimates. Got {y_type} while only the following are " + "supported: continuous, continuous-multioutput, binary, " + "multiclass, multilabel-indicator." + ) + + if callable(self.oob_score): + self._set_oob_score_and_attributes( + X, y, scoring_function=self.oob_score + ) + else: + self._set_oob_score_and_attributes(X, y) + + # Decapsulate classes_ attributes + if hasattr(self, "classes_") and self.n_outputs_ == 1: + self.n_classes_ = self.n_classes_[0] + self.classes_ = self.classes_[0] + + return self + + @abstractmethod + def _set_oob_score_and_attributes(self, X, y, scoring_function=None): + """Compute and set the OOB score and attributes. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data matrix. + y : ndarray of shape (n_samples, n_outputs) + The target matrix. + scoring_function : callable, default=None + Scoring function for OOB score. Default depends on whether + this is a regression (R2 score) or classification problem + (accuracy score). + """ + + def _compute_oob_predictions(self, X, y): + """Compute and set the OOB score. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data matrix. + y : ndarray of shape (n_samples, n_outputs) + The target matrix. + + Returns + ------- + oob_pred : ndarray of shape (n_samples, n_classes, n_outputs) or \ + (n_samples, 1, n_outputs) + The OOB predictions. + """ + # Prediction requires X to be in CSR format + if issparse(X): + X = X.tocsr() + + n_samples = y.shape[0] + n_outputs = self.n_outputs_ + if is_classifier(self) and hasattr(self, "n_classes_"): + # n_classes_ is a ndarray at this stage + # all the supported type of target will have the same number of + # classes in all outputs + oob_pred_shape = (n_samples, self.n_classes_[0], n_outputs) + else: + # for regression, n_classes_ does not exist and we create an empty + # axis to be consistent with the classification case and make + # the array operations compatible with the 2 settings + oob_pred_shape = (n_samples, 1, n_outputs) + + oob_pred = np.zeros(shape=oob_pred_shape, dtype=np.float64) + n_oob_pred = np.zeros((n_samples, n_outputs), dtype=np.int64) + + n_samples_bootstrap = _get_n_samples_bootstrap( + n_samples, + self.max_samples, + ) + for estimator in self.estimators_: + unsampled_indices = _generate_unsampled_indices( + estimator.random_state, + n_samples, + n_samples_bootstrap, + ) + + y_pred = self._get_oob_predictions(estimator, X[unsampled_indices, :]) + oob_pred[unsampled_indices, ...] += y_pred + n_oob_pred[unsampled_indices, :] += 1 + + for k in range(n_outputs): + if (n_oob_pred == 0).any(): + warn( + ( + "Some inputs do not have OOB scores. This probably means " + "too few trees were used to compute any reliable OOB " + "estimates." + ), + UserWarning, + ) + n_oob_pred[n_oob_pred == 0] = 1 + oob_pred[..., k] /= n_oob_pred[..., [k]] + + return oob_pred + + def _validate_y_class_weight(self, y): + # Default implementation + return y, None + + def _validate_X_predict(self, X): + """ + Validate X whenever one tries to predict, apply, predict_proba.""" + check_is_fitted(self) + if self.estimators_[0]._support_missing_values(X): + ensure_all_finite = "allow-nan" + else: + ensure_all_finite = True + + X = validate_data( + self, + X, + dtype=DTYPE, + accept_sparse="csr", + reset=False, + ensure_all_finite=ensure_all_finite, + ) + if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc): + raise ValueError("No support for np.int64 index based sparse matrices") + return X + + @property + def feature_importances_(self): + """ + The impurity-based feature importances. + + The higher, the more important the feature. + The importance of a feature is computed as the (normalized) + total reduction of the criterion brought by that feature. It is also + known as the Gini importance. + + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + + Returns + ------- + feature_importances_ : ndarray of shape (n_features,) + The values of this array sum to 1, unless all trees are single node + trees consisting of only the root node, in which case it will be an + array of zeros. + """ + check_is_fitted(self) + + all_importances = Parallel(n_jobs=self.n_jobs, prefer="threads")( + delayed(getattr)(tree, "feature_importances_") + for tree in self.estimators_ + if tree.tree_.node_count > 1 + ) + + if not all_importances: + return np.zeros(self.n_features_in_, dtype=np.float64) + + all_importances = np.mean(all_importances, axis=0, dtype=np.float64) + return all_importances / np.sum(all_importances) + + def _get_estimators_indices(self): + # Get drawn indices along both sample and feature axes + for tree in self.estimators_: + if not self.bootstrap: + yield np.arange(self._n_samples, dtype=np.int32) + else: + # tree.random_state is actually an immutable integer seed rather + # than a mutable RandomState instance, so it's safe to use it + # repeatedly when calling this property. + seed = tree.random_state + # Operations accessing random_state must be performed identically + # to those in `_parallel_build_trees()` + yield _generate_sample_indices( + seed, self._n_samples, self._n_samples_bootstrap + ) + + @property + def estimators_samples_(self): + """The subset of drawn samples for each base estimator. + + Returns a dynamically generated list of indices identifying + the samples used for fitting each member of the ensemble, i.e., + the in-bag samples. + + Note: the list is re-created at each call to the property in order + to reduce the object memory footprint by not storing the sampling + data. Thus fetching the property may be slower than expected. + """ + return [sample_indices for sample_indices in self._get_estimators_indices()] + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + # Only the criterion is required to determine if the tree supports + # missing values + estimator = type(self.estimator)(criterion=self.criterion) + tags.input_tags.allow_nan = get_tags(estimator).input_tags.allow_nan + return tags + + +def _accumulate_prediction(predict, X, out, lock): + """ + This is a utility function for joblib's Parallel. + + It can't go locally in ForestClassifier or ForestRegressor, because joblib + complains that it cannot pickle it when placed there. + """ + prediction = predict(X, check_input=False) + with lock: + if len(out) == 1: + out[0] += prediction + else: + for i in range(len(out)): + out[i] += prediction[i] + + +class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): + """ + Base class for forest of trees-based classifiers. + + Warning: This class should not be used directly. Use derived classes + instead. + """ + + @abstractmethod + def __init__( + self, + estimator, + n_estimators=100, + *, + estimator_params=tuple(), + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + max_samples=None, + ): + super().__init__( + estimator=estimator, + n_estimators=n_estimators, + estimator_params=estimator_params, + bootstrap=bootstrap, + oob_score=oob_score, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + warm_start=warm_start, + class_weight=class_weight, + max_samples=max_samples, + ) + + @staticmethod + def _get_oob_predictions(tree, X): + """Compute the OOB predictions for an individual tree. + + Parameters + ---------- + tree : DecisionTreeClassifier object + A single decision tree classifier. + X : ndarray of shape (n_samples, n_features) + The OOB samples. + + Returns + ------- + y_pred : ndarray of shape (n_samples, n_classes, n_outputs) + The OOB associated predictions. + """ + y_pred = tree.predict_proba(X, check_input=False) + y_pred = np.asarray(y_pred) + if y_pred.ndim == 2: + # binary and multiclass + y_pred = y_pred[..., np.newaxis] + else: + # Roll the first `n_outputs` axis to the last axis. We will reshape + # from a shape of (n_outputs, n_samples, n_classes) to a shape of + # (n_samples, n_classes, n_outputs). + y_pred = np.rollaxis(y_pred, axis=0, start=3) + return y_pred + + def _set_oob_score_and_attributes(self, X, y, scoring_function=None): + """Compute and set the OOB score and attributes. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data matrix. + y : ndarray of shape (n_samples, n_outputs) + The target matrix. + scoring_function : callable, default=None + Scoring function for OOB score. Defaults to `accuracy_score`. + """ + self.oob_decision_function_ = super()._compute_oob_predictions(X, y) + if self.oob_decision_function_.shape[-1] == 1: + # drop the n_outputs axis if there is a single output + self.oob_decision_function_ = self.oob_decision_function_.squeeze(axis=-1) + + if scoring_function is None: + scoring_function = accuracy_score + + self.oob_score_ = scoring_function( + y, np.argmax(self.oob_decision_function_, axis=1) + ) + + def _validate_y_class_weight(self, y): + check_classification_targets(y) + + y = np.copy(y) + expanded_class_weight = None + + if self.class_weight is not None: + y_original = np.copy(y) + + self.classes_ = [] + self.n_classes_ = [] + + y_store_unique_indices = np.zeros(y.shape, dtype=int) + for k in range(self.n_outputs_): + classes_k, y_store_unique_indices[:, k] = np.unique( + y[:, k], return_inverse=True + ) + self.classes_.append(classes_k) + self.n_classes_.append(classes_k.shape[0]) + y = y_store_unique_indices + + if self.class_weight is not None: + valid_presets = ("balanced", "balanced_subsample") + if isinstance(self.class_weight, str): + if self.class_weight not in valid_presets: + raise ValueError( + "Valid presets for class_weight include " + '"balanced" and "balanced_subsample".' + 'Given "%s".' % self.class_weight + ) + if self.warm_start: + warn( + 'class_weight presets "balanced" or ' + '"balanced_subsample" are ' + "not recommended for warm_start if the fitted data " + "differs from the full dataset. In order to use " + '"balanced" weights, use compute_class_weight ' + '("balanced", classes, y). In place of y you can use ' + "a large enough sample of the full training set " + "target to properly estimate the class frequency " + "distributions. Pass the resulting weights as the " + "class_weight parameter." + ) + + if self.class_weight != "balanced_subsample" or not self.bootstrap: + if self.class_weight == "balanced_subsample": + class_weight = "balanced" + else: + class_weight = self.class_weight + expanded_class_weight = compute_sample_weight(class_weight, y_original) + + return y, expanded_class_weight + + def predict(self, X): + """ + Predict class for X. + + The predicted class of an input sample is a vote by the trees in + the forest, weighted by their probability estimates. That is, + the predicted class is the one with highest mean probability + estimate across the trees. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, its dtype will be converted to + ``dtype=np.float32``. If a sparse matrix is provided, it will be + converted into a sparse ``csr_matrix``. + + Returns + ------- + y : ndarray of shape (n_samples,) or (n_samples, n_outputs) + The predicted classes. + """ + proba = self.predict_proba(X) + + if self.n_outputs_ == 1: + return self.classes_.take(np.argmax(proba, axis=1), axis=0) + + else: + n_samples = proba[0].shape[0] + # all dtypes should be the same, so just take the first + class_type = self.classes_[0].dtype + predictions = np.empty((n_samples, self.n_outputs_), dtype=class_type) + + for k in range(self.n_outputs_): + predictions[:, k] = self.classes_[k].take( + np.argmax(proba[k], axis=1), axis=0 + ) + + return predictions + + def predict_proba(self, X): + """ + Predict class probabilities for X. + + The predicted class probabilities of an input sample are computed as + the mean predicted class probabilities of the trees in the forest. + The class probability of a single tree is the fraction of samples of + the same class in a leaf. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, its dtype will be converted to + ``dtype=np.float32``. If a sparse matrix is provided, it will be + converted into a sparse ``csr_matrix``. + + Returns + ------- + p : ndarray of shape (n_samples, n_classes), or a list of such arrays + The class probabilities of the input samples. The order of the + classes corresponds to that in the attribute :term:`classes_`. + """ + check_is_fitted(self) + # Check data + X = self._validate_X_predict(X) + + # Assign chunk of trees to jobs + n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) + + # avoid storing the output of every estimator by summing them here + all_proba = [ + np.zeros((X.shape[0], j), dtype=np.float64) + for j in np.atleast_1d(self.n_classes_) + ] + lock = threading.Lock() + Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")( + delayed(_accumulate_prediction)(e.predict_proba, X, all_proba, lock) + for e in self.estimators_ + ) + + for proba in all_proba: + proba /= len(self.estimators_) + + if len(all_proba) == 1: + return all_proba[0] + else: + return all_proba + + def predict_log_proba(self, X): + """ + Predict class log-probabilities for X. + + The predicted class log-probabilities of an input sample is computed as + the log of the mean predicted class probabilities of the trees in the + forest. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, its dtype will be converted to + ``dtype=np.float32``. If a sparse matrix is provided, it will be + converted into a sparse ``csr_matrix``. + + Returns + ------- + p : ndarray of shape (n_samples, n_classes), or a list of such arrays + The class probabilities of the input samples. The order of the + classes corresponds to that in the attribute :term:`classes_`. + """ + proba = self.predict_proba(X) + + if self.n_outputs_ == 1: + return np.log(proba) + + else: + for k in range(self.n_outputs_): + proba[k] = np.log(proba[k]) + + return proba + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.classifier_tags.multi_label = True + tags.input_tags.sparse = True + return tags + + +class ForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta): + """ + Base class for forest of trees-based regressors. + + Warning: This class should not be used directly. Use derived classes + instead. + """ + + @abstractmethod + def __init__( + self, + estimator, + n_estimators=100, + *, + estimator_params=tuple(), + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + max_samples=None, + ): + super().__init__( + estimator, + n_estimators=n_estimators, + estimator_params=estimator_params, + bootstrap=bootstrap, + oob_score=oob_score, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + warm_start=warm_start, + max_samples=max_samples, + ) + + def predict(self, X): + """ + Predict regression target for X. + + The predicted regression target of an input sample is computed as the + mean predicted regression targets of the trees in the forest. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, its dtype will be converted to + ``dtype=np.float32``. If a sparse matrix is provided, it will be + converted into a sparse ``csr_matrix``. + + Returns + ------- + y : ndarray of shape (n_samples,) or (n_samples, n_outputs) + The predicted values. + """ + check_is_fitted(self) + # Check data + X = self._validate_X_predict(X) + + # Assign chunk of trees to jobs + n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) + + # avoid storing the output of every estimator by summing them here + if self.n_outputs_ > 1: + y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=np.float64) + else: + y_hat = np.zeros((X.shape[0]), dtype=np.float64) + + # Parallel loop + lock = threading.Lock() + Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")( + delayed(_accumulate_prediction)(e.predict, X, [y_hat], lock) + for e in self.estimators_ + ) + + y_hat /= len(self.estimators_) + + return y_hat + + @staticmethod + def _get_oob_predictions(tree, X): + """Compute the OOB predictions for an individual tree. + + Parameters + ---------- + tree : DecisionTreeRegressor object + A single decision tree regressor. + X : ndarray of shape (n_samples, n_features) + The OOB samples. + + Returns + ------- + y_pred : ndarray of shape (n_samples, 1, n_outputs) + The OOB associated predictions. + """ + y_pred = tree.predict(X, check_input=False) + if y_pred.ndim == 1: + # single output regression + y_pred = y_pred[:, np.newaxis, np.newaxis] + else: + # multioutput regression + y_pred = y_pred[:, np.newaxis, :] + return y_pred + + def _set_oob_score_and_attributes(self, X, y, scoring_function=None): + """Compute and set the OOB score and attributes. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data matrix. + y : ndarray of shape (n_samples, n_outputs) + The target matrix. + scoring_function : callable, default=None + Scoring function for OOB score. Defaults to `r2_score`. + """ + self.oob_prediction_ = super()._compute_oob_predictions(X, y).squeeze(axis=1) + if self.oob_prediction_.shape[-1] == 1: + # drop the n_outputs axis if there is a single output + self.oob_prediction_ = self.oob_prediction_.squeeze(axis=-1) + + if scoring_function is None: + scoring_function = r2_score + + self.oob_score_ = scoring_function(y, self.oob_prediction_) + + def _compute_partial_dependence_recursion(self, grid, target_features): + """Fast partial dependence computation. + + Parameters + ---------- + grid : ndarray of shape (n_samples, n_target_features), dtype=DTYPE + The grid points on which the partial dependence should be + evaluated. + target_features : ndarray of shape (n_target_features), dtype=np.intp + The set of target features for which the partial dependence + should be evaluated. + + Returns + ------- + averaged_predictions : ndarray of shape (n_samples,) + The value of the partial dependence function on each grid point. + """ + grid = np.asarray(grid, dtype=DTYPE, order="C") + target_features = np.asarray(target_features, dtype=np.intp, order="C") + averaged_predictions = np.zeros( + shape=grid.shape[0], dtype=np.float64, order="C" + ) + + for tree in self.estimators_: + # Note: we don't sum in parallel because the GIL isn't released in + # the fast method. + tree.tree_.compute_partial_dependence( + grid, target_features, averaged_predictions + ) + # Average over the forest + averaged_predictions /= len(self.estimators_) + + return averaged_predictions + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + + +class RandomForestClassifier(ForestClassifier): + """ + A random forest classifier. + + A random forest is a meta estimator that fits a number of decision tree + classifiers on various sub-samples of the dataset and uses averaging to + improve the predictive accuracy and control over-fitting. + Trees in the forest use the best split strategy, i.e. equivalent to passing + `splitter="best"` to the underlying :class:`~sklearn.tree.DecisionTreeClassifier`. + The sub-sample size is controlled with the `max_samples` parameter if + `bootstrap=True` (default), otherwise the whole dataset is used to build + each tree. + + For a comparison between tree-based ensemble models see the example + :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`. + + This estimator has native support for missing values (NaNs). During training, + the tree grower learns at each split point whether samples with missing values + should go to the left or right child, based on the potential gain. When predicting, + samples with missing values are assigned to the left or right child consequently. + If no missing values were encountered for a given feature during training, then + samples with missing values are mapped to whichever child has the most samples. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_estimators : int, default=100 + The number of trees in the forest. + + .. versionchanged:: 0.22 + The default value of ``n_estimators`` changed from 10 to 100 + in 0.22. + + criterion : {"gini", "entropy", "log_loss"}, default="gini" + The function to measure the quality of a split. Supported criteria are + "gini" for the Gini impurity and "log_loss" and "entropy" both for the + Shannon information gain, see :ref:`tree_mathematical_formulation`. + Note: This parameter is tree-specific. + + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and + `ceil(min_samples_split * n_samples)` are the minimum + number of samples for each split. + + .. versionchanged:: 0.18 + Added float values for fractions. + + min_samples_leaf : int or float, default=1 + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and + `ceil(min_samples_leaf * n_samples)` are the minimum + number of samples for each node. + + .. versionchanged:: 0.18 + Added float values for fractions. + + min_weight_fraction_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. + + max_features : {"sqrt", "log2", None}, int or float, default="sqrt" + The number of features to consider when looking for the best split: + + - If int, then consider `max_features` features at each split. + - If float, then `max_features` is a fraction and + `max(1, int(max_features * n_features_in_))` features are considered at each + split. + - If "sqrt", then `max_features=sqrt(n_features)`. + - If "log2", then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + + .. versionchanged:: 1.1 + The default of `max_features` changed from `"auto"` to `"sqrt"`. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + + max_leaf_nodes : int, default=None + Grow trees with ``max_leaf_nodes`` in best-first fashion. + Best nodes are defined as relative reduction in impurity. + If None then unlimited number of leaf nodes. + + min_impurity_decrease : float, default=0.0 + A node will be split if this split induces a decrease of the impurity + greater than or equal to this value. + + The weighted impurity decrease equation is the following:: + + N_t / N * (impurity - N_t_R / N_t * right_impurity + - N_t_L / N_t * left_impurity) + + where ``N`` is the total number of samples, ``N_t`` is the number of + samples at the current node, ``N_t_L`` is the number of samples in the + left child, and ``N_t_R`` is the number of samples in the right child. + + ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, + if ``sample_weight`` is passed. + + .. versionadded:: 0.19 + + bootstrap : bool, default=True + Whether bootstrap samples are used when building trees. If False, the + whole dataset is used to build each tree. + + oob_score : bool or callable, default=False + Whether to use out-of-bag samples to estimate the generalization score. + By default, :func:`~sklearn.metrics.accuracy_score` is used. + Provide a callable with signature `metric(y_true, y_pred)` to use a + custom metric. Only available if `bootstrap=True`. + + For an illustration of out-of-bag (OOB) error estimation, see the example + :ref:`sphx_glr_auto_examples_ensemble_plot_ensemble_oob.py`. + + n_jobs : int, default=None + The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`, + :meth:`decision_path` and :meth:`apply` are all parallelized over the + trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` + context. ``-1`` means using all processors. See :term:`Glossary + ` for more details. + + random_state : int, RandomState instance or None, default=None + Controls both the randomness of the bootstrapping of the samples used + when building trees (if ``bootstrap=True``) and the sampling of the + features to consider when looking for the best split at each node + (if ``max_features < n_features``). + See :term:`Glossary ` for details. + + verbose : int, default=0 + Controls the verbosity when fitting and predicting. + + warm_start : bool, default=False + When set to ``True``, reuse the solution of the previous call to fit + and add more estimators to the ensemble, otherwise, just fit a whole + new forest. See :term:`Glossary ` and + :ref:`tree_ensemble_warm_start` for details. + + class_weight : {"balanced", "balanced_subsample"}, dict or list of dicts, \ + default=None + Weights associated with classes in the form ``{class_label: weight}``. + If not given, all classes are supposed to have weight one. For + multi-output problems, a list of dicts can be provided in the same + order as the columns of y. + + Note that for multioutput (including multilabel) weights should be + defined for each class of every column in its own dict. For example, + for four-class multilabel classification weights should be + [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of + [{1:1}, {2:5}, {3:1}, {4:1}]. + + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))`` + + The "balanced_subsample" mode is the same as "balanced" except that + weights are computed based on the bootstrap sample for every tree + grown. + + For multi-output, the weights of each column of y will be multiplied. + + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if sample_weight is specified. + + ccp_alpha : non-negative float, default=0.0 + Complexity parameter used for Minimal Cost-Complexity Pruning. The + subtree with the largest cost complexity that is smaller than + ``ccp_alpha`` will be chosen. By default, no pruning is performed. See + :ref:`minimal_cost_complexity_pruning` for details. See + :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py` + for an example of such pruning. + + .. versionadded:: 0.22 + + max_samples : int or float, default=None + If bootstrap is True, the number of samples to draw from X + to train each base estimator. + + - If None (default), then draw `X.shape[0]` samples. + - If int, then draw `max_samples` samples. + - If float, then draw `max(round(n_samples * max_samples), 1)` samples. Thus, + `max_samples` should be in the interval `(0.0, 1.0]`. + + .. versionadded:: 0.22 + + monotonic_cst : array-like of int of shape (n_features), default=None + Indicates the monotonicity constraint to enforce on each feature. + - 1: monotonic increase + - 0: no constraint + - -1: monotonic decrease + + If monotonic_cst is None, no constraints are applied. + + Monotonicity constraints are not supported for: + - multiclass classifications (i.e. when `n_classes > 2`), + - multioutput classifications (i.e. when `n_outputs_ > 1`), + - classifications trained on data with missing values. + + The constraints hold over the probability of the positive class. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.4 + + Attributes + ---------- + estimator_ : :class:`~sklearn.tree.DecisionTreeClassifier` + The child estimator template used to create the collection of fitted + sub-estimators. + + .. versionadded:: 1.2 + `base_estimator_` was renamed to `estimator_`. + + estimators_ : list of DecisionTreeClassifier + The collection of fitted sub-estimators. + + classes_ : ndarray of shape (n_classes,) or a list of such arrays + The classes labels (single output problem), or a list of arrays of + class labels (multi-output problem). + + n_classes_ : int or list + The number of classes (single output problem), or a list containing the + number of classes for each output (multi-output problem). + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_outputs_ : int + The number of outputs when ``fit`` is performed. + + feature_importances_ : ndarray of shape (n_features,) + The impurity-based feature importances. + The higher, the more important the feature. + The importance of a feature is computed as the (normalized) + total reduction of the criterion brought by that feature. It is also + known as the Gini importance. + + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + + oob_score_ : float + Score of the training dataset obtained using an out-of-bag estimate. + This attribute exists only when ``oob_score`` is True. + + oob_decision_function_ : ndarray of shape (n_samples, n_classes) or \ + (n_samples, n_classes, n_outputs) + Decision function computed with out-of-bag estimate on the training + set. If n_estimators is small it might be possible that a data point + was never left out during the bootstrap. In this case, + `oob_decision_function_` might contain NaN. This attribute exists + only when ``oob_score`` is True. + + estimators_samples_ : list of arrays + The subset of drawn samples (i.e., the in-bag samples) for each base + estimator. Each subset is defined by an array of the indices selected. + + .. versionadded:: 1.4 + + See Also + -------- + sklearn.tree.DecisionTreeClassifier : A decision tree classifier. + sklearn.ensemble.ExtraTreesClassifier : Ensemble of extremely randomized + tree classifiers. + sklearn.ensemble.HistGradientBoostingClassifier : A Histogram-based Gradient + Boosting Classification Tree, very fast for big datasets (n_samples >= + 10_000). + + Notes + ----- + The default values for the parameters controlling the size of the trees + (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and + unpruned trees which can potentially be very large on some data sets. To + reduce memory consumption, the complexity and size of the trees should be + controlled by setting those parameter values. + + The features are always randomly permuted at each split. Therefore, + the best found split may vary, even with the same training data, + ``max_features=n_features`` and ``bootstrap=False``, if the improvement + of the criterion is identical for several splits enumerated during the + search of the best split. To obtain a deterministic behaviour during + fitting, ``random_state`` has to be fixed. + + References + ---------- + .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001. + + Examples + -------- + >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.datasets import make_classification + >>> X, y = make_classification(n_samples=1000, n_features=4, + ... n_informative=2, n_redundant=0, + ... random_state=0, shuffle=False) + >>> clf = RandomForestClassifier(max_depth=2, random_state=0) + >>> clf.fit(X, y) + RandomForestClassifier(...) + >>> print(clf.predict([[0, 0, 0, 0]])) + [1] + """ + + _parameter_constraints: dict = { + **ForestClassifier._parameter_constraints, + **DecisionTreeClassifier._parameter_constraints, + "class_weight": [ + StrOptions({"balanced_subsample", "balanced"}), + dict, + list, + None, + ], + } + _parameter_constraints.pop("splitter") + + def __init__( + self, + n_estimators=100, + *, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="sqrt", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + monotonic_cst=None, + ): + super().__init__( + estimator=DecisionTreeClassifier(), + n_estimators=n_estimators, + estimator_params=( + "criterion", + "max_depth", + "min_samples_split", + "min_samples_leaf", + "min_weight_fraction_leaf", + "max_features", + "max_leaf_nodes", + "min_impurity_decrease", + "random_state", + "ccp_alpha", + "monotonic_cst", + ), + bootstrap=bootstrap, + oob_score=oob_score, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + warm_start=warm_start, + class_weight=class_weight, + max_samples=max_samples, + ) + + self.criterion = criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_decrease = min_impurity_decrease + self.monotonic_cst = monotonic_cst + self.ccp_alpha = ccp_alpha + + +class RandomForestRegressor(ForestRegressor): + """ + A random forest regressor. + + A random forest is a meta estimator that fits a number of decision tree + regressors on various sub-samples of the dataset and uses averaging to + improve the predictive accuracy and control over-fitting. + Trees in the forest use the best split strategy, i.e. equivalent to passing + `splitter="best"` to the underlying :class:`~sklearn.tree.DecisionTreeRegressor`. + The sub-sample size is controlled with the `max_samples` parameter if + `bootstrap=True` (default), otherwise the whole dataset is used to build + each tree. + + This estimator has native support for missing values (NaNs). During training, + the tree grower learns at each split point whether samples with missing values + should go to the left or right child, based on the potential gain. When predicting, + samples with missing values are assigned to the left or right child consequently. + If no missing values were encountered for a given feature during training, then + samples with missing values are mapped to whichever child has the most samples. + + For a comparison between tree-based ensemble models see the example + :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_estimators : int, default=100 + The number of trees in the forest. + + .. versionchanged:: 0.22 + The default value of ``n_estimators`` changed from 10 to 100 + in 0.22. + + criterion : {"squared_error", "absolute_error", "friedman_mse", "poisson"}, \ + default="squared_error" + The function to measure the quality of a split. Supported criteria + are "squared_error" for the mean squared error, which is equal to + variance reduction as feature selection criterion and minimizes the L2 + loss using the mean of each terminal node, "friedman_mse", which uses + mean squared error with Friedman's improvement score for potential + splits, "absolute_error" for the mean absolute error, which minimizes + the L1 loss using the median of each terminal node, and "poisson" which + uses reduction in Poisson deviance to find splits. + Training using "absolute_error" is significantly slower + than when using "squared_error". + + .. versionadded:: 0.18 + Mean Absolute Error (MAE) criterion. + + .. versionadded:: 1.0 + Poisson criterion. + + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and + `ceil(min_samples_split * n_samples)` are the minimum + number of samples for each split. + + .. versionchanged:: 0.18 + Added float values for fractions. + + min_samples_leaf : int or float, default=1 + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and + `ceil(min_samples_leaf * n_samples)` are the minimum + number of samples for each node. + + .. versionchanged:: 0.18 + Added float values for fractions. + + min_weight_fraction_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. + + max_features : {"sqrt", "log2", None}, int or float, default=1.0 + The number of features to consider when looking for the best split: + + - If int, then consider `max_features` features at each split. + - If float, then `max_features` is a fraction and + `max(1, int(max_features * n_features_in_))` features are considered at each + split. + - If "sqrt", then `max_features=sqrt(n_features)`. + - If "log2", then `max_features=log2(n_features)`. + - If None or 1.0, then `max_features=n_features`. + + .. note:: + The default of 1.0 is equivalent to bagged trees and more + randomness can be achieved by setting smaller values, e.g. 0.3. + + .. versionchanged:: 1.1 + The default of `max_features` changed from `"auto"` to 1.0. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + + max_leaf_nodes : int, default=None + Grow trees with ``max_leaf_nodes`` in best-first fashion. + Best nodes are defined as relative reduction in impurity. + If None then unlimited number of leaf nodes. + + min_impurity_decrease : float, default=0.0 + A node will be split if this split induces a decrease of the impurity + greater than or equal to this value. + + The weighted impurity decrease equation is the following:: + + N_t / N * (impurity - N_t_R / N_t * right_impurity + - N_t_L / N_t * left_impurity) + + where ``N`` is the total number of samples, ``N_t`` is the number of + samples at the current node, ``N_t_L`` is the number of samples in the + left child, and ``N_t_R`` is the number of samples in the right child. + + ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, + if ``sample_weight`` is passed. + + .. versionadded:: 0.19 + + bootstrap : bool, default=True + Whether bootstrap samples are used when building trees. If False, the + whole dataset is used to build each tree. + + oob_score : bool or callable, default=False + Whether to use out-of-bag samples to estimate the generalization score. + By default, :func:`~sklearn.metrics.r2_score` is used. + Provide a callable with signature `metric(y_true, y_pred)` to use a + custom metric. Only available if `bootstrap=True`. + + For an illustration of out-of-bag (OOB) error estimation, see the example + :ref:`sphx_glr_auto_examples_ensemble_plot_ensemble_oob.py`. + + n_jobs : int, default=None + The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`, + :meth:`decision_path` and :meth:`apply` are all parallelized over the + trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` + context. ``-1`` means using all processors. See :term:`Glossary + ` for more details. + + random_state : int, RandomState instance or None, default=None + Controls both the randomness of the bootstrapping of the samples used + when building trees (if ``bootstrap=True``) and the sampling of the + features to consider when looking for the best split at each node + (if ``max_features < n_features``). + See :term:`Glossary ` for details. + + verbose : int, default=0 + Controls the verbosity when fitting and predicting. + + warm_start : bool, default=False + When set to ``True``, reuse the solution of the previous call to fit + and add more estimators to the ensemble, otherwise, just fit a whole + new forest. See :term:`Glossary ` and + :ref:`tree_ensemble_warm_start` for details. + + ccp_alpha : non-negative float, default=0.0 + Complexity parameter used for Minimal Cost-Complexity Pruning. The + subtree with the largest cost complexity that is smaller than + ``ccp_alpha`` will be chosen. By default, no pruning is performed. See + :ref:`minimal_cost_complexity_pruning` for details. See + :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py` + for an example of such pruning. + + .. versionadded:: 0.22 + + max_samples : int or float, default=None + If bootstrap is True, the number of samples to draw from X + to train each base estimator. + + - If None (default), then draw `X.shape[0]` samples. + - If int, then draw `max_samples` samples. + - If float, then draw `max(round(n_samples * max_samples), 1)` samples. Thus, + `max_samples` should be in the interval `(0.0, 1.0]`. + + .. versionadded:: 0.22 + + monotonic_cst : array-like of int of shape (n_features), default=None + Indicates the monotonicity constraint to enforce on each feature. + - 1: monotonically increasing + - 0: no constraint + - -1: monotonically decreasing + + If monotonic_cst is None, no constraints are applied. + + Monotonicity constraints are not supported for: + - multioutput regressions (i.e. when `n_outputs_ > 1`), + - regressions trained on data with missing values. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.4 + + Attributes + ---------- + estimator_ : :class:`~sklearn.tree.DecisionTreeRegressor` + The child estimator template used to create the collection of fitted + sub-estimators. + + .. versionadded:: 1.2 + `base_estimator_` was renamed to `estimator_`. + + estimators_ : list of DecisionTreeRegressor + The collection of fitted sub-estimators. + + feature_importances_ : ndarray of shape (n_features,) + The impurity-based feature importances. + The higher, the more important the feature. + The importance of a feature is computed as the (normalized) + total reduction of the criterion brought by that feature. It is also + known as the Gini importance. + + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_outputs_ : int + The number of outputs when ``fit`` is performed. + + oob_score_ : float + Score of the training dataset obtained using an out-of-bag estimate. + This attribute exists only when ``oob_score`` is True. + + oob_prediction_ : ndarray of shape (n_samples,) or (n_samples, n_outputs) + Prediction computed with out-of-bag estimate on the training set. + This attribute exists only when ``oob_score`` is True. + + estimators_samples_ : list of arrays + The subset of drawn samples (i.e., the in-bag samples) for each base + estimator. Each subset is defined by an array of the indices selected. + + .. versionadded:: 1.4 + + See Also + -------- + sklearn.tree.DecisionTreeRegressor : A decision tree regressor. + sklearn.ensemble.ExtraTreesRegressor : Ensemble of extremely randomized + tree regressors. + sklearn.ensemble.HistGradientBoostingRegressor : A Histogram-based Gradient + Boosting Regression Tree, very fast for big datasets (n_samples >= + 10_000). + + Notes + ----- + The default values for the parameters controlling the size of the trees + (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and + unpruned trees which can potentially be very large on some data sets. To + reduce memory consumption, the complexity and size of the trees should be + controlled by setting those parameter values. + + The features are always randomly permuted at each split. Therefore, + the best found split may vary, even with the same training data, + ``max_features=n_features`` and ``bootstrap=False``, if the improvement + of the criterion is identical for several splits enumerated during the + search of the best split. To obtain a deterministic behaviour during + fitting, ``random_state`` has to be fixed. + + The default value ``max_features=1.0`` uses ``n_features`` + rather than ``n_features / 3``. The latter was originally suggested in + [1], whereas the former was more recently justified empirically in [2]. + + References + ---------- + .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001. + + .. [2] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized + trees", Machine Learning, 63(1), 3-42, 2006. + + Examples + -------- + >>> from sklearn.ensemble import RandomForestRegressor + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression(n_features=4, n_informative=2, + ... random_state=0, shuffle=False) + >>> regr = RandomForestRegressor(max_depth=2, random_state=0) + >>> regr.fit(X, y) + RandomForestRegressor(...) + >>> print(regr.predict([[0, 0, 0, 0]])) + [-8.32987858] + """ + + _parameter_constraints: dict = { + **ForestRegressor._parameter_constraints, + **DecisionTreeRegressor._parameter_constraints, + } + _parameter_constraints.pop("splitter") + + def __init__( + self, + n_estimators=100, + *, + criterion="squared_error", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=1.0, + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + monotonic_cst=None, + ): + super().__init__( + estimator=DecisionTreeRegressor(), + n_estimators=n_estimators, + estimator_params=( + "criterion", + "max_depth", + "min_samples_split", + "min_samples_leaf", + "min_weight_fraction_leaf", + "max_features", + "max_leaf_nodes", + "min_impurity_decrease", + "random_state", + "ccp_alpha", + "monotonic_cst", + ), + bootstrap=bootstrap, + oob_score=oob_score, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + warm_start=warm_start, + max_samples=max_samples, + ) + + self.criterion = criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_decrease = min_impurity_decrease + self.ccp_alpha = ccp_alpha + self.monotonic_cst = monotonic_cst + + +class ExtraTreesClassifier(ForestClassifier): + """ + An extra-trees classifier. + + This class implements a meta estimator that fits a number of + randomized decision trees (a.k.a. extra-trees) on various sub-samples + of the dataset and uses averaging to improve the predictive accuracy + and control over-fitting. + + This estimator has native support for missing values (NaNs) for + random splits. During training, a random threshold will be chosen + to split the non-missing values on. Then the non-missing values will be sent + to the left and right child based on the randomly selected threshold, while + the missing values will also be randomly sent to the left or right child. + This is repeated for every feature considered at each split. The best split + among these is chosen. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_estimators : int, default=100 + The number of trees in the forest. + + .. versionchanged:: 0.22 + The default value of ``n_estimators`` changed from 10 to 100 + in 0.22. + + criterion : {"gini", "entropy", "log_loss"}, default="gini" + The function to measure the quality of a split. Supported criteria are + "gini" for the Gini impurity and "log_loss" and "entropy" both for the + Shannon information gain, see :ref:`tree_mathematical_formulation`. + Note: This parameter is tree-specific. + + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and + `ceil(min_samples_split * n_samples)` are the minimum + number of samples for each split. + + .. versionchanged:: 0.18 + Added float values for fractions. + + min_samples_leaf : int or float, default=1 + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and + `ceil(min_samples_leaf * n_samples)` are the minimum + number of samples for each node. + + .. versionchanged:: 0.18 + Added float values for fractions. + + min_weight_fraction_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. + + max_features : {"sqrt", "log2", None}, int or float, default="sqrt" + The number of features to consider when looking for the best split: + + - If int, then consider `max_features` features at each split. + - If float, then `max_features` is a fraction and + `max(1, int(max_features * n_features_in_))` features are considered at each + split. + - If "sqrt", then `max_features=sqrt(n_features)`. + - If "log2", then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + + .. versionchanged:: 1.1 + The default of `max_features` changed from `"auto"` to `"sqrt"`. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + + max_leaf_nodes : int, default=None + Grow trees with ``max_leaf_nodes`` in best-first fashion. + Best nodes are defined as relative reduction in impurity. + If None then unlimited number of leaf nodes. + + min_impurity_decrease : float, default=0.0 + A node will be split if this split induces a decrease of the impurity + greater than or equal to this value. + + The weighted impurity decrease equation is the following:: + + N_t / N * (impurity - N_t_R / N_t * right_impurity + - N_t_L / N_t * left_impurity) + + where ``N`` is the total number of samples, ``N_t`` is the number of + samples at the current node, ``N_t_L`` is the number of samples in the + left child, and ``N_t_R`` is the number of samples in the right child. + + ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, + if ``sample_weight`` is passed. + + .. versionadded:: 0.19 + + bootstrap : bool, default=False + Whether bootstrap samples are used when building trees. If False, the + whole dataset is used to build each tree. + + oob_score : bool or callable, default=False + Whether to use out-of-bag samples to estimate the generalization score. + By default, :func:`~sklearn.metrics.accuracy_score` is used. + Provide a callable with signature `metric(y_true, y_pred)` to use a + custom metric. Only available if `bootstrap=True`. + + For an illustration of out-of-bag (OOB) error estimation, see the example + :ref:`sphx_glr_auto_examples_ensemble_plot_ensemble_oob.py`. + + n_jobs : int, default=None + The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`, + :meth:`decision_path` and :meth:`apply` are all parallelized over the + trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` + context. ``-1`` means using all processors. See :term:`Glossary + ` for more details. + + random_state : int, RandomState instance or None, default=None + Controls 3 sources of randomness: + + - the bootstrapping of the samples used when building trees + (if ``bootstrap=True``) + - the sampling of the features to consider when looking for the best + split at each node (if ``max_features < n_features``) + - the draw of the splits for each of the `max_features` + + See :term:`Glossary ` for details. + + verbose : int, default=0 + Controls the verbosity when fitting and predicting. + + warm_start : bool, default=False + When set to ``True``, reuse the solution of the previous call to fit + and add more estimators to the ensemble, otherwise, just fit a whole + new forest. See :term:`Glossary ` and + :ref:`tree_ensemble_warm_start` for details. + + class_weight : {"balanced", "balanced_subsample"}, dict or list of dicts, \ + default=None + Weights associated with classes in the form ``{class_label: weight}``. + If not given, all classes are supposed to have weight one. For + multi-output problems, a list of dicts can be provided in the same + order as the columns of y. + + Note that for multioutput (including multilabel) weights should be + defined for each class of every column in its own dict. For example, + for four-class multilabel classification weights should be + [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of + [{1:1}, {2:5}, {3:1}, {4:1}]. + + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))`` + + The "balanced_subsample" mode is the same as "balanced" except that + weights are computed based on the bootstrap sample for every tree + grown. + + For multi-output, the weights of each column of y will be multiplied. + + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if sample_weight is specified. + + ccp_alpha : non-negative float, default=0.0 + Complexity parameter used for Minimal Cost-Complexity Pruning. The + subtree with the largest cost complexity that is smaller than + ``ccp_alpha`` will be chosen. By default, no pruning is performed. See + :ref:`minimal_cost_complexity_pruning` for details. See + :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py` + for an example of such pruning. + + .. versionadded:: 0.22 + + max_samples : int or float, default=None + If bootstrap is True, the number of samples to draw from X + to train each base estimator. + + - If None (default), then draw `X.shape[0]` samples. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. Thus, + `max_samples` should be in the interval `(0.0, 1.0]`. + + .. versionadded:: 0.22 + + monotonic_cst : array-like of int of shape (n_features), default=None + Indicates the monotonicity constraint to enforce on each feature. + - 1: monotonically increasing + - 0: no constraint + - -1: monotonically decreasing + + If monotonic_cst is None, no constraints are applied. + + Monotonicity constraints are not supported for: + - multiclass classifications (i.e. when `n_classes > 2`), + - multioutput classifications (i.e. when `n_outputs_ > 1`), + - classifications trained on data with missing values. + + The constraints hold over the probability of the positive class. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.4 + + Attributes + ---------- + estimator_ : :class:`~sklearn.tree.ExtraTreeClassifier` + The child estimator template used to create the collection of fitted + sub-estimators. + + .. versionadded:: 1.2 + `base_estimator_` was renamed to `estimator_`. + + estimators_ : list of DecisionTreeClassifier + The collection of fitted sub-estimators. + + classes_ : ndarray of shape (n_classes,) or a list of such arrays + The classes labels (single output problem), or a list of arrays of + class labels (multi-output problem). + + n_classes_ : int or list + The number of classes (single output problem), or a list containing the + number of classes for each output (multi-output problem). + + feature_importances_ : ndarray of shape (n_features,) + The impurity-based feature importances. + The higher, the more important the feature. + The importance of a feature is computed as the (normalized) + total reduction of the criterion brought by that feature. It is also + known as the Gini importance. + + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_outputs_ : int + The number of outputs when ``fit`` is performed. + + oob_score_ : float + Score of the training dataset obtained using an out-of-bag estimate. + This attribute exists only when ``oob_score`` is True. + + oob_decision_function_ : ndarray of shape (n_samples, n_classes) or \ + (n_samples, n_classes, n_outputs) + Decision function computed with out-of-bag estimate on the training + set. If n_estimators is small it might be possible that a data point + was never left out during the bootstrap. In this case, + `oob_decision_function_` might contain NaN. This attribute exists + only when ``oob_score`` is True. + + estimators_samples_ : list of arrays + The subset of drawn samples (i.e., the in-bag samples) for each base + estimator. Each subset is defined by an array of the indices selected. + + .. versionadded:: 1.4 + + See Also + -------- + ExtraTreesRegressor : An extra-trees regressor with random splits. + RandomForestClassifier : A random forest classifier with optimal splits. + RandomForestRegressor : Ensemble regressor using trees with optimal splits. + + Notes + ----- + The default values for the parameters controlling the size of the trees + (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and + unpruned trees which can potentially be very large on some data sets. To + reduce memory consumption, the complexity and size of the trees should be + controlled by setting those parameter values. + + References + ---------- + .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized + trees", Machine Learning, 63(1), 3-42, 2006. + + Examples + -------- + >>> from sklearn.ensemble import ExtraTreesClassifier + >>> from sklearn.datasets import make_classification + >>> X, y = make_classification(n_features=4, random_state=0) + >>> clf = ExtraTreesClassifier(n_estimators=100, random_state=0) + >>> clf.fit(X, y) + ExtraTreesClassifier(random_state=0) + >>> clf.predict([[0, 0, 0, 0]]) + array([1]) + """ + + _parameter_constraints: dict = { + **ForestClassifier._parameter_constraints, + **DecisionTreeClassifier._parameter_constraints, + "class_weight": [ + StrOptions({"balanced_subsample", "balanced"}), + dict, + list, + None, + ], + } + _parameter_constraints.pop("splitter") + + def __init__( + self, + n_estimators=100, + *, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="sqrt", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + monotonic_cst=None, + ): + super().__init__( + estimator=ExtraTreeClassifier(), + n_estimators=n_estimators, + estimator_params=( + "criterion", + "max_depth", + "min_samples_split", + "min_samples_leaf", + "min_weight_fraction_leaf", + "max_features", + "max_leaf_nodes", + "min_impurity_decrease", + "random_state", + "ccp_alpha", + "monotonic_cst", + ), + bootstrap=bootstrap, + oob_score=oob_score, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + warm_start=warm_start, + class_weight=class_weight, + max_samples=max_samples, + ) + + self.criterion = criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_decrease = min_impurity_decrease + self.ccp_alpha = ccp_alpha + self.monotonic_cst = monotonic_cst + + +class ExtraTreesRegressor(ForestRegressor): + """ + An extra-trees regressor. + + This class implements a meta estimator that fits a number of + randomized decision trees (a.k.a. extra-trees) on various sub-samples + of the dataset and uses averaging to improve the predictive accuracy + and control over-fitting. + + This estimator has native support for missing values (NaNs) for + random splits. During training, a random threshold will be chosen + to split the non-missing values on. Then the non-missing values will be sent + to the left and right child based on the randomly selected threshold, while + the missing values will also be randomly sent to the left or right child. + This is repeated for every feature considered at each split. The best split + among these is chosen. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_estimators : int, default=100 + The number of trees in the forest. + + .. versionchanged:: 0.22 + The default value of ``n_estimators`` changed from 10 to 100 + in 0.22. + + criterion : {"squared_error", "absolute_error", "friedman_mse", "poisson"}, \ + default="squared_error" + The function to measure the quality of a split. Supported criteria + are "squared_error" for the mean squared error, which is equal to + variance reduction as feature selection criterion and minimizes the L2 + loss using the mean of each terminal node, "friedman_mse", which uses + mean squared error with Friedman's improvement score for potential + splits, "absolute_error" for the mean absolute error, which minimizes + the L1 loss using the median of each terminal node, and "poisson" which + uses reduction in Poisson deviance to find splits. + Training using "absolute_error" is significantly slower + than when using "squared_error". + + .. versionadded:: 0.18 + Mean Absolute Error (MAE) criterion. + + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and + `ceil(min_samples_split * n_samples)` are the minimum + number of samples for each split. + + .. versionchanged:: 0.18 + Added float values for fractions. + + min_samples_leaf : int or float, default=1 + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and + `ceil(min_samples_leaf * n_samples)` are the minimum + number of samples for each node. + + .. versionchanged:: 0.18 + Added float values for fractions. + + min_weight_fraction_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. + + max_features : {"sqrt", "log2", None}, int or float, default=1.0 + The number of features to consider when looking for the best split: + + - If int, then consider `max_features` features at each split. + - If float, then `max_features` is a fraction and + `max(1, int(max_features * n_features_in_))` features are considered at each + split. + - If "sqrt", then `max_features=sqrt(n_features)`. + - If "log2", then `max_features=log2(n_features)`. + - If None or 1.0, then `max_features=n_features`. + + .. note:: + The default of 1.0 is equivalent to bagged trees and more + randomness can be achieved by setting smaller values, e.g. 0.3. + + .. versionchanged:: 1.1 + The default of `max_features` changed from `"auto"` to 1.0. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + + max_leaf_nodes : int, default=None + Grow trees with ``max_leaf_nodes`` in best-first fashion. + Best nodes are defined as relative reduction in impurity. + If None then unlimited number of leaf nodes. + + min_impurity_decrease : float, default=0.0 + A node will be split if this split induces a decrease of the impurity + greater than or equal to this value. + + The weighted impurity decrease equation is the following:: + + N_t / N * (impurity - N_t_R / N_t * right_impurity + - N_t_L / N_t * left_impurity) + + where ``N`` is the total number of samples, ``N_t`` is the number of + samples at the current node, ``N_t_L`` is the number of samples in the + left child, and ``N_t_R`` is the number of samples in the right child. + + ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, + if ``sample_weight`` is passed. + + .. versionadded:: 0.19 + + bootstrap : bool, default=False + Whether bootstrap samples are used when building trees. If False, the + whole dataset is used to build each tree. + + oob_score : bool or callable, default=False + Whether to use out-of-bag samples to estimate the generalization score. + By default, :func:`~sklearn.metrics.r2_score` is used. + Provide a callable with signature `metric(y_true, y_pred)` to use a + custom metric. Only available if `bootstrap=True`. + + For an illustration of out-of-bag (OOB) error estimation, see the example + :ref:`sphx_glr_auto_examples_ensemble_plot_ensemble_oob.py`. + + n_jobs : int, default=None + The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`, + :meth:`decision_path` and :meth:`apply` are all parallelized over the + trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` + context. ``-1`` means using all processors. See :term:`Glossary + ` for more details. + + random_state : int, RandomState instance or None, default=None + Controls 3 sources of randomness: + + - the bootstrapping of the samples used when building trees + (if ``bootstrap=True``) + - the sampling of the features to consider when looking for the best + split at each node (if ``max_features < n_features``) + - the draw of the splits for each of the `max_features` + + See :term:`Glossary ` for details. + + verbose : int, default=0 + Controls the verbosity when fitting and predicting. + + warm_start : bool, default=False + When set to ``True``, reuse the solution of the previous call to fit + and add more estimators to the ensemble, otherwise, just fit a whole + new forest. See :term:`Glossary ` and + :ref:`tree_ensemble_warm_start` for details. + + ccp_alpha : non-negative float, default=0.0 + Complexity parameter used for Minimal Cost-Complexity Pruning. The + subtree with the largest cost complexity that is smaller than + ``ccp_alpha`` will be chosen. By default, no pruning is performed. See + :ref:`minimal_cost_complexity_pruning` for details. See + :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py` + for an example of such pruning. + + .. versionadded:: 0.22 + + max_samples : int or float, default=None + If bootstrap is True, the number of samples to draw from X + to train each base estimator. + + - If None (default), then draw `X.shape[0]` samples. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. Thus, + `max_samples` should be in the interval `(0.0, 1.0]`. + + .. versionadded:: 0.22 + + monotonic_cst : array-like of int of shape (n_features), default=None + Indicates the monotonicity constraint to enforce on each feature. + - 1: monotonically increasing + - 0: no constraint + - -1: monotonically decreasing + + If monotonic_cst is None, no constraints are applied. + + Monotonicity constraints are not supported for: + - multioutput regressions (i.e. when `n_outputs_ > 1`), + - regressions trained on data with missing values. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.4 + + Attributes + ---------- + estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` + The child estimator template used to create the collection of fitted + sub-estimators. + + .. versionadded:: 1.2 + `base_estimator_` was renamed to `estimator_`. + + estimators_ : list of DecisionTreeRegressor + The collection of fitted sub-estimators. + + feature_importances_ : ndarray of shape (n_features,) + The impurity-based feature importances. + The higher, the more important the feature. + The importance of a feature is computed as the (normalized) + total reduction of the criterion brought by that feature. It is also + known as the Gini importance. + + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_outputs_ : int + The number of outputs. + + oob_score_ : float + Score of the training dataset obtained using an out-of-bag estimate. + This attribute exists only when ``oob_score`` is True. + + oob_prediction_ : ndarray of shape (n_samples,) or (n_samples, n_outputs) + Prediction computed with out-of-bag estimate on the training set. + This attribute exists only when ``oob_score`` is True. + + estimators_samples_ : list of arrays + The subset of drawn samples (i.e., the in-bag samples) for each base + estimator. Each subset is defined by an array of the indices selected. + + .. versionadded:: 1.4 + + See Also + -------- + ExtraTreesClassifier : An extra-trees classifier with random splits. + RandomForestClassifier : A random forest classifier with optimal splits. + RandomForestRegressor : Ensemble regressor using trees with optimal splits. + + Notes + ----- + The default values for the parameters controlling the size of the trees + (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and + unpruned trees which can potentially be very large on some data sets. To + reduce memory consumption, the complexity and size of the trees should be + controlled by setting those parameter values. + + References + ---------- + .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees", + Machine Learning, 63(1), 3-42, 2006. + + Examples + -------- + >>> from sklearn.datasets import load_diabetes + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.ensemble import ExtraTreesRegressor + >>> X, y = load_diabetes(return_X_y=True) + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, random_state=0) + >>> reg = ExtraTreesRegressor(n_estimators=100, random_state=0).fit( + ... X_train, y_train) + >>> reg.score(X_test, y_test) + 0.2727... + """ + + _parameter_constraints: dict = { + **ForestRegressor._parameter_constraints, + **DecisionTreeRegressor._parameter_constraints, + } + _parameter_constraints.pop("splitter") + + def __init__( + self, + n_estimators=100, + *, + criterion="squared_error", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=1.0, + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + monotonic_cst=None, + ): + super().__init__( + estimator=ExtraTreeRegressor(), + n_estimators=n_estimators, + estimator_params=( + "criterion", + "max_depth", + "min_samples_split", + "min_samples_leaf", + "min_weight_fraction_leaf", + "max_features", + "max_leaf_nodes", + "min_impurity_decrease", + "random_state", + "ccp_alpha", + "monotonic_cst", + ), + bootstrap=bootstrap, + oob_score=oob_score, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + warm_start=warm_start, + max_samples=max_samples, + ) + + self.criterion = criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_decrease = min_impurity_decrease + self.ccp_alpha = ccp_alpha + self.monotonic_cst = monotonic_cst + + +class RandomTreesEmbedding(TransformerMixin, BaseForest): + """ + An ensemble of totally random trees. + + An unsupervised transformation of a dataset to a high-dimensional + sparse representation. A datapoint is coded according to which leaf of + each tree it is sorted into. Using a one-hot encoding of the leaves, + this leads to a binary coding with as many ones as there are trees in + the forest. + + The dimensionality of the resulting representation is + ``n_out <= n_estimators * max_leaf_nodes``. If ``max_leaf_nodes == None``, + the number of leaf nodes is at most ``n_estimators * 2 ** max_depth``. + + For an example of applying Random Trees Embedding to non-linear + classification, see + :ref:`sphx_glr_auto_examples_ensemble_plot_random_forest_embedding.py`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_estimators : int, default=100 + Number of trees in the forest. + + .. versionchanged:: 0.22 + The default value of ``n_estimators`` changed from 10 to 100 + in 0.22. + + max_depth : int, default=5 + The maximum depth of each tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and + `ceil(min_samples_split * n_samples)` is the minimum + number of samples for each split. + + .. versionchanged:: 0.18 + Added float values for fractions. + + min_samples_leaf : int or float, default=1 + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and + `ceil(min_samples_leaf * n_samples)` is the minimum + number of samples for each node. + + .. versionchanged:: 0.18 + Added float values for fractions. + + min_weight_fraction_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. + + max_leaf_nodes : int, default=None + Grow trees with ``max_leaf_nodes`` in best-first fashion. + Best nodes are defined as relative reduction in impurity. + If None then unlimited number of leaf nodes. + + min_impurity_decrease : float, default=0.0 + A node will be split if this split induces a decrease of the impurity + greater than or equal to this value. + + The weighted impurity decrease equation is the following:: + + N_t / N * (impurity - N_t_R / N_t * right_impurity + - N_t_L / N_t * left_impurity) + + where ``N`` is the total number of samples, ``N_t`` is the number of + samples at the current node, ``N_t_L`` is the number of samples in the + left child, and ``N_t_R`` is the number of samples in the right child. + + ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, + if ``sample_weight`` is passed. + + .. versionadded:: 0.19 + + sparse_output : bool, default=True + Whether or not to return a sparse CSR matrix, as default behavior, + or to return a dense array compatible with dense pipeline operators. + + n_jobs : int, default=None + The number of jobs to run in parallel. :meth:`fit`, :meth:`transform`, + :meth:`decision_path` and :meth:`apply` are all parallelized over the + trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` + context. ``-1`` means using all processors. See :term:`Glossary + ` for more details. + + random_state : int, RandomState instance or None, default=None + Controls the generation of the random `y` used to fit the trees + and the draw of the splits for each feature at the trees' nodes. + See :term:`Glossary ` for details. + + verbose : int, default=0 + Controls the verbosity when fitting and predicting. + + warm_start : bool, default=False + When set to ``True``, reuse the solution of the previous call to fit + and add more estimators to the ensemble, otherwise, just fit a whole + new forest. See :term:`Glossary ` and + :ref:`tree_ensemble_warm_start` for details. + + Attributes + ---------- + estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` instance + The child estimator template used to create the collection of fitted + sub-estimators. + + .. versionadded:: 1.2 + `base_estimator_` was renamed to `estimator_`. + + estimators_ : list of :class:`~sklearn.tree.ExtraTreeRegressor` instances + The collection of fitted sub-estimators. + + feature_importances_ : ndarray of shape (n_features,) + The feature importances (the higher, the more important the feature). + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_outputs_ : int + The number of outputs when ``fit`` is performed. + + one_hot_encoder_ : OneHotEncoder instance + One-hot encoder used to create the sparse embedding. + + estimators_samples_ : list of arrays + The subset of drawn samples (i.e., the in-bag samples) for each base + estimator. Each subset is defined by an array of the indices selected. + + .. versionadded:: 1.4 + + See Also + -------- + ExtraTreesClassifier : An extra-trees classifier. + ExtraTreesRegressor : An extra-trees regressor. + RandomForestClassifier : A random forest classifier. + RandomForestRegressor : A random forest regressor. + sklearn.tree.ExtraTreeClassifier: An extremely randomized + tree classifier. + sklearn.tree.ExtraTreeRegressor : An extremely randomized + tree regressor. + + References + ---------- + .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees", + Machine Learning, 63(1), 3-42, 2006. + .. [2] Moosmann, F. and Triggs, B. and Jurie, F. "Fast discriminative + visual codebooks using randomized clustering forests" + NIPS 2007 + + Examples + -------- + >>> from sklearn.ensemble import RandomTreesEmbedding + >>> X = [[0,0], [1,0], [0,1], [-1,0], [0,-1]] + >>> random_trees = RandomTreesEmbedding( + ... n_estimators=5, random_state=0, max_depth=1).fit(X) + >>> X_sparse_embedding = random_trees.transform(X) + >>> X_sparse_embedding.toarray() + array([[0., 1., 1., 0., 1., 0., 0., 1., 1., 0.], + [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.], + [0., 1., 0., 1., 0., 1., 0., 1., 0., 1.], + [1., 0., 1., 0., 1., 0., 1., 0., 1., 0.], + [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.]]) + """ + + _parameter_constraints: dict = { + "n_estimators": [Interval(Integral, 1, None, closed="left")], + "n_jobs": [Integral, None], + "verbose": ["verbose"], + "warm_start": ["boolean"], + **BaseDecisionTree._parameter_constraints, + "sparse_output": ["boolean"], + } + for param in ("max_features", "ccp_alpha", "splitter", "monotonic_cst"): + _parameter_constraints.pop(param) + + criterion = "squared_error" + max_features = 1 + + def __init__( + self, + n_estimators=100, + *, + max_depth=5, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_leaf_nodes=None, + min_impurity_decrease=0.0, + sparse_output=True, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ): + super().__init__( + estimator=ExtraTreeRegressor(), + n_estimators=n_estimators, + estimator_params=( + "criterion", + "max_depth", + "min_samples_split", + "min_samples_leaf", + "min_weight_fraction_leaf", + "max_features", + "max_leaf_nodes", + "min_impurity_decrease", + "random_state", + ), + bootstrap=False, + oob_score=False, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + warm_start=warm_start, + max_samples=None, + ) + + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_decrease = min_impurity_decrease + self.sparse_output = sparse_output + + def _set_oob_score_and_attributes(self, X, y, scoring_function=None): + raise NotImplementedError("OOB score not supported by tree embedding") + + def fit(self, X, y=None, sample_weight=None): + """ + Fit estimator. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Use ``dtype=np.float32`` for maximum + efficiency. Sparse matrices are also supported, use sparse + ``csc_matrix`` for maximum efficiency. + + y : Ignored + Not used, present for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. Splits + that would create child nodes with net zero or negative weight are + ignored while searching for a split in each node. In the case of + classification, splits are also ignored if they would result in any + single class carrying a negative weight in either child node. + + Returns + ------- + self : object + Returns the instance itself. + """ + # Parameters are validated in fit_transform + self.fit_transform(X, y, sample_weight=sample_weight) + return self + + @_fit_context(prefer_skip_nested_validation=True) + def fit_transform(self, X, y=None, sample_weight=None): + """ + Fit estimator and transform dataset. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data used to build forests. Use ``dtype=np.float32`` for + maximum efficiency. + + y : Ignored + Not used, present for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. Splits + that would create child nodes with net zero or negative weight are + ignored while searching for a split in each node. In the case of + classification, splits are also ignored if they would result in any + single class carrying a negative weight in either child node. + + Returns + ------- + X_transformed : sparse matrix of shape (n_samples, n_out) + Transformed dataset. + """ + rnd = check_random_state(self.random_state) + y = rnd.uniform(size=_num_samples(X)) + super().fit(X, y, sample_weight=sample_weight) + + self.one_hot_encoder_ = OneHotEncoder(sparse_output=self.sparse_output) + output = self.one_hot_encoder_.fit_transform(self.apply(X)) + self._n_features_out = output.shape[1] + return output + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Only used to validate feature names with the names seen in :meth:`fit`. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names, in the format of + `randomtreesembedding_{tree}_{leaf}`, where `tree` is the tree used + to generate the leaf and `leaf` is the index of a leaf node + in that tree. Note that the node indexing scheme is used to + index both nodes with children (split nodes) and leaf nodes. + Only the latter can be present as output features. + As a consequence, there are missing indices in the output + feature names. + """ + check_is_fitted(self, "_n_features_out") + _check_feature_names_in( + self, input_features=input_features, generate_names=False + ) + + feature_names = [ + f"randomtreesembedding_{tree}_{leaf}" + for tree in range(self.n_estimators) + for leaf in self.one_hot_encoder_.categories_[tree] + ] + return np.asarray(feature_names, dtype=object) + + def transform(self, X): + """ + Transform dataset. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data to be transformed. Use ``dtype=np.float32`` for maximum + efficiency. Sparse matrices are also supported, use sparse + ``csr_matrix`` for maximum efficiency. + + Returns + ------- + X_transformed : sparse matrix of shape (n_samples, n_out) + Transformed dataset. + """ + check_is_fitted(self) + return self.one_hot_encoder_.transform(self.apply(X)) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_gb.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_gb.py new file mode 100644 index 0000000000000000000000000000000000000000..55c8e79e062dfd41be23162d9bdb90afc71b4381 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_gb.py @@ -0,0 +1,2196 @@ +"""Gradient Boosted Regression Trees. + +This module contains methods for fitting gradient boosted regression trees for +both classification and regression. + +The module structure is the following: + +- The ``BaseGradientBoosting`` base class implements a common ``fit`` method + for all the estimators in the module. Regression and classification + only differ in the concrete ``LossFunction`` used. + +- ``GradientBoostingClassifier`` implements gradient boosting for + classification problems. + +- ``GradientBoostingRegressor`` implements gradient boosting for + regression problems. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import math +import warnings +from abc import ABCMeta, abstractmethod +from numbers import Integral, Real +from time import time + +import numpy as np +from scipy.sparse import csc_matrix, csr_matrix, issparse + +from .._loss.loss import ( + _LOSSES, + AbsoluteError, + ExponentialLoss, + HalfBinomialLoss, + HalfMultinomialLoss, + HalfSquaredError, + HuberLoss, + PinballLoss, +) +from ..base import ClassifierMixin, RegressorMixin, _fit_context, is_classifier +from ..dummy import DummyClassifier, DummyRegressor +from ..exceptions import NotFittedError +from ..model_selection import train_test_split +from ..preprocessing import LabelEncoder +from ..tree import DecisionTreeRegressor +from ..tree._tree import DOUBLE, DTYPE, TREE_LEAF +from ..utils import check_array, check_random_state, column_or_1d +from ..utils._param_validation import HasMethods, Interval, StrOptions +from ..utils.multiclass import check_classification_targets +from ..utils.stats import _weighted_percentile +from ..utils.validation import _check_sample_weight, check_is_fitted, validate_data +from ._base import BaseEnsemble +from ._gradient_boosting import _random_sample_mask, predict_stage, predict_stages + +_LOSSES = _LOSSES.copy() +_LOSSES.update( + { + "quantile": PinballLoss, + "huber": HuberLoss, + } +) + + +def _safe_divide(numerator, denominator): + """Prevents overflow and division by zero.""" + # This is used for classifiers where the denominator might become zero exactly. + # For instance for log loss, HalfBinomialLoss, if proba=0 or proba=1 exactly, then + # denominator = hessian = 0, and we should set the node value in the line search to + # zero as there is no improvement of the loss possible. + # For numerical safety, we do this already for extremely tiny values. + if abs(denominator) < 1e-150: + return 0.0 + else: + # Cast to Python float to trigger Python errors, e.g. ZeroDivisionError, + # without relying on `np.errstate` that is not supported by Pyodide. + result = float(numerator) / float(denominator) + # Cast to Python float to trigger a ZeroDivisionError without relying + # on `np.errstate` that is not supported by Pyodide. + result = float(numerator) / float(denominator) + if math.isinf(result): + warnings.warn("overflow encountered in _safe_divide", RuntimeWarning) + return result + + +def _init_raw_predictions(X, estimator, loss, use_predict_proba): + """Return the initial raw predictions. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + The data array. + estimator : object + The estimator to use to compute the predictions. + loss : BaseLoss + An instance of a loss function class. + use_predict_proba : bool + Whether estimator.predict_proba is used instead of estimator.predict. + + Returns + ------- + raw_predictions : ndarray of shape (n_samples, K) + The initial raw predictions. K is equal to 1 for binary + classification and regression, and equal to the number of classes + for multiclass classification. ``raw_predictions`` is casted + into float64. + """ + # TODO: Use loss.fit_intercept_only where appropriate instead of + # DummyRegressor which is the default given by the `init` parameter, + # see also _init_state. + if use_predict_proba: + # Our parameter validation, set via _fit_context and _parameter_constraints + # already guarantees that estimator has a predict_proba method. + predictions = estimator.predict_proba(X) + if not loss.is_multiclass: + predictions = predictions[:, 1] # probability of positive class + eps = np.finfo(np.float32).eps # FIXME: This is quite large! + predictions = np.clip(predictions, eps, 1 - eps, dtype=np.float64) + else: + predictions = estimator.predict(X).astype(np.float64) + + if predictions.ndim == 1: + return loss.link.link(predictions).reshape(-1, 1) + else: + return loss.link.link(predictions) + + +def _update_terminal_regions( + loss, + tree, + X, + y, + neg_gradient, + raw_prediction, + sample_weight, + sample_mask, + learning_rate=0.1, + k=0, +): + """Update the leaf values to be predicted by the tree and raw_prediction. + + The current raw predictions of the model (of this stage) are updated. + + Additionally, the terminal regions (=leaves) of the given tree are updated as well. + This corresponds to the line search step in "Greedy Function Approximation" by + Friedman, Algorithm 1 step 5. + + Update equals: + argmin_{x} loss(y_true, raw_prediction_old + x * tree.value) + + For non-trivial cases like the Binomial loss, the update has no closed formula and + is an approximation, again, see the Friedman paper. + + Also note that the update formula for the SquaredError is the identity. Therefore, + in this case, the leaf values don't need an update and only the raw_predictions are + updated (with the learning rate included). + + Parameters + ---------- + loss : BaseLoss + tree : tree.Tree + The tree object. + X : ndarray of shape (n_samples, n_features) + The data array. + y : ndarray of shape (n_samples,) + The target labels. + neg_gradient : ndarray of shape (n_samples,) + The negative gradient. + raw_prediction : ndarray of shape (n_samples, n_trees_per_iteration) + The raw predictions (i.e. values from the tree leaves) of the + tree ensemble at iteration ``i - 1``. + sample_weight : ndarray of shape (n_samples,) + The weight of each sample. + sample_mask : ndarray of shape (n_samples,) + The sample mask to be used. + learning_rate : float, default=0.1 + Learning rate shrinks the contribution of each tree by + ``learning_rate``. + k : int, default=0 + The index of the estimator being updated. + """ + # compute leaf for each sample in ``X``. + terminal_regions = tree.apply(X) + + if not isinstance(loss, HalfSquaredError): + # mask all which are not in sample mask. + masked_terminal_regions = terminal_regions.copy() + masked_terminal_regions[~sample_mask] = -1 + + if isinstance(loss, HalfBinomialLoss): + + def compute_update(y_, indices, neg_gradient, raw_prediction, k): + # Make a single Newton-Raphson step, see "Additive Logistic Regression: + # A Statistical View of Boosting" FHT00 and note that we use a slightly + # different version (factor 2) of "F" with proba=expit(raw_prediction). + # Our node estimate is given by: + # sum(w * (y - prob)) / sum(w * prob * (1 - prob)) + # we take advantage that: y - prob = neg_gradient + neg_g = neg_gradient.take(indices, axis=0) + prob = y_ - neg_g + # numerator = negative gradient = y - prob + numerator = np.average(neg_g, weights=sw) + # denominator = hessian = prob * (1 - prob) + denominator = np.average(prob * (1 - prob), weights=sw) + return _safe_divide(numerator, denominator) + + elif isinstance(loss, HalfMultinomialLoss): + + def compute_update(y_, indices, neg_gradient, raw_prediction, k): + # we take advantage that: y - prob = neg_gradient + neg_g = neg_gradient.take(indices, axis=0) + prob = y_ - neg_g + K = loss.n_classes + # numerator = negative gradient * (k - 1) / k + # Note: The factor (k - 1)/k appears in the original papers "Greedy + # Function Approximation" by Friedman and "Additive Logistic + # Regression" by Friedman, Hastie, Tibshirani. This factor is, however, + # wrong or at least arbitrary as it directly multiplies the + # learning_rate. We keep it for backward compatibility. + numerator = np.average(neg_g, weights=sw) + numerator *= (K - 1) / K + # denominator = (diagonal) hessian = prob * (1 - prob) + denominator = np.average(prob * (1 - prob), weights=sw) + return _safe_divide(numerator, denominator) + + elif isinstance(loss, ExponentialLoss): + + def compute_update(y_, indices, neg_gradient, raw_prediction, k): + neg_g = neg_gradient.take(indices, axis=0) + # numerator = negative gradient = y * exp(-raw) - (1-y) * exp(raw) + numerator = np.average(neg_g, weights=sw) + # denominator = hessian = y * exp(-raw) + (1-y) * exp(raw) + # if y=0: hessian = exp(raw) = -neg_g + # y=1: hessian = exp(-raw) = neg_g + hessian = neg_g.copy() + hessian[y_ == 0] *= -1 + denominator = np.average(hessian, weights=sw) + return _safe_divide(numerator, denominator) + + else: + + def compute_update(y_, indices, neg_gradient, raw_prediction, k): + return loss.fit_intercept_only( + y_true=y_ - raw_prediction[indices, k], + sample_weight=sw, + ) + + # update each leaf (= perform line search) + for leaf in np.nonzero(tree.children_left == TREE_LEAF)[0]: + indices = np.nonzero(masked_terminal_regions == leaf)[ + 0 + ] # of terminal regions + y_ = y.take(indices, axis=0) + sw = None if sample_weight is None else sample_weight[indices] + update = compute_update(y_, indices, neg_gradient, raw_prediction, k) + + # TODO: Multiply here by learning rate instead of everywhere else. + tree.value[leaf, 0, 0] = update + + # update predictions (both in-bag and out-of-bag) + raw_prediction[:, k] += learning_rate * tree.value[:, 0, 0].take( + terminal_regions, axis=0 + ) + + +def set_huber_delta(loss, y_true, raw_prediction, sample_weight=None): + """Calculate and set self.closs.delta based on self.quantile.""" + abserr = np.abs(y_true - raw_prediction.squeeze()) + # sample_weight is always a ndarray, never None. + delta = _weighted_percentile(abserr, sample_weight, 100 * loss.quantile) + loss.closs.delta = float(delta) + + +class VerboseReporter: + """Reports verbose output to stdout. + + Parameters + ---------- + verbose : int + Verbosity level. If ``verbose==1`` output is printed once in a while + (when iteration mod verbose_mod is zero).; if larger than 1 then output + is printed for each update. + """ + + def __init__(self, verbose): + self.verbose = verbose + + def init(self, est, begin_at_stage=0): + """Initialize reporter + + Parameters + ---------- + est : Estimator + The estimator + + begin_at_stage : int, default=0 + stage at which to begin reporting + """ + # header fields and line format str + header_fields = ["Iter", "Train Loss"] + verbose_fmt = ["{iter:>10d}", "{train_score:>16.4f}"] + # do oob? + if est.subsample < 1: + header_fields.append("OOB Improve") + verbose_fmt.append("{oob_impr:>16.4f}") + header_fields.append("Remaining Time") + verbose_fmt.append("{remaining_time:>16s}") + + # print the header line + print(("%10s " + "%16s " * (len(header_fields) - 1)) % tuple(header_fields)) + + self.verbose_fmt = " ".join(verbose_fmt) + # plot verbose info each time i % verbose_mod == 0 + self.verbose_mod = 1 + self.start_time = time() + self.begin_at_stage = begin_at_stage + + def update(self, j, est): + """Update reporter with new iteration. + + Parameters + ---------- + j : int + The new iteration. + est : Estimator + The estimator. + """ + do_oob = est.subsample < 1 + # we need to take into account if we fit additional estimators. + i = j - self.begin_at_stage # iteration relative to the start iter + if (i + 1) % self.verbose_mod == 0: + oob_impr = est.oob_improvement_[j] if do_oob else 0 + remaining_time = ( + (est.n_estimators - (j + 1)) * (time() - self.start_time) / float(i + 1) + ) + if remaining_time > 60: + remaining_time = "{0:.2f}m".format(remaining_time / 60.0) + else: + remaining_time = "{0:.2f}s".format(remaining_time) + print( + self.verbose_fmt.format( + iter=j + 1, + train_score=est.train_score_[j], + oob_impr=oob_impr, + remaining_time=remaining_time, + ) + ) + if self.verbose == 1 and ((i + 1) // (self.verbose_mod * 10) > 0): + # adjust verbose frequency (powers of 10) + self.verbose_mod *= 10 + + +class BaseGradientBoosting(BaseEnsemble, metaclass=ABCMeta): + """Abstract base class for Gradient Boosting.""" + + _parameter_constraints: dict = { + **DecisionTreeRegressor._parameter_constraints, + "learning_rate": [Interval(Real, 0.0, None, closed="left")], + "n_estimators": [Interval(Integral, 1, None, closed="left")], + "criterion": [StrOptions({"friedman_mse", "squared_error"})], + "subsample": [Interval(Real, 0.0, 1.0, closed="right")], + "verbose": ["verbose"], + "warm_start": ["boolean"], + "validation_fraction": [Interval(Real, 0.0, 1.0, closed="neither")], + "n_iter_no_change": [Interval(Integral, 1, None, closed="left"), None], + "tol": [Interval(Real, 0.0, None, closed="left")], + } + _parameter_constraints.pop("splitter") + _parameter_constraints.pop("monotonic_cst") + + @abstractmethod + def __init__( + self, + *, + loss, + learning_rate, + n_estimators, + criterion, + min_samples_split, + min_samples_leaf, + min_weight_fraction_leaf, + max_depth, + min_impurity_decrease, + init, + subsample, + max_features, + ccp_alpha, + random_state, + alpha=0.9, + verbose=0, + max_leaf_nodes=None, + warm_start=False, + validation_fraction=0.1, + n_iter_no_change=None, + tol=1e-4, + ): + self.n_estimators = n_estimators + self.learning_rate = learning_rate + self.loss = loss + self.criterion = criterion + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.subsample = subsample + self.max_features = max_features + self.max_depth = max_depth + self.min_impurity_decrease = min_impurity_decrease + self.ccp_alpha = ccp_alpha + self.init = init + self.random_state = random_state + self.alpha = alpha + self.verbose = verbose + self.max_leaf_nodes = max_leaf_nodes + self.warm_start = warm_start + self.validation_fraction = validation_fraction + self.n_iter_no_change = n_iter_no_change + self.tol = tol + + @abstractmethod + def _encode_y(self, y=None, sample_weight=None): + """Called by fit to validate and encode y.""" + + @abstractmethod + def _get_loss(self, sample_weight): + """Get loss object from sklearn._loss.loss.""" + + def _fit_stage( + self, + i, + X, + y, + raw_predictions, + sample_weight, + sample_mask, + random_state, + X_csc=None, + X_csr=None, + ): + """Fit another stage of ``n_trees_per_iteration_`` trees.""" + original_y = y + + if isinstance(self._loss, HuberLoss): + set_huber_delta( + loss=self._loss, + y_true=y, + raw_prediction=raw_predictions, + sample_weight=sample_weight, + ) + # TODO: Without oob, i.e. with self.subsample = 1.0, we could call + # self._loss.loss_gradient and use it to set train_score_. + # But note that train_score_[i] is the score AFTER fitting the i-th tree. + # Note: We need the negative gradient! + neg_gradient = -self._loss.gradient( + y_true=y, + raw_prediction=raw_predictions, + sample_weight=None, # We pass sample_weights to the tree directly. + ) + # 2-d views of shape (n_samples, n_trees_per_iteration_) or (n_samples, 1) + # on neg_gradient to simplify the loop over n_trees_per_iteration_. + if neg_gradient.ndim == 1: + neg_g_view = neg_gradient.reshape((-1, 1)) + else: + neg_g_view = neg_gradient + + for k in range(self.n_trees_per_iteration_): + if self._loss.is_multiclass: + y = np.array(original_y == k, dtype=np.float64) + + # induce regression tree on the negative gradient + tree = DecisionTreeRegressor( + criterion=self.criterion, + splitter="best", + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + min_weight_fraction_leaf=self.min_weight_fraction_leaf, + min_impurity_decrease=self.min_impurity_decrease, + max_features=self.max_features, + max_leaf_nodes=self.max_leaf_nodes, + random_state=random_state, + ccp_alpha=self.ccp_alpha, + ) + + if self.subsample < 1.0: + # no inplace multiplication! + sample_weight = sample_weight * sample_mask.astype(np.float64) + + X = X_csc if X_csc is not None else X + tree.fit( + X, neg_g_view[:, k], sample_weight=sample_weight, check_input=False + ) + + # update tree leaves + X_for_tree_update = X_csr if X_csr is not None else X + _update_terminal_regions( + self._loss, + tree.tree_, + X_for_tree_update, + y, + neg_g_view[:, k], + raw_predictions, + sample_weight, + sample_mask, + learning_rate=self.learning_rate, + k=k, + ) + + # add tree to ensemble + self.estimators_[i, k] = tree + + return raw_predictions + + def _set_max_features(self): + """Set self.max_features_.""" + if isinstance(self.max_features, str): + if self.max_features == "auto": + if is_classifier(self): + max_features = max(1, int(np.sqrt(self.n_features_in_))) + else: + max_features = self.n_features_in_ + elif self.max_features == "sqrt": + max_features = max(1, int(np.sqrt(self.n_features_in_))) + else: # self.max_features == "log2" + max_features = max(1, int(np.log2(self.n_features_in_))) + elif self.max_features is None: + max_features = self.n_features_in_ + elif isinstance(self.max_features, Integral): + max_features = self.max_features + else: # float + max_features = max(1, int(self.max_features * self.n_features_in_)) + + self.max_features_ = max_features + + def _init_state(self): + """Initialize model state and allocate model state data structures.""" + + self.init_ = self.init + if self.init_ is None: + if is_classifier(self): + self.init_ = DummyClassifier(strategy="prior") + elif isinstance(self._loss, (AbsoluteError, HuberLoss)): + self.init_ = DummyRegressor(strategy="quantile", quantile=0.5) + elif isinstance(self._loss, PinballLoss): + self.init_ = DummyRegressor(strategy="quantile", quantile=self.alpha) + else: + self.init_ = DummyRegressor(strategy="mean") + + self.estimators_ = np.empty( + (self.n_estimators, self.n_trees_per_iteration_), dtype=object + ) + self.train_score_ = np.zeros((self.n_estimators,), dtype=np.float64) + # do oob? + if self.subsample < 1.0: + self.oob_improvement_ = np.zeros((self.n_estimators), dtype=np.float64) + self.oob_scores_ = np.zeros((self.n_estimators), dtype=np.float64) + self.oob_score_ = np.nan + + def _clear_state(self): + """Clear the state of the gradient boosting model.""" + if hasattr(self, "estimators_"): + self.estimators_ = np.empty((0, 0), dtype=object) + if hasattr(self, "train_score_"): + del self.train_score_ + if hasattr(self, "oob_improvement_"): + del self.oob_improvement_ + if hasattr(self, "oob_scores_"): + del self.oob_scores_ + if hasattr(self, "oob_score_"): + del self.oob_score_ + if hasattr(self, "init_"): + del self.init_ + if hasattr(self, "_rng"): + del self._rng + + def _resize_state(self): + """Add additional ``n_estimators`` entries to all attributes.""" + # self.n_estimators is the number of additional est to fit + total_n_estimators = self.n_estimators + if total_n_estimators < self.estimators_.shape[0]: + raise ValueError( + "resize with smaller n_estimators %d < %d" + % (total_n_estimators, self.estimators_[0]) + ) + + self.estimators_ = np.resize( + self.estimators_, (total_n_estimators, self.n_trees_per_iteration_) + ) + self.train_score_ = np.resize(self.train_score_, total_n_estimators) + if self.subsample < 1 or hasattr(self, "oob_improvement_"): + # if do oob resize arrays or create new if not available + if hasattr(self, "oob_improvement_"): + self.oob_improvement_ = np.resize( + self.oob_improvement_, total_n_estimators + ) + self.oob_scores_ = np.resize(self.oob_scores_, total_n_estimators) + self.oob_score_ = np.nan + else: + self.oob_improvement_ = np.zeros( + (total_n_estimators,), dtype=np.float64 + ) + self.oob_scores_ = np.zeros((total_n_estimators,), dtype=np.float64) + self.oob_score_ = np.nan + + def _is_fitted(self): + return len(getattr(self, "estimators_", [])) > 0 + + def _check_initialized(self): + """Check that the estimator is initialized, raising an error if not.""" + check_is_fitted(self) + + @_fit_context( + # GradientBoosting*.init is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y, sample_weight=None, monitor=None): + """Fit the gradient boosting model. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + + y : array-like of shape (n_samples,) + Target values (strings or integers in classification, real numbers + in regression) + For classification, labels must correspond to classes. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. Splits + that would create child nodes with net zero or negative weight are + ignored while searching for a split in each node. In the case of + classification, splits are also ignored if they would result in any + single class carrying a negative weight in either child node. + + monitor : callable, default=None + The monitor is called after each iteration with the current + iteration, a reference to the estimator and the local variables of + ``_fit_stages`` as keyword arguments ``callable(i, self, + locals())``. If the callable returns ``True`` the fitting procedure + is stopped. The monitor can be used for various things such as + computing held-out estimates, early stopping, model introspect, and + snapshotting. + + Returns + ------- + self : object + Fitted estimator. + """ + if not self.warm_start: + self._clear_state() + + # Check input + # Since check_array converts both X and y to the same dtype, but the + # trees use different types for X and y, checking them separately. + + X, y = validate_data( + self, + X, + y, + accept_sparse=["csr", "csc", "coo"], + dtype=DTYPE, + multi_output=True, + ) + sample_weight_is_none = sample_weight is None + sample_weight = _check_sample_weight(sample_weight, X) + if sample_weight_is_none: + y = self._encode_y(y=y, sample_weight=None) + else: + y = self._encode_y(y=y, sample_weight=sample_weight) + y = column_or_1d(y, warn=True) # TODO: Is this still required? + + self._set_max_features() + + # self.loss is guaranteed to be a string + self._loss = self._get_loss(sample_weight=sample_weight) + + if self.n_iter_no_change is not None: + stratify = y if is_classifier(self) else None + ( + X_train, + X_val, + y_train, + y_val, + sample_weight_train, + sample_weight_val, + ) = train_test_split( + X, + y, + sample_weight, + random_state=self.random_state, + test_size=self.validation_fraction, + stratify=stratify, + ) + if is_classifier(self): + if self.n_classes_ != np.unique(y_train).shape[0]: + # We choose to error here. The problem is that the init + # estimator would be trained on y, which has some missing + # classes now, so its predictions would not have the + # correct shape. + raise ValueError( + "The training data after the early stopping split " + "is missing some classes. Try using another random " + "seed." + ) + else: + X_train, y_train, sample_weight_train = X, y, sample_weight + X_val = y_val = sample_weight_val = None + + n_samples = X_train.shape[0] + + # First time calling fit. + if not self._is_fitted(): + # init state + self._init_state() + + # fit initial model and initialize raw predictions + if self.init_ == "zero": + raw_predictions = np.zeros( + shape=(n_samples, self.n_trees_per_iteration_), + dtype=np.float64, + ) + else: + # XXX clean this once we have a support_sample_weight tag + if sample_weight_is_none: + self.init_.fit(X_train, y_train) + else: + msg = ( + "The initial estimator {} does not support sample " + "weights.".format(self.init_.__class__.__name__) + ) + try: + self.init_.fit( + X_train, y_train, sample_weight=sample_weight_train + ) + except TypeError as e: + if "unexpected keyword argument 'sample_weight'" in str(e): + # regular estimator without SW support + raise ValueError(msg) from e + else: # regular estimator whose input checking failed + raise + except ValueError as e: + if ( + "pass parameters to specific steps of " + "your pipeline using the " + "stepname__parameter" in str(e) + ): # pipeline + raise ValueError(msg) from e + else: # regular estimator whose input checking failed + raise + + raw_predictions = _init_raw_predictions( + X_train, self.init_, self._loss, is_classifier(self) + ) + + begin_at_stage = 0 + + # The rng state must be preserved if warm_start is True + self._rng = check_random_state(self.random_state) + + # warm start: this is not the first time fit was called + else: + # add more estimators to fitted model + # invariant: warm_start = True + if self.n_estimators < self.estimators_.shape[0]: + raise ValueError( + "n_estimators=%d must be larger or equal to " + "estimators_.shape[0]=%d when " + "warm_start==True" % (self.n_estimators, self.estimators_.shape[0]) + ) + begin_at_stage = self.estimators_.shape[0] + # The requirements of _raw_predict + # are more constrained than fit. It accepts only CSR + # matrices. Finite values have already been checked in _validate_data. + X_train = check_array( + X_train, + dtype=DTYPE, + order="C", + accept_sparse="csr", + ensure_all_finite=False, + ) + raw_predictions = self._raw_predict(X_train) + self._resize_state() + + # fit the boosting stages + n_stages = self._fit_stages( + X_train, + y_train, + raw_predictions, + sample_weight_train, + self._rng, + X_val, + y_val, + sample_weight_val, + begin_at_stage, + monitor, + ) + + # change shape of arrays after fit (early-stopping or additional ests) + if n_stages != self.estimators_.shape[0]: + self.estimators_ = self.estimators_[:n_stages] + self.train_score_ = self.train_score_[:n_stages] + if hasattr(self, "oob_improvement_"): + # OOB scores were computed + self.oob_improvement_ = self.oob_improvement_[:n_stages] + self.oob_scores_ = self.oob_scores_[:n_stages] + self.oob_score_ = self.oob_scores_[-1] + self.n_estimators_ = n_stages + return self + + def _fit_stages( + self, + X, + y, + raw_predictions, + sample_weight, + random_state, + X_val, + y_val, + sample_weight_val, + begin_at_stage=0, + monitor=None, + ): + """Iteratively fits the stages. + + For each stage it computes the progress (OOB, train score) + and delegates to ``_fit_stage``. + Returns the number of stages fit; might differ from ``n_estimators`` + due to early stopping. + """ + n_samples = X.shape[0] + do_oob = self.subsample < 1.0 + sample_mask = np.ones((n_samples,), dtype=bool) + n_inbag = max(1, int(self.subsample * n_samples)) + + if self.verbose: + verbose_reporter = VerboseReporter(verbose=self.verbose) + verbose_reporter.init(self, begin_at_stage) + + X_csc = csc_matrix(X) if issparse(X) else None + X_csr = csr_matrix(X) if issparse(X) else None + + if self.n_iter_no_change is not None: + loss_history = np.full(self.n_iter_no_change, np.inf) + # We create a generator to get the predictions for X_val after + # the addition of each successive stage + y_val_pred_iter = self._staged_raw_predict(X_val, check_input=False) + + # Older versions of GBT had its own loss functions. With the new common + # private loss function submodule _loss, we often are a factor of 2 + # away from the old version. Here we keep backward compatibility for + # oob_scores_ and oob_improvement_, even if the old way is quite + # inconsistent (sometimes the gradient is half the gradient, sometimes + # not). + if isinstance( + self._loss, + ( + HalfSquaredError, + HalfBinomialLoss, + ), + ): + factor = 2 + else: + factor = 1 + + # perform boosting iterations + i = begin_at_stage + for i in range(begin_at_stage, self.n_estimators): + # subsampling + if do_oob: + sample_mask = _random_sample_mask(n_samples, n_inbag, random_state) + y_oob_masked = y[~sample_mask] + sample_weight_oob_masked = sample_weight[~sample_mask] + if i == 0: # store the initial loss to compute the OOB score + initial_loss = factor * self._loss( + y_true=y_oob_masked, + raw_prediction=raw_predictions[~sample_mask], + sample_weight=sample_weight_oob_masked, + ) + + # fit next stage of trees + raw_predictions = self._fit_stage( + i, + X, + y, + raw_predictions, + sample_weight, + sample_mask, + random_state, + X_csc=X_csc, + X_csr=X_csr, + ) + + # track loss + if do_oob: + self.train_score_[i] = factor * self._loss( + y_true=y[sample_mask], + raw_prediction=raw_predictions[sample_mask], + sample_weight=sample_weight[sample_mask], + ) + self.oob_scores_[i] = factor * self._loss( + y_true=y_oob_masked, + raw_prediction=raw_predictions[~sample_mask], + sample_weight=sample_weight_oob_masked, + ) + previous_loss = initial_loss if i == 0 else self.oob_scores_[i - 1] + self.oob_improvement_[i] = previous_loss - self.oob_scores_[i] + self.oob_score_ = self.oob_scores_[-1] + else: + # no need to fancy index w/ no subsampling + self.train_score_[i] = factor * self._loss( + y_true=y, + raw_prediction=raw_predictions, + sample_weight=sample_weight, + ) + + if self.verbose > 0: + verbose_reporter.update(i, self) + + if monitor is not None: + early_stopping = monitor(i, self, locals()) + if early_stopping: + break + + # We also provide an early stopping based on the score from + # validation set (X_val, y_val), if n_iter_no_change is set + if self.n_iter_no_change is not None: + # By calling next(y_val_pred_iter), we get the predictions + # for X_val after the addition of the current stage + validation_loss = factor * self._loss( + y_val, next(y_val_pred_iter), sample_weight_val + ) + + # Require validation_score to be better (less) than at least + # one of the last n_iter_no_change evaluations + if np.any(validation_loss + self.tol < loss_history): + loss_history[i % len(loss_history)] = validation_loss + else: + break + + return i + 1 + + def _make_estimator(self, append=True): + # we don't need _make_estimator + raise NotImplementedError() + + def _raw_predict_init(self, X): + """Check input and compute raw predictions of the init estimator.""" + self._check_initialized() + X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True) + if self.init_ == "zero": + raw_predictions = np.zeros( + shape=(X.shape[0], self.n_trees_per_iteration_), dtype=np.float64 + ) + else: + raw_predictions = _init_raw_predictions( + X, self.init_, self._loss, is_classifier(self) + ) + return raw_predictions + + def _raw_predict(self, X): + """Return the sum of the trees raw predictions (+ init estimator).""" + check_is_fitted(self) + raw_predictions = self._raw_predict_init(X) + predict_stages(self.estimators_, X, self.learning_rate, raw_predictions) + return raw_predictions + + def _staged_raw_predict(self, X, check_input=True): + """Compute raw predictions of ``X`` for each iteration. + + This method allows monitoring (i.e. determine error on testing set) + after each stage. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + + check_input : bool, default=True + If False, the input arrays X will not be checked. + + Returns + ------- + raw_predictions : generator of ndarray of shape (n_samples, k) + The raw predictions of the input samples. The order of the + classes corresponds to that in the attribute :term:`classes_`. + Regression and binary classification are special cases with + ``k == 1``, otherwise ``k==n_classes``. + """ + if check_input: + X = validate_data( + self, X, dtype=DTYPE, order="C", accept_sparse="csr", reset=False + ) + raw_predictions = self._raw_predict_init(X) + for i in range(self.estimators_.shape[0]): + predict_stage(self.estimators_, i, X, self.learning_rate, raw_predictions) + yield raw_predictions.copy() + + @property + def feature_importances_(self): + """The impurity-based feature importances. + + The higher, the more important the feature. + The importance of a feature is computed as the (normalized) + total reduction of the criterion brought by that feature. It is also + known as the Gini importance. + + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + + Returns + ------- + feature_importances_ : ndarray of shape (n_features,) + The values of this array sum to 1, unless all trees are single node + trees consisting of only the root node, in which case it will be an + array of zeros. + """ + self._check_initialized() + + relevant_trees = [ + tree + for stage in self.estimators_ + for tree in stage + if tree.tree_.node_count > 1 + ] + if not relevant_trees: + # degenerate case where all trees have only one node + return np.zeros(shape=self.n_features_in_, dtype=np.float64) + + relevant_feature_importances = [ + tree.tree_.compute_feature_importances(normalize=False) + for tree in relevant_trees + ] + avg_feature_importances = np.mean( + relevant_feature_importances, axis=0, dtype=np.float64 + ) + return avg_feature_importances / np.sum(avg_feature_importances) + + def _compute_partial_dependence_recursion(self, grid, target_features): + """Fast partial dependence computation. + + Parameters + ---------- + grid : ndarray of shape (n_samples, n_target_features), dtype=np.float32 + The grid points on which the partial dependence should be + evaluated. + target_features : ndarray of shape (n_target_features,), dtype=np.intp + The set of target features for which the partial dependence + should be evaluated. + + Returns + ------- + averaged_predictions : ndarray of shape \ + (n_trees_per_iteration_, n_samples) + The value of the partial dependence function on each grid point. + """ + if self.init is not None: + warnings.warn( + "Using recursion method with a non-constant init predictor " + "will lead to incorrect partial dependence values. " + "Got init=%s." % self.init, + UserWarning, + ) + grid = np.asarray(grid, dtype=DTYPE, order="C") + n_estimators, n_trees_per_stage = self.estimators_.shape + averaged_predictions = np.zeros( + (n_trees_per_stage, grid.shape[0]), dtype=np.float64, order="C" + ) + target_features = np.asarray(target_features, dtype=np.intp, order="C") + + for stage in range(n_estimators): + for k in range(n_trees_per_stage): + tree = self.estimators_[stage, k].tree_ + tree.compute_partial_dependence( + grid, target_features, averaged_predictions[k] + ) + averaged_predictions *= self.learning_rate + + return averaged_predictions + + def apply(self, X): + """Apply trees in the ensemble to X, return leaf indices. + + .. versionadded:: 0.17 + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, its dtype will be converted to + ``dtype=np.float32``. If a sparse matrix is provided, it will + be converted to a sparse ``csr_matrix``. + + Returns + ------- + X_leaves : array-like of shape (n_samples, n_estimators, n_classes) + For each datapoint x in X and for each tree in the ensemble, + return the index of the leaf x ends up in each estimator. + In the case of binary classification n_classes is 1. + """ + + self._check_initialized() + X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True) + + # n_classes will be equal to 1 in the binary classification or the + # regression case. + n_estimators, n_classes = self.estimators_.shape + leaves = np.zeros((X.shape[0], n_estimators, n_classes)) + + for i in range(n_estimators): + for j in range(n_classes): + estimator = self.estimators_[i, j] + leaves[:, i, j] = estimator.apply(X, check_input=False) + + return leaves + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + + +class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): + """Gradient Boosting for classification. + + This algorithm builds an additive model in a forward stage-wise fashion; it + allows for the optimization of arbitrary differentiable loss functions. In + each stage ``n_classes_`` regression trees are fit on the negative gradient + of the loss function, e.g. binary or multiclass log loss. Binary + classification is a special case where only a single regression tree is + induced. + + :class:`~sklearn.ensemble.HistGradientBoostingClassifier` is a much faster variant + of this algorithm for intermediate and large datasets (`n_samples >= 10_000`) and + supports monotonic constraints. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + loss : {'log_loss', 'exponential'}, default='log_loss' + The loss function to be optimized. 'log_loss' refers to binomial and + multinomial deviance, the same as used in logistic regression. + It is a good choice for classification with probabilistic outputs. + For loss 'exponential', gradient boosting recovers the AdaBoost algorithm. + + learning_rate : float, default=0.1 + Learning rate shrinks the contribution of each tree by `learning_rate`. + There is a trade-off between learning_rate and n_estimators. + Values must be in the range `[0.0, inf)`. + + For an example of the effects of this parameter and its interaction with + ``subsample``, see + :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_regularization.py`. + + n_estimators : int, default=100 + The number of boosting stages to perform. Gradient boosting + is fairly robust to over-fitting so a large number usually + results in better performance. + Values must be in the range `[1, inf)`. + + subsample : float, default=1.0 + The fraction of samples to be used for fitting the individual base + learners. If smaller than 1.0 this results in Stochastic Gradient + Boosting. `subsample` interacts with the parameter `n_estimators`. + Choosing `subsample < 1.0` leads to a reduction of variance + and an increase in bias. + Values must be in the range `(0.0, 1.0]`. + + criterion : {'friedman_mse', 'squared_error'}, default='friedman_mse' + The function to measure the quality of a split. Supported criteria are + 'friedman_mse' for the mean squared error with improvement score by + Friedman, 'squared_error' for mean squared error. The default value of + 'friedman_mse' is generally the best as it can provide a better + approximation in some cases. + + .. versionadded:: 0.18 + + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + + - If int, values must be in the range `[2, inf)`. + - If float, values must be in the range `(0.0, 1.0]` and `min_samples_split` + will be `ceil(min_samples_split * n_samples)`. + + .. versionchanged:: 0.18 + Added float values for fractions. + + min_samples_leaf : int or float, default=1 + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, values must be in the range `[1, inf)`. + - If float, values must be in the range `(0.0, 1.0)` and `min_samples_leaf` + will be `ceil(min_samples_leaf * n_samples)`. + + .. versionchanged:: 0.18 + Added float values for fractions. + + min_weight_fraction_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. + Values must be in the range `[0.0, 0.5]`. + + max_depth : int or None, default=3 + Maximum depth of the individual regression estimators. The maximum + depth limits the number of nodes in the tree. Tune this parameter + for best performance; the best value depends on the interaction + of the input variables. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + If int, values must be in the range `[1, inf)`. + + min_impurity_decrease : float, default=0.0 + A node will be split if this split induces a decrease of the impurity + greater than or equal to this value. + Values must be in the range `[0.0, inf)`. + + The weighted impurity decrease equation is the following:: + + N_t / N * (impurity - N_t_R / N_t * right_impurity + - N_t_L / N_t * left_impurity) + + where ``N`` is the total number of samples, ``N_t`` is the number of + samples at the current node, ``N_t_L`` is the number of samples in the + left child, and ``N_t_R`` is the number of samples in the right child. + + ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, + if ``sample_weight`` is passed. + + .. versionadded:: 0.19 + + init : estimator or 'zero', default=None + An estimator object that is used to compute the initial predictions. + ``init`` has to provide :term:`fit` and :term:`predict_proba`. If + 'zero', the initial raw predictions are set to zero. By default, a + ``DummyEstimator`` predicting the classes priors is used. + + random_state : int, RandomState instance or None, default=None + Controls the random seed given to each Tree estimator at each + boosting iteration. + In addition, it controls the random permutation of the features at + each split (see Notes for more details). + It also controls the random splitting of the training data to obtain a + validation set if `n_iter_no_change` is not None. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + max_features : {'sqrt', 'log2'}, int or float, default=None + The number of features to consider when looking for the best split: + + - If int, values must be in the range `[1, inf)`. + - If float, values must be in the range `(0.0, 1.0]` and the features + considered at each split will be `max(1, int(max_features * n_features_in_))`. + - If 'sqrt', then `max_features=sqrt(n_features)`. + - If 'log2', then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + + Choosing `max_features < n_features` leads to a reduction of variance + and an increase in bias. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + + verbose : int, default=0 + Enable verbose output. If 1 then it prints progress and performance + once in a while (the more trees the lower the frequency). If greater + than 1 then it prints progress and performance for every tree. + Values must be in the range `[0, inf)`. + + max_leaf_nodes : int, default=None + Grow trees with ``max_leaf_nodes`` in best-first fashion. + Best nodes are defined as relative reduction in impurity. + Values must be in the range `[2, inf)`. + If `None`, then unlimited number of leaf nodes. + + warm_start : bool, default=False + When set to ``True``, reuse the solution of the previous call to fit + and add more estimators to the ensemble, otherwise, just erase the + previous solution. See :term:`the Glossary `. + + validation_fraction : float, default=0.1 + The proportion of training data to set aside as validation set for + early stopping. Values must be in the range `(0.0, 1.0)`. + Only used if ``n_iter_no_change`` is set to an integer. + + .. versionadded:: 0.20 + + n_iter_no_change : int, default=None + ``n_iter_no_change`` is used to decide if early stopping will be used + to terminate training when validation score is not improving. By + default it is set to None to disable early stopping. If set to a + number, it will set aside ``validation_fraction`` size of the training + data as validation and terminate training when validation score is not + improving in all of the previous ``n_iter_no_change`` numbers of + iterations. The split is stratified. + Values must be in the range `[1, inf)`. + See + :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_early_stopping.py`. + + .. versionadded:: 0.20 + + tol : float, default=1e-4 + Tolerance for the early stopping. When the loss is not improving + by at least tol for ``n_iter_no_change`` iterations (if set to a + number), the training stops. + Values must be in the range `[0.0, inf)`. + + .. versionadded:: 0.20 + + ccp_alpha : non-negative float, default=0.0 + Complexity parameter used for Minimal Cost-Complexity Pruning. The + subtree with the largest cost complexity that is smaller than + ``ccp_alpha`` will be chosen. By default, no pruning is performed. + Values must be in the range `[0.0, inf)`. + See :ref:`minimal_cost_complexity_pruning` for details. See + :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py` + for an example of such pruning. + + .. versionadded:: 0.22 + + Attributes + ---------- + n_estimators_ : int + The number of estimators as selected by early stopping (if + ``n_iter_no_change`` is specified). Otherwise it is set to + ``n_estimators``. + + .. versionadded:: 0.20 + + n_trees_per_iteration_ : int + The number of trees that are built at each iteration. For binary classifiers, + this is always 1. + + .. versionadded:: 1.4.0 + + feature_importances_ : ndarray of shape (n_features,) + The impurity-based feature importances. + The higher, the more important the feature. + The importance of a feature is computed as the (normalized) + total reduction of the criterion brought by that feature. It is also + known as the Gini importance. + + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + + oob_improvement_ : ndarray of shape (n_estimators,) + The improvement in loss on the out-of-bag samples + relative to the previous iteration. + ``oob_improvement_[0]`` is the improvement in + loss of the first stage over the ``init`` estimator. + Only available if ``subsample < 1.0``. + + oob_scores_ : ndarray of shape (n_estimators,) + The full history of the loss values on the out-of-bag + samples. Only available if `subsample < 1.0`. + + .. versionadded:: 1.3 + + oob_score_ : float + The last value of the loss on the out-of-bag samples. It is + the same as `oob_scores_[-1]`. Only available if `subsample < 1.0`. + + .. versionadded:: 1.3 + + train_score_ : ndarray of shape (n_estimators,) + The i-th score ``train_score_[i]`` is the loss of the + model at iteration ``i`` on the in-bag sample. + If ``subsample == 1`` this is the loss on the training data. + + init_ : estimator + The estimator that provides the initial predictions. Set via the ``init`` + argument. + + estimators_ : ndarray of DecisionTreeRegressor of \ + shape (n_estimators, ``n_trees_per_iteration_``) + The collection of fitted sub-estimators. ``n_trees_per_iteration_`` is 1 for + binary classification, otherwise ``n_classes``. + + classes_ : ndarray of shape (n_classes,) + The classes labels. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_classes_ : int + The number of classes. + + max_features_ : int + The inferred value of max_features. + + See Also + -------- + HistGradientBoostingClassifier : Histogram-based Gradient Boosting + Classification Tree. + sklearn.tree.DecisionTreeClassifier : A decision tree classifier. + RandomForestClassifier : A meta-estimator that fits a number of decision + tree classifiers on various sub-samples of the dataset and uses + averaging to improve the predictive accuracy and control over-fitting. + AdaBoostClassifier : A meta-estimator that begins by fitting a classifier + on the original dataset and then fits additional copies of the + classifier on the same dataset where the weights of incorrectly + classified instances are adjusted such that subsequent classifiers + focus more on difficult cases. + + Notes + ----- + The features are always randomly permuted at each split. Therefore, + the best found split may vary, even with the same training data and + ``max_features=n_features``, if the improvement of the criterion is + identical for several splits enumerated during the search of the best + split. To obtain a deterministic behaviour during fitting, + ``random_state`` has to be fixed. + + References + ---------- + J. Friedman, Greedy Function Approximation: A Gradient Boosting + Machine, The Annals of Statistics, Vol. 29, No. 5, 2001. + + J. Friedman, Stochastic Gradient Boosting, 1999 + + T. Hastie, R. Tibshirani and J. Friedman. + Elements of Statistical Learning Ed. 2, Springer, 2009. + + Examples + -------- + The following example shows how to fit a gradient boosting classifier with + 100 decision stumps as weak learners. + + >>> from sklearn.datasets import make_hastie_10_2 + >>> from sklearn.ensemble import GradientBoostingClassifier + + >>> X, y = make_hastie_10_2(random_state=0) + >>> X_train, X_test = X[:2000], X[2000:] + >>> y_train, y_test = y[:2000], y[2000:] + + >>> clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, + ... max_depth=1, random_state=0).fit(X_train, y_train) + >>> clf.score(X_test, y_test) + 0.913 + """ + + _parameter_constraints: dict = { + **BaseGradientBoosting._parameter_constraints, + "loss": [StrOptions({"log_loss", "exponential"})], + "init": [StrOptions({"zero"}), None, HasMethods(["fit", "predict_proba"])], + } + + def __init__( + self, + *, + loss="log_loss", + learning_rate=0.1, + n_estimators=100, + subsample=1.0, + criterion="friedman_mse", + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_depth=3, + min_impurity_decrease=0.0, + init=None, + random_state=None, + max_features=None, + verbose=0, + max_leaf_nodes=None, + warm_start=False, + validation_fraction=0.1, + n_iter_no_change=None, + tol=1e-4, + ccp_alpha=0.0, + ): + super().__init__( + loss=loss, + learning_rate=learning_rate, + n_estimators=n_estimators, + criterion=criterion, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + min_weight_fraction_leaf=min_weight_fraction_leaf, + max_depth=max_depth, + init=init, + subsample=subsample, + max_features=max_features, + random_state=random_state, + verbose=verbose, + max_leaf_nodes=max_leaf_nodes, + min_impurity_decrease=min_impurity_decrease, + warm_start=warm_start, + validation_fraction=validation_fraction, + n_iter_no_change=n_iter_no_change, + tol=tol, + ccp_alpha=ccp_alpha, + ) + + def _encode_y(self, y, sample_weight): + # encode classes into 0 ... n_classes - 1 and sets attributes classes_ + # and n_trees_per_iteration_ + check_classification_targets(y) + + label_encoder = LabelEncoder() + encoded_y_int = label_encoder.fit_transform(y) + self.classes_ = label_encoder.classes_ + n_classes = self.classes_.shape[0] + # only 1 tree for binary classification. For multiclass classification, + # we build 1 tree per class. + self.n_trees_per_iteration_ = 1 if n_classes <= 2 else n_classes + encoded_y = encoded_y_int.astype(float, copy=False) + + # From here on, it is additional to the HGBT case. + # expose n_classes_ attribute + self.n_classes_ = n_classes + if sample_weight is None: + n_trim_classes = n_classes + else: + n_trim_classes = np.count_nonzero(np.bincount(encoded_y_int, sample_weight)) + + if n_trim_classes < 2: + raise ValueError( + "y contains %d class after sample_weight " + "trimmed classes with zero weights, while a " + "minimum of 2 classes are required." % n_trim_classes + ) + return encoded_y + + def _get_loss(self, sample_weight): + if self.loss == "log_loss": + if self.n_classes_ == 2: + return HalfBinomialLoss(sample_weight=sample_weight) + else: + return HalfMultinomialLoss( + sample_weight=sample_weight, n_classes=self.n_classes_ + ) + elif self.loss == "exponential": + if self.n_classes_ > 2: + raise ValueError( + f"loss='{self.loss}' is only suitable for a binary classification " + f"problem, you have n_classes={self.n_classes_}. " + "Please use loss='log_loss' instead." + ) + else: + return ExponentialLoss(sample_weight=sample_weight) + + def decision_function(self, X): + """Compute the decision function of ``X``. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + + Returns + ------- + score : ndarray of shape (n_samples, n_classes) or (n_samples,) + The decision function of the input samples, which corresponds to + the raw values predicted from the trees of the ensemble . The + order of the classes corresponds to that in the attribute + :term:`classes_`. Regression and binary classification produce an + array of shape (n_samples,). + """ + X = validate_data( + self, X, dtype=DTYPE, order="C", accept_sparse="csr", reset=False + ) + raw_predictions = self._raw_predict(X) + if raw_predictions.shape[1] == 1: + return raw_predictions.ravel() + return raw_predictions + + def staged_decision_function(self, X): + """Compute decision function of ``X`` for each iteration. + + This method allows monitoring (i.e. determine error on testing set) + after each stage. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + + Yields + ------ + score : generator of ndarray of shape (n_samples, k) + The decision function of the input samples, which corresponds to + the raw values predicted from the trees of the ensemble . The + classes corresponds to that in the attribute :term:`classes_`. + Regression and binary classification are special cases with + ``k == 1``, otherwise ``k==n_classes``. + """ + yield from self._staged_raw_predict(X) + + def predict(self, X): + """Predict class for X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + + Returns + ------- + y : ndarray of shape (n_samples,) + The predicted values. + """ + raw_predictions = self.decision_function(X) + if raw_predictions.ndim == 1: # decision_function already squeezed it + encoded_classes = (raw_predictions >= 0).astype(int) + else: + encoded_classes = np.argmax(raw_predictions, axis=1) + return self.classes_[encoded_classes] + + def staged_predict(self, X): + """Predict class at each stage for X. + + This method allows monitoring (i.e. determine error on testing set) + after each stage. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + + Yields + ------ + y : generator of ndarray of shape (n_samples,) + The predicted value of the input samples. + """ + if self.n_classes_ == 2: # n_trees_per_iteration_ = 1 + for raw_predictions in self._staged_raw_predict(X): + encoded_classes = (raw_predictions.squeeze() >= 0).astype(int) + yield self.classes_.take(encoded_classes, axis=0) + else: + for raw_predictions in self._staged_raw_predict(X): + encoded_classes = np.argmax(raw_predictions, axis=1) + yield self.classes_.take(encoded_classes, axis=0) + + def predict_proba(self, X): + """Predict class probabilities for X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + + Returns + ------- + p : ndarray of shape (n_samples, n_classes) + The class probabilities of the input samples. The order of the + classes corresponds to that in the attribute :term:`classes_`. + + Raises + ------ + AttributeError + If the ``loss`` does not support probabilities. + """ + raw_predictions = self.decision_function(X) + return self._loss.predict_proba(raw_predictions) + + def predict_log_proba(self, X): + """Predict class log-probabilities for X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + + Returns + ------- + p : ndarray of shape (n_samples, n_classes) + The class log-probabilities of the input samples. The order of the + classes corresponds to that in the attribute :term:`classes_`. + + Raises + ------ + AttributeError + If the ``loss`` does not support probabilities. + """ + proba = self.predict_proba(X) + return np.log(proba) + + def staged_predict_proba(self, X): + """Predict class probabilities at each stage for X. + + This method allows monitoring (i.e. determine error on testing set) + after each stage. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + + Yields + ------ + y : generator of ndarray of shape (n_samples,) + The predicted value of the input samples. + """ + try: + for raw_predictions in self._staged_raw_predict(X): + yield self._loss.predict_proba(raw_predictions) + except NotFittedError: + raise + except AttributeError as e: + raise AttributeError( + "loss=%r does not support predict_proba" % self.loss + ) from e + + +class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): + """Gradient Boosting for regression. + + This estimator builds an additive model in a forward stage-wise fashion; it + allows for the optimization of arbitrary differentiable loss functions. In + each stage a regression tree is fit on the negative gradient of the given + loss function. + + :class:`~sklearn.ensemble.HistGradientBoostingRegressor` is a much faster variant + of this algorithm for intermediate and large datasets (`n_samples >= 10_000`) and + supports monotonic constraints. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + loss : {'squared_error', 'absolute_error', 'huber', 'quantile'}, \ + default='squared_error' + Loss function to be optimized. 'squared_error' refers to the squared + error for regression. 'absolute_error' refers to the absolute error of + regression and is a robust loss function. 'huber' is a + combination of the two. 'quantile' allows quantile regression (use + `alpha` to specify the quantile). + See + :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_quantile.py` + for an example that demonstrates quantile regression for creating + prediction intervals with `loss='quantile'`. + + learning_rate : float, default=0.1 + Learning rate shrinks the contribution of each tree by `learning_rate`. + There is a trade-off between learning_rate and n_estimators. + Values must be in the range `[0.0, inf)`. + + n_estimators : int, default=100 + The number of boosting stages to perform. Gradient boosting + is fairly robust to over-fitting so a large number usually + results in better performance. + Values must be in the range `[1, inf)`. + + subsample : float, default=1.0 + The fraction of samples to be used for fitting the individual base + learners. If smaller than 1.0 this results in Stochastic Gradient + Boosting. `subsample` interacts with the parameter `n_estimators`. + Choosing `subsample < 1.0` leads to a reduction of variance + and an increase in bias. + Values must be in the range `(0.0, 1.0]`. + + criterion : {'friedman_mse', 'squared_error'}, default='friedman_mse' + The function to measure the quality of a split. Supported criteria are + "friedman_mse" for the mean squared error with improvement score by + Friedman, "squared_error" for mean squared error. The default value of + "friedman_mse" is generally the best as it can provide a better + approximation in some cases. + + .. versionadded:: 0.18 + + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + + - If int, values must be in the range `[2, inf)`. + - If float, values must be in the range `(0.0, 1.0]` and `min_samples_split` + will be `ceil(min_samples_split * n_samples)`. + + .. versionchanged:: 0.18 + Added float values for fractions. + + min_samples_leaf : int or float, default=1 + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, values must be in the range `[1, inf)`. + - If float, values must be in the range `(0.0, 1.0)` and `min_samples_leaf` + will be `ceil(min_samples_leaf * n_samples)`. + + .. versionchanged:: 0.18 + Added float values for fractions. + + min_weight_fraction_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. + Values must be in the range `[0.0, 0.5]`. + + max_depth : int or None, default=3 + Maximum depth of the individual regression estimators. The maximum + depth limits the number of nodes in the tree. Tune this parameter + for best performance; the best value depends on the interaction + of the input variables. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + If int, values must be in the range `[1, inf)`. + + min_impurity_decrease : float, default=0.0 + A node will be split if this split induces a decrease of the impurity + greater than or equal to this value. + Values must be in the range `[0.0, inf)`. + + The weighted impurity decrease equation is the following:: + + N_t / N * (impurity - N_t_R / N_t * right_impurity + - N_t_L / N_t * left_impurity) + + where ``N`` is the total number of samples, ``N_t`` is the number of + samples at the current node, ``N_t_L`` is the number of samples in the + left child, and ``N_t_R`` is the number of samples in the right child. + + ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, + if ``sample_weight`` is passed. + + .. versionadded:: 0.19 + + init : estimator or 'zero', default=None + An estimator object that is used to compute the initial predictions. + ``init`` has to provide :term:`fit` and :term:`predict`. If 'zero', the + initial raw predictions are set to zero. By default a + ``DummyEstimator`` is used, predicting either the average target value + (for loss='squared_error'), or a quantile for the other losses. + + random_state : int, RandomState instance or None, default=None + Controls the random seed given to each Tree estimator at each + boosting iteration. + In addition, it controls the random permutation of the features at + each split (see Notes for more details). + It also controls the random splitting of the training data to obtain a + validation set if `n_iter_no_change` is not None. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + max_features : {'sqrt', 'log2'}, int or float, default=None + The number of features to consider when looking for the best split: + + - If int, values must be in the range `[1, inf)`. + - If float, values must be in the range `(0.0, 1.0]` and the features + considered at each split will be `max(1, int(max_features * n_features_in_))`. + - If "sqrt", then `max_features=sqrt(n_features)`. + - If "log2", then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + + Choosing `max_features < n_features` leads to a reduction of variance + and an increase in bias. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + + alpha : float, default=0.9 + The alpha-quantile of the huber loss function and the quantile + loss function. Only if ``loss='huber'`` or ``loss='quantile'``. + Values must be in the range `(0.0, 1.0)`. + + verbose : int, default=0 + Enable verbose output. If 1 then it prints progress and performance + once in a while (the more trees the lower the frequency). If greater + than 1 then it prints progress and performance for every tree. + Values must be in the range `[0, inf)`. + + max_leaf_nodes : int, default=None + Grow trees with ``max_leaf_nodes`` in best-first fashion. + Best nodes are defined as relative reduction in impurity. + Values must be in the range `[2, inf)`. + If None, then unlimited number of leaf nodes. + + warm_start : bool, default=False + When set to ``True``, reuse the solution of the previous call to fit + and add more estimators to the ensemble, otherwise, just erase the + previous solution. See :term:`the Glossary `. + + validation_fraction : float, default=0.1 + The proportion of training data to set aside as validation set for + early stopping. Values must be in the range `(0.0, 1.0)`. + Only used if ``n_iter_no_change`` is set to an integer. + + .. versionadded:: 0.20 + + n_iter_no_change : int, default=None + ``n_iter_no_change`` is used to decide if early stopping will be used + to terminate training when validation score is not improving. By + default it is set to None to disable early stopping. If set to a + number, it will set aside ``validation_fraction`` size of the training + data as validation and terminate training when validation score is not + improving in all of the previous ``n_iter_no_change`` numbers of + iterations. + Values must be in the range `[1, inf)`. + See + :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_early_stopping.py`. + + .. versionadded:: 0.20 + + tol : float, default=1e-4 + Tolerance for the early stopping. When the loss is not improving + by at least tol for ``n_iter_no_change`` iterations (if set to a + number), the training stops. + Values must be in the range `[0.0, inf)`. + + .. versionadded:: 0.20 + + ccp_alpha : non-negative float, default=0.0 + Complexity parameter used for Minimal Cost-Complexity Pruning. The + subtree with the largest cost complexity that is smaller than + ``ccp_alpha`` will be chosen. By default, no pruning is performed. + Values must be in the range `[0.0, inf)`. + See :ref:`minimal_cost_complexity_pruning` for details. See + :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py` + for an example of such pruning. + + .. versionadded:: 0.22 + + Attributes + ---------- + n_estimators_ : int + The number of estimators as selected by early stopping (if + ``n_iter_no_change`` is specified). Otherwise it is set to + ``n_estimators``. + + n_trees_per_iteration_ : int + The number of trees that are built at each iteration. For regressors, this is + always 1. + + .. versionadded:: 1.4.0 + + feature_importances_ : ndarray of shape (n_features,) + The impurity-based feature importances. + The higher, the more important the feature. + The importance of a feature is computed as the (normalized) + total reduction of the criterion brought by that feature. It is also + known as the Gini importance. + + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + + oob_improvement_ : ndarray of shape (n_estimators,) + The improvement in loss on the out-of-bag samples + relative to the previous iteration. + ``oob_improvement_[0]`` is the improvement in + loss of the first stage over the ``init`` estimator. + Only available if ``subsample < 1.0``. + + oob_scores_ : ndarray of shape (n_estimators,) + The full history of the loss values on the out-of-bag + samples. Only available if `subsample < 1.0`. + + .. versionadded:: 1.3 + + oob_score_ : float + The last value of the loss on the out-of-bag samples. It is + the same as `oob_scores_[-1]`. Only available if `subsample < 1.0`. + + .. versionadded:: 1.3 + + train_score_ : ndarray of shape (n_estimators,) + The i-th score ``train_score_[i]`` is the loss of the + model at iteration ``i`` on the in-bag sample. + If ``subsample == 1`` this is the loss on the training data. + + init_ : estimator + The estimator that provides the initial predictions. Set via the ``init`` + argument. + + estimators_ : ndarray of DecisionTreeRegressor of shape (n_estimators, 1) + The collection of fitted sub-estimators. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + max_features_ : int + The inferred value of max_features. + + See Also + -------- + HistGradientBoostingRegressor : Histogram-based Gradient Boosting + Classification Tree. + sklearn.tree.DecisionTreeRegressor : A decision tree regressor. + sklearn.ensemble.RandomForestRegressor : A random forest regressor. + + Notes + ----- + The features are always randomly permuted at each split. Therefore, + the best found split may vary, even with the same training data and + ``max_features=n_features``, if the improvement of the criterion is + identical for several splits enumerated during the search of the best + split. To obtain a deterministic behaviour during fitting, + ``random_state`` has to be fixed. + + References + ---------- + J. Friedman, Greedy Function Approximation: A Gradient Boosting + Machine, The Annals of Statistics, Vol. 29, No. 5, 2001. + + J. Friedman, Stochastic Gradient Boosting, 1999 + + T. Hastie, R. Tibshirani and J. Friedman. + Elements of Statistical Learning Ed. 2, Springer, 2009. + + Examples + -------- + >>> from sklearn.datasets import make_regression + >>> from sklearn.ensemble import GradientBoostingRegressor + >>> from sklearn.model_selection import train_test_split + >>> X, y = make_regression(random_state=0) + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, random_state=0) + >>> reg = GradientBoostingRegressor(random_state=0) + >>> reg.fit(X_train, y_train) + GradientBoostingRegressor(random_state=0) + >>> reg.predict(X_test[1:2]) + array([-61.1]) + >>> reg.score(X_test, y_test) + 0.4... + + For a detailed example of utilizing + :class:`~sklearn.ensemble.GradientBoostingRegressor` + to fit an ensemble of weak predictive models, please refer to + :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_regression.py`. + """ + + _parameter_constraints: dict = { + **BaseGradientBoosting._parameter_constraints, + "loss": [StrOptions({"squared_error", "absolute_error", "huber", "quantile"})], + "init": [StrOptions({"zero"}), None, HasMethods(["fit", "predict"])], + "alpha": [Interval(Real, 0.0, 1.0, closed="neither")], + } + + def __init__( + self, + *, + loss="squared_error", + learning_rate=0.1, + n_estimators=100, + subsample=1.0, + criterion="friedman_mse", + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_depth=3, + min_impurity_decrease=0.0, + init=None, + random_state=None, + max_features=None, + alpha=0.9, + verbose=0, + max_leaf_nodes=None, + warm_start=False, + validation_fraction=0.1, + n_iter_no_change=None, + tol=1e-4, + ccp_alpha=0.0, + ): + super().__init__( + loss=loss, + learning_rate=learning_rate, + n_estimators=n_estimators, + criterion=criterion, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + min_weight_fraction_leaf=min_weight_fraction_leaf, + max_depth=max_depth, + init=init, + subsample=subsample, + max_features=max_features, + min_impurity_decrease=min_impurity_decrease, + random_state=random_state, + alpha=alpha, + verbose=verbose, + max_leaf_nodes=max_leaf_nodes, + warm_start=warm_start, + validation_fraction=validation_fraction, + n_iter_no_change=n_iter_no_change, + tol=tol, + ccp_alpha=ccp_alpha, + ) + + def _encode_y(self, y=None, sample_weight=None): + # Just convert y to the expected dtype + self.n_trees_per_iteration_ = 1 + y = y.astype(DOUBLE, copy=False) + return y + + def _get_loss(self, sample_weight): + if self.loss in ("quantile", "huber"): + return _LOSSES[self.loss](sample_weight=sample_weight, quantile=self.alpha) + else: + return _LOSSES[self.loss](sample_weight=sample_weight) + + def predict(self, X): + """Predict regression target for X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + + Returns + ------- + y : ndarray of shape (n_samples,) + The predicted values. + """ + X = validate_data( + self, X, dtype=DTYPE, order="C", accept_sparse="csr", reset=False + ) + # In regression we can directly return the raw value from the trees. + return self._raw_predict(X).ravel() + + def staged_predict(self, X): + """Predict regression target at each stage for X. + + This method allows monitoring (i.e. determine error on testing set) + after each stage. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + + Yields + ------ + y : generator of ndarray of shape (n_samples,) + The predicted value of the input samples. + """ + for raw_predictions in self._staged_raw_predict(X): + yield raw_predictions.ravel() + + def apply(self, X): + """Apply trees in the ensemble to X, return leaf indices. + + .. versionadded:: 0.17 + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, its dtype will be converted to + ``dtype=np.float32``. If a sparse matrix is provided, it will + be converted to a sparse ``csr_matrix``. + + Returns + ------- + X_leaves : array-like of shape (n_samples, n_estimators) + For each datapoint x in X and for each tree in the ensemble, + return the index of the leaf x ends up in each estimator. + """ + + leaves = super().apply(X) + leaves = leaves.reshape(X.shape[0], self.estimators_.shape[0]) + return leaves diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_gradient_boosting.pyx b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_gradient_boosting.pyx new file mode 100644 index 0000000000000000000000000000000000000000..cd9845a217c7d505ff227637ec7c3f092a432849 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_gradient_boosting.pyx @@ -0,0 +1,262 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from libc.stdlib cimport free +from libc.string cimport memset + +import numpy as np +from scipy.sparse import issparse + +from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint8_t +# Note: _tree uses cimport numpy, cnp.import_array, so we need to include +# numpy headers in the build configuration of this extension +from ..tree._tree cimport Node +from ..tree._tree cimport Tree +from ..tree._utils cimport safe_realloc + + +# no namespace lookup for numpy dtype and array creation +from numpy import zeros as np_zeros + + +# constant to mark tree leafs +cdef intp_t TREE_LEAF = -1 + +cdef void _predict_regression_tree_inplace_fast_dense( + const float32_t[:, ::1] X, + Node* root_node, + double *value, + double scale, + Py_ssize_t k, + float64_t[:, :] out +) noexcept nogil: + """Predicts output for regression tree and stores it in ``out[i, k]``. + + This function operates directly on the data arrays of the tree + data structures. This is 5x faster than the variant above because + it allows us to avoid buffer validation. + + The function assumes that the ndarray that wraps ``X`` is + c-continuous. + + Parameters + ---------- + X : float32_t 2d memory view + The memory view on the data ndarray of the input ``X``. + Assumes that the array is c-continuous. + root_node : tree Node pointer + Pointer to the main node array of the :class:``sklearn.tree.Tree``. + value : np.float64_t pointer + The pointer to the data array of the ``value`` array attribute + of the :class:``sklearn.tree.Tree``. + scale : double + A constant to scale the predictions. + k : int + The index of the tree output to be predicted. Must satisfy + 0 <= ``k`` < ``K``. + out : memory view on array of type np.float64_t + The data array where the predictions are stored. + ``out`` is assumed to be a two-dimensional array of + shape ``(n_samples, K)``. + """ + cdef intp_t n_samples = X.shape[0] + cdef Py_ssize_t i + cdef Node *node + for i in range(n_samples): + node = root_node + # While node not a leaf + while node.left_child != TREE_LEAF: + if X[i, node.feature] <= node.threshold: + node = root_node + node.left_child + else: + node = root_node + node.right_child + out[i, k] += scale * value[node - root_node] + + +def _predict_regression_tree_stages_sparse( + object[:, :] estimators, + object X, + double scale, + float64_t[:, :] out +): + """Predicts output for regression tree inplace and adds scaled value to ``out[i, k]``. + + The function assumes that the ndarray that wraps ``X`` is csr_matrix. + """ + cdef const float32_t[::1] X_data = X.data + cdef const int32_t[::1] X_indices = X.indices + cdef const int32_t[::1] X_indptr = X.indptr + + cdef intp_t n_samples = X.shape[0] + cdef intp_t n_features = X.shape[1] + cdef intp_t n_stages = estimators.shape[0] + cdef intp_t n_outputs = estimators.shape[1] + + # Indices and temporary variables + cdef intp_t sample_i + cdef intp_t feature_i + cdef intp_t stage_i + cdef intp_t output_i + cdef Node *root_node = NULL + cdef Node *node = NULL + cdef double *value = NULL + + cdef Tree tree + cdef Node** nodes = NULL + cdef double** values = NULL + safe_realloc(&nodes, n_stages * n_outputs) + safe_realloc(&values, n_stages * n_outputs) + for stage_i in range(n_stages): + for output_i in range(n_outputs): + tree = estimators[stage_i, output_i].tree_ + nodes[stage_i * n_outputs + output_i] = tree.nodes + values[stage_i * n_outputs + output_i] = tree.value + + # Initialize auxiliary data-structure + cdef float32_t feature_value = 0. + cdef float32_t* X_sample = NULL + + # feature_to_sample as a data structure records the last seen sample + # for each feature; functionally, it is an efficient way to identify + # which features are nonzero in the present sample. + cdef intp_t* feature_to_sample = NULL + + safe_realloc(&X_sample, n_features) + safe_realloc(&feature_to_sample, n_features) + + memset(feature_to_sample, -1, n_features * sizeof(intp_t)) + + # Cycle through all samples + for sample_i in range(n_samples): + for feature_i in range(X_indptr[sample_i], X_indptr[sample_i + 1]): + feature_to_sample[X_indices[feature_i]] = sample_i + X_sample[X_indices[feature_i]] = X_data[feature_i] + + # Cycle through all stages + for stage_i in range(n_stages): + # Cycle through all trees + for output_i in range(n_outputs): + root_node = nodes[stage_i * n_outputs + output_i] + value = values[stage_i * n_outputs + output_i] + node = root_node + + # While node not a leaf + while node.left_child != TREE_LEAF: + # ... and node.right_child != TREE_LEAF: + if feature_to_sample[node.feature] == sample_i: + feature_value = X_sample[node.feature] + else: + feature_value = 0. + + if feature_value <= node.threshold: + node = root_node + node.left_child + else: + node = root_node + node.right_child + out[sample_i, output_i] += scale * value[node - root_node] + + # Free auxiliary arrays + free(X_sample) + free(feature_to_sample) + free(nodes) + free(values) + + +def predict_stages( + object[:, :] estimators, + object X, + double scale, + float64_t[:, :] out +): + """Add predictions of ``estimators`` to ``out``. + + Each estimator is scaled by ``scale`` before its prediction + is added to ``out``. + """ + cdef Py_ssize_t i + cdef Py_ssize_t k + cdef Py_ssize_t n_estimators = estimators.shape[0] + cdef Py_ssize_t K = estimators.shape[1] + cdef Tree tree + + if issparse(X): + if X.format != 'csr': + raise ValueError("When X is a sparse matrix, a CSR format is" + " expected, got {!r}".format(type(X))) + _predict_regression_tree_stages_sparse( + estimators=estimators, X=X, scale=scale, out=out + ) + else: + if not isinstance(X, np.ndarray) or np.isfortran(X): + raise ValueError(f"X should be C-ordered np.ndarray, got {type(X)}") + + for i in range(n_estimators): + for k in range(K): + tree = estimators[i, k].tree_ + + # avoid buffer validation by casting to ndarray + # and get data pointer + # need brackets because of casting operator priority + _predict_regression_tree_inplace_fast_dense( + X=X, + root_node=tree.nodes, + value=tree.value, + scale=scale, + k=k, + out=out + ) + # out[:, k] += scale * tree.predict(X).ravel() + + +def predict_stage( + object[:, :] estimators, + int stage, + object X, + double scale, + float64_t[:, :] out +): + """Add predictions of ``estimators[stage]`` to ``out``. + + Each estimator in the stage is scaled by ``scale`` before + its prediction is added to ``out``. + """ + return predict_stages( + estimators=estimators[stage:stage + 1], X=X, scale=scale, out=out + ) + + +def _random_sample_mask( + intp_t n_total_samples, + intp_t n_total_in_bag, + random_state +): + """Create a random sample mask where ``n_total_in_bag`` elements are set. + + Parameters + ---------- + n_total_samples : int + The length of the resulting mask. + + n_total_in_bag : int + The number of elements in the sample mask which are set to 1. + + random_state : RandomState + A numpy ``RandomState`` object. + + Returns + ------- + sample_mask : np.ndarray, shape=[n_total_samples] + An ndarray where ``n_total_in_bag`` elements are set to ``True`` + the others are ``False``. + """ + cdef float64_t[::1] rand = random_state.uniform(size=n_total_samples) + cdef uint8_t[::1] sample_mask = np_zeros((n_total_samples,), dtype=bool) + + cdef intp_t n_bagged = 0 + cdef intp_t i = 0 + + for i in range(n_total_samples): + if rand[i] * (n_total_samples - i) < (n_total_in_bag - n_bagged): + sample_mask[i] = 1 + n_bagged += 1 + + return sample_mask.base diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5939d83c8483812187c39d373e425630a9e44fe5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/__init__.py @@ -0,0 +1,8 @@ +"""This module implements histogram-based gradient boosting estimators. + +The implementation is a port from pygbm which is itself strongly inspired +from LightGBM. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_binning.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_binning.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..34ab20733dfd9310b157edfc130f774bf063959c Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_binning.cpython-312-x86_64-linux-gnu.so differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx new file mode 100644 index 0000000000000000000000000000000000000000..f343ada64cdd0c0923184b5bafbaf1f0a9526a12 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx @@ -0,0 +1,85 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from cython.parallel import prange +from libc.math cimport isnan + +from .common cimport X_DTYPE_C, X_BINNED_DTYPE_C +from ...utils._typedefs cimport uint8_t + + +def _map_to_bins(const X_DTYPE_C [:, :] data, + list binning_thresholds, + const uint8_t[::1] is_categorical, + const uint8_t missing_values_bin_idx, + int n_threads, + X_BINNED_DTYPE_C [::1, :] binned): + """Bin continuous and categorical values to discrete integer-coded levels. + + A given value x is mapped into bin value i iff + thresholds[i - 1] < x <= thresholds[i] + + Parameters + ---------- + data : ndarray, shape (n_samples, n_features) + The data to bin. + binning_thresholds : list of arrays + For each feature, stores the increasing numeric values that are + used to separate the bins. + is_categorical : ndarray of uint8_t of shape (n_features,) + Indicates categorical features. + n_threads : int + Number of OpenMP threads to use. + binned : ndarray, shape (n_samples, n_features) + Output array, must be fortran aligned. + """ + cdef: + int feature_idx + + for feature_idx in range(data.shape[1]): + _map_col_to_bins( + data[:, feature_idx], + binning_thresholds[feature_idx], + is_categorical[feature_idx], + missing_values_bin_idx, + n_threads, + binned[:, feature_idx] + ) + + +cdef void _map_col_to_bins( + const X_DTYPE_C [:] data, + const X_DTYPE_C [:] binning_thresholds, + const uint8_t is_categorical, + const uint8_t missing_values_bin_idx, + int n_threads, + X_BINNED_DTYPE_C [:] binned +): + """Binary search to find the bin index for each value in the data.""" + cdef: + int i + int left + int right + int middle + + for i in prange(data.shape[0], schedule='static', nogil=True, + num_threads=n_threads): + if ( + isnan(data[i]) or + # To follow LightGBM's conventions, negative values for + # categorical features are considered as missing values. + (is_categorical and data[i] < 0) + ): + binned[i] = missing_values_bin_idx + else: + # for known values, use binary search + left, right = 0, binning_thresholds.shape[0] + while left < right: + # equal to (right + left - 1) // 2 but avoids overflow + middle = left + (right - left - 1) // 2 + if data[i] <= binning_thresholds[middle]: + right = middle + else: + left = middle + 1 + + binned[i] = left diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_bitset.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_bitset.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..d68213df8f07d204ee46bfd482025551796bf324 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_bitset.cpython-312-x86_64-linux-gnu.so differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_bitset.pxd b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_bitset.pxd new file mode 100644 index 0000000000000000000000000000000000000000..c44477cfa2300620c457152d86f8053ef44cf720 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_bitset.pxd @@ -0,0 +1,20 @@ +from .common cimport X_BINNED_DTYPE_C +from .common cimport BITSET_DTYPE_C +from .common cimport BITSET_INNER_DTYPE_C +from .common cimport X_DTYPE_C +from ...utils._typedefs cimport uint8_t + + +cdef void init_bitset(BITSET_DTYPE_C bitset) noexcept nogil + +cdef void set_bitset(BITSET_DTYPE_C bitset, X_BINNED_DTYPE_C val) noexcept nogil + +cdef uint8_t in_bitset(BITSET_DTYPE_C bitset, X_BINNED_DTYPE_C val) noexcept nogil + +cpdef uint8_t in_bitset_memoryview(const BITSET_INNER_DTYPE_C[:] bitset, + X_BINNED_DTYPE_C val) noexcept nogil + +cdef uint8_t in_bitset_2d_memoryview( + const BITSET_INNER_DTYPE_C[:, :] bitset, + X_BINNED_DTYPE_C val, + unsigned int row) noexcept nogil diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_bitset.pyx b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_bitset.pyx new file mode 100644 index 0000000000000000000000000000000000000000..cab20f7d5af05242102379d9e61ab7ca9a0e91f3 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_bitset.pyx @@ -0,0 +1,65 @@ +from .common cimport BITSET_INNER_DTYPE_C +from .common cimport BITSET_DTYPE_C +from .common cimport X_DTYPE_C +from .common cimport X_BINNED_DTYPE_C +from ...utils._typedefs cimport uint8_t + + +# A bitset is a data structure used to represent sets of integers in [0, n]. We +# use them to represent sets of features indices (e.g. features that go to the +# left child, or features that are categorical). For familiarity with bitsets +# and bitwise operations: +# https://en.wikipedia.org/wiki/Bit_array +# https://en.wikipedia.org/wiki/Bitwise_operation + + +cdef inline void init_bitset(BITSET_DTYPE_C bitset) noexcept nogil: # OUT + cdef: + unsigned int i + + for i in range(8): + bitset[i] = 0 + + +cdef inline void set_bitset(BITSET_DTYPE_C bitset, # OUT + X_BINNED_DTYPE_C val) noexcept nogil: + bitset[val // 32] |= (1 << (val % 32)) + + +cdef inline uint8_t in_bitset(BITSET_DTYPE_C bitset, + X_BINNED_DTYPE_C val) noexcept nogil: + return (bitset[val // 32] >> (val % 32)) & 1 + + +cpdef inline uint8_t in_bitset_memoryview(const BITSET_INNER_DTYPE_C[:] bitset, + X_BINNED_DTYPE_C val) noexcept nogil: + return (bitset[val // 32] >> (val % 32)) & 1 + + +cdef inline uint8_t in_bitset_2d_memoryview(const BITSET_INNER_DTYPE_C[:, :] bitset, + X_BINNED_DTYPE_C val, + unsigned int row) noexcept nogil: + # Same as above but works on 2d memory views to avoid the creation of 1d + # memory views. See https://github.com/scikit-learn/scikit-learn/issues/17299 + return (bitset[row, val // 32] >> (val % 32)) & 1 + + +cpdef inline void set_bitset_memoryview(BITSET_INNER_DTYPE_C[:] bitset, # OUT + X_BINNED_DTYPE_C val): + bitset[val // 32] |= (1 << (val % 32)) + + +def set_raw_bitset_from_binned_bitset(BITSET_INNER_DTYPE_C[:] raw_bitset, # OUT + BITSET_INNER_DTYPE_C[:] binned_bitset, + X_DTYPE_C[:] categories): + """Set the raw_bitset from the values of the binned bitset + + categories is a mapping from binned category value to raw category value. + """ + cdef: + int binned_cat_value + X_DTYPE_C raw_cat_value + + for binned_cat_value, raw_cat_value in enumerate(categories): + if in_bitset_memoryview(binned_bitset, binned_cat_value): + set_bitset_memoryview(raw_bitset, raw_cat_value) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx new file mode 100644 index 0000000000000000000000000000000000000000..dcbbf733ebb51cf6f3ca426bd8d2955a20af3e50 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx @@ -0,0 +1,59 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from cython.parallel import prange +import numpy as np + +from .common import Y_DTYPE +from .common cimport Y_DTYPE_C + + +def _update_raw_predictions( + Y_DTYPE_C [::1] raw_predictions, # OUT + grower, + n_threads, +): + """Update raw_predictions with the predictions of the newest tree. + + This is equivalent to (and much faster than): + raw_predictions += last_estimator.predict(X_train) + + It's only possible for data X_train that is used to train the trees (it + isn't usable for e.g. X_val). + """ + cdef: + unsigned int [::1] starts # start of each leaf in partition + unsigned int [::1] stops # end of each leaf in partition + Y_DTYPE_C [::1] values # value of each leaf + const unsigned int [::1] partition = grower.splitter.partition + list leaves + + leaves = grower.finalized_leaves + starts = np.array([leaf.partition_start for leaf in leaves], + dtype=np.uint32) + stops = np.array([leaf.partition_stop for leaf in leaves], + dtype=np.uint32) + values = np.array([leaf.value for leaf in leaves], dtype=Y_DTYPE) + + _update_raw_predictions_helper(raw_predictions, starts, stops, partition, + values, n_threads) + + +cdef inline void _update_raw_predictions_helper( + Y_DTYPE_C [::1] raw_predictions, # OUT + const unsigned int [::1] starts, + const unsigned int [::1] stops, + const unsigned int [::1] partition, + const Y_DTYPE_C [::1] values, + int n_threads, +): + + cdef: + unsigned int position + int leaf_idx + int n_leaves = starts.shape[0] + + for leaf_idx in prange(n_leaves, schedule='static', nogil=True, + num_threads=n_threads): + for position in range(starts[leaf_idx], stops[leaf_idx]): + raw_predictions[partition[position]] += values[leaf_idx] diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx new file mode 100644 index 0000000000000000000000000000000000000000..8257fa974c4a00115180aaa7815389051f1d9db2 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx @@ -0,0 +1,256 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from cython.parallel import prange +from libc.math cimport isnan +import numpy as np + +from ...utils._typedefs cimport intp_t, uint8_t +from .common cimport X_DTYPE_C +from .common cimport Y_DTYPE_C +from .common import Y_DTYPE +from .common cimport X_BINNED_DTYPE_C +from .common cimport BITSET_INNER_DTYPE_C +from .common cimport node_struct +from ._bitset cimport in_bitset_2d_memoryview + + +def _predict_from_raw_data( # raw data = non-binned data + const node_struct [:] nodes, + const X_DTYPE_C [:, :] numeric_data, + const BITSET_INNER_DTYPE_C [:, ::1] raw_left_cat_bitsets, + const BITSET_INNER_DTYPE_C [:, ::1] known_cat_bitsets, + const unsigned int [::1] f_idx_map, + int n_threads, + Y_DTYPE_C [:] out): + + cdef: + int i + + for i in prange(numeric_data.shape[0], schedule='static', nogil=True, + num_threads=n_threads): + out[i] = _predict_one_from_raw_data( + nodes, numeric_data, raw_left_cat_bitsets, + known_cat_bitsets, + f_idx_map, i) + + +cdef inline Y_DTYPE_C _predict_one_from_raw_data( + const node_struct [:] nodes, + const X_DTYPE_C [:, :] numeric_data, + const BITSET_INNER_DTYPE_C [:, ::1] raw_left_cat_bitsets, + const BITSET_INNER_DTYPE_C [:, ::1] known_cat_bitsets, + const unsigned int [::1] f_idx_map, + const int row) noexcept nogil: + # Need to pass the whole array and the row index, else prange won't work. + # See issue Cython #2798 + + cdef: + node_struct node = nodes[0] + unsigned int node_idx = 0 + X_DTYPE_C data_val + + while True: + if node.is_leaf: + return node.value + + data_val = numeric_data[row, node.feature_idx] + + if isnan(data_val): + if node.missing_go_to_left: + node_idx = node.left + else: + node_idx = node.right + elif node.is_categorical: + if data_val < 0: + # data_val is not in the accepted range, so it is treated as missing value + node_idx = node.left if node.missing_go_to_left else node.right + elif in_bitset_2d_memoryview( + raw_left_cat_bitsets, + data_val, + node.bitset_idx): + node_idx = node.left + elif in_bitset_2d_memoryview( + known_cat_bitsets, + data_val, + f_idx_map[node.feature_idx]): + node_idx = node.right + else: + # Treat unknown categories as missing. + node_idx = node.left if node.missing_go_to_left else node.right + else: + if data_val <= node.num_threshold: + node_idx = node.left + else: + node_idx = node.right + node = nodes[node_idx] + + +def _predict_from_binned_data( + node_struct [:] nodes, + const X_BINNED_DTYPE_C [:, :] binned_data, + BITSET_INNER_DTYPE_C [:, :] binned_left_cat_bitsets, + const uint8_t missing_values_bin_idx, + int n_threads, + Y_DTYPE_C [:] out): + + cdef: + int i + + for i in prange(binned_data.shape[0], schedule='static', nogil=True, + num_threads=n_threads): + out[i] = _predict_one_from_binned_data(nodes, + binned_data, + binned_left_cat_bitsets, i, + missing_values_bin_idx) + + +cdef inline Y_DTYPE_C _predict_one_from_binned_data( + node_struct [:] nodes, + const X_BINNED_DTYPE_C [:, :] binned_data, + const BITSET_INNER_DTYPE_C [:, :] binned_left_cat_bitsets, + const int row, + const uint8_t missing_values_bin_idx) noexcept nogil: + # Need to pass the whole array and the row index, else prange won't work. + # See issue Cython #2798 + + cdef: + node_struct node = nodes[0] + unsigned int node_idx = 0 + X_BINNED_DTYPE_C data_val + + while True: + if node.is_leaf: + return node.value + + data_val = binned_data[row, node.feature_idx] + + if data_val == missing_values_bin_idx: + if node.missing_go_to_left: + node_idx = node.left + else: + node_idx = node.right + elif node.is_categorical: + if in_bitset_2d_memoryview( + binned_left_cat_bitsets, + data_val, + node.bitset_idx): + node_idx = node.left + else: + node_idx = node.right + else: + if data_val <= node.bin_threshold: + node_idx = node.left + else: + node_idx = node.right + node = nodes[node_idx] + + +def _compute_partial_dependence( + node_struct [:] nodes, + const X_DTYPE_C [:, ::1] X, + const intp_t [:] target_features, + Y_DTYPE_C [:] out +): + """Partial dependence of the response on the ``target_features`` set. + + For each sample in ``X`` a tree traversal is performed. + Each traversal starts from the root with weight 1.0. + + At each non-leaf node that splits on a target feature, either + the left child or the right child is visited based on the feature + value of the current sample, and the weight is not modified. + At each non-leaf node that splits on a complementary feature, + both children are visited and the weight is multiplied by the fraction + of training samples which went to each child. + + At each leaf, the value of the node is multiplied by the current + weight (weights sum to 1 for all visited terminal nodes). + + Parameters + ---------- + nodes : view on array of PREDICTOR_RECORD_DTYPE, shape (n_nodes) + The array representing the predictor tree. + X : view on 2d ndarray, shape (n_samples, n_target_features) + The grid points on which the partial dependence should be + evaluated. + target_features : view on 1d ndarray of intp_t, shape (n_target_features) + The set of target features for which the partial dependence + should be evaluated. + out : view on 1d ndarray, shape (n_samples) + The value of the partial dependence function on each grid + point. + """ + + cdef: + unsigned int current_node_idx + unsigned int [:] node_idx_stack = np.zeros(shape=nodes.shape[0], + dtype=np.uint32) + Y_DTYPE_C [::1] weight_stack = np.zeros(shape=nodes.shape[0], + dtype=Y_DTYPE) + node_struct * current_node # pointer to avoid copying attributes + + unsigned int sample_idx + intp_t feature_idx + unsigned stack_size + Y_DTYPE_C left_sample_frac + Y_DTYPE_C current_weight + Y_DTYPE_C total_weight # used for sanity check only + bint is_target_feature + + for sample_idx in range(X.shape[0]): + # init stacks for current sample + stack_size = 1 + node_idx_stack[0] = 0 # root node + weight_stack[0] = 1 # all the samples are in the root node + total_weight = 0 + + while stack_size > 0: + + # pop the stack + stack_size -= 1 + current_node_idx = node_idx_stack[stack_size] + current_node = &nodes[current_node_idx] + + if current_node.is_leaf: + out[sample_idx] += (weight_stack[stack_size] * + current_node.value) + total_weight += weight_stack[stack_size] + else: + # determine if the split feature is a target feature + is_target_feature = False + for feature_idx in range(target_features.shape[0]): + if target_features[feature_idx] == current_node.feature_idx: + is_target_feature = True + break + + if is_target_feature: + # In this case, we push left or right child on stack + if X[sample_idx, feature_idx] <= current_node.num_threshold: + node_idx_stack[stack_size] = current_node.left + else: + node_idx_stack[stack_size] = current_node.right + stack_size += 1 + else: + # In this case, we push both children onto the stack, + # and give a weight proportional to the number of + # samples going through each branch. + + # push left child + node_idx_stack[stack_size] = current_node.left + left_sample_frac = ( + nodes[current_node.left].count / + current_node.count) + current_weight = weight_stack[stack_size] + weight_stack[stack_size] = current_weight * left_sample_frac + stack_size += 1 + + # push right child + node_idx_stack[stack_size] = current_node.right + weight_stack[stack_size] = ( + current_weight * (1 - left_sample_frac)) + stack_size += 1 + + # Sanity check. Should never happen. + if not (0.999 < total_weight < 1.001): + raise ValueError("Total weight should be 1.0 but was %.9f" %total_weight) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/binning.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/binning.py new file mode 100644 index 0000000000000000000000000000000000000000..eee26e68842b7925922340bf79badeaae19e4ae6 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -0,0 +1,333 @@ +""" +This module contains the BinMapper class. + +BinMapper is used for mapping a real-valued dataset into integer-valued bins. +Bin thresholds are computed with the quantiles so that each bin contains +approximately the same number of samples. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np + +from ...base import BaseEstimator, TransformerMixin +from ...utils import check_array, check_random_state +from ...utils._openmp_helpers import _openmp_effective_n_threads +from ...utils.parallel import Parallel, delayed +from ...utils.validation import check_is_fitted +from ._binning import _map_to_bins +from ._bitset import set_bitset_memoryview +from .common import ALMOST_INF, X_BINNED_DTYPE, X_BITSET_INNER_DTYPE, X_DTYPE + + +def _find_binning_thresholds(col_data, max_bins): + """Extract quantiles from a continuous feature. + + Missing values are ignored for finding the thresholds. + + Parameters + ---------- + col_data : array-like, shape (n_samples,) + The continuous feature to bin. + max_bins: int + The maximum number of bins to use for non-missing values. If for a + given feature the number of unique values is less than ``max_bins``, + then those unique values will be used to compute the bin thresholds, + instead of the quantiles + + Return + ------ + binning_thresholds : ndarray of shape(min(max_bins, n_unique_values) - 1,) + The increasing numeric values that can be used to separate the bins. + A given value x will be mapped into bin value i iff + bining_thresholds[i - 1] < x <= binning_thresholds[i] + """ + # ignore missing values when computing bin thresholds + missing_mask = np.isnan(col_data) + if missing_mask.any(): + col_data = col_data[~missing_mask] + # The data will be sorted anyway in np.unique and again in percentile, so we do it + # here. Sorting also returns a contiguous array. + col_data = np.sort(col_data) + distinct_values = np.unique(col_data).astype(X_DTYPE) + if len(distinct_values) <= max_bins: + midpoints = distinct_values[:-1] + distinct_values[1:] + midpoints *= 0.5 + else: + # We could compute approximate midpoint percentiles using the output of + # np.unique(col_data, return_counts) instead but this is more + # work and the performance benefit will be limited because we + # work on a fixed-size subsample of the full data. + percentiles = np.linspace(0, 100, num=max_bins + 1) + percentiles = percentiles[1:-1] + midpoints = np.percentile(col_data, percentiles, method="midpoint").astype( + X_DTYPE + ) + assert midpoints.shape[0] == max_bins - 1 + + # We avoid having +inf thresholds: +inf thresholds are only allowed in + # a "split on nan" situation. + np.clip(midpoints, a_min=None, a_max=ALMOST_INF, out=midpoints) + return midpoints + + +class _BinMapper(TransformerMixin, BaseEstimator): + """Transformer that maps a dataset into integer-valued bins. + + For continuous features, the bins are created in a feature-wise fashion, + using quantiles so that each bins contains approximately the same number + of samples. For large datasets, quantiles are computed on a subset of the + data to speed-up the binning, but the quantiles should remain stable. + + For categorical features, the raw categorical values are expected to be + in [0, 254] (this is not validated here though) and each category + corresponds to a bin. All categorical values must be known at + initialization: transform() doesn't know how to bin unknown categorical + values. Note that transform() is only used on non-training data in the + case of early stopping. + + Features with a small number of values may be binned into less than + ``n_bins`` bins. The last bin (at index ``n_bins - 1``) is always reserved + for missing values. + + Parameters + ---------- + n_bins : int, default=256 + The maximum number of bins to use (including the bin for missing + values). Should be in [3, 256]. Non-missing values are binned on + ``max_bins = n_bins - 1`` bins. The last bin is always reserved for + missing values. If for a given feature the number of unique values is + less than ``max_bins``, then those unique values will be used to + compute the bin thresholds, instead of the quantiles. For categorical + features indicated by ``is_categorical``, the docstring for + ``is_categorical`` details on this procedure. + subsample : int or None, default=2e5 + If ``n_samples > subsample``, then ``sub_samples`` samples will be + randomly chosen to compute the quantiles. If ``None``, the whole data + is used. + is_categorical : ndarray of bool of shape (n_features,), default=None + Indicates categorical features. By default, all features are + considered continuous. + known_categories : list of {ndarray, None} of shape (n_features,), \ + default=none + For each categorical feature, the array indicates the set of unique + categorical values. These should be the possible values over all the + data, not just the training data. For continuous features, the + corresponding entry should be None. + random_state: int, RandomState instance or None, default=None + Pseudo-random number generator to control the random sub-sampling. + Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. + n_threads : int, default=None + Number of OpenMP threads to use. `_openmp_effective_n_threads` is called + to determine the effective number of threads use, which takes cgroups CPU + quotes into account. See the docstring of `_openmp_effective_n_threads` + for details. + + Attributes + ---------- + bin_thresholds_ : list of ndarray + For each feature, each array indicates how to map a feature into a + binned feature. The semantic and size depends on the nature of the + feature: + - for real-valued features, the array corresponds to the real-valued + bin thresholds (the upper bound of each bin). There are ``max_bins + - 1`` thresholds, where ``max_bins = n_bins - 1`` is the number of + bins used for non-missing values. + - for categorical features, the array is a map from a binned category + value to the raw category value. The size of the array is equal to + ``min(max_bins, category_cardinality)`` where we ignore missing + values in the cardinality. + n_bins_non_missing_ : ndarray, dtype=np.uint32 + For each feature, gives the number of bins actually used for + non-missing values. For features with a lot of unique values, this is + equal to ``n_bins - 1``. + is_categorical_ : ndarray of shape (n_features,), dtype=np.uint8 + Indicator for categorical features. + missing_values_bin_idx_ : np.uint8 + The index of the bin where missing values are mapped. This is a + constant across all features. This corresponds to the last bin, and + it is always equal to ``n_bins - 1``. Note that if ``n_bins_non_missing_`` + is less than ``n_bins - 1`` for a given feature, then there are + empty (and unused) bins. + """ + + def __init__( + self, + n_bins=256, + subsample=int(2e5), + is_categorical=None, + known_categories=None, + random_state=None, + n_threads=None, + ): + self.n_bins = n_bins + self.subsample = subsample + self.is_categorical = is_categorical + self.known_categories = known_categories + self.random_state = random_state + self.n_threads = n_threads + + def fit(self, X, y=None): + """Fit data X by computing the binning thresholds. + + The last bin is reserved for missing values, whether missing values + are present in the data or not. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to bin. + y: None + Ignored. + + Returns + ------- + self : object + """ + if not (3 <= self.n_bins <= 256): + # min is 3: at least 2 distinct bins and a missing values bin + raise ValueError( + "n_bins={} should be no smaller than 3 and no larger than 256.".format( + self.n_bins + ) + ) + + X = check_array(X, dtype=[X_DTYPE], ensure_all_finite=False) + max_bins = self.n_bins - 1 + + rng = check_random_state(self.random_state) + if self.subsample is not None and X.shape[0] > self.subsample: + subset = rng.choice(X.shape[0], self.subsample, replace=False) + X = X.take(subset, axis=0) + + if self.is_categorical is None: + self.is_categorical_ = np.zeros(X.shape[1], dtype=np.uint8) + else: + self.is_categorical_ = np.asarray(self.is_categorical, dtype=np.uint8) + + n_features = X.shape[1] + known_categories = self.known_categories + if known_categories is None: + known_categories = [None] * n_features + + # validate is_categorical and known_categories parameters + for f_idx in range(n_features): + is_categorical = self.is_categorical_[f_idx] + known_cats = known_categories[f_idx] + if is_categorical and known_cats is None: + raise ValueError( + f"Known categories for feature {f_idx} must be provided." + ) + if not is_categorical and known_cats is not None: + raise ValueError( + f"Feature {f_idx} isn't marked as a categorical feature, " + "but categories were passed." + ) + + self.missing_values_bin_idx_ = self.n_bins - 1 + + self.bin_thresholds_ = [None] * n_features + n_bins_non_missing = [None] * n_features + + non_cat_thresholds = Parallel(n_jobs=self.n_threads, backend="threading")( + delayed(_find_binning_thresholds)(X[:, f_idx], max_bins) + for f_idx in range(n_features) + if not self.is_categorical_[f_idx] + ) + + non_cat_idx = 0 + for f_idx in range(n_features): + if self.is_categorical_[f_idx]: + # Since categories are assumed to be encoded in + # [0, n_cats] and since n_cats <= max_bins, + # the thresholds *are* the unique categorical values. This will + # lead to the correct mapping in transform() + thresholds = known_categories[f_idx] + n_bins_non_missing[f_idx] = thresholds.shape[0] + self.bin_thresholds_[f_idx] = thresholds + else: + self.bin_thresholds_[f_idx] = non_cat_thresholds[non_cat_idx] + n_bins_non_missing[f_idx] = self.bin_thresholds_[f_idx].shape[0] + 1 + non_cat_idx += 1 + + self.n_bins_non_missing_ = np.array(n_bins_non_missing, dtype=np.uint32) + return self + + def transform(self, X): + """Bin data X. + + Missing values will be mapped to the last bin. + + For categorical features, the mapping will be incorrect for unknown + categories. Since the BinMapper is given known_categories of the + entire training data (i.e. before the call to train_test_split() in + case of early-stopping), this never happens. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to bin. + + Returns + ------- + X_binned : array-like of shape (n_samples, n_features) + The binned data (fortran-aligned). + """ + X = check_array(X, dtype=[X_DTYPE], ensure_all_finite=False) + check_is_fitted(self) + if X.shape[1] != self.n_bins_non_missing_.shape[0]: + raise ValueError( + "This estimator was fitted with {} features but {} got passed " + "to transform()".format(self.n_bins_non_missing_.shape[0], X.shape[1]) + ) + + n_threads = _openmp_effective_n_threads(self.n_threads) + binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order="F") + _map_to_bins( + X, + self.bin_thresholds_, + self.is_categorical_, + self.missing_values_bin_idx_, + n_threads, + binned, + ) + return binned + + def make_known_categories_bitsets(self): + """Create bitsets of known categories. + + Returns + ------- + - known_cat_bitsets : ndarray of shape (n_categorical_features, 8) + Array of bitsets of known categories, for each categorical feature. + - f_idx_map : ndarray of shape (n_features,) + Map from original feature index to the corresponding index in the + known_cat_bitsets array. + """ + + categorical_features_indices = np.flatnonzero(self.is_categorical_) + + n_features = self.is_categorical_.size + n_categorical_features = categorical_features_indices.size + + f_idx_map = np.zeros(n_features, dtype=np.uint32) + f_idx_map[categorical_features_indices] = np.arange( + n_categorical_features, dtype=np.uint32 + ) + + known_categories = self.bin_thresholds_ + + known_cat_bitsets = np.zeros( + (n_categorical_features, 8), dtype=X_BITSET_INNER_DTYPE + ) + + # TODO: complexity is O(n_categorical_features * 255). Maybe this is + # worth cythonizing + for mapped_f_idx, f_idx in enumerate(categorical_features_indices): + for raw_cat_val in known_categories[f_idx]: + set_bitset_memoryview(known_cat_bitsets[mapped_f_idx], raw_cat_val) + + return known_cat_bitsets, f_idx_map diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..7ce6502fa120ecd189de882e537f440844f48021 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.cpython-312-x86_64-linux-gnu.so differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.pxd b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.pxd new file mode 100644 index 0000000000000000000000000000000000000000..9ff9fc89800d7bcd04a0a9d202d828a2079a6f28 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.pxd @@ -0,0 +1,43 @@ +from ...utils._typedefs cimport float32_t, float64_t, intp_t, uint8_t, uint32_t + + +ctypedef float64_t X_DTYPE_C +ctypedef uint8_t X_BINNED_DTYPE_C +ctypedef float64_t Y_DTYPE_C +ctypedef float32_t G_H_DTYPE_C +ctypedef uint32_t BITSET_INNER_DTYPE_C +ctypedef BITSET_INNER_DTYPE_C[8] BITSET_DTYPE_C + + +cdef packed struct hist_struct: + # Same as histogram dtype but we need a struct to declare views. It needs + # to be packed since by default numpy dtypes aren't aligned + Y_DTYPE_C sum_gradients + Y_DTYPE_C sum_hessians + unsigned int count + + +cdef packed struct node_struct: + # Equivalent struct to PREDICTOR_RECORD_DTYPE to use in memory views. It + # needs to be packed since by default numpy dtypes aren't aligned + Y_DTYPE_C value + unsigned int count + intp_t feature_idx + X_DTYPE_C num_threshold + uint8_t missing_go_to_left + unsigned int left + unsigned int right + Y_DTYPE_C gain + unsigned int depth + uint8_t is_leaf + X_BINNED_DTYPE_C bin_threshold + uint8_t is_categorical + # The index of the corresponding bitsets in the Predictor's bitset arrays. + # Only used if is_categorical is True + unsigned int bitset_idx + + +cpdef enum MonotonicConstraint: + NO_CST = 0 + POS = 1 + NEG = -1 diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.pyx b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.pyx new file mode 100644 index 0000000000000000000000000000000000000000..6b20e32813d5b88e533936e7ace1693ac4e5d7ec --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/common.pyx @@ -0,0 +1,44 @@ +import numpy as np + +# Y_DYTPE is the dtype to which the targets y are converted to. This is also +# dtype for leaf values, gains, and sums of gradients / hessians. The gradients +# and hessians arrays are stored as floats to avoid using too much memory. +Y_DTYPE = np.float64 +X_DTYPE = np.float64 +X_BINNED_DTYPE = np.uint8 # hence max_bins == 256 +# dtype for gradients and hessians arrays +G_H_DTYPE = np.float32 +X_BITSET_INNER_DTYPE = np.uint32 + +# Note that we use Y_DTYPE=float64 to avoid issues with floating point precision when +# summing gradients and hessians (both float32). Those are difficult to protect via +# tools like (Kahan-) Neumaier summation as in CPython, see +# https://github.com/python/cpython/issues/100425, or pairwise summation as numpy, see +# https://github.com/numpy/numpy/pull/3685, due to the way histograms are summed +# (number of additions per bin is not known in advance). See also comment in +# _subtract_histograms. +HISTOGRAM_DTYPE = np.dtype([ + ('sum_gradients', Y_DTYPE), # sum of sample gradients in bin + ('sum_hessians', Y_DTYPE), # sum of sample hessians in bin + ('count', np.uint32), # number of samples in bin +]) + +PREDICTOR_RECORD_DTYPE = np.dtype([ + ('value', Y_DTYPE), + ('count', np.uint32), + ('feature_idx', np.intp), + ('num_threshold', X_DTYPE), + ('missing_go_to_left', np.uint8), + ('left', np.uint32), + ('right', np.uint32), + ('gain', Y_DTYPE), + ('depth', np.uint32), + ('is_leaf', np.uint8), + ('bin_threshold', X_BINNED_DTYPE), + ('is_categorical', np.uint8), + # The index of the corresponding bitsets in the Predictor's bitset arrays. + # Only used if is_categorical is True + ('bitset_idx', np.uint32) +]) + +ALMOST_INF = 1e300 # see LightGBM AvoidInf() diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py new file mode 100644 index 0000000000000000000000000000000000000000..064391abab24d88d3c410bbdacde7ebc4db616d8 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -0,0 +1,2371 @@ +"""Fast Gradient Boosting decision trees for classification and regression.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import itertools +from abc import ABC, abstractmethod +from contextlib import contextmanager, nullcontext, suppress +from functools import partial +from numbers import Integral, Real +from time import time + +import numpy as np + +from ..._loss.loss import ( + _LOSSES, + BaseLoss, + HalfBinomialLoss, + HalfGammaLoss, + HalfMultinomialLoss, + HalfPoissonLoss, + PinballLoss, +) +from ...base import ( + BaseEstimator, + ClassifierMixin, + RegressorMixin, + _fit_context, + is_classifier, +) +from ...compose import ColumnTransformer +from ...metrics import check_scoring +from ...metrics._scorer import _SCORERS +from ...model_selection import train_test_split +from ...preprocessing import FunctionTransformer, LabelEncoder, OrdinalEncoder +from ...utils import check_random_state, compute_sample_weight, resample +from ...utils._missing import is_scalar_nan +from ...utils._openmp_helpers import _openmp_effective_n_threads +from ...utils._param_validation import Interval, RealNotInt, StrOptions +from ...utils.multiclass import check_classification_targets +from ...utils.validation import ( + _check_monotonic_cst, + _check_sample_weight, + _check_y, + _is_pandas_df, + check_array, + check_consistent_length, + check_is_fitted, + validate_data, +) +from ._gradient_boosting import _update_raw_predictions +from .binning import _BinMapper +from .common import G_H_DTYPE, X_DTYPE, Y_DTYPE +from .grower import TreeGrower + +_LOSSES = _LOSSES.copy() +_LOSSES.update( + { + "poisson": HalfPoissonLoss, + "gamma": HalfGammaLoss, + "quantile": PinballLoss, + } +) + + +def _update_leaves_values(loss, grower, y_true, raw_prediction, sample_weight): + """Update the leaf values to be predicted by the tree. + + Update equals: + loss.fit_intercept_only(y_true - raw_prediction) + + This is only applied if loss.differentiable is False. + Note: It only works, if the loss is a function of the residual, as is the + case for AbsoluteError and PinballLoss. Otherwise, one would need to get + the minimum of loss(y_true, raw_prediction + x) in x. A few examples: + - AbsoluteError: median(y_true - raw_prediction). + - PinballLoss: quantile(y_true - raw_prediction). + + More background: + For the standard gradient descent method according to "Greedy Function + Approximation: A Gradient Boosting Machine" by Friedman, all loss functions but the + squared loss need a line search step. BaseHistGradientBoosting, however, implements + a so called Newton boosting where the trees are fitted to a 2nd order + approximations of the loss in terms of gradients and hessians. In this case, the + line search step is only necessary if the loss is not smooth, i.e. not + differentiable, which renders the 2nd order approximation invalid. In fact, + non-smooth losses arbitrarily set hessians to 1 and effectively use the standard + gradient descent method with line search. + """ + # TODO: Ideally this should be computed in parallel over the leaves using something + # similar to _update_raw_predictions(), but this requires a cython version of + # median(). + for leaf in grower.finalized_leaves: + indices = leaf.sample_indices + if sample_weight is None: + sw = None + else: + sw = sample_weight[indices] + update = loss.fit_intercept_only( + y_true=y_true[indices] - raw_prediction[indices], + sample_weight=sw, + ) + leaf.value = grower.shrinkage * update + # Note that the regularization is ignored here + + +@contextmanager +def _patch_raw_predict(estimator, raw_predictions): + """Context manager that patches _raw_predict to return raw_predictions. + + `raw_predictions` is typically a precomputed array to avoid redundant + state-wise computations fitting with early stopping enabled: in this case + `raw_predictions` is incrementally updated whenever we add a tree to the + boosted ensemble. + + Note: this makes fitting HistGradientBoosting* models inherently non thread + safe at fit time. However thread-safety at fit time was never guaranteed nor + enforced for scikit-learn estimators in general. + + Thread-safety at prediction/transform time is another matter as those + operations are typically side-effect free and therefore often thread-safe by + default for most scikit-learn models and would like to keep it that way. + Therefore this context manager should only be used at fit time. + + TODO: in the future, we could explore the possibility to extend the scorer + public API to expose a way to compute vales from raw predictions. That would + probably require also making the scorer aware of the inverse link function + used by the estimator which is typically private API for now, hence the need + for this patching mechanism. + """ + orig_raw_predict = estimator._raw_predict + + def _patched_raw_predicts(*args, **kwargs): + return raw_predictions + + estimator._raw_predict = _patched_raw_predicts + yield estimator + estimator._raw_predict = orig_raw_predict + + +class BaseHistGradientBoosting(BaseEstimator, ABC): + """Base class for histogram-based gradient boosting estimators.""" + + _parameter_constraints: dict = { + "loss": [BaseLoss], + "learning_rate": [Interval(Real, 0, None, closed="neither")], + "max_iter": [Interval(Integral, 1, None, closed="left")], + "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None], + "max_depth": [Interval(Integral, 1, None, closed="left"), None], + "min_samples_leaf": [Interval(Integral, 1, None, closed="left")], + "l2_regularization": [Interval(Real, 0, None, closed="left")], + "max_features": [Interval(RealNotInt, 0, 1, closed="right")], + "monotonic_cst": ["array-like", dict, None], + "interaction_cst": [ + list, + tuple, + StrOptions({"pairwise", "no_interactions"}), + None, + ], + "n_iter_no_change": [Interval(Integral, 1, None, closed="left")], + "validation_fraction": [ + Interval(RealNotInt, 0, 1, closed="neither"), + Interval(Integral, 1, None, closed="left"), + None, + ], + "tol": [Interval(Real, 0, None, closed="left")], + "max_bins": [Interval(Integral, 2, 255, closed="both")], + "categorical_features": ["array-like", StrOptions({"from_dtype"}), None], + "warm_start": ["boolean"], + "early_stopping": [StrOptions({"auto"}), "boolean"], + "scoring": [str, callable, None], + "verbose": ["verbose"], + "random_state": ["random_state"], + } + + @abstractmethod + def __init__( + self, + loss, + *, + learning_rate, + max_iter, + max_leaf_nodes, + max_depth, + min_samples_leaf, + l2_regularization, + max_features, + max_bins, + categorical_features, + monotonic_cst, + interaction_cst, + warm_start, + early_stopping, + scoring, + validation_fraction, + n_iter_no_change, + tol, + verbose, + random_state, + ): + self.loss = loss + self.learning_rate = learning_rate + self.max_iter = max_iter + self.max_leaf_nodes = max_leaf_nodes + self.max_depth = max_depth + self.min_samples_leaf = min_samples_leaf + self.l2_regularization = l2_regularization + self.max_features = max_features + self.max_bins = max_bins + self.monotonic_cst = monotonic_cst + self.interaction_cst = interaction_cst + self.categorical_features = categorical_features + self.warm_start = warm_start + self.early_stopping = early_stopping + self.scoring = scoring + self.validation_fraction = validation_fraction + self.n_iter_no_change = n_iter_no_change + self.tol = tol + self.verbose = verbose + self.random_state = random_state + + def _validate_parameters(self): + """Validate parameters passed to __init__. + + The parameters that are directly passed to the grower are checked in + TreeGrower.""" + if self.monotonic_cst is not None and self.n_trees_per_iteration_ != 1: + raise ValueError( + "monotonic constraints are not supported for multiclass classification." + ) + + def _finalize_sample_weight(self, sample_weight, y): + """Finalize sample weight. + + Used by subclasses to adjust sample_weights. This is useful for implementing + class weights. + """ + return sample_weight + + def _preprocess_X(self, X, *, reset): + """Preprocess and validate X. + + Parameters + ---------- + X : {array-like, pandas DataFrame} of shape (n_samples, n_features) + Input data. + + reset : bool + Whether to reset the `n_features_in_` and `feature_names_in_ attributes. + + Returns + ------- + X : ndarray of shape (n_samples, n_features) + Validated input data. + + known_categories : list of ndarray of shape (n_categories,) + List of known categories for each categorical feature. + """ + # If there is a preprocessor, we let the preprocessor handle the validation. + # Otherwise, we validate the data ourselves. + check_X_kwargs = dict(dtype=[X_DTYPE], ensure_all_finite=False) + if not reset: + if self._preprocessor is None: + return validate_data(self, X, reset=False, **check_X_kwargs) + return self._preprocessor.transform(X) + + # At this point, reset is False, which runs during `fit`. + self.is_categorical_ = self._check_categorical_features(X) + + if self.is_categorical_ is None: + self._preprocessor = None + self._is_categorical_remapped = None + + X = validate_data(self, X, **check_X_kwargs) + return X, None + + n_features = X.shape[1] + ordinal_encoder = OrdinalEncoder( + categories="auto", + handle_unknown="use_encoded_value", + unknown_value=np.nan, + encoded_missing_value=np.nan, + dtype=X_DTYPE, + ) + + check_X = partial(check_array, **check_X_kwargs) + numerical_preprocessor = FunctionTransformer(check_X) + self._preprocessor = ColumnTransformer( + [ + ("encoder", ordinal_encoder, self.is_categorical_), + ("numerical", numerical_preprocessor, ~self.is_categorical_), + ] + ) + self._preprocessor.set_output(transform="default") + X = self._preprocessor.fit_transform(X) + # check categories found by the OrdinalEncoder and get their encoded values + known_categories = self._check_categories() + self.n_features_in_ = self._preprocessor.n_features_in_ + with suppress(AttributeError): + self.feature_names_in_ = self._preprocessor.feature_names_in_ + + # The ColumnTransformer's output places the categorical features at the + # beginning + categorical_remapped = np.zeros(n_features, dtype=bool) + categorical_remapped[self._preprocessor.output_indices_["encoder"]] = True + self._is_categorical_remapped = categorical_remapped + + return X, known_categories + + def _check_categories(self): + """Check categories found by the preprocessor and return their encoded values. + + Returns a list of length ``self.n_features_in_``, with one entry per + input feature. + + For non-categorical features, the corresponding entry is ``None``. + + For categorical features, the corresponding entry is an array + containing the categories as encoded by the preprocessor (an + ``OrdinalEncoder``), excluding missing values. The entry is therefore + ``np.arange(n_categories)`` where ``n_categories`` is the number of + unique values in the considered feature column, after removing missing + values. + + If ``n_categories > self.max_bins`` for any feature, a ``ValueError`` + is raised. + """ + encoder = self._preprocessor.named_transformers_["encoder"] + known_categories = [None] * self._preprocessor.n_features_in_ + categorical_column_indices = np.arange(self._preprocessor.n_features_in_)[ + self._preprocessor.output_indices_["encoder"] + ] + for feature_idx, categories in zip( + categorical_column_indices, encoder.categories_ + ): + # OrdinalEncoder always puts np.nan as the last category if the + # training data has missing values. Here we remove it because it is + # already added by the _BinMapper. + if len(categories) and is_scalar_nan(categories[-1]): + categories = categories[:-1] + if categories.size > self.max_bins: + try: + feature_name = repr(encoder.feature_names_in_[feature_idx]) + except AttributeError: + feature_name = f"at index {feature_idx}" + raise ValueError( + f"Categorical feature {feature_name} is expected to " + f"have a cardinality <= {self.max_bins} but actually " + f"has a cardinality of {categories.size}." + ) + known_categories[feature_idx] = np.arange(len(categories), dtype=X_DTYPE) + return known_categories + + def _check_categorical_features(self, X): + """Check and validate categorical features in X + + Parameters + ---------- + X : {array-like, pandas DataFrame} of shape (n_samples, n_features) + Input data. + + Return + ------ + is_categorical : ndarray of shape (n_features,) or None, dtype=bool + Indicates whether a feature is categorical. If no feature is + categorical, this is None. + """ + # Special code for pandas because of a bug in recent pandas, which is + # fixed in main and maybe included in 2.2.1, see + # https://github.com/pandas-dev/pandas/pull/57173. + # Also pandas versions < 1.5.1 do not support the dataframe interchange + if _is_pandas_df(X): + X_is_dataframe = True + categorical_columns_mask = np.asarray(X.dtypes == "category") + elif hasattr(X, "__dataframe__"): + X_is_dataframe = True + categorical_columns_mask = np.asarray( + [ + c.dtype[0].name == "CATEGORICAL" + for c in X.__dataframe__().get_columns() + ] + ) + else: + X_is_dataframe = False + categorical_columns_mask = None + + categorical_features = self.categorical_features + + categorical_by_dtype = ( + isinstance(categorical_features, str) + and categorical_features == "from_dtype" + ) + no_categorical_dtype = categorical_features is None or ( + categorical_by_dtype and not X_is_dataframe + ) + + if no_categorical_dtype: + return None + + use_pandas_categorical = categorical_by_dtype and X_is_dataframe + if use_pandas_categorical: + categorical_features = categorical_columns_mask + else: + categorical_features = np.asarray(categorical_features) + + if categorical_features.size == 0: + return None + + if categorical_features.dtype.kind not in ("i", "b", "U", "O"): + raise ValueError( + "categorical_features must be an array-like of bool, int or " + f"str, got: {categorical_features.dtype.name}." + ) + + if categorical_features.dtype.kind == "O": + types = set(type(f) for f in categorical_features) + if types != {str}: + raise ValueError( + "categorical_features must be an array-like of bool, int or " + f"str, got: {', '.join(sorted(t.__name__ for t in types))}." + ) + + n_features = X.shape[1] + # At this point `validate_data` was not called yet because we use the original + # dtypes to discover the categorical features. Thus `feature_names_in_` + # is not defined yet. + feature_names_in_ = getattr(X, "columns", None) + + if categorical_features.dtype.kind in ("U", "O"): + # check for feature names + if feature_names_in_ is None: + raise ValueError( + "categorical_features should be passed as an array of " + "integers or as a boolean mask when the model is fitted " + "on data without feature names." + ) + is_categorical = np.zeros(n_features, dtype=bool) + feature_names = list(feature_names_in_) + for feature_name in categorical_features: + try: + is_categorical[feature_names.index(feature_name)] = True + except ValueError as e: + raise ValueError( + f"categorical_features has a item value '{feature_name}' " + "which is not a valid feature name of the training " + f"data. Observed feature names: {feature_names}" + ) from e + elif categorical_features.dtype.kind == "i": + # check for categorical features as indices + if ( + np.max(categorical_features) >= n_features + or np.min(categorical_features) < 0 + ): + raise ValueError( + "categorical_features set as integer " + "indices must be in [0, n_features - 1]" + ) + is_categorical = np.zeros(n_features, dtype=bool) + is_categorical[categorical_features] = True + else: + if categorical_features.shape[0] != n_features: + raise ValueError( + "categorical_features set as a boolean mask " + "must have shape (n_features,), got: " + f"{categorical_features.shape}" + ) + is_categorical = categorical_features + + if not np.any(is_categorical): + return None + return is_categorical + + def _check_interaction_cst(self, n_features): + """Check and validation for interaction constraints.""" + if self.interaction_cst is None: + return None + + if self.interaction_cst == "no_interactions": + interaction_cst = [[i] for i in range(n_features)] + elif self.interaction_cst == "pairwise": + interaction_cst = itertools.combinations(range(n_features), 2) + else: + interaction_cst = self.interaction_cst + + try: + constraints = [set(group) for group in interaction_cst] + except TypeError: + raise ValueError( + "Interaction constraints must be a sequence of tuples or lists, got:" + f" {self.interaction_cst!r}." + ) + + for group in constraints: + for x in group: + if not (isinstance(x, Integral) and 0 <= x < n_features): + raise ValueError( + "Interaction constraints must consist of integer indices in" + f" [0, n_features - 1] = [0, {n_features - 1}], specifying the" + " position of features, got invalid indices:" + f" {group!r}" + ) + + # Add all not listed features as own group by default. + rest = set(range(n_features)) - set().union(*constraints) + if len(rest) > 0: + constraints.append(rest) + + return constraints + + @_fit_context(prefer_skip_nested_validation=True) + def fit( + self, + X, + y, + sample_weight=None, + *, + X_val=None, + y_val=None, + sample_weight_val=None, + ): + """Fit the gradient boosting model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input samples. + + y : array-like of shape (n_samples,) + Target values. + + sample_weight : array-like of shape (n_samples,) default=None + Weights of training data. + + .. versionadded:: 0.23 + + X_val : array-like of shape (n_val, n_features) + Additional sample of features for validation used in early stopping. + In a `Pipeline`, `X_val` can be transformed the same way as `X` with + `Pipeline(..., transform_input=["X_val"])`. + + .. versionadded:: 1.7 + + y_val : array-like of shape (n_samples,) + Additional sample of target values for validation used in early stopping. + + .. versionadded:: 1.7 + + sample_weight_val : array-like of shape (n_samples,) default=None + Additional weights for validation used in early stopping. + + .. versionadded:: 1.7 + + Returns + ------- + self : object + Fitted estimator. + """ + fit_start_time = time() + acc_find_split_time = 0.0 # time spent finding the best splits + acc_apply_split_time = 0.0 # time spent splitting nodes + acc_compute_hist_time = 0.0 # time spent computing histograms + # time spent predicting X for gradient and hessians update + acc_prediction_time = 0.0 + X, known_categories = self._preprocess_X(X, reset=True) + y = _check_y(y, estimator=self) + y = self._encode_y(y) + check_consistent_length(X, y) + # Do not create unit sample weights by default to later skip some + # computation + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float64) + # TODO: remove when PDP supports sample weights + self._fitted_with_sw = True + + sample_weight = self._finalize_sample_weight(sample_weight, y) + + validation_data_provided = X_val is not None or y_val is not None + if validation_data_provided: + if y_val is None: + raise ValueError("X_val is provided, but y_val was not provided.") + if X_val is None: + raise ValueError("y_val is provided, but X_val was not provided.") + X_val = self._preprocess_X(X_val, reset=False) + y_val = _check_y(y_val, estimator=self) + y_val = self._encode_y_val(y_val) + check_consistent_length(X_val, y_val) + if sample_weight_val is not None: + sample_weight_val = _check_sample_weight( + sample_weight_val, X_val, dtype=np.float64 + ) + if self.early_stopping is False: + raise ValueError( + "X_val and y_val are passed to fit while at the same time " + "early_stopping is False. When passing X_val and y_val to fit," + "early_stopping should be set to either 'auto' or True." + ) + + # Note: At this point, we could delete self._label_encoder if it exists. + # But we don't to keep the code even simpler. + + rng = check_random_state(self.random_state) + + # When warm starting, we want to reuse the same seed that was used + # the first time fit was called (e.g. train/val split). + # For feature subsampling, we want to continue with the rng we started with. + if not self.warm_start or not self._is_fitted(): + self._random_seed = rng.randint(np.iinfo(np.uint32).max, dtype="u8") + feature_subsample_seed = rng.randint(np.iinfo(np.uint32).max, dtype="u8") + self._feature_subsample_rng = np.random.default_rng(feature_subsample_seed) + + self._validate_parameters() + monotonic_cst = _check_monotonic_cst(self, self.monotonic_cst) + # _preprocess_X places the categorical features at the beginning, + # change the order of monotonic_cst accordingly + if self.is_categorical_ is not None: + monotonic_cst_remapped = np.concatenate( + ( + monotonic_cst[self.is_categorical_], + monotonic_cst[~self.is_categorical_], + ) + ) + else: + monotonic_cst_remapped = monotonic_cst + + # used for validation in predict + n_samples, self._n_features = X.shape + + # Encode constraints into a list of sets of features indices (integers). + interaction_cst = self._check_interaction_cst(self._n_features) + + # we need this stateful variable to tell raw_predict() that it was + # called from fit() (this current method), and that the data it has + # received is pre-binned. + # predicting is faster on pre-binned data, so we want early stopping + # predictions to be made on pre-binned data. Unfortunately the _scorer + # can only call predict() or predict_proba(), not raw_predict(), and + # there's no way to tell the scorer that it needs to predict binned + # data. + self._in_fit = True + + # `_openmp_effective_n_threads` is used to take cgroups CPU quotes + # into account when determine the maximum number of threads to use. + n_threads = _openmp_effective_n_threads() + + if isinstance(self.loss, str): + self._loss = self._get_loss(sample_weight=sample_weight) + elif isinstance(self.loss, BaseLoss): + self._loss = self.loss + + if self.early_stopping == "auto": + self.do_early_stopping_ = n_samples > 10_000 + else: + self.do_early_stopping_ = self.early_stopping + + # create validation data if needed + self._use_validation_data = ( + self.validation_fraction is not None or validation_data_provided + ) + if ( + self.do_early_stopping_ + and self._use_validation_data + and not validation_data_provided + ): + # stratify for classification + # instead of checking predict_proba, loss.n_classes >= 2 would also work + stratify = y if hasattr(self._loss, "predict_proba") else None + + # Save the state of the RNG for the training and validation split. + # This is needed in order to have the same split when using + # warm starting. + + if sample_weight is None: + X_train, X_val, y_train, y_val = train_test_split( + X, + y, + test_size=self.validation_fraction, + stratify=stratify, + random_state=self._random_seed, + ) + sample_weight_train = sample_weight_val = None + else: + # TODO: incorporate sample_weight in sampling here, as well as + # stratify + ( + X_train, + X_val, + y_train, + y_val, + sample_weight_train, + sample_weight_val, + ) = train_test_split( + X, + y, + sample_weight, + test_size=self.validation_fraction, + stratify=stratify, + random_state=self._random_seed, + ) + else: + X_train, y_train, sample_weight_train = X, y, sample_weight + if not validation_data_provided: + X_val = y_val = sample_weight_val = None + + # Bin the data + # For ease of use of the API, the user-facing GBDT classes accept the + # parameter max_bins, which doesn't take into account the bin for + # missing values (which is always allocated). However, since max_bins + # isn't the true maximal number of bins, all other private classes + # (binmapper, histbuilder...) accept n_bins instead, which is the + # actual total number of bins. Everywhere in the code, the + # convention is that n_bins == max_bins + 1 + n_bins = self.max_bins + 1 # + 1 for missing values + self._bin_mapper = _BinMapper( + n_bins=n_bins, + is_categorical=self._is_categorical_remapped, + known_categories=known_categories, + random_state=self._random_seed, + n_threads=n_threads, + ) + X_binned_train = self._bin_data(X_train, is_training_data=True) + if X_val is not None: + X_binned_val = self._bin_data(X_val, is_training_data=False) + else: + X_binned_val = None + + # Uses binned data to check for missing values + has_missing_values = ( + (X_binned_train == self._bin_mapper.missing_values_bin_idx_) + .any(axis=0) + .astype(np.uint8) + ) + + if self.verbose: + print("Fitting gradient boosted rounds:") + + n_samples = X_binned_train.shape[0] + scoring_is_predefined_string = self.scoring in _SCORERS + need_raw_predictions_val = X_binned_val is not None and ( + scoring_is_predefined_string or self.scoring == "loss" + ) + # First time calling fit, or no warm start + if not (self._is_fitted() and self.warm_start): + # Clear random state and score attributes + self._clear_state() + + # initialize raw_predictions: those are the accumulated values + # predicted by the trees for the training data. raw_predictions has + # shape (n_samples, n_trees_per_iteration) where + # n_trees_per_iterations is n_classes in multiclass classification, + # else 1. + # self._baseline_prediction has shape (1, n_trees_per_iteration) + self._baseline_prediction = self._loss.fit_intercept_only( + y_true=y_train, sample_weight=sample_weight_train + ).reshape((1, -1)) + raw_predictions = np.zeros( + shape=(n_samples, self.n_trees_per_iteration_), + dtype=self._baseline_prediction.dtype, + order="F", + ) + raw_predictions += self._baseline_prediction + + # predictors is a matrix (list of lists) of TreePredictor objects + # with shape (n_iter_, n_trees_per_iteration) + self._predictors = predictors = [] + + # Initialize structures and attributes related to early stopping + self._scorer = None # set if scoring != loss + raw_predictions_val = None # set if use val and scoring is a string + self.train_score_ = [] + self.validation_score_ = [] + + if self.do_early_stopping_: + # populate train_score and validation_score with the + # predictions of the initial model (before the first tree) + + # Create raw_predictions_val for storing the raw predictions of + # the validation data. + if need_raw_predictions_val: + raw_predictions_val = np.zeros( + shape=(X_binned_val.shape[0], self.n_trees_per_iteration_), + dtype=self._baseline_prediction.dtype, + order="F", + ) + + raw_predictions_val += self._baseline_prediction + + if self.scoring == "loss": + # we're going to compute scoring w.r.t the loss. As losses + # take raw predictions as input (unlike the scorers), we + # can optimize a bit and avoid repeating computing the + # predictions of the previous trees. We'll reuse + # raw_predictions (as it's needed for training anyway) for + # evaluating the training loss. + + self._check_early_stopping_loss( + raw_predictions=raw_predictions, + y_train=y_train, + sample_weight_train=sample_weight_train, + raw_predictions_val=raw_predictions_val, + y_val=y_val, + sample_weight_val=sample_weight_val, + n_threads=n_threads, + ) + else: + self._scorer = check_scoring(self, self.scoring) + # _scorer is a callable with signature (est, X, y) and + # calls est.predict() or est.predict_proba() depending on + # its nature. + # Unfortunately, each call to _scorer() will compute + # the predictions of all the trees. So we use a subset of + # the training set to compute train scores. + + # Compute the subsample set + ( + X_binned_small_train, + y_small_train, + sample_weight_small_train, + indices_small_train, + ) = self._get_small_trainset( + X_binned_train, + y_train, + sample_weight_train, + self._random_seed, + ) + + # If the scorer is a predefined string, then we optimize + # the evaluation by reusing the incrementally updated raw + # predictions. + if scoring_is_predefined_string: + raw_predictions_small_train = raw_predictions[ + indices_small_train + ] + else: + raw_predictions_small_train = None + + self._check_early_stopping_scorer( + X_binned_small_train, + y_small_train, + sample_weight_small_train, + X_binned_val, + y_val, + sample_weight_val, + raw_predictions_small_train=raw_predictions_small_train, + raw_predictions_val=raw_predictions_val, + ) + begin_at_stage = 0 + + # warm start: this is not the first time fit was called + else: + # Check that the maximum number of iterations is not smaller + # than the number of iterations from the previous fit + if self.max_iter < self.n_iter_: + raise ValueError( + "max_iter=%d must be larger than or equal to " + "n_iter_=%d when warm_start==True" % (self.max_iter, self.n_iter_) + ) + + # Convert array attributes to lists + self.train_score_ = self.train_score_.tolist() + self.validation_score_ = self.validation_score_.tolist() + + # Compute raw predictions + raw_predictions = self._raw_predict(X_binned_train, n_threads=n_threads) + if self.do_early_stopping_ and need_raw_predictions_val: + raw_predictions_val = self._raw_predict( + X_binned_val, n_threads=n_threads + ) + else: + raw_predictions_val = None + + if self.do_early_stopping_ and self.scoring != "loss": + # Compute the subsample set + ( + X_binned_small_train, + y_small_train, + sample_weight_small_train, + indices_small_train, + ) = self._get_small_trainset( + X_binned_train, y_train, sample_weight_train, self._random_seed + ) + + # Get the predictors from the previous fit + predictors = self._predictors + + begin_at_stage = self.n_iter_ + + # initialize gradients and hessians (empty arrays). + # shape = (n_samples, n_trees_per_iteration). + gradient, hessian = self._loss.init_gradient_and_hessian( + n_samples=n_samples, dtype=G_H_DTYPE, order="F" + ) + + for iteration in range(begin_at_stage, self.max_iter): + if self.verbose >= 2: + iteration_start_time = time() + print( + "[{}/{}] ".format(iteration + 1, self.max_iter), end="", flush=True + ) + + # Update gradients and hessians, inplace + # Note that self._loss expects shape (n_samples,) for + # n_trees_per_iteration = 1 else shape (n_samples, n_trees_per_iteration). + if self._loss.constant_hessian: + self._loss.gradient( + y_true=y_train, + raw_prediction=raw_predictions, + sample_weight=sample_weight_train, + gradient_out=gradient, + n_threads=n_threads, + ) + else: + self._loss.gradient_hessian( + y_true=y_train, + raw_prediction=raw_predictions, + sample_weight=sample_weight_train, + gradient_out=gradient, + hessian_out=hessian, + n_threads=n_threads, + ) + + # Append a list since there may be more than 1 predictor per iter + predictors.append([]) + + # 2-d views of shape (n_samples, n_trees_per_iteration_) or (n_samples, 1) + # on gradient and hessian to simplify the loop over n_trees_per_iteration_. + if gradient.ndim == 1: + g_view = gradient.reshape((-1, 1)) + h_view = hessian.reshape((-1, 1)) + else: + g_view = gradient + h_view = hessian + + # Build `n_trees_per_iteration` trees. + for k in range(self.n_trees_per_iteration_): + grower = TreeGrower( + X_binned=X_binned_train, + gradients=g_view[:, k], + hessians=h_view[:, k], + n_bins=n_bins, + n_bins_non_missing=self._bin_mapper.n_bins_non_missing_, + has_missing_values=has_missing_values, + is_categorical=self._is_categorical_remapped, + monotonic_cst=monotonic_cst_remapped, + interaction_cst=interaction_cst, + max_leaf_nodes=self.max_leaf_nodes, + max_depth=self.max_depth, + min_samples_leaf=self.min_samples_leaf, + l2_regularization=self.l2_regularization, + feature_fraction_per_split=self.max_features, + rng=self._feature_subsample_rng, + shrinkage=self.learning_rate, + n_threads=n_threads, + ) + grower.grow() + + acc_apply_split_time += grower.total_apply_split_time + acc_find_split_time += grower.total_find_split_time + acc_compute_hist_time += grower.total_compute_hist_time + + if not self._loss.differentiable: + _update_leaves_values( + loss=self._loss, + grower=grower, + y_true=y_train, + raw_prediction=raw_predictions[:, k], + sample_weight=sample_weight_train, + ) + + predictor = grower.make_predictor( + binning_thresholds=self._bin_mapper.bin_thresholds_ + ) + predictors[-1].append(predictor) + + # Update raw_predictions with the predictions of the newly + # created tree. + tic_pred = time() + _update_raw_predictions(raw_predictions[:, k], grower, n_threads) + toc_pred = time() + acc_prediction_time += toc_pred - tic_pred + + should_early_stop = False + if self.do_early_stopping_: + # Update raw_predictions_val with the newest tree(s) + if need_raw_predictions_val: + for k, pred in enumerate(self._predictors[-1]): + raw_predictions_val[:, k] += pred.predict_binned( + X_binned_val, + self._bin_mapper.missing_values_bin_idx_, + n_threads, + ) + + if self.scoring == "loss": + should_early_stop = self._check_early_stopping_loss( + raw_predictions=raw_predictions, + y_train=y_train, + sample_weight_train=sample_weight_train, + raw_predictions_val=raw_predictions_val, + y_val=y_val, + sample_weight_val=sample_weight_val, + n_threads=n_threads, + ) + + else: + # If the scorer is a predefined string, then we optimize the + # evaluation by reusing the incrementally computed raw predictions. + if scoring_is_predefined_string: + raw_predictions_small_train = raw_predictions[ + indices_small_train + ] + else: + raw_predictions_small_train = None + + should_early_stop = self._check_early_stopping_scorer( + X_binned_small_train, + y_small_train, + sample_weight_small_train, + X_binned_val, + y_val, + sample_weight_val, + raw_predictions_small_train=raw_predictions_small_train, + raw_predictions_val=raw_predictions_val, + ) + + if self.verbose >= 2: + self._print_iteration_stats(iteration_start_time) + + # maybe we could also early stop if all the trees are stumps? + if should_early_stop: + break + + if self.verbose: + duration = time() - fit_start_time + n_total_leaves = sum( + predictor.get_n_leaf_nodes() + for predictors_at_ith_iteration in self._predictors + for predictor in predictors_at_ith_iteration + ) + n_predictors = sum( + len(predictors_at_ith_iteration) + for predictors_at_ith_iteration in self._predictors + ) + print( + "Fit {} trees in {:.3f} s, ({} total leaves)".format( + n_predictors, duration, n_total_leaves + ) + ) + print( + "{:<32} {:.3f}s".format( + "Time spent computing histograms:", acc_compute_hist_time + ) + ) + print( + "{:<32} {:.3f}s".format( + "Time spent finding best splits:", acc_find_split_time + ) + ) + print( + "{:<32} {:.3f}s".format( + "Time spent applying splits:", acc_apply_split_time + ) + ) + print( + "{:<32} {:.3f}s".format("Time spent predicting:", acc_prediction_time) + ) + + self.train_score_ = np.asarray(self.train_score_) + self.validation_score_ = np.asarray(self.validation_score_) + del self._in_fit # hard delete so we're sure it can't be used anymore + return self + + def _is_fitted(self): + return len(getattr(self, "_predictors", [])) > 0 + + def _clear_state(self): + """Clear the state of the gradient boosting model.""" + for var in ("train_score_", "validation_score_"): + if hasattr(self, var): + delattr(self, var) + + def _get_small_trainset(self, X_binned_train, y_train, sample_weight_train, seed): + """Compute the indices of the subsample set and return this set. + + For efficiency, we need to subsample the training set to compute scores + with scorers. + """ + # TODO: incorporate sample_weights here in `resample` + subsample_size = 10000 + if X_binned_train.shape[0] > subsample_size: + indices = np.arange(X_binned_train.shape[0]) + stratify = y_train if is_classifier(self) else None + indices = resample( + indices, + n_samples=subsample_size, + replace=False, + random_state=seed, + stratify=stratify, + ) + X_binned_small_train = X_binned_train[indices] + y_small_train = y_train[indices] + if sample_weight_train is not None: + sample_weight_small_train = sample_weight_train[indices] + else: + sample_weight_small_train = None + X_binned_small_train = np.ascontiguousarray(X_binned_small_train) + return ( + X_binned_small_train, + y_small_train, + sample_weight_small_train, + indices, + ) + else: + return X_binned_train, y_train, sample_weight_train, slice(None) + + def _check_early_stopping_scorer( + self, + X_binned_small_train, + y_small_train, + sample_weight_small_train, + X_binned_val, + y_val, + sample_weight_val, + raw_predictions_small_train=None, + raw_predictions_val=None, + ): + """Check if fitting should be early-stopped based on scorer. + + Scores are computed on validation data or on training data. + """ + if is_classifier(self): + y_small_train = self.classes_[y_small_train.astype(int)] + + self.train_score_.append( + self._score_with_raw_predictions( + X_binned_small_train, + y_small_train, + sample_weight_small_train, + raw_predictions_small_train, + ) + ) + + if self._use_validation_data: + if is_classifier(self): + y_val = self.classes_[y_val.astype(int)] + self.validation_score_.append( + self._score_with_raw_predictions( + X_binned_val, y_val, sample_weight_val, raw_predictions_val + ) + ) + return self._should_stop(self.validation_score_) + else: + return self._should_stop(self.train_score_) + + def _score_with_raw_predictions(self, X, y, sample_weight, raw_predictions=None): + if raw_predictions is None: + patcher_raw_predict = nullcontext() + else: + patcher_raw_predict = _patch_raw_predict(self, raw_predictions) + + with patcher_raw_predict: + if sample_weight is None: + return self._scorer(self, X, y) + else: + return self._scorer(self, X, y, sample_weight=sample_weight) + + def _check_early_stopping_loss( + self, + raw_predictions, + y_train, + sample_weight_train, + raw_predictions_val, + y_val, + sample_weight_val, + n_threads=1, + ): + """Check if fitting should be early-stopped based on loss. + + Scores are computed on validation data or on training data. + """ + self.train_score_.append( + -self._loss( + y_true=y_train, + raw_prediction=raw_predictions, + sample_weight=sample_weight_train, + n_threads=n_threads, + ) + ) + + if self._use_validation_data: + self.validation_score_.append( + -self._loss( + y_true=y_val, + raw_prediction=raw_predictions_val, + sample_weight=sample_weight_val, + n_threads=n_threads, + ) + ) + return self._should_stop(self.validation_score_) + else: + return self._should_stop(self.train_score_) + + def _should_stop(self, scores): + """ + Return True (do early stopping) if the last n scores aren't better + than the (n-1)th-to-last score, up to some tolerance. + """ + reference_position = self.n_iter_no_change + 1 + if len(scores) < reference_position: + return False + + # A higher score is always better. Higher tol means that it will be + # harder for subsequent iteration to be considered an improvement upon + # the reference score, and therefore it is more likely to early stop + # because of the lack of significant improvement. + reference_score = scores[-reference_position] + self.tol + recent_scores = scores[-reference_position + 1 :] + recent_improvements = [score > reference_score for score in recent_scores] + return not any(recent_improvements) + + def _bin_data(self, X, is_training_data): + """Bin data X. + + If is_training_data, then fit the _bin_mapper attribute. + Else, the binned data is converted to a C-contiguous array. + """ + + description = "training" if is_training_data else "validation" + if self.verbose: + print( + "Binning {:.3f} GB of {} data: ".format(X.nbytes / 1e9, description), + end="", + flush=True, + ) + tic = time() + if is_training_data: + X_binned = self._bin_mapper.fit_transform(X) # F-aligned array + else: + X_binned = self._bin_mapper.transform(X) # F-aligned array + # We convert the array to C-contiguous since predicting is faster + # with this layout (training is faster on F-arrays though) + X_binned = np.ascontiguousarray(X_binned) + toc = time() + if self.verbose: + duration = toc - tic + print("{:.3f} s".format(duration)) + + return X_binned + + def _print_iteration_stats(self, iteration_start_time): + """Print info about the current fitting iteration.""" + log_msg = "" + + predictors_of_ith_iteration = [ + predictors_list + for predictors_list in self._predictors[-1] + if predictors_list + ] + n_trees = len(predictors_of_ith_iteration) + max_depth = max( + predictor.get_max_depth() for predictor in predictors_of_ith_iteration + ) + n_leaves = sum( + predictor.get_n_leaf_nodes() for predictor in predictors_of_ith_iteration + ) + + if n_trees == 1: + log_msg += "{} tree, {} leaves, ".format(n_trees, n_leaves) + else: + log_msg += "{} trees, {} leaves ".format(n_trees, n_leaves) + log_msg += "({} on avg), ".format(int(n_leaves / n_trees)) + + log_msg += "max depth = {}, ".format(max_depth) + + if self.do_early_stopping_: + if self.scoring == "loss": + factor = -1 # score_ arrays contain the negative loss + name = "loss" + else: + factor = 1 + name = "score" + log_msg += "train {}: {:.5f}, ".format(name, factor * self.train_score_[-1]) + if self._use_validation_data: + log_msg += "val {}: {:.5f}, ".format( + name, factor * self.validation_score_[-1] + ) + + iteration_time = time() - iteration_start_time + log_msg += "in {:0.3f}s".format(iteration_time) + + print(log_msg) + + def _raw_predict(self, X, n_threads=None): + """Return the sum of the leaves values over all predictors. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input samples. + n_threads : int, default=None + Number of OpenMP threads to use. `_openmp_effective_n_threads` is called + to determine the effective number of threads use, which takes cgroups CPU + quotes into account. See the docstring of `_openmp_effective_n_threads` + for details. + + Returns + ------- + raw_predictions : array, shape (n_samples, n_trees_per_iteration) + The raw predicted values. + """ + check_is_fitted(self) + is_binned = getattr(self, "_in_fit", False) + if not is_binned: + X = self._preprocess_X(X, reset=False) + + n_samples = X.shape[0] + raw_predictions = np.zeros( + shape=(n_samples, self.n_trees_per_iteration_), + dtype=self._baseline_prediction.dtype, + order="F", + ) + raw_predictions += self._baseline_prediction + + # We intentionally decouple the number of threads used at prediction + # time from the number of threads used at fit time because the model + # can be deployed on a different machine for prediction purposes. + n_threads = _openmp_effective_n_threads(n_threads) + self._predict_iterations( + X, self._predictors, raw_predictions, is_binned, n_threads + ) + return raw_predictions + + def _predict_iterations(self, X, predictors, raw_predictions, is_binned, n_threads): + """Add the predictions of the predictors to raw_predictions.""" + if not is_binned: + ( + known_cat_bitsets, + f_idx_map, + ) = self._bin_mapper.make_known_categories_bitsets() + + for predictors_of_ith_iteration in predictors: + for k, predictor in enumerate(predictors_of_ith_iteration): + if is_binned: + predict = partial( + predictor.predict_binned, + missing_values_bin_idx=self._bin_mapper.missing_values_bin_idx_, + n_threads=n_threads, + ) + else: + predict = partial( + predictor.predict, + known_cat_bitsets=known_cat_bitsets, + f_idx_map=f_idx_map, + n_threads=n_threads, + ) + raw_predictions[:, k] += predict(X) + + def _staged_raw_predict(self, X): + """Compute raw predictions of ``X`` for each iteration. + + This method allows monitoring (i.e. determine error on testing set) + after each stage. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input samples. + + Yields + ------ + raw_predictions : generator of ndarray of shape \ + (n_samples, n_trees_per_iteration) + The raw predictions of the input samples. The order of the + classes corresponds to that in the attribute :term:`classes_`. + """ + check_is_fitted(self) + X = self._preprocess_X(X, reset=False) + if X.shape[1] != self._n_features: + raise ValueError( + "X has {} features but this estimator was trained with " + "{} features.".format(X.shape[1], self._n_features) + ) + n_samples = X.shape[0] + raw_predictions = np.zeros( + shape=(n_samples, self.n_trees_per_iteration_), + dtype=self._baseline_prediction.dtype, + order="F", + ) + raw_predictions += self._baseline_prediction + + # We intentionally decouple the number of threads used at prediction + # time from the number of threads used at fit time because the model + # can be deployed on a different machine for prediction purposes. + n_threads = _openmp_effective_n_threads() + for iteration in range(len(self._predictors)): + self._predict_iterations( + X, + self._predictors[iteration : iteration + 1], + raw_predictions, + is_binned=False, + n_threads=n_threads, + ) + yield raw_predictions.copy() + + def _compute_partial_dependence_recursion(self, grid, target_features): + """Fast partial dependence computation. + + Parameters + ---------- + grid : ndarray, shape (n_samples, n_target_features), dtype=np.float32 + The grid points on which the partial dependence should be + evaluated. + target_features : ndarray, shape (n_target_features), dtype=np.intp + The set of target features for which the partial dependence + should be evaluated. + + Returns + ------- + averaged_predictions : ndarray, shape \ + (n_trees_per_iteration, n_samples) + The value of the partial dependence function on each grid point. + """ + + if getattr(self, "_fitted_with_sw", False): + raise NotImplementedError( + "{} does not support partial dependence " + "plots with the 'recursion' method when " + "sample weights were given during fit " + "time.".format(self.__class__.__name__) + ) + + grid = np.asarray(grid, dtype=X_DTYPE, order="C") + averaged_predictions = np.zeros( + (self.n_trees_per_iteration_, grid.shape[0]), dtype=Y_DTYPE + ) + target_features = np.asarray(target_features, dtype=np.intp, order="C") + + for predictors_of_ith_iteration in self._predictors: + for k, predictor in enumerate(predictors_of_ith_iteration): + predictor.compute_partial_dependence( + grid, target_features, averaged_predictions[k] + ) + # Note that the learning rate is already accounted for in the leaves + # values. + + return averaged_predictions + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags + + @abstractmethod + def _get_loss(self, sample_weight): + pass + + @abstractmethod + def _encode_y(self, y=None): + pass # pragma: no cover + + @abstractmethod + def _encode_y_val(self, y=None): + pass # pragma: no cover + + @property + def n_iter_(self): + """Number of iterations of the boosting process.""" + check_is_fitted(self) + return len(self._predictors) + + +class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): + """Histogram-based Gradient Boosting Regression Tree. + + This estimator is much faster than + :class:`GradientBoostingRegressor` + for big datasets (n_samples >= 10 000). + + This estimator has native support for missing values (NaNs). During + training, the tree grower learns at each split point whether samples + with missing values should go to the left or right child, based on the + potential gain. When predicting, samples with missing values are + assigned to the left or right child consequently. If no missing values + were encountered for a given feature during training, then samples with + missing values are mapped to whichever child has the most samples. + See :ref:`sphx_glr_auto_examples_ensemble_plot_hgbt_regression.py` for a + usecase example of this feature. + + This implementation is inspired by + `LightGBM `_. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.21 + + Parameters + ---------- + loss : {'squared_error', 'absolute_error', 'gamma', 'poisson', 'quantile'}, \ + default='squared_error' + The loss function to use in the boosting process. Note that the + "squared error", "gamma" and "poisson" losses actually implement + "half least squares loss", "half gamma deviance" and "half poisson + deviance" to simplify the computation of the gradient. Furthermore, + "gamma" and "poisson" losses internally use a log-link, "gamma" + requires ``y > 0`` and "poisson" requires ``y >= 0``. + "quantile" uses the pinball loss. + + .. versionchanged:: 0.23 + Added option 'poisson'. + + .. versionchanged:: 1.1 + Added option 'quantile'. + + .. versionchanged:: 1.3 + Added option 'gamma'. + + quantile : float, default=None + If loss is "quantile", this parameter specifies which quantile to be estimated + and must be between 0 and 1. + learning_rate : float, default=0.1 + The learning rate, also known as *shrinkage*. This is used as a + multiplicative factor for the leaves values. Use ``1`` for no + shrinkage. + max_iter : int, default=100 + The maximum number of iterations of the boosting process, i.e. the + maximum number of trees. + max_leaf_nodes : int or None, default=31 + The maximum number of leaves for each tree. Must be strictly greater + than 1. If None, there is no maximum limit. + max_depth : int or None, default=None + The maximum depth of each tree. The depth of a tree is the number of + edges to go from the root to the deepest leaf. + Depth isn't constrained by default. + min_samples_leaf : int, default=20 + The minimum number of samples per leaf. For small datasets with less + than a few hundred samples, it is recommended to lower this value + since only very shallow trees would be built. + l2_regularization : float, default=0 + The L2 regularization parameter penalizing leaves with small hessians. + Use ``0`` for no regularization (default). + max_features : float, default=1.0 + Proportion of randomly chosen features in each and every node split. + This is a form of regularization, smaller values make the trees weaker + learners and might prevent overfitting. + If interaction constraints from `interaction_cst` are present, only allowed + features are taken into account for the subsampling. + + .. versionadded:: 1.4 + + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. Before + training, each feature of the input array `X` is binned into + integer-valued bins, which allows for a much faster training stage. + Features with a small number of unique values may use less than + ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin + is always reserved for missing values. Must be no larger than 255. + categorical_features : array-like of {bool, int, str} of shape (n_features) \ + or shape (n_categorical_features,), default='from_dtype' + Indicates the categorical features. + + - None : no feature will be considered categorical. + - boolean array-like : boolean mask indicating categorical features. + - integer array-like : integer indices indicating categorical + features. + - str array-like: names of categorical features (assuming the training + data has feature names). + - `"from_dtype"`: dataframe columns with dtype "category" are + considered to be categorical features. The input must be an object + exposing a ``__dataframe__`` method such as pandas or polars + DataFrames to use this feature. + + For each categorical feature, there must be at most `max_bins` unique + categories. Negative values for categorical features encoded as numeric + dtypes are treated as missing values. All categorical values are + converted to floating point numbers. This means that categorical values + of 1.0 and 1 are treated as the same category. + + Read more in the :ref:`User Guide ` and + :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`. + + .. versionadded:: 0.24 + + .. versionchanged:: 1.2 + Added support for feature names. + + .. versionchanged:: 1.4 + Added `"from_dtype"` option. + + .. versionchanged:: 1.6 + The default value changed from `None` to `"from_dtype"`. + + monotonic_cst : array-like of int of shape (n_features) or dict, default=None + Monotonic constraint to enforce on each feature are specified using the + following integer values: + + - 1: monotonic increase + - 0: no constraint + - -1: monotonic decrease + + If a dict with str keys, map feature to monotonic constraints by name. + If an array, the features are mapped to constraints by position. See + :ref:`monotonic_cst_features_names` for a usage example. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.23 + + .. versionchanged:: 1.2 + Accept dict of constraints with feature names as keys. + + interaction_cst : {"pairwise", "no_interactions"} or sequence of lists/tuples/sets \ + of int, default=None + Specify interaction constraints, the sets of features which can + interact with each other in child node splits. + + Each item specifies the set of feature indices that are allowed + to interact with each other. If there are more features than + specified in these constraints, they are treated as if they were + specified as an additional set. + + The strings "pairwise" and "no_interactions" are shorthands for + allowing only pairwise or no interactions, respectively. + + For instance, with 5 features in total, `interaction_cst=[{0, 1}]` + is equivalent to `interaction_cst=[{0, 1}, {2, 3, 4}]`, + and specifies that each branch of a tree will either only split + on features 0 and 1 or only split on features 2, 3 and 4. + + See :ref:`this example` on how to use `interaction_cst`. + + .. versionadded:: 1.2 + + warm_start : bool, default=False + When set to ``True``, reuse the solution of the previous call to fit + and add more estimators to the ensemble. For results to be valid, the + estimator should be re-trained on the same data only. + See :term:`the Glossary `. + early_stopping : 'auto' or bool, default='auto' + If 'auto', early stopping is enabled if the sample size is larger than + 10000 or if `X_val` and `y_val` are passed to `fit`. If True, early stopping + is enabled, otherwise early stopping is disabled. + + .. versionadded:: 0.23 + + scoring : str or callable or None, default='loss' + Scoring method to use for early stopping. Only used if `early_stopping` + is enabled. Options: + + - str: see :ref:`scoring_string_names` for options. + - callable: a scorer callable object (e.g., function) with signature + ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details. + - `None`: the :ref:`coefficient of determination ` + (:math:`R^2`) is used. + - 'loss': early stopping is checked w.r.t the loss value. + + validation_fraction : int or float or None, default=0.1 + Proportion (or absolute size) of training data to set aside as + validation data for early stopping. If None, early stopping is done on + the training data. + The value is ignored if either early stopping is not performed, e.g. + `early_stopping=False`, or if `X_val` and `y_val` are passed to fit. + n_iter_no_change : int, default=10 + Used to determine when to "early stop". The fitting process is + stopped when none of the last ``n_iter_no_change`` scores are better + than the ``n_iter_no_change - 1`` -th-to-last one, up to some + tolerance. Only used if early stopping is performed. + tol : float, default=1e-7 + The absolute tolerance to use when comparing scores during early + stopping. The higher the tolerance, the more likely we are to early + stop: higher tolerance means that it will be harder for subsequent + iterations to be considered an improvement upon the reference score. + verbose : int, default=0 + The verbosity level. If not zero, print some information about the + fitting process. ``1`` prints only summary info, ``2`` prints info per + iteration. + random_state : int, RandomState instance or None, default=None + Pseudo-random number generator to control the subsampling in the + binning process, and the train/validation data split if early stopping + is enabled. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + do_early_stopping_ : bool + Indicates whether early stopping is used during training. + n_iter_ : int + The number of iterations as selected by early stopping, depending on + the `early_stopping` parameter. Otherwise it corresponds to max_iter. + n_trees_per_iteration_ : int + The number of tree that are built at each iteration. For regressors, + this is always 1. + train_score_ : ndarray, shape (n_iter_+1,) + The scores at each iteration on the training data. The first entry + is the score of the ensemble before the first iteration. Scores are + computed according to the ``scoring`` parameter. If ``scoring`` is + not 'loss', scores are computed on a subset of at most 10 000 + samples. Empty if no early stopping. + validation_score_ : ndarray, shape (n_iter_+1,) + The scores at each iteration on the held-out validation data. The + first entry is the score of the ensemble before the first iteration. + Scores are computed according to the ``scoring`` parameter. Empty if + no early stopping or if ``validation_fraction`` is None. + is_categorical_ : ndarray, shape (n_features, ) or None + Boolean mask for the categorical features. ``None`` if there are no + categorical features. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + GradientBoostingRegressor : Exact gradient boosting method that does not + scale as good on datasets with a large number of samples. + sklearn.tree.DecisionTreeRegressor : A decision tree regressor. + RandomForestRegressor : A meta-estimator that fits a number of decision + tree regressors on various sub-samples of the dataset and uses + averaging to improve the statistical performance and control + over-fitting. + AdaBoostRegressor : A meta-estimator that begins by fitting a regressor + on the original dataset and then fits additional copies of the + regressor on the same dataset but where the weights of instances are + adjusted according to the error of the current prediction. As such, + subsequent regressors focus more on difficult cases. + + Examples + -------- + >>> from sklearn.ensemble import HistGradientBoostingRegressor + >>> from sklearn.datasets import load_diabetes + >>> X, y = load_diabetes(return_X_y=True) + >>> est = HistGradientBoostingRegressor().fit(X, y) + >>> est.score(X, y) + 0.92... + """ + + _parameter_constraints: dict = { + **BaseHistGradientBoosting._parameter_constraints, + "loss": [ + StrOptions( + { + "squared_error", + "absolute_error", + "poisson", + "gamma", + "quantile", + } + ), + BaseLoss, + ], + "quantile": [Interval(Real, 0, 1, closed="both"), None], + } + + def __init__( + self, + loss="squared_error", + *, + quantile=None, + learning_rate=0.1, + max_iter=100, + max_leaf_nodes=31, + max_depth=None, + min_samples_leaf=20, + l2_regularization=0.0, + max_features=1.0, + max_bins=255, + categorical_features="from_dtype", + monotonic_cst=None, + interaction_cst=None, + warm_start=False, + early_stopping="auto", + scoring="loss", + validation_fraction=0.1, + n_iter_no_change=10, + tol=1e-7, + verbose=0, + random_state=None, + ): + super().__init__( + loss=loss, + learning_rate=learning_rate, + max_iter=max_iter, + max_leaf_nodes=max_leaf_nodes, + max_depth=max_depth, + min_samples_leaf=min_samples_leaf, + l2_regularization=l2_regularization, + max_features=max_features, + max_bins=max_bins, + monotonic_cst=monotonic_cst, + interaction_cst=interaction_cst, + categorical_features=categorical_features, + early_stopping=early_stopping, + warm_start=warm_start, + scoring=scoring, + validation_fraction=validation_fraction, + n_iter_no_change=n_iter_no_change, + tol=tol, + verbose=verbose, + random_state=random_state, + ) + self.quantile = quantile + + def predict(self, X): + """Predict values for X. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The input samples. + + Returns + ------- + y : ndarray, shape (n_samples,) + The predicted values. + """ + check_is_fitted(self) + # Return inverse link of raw predictions after converting + # shape (n_samples, 1) to (n_samples,) + return self._loss.link.inverse(self._raw_predict(X).ravel()) + + def staged_predict(self, X): + """Predict regression target for each iteration. + + This method allows monitoring (i.e. determine error on testing set) + after each stage. + + .. versionadded:: 0.24 + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input samples. + + Yields + ------ + y : generator of ndarray of shape (n_samples,) + The predicted values of the input samples, for each iteration. + """ + for raw_predictions in self._staged_raw_predict(X): + yield self._loss.link.inverse(raw_predictions.ravel()) + + def _encode_y(self, y): + # Just convert y to the expected dtype + self.n_trees_per_iteration_ = 1 + y = y.astype(Y_DTYPE, copy=False) + if self.loss == "gamma": + # Ensure y > 0 + if not np.all(y > 0): + raise ValueError("loss='gamma' requires strictly positive y.") + elif self.loss == "poisson": + # Ensure y >= 0 and sum(y) > 0 + if not (np.all(y >= 0) and np.sum(y) > 0): + raise ValueError( + "loss='poisson' requires non-negative y and sum(y) > 0." + ) + return y + + def _encode_y_val(self, y=None): + return self._encode_y(y) + + def _get_loss(self, sample_weight): + if self.loss == "quantile": + return _LOSSES[self.loss]( + sample_weight=sample_weight, quantile=self.quantile + ) + else: + return _LOSSES[self.loss](sample_weight=sample_weight) + + +class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting): + """Histogram-based Gradient Boosting Classification Tree. + + This estimator is much faster than + :class:`GradientBoostingClassifier` + for big datasets (n_samples >= 10 000). + + This estimator has native support for missing values (NaNs). During + training, the tree grower learns at each split point whether samples + with missing values should go to the left or right child, based on the + potential gain. When predicting, samples with missing values are + assigned to the left or right child consequently. If no missing values + were encountered for a given feature during training, then samples with + missing values are mapped to whichever child has the most samples. + + This implementation is inspired by + `LightGBM `_. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.21 + + Parameters + ---------- + loss : {'log_loss'}, default='log_loss' + The loss function to use in the boosting process. + + For binary classification problems, 'log_loss' is also known as logistic loss, + binomial deviance or binary crossentropy. Internally, the model fits one tree + per boosting iteration and uses the logistic sigmoid function (expit) as + inverse link function to compute the predicted positive class probability. + + For multiclass classification problems, 'log_loss' is also known as multinomial + deviance or categorical crossentropy. Internally, the model fits one tree per + boosting iteration and per class and uses the softmax function as inverse link + function to compute the predicted probabilities of the classes. + + learning_rate : float, default=0.1 + The learning rate, also known as *shrinkage*. This is used as a + multiplicative factor for the leaves values. Use ``1`` for no + shrinkage. + max_iter : int, default=100 + The maximum number of iterations of the boosting process, i.e. the + maximum number of trees for binary classification. For multiclass + classification, `n_classes` trees per iteration are built. + max_leaf_nodes : int or None, default=31 + The maximum number of leaves for each tree. Must be strictly greater + than 1. If None, there is no maximum limit. + max_depth : int or None, default=None + The maximum depth of each tree. The depth of a tree is the number of + edges to go from the root to the deepest leaf. + Depth isn't constrained by default. + min_samples_leaf : int, default=20 + The minimum number of samples per leaf. For small datasets with less + than a few hundred samples, it is recommended to lower this value + since only very shallow trees would be built. + l2_regularization : float, default=0 + The L2 regularization parameter penalizing leaves with small hessians. + Use ``0`` for no regularization (default). + max_features : float, default=1.0 + Proportion of randomly chosen features in each and every node split. + This is a form of regularization, smaller values make the trees weaker + learners and might prevent overfitting. + If interaction constraints from `interaction_cst` are present, only allowed + features are taken into account for the subsampling. + + .. versionadded:: 1.4 + + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. Before + training, each feature of the input array `X` is binned into + integer-valued bins, which allows for a much faster training stage. + Features with a small number of unique values may use less than + ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin + is always reserved for missing values. Must be no larger than 255. + categorical_features : array-like of {bool, int, str} of shape (n_features) \ + or shape (n_categorical_features,), default='from_dtype' + Indicates the categorical features. + + - None : no feature will be considered categorical. + - boolean array-like : boolean mask indicating categorical features. + - integer array-like : integer indices indicating categorical + features. + - str array-like: names of categorical features (assuming the training + data has feature names). + - `"from_dtype"`: dataframe columns with dtype "category" are + considered to be categorical features. The input must be an object + exposing a ``__dataframe__`` method such as pandas or polars + DataFrames to use this feature. + + For each categorical feature, there must be at most `max_bins` unique + categories. Negative values for categorical features encoded as numeric + dtypes are treated as missing values. All categorical values are + converted to floating point numbers. This means that categorical values + of 1.0 and 1 are treated as the same category. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.24 + + .. versionchanged:: 1.2 + Added support for feature names. + + .. versionchanged:: 1.4 + Added `"from_dtype"` option. + + .. versionchanged:: 1.6 + The default value changed from `None` to `"from_dtype"`. + + monotonic_cst : array-like of int of shape (n_features) or dict, default=None + Monotonic constraint to enforce on each feature are specified using the + following integer values: + + - 1: monotonic increase + - 0: no constraint + - -1: monotonic decrease + + If a dict with str keys, map feature to monotonic constraints by name. + If an array, the features are mapped to constraints by position. See + :ref:`monotonic_cst_features_names` for a usage example. + + The constraints are only valid for binary classifications and hold + over the probability of the positive class. + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.23 + + .. versionchanged:: 1.2 + Accept dict of constraints with feature names as keys. + + interaction_cst : {"pairwise", "no_interactions"} or sequence of lists/tuples/sets \ + of int, default=None + Specify interaction constraints, the sets of features which can + interact with each other in child node splits. + + Each item specifies the set of feature indices that are allowed + to interact with each other. If there are more features than + specified in these constraints, they are treated as if they were + specified as an additional set. + + The strings "pairwise" and "no_interactions" are shorthands for + allowing only pairwise or no interactions, respectively. + + For instance, with 5 features in total, `interaction_cst=[{0, 1}]` + is equivalent to `interaction_cst=[{0, 1}, {2, 3, 4}]`, + and specifies that each branch of a tree will either only split + on features 0 and 1 or only split on features 2, 3 and 4. + + See :ref:`this example` on how to use `interaction_cst`. + + .. versionadded:: 1.2 + + warm_start : bool, default=False + When set to ``True``, reuse the solution of the previous call to fit + and add more estimators to the ensemble. For results to be valid, the + estimator should be re-trained on the same data only. + See :term:`the Glossary `. + early_stopping : 'auto' or bool, default='auto' + If 'auto', early stopping is enabled if the sample size is larger than + 10000 or if `X_val` and `y_val` are passed to `fit`. If True, early stopping + is enabled, otherwise early stopping is disabled. + + .. versionadded:: 0.23 + + scoring : str or callable or None, default='loss' + Scoring method to use for early stopping. Only used if `early_stopping` + is enabled. Options: + + - str: see :ref:`scoring_string_names` for options. + - callable: a scorer callable object (e.g., function) with signature + ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details. + - `None`: :ref:`accuracy ` is used. + - 'loss': early stopping is checked w.r.t the loss value. + + validation_fraction : int or float or None, default=0.1 + Proportion (or absolute size) of training data to set aside as + validation data for early stopping. If None, early stopping is done on + the training data. + The value is ignored if either early stopping is not performed, e.g. + `early_stopping=False`, or if `X_val` and `y_val` are passed to fit. + n_iter_no_change : int, default=10 + Used to determine when to "early stop". The fitting process is + stopped when none of the last ``n_iter_no_change`` scores are better + than the ``n_iter_no_change - 1`` -th-to-last one, up to some + tolerance. Only used if early stopping is performed. + tol : float, default=1e-7 + The absolute tolerance to use when comparing scores. The higher the + tolerance, the more likely we are to early stop: higher tolerance + means that it will be harder for subsequent iterations to be + considered an improvement upon the reference score. + verbose : int, default=0 + The verbosity level. If not zero, print some information about the + fitting process. ``1`` prints only summary info, ``2`` prints info per + iteration. + random_state : int, RandomState instance or None, default=None + Pseudo-random number generator to control the subsampling in the + binning process, and the train/validation data split if early stopping + is enabled. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + class_weight : dict or 'balanced', default=None + Weights associated with classes in the form `{class_label: weight}`. + If not given, all classes are supposed to have weight one. + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as `n_samples / (n_classes * np.bincount(y))`. + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if `sample_weight` is specified. + + .. versionadded:: 1.2 + + Attributes + ---------- + classes_ : array, shape = (n_classes,) + Class labels. + do_early_stopping_ : bool + Indicates whether early stopping is used during training. + n_iter_ : int + The number of iterations as selected by early stopping, depending on + the `early_stopping` parameter. Otherwise it corresponds to max_iter. + n_trees_per_iteration_ : int + The number of tree that are built at each iteration. This is equal to 1 + for binary classification, and to ``n_classes`` for multiclass + classification. + train_score_ : ndarray, shape (n_iter_+1,) + The scores at each iteration on the training data. The first entry + is the score of the ensemble before the first iteration. Scores are + computed according to the ``scoring`` parameter. If ``scoring`` is + not 'loss', scores are computed on a subset of at most 10 000 + samples. Empty if no early stopping. + validation_score_ : ndarray, shape (n_iter_+1,) + The scores at each iteration on the held-out validation data. The + first entry is the score of the ensemble before the first iteration. + Scores are computed according to the ``scoring`` parameter. Empty if + no early stopping or if ``validation_fraction`` is None. + is_categorical_ : ndarray, shape (n_features, ) or None + Boolean mask for the categorical features. ``None`` if there are no + categorical features. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + GradientBoostingClassifier : Exact gradient boosting method that does not + scale as good on datasets with a large number of samples. + sklearn.tree.DecisionTreeClassifier : A decision tree classifier. + RandomForestClassifier : A meta-estimator that fits a number of decision + tree classifiers on various sub-samples of the dataset and uses + averaging to improve the predictive accuracy and control over-fitting. + AdaBoostClassifier : A meta-estimator that begins by fitting a classifier + on the original dataset and then fits additional copies of the + classifier on the same dataset where the weights of incorrectly + classified instances are adjusted such that subsequent classifiers + focus more on difficult cases. + + Examples + -------- + >>> from sklearn.ensemble import HistGradientBoostingClassifier + >>> from sklearn.datasets import load_iris + >>> X, y = load_iris(return_X_y=True) + >>> clf = HistGradientBoostingClassifier().fit(X, y) + >>> clf.score(X, y) + 1.0 + """ + + _parameter_constraints: dict = { + **BaseHistGradientBoosting._parameter_constraints, + "loss": [StrOptions({"log_loss"}), BaseLoss], + "class_weight": [dict, StrOptions({"balanced"}), None], + } + + def __init__( + self, + loss="log_loss", + *, + learning_rate=0.1, + max_iter=100, + max_leaf_nodes=31, + max_depth=None, + min_samples_leaf=20, + l2_regularization=0.0, + max_features=1.0, + max_bins=255, + categorical_features="from_dtype", + monotonic_cst=None, + interaction_cst=None, + warm_start=False, + early_stopping="auto", + scoring="loss", + validation_fraction=0.1, + n_iter_no_change=10, + tol=1e-7, + verbose=0, + random_state=None, + class_weight=None, + ): + super().__init__( + loss=loss, + learning_rate=learning_rate, + max_iter=max_iter, + max_leaf_nodes=max_leaf_nodes, + max_depth=max_depth, + min_samples_leaf=min_samples_leaf, + l2_regularization=l2_regularization, + max_features=max_features, + max_bins=max_bins, + categorical_features=categorical_features, + monotonic_cst=monotonic_cst, + interaction_cst=interaction_cst, + warm_start=warm_start, + early_stopping=early_stopping, + scoring=scoring, + validation_fraction=validation_fraction, + n_iter_no_change=n_iter_no_change, + tol=tol, + verbose=verbose, + random_state=random_state, + ) + self.class_weight = class_weight + + def _finalize_sample_weight(self, sample_weight, y): + """Adjust sample_weights with class_weights.""" + if self.class_weight is None: + return sample_weight + + expanded_class_weight = compute_sample_weight(self.class_weight, y) + + if sample_weight is not None: + return sample_weight * expanded_class_weight + else: + return expanded_class_weight + + def predict(self, X): + """Predict classes for X. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The input samples. + + Returns + ------- + y : ndarray, shape (n_samples,) + The predicted classes. + """ + # TODO: This could be done in parallel + raw_predictions = self._raw_predict(X) + if raw_predictions.shape[1] == 1: + # np.argmax([0.5, 0.5]) is 0, not 1. Therefore "> 0" not ">= 0" to be + # consistent with the multiclass case. + encoded_classes = (raw_predictions.ravel() > 0).astype(int) + else: + encoded_classes = np.argmax(raw_predictions, axis=1) + return self.classes_[encoded_classes] + + def staged_predict(self, X): + """Predict classes at each iteration. + + This method allows monitoring (i.e. determine error on testing set) + after each stage. + + .. versionadded:: 0.24 + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input samples. + + Yields + ------ + y : generator of ndarray of shape (n_samples,) + The predicted classes of the input samples, for each iteration. + """ + for raw_predictions in self._staged_raw_predict(X): + if raw_predictions.shape[1] == 1: + # np.argmax([0, 0]) is 0, not 1, therefore "> 0" not ">= 0" + encoded_classes = (raw_predictions.ravel() > 0).astype(int) + else: + encoded_classes = np.argmax(raw_predictions, axis=1) + yield self.classes_.take(encoded_classes, axis=0) + + def predict_proba(self, X): + """Predict class probabilities for X. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The input samples. + + Returns + ------- + p : ndarray, shape (n_samples, n_classes) + The class probabilities of the input samples. + """ + raw_predictions = self._raw_predict(X) + return self._loss.predict_proba(raw_predictions) + + def staged_predict_proba(self, X): + """Predict class probabilities at each iteration. + + This method allows monitoring (i.e. determine error on testing set) + after each stage. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input samples. + + Yields + ------ + y : generator of ndarray of shape (n_samples,) + The predicted class probabilities of the input samples, + for each iteration. + """ + for raw_predictions in self._staged_raw_predict(X): + yield self._loss.predict_proba(raw_predictions) + + def decision_function(self, X): + """Compute the decision function of ``X``. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The input samples. + + Returns + ------- + decision : ndarray, shape (n_samples,) or \ + (n_samples, n_trees_per_iteration) + The raw predicted values (i.e. the sum of the trees leaves) for + each sample. n_trees_per_iteration is equal to the number of + classes in multiclass classification. + """ + decision = self._raw_predict(X) + if decision.shape[1] == 1: + decision = decision.ravel() + return decision + + def staged_decision_function(self, X): + """Compute decision function of ``X`` for each iteration. + + This method allows monitoring (i.e. determine error on testing set) + after each stage. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input samples. + + Yields + ------ + decision : generator of ndarray of shape (n_samples,) or \ + (n_samples, n_trees_per_iteration) + The decision function of the input samples, which corresponds to + the raw values predicted from the trees of the ensemble . The + classes corresponds to that in the attribute :term:`classes_`. + """ + for staged_decision in self._staged_raw_predict(X): + if staged_decision.shape[1] == 1: + staged_decision = staged_decision.ravel() + yield staged_decision + + def _encode_y(self, y): + """Create self._label_encoder and encode y correspondingly.""" + # encode classes into 0 ... n_classes - 1 and sets attributes classes_ + # and n_trees_per_iteration_ + check_classification_targets(y) + + # We need to store the label encoder in case y_val needs to be label encoded, + # too. + self._label_encoder = LabelEncoder() + encoded_y = self._label_encoder.fit_transform(y) + self.classes_ = self._label_encoder.classes_ + n_classes = self.classes_.shape[0] + # only 1 tree for binary classification. For multiclass classification, + # we build 1 tree per class. + self.n_trees_per_iteration_ = 1 if n_classes <= 2 else n_classes + encoded_y = encoded_y.astype(Y_DTYPE, copy=False) + return encoded_y + + def _encode_y_val(self, y): + encoded_y = self._label_encoder.transform(y) + return encoded_y.astype(Y_DTYPE, copy=False) + + def _get_loss(self, sample_weight): + # At this point self.loss == "log_loss" + if self.n_trees_per_iteration_ == 1: + return HalfBinomialLoss(sample_weight=sample_weight) + else: + return HalfMultinomialLoss( + sample_weight=sample_weight, n_classes=self.n_trees_per_iteration_ + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/grower.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/grower.py new file mode 100644 index 0000000000000000000000000000000000000000..c3dbbe7d82948412cf0c567cbc672cb018ee9817 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -0,0 +1,821 @@ +""" +This module contains the TreeGrower class. + +TreeGrower builds a regression tree fitting a Newton-Raphson step, based on +the gradients and hessians of the training data. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numbers +from heapq import heappop, heappush +from timeit import default_timer as time + +import numpy as np + +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads + +from ._bitset import set_raw_bitset_from_binned_bitset +from .common import ( + PREDICTOR_RECORD_DTYPE, + X_BITSET_INNER_DTYPE, + MonotonicConstraint, +) +from .histogram import HistogramBuilder +from .predictor import TreePredictor +from .splitting import Splitter + + +class TreeNode: + """Tree Node class used in TreeGrower. + + This isn't used for prediction purposes, only for training (see + TreePredictor). + + Parameters + ---------- + depth : int + The depth of the node, i.e. its distance from the root. + sample_indices : ndarray of shape (n_samples_at_node,), dtype=np.uint32 + The indices of the samples at the node. + partition_start : int + start position of the node's sample_indices in splitter.partition. + partition_stop : int + stop position of the node's sample_indices in splitter.partition. + sum_gradients : float + The sum of the gradients of the samples at the node. + sum_hessians : float + The sum of the hessians of the samples at the node. + + Attributes + ---------- + depth : int + The depth of the node, i.e. its distance from the root. + sample_indices : ndarray of shape (n_samples_at_node,), dtype=np.uint32 + The indices of the samples at the node. + sum_gradients : float + The sum of the gradients of the samples at the node. + sum_hessians : float + The sum of the hessians of the samples at the node. + split_info : SplitInfo or None + The result of the split evaluation. + is_leaf : bool + True if node is a leaf + left_child : TreeNode or None + The left child of the node. None for leaves. + right_child : TreeNode or None + The right child of the node. None for leaves. + value : float or None + The value of the leaf, as computed in finalize_leaf(). None for + non-leaf nodes. + partition_start : int + start position of the node's sample_indices in splitter.partition. + partition_stop : int + stop position of the node's sample_indices in splitter.partition. + allowed_features : None or ndarray, dtype=int + Indices of features allowed to split for children. + interaction_cst_indices : None or list of ints + Indices of the interaction sets that have to be applied on splits of + child nodes. The fewer sets the stronger the constraint as fewer sets + contain fewer features. + children_lower_bound : float + children_upper_bound : float + """ + + def __init__( + self, + *, + depth, + sample_indices, + partition_start, + partition_stop, + sum_gradients, + sum_hessians, + value=None, + ): + self.depth = depth + self.sample_indices = sample_indices + self.n_samples = sample_indices.shape[0] + self.sum_gradients = sum_gradients + self.sum_hessians = sum_hessians + self.value = value + self.is_leaf = False + self.allowed_features = None + self.interaction_cst_indices = None + self.set_children_bounds(float("-inf"), float("+inf")) + self.split_info = None + self.left_child = None + self.right_child = None + self.histograms = None + # start and stop indices of the node in the splitter.partition + # array. Concretely, + # self.sample_indices = view(self.splitter.partition[start:stop]) + # Please see the comments about splitter.partition and + # splitter.split_indices for more info about this design. + # These 2 attributes are only used in _update_raw_prediction, because we + # need to iterate over the leaves and I don't know how to efficiently + # store the sample_indices views because they're all of different sizes. + self.partition_start = partition_start + self.partition_stop = partition_stop + + def set_children_bounds(self, lower, upper): + """Set children values bounds to respect monotonic constraints.""" + + # These are bounds for the node's *children* values, not the node's + # value. The bounds are used in the splitter when considering potential + # left and right child. + self.children_lower_bound = lower + self.children_upper_bound = upper + + def __lt__(self, other_node): + """Comparison for priority queue. + + Nodes with high gain are higher priority than nodes with low gain. + + heapq.heappush only need the '<' operator. + heapq.heappop take the smallest item first (smaller is higher + priority). + + Parameters + ---------- + other_node : TreeNode + The node to compare with. + """ + return self.split_info.gain > other_node.split_info.gain + + +class TreeGrower: + """Tree grower class used to build a tree. + + The tree is fitted to predict the values of a Newton-Raphson step. The + splits are considered in a best-first fashion, and the quality of a + split is defined in splitting._split_gain. + + Parameters + ---------- + X_binned : ndarray of shape (n_samples, n_features), dtype=np.uint8 + The binned input samples. Must be Fortran-aligned. + gradients : ndarray of shape (n_samples,) + The gradients of each training sample. Those are the gradients of the + loss w.r.t the predictions, evaluated at iteration ``i - 1``. + hessians : ndarray of shape (n_samples,) + The hessians of each training sample. Those are the hessians of the + loss w.r.t the predictions, evaluated at iteration ``i - 1``. + max_leaf_nodes : int, default=None + The maximum number of leaves for each tree. If None, there is no + maximum limit. + max_depth : int, default=None + The maximum depth of each tree. The depth of a tree is the number of + edges to go from the root to the deepest leaf. + Depth isn't constrained by default. + min_samples_leaf : int, default=20 + The minimum number of samples per leaf. + min_gain_to_split : float, default=0. + The minimum gain needed to split a node. Splits with lower gain will + be ignored. + min_hessian_to_split : float, default=1e-3 + The minimum sum of hessians needed in each node. Splits that result in + at least one child having a sum of hessians less than + ``min_hessian_to_split`` are discarded. + n_bins : int, default=256 + The total number of bins, including the bin for missing values. Used + to define the shape of the histograms. + n_bins_non_missing : ndarray, dtype=np.uint32, default=None + For each feature, gives the number of bins actually used for + non-missing values. For features with a lot of unique values, this + is equal to ``n_bins - 1``. If it's an int, all features are + considered to have the same number of bins. If None, all features + are considered to have ``n_bins - 1`` bins. + has_missing_values : bool or ndarray, dtype=bool, default=False + Whether each feature contains missing values (in the training data). + If it's a bool, the same value is used for all features. + is_categorical : ndarray of bool of shape (n_features,), default=None + Indicates categorical features. + monotonic_cst : array-like of int of shape (n_features,), dtype=int, default=None + Indicates the monotonic constraint to enforce on each feature. + - 1: monotonic increase + - 0: no constraint + - -1: monotonic decrease + + Read more in the :ref:`User Guide `. + interaction_cst : list of sets of integers, default=None + List of interaction constraints. + l2_regularization : float, default=0. + The L2 regularization parameter penalizing leaves with small hessians. + Use ``0`` for no regularization (default). + feature_fraction_per_split : float, default=1 + Proportion of randomly chosen features in each and every node split. + This is a form of regularization, smaller values make the trees weaker + learners and might prevent overfitting. + rng : Generator + Numpy random Generator used for feature subsampling. + shrinkage : float, default=1. + The shrinkage parameter to apply to the leaves values, also known as + learning rate. + n_threads : int, default=None + Number of OpenMP threads to use. `_openmp_effective_n_threads` is called + to determine the effective number of threads use, which takes cgroups CPU + quotes into account. See the docstring of `_openmp_effective_n_threads` + for details. + + Attributes + ---------- + histogram_builder : HistogramBuilder + splitter : Splitter + root : TreeNode + finalized_leaves : list of TreeNode + splittable_nodes : list of TreeNode + missing_values_bin_idx : int + Equals n_bins - 1 + n_categorical_splits : int + n_features : int + n_nodes : int + total_find_split_time : float + Time spent finding the best splits + total_compute_hist_time : float + Time spent computing histograms + total_apply_split_time : float + Time spent splitting nodes + with_monotonic_cst : bool + Whether there are monotonic constraints that apply. False iff monotonic_cst is + None. + """ + + def __init__( + self, + X_binned, + gradients, + hessians, + max_leaf_nodes=None, + max_depth=None, + min_samples_leaf=20, + min_gain_to_split=0.0, + min_hessian_to_split=1e-3, + n_bins=256, + n_bins_non_missing=None, + has_missing_values=False, + is_categorical=None, + monotonic_cst=None, + interaction_cst=None, + l2_regularization=0.0, + feature_fraction_per_split=1.0, + rng=np.random.default_rng(), + shrinkage=1.0, + n_threads=None, + ): + self._validate_parameters( + X_binned, + min_gain_to_split, + min_hessian_to_split, + ) + n_threads = _openmp_effective_n_threads(n_threads) + + if n_bins_non_missing is None: + n_bins_non_missing = n_bins - 1 + + if isinstance(n_bins_non_missing, numbers.Integral): + n_bins_non_missing = np.array( + [n_bins_non_missing] * X_binned.shape[1], dtype=np.uint32 + ) + else: + n_bins_non_missing = np.asarray(n_bins_non_missing, dtype=np.uint32) + + if isinstance(has_missing_values, bool): + has_missing_values = [has_missing_values] * X_binned.shape[1] + has_missing_values = np.asarray(has_missing_values, dtype=np.uint8) + + # `monotonic_cst` validation is done in _validate_monotonic_cst + # at the estimator level and therefore the following should not be + # needed when using the public API. + if monotonic_cst is None: + monotonic_cst = np.full( + shape=X_binned.shape[1], + fill_value=MonotonicConstraint.NO_CST, + dtype=np.int8, + ) + else: + monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8) + self.with_monotonic_cst = np.any(monotonic_cst != MonotonicConstraint.NO_CST) + + if is_categorical is None: + is_categorical = np.zeros(shape=X_binned.shape[1], dtype=np.uint8) + else: + is_categorical = np.asarray(is_categorical, dtype=np.uint8) + + if np.any( + np.logical_and( + is_categorical == 1, monotonic_cst != MonotonicConstraint.NO_CST + ) + ): + raise ValueError("Categorical features cannot have monotonic constraints.") + + hessians_are_constant = hessians.shape[0] == 1 + self.histogram_builder = HistogramBuilder( + X_binned, n_bins, gradients, hessians, hessians_are_constant, n_threads + ) + missing_values_bin_idx = n_bins - 1 + self.splitter = Splitter( + X_binned=X_binned, + n_bins_non_missing=n_bins_non_missing, + missing_values_bin_idx=missing_values_bin_idx, + has_missing_values=has_missing_values, + is_categorical=is_categorical, + monotonic_cst=monotonic_cst, + l2_regularization=l2_regularization, + min_hessian_to_split=min_hessian_to_split, + min_samples_leaf=min_samples_leaf, + min_gain_to_split=min_gain_to_split, + hessians_are_constant=hessians_are_constant, + feature_fraction_per_split=feature_fraction_per_split, + rng=rng, + n_threads=n_threads, + ) + self.X_binned = X_binned + self.max_leaf_nodes = max_leaf_nodes + self.max_depth = max_depth + self.min_samples_leaf = min_samples_leaf + self.min_gain_to_split = min_gain_to_split + self.n_bins_non_missing = n_bins_non_missing + self.missing_values_bin_idx = missing_values_bin_idx + self.has_missing_values = has_missing_values + self.is_categorical = is_categorical + self.monotonic_cst = monotonic_cst + self.interaction_cst = interaction_cst + self.l2_regularization = l2_regularization + self.shrinkage = shrinkage + self.n_features = X_binned.shape[1] + self.n_threads = n_threads + self.splittable_nodes = [] + self.finalized_leaves = [] + self.total_find_split_time = 0.0 # time spent finding the best splits + self.total_compute_hist_time = 0.0 # time spent computing histograms + self.total_apply_split_time = 0.0 # time spent splitting nodes + self.n_categorical_splits = 0 + self._initialize_root() + self.n_nodes = 1 + + def _validate_parameters( + self, + X_binned, + min_gain_to_split, + min_hessian_to_split, + ): + """Validate parameters passed to __init__. + + Also validate parameters passed to splitter. + """ + if X_binned.dtype != np.uint8: + raise NotImplementedError("X_binned must be of type uint8.") + if not X_binned.flags.f_contiguous: + raise ValueError( + "X_binned should be passed as Fortran contiguous " + "array for maximum efficiency." + ) + if min_gain_to_split < 0: + raise ValueError( + "min_gain_to_split={} must be positive.".format(min_gain_to_split) + ) + if min_hessian_to_split < 0: + raise ValueError( + "min_hessian_to_split={} must be positive.".format(min_hessian_to_split) + ) + + def grow(self): + """Grow the tree, from root to leaves.""" + while self.splittable_nodes: + self.split_next() + + self._apply_shrinkage() + + def _apply_shrinkage(self): + """Multiply leaves values by shrinkage parameter. + + This must be done at the very end of the growing process. If this were + done during the growing process e.g. in finalize_leaf(), then a leaf + would be shrunk but its sibling would potentially not be (if it's a + non-leaf), which would lead to a wrong computation of the 'middle' + value needed to enforce the monotonic constraints. + """ + for leaf in self.finalized_leaves: + leaf.value *= self.shrinkage + + def _initialize_root(self): + """Initialize root node and finalize it if needed.""" + tic = time() + if self.interaction_cst is not None: + allowed_features = set().union(*self.interaction_cst) + allowed_features = np.fromiter( + allowed_features, dtype=np.uint32, count=len(allowed_features) + ) + arbitrary_feature = allowed_features[0] + else: + allowed_features = None + arbitrary_feature = 0 + + # TreeNode init needs the total sum of gradients and hessians. Therefore, we + # first compute the histograms and then compute the total grad/hess on an + # arbitrary feature histogram. This way we replace a loop over n_samples by a + # loop over n_bins. + histograms = self.histogram_builder.compute_histograms_brute( + self.splitter.partition, # =self.root.sample_indices + allowed_features, + ) + self.total_compute_hist_time += time() - tic + + tic = time() + n_samples = self.X_binned.shape[0] + depth = 0 + histogram_array = np.asarray(histograms[arbitrary_feature]) + sum_gradients = histogram_array["sum_gradients"].sum() + if self.histogram_builder.hessians_are_constant: + sum_hessians = self.histogram_builder.hessians[0] * n_samples + else: + sum_hessians = histogram_array["sum_hessians"].sum() + self.root = TreeNode( + depth=depth, + sample_indices=self.splitter.partition, + partition_start=0, + partition_stop=n_samples, + sum_gradients=sum_gradients, + sum_hessians=sum_hessians, + value=0, + ) + + if self.root.n_samples < 2 * self.min_samples_leaf: + # Do not even bother computing any splitting statistics. + self._finalize_leaf(self.root) + return + if sum_hessians < self.splitter.min_hessian_to_split: + self._finalize_leaf(self.root) + return + + if self.interaction_cst is not None: + self.root.interaction_cst_indices = range(len(self.interaction_cst)) + self.root.allowed_features = allowed_features + + self.root.histograms = histograms + + self._compute_best_split_and_push(self.root) + self.total_find_split_time += time() - tic + + def _compute_best_split_and_push(self, node): + """Compute the best possible split (SplitInfo) of a given node. + + Also push it in the heap of splittable nodes if gain isn't zero. + The gain of a node is 0 if either all the leaves are pure + (best gain = 0), or if no split would satisfy the constraints, + (min_hessians_to_split, min_gain_to_split, min_samples_leaf) + """ + + node.split_info = self.splitter.find_node_split( + n_samples=node.n_samples, + histograms=node.histograms, + sum_gradients=node.sum_gradients, + sum_hessians=node.sum_hessians, + value=node.value, + lower_bound=node.children_lower_bound, + upper_bound=node.children_upper_bound, + allowed_features=node.allowed_features, + ) + + if node.split_info.gain <= 0: # no valid split + self._finalize_leaf(node) + else: + heappush(self.splittable_nodes, node) + + def split_next(self): + """Split the node with highest potential gain. + + Returns + ------- + left : TreeNode + The resulting left child. + right : TreeNode + The resulting right child. + """ + # Consider the node with the highest loss reduction (a.k.a. gain) + node = heappop(self.splittable_nodes) + + tic = time() + ( + sample_indices_left, + sample_indices_right, + right_child_pos, + ) = self.splitter.split_indices(node.split_info, node.sample_indices) + self.total_apply_split_time += time() - tic + + depth = node.depth + 1 + n_leaf_nodes = len(self.finalized_leaves) + len(self.splittable_nodes) + n_leaf_nodes += 2 + + left_child_node = TreeNode( + depth=depth, + sample_indices=sample_indices_left, + partition_start=node.partition_start, + partition_stop=node.partition_start + right_child_pos, + sum_gradients=node.split_info.sum_gradient_left, + sum_hessians=node.split_info.sum_hessian_left, + value=node.split_info.value_left, + ) + right_child_node = TreeNode( + depth=depth, + sample_indices=sample_indices_right, + partition_start=left_child_node.partition_stop, + partition_stop=node.partition_stop, + sum_gradients=node.split_info.sum_gradient_right, + sum_hessians=node.split_info.sum_hessian_right, + value=node.split_info.value_right, + ) + + node.right_child = right_child_node + node.left_child = left_child_node + + # set interaction constraints (the indices of the constraints sets) + if self.interaction_cst is not None: + # Calculate allowed_features and interaction_cst_indices only once. Child + # nodes inherit them before they get split. + ( + left_child_node.allowed_features, + left_child_node.interaction_cst_indices, + ) = self._compute_interactions(node) + right_child_node.interaction_cst_indices = ( + left_child_node.interaction_cst_indices + ) + right_child_node.allowed_features = left_child_node.allowed_features + + if not self.has_missing_values[node.split_info.feature_idx]: + # If no missing values are encountered at fit time, then samples + # with missing values during predict() will go to whichever child + # has the most samples. + node.split_info.missing_go_to_left = ( + left_child_node.n_samples > right_child_node.n_samples + ) + + self.n_nodes += 2 + self.n_categorical_splits += node.split_info.is_categorical + + if self.max_leaf_nodes is not None and n_leaf_nodes == self.max_leaf_nodes: + self._finalize_leaf(left_child_node) + self._finalize_leaf(right_child_node) + self._finalize_splittable_nodes() + return left_child_node, right_child_node + + if self.max_depth is not None and depth == self.max_depth: + self._finalize_leaf(left_child_node) + self._finalize_leaf(right_child_node) + return left_child_node, right_child_node + + if left_child_node.n_samples < self.min_samples_leaf * 2: + self._finalize_leaf(left_child_node) + if right_child_node.n_samples < self.min_samples_leaf * 2: + self._finalize_leaf(right_child_node) + + if self.with_monotonic_cst: + # Set value bounds for respecting monotonic constraints + # See test_nodes_values() for details + if ( + self.monotonic_cst[node.split_info.feature_idx] + == MonotonicConstraint.NO_CST + ): + lower_left = lower_right = node.children_lower_bound + upper_left = upper_right = node.children_upper_bound + else: + mid = (left_child_node.value + right_child_node.value) / 2 + if ( + self.monotonic_cst[node.split_info.feature_idx] + == MonotonicConstraint.POS + ): + lower_left, upper_left = node.children_lower_bound, mid + lower_right, upper_right = mid, node.children_upper_bound + else: # NEG + lower_left, upper_left = mid, node.children_upper_bound + lower_right, upper_right = node.children_lower_bound, mid + left_child_node.set_children_bounds(lower_left, upper_left) + right_child_node.set_children_bounds(lower_right, upper_right) + + # Compute histograms of children, and compute their best possible split + # (if needed) + should_split_left = not left_child_node.is_leaf + should_split_right = not right_child_node.is_leaf + if should_split_left or should_split_right: + # We will compute the histograms of both nodes even if one of them + # is a leaf, since computing the second histogram is very cheap + # (using histogram subtraction). + n_samples_left = left_child_node.sample_indices.shape[0] + n_samples_right = right_child_node.sample_indices.shape[0] + if n_samples_left < n_samples_right: + smallest_child = left_child_node + largest_child = right_child_node + else: + smallest_child = right_child_node + largest_child = left_child_node + + # We use the brute O(n_samples) method on the child that has the + # smallest number of samples, and the subtraction trick O(n_bins) + # on the other one. + # Note that both left and right child have the same allowed_features. + tic = time() + smallest_child.histograms = self.histogram_builder.compute_histograms_brute( + smallest_child.sample_indices, smallest_child.allowed_features + ) + largest_child.histograms = ( + self.histogram_builder.compute_histograms_subtraction( + node.histograms, + smallest_child.histograms, + smallest_child.allowed_features, + ) + ) + # node.histograms is reused in largest_child.histograms. To break cyclic + # memory references and help garbage collection, we set it to None. + node.histograms = None + self.total_compute_hist_time += time() - tic + + tic = time() + if should_split_left: + self._compute_best_split_and_push(left_child_node) + if should_split_right: + self._compute_best_split_and_push(right_child_node) + self.total_find_split_time += time() - tic + + # Release memory used by histograms as they are no longer needed + # for leaf nodes since they won't be split. + for child in (left_child_node, right_child_node): + if child.is_leaf: + del child.histograms + + # Release memory used by histograms as they are no longer needed for + # internal nodes once children histograms have been computed. + del node.histograms + + return left_child_node, right_child_node + + def _compute_interactions(self, node): + r"""Compute features allowed by interactions to be inherited by child nodes. + + Example: Assume constraints [{0, 1}, {1, 2}]. + 1 <- Both constraint groups could be applied from now on + / \ + 1 2 <- Left split still fulfills both constraint groups. + / \ / \ Right split at feature 2 has only group {1, 2} from now on. + + LightGBM uses the same logic for overlapping groups. See + https://github.com/microsoft/LightGBM/issues/4481 for details. + + Parameters: + ---------- + node : TreeNode + A node that might have children. Based on its feature_idx, the interaction + constraints for possible child nodes are computed. + + Returns + ------- + allowed_features : ndarray, dtype=uint32 + Indices of features allowed to split for children. + interaction_cst_indices : list of ints + Indices of the interaction sets that have to be applied on splits of + child nodes. The fewer sets the stronger the constraint as fewer sets + contain fewer features. + """ + # Note: + # - Case of no interactions is already captured before function call. + # - This is for nodes that are already split and have a + # node.split_info.feature_idx. + allowed_features = set() + interaction_cst_indices = [] + for i in node.interaction_cst_indices: + if node.split_info.feature_idx in self.interaction_cst[i]: + interaction_cst_indices.append(i) + allowed_features.update(self.interaction_cst[i]) + return ( + np.fromiter(allowed_features, dtype=np.uint32, count=len(allowed_features)), + interaction_cst_indices, + ) + + def _finalize_leaf(self, node): + """Make node a leaf of the tree being grown.""" + + node.is_leaf = True + self.finalized_leaves.append(node) + + def _finalize_splittable_nodes(self): + """Transform all splittable nodes into leaves. + + Used when some constraint is met e.g. maximum number of leaves or + maximum depth.""" + while len(self.splittable_nodes) > 0: + node = self.splittable_nodes.pop() + self._finalize_leaf(node) + + def make_predictor(self, binning_thresholds): + """Make a TreePredictor object out of the current tree. + + Parameters + ---------- + binning_thresholds : array-like of floats + Corresponds to the bin_thresholds_ attribute of the BinMapper. + For each feature, this stores: + + - the bin frontiers for continuous features + - the unique raw category values for categorical features + + Returns + ------- + A TreePredictor object. + """ + predictor_nodes = np.zeros(self.n_nodes, dtype=PREDICTOR_RECORD_DTYPE) + binned_left_cat_bitsets = np.zeros( + (self.n_categorical_splits, 8), dtype=X_BITSET_INNER_DTYPE + ) + raw_left_cat_bitsets = np.zeros( + (self.n_categorical_splits, 8), dtype=X_BITSET_INNER_DTYPE + ) + _fill_predictor_arrays( + predictor_nodes, + binned_left_cat_bitsets, + raw_left_cat_bitsets, + self.root, + binning_thresholds, + self.n_bins_non_missing, + ) + return TreePredictor( + predictor_nodes, binned_left_cat_bitsets, raw_left_cat_bitsets + ) + + +def _fill_predictor_arrays( + predictor_nodes, + binned_left_cat_bitsets, + raw_left_cat_bitsets, + grower_node, + binning_thresholds, + n_bins_non_missing, + next_free_node_idx=0, + next_free_bitset_idx=0, +): + """Helper used in make_predictor to set the TreePredictor fields.""" + node = predictor_nodes[next_free_node_idx] + node["count"] = grower_node.n_samples + node["depth"] = grower_node.depth + if grower_node.split_info is not None: + node["gain"] = grower_node.split_info.gain + else: + node["gain"] = -1 + + node["value"] = grower_node.value + + if grower_node.is_leaf: + # Leaf node + node["is_leaf"] = True + return next_free_node_idx + 1, next_free_bitset_idx + + split_info = grower_node.split_info + feature_idx, bin_idx = split_info.feature_idx, split_info.bin_idx + node["feature_idx"] = feature_idx + node["bin_threshold"] = bin_idx + node["missing_go_to_left"] = split_info.missing_go_to_left + node["is_categorical"] = split_info.is_categorical + + if split_info.bin_idx == n_bins_non_missing[feature_idx] - 1: + # Split is on the last non-missing bin: it's a "split on nans". + # All nans go to the right, the rest go to the left. + # Note: for categorical splits, bin_idx is 0 and we rely on the bitset + node["num_threshold"] = np.inf + elif split_info.is_categorical: + categories = binning_thresholds[feature_idx] + node["bitset_idx"] = next_free_bitset_idx + binned_left_cat_bitsets[next_free_bitset_idx] = split_info.left_cat_bitset + set_raw_bitset_from_binned_bitset( + raw_left_cat_bitsets[next_free_bitset_idx], + split_info.left_cat_bitset, + categories, + ) + next_free_bitset_idx += 1 + else: + node["num_threshold"] = binning_thresholds[feature_idx][bin_idx] + + next_free_node_idx += 1 + + node["left"] = next_free_node_idx + next_free_node_idx, next_free_bitset_idx = _fill_predictor_arrays( + predictor_nodes, + binned_left_cat_bitsets, + raw_left_cat_bitsets, + grower_node.left_child, + binning_thresholds=binning_thresholds, + n_bins_non_missing=n_bins_non_missing, + next_free_node_idx=next_free_node_idx, + next_free_bitset_idx=next_free_bitset_idx, + ) + + node["right"] = next_free_node_idx + return _fill_predictor_arrays( + predictor_nodes, + binned_left_cat_bitsets, + raw_left_cat_bitsets, + grower_node.right_child, + binning_thresholds=binning_thresholds, + n_bins_non_missing=n_bins_non_missing, + next_free_node_idx=next_free_node_idx, + next_free_bitset_idx=next_free_bitset_idx, + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx new file mode 100644 index 0000000000000000000000000000000000000000..e204eec6b97850f696ef61898562bb65bc908ed6 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx @@ -0,0 +1,520 @@ +"""This module contains routines for building histograms.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +cimport cython +from cython.parallel import prange +from libc.string cimport memset + +import numpy as np + +from .common import HISTOGRAM_DTYPE +from .common cimport hist_struct +from .common cimport X_BINNED_DTYPE_C +from .common cimport G_H_DTYPE_C +from ...utils._typedefs cimport uint8_t + + +# Notes: +# - IN views are read-only, OUT views are write-only +# - In a lot of functions here, we pass feature_idx and the whole 2d +# histograms arrays instead of just histograms[feature_idx]. This is because +# Cython generated C code will have strange Python interactions (likely +# related to the GIL release and the custom histogram dtype) when using 1d +# histogram arrays that come from 2d arrays. +# - The for loops are un-wrapped, for example: +# +# for i in range(n): +# array[i] = i +# +# will become +# +# for i in range(n // 4): +# array[i] = i +# array[i + 1] = i + 1 +# array[i + 2] = i + 2 +# array[i + 3] = i + 3 +# +# This is to hint gcc that it can auto-vectorize these 4 operations and +# perform them all at once. + + +@cython.final +cdef class HistogramBuilder: + """A Histogram builder... used to build histograms. + + A histogram is an array with n_bins entries of type HISTOGRAM_DTYPE. Each + feature has its own histogram. A histogram contains the sum of gradients + and hessians of all the samples belonging to each bin. + + There are different ways to build a histogram: + - by subtraction: hist(child) = hist(parent) - hist(sibling) + - from scratch. In this case we have routines that update the hessians + or not (not useful when hessians are constant for some losses e.g. + least squares). Also, there's a special case for the root which + contains all the samples, leading to some possible optimizations. + Overall all the implementations look the same, and are optimized for + cache hit. + + Parameters + ---------- + X_binned : ndarray of int, shape (n_samples, n_features) + The binned input samples. Must be Fortran-aligned. + n_bins : int + The total number of bins, including the bin for missing values. Used + to define the shape of the histograms. + gradients : ndarray, shape (n_samples,) + The gradients of each training sample. Those are the gradients of the + loss w.r.t the predictions, evaluated at iteration i - 1. + hessians : ndarray, shape (n_samples,) + The hessians of each training sample. Those are the hessians of the + loss w.r.t the predictions, evaluated at iteration i - 1. + hessians_are_constant : bool + Whether hessians are constant. + """ + cdef public: + const X_BINNED_DTYPE_C [::1, :] X_binned + unsigned int n_features + unsigned int n_bins + G_H_DTYPE_C [::1] gradients + G_H_DTYPE_C [::1] hessians + G_H_DTYPE_C [::1] ordered_gradients + G_H_DTYPE_C [::1] ordered_hessians + uint8_t hessians_are_constant + int n_threads + + def __init__(self, const X_BINNED_DTYPE_C [::1, :] X_binned, + unsigned int n_bins, G_H_DTYPE_C [::1] gradients, + G_H_DTYPE_C [::1] hessians, + uint8_t hessians_are_constant, + int n_threads): + + self.X_binned = X_binned + self.n_features = X_binned.shape[1] + # Note: all histograms will have bins, but some of the + # bins may be unused if a feature has a small number of unique values. + self.n_bins = n_bins + self.gradients = gradients + self.hessians = hessians + # for root node, gradients and hessians are already ordered + self.ordered_gradients = gradients.copy() + self.ordered_hessians = hessians.copy() + self.hessians_are_constant = hessians_are_constant + self.n_threads = n_threads + + def compute_histograms_brute( + HistogramBuilder self, + const unsigned int [::1] sample_indices, # IN + const unsigned int [:] allowed_features=None, # IN + ): + """Compute the histograms of the node by scanning through all the data. + + For a given feature, the complexity is O(n_samples) + + Parameters + ---------- + sample_indices : array of int, shape (n_samples_at_node,) + The indices of the samples at the node to split. + + allowed_features : None or ndarray, dtype=np.uint32 + Indices of the features that are allowed by interaction constraints to be + split. + + Returns + ------- + histograms : ndarray of HISTOGRAM_DTYPE, shape (n_features, n_bins) + The computed histograms of the current node. + """ + cdef: + int n_samples + int feature_idx + int f_idx + int i + # need local views to avoid python interactions + uint8_t hessians_are_constant = self.hessians_are_constant + int n_allowed_features = self.n_features + G_H_DTYPE_C [::1] ordered_gradients = self.ordered_gradients + G_H_DTYPE_C [::1] gradients = self.gradients + G_H_DTYPE_C [::1] ordered_hessians = self.ordered_hessians + G_H_DTYPE_C [::1] hessians = self.hessians + # Histograms will be initialized to zero later within a prange + hist_struct [:, ::1] histograms = np.empty( + shape=(self.n_features, self.n_bins), + dtype=HISTOGRAM_DTYPE + ) + bint has_interaction_cst = allowed_features is not None + int n_threads = self.n_threads + + if has_interaction_cst: + n_allowed_features = allowed_features.shape[0] + + with nogil: + n_samples = sample_indices.shape[0] + + # Populate ordered_gradients and ordered_hessians. (Already done + # for root) Ordering the gradients and hessians helps to improve + # cache hit. + if sample_indices.shape[0] != gradients.shape[0]: + if hessians_are_constant: + for i in prange(n_samples, schedule='static', + num_threads=n_threads): + ordered_gradients[i] = gradients[sample_indices[i]] + else: + for i in prange(n_samples, schedule='static', + num_threads=n_threads): + ordered_gradients[i] = gradients[sample_indices[i]] + ordered_hessians[i] = hessians[sample_indices[i]] + + # Compute histogram of each feature + for f_idx in prange( + n_allowed_features, schedule='static', num_threads=n_threads + ): + if has_interaction_cst: + feature_idx = allowed_features[f_idx] + else: + feature_idx = f_idx + + self._compute_histogram_brute_single_feature( + feature_idx, sample_indices, histograms + ) + + return histograms + + cdef void _compute_histogram_brute_single_feature( + HistogramBuilder self, + const int feature_idx, + const unsigned int [::1] sample_indices, # IN + hist_struct [:, ::1] histograms) noexcept nogil: # OUT + """Compute the histogram for a given feature.""" + + cdef: + unsigned int n_samples = sample_indices.shape[0] + const X_BINNED_DTYPE_C [::1] X_binned = \ + self.X_binned[:, feature_idx] + unsigned int root_node = X_binned.shape[0] == n_samples + G_H_DTYPE_C [::1] ordered_gradients = \ + self.ordered_gradients[:n_samples] + G_H_DTYPE_C [::1] ordered_hessians = \ + self.ordered_hessians[:n_samples] + uint8_t hessians_are_constant = \ + self.hessians_are_constant + + # Set histograms to zero. + memset(&histograms[feature_idx, 0], 0, self.n_bins * sizeof(hist_struct)) + + if root_node: + if hessians_are_constant: + _build_histogram_root_no_hessian(feature_idx, X_binned, + ordered_gradients, + histograms) + else: + _build_histogram_root(feature_idx, X_binned, + ordered_gradients, ordered_hessians, + histograms) + else: + if hessians_are_constant: + _build_histogram_no_hessian(feature_idx, + sample_indices, X_binned, + ordered_gradients, histograms) + else: + _build_histogram(feature_idx, sample_indices, + X_binned, ordered_gradients, + ordered_hessians, histograms) + + def compute_histograms_subtraction( + HistogramBuilder self, + hist_struct [:, ::1] parent_histograms, # IN and OUT + hist_struct [:, ::1] sibling_histograms, # IN + const unsigned int [:] allowed_features=None, # IN + ): + """Compute the histograms of the node using the subtraction trick. + + hist(parent) = hist(left_child) + hist(right_child) + + For a given feature, the complexity is O(n_bins). This is much more + efficient than compute_histograms_brute, but it's only possible for one + of the siblings. + + Parameters + ---------- + parent_histograms : ndarray of HISTOGRAM_DTYPE, \ + shape (n_features, n_bins) + The histograms of the parent. + sibling_histograms : ndarray of HISTOGRAM_DTYPE, \ + shape (n_features, n_bins) + The histograms of the sibling. + allowed_features : None or ndarray, dtype=np.uint32 + Indices of the features that are allowed by interaction constraints to be + split. + + Returns + ------- + histograms : ndarray of HISTOGRAM_DTYPE, shape(n_features, n_bins) + The computed histograms of the current node. + We repurpose parent_histograms for this and don't need to allocate new + memory. + """ + + cdef: + int feature_idx + int f_idx + int n_allowed_features = self.n_features + bint has_interaction_cst = allowed_features is not None + int n_threads = self.n_threads + + if has_interaction_cst: + n_allowed_features = allowed_features.shape[0] + + # Compute histogram of each feature + for f_idx in prange(n_allowed_features, schedule='static', nogil=True, + num_threads=n_threads): + if has_interaction_cst: + feature_idx = allowed_features[f_idx] + else: + feature_idx = f_idx + + _subtract_histograms( + feature_idx, + self.n_bins, + parent_histograms, + sibling_histograms, + ) + return parent_histograms + + +cpdef void _build_histogram_naive( + const int feature_idx, + unsigned int [:] sample_indices, # IN + X_BINNED_DTYPE_C [:] binned_feature, # IN + G_H_DTYPE_C [:] ordered_gradients, # IN + G_H_DTYPE_C [:] ordered_hessians, # IN + hist_struct [:, :] out) noexcept nogil: # OUT + """Build histogram in a naive way, without optimizing for cache hit. + + Used in tests to compare with the optimized version.""" + cdef: + unsigned int i + unsigned int n_samples = sample_indices.shape[0] + unsigned int sample_idx + unsigned int bin_idx + + for i in range(n_samples): + sample_idx = sample_indices[i] + bin_idx = binned_feature[sample_idx] + out[feature_idx, bin_idx].sum_gradients += ordered_gradients[i] + out[feature_idx, bin_idx].sum_hessians += ordered_hessians[i] + out[feature_idx, bin_idx].count += 1 + + +cpdef void _subtract_histograms( + const int feature_idx, + unsigned int n_bins, + hist_struct [:, ::1] hist_a, # IN and OUT + hist_struct [:, ::1] hist_b, # IN +) noexcept nogil: # OUT + """compute hist_a = hist_a - hist_b""" + # Note that subtraction of large sums of floating point numbers, as we have here, + # can exhibit catastrophic cancallation. This is in particular true for gradients + # as they can be positive and negative, while hessians are non-negative. + # Remember that gradients and hessians are originally computed in + # G_H_DTYPE_C = float32 precision. Therefore, if sum_gradients and sum_hessians are + # float64, we don't loose precision. But if we also used float32 for summation, we + # would need to take care of floating point errors. + # + # Note that we could protect for negative hessians by setting: + # sum_hessians = max(0, sum_hessians) + # But as we use float64 for summing float32, that's veeeery unlikely. + cdef: + unsigned int i = 0 + for i in range(n_bins): + hist_a[feature_idx, i].sum_gradients -= hist_b[feature_idx, i].sum_gradients + hist_a[feature_idx, i].sum_hessians -= hist_b[feature_idx, i].sum_hessians + hist_a[feature_idx, i].count -= hist_b[feature_idx, i].count + + +cpdef void _build_histogram( + const int feature_idx, + const unsigned int [::1] sample_indices, # IN + const X_BINNED_DTYPE_C [::1] binned_feature, # IN + const G_H_DTYPE_C [::1] ordered_gradients, # IN + const G_H_DTYPE_C [::1] ordered_hessians, # IN + hist_struct [:, ::1] out) noexcept nogil: # OUT + """Return histogram for a given feature.""" + cdef: + unsigned int i = 0 + unsigned int n_node_samples = sample_indices.shape[0] + unsigned int unrolled_upper = (n_node_samples // 4) * 4 + + unsigned int bin_0 + unsigned int bin_1 + unsigned int bin_2 + unsigned int bin_3 + unsigned int bin_idx + + for i in range(0, unrolled_upper, 4): + bin_0 = binned_feature[sample_indices[i]] + bin_1 = binned_feature[sample_indices[i + 1]] + bin_2 = binned_feature[sample_indices[i + 2]] + bin_3 = binned_feature[sample_indices[i + 3]] + + out[feature_idx, bin_0].sum_gradients += ordered_gradients[i] + out[feature_idx, bin_1].sum_gradients += ordered_gradients[i + 1] + out[feature_idx, bin_2].sum_gradients += ordered_gradients[i + 2] + out[feature_idx, bin_3].sum_gradients += ordered_gradients[i + 3] + + out[feature_idx, bin_0].sum_hessians += ordered_hessians[i] + out[feature_idx, bin_1].sum_hessians += ordered_hessians[i + 1] + out[feature_idx, bin_2].sum_hessians += ordered_hessians[i + 2] + out[feature_idx, bin_3].sum_hessians += ordered_hessians[i + 3] + + out[feature_idx, bin_0].count += 1 + out[feature_idx, bin_1].count += 1 + out[feature_idx, bin_2].count += 1 + out[feature_idx, bin_3].count += 1 + + for i in range(unrolled_upper, n_node_samples): + bin_idx = binned_feature[sample_indices[i]] + out[feature_idx, bin_idx].sum_gradients += ordered_gradients[i] + out[feature_idx, bin_idx].sum_hessians += ordered_hessians[i] + out[feature_idx, bin_idx].count += 1 + + +cpdef void _build_histogram_no_hessian( + const int feature_idx, + const unsigned int [::1] sample_indices, # IN + const X_BINNED_DTYPE_C [::1] binned_feature, # IN + const G_H_DTYPE_C [::1] ordered_gradients, # IN + hist_struct [:, ::1] out) noexcept nogil: # OUT + """Return histogram for a given feature, not updating hessians. + + Used when the hessians of the loss are constant (typically LS loss). + """ + + cdef: + unsigned int i = 0 + unsigned int n_node_samples = sample_indices.shape[0] + unsigned int unrolled_upper = (n_node_samples // 4) * 4 + + unsigned int bin_0 + unsigned int bin_1 + unsigned int bin_2 + unsigned int bin_3 + unsigned int bin_idx + + for i in range(0, unrolled_upper, 4): + bin_0 = binned_feature[sample_indices[i]] + bin_1 = binned_feature[sample_indices[i + 1]] + bin_2 = binned_feature[sample_indices[i + 2]] + bin_3 = binned_feature[sample_indices[i + 3]] + + out[feature_idx, bin_0].sum_gradients += ordered_gradients[i] + out[feature_idx, bin_1].sum_gradients += ordered_gradients[i + 1] + out[feature_idx, bin_2].sum_gradients += ordered_gradients[i + 2] + out[feature_idx, bin_3].sum_gradients += ordered_gradients[i + 3] + + out[feature_idx, bin_0].count += 1 + out[feature_idx, bin_1].count += 1 + out[feature_idx, bin_2].count += 1 + out[feature_idx, bin_3].count += 1 + + for i in range(unrolled_upper, n_node_samples): + bin_idx = binned_feature[sample_indices[i]] + out[feature_idx, bin_idx].sum_gradients += ordered_gradients[i] + out[feature_idx, bin_idx].count += 1 + + +cpdef void _build_histogram_root( + const int feature_idx, + const X_BINNED_DTYPE_C [::1] binned_feature, # IN + const G_H_DTYPE_C [::1] all_gradients, # IN + const G_H_DTYPE_C [::1] all_hessians, # IN + hist_struct [:, ::1] out) noexcept nogil: # OUT + """Compute histogram of the root node. + + Unlike other nodes, the root node has to find the split among *all* the + samples from the training set. binned_feature and all_gradients / + all_hessians already have a consistent ordering. + """ + + cdef: + unsigned int i = 0 + unsigned int n_samples = binned_feature.shape[0] + unsigned int unrolled_upper = (n_samples // 4) * 4 + + unsigned int bin_0 + unsigned int bin_1 + unsigned int bin_2 + unsigned int bin_3 + unsigned int bin_idx + + for i in range(0, unrolled_upper, 4): + + bin_0 = binned_feature[i] + bin_1 = binned_feature[i + 1] + bin_2 = binned_feature[i + 2] + bin_3 = binned_feature[i + 3] + + out[feature_idx, bin_0].sum_gradients += all_gradients[i] + out[feature_idx, bin_1].sum_gradients += all_gradients[i + 1] + out[feature_idx, bin_2].sum_gradients += all_gradients[i + 2] + out[feature_idx, bin_3].sum_gradients += all_gradients[i + 3] + + out[feature_idx, bin_0].sum_hessians += all_hessians[i] + out[feature_idx, bin_1].sum_hessians += all_hessians[i + 1] + out[feature_idx, bin_2].sum_hessians += all_hessians[i + 2] + out[feature_idx, bin_3].sum_hessians += all_hessians[i + 3] + + out[feature_idx, bin_0].count += 1 + out[feature_idx, bin_1].count += 1 + out[feature_idx, bin_2].count += 1 + out[feature_idx, bin_3].count += 1 + + for i in range(unrolled_upper, n_samples): + bin_idx = binned_feature[i] + out[feature_idx, bin_idx].sum_gradients += all_gradients[i] + out[feature_idx, bin_idx].sum_hessians += all_hessians[i] + out[feature_idx, bin_idx].count += 1 + + +cpdef void _build_histogram_root_no_hessian( + const int feature_idx, + const X_BINNED_DTYPE_C [::1] binned_feature, # IN + const G_H_DTYPE_C [::1] all_gradients, # IN + hist_struct [:, ::1] out) noexcept nogil: # OUT + """Compute histogram of the root node, not updating hessians. + + Used when the hessians of the loss are constant (typically LS loss). + """ + + cdef: + unsigned int i = 0 + unsigned int n_samples = binned_feature.shape[0] + unsigned int unrolled_upper = (n_samples // 4) * 4 + + unsigned int bin_0 + unsigned int bin_1 + unsigned int bin_2 + unsigned int bin_3 + unsigned int bin_idx + + for i in range(0, unrolled_upper, 4): + bin_0 = binned_feature[i] + bin_1 = binned_feature[i + 1] + bin_2 = binned_feature[i + 2] + bin_3 = binned_feature[i + 3] + + out[feature_idx, bin_0].sum_gradients += all_gradients[i] + out[feature_idx, bin_1].sum_gradients += all_gradients[i + 1] + out[feature_idx, bin_2].sum_gradients += all_gradients[i + 2] + out[feature_idx, bin_3].sum_gradients += all_gradients[i + 3] + + out[feature_idx, bin_0].count += 1 + out[feature_idx, bin_1].count += 1 + out[feature_idx, bin_2].count += 1 + out[feature_idx, bin_3].count += 1 + + for i in range(unrolled_upper, n_samples): + bin_idx = binned_feature[i] + out[feature_idx, bin_idx].sum_gradients += all_gradients[i] + out[feature_idx, bin_idx].count += 1 diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/meson.build b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..122a2102800f38f111af48d9ce009505dd5308ee --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/meson.build @@ -0,0 +1,20 @@ +hist_gradient_boosting_extension_metadata = { + '_gradient_boosting': {'sources': [cython_gen.process('_gradient_boosting.pyx')], + 'dependencies': [openmp_dep]}, + 'histogram': {'sources': [cython_gen.process('histogram.pyx')], 'dependencies': [openmp_dep]}, + 'splitting': {'sources': [cython_gen.process('splitting.pyx')], 'dependencies': [openmp_dep]}, + '_binning': {'sources': [cython_gen.process('_binning.pyx')], 'dependencies': [openmp_dep]}, + '_predictor': {'sources': [cython_gen.process('_predictor.pyx')], 'dependencies': [openmp_dep]}, + '_bitset': {'sources': [cython_gen.process('_bitset.pyx')]}, + 'common': {'sources': [cython_gen.process('common.pyx')]}, +} + +foreach ext_name, ext_dict : hist_gradient_boosting_extension_metadata + py.extension_module( + ext_name, + ext_dict.get('sources'), + dependencies: ext_dict.get('dependencies', []), + subdir: 'sklearn/ensemble/_hist_gradient_boosting', + install: true + ) +endforeach diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/predictor.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..59bb6499c450114db3171342d7bb97111db64b81 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/predictor.py @@ -0,0 +1,146 @@ +""" +This module contains the TreePredictor class which is used for prediction. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np + +from ._predictor import ( + _compute_partial_dependence, + _predict_from_binned_data, + _predict_from_raw_data, +) +from .common import PREDICTOR_RECORD_DTYPE, Y_DTYPE + + +class TreePredictor: + """Tree class used for predictions. + + Parameters + ---------- + nodes : ndarray of PREDICTOR_RECORD_DTYPE + The nodes of the tree. + binned_left_cat_bitsets : ndarray of shape (n_categorical_splits, 8), dtype=uint32 + Array of bitsets for binned categories used in predict_binned when a + split is categorical. + raw_left_cat_bitsets : ndarray of shape (n_categorical_splits, 8), dtype=uint32 + Array of bitsets for raw categories used in predict when a split is + categorical. + """ + + def __init__(self, nodes, binned_left_cat_bitsets, raw_left_cat_bitsets): + self.nodes = nodes + self.binned_left_cat_bitsets = binned_left_cat_bitsets + self.raw_left_cat_bitsets = raw_left_cat_bitsets + + def get_n_leaf_nodes(self): + """Return number of leaves.""" + return int(self.nodes["is_leaf"].sum()) + + def get_max_depth(self): + """Return maximum depth among all leaves.""" + return int(self.nodes["depth"].max()) + + def predict(self, X, known_cat_bitsets, f_idx_map, n_threads): + """Predict raw values for non-binned data. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + The input samples. + + known_cat_bitsets : ndarray of shape (n_categorical_features, 8) + Array of bitsets of known categories, for each categorical feature. + + f_idx_map : ndarray of shape (n_features,) + Map from original feature index to the corresponding index in the + known_cat_bitsets array. + + n_threads : int + Number of OpenMP threads to use. + + Returns + ------- + y : ndarray, shape (n_samples,) + The raw predicted values. + """ + out = np.empty(X.shape[0], dtype=Y_DTYPE) + + _predict_from_raw_data( + self.nodes, + X, + self.raw_left_cat_bitsets, + known_cat_bitsets, + f_idx_map, + n_threads, + out, + ) + return out + + def predict_binned(self, X, missing_values_bin_idx, n_threads): + """Predict raw values for binned data. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + The input samples. + missing_values_bin_idx : uint8 + Index of the bin that is used for missing values. This is the + index of the last bin and is always equal to max_bins (as passed + to the GBDT classes), or equivalently to n_bins - 1. + n_threads : int + Number of OpenMP threads to use. + + Returns + ------- + y : ndarray, shape (n_samples,) + The raw predicted values. + """ + out = np.empty(X.shape[0], dtype=Y_DTYPE) + _predict_from_binned_data( + self.nodes, + X, + self.binned_left_cat_bitsets, + missing_values_bin_idx, + n_threads, + out, + ) + return out + + def compute_partial_dependence(self, grid, target_features, out): + """Fast partial dependence computation. + + Parameters + ---------- + grid : ndarray, shape (n_samples, n_target_features) + The grid points on which the partial dependence should be + evaluated. + target_features : ndarray, shape (n_target_features) + The set of target features for which the partial dependence + should be evaluated. + out : ndarray, shape (n_samples) + The value of the partial dependence function on each grid + point. + """ + _compute_partial_dependence(self.nodes, grid, target_features, out) + + def __setstate__(self, state): + try: + super().__setstate__(state) + except AttributeError: + self.__dict__.update(state) + + # The dtype of feature_idx is np.intp which is platform dependent. Here, we + # make sure that saving and loading on different bitness systems works without + # errors. For instance, on a 64 bit Python runtime, np.intp = np.int64, + # while on 32 bit np.intp = np.int32. + # + # TODO: consider always using platform agnostic dtypes for fitted + # estimator attributes. For this particular estimator, this would + # mean replacing the intp field of PREDICTOR_RECORD_DTYPE by an int32 + # field. Ideally this should be done consistently throughout + # scikit-learn along with a common test. + if self.nodes.dtype != PREDICTOR_RECORD_DTYPE: + self.nodes = self.nodes.astype(PREDICTOR_RECORD_DTYPE, casting="same_kind") diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx new file mode 100644 index 0000000000000000000000000000000000000000..c4cb22067cf376294e16067fe6c62dfbe215fb90 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -0,0 +1,1201 @@ +"""This module contains routines and data structures to: + +- Find the best possible split of a node. For a given node, a split is + characterized by a feature and a bin. +- Apply a split to a node, i.e. split the indices of the samples at the node + into the newly created left and right children. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +cimport cython +from cython.parallel import prange +import numpy as np +from libc.math cimport INFINITY, ceil +from libc.stdlib cimport malloc, free, qsort +from libc.string cimport memcpy + +from ...utils._typedefs cimport uint8_t +from .common cimport X_BINNED_DTYPE_C +from .common cimport Y_DTYPE_C +from .common cimport hist_struct +from .common cimport BITSET_INNER_DTYPE_C +from .common cimport BITSET_DTYPE_C +from .common cimport MonotonicConstraint +from ._bitset cimport init_bitset +from ._bitset cimport set_bitset +from ._bitset cimport in_bitset + + +cdef struct split_info_struct: + # Same as the SplitInfo class, but we need a C struct to use it in the + # nogil sections and to use in arrays. + Y_DTYPE_C gain + int feature_idx + unsigned int bin_idx + uint8_t missing_go_to_left + Y_DTYPE_C sum_gradient_left + Y_DTYPE_C sum_gradient_right + Y_DTYPE_C sum_hessian_left + Y_DTYPE_C sum_hessian_right + unsigned int n_samples_left + unsigned int n_samples_right + Y_DTYPE_C value_left + Y_DTYPE_C value_right + uint8_t is_categorical + BITSET_DTYPE_C left_cat_bitset + + +# used in categorical splits for sorting categories by increasing values of +# sum_gradients / sum_hessians +cdef struct categorical_info: + X_BINNED_DTYPE_C bin_idx + Y_DTYPE_C value + + +class SplitInfo: + """Pure data class to store information about a potential split. + + Parameters + ---------- + gain : float + The gain of the split. + feature_idx : int + The index of the feature to be split. + bin_idx : int + The index of the bin on which the split is made. Should be ignored if + `is_categorical` is True: `left_cat_bitset` will be used to determine + the split. + missing_go_to_left : bool + Whether missing values should go to the left child. This is used + whether the split is categorical or not. + sum_gradient_left : float + The sum of the gradients of all the samples in the left child. + sum_hessian_left : float + The sum of the hessians of all the samples in the left child. + sum_gradient_right : float + The sum of the gradients of all the samples in the right child. + sum_hessian_right : float + The sum of the hessians of all the samples in the right child. + n_samples_left : int, default=0 + The number of samples in the left child. + n_samples_right : int + The number of samples in the right child. + is_categorical : bool + Whether the split is done on a categorical feature. + left_cat_bitset : ndarray of shape=(8,), dtype=uint32 or None + Bitset representing the categories that go to the left. This is used + only when `is_categorical` is True. + Note that missing values are part of that bitset if there are missing + values in the training data. For missing values, we rely on that + bitset for splitting, but at prediction time, we rely on + missing_go_to_left. + """ + def __init__(self, gain, feature_idx, bin_idx, + missing_go_to_left, sum_gradient_left, sum_hessian_left, + sum_gradient_right, sum_hessian_right, n_samples_left, + n_samples_right, value_left, value_right, + is_categorical, left_cat_bitset): + self.gain = gain + self.feature_idx = feature_idx + self.bin_idx = bin_idx + self.missing_go_to_left = missing_go_to_left + self.sum_gradient_left = sum_gradient_left + self.sum_hessian_left = sum_hessian_left + self.sum_gradient_right = sum_gradient_right + self.sum_hessian_right = sum_hessian_right + self.n_samples_left = n_samples_left + self.n_samples_right = n_samples_right + self.value_left = value_left + self.value_right = value_right + self.is_categorical = is_categorical + self.left_cat_bitset = left_cat_bitset + + +@cython.final +cdef class Splitter: + """Splitter used to find the best possible split at each node. + + A split (see SplitInfo) is characterized by a feature and a bin. + + The Splitter is also responsible for partitioning the samples among the + leaves of the tree (see split_indices() and the partition attribute). + + Parameters + ---------- + X_binned : ndarray of int, shape (n_samples, n_features) + The binned input samples. Must be Fortran-aligned. + n_bins_non_missing : ndarray, shape (n_features,) + For each feature, gives the number of bins actually used for + non-missing values. + missing_values_bin_idx : uint8 + Index of the bin that is used for missing values. This is the index of + the last bin and is always equal to max_bins (as passed to the GBDT + classes), or equivalently to n_bins - 1. + has_missing_values : ndarray, shape (n_features,) + Whether missing values were observed in the training data, for each + feature. + is_categorical : ndarray of bool of shape (n_features,) + Indicates categorical features. + monotonic_cst : ndarray of int of shape (n_features,), dtype=int + Indicates the monotonic constraint to enforce on each feature. + - 1: monotonic increase + - 0: no constraint + - -1: monotonic decrease + + Read more in the :ref:`User Guide `. + l2_regularization : float + The L2 regularization parameter. + min_hessian_to_split : float, default=1e-3 + The minimum sum of hessians needed in each node. Splits that result in + at least one child having a sum of hessians less than + min_hessian_to_split are discarded. + min_samples_leaf : int, default=20 + The minimum number of samples per leaf. + min_gain_to_split : float, default=0.0 + The minimum gain needed to split a node. Splits with lower gain will + be ignored. + hessians_are_constant: bool, default is False + Whether hessians are constant. + feature_fraction_per_split : float, default=1 + Proportion of randomly chosen features in each and every node split. + This is a form of regularization, smaller values make the trees weaker + learners and might prevent overfitting. + rng : Generator + n_threads : int, default=1 + Number of OpenMP threads to use. + """ + cdef public: + const X_BINNED_DTYPE_C [::1, :] X_binned + unsigned int n_features + const unsigned int [::1] n_bins_non_missing + uint8_t missing_values_bin_idx + const uint8_t [::1] has_missing_values + const uint8_t [::1] is_categorical + const signed char [::1] monotonic_cst + uint8_t hessians_are_constant + Y_DTYPE_C l2_regularization + Y_DTYPE_C min_hessian_to_split + unsigned int min_samples_leaf + Y_DTYPE_C min_gain_to_split + Y_DTYPE_C feature_fraction_per_split + rng + + unsigned int [::1] partition + unsigned int [::1] left_indices_buffer + unsigned int [::1] right_indices_buffer + int n_threads + + def __init__(self, + const X_BINNED_DTYPE_C [::1, :] X_binned, + const unsigned int [::1] n_bins_non_missing, + const uint8_t missing_values_bin_idx, + const uint8_t [::1] has_missing_values, + const uint8_t [::1] is_categorical, + const signed char [::1] monotonic_cst, + Y_DTYPE_C l2_regularization, + Y_DTYPE_C min_hessian_to_split=1e-3, + unsigned int min_samples_leaf=20, + Y_DTYPE_C min_gain_to_split=0., + uint8_t hessians_are_constant=False, + Y_DTYPE_C feature_fraction_per_split=1.0, + rng=np.random.RandomState(), + unsigned int n_threads=1): + + self.X_binned = X_binned + self.n_features = X_binned.shape[1] + self.n_bins_non_missing = n_bins_non_missing + self.missing_values_bin_idx = missing_values_bin_idx + self.has_missing_values = has_missing_values + self.is_categorical = is_categorical + self.monotonic_cst = monotonic_cst + self.l2_regularization = l2_regularization + self.min_hessian_to_split = min_hessian_to_split + self.min_samples_leaf = min_samples_leaf + self.min_gain_to_split = min_gain_to_split + self.hessians_are_constant = hessians_are_constant + self.feature_fraction_per_split = feature_fraction_per_split + self.rng = rng + self.n_threads = n_threads + + # The partition array maps each sample index into the leaves of the + # tree (a leaf in this context is a node that isn't split yet, not + # necessarily a 'finalized' leaf). Initially, the root contains all + # the indices, e.g.: + # partition = [abcdefghijkl] + # After a call to split_indices, it may look e.g. like this: + # partition = [cef|abdghijkl] + # we have 2 leaves, the left one is at position 0 and the second one at + # position 3. The order of the samples is irrelevant. + self.partition = np.arange(X_binned.shape[0], dtype=np.uint32) + # buffers used in split_indices to support parallel splitting. + self.left_indices_buffer = np.empty_like(self.partition) + self.right_indices_buffer = np.empty_like(self.partition) + + def split_indices(Splitter self, split_info, unsigned int [::1] + sample_indices): + """Split samples into left and right arrays. + + The split is performed according to the best possible split + (split_info). + + Ultimately, this is nothing but a partition of the sample_indices + array with a given pivot, exactly like a quicksort subroutine. + + Parameters + ---------- + split_info : SplitInfo + The SplitInfo of the node to split. + sample_indices : ndarray of unsigned int, shape (n_samples_at_node,) + The indices of the samples at the node to split. This is a view + on self.partition, and it is modified inplace by placing the + indices of the left child at the beginning, and the indices of + the right child at the end. + + Returns + ------- + left_indices : ndarray of int, shape (n_left_samples,) + The indices of the samples in the left child. This is a view on + self.partition. + right_indices : ndarray of int, shape (n_right_samples,) + The indices of the samples in the right child. This is a view on + self.partition. + right_child_position : int + The position of the right child in ``sample_indices``. + """ + # This is a multi-threaded implementation inspired by lightgbm. Here + # is a quick break down. Let's suppose we want to split a node with 24 + # samples named from a to x. self.partition looks like this (the * are + # indices in other leaves that we don't care about): + # partition = [*************abcdefghijklmnopqrstuvwx****************] + # ^ ^ + # node_position node_position + node.n_samples + + # Ultimately, we want to reorder the samples inside the boundaries of + # the leaf (which becomes a node) to now represent the samples in its + # left and right child. For example: + # partition = [*************abefilmnopqrtuxcdghjksvw*****************] + # ^ ^ + # left_child_pos right_child_pos + # Note that left_child_pos always takes the value of node_position, + # and right_child_pos = left_child_pos + left_child.n_samples. The + # order of the samples inside a leaf is irrelevant. + + # 1. sample_indices is a view on this region a..x. We conceptually + # divide it into n_threads regions. Each thread will be responsible + # for its own region. Here is an example with 4 threads: + # sample_indices = [abcdef|ghijkl|mnopqr|stuvwx] + # 2. Each thread processes 6 = 24 // 4 entries and maps them into + # left_indices_buffer or right_indices_buffer. For example, we could + # have the following mapping ('.' denotes an undefined entry): + # - left_indices_buffer = [abef..|il....|mnopqr|tux...] + # - right_indices_buffer = [cd....|ghjk..|......|svw...] + # 3. We keep track of the start positions of the regions (the '|') in + # ``offset_in_buffers`` as well as the size of each region. We also + # keep track of the number of samples put into the left/right child + # by each thread. Concretely: + # - left_counts = [4, 2, 6, 3] + # - right_counts = [2, 4, 0, 3] + # 4. Finally, we put left/right_indices_buffer back into the + # sample_indices, without any undefined entries and the partition + # looks as expected + # partition = [*************abefilmnopqrtuxcdghjksvw***************] + + # Note: We here show left/right_indices_buffer as being the same size + # as sample_indices for simplicity, but in reality they are of the + # same size as partition. + + cdef: + int n_samples = sample_indices.shape[0] + X_BINNED_DTYPE_C bin_idx = split_info.bin_idx + uint8_t missing_go_to_left = split_info.missing_go_to_left + uint8_t missing_values_bin_idx = self.missing_values_bin_idx + int feature_idx = split_info.feature_idx + const X_BINNED_DTYPE_C [::1] X_binned = \ + self.X_binned[:, feature_idx] + unsigned int [::1] left_indices_buffer = self.left_indices_buffer + unsigned int [::1] right_indices_buffer = self.right_indices_buffer + uint8_t is_categorical = split_info.is_categorical + # Cython is unhappy if we set left_cat_bitset to + # split_info.left_cat_bitset directly, so we need a tmp var + BITSET_INNER_DTYPE_C [:] cat_bitset_tmp = split_info.left_cat_bitset + BITSET_DTYPE_C left_cat_bitset + int n_threads = self.n_threads + + int [:] sizes = np.full(n_threads, n_samples // n_threads, + dtype=np.int32) + int [:] offset_in_buffers = np.zeros(n_threads, dtype=np.int32) + int [:] left_counts = np.empty(n_threads, dtype=np.int32) + int [:] right_counts = np.empty(n_threads, dtype=np.int32) + int left_count + int right_count + int start + int stop + int i + int thread_idx + int sample_idx + int right_child_position + uint8_t turn_left + int [:] left_offset = np.zeros(n_threads, dtype=np.int32) + int [:] right_offset = np.zeros(n_threads, dtype=np.int32) + + # only set left_cat_bitset when is_categorical is True + if is_categorical: + left_cat_bitset = &cat_bitset_tmp[0] + + with nogil: + for thread_idx in range(n_samples % n_threads): + sizes[thread_idx] += 1 + + for thread_idx in range(1, n_threads): + offset_in_buffers[thread_idx] = \ + offset_in_buffers[thread_idx - 1] + sizes[thread_idx - 1] + + # map indices from sample_indices to left/right_indices_buffer + for thread_idx in prange(n_threads, schedule='static', + chunksize=1, num_threads=n_threads): + left_count = 0 + right_count = 0 + + start = offset_in_buffers[thread_idx] + stop = start + sizes[thread_idx] + for i in range(start, stop): + sample_idx = sample_indices[i] + turn_left = sample_goes_left( + missing_go_to_left, + missing_values_bin_idx, bin_idx, + X_binned[sample_idx], is_categorical, + left_cat_bitset) + + if turn_left: + left_indices_buffer[start + left_count] = sample_idx + left_count = left_count + 1 + else: + right_indices_buffer[start + right_count] = sample_idx + right_count = right_count + 1 + + left_counts[thread_idx] = left_count + right_counts[thread_idx] = right_count + + # position of right child = just after the left child + right_child_position = 0 + for thread_idx in range(n_threads): + right_child_position += left_counts[thread_idx] + + # offset of each thread in sample_indices for left and right + # child, i.e. where each thread will start to write. + right_offset[0] = right_child_position + for thread_idx in range(1, n_threads): + left_offset[thread_idx] = \ + left_offset[thread_idx - 1] + left_counts[thread_idx - 1] + right_offset[thread_idx] = \ + right_offset[thread_idx - 1] + right_counts[thread_idx - 1] + + # map indices in left/right_indices_buffer back into + # sample_indices. This also updates self.partition since + # sample_indices is a view. + for thread_idx in prange(n_threads, schedule='static', + chunksize=1, num_threads=n_threads): + memcpy( + &sample_indices[left_offset[thread_idx]], + &left_indices_buffer[offset_in_buffers[thread_idx]], + sizeof(unsigned int) * left_counts[thread_idx] + ) + if right_counts[thread_idx] > 0: + # If we're splitting the rightmost node of the tree, i.e. the + # rightmost node in the partition array, and if n_threads >= 2, one + # might have right_counts[-1] = 0 and right_offset[-1] = len(sample_indices) + # leading to evaluating + # + # &sample_indices[right_offset[-1]] = &samples_indices[n_samples_at_node] + # = &partition[n_samples_in_tree] + # + # which is an out-of-bounds read access that can cause a segmentation fault. + # When boundscheck=True, removing this check produces this exception: + # + # IndexError: Out of bounds on buffer access + # + memcpy( + &sample_indices[right_offset[thread_idx]], + &right_indices_buffer[offset_in_buffers[thread_idx]], + sizeof(unsigned int) * right_counts[thread_idx] + ) + + return (sample_indices[:right_child_position], + sample_indices[right_child_position:], + right_child_position) + + def find_node_split( + Splitter self, + unsigned int n_samples, + hist_struct [:, ::1] histograms, # IN + const Y_DTYPE_C sum_gradients, + const Y_DTYPE_C sum_hessians, + const Y_DTYPE_C value, + const Y_DTYPE_C lower_bound=-INFINITY, + const Y_DTYPE_C upper_bound=INFINITY, + const unsigned int [:] allowed_features=None, + ): + """For each feature, find the best bin to split on at a given node. + + Return the best split info among all features. + + Parameters + ---------- + n_samples : int + The number of samples at the node. + histograms : ndarray of HISTOGRAM_DTYPE of \ + shape (n_features, max_bins) + The histograms of the current node. + sum_gradients : float + The sum of the gradients for each sample at the node. + sum_hessians : float + The sum of the hessians for each sample at the node. + value : float + The bounded value of the current node. We directly pass the value + instead of re-computing it from sum_gradients and sum_hessians, + because we need to compute the loss and the gain based on the + *bounded* value: computing the value from + sum_gradients / sum_hessians would give the unbounded value, and + the interaction with min_gain_to_split would not be correct + anymore. Side note: we can't use the lower_bound / upper_bound + parameters either because these refer to the bounds of the + children, not the bounds of the current node. + lower_bound : float + Lower bound for the children values for respecting the monotonic + constraints. + upper_bound : float + Upper bound for the children values for respecting the monotonic + constraints. + allowed_features : None or ndarray, dtype=np.uint32 + Indices of the features that are allowed by interaction constraints to be + split. + + Returns + ------- + best_split_info : SplitInfo + The info about the best possible split among all features. + """ + cdef: + int feature_idx + int split_info_idx + int best_split_info_idx + int n_allowed_features + split_info_struct split_info + split_info_struct * split_infos + const uint8_t [::1] has_missing_values = self.has_missing_values + const uint8_t [::1] is_categorical = self.is_categorical + const signed char [::1] monotonic_cst = self.monotonic_cst + int n_threads = self.n_threads + bint has_interaction_cst = False + Y_DTYPE_C feature_fraction_per_split = self.feature_fraction_per_split + uint8_t [:] subsample_mask # same as npy_bool + int n_subsampled_features + + has_interaction_cst = allowed_features is not None + if has_interaction_cst: + n_allowed_features = allowed_features.shape[0] + else: + n_allowed_features = self.n_features + + if feature_fraction_per_split < 1.0: + # We do all random sampling before the nogil and make sure that we sample + # exactly n_subsampled_features >= 1 features. + n_subsampled_features = max( + 1, + int(ceil(feature_fraction_per_split * n_allowed_features)), + ) + subsample_mask_arr = np.full(n_allowed_features, False) + subsample_mask_arr[:n_subsampled_features] = True + self.rng.shuffle(subsample_mask_arr) + # https://github.com/numpy/numpy/issues/18273 + subsample_mask = subsample_mask_arr + + with nogil: + + split_infos = malloc( + n_allowed_features * sizeof(split_info_struct)) + + # split_info_idx is index of split_infos of size n_allowed_features. + # features_idx is the index of the feature column in X. + for split_info_idx in prange(n_allowed_features, schedule='static', + num_threads=n_threads): + if has_interaction_cst: + feature_idx = allowed_features[split_info_idx] + else: + feature_idx = split_info_idx + + split_infos[split_info_idx].feature_idx = feature_idx + + # For each feature, find best bin to split on + # Start with a gain of -1 if no better split is found, that + # means one of the constraints isn't respected + # (min_samples_leaf, etc.) and the grower will later turn the + # node into a leaf. + split_infos[split_info_idx].gain = -1 + split_infos[split_info_idx].is_categorical = is_categorical[feature_idx] + + # Note that subsample_mask is indexed by split_info_idx and not by + # feature_idx because we only need to exclude the same features again + # and again. We do NOT need to access the features directly by using + # allowed_features. + if feature_fraction_per_split < 1.0 and not subsample_mask[split_info_idx]: + continue + + if is_categorical[feature_idx]: + self._find_best_bin_to_split_category( + feature_idx, has_missing_values[feature_idx], + histograms, n_samples, sum_gradients, sum_hessians, + value, monotonic_cst[feature_idx], lower_bound, + upper_bound, &split_infos[split_info_idx]) + else: + # We will scan bins from left to right (in all cases), and + # if there are any missing values, we will also scan bins + # from right to left. This way, we can consider whichever + # case yields the best gain: either missing values go to + # the right (left to right scan) or to the left (right to + # left case). See algo 3 from the XGBoost paper + # https://arxiv.org/abs/1603.02754 + # Note: for the categorical features above, this isn't + # needed since missing values are considered a native + # category. + self._find_best_bin_to_split_left_to_right( + feature_idx, has_missing_values[feature_idx], + histograms, n_samples, sum_gradients, sum_hessians, + value, monotonic_cst[feature_idx], + lower_bound, upper_bound, &split_infos[split_info_idx]) + + if has_missing_values[feature_idx]: + # We need to explore both directions to check whether + # sending the nans to the left child would lead to a higher + # gain + self._find_best_bin_to_split_right_to_left( + feature_idx, histograms, n_samples, + sum_gradients, sum_hessians, + value, monotonic_cst[feature_idx], + lower_bound, upper_bound, &split_infos[split_info_idx]) + + # then compute best possible split among all features + # split_info is set to the best of split_infos + best_split_info_idx = self._find_best_feature_to_split_helper( + split_infos, n_allowed_features + ) + split_info = split_infos[best_split_info_idx] + + out = SplitInfo( + split_info.gain, + split_info.feature_idx, + split_info.bin_idx, + split_info.missing_go_to_left, + split_info.sum_gradient_left, + split_info.sum_hessian_left, + split_info.sum_gradient_right, + split_info.sum_hessian_right, + split_info.n_samples_left, + split_info.n_samples_right, + split_info.value_left, + split_info.value_right, + split_info.is_categorical, + None, # left_cat_bitset will only be set if the split is categorical + ) + # Only set bitset if the split is categorical + if split_info.is_categorical: + out.left_cat_bitset = np.asarray(split_info.left_cat_bitset, dtype=np.uint32) + + free(split_infos) + return out + + cdef int _find_best_feature_to_split_helper( + self, + split_info_struct * split_infos, # IN + int n_allowed_features, + ) noexcept nogil: + """Return the index of split_infos with the best feature split.""" + cdef: + int split_info_idx + int best_split_info_idx = 0 + + for split_info_idx in range(1, n_allowed_features): + if (split_infos[split_info_idx].gain > split_infos[best_split_info_idx].gain): + best_split_info_idx = split_info_idx + return best_split_info_idx + + cdef void _find_best_bin_to_split_left_to_right( + Splitter self, + unsigned int feature_idx, + uint8_t has_missing_values, + const hist_struct [:, ::1] histograms, # IN + unsigned int n_samples, + Y_DTYPE_C sum_gradients, + Y_DTYPE_C sum_hessians, + Y_DTYPE_C value, + signed char monotonic_cst, + Y_DTYPE_C lower_bound, + Y_DTYPE_C upper_bound, + split_info_struct * split_info) noexcept nogil: # OUT + """Find best bin to split on for a given feature. + + Splits that do not satisfy the splitting constraints + (min_gain_to_split, etc.) are discarded here. + + We scan node from left to right. This version is called whether there + are missing values or not. If any, missing values are assigned to the + right node. + """ + cdef: + unsigned int bin_idx + unsigned int n_samples_left + unsigned int n_samples_right + unsigned int n_samples_ = n_samples + # We set the 'end' variable such that the last non-missing-values + # bin never goes to the left child (which would result in and + # empty right child), unless there are missing values, since these + # would go to the right child. + unsigned int end = \ + self.n_bins_non_missing[feature_idx] - 1 + has_missing_values + Y_DTYPE_C sum_hessian_left + Y_DTYPE_C sum_hessian_right + Y_DTYPE_C sum_gradient_left + Y_DTYPE_C sum_gradient_right + Y_DTYPE_C loss_current_node + Y_DTYPE_C gain + uint8_t found_better_split = False + + Y_DTYPE_C best_sum_hessian_left + Y_DTYPE_C best_sum_gradient_left + unsigned int best_bin_idx + unsigned int best_n_samples_left + Y_DTYPE_C best_gain = -1 + hist_struct hist + + sum_gradient_left, sum_hessian_left = 0., 0. + n_samples_left = 0 + + loss_current_node = _loss_from_value(value, sum_gradients) + + for bin_idx in range(end): + hist = histograms[feature_idx, bin_idx] + n_samples_left += hist.count + n_samples_right = n_samples_ - n_samples_left + + if self.hessians_are_constant: + sum_hessian_left += hist.count + else: + sum_hessian_left += \ + hist.sum_hessians + sum_hessian_right = sum_hessians - sum_hessian_left + + sum_gradient_left += hist.sum_gradients + sum_gradient_right = sum_gradients - sum_gradient_left + + if n_samples_left < self.min_samples_leaf: + continue + if n_samples_right < self.min_samples_leaf: + # won't get any better + break + + if sum_hessian_left < self.min_hessian_to_split: + continue + if sum_hessian_right < self.min_hessian_to_split: + # won't get any better (hessians are > 0 since loss is convex) + break + + gain = _split_gain(sum_gradient_left, sum_hessian_left, + sum_gradient_right, sum_hessian_right, + loss_current_node, + monotonic_cst, + lower_bound, + upper_bound, + self.l2_regularization) + + if gain > best_gain and gain > self.min_gain_to_split: + found_better_split = True + best_gain = gain + best_bin_idx = bin_idx + best_sum_gradient_left = sum_gradient_left + best_sum_hessian_left = sum_hessian_left + best_n_samples_left = n_samples_left + + if found_better_split: + split_info.gain = best_gain + split_info.bin_idx = best_bin_idx + # we scan from left to right so missing values go to the right + split_info.missing_go_to_left = False + split_info.sum_gradient_left = best_sum_gradient_left + split_info.sum_gradient_right = sum_gradients - best_sum_gradient_left + split_info.sum_hessian_left = best_sum_hessian_left + split_info.sum_hessian_right = sum_hessians - best_sum_hessian_left + split_info.n_samples_left = best_n_samples_left + split_info.n_samples_right = n_samples - best_n_samples_left + + # We recompute best values here but it's cheap + split_info.value_left = compute_node_value( + split_info.sum_gradient_left, split_info.sum_hessian_left, + lower_bound, upper_bound, self.l2_regularization) + + split_info.value_right = compute_node_value( + split_info.sum_gradient_right, split_info.sum_hessian_right, + lower_bound, upper_bound, self.l2_regularization) + + cdef void _find_best_bin_to_split_right_to_left( + self, + unsigned int feature_idx, + const hist_struct [:, ::1] histograms, # IN + unsigned int n_samples, + Y_DTYPE_C sum_gradients, + Y_DTYPE_C sum_hessians, + Y_DTYPE_C value, + signed char monotonic_cst, + Y_DTYPE_C lower_bound, + Y_DTYPE_C upper_bound, + split_info_struct * split_info) noexcept nogil: # OUT + """Find best bin to split on for a given feature. + + Splits that do not satisfy the splitting constraints + (min_gain_to_split, etc.) are discarded here. + + We scan node from right to left. This version is only called when + there are missing values. Missing values are assigned to the left + child. + + If no missing value are present in the data this method isn't called + since only calling _find_best_bin_to_split_left_to_right is enough. + """ + + cdef: + unsigned int bin_idx + unsigned int n_samples_left + unsigned int n_samples_right + unsigned int n_samples_ = n_samples + Y_DTYPE_C sum_hessian_left + Y_DTYPE_C sum_hessian_right + Y_DTYPE_C sum_gradient_left + Y_DTYPE_C sum_gradient_right + Y_DTYPE_C loss_current_node + Y_DTYPE_C gain + unsigned int start = self.n_bins_non_missing[feature_idx] - 2 + uint8_t found_better_split = False + + Y_DTYPE_C best_sum_hessian_left + Y_DTYPE_C best_sum_gradient_left + unsigned int best_bin_idx + unsigned int best_n_samples_left + Y_DTYPE_C best_gain = split_info.gain # computed during previous scan + hist_struct hist + + sum_gradient_right, sum_hessian_right = 0., 0. + n_samples_right = 0 + + loss_current_node = _loss_from_value(value, sum_gradients) + + for bin_idx in range(start, -1, -1): + hist = histograms[feature_idx, bin_idx + 1] + n_samples_right += hist.count + n_samples_left = n_samples_ - n_samples_right + + if self.hessians_are_constant: + sum_hessian_right += hist.count + else: + sum_hessian_right += \ + hist.sum_hessians + sum_hessian_left = sum_hessians - sum_hessian_right + + sum_gradient_right += \ + hist.sum_gradients + sum_gradient_left = sum_gradients - sum_gradient_right + + if n_samples_right < self.min_samples_leaf: + continue + if n_samples_left < self.min_samples_leaf: + # won't get any better + break + + if sum_hessian_right < self.min_hessian_to_split: + continue + if sum_hessian_left < self.min_hessian_to_split: + # won't get any better (hessians are > 0 since loss is convex) + break + + gain = _split_gain(sum_gradient_left, sum_hessian_left, + sum_gradient_right, sum_hessian_right, + loss_current_node, + monotonic_cst, + lower_bound, + upper_bound, + self.l2_regularization) + + if gain > best_gain and gain > self.min_gain_to_split: + found_better_split = True + best_gain = gain + best_bin_idx = bin_idx + best_sum_gradient_left = sum_gradient_left + best_sum_hessian_left = sum_hessian_left + best_n_samples_left = n_samples_left + + if found_better_split: + split_info.gain = best_gain + split_info.bin_idx = best_bin_idx + # we scan from right to left so missing values go to the left + split_info.missing_go_to_left = True + split_info.sum_gradient_left = best_sum_gradient_left + split_info.sum_gradient_right = sum_gradients - best_sum_gradient_left + split_info.sum_hessian_left = best_sum_hessian_left + split_info.sum_hessian_right = sum_hessians - best_sum_hessian_left + split_info.n_samples_left = best_n_samples_left + split_info.n_samples_right = n_samples - best_n_samples_left + + # We recompute best values here but it's cheap + split_info.value_left = compute_node_value( + split_info.sum_gradient_left, split_info.sum_hessian_left, + lower_bound, upper_bound, self.l2_regularization) + + split_info.value_right = compute_node_value( + split_info.sum_gradient_right, split_info.sum_hessian_right, + lower_bound, upper_bound, self.l2_regularization) + + cdef void _find_best_bin_to_split_category( + self, + unsigned int feature_idx, + uint8_t has_missing_values, + const hist_struct [:, ::1] histograms, # IN + unsigned int n_samples, + Y_DTYPE_C sum_gradients, + Y_DTYPE_C sum_hessians, + Y_DTYPE_C value, + char monotonic_cst, + Y_DTYPE_C lower_bound, + Y_DTYPE_C upper_bound, + split_info_struct * split_info) noexcept nogil: # OUT + """Find best split for categorical features. + + Categories are first sorted according to their variance, and then + a scan is performed as if categories were ordered quantities. + + Ref: "On Grouping for Maximum Homogeneity", Walter D. Fisher + """ + + cdef: + unsigned int bin_idx + unsigned int n_bins_non_missing = self.n_bins_non_missing[feature_idx] + unsigned int missing_values_bin_idx = self.missing_values_bin_idx + categorical_info * cat_infos + unsigned int sorted_cat_idx + unsigned int n_used_bins = 0 + int [2] scan_direction + int direction = 0 + int best_direction = 0 + unsigned int middle + unsigned int i + const hist_struct[::1] feature_hist = histograms[feature_idx, :] + hist_struct hist + Y_DTYPE_C sum_gradients_bin + Y_DTYPE_C sum_hessians_bin + Y_DTYPE_C loss_current_node + Y_DTYPE_C sum_gradient_left, sum_hessian_left + Y_DTYPE_C sum_gradient_right, sum_hessian_right + unsigned int n_samples_left, n_samples_right + Y_DTYPE_C gain + Y_DTYPE_C best_gain = -1.0 + uint8_t found_better_split = False + Y_DTYPE_C best_sum_hessian_left + Y_DTYPE_C best_sum_gradient_left + unsigned int best_n_samples_left + unsigned int best_cat_infos_thresh + # Reduces the effect of noises in categorical features, + # especially for categories with few data. Called cat_smooth in + # LightGBM. TODO: Make this user adjustable? + Y_DTYPE_C MIN_CAT_SUPPORT = 10. + # this is equal to 1 for losses where hessians are constant + Y_DTYPE_C support_factor = n_samples / sum_hessians + + # Details on the split finding: + # We first order categories by their sum_gradients / sum_hessians + # values, and we exclude categories that don't respect MIN_CAT_SUPPORT + # from this sorted array. Missing values are treated just like any + # other category. The low-support categories will always be mapped to + # the right child. We scan the sorted categories array from left to + # right and from right to left, and we stop at the middle. + + # Considering ordered categories A B C D, with E being a low-support + # category: A B C D + # ^ + # midpoint + # The scans will consider the following split-points: + # * left to right: + # A - B C D E + # A B - C D E + # * right to left: + # D - A B C E + # C D - A B E + + # Note that since we stop at the middle and since low-support + # categories (E) are always mapped to the right, the following splits + # aren't considered: + # A E - B C D + # D E - A B C + # Basically, we're forcing E to always be mapped to the child that has + # *at least half of the categories* (and this child is always the right + # child, by convention). + + # Also note that if we scanned in only one direction (e.g. left to + # right), we would only consider the following splits: + # A - B C D E + # A B - C D E + # A B C - D E + # and thus we would be missing on D - A B C E and on C D - A B E + + cat_infos = malloc( + (n_bins_non_missing + has_missing_values) * sizeof(categorical_info)) + + # fill cat_infos while filtering out categories based on MIN_CAT_SUPPORT + for bin_idx in range(n_bins_non_missing): + hist = feature_hist[bin_idx] + if self.hessians_are_constant: + sum_hessians_bin = hist.count + else: + sum_hessians_bin = hist.sum_hessians + if sum_hessians_bin * support_factor >= MIN_CAT_SUPPORT: + cat_infos[n_used_bins].bin_idx = bin_idx + sum_gradients_bin = hist.sum_gradients + + cat_infos[n_used_bins].value = ( + sum_gradients_bin / (sum_hessians_bin + MIN_CAT_SUPPORT) + ) + n_used_bins += 1 + + # Also add missing values bin so that nans are considered as a category + if has_missing_values: + hist = feature_hist[missing_values_bin_idx] + if self.hessians_are_constant: + sum_hessians_bin = hist.count + else: + sum_hessians_bin = hist.sum_hessians + if sum_hessians_bin * support_factor >= MIN_CAT_SUPPORT: + cat_infos[n_used_bins].bin_idx = missing_values_bin_idx + sum_gradients_bin = ( + hist.sum_gradients + ) + + cat_infos[n_used_bins].value = ( + sum_gradients_bin / (sum_hessians_bin + MIN_CAT_SUPPORT) + ) + n_used_bins += 1 + + # not enough categories to form a split + if n_used_bins <= 1: + free(cat_infos) + return + + qsort(cat_infos, n_used_bins, sizeof(categorical_info), + compare_cat_infos) + + loss_current_node = _loss_from_value(value, sum_gradients) + + scan_direction[0], scan_direction[1] = 1, -1 + for direction in scan_direction: + if direction == 1: + middle = (n_used_bins + 1) // 2 + else: + middle = (n_used_bins + 1) // 2 - 1 + + # The categories we'll consider will go to the left child + sum_gradient_left, sum_hessian_left = 0., 0. + n_samples_left = 0 + + for i in range(middle): + sorted_cat_idx = i if direction == 1 else n_used_bins - 1 - i + bin_idx = cat_infos[sorted_cat_idx].bin_idx + hist = feature_hist[bin_idx] + + n_samples_left += hist.count + n_samples_right = n_samples - n_samples_left + + if self.hessians_are_constant: + sum_hessian_left += hist.count + else: + sum_hessian_left += hist.sum_hessians + sum_hessian_right = sum_hessians - sum_hessian_left + + sum_gradient_left += hist.sum_gradients + sum_gradient_right = sum_gradients - sum_gradient_left + + if ( + n_samples_left < self.min_samples_leaf or + sum_hessian_left < self.min_hessian_to_split + ): + continue + if ( + n_samples_right < self.min_samples_leaf or + sum_hessian_right < self.min_hessian_to_split + ): + break + + gain = _split_gain(sum_gradient_left, sum_hessian_left, + sum_gradient_right, sum_hessian_right, + loss_current_node, monotonic_cst, + lower_bound, upper_bound, + self.l2_regularization) + if gain > best_gain and gain > self.min_gain_to_split: + found_better_split = True + best_gain = gain + best_cat_infos_thresh = sorted_cat_idx + best_sum_gradient_left = sum_gradient_left + best_sum_hessian_left = sum_hessian_left + best_n_samples_left = n_samples_left + best_direction = direction + + if found_better_split: + split_info.gain = best_gain + + # split_info.bin_idx is unused for categorical splits: left_cat_bitset + # is used instead and set below + split_info.bin_idx = 0 + + split_info.sum_gradient_left = best_sum_gradient_left + split_info.sum_gradient_right = sum_gradients - best_sum_gradient_left + split_info.sum_hessian_left = best_sum_hessian_left + split_info.sum_hessian_right = sum_hessians - best_sum_hessian_left + split_info.n_samples_left = best_n_samples_left + split_info.n_samples_right = n_samples - best_n_samples_left + + # We recompute best values here but it's cheap + split_info.value_left = compute_node_value( + split_info.sum_gradient_left, split_info.sum_hessian_left, + lower_bound, upper_bound, self.l2_regularization) + + split_info.value_right = compute_node_value( + split_info.sum_gradient_right, split_info.sum_hessian_right, + lower_bound, upper_bound, self.l2_regularization) + + # create bitset with values from best_cat_infos_thresh + init_bitset(split_info.left_cat_bitset) + if best_direction == 1: + for sorted_cat_idx in range(best_cat_infos_thresh + 1): + bin_idx = cat_infos[sorted_cat_idx].bin_idx + set_bitset(split_info.left_cat_bitset, bin_idx) + else: + for sorted_cat_idx in range(n_used_bins - 1, best_cat_infos_thresh - 1, -1): + bin_idx = cat_infos[sorted_cat_idx].bin_idx + set_bitset(split_info.left_cat_bitset, bin_idx) + + if has_missing_values: + split_info.missing_go_to_left = in_bitset( + split_info.left_cat_bitset, missing_values_bin_idx) + + free(cat_infos) + + +cdef int compare_cat_infos(const void * a, const void * b) noexcept nogil: + return -1 if (a).value < (b).value else 1 + +cdef inline Y_DTYPE_C _split_gain( + Y_DTYPE_C sum_gradient_left, + Y_DTYPE_C sum_hessian_left, + Y_DTYPE_C sum_gradient_right, + Y_DTYPE_C sum_hessian_right, + Y_DTYPE_C loss_current_node, + signed char monotonic_cst, + Y_DTYPE_C lower_bound, + Y_DTYPE_C upper_bound, + Y_DTYPE_C l2_regularization) noexcept nogil: + """Loss reduction + + Compute the reduction in loss after taking a split, compared to keeping + the node a leaf of the tree. + + See Equation 7 of: + :arxiv:`T. Chen, C. Guestrin, (2016) XGBoost: A Scalable Tree Boosting System, + <1603.02754>.` + """ + cdef: + Y_DTYPE_C gain + Y_DTYPE_C value_left + Y_DTYPE_C value_right + + # Compute values of potential left and right children + value_left = compute_node_value(sum_gradient_left, sum_hessian_left, + lower_bound, upper_bound, + l2_regularization) + value_right = compute_node_value(sum_gradient_right, sum_hessian_right, + lower_bound, upper_bound, + l2_regularization) + + if ((monotonic_cst == MonotonicConstraint.POS and value_left > value_right) or + (monotonic_cst == MonotonicConstraint.NEG and value_left < value_right)): + # don't consider this split since it does not respect the monotonic + # constraints. Note that these comparisons need to be done on values + # that have already been clipped to take the monotonic constraints into + # account (if any). + return -1 + + gain = loss_current_node + gain -= _loss_from_value(value_left, sum_gradient_left) + gain -= _loss_from_value(value_right, sum_gradient_right) + # Note that for the gain to be correct (and for min_gain_to_split to work + # as expected), we need all values to be bounded (current node, left child + # and right child). + + return gain + +cdef inline Y_DTYPE_C _loss_from_value( + Y_DTYPE_C value, + Y_DTYPE_C sum_gradient) noexcept nogil: + """Return loss of a node from its (bounded) value + + See Equation 6 of: + :arxiv:`T. Chen, C. Guestrin, (2016) XGBoost: A Scalable Tree Boosting System, + <1603.02754>.` + """ + return sum_gradient * value + +cdef inline uint8_t sample_goes_left( + uint8_t missing_go_to_left, + uint8_t missing_values_bin_idx, + X_BINNED_DTYPE_C split_bin_idx, + X_BINNED_DTYPE_C bin_value, + uint8_t is_categorical, + BITSET_DTYPE_C left_cat_bitset) noexcept nogil: + """Helper to decide whether sample should go to left or right child.""" + + if is_categorical: + # note: if any, missing values are encoded in left_cat_bitset + return in_bitset(left_cat_bitset, bin_value) + else: + return ( + ( + missing_go_to_left and + bin_value == missing_values_bin_idx + ) + or ( + bin_value <= split_bin_idx + )) + + +cpdef inline Y_DTYPE_C compute_node_value( + Y_DTYPE_C sum_gradient, + Y_DTYPE_C sum_hessian, + Y_DTYPE_C lower_bound, + Y_DTYPE_C upper_bound, + Y_DTYPE_C l2_regularization) noexcept nogil: + """Compute a node's value. + + The value is capped in the [lower_bound, upper_bound] interval to respect + monotonic constraints. Shrinkage is ignored. + + See Equation 5 of: + :arxiv:`T. Chen, C. Guestrin, (2016) XGBoost: A Scalable Tree Boosting System, + <1603.02754>.` + """ + + cdef: + Y_DTYPE_C value + + value = -sum_gradient / (sum_hessian + l2_regularization + 1e-15) + + if value < lower_bound: + value = lower_bound + elif value > upper_bound: + value = upper_bound + + return value diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py new file mode 100644 index 0000000000000000000000000000000000000000..6f9fcd0057141a398611ff94d528b1317ba4a0fc --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py @@ -0,0 +1,489 @@ +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_array_equal + +from sklearn.ensemble._hist_gradient_boosting.binning import ( + _BinMapper, + _find_binning_thresholds, + _map_to_bins, +) +from sklearn.ensemble._hist_gradient_boosting.common import ( + ALMOST_INF, + X_BINNED_DTYPE, + X_DTYPE, +) +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads + +n_threads = _openmp_effective_n_threads() + + +DATA = ( + np.random.RandomState(42) + .normal(loc=[0, 10], scale=[1, 0.01], size=(int(1e6), 2)) + .astype(X_DTYPE) +) + + +def test_find_binning_thresholds_regular_data(): + data = np.linspace(0, 10, 1001) + bin_thresholds = _find_binning_thresholds(data, max_bins=10) + assert_allclose(bin_thresholds, [1, 2, 3, 4, 5, 6, 7, 8, 9]) + + bin_thresholds = _find_binning_thresholds(data, max_bins=5) + assert_allclose(bin_thresholds, [2, 4, 6, 8]) + + +def test_find_binning_thresholds_small_regular_data(): + data = np.linspace(0, 10, 11) + + bin_thresholds = _find_binning_thresholds(data, max_bins=5) + assert_allclose(bin_thresholds, [2, 4, 6, 8]) + + bin_thresholds = _find_binning_thresholds(data, max_bins=10) + assert_allclose(bin_thresholds, [1, 2, 3, 4, 5, 6, 7, 8, 9]) + + bin_thresholds = _find_binning_thresholds(data, max_bins=11) + assert_allclose(bin_thresholds, np.arange(10) + 0.5) + + bin_thresholds = _find_binning_thresholds(data, max_bins=255) + assert_allclose(bin_thresholds, np.arange(10) + 0.5) + + +def test_find_binning_thresholds_random_data(): + bin_thresholds = [ + _find_binning_thresholds(DATA[:, i], max_bins=255) for i in range(2) + ] + for i in range(len(bin_thresholds)): + assert bin_thresholds[i].shape == (254,) # 255 - 1 + assert bin_thresholds[i].dtype == DATA.dtype + + assert_allclose( + bin_thresholds[0][[64, 128, 192]], np.array([-0.7, 0.0, 0.7]), atol=1e-1 + ) + + assert_allclose( + bin_thresholds[1][[64, 128, 192]], np.array([9.99, 10.00, 10.01]), atol=1e-2 + ) + + +def test_find_binning_thresholds_low_n_bins(): + bin_thresholds = [ + _find_binning_thresholds(DATA[:, i], max_bins=128) for i in range(2) + ] + for i in range(len(bin_thresholds)): + assert bin_thresholds[i].shape == (127,) # 128 - 1 + assert bin_thresholds[i].dtype == DATA.dtype + + +@pytest.mark.parametrize("n_bins", (2, 257)) +def test_invalid_n_bins(n_bins): + err_msg = "n_bins={} should be no smaller than 3 and no larger than 256".format( + n_bins + ) + with pytest.raises(ValueError, match=err_msg): + _BinMapper(n_bins=n_bins).fit(DATA) + + +def test_bin_mapper_n_features_transform(): + mapper = _BinMapper(n_bins=42, random_state=42).fit(DATA) + err_msg = "This estimator was fitted with 2 features but 4 got passed" + with pytest.raises(ValueError, match=err_msg): + mapper.transform(np.repeat(DATA, 2, axis=1)) + + +@pytest.mark.parametrize("max_bins", [16, 128, 255]) +def test_map_to_bins(max_bins): + bin_thresholds = [ + _find_binning_thresholds(DATA[:, i], max_bins=max_bins) for i in range(2) + ] + binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order="F") + is_categorical = np.zeros(2, dtype=np.uint8) + last_bin_idx = max_bins + _map_to_bins(DATA, bin_thresholds, is_categorical, last_bin_idx, n_threads, binned) + assert binned.shape == DATA.shape + assert binned.dtype == np.uint8 + assert binned.flags.f_contiguous + + min_indices = DATA.argmin(axis=0) + max_indices = DATA.argmax(axis=0) + + for feature_idx, min_idx in enumerate(min_indices): + assert binned[min_idx, feature_idx] == 0 + for feature_idx, max_idx in enumerate(max_indices): + assert binned[max_idx, feature_idx] == max_bins - 1 + + +@pytest.mark.parametrize("max_bins", [5, 10, 42]) +def test_bin_mapper_random_data(max_bins): + n_samples, n_features = DATA.shape + + expected_count_per_bin = n_samples // max_bins + tol = int(0.05 * expected_count_per_bin) + + # max_bins is the number of bins for non-missing values + n_bins = max_bins + 1 + mapper = _BinMapper(n_bins=n_bins, random_state=42).fit(DATA) + binned = mapper.transform(DATA) + + assert binned.shape == (n_samples, n_features) + assert binned.dtype == np.uint8 + assert_array_equal(binned.min(axis=0), np.array([0, 0])) + assert_array_equal(binned.max(axis=0), np.array([max_bins - 1, max_bins - 1])) + assert len(mapper.bin_thresholds_) == n_features + for bin_thresholds_feature in mapper.bin_thresholds_: + assert bin_thresholds_feature.shape == (max_bins - 1,) + assert bin_thresholds_feature.dtype == DATA.dtype + assert np.all(mapper.n_bins_non_missing_ == max_bins) + + # Check that the binned data is approximately balanced across bins. + for feature_idx in range(n_features): + for bin_idx in range(max_bins): + count = (binned[:, feature_idx] == bin_idx).sum() + assert abs(count - expected_count_per_bin) < tol + + +@pytest.mark.parametrize("n_samples, max_bins", [(5, 5), (5, 10), (5, 11), (42, 255)]) +def test_bin_mapper_small_random_data(n_samples, max_bins): + data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1) + assert len(np.unique(data)) == n_samples + + # max_bins is the number of bins for non-missing values + n_bins = max_bins + 1 + mapper = _BinMapper(n_bins=n_bins, random_state=42) + binned = mapper.fit_transform(data) + + assert binned.shape == data.shape + assert binned.dtype == np.uint8 + assert_array_equal(binned.ravel()[np.argsort(data.ravel())], np.arange(n_samples)) + + +@pytest.mark.parametrize( + "max_bins, n_distinct, multiplier", + [ + (5, 5, 1), + (5, 5, 3), + (255, 12, 42), + ], +) +def test_bin_mapper_identity_repeated_values(max_bins, n_distinct, multiplier): + data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1) + # max_bins is the number of bins for non-missing values + n_bins = max_bins + 1 + binned = _BinMapper(n_bins=n_bins).fit_transform(data) + assert_array_equal(data, binned) + + +@pytest.mark.parametrize("n_distinct", [2, 7, 42]) +def test_bin_mapper_repeated_values_invariance(n_distinct): + rng = np.random.RandomState(42) + distinct_values = rng.normal(size=n_distinct) + assert len(np.unique(distinct_values)) == n_distinct + + repeated_indices = rng.randint(low=0, high=n_distinct, size=1000) + data = distinct_values[repeated_indices] + rng.shuffle(data) + assert_array_equal(np.unique(data), np.sort(distinct_values)) + + data = data.reshape(-1, 1) + + mapper_1 = _BinMapper(n_bins=n_distinct + 1) + binned_1 = mapper_1.fit_transform(data) + assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct)) + + # Adding more bins to the mapper yields the same results (same thresholds) + mapper_2 = _BinMapper(n_bins=min(256, n_distinct * 3) + 1) + binned_2 = mapper_2.fit_transform(data) + + assert_allclose(mapper_1.bin_thresholds_[0], mapper_2.bin_thresholds_[0]) + assert_array_equal(binned_1, binned_2) + + +@pytest.mark.parametrize( + "max_bins, scale, offset", + [ + (3, 2, -1), + (42, 1, 0), + (255, 0.3, 42), + ], +) +def test_bin_mapper_identity_small(max_bins, scale, offset): + data = np.arange(max_bins).reshape(-1, 1) * scale + offset + # max_bins is the number of bins for non-missing values + n_bins = max_bins + 1 + binned = _BinMapper(n_bins=n_bins).fit_transform(data) + assert_array_equal(binned, np.arange(max_bins).reshape(-1, 1)) + + +@pytest.mark.parametrize( + "max_bins_small, max_bins_large", + [ + (2, 2), + (3, 3), + (4, 4), + (42, 42), + (255, 255), + (5, 17), + (42, 255), + ], +) +def test_bin_mapper_idempotence(max_bins_small, max_bins_large): + assert max_bins_large >= max_bins_small + data = np.random.RandomState(42).normal(size=30000).reshape(-1, 1) + mapper_small = _BinMapper(n_bins=max_bins_small + 1) + mapper_large = _BinMapper(n_bins=max_bins_small + 1) + binned_small = mapper_small.fit_transform(data) + binned_large = mapper_large.fit_transform(binned_small) + assert_array_equal(binned_small, binned_large) + + +@pytest.mark.parametrize("n_bins", [10, 100, 256]) +@pytest.mark.parametrize("diff", [-5, 0, 5]) +def test_n_bins_non_missing(n_bins, diff): + # Check that n_bins_non_missing is n_unique_values when + # there are not a lot of unique values, else n_bins - 1. + + n_unique_values = n_bins + diff + X = list(range(n_unique_values)) * 2 + X = np.array(X).reshape(-1, 1) + mapper = _BinMapper(n_bins=n_bins).fit(X) + assert np.all(mapper.n_bins_non_missing_ == min(n_bins - 1, n_unique_values)) + + +def test_subsample(): + # Make sure bin thresholds are different when applying subsampling + mapper_no_subsample = _BinMapper(subsample=None, random_state=0).fit(DATA) + mapper_subsample = _BinMapper(subsample=256, random_state=0).fit(DATA) + + for feature in range(DATA.shape[1]): + assert not np.allclose( + mapper_no_subsample.bin_thresholds_[feature], + mapper_subsample.bin_thresholds_[feature], + rtol=1e-4, + ) + + +@pytest.mark.parametrize( + "n_bins, n_bins_non_missing, X_trans_expected", + [ + ( + 256, + [4, 2, 2], + [ + [0, 0, 0], # 255 <=> missing value + [255, 255, 0], + [1, 0, 0], + [255, 1, 1], + [2, 1, 1], + [3, 0, 0], + ], + ), + ( + 3, + [2, 2, 2], + [ + [0, 0, 0], # 2 <=> missing value + [2, 2, 0], + [0, 0, 0], + [2, 1, 1], + [1, 1, 1], + [1, 0, 0], + ], + ), + ], +) +def test_missing_values_support(n_bins, n_bins_non_missing, X_trans_expected): + # check for missing values: make sure nans are mapped to the last bin + # and that the _BinMapper attributes are correct + + X = [ + [1, 1, 0], + [np.nan, np.nan, 0], + [2, 1, 0], + [np.nan, 2, 1], + [3, 2, 1], + [4, 1, 0], + ] + + X = np.array(X) + + mapper = _BinMapper(n_bins=n_bins) + mapper.fit(X) + + assert_array_equal(mapper.n_bins_non_missing_, n_bins_non_missing) + + for feature_idx in range(X.shape[1]): + assert ( + len(mapper.bin_thresholds_[feature_idx]) + == n_bins_non_missing[feature_idx] - 1 + ) + + assert mapper.missing_values_bin_idx_ == n_bins - 1 + + X_trans = mapper.transform(X) + assert_array_equal(X_trans, X_trans_expected) + + +def test_infinite_values(): + # Make sure infinite values are properly handled. + bin_mapper = _BinMapper() + + X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1) + + bin_mapper.fit(X) + assert_allclose(bin_mapper.bin_thresholds_[0], [-np.inf, 0.5, ALMOST_INF]) + assert bin_mapper.n_bins_non_missing_ == [4] + + expected_binned_X = np.array([0, 1, 2, 3]).reshape(-1, 1) + assert_array_equal(bin_mapper.transform(X), expected_binned_X) + + +@pytest.mark.parametrize("n_bins", [15, 256]) +def test_categorical_feature(n_bins): + # Basic test for categorical features + # we make sure that categories are mapped into [0, n_categories - 1] and + # that nans are mapped to the last bin + X = np.array( + [[4] * 500 + [1] * 3 + [10] * 4 + [0] * 4 + [13] + [7] * 5 + [np.nan] * 2], + dtype=X_DTYPE, + ).T + known_categories = [np.unique(X[~np.isnan(X)])] + + bin_mapper = _BinMapper( + n_bins=n_bins, + is_categorical=np.array([True]), + known_categories=known_categories, + ).fit(X) + assert bin_mapper.n_bins_non_missing_ == [6] + assert_array_equal(bin_mapper.bin_thresholds_[0], [0, 1, 4, 7, 10, 13]) + + X = np.array([[0, 1, 4, np.nan, 7, 10, 13]], dtype=X_DTYPE).T + expected_trans = np.array([[0, 1, 2, n_bins - 1, 3, 4, 5]]).T + assert_array_equal(bin_mapper.transform(X), expected_trans) + + # Negative categories are mapped to the missing values' bin + # (i.e. the bin of index `missing_values_bin_idx_ == n_bins - 1). + # Unknown positive categories does not happen in practice and tested + # for illustration purpose. + X = np.array([[-4, -1, 100]], dtype=X_DTYPE).T + expected_trans = np.array([[n_bins - 1, n_bins - 1, 6]]).T + assert_array_equal(bin_mapper.transform(X), expected_trans) + + +def test_categorical_feature_negative_missing(): + """Make sure bin mapper treats negative categories as missing values.""" + X = np.array( + [[4] * 500 + [1] * 3 + [5] * 10 + [-1] * 3 + [np.nan] * 4], dtype=X_DTYPE + ).T + bin_mapper = _BinMapper( + n_bins=4, + is_categorical=np.array([True]), + known_categories=[np.array([1, 4, 5], dtype=X_DTYPE)], + ).fit(X) + + assert bin_mapper.n_bins_non_missing_ == [3] + + X = np.array([[-1, 1, 3, 5, np.nan]], dtype=X_DTYPE).T + + # Negative values for categorical features are considered as missing values. + # They are mapped to the bin of index `bin_mapper.missing_values_bin_idx_`, + # which is 3 here. + assert bin_mapper.missing_values_bin_idx_ == 3 + expected_trans = np.array([[3, 0, 1, 2, 3]]).T + assert_array_equal(bin_mapper.transform(X), expected_trans) + + +@pytest.mark.parametrize("n_bins", (128, 256)) +def test_categorical_with_numerical_features(n_bins): + # basic check for binmapper with mixed data + X1 = np.arange(10, 20).reshape(-1, 1) # numerical + X2 = np.arange(10, 15).reshape(-1, 1) # categorical + X2 = np.r_[X2, X2] + X = np.c_[X1, X2] + known_categories = [None, np.unique(X2).astype(X_DTYPE)] + + bin_mapper = _BinMapper( + n_bins=n_bins, + is_categorical=np.array([False, True]), + known_categories=known_categories, + ).fit(X) + + assert_array_equal(bin_mapper.n_bins_non_missing_, [10, 5]) + + bin_thresholds = bin_mapper.bin_thresholds_ + assert len(bin_thresholds) == 2 + assert_array_equal(bin_thresholds[1], np.arange(10, 15)) + + expected_X_trans = [ + [0, 0], + [1, 1], + [2, 2], + [3, 3], + [4, 4], + [5, 0], + [6, 1], + [7, 2], + [8, 3], + [9, 4], + ] + assert_array_equal(bin_mapper.transform(X), expected_X_trans) + + +def test_make_known_categories_bitsets(): + # Check the output of make_known_categories_bitsets + X = np.array( + [[14, 2, 30], [30, 4, 70], [40, 10, 180], [40, 240, 180]], dtype=X_DTYPE + ) + + bin_mapper = _BinMapper( + n_bins=256, + is_categorical=np.array([False, True, True]), + known_categories=[None, X[:, 1], X[:, 2]], + ) + bin_mapper.fit(X) + + known_cat_bitsets, f_idx_map = bin_mapper.make_known_categories_bitsets() + + # Note that for non-categorical features, values are left to 0 + expected_f_idx_map = np.array([0, 0, 1], dtype=np.uint8) + assert_allclose(expected_f_idx_map, f_idx_map) + + expected_cat_bitset = np.zeros((2, 8), dtype=np.uint32) + + # first categorical feature: [2, 4, 10, 240] + f_idx = 1 + mapped_f_idx = f_idx_map[f_idx] + expected_cat_bitset[mapped_f_idx, 0] = 2**2 + 2**4 + 2**10 + # 240 = 32**7 + 16, therefore the 16th bit of the 7th array is 1. + expected_cat_bitset[mapped_f_idx, 7] = 2**16 + + # second categorical feature [30, 70, 180] + f_idx = 2 + mapped_f_idx = f_idx_map[f_idx] + expected_cat_bitset[mapped_f_idx, 0] = 2**30 + expected_cat_bitset[mapped_f_idx, 2] = 2**6 + expected_cat_bitset[mapped_f_idx, 5] = 2**20 + + assert_allclose(expected_cat_bitset, known_cat_bitsets) + + +@pytest.mark.parametrize( + "is_categorical, known_categories, match", + [ + (np.array([True]), [None], "Known categories for feature 0 must be provided"), + ( + np.array([False]), + np.array([1, 2, 3]), + "isn't marked as a categorical feature, but categories were passed", + ), + ], +) +def test_categorical_parameters(is_categorical, known_categories, match): + # test the validation of the is_categorical and known_categories parameters + + X = np.array([[1, 2, 3]], dtype=X_DTYPE) + + bin_mapper = _BinMapper( + is_categorical=is_categorical, known_categories=known_categories + ) + with pytest.raises(ValueError, match=match): + bin_mapper.fit(X) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_bitset.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_bitset.py new file mode 100644 index 0000000000000000000000000000000000000000..c02d66b666f80216088c691db39a55c055aa8d83 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_bitset.py @@ -0,0 +1,64 @@ +import numpy as np +import pytest +from numpy.testing import assert_allclose + +from sklearn.ensemble._hist_gradient_boosting._bitset import ( + in_bitset_memoryview, + set_bitset_memoryview, + set_raw_bitset_from_binned_bitset, +) +from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE + + +@pytest.mark.parametrize( + "values_to_insert, expected_bitset", + [ + ([0, 4, 33], np.array([2**0 + 2**4, 2**1, 0], dtype=np.uint32)), + ( + [31, 32, 33, 79], + np.array([2**31, 2**0 + 2**1, 2**15], dtype=np.uint32), + ), + ], +) +def test_set_get_bitset(values_to_insert, expected_bitset): + n_32bits_ints = 3 + bitset = np.zeros(n_32bits_ints, dtype=np.uint32) + for value in values_to_insert: + set_bitset_memoryview(bitset, value) + assert_allclose(expected_bitset, bitset) + for value in range(32 * n_32bits_ints): + if value in values_to_insert: + assert in_bitset_memoryview(bitset, value) + else: + assert not in_bitset_memoryview(bitset, value) + + +@pytest.mark.parametrize( + "raw_categories, binned_cat_to_insert, expected_raw_bitset", + [ + ( + [3, 4, 5, 10, 31, 32, 43], + [0, 2, 4, 5, 6], + [2**3 + 2**5 + 2**31, 2**0 + 2**11], + ), + ([3, 33, 50, 52], [1, 3], [0, 2**1 + 2**20]), + ], +) +def test_raw_bitset_from_binned_bitset( + raw_categories, binned_cat_to_insert, expected_raw_bitset +): + binned_bitset = np.zeros(2, dtype=np.uint32) + raw_bitset = np.zeros(2, dtype=np.uint32) + raw_categories = np.asarray(raw_categories, dtype=X_DTYPE) + + for val in binned_cat_to_insert: + set_bitset_memoryview(binned_bitset, val) + + set_raw_bitset_from_binned_bitset(raw_bitset, binned_bitset, raw_categories) + + assert_allclose(expected_raw_bitset, raw_bitset) + for binned_cat_val, raw_cat_val in enumerate(raw_categories): + if binned_cat_val in binned_cat_to_insert: + assert in_bitset_memoryview(raw_bitset, raw_cat_val) + else: + assert not in_bitset_memoryview(raw_bitset, raw_cat_val) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py new file mode 100644 index 0000000000000000000000000000000000000000..24b5b02aa0696c5cad5701ea23acba601b307f09 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_compare_lightgbm.py @@ -0,0 +1,291 @@ +import numpy as np +import pytest + +from sklearn.datasets import make_classification, make_regression +from sklearn.ensemble import ( + HistGradientBoostingClassifier, + HistGradientBoostingRegressor, +) +from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper +from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator +from sklearn.metrics import accuracy_score +from sklearn.model_selection import train_test_split + + +# TODO(1.8) remove the filterwarnings decorator +@pytest.mark.filterwarnings( + "ignore:'force_all_finite' was renamed to 'ensure_all_finite':FutureWarning" +) +@pytest.mark.parametrize("seed", range(5)) +@pytest.mark.parametrize( + "loss", + [ + "squared_error", + "poisson", + pytest.param( + "gamma", + marks=pytest.mark.skip("LightGBM with gamma loss has larger deviation."), + ), + ], +) +@pytest.mark.parametrize("min_samples_leaf", (1, 20)) +@pytest.mark.parametrize( + "n_samples, max_leaf_nodes", + [ + (255, 4096), + (1000, 8), + ], +) +def test_same_predictions_regression( + seed, loss, min_samples_leaf, n_samples, max_leaf_nodes +): + # Make sure sklearn has the same predictions as lightgbm for easy targets. + # + # In particular when the size of the trees are bound and the number of + # samples is large enough, the structure of the prediction trees found by + # LightGBM and sklearn should be exactly identical. + # + # Notes: + # - Several candidate splits may have equal gains when the number of + # samples in a node is low (and because of float errors). Therefore the + # predictions on the test set might differ if the structure of the tree + # is not exactly the same. To avoid this issue we only compare the + # predictions on the test set when the number of samples is large enough + # and max_leaf_nodes is low enough. + # - To ignore discrepancies caused by small differences in the binning + # strategy, data is pre-binned if n_samples > 255. + # - We don't check the absolute_error loss here. This is because + # LightGBM's computation of the median (used for the initial value of + # raw_prediction) is a bit off (they'll e.g. return midpoints when there + # is no need to.). Since these tests only run 1 iteration, the + # discrepancy between the initial values leads to biggish differences in + # the predictions. These differences are much smaller with more + # iterations. + pytest.importorskip("lightgbm") + + rng = np.random.RandomState(seed=seed) + max_iter = 1 + max_bins = 255 + + X, y = make_regression( + n_samples=n_samples, n_features=5, n_informative=5, random_state=0 + ) + + if loss in ("gamma", "poisson"): + # make the target positive + y = np.abs(y) + np.mean(np.abs(y)) + + if n_samples > 255: + # bin data and convert it to float32 so that the estimator doesn't + # treat it as pre-binned + X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) + + est_sklearn = HistGradientBoostingRegressor( + loss=loss, + max_iter=max_iter, + max_bins=max_bins, + learning_rate=1, + early_stopping=False, + min_samples_leaf=min_samples_leaf, + max_leaf_nodes=max_leaf_nodes, + ) + est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm") + est_lightgbm.set_params(min_sum_hessian_in_leaf=0) + + est_lightgbm.fit(X_train, y_train) + est_sklearn.fit(X_train, y_train) + + # We need X to be treated an numerical data, not pre-binned data. + X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) + + pred_lightgbm = est_lightgbm.predict(X_train) + pred_sklearn = est_sklearn.predict(X_train) + if loss in ("gamma", "poisson"): + # More than 65% of the predictions must be close up to the 2nd decimal. + # TODO: We are not entirely satisfied with this lax comparison, but the root + # cause is not clear, maybe algorithmic differences. One such example is the + # poisson_max_delta_step parameter of LightGBM which does not exist in HGBT. + assert ( + np.mean(np.isclose(pred_lightgbm, pred_sklearn, rtol=1e-2, atol=1e-2)) + > 0.65 + ) + else: + # Less than 1% of the predictions may deviate more than 1e-3 in relative terms. + assert np.mean(np.isclose(pred_lightgbm, pred_sklearn, rtol=1e-3)) > 1 - 0.01 + + if max_leaf_nodes < 10 and n_samples >= 1000 and loss in ("squared_error",): + pred_lightgbm = est_lightgbm.predict(X_test) + pred_sklearn = est_sklearn.predict(X_test) + # Less than 1% of the predictions may deviate more than 1e-4 in relative terms. + assert np.mean(np.isclose(pred_lightgbm, pred_sklearn, rtol=1e-4)) > 1 - 0.01 + + +# TODO(1.8) remove the filterwarnings decorator +@pytest.mark.filterwarnings( + "ignore:'force_all_finite' was renamed to 'ensure_all_finite':FutureWarning" +) +@pytest.mark.parametrize("seed", range(5)) +@pytest.mark.parametrize("min_samples_leaf", (1, 20)) +@pytest.mark.parametrize( + "n_samples, max_leaf_nodes", + [ + (255, 4096), + (1000, 8), + ], +) +def test_same_predictions_classification( + seed, min_samples_leaf, n_samples, max_leaf_nodes +): + # Same as test_same_predictions_regression but for classification + pytest.importorskip("lightgbm") + + rng = np.random.RandomState(seed=seed) + max_iter = 1 + n_classes = 2 + max_bins = 255 + + X, y = make_classification( + n_samples=n_samples, + n_classes=n_classes, + n_features=5, + n_informative=5, + n_redundant=0, + random_state=0, + ) + + if n_samples > 255: + # bin data and convert it to float32 so that the estimator doesn't + # treat it as pre-binned + X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) + + est_sklearn = HistGradientBoostingClassifier( + loss="log_loss", + max_iter=max_iter, + max_bins=max_bins, + learning_rate=1, + early_stopping=False, + min_samples_leaf=min_samples_leaf, + max_leaf_nodes=max_leaf_nodes, + ) + est_lightgbm = get_equivalent_estimator( + est_sklearn, lib="lightgbm", n_classes=n_classes + ) + + est_lightgbm.fit(X_train, y_train) + est_sklearn.fit(X_train, y_train) + + # We need X to be treated an numerical data, not pre-binned data. + X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) + + pred_lightgbm = est_lightgbm.predict(X_train) + pred_sklearn = est_sklearn.predict(X_train) + assert np.mean(pred_sklearn == pred_lightgbm) > 0.89 + + acc_lightgbm = accuracy_score(y_train, pred_lightgbm) + acc_sklearn = accuracy_score(y_train, pred_sklearn) + np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn) + + if max_leaf_nodes < 10 and n_samples >= 1000: + pred_lightgbm = est_lightgbm.predict(X_test) + pred_sklearn = est_sklearn.predict(X_test) + assert np.mean(pred_sklearn == pred_lightgbm) > 0.89 + + acc_lightgbm = accuracy_score(y_test, pred_lightgbm) + acc_sklearn = accuracy_score(y_test, pred_sklearn) + np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2) + + +# TODO(1.8) remove the filterwarnings decorator +@pytest.mark.filterwarnings( + "ignore:'force_all_finite' was renamed to 'ensure_all_finite':FutureWarning" +) +@pytest.mark.parametrize("seed", range(5)) +@pytest.mark.parametrize("min_samples_leaf", (1, 20)) +@pytest.mark.parametrize( + "n_samples, max_leaf_nodes", + [ + (255, 4096), + (10000, 8), + ], +) +def test_same_predictions_multiclass_classification( + seed, min_samples_leaf, n_samples, max_leaf_nodes +): + # Same as test_same_predictions_regression but for classification + pytest.importorskip("lightgbm") + + rng = np.random.RandomState(seed=seed) + n_classes = 3 + max_iter = 1 + max_bins = 255 + lr = 1 + + X, y = make_classification( + n_samples=n_samples, + n_classes=n_classes, + n_features=5, + n_informative=5, + n_redundant=0, + n_clusters_per_class=1, + random_state=0, + ) + + if n_samples > 255: + # bin data and convert it to float32 so that the estimator doesn't + # treat it as pre-binned + X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) + + est_sklearn = HistGradientBoostingClassifier( + loss="log_loss", + max_iter=max_iter, + max_bins=max_bins, + learning_rate=lr, + early_stopping=False, + min_samples_leaf=min_samples_leaf, + max_leaf_nodes=max_leaf_nodes, + ) + est_lightgbm = get_equivalent_estimator( + est_sklearn, lib="lightgbm", n_classes=n_classes + ) + + est_lightgbm.fit(X_train, y_train) + est_sklearn.fit(X_train, y_train) + + # We need X to be treated an numerical data, not pre-binned data. + X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) + + pred_lightgbm = est_lightgbm.predict(X_train) + pred_sklearn = est_sklearn.predict(X_train) + assert np.mean(pred_sklearn == pred_lightgbm) > 0.89 + + proba_lightgbm = est_lightgbm.predict_proba(X_train) + proba_sklearn = est_sklearn.predict_proba(X_train) + # assert more than 75% of the predicted probabilities are the same up to + # the second decimal + assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > 0.75 + + acc_lightgbm = accuracy_score(y_train, pred_lightgbm) + acc_sklearn = accuracy_score(y_train, pred_sklearn) + + np.testing.assert_allclose(acc_lightgbm, acc_sklearn, rtol=0, atol=5e-2) + + if max_leaf_nodes < 10 and n_samples >= 1000: + pred_lightgbm = est_lightgbm.predict(X_test) + pred_sklearn = est_sklearn.predict(X_test) + assert np.mean(pred_sklearn == pred_lightgbm) > 0.89 + + proba_lightgbm = est_lightgbm.predict_proba(X_train) + proba_sklearn = est_sklearn.predict_proba(X_train) + # assert more than 75% of the predicted probabilities are the same up + # to the second decimal + assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > 0.75 + + acc_lightgbm = accuracy_score(y_test, pred_lightgbm) + acc_sklearn = accuracy_score(y_test, pred_sklearn) + np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py new file mode 100644 index 0000000000000000000000000000000000000000..7dde25f3d22dfc8e3a037e1d284d6e924a03986c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -0,0 +1,1760 @@ +import copyreg +import io +import pickle +import re +import warnings +from unittest.mock import Mock + +import joblib +import numpy as np +import pytest +from joblib.numpy_pickle import NumpyPickler +from numpy.testing import assert_allclose, assert_array_equal + +import sklearn +from sklearn._loss.loss import ( + AbsoluteError, + HalfBinomialLoss, + HalfSquaredError, + PinballLoss, +) +from sklearn.base import BaseEstimator, TransformerMixin, clone, is_regressor +from sklearn.compose import make_column_transformer +from sklearn.datasets import make_classification, make_low_rank_matrix, make_regression +from sklearn.dummy import DummyRegressor +from sklearn.ensemble import ( + HistGradientBoostingClassifier, + HistGradientBoostingRegressor, +) +from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper +from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE +from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower +from sklearn.ensemble._hist_gradient_boosting.predictor import TreePredictor +from sklearn.exceptions import NotFittedError +from sklearn.metrics import get_scorer, mean_gamma_deviance, mean_poisson_deviance +from sklearn.model_selection import cross_val_score, train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler, OneHotEncoder +from sklearn.utils import check_random_state, shuffle +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +from sklearn.utils._testing import _convert_container +from sklearn.utils.fixes import _IS_32BIT + +n_threads = _openmp_effective_n_threads() + +X_classification, y_classification = make_classification(random_state=0) +X_regression, y_regression = make_regression(random_state=0) +X_multi_classification, y_multi_classification = make_classification( + n_classes=3, n_informative=3, random_state=0 +) + + +def _make_dumb_dataset(n_samples): + """Make a dumb dataset to test early stopping.""" + rng = np.random.RandomState(42) + X_dumb = rng.randn(n_samples, 1) + y_dumb = (X_dumb[:, 0] > 0).astype("int64") + return X_dumb, y_dumb + + +@pytest.mark.parametrize( + "GradientBoosting, X, y", + [ + (HistGradientBoostingClassifier, X_classification, y_classification), + (HistGradientBoostingRegressor, X_regression, y_regression), + ], +) +@pytest.mark.parametrize( + "params, err_msg", + [ + ( + {"interaction_cst": [0, 1]}, + "Interaction constraints must be a sequence of tuples or lists", + ), + ( + {"interaction_cst": [{0, 9999}]}, + r"Interaction constraints must consist of integer indices in \[0," + r" n_features - 1\] = \[.*\], specifying the position of features,", + ), + ( + {"interaction_cst": [{-1, 0}]}, + r"Interaction constraints must consist of integer indices in \[0," + r" n_features - 1\] = \[.*\], specifying the position of features,", + ), + ( + {"interaction_cst": [{0.5}]}, + r"Interaction constraints must consist of integer indices in \[0," + r" n_features - 1\] = \[.*\], specifying the position of features,", + ), + ], +) +def test_init_parameters_validation(GradientBoosting, X, y, params, err_msg): + with pytest.raises(ValueError, match=err_msg): + GradientBoosting(**params).fit(X, y) + + +@pytest.mark.parametrize( + "scoring, validation_fraction, early_stopping, n_iter_no_change, tol", + [ + ("neg_mean_squared_error", 0.1, True, 5, 1e-7), # use scorer + ("neg_mean_squared_error", None, True, 5, 1e-1), # use scorer on train + (None, 0.1, True, 5, 1e-7), # same with default scorer + (None, None, True, 5, 1e-1), + ("loss", 0.1, True, 5, 1e-7), # use loss + ("loss", None, True, 5, 1e-1), # use loss on training data + (None, None, False, 5, 0.0), # no early stopping + ], +) +def test_early_stopping_regression( + scoring, validation_fraction, early_stopping, n_iter_no_change, tol +): + max_iter = 200 + + X, y = make_regression(n_samples=50, random_state=0) + + gb = HistGradientBoostingRegressor( + verbose=1, # just for coverage + min_samples_leaf=5, # easier to overfit fast + scoring=scoring, + tol=tol, + early_stopping=early_stopping, + validation_fraction=validation_fraction, + max_iter=max_iter, + n_iter_no_change=n_iter_no_change, + random_state=0, + ) + gb.fit(X, y) + + if early_stopping: + assert n_iter_no_change <= gb.n_iter_ < max_iter + else: + assert gb.n_iter_ == max_iter + + +@pytest.mark.parametrize( + "data", + ( + make_classification(n_samples=30, random_state=0), + make_classification( + n_samples=30, n_classes=3, n_clusters_per_class=1, random_state=0 + ), + ), +) +@pytest.mark.parametrize( + "scoring, validation_fraction, early_stopping, n_iter_no_change, tol", + [ + ("accuracy", 0.1, True, 5, 1e-7), # use scorer + ("accuracy", None, True, 5, 1e-1), # use scorer on training data + (None, 0.1, True, 5, 1e-7), # same with default scorer + (None, None, True, 5, 1e-1), + ("loss", 0.1, True, 5, 1e-7), # use loss + ("loss", None, True, 5, 1e-1), # use loss on training data + (None, None, False, 5, 0.0), # no early stopping + ], +) +def test_early_stopping_classification( + data, scoring, validation_fraction, early_stopping, n_iter_no_change, tol +): + max_iter = 50 + + X, y = data + + gb = HistGradientBoostingClassifier( + verbose=2, # just for coverage + min_samples_leaf=5, # easier to overfit fast + scoring=scoring, + tol=tol, + early_stopping=early_stopping, + validation_fraction=validation_fraction, + max_iter=max_iter, + n_iter_no_change=n_iter_no_change, + random_state=0, + ) + gb.fit(X, y) + + if early_stopping is True: + assert n_iter_no_change <= gb.n_iter_ < max_iter + else: + assert gb.n_iter_ == max_iter + + +@pytest.mark.parametrize( + "GradientBoosting, X, y", + [ + (HistGradientBoostingClassifier, *_make_dumb_dataset(10000)), + (HistGradientBoostingClassifier, *_make_dumb_dataset(10001)), + (HistGradientBoostingRegressor, *_make_dumb_dataset(10000)), + (HistGradientBoostingRegressor, *_make_dumb_dataset(10001)), + ], +) +def test_early_stopping_default(GradientBoosting, X, y): + # Test that early stopping is enabled by default if and only if there + # are more than 10000 samples + gb = GradientBoosting(max_iter=10, n_iter_no_change=2, tol=1e-1) + gb.fit(X, y) + if X.shape[0] > 10000: + assert gb.n_iter_ < gb.max_iter + else: + assert gb.n_iter_ == gb.max_iter + + +@pytest.mark.parametrize( + "scores, n_iter_no_change, tol, stopping", + [ + ([], 1, 0.001, False), # not enough iterations + ([1, 1, 1], 5, 0.001, False), # not enough iterations + ([1, 1, 1, 1, 1], 5, 0.001, False), # not enough iterations + ([1, 2, 3, 4, 5, 6], 5, 0.001, False), # significant improvement + ([1, 2, 3, 4, 5, 6], 5, 0.0, False), # significant improvement + ([1, 2, 3, 4, 5, 6], 5, 0.999, False), # significant improvement + ([1, 2, 3, 4, 5, 6], 5, 5 - 1e-5, False), # significant improvement + ([1] * 6, 5, 0.0, True), # no significant improvement + ([1] * 6, 5, 0.001, True), # no significant improvement + ([1] * 6, 5, 5, True), # no significant improvement + ], +) +def test_should_stop(scores, n_iter_no_change, tol, stopping): + gbdt = HistGradientBoostingClassifier(n_iter_no_change=n_iter_no_change, tol=tol) + assert gbdt._should_stop(scores) == stopping + + +def test_absolute_error(): + # For coverage only. + X, y = make_regression(n_samples=500, random_state=0) + gbdt = HistGradientBoostingRegressor(loss="absolute_error", random_state=0) + gbdt.fit(X, y) + assert gbdt.score(X, y) > 0.9 + + +def test_absolute_error_sample_weight(): + # non regression test for issue #19400 + # make sure no error is thrown during fit of + # HistGradientBoostingRegressor with absolute_error loss function + # and passing sample_weight + rng = np.random.RandomState(0) + n_samples = 100 + X = rng.uniform(-1, 1, size=(n_samples, 2)) + y = rng.uniform(-1, 1, size=n_samples) + sample_weight = rng.uniform(0, 1, size=n_samples) + gbdt = HistGradientBoostingRegressor(loss="absolute_error") + gbdt.fit(X, y, sample_weight=sample_weight) + + +@pytest.mark.parametrize("y", [([1.0, -2.0, 0.0]), ([0.0, 1.0, 2.0])]) +def test_gamma_y_positive(y): + # Test that ValueError is raised if any y_i <= 0. + err_msg = r"loss='gamma' requires strictly positive y." + gbdt = HistGradientBoostingRegressor(loss="gamma", random_state=0) + with pytest.raises(ValueError, match=err_msg): + gbdt.fit(np.zeros(shape=(len(y), 1)), y) + + +def test_gamma(): + # For a Gamma distributed target, we expect an HGBT trained with the Gamma deviance + # (loss) to give better results than an HGBT with any other loss function, measured + # in out-of-sample Gamma deviance as metric/score. + # Note that squared error could potentially predict negative values which is + # invalid (np.inf) for the Gamma deviance. A Poisson HGBT (having a log link) + # does not have that defect. + # Important note: It seems that a Poisson HGBT almost always has better + # out-of-sample performance than the Gamma HGBT, measured in Gamma deviance. + # LightGBM shows the same behaviour. Hence, we only compare to a squared error + # HGBT, but not to a Poisson deviance HGBT. + rng = np.random.RandomState(42) + n_train, n_test, n_features = 500, 100, 20 + X = make_low_rank_matrix( + n_samples=n_train + n_test, + n_features=n_features, + random_state=rng, + ) + # We create a log-linear Gamma model. This gives y.min ~ 1e-2, y.max ~ 1e2 + coef = rng.uniform(low=-10, high=20, size=n_features) + # Numpy parametrizes gamma(shape=k, scale=theta) with mean = k * theta and + # variance = k * theta^2. We parametrize it instead with mean = exp(X @ coef) + # and variance = dispersion * mean^2 by setting k = 1 / dispersion, + # theta = dispersion * mean. + dispersion = 0.5 + y = rng.gamma(shape=1 / dispersion, scale=dispersion * np.exp(X @ coef)) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=n_test, random_state=rng + ) + gbdt_gamma = HistGradientBoostingRegressor(loss="gamma", random_state=123) + gbdt_mse = HistGradientBoostingRegressor(loss="squared_error", random_state=123) + dummy = DummyRegressor(strategy="mean") + for model in (gbdt_gamma, gbdt_mse, dummy): + model.fit(X_train, y_train) + + for X, y in [(X_train, y_train), (X_test, y_test)]: + loss_gbdt_gamma = mean_gamma_deviance(y, gbdt_gamma.predict(X)) + # We restrict the squared error HGBT to predict at least the minimum seen y at + # train time to make it strictly positive. + loss_gbdt_mse = mean_gamma_deviance( + y, np.maximum(np.min(y_train), gbdt_mse.predict(X)) + ) + loss_dummy = mean_gamma_deviance(y, dummy.predict(X)) + assert loss_gbdt_gamma < loss_dummy + assert loss_gbdt_gamma < loss_gbdt_mse + + +@pytest.mark.parametrize("quantile", [0.2, 0.5, 0.8]) +def test_quantile_asymmetric_error(quantile): + """Test quantile regression for asymmetric distributed targets.""" + n_samples = 10_000 + rng = np.random.RandomState(42) + # take care that X @ coef + intercept > 0 + X = np.concatenate( + ( + np.abs(rng.randn(n_samples)[:, None]), + -rng.randint(2, size=(n_samples, 1)), + ), + axis=1, + ) + intercept = 1.23 + coef = np.array([0.5, -2]) + # For an exponential distribution with rate lambda, e.g. exp(-lambda * x), + # the quantile at level q is: + # quantile(q) = - log(1 - q) / lambda + # scale = 1/lambda = -quantile(q) / log(1-q) + y = rng.exponential( + scale=-(X @ coef + intercept) / np.log(1 - quantile), size=n_samples + ) + model = HistGradientBoostingRegressor( + loss="quantile", + quantile=quantile, + max_iter=25, + random_state=0, + max_leaf_nodes=10, + ).fit(X, y) + assert_allclose(np.mean(model.predict(X) > y), quantile, rtol=1e-2) + + pinball_loss = PinballLoss(quantile=quantile) + loss_true_quantile = pinball_loss(y, X @ coef + intercept) + loss_pred_quantile = pinball_loss(y, model.predict(X)) + # we are overfitting + assert loss_pred_quantile <= loss_true_quantile + + +@pytest.mark.parametrize("y", [([1.0, -2.0, 0.0]), ([0.0, 0.0, 0.0])]) +def test_poisson_y_positive(y): + # Test that ValueError is raised if either one y_i < 0 or sum(y_i) <= 0. + err_msg = r"loss='poisson' requires non-negative y and sum\(y\) > 0." + gbdt = HistGradientBoostingRegressor(loss="poisson", random_state=0) + with pytest.raises(ValueError, match=err_msg): + gbdt.fit(np.zeros(shape=(len(y), 1)), y) + + +def test_poisson(): + # For Poisson distributed target, Poisson loss should give better results + # than least squares measured in Poisson deviance as metric. + rng = np.random.RandomState(42) + n_train, n_test, n_features = 500, 100, 100 + X = make_low_rank_matrix( + n_samples=n_train + n_test, n_features=n_features, random_state=rng + ) + # We create a log-linear Poisson model and downscale coef as it will get + # exponentiated. + coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0) + y = rng.poisson(lam=np.exp(X @ coef)) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=n_test, random_state=rng + ) + gbdt_pois = HistGradientBoostingRegressor(loss="poisson", random_state=rng) + gbdt_ls = HistGradientBoostingRegressor(loss="squared_error", random_state=rng) + gbdt_pois.fit(X_train, y_train) + gbdt_ls.fit(X_train, y_train) + dummy = DummyRegressor(strategy="mean").fit(X_train, y_train) + + for X, y in [(X_train, y_train), (X_test, y_test)]: + metric_pois = mean_poisson_deviance(y, gbdt_pois.predict(X)) + # squared_error might produce non-positive predictions => clip + metric_ls = mean_poisson_deviance(y, np.clip(gbdt_ls.predict(X), 1e-15, None)) + metric_dummy = mean_poisson_deviance(y, dummy.predict(X)) + assert metric_pois < metric_ls + assert metric_pois < metric_dummy + + +def test_binning_train_validation_are_separated(): + # Make sure training and validation data are binned separately. + # See issue 13926 + + rng = np.random.RandomState(0) + validation_fraction = 0.2 + gb = HistGradientBoostingClassifier( + early_stopping=True, validation_fraction=validation_fraction, random_state=rng + ) + gb.fit(X_classification, y_classification) + mapper_training_data = gb._bin_mapper + + # Note that since the data is small there is no subsampling and the + # random_state doesn't matter + mapper_whole_data = _BinMapper(random_state=0) + mapper_whole_data.fit(X_classification) + + n_samples = X_classification.shape[0] + assert np.all( + mapper_training_data.n_bins_non_missing_ + == int((1 - validation_fraction) * n_samples) + ) + assert np.all( + mapper_training_data.n_bins_non_missing_ + != mapper_whole_data.n_bins_non_missing_ + ) + + +def test_missing_values_trivial(): + # sanity check for missing values support. With only one feature and + # y == isnan(X), the gbdt is supposed to reach perfect accuracy on the + # training set. + + n_samples = 100 + n_features = 1 + rng = np.random.RandomState(0) + + X = rng.normal(size=(n_samples, n_features)) + mask = rng.binomial(1, 0.5, size=X.shape).astype(bool) + X[mask] = np.nan + y = mask.ravel() + gb = HistGradientBoostingClassifier() + gb.fit(X, y) + + assert gb.score(X, y) == pytest.approx(1) + + +@pytest.mark.parametrize("problem", ("classification", "regression")) +@pytest.mark.parametrize( + ( + "missing_proportion, expected_min_score_classification, " + "expected_min_score_regression" + ), + [(0.1, 0.97, 0.89), (0.2, 0.93, 0.81), (0.5, 0.79, 0.52)], +) +def test_missing_values_resilience( + problem, + missing_proportion, + expected_min_score_classification, + expected_min_score_regression, +): + # Make sure the estimators can deal with missing values and still yield + # decent predictions + + rng = np.random.RandomState(0) + n_samples = 1000 + n_features = 2 + if problem == "regression": + X, y = make_regression( + n_samples=n_samples, + n_features=n_features, + n_informative=n_features, + random_state=rng, + ) + gb = HistGradientBoostingRegressor() + expected_min_score = expected_min_score_regression + else: + X, y = make_classification( + n_samples=n_samples, + n_features=n_features, + n_informative=n_features, + n_redundant=0, + n_repeated=0, + random_state=rng, + ) + gb = HistGradientBoostingClassifier() + expected_min_score = expected_min_score_classification + + mask = rng.binomial(1, missing_proportion, size=X.shape).astype(bool) + X[mask] = np.nan + + gb.fit(X, y) + + assert gb.score(X, y) > expected_min_score + + +@pytest.mark.parametrize( + "data", + [ + make_classification(random_state=0, n_classes=2), + make_classification(random_state=0, n_classes=3, n_informative=3), + ], + ids=["binary_log_loss", "multiclass_log_loss"], +) +def test_zero_division_hessians(data): + # non regression test for issue #14018 + # make sure we avoid zero division errors when computing the leaves values. + + # If the learning rate is too high, the raw predictions are bad and will + # saturate the softmax (or sigmoid in binary classif). This leads to + # probabilities being exactly 0 or 1, gradients being constant, and + # hessians being zero. + X, y = data + gb = HistGradientBoostingClassifier(learning_rate=100, max_iter=10) + gb.fit(X, y) + + +def test_small_trainset(): + # Make sure that the small trainset is stratified and has the expected + # length (10k samples) + n_samples = 20000 + original_distrib = {0: 0.1, 1: 0.2, 2: 0.3, 3: 0.4} + rng = np.random.RandomState(42) + X = rng.randn(n_samples).reshape(n_samples, 1) + y = [ + [class_] * int(prop * n_samples) for (class_, prop) in original_distrib.items() + ] + y = shuffle(np.concatenate(y)) + gb = HistGradientBoostingClassifier() + + # Compute the small training set + X_small, y_small, *_ = gb._get_small_trainset( + X, y, seed=42, sample_weight_train=None + ) + + # Compute the class distribution in the small training set + unique, counts = np.unique(y_small, return_counts=True) + small_distrib = {class_: count / 10000 for (class_, count) in zip(unique, counts)} + + # Test that the small training set has the expected length + assert X_small.shape[0] == 10000 + assert y_small.shape[0] == 10000 + + # Test that the class distributions in the whole dataset and in the small + # training set are identical + assert small_distrib == pytest.approx(original_distrib) + + +def test_missing_values_minmax_imputation(): + # Compare the buit-in missing value handling of Histogram GBC with an + # a-priori missing value imputation strategy that should yield the same + # results in terms of decision function. + # + # Each feature (containing NaNs) is replaced by 2 features: + # - one where the nans are replaced by min(feature) - 1 + # - one where the nans are replaced by max(feature) + 1 + # A split where nans go to the left has an equivalent split in the + # first (min) feature, and a split where nans go to the right has an + # equivalent split in the second (max) feature. + # + # Assuming the data is such that there is never a tie to select the best + # feature to split on during training, the learned decision trees should be + # strictly equivalent (learn a sequence of splits that encode the same + # decision function). + # + # The MinMaxImputer transformer is meant to be a toy implementation of the + # "Missing In Attributes" (MIA) missing value handling for decision trees + # https://www.sciencedirect.com/science/article/abs/pii/S0167865508000305 + # The implementation of MIA as an imputation transformer was suggested by + # "Remark 3" in :arxiv:'<1902.06931>` + + class MinMaxImputer(TransformerMixin, BaseEstimator): + def fit(self, X, y=None): + mm = MinMaxScaler().fit(X) + self.data_min_ = mm.data_min_ + self.data_max_ = mm.data_max_ + return self + + def transform(self, X): + X_min, X_max = X.copy(), X.copy() + + for feature_idx in range(X.shape[1]): + nan_mask = np.isnan(X[:, feature_idx]) + X_min[nan_mask, feature_idx] = self.data_min_[feature_idx] - 1 + X_max[nan_mask, feature_idx] = self.data_max_[feature_idx] + 1 + + return np.concatenate([X_min, X_max], axis=1) + + def make_missing_value_data(n_samples=int(1e4), seed=0): + rng = np.random.RandomState(seed) + X, y = make_regression(n_samples=n_samples, n_features=4, random_state=rng) + + # Pre-bin the data to ensure a deterministic handling by the 2 + # strategies and also make it easier to insert np.nan in a structured + # way: + X = KBinsDiscretizer( + n_bins=42, encode="ordinal", quantile_method="averaged_inverted_cdf" + ).fit_transform(X) + + # First feature has missing values completely at random: + rnd_mask = rng.rand(X.shape[0]) > 0.9 + X[rnd_mask, 0] = np.nan + + # Second and third features have missing values for extreme values + # (censoring missingness): + low_mask = X[:, 1] == 0 + X[low_mask, 1] = np.nan + + high_mask = X[:, 2] == X[:, 2].max() + X[high_mask, 2] = np.nan + + # Make the last feature nan pattern very informative: + y_max = np.percentile(y, 70) + y_max_mask = y >= y_max + y[y_max_mask] = y_max + X[y_max_mask, 3] = np.nan + + # Check that there is at least one missing value in each feature: + for feature_idx in range(X.shape[1]): + assert any(np.isnan(X[:, feature_idx])) + + # Let's use a test set to check that the learned decision function is + # the same as evaluated on unseen data. Otherwise it could just be the + # case that we find two independent ways to overfit the training set. + return train_test_split(X, y, random_state=rng) + + # n_samples need to be large enough to minimize the likelihood of having + # several candidate splits with the same gain value in a given tree. + X_train, X_test, y_train, y_test = make_missing_value_data( + n_samples=int(1e4), seed=0 + ) + + # Use a small number of leaf nodes and iterations so as to keep + # under-fitting models to minimize the likelihood of ties when training the + # model. + gbm1 = HistGradientBoostingRegressor(max_iter=100, max_leaf_nodes=5, random_state=0) + gbm1.fit(X_train, y_train) + + gbm2 = make_pipeline(MinMaxImputer(), clone(gbm1)) + gbm2.fit(X_train, y_train) + + # Check that the model reach the same score: + assert gbm1.score(X_train, y_train) == pytest.approx(gbm2.score(X_train, y_train)) + + assert gbm1.score(X_test, y_test) == pytest.approx(gbm2.score(X_test, y_test)) + + # Check the individual prediction match as a finer grained + # decision function check. + assert_allclose(gbm1.predict(X_train), gbm2.predict(X_train)) + assert_allclose(gbm1.predict(X_test), gbm2.predict(X_test)) + + +def test_infinite_values(): + # Basic test for infinite values + + X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1) + y = np.array([0, 0, 1, 1]) + + gbdt = HistGradientBoostingRegressor(min_samples_leaf=1) + gbdt.fit(X, y) + np.testing.assert_allclose(gbdt.predict(X), y, atol=1e-4) + + +def test_consistent_lengths(): + X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1) + y = np.array([0, 0, 1, 1]) + sample_weight = np.array([0.1, 0.3, 0.1]) + gbdt = HistGradientBoostingRegressor() + with pytest.raises(ValueError, match=r"sample_weight.shape == \(3,\), expected"): + gbdt.fit(X, y, sample_weight) + + with pytest.raises( + ValueError, match="Found input variables with inconsistent number" + ): + gbdt.fit(X, y[1:]) + + +def test_infinite_values_missing_values(): + # High level test making sure that inf and nan values are properly handled + # when both are present. This is similar to + # test_split_on_nan_with_infinite_values() in test_grower.py, though we + # cannot check the predictions for binned values here. + + X = np.asarray([-np.inf, 0, 1, np.inf, np.nan]).reshape(-1, 1) + y_isnan = np.isnan(X.ravel()) + y_isinf = X.ravel() == np.inf + + stump_clf = HistGradientBoostingClassifier( + min_samples_leaf=1, max_iter=1, learning_rate=1, max_depth=2 + ) + + assert stump_clf.fit(X, y_isinf).score(X, y_isinf) == 1 + assert stump_clf.fit(X, y_isnan).score(X, y_isnan) == 1 + + +@pytest.mark.parametrize("scoring", [None, "loss"]) +def test_string_target_early_stopping(scoring): + # Regression tests for #14709 where the targets need to be encoded before + # to compute the score + rng = np.random.RandomState(42) + X = rng.randn(100, 10) + y = np.array(["x"] * 50 + ["y"] * 50, dtype=object) + gbrt = HistGradientBoostingClassifier(n_iter_no_change=10, scoring=scoring) + gbrt.fit(X, y) + + +def test_zero_sample_weights_regression(): + # Make sure setting a SW to zero amounts to ignoring the corresponding + # sample + + X = [[1, 0], [1, 0], [1, 0], [0, 1]] + y = [0, 0, 1, 0] + # ignore the first 2 training samples by setting their weight to 0 + sample_weight = [0, 0, 1, 1] + gb = HistGradientBoostingRegressor(min_samples_leaf=1) + gb.fit(X, y, sample_weight=sample_weight) + assert gb.predict([[1, 0]])[0] > 0.5 + + +def test_zero_sample_weights_classification(): + # Make sure setting a SW to zero amounts to ignoring the corresponding + # sample + + X = [[1, 0], [1, 0], [1, 0], [0, 1]] + y = [0, 0, 1, 0] + # ignore the first 2 training samples by setting their weight to 0 + sample_weight = [0, 0, 1, 1] + gb = HistGradientBoostingClassifier(loss="log_loss", min_samples_leaf=1) + gb.fit(X, y, sample_weight=sample_weight) + assert_array_equal(gb.predict([[1, 0]]), [1]) + + X = [[1, 0], [1, 0], [1, 0], [0, 1], [1, 1]] + y = [0, 0, 1, 0, 2] + # ignore the first 2 training samples by setting their weight to 0 + sample_weight = [0, 0, 1, 1, 1] + gb = HistGradientBoostingClassifier(loss="log_loss", min_samples_leaf=1) + gb.fit(X, y, sample_weight=sample_weight) + assert_array_equal(gb.predict([[1, 0]]), [1]) + + +@pytest.mark.parametrize( + "problem", ("regression", "binary_classification", "multiclass_classification") +) +@pytest.mark.parametrize("duplication", ("half", "all")) +def test_sample_weight_effect(problem, duplication): + # High level test to make sure that duplicating a sample is equivalent to + # giving it weight of 2. + + # fails for n_samples > 255 because binning does not take sample weights + # into account. Keeping n_samples <= 255 makes + # sure only unique values are used so SW have no effect on binning. + n_samples = 255 + n_features = 2 + if problem == "regression": + X, y = make_regression( + n_samples=n_samples, + n_features=n_features, + n_informative=n_features, + random_state=0, + ) + Klass = HistGradientBoostingRegressor + else: + n_classes = 2 if problem == "binary_classification" else 3 + X, y = make_classification( + n_samples=n_samples, + n_features=n_features, + n_informative=n_features, + n_redundant=0, + n_clusters_per_class=1, + n_classes=n_classes, + random_state=0, + ) + Klass = HistGradientBoostingClassifier + + # This test can't pass if min_samples_leaf > 1 because that would force 2 + # samples to be in the same node in est_sw, while these samples would be + # free to be separate in est_dup: est_dup would just group together the + # duplicated samples. + est = Klass(min_samples_leaf=1) + + # Create dataset with duplicate and corresponding sample weights + if duplication == "half": + lim = n_samples // 2 + else: + lim = n_samples + X_dup = np.r_[X, X[:lim]] + y_dup = np.r_[y, y[:lim]] + sample_weight = np.ones(shape=(n_samples)) + sample_weight[:lim] = 2 + + est_sw = clone(est).fit(X, y, sample_weight=sample_weight) + est_dup = clone(est).fit(X_dup, y_dup) + + # checking raw_predict is stricter than just predict for classification + assert np.allclose(est_sw._raw_predict(X_dup), est_dup._raw_predict(X_dup)) + + +@pytest.mark.parametrize("Loss", (HalfSquaredError, AbsoluteError)) +def test_sum_hessians_are_sample_weight(Loss): + # For losses with constant hessians, the sum_hessians field of the + # histograms must be equal to the sum of the sample weight of samples at + # the corresponding bin. + + rng = np.random.RandomState(0) + n_samples = 1000 + n_features = 2 + X, y = make_regression(n_samples=n_samples, n_features=n_features, random_state=rng) + bin_mapper = _BinMapper() + X_binned = bin_mapper.fit_transform(X) + + # While sample weights are supposed to be positive, this still works. + sample_weight = rng.normal(size=n_samples) + + loss = Loss(sample_weight=sample_weight) + gradients, hessians = loss.init_gradient_and_hessian( + n_samples=n_samples, dtype=G_H_DTYPE + ) + gradients, hessians = gradients.reshape((-1, 1)), hessians.reshape((-1, 1)) + raw_predictions = rng.normal(size=(n_samples, 1)) + loss.gradient_hessian( + y_true=y, + raw_prediction=raw_predictions, + sample_weight=sample_weight, + gradient_out=gradients, + hessian_out=hessians, + n_threads=n_threads, + ) + + # build sum_sample_weight which contains the sum of the sample weights at + # each bin (for each feature). This must be equal to the sum_hessians + # field of the corresponding histogram + sum_sw = np.zeros(shape=(n_features, bin_mapper.n_bins)) + for feature_idx in range(n_features): + for sample_idx in range(n_samples): + sum_sw[feature_idx, X_binned[sample_idx, feature_idx]] += sample_weight[ + sample_idx + ] + + # Build histogram + grower = TreeGrower( + X_binned, gradients[:, 0], hessians[:, 0], n_bins=bin_mapper.n_bins + ) + histograms = grower.histogram_builder.compute_histograms_brute( + grower.root.sample_indices + ) + + for feature_idx in range(n_features): + for bin_idx in range(bin_mapper.n_bins): + assert histograms[feature_idx, bin_idx]["sum_hessians"] == ( + pytest.approx(sum_sw[feature_idx, bin_idx], rel=1e-5) + ) + + +def test_max_depth_max_leaf_nodes(): + # Non regression test for + # https://github.com/scikit-learn/scikit-learn/issues/16179 + # there was a bug when the max_depth and the max_leaf_nodes criteria were + # met at the same time, which would lead to max_leaf_nodes not being + # respected. + X, y = make_classification(random_state=0) + est = HistGradientBoostingClassifier(max_depth=2, max_leaf_nodes=3, max_iter=1).fit( + X, y + ) + tree = est._predictors[0][0] + assert tree.get_max_depth() == 2 + assert tree.get_n_leaf_nodes() == 3 # would be 4 prior to bug fix + + +def test_early_stopping_on_test_set_with_warm_start(): + # Non regression test for #16661 where second fit fails with + # warm_start=True, early_stopping is on, and no validation set + X, y = make_classification(random_state=0) + gb = HistGradientBoostingClassifier( + max_iter=1, + scoring="loss", + warm_start=True, + early_stopping=True, + n_iter_no_change=1, + validation_fraction=None, + ) + + gb.fit(X, y) + # does not raise on second call + gb.set_params(max_iter=2) + gb.fit(X, y) + + +def test_early_stopping_with_sample_weights(monkeypatch): + """Check that sample weights is passed in to the scorer and _raw_predict is not + called.""" + + mock_scorer = Mock(side_effect=get_scorer("neg_median_absolute_error")) + + def mock_check_scoring(estimator, scoring): + assert scoring == "neg_median_absolute_error" + return mock_scorer + + monkeypatch.setattr( + sklearn.ensemble._hist_gradient_boosting.gradient_boosting, + "check_scoring", + mock_check_scoring, + ) + + X, y = make_regression(random_state=0) + sample_weight = np.ones_like(y) + hist = HistGradientBoostingRegressor( + max_iter=2, + early_stopping=True, + random_state=0, + scoring="neg_median_absolute_error", + ) + mock_raw_predict = Mock(side_effect=hist._raw_predict) + hist._raw_predict = mock_raw_predict + hist.fit(X, y, sample_weight=sample_weight) + + # _raw_predict should never be called with scoring as a string + assert mock_raw_predict.call_count == 0 + + # For scorer is called twice (train and val) for the baseline score, and twice + # per iteration (train and val) after that. So 6 times in total for `max_iter=2`. + assert mock_scorer.call_count == 6 + for arg_list in mock_scorer.call_args_list: + assert "sample_weight" in arg_list[1] + + +def test_raw_predict_is_called_with_custom_scorer(): + """Custom scorer will still call _raw_predict.""" + + mock_scorer = Mock(side_effect=get_scorer("neg_median_absolute_error")) + + X, y = make_regression(random_state=0) + hist = HistGradientBoostingRegressor( + max_iter=2, + early_stopping=True, + random_state=0, + scoring=mock_scorer, + ) + mock_raw_predict = Mock(side_effect=hist._raw_predict) + hist._raw_predict = mock_raw_predict + hist.fit(X, y) + + # `_raw_predict` and scorer is called twice (train and val) for the baseline score, + # and twice per iteration (train and val) after that. So 6 times in total for + # `max_iter=2`. + assert mock_raw_predict.call_count == 6 + assert mock_scorer.call_count == 6 + + +@pytest.mark.parametrize( + "Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor) +) +def test_single_node_trees(Est): + # Make sure it's still possible to build single-node trees. In that case + # the value of the root is set to 0. That's a correct value: if the tree is + # single-node that's because min_gain_to_split is not respected right from + # the root, so we don't want the tree to have any impact on the + # predictions. + + X, y = make_classification(random_state=0) + y[:] = 1 # constant target will lead to a single root node + + est = Est(max_iter=20) + est.fit(X, y) + + assert all(len(predictor[0].nodes) == 1 for predictor in est._predictors) + assert all(predictor[0].nodes[0]["value"] == 0 for predictor in est._predictors) + # Still gives correct predictions thanks to the baseline prediction + assert_allclose(est.predict(X), y) + + +@pytest.mark.parametrize( + "Est, loss, X, y", + [ + ( + HistGradientBoostingClassifier, + HalfBinomialLoss(sample_weight=None), + X_classification, + y_classification, + ), + ( + HistGradientBoostingRegressor, + HalfSquaredError(sample_weight=None), + X_regression, + y_regression, + ), + ], +) +def test_custom_loss(Est, loss, X, y): + est = Est(loss=loss, max_iter=20) + est.fit(X, y) + + +@pytest.mark.parametrize( + "HistGradientBoosting, X, y", + [ + (HistGradientBoostingClassifier, X_classification, y_classification), + (HistGradientBoostingRegressor, X_regression, y_regression), + ( + HistGradientBoostingClassifier, + X_multi_classification, + y_multi_classification, + ), + ], +) +def test_staged_predict(HistGradientBoosting, X, y): + # Test whether staged predictor eventually gives + # the same prediction. + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.5, random_state=0 + ) + gb = HistGradientBoosting(max_iter=10) + + # test raise NotFittedError if not fitted + with pytest.raises(NotFittedError): + next(gb.staged_predict(X_test)) + + gb.fit(X_train, y_train) + + # test if the staged predictions of each iteration + # are equal to the corresponding predictions of the same estimator + # trained from scratch. + # this also test limit case when max_iter = 1 + method_names = ( + ["predict"] + if is_regressor(gb) + else ["predict", "predict_proba", "decision_function"] + ) + for method_name in method_names: + staged_method = getattr(gb, "staged_" + method_name) + staged_predictions = list(staged_method(X_test)) + assert len(staged_predictions) == gb.n_iter_ + for n_iter, staged_predictions in enumerate(staged_method(X_test), 1): + aux = HistGradientBoosting(max_iter=n_iter) + aux.fit(X_train, y_train) + pred_aux = getattr(aux, method_name)(X_test) + + assert_allclose(staged_predictions, pred_aux) + assert staged_predictions.shape == pred_aux.shape + + +@pytest.mark.parametrize("insert_missing", [False, True]) +@pytest.mark.parametrize( + "Est", (HistGradientBoostingRegressor, HistGradientBoostingClassifier) +) +@pytest.mark.parametrize("bool_categorical_parameter", [True, False]) +@pytest.mark.parametrize("missing_value", [np.nan, -1]) +def test_unknown_categories_nan( + insert_missing, Est, bool_categorical_parameter, missing_value +): + # Make sure no error is raised at predict if a category wasn't seen during + # fit. We also make sure they're treated as nans. + + rng = np.random.RandomState(0) + n_samples = 1000 + f1 = rng.rand(n_samples) + f2 = rng.randint(4, size=n_samples) + X = np.c_[f1, f2] + y = np.zeros(shape=n_samples) + y[X[:, 1] % 2 == 0] = 1 + + if bool_categorical_parameter: + categorical_features = [False, True] + else: + categorical_features = [1] + + if insert_missing: + mask = rng.binomial(1, 0.01, size=X.shape).astype(bool) + assert mask.sum() > 0 + X[mask] = missing_value + + est = Est(max_iter=20, categorical_features=categorical_features).fit(X, y) + assert_array_equal(est.is_categorical_, [False, True]) + + # Make sure no error is raised on unknown categories and nans + # unknown categories will be treated as nans + X_test = np.zeros((10, X.shape[1]), dtype=float) + X_test[:5, 1] = 30 + X_test[5:, 1] = missing_value + assert len(np.unique(est.predict(X_test))) == 1 + + +def test_categorical_encoding_strategies(): + # Check native categorical handling vs different encoding strategies. We + # make sure that native encoding needs only 1 split to achieve a perfect + # prediction on a simple dataset. In contrast, OneHotEncoded data needs + # more depth / splits, and treating categories as ordered (just using + # OrdinalEncoder) requires even more depth. + + # dataset with one random continuous feature, and one categorical feature + # with values in [0, 5], e.g. from an OrdinalEncoder. + # class == 1 iff categorical value in {0, 2, 4} + rng = np.random.RandomState(0) + n_samples = 10_000 + f1 = rng.rand(n_samples) + f2 = rng.randint(6, size=n_samples) + X = np.c_[f1, f2] + y = np.zeros(shape=n_samples) + y[X[:, 1] % 2 == 0] = 1 + + # make sure dataset is balanced so that the baseline_prediction doesn't + # influence predictions too much with max_iter = 1 + assert 0.49 < y.mean() < 0.51 + + native_cat_specs = [ + [False, True], + [1], + ] + try: + import pandas as pd + + X = pd.DataFrame(X, columns=["f_0", "f_1"]) + native_cat_specs.append(["f_1"]) + except ImportError: + pass + + for native_cat_spec in native_cat_specs: + clf_cat = HistGradientBoostingClassifier( + max_iter=1, max_depth=1, categorical_features=native_cat_spec + ) + clf_cat.fit(X, y) + + # Using native categorical encoding, we get perfect predictions with just + # one split + assert cross_val_score(clf_cat, X, y).mean() == 1 + + # quick sanity check for the bitset: 0, 2, 4 = 2**0 + 2**2 + 2**4 = 21 + expected_left_bitset = [21, 0, 0, 0, 0, 0, 0, 0] + left_bitset = clf_cat.fit(X, y)._predictors[0][0].raw_left_cat_bitsets[0] + assert_array_equal(left_bitset, expected_left_bitset) + + # Treating categories as ordered, we need more depth / more splits to get + # the same predictions + clf_no_cat = HistGradientBoostingClassifier( + max_iter=1, max_depth=4, categorical_features=None + ) + assert cross_val_score(clf_no_cat, X, y).mean() < 0.9 + + clf_no_cat.set_params(max_depth=5) + assert cross_val_score(clf_no_cat, X, y).mean() == 1 + + # Using OHEd data, we need less splits than with pure OEd data, but we + # still need more splits than with the native categorical splits + ct = make_column_transformer( + (OneHotEncoder(sparse_output=False), [1]), remainder="passthrough" + ) + X_ohe = ct.fit_transform(X) + clf_no_cat.set_params(max_depth=2) + assert cross_val_score(clf_no_cat, X_ohe, y).mean() < 0.9 + + clf_no_cat.set_params(max_depth=3) + assert cross_val_score(clf_no_cat, X_ohe, y).mean() == 1 + + +@pytest.mark.parametrize( + "Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor) +) +@pytest.mark.parametrize( + "categorical_features, monotonic_cst, expected_msg", + [ + ( + [b"hello", b"world"], + None, + re.escape( + "categorical_features must be an array-like of bool, int or str, " + "got: bytes40." + ), + ), + ( + np.array([b"hello", 1.3], dtype=object), + None, + re.escape( + "categorical_features must be an array-like of bool, int or str, " + "got: bytes, float." + ), + ), + ( + [0, -1], + None, + re.escape( + "categorical_features set as integer indices must be in " + "[0, n_features - 1]" + ), + ), + ( + [True, True, False, False, True], + None, + re.escape( + "categorical_features set as a boolean mask must have shape " + "(n_features,)" + ), + ), + ( + [True, True, False, False], + [0, -1, 0, 1], + "Categorical features cannot have monotonic constraints", + ), + ], +) +def test_categorical_spec_errors( + Est, categorical_features, monotonic_cst, expected_msg +): + # Test errors when categories are specified incorrectly + n_samples = 100 + X, y = make_classification(random_state=0, n_features=4, n_samples=n_samples) + rng = np.random.RandomState(0) + X[:, 0] = rng.randint(0, 10, size=n_samples) + X[:, 1] = rng.randint(0, 10, size=n_samples) + est = Est(categorical_features=categorical_features, monotonic_cst=monotonic_cst) + + with pytest.raises(ValueError, match=expected_msg): + est.fit(X, y) + + +@pytest.mark.parametrize( + "Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor) +) +def test_categorical_spec_errors_with_feature_names(Est): + pd = pytest.importorskip("pandas") + n_samples = 10 + X = pd.DataFrame( + { + "f0": range(n_samples), + "f1": range(n_samples), + "f2": [1.0] * n_samples, + } + ) + y = [0, 1] * (n_samples // 2) + + est = Est(categorical_features=["f0", "f1", "f3"]) + expected_msg = re.escape( + "categorical_features has a item value 'f3' which is not a valid " + "feature name of the training data." + ) + with pytest.raises(ValueError, match=expected_msg): + est.fit(X, y) + + est = Est(categorical_features=["f0", "f1"]) + expected_msg = re.escape( + "categorical_features should be passed as an array of integers or " + "as a boolean mask when the model is fitted on data without feature " + "names." + ) + with pytest.raises(ValueError, match=expected_msg): + est.fit(X.to_numpy(), y) + + +@pytest.mark.parametrize( + "Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor) +) +@pytest.mark.parametrize("categorical_features", ([False, False], [])) +@pytest.mark.parametrize("as_array", (True, False)) +def test_categorical_spec_no_categories(Est, categorical_features, as_array): + # Make sure we can properly detect that no categorical features are present + # even if the categorical_features parameter is not None + X = np.arange(10).reshape(5, 2) + y = np.arange(5) + if as_array: + categorical_features = np.asarray(categorical_features) + est = Est(categorical_features=categorical_features).fit(X, y) + assert est.is_categorical_ is None + + +@pytest.mark.parametrize( + "Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor) +) +@pytest.mark.parametrize( + "use_pandas, feature_name", [(False, "at index 0"), (True, "'f0'")] +) +def test_categorical_bad_encoding_errors(Est, use_pandas, feature_name): + # Test errors when categories are encoded incorrectly + + gb = Est(categorical_features=[True], max_bins=2) + + if use_pandas: + pd = pytest.importorskip("pandas") + X = pd.DataFrame({"f0": [0, 1, 2]}) + else: + X = np.array([[0, 1, 2]]).T + y = np.arange(3) + msg = ( + f"Categorical feature {feature_name} is expected to have a " + "cardinality <= 2 but actually has a cardinality of 3." + ) + with pytest.raises(ValueError, match=msg): + gb.fit(X, y) + + # nans are ignored in the counts + X = np.array([[0, 1, np.nan]]).T + y = np.arange(3) + gb.fit(X, y) + + +@pytest.mark.parametrize( + "Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor) +) +def test_uint8_predict(Est): + # Non regression test for + # https://github.com/scikit-learn/scikit-learn/issues/18408 + # Make sure X can be of dtype uint8 (i.e. X_BINNED_DTYPE) in predict. It + # will be converted to X_DTYPE. + + rng = np.random.RandomState(0) + + X = rng.randint(0, 100, size=(10, 2)).astype(np.uint8) + y = rng.randint(0, 2, size=10).astype(np.uint8) + est = Est() + est.fit(X, y) + est.predict(X) + + +@pytest.mark.parametrize( + "interaction_cst, n_features, result", + [ + (None, 931, None), + ([{0, 1}], 2, [{0, 1}]), + ("pairwise", 2, [{0, 1}]), + ("pairwise", 4, [{0, 1}, {0, 2}, {0, 3}, {1, 2}, {1, 3}, {2, 3}]), + ("no_interactions", 2, [{0}, {1}]), + ("no_interactions", 4, [{0}, {1}, {2}, {3}]), + ([(1, 0), [5, 1]], 6, [{0, 1}, {1, 5}, {2, 3, 4}]), + ], +) +def test_check_interaction_cst(interaction_cst, n_features, result): + """Check that _check_interaction_cst returns the expected list of sets""" + est = HistGradientBoostingRegressor() + est.set_params(interaction_cst=interaction_cst) + assert est._check_interaction_cst(n_features) == result + + +def test_interaction_cst_numerically(): + """Check that interaction constraints have no forbidden interactions.""" + rng = np.random.RandomState(42) + n_samples = 1000 + X = rng.uniform(size=(n_samples, 2)) + # Construct y with a strong interaction term + # y = x0 + x1 + 5 * x0 * x1 + y = np.hstack((X, 5 * X[:, [0]] * X[:, [1]])).sum(axis=1) + + est = HistGradientBoostingRegressor(random_state=42) + est.fit(X, y) + est_no_interactions = HistGradientBoostingRegressor( + interaction_cst=[{0}, {1}], random_state=42 + ) + est_no_interactions.fit(X, y) + + delta = 0.25 + # Make sure we do not extrapolate out of the training set as tree-based estimators + # are very bad in doing so. + X_test = X[(X[:, 0] < 1 - delta) & (X[:, 1] < 1 - delta)] + X_delta_d_0 = X_test + [delta, 0] + X_delta_0_d = X_test + [0, delta] + X_delta_d_d = X_test + [delta, delta] + + # Note: For the y from above as a function of x0 and x1, we have + # y(x0+d, x1+d) = y(x0, x1) + 5 * d * (2/5 + x0 + x1) + 5 * d**2 + # y(x0+d, x1) = y(x0, x1) + 5 * d * (1/5 + x1) + # y(x0, x1+d) = y(x0, x1) + 5 * d * (1/5 + x0) + # Without interaction constraints, we would expect a result of 5 * d**2 for the + # following expression, but zero with constraints in place. + assert_allclose( + est_no_interactions.predict(X_delta_d_d) + + est_no_interactions.predict(X_test) + - est_no_interactions.predict(X_delta_d_0) + - est_no_interactions.predict(X_delta_0_d), + 0, + atol=1e-12, + ) + + # Correct result of the expressions is 5 * delta**2. But this is hard to achieve by + # a fitted tree-based model. However, with 100 iterations the expression should + # at least be positive! + assert np.all( + est.predict(X_delta_d_d) + + est.predict(X_test) + - est.predict(X_delta_d_0) + - est.predict(X_delta_0_d) + > 0.01 + ) + + +def test_no_user_warning_with_scoring(): + """Check that no UserWarning is raised when scoring is set. + + Non-regression test for #22907. + """ + pd = pytest.importorskip("pandas") + X, y = make_regression(n_samples=50, random_state=0) + X_df = pd.DataFrame(X, columns=[f"col{i}" for i in range(X.shape[1])]) + + est = HistGradientBoostingRegressor( + random_state=0, scoring="neg_mean_absolute_error", early_stopping=True + ) + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + est.fit(X_df, y) + + +def test_class_weights(): + """High level test to check class_weights.""" + n_samples = 255 + n_features = 2 + + X, y = make_classification( + n_samples=n_samples, + n_features=n_features, + n_informative=n_features, + n_redundant=0, + n_clusters_per_class=1, + n_classes=2, + random_state=0, + ) + y_is_1 = y == 1 + + # class_weight is the same as sample weights with the corresponding class + clf = HistGradientBoostingClassifier( + min_samples_leaf=2, random_state=0, max_depth=2 + ) + sample_weight = np.ones(shape=(n_samples)) + sample_weight[y_is_1] = 3.0 + clf.fit(X, y, sample_weight=sample_weight) + + class_weight = {0: 1.0, 1: 3.0} + clf_class_weighted = clone(clf).set_params(class_weight=class_weight) + clf_class_weighted.fit(X, y) + + assert_allclose(clf.decision_function(X), clf_class_weighted.decision_function(X)) + + # Check that sample_weight and class_weight are multiplicative + clf.fit(X, y, sample_weight=sample_weight**2) + clf_class_weighted.fit(X, y, sample_weight=sample_weight) + assert_allclose(clf.decision_function(X), clf_class_weighted.decision_function(X)) + + # Make imbalanced dataset + X_imb = np.concatenate((X[~y_is_1], X[y_is_1][:10])) + y_imb = np.concatenate((y[~y_is_1], y[y_is_1][:10])) + + # class_weight="balanced" is the same as sample_weights to be + # inversely proportional to n_samples / (n_classes * np.bincount(y)) + clf_balanced = clone(clf).set_params(class_weight="balanced") + clf_balanced.fit(X_imb, y_imb) + + class_weight = y_imb.shape[0] / (2 * np.bincount(y_imb)) + sample_weight = class_weight[y_imb] + clf_sample_weight = clone(clf).set_params(class_weight=None) + clf_sample_weight.fit(X_imb, y_imb, sample_weight=sample_weight) + + assert_allclose( + clf_balanced.decision_function(X_imb), + clf_sample_weight.decision_function(X_imb), + ) + + +def test_unknown_category_that_are_negative(): + """Check that unknown categories that are negative does not error. + + Non-regression test for #24274. + """ + rng = np.random.RandomState(42) + n_samples = 1000 + X = np.c_[rng.rand(n_samples), rng.randint(4, size=n_samples)] + y = np.zeros(shape=n_samples) + y[X[:, 1] % 2 == 0] = 1 + + hist = HistGradientBoostingRegressor( + random_state=0, + categorical_features=[False, True], + max_iter=10, + ).fit(X, y) + + # Check that negative values from the second column are treated like a + # missing category + X_test_neg = np.asarray([[1, -2], [3, -4]]) + X_test_nan = np.asarray([[1, np.nan], [3, np.nan]]) + + assert_allclose(hist.predict(X_test_neg), hist.predict(X_test_nan)) + + +@pytest.mark.parametrize( + ("GradientBoosting", "make_X_y"), + [ + (HistGradientBoostingClassifier, make_classification), + (HistGradientBoostingRegressor, make_regression), + ], +) +@pytest.mark.parametrize("sample_weight", [False, True]) +def test_X_val_in_fit(GradientBoosting, make_X_y, sample_weight, global_random_seed): + """Test that passing X_val, y_val in fit is same as validation fraction.""" + rng = np.random.RandomState(42) + n_samples = 100 + X, y = make_X_y(n_samples=n_samples, random_state=rng) + if sample_weight: + sample_weight = np.abs(rng.normal(size=n_samples)) + data = (X, y, sample_weight) + else: + sample_weight = None + data = (X, y) + rng_seed = global_random_seed + + # Fit with validation fraction and early stopping. + m1 = GradientBoosting( + early_stopping=True, + validation_fraction=0.5, + random_state=rng_seed, + ) + m1.fit(X, y, sample_weight) + + # Do train-test split ourselves. + rng = check_random_state(rng_seed) + # We do the same as in the fit method. + stratify = y if isinstance(m1, HistGradientBoostingClassifier) else None + random_seed = rng.randint(np.iinfo(np.uint32).max, dtype="u8") + X_train, X_val, y_train, y_val, *sw = train_test_split( + *data, + test_size=0.5, + stratify=stratify, + random_state=random_seed, + ) + if sample_weight is not None: + sample_weight_train = sw[0] + sample_weight_val = sw[1] + else: + sample_weight_train = None + sample_weight_val = None + m2 = GradientBoosting( + early_stopping=True, + random_state=rng_seed, + ) + m2.fit( + X_train, + y_train, + sample_weight=sample_weight_train, + X_val=X_val, + y_val=y_val, + sample_weight_val=sample_weight_val, + ) + + assert_allclose(m2.n_iter_, m1.n_iter_) + assert_allclose(m2.predict(X), m1.predict(X)) + + +def test_X_val_raises_missing_y_val(): + """Test that an error is raised if X_val given but y_val None.""" + X, y = make_classification(n_samples=4) + X, X_val = X[:2], X[2:] + y, y_val = y[:2], y[2:] + with pytest.raises( + ValueError, + match="X_val is provided, but y_val was not provided", + ): + HistGradientBoostingClassifier().fit(X, y, X_val=X_val) + with pytest.raises( + ValueError, + match="y_val is provided, but X_val was not provided", + ): + HistGradientBoostingClassifier().fit(X, y, y_val=y_val) + + +def test_X_val_raises_with_early_stopping_false(): + """Test that an error is raised if X_val given but early_stopping is False.""" + X, y = make_regression(n_samples=4) + X, X_val = X[:2], X[2:] + y, y_val = y[:2], y[2:] + with pytest.raises( + ValueError, + match="X_val and y_val are passed to fit while at the same time", + ): + HistGradientBoostingRegressor(early_stopping=False).fit( + X, y, X_val=X_val, y_val=y_val + ) + + +@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"]) +@pytest.mark.parametrize( + "HistGradientBoosting", + [HistGradientBoostingClassifier, HistGradientBoostingRegressor], +) +def test_dataframe_categorical_results_same_as_ndarray( + dataframe_lib, HistGradientBoosting +): + """Check that pandas categorical give the same results as ndarray.""" + pytest.importorskip(dataframe_lib) + + rng = np.random.RandomState(42) + n_samples = 5_000 + n_cardinality = 50 + max_bins = 100 + f_num = rng.rand(n_samples) + f_cat = rng.randint(n_cardinality, size=n_samples) + + # Make f_cat an informative feature + y = (f_cat % 3 == 0) & (f_num > 0.2) + + X = np.c_[f_num, f_cat] + f_cat = [f"cat{c:0>3}" for c in f_cat] + X_df = _convert_container( + np.asarray([f_num, f_cat]).T, + dataframe_lib, + ["f_num", "f_cat"], + categorical_feature_names=["f_cat"], + ) + + X_train, X_test, X_train_df, X_test_df, y_train, y_test = train_test_split( + X, X_df, y, random_state=0 + ) + + hist_kwargs = dict(max_iter=10, max_bins=max_bins, random_state=0) + hist_np = HistGradientBoosting(categorical_features=[False, True], **hist_kwargs) + hist_np.fit(X_train, y_train) + + hist_pd = HistGradientBoosting(categorical_features="from_dtype", **hist_kwargs) + hist_pd.fit(X_train_df, y_train) + + # Check categories are correct and sorted + categories = hist_pd._preprocessor.named_transformers_["encoder"].categories_[0] + assert_array_equal(categories, np.unique(f_cat)) + + assert len(hist_np._predictors) == len(hist_pd._predictors) + for predictor_1, predictor_2 in zip(hist_np._predictors, hist_pd._predictors): + assert len(predictor_1[0].nodes) == len(predictor_2[0].nodes) + + score_np = hist_np.score(X_test, y_test) + score_pd = hist_pd.score(X_test_df, y_test) + assert score_np == pytest.approx(score_pd) + assert_allclose(hist_np.predict(X_test), hist_pd.predict(X_test_df)) + + +@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"]) +@pytest.mark.parametrize( + "HistGradientBoosting", + [HistGradientBoostingClassifier, HistGradientBoostingRegressor], +) +def test_dataframe_categorical_errors(dataframe_lib, HistGradientBoosting): + """Check error cases for pandas categorical feature.""" + pytest.importorskip(dataframe_lib) + msg = "Categorical feature 'f_cat' is expected to have a cardinality <= 16" + hist = HistGradientBoosting(categorical_features="from_dtype", max_bins=16) + + rng = np.random.RandomState(42) + f_cat = rng.randint(0, high=100, size=100).astype(str) + X_df = _convert_container( + f_cat[:, None], dataframe_lib, ["f_cat"], categorical_feature_names=["f_cat"] + ) + y = rng.randint(0, high=2, size=100) + + with pytest.raises(ValueError, match=msg): + hist.fit(X_df, y) + + +@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"]) +def test_categorical_different_order_same_model(dataframe_lib): + """Check that the order of the categorical gives same model.""" + pytest.importorskip(dataframe_lib) + rng = np.random.RandomState(42) + n_samples = 1_000 + f_ints = rng.randint(low=0, high=2, size=n_samples) + + # Construct a target with some noise + y = f_ints.copy() + flipped = rng.choice([True, False], size=n_samples, p=[0.1, 0.9]) + y[flipped] = 1 - y[flipped] + + # Construct categorical where 0 -> A and 1 -> B and 1 -> A and 0 -> B + f_cat_a_b = np.asarray(["A", "B"])[f_ints] + f_cat_b_a = np.asarray(["B", "A"])[f_ints] + df_a_b = _convert_container( + f_cat_a_b[:, None], + dataframe_lib, + ["f_cat"], + categorical_feature_names=["f_cat"], + ) + df_b_a = _convert_container( + f_cat_b_a[:, None], + dataframe_lib, + ["f_cat"], + categorical_feature_names=["f_cat"], + ) + + hist_a_b = HistGradientBoostingClassifier( + categorical_features="from_dtype", random_state=0 + ) + hist_b_a = HistGradientBoostingClassifier( + categorical_features="from_dtype", random_state=0 + ) + + hist_a_b.fit(df_a_b, y) + hist_b_a.fit(df_b_a, y) + + assert len(hist_a_b._predictors) == len(hist_b_a._predictors) + for predictor_1, predictor_2 in zip(hist_a_b._predictors, hist_b_a._predictors): + assert len(predictor_1[0].nodes) == len(predictor_2[0].nodes) + + +def get_different_bitness_node_ndarray(node_ndarray): + new_dtype_for_indexing_fields = np.int64 if _IS_32BIT else np.int32 + + # field names in Node struct with np.intp types (see + # sklearn/ensemble/_hist_gradient_boosting/common.pyx) + indexing_field_names = ["feature_idx"] + + new_dtype_dict = { + name: dtype for name, (dtype, _) in node_ndarray.dtype.fields.items() + } + for name in indexing_field_names: + new_dtype_dict[name] = new_dtype_for_indexing_fields + + new_dtype = np.dtype( + {"names": list(new_dtype_dict.keys()), "formats": list(new_dtype_dict.values())} + ) + return node_ndarray.astype(new_dtype, casting="same_kind") + + +def reduce_predictor_with_different_bitness(predictor): + cls, args, state = predictor.__reduce__() + + new_state = state.copy() + new_state["nodes"] = get_different_bitness_node_ndarray(new_state["nodes"]) + + return (cls, args, new_state) + + +def test_different_bitness_pickle(): + X, y = make_classification(random_state=0) + + clf = HistGradientBoostingClassifier(random_state=0, max_depth=3) + clf.fit(X, y) + score = clf.score(X, y) + + def pickle_dump_with_different_bitness(): + f = io.BytesIO() + p = pickle.Pickler(f) + p.dispatch_table = copyreg.dispatch_table.copy() + p.dispatch_table[TreePredictor] = reduce_predictor_with_different_bitness + + p.dump(clf) + f.seek(0) + return f + + # Simulate loading a pickle of the same model trained on a platform with different + # bitness that than the platform it will be used to make predictions on: + new_clf = pickle.load(pickle_dump_with_different_bitness()) + new_score = new_clf.score(X, y) + assert score == pytest.approx(new_score) + + +def test_different_bitness_joblib_pickle(): + # Make sure that a platform specific pickle generated on a 64 bit + # platform can be converted at pickle load time into an estimator + # with Cython code that works with the host's native integer precision + # to index nodes in the tree data structure when the host is a 32 bit + # platform (and vice versa). + # + # This is in particular useful to be able to train a model on a 64 bit Linux + # server and deploy the model as part of a (32 bit) WASM in-browser + # application using pyodide. + X, y = make_classification(random_state=0) + + clf = HistGradientBoostingClassifier(random_state=0, max_depth=3) + clf.fit(X, y) + score = clf.score(X, y) + + def joblib_dump_with_different_bitness(): + f = io.BytesIO() + p = NumpyPickler(f) + p.dispatch_table = copyreg.dispatch_table.copy() + p.dispatch_table[TreePredictor] = reduce_predictor_with_different_bitness + + p.dump(clf) + f.seek(0) + return f + + new_clf = joblib.load(joblib_dump_with_different_bitness()) + new_score = new_clf.score(X, y) + assert score == pytest.approx(new_score) + + +def test_pandas_nullable_dtype(): + # Non regression test for https://github.com/scikit-learn/scikit-learn/issues/28317 + pd = pytest.importorskip("pandas") + + rng = np.random.default_rng(0) + X = pd.DataFrame({"a": rng.integers(10, size=100)}).astype(pd.Int64Dtype()) + y = rng.integers(2, size=100) + + clf = HistGradientBoostingClassifier() + clf.fit(X, y) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py new file mode 100644 index 0000000000000000000000000000000000000000..a55cb871e3c72ea04325b0b72f7aabc419285921 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -0,0 +1,650 @@ +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_array_equal +from pytest import approx + +from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper +from sklearn.ensemble._hist_gradient_boosting.common import ( + G_H_DTYPE, + X_BINNED_DTYPE, + X_BITSET_INNER_DTYPE, + X_DTYPE, + Y_DTYPE, +) +from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower +from sklearn.preprocessing import OneHotEncoder +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads + +n_threads = _openmp_effective_n_threads() + + +def _make_training_data(n_bins=256, constant_hessian=True): + rng = np.random.RandomState(42) + n_samples = 10000 + + # Generate some test data directly binned so as to test the grower code + # independently of the binning logic. + X_binned = rng.randint(0, n_bins - 1, size=(n_samples, 2), dtype=X_BINNED_DTYPE) + X_binned = np.asfortranarray(X_binned) + + def true_decision_function(input_features): + """Ground truth decision function + + This is a very simple yet asymmetric decision tree. Therefore the + grower code should have no trouble recovering the decision function + from 10000 training samples. + """ + if input_features[0] <= n_bins // 2: + return -1 + else: + return -1 if input_features[1] <= n_bins // 3 else 1 + + target = np.array([true_decision_function(x) for x in X_binned], dtype=Y_DTYPE) + + # Assume a square loss applied to an initial model that always predicts 0 + # (hardcoded for this test): + all_gradients = target.astype(G_H_DTYPE) + shape_hessians = 1 if constant_hessian else all_gradients.shape + all_hessians = np.ones(shape=shape_hessians, dtype=G_H_DTYPE) + + return X_binned, all_gradients, all_hessians + + +def _check_children_consistency(parent, left, right): + # Make sure the samples are correctly dispatched from a parent to its + # children + assert parent.left_child is left + assert parent.right_child is right + + # each sample from the parent is propagated to one of the two children + assert len(left.sample_indices) + len(right.sample_indices) == len( + parent.sample_indices + ) + + assert set(left.sample_indices).union(set(right.sample_indices)) == set( + parent.sample_indices + ) + + # samples are sent either to the left or the right node, never to both + assert set(left.sample_indices).intersection(set(right.sample_indices)) == set() + + +@pytest.mark.parametrize( + "n_bins, constant_hessian, stopping_param, shrinkage", + [ + (11, True, "min_gain_to_split", 0.5), + (11, False, "min_gain_to_split", 1.0), + (11, True, "max_leaf_nodes", 1.0), + (11, False, "max_leaf_nodes", 0.1), + (42, True, "max_leaf_nodes", 0.01), + (42, False, "max_leaf_nodes", 1.0), + (256, True, "min_gain_to_split", 1.0), + (256, True, "max_leaf_nodes", 0.1), + ], +) +def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage): + X_binned, all_gradients, all_hessians = _make_training_data( + n_bins=n_bins, constant_hessian=constant_hessian + ) + n_samples = X_binned.shape[0] + + if stopping_param == "max_leaf_nodes": + stopping_param = {"max_leaf_nodes": 3} + else: + stopping_param = {"min_gain_to_split": 0.01} + + grower = TreeGrower( + X_binned, + all_gradients, + all_hessians, + n_bins=n_bins, + shrinkage=shrinkage, + min_samples_leaf=1, + **stopping_param, + ) + + # The root node is not yet split, but the best possible split has + # already been evaluated: + assert grower.root.left_child is None + assert grower.root.right_child is None + + root_split = grower.root.split_info + assert root_split.feature_idx == 0 + assert root_split.bin_idx == n_bins // 2 + assert len(grower.splittable_nodes) == 1 + + # Calling split next applies the next split and computes the best split + # for each of the two newly introduced children nodes. + left_node, right_node = grower.split_next() + + # All training samples have ben split in the two nodes, approximately + # 50%/50% + _check_children_consistency(grower.root, left_node, right_node) + assert len(left_node.sample_indices) > 0.4 * n_samples + assert len(left_node.sample_indices) < 0.6 * n_samples + + if grower.min_gain_to_split > 0: + # The left node is too pure: there is no gain to split it further. + assert left_node.split_info.gain < grower.min_gain_to_split + assert left_node in grower.finalized_leaves + + # The right node can still be split further, this time on feature #1 + split_info = right_node.split_info + assert split_info.gain > 1.0 + assert split_info.feature_idx == 1 + assert split_info.bin_idx == n_bins // 3 + assert right_node.left_child is None + assert right_node.right_child is None + + # The right split has not been applied yet. Let's do it now: + assert len(grower.splittable_nodes) == 1 + right_left_node, right_right_node = grower.split_next() + _check_children_consistency(right_node, right_left_node, right_right_node) + assert len(right_left_node.sample_indices) > 0.1 * n_samples + assert len(right_left_node.sample_indices) < 0.2 * n_samples + + assert len(right_right_node.sample_indices) > 0.2 * n_samples + assert len(right_right_node.sample_indices) < 0.4 * n_samples + + # All the leafs are pure, it is not possible to split any further: + assert not grower.splittable_nodes + + grower._apply_shrinkage() + + # Check the values of the leaves: + assert grower.root.left_child.value == approx(shrinkage) + assert grower.root.right_child.left_child.value == approx(shrinkage) + assert grower.root.right_child.right_child.value == approx(-shrinkage, rel=1e-3) + + +def test_predictor_from_grower(): + # Build a tree on the toy 3-leaf dataset to extract the predictor. + n_bins = 256 + X_binned, all_gradients, all_hessians = _make_training_data(n_bins=n_bins) + grower = TreeGrower( + X_binned, + all_gradients, + all_hessians, + n_bins=n_bins, + shrinkage=1.0, + max_leaf_nodes=3, + min_samples_leaf=5, + ) + grower.grow() + assert grower.n_nodes == 5 # (2 decision nodes + 3 leaves) + + # Check that the node structure can be converted into a predictor + # object to perform predictions at scale + # We pass undefined binning_thresholds because we won't use predict anyway + predictor = grower.make_predictor( + binning_thresholds=np.zeros((X_binned.shape[1], n_bins)) + ) + assert predictor.nodes.shape[0] == 5 + assert predictor.nodes["is_leaf"].sum() == 3 + + # Probe some predictions for each leaf of the tree + # each group of 3 samples corresponds to a condition in _make_training_data + input_data = np.array( + [ + [0, 0], + [42, 99], + [128, 254], + [129, 0], + [129, 85], + [254, 85], + [129, 86], + [129, 254], + [242, 100], + ], + dtype=np.uint8, + ) + missing_values_bin_idx = n_bins - 1 + predictions = predictor.predict_binned( + input_data, missing_values_bin_idx, n_threads + ) + expected_targets = [1, 1, 1, 1, 1, 1, -1, -1, -1] + assert np.allclose(predictions, expected_targets) + + # Check that training set can be recovered exactly: + predictions = predictor.predict_binned(X_binned, missing_values_bin_idx, n_threads) + assert np.allclose(predictions, -all_gradients) + + +@pytest.mark.parametrize( + "n_samples, min_samples_leaf, n_bins, constant_hessian, noise", + [ + (11, 10, 7, True, 0), + (13, 10, 42, False, 0), + (56, 10, 255, True, 0.1), + (101, 3, 7, True, 0), + (200, 42, 42, False, 0), + (300, 55, 255, True, 0.1), + (300, 301, 255, True, 0.1), + ], +) +def test_min_samples_leaf(n_samples, min_samples_leaf, n_bins, constant_hessian, noise): + rng = np.random.RandomState(seed=0) + # data = linear target, 3 features, 1 irrelevant. + X = rng.normal(size=(n_samples, 3)) + y = X[:, 0] - X[:, 1] + if noise: + y_scale = y.std() + y += rng.normal(scale=noise, size=n_samples) * y_scale + mapper = _BinMapper(n_bins=n_bins) + X = mapper.fit_transform(X) + + all_gradients = y.astype(G_H_DTYPE) + shape_hessian = 1 if constant_hessian else all_gradients.shape + all_hessians = np.ones(shape=shape_hessian, dtype=G_H_DTYPE) + grower = TreeGrower( + X, + all_gradients, + all_hessians, + n_bins=n_bins, + shrinkage=1.0, + min_samples_leaf=min_samples_leaf, + max_leaf_nodes=n_samples, + ) + grower.grow() + predictor = grower.make_predictor(binning_thresholds=mapper.bin_thresholds_) + + if n_samples >= min_samples_leaf: + for node in predictor.nodes: + if node["is_leaf"]: + assert node["count"] >= min_samples_leaf + else: + assert predictor.nodes.shape[0] == 1 + assert predictor.nodes[0]["is_leaf"] + assert predictor.nodes[0]["count"] == n_samples + + +@pytest.mark.parametrize("n_samples, min_samples_leaf", [(99, 50), (100, 50)]) +def test_min_samples_leaf_root(n_samples, min_samples_leaf): + # Make sure root node isn't split if n_samples is not at least twice + # min_samples_leaf + rng = np.random.RandomState(seed=0) + + n_bins = 256 + + # data = linear target, 3 features, 1 irrelevant. + X = rng.normal(size=(n_samples, 3)) + y = X[:, 0] - X[:, 1] + mapper = _BinMapper(n_bins=n_bins) + X = mapper.fit_transform(X) + + all_gradients = y.astype(G_H_DTYPE) + all_hessians = np.ones(shape=1, dtype=G_H_DTYPE) + grower = TreeGrower( + X, + all_gradients, + all_hessians, + n_bins=n_bins, + shrinkage=1.0, + min_samples_leaf=min_samples_leaf, + max_leaf_nodes=n_samples, + ) + grower.grow() + if n_samples >= min_samples_leaf * 2: + assert len(grower.finalized_leaves) >= 2 + else: + assert len(grower.finalized_leaves) == 1 + + +def assert_is_stump(grower): + # To assert that stumps are created when max_depth=1 + for leaf in (grower.root.left_child, grower.root.right_child): + assert leaf.left_child is None + assert leaf.right_child is None + + +@pytest.mark.parametrize("max_depth", [1, 2, 3]) +def test_max_depth(max_depth): + # Make sure max_depth parameter works as expected + rng = np.random.RandomState(seed=0) + + n_bins = 256 + n_samples = 1000 + + # data = linear target, 3 features, 1 irrelevant. + X = rng.normal(size=(n_samples, 3)) + y = X[:, 0] - X[:, 1] + mapper = _BinMapper(n_bins=n_bins) + X = mapper.fit_transform(X) + + all_gradients = y.astype(G_H_DTYPE) + all_hessians = np.ones(shape=1, dtype=G_H_DTYPE) + grower = TreeGrower(X, all_gradients, all_hessians, max_depth=max_depth) + grower.grow() + + depth = max(leaf.depth for leaf in grower.finalized_leaves) + assert depth == max_depth + + if max_depth == 1: + assert_is_stump(grower) + + +def test_input_validation(): + X_binned, all_gradients, all_hessians = _make_training_data() + + X_binned_float = X_binned.astype(np.float32) + with pytest.raises(NotImplementedError, match="X_binned must be of type uint8"): + TreeGrower(X_binned_float, all_gradients, all_hessians) + + X_binned_C_array = np.ascontiguousarray(X_binned) + with pytest.raises( + ValueError, match="X_binned should be passed as Fortran contiguous array" + ): + TreeGrower(X_binned_C_array, all_gradients, all_hessians) + + +def test_init_parameters_validation(): + X_binned, all_gradients, all_hessians = _make_training_data() + with pytest.raises(ValueError, match="min_gain_to_split=-1 must be positive"): + TreeGrower(X_binned, all_gradients, all_hessians, min_gain_to_split=-1) + + with pytest.raises(ValueError, match="min_hessian_to_split=-1 must be positive"): + TreeGrower(X_binned, all_gradients, all_hessians, min_hessian_to_split=-1) + + +def test_missing_value_predict_only(): + # Make sure that missing values are supported at predict time even if they + # were not encountered in the training data: the missing values are + # assigned to whichever child has the most samples. + + rng = np.random.RandomState(0) + n_samples = 100 + X_binned = rng.randint(0, 256, size=(n_samples, 1), dtype=np.uint8) + X_binned = np.asfortranarray(X_binned) + + gradients = rng.normal(size=n_samples).astype(G_H_DTYPE) + hessians = np.ones(shape=1, dtype=G_H_DTYPE) + + grower = TreeGrower( + X_binned, gradients, hessians, min_samples_leaf=5, has_missing_values=False + ) + grower.grow() + + # We pass undefined binning_thresholds because we won't use predict anyway + predictor = grower.make_predictor( + binning_thresholds=np.zeros((X_binned.shape[1], X_binned.max() + 1)) + ) + + # go from root to a leaf, always following node with the most samples. + # That's the path nans are supposed to take + node = predictor.nodes[0] + while not node["is_leaf"]: + left = predictor.nodes[node["left"]] + right = predictor.nodes[node["right"]] + node = left if left["count"] > right["count"] else right + + prediction_main_path = node["value"] + + # now build X_test with only nans, and make sure all predictions are equal + # to prediction_main_path + all_nans = np.full(shape=(n_samples, 1), fill_value=np.nan) + known_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE) + f_idx_map = np.zeros(0, dtype=np.uint32) + + y_pred = predictor.predict(all_nans, known_cat_bitsets, f_idx_map, n_threads) + assert np.all(y_pred == prediction_main_path) + + +def test_split_on_nan_with_infinite_values(): + # Make sure the split on nan situations are respected even when there are + # samples with +inf values (we set the threshold to +inf when we have a + # split on nan so this test makes sure this does not introduce edge-case + # bugs). We need to use the private API so that we can also test + # predict_binned(). + + X = np.array([0, 1, np.inf, np.nan, np.nan]).reshape(-1, 1) + # the gradient values will force a split on nan situation + gradients = np.array([0, 0, 0, 100, 100], dtype=G_H_DTYPE) + hessians = np.ones(shape=1, dtype=G_H_DTYPE) + + bin_mapper = _BinMapper() + X_binned = bin_mapper.fit_transform(X) + + n_bins_non_missing = 3 + has_missing_values = True + grower = TreeGrower( + X_binned, + gradients, + hessians, + n_bins_non_missing=n_bins_non_missing, + has_missing_values=has_missing_values, + min_samples_leaf=1, + n_threads=n_threads, + ) + + grower.grow() + + predictor = grower.make_predictor(binning_thresholds=bin_mapper.bin_thresholds_) + + # sanity check: this was a split on nan + assert predictor.nodes[0]["num_threshold"] == np.inf + assert predictor.nodes[0]["bin_threshold"] == n_bins_non_missing - 1 + + known_cat_bitsets, f_idx_map = bin_mapper.make_known_categories_bitsets() + + # Make sure in particular that the +inf sample is mapped to the left child + # Note that lightgbm "fails" here and will assign the inf sample to the + # right child, even though it's a "split on nan" situation. + predictions = predictor.predict(X, known_cat_bitsets, f_idx_map, n_threads) + predictions_binned = predictor.predict_binned( + X_binned, + missing_values_bin_idx=bin_mapper.missing_values_bin_idx_, + n_threads=n_threads, + ) + np.testing.assert_allclose(predictions, -gradients) + np.testing.assert_allclose(predictions_binned, -gradients) + + +def test_grow_tree_categories(): + # Check that the grower produces the right predictor tree when a split is + # categorical + X_binned = np.array([[0, 1] * 11 + [1]], dtype=X_BINNED_DTYPE).T + X_binned = np.asfortranarray(X_binned) + + all_gradients = np.array([10, 1] * 11 + [1], dtype=G_H_DTYPE) + all_hessians = np.ones(1, dtype=G_H_DTYPE) + is_categorical = np.ones(1, dtype=np.uint8) + + grower = TreeGrower( + X_binned, + all_gradients, + all_hessians, + n_bins=4, + shrinkage=1.0, + min_samples_leaf=1, + is_categorical=is_categorical, + n_threads=n_threads, + ) + grower.grow() + assert grower.n_nodes == 3 + + categories = [np.array([4, 9], dtype=X_DTYPE)] + predictor = grower.make_predictor(binning_thresholds=categories) + root = predictor.nodes[0] + assert root["count"] == 23 + assert root["depth"] == 0 + assert root["is_categorical"] + + left, right = predictor.nodes[root["left"]], predictor.nodes[root["right"]] + + # arbitrary validation, but this means ones go to the left. + assert left["count"] >= right["count"] + + # check binned category value (1) + expected_binned_cat_bitset = [2**1] + [0] * 7 + binned_cat_bitset = predictor.binned_left_cat_bitsets + assert_array_equal(binned_cat_bitset[0], expected_binned_cat_bitset) + + # check raw category value (9) + expected_raw_cat_bitsets = [2**9] + [0] * 7 + raw_cat_bitsets = predictor.raw_left_cat_bitsets + assert_array_equal(raw_cat_bitsets[0], expected_raw_cat_bitsets) + + # Note that since there was no missing values during training, the missing + # values aren't part of the bitsets. However, we expect the missing values + # to go to the biggest child (i.e. the left one). + # The left child has a value of -1 = negative gradient. + assert root["missing_go_to_left"] + + # make sure binned missing values are mapped to the left child during + # prediction + prediction_binned = predictor.predict_binned( + np.asarray([[6]]).astype(X_BINNED_DTYPE), + missing_values_bin_idx=6, + n_threads=n_threads, + ) + assert_allclose(prediction_binned, [-1]) # negative gradient + + # make sure raw missing values are mapped to the left child during + # prediction + known_cat_bitsets = np.zeros((1, 8), dtype=np.uint32) # ignored anyway + f_idx_map = np.array([0], dtype=np.uint32) + prediction = predictor.predict( + np.array([[np.nan]]), known_cat_bitsets, f_idx_map, n_threads + ) + assert_allclose(prediction, [-1]) + + +@pytest.mark.parametrize("min_samples_leaf", (1, 20)) +@pytest.mark.parametrize("n_unique_categories", (2, 10, 100)) +@pytest.mark.parametrize("target", ("binary", "random", "equal")) +def test_ohe_equivalence(min_samples_leaf, n_unique_categories, target): + # Make sure that native categorical splits are equivalent to using a OHE, + # when given enough depth + + rng = np.random.RandomState(0) + n_samples = 10_000 + X_binned = rng.randint(0, n_unique_categories, size=(n_samples, 1), dtype=np.uint8) + + X_ohe = OneHotEncoder(sparse_output=False).fit_transform(X_binned) + X_ohe = np.asfortranarray(X_ohe).astype(np.uint8) + + if target == "equal": + gradients = X_binned.reshape(-1) + elif target == "binary": + gradients = (X_binned % 2).reshape(-1) + else: + gradients = rng.randn(n_samples) + gradients = gradients.astype(G_H_DTYPE) + + hessians = np.ones(shape=1, dtype=G_H_DTYPE) + + grower_params = { + "min_samples_leaf": min_samples_leaf, + "max_depth": None, + "max_leaf_nodes": None, + } + + grower = TreeGrower( + X_binned, gradients, hessians, is_categorical=[True], **grower_params + ) + grower.grow() + # we pass undefined bin_thresholds because we won't use predict() + predictor = grower.make_predictor( + binning_thresholds=np.zeros((1, n_unique_categories)) + ) + preds = predictor.predict_binned( + X_binned, missing_values_bin_idx=255, n_threads=n_threads + ) + + grower_ohe = TreeGrower(X_ohe, gradients, hessians, **grower_params) + grower_ohe.grow() + predictor_ohe = grower_ohe.make_predictor( + binning_thresholds=np.zeros((X_ohe.shape[1], n_unique_categories)) + ) + preds_ohe = predictor_ohe.predict_binned( + X_ohe, missing_values_bin_idx=255, n_threads=n_threads + ) + + assert predictor.get_max_depth() <= predictor_ohe.get_max_depth() + if target == "binary" and n_unique_categories > 2: + # OHE needs more splits to achieve the same predictions + assert predictor.get_max_depth() < predictor_ohe.get_max_depth() + + np.testing.assert_allclose(preds, preds_ohe) + + +def test_grower_interaction_constraints(): + """Check that grower respects interaction constraints.""" + n_features = 6 + interaction_cst = [{0, 1}, {1, 2}, {3, 4, 5}] + n_samples = 10 + n_bins = 6 + root_feature_splits = [] + + def get_all_children(node): + res = [] + if node.is_leaf: + return res + for n in [node.left_child, node.right_child]: + res.append(n) + res.extend(get_all_children(n)) + return res + + for seed in range(20): + rng = np.random.RandomState(seed) + + X_binned = rng.randint( + 0, n_bins - 1, size=(n_samples, n_features), dtype=X_BINNED_DTYPE + ) + X_binned = np.asfortranarray(X_binned) + gradients = rng.normal(size=n_samples).astype(G_H_DTYPE) + hessians = np.ones(shape=1, dtype=G_H_DTYPE) + + grower = TreeGrower( + X_binned, + gradients, + hessians, + n_bins=n_bins, + min_samples_leaf=1, + interaction_cst=interaction_cst, + n_threads=n_threads, + ) + grower.grow() + + root_feature_idx = grower.root.split_info.feature_idx + root_feature_splits.append(root_feature_idx) + + feature_idx_to_constraint_set = { + 0: {0, 1}, + 1: {0, 1, 2}, + 2: {1, 2}, + 3: {3, 4, 5}, + 4: {3, 4, 5}, + 5: {3, 4, 5}, + } + + root_constraint_set = feature_idx_to_constraint_set[root_feature_idx] + for node in (grower.root.left_child, grower.root.right_child): + # Root's children's allowed_features must be the root's constraints set. + assert_array_equal(node.allowed_features, list(root_constraint_set)) + for node in get_all_children(grower.root): + if node.is_leaf: + continue + # Ensure that each node uses a subset of features of its parent node. + parent_interaction_cst_indices = set(node.interaction_cst_indices) + right_interactions_cst_indices = set( + node.right_child.interaction_cst_indices + ) + left_interactions_cst_indices = set(node.left_child.interaction_cst_indices) + + assert right_interactions_cst_indices.issubset( + parent_interaction_cst_indices + ) + assert left_interactions_cst_indices.issubset( + parent_interaction_cst_indices + ) + # The features used for split must have been present in the root's + # constraint set. + assert node.split_info.feature_idx in root_constraint_set + + # Make sure that every feature is used at least once as split for the root node. + assert ( + len(set(root_feature_splits)) + == len(set().union(*interaction_cst)) + == n_features + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py new file mode 100644 index 0000000000000000000000000000000000000000..22375c7d4ea2c378bf7a45ad619f92c187d40984 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_histogram.py @@ -0,0 +1,239 @@ +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_array_equal + +from sklearn.ensemble._hist_gradient_boosting.common import ( + G_H_DTYPE, + HISTOGRAM_DTYPE, + X_BINNED_DTYPE, +) +from sklearn.ensemble._hist_gradient_boosting.histogram import ( + _build_histogram, + _build_histogram_naive, + _build_histogram_no_hessian, + _build_histogram_root, + _build_histogram_root_no_hessian, + _subtract_histograms, +) + + +@pytest.mark.parametrize("build_func", [_build_histogram_naive, _build_histogram]) +def test_build_histogram(build_func): + binned_feature = np.array([0, 2, 0, 1, 2, 0, 2, 1], dtype=X_BINNED_DTYPE) + + # Small sample_indices (below unrolling threshold) + ordered_gradients = np.array([0, 1, 3], dtype=G_H_DTYPE) + ordered_hessians = np.array([1, 1, 2], dtype=G_H_DTYPE) + + sample_indices = np.array([0, 2, 3], dtype=np.uint32) + hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE) + build_func( + 0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist + ) + hist = hist[0] + assert_array_equal(hist["count"], [2, 1, 0]) + assert_allclose(hist["sum_gradients"], [1, 3, 0]) + assert_allclose(hist["sum_hessians"], [2, 2, 0]) + + # Larger sample_indices (above unrolling threshold) + sample_indices = np.array([0, 2, 3, 6, 7], dtype=np.uint32) + ordered_gradients = np.array([0, 1, 3, 0, 1], dtype=G_H_DTYPE) + ordered_hessians = np.array([1, 1, 2, 1, 0], dtype=G_H_DTYPE) + + hist = np.zeros((1, 3), dtype=HISTOGRAM_DTYPE) + build_func( + 0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist + ) + hist = hist[0] + assert_array_equal(hist["count"], [2, 2, 1]) + assert_allclose(hist["sum_gradients"], [1, 4, 0]) + assert_allclose(hist["sum_hessians"], [2, 2, 1]) + + +def test_histogram_sample_order_independence(): + # Make sure the order of the samples has no impact on the histogram + # computations + rng = np.random.RandomState(42) + n_sub_samples = 100 + n_samples = 1000 + n_bins = 256 + + binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=X_BINNED_DTYPE) + sample_indices = rng.choice( + np.arange(n_samples, dtype=np.uint32), n_sub_samples, replace=False + ) + ordered_gradients = rng.randn(n_sub_samples).astype(G_H_DTYPE) + hist_gc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + _build_histogram_no_hessian( + 0, sample_indices, binned_feature, ordered_gradients, hist_gc + ) + + ordered_hessians = rng.exponential(size=n_sub_samples).astype(G_H_DTYPE) + hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + _build_histogram( + 0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist_ghc + ) + + permutation = rng.permutation(n_sub_samples) + hist_gc_perm = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + _build_histogram_no_hessian( + 0, + sample_indices[permutation], + binned_feature, + ordered_gradients[permutation], + hist_gc_perm, + ) + + hist_ghc_perm = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + _build_histogram( + 0, + sample_indices[permutation], + binned_feature, + ordered_gradients[permutation], + ordered_hessians[permutation], + hist_ghc_perm, + ) + + hist_gc = hist_gc[0] + hist_ghc = hist_ghc[0] + hist_gc_perm = hist_gc_perm[0] + hist_ghc_perm = hist_ghc_perm[0] + + assert_allclose(hist_gc["sum_gradients"], hist_gc_perm["sum_gradients"]) + assert_array_equal(hist_gc["count"], hist_gc_perm["count"]) + + assert_allclose(hist_ghc["sum_gradients"], hist_ghc_perm["sum_gradients"]) + assert_allclose(hist_ghc["sum_hessians"], hist_ghc_perm["sum_hessians"]) + assert_array_equal(hist_ghc["count"], hist_ghc_perm["count"]) + + +@pytest.mark.parametrize("constant_hessian", [True, False]) +def test_unrolled_equivalent_to_naive(constant_hessian): + # Make sure the different unrolled histogram computations give the same + # results as the naive one. + rng = np.random.RandomState(42) + n_samples = 10 + n_bins = 5 + sample_indices = np.arange(n_samples).astype(np.uint32) + binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8) + ordered_gradients = rng.randn(n_samples).astype(G_H_DTYPE) + if constant_hessian: + ordered_hessians = np.ones(n_samples, dtype=G_H_DTYPE) + else: + ordered_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE) + + hist_gc_root = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + hist_ghc_root = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + hist_gc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + hist_ghc = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + hist_naive = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + + _build_histogram_root_no_hessian(0, binned_feature, ordered_gradients, hist_gc_root) + _build_histogram_root( + 0, binned_feature, ordered_gradients, ordered_hessians, hist_ghc_root + ) + _build_histogram_no_hessian( + 0, sample_indices, binned_feature, ordered_gradients, hist_gc + ) + _build_histogram( + 0, sample_indices, binned_feature, ordered_gradients, ordered_hessians, hist_ghc + ) + _build_histogram_naive( + 0, + sample_indices, + binned_feature, + ordered_gradients, + ordered_hessians, + hist_naive, + ) + + hist_naive = hist_naive[0] + hist_gc_root = hist_gc_root[0] + hist_ghc_root = hist_ghc_root[0] + hist_gc = hist_gc[0] + hist_ghc = hist_ghc[0] + for hist in (hist_gc_root, hist_ghc_root, hist_gc, hist_ghc): + assert_array_equal(hist["count"], hist_naive["count"]) + assert_allclose(hist["sum_gradients"], hist_naive["sum_gradients"]) + for hist in (hist_ghc_root, hist_ghc): + assert_allclose(hist["sum_hessians"], hist_naive["sum_hessians"]) + for hist in (hist_gc_root, hist_gc): + assert_array_equal(hist["sum_hessians"], np.zeros(n_bins)) + + +@pytest.mark.parametrize("constant_hessian", [True, False]) +def test_hist_subtraction(constant_hessian): + # Make sure the histogram subtraction trick gives the same result as the + # classical method. + rng = np.random.RandomState(42) + n_samples = 10 + n_bins = 5 + sample_indices = np.arange(n_samples).astype(np.uint32) + binned_feature = rng.randint(0, n_bins - 1, size=n_samples, dtype=np.uint8) + ordered_gradients = rng.randn(n_samples).astype(G_H_DTYPE) + if constant_hessian: + ordered_hessians = np.ones(n_samples, dtype=G_H_DTYPE) + else: + ordered_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE) + + hist_parent = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + if constant_hessian: + _build_histogram_no_hessian( + 0, sample_indices, binned_feature, ordered_gradients, hist_parent + ) + else: + _build_histogram( + 0, + sample_indices, + binned_feature, + ordered_gradients, + ordered_hessians, + hist_parent, + ) + + mask = rng.randint(0, 2, n_samples).astype(bool) + + sample_indices_left = sample_indices[mask] + ordered_gradients_left = ordered_gradients[mask] + ordered_hessians_left = ordered_hessians[mask] + hist_left = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + if constant_hessian: + _build_histogram_no_hessian( + 0, sample_indices_left, binned_feature, ordered_gradients_left, hist_left + ) + else: + _build_histogram( + 0, + sample_indices_left, + binned_feature, + ordered_gradients_left, + ordered_hessians_left, + hist_left, + ) + + sample_indices_right = sample_indices[~mask] + ordered_gradients_right = ordered_gradients[~mask] + ordered_hessians_right = ordered_hessians[~mask] + hist_right = np.zeros((1, n_bins), dtype=HISTOGRAM_DTYPE) + if constant_hessian: + _build_histogram_no_hessian( + 0, sample_indices_right, binned_feature, ordered_gradients_right, hist_right + ) + else: + _build_histogram( + 0, + sample_indices_right, + binned_feature, + ordered_gradients_right, + ordered_hessians_right, + hist_right, + ) + + hist_left_sub = np.copy(hist_parent) + hist_right_sub = np.copy(hist_parent) + _subtract_histograms(0, n_bins, hist_left_sub, hist_right) + _subtract_histograms(0, n_bins, hist_right_sub, hist_left) + + for key in ("count", "sum_hessians", "sum_gradients"): + assert_allclose(hist_left[key], hist_left_sub[key], rtol=1e-6) + assert_allclose(hist_right[key], hist_right_sub[key], rtol=1e-6) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_constraints.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_constraints.py new file mode 100644 index 0000000000000000000000000000000000000000..56b6068d794e8c96c24ee0ef18dbad3f66ad64b0 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_constraints.py @@ -0,0 +1,446 @@ +import re + +import numpy as np +import pytest + +from sklearn.ensemble import ( + HistGradientBoostingClassifier, + HistGradientBoostingRegressor, +) +from sklearn.ensemble._hist_gradient_boosting.common import ( + G_H_DTYPE, + X_BINNED_DTYPE, + MonotonicConstraint, +) +from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower +from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder +from sklearn.ensemble._hist_gradient_boosting.splitting import ( + Splitter, + compute_node_value, +) +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +from sklearn.utils._testing import _convert_container + +n_threads = _openmp_effective_n_threads() + + +def is_increasing(a): + return (np.diff(a) >= 0.0).all() + + +def is_decreasing(a): + return (np.diff(a) <= 0.0).all() + + +def assert_leaves_values_monotonic(predictor, monotonic_cst): + # make sure leaves values (from left to right) are either all increasing + # or all decreasing (or neither) depending on the monotonic constraint. + nodes = predictor.nodes + + def get_leaves_values(): + """get leaves values from left to right""" + values = [] + + def depth_first_collect_leaf_values(node_idx): + node = nodes[node_idx] + if node["is_leaf"]: + values.append(node["value"]) + return + depth_first_collect_leaf_values(node["left"]) + depth_first_collect_leaf_values(node["right"]) + + depth_first_collect_leaf_values(0) # start at root (0) + return values + + values = get_leaves_values() + + if monotonic_cst == MonotonicConstraint.NO_CST: + # some increasing, some decreasing + assert not is_increasing(values) and not is_decreasing(values) + elif monotonic_cst == MonotonicConstraint.POS: + # all increasing + assert is_increasing(values) + else: # NEG + # all decreasing + assert is_decreasing(values) + + +def assert_children_values_monotonic(predictor, monotonic_cst): + # Make sure siblings values respect the monotonic constraints. Left should + # be lower (resp greater) than right child if constraint is POS (resp. + # NEG). + # Note that this property alone isn't enough to ensure full monotonicity, + # since we also need to guanrantee that all the descendents of the left + # child won't be greater (resp. lower) than the right child, or its + # descendents. That's why we need to bound the predicted values (this is + # tested in assert_children_values_bounded) + nodes = predictor.nodes + left_lower = [] + left_greater = [] + for node in nodes: + if node["is_leaf"]: + continue + + left_idx = node["left"] + right_idx = node["right"] + + if nodes[left_idx]["value"] < nodes[right_idx]["value"]: + left_lower.append(node) + elif nodes[left_idx]["value"] > nodes[right_idx]["value"]: + left_greater.append(node) + + if monotonic_cst == MonotonicConstraint.NO_CST: + assert left_lower and left_greater + elif monotonic_cst == MonotonicConstraint.POS: + assert left_lower and not left_greater + else: # NEG + assert not left_lower and left_greater + + +def assert_children_values_bounded(grower, monotonic_cst): + # Make sure that the values of the children of a node are bounded by the + # middle value between that node and its sibling (if there is a monotonic + # constraint). + # As a bonus, we also check that the siblings values are properly ordered + # which is slightly redundant with assert_children_values_monotonic (but + # this check is done on the grower nodes whereas + # assert_children_values_monotonic is done on the predictor nodes) + + if monotonic_cst == MonotonicConstraint.NO_CST: + return + + def recursively_check_children_node_values(node, right_sibling=None): + if node.is_leaf: + return + if right_sibling is not None: + middle = (node.value + right_sibling.value) / 2 + if monotonic_cst == MonotonicConstraint.POS: + assert node.left_child.value <= node.right_child.value <= middle + if not right_sibling.is_leaf: + assert ( + middle + <= right_sibling.left_child.value + <= right_sibling.right_child.value + ) + else: # NEG + assert node.left_child.value >= node.right_child.value >= middle + if not right_sibling.is_leaf: + assert ( + middle + >= right_sibling.left_child.value + >= right_sibling.right_child.value + ) + + recursively_check_children_node_values( + node.left_child, right_sibling=node.right_child + ) + recursively_check_children_node_values(node.right_child) + + recursively_check_children_node_values(grower.root) + + +@pytest.mark.parametrize("seed", range(3)) +@pytest.mark.parametrize( + "monotonic_cst", + ( + MonotonicConstraint.NO_CST, + MonotonicConstraint.POS, + MonotonicConstraint.NEG, + ), +) +def test_nodes_values(monotonic_cst, seed): + # Build a single tree with only one feature, and make sure the nodes + # values respect the monotonic constraints. + + # Considering the following tree with a monotonic POS constraint, we + # should have: + # + # root + # / \ + # 5 10 # middle = 7.5 + # / \ / \ + # a b c d + # + # a <= b and c <= d (assert_children_values_monotonic) + # a, b <= middle <= c, d (assert_children_values_bounded) + # a <= b <= c <= d (assert_leaves_values_monotonic) + # + # The last one is a consequence of the others, but can't hurt to check + + rng = np.random.RandomState(seed) + n_samples = 1000 + n_features = 1 + X_binned = rng.randint(0, 255, size=(n_samples, n_features), dtype=np.uint8) + X_binned = np.asfortranarray(X_binned) + + gradients = rng.normal(size=n_samples).astype(G_H_DTYPE) + hessians = np.ones(shape=1, dtype=G_H_DTYPE) + + grower = TreeGrower( + X_binned, gradients, hessians, monotonic_cst=[monotonic_cst], shrinkage=0.1 + ) + grower.grow() + + # grow() will shrink the leaves values at the very end. For our comparison + # tests, we need to revert the shrinkage of the leaves, else we would + # compare the value of a leaf (shrunk) with a node (not shrunk) and the + # test would not be correct. + for leave in grower.finalized_leaves: + leave.value /= grower.shrinkage + + # We pass undefined binning_thresholds because we won't use predict anyway + predictor = grower.make_predictor( + binning_thresholds=np.zeros((X_binned.shape[1], X_binned.max() + 1)) + ) + + # The consistency of the bounds can only be checked on the tree grower + # as the node bounds are not copied into the predictor tree. The + # consistency checks on the values of node children and leaves can be + # done either on the grower tree or on the predictor tree. We only + # do those checks on the predictor tree as the latter is derived from + # the former. + assert_children_values_monotonic(predictor, monotonic_cst) + assert_children_values_bounded(grower, monotonic_cst) + assert_leaves_values_monotonic(predictor, monotonic_cst) + + +@pytest.mark.parametrize("use_feature_names", (True, False)) +def test_predictions(global_random_seed, use_feature_names): + # Train a model with a POS constraint on the first non-categorical feature + # and a NEG constraint on the second non-categorical feature, and make sure + # the constraints are respected by checking the predictions. + # test adapted from lightgbm's test_monotone_constraint(), itself inspired + # by https://xgboost.readthedocs.io/en/latest/tutorials/monotonic.html + + rng = np.random.RandomState(global_random_seed) + + n_samples = 1000 + f_0 = rng.rand(n_samples) # positive correlation with y + f_1 = rng.rand(n_samples) # negative correlation with y + + # extra categorical features, no correlation with y, + # to check the correctness of monotonicity constraint remapping, see issue #28898 + f_a = rng.randint(low=0, high=9, size=n_samples) + f_b = rng.randint(low=0, high=9, size=n_samples) + f_c = rng.randint(low=0, high=9, size=n_samples) + + X = np.c_[f_a, f_0, f_b, f_1, f_c] + columns_name = ["f_a", "f_0", "f_b", "f_1", "f_c"] + constructor_name = "dataframe" if use_feature_names else "array" + X = _convert_container(X, constructor_name, columns_name=columns_name) + + noise = rng.normal(loc=0.0, scale=0.01, size=n_samples) + y = 5 * f_0 + np.sin(10 * np.pi * f_0) - 5 * f_1 - np.cos(10 * np.pi * f_1) + noise + + if use_feature_names: + monotonic_cst = {"f_0": +1, "f_1": -1} + categorical_features = ["f_a", "f_b", "f_c"] + else: + monotonic_cst = [0, +1, 0, -1, 0] + categorical_features = [0, 2, 4] + + gbdt = HistGradientBoostingRegressor( + monotonic_cst=monotonic_cst, categorical_features=categorical_features + ) + gbdt.fit(X, y) + + linspace = np.linspace(0, 1, 100) + sin = np.sin(linspace) + constant = np.full_like(linspace, fill_value=0.5) + + # We now assert the predictions properly respect the constraints, on each + # feature. When testing for a feature we need to set the other one to a + # constant, because the monotonic constraints are only a "all else being + # equal" type of constraints: + # a constraint on the first feature only means that + # x0 < x0' => f(x0, x1) < f(x0', x1) + # while x1 stays constant. + # The constraint does not guanrantee that + # x0 < x0' => f(x0, x1) < f(x0', x1') + + # First non-categorical feature (POS) + # assert pred is all increasing when f_0 is all increasing + X = np.c_[constant, linspace, constant, constant, constant] + X = _convert_container(X, constructor_name, columns_name=columns_name) + pred = gbdt.predict(X) + assert is_increasing(pred) + # assert pred actually follows the variations of f_0 + X = np.c_[constant, sin, constant, constant, constant] + X = _convert_container(X, constructor_name, columns_name=columns_name) + pred = gbdt.predict(X) + assert np.all((np.diff(pred) >= 0) == (np.diff(sin) >= 0)) + + # Second non-categorical feature (NEG) + # assert pred is all decreasing when f_1 is all increasing + X = np.c_[constant, constant, constant, linspace, constant] + X = _convert_container(X, constructor_name, columns_name=columns_name) + pred = gbdt.predict(X) + assert is_decreasing(pred) + # assert pred actually follows the inverse variations of f_1 + X = np.c_[constant, constant, constant, sin, constant] + X = _convert_container(X, constructor_name, columns_name=columns_name) + pred = gbdt.predict(X) + assert ((np.diff(pred) <= 0) == (np.diff(sin) >= 0)).all() + + +def test_input_error(): + X = [[1, 2], [2, 3], [3, 4]] + y = [0, 1, 2] + + gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, 0, -1]) + with pytest.raises( + ValueError, match=re.escape("monotonic_cst has shape (3,) but the input data") + ): + gbdt.fit(X, y) + + for monotonic_cst in ([1, 3], [1, -3], [0.3, -0.7]): + gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst) + expected_msg = re.escape( + "must be an array-like of -1, 0 or 1. Observed values:" + ) + with pytest.raises(ValueError, match=expected_msg): + gbdt.fit(X, y) + + gbdt = HistGradientBoostingClassifier(monotonic_cst=[0, 1]) + with pytest.raises( + ValueError, + match="monotonic constraints are not supported for multiclass classification", + ): + gbdt.fit(X, y) + + +def test_input_error_related_to_feature_names(): + pd = pytest.importorskip("pandas") + X = pd.DataFrame({"a": [0, 1, 2], "b": [0, 1, 2]}) + y = np.array([0, 1, 0]) + + monotonic_cst = {"d": 1, "a": 1, "c": -1} + gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst) + expected_msg = re.escape( + "monotonic_cst contains 2 unexpected feature names: ['c', 'd']." + ) + with pytest.raises(ValueError, match=expected_msg): + gbdt.fit(X, y) + + monotonic_cst = {k: 1 for k in "abcdefghijklmnopqrstuvwxyz"} + gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst) + expected_msg = re.escape( + "monotonic_cst contains 24 unexpected feature names: " + "['c', 'd', 'e', 'f', 'g', '...']." + ) + with pytest.raises(ValueError, match=expected_msg): + gbdt.fit(X, y) + + monotonic_cst = {"a": 1} + gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst) + expected_msg = re.escape( + "HistGradientBoostingRegressor was not fitted on data with feature " + "names. Pass monotonic_cst as an integer array instead." + ) + with pytest.raises(ValueError, match=expected_msg): + gbdt.fit(X.values, y) + + monotonic_cst = {"b": -1, "a": "+"} + gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst) + expected_msg = re.escape("monotonic_cst['a'] must be either -1, 0 or 1. Got '+'.") + with pytest.raises(ValueError, match=expected_msg): + gbdt.fit(X, y) + + +def test_bounded_value_min_gain_to_split(): + # The purpose of this test is to show that when computing the gain at a + # given split, the value of the current node should be properly bounded to + # respect the monotonic constraints, because it strongly interacts with + # min_gain_to_split. We build a simple example where gradients are [1, 1, + # 100, 1, 1] (hessians are all ones). The best split happens on the 3rd + # bin, and depending on whether the value of the node is bounded or not, + # the min_gain_to_split constraint is or isn't satisfied. + l2_regularization = 0 + min_hessian_to_split = 0 + min_samples_leaf = 1 + n_bins = n_samples = 5 + X_binned = np.arange(n_samples).reshape(-1, 1).astype(X_BINNED_DTYPE) + sample_indices = np.arange(n_samples, dtype=np.uint32) + all_hessians = np.ones(n_samples, dtype=G_H_DTYPE) + all_gradients = np.array([1, 1, 100, 1, 1], dtype=G_H_DTYPE) + sum_gradients = all_gradients.sum() + sum_hessians = all_hessians.sum() + hessians_are_constant = False + + builder = HistogramBuilder( + X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads + ) + n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32) + has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) + monotonic_cst = np.array( + [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8 + ) + is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8) + missing_values_bin_idx = n_bins - 1 + children_lower_bound, children_upper_bound = -np.inf, np.inf + + min_gain_to_split = 2000 + splitter = Splitter( + X_binned, + n_bins_non_missing, + missing_values_bin_idx, + has_missing_values, + is_categorical, + monotonic_cst, + l2_regularization, + min_hessian_to_split, + min_samples_leaf, + min_gain_to_split, + hessians_are_constant, + ) + + histograms = builder.compute_histograms_brute(sample_indices) + + # Since the gradient array is [1, 1, 100, 1, 1] + # the max possible gain happens on the 3rd bin (or equivalently in the 2nd) + # and is equal to about 1307, which less than min_gain_to_split = 2000, so + # the node is considered unsplittable (gain = -1) + current_lower_bound, current_upper_bound = -np.inf, np.inf + value = compute_node_value( + sum_gradients, + sum_hessians, + current_lower_bound, + current_upper_bound, + l2_regularization, + ) + # the unbounded value is equal to -sum_gradients / sum_hessians + assert value == pytest.approx(-104 / 5) + split_info = splitter.find_node_split( + n_samples, + histograms, + sum_gradients, + sum_hessians, + value, + lower_bound=children_lower_bound, + upper_bound=children_upper_bound, + ) + assert split_info.gain == -1 # min_gain_to_split not respected + + # here again the max possible gain is on the 3rd bin but we now cap the + # value of the node into [-10, inf]. + # This means the gain is now about 2430 which is more than the + # min_gain_to_split constraint. + current_lower_bound, current_upper_bound = -10, np.inf + value = compute_node_value( + sum_gradients, + sum_hessians, + current_lower_bound, + current_upper_bound, + l2_regularization, + ) + assert value == -10 + split_info = splitter.find_node_split( + n_samples, + histograms, + sum_gradients, + sum_hessians, + value, + lower_bound=children_lower_bound, + upper_bound=children_upper_bound, + ) + assert split_info.gain > min_gain_to_split diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..3c3c9ae81bac2d498c460bfb5f2173f8c48693d1 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py @@ -0,0 +1,187 @@ +import numpy as np +import pytest +from numpy.testing import assert_allclose + +from sklearn.datasets import make_regression +from sklearn.ensemble._hist_gradient_boosting._bitset import ( + set_bitset_memoryview, + set_raw_bitset_from_binned_bitset, +) +from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper +from sklearn.ensemble._hist_gradient_boosting.common import ( + ALMOST_INF, + G_H_DTYPE, + PREDICTOR_RECORD_DTYPE, + X_BINNED_DTYPE, + X_BITSET_INNER_DTYPE, + X_DTYPE, +) +from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower +from sklearn.ensemble._hist_gradient_boosting.predictor import TreePredictor +from sklearn.metrics import r2_score +from sklearn.model_selection import train_test_split +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads + +n_threads = _openmp_effective_n_threads() + + +@pytest.mark.parametrize("n_bins", [200, 256]) +def test_regression_dataset(n_bins): + X, y = make_regression( + n_samples=500, n_features=10, n_informative=5, random_state=42 + ) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + + mapper = _BinMapper(n_bins=n_bins, random_state=42) + X_train_binned = mapper.fit_transform(X_train) + + # Init gradients and hessians to that of least squares loss + gradients = -y_train.astype(G_H_DTYPE) + hessians = np.ones(1, dtype=G_H_DTYPE) + + min_samples_leaf = 10 + max_leaf_nodes = 30 + grower = TreeGrower( + X_train_binned, + gradients, + hessians, + min_samples_leaf=min_samples_leaf, + max_leaf_nodes=max_leaf_nodes, + n_bins=n_bins, + n_bins_non_missing=mapper.n_bins_non_missing_, + ) + grower.grow() + + predictor = grower.make_predictor(binning_thresholds=mapper.bin_thresholds_) + + known_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE) + f_idx_map = np.zeros(0, dtype=np.uint32) + + y_pred_train = predictor.predict(X_train, known_cat_bitsets, f_idx_map, n_threads) + assert r2_score(y_train, y_pred_train) > 0.82 + + y_pred_test = predictor.predict(X_test, known_cat_bitsets, f_idx_map, n_threads) + assert r2_score(y_test, y_pred_test) > 0.67 + + +@pytest.mark.parametrize( + "num_threshold, expected_predictions", + [ + (-np.inf, [0, 1, 1, 1]), + (10, [0, 0, 1, 1]), + (20, [0, 0, 0, 1]), + (ALMOST_INF, [0, 0, 0, 1]), + (np.inf, [0, 0, 0, 0]), + ], +) +def test_infinite_values_and_thresholds(num_threshold, expected_predictions): + # Make sure infinite values and infinite thresholds are handled properly. + # In particular, if a value is +inf and the threshold is ALMOST_INF the + # sample should go to the right child. If the threshold is inf (split on + # nan), the +inf sample will go to the left child. + + X = np.array([-np.inf, 10, 20, np.inf]).reshape(-1, 1) + nodes = np.zeros(3, dtype=PREDICTOR_RECORD_DTYPE) + + # We just construct a simple tree with 1 root and 2 children + # parent node + nodes[0]["left"] = 1 + nodes[0]["right"] = 2 + nodes[0]["feature_idx"] = 0 + nodes[0]["num_threshold"] = num_threshold + + # left child + nodes[1]["is_leaf"] = True + nodes[1]["value"] = 0 + + # right child + nodes[2]["is_leaf"] = True + nodes[2]["value"] = 1 + + binned_cat_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE) + raw_categorical_bitsets = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE) + known_cat_bitset = np.zeros((0, 8), dtype=X_BITSET_INNER_DTYPE) + f_idx_map = np.zeros(0, dtype=np.uint32) + + predictor = TreePredictor(nodes, binned_cat_bitsets, raw_categorical_bitsets) + predictions = predictor.predict(X, known_cat_bitset, f_idx_map, n_threads) + + assert np.all(predictions == expected_predictions) + + +@pytest.mark.parametrize( + "bins_go_left, expected_predictions", + [ + ([0, 3, 4, 6], [1, 0, 0, 1, 1, 0]), + ([0, 1, 2, 6], [1, 1, 1, 0, 0, 0]), + ([3, 5, 6], [0, 0, 0, 1, 0, 1]), + ], +) +def test_categorical_predictor(bins_go_left, expected_predictions): + # Test predictor outputs are correct with categorical features + + X_binned = np.array([[0, 1, 2, 3, 4, 5]], dtype=X_BINNED_DTYPE).T + categories = np.array([2, 5, 6, 8, 10, 15], dtype=X_DTYPE) + + bins_go_left = np.array(bins_go_left, dtype=X_BINNED_DTYPE) + + # We just construct a simple tree with 1 root and 2 children + # parent node + nodes = np.zeros(3, dtype=PREDICTOR_RECORD_DTYPE) + nodes[0]["left"] = 1 + nodes[0]["right"] = 2 + nodes[0]["feature_idx"] = 0 + nodes[0]["is_categorical"] = True + nodes[0]["missing_go_to_left"] = True + + # left child + nodes[1]["is_leaf"] = True + nodes[1]["value"] = 1 + + # right child + nodes[2]["is_leaf"] = True + nodes[2]["value"] = 0 + + binned_cat_bitsets = np.zeros((1, 8), dtype=X_BITSET_INNER_DTYPE) + raw_categorical_bitsets = np.zeros((1, 8), dtype=X_BITSET_INNER_DTYPE) + for go_left in bins_go_left: + set_bitset_memoryview(binned_cat_bitsets[0], go_left) + + set_raw_bitset_from_binned_bitset( + raw_categorical_bitsets[0], binned_cat_bitsets[0], categories + ) + + predictor = TreePredictor(nodes, binned_cat_bitsets, raw_categorical_bitsets) + + # Check binned data gives correct predictions + prediction_binned = predictor.predict_binned( + X_binned, missing_values_bin_idx=6, n_threads=n_threads + ) + assert_allclose(prediction_binned, expected_predictions) + + # manually construct bitset + known_cat_bitsets = np.zeros((1, 8), dtype=np.uint32) + known_cat_bitsets[0, 0] = np.sum(2**categories, dtype=np.uint32) + f_idx_map = np.array([0], dtype=np.uint32) + + # Check with un-binned data + predictions = predictor.predict( + categories.reshape(-1, 1), known_cat_bitsets, f_idx_map, n_threads + ) + assert_allclose(predictions, expected_predictions) + + # Check missing goes left because missing_values_bin_idx=6 + X_binned_missing = np.array([[6]], dtype=X_BINNED_DTYPE).T + predictions = predictor.predict_binned( + X_binned_missing, missing_values_bin_idx=6, n_threads=n_threads + ) + assert_allclose(predictions, [1]) + + # missing and unknown go left + predictions = predictor.predict( + np.array([[np.nan, 17]], dtype=X_DTYPE).T, + known_cat_bitsets, + f_idx_map, + n_threads, + ) + assert_allclose(predictions, [1, 1]) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py new file mode 100644 index 0000000000000000000000000000000000000000..388697340e08b545be766c6d46cf7362371bc258 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py @@ -0,0 +1,1070 @@ +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +from sklearn.ensemble._hist_gradient_boosting.common import ( + G_H_DTYPE, + HISTOGRAM_DTYPE, + X_BINNED_DTYPE, + MonotonicConstraint, +) +from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder +from sklearn.ensemble._hist_gradient_boosting.splitting import ( + Splitter, + compute_node_value, +) +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +from sklearn.utils._testing import skip_if_32bit + +n_threads = _openmp_effective_n_threads() + + +@pytest.mark.parametrize("n_bins", [3, 32, 256]) +def test_histogram_split(n_bins): + rng = np.random.RandomState(42) + feature_idx = 0 + l2_regularization = 0 + min_hessian_to_split = 1e-3 + min_samples_leaf = 1 + min_gain_to_split = 0.0 + X_binned = np.asfortranarray( + rng.randint(0, n_bins - 1, size=(int(1e4), 1)), dtype=X_BINNED_DTYPE + ) + binned_feature = X_binned.T[feature_idx] + sample_indices = np.arange(binned_feature.shape[0], dtype=np.uint32) + ordered_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE) + all_hessians = ordered_hessians + sum_hessians = all_hessians.sum() + hessians_are_constant = False + + for true_bin in range(1, n_bins - 2): + for sign in [-1, 1]: + ordered_gradients = np.full_like(binned_feature, sign, dtype=G_H_DTYPE) + ordered_gradients[binned_feature <= true_bin] *= -1 + all_gradients = ordered_gradients + sum_gradients = all_gradients.sum() + + builder = HistogramBuilder( + X_binned, + n_bins, + all_gradients, + all_hessians, + hessians_are_constant, + n_threads, + ) + n_bins_non_missing = np.array( + [n_bins - 1] * X_binned.shape[1], dtype=np.uint32 + ) + has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) + monotonic_cst = np.array( + [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8 + ) + is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8) + missing_values_bin_idx = n_bins - 1 + splitter = Splitter( + X_binned, + n_bins_non_missing, + missing_values_bin_idx, + has_missing_values, + is_categorical, + monotonic_cst, + l2_regularization, + min_hessian_to_split, + min_samples_leaf, + min_gain_to_split, + hessians_are_constant, + ) + + histograms = builder.compute_histograms_brute(sample_indices) + value = compute_node_value( + sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization + ) + split_info = splitter.find_node_split( + sample_indices.shape[0], histograms, sum_gradients, sum_hessians, value + ) + + assert split_info.bin_idx == true_bin + assert split_info.gain >= 0 + assert split_info.feature_idx == feature_idx + assert ( + split_info.n_samples_left + split_info.n_samples_right + == sample_indices.shape[0] + ) + # Constant hessian: 1. per sample. + assert split_info.n_samples_left == split_info.sum_hessian_left + + +@skip_if_32bit +@pytest.mark.parametrize("constant_hessian", [True, False]) +def test_gradient_and_hessian_sanity(constant_hessian): + # This test checks that the values of gradients and hessians are + # consistent in different places: + # - in split_info: si.sum_gradient_left + si.sum_gradient_right must be + # equal to the gradient at the node. Same for hessians. + # - in the histograms: summing 'sum_gradients' over the bins must be + # constant across all features, and those sums must be equal to the + # node's gradient. Same for hessians. + + rng = np.random.RandomState(42) + + n_bins = 10 + n_features = 20 + n_samples = 500 + l2_regularization = 0.0 + min_hessian_to_split = 1e-3 + min_samples_leaf = 1 + min_gain_to_split = 0.0 + + X_binned = rng.randint( + 0, n_bins, size=(n_samples, n_features), dtype=X_BINNED_DTYPE + ) + X_binned = np.asfortranarray(X_binned) + sample_indices = np.arange(n_samples, dtype=np.uint32) + all_gradients = rng.randn(n_samples).astype(G_H_DTYPE) + sum_gradients = all_gradients.sum() + if constant_hessian: + all_hessians = np.ones(1, dtype=G_H_DTYPE) + sum_hessians = 1 * n_samples + else: + all_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE) + sum_hessians = all_hessians.sum() + + builder = HistogramBuilder( + X_binned, n_bins, all_gradients, all_hessians, constant_hessian, n_threads + ) + n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32) + has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) + monotonic_cst = np.array( + [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8 + ) + is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8) + missing_values_bin_idx = n_bins - 1 + splitter = Splitter( + X_binned, + n_bins_non_missing, + missing_values_bin_idx, + has_missing_values, + is_categorical, + monotonic_cst, + l2_regularization, + min_hessian_to_split, + min_samples_leaf, + min_gain_to_split, + constant_hessian, + ) + + hists_parent = builder.compute_histograms_brute(sample_indices) + value_parent = compute_node_value( + sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization + ) + si_parent = splitter.find_node_split( + n_samples, hists_parent, sum_gradients, sum_hessians, value_parent + ) + sample_indices_left, sample_indices_right, _ = splitter.split_indices( + si_parent, sample_indices + ) + + hists_left = builder.compute_histograms_brute(sample_indices_left) + value_left = compute_node_value( + si_parent.sum_gradient_left, + si_parent.sum_hessian_left, + -np.inf, + np.inf, + l2_regularization, + ) + hists_right = builder.compute_histograms_brute(sample_indices_right) + value_right = compute_node_value( + si_parent.sum_gradient_right, + si_parent.sum_hessian_right, + -np.inf, + np.inf, + l2_regularization, + ) + si_left = splitter.find_node_split( + n_samples, + hists_left, + si_parent.sum_gradient_left, + si_parent.sum_hessian_left, + value_left, + ) + si_right = splitter.find_node_split( + n_samples, + hists_right, + si_parent.sum_gradient_right, + si_parent.sum_hessian_right, + value_right, + ) + + # make sure that si.sum_gradient_left + si.sum_gradient_right have their + # expected value, same for hessians + for si, indices in ( + (si_parent, sample_indices), + (si_left, sample_indices_left), + (si_right, sample_indices_right), + ): + gradient = si.sum_gradient_right + si.sum_gradient_left + expected_gradient = all_gradients[indices].sum() + hessian = si.sum_hessian_right + si.sum_hessian_left + if constant_hessian: + expected_hessian = indices.shape[0] * all_hessians[0] + else: + expected_hessian = all_hessians[indices].sum() + + assert np.isclose(gradient, expected_gradient) + assert np.isclose(hessian, expected_hessian) + + # make sure sum of gradients in histograms are the same for all features, + # and make sure they're equal to their expected value + hists_parent = np.asarray(hists_parent, dtype=HISTOGRAM_DTYPE) + hists_left = np.asarray(hists_left, dtype=HISTOGRAM_DTYPE) + hists_right = np.asarray(hists_right, dtype=HISTOGRAM_DTYPE) + for hists, indices in ( + (hists_parent, sample_indices), + (hists_left, sample_indices_left), + (hists_right, sample_indices_right), + ): + # note: gradients and hessians have shape (n_features,), + # we're comparing them to *scalars*. This has the benefit of also + # making sure that all the entries are equal across features. + gradients = hists["sum_gradients"].sum(axis=1) # shape = (n_features,) + expected_gradient = all_gradients[indices].sum() # scalar + hessians = hists["sum_hessians"].sum(axis=1) + if constant_hessian: + # 0 is not the actual hessian, but it's not computed in this case + expected_hessian = 0.0 + else: + expected_hessian = all_hessians[indices].sum() + + assert np.allclose(gradients, expected_gradient) + assert np.allclose(hessians, expected_hessian) + + +def test_split_indices(): + # Check that split_indices returns the correct splits and that + # splitter.partition is consistent with what is returned. + rng = np.random.RandomState(421) + + n_bins = 5 + n_samples = 10 + l2_regularization = 0.0 + min_hessian_to_split = 1e-3 + min_samples_leaf = 1 + min_gain_to_split = 0.0 + + # split will happen on feature 1 and on bin 3 + X_binned = [ + [0, 0], + [0, 3], + [0, 4], + [0, 0], + [0, 0], + [0, 0], + [0, 0], + [0, 4], + [0, 0], + [0, 4], + ] + X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE) + sample_indices = np.arange(n_samples, dtype=np.uint32) + all_gradients = rng.randn(n_samples).astype(G_H_DTYPE) + all_hessians = np.ones(1, dtype=G_H_DTYPE) + sum_gradients = all_gradients.sum() + sum_hessians = 1 * n_samples + hessians_are_constant = True + + builder = HistogramBuilder( + X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads + ) + n_bins_non_missing = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) + has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) + monotonic_cst = np.array( + [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8 + ) + is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8) + missing_values_bin_idx = n_bins - 1 + splitter = Splitter( + X_binned, + n_bins_non_missing, + missing_values_bin_idx, + has_missing_values, + is_categorical, + monotonic_cst, + l2_regularization, + min_hessian_to_split, + min_samples_leaf, + min_gain_to_split, + hessians_are_constant, + ) + + assert np.all(sample_indices == splitter.partition) + + histograms = builder.compute_histograms_brute(sample_indices) + value = compute_node_value( + sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization + ) + si_root = splitter.find_node_split( + n_samples, histograms, sum_gradients, sum_hessians, value + ) + + # sanity checks for best split + assert si_root.feature_idx == 1 + assert si_root.bin_idx == 3 + + samples_left, samples_right, position_right = splitter.split_indices( + si_root, splitter.partition + ) + assert set(samples_left) == set([0, 1, 3, 4, 5, 6, 8]) + assert set(samples_right) == set([2, 7, 9]) + + assert list(samples_left) == list(splitter.partition[:position_right]) + assert list(samples_right) == list(splitter.partition[position_right:]) + + # Check that the resulting split indices sizes are consistent with the + # count statistics anticipated when looking for the best split. + assert samples_left.shape[0] == si_root.n_samples_left + assert samples_right.shape[0] == si_root.n_samples_right + + +def test_min_gain_to_split(): + # Try to split a pure node (all gradients are equal, same for hessians) + # with min_gain_to_split = 0 and make sure that the node is not split (best + # possible gain = -1). Note: before the strict inequality comparison, this + # test would fail because the node would be split with a gain of 0. + rng = np.random.RandomState(42) + l2_regularization = 0 + min_hessian_to_split = 0 + min_samples_leaf = 1 + min_gain_to_split = 0.0 + n_bins = 255 + n_samples = 100 + X_binned = np.asfortranarray( + rng.randint(0, n_bins, size=(n_samples, 1)), dtype=X_BINNED_DTYPE + ) + binned_feature = X_binned[:, 0] + sample_indices = np.arange(n_samples, dtype=np.uint32) + all_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE) + all_gradients = np.ones_like(binned_feature, dtype=G_H_DTYPE) + sum_gradients = all_gradients.sum() + sum_hessians = all_hessians.sum() + hessians_are_constant = False + + builder = HistogramBuilder( + X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads + ) + n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32) + has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) + monotonic_cst = np.array( + [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8 + ) + is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8) + missing_values_bin_idx = n_bins - 1 + splitter = Splitter( + X_binned, + n_bins_non_missing, + missing_values_bin_idx, + has_missing_values, + is_categorical, + monotonic_cst, + l2_regularization, + min_hessian_to_split, + min_samples_leaf, + min_gain_to_split, + hessians_are_constant, + ) + + histograms = builder.compute_histograms_brute(sample_indices) + value = compute_node_value( + sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization + ) + split_info = splitter.find_node_split( + n_samples, histograms, sum_gradients, sum_hessians, value + ) + assert split_info.gain == -1 + + +@pytest.mark.parametrize( + ( + "X_binned, all_gradients, has_missing_values, n_bins_non_missing, " + " expected_split_on_nan, expected_bin_idx, expected_go_to_left" + ), + [ + # basic sanity check with no missing values: given the gradient + # values, the split must occur on bin_idx=3 + ( + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], # X_binned + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], # gradients + False, # no missing values + 10, # n_bins_non_missing + False, # don't split on nans + 3, # expected_bin_idx + "not_applicable", + ), + # We replace 2 samples by NaNs (bin_idx=8) + # These 2 samples were mapped to the left node before, so they should + # be mapped to left node again + # Notice how the bin_idx threshold changes from 3 to 1. + ( + [8, 0, 1, 8, 2, 3, 4, 5, 6, 7], # 8 <=> missing + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], + True, # missing values + 8, # n_bins_non_missing + False, # don't split on nans + 1, # cut on bin_idx=1 + True, + ), # missing values go to left + # same as above, but with non-consecutive missing_values_bin + ( + [9, 0, 1, 9, 2, 3, 4, 5, 6, 7], # 9 <=> missing + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], + True, # missing values + 8, # n_bins_non_missing + False, # don't split on nans + 1, # cut on bin_idx=1 + True, + ), # missing values go to left + # this time replacing 2 samples that were on the right. + ( + [0, 1, 2, 3, 8, 4, 8, 5, 6, 7], # 8 <=> missing + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], + True, # missing values + 8, # n_bins_non_missing + False, # don't split on nans + 3, # cut on bin_idx=3 (like in first case) + False, + ), # missing values go to right + # same as above, but with non-consecutive missing_values_bin + ( + [0, 1, 2, 3, 9, 4, 9, 5, 6, 7], # 9 <=> missing + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], + True, # missing values + 8, # n_bins_non_missing + False, # don't split on nans + 3, # cut on bin_idx=3 (like in first case) + False, + ), # missing values go to right + # For the following cases, split_on_nans is True (we replace all of + # the samples with nans, instead of just 2). + ( + [0, 1, 2, 3, 4, 4, 4, 4, 4, 4], # 4 <=> missing + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], + True, # missing values + 4, # n_bins_non_missing + True, # split on nans + 3, # cut on bin_idx=3 + False, + ), # missing values go to right + # same as above, but with non-consecutive missing_values_bin + ( + [0, 1, 2, 3, 9, 9, 9, 9, 9, 9], # 9 <=> missing + [1, 1, 1, 1, 1, 1, 5, 5, 5, 5], + True, # missing values + 4, # n_bins_non_missing + True, # split on nans + 3, # cut on bin_idx=3 + False, + ), # missing values go to right + ( + [6, 6, 6, 6, 0, 1, 2, 3, 4, 5], # 6 <=> missing + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], + True, # missing values + 6, # n_bins_non_missing + True, # split on nans + 5, # cut on bin_idx=5 + False, + ), # missing values go to right + # same as above, but with non-consecutive missing_values_bin + ( + [9, 9, 9, 9, 0, 1, 2, 3, 4, 5], # 9 <=> missing + [1, 1, 1, 1, 5, 5, 5, 5, 5, 5], + True, # missing values + 6, # n_bins_non_missing + True, # split on nans + 5, # cut on bin_idx=5 + False, + ), # missing values go to right + ], +) +def test_splitting_missing_values( + X_binned, + all_gradients, + has_missing_values, + n_bins_non_missing, + expected_split_on_nan, + expected_bin_idx, + expected_go_to_left, +): + # Make sure missing values are properly supported. + # we build an artificial example with gradients such that the best split + # is on bin_idx=3, when there are no missing values. + # Then we introduce missing values and: + # - make sure the chosen bin is correct (find_best_bin()): it's + # still the same split, even though the index of the bin may change + # - make sure the missing values are mapped to the correct child + # (split_indices()) + + n_bins = max(X_binned) + 1 + n_samples = len(X_binned) + l2_regularization = 0.0 + min_hessian_to_split = 1e-3 + min_samples_leaf = 1 + min_gain_to_split = 0.0 + + sample_indices = np.arange(n_samples, dtype=np.uint32) + X_binned = np.array(X_binned, dtype=X_BINNED_DTYPE).reshape(-1, 1) + X_binned = np.asfortranarray(X_binned) + all_gradients = np.array(all_gradients, dtype=G_H_DTYPE) + has_missing_values = np.array([has_missing_values], dtype=np.uint8) + all_hessians = np.ones(1, dtype=G_H_DTYPE) + sum_gradients = all_gradients.sum() + sum_hessians = 1 * n_samples + hessians_are_constant = True + + builder = HistogramBuilder( + X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads + ) + + n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32) + monotonic_cst = np.array( + [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8 + ) + is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8) + missing_values_bin_idx = n_bins - 1 + splitter = Splitter( + X_binned, + n_bins_non_missing, + missing_values_bin_idx, + has_missing_values, + is_categorical, + monotonic_cst, + l2_regularization, + min_hessian_to_split, + min_samples_leaf, + min_gain_to_split, + hessians_are_constant, + ) + + histograms = builder.compute_histograms_brute(sample_indices) + value = compute_node_value( + sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization + ) + split_info = splitter.find_node_split( + n_samples, histograms, sum_gradients, sum_hessians, value + ) + + assert split_info.bin_idx == expected_bin_idx + if has_missing_values: + assert split_info.missing_go_to_left == expected_go_to_left + + split_on_nan = split_info.bin_idx == n_bins_non_missing[0] - 1 + assert split_on_nan == expected_split_on_nan + + # Make sure the split is properly computed. + # This also make sure missing values are properly assigned to the correct + # child in split_indices() + samples_left, samples_right, _ = splitter.split_indices( + split_info, splitter.partition + ) + + if not expected_split_on_nan: + # When we don't split on nans, the split should always be the same. + assert set(samples_left) == set([0, 1, 2, 3]) + assert set(samples_right) == set([4, 5, 6, 7, 8, 9]) + else: + # When we split on nans, samples with missing values are always mapped + # to the right child. + missing_samples_indices = np.flatnonzero( + np.array(X_binned) == missing_values_bin_idx + ) + non_missing_samples_indices = np.flatnonzero( + np.array(X_binned) != missing_values_bin_idx + ) + + assert set(samples_right) == set(missing_samples_indices) + assert set(samples_left) == set(non_missing_samples_indices) + + +@pytest.mark.parametrize( + "X_binned, has_missing_values, n_bins_non_missing, ", + [ + # one category + ([0] * 20, False, 1), + # all categories appear less than MIN_CAT_SUPPORT (hardcoded to 10) + ([0] * 9 + [1] * 8, False, 2), + # only one category appears more than MIN_CAT_SUPPORT + ([0] * 12 + [1] * 8, False, 2), + # missing values + category appear less than MIN_CAT_SUPPORT + # 9 is missing + ([0] * 9 + [1] * 8 + [9] * 4, True, 2), + # no non-missing category + ([9] * 11, True, 0), + ], +) +def test_splitting_categorical_cat_smooth( + X_binned, has_missing_values, n_bins_non_missing +): + # Checks categorical splits are correct when the MIN_CAT_SUPPORT constraint + # isn't respected: there are no splits + + n_bins = max(X_binned) + 1 + n_samples = len(X_binned) + X_binned = np.array([X_binned], dtype=X_BINNED_DTYPE).T + X_binned = np.asfortranarray(X_binned) + + l2_regularization = 0.0 + min_hessian_to_split = 1e-3 + min_samples_leaf = 1 + min_gain_to_split = 0.0 + + sample_indices = np.arange(n_samples, dtype=np.uint32) + all_gradients = np.ones(n_samples, dtype=G_H_DTYPE) + has_missing_values = np.array([has_missing_values], dtype=np.uint8) + all_hessians = np.ones(1, dtype=G_H_DTYPE) + sum_gradients = all_gradients.sum() + sum_hessians = n_samples + hessians_are_constant = True + + builder = HistogramBuilder( + X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads + ) + + n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32) + monotonic_cst = np.array( + [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8 + ) + is_categorical = np.ones_like(monotonic_cst, dtype=np.uint8) + missing_values_bin_idx = n_bins - 1 + + splitter = Splitter( + X_binned, + n_bins_non_missing, + missing_values_bin_idx, + has_missing_values, + is_categorical, + monotonic_cst, + l2_regularization, + min_hessian_to_split, + min_samples_leaf, + min_gain_to_split, + hessians_are_constant, + ) + + histograms = builder.compute_histograms_brute(sample_indices) + value = compute_node_value( + sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization + ) + split_info = splitter.find_node_split( + n_samples, histograms, sum_gradients, sum_hessians, value + ) + + # no split found + assert split_info.gain == -1 + + +def _assert_categories_equals_bitset(categories, bitset): + # assert that the bitset exactly corresponds to the categories + # bitset is assumed to be an array of 8 uint32 elements + + # form bitset from threshold + expected_bitset = np.zeros(8, dtype=np.uint32) + for cat in categories: + idx = cat // 32 + shift = cat % 32 + expected_bitset[idx] |= 1 << shift + + # check for equality + assert_array_equal(expected_bitset, bitset) + + +@pytest.mark.parametrize( + ( + "X_binned, all_gradients, expected_categories_left, n_bins_non_missing," + "missing_values_bin_idx, has_missing_values, expected_missing_go_to_left" + ), + [ + # 4 categories + ( + [0, 1, 2, 3] * 11, # X_binned + [10, 1, 10, 10] * 11, # all_gradients + [1], # expected_categories_left + 4, # n_bins_non_missing + 4, # missing_values_bin_idx + False, # has_missing_values + None, + ), # expected_missing_go_to_left, unchecked + # Make sure that the categories that are on the right (second half) of + # the sorted categories array can still go in the left child. In this + # case, the best split was found when scanning from right to left. + ( + [0, 1, 2, 3] * 11, # X_binned + [10, 10, 10, 1] * 11, # all_gradients + [3], # expected_categories_left + 4, # n_bins_non_missing + 4, # missing_values_bin_idx + False, # has_missing_values + None, + ), # expected_missing_go_to_left, unchecked + # categories that don't respect MIN_CAT_SUPPORT (cat 4) are always + # mapped to the right child + ( + [0, 1, 2, 3] * 11 + [4] * 5, # X_binned + [10, 10, 10, 1] * 11 + [10] * 5, # all_gradients + [3], # expected_categories_left + 4, # n_bins_non_missing + 4, # missing_values_bin_idx + False, # has_missing_values + None, + ), # expected_missing_go_to_left, unchecked + # categories that don't respect MIN_CAT_SUPPORT are always mapped to + # the right child: in this case a more sensible split could have been + # 3, 4 - 0, 1, 2 + # But the split is still 3 - 0, 1, 2, 4. this is because we only scan + # up to the middle of the sorted category array (0, 1, 2, 3), and + # because we exclude cat 4 in this array. + ( + [0, 1, 2, 3] * 11 + [4] * 5, # X_binned + [10, 10, 10, 1] * 11 + [1] * 5, # all_gradients + [3], # expected_categories_left + 4, # n_bins_non_missing + 4, # missing_values_bin_idx + False, # has_missing_values + None, + ), # expected_missing_go_to_left, unchecked + # 4 categories with missing values that go to the right + ( + [0, 1, 2] * 11 + [9] * 11, # X_binned + [10, 1, 10] * 11 + [10] * 11, # all_gradients + [1], # expected_categories_left + 3, # n_bins_non_missing + 9, # missing_values_bin_idx + True, # has_missing_values + False, + ), # expected_missing_go_to_left + # 4 categories with missing values that go to the left + ( + [0, 1, 2] * 11 + [9] * 11, # X_binned + [10, 1, 10] * 11 + [1] * 11, # all_gradients + [1, 9], # expected_categories_left + 3, # n_bins_non_missing + 9, # missing_values_bin_idx + True, # has_missing_values + True, + ), # expected_missing_go_to_left + # split is on the missing value + ( + [0, 1, 2, 3, 4] * 11 + [255] * 12, # X_binned + [10, 10, 10, 10, 10] * 11 + [1] * 12, # all_gradients + [255], # expected_categories_left + 5, # n_bins_non_missing + 255, # missing_values_bin_idx + True, # has_missing_values + True, + ), # expected_missing_go_to_left + # split on even categories + ( + list(range(60)) * 12, # X_binned + [10, 1] * 360, # all_gradients + list(range(1, 60, 2)), # expected_categories_left + 59, # n_bins_non_missing + 59, # missing_values_bin_idx + True, # has_missing_values + True, + ), # expected_missing_go_to_left + # split on every 8 categories + ( + list(range(256)) * 12, # X_binned + [10, 10, 10, 10, 10, 10, 10, 1] * 384, # all_gradients + list(range(7, 256, 8)), # expected_categories_left + 255, # n_bins_non_missing + 255, # missing_values_bin_idx + True, # has_missing_values + True, + ), # expected_missing_go_to_left + ], +) +def test_splitting_categorical_sanity( + X_binned, + all_gradients, + expected_categories_left, + n_bins_non_missing, + missing_values_bin_idx, + has_missing_values, + expected_missing_go_to_left, +): + # Tests various combinations of categorical splits + + n_samples = len(X_binned) + n_bins = max(X_binned) + 1 + + X_binned = np.array(X_binned, dtype=X_BINNED_DTYPE).reshape(-1, 1) + X_binned = np.asfortranarray(X_binned) + + l2_regularization = 0.0 + min_hessian_to_split = 1e-3 + min_samples_leaf = 1 + min_gain_to_split = 0.0 + + sample_indices = np.arange(n_samples, dtype=np.uint32) + all_gradients = np.array(all_gradients, dtype=G_H_DTYPE) + all_hessians = np.ones(1, dtype=G_H_DTYPE) + has_missing_values = np.array([has_missing_values], dtype=np.uint8) + sum_gradients = all_gradients.sum() + sum_hessians = n_samples + hessians_are_constant = True + + builder = HistogramBuilder( + X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads + ) + + n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32) + monotonic_cst = np.array( + [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8 + ) + is_categorical = np.ones_like(monotonic_cst, dtype=np.uint8) + + splitter = Splitter( + X_binned, + n_bins_non_missing, + missing_values_bin_idx, + has_missing_values, + is_categorical, + monotonic_cst, + l2_regularization, + min_hessian_to_split, + min_samples_leaf, + min_gain_to_split, + hessians_are_constant, + ) + + histograms = builder.compute_histograms_brute(sample_indices) + + value = compute_node_value( + sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization + ) + split_info = splitter.find_node_split( + n_samples, histograms, sum_gradients, sum_hessians, value + ) + + assert split_info.is_categorical + assert split_info.gain > 0 + _assert_categories_equals_bitset( + expected_categories_left, split_info.left_cat_bitset + ) + if has_missing_values: + assert split_info.missing_go_to_left == expected_missing_go_to_left + # If there is no missing value during training, the flag missing_go_to_left + # is set later in the grower. + + # make sure samples are split correctly + samples_left, samples_right, _ = splitter.split_indices( + split_info, splitter.partition + ) + + left_mask = np.isin(X_binned.ravel(), expected_categories_left) + assert_array_equal(sample_indices[left_mask], samples_left) + assert_array_equal(sample_indices[~left_mask], samples_right) + + +def test_split_interaction_constraints(): + """Check that allowed_features are respected.""" + n_features = 4 + # features 1 and 2 are not allowed to be split on + allowed_features = np.array([0, 3], dtype=np.uint32) + n_bins = 5 + n_samples = 10 + l2_regularization = 0.0 + min_hessian_to_split = 1e-3 + min_samples_leaf = 1 + min_gain_to_split = 0.0 + + sample_indices = np.arange(n_samples, dtype=np.uint32) + all_hessians = np.ones(1, dtype=G_H_DTYPE) + sum_hessians = n_samples + hessians_are_constant = True + + split_features = [] + + # The loop is to ensure that we split at least once on each allowed feature (0, 3). + # This is tracked by split_features and checked at the end. + for i in range(10): + rng = np.random.RandomState(919 + i) + X_binned = np.asfortranarray( + rng.randint(0, n_bins - 1, size=(n_samples, n_features)), + dtype=X_BINNED_DTYPE, + ) + X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE) + + # Make feature 1 very important + all_gradients = (10 * X_binned[:, 1] + rng.randn(n_samples)).astype(G_H_DTYPE) + sum_gradients = all_gradients.sum() + + builder = HistogramBuilder( + X_binned, + n_bins, + all_gradients, + all_hessians, + hessians_are_constant, + n_threads, + ) + n_bins_non_missing = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) + has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) + monotonic_cst = np.array( + [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8 + ) + is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8) + missing_values_bin_idx = n_bins - 1 + splitter = Splitter( + X_binned, + n_bins_non_missing, + missing_values_bin_idx, + has_missing_values, + is_categorical, + monotonic_cst, + l2_regularization, + min_hessian_to_split, + min_samples_leaf, + min_gain_to_split, + hessians_are_constant, + ) + + assert np.all(sample_indices == splitter.partition) + + histograms = builder.compute_histograms_brute(sample_indices) + value = compute_node_value( + sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization + ) + + # with all features allowed, feature 1 should be split on as it is the most + # important one by construction of the gradients + si_root = splitter.find_node_split( + n_samples, + histograms, + sum_gradients, + sum_hessians, + value, + allowed_features=None, + ) + assert si_root.feature_idx == 1 + + # only features 0 and 3 are allowed to be split on + si_root = splitter.find_node_split( + n_samples, + histograms, + sum_gradients, + sum_hessians, + value, + allowed_features=allowed_features, + ) + split_features.append(si_root.feature_idx) + assert si_root.feature_idx in allowed_features + + # make sure feature 0 and feature 3 are split on in the constraint setting + assert set(allowed_features) == set(split_features) + + +@pytest.mark.parametrize("forbidden_features", [set(), {1, 3}]) +def test_split_feature_fraction_per_split(forbidden_features): + """Check that feature_fraction_per_split is respected. + + Because we set `n_features = 4` and `feature_fraction_per_split = 0.25`, it means + that calling `splitter.find_node_split` will be allowed to select a split for a + single completely random feature at each call. So if we iterate enough, we should + cover all the allowed features, irrespective of the values of the gradients and + Hessians of the objective. + """ + n_features = 4 + allowed_features = np.array( + list(set(range(n_features)) - forbidden_features), dtype=np.uint32 + ) + n_bins = 5 + n_samples = 40 + l2_regularization = 0.0 + min_hessian_to_split = 1e-3 + min_samples_leaf = 1 + min_gain_to_split = 0.0 + rng = np.random.default_rng(42) + + sample_indices = np.arange(n_samples, dtype=np.uint32) + all_gradients = rng.uniform(low=0.5, high=1, size=n_samples).astype(G_H_DTYPE) + sum_gradients = all_gradients.sum() + all_hessians = np.ones(1, dtype=G_H_DTYPE) + sum_hessians = n_samples + hessians_are_constant = True + + X_binned = np.asfortranarray( + rng.integers(low=0, high=n_bins - 1, size=(n_samples, n_features)), + dtype=X_BINNED_DTYPE, + ) + X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE) + builder = HistogramBuilder( + X_binned, + n_bins, + all_gradients, + all_hessians, + hessians_are_constant, + n_threads, + ) + histograms = builder.compute_histograms_brute(sample_indices) + value = compute_node_value( + sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization + ) + n_bins_non_missing = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32) + has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8) + monotonic_cst = np.array( + [MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8 + ) + is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8) + missing_values_bin_idx = n_bins - 1 + + params = dict( + X_binned=X_binned, + n_bins_non_missing=n_bins_non_missing, + missing_values_bin_idx=missing_values_bin_idx, + has_missing_values=has_missing_values, + is_categorical=is_categorical, + monotonic_cst=monotonic_cst, + l2_regularization=l2_regularization, + min_hessian_to_split=min_hessian_to_split, + min_samples_leaf=min_samples_leaf, + min_gain_to_split=min_gain_to_split, + hessians_are_constant=hessians_are_constant, + rng=rng, + ) + splitter_subsample = Splitter( + feature_fraction_per_split=0.25, # THIS is the important setting here. + **params, + ) + splitter_all_features = Splitter(feature_fraction_per_split=1.0, **params) + + assert np.all(sample_indices == splitter_subsample.partition) + + split_features_subsample = [] + split_features_all = [] + # The loop is to ensure that we split at least once on each feature. + # This is tracked by split_features and checked at the end. + for i in range(20): + si_root = splitter_subsample.find_node_split( + n_samples, + histograms, + sum_gradients, + sum_hessians, + value, + allowed_features=allowed_features, + ) + split_features_subsample.append(si_root.feature_idx) + + # This second splitter is our "counterfactual". + si_root = splitter_all_features.find_node_split( + n_samples, + histograms, + sum_gradients, + sum_hessians, + value, + allowed_features=allowed_features, + ) + split_features_all.append(si_root.feature_idx) + + # Make sure all features are split on. + assert set(split_features_subsample) == set(allowed_features) + + # Make sure, our counterfactual always splits on same feature. + assert len(set(split_features_all)) == 1 diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py new file mode 100644 index 0000000000000000000000000000000000000000..03a2720b36127108e06537a3f4a85c5b9d4e7701 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py @@ -0,0 +1,231 @@ +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_array_equal + +from sklearn.base import clone +from sklearn.datasets import make_classification, make_regression +from sklearn.ensemble import ( + HistGradientBoostingClassifier, + HistGradientBoostingRegressor, +) +from sklearn.metrics import check_scoring + +X_classification, y_classification = make_classification(random_state=0) +X_regression, y_regression = make_regression(random_state=0) + + +def _assert_predictor_equal(gb_1, gb_2, X): + """Assert that two HistGBM instances are identical.""" + # Check identical nodes for each tree + for pred_ith_1, pred_ith_2 in zip(gb_1._predictors, gb_2._predictors): + for predictor_1, predictor_2 in zip(pred_ith_1, pred_ith_2): + assert_array_equal(predictor_1.nodes, predictor_2.nodes) + + # Check identical predictions + assert_allclose(gb_1.predict(X), gb_2.predict(X)) + + +@pytest.mark.parametrize( + "GradientBoosting, X, y", + [ + (HistGradientBoostingClassifier, X_classification, y_classification), + (HistGradientBoostingRegressor, X_regression, y_regression), + ], +) +def test_max_iter_with_warm_start_validation(GradientBoosting, X, y): + # Check that a ValueError is raised when the maximum number of iterations + # is smaller than the number of iterations from the previous fit when warm + # start is True. + + estimator = GradientBoosting(max_iter=10, early_stopping=False, warm_start=True) + estimator.fit(X, y) + estimator.set_params(max_iter=5) + err_msg = ( + "max_iter=5 must be larger than or equal to n_iter_=10 when warm_start==True" + ) + with pytest.raises(ValueError, match=err_msg): + estimator.fit(X, y) + + +@pytest.mark.parametrize( + "GradientBoosting, X, y", + [ + (HistGradientBoostingClassifier, X_classification, y_classification), + (HistGradientBoostingRegressor, X_regression, y_regression), + ], +) +def test_warm_start_yields_identical_results(GradientBoosting, X, y): + # Make sure that fitting 50 iterations and then 25 with warm start is + # equivalent to fitting 75 iterations. + + rng = 42 + gb_warm_start = GradientBoosting( + n_iter_no_change=100, max_iter=50, random_state=rng, warm_start=True + ) + gb_warm_start.fit(X, y).set_params(max_iter=75).fit(X, y) + + gb_no_warm_start = GradientBoosting( + n_iter_no_change=100, max_iter=75, random_state=rng, warm_start=False + ) + gb_no_warm_start.fit(X, y) + + # Check that both predictors are equal + _assert_predictor_equal(gb_warm_start, gb_no_warm_start, X) + + +@pytest.mark.parametrize( + "GradientBoosting, X, y", + [ + (HistGradientBoostingClassifier, X_classification, y_classification), + (HistGradientBoostingRegressor, X_regression, y_regression), + ], +) +def test_warm_start_max_depth(GradientBoosting, X, y): + # Test if possible to fit trees of different depth in ensemble. + gb = GradientBoosting( + max_iter=20, + min_samples_leaf=1, + warm_start=True, + max_depth=2, + early_stopping=False, + ) + gb.fit(X, y) + gb.set_params(max_iter=30, max_depth=3, n_iter_no_change=110) + gb.fit(X, y) + + # First 20 trees have max_depth == 2 + for i in range(20): + assert gb._predictors[i][0].get_max_depth() == 2 + # Last 10 trees have max_depth == 3 + for i in range(1, 11): + assert gb._predictors[-i][0].get_max_depth() == 3 + + +@pytest.mark.parametrize( + "GradientBoosting, X, y", + [ + (HistGradientBoostingClassifier, X_classification, y_classification), + (HistGradientBoostingRegressor, X_regression, y_regression), + ], +) +@pytest.mark.parametrize("scoring", (None, "loss")) +def test_warm_start_early_stopping(GradientBoosting, X, y, scoring): + # Make sure that early stopping occurs after a small number of iterations + # when fitting a second time with warm starting. + + n_iter_no_change = 5 + gb = GradientBoosting( + n_iter_no_change=n_iter_no_change, + max_iter=10000, + early_stopping=True, + random_state=42, + warm_start=True, + tol=1e-3, + scoring=scoring, + ) + gb.fit(X, y) + n_iter_first_fit = gb.n_iter_ + gb.fit(X, y) + n_iter_second_fit = gb.n_iter_ + assert 0 < n_iter_second_fit - n_iter_first_fit < n_iter_no_change + + +@pytest.mark.parametrize( + "GradientBoosting, X, y", + [ + (HistGradientBoostingClassifier, X_classification, y_classification), + (HistGradientBoostingRegressor, X_regression, y_regression), + ], +) +def test_warm_start_equal_n_estimators(GradientBoosting, X, y): + # Test if warm start with equal n_estimators does nothing + gb_1 = GradientBoosting(max_depth=2, early_stopping=False) + gb_1.fit(X, y) + + gb_2 = clone(gb_1) + gb_2.set_params(max_iter=gb_1.max_iter, warm_start=True, n_iter_no_change=5) + gb_2.fit(X, y) + + # Check that both predictors are equal + _assert_predictor_equal(gb_1, gb_2, X) + + +@pytest.mark.parametrize( + "GradientBoosting, X, y", + [ + (HistGradientBoostingClassifier, X_classification, y_classification), + (HistGradientBoostingRegressor, X_regression, y_regression), + ], +) +def test_warm_start_clear(GradientBoosting, X, y): + # Test if fit clears state. + gb_1 = GradientBoosting(n_iter_no_change=5, random_state=42) + gb_1.fit(X, y) + + gb_2 = GradientBoosting(n_iter_no_change=5, random_state=42, warm_start=True) + gb_2.fit(X, y) # inits state + gb_2.set_params(warm_start=False) + gb_2.fit(X, y) # clears old state and equals est + + # Check that both predictors have the same train_score_ and + # validation_score_ attributes + assert_allclose(gb_1.train_score_, gb_2.train_score_) + assert_allclose(gb_1.validation_score_, gb_2.validation_score_) + + # Check that both predictors are equal + _assert_predictor_equal(gb_1, gb_2, X) + + +@pytest.mark.parametrize( + "GradientBoosting, X, y", + [ + (HistGradientBoostingClassifier, X_classification, y_classification), + (HistGradientBoostingRegressor, X_regression, y_regression), + ], +) +@pytest.mark.parametrize("rng_type", ("none", "int", "instance")) +def test_random_seeds_warm_start(GradientBoosting, X, y, rng_type): + # Make sure the seeds for train/val split and small trainset subsampling + # are correctly set in a warm start context. + def _get_rng(rng_type): + # Helper to avoid consuming rngs + if rng_type == "none": + return None + elif rng_type == "int": + return 42 + else: + return np.random.RandomState(0) + + random_state = _get_rng(rng_type) + gb_1 = GradientBoosting(early_stopping=True, max_iter=2, random_state=random_state) + gb_1.set_params(scoring=check_scoring(gb_1)) + gb_1.fit(X, y) + random_seed_1_1 = gb_1._random_seed + + gb_1.fit(X, y) + random_seed_1_2 = gb_1._random_seed # clear the old state, different seed + + random_state = _get_rng(rng_type) + gb_2 = GradientBoosting( + early_stopping=True, max_iter=2, random_state=random_state, warm_start=True + ) + gb_2.set_params(scoring=check_scoring(gb_2)) + gb_2.fit(X, y) # inits state + random_seed_2_1 = gb_2._random_seed + gb_2.fit(X, y) # clears old state and equals est + random_seed_2_2 = gb_2._random_seed + + # Without warm starting, the seeds should be + # * all different if random state is None + # * all equal if random state is an integer + # * different when refitting and equal with a new estimator (because + # the random state is mutated) + if rng_type == "none": + assert random_seed_1_1 != random_seed_1_2 != random_seed_2_1 + elif rng_type == "int": + assert random_seed_1_1 == random_seed_1_2 == random_seed_2_1 + else: + assert random_seed_1_1 == random_seed_2_1 != random_seed_1_2 + + # With warm starting, the seeds must be equal + assert random_seed_2_1 == random_seed_2_2 diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/utils.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..429fbed611c22952154d1083152a3af69ba1ca07 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_hist_gradient_boosting/utils.py @@ -0,0 +1,149 @@ +"""This module contains utility routines.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ...base import is_classifier +from .binning import _BinMapper + + +def get_equivalent_estimator(estimator, lib="lightgbm", n_classes=None): + """Return an unfitted estimator from another lib with matching hyperparams. + + This utility function takes care of renaming the sklearn parameters into + their LightGBM, XGBoost or CatBoost equivalent parameters. + + # unmapped XGB parameters: + # - min_samples_leaf + # - min_data_in_bin + # - min_split_gain (there is min_split_loss though?) + + # unmapped Catboost parameters: + # max_leaves + # min_* + """ + + if lib not in ("lightgbm", "xgboost", "catboost"): + raise ValueError( + "accepted libs are lightgbm, xgboost, and catboost. got {}".format(lib) + ) + + sklearn_params = estimator.get_params() + + if sklearn_params["loss"] == "auto": + raise ValueError( + "auto loss is not accepted. We need to know if " + "the problem is binary or multiclass classification." + ) + if sklearn_params["early_stopping"]: + raise NotImplementedError("Early stopping should be deactivated.") + + lightgbm_loss_mapping = { + "squared_error": "regression_l2", + "absolute_error": "regression_l1", + "log_loss": "binary" if n_classes == 2 else "multiclass", + "gamma": "gamma", + "poisson": "poisson", + } + + lightgbm_params = { + "objective": lightgbm_loss_mapping[sklearn_params["loss"]], + "learning_rate": sklearn_params["learning_rate"], + "n_estimators": sklearn_params["max_iter"], + "num_leaves": sklearn_params["max_leaf_nodes"], + "max_depth": sklearn_params["max_depth"], + "min_data_in_leaf": sklearn_params["min_samples_leaf"], + "reg_lambda": sklearn_params["l2_regularization"], + "max_bin": sklearn_params["max_bins"], + "min_data_in_bin": 1, + "min_sum_hessian_in_leaf": 1e-3, + "min_split_gain": 0, + "verbosity": 10 if sklearn_params["verbose"] else -10, + "boost_from_average": True, + "enable_bundle": False, # also makes feature order consistent + "subsample_for_bin": _BinMapper().subsample, + "poisson_max_delta_step": 1e-12, + "feature_fraction_bynode": sklearn_params["max_features"], + } + + if sklearn_params["loss"] == "log_loss" and n_classes > 2: + # LightGBM multiplies hessians by 2 in multiclass loss. + lightgbm_params["min_sum_hessian_in_leaf"] *= 2 + # LightGBM 3.0 introduced a different scaling of the hessian for the multiclass + # case. + # It is equivalent of scaling the learning rate. + # See https://github.com/microsoft/LightGBM/pull/3256. + if n_classes is not None: + lightgbm_params["learning_rate"] *= n_classes / (n_classes - 1) + + # XGB + xgboost_loss_mapping = { + "squared_error": "reg:linear", + "absolute_error": "LEAST_ABSOLUTE_DEV_NOT_SUPPORTED", + "log_loss": "reg:logistic" if n_classes == 2 else "multi:softmax", + "gamma": "reg:gamma", + "poisson": "count:poisson", + } + + xgboost_params = { + "tree_method": "hist", + "grow_policy": "lossguide", # so that we can set max_leaves + "objective": xgboost_loss_mapping[sklearn_params["loss"]], + "learning_rate": sklearn_params["learning_rate"], + "n_estimators": sklearn_params["max_iter"], + "max_leaves": sklearn_params["max_leaf_nodes"], + "max_depth": sklearn_params["max_depth"] or 0, + "lambda": sklearn_params["l2_regularization"], + "max_bin": sklearn_params["max_bins"], + "min_child_weight": 1e-3, + "verbosity": 2 if sklearn_params["verbose"] else 0, + "silent": sklearn_params["verbose"] == 0, + "n_jobs": -1, + "colsample_bynode": sklearn_params["max_features"], + } + + # Catboost + catboost_loss_mapping = { + "squared_error": "RMSE", + # catboost does not support MAE when leaf_estimation_method is Newton + "absolute_error": "LEAST_ASBOLUTE_DEV_NOT_SUPPORTED", + "log_loss": "Logloss" if n_classes == 2 else "MultiClass", + "gamma": None, + "poisson": "Poisson", + } + + catboost_params = { + "loss_function": catboost_loss_mapping[sklearn_params["loss"]], + "learning_rate": sklearn_params["learning_rate"], + "iterations": sklearn_params["max_iter"], + "depth": sklearn_params["max_depth"], + "reg_lambda": sklearn_params["l2_regularization"], + "max_bin": sklearn_params["max_bins"], + "feature_border_type": "Median", + "leaf_estimation_method": "Newton", + "verbose": bool(sklearn_params["verbose"]), + } + + if lib == "lightgbm": + from lightgbm import LGBMClassifier, LGBMRegressor + + if is_classifier(estimator): + return LGBMClassifier(**lightgbm_params) + else: + return LGBMRegressor(**lightgbm_params) + + elif lib == "xgboost": + from xgboost import XGBClassifier, XGBRegressor + + if is_classifier(estimator): + return XGBClassifier(**xgboost_params) + else: + return XGBRegressor(**xgboost_params) + + else: + from catboost import CatBoostClassifier, CatBoostRegressor + + if is_classifier(estimator): + return CatBoostClassifier(**catboost_params) + else: + return CatBoostRegressor(**catboost_params) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_iforest.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_iforest.py new file mode 100644 index 0000000000000000000000000000000000000000..4e5287af7f699992d21c1881beb039dfb0f6dcc0 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_iforest.py @@ -0,0 +1,673 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numbers +import threading +from numbers import Integral, Real +from warnings import warn + +import numpy as np +from scipy.sparse import issparse + +from ..base import OutlierMixin, _fit_context +from ..tree import ExtraTreeRegressor +from ..tree._tree import DTYPE as tree_dtype +from ..utils import ( + check_array, + check_random_state, + gen_batches, +) +from ..utils._chunking import get_chunk_n_rows +from ..utils._param_validation import Interval, RealNotInt, StrOptions +from ..utils.parallel import Parallel, delayed +from ..utils.validation import _num_samples, check_is_fitted, validate_data +from ._bagging import BaseBagging + +__all__ = ["IsolationForest"] + + +def _parallel_compute_tree_depths( + tree, + X, + features, + tree_decision_path_lengths, + tree_avg_path_lengths, + depths, + lock, +): + """Parallel computation of isolation tree depth.""" + if features is None: + X_subset = X + else: + X_subset = X[:, features] + + leaves_index = tree.apply(X_subset, check_input=False) + + with lock: + depths += ( + tree_decision_path_lengths[leaves_index] + + tree_avg_path_lengths[leaves_index] + - 1.0 + ) + + +class IsolationForest(OutlierMixin, BaseBagging): + """ + Isolation Forest Algorithm. + + Return the anomaly score of each sample using the IsolationForest algorithm + + The IsolationForest 'isolates' observations by randomly selecting a feature + and then randomly selecting a split value between the maximum and minimum + values of the selected feature. + + Since recursive partitioning can be represented by a tree structure, the + number of splittings required to isolate a sample is equivalent to the path + length from the root node to the terminating node. + + This path length, averaged over a forest of such random trees, is a + measure of normality and our decision function. + + Random partitioning produces noticeably shorter paths for anomalies. + Hence, when a forest of random trees collectively produce shorter path + lengths for particular samples, they are highly likely to be anomalies. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.18 + + Parameters + ---------- + n_estimators : int, default=100 + The number of base estimators in the ensemble. + + max_samples : "auto", int or float, default="auto" + The number of samples to draw from X to train each base estimator. + + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. + - If "auto", then `max_samples=min(256, n_samples)`. + + If max_samples is larger than the number of samples provided, + all samples will be used for all trees (no sampling). + + contamination : 'auto' or float, default='auto' + The amount of contamination of the data set, i.e. the proportion + of outliers in the data set. Used when fitting to define the threshold + on the scores of the samples. + + - If 'auto', the threshold is determined as in the + original paper. + - If float, the contamination should be in the range (0, 0.5]. + + .. versionchanged:: 0.22 + The default value of ``contamination`` changed from 0.1 + to ``'auto'``. + + max_features : int or float, default=1.0 + The number of features to draw from X to train each base estimator. + + - If int, then draw `max_features` features. + - If float, then draw `max(1, int(max_features * n_features_in_))` features. + + Note: using a float number less than 1.0 or integer less than number of + features will enable feature subsampling and leads to a longer runtime. + + bootstrap : bool, default=False + If True, individual trees are fit on random subsets of the training + data sampled with replacement. If False, sampling without replacement + is performed. + + n_jobs : int, default=None + The number of jobs to run in parallel for :meth:`fit`. ``None`` means 1 + unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using + all processors. See :term:`Glossary ` for more details. + + random_state : int, RandomState instance or None, default=None + Controls the pseudo-randomness of the selection of the feature + and split values for each branching step and each tree in the forest. + + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + verbose : int, default=0 + Controls the verbosity of the tree building process. + + warm_start : bool, default=False + When set to ``True``, reuse the solution of the previous call to fit + and add more estimators to the ensemble, otherwise, just fit a whole + new forest. See :term:`the Glossary `. + + .. versionadded:: 0.21 + + Attributes + ---------- + estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` instance + The child estimator template used to create the collection of + fitted sub-estimators. + + .. versionadded:: 1.2 + `base_estimator_` was renamed to `estimator_`. + + estimators_ : list of ExtraTreeRegressor instances + The collection of fitted sub-estimators. + + estimators_features_ : list of ndarray + The subset of drawn features for each base estimator. + + estimators_samples_ : list of ndarray + The subset of drawn samples (i.e., the in-bag samples) for each base + estimator. + + max_samples_ : int + The actual number of samples. + + offset_ : float + Offset used to define the decision function from the raw scores. We + have the relation: ``decision_function = score_samples - offset_``. + ``offset_`` is defined as follows. When the contamination parameter is + set to "auto", the offset is equal to -0.5 as the scores of inliers are + close to 0 and the scores of outliers are close to -1. When a + contamination parameter different than "auto" is provided, the offset + is defined in such a way we obtain the expected number of outliers + (samples with decision function < 0) in training. + + .. versionadded:: 0.20 + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + sklearn.covariance.EllipticEnvelope : An object for detecting outliers in a + Gaussian distributed dataset. + sklearn.svm.OneClassSVM : Unsupervised Outlier Detection. + Estimate the support of a high-dimensional distribution. + The implementation is based on libsvm. + sklearn.neighbors.LocalOutlierFactor : Unsupervised Outlier Detection + using Local Outlier Factor (LOF). + + Notes + ----- + The implementation is based on an ensemble of ExtraTreeRegressor. The + maximum depth of each tree is set to ``ceil(log_2(n))`` where + :math:`n` is the number of samples used to build the tree + (see (Liu et al., 2008) for more details). + + References + ---------- + .. [1] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest." + Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on. + .. [2] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation-based + anomaly detection." ACM Transactions on Knowledge Discovery from + Data (TKDD) 6.1 (2012): 3. + + Examples + -------- + >>> from sklearn.ensemble import IsolationForest + >>> X = [[-1.1], [0.3], [0.5], [100]] + >>> clf = IsolationForest(random_state=0).fit(X) + >>> clf.predict([[0.1], [0], [90]]) + array([ 1, 1, -1]) + + For an example of using isolation forest for anomaly detection see + :ref:`sphx_glr_auto_examples_ensemble_plot_isolation_forest.py`. + """ + + _parameter_constraints: dict = { + "n_estimators": [Interval(Integral, 1, None, closed="left")], + "max_samples": [ + StrOptions({"auto"}), + Interval(Integral, 1, None, closed="left"), + Interval(RealNotInt, 0, 1, closed="right"), + ], + "contamination": [ + StrOptions({"auto"}), + Interval(Real, 0, 0.5, closed="right"), + ], + "max_features": [ + Integral, + Interval(Real, 0, 1, closed="right"), + ], + "bootstrap": ["boolean"], + "n_jobs": [Integral, None], + "random_state": ["random_state"], + "verbose": ["verbose"], + "warm_start": ["boolean"], + } + + def __init__( + self, + *, + n_estimators=100, + max_samples="auto", + contamination="auto", + max_features=1.0, + bootstrap=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ): + super().__init__( + estimator=None, + # here above max_features has no links with self.max_features + bootstrap=bootstrap, + bootstrap_features=False, + n_estimators=n_estimators, + max_samples=max_samples, + max_features=max_features, + warm_start=warm_start, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + ) + + self.contamination = contamination + + def _get_estimator(self): + return ExtraTreeRegressor( + # here max_features has no links with self.max_features + max_features=1, + splitter="random", + random_state=self.random_state, + ) + + def _set_oob_score(self, X, y): + raise NotImplementedError("OOB score not supported by iforest") + + def _parallel_args(self): + # ExtraTreeRegressor releases the GIL, so it's more efficient to use + # a thread-based backend rather than a process-based backend so as + # to avoid suffering from communication overhead and extra memory + # copies. This is only used in the fit method. + return {"prefer": "threads"} + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None, sample_weight=None): + """ + Fit estimator. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Use ``dtype=np.float32`` for maximum + efficiency. Sparse matrices are also supported, use sparse + ``csc_matrix`` for maximum efficiency. + + y : Ignored + Not used, present for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. + + Returns + ------- + self : object + Fitted estimator. + """ + X = validate_data( + self, X, accept_sparse=["csc"], dtype=tree_dtype, ensure_all_finite=False + ) + if issparse(X): + # Pre-sort indices to avoid that each individual tree of the + # ensemble sorts the indices. + X.sort_indices() + + rnd = check_random_state(self.random_state) + y = rnd.uniform(size=X.shape[0]) + + # ensure that max_sample is in [1, n_samples]: + n_samples = X.shape[0] + + if isinstance(self.max_samples, str) and self.max_samples == "auto": + max_samples = min(256, n_samples) + + elif isinstance(self.max_samples, numbers.Integral): + if self.max_samples > n_samples: + warn( + "max_samples (%s) is greater than the " + "total number of samples (%s). max_samples " + "will be set to n_samples for estimation." + % (self.max_samples, n_samples) + ) + max_samples = n_samples + else: + max_samples = self.max_samples + else: # max_samples is float + max_samples = int(self.max_samples * X.shape[0]) + + self.max_samples_ = max_samples + max_depth = int(np.ceil(np.log2(max(max_samples, 2)))) + super()._fit( + X, + y, + max_samples, + max_depth=max_depth, + sample_weight=sample_weight, + check_input=False, + ) + + self._average_path_length_per_tree, self._decision_path_lengths = zip( + *[ + ( + _average_path_length(tree.tree_.n_node_samples), + tree.tree_.compute_node_depths(), + ) + for tree in self.estimators_ + ] + ) + + if self.contamination == "auto": + # 0.5 plays a special role as described in the original paper. + # we take the opposite as we consider the opposite of their score. + self.offset_ = -0.5 + return self + + # Else, define offset_ wrt contamination parameter + # To avoid performing input validation a second time we call + # _score_samples rather than score_samples. + # _score_samples expects a CSR matrix, so we convert if necessary. + if issparse(X): + X = X.tocsr() + self.offset_ = np.percentile(self._score_samples(X), 100.0 * self.contamination) + + return self + + def predict(self, X): + """ + Predict if a particular sample is an outlier or not. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + + Returns + ------- + is_inlier : ndarray of shape (n_samples,) + For each observation, tells whether or not (+1 or -1) it should + be considered as an inlier according to the fitted model. + + Notes + ----- + The predict method can be parallelized by setting a joblib context. This + inherently does NOT use the ``n_jobs`` parameter initialized in the class, + which is used during ``fit``. This is because, predict may actually be faster + without parallelization for a small number of samples, + such as for 1000 samples or less. The user can set the + number of jobs in the joblib context to control the number of parallel jobs. + + .. code-block:: python + + from joblib import parallel_backend + + # Note, we use threading here as the predict method is not CPU bound. + with parallel_backend("threading", n_jobs=4): + model.predict(X) + """ + check_is_fitted(self) + decision_func = self.decision_function(X) + is_inlier = np.ones_like(decision_func, dtype=int) + is_inlier[decision_func < 0] = -1 + return is_inlier + + def decision_function(self, X): + """ + Average anomaly score of X of the base classifiers. + + The anomaly score of an input sample is computed as + the mean anomaly score of the trees in the forest. + + The measure of normality of an observation given a tree is the depth + of the leaf containing this observation, which is equivalent to + the number of splittings required to isolate this point. In case of + several observations n_left in the leaf, the average path length of + a n_left samples isolation tree is added. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + + Returns + ------- + scores : ndarray of shape (n_samples,) + The anomaly score of the input samples. + The lower, the more abnormal. Negative scores represent outliers, + positive scores represent inliers. + + Notes + ----- + The decision_function method can be parallelized by setting a joblib context. + This inherently does NOT use the ``n_jobs`` parameter initialized in the class, + which is used during ``fit``. This is because, calculating the score may + actually be faster without parallelization for a small number of samples, + such as for 1000 samples or less. + The user can set the number of jobs in the joblib context to control the + number of parallel jobs. + + .. code-block:: python + + from joblib import parallel_backend + + # Note, we use threading here as the decision_function method is + # not CPU bound. + with parallel_backend("threading", n_jobs=4): + model.decision_function(X) + """ + # We subtract self.offset_ to make 0 be the threshold value for being + # an outlier: + + return self.score_samples(X) - self.offset_ + + def score_samples(self, X): + """ + Opposite of the anomaly score defined in the original paper. + + The anomaly score of an input sample is computed as + the mean anomaly score of the trees in the forest. + + The measure of normality of an observation given a tree is the depth + of the leaf containing this observation, which is equivalent to + the number of splittings required to isolate this point. In case of + several observations n_left in the leaf, the average path length of + a n_left samples isolation tree is added. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. + + Returns + ------- + scores : ndarray of shape (n_samples,) + The anomaly score of the input samples. + The lower, the more abnormal. + + Notes + ----- + The score function method can be parallelized by setting a joblib context. This + inherently does NOT use the ``n_jobs`` parameter initialized in the class, + which is used during ``fit``. This is because, calculating the score may + actually be faster without parallelization for a small number of samples, + such as for 1000 samples or less. + The user can set the number of jobs in the joblib context to control the + number of parallel jobs. + + .. code-block:: python + + from joblib import parallel_backend + + # Note, we use threading here as the score_samples method is not CPU bound. + with parallel_backend("threading", n_jobs=4): + model.score(X) + """ + # Check data + X = validate_data( + self, + X, + accept_sparse="csr", + dtype=tree_dtype, + reset=False, + ensure_all_finite=False, + ) + + return self._score_samples(X) + + def _score_samples(self, X): + """Private version of score_samples without input validation. + + Input validation would remove feature names, so we disable it. + """ + # Code structure from ForestClassifier/predict_proba + + check_is_fitted(self) + + # Take the opposite of the scores as bigger is better (here less abnormal) + return -self._compute_chunked_score_samples(X) + + def _compute_chunked_score_samples(self, X): + n_samples = _num_samples(X) + + if self._max_features == X.shape[1]: + subsample_features = False + else: + subsample_features = True + + # We get as many rows as possible within our working_memory budget + # (defined by sklearn.get_config()['working_memory']) to store + # self._max_features in each row during computation. + # + # Note: + # - this will get at least 1 row, even if 1 row of score will + # exceed working_memory. + # - this does only account for temporary memory usage while loading + # the data needed to compute the scores -- the returned scores + # themselves are 1D. + + chunk_n_rows = get_chunk_n_rows( + row_bytes=16 * self._max_features, max_n_rows=n_samples + ) + slices = gen_batches(n_samples, chunk_n_rows) + + scores = np.zeros(n_samples, order="f") + + for sl in slices: + # compute score on the slices of test samples: + scores[sl] = self._compute_score_samples(X[sl], subsample_features) + + return scores + + def _compute_score_samples(self, X, subsample_features): + """ + Compute the score of each samples in X going through the extra trees. + + Parameters + ---------- + X : array-like or sparse matrix + Data matrix. + + subsample_features : bool + Whether features should be subsampled. + + Returns + ------- + scores : ndarray of shape (n_samples,) + The score of each sample in X. + """ + n_samples = X.shape[0] + + depths = np.zeros(n_samples, order="f") + + average_path_length_max_samples = _average_path_length([self._max_samples]) + + # Note: we use default n_jobs value, i.e. sequential computation, which + # we expect to be more performant that parallelizing for small number + # of samples, e.g. < 1k samples. Default n_jobs value can be overridden + # by using joblib.parallel_backend context manager around + # ._compute_score_samples. Using a higher n_jobs may speed up the + # computation of the scores, e.g. for > 1k samples. See + # https://github.com/scikit-learn/scikit-learn/pull/28622 for more + # details. + lock = threading.Lock() + Parallel( + verbose=self.verbose, + require="sharedmem", + )( + delayed(_parallel_compute_tree_depths)( + tree, + X, + features if subsample_features else None, + self._decision_path_lengths[tree_idx], + self._average_path_length_per_tree[tree_idx], + depths, + lock, + ) + for tree_idx, (tree, features) in enumerate( + zip(self.estimators_, self.estimators_features_) + ) + ) + + denominator = len(self.estimators_) * average_path_length_max_samples + scores = 2 ** ( + # For a single training sample, denominator and depth are 0. + # Therefore, we set the score manually to 1. + -np.divide( + depths, denominator, out=np.ones_like(depths), where=denominator != 0 + ) + ) + return scores + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags + + +def _average_path_length(n_samples_leaf): + """ + The average path length in a n_samples iTree, which is equal to + the average path length of an unsuccessful BST search since the + latter has the same structure as an isolation tree. + Parameters + ---------- + n_samples_leaf : array-like of shape (n_samples,) + The number of training samples in each test sample leaf, for + each estimators. + + Returns + ------- + average_path_length : ndarray of shape (n_samples,) + """ + + n_samples_leaf = check_array(n_samples_leaf, ensure_2d=False) + + n_samples_leaf_shape = n_samples_leaf.shape + n_samples_leaf = n_samples_leaf.reshape((1, -1)) + average_path_length = np.zeros(n_samples_leaf.shape) + + mask_1 = n_samples_leaf <= 1 + mask_2 = n_samples_leaf == 2 + not_mask = ~np.logical_or(mask_1, mask_2) + + average_path_length[mask_1] = 0.0 + average_path_length[mask_2] = 1.0 + average_path_length[not_mask] = ( + 2.0 * (np.log(n_samples_leaf[not_mask] - 1.0) + np.euler_gamma) + - 2.0 * (n_samples_leaf[not_mask] - 1.0) / n_samples_leaf[not_mask] + ) + + return average_path_length.reshape(n_samples_leaf_shape) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_stacking.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_stacking.py new file mode 100644 index 0000000000000000000000000000000000000000..2894d8f174c13a7f54607e4b56717381464cf94f --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_stacking.py @@ -0,0 +1,1145 @@ +"""Stacking classifier and regressor.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from abc import ABCMeta, abstractmethod +from copy import deepcopy +from numbers import Integral + +import numpy as np +import scipy.sparse as sparse + +from ..base import ( + ClassifierMixin, + RegressorMixin, + TransformerMixin, + _fit_context, + clone, + is_classifier, + is_regressor, +) +from ..exceptions import NotFittedError +from ..linear_model import LogisticRegression, RidgeCV +from ..model_selection import check_cv, cross_val_predict +from ..preprocessing import LabelEncoder +from ..utils import Bunch +from ..utils._param_validation import HasMethods, StrOptions +from ..utils._repr_html.estimator import _VisualBlock +from ..utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + _routing_enabled, + process_routing, +) +from ..utils.metaestimators import available_if +from ..utils.multiclass import check_classification_targets, type_of_target +from ..utils.parallel import Parallel, delayed +from ..utils.validation import ( + _check_feature_names_in, + _check_response_method, + _estimator_has, + check_is_fitted, + column_or_1d, +) +from ._base import _BaseHeterogeneousEnsemble, _fit_single_estimator + + +class _BaseStacking(TransformerMixin, _BaseHeterogeneousEnsemble, metaclass=ABCMeta): + """Base class for stacking method.""" + + _parameter_constraints: dict = { + "estimators": [list], + "final_estimator": [None, HasMethods("fit")], + "cv": ["cv_object", StrOptions({"prefit"})], + "n_jobs": [None, Integral], + "passthrough": ["boolean"], + "verbose": ["verbose"], + } + + @abstractmethod + def __init__( + self, + estimators, + final_estimator=None, + *, + cv=None, + stack_method="auto", + n_jobs=None, + verbose=0, + passthrough=False, + ): + super().__init__(estimators=estimators) + self.final_estimator = final_estimator + self.cv = cv + self.stack_method = stack_method + self.n_jobs = n_jobs + self.verbose = verbose + self.passthrough = passthrough + + def _clone_final_estimator(self, default): + if self.final_estimator is not None: + self.final_estimator_ = clone(self.final_estimator) + else: + self.final_estimator_ = clone(default) + + def _concatenate_predictions(self, X, predictions): + """Concatenate the predictions of each first layer learner and + possibly the input dataset `X`. + + If `X` is sparse and `self.passthrough` is False, the output of + `transform` will be dense (the predictions). If `X` is sparse + and `self.passthrough` is True, the output of `transform` will + be sparse. + + This helper is in charge of ensuring the predictions are 2D arrays and + it will drop one of the probability column when using probabilities + in the binary case. Indeed, the p(y|c=0) = 1 - p(y|c=1) + + When `y` type is `"multilabel-indicator"`` and the method used is + `predict_proba`, `preds` can be either a `ndarray` of shape + `(n_samples, n_class)` or for some estimators a list of `ndarray`. + This function will drop one of the probability column in this situation as well. + """ + X_meta = [] + for est_idx, preds in enumerate(predictions): + if isinstance(preds, list): + # `preds` is here a list of `n_targets` 2D ndarrays of + # `n_classes` columns. The k-th column contains the + # probabilities of the samples belonging the k-th class. + # + # Since those probabilities must sum to one for each sample, + # we can work with probabilities of `n_classes - 1` classes. + # Hence we drop the first column. + for pred in preds: + X_meta.append(pred[:, 1:]) + elif preds.ndim == 1: + # Some estimator return a 1D array for predictions + # which must be 2-dimensional arrays. + X_meta.append(preds.reshape(-1, 1)) + elif ( + self.stack_method_[est_idx] == "predict_proba" + and len(self.classes_) == 2 + ): + # Remove the first column when using probabilities in + # binary classification because both features `preds` are perfectly + # collinear. + X_meta.append(preds[:, 1:]) + else: + X_meta.append(preds) + + self._n_feature_outs = [pred.shape[1] for pred in X_meta] + if self.passthrough: + X_meta.append(X) + if sparse.issparse(X): + return sparse.hstack(X_meta, format=X.format) + + return np.hstack(X_meta) + + @staticmethod + def _method_name(name, estimator, method): + if estimator == "drop": + return None + if method == "auto": + method = ["predict_proba", "decision_function", "predict"] + try: + method_name = _check_response_method(estimator, method).__name__ + except AttributeError as e: + raise ValueError( + f"Underlying estimator {name} does not implement the method {method}." + ) from e + + return method_name + + @_fit_context( + # estimators in Stacking*.estimators are not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y, **fit_params): + """Fit the estimators. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target values. + + **fit_params : dict + Dict of metadata, potentially containing sample_weight as a + key-value pair. If sample_weight is not present, then samples are + equally weighted. Note that sample_weight is supported only if all + underlying estimators support sample weights. + + .. versionadded:: 1.6 + + Returns + ------- + self : object + """ + # all_estimators contains all estimators, the one to be fitted and the + # 'drop' string. + names, all_estimators = self._validate_estimators() + self._validate_final_estimator() + + stack_method = [self.stack_method] * len(all_estimators) + + if _routing_enabled(): + routed_params = process_routing(self, "fit", **fit_params) + else: + routed_params = Bunch() + for name in names: + routed_params[name] = Bunch(fit={}) + if "sample_weight" in fit_params: + routed_params[name].fit["sample_weight"] = fit_params[ + "sample_weight" + ] + + if self.cv == "prefit": + self.estimators_ = [] + for estimator in all_estimators: + if estimator != "drop": + check_is_fitted(estimator) + self.estimators_.append(estimator) + else: + # Fit the base estimators on the whole training data. Those + # base estimators will be used in transform, predict, and + # predict_proba. They are exposed publicly. + self.estimators_ = Parallel(n_jobs=self.n_jobs)( + delayed(_fit_single_estimator)( + clone(est), X, y, routed_params[name]["fit"] + ) + for name, est in zip(names, all_estimators) + if est != "drop" + ) + + self.named_estimators_ = Bunch() + est_fitted_idx = 0 + for name_est, org_est in zip(names, all_estimators): + if org_est != "drop": + current_estimator = self.estimators_[est_fitted_idx] + self.named_estimators_[name_est] = current_estimator + est_fitted_idx += 1 + if hasattr(current_estimator, "feature_names_in_"): + self.feature_names_in_ = current_estimator.feature_names_in_ + else: + self.named_estimators_[name_est] = "drop" + + self.stack_method_ = [ + self._method_name(name, est, meth) + for name, est, meth in zip(names, all_estimators, stack_method) + ] + + if self.cv == "prefit": + # Generate predictions from prefit models + predictions = [ + getattr(estimator, predict_method)(X) + for estimator, predict_method in zip(all_estimators, self.stack_method_) + if estimator != "drop" + ] + else: + # To train the meta-classifier using the most data as possible, we use + # a cross-validation to obtain the output of the stacked estimators. + # To ensure that the data provided to each estimator are the same, + # we need to set the random state of the cv if there is one and we + # need to take a copy. + cv = check_cv(self.cv, y=y, classifier=is_classifier(self)) + if hasattr(cv, "random_state") and cv.random_state is None: + cv.random_state = np.random.RandomState() + + predictions = Parallel(n_jobs=self.n_jobs)( + delayed(cross_val_predict)( + clone(est), + X, + y, + cv=deepcopy(cv), + method=meth, + n_jobs=self.n_jobs, + params=routed_params[name]["fit"], + verbose=self.verbose, + ) + for name, est, meth in zip(names, all_estimators, self.stack_method_) + if est != "drop" + ) + + # Only not None or not 'drop' estimators will be used in transform. + # Remove the None from the method as well. + self.stack_method_ = [ + meth + for (meth, est) in zip(self.stack_method_, all_estimators) + if est != "drop" + ] + + X_meta = self._concatenate_predictions(X, predictions) + _fit_single_estimator(self.final_estimator_, X_meta, y, fit_params=fit_params) + + return self + + @property + def n_features_in_(self): + """Number of features seen during :term:`fit`.""" + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + f"{self.__class__.__name__} object has no attribute n_features_in_" + ) from nfe + return self.estimators_[0].n_features_in_ + + def _transform(self, X): + """Concatenate and return the predictions of the estimators.""" + check_is_fitted(self) + predictions = [ + getattr(est, meth)(X) + for est, meth in zip(self.estimators_, self.stack_method_) + if est != "drop" + ] + return self._concatenate_predictions(X, predictions) + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. The input feature names are only used when `passthrough` is + `True`. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then names are generated: `[x0, x1, ..., x(n_features_in_ - 1)]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + If `passthrough` is `False`, then only the names of `estimators` are used + to generate the output feature names. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + check_is_fitted(self, "n_features_in_") + input_features = _check_feature_names_in( + self, input_features, generate_names=self.passthrough + ) + + class_name = self.__class__.__name__.lower() + non_dropped_estimators = ( + name for name, est in self.estimators if est != "drop" + ) + meta_names = [] + for est, n_features_out in zip(non_dropped_estimators, self._n_feature_outs): + if n_features_out == 1: + meta_names.append(f"{class_name}_{est}") + else: + meta_names.extend( + f"{class_name}_{est}{i}" for i in range(n_features_out) + ) + + if self.passthrough: + return np.concatenate((meta_names, input_features)) + + return np.asarray(meta_names, dtype=object) + + @available_if( + _estimator_has("predict", delegates=("final_estimator_", "final_estimator")) + ) + def predict(self, X, **predict_params): + """Predict target for X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + **predict_params : dict of str -> obj + Parameters to the `predict` called by the `final_estimator`. Note + that this may be used to return uncertainties from some estimators + with `return_std` or `return_cov`. Be aware that it will only + account for uncertainty in the final estimator. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) or (n_samples, n_output) + Predicted targets. + """ + + check_is_fitted(self) + return self.final_estimator_.predict(self.transform(X), **predict_params) + + def _sk_visual_block_with_final_estimator(self, final_estimator): + names, estimators = zip(*self.estimators) + parallel = _VisualBlock("parallel", estimators, names=names, dash_wrapped=False) + + # final estimator is wrapped in a parallel block to show the label: + # 'final_estimator' in the html repr + final_block = _VisualBlock( + "parallel", [final_estimator], names=["final_estimator"], dash_wrapped=False + ) + return _VisualBlock("serial", (parallel, final_block), dash_wrapped=False) + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.6 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__) + + # `self.estimators` is a list of (name, est) tuples + for name, estimator in self.estimators: + router.add( + **{name: estimator}, + method_mapping=MethodMapping().add(callee="fit", caller="fit"), + ) + + try: + final_estimator_ = self.final_estimator_ + except AttributeError: + final_estimator_ = self.final_estimator + + router.add( + final_estimator_=final_estimator_, + method_mapping=MethodMapping().add(caller="predict", callee="predict"), + ) + + return router + + +class StackingClassifier(ClassifierMixin, _BaseStacking): + """Stack of estimators with a final classifier. + + Stacked generalization consists in stacking the output of individual + estimator and use a classifier to compute the final prediction. Stacking + allows to use the strength of each individual estimator by using their + output as input of a final estimator. + + Note that `estimators_` are fitted on the full `X` while `final_estimator_` + is trained using cross-validated predictions of the base estimators using + `cross_val_predict`. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.22 + + Parameters + ---------- + estimators : list of (str, estimator) + Base estimators which will be stacked together. Each element of the + list is defined as a tuple of string (i.e. name) and an estimator + instance. An estimator can be set to 'drop' using `set_params`. + + The type of estimator is generally expected to be a classifier. + However, one can pass a regressor for some use case (e.g. ordinal + regression). + + final_estimator : estimator, default=None + A classifier which will be used to combine the base estimators. + The default classifier is a + :class:`~sklearn.linear_model.LogisticRegression`. + + cv : int, cross-validation generator, iterable, or "prefit", default=None + Determines the cross-validation splitting strategy used in + `cross_val_predict` to train `final_estimator`. Possible inputs for + cv are: + + * None, to use the default 5-fold cross validation, + * integer, to specify the number of folds in a (Stratified) KFold, + * An object to be used as a cross-validation generator, + * An iterable yielding train, test splits, + * `"prefit"`, to assume the `estimators` are prefit. In this case, the + estimators will not be refitted. + + For integer/None inputs, if the estimator is a classifier and y is + either binary or multiclass, + :class:`~sklearn.model_selection.StratifiedKFold` is used. + In all other cases, :class:`~sklearn.model_selection.KFold` is used. + These splitters are instantiated with `shuffle=False` so the splits + will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + If "prefit" is passed, it is assumed that all `estimators` have + been fitted already. The `final_estimator_` is trained on the `estimators` + predictions on the full training set and are **not** cross validated + predictions. Please note that if the models have been trained on the same + data to train the stacking model, there is a very high risk of overfitting. + + .. versionadded:: 1.1 + The 'prefit' option was added in 1.1 + + .. note:: + A larger number of split will provide no benefits if the number + of training samples is large enough. Indeed, the training time + will increase. ``cv`` is not used for model evaluation but for + prediction. + + stack_method : {'auto', 'predict_proba', 'decision_function', 'predict'}, \ + default='auto' + Methods called for each base estimator. It can be: + + * if 'auto', it will try to invoke, for each estimator, + `'predict_proba'`, `'decision_function'` or `'predict'` in that + order. + * otherwise, one of `'predict_proba'`, `'decision_function'` or + `'predict'`. If the method is not implemented by the estimator, it + will raise an error. + + n_jobs : int, default=None + The number of jobs to run in parallel for `fit` of all `estimators`. + `None` means 1 unless in a `joblib.parallel_backend` context. -1 means + using all processors. See :term:`Glossary ` for more details. + + passthrough : bool, default=False + When False, only the predictions of estimators will be used as + training data for `final_estimator`. When True, the + `final_estimator` is trained on the predictions as well as the + original training data. + + verbose : int, default=0 + Verbosity level. + + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) or list of ndarray if `y` \ + is of type `"multilabel-indicator"`. + Class labels. + + estimators_ : list of estimators + The elements of the `estimators` parameter, having been fitted on the + training data. If an estimator has been set to `'drop'`, it + will not appear in `estimators_`. When `cv="prefit"`, `estimators_` + is set to `estimators` and is not fitted again. + + named_estimators_ : :class:`~sklearn.utils.Bunch` + Attribute to access any fitted sub-estimators by name. + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Only defined if the + underlying estimators expose such an attribute when fit. + + .. versionadded:: 1.0 + + final_estimator_ : estimator + The classifier fit on the output of `estimators_` and responsible for + final predictions. + + stack_method_ : list of str + The method used by each base estimator. + + See Also + -------- + StackingRegressor : Stack of estimators with a final regressor. + + Notes + ----- + When `predict_proba` is used by each estimator (i.e. most of the time for + `stack_method='auto'` or specifically for `stack_method='predict_proba'`), + the first column predicted by each estimator will be dropped in the case + of a binary classification problem. Indeed, both feature will be perfectly + collinear. + + In some cases (e.g. ordinal regression), one can pass regressors as the + first layer of the :class:`StackingClassifier`. However, note that `y` will + be internally encoded in a numerically increasing order or lexicographic + order. If this ordering is not adequate, one should manually numerically + encode the classes in the desired order. + + References + ---------- + .. [1] Wolpert, David H. "Stacked generalization." Neural networks 5.2 + (1992): 241-259. + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.svm import LinearSVC + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.preprocessing import StandardScaler + >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.ensemble import StackingClassifier + >>> X, y = load_iris(return_X_y=True) + >>> estimators = [ + ... ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), + ... ('svr', make_pipeline(StandardScaler(), + ... LinearSVC(random_state=42))) + ... ] + >>> clf = StackingClassifier( + ... estimators=estimators, final_estimator=LogisticRegression() + ... ) + >>> from sklearn.model_selection import train_test_split + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, stratify=y, random_state=42 + ... ) + >>> clf.fit(X_train, y_train).score(X_test, y_test) + 0.9... + """ + + _parameter_constraints: dict = { + **_BaseStacking._parameter_constraints, + "stack_method": [ + StrOptions({"auto", "predict_proba", "decision_function", "predict"}) + ], + } + + def __init__( + self, + estimators, + final_estimator=None, + *, + cv=None, + stack_method="auto", + n_jobs=None, + passthrough=False, + verbose=0, + ): + super().__init__( + estimators=estimators, + final_estimator=final_estimator, + cv=cv, + stack_method=stack_method, + n_jobs=n_jobs, + passthrough=passthrough, + verbose=verbose, + ) + + def _validate_final_estimator(self): + self._clone_final_estimator(default=LogisticRegression()) + if not is_classifier(self.final_estimator_): + raise ValueError( + "'final_estimator' parameter should be a classifier. Got {}".format( + self.final_estimator_ + ) + ) + + def _validate_estimators(self): + """Overload the method of `_BaseHeterogeneousEnsemble` to be more + lenient towards the type of `estimators`. + + Regressors can be accepted for some cases such as ordinal regression. + """ + if len(self.estimators) == 0: + raise ValueError( + "Invalid 'estimators' attribute, 'estimators' should be a " + "non-empty list of (string, estimator) tuples." + ) + names, estimators = zip(*self.estimators) + self._validate_names(names) + + has_estimator = any(est != "drop" for est in estimators) + if not has_estimator: + raise ValueError( + "All estimators are dropped. At least one is required " + "to be an estimator." + ) + + return names, estimators + + def fit(self, X, y, **fit_params): + """Fit the estimators. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target values. Note that `y` will be internally encoded in + numerically increasing order or lexicographic order. If the order + matter (e.g. for ordinal regression), one should numerically encode + the target `y` before calling :term:`fit`. + + **fit_params : dict + Parameters to pass to the underlying estimators. + + .. versionadded:: 1.6 + + Only available if `enable_metadata_routing=True`, which can be + set by using ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Returns a fitted instance of estimator. + """ + _raise_for_params(fit_params, self, "fit", allow=["sample_weight"]) + + check_classification_targets(y) + if type_of_target(y) == "multilabel-indicator": + self._label_encoder = [LabelEncoder().fit(yk) for yk in y.T] + self.classes_ = [le.classes_ for le in self._label_encoder] + y_encoded = np.array( + [ + self._label_encoder[target_idx].transform(target) + for target_idx, target in enumerate(y.T) + ] + ).T + else: + self._label_encoder = LabelEncoder().fit(y) + self.classes_ = self._label_encoder.classes_ + y_encoded = self._label_encoder.transform(y) + + return super().fit(X, y_encoded, **fit_params) + + @available_if( + _estimator_has("predict", delegates=("final_estimator_", "final_estimator")) + ) + def predict(self, X, **predict_params): + """Predict target for X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + **predict_params : dict of str -> obj + Parameters to the `predict` called by the `final_estimator`. Note + that this may be used to return uncertainties from some estimators + with `return_std` or `return_cov`. Be aware that it will only + account for uncertainty in the final estimator. + + - If `enable_metadata_routing=False` (default): + Parameters directly passed to the `predict` method of the + `final_estimator`. + + - If `enable_metadata_routing=True`: Parameters safely routed to + the `predict` method of the `final_estimator`. See :ref:`Metadata + Routing User Guide ` for more details. + + .. versionchanged:: 1.6 + `**predict_params` can be routed via metadata routing API. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) or (n_samples, n_output) + Predicted targets. + """ + if _routing_enabled(): + routed_params = process_routing(self, "predict", **predict_params) + else: + # TODO(SLEP6): remove when metadata routing cannot be disabled. + routed_params = Bunch() + routed_params.final_estimator_ = Bunch(predict={}) + routed_params.final_estimator_.predict = predict_params + + y_pred = super().predict(X, **routed_params.final_estimator_["predict"]) + if isinstance(self._label_encoder, list): + # Handle the multilabel-indicator case + y_pred = np.array( + [ + self._label_encoder[target_idx].inverse_transform(target) + for target_idx, target in enumerate(y_pred.T) + ] + ).T + else: + y_pred = self._label_encoder.inverse_transform(y_pred) + return y_pred + + @available_if( + _estimator_has( + "predict_proba", delegates=("final_estimator_", "final_estimator") + ) + ) + def predict_proba(self, X): + """Predict class probabilities for `X` using the final estimator. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + Returns + ------- + probabilities : ndarray of shape (n_samples, n_classes) or \ + list of ndarray of shape (n_output,) + The class probabilities of the input samples. + """ + check_is_fitted(self) + y_pred = self.final_estimator_.predict_proba(self.transform(X)) + + if isinstance(self._label_encoder, list): + # Handle the multilabel-indicator cases + y_pred = np.array([preds[:, 0] for preds in y_pred]).T + return y_pred + + @available_if( + _estimator_has( + "decision_function", delegates=("final_estimator_", "final_estimator") + ) + ) + def decision_function(self, X): + """Decision function for samples in `X` using the final estimator. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + Returns + ------- + decisions : ndarray of shape (n_samples,), (n_samples, n_classes), \ + or (n_samples, n_classes * (n_classes-1) / 2) + The decision function computed the final estimator. + """ + check_is_fitted(self) + return self.final_estimator_.decision_function(self.transform(X)) + + def transform(self, X): + """Return class labels or probabilities for X for each estimator. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + Returns + ------- + y_preds : ndarray of shape (n_samples, n_estimators) or \ + (n_samples, n_classes * n_estimators) + Prediction outputs for each estimator. + """ + return self._transform(X) + + def _sk_visual_block_(self): + # If final_estimator's default changes then this should be + # updated. + if self.final_estimator is None: + final_estimator = LogisticRegression() + else: + final_estimator = self.final_estimator + return super()._sk_visual_block_with_final_estimator(final_estimator) + + +class StackingRegressor(RegressorMixin, _BaseStacking): + """Stack of estimators with a final regressor. + + Stacked generalization consists in stacking the output of individual + estimator and use a regressor to compute the final prediction. Stacking + allows to use the strength of each individual estimator by using their + output as input of a final estimator. + + Note that `estimators_` are fitted on the full `X` while `final_estimator_` + is trained using cross-validated predictions of the base estimators using + `cross_val_predict`. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.22 + + Parameters + ---------- + estimators : list of (str, estimator) + Base estimators which will be stacked together. Each element of the + list is defined as a tuple of string (i.e. name) and an estimator + instance. An estimator can be set to 'drop' using `set_params`. + + final_estimator : estimator, default=None + A regressor which will be used to combine the base estimators. + The default regressor is a :class:`~sklearn.linear_model.RidgeCV`. + + cv : int, cross-validation generator, iterable, or "prefit", default=None + Determines the cross-validation splitting strategy used in + `cross_val_predict` to train `final_estimator`. Possible inputs for + cv are: + + * None, to use the default 5-fold cross validation, + * integer, to specify the number of folds in a (Stratified) KFold, + * An object to be used as a cross-validation generator, + * An iterable yielding train, test splits, + * `"prefit"`, to assume the `estimators` are prefit. In this case, the + estimators will not be refitted. + + For integer/None inputs, if the estimator is a classifier and y is + either binary or multiclass, + :class:`~sklearn.model_selection.StratifiedKFold` is used. + In all other cases, :class:`~sklearn.model_selection.KFold` is used. + These splitters are instantiated with `shuffle=False` so the splits + will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + If "prefit" is passed, it is assumed that all `estimators` have + been fitted already. The `final_estimator_` is trained on the `estimators` + predictions on the full training set and are **not** cross validated + predictions. Please note that if the models have been trained on the same + data to train the stacking model, there is a very high risk of overfitting. + + .. versionadded:: 1.1 + The 'prefit' option was added in 1.1 + + .. note:: + A larger number of split will provide no benefits if the number + of training samples is large enough. Indeed, the training time + will increase. ``cv`` is not used for model evaluation but for + prediction. + + n_jobs : int, default=None + The number of jobs to run in parallel for `fit` of all `estimators`. + `None` means 1 unless in a `joblib.parallel_backend` context. -1 means + using all processors. See :term:`Glossary ` for more details. + + passthrough : bool, default=False + When False, only the predictions of estimators will be used as + training data for `final_estimator`. When True, the + `final_estimator` is trained on the predictions as well as the + original training data. + + verbose : int, default=0 + Verbosity level. + + Attributes + ---------- + estimators_ : list of estimators + The elements of the `estimators` parameter, having been fitted on the + training data. If an estimator has been set to `'drop'`, it + will not appear in `estimators_`. When `cv="prefit"`, `estimators_` + is set to `estimators` and is not fitted again. + + named_estimators_ : :class:`~sklearn.utils.Bunch` + Attribute to access any fitted sub-estimators by name. + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Only defined if the + underlying estimators expose such an attribute when fit. + + .. versionadded:: 1.0 + + final_estimator_ : estimator + The regressor fit on the output of `estimators_` and responsible for + final predictions. + + stack_method_ : list of str + The method used by each base estimator. + + See Also + -------- + StackingClassifier : Stack of estimators with a final classifier. + + References + ---------- + .. [1] Wolpert, David H. "Stacked generalization." Neural networks 5.2 + (1992): 241-259. + + Examples + -------- + >>> from sklearn.datasets import load_diabetes + >>> from sklearn.linear_model import RidgeCV + >>> from sklearn.svm import LinearSVR + >>> from sklearn.ensemble import RandomForestRegressor + >>> from sklearn.ensemble import StackingRegressor + >>> X, y = load_diabetes(return_X_y=True) + >>> estimators = [ + ... ('lr', RidgeCV()), + ... ('svr', LinearSVR(random_state=42)) + ... ] + >>> reg = StackingRegressor( + ... estimators=estimators, + ... final_estimator=RandomForestRegressor(n_estimators=10, + ... random_state=42) + ... ) + >>> from sklearn.model_selection import train_test_split + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, random_state=42 + ... ) + >>> reg.fit(X_train, y_train).score(X_test, y_test) + 0.3... + """ + + def __init__( + self, + estimators, + final_estimator=None, + *, + cv=None, + n_jobs=None, + passthrough=False, + verbose=0, + ): + super().__init__( + estimators=estimators, + final_estimator=final_estimator, + cv=cv, + stack_method="predict", + n_jobs=n_jobs, + passthrough=passthrough, + verbose=verbose, + ) + + def _validate_final_estimator(self): + self._clone_final_estimator(default=RidgeCV()) + if not is_regressor(self.final_estimator_): + raise ValueError( + "'final_estimator' parameter should be a regressor. Got {}".format( + self.final_estimator_ + ) + ) + + def fit(self, X, y, **fit_params): + """Fit the estimators. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target values. + + **fit_params : dict + Parameters to pass to the underlying estimators. + + .. versionadded:: 1.6 + + Only available if `enable_metadata_routing=True`, which can be + set by using ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Returns a fitted instance. + """ + _raise_for_params(fit_params, self, "fit", allow=["sample_weight"]) + + y = column_or_1d(y, warn=True) + + return super().fit(X, y, **fit_params) + + def transform(self, X): + """Return the predictions for X for each estimator. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + Returns + ------- + y_preds : ndarray of shape (n_samples, n_estimators) + Prediction outputs for each estimator. + """ + return self._transform(X) + + def fit_transform(self, X, y, **fit_params): + """Fit the estimators and return the predictions for X for each estimator. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target values. + + **fit_params : dict + Parameters to pass to the underlying estimators. + + .. versionadded:: 1.6 + + Only available if `enable_metadata_routing=True`, which can be + set by using ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + y_preds : ndarray of shape (n_samples, n_estimators) + Prediction outputs for each estimator. + """ + _raise_for_params(fit_params, self, "fit", allow=["sample_weight"]) + + return super().fit_transform(X, y, **fit_params) + + @available_if( + _estimator_has("predict", delegates=("final_estimator_", "final_estimator")) + ) + def predict(self, X, **predict_params): + """Predict target for X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + **predict_params : dict of str -> obj + Parameters to the `predict` called by the `final_estimator`. Note + that this may be used to return uncertainties from some estimators + with `return_std` or `return_cov`. Be aware that it will only + account for uncertainty in the final estimator. + + - If `enable_metadata_routing=False` (default): + Parameters directly passed to the `predict` method of the + `final_estimator`. + + - If `enable_metadata_routing=True`: Parameters safely routed to + the `predict` method of the `final_estimator`. See :ref:`Metadata + Routing User Guide ` for more details. + + .. versionchanged:: 1.6 + `**predict_params` can be routed via metadata routing API. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) or (n_samples, n_output) + Predicted targets. + """ + if _routing_enabled(): + routed_params = process_routing(self, "predict", **predict_params) + else: + # TODO(SLEP6): remove when metadata routing cannot be disabled. + routed_params = Bunch() + routed_params.final_estimator_ = Bunch(predict={}) + routed_params.final_estimator_.predict = predict_params + + y_pred = super().predict(X, **routed_params.final_estimator_["predict"]) + + return y_pred + + def _sk_visual_block_(self): + # If final_estimator's default changes then this should be + # updated. + if self.final_estimator is None: + final_estimator = RidgeCV() + else: + final_estimator = self.final_estimator + return super()._sk_visual_block_with_final_estimator(final_estimator) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_voting.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_voting.py new file mode 100644 index 0000000000000000000000000000000000000000..369d3f0f5553ee2c22b930d7f6a43e132dbe2596 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_voting.py @@ -0,0 +1,734 @@ +""" +Soft Voting/Majority Rule classifier and Voting regressor. + +This module contains: + - A Soft Voting/Majority Rule classifier for classification estimators. + - A Voting regressor for regression estimators. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from abc import abstractmethod +from numbers import Integral + +import numpy as np + +from ..base import ( + ClassifierMixin, + RegressorMixin, + TransformerMixin, + _fit_context, + clone, +) +from ..exceptions import NotFittedError +from ..preprocessing import LabelEncoder +from ..utils import Bunch +from ..utils._param_validation import StrOptions +from ..utils._repr_html.estimator import _VisualBlock +from ..utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + _routing_enabled, + process_routing, +) +from ..utils.metaestimators import available_if +from ..utils.multiclass import type_of_target +from ..utils.parallel import Parallel, delayed +from ..utils.validation import ( + _check_feature_names_in, + check_is_fitted, + column_or_1d, +) +from ._base import _BaseHeterogeneousEnsemble, _fit_single_estimator + + +class _BaseVoting(TransformerMixin, _BaseHeterogeneousEnsemble): + """Base class for voting. + + Warning: This class should not be used directly. Use derived classes + instead. + """ + + _parameter_constraints: dict = { + "estimators": [list], + "weights": ["array-like", None], + "n_jobs": [None, Integral], + "verbose": ["verbose"], + } + + def _log_message(self, name, idx, total): + if not self.verbose: + return None + return f"({idx} of {total}) Processing {name}" + + @property + def _weights_not_none(self): + """Get the weights of not `None` estimators.""" + if self.weights is None: + return None + return [w for est, w in zip(self.estimators, self.weights) if est[1] != "drop"] + + def _predict(self, X): + """Collect results from clf.predict calls.""" + return np.asarray([est.predict(X) for est in self.estimators_]).T + + @abstractmethod + def fit(self, X, y, **fit_params): + """Get common fit operations.""" + names, clfs = self._validate_estimators() + + if self.weights is not None and len(self.weights) != len(self.estimators): + raise ValueError( + "Number of `estimators` and weights must be equal; got" + f" {len(self.weights)} weights, {len(self.estimators)} estimators" + ) + + if _routing_enabled(): + routed_params = process_routing(self, "fit", **fit_params) + else: + routed_params = Bunch() + for name in names: + routed_params[name] = Bunch(fit={}) + if "sample_weight" in fit_params: + routed_params[name].fit["sample_weight"] = fit_params[ + "sample_weight" + ] + + self.estimators_ = Parallel(n_jobs=self.n_jobs)( + delayed(_fit_single_estimator)( + clone(clf), + X, + y, + fit_params=routed_params[name]["fit"], + message_clsname="Voting", + message=self._log_message(name, idx + 1, len(clfs)), + ) + for idx, (name, clf) in enumerate(zip(names, clfs)) + if clf != "drop" + ) + + self.named_estimators_ = Bunch() + + # Uses 'drop' as placeholder for dropped estimators + est_iter = iter(self.estimators_) + for name, est in self.estimators: + current_est = est if est == "drop" else next(est_iter) + self.named_estimators_[name] = current_est + + if hasattr(current_est, "feature_names_in_"): + self.feature_names_in_ = current_est.feature_names_in_ + + return self + + def fit_transform(self, X, y=None, **fit_params): + """Return class labels or probabilities for each estimator. + + Return predictions for X for each estimator. + + Parameters + ---------- + X : {array-like, sparse matrix, dataframe} of shape \ + (n_samples, n_features) + Input samples. + + y : ndarray of shape (n_samples,), default=None + Target values (None for unsupervised transformations). + + **fit_params : dict + Additional fit parameters. + + Returns + ------- + X_new : ndarray array of shape (n_samples, n_features_new) + Transformed array. + """ + return super().fit_transform(X, y, **fit_params) + + @property + def n_features_in_(self): + """Number of features seen during :term:`fit`.""" + # For consistency with other estimators we raise a AttributeError so + # that hasattr() fails if the estimator isn't fitted. + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute.".format( + self.__class__.__name__ + ) + ) from nfe + + return self.estimators_[0].n_features_in_ + + def _sk_visual_block_(self): + names, estimators = zip(*self.estimators) + return _VisualBlock("parallel", estimators, names=names) + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.5 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__) + + # `self.estimators` is a list of (name, est) tuples + for name, estimator in self.estimators: + router.add( + **{name: estimator}, + method_mapping=MethodMapping().add(callee="fit", caller="fit"), + ) + return router + + +class VotingClassifier(ClassifierMixin, _BaseVoting): + """Soft Voting/Majority Rule classifier for unfitted estimators. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.17 + + Parameters + ---------- + estimators : list of (str, estimator) tuples + Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones + of those original estimators that will be stored in the class attribute + ``self.estimators_``. An estimator can be set to ``'drop'`` using + :meth:`set_params`. + + .. versionchanged:: 0.21 + ``'drop'`` is accepted. Using None was deprecated in 0.22 and + support was removed in 0.24. + + voting : {'hard', 'soft'}, default='hard' + If 'hard', uses predicted class labels for majority rule voting. + Else if 'soft', predicts the class label based on the argmax of + the sums of the predicted probabilities, which is recommended for + an ensemble of well-calibrated classifiers. + + weights : array-like of shape (n_classifiers,), default=None + Sequence of weights (`float` or `int`) to weight the occurrences of + predicted class labels (`hard` voting) or class probabilities + before averaging (`soft` voting). Uses uniform weights if `None`. + + n_jobs : int, default=None + The number of jobs to run in parallel for ``fit``. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + .. versionadded:: 0.18 + + flatten_transform : bool, default=True + Affects shape of transform output only when voting='soft' + If voting='soft' and flatten_transform=True, transform method returns + matrix with shape (n_samples, n_classifiers * n_classes). If + flatten_transform=False, it returns + (n_classifiers, n_samples, n_classes). + + verbose : bool, default=False + If True, the time elapsed while fitting will be printed as it + is completed. + + .. versionadded:: 0.23 + + Attributes + ---------- + estimators_ : list of classifiers + The collection of fitted sub-estimators as defined in ``estimators`` + that are not 'drop'. + + named_estimators_ : :class:`~sklearn.utils.Bunch` + Attribute to access any fitted sub-estimators by name. + + .. versionadded:: 0.20 + + le_ : :class:`~sklearn.preprocessing.LabelEncoder` + Transformer used to encode the labels during fit and decode during + prediction. + + classes_ : ndarray of shape (n_classes,) + The classes labels. + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying classifier exposes such an attribute when fit. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Only defined if the + underlying estimators expose such an attribute when fit. + + .. versionadded:: 1.0 + + See Also + -------- + VotingRegressor : Prediction voting regressor. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.naive_bayes import GaussianNB + >>> from sklearn.ensemble import RandomForestClassifier, VotingClassifier + >>> clf1 = LogisticRegression(random_state=1) + >>> clf2 = RandomForestClassifier(n_estimators=50, random_state=1) + >>> clf3 = GaussianNB() + >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) + >>> y = np.array([1, 1, 1, 2, 2, 2]) + >>> eclf1 = VotingClassifier(estimators=[ + ... ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard') + >>> eclf1 = eclf1.fit(X, y) + >>> print(eclf1.predict(X)) + [1 1 1 2 2 2] + >>> np.array_equal(eclf1.named_estimators_.lr.predict(X), + ... eclf1.named_estimators_['lr'].predict(X)) + True + >>> eclf2 = VotingClassifier(estimators=[ + ... ('lr', clf1), ('rf', clf2), ('gnb', clf3)], + ... voting='soft') + >>> eclf2 = eclf2.fit(X, y) + >>> print(eclf2.predict(X)) + [1 1 1 2 2 2] + + To drop an estimator, :meth:`set_params` can be used to remove it. Here we + dropped one of the estimators, resulting in 2 fitted estimators: + + >>> eclf2 = eclf2.set_params(lr='drop') + >>> eclf2 = eclf2.fit(X, y) + >>> len(eclf2.estimators_) + 2 + + Setting `flatten_transform=True` with `voting='soft'` flattens output shape of + `transform`: + + >>> eclf3 = VotingClassifier(estimators=[ + ... ('lr', clf1), ('rf', clf2), ('gnb', clf3)], + ... voting='soft', weights=[2,1,1], + ... flatten_transform=True) + >>> eclf3 = eclf3.fit(X, y) + >>> print(eclf3.predict(X)) + [1 1 1 2 2 2] + >>> print(eclf3.transform(X).shape) + (6, 6) + """ + + _parameter_constraints: dict = { + **_BaseVoting._parameter_constraints, + "voting": [StrOptions({"hard", "soft"})], + "flatten_transform": ["boolean"], + } + + def __init__( + self, + estimators, + *, + voting="hard", + weights=None, + n_jobs=None, + flatten_transform=True, + verbose=False, + ): + super().__init__(estimators=estimators) + self.voting = voting + self.weights = weights + self.n_jobs = n_jobs + self.flatten_transform = flatten_transform + self.verbose = verbose + + @_fit_context( + # estimators in VotingClassifier.estimators are not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y, **fit_params): + """Fit the estimators. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target values. + + **fit_params : dict + Parameters to pass to the underlying estimators. + + .. versionadded:: 1.5 + + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Returns the instance itself. + """ + _raise_for_params(fit_params, self, "fit", allow=["sample_weight"]) + + y_type = type_of_target(y, input_name="y") + if y_type in ("unknown", "continuous"): + # raise a specific ValueError for non-classification tasks + raise ValueError( + f"Unknown label type: {y_type}. Maybe you are trying to fit a " + "classifier, which expects discrete classes on a " + "regression target with continuous values." + ) + elif y_type not in ("binary", "multiclass"): + # raise a NotImplementedError for backward compatibility for non-supported + # classification tasks + raise NotImplementedError( + f"{self.__class__.__name__} only supports binary or multiclass " + "classification. Multilabel and multi-output classification are not " + "supported." + ) + + self.le_ = LabelEncoder().fit(y) + self.classes_ = self.le_.classes_ + transformed_y = self.le_.transform(y) + + return super().fit(X, transformed_y, **fit_params) + + def predict(self, X): + """Predict class labels for X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. + + Returns + ------- + maj : array-like of shape (n_samples,) + Predicted class labels. + """ + check_is_fitted(self) + if self.voting == "soft": + maj = np.argmax(self.predict_proba(X), axis=1) + + else: # 'hard' voting + predictions = self._predict(X) + maj = np.apply_along_axis( + lambda x: np.argmax(np.bincount(x, weights=self._weights_not_none)), + axis=1, + arr=predictions, + ) + + maj = self.le_.inverse_transform(maj) + + return maj + + def _collect_probas(self, X): + """Collect results from clf.predict calls.""" + return np.asarray([clf.predict_proba(X) for clf in self.estimators_]) + + def _check_voting(self): + if self.voting == "hard": + raise AttributeError( + f"predict_proba is not available when voting={self.voting!r}" + ) + return True + + @available_if(_check_voting) + def predict_proba(self, X): + """Compute probabilities of possible outcomes for samples in X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. + + Returns + ------- + avg : array-like of shape (n_samples, n_classes) + Weighted average probability for each class per sample. + """ + check_is_fitted(self) + avg = np.average( + self._collect_probas(X), axis=0, weights=self._weights_not_none + ) + return avg + + def transform(self, X): + """Return class labels or probabilities for X for each estimator. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + Returns + ------- + probabilities_or_labels + If `voting='soft'` and `flatten_transform=True`: + returns ndarray of shape (n_samples, n_classifiers * n_classes), + being class probabilities calculated by each classifier. + If `voting='soft' and `flatten_transform=False`: + ndarray of shape (n_classifiers, n_samples, n_classes) + If `voting='hard'`: + ndarray of shape (n_samples, n_classifiers), being + class labels predicted by each classifier. + """ + check_is_fitted(self) + + if self.voting == "soft": + probas = self._collect_probas(X) + if not self.flatten_transform: + return probas + return np.hstack(probas) + + else: + return self._predict(X) + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + check_is_fitted(self, "n_features_in_") + if self.voting == "soft" and not self.flatten_transform: + raise ValueError( + "get_feature_names_out is not supported when `voting='soft'` and " + "`flatten_transform=False`" + ) + + _check_feature_names_in(self, input_features, generate_names=False) + class_name = self.__class__.__name__.lower() + + active_names = [name for name, est in self.estimators if est != "drop"] + + if self.voting == "hard": + return np.asarray( + [f"{class_name}_{name}" for name in active_names], dtype=object + ) + + # voting == "soft" + n_classes = len(self.classes_) + names_out = [ + f"{class_name}_{name}{i}" for name in active_names for i in range(n_classes) + ] + return np.asarray(names_out, dtype=object) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = [] + return tags + + +class VotingRegressor(RegressorMixin, _BaseVoting): + """Prediction voting regressor for unfitted estimators. + + A voting regressor is an ensemble meta-estimator that fits several base + regressors, each on the whole dataset. Then it averages the individual + predictions to form a final prediction. + + For a detailed example, refer to + :ref:`sphx_glr_auto_examples_ensemble_plot_voting_regressor.py`. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.21 + + Parameters + ---------- + estimators : list of (str, estimator) tuples + Invoking the ``fit`` method on the ``VotingRegressor`` will fit clones + of those original estimators that will be stored in the class attribute + ``self.estimators_``. An estimator can be set to ``'drop'`` using + :meth:`set_params`. + + .. versionchanged:: 0.21 + ``'drop'`` is accepted. Using None was deprecated in 0.22 and + support was removed in 0.24. + + weights : array-like of shape (n_regressors,), default=None + Sequence of weights (`float` or `int`) to weight the occurrences of + predicted values before averaging. Uses uniform weights if `None`. + + n_jobs : int, default=None + The number of jobs to run in parallel for ``fit``. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + verbose : bool, default=False + If True, the time elapsed while fitting will be printed as it + is completed. + + .. versionadded:: 0.23 + + Attributes + ---------- + estimators_ : list of regressors + The collection of fitted sub-estimators as defined in ``estimators`` + that are not 'drop'. + + named_estimators_ : :class:`~sklearn.utils.Bunch` + Attribute to access any fitted sub-estimators by name. + + .. versionadded:: 0.20 + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying regressor exposes such an attribute when fit. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Only defined if the + underlying estimators expose such an attribute when fit. + + .. versionadded:: 1.0 + + See Also + -------- + VotingClassifier : Soft Voting/Majority Rule classifier. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.linear_model import LinearRegression + >>> from sklearn.ensemble import RandomForestRegressor + >>> from sklearn.ensemble import VotingRegressor + >>> from sklearn.neighbors import KNeighborsRegressor + >>> r1 = LinearRegression() + >>> r2 = RandomForestRegressor(n_estimators=10, random_state=1) + >>> r3 = KNeighborsRegressor() + >>> X = np.array([[1, 1], [2, 4], [3, 9], [4, 16], [5, 25], [6, 36]]) + >>> y = np.array([2, 6, 12, 20, 30, 42]) + >>> er = VotingRegressor([('lr', r1), ('rf', r2), ('r3', r3)]) + >>> print(er.fit(X, y).predict(X)) + [ 6.8 8.4 12.5 17.8 26 34] + + In the following example, we drop the `'lr'` estimator with + :meth:`~VotingRegressor.set_params` and fit the remaining two estimators: + + >>> er = er.set_params(lr='drop') + >>> er = er.fit(X, y) + >>> len(er.estimators_) + 2 + """ + + def __init__(self, estimators, *, weights=None, n_jobs=None, verbose=False): + super().__init__(estimators=estimators) + self.weights = weights + self.n_jobs = n_jobs + self.verbose = verbose + + @_fit_context( + # estimators in VotingRegressor.estimators are not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y, **fit_params): + """Fit the estimators. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target values. + + **fit_params : dict + Parameters to pass to the underlying estimators. + + .. versionadded:: 1.5 + + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Fitted estimator. + """ + _raise_for_params(fit_params, self, "fit", allow=["sample_weight"]) + + y = column_or_1d(y, warn=True) + + return super().fit(X, y, **fit_params) + + def predict(self, X): + """Predict regression target for X. + + The predicted regression target of an input sample is computed as the + mean predicted regression targets of the estimators in the ensemble. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. + + Returns + ------- + y : ndarray of shape (n_samples,) + The predicted values. + """ + check_is_fitted(self) + return np.average(self._predict(X), axis=1, weights=self._weights_not_none) + + def transform(self, X): + """Return predictions for X for each estimator. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. + + Returns + ------- + predictions : ndarray of shape (n_samples, n_classifiers) + Values predicted by each regressor. + """ + check_is_fitted(self) + return self._predict(X) + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + check_is_fitted(self, "n_features_in_") + _check_feature_names_in(self, input_features, generate_names=False) + class_name = self.__class__.__name__.lower() + return np.asarray( + [f"{class_name}_{name}" for name, est in self.estimators if est != "drop"], + dtype=object, + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/_weight_boosting.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_weight_boosting.py new file mode 100644 index 0000000000000000000000000000000000000000..37c6468a5ebf6d8e22d927b3528604f57b9e0676 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/_weight_boosting.py @@ -0,0 +1,1173 @@ +"""Weight Boosting. + +This module contains weight boosting estimators for both classification and +regression. + +The module structure is the following: + +- The `BaseWeightBoosting` base class implements a common ``fit`` method + for all the estimators in the module. Regression and classification + only differ from each other in the loss function that is optimized. + +- :class:`~sklearn.ensemble.AdaBoostClassifier` implements adaptive boosting + (AdaBoost-SAMME) for classification problems. + +- :class:`~sklearn.ensemble.AdaBoostRegressor` implements adaptive boosting + (AdaBoost.R2) for regression problems. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from abc import ABCMeta, abstractmethod +from numbers import Integral, Real + +import numpy as np + +from ..base import ( + ClassifierMixin, + RegressorMixin, + _fit_context, + is_classifier, + is_regressor, +) +from ..metrics import accuracy_score, r2_score +from ..tree import DecisionTreeClassifier, DecisionTreeRegressor +from ..utils import _safe_indexing, check_random_state +from ..utils._param_validation import HasMethods, Hidden, Interval, StrOptions +from ..utils.extmath import softmax, stable_cumsum +from ..utils.metadata_routing import ( + _raise_for_unsupported_routing, + _RoutingNotSupportedMixin, +) +from ..utils.validation import ( + _check_sample_weight, + _num_samples, + check_is_fitted, + has_fit_parameter, + validate_data, +) +from ._base import BaseEnsemble + +__all__ = [ + "AdaBoostClassifier", + "AdaBoostRegressor", +] + + +class BaseWeightBoosting(BaseEnsemble, metaclass=ABCMeta): + """Base class for AdaBoost estimators. + + Warning: This class should not be used directly. Use derived classes + instead. + """ + + _parameter_constraints: dict = { + "estimator": [HasMethods(["fit", "predict"]), None], + "n_estimators": [Interval(Integral, 1, None, closed="left")], + "learning_rate": [Interval(Real, 0, None, closed="neither")], + "random_state": ["random_state"], + } + + @abstractmethod + def __init__( + self, + estimator=None, + *, + n_estimators=50, + estimator_params=tuple(), + learning_rate=1.0, + random_state=None, + ): + super().__init__( + estimator=estimator, + n_estimators=n_estimators, + estimator_params=estimator_params, + ) + + self.learning_rate = learning_rate + self.random_state = random_state + + def _check_X(self, X): + # Only called to validate X in non-fit methods, therefore reset=False + return validate_data( + self, + X, + accept_sparse=["csr", "csc"], + ensure_2d=True, + allow_nd=True, + dtype=None, + reset=False, + ) + + @_fit_context( + # AdaBoost*.estimator is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y, sample_weight=None): + """Build a boosted classifier/regressor from the training set (X, y). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrix can be CSC, CSR, COO, + DOK, or LIL. COO, DOK, and LIL are converted to CSR. + + y : array-like of shape (n_samples,) + The target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, the sample weights are initialized to + 1 / n_samples. + + Returns + ------- + self : object + Fitted estimator. + """ + _raise_for_unsupported_routing(self, "fit", sample_weight=sample_weight) + X, y = validate_data( + self, + X, + y, + accept_sparse=["csr", "csc"], + ensure_2d=True, + allow_nd=True, + dtype=None, + y_numeric=is_regressor(self), + ) + + sample_weight = _check_sample_weight( + sample_weight, X, dtype=np.float64, copy=True, ensure_non_negative=True + ) + sample_weight /= sample_weight.sum() + + # Check parameters + self._validate_estimator() + + # Clear any previous fit results + self.estimators_ = [] + self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64) + self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64) + + # Initialization of the random number instance that will be used to + # generate a seed at each iteration + random_state = check_random_state(self.random_state) + epsilon = np.finfo(sample_weight.dtype).eps + + zero_weight_mask = sample_weight == 0.0 + for iboost in range(self.n_estimators): + # avoid extremely small sample weight, for details see issue #20320 + sample_weight = np.clip(sample_weight, a_min=epsilon, a_max=None) + # do not clip sample weights that were exactly zero originally + sample_weight[zero_weight_mask] = 0.0 + + # Boosting step + sample_weight, estimator_weight, estimator_error = self._boost( + iboost, X, y, sample_weight, random_state + ) + + # Early termination + if sample_weight is None: + break + self.estimator_weights_[iboost] = estimator_weight + self.estimator_errors_[iboost] = estimator_error + + # Stop if error is zero + if estimator_error == 0: + break + + sample_weight_sum = np.sum(sample_weight) + + if not np.isfinite(sample_weight_sum): + warnings.warn( + ( + "Sample weights have reached infinite values," + f" at iteration {iboost}, causing overflow. " + "Iterations stopped. Try lowering the learning rate." + ), + stacklevel=2, + ) + break + + # Stop if the sum of sample weights has become non-positive + if sample_weight_sum <= 0: + break + + if iboost < self.n_estimators - 1: + # Normalize + sample_weight /= sample_weight_sum + + return self + + @abstractmethod + def _boost(self, iboost, X, y, sample_weight, random_state): + """Implement a single boost. + + Warning: This method needs to be overridden by subclasses. + + Parameters + ---------- + iboost : int + The index of the current boost iteration. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrix can be CSC, CSR, COO, + DOK, or LIL. COO, DOK, and LIL are converted to CSR. + + y : array-like of shape (n_samples,) + The target values (class labels). + + sample_weight : array-like of shape (n_samples,) + The current sample weights. + + random_state : RandomState + The current random number generator + + Returns + ------- + sample_weight : array-like of shape (n_samples,) or None + The reweighted sample weights. + If None then boosting has terminated early. + + estimator_weight : float + The weight for the current boost. + If None then boosting has terminated early. + + error : float + The classification error for the current boost. + If None then boosting has terminated early. + """ + pass + + def staged_score(self, X, y, sample_weight=None): + """Return staged scores for X, y. + + This generator method yields the ensemble score after each iteration of + boosting and therefore allows monitoring, such as to determine the + score on a test set after each boost. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrix can be CSC, CSR, COO, + DOK, or LIL. COO, DOK, and LIL are converted to CSR. + + y : array-like of shape (n_samples,) + Labels for X. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Yields + ------ + z : float + """ + X = self._check_X(X) + + for y_pred in self.staged_predict(X): + if is_classifier(self): + yield accuracy_score(y, y_pred, sample_weight=sample_weight) + else: + yield r2_score(y, y_pred, sample_weight=sample_weight) + + @property + def feature_importances_(self): + """The impurity-based feature importances. + + The higher, the more important the feature. + The importance of a feature is computed as the (normalized) + total reduction of the criterion brought by that feature. It is also + known as the Gini importance. + + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + + Returns + ------- + feature_importances_ : ndarray of shape (n_features,) + The feature importances. + """ + if self.estimators_ is None or len(self.estimators_) == 0: + raise ValueError( + "Estimator not fitted, call `fit` before `feature_importances_`." + ) + + try: + norm = self.estimator_weights_.sum() + return ( + sum( + weight * clf.feature_importances_ + for weight, clf in zip(self.estimator_weights_, self.estimators_) + ) + / norm + ) + + except AttributeError as e: + raise AttributeError( + "Unable to compute feature importances " + "since estimator does not have a " + "feature_importances_ attribute" + ) from e + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + + +def _samme_proba(estimator, n_classes, X): + """Calculate algorithm 4, step 2, equation c) of Zhu et al [1]. + + References + ---------- + .. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", 2009. + + """ + proba = estimator.predict_proba(X) + + # Displace zero probabilities so the log is defined. + # Also fix negative elements which may occur with + # negative sample weights. + np.clip(proba, np.finfo(proba.dtype).eps, None, out=proba) + log_proba = np.log(proba) + + return (n_classes - 1) * ( + log_proba - (1.0 / n_classes) * log_proba.sum(axis=1)[:, np.newaxis] + ) + + +class AdaBoostClassifier( + _RoutingNotSupportedMixin, ClassifierMixin, BaseWeightBoosting +): + """An AdaBoost classifier. + + An AdaBoost [1]_ classifier is a meta-estimator that begins by fitting a + classifier on the original dataset and then fits additional copies of the + classifier on the same dataset but where the weights of incorrectly + classified instances are adjusted such that subsequent classifiers focus + more on difficult cases. + + This class implements the algorithm based on [2]_. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.14 + + Parameters + ---------- + estimator : object, default=None + The base estimator from which the boosted ensemble is built. + Support for sample weighting is required, as well as proper + ``classes_`` and ``n_classes_`` attributes. If ``None``, then + the base estimator is :class:`~sklearn.tree.DecisionTreeClassifier` + initialized with `max_depth=1`. + + .. versionadded:: 1.2 + `base_estimator` was renamed to `estimator`. + + n_estimators : int, default=50 + The maximum number of estimators at which boosting is terminated. + In case of perfect fit, the learning procedure is stopped early. + Values must be in the range `[1, inf)`. + + learning_rate : float, default=1.0 + Weight applied to each classifier at each boosting iteration. A higher + learning rate increases the contribution of each classifier. There is + a trade-off between the `learning_rate` and `n_estimators` parameters. + Values must be in the range `(0.0, inf)`. + + algorithm : {'SAMME'}, default='SAMME' + Use the SAMME discrete boosting algorithm. + + .. deprecated:: 1.6 + `algorithm` is deprecated and will be removed in version 1.8. This + estimator only implements the 'SAMME' algorithm. + + random_state : int, RandomState instance or None, default=None + Controls the random seed given at each `estimator` at each + boosting iteration. + Thus, it is only used when `estimator` exposes a `random_state`. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + estimator_ : estimator + The base estimator from which the ensemble is grown. + + .. versionadded:: 1.2 + `base_estimator_` was renamed to `estimator_`. + + estimators_ : list of classifiers + The collection of fitted sub-estimators. + + classes_ : ndarray of shape (n_classes,) + The classes labels. + + n_classes_ : int + The number of classes. + + estimator_weights_ : ndarray of floats + Weights for each estimator in the boosted ensemble. + + estimator_errors_ : ndarray of floats + Classification error for each estimator in the boosted + ensemble. + + feature_importances_ : ndarray of shape (n_features,) + The impurity-based feature importances if supported by the + ``estimator`` (when based on decision trees). + + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + AdaBoostRegressor : An AdaBoost regressor that begins by fitting a + regressor on the original dataset and then fits additional copies of + the regressor on the same dataset but where the weights of instances + are adjusted according to the error of the current prediction. + + GradientBoostingClassifier : GB builds an additive model in a forward + stage-wise fashion. Regression trees are fit on the negative gradient + of the binomial or multinomial deviance loss function. Binary + classification is a special case where only a single regression tree is + induced. + + sklearn.tree.DecisionTreeClassifier : A non-parametric supervised learning + method used for classification. + Creates a model that predicts the value of a target variable by + learning simple decision rules inferred from the data features. + + References + ---------- + .. [1] Y. Freund, R. Schapire, "A Decision-Theoretic Generalization of + on-Line Learning and an Application to Boosting", 1995. + + .. [2] :doi:`J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class adaboost." + Statistics and its Interface 2.3 (2009): 349-360. + <10.4310/SII.2009.v2.n3.a8>` + + Examples + -------- + >>> from sklearn.ensemble import AdaBoostClassifier + >>> from sklearn.datasets import make_classification + >>> X, y = make_classification(n_samples=1000, n_features=4, + ... n_informative=2, n_redundant=0, + ... random_state=0, shuffle=False) + >>> clf = AdaBoostClassifier(n_estimators=100, random_state=0) + >>> clf.fit(X, y) + AdaBoostClassifier(n_estimators=100, random_state=0) + >>> clf.predict([[0, 0, 0, 0]]) + array([1]) + >>> clf.score(X, y) + 0.96 + + For a detailed example of using AdaBoost to fit a sequence of DecisionTrees + as weaklearners, please refer to + :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_multiclass.py`. + + For a detailed example of using AdaBoost to fit a non-linearly separable + classification dataset composed of two Gaussian quantiles clusters, please + refer to :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_twoclass.py`. + """ + + # TODO(1.8): remove "algorithm" entry + _parameter_constraints: dict = { + **BaseWeightBoosting._parameter_constraints, + "algorithm": [StrOptions({"SAMME"}), Hidden(StrOptions({"deprecated"}))], + } + + def __init__( + self, + estimator=None, + *, + n_estimators=50, + learning_rate=1.0, + algorithm="deprecated", + random_state=None, + ): + super().__init__( + estimator=estimator, + n_estimators=n_estimators, + learning_rate=learning_rate, + random_state=random_state, + ) + + self.algorithm = algorithm + + def _validate_estimator(self): + """Check the estimator and set the estimator_ attribute.""" + super()._validate_estimator(default=DecisionTreeClassifier(max_depth=1)) + + if self.algorithm != "deprecated": + warnings.warn( + "The parameter 'algorithm' is deprecated in 1.6 and has no effect. " + "It will be removed in version 1.8.", + FutureWarning, + ) + + if not has_fit_parameter(self.estimator_, "sample_weight"): + raise ValueError( + f"{self.estimator.__class__.__name__} doesn't support sample_weight." + ) + + def _boost(self, iboost, X, y, sample_weight, random_state): + """Implement a single boost. + + Perform a single boost according to the discrete SAMME algorithm and return the + updated sample weights. + + Parameters + ---------- + iboost : int + The index of the current boost iteration. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. + + y : array-like of shape (n_samples,) + The target values (class labels). + + sample_weight : array-like of shape (n_samples,) + The current sample weights. + + random_state : RandomState instance + The RandomState instance used if the base estimator accepts a + `random_state` attribute. + + Returns + ------- + sample_weight : array-like of shape (n_samples,) or None + The reweighted sample weights. + If None then boosting has terminated early. + + estimator_weight : float + The weight for the current boost. + If None then boosting has terminated early. + + estimator_error : float + The classification error for the current boost. + If None then boosting has terminated early. + """ + estimator = self._make_estimator(random_state=random_state) + + estimator.fit(X, y, sample_weight=sample_weight) + + y_predict = estimator.predict(X) + + if iboost == 0: + self.classes_ = getattr(estimator, "classes_", None) + self.n_classes_ = len(self.classes_) + + # Instances incorrectly classified + incorrect = y_predict != y + + # Error fraction + estimator_error = np.mean(np.average(incorrect, weights=sample_weight, axis=0)) + + # Stop if classification is perfect + if estimator_error <= 0: + return sample_weight, 1.0, 0.0 + + n_classes = self.n_classes_ + + # Stop if the error is at least as bad as random guessing + if estimator_error >= 1.0 - (1.0 / n_classes): + self.estimators_.pop(-1) + if len(self.estimators_) == 0: + raise ValueError( + "BaseClassifier in AdaBoostClassifier " + "ensemble is worse than random, ensemble " + "can not be fit." + ) + return None, None, None + + # Boost weight using multi-class AdaBoost SAMME alg + estimator_weight = self.learning_rate * ( + np.log((1.0 - estimator_error) / estimator_error) + np.log(n_classes - 1.0) + ) + + # Only boost the weights if it will fit again + if not iboost == self.n_estimators - 1: + # Only boost positive weights + sample_weight = np.exp( + np.log(sample_weight) + + estimator_weight * incorrect * (sample_weight > 0) + ) + + return sample_weight, estimator_weight, estimator_error + + def predict(self, X): + """Predict classes for X. + + The predicted class of an input sample is computed as the weighted mean + prediction of the classifiers in the ensemble. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrix can be CSC, CSR, COO, + DOK, or LIL. COO, DOK, and LIL are converted to CSR. + + Returns + ------- + y : ndarray of shape (n_samples,) + The predicted classes. + """ + pred = self.decision_function(X) + + if self.n_classes_ == 2: + return self.classes_.take(pred > 0, axis=0) + + return self.classes_.take(np.argmax(pred, axis=1), axis=0) + + def staged_predict(self, X): + """Return staged predictions for X. + + The predicted class of an input sample is computed as the weighted mean + prediction of the classifiers in the ensemble. + + This generator method yields the ensemble prediction after each + iteration of boosting and therefore allows monitoring, such as to + determine the prediction on a test set after each boost. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input samples. Sparse matrix can be CSC, CSR, COO, + DOK, or LIL. COO, DOK, and LIL are converted to CSR. + + Yields + ------ + y : generator of ndarray of shape (n_samples,) + The predicted classes. + """ + X = self._check_X(X) + + n_classes = self.n_classes_ + classes = self.classes_ + + if n_classes == 2: + for pred in self.staged_decision_function(X): + yield np.array(classes.take(pred > 0, axis=0)) + + else: + for pred in self.staged_decision_function(X): + yield np.array(classes.take(np.argmax(pred, axis=1), axis=0)) + + def decision_function(self, X): + """Compute the decision function of ``X``. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrix can be CSC, CSR, COO, + DOK, or LIL. COO, DOK, and LIL are converted to CSR. + + Returns + ------- + score : ndarray of shape of (n_samples, k) + The decision function of the input samples. The order of + outputs is the same as that of the :term:`classes_` attribute. + Binary classification is a special cases with ``k == 1``, + otherwise ``k==n_classes``. For binary classification, + values closer to -1 or 1 mean more like the first or second + class in ``classes_``, respectively. + """ + check_is_fitted(self) + X = self._check_X(X) + + n_classes = self.n_classes_ + classes = self.classes_[:, np.newaxis] + + if n_classes == 1: + return np.zeros_like(X, shape=(X.shape[0], 1)) + + pred = sum( + np.where( + (estimator.predict(X) == classes).T, + w, + -1 / (n_classes - 1) * w, + ) + for estimator, w in zip(self.estimators_, self.estimator_weights_) + ) + + pred /= self.estimator_weights_.sum() + if n_classes == 2: + pred[:, 0] *= -1 + return pred.sum(axis=1) + return pred + + def staged_decision_function(self, X): + """Compute decision function of ``X`` for each boosting iteration. + + This method allows monitoring (i.e. determine error on testing set) + after each boosting iteration. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrix can be CSC, CSR, COO, + DOK, or LIL. COO, DOK, and LIL are converted to CSR. + + Yields + ------ + score : generator of ndarray of shape (n_samples, k) + The decision function of the input samples. The order of + outputs is the same of that of the :term:`classes_` attribute. + Binary classification is a special cases with ``k == 1``, + otherwise ``k==n_classes``. For binary classification, + values closer to -1 or 1 mean more like the first or second + class in ``classes_``, respectively. + """ + check_is_fitted(self) + X = self._check_X(X) + + n_classes = self.n_classes_ + classes = self.classes_[:, np.newaxis] + pred = None + norm = 0.0 + + for weight, estimator in zip(self.estimator_weights_, self.estimators_): + norm += weight + + current_pred = np.where( + (estimator.predict(X) == classes).T, + weight, + -1 / (n_classes - 1) * weight, + ) + + if pred is None: + pred = current_pred + else: + pred += current_pred + + if n_classes == 2: + tmp_pred = np.copy(pred) + tmp_pred[:, 0] *= -1 + yield (tmp_pred / norm).sum(axis=1) + else: + yield pred / norm + + @staticmethod + def _compute_proba_from_decision(decision, n_classes): + """Compute probabilities from the decision function. + + This is based eq. (15) of [1] where: + p(y=c|X) = exp((1 / K-1) f_c(X)) / sum_k(exp((1 / K-1) f_k(X))) + = softmax((1 / K-1) * f(X)) + + References + ---------- + .. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", + 2009. + """ + if n_classes == 2: + decision = np.vstack([-decision, decision]).T / 2 + else: + decision /= n_classes - 1 + return softmax(decision, copy=False) + + def predict_proba(self, X): + """Predict class probabilities for X. + + The predicted class probabilities of an input sample is computed as + the weighted mean predicted class probabilities of the classifiers + in the ensemble. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrix can be CSC, CSR, COO, + DOK, or LIL. COO, DOK, and LIL are converted to CSR. + + Returns + ------- + p : ndarray of shape (n_samples, n_classes) + The class probabilities of the input samples. The order of + outputs is the same of that of the :term:`classes_` attribute. + """ + check_is_fitted(self) + n_classes = self.n_classes_ + + if n_classes == 1: + return np.ones((_num_samples(X), 1)) + + decision = self.decision_function(X) + return self._compute_proba_from_decision(decision, n_classes) + + def staged_predict_proba(self, X): + """Predict class probabilities for X. + + The predicted class probabilities of an input sample is computed as + the weighted mean predicted class probabilities of the classifiers + in the ensemble. + + This generator method yields the ensemble predicted class probabilities + after each iteration of boosting and therefore allows monitoring, such + as to determine the predicted class probabilities on a test set after + each boost. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrix can be CSC, CSR, COO, + DOK, or LIL. COO, DOK, and LIL are converted to CSR. + + Yields + ------ + p : generator of ndarray of shape (n_samples,) + The class probabilities of the input samples. The order of + outputs is the same of that of the :term:`classes_` attribute. + """ + + n_classes = self.n_classes_ + + for decision in self.staged_decision_function(X): + yield self._compute_proba_from_decision(decision, n_classes) + + def predict_log_proba(self, X): + """Predict class log-probabilities for X. + + The predicted class log-probabilities of an input sample is computed as + the weighted mean predicted class log-probabilities of the classifiers + in the ensemble. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrix can be CSC, CSR, COO, + DOK, or LIL. COO, DOK, and LIL are converted to CSR. + + Returns + ------- + p : ndarray of shape (n_samples, n_classes) + The class probabilities of the input samples. The order of + outputs is the same of that of the :term:`classes_` attribute. + """ + return np.log(self.predict_proba(X)) + + +class AdaBoostRegressor(_RoutingNotSupportedMixin, RegressorMixin, BaseWeightBoosting): + """An AdaBoost regressor. + + An AdaBoost [1] regressor is a meta-estimator that begins by fitting a + regressor on the original dataset and then fits additional copies of the + regressor on the same dataset but where the weights of instances are + adjusted according to the error of the current prediction. As such, + subsequent regressors focus more on difficult cases. + + This class implements the algorithm known as AdaBoost.R2 [2]. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.14 + + Parameters + ---------- + estimator : object, default=None + The base estimator from which the boosted ensemble is built. + If ``None``, then the base estimator is + :class:`~sklearn.tree.DecisionTreeRegressor` initialized with + `max_depth=3`. + + .. versionadded:: 1.2 + `base_estimator` was renamed to `estimator`. + + n_estimators : int, default=50 + The maximum number of estimators at which boosting is terminated. + In case of perfect fit, the learning procedure is stopped early. + Values must be in the range `[1, inf)`. + + learning_rate : float, default=1.0 + Weight applied to each regressor at each boosting iteration. A higher + learning rate increases the contribution of each regressor. There is + a trade-off between the `learning_rate` and `n_estimators` parameters. + Values must be in the range `(0.0, inf)`. + + loss : {'linear', 'square', 'exponential'}, default='linear' + The loss function to use when updating the weights after each + boosting iteration. + + random_state : int, RandomState instance or None, default=None + Controls the random seed given at each `estimator` at each + boosting iteration. + Thus, it is only used when `estimator` exposes a `random_state`. + In addition, it controls the bootstrap of the weights used to train the + `estimator` at each boosting iteration. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + estimator_ : estimator + The base estimator from which the ensemble is grown. + + .. versionadded:: 1.2 + `base_estimator_` was renamed to `estimator_`. + + estimators_ : list of regressors + The collection of fitted sub-estimators. + + estimator_weights_ : ndarray of floats + Weights for each estimator in the boosted ensemble. + + estimator_errors_ : ndarray of floats + Regression error for each estimator in the boosted ensemble. + + feature_importances_ : ndarray of shape (n_features,) + The impurity-based feature importances if supported by the + ``estimator`` (when based on decision trees). + + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + AdaBoostClassifier : An AdaBoost classifier. + GradientBoostingRegressor : Gradient Boosting Classification Tree. + sklearn.tree.DecisionTreeRegressor : A decision tree regressor. + + References + ---------- + .. [1] Y. Freund, R. Schapire, "A Decision-Theoretic Generalization of + on-Line Learning and an Application to Boosting", 1995. + + .. [2] H. Drucker, "Improving Regressors using Boosting Techniques", 1997. + + Examples + -------- + >>> from sklearn.ensemble import AdaBoostRegressor + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression(n_features=4, n_informative=2, + ... random_state=0, shuffle=False) + >>> regr = AdaBoostRegressor(random_state=0, n_estimators=100) + >>> regr.fit(X, y) + AdaBoostRegressor(n_estimators=100, random_state=0) + >>> regr.predict([[0, 0, 0, 0]]) + array([4.7972]) + >>> regr.score(X, y) + 0.9771 + + For a detailed example of utilizing :class:`~sklearn.ensemble.AdaBoostRegressor` + to fit a sequence of decision trees as weak learners, please refer to + :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py`. + """ + + _parameter_constraints: dict = { + **BaseWeightBoosting._parameter_constraints, + "loss": [StrOptions({"linear", "square", "exponential"})], + } + + def __init__( + self, + estimator=None, + *, + n_estimators=50, + learning_rate=1.0, + loss="linear", + random_state=None, + ): + super().__init__( + estimator=estimator, + n_estimators=n_estimators, + learning_rate=learning_rate, + random_state=random_state, + ) + + self.loss = loss + self.random_state = random_state + + def _validate_estimator(self): + """Check the estimator and set the estimator_ attribute.""" + super()._validate_estimator(default=DecisionTreeRegressor(max_depth=3)) + + def _boost(self, iboost, X, y, sample_weight, random_state): + """Implement a single boost for regression + + Perform a single boost according to the AdaBoost.R2 algorithm and + return the updated sample weights. + + Parameters + ---------- + iboost : int + The index of the current boost iteration. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. + + y : array-like of shape (n_samples,) + The target values (class labels in classification, real numbers in + regression). + + sample_weight : array-like of shape (n_samples,) + The current sample weights. + + random_state : RandomState + The RandomState instance used if the base estimator accepts a + `random_state` attribute. + Controls also the bootstrap of the weights used to train the weak + learner. + + Returns + ------- + sample_weight : array-like of shape (n_samples,) or None + The reweighted sample weights. + If None then boosting has terminated early. + + estimator_weight : float + The weight for the current boost. + If None then boosting has terminated early. + + estimator_error : float + The regression error for the current boost. + If None then boosting has terminated early. + """ + estimator = self._make_estimator(random_state=random_state) + + # Weighted sampling of the training set with replacement + bootstrap_idx = random_state.choice( + np.arange(_num_samples(X)), + size=_num_samples(X), + replace=True, + p=sample_weight, + ) + + # Fit on the bootstrapped sample and obtain a prediction + # for all samples in the training set + X_ = _safe_indexing(X, bootstrap_idx) + y_ = _safe_indexing(y, bootstrap_idx) + estimator.fit(X_, y_) + y_predict = estimator.predict(X) + + error_vect = np.abs(y_predict - y) + sample_mask = sample_weight > 0 + masked_sample_weight = sample_weight[sample_mask] + masked_error_vector = error_vect[sample_mask] + + error_max = masked_error_vector.max() + if error_max != 0: + masked_error_vector /= error_max + + if self.loss == "square": + masked_error_vector **= 2 + elif self.loss == "exponential": + masked_error_vector = 1.0 - np.exp(-masked_error_vector) + + # Calculate the average loss + estimator_error = (masked_sample_weight * masked_error_vector).sum() + + if estimator_error <= 0: + # Stop if fit is perfect + return sample_weight, 1.0, 0.0 + + elif estimator_error >= 0.5: + # Discard current estimator only if it isn't the only one + if len(self.estimators_) > 1: + self.estimators_.pop(-1) + return None, None, None + + beta = estimator_error / (1.0 - estimator_error) + + # Boost weight using AdaBoost.R2 alg + estimator_weight = self.learning_rate * np.log(1.0 / beta) + + if not iboost == self.n_estimators - 1: + sample_weight[sample_mask] *= np.power( + beta, (1.0 - masked_error_vector) * self.learning_rate + ) + + return sample_weight, estimator_weight, estimator_error + + def _get_median_predict(self, X, limit): + # Evaluate predictions of all estimators + predictions = np.array([est.predict(X) for est in self.estimators_[:limit]]).T + + # Sort the predictions + sorted_idx = np.argsort(predictions, axis=1) + + # Find index of median prediction for each sample + weight_cdf = stable_cumsum(self.estimator_weights_[sorted_idx], axis=1) + median_or_above = weight_cdf >= 0.5 * weight_cdf[:, -1][:, np.newaxis] + median_idx = median_or_above.argmax(axis=1) + + median_estimators = sorted_idx[np.arange(_num_samples(X)), median_idx] + + # Return median predictions + return predictions[np.arange(_num_samples(X)), median_estimators] + + def predict(self, X): + """Predict regression value for X. + + The predicted regression value of an input sample is computed + as the weighted median prediction of the regressors in the ensemble. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Sparse matrix can be CSC, CSR, COO, + DOK, or LIL. COO, DOK, and LIL are converted to CSR. + + Returns + ------- + y : ndarray of shape (n_samples,) + The predicted regression values. + """ + check_is_fitted(self) + X = self._check_X(X) + + return self._get_median_predict(X, len(self.estimators_)) + + def staged_predict(self, X): + """Return staged predictions for X. + + The predicted regression value of an input sample is computed + as the weighted median prediction of the regressors in the ensemble. + + This generator method yields the ensemble prediction after each + iteration of boosting and therefore allows monitoring, such as to + determine the prediction on a test set after each boost. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. + + Yields + ------ + y : generator of ndarray of shape (n_samples,) + The predicted regression values. + """ + check_is_fitted(self) + X = self._check_X(X) + + for i, _ in enumerate(self.estimators_, 1): + yield self._get_median_predict(X, limit=i) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/meson.build b/.venv/lib/python3.12/site-packages/sklearn/ensemble/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..893a4eb1a510aeea0eecb26b38087eb35bbe93d8 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/meson.build @@ -0,0 +1,9 @@ +py.extension_module( + '_gradient_boosting', + [cython_gen.process('_gradient_boosting.pyx')] + utils_cython_tree, + dependencies: [np_dep], + subdir: 'sklearn/ensemble', + install: true +) + +subdir('_hist_gradient_boosting') diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_bagging.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_bagging.py new file mode 100644 index 0000000000000000000000000000000000000000..2cb9336bfd759321da57745beecacf3d46154551 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_bagging.py @@ -0,0 +1,1043 @@ +""" +Testing for the bagging ensemble module (sklearn.ensemble.bagging). +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from itertools import cycle, product + +import joblib +import numpy as np +import pytest + +from sklearn import config_context +from sklearn.base import BaseEstimator +from sklearn.datasets import load_diabetes, load_iris, make_hastie_10_2 +from sklearn.dummy import DummyClassifier, DummyRegressor +from sklearn.ensemble import ( + AdaBoostClassifier, + AdaBoostRegressor, + BaggingClassifier, + BaggingRegressor, + HistGradientBoostingClassifier, + HistGradientBoostingRegressor, + RandomForestClassifier, + RandomForestRegressor, +) +from sklearn.feature_selection import SelectKBest +from sklearn.linear_model import LogisticRegression, Perceptron +from sklearn.model_selection import GridSearchCV, ParameterGrid, train_test_split +from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import FunctionTransformer, scale +from sklearn.random_projection import SparseRandomProjection +from sklearn.svm import SVC, SVR +from sklearn.tests.metadata_routing_common import ( + ConsumingClassifierWithOnlyPredict, + ConsumingClassifierWithoutPredictLogProba, + ConsumingClassifierWithoutPredictProba, + _Registry, + check_recorded_metadata, +) +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +from sklearn.utils import check_random_state +from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal +from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS + +rng = check_random_state(0) + +# also load the iris dataset +# and randomly permute it +iris = load_iris() +perm = rng.permutation(iris.target.size) +iris.data = iris.data[perm] +iris.target = iris.target[perm] + +# also load the diabetes dataset +# and randomly permute it +diabetes = load_diabetes() +perm = rng.permutation(diabetes.target.size) +diabetes.data = diabetes.data[perm] +diabetes.target = diabetes.target[perm] + + +def test_classification(): + # Check classification for various parameter settings. + rng = check_random_state(0) + X_train, X_test, y_train, y_test = train_test_split( + iris.data, iris.target, random_state=rng + ) + grid = ParameterGrid( + { + "max_samples": [0.5, 1.0], + "max_features": [1, 4], + "bootstrap": [True, False], + "bootstrap_features": [True, False], + } + ) + estimators = [ + None, + DummyClassifier(), + Perceptron(max_iter=20), + DecisionTreeClassifier(max_depth=2), + KNeighborsClassifier(), + SVC(), + ] + # Try different parameter settings with different base classifiers without + # doing the full cartesian product to keep the test durations low. + for params, estimator in zip(grid, cycle(estimators)): + BaggingClassifier( + estimator=estimator, + random_state=rng, + n_estimators=2, + **params, + ).fit(X_train, y_train).predict(X_test) + + +@pytest.mark.parametrize( + "sparse_container, params, method", + product( + CSR_CONTAINERS + CSC_CONTAINERS, + [ + { + "max_samples": 0.5, + "max_features": 2, + "bootstrap": True, + "bootstrap_features": True, + }, + { + "max_samples": 1.0, + "max_features": 4, + "bootstrap": True, + "bootstrap_features": True, + }, + {"max_features": 2, "bootstrap": False, "bootstrap_features": True}, + {"max_samples": 0.5, "bootstrap": True, "bootstrap_features": False}, + ], + ["predict", "predict_proba", "predict_log_proba", "decision_function"], + ), +) +def test_sparse_classification(sparse_container, params, method): + # Check classification for various parameter settings on sparse input. + + class CustomSVC(SVC): + """SVC variant that records the nature of the training set""" + + def fit(self, X, y): + super().fit(X, y) + self.data_type_ = type(X) + return self + + rng = check_random_state(0) + X_train, X_test, y_train, y_test = train_test_split( + scale(iris.data), iris.target, random_state=rng + ) + + X_train_sparse = sparse_container(X_train) + X_test_sparse = sparse_container(X_test) + # Trained on sparse format + sparse_classifier = BaggingClassifier( + estimator=CustomSVC(kernel="linear", decision_function_shape="ovr"), + random_state=1, + **params, + ).fit(X_train_sparse, y_train) + sparse_results = getattr(sparse_classifier, method)(X_test_sparse) + + # Trained on dense format + dense_classifier = BaggingClassifier( + estimator=CustomSVC(kernel="linear", decision_function_shape="ovr"), + random_state=1, + **params, + ).fit(X_train, y_train) + dense_results = getattr(dense_classifier, method)(X_test) + assert_array_almost_equal(sparse_results, dense_results) + + sparse_type = type(X_train_sparse) + types = [i.data_type_ for i in sparse_classifier.estimators_] + + assert all([t == sparse_type for t in types]) + + +def test_regression(): + # Check regression for various parameter settings. + rng = check_random_state(0) + X_train, X_test, y_train, y_test = train_test_split( + diabetes.data[:50], diabetes.target[:50], random_state=rng + ) + grid = ParameterGrid( + { + "max_samples": [0.5, 1.0], + "max_features": [0.5, 1.0], + "bootstrap": [True, False], + "bootstrap_features": [True, False], + } + ) + + for estimator in [ + None, + DummyRegressor(), + DecisionTreeRegressor(), + KNeighborsRegressor(), + SVR(), + ]: + for params in grid: + BaggingRegressor(estimator=estimator, random_state=rng, **params).fit( + X_train, y_train + ).predict(X_test) + + +@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) +def test_sparse_regression(sparse_container): + # Check regression for various parameter settings on sparse input. + rng = check_random_state(0) + X_train, X_test, y_train, y_test = train_test_split( + diabetes.data[:50], diabetes.target[:50], random_state=rng + ) + + class CustomSVR(SVR): + """SVC variant that records the nature of the training set""" + + def fit(self, X, y): + super().fit(X, y) + self.data_type_ = type(X) + return self + + parameter_sets = [ + { + "max_samples": 0.5, + "max_features": 2, + "bootstrap": True, + "bootstrap_features": True, + }, + { + "max_samples": 1.0, + "max_features": 4, + "bootstrap": True, + "bootstrap_features": True, + }, + {"max_features": 2, "bootstrap": False, "bootstrap_features": True}, + {"max_samples": 0.5, "bootstrap": True, "bootstrap_features": False}, + ] + + X_train_sparse = sparse_container(X_train) + X_test_sparse = sparse_container(X_test) + for params in parameter_sets: + # Trained on sparse format + sparse_classifier = BaggingRegressor( + estimator=CustomSVR(), random_state=1, **params + ).fit(X_train_sparse, y_train) + sparse_results = sparse_classifier.predict(X_test_sparse) + + # Trained on dense format + dense_results = ( + BaggingRegressor(estimator=CustomSVR(), random_state=1, **params) + .fit(X_train, y_train) + .predict(X_test) + ) + + sparse_type = type(X_train_sparse) + types = [i.data_type_ for i in sparse_classifier.estimators_] + + assert_array_almost_equal(sparse_results, dense_results) + assert all([t == sparse_type for t in types]) + assert_array_almost_equal(sparse_results, dense_results) + + +class DummySizeEstimator(BaseEstimator): + def fit(self, X, y): + self.training_size_ = X.shape[0] + self.training_hash_ = joblib.hash(X) + + def predict(self, X): + return np.ones(X.shape[0]) + + +def test_bootstrap_samples(): + # Test that bootstrapping samples generate non-perfect base estimators. + rng = check_random_state(0) + X_train, X_test, y_train, y_test = train_test_split( + diabetes.data, diabetes.target, random_state=rng + ) + + estimator = DecisionTreeRegressor().fit(X_train, y_train) + + # without bootstrap, all trees are perfect on the training set + ensemble = BaggingRegressor( + estimator=DecisionTreeRegressor(), + max_samples=1.0, + bootstrap=False, + random_state=rng, + ).fit(X_train, y_train) + + assert estimator.score(X_train, y_train) == ensemble.score(X_train, y_train) + + # with bootstrap, trees are no longer perfect on the training set + ensemble = BaggingRegressor( + estimator=DecisionTreeRegressor(), + max_samples=1.0, + bootstrap=True, + random_state=rng, + ).fit(X_train, y_train) + + assert estimator.score(X_train, y_train) > ensemble.score(X_train, y_train) + + # check that each sampling correspond to a complete bootstrap resample. + # the size of each bootstrap should be the same as the input data but + # the data should be different (checked using the hash of the data). + ensemble = BaggingRegressor(estimator=DummySizeEstimator(), bootstrap=True).fit( + X_train, y_train + ) + training_hash = [] + for estimator in ensemble.estimators_: + assert estimator.training_size_ == X_train.shape[0] + training_hash.append(estimator.training_hash_) + assert len(set(training_hash)) == len(training_hash) + + +def test_bootstrap_features(): + # Test that bootstrapping features may generate duplicate features. + rng = check_random_state(0) + X_train, X_test, y_train, y_test = train_test_split( + diabetes.data, diabetes.target, random_state=rng + ) + + ensemble = BaggingRegressor( + estimator=DecisionTreeRegressor(), + max_features=1.0, + bootstrap_features=False, + random_state=rng, + ).fit(X_train, y_train) + + for features in ensemble.estimators_features_: + assert diabetes.data.shape[1] == np.unique(features).shape[0] + + ensemble = BaggingRegressor( + estimator=DecisionTreeRegressor(), + max_features=1.0, + bootstrap_features=True, + random_state=rng, + ).fit(X_train, y_train) + + for features in ensemble.estimators_features_: + assert diabetes.data.shape[1] > np.unique(features).shape[0] + + +def test_probability(): + # Predict probabilities. + rng = check_random_state(0) + X_train, X_test, y_train, y_test = train_test_split( + iris.data, iris.target, random_state=rng + ) + + with np.errstate(divide="ignore", invalid="ignore"): + # Normal case + ensemble = BaggingClassifier( + estimator=DecisionTreeClassifier(), random_state=rng + ).fit(X_train, y_train) + + assert_array_almost_equal( + np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test)) + ) + + assert_array_almost_equal( + ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)) + ) + + # Degenerate case, where some classes are missing + ensemble = BaggingClassifier( + estimator=LogisticRegression(), random_state=rng, max_samples=5 + ).fit(X_train, y_train) + + assert_array_almost_equal( + np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test)) + ) + + assert_array_almost_equal( + ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)) + ) + + +def test_oob_score_classification(): + # Check that oob prediction is a good estimation of the generalization + # error. + rng = check_random_state(0) + X_train, X_test, y_train, y_test = train_test_split( + iris.data, iris.target, random_state=rng + ) + + for estimator in [DecisionTreeClassifier(), SVC()]: + clf = BaggingClassifier( + estimator=estimator, + n_estimators=100, + bootstrap=True, + oob_score=True, + random_state=rng, + ).fit(X_train, y_train) + + test_score = clf.score(X_test, y_test) + + assert abs(test_score - clf.oob_score_) < 0.1 + + # Test with few estimators + warn_msg = ( + "Some inputs do not have OOB scores. This probably means too few " + "estimators were used to compute any reliable oob estimates." + ) + with pytest.warns(UserWarning, match=warn_msg): + clf = BaggingClassifier( + estimator=estimator, + n_estimators=1, + bootstrap=True, + oob_score=True, + random_state=rng, + ) + clf.fit(X_train, y_train) + + +def test_oob_score_regression(): + # Check that oob prediction is a good estimation of the generalization + # error. + rng = check_random_state(0) + X_train, X_test, y_train, y_test = train_test_split( + diabetes.data, diabetes.target, random_state=rng + ) + + clf = BaggingRegressor( + estimator=DecisionTreeRegressor(), + n_estimators=50, + bootstrap=True, + oob_score=True, + random_state=rng, + ).fit(X_train, y_train) + + test_score = clf.score(X_test, y_test) + + assert abs(test_score - clf.oob_score_) < 0.1 + + # Test with few estimators + warn_msg = ( + "Some inputs do not have OOB scores. This probably means too few " + "estimators were used to compute any reliable oob estimates." + ) + with pytest.warns(UserWarning, match=warn_msg): + regr = BaggingRegressor( + estimator=DecisionTreeRegressor(), + n_estimators=1, + bootstrap=True, + oob_score=True, + random_state=rng, + ) + regr.fit(X_train, y_train) + + +def test_single_estimator(): + # Check singleton ensembles. + rng = check_random_state(0) + X_train, X_test, y_train, y_test = train_test_split( + diabetes.data, diabetes.target, random_state=rng + ) + + clf1 = BaggingRegressor( + estimator=KNeighborsRegressor(), + n_estimators=1, + bootstrap=False, + bootstrap_features=False, + random_state=rng, + ).fit(X_train, y_train) + + clf2 = KNeighborsRegressor().fit(X_train, y_train) + + assert_array_almost_equal(clf1.predict(X_test), clf2.predict(X_test)) + + +def test_error(): + # Test support of decision_function + X, y = iris.data, iris.target + base = DecisionTreeClassifier() + assert not hasattr(BaggingClassifier(base).fit(X, y), "decision_function") + + +def test_parallel_classification(): + # Check parallel classification. + X_train, X_test, y_train, y_test = train_test_split( + iris.data, iris.target, random_state=0 + ) + + ensemble = BaggingClassifier( + DecisionTreeClassifier(), n_jobs=3, random_state=0 + ).fit(X_train, y_train) + + # predict_proba + y1 = ensemble.predict_proba(X_test) + ensemble.set_params(n_jobs=1) + y2 = ensemble.predict_proba(X_test) + assert_array_almost_equal(y1, y2) + + ensemble = BaggingClassifier( + DecisionTreeClassifier(), n_jobs=1, random_state=0 + ).fit(X_train, y_train) + + y3 = ensemble.predict_proba(X_test) + assert_array_almost_equal(y1, y3) + + # decision_function + ensemble = BaggingClassifier( + SVC(decision_function_shape="ovr"), n_jobs=3, random_state=0 + ).fit(X_train, y_train) + + decisions1 = ensemble.decision_function(X_test) + ensemble.set_params(n_jobs=1) + decisions2 = ensemble.decision_function(X_test) + assert_array_almost_equal(decisions1, decisions2) + + ensemble = BaggingClassifier( + SVC(decision_function_shape="ovr"), n_jobs=1, random_state=0 + ).fit(X_train, y_train) + + decisions3 = ensemble.decision_function(X_test) + assert_array_almost_equal(decisions1, decisions3) + + +def test_parallel_regression(): + # Check parallel regression. + rng = check_random_state(0) + + X_train, X_test, y_train, y_test = train_test_split( + diabetes.data, diabetes.target, random_state=rng + ) + + ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit( + X_train, y_train + ) + + ensemble.set_params(n_jobs=1) + y1 = ensemble.predict(X_test) + ensemble.set_params(n_jobs=2) + y2 = ensemble.predict(X_test) + assert_array_almost_equal(y1, y2) + + ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=1, random_state=0).fit( + X_train, y_train + ) + + y3 = ensemble.predict(X_test) + assert_array_almost_equal(y1, y3) + + +def test_gridsearch(): + # Check that bagging ensembles can be grid-searched. + # Transform iris into a binary classification task + X, y = iris.data, iris.target + y[y == 2] = 1 + + # Grid search with scoring based on decision_function + parameters = {"n_estimators": (1, 2), "estimator__C": (1, 2)} + + GridSearchCV(BaggingClassifier(SVC()), parameters, scoring="roc_auc").fit(X, y) + + +def test_estimator(): + # Check estimator and its default values. + rng = check_random_state(0) + + # Classification + X_train, X_test, y_train, y_test = train_test_split( + iris.data, iris.target, random_state=rng + ) + + ensemble = BaggingClassifier(None, n_jobs=3, random_state=0).fit(X_train, y_train) + + assert isinstance(ensemble.estimator_, DecisionTreeClassifier) + + ensemble = BaggingClassifier( + DecisionTreeClassifier(), n_jobs=3, random_state=0 + ).fit(X_train, y_train) + + assert isinstance(ensemble.estimator_, DecisionTreeClassifier) + + ensemble = BaggingClassifier(Perceptron(), n_jobs=3, random_state=0).fit( + X_train, y_train + ) + + assert isinstance(ensemble.estimator_, Perceptron) + + # Regression + X_train, X_test, y_train, y_test = train_test_split( + diabetes.data, diabetes.target, random_state=rng + ) + + ensemble = BaggingRegressor(None, n_jobs=3, random_state=0).fit(X_train, y_train) + + assert isinstance(ensemble.estimator_, DecisionTreeRegressor) + + ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit( + X_train, y_train + ) + + assert isinstance(ensemble.estimator_, DecisionTreeRegressor) + + ensemble = BaggingRegressor(SVR(), n_jobs=3, random_state=0).fit(X_train, y_train) + assert isinstance(ensemble.estimator_, SVR) + + +def test_bagging_with_pipeline(): + estimator = BaggingClassifier( + make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2 + ) + estimator.fit(iris.data, iris.target) + assert isinstance(estimator[0].steps[-1][1].random_state, int) + + +class DummyZeroEstimator(BaseEstimator): + def fit(self, X, y): + self.classes_ = np.unique(y) + return self + + def predict(self, X): + return self.classes_[np.zeros(X.shape[0], dtype=int)] + + +def test_bagging_sample_weight_unsupported_but_passed(): + estimator = BaggingClassifier(DummyZeroEstimator()) + rng = check_random_state(0) + + estimator.fit(iris.data, iris.target).predict(iris.data) + with pytest.raises(ValueError): + estimator.fit( + iris.data, + iris.target, + sample_weight=rng.randint(10, size=(iris.data.shape[0])), + ) + + +def test_warm_start(random_state=42): + # Test if fitting incrementally with warm start gives a forest of the + # right size and the same results as a normal fit. + X, y = make_hastie_10_2(n_samples=20, random_state=1) + + clf_ws = None + for n_estimators in [5, 10]: + if clf_ws is None: + clf_ws = BaggingClassifier( + n_estimators=n_estimators, random_state=random_state, warm_start=True + ) + else: + clf_ws.set_params(n_estimators=n_estimators) + clf_ws.fit(X, y) + assert len(clf_ws) == n_estimators + + clf_no_ws = BaggingClassifier( + n_estimators=10, random_state=random_state, warm_start=False + ) + clf_no_ws.fit(X, y) + + assert set([tree.random_state for tree in clf_ws]) == set( + [tree.random_state for tree in clf_no_ws] + ) + + +def test_warm_start_smaller_n_estimators(): + # Test if warm start'ed second fit with smaller n_estimators raises error. + X, y = make_hastie_10_2(n_samples=20, random_state=1) + clf = BaggingClassifier(n_estimators=5, warm_start=True) + clf.fit(X, y) + clf.set_params(n_estimators=4) + with pytest.raises(ValueError): + clf.fit(X, y) + + +def test_warm_start_equal_n_estimators(): + # Test that nothing happens when fitting without increasing n_estimators + X, y = make_hastie_10_2(n_samples=20, random_state=1) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) + + clf = BaggingClassifier(n_estimators=5, warm_start=True, random_state=83) + clf.fit(X_train, y_train) + + y_pred = clf.predict(X_test) + # modify X to nonsense values, this should not change anything + X_train += 1.0 + + warn_msg = "Warm-start fitting without increasing n_estimators does not" + with pytest.warns(UserWarning, match=warn_msg): + clf.fit(X_train, y_train) + assert_array_equal(y_pred, clf.predict(X_test)) + + +def test_warm_start_equivalence(): + # warm started classifier with 5+5 estimators should be equivalent to + # one classifier with 10 estimators + X, y = make_hastie_10_2(n_samples=20, random_state=1) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) + + clf_ws = BaggingClassifier(n_estimators=5, warm_start=True, random_state=3141) + clf_ws.fit(X_train, y_train) + clf_ws.set_params(n_estimators=10) + clf_ws.fit(X_train, y_train) + y1 = clf_ws.predict(X_test) + + clf = BaggingClassifier(n_estimators=10, warm_start=False, random_state=3141) + clf.fit(X_train, y_train) + y2 = clf.predict(X_test) + + assert_array_almost_equal(y1, y2) + + +def test_warm_start_with_oob_score_fails(): + # Check using oob_score and warm_start simultaneously fails + X, y = make_hastie_10_2(n_samples=20, random_state=1) + clf = BaggingClassifier(n_estimators=5, warm_start=True, oob_score=True) + with pytest.raises(ValueError): + clf.fit(X, y) + + +def test_oob_score_removed_on_warm_start(): + X, y = make_hastie_10_2(n_samples=100, random_state=1) + + clf = BaggingClassifier(n_estimators=5, oob_score=True) + clf.fit(X, y) + + clf.set_params(warm_start=True, oob_score=False, n_estimators=10) + clf.fit(X, y) + + with pytest.raises(AttributeError): + getattr(clf, "oob_score_") + + +def test_oob_score_consistency(): + # Make sure OOB scores are identical when random_state, estimator, and + # training data are fixed and fitting is done twice + X, y = make_hastie_10_2(n_samples=200, random_state=1) + bagging = BaggingClassifier( + KNeighborsClassifier(), + max_samples=0.5, + max_features=0.5, + oob_score=True, + random_state=1, + ) + assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_ + + +def test_estimators_samples(): + # Check that format of estimators_samples_ is correct and that results + # generated at fit time can be identically reproduced at a later time + # using data saved in object attributes. + X, y = make_hastie_10_2(n_samples=200, random_state=1) + bagging = BaggingClassifier( + LogisticRegression(), + max_samples=0.5, + max_features=0.5, + random_state=1, + bootstrap=False, + ) + bagging.fit(X, y) + + # Get relevant attributes + estimators_samples = bagging.estimators_samples_ + estimators_features = bagging.estimators_features_ + estimators = bagging.estimators_ + + # Test for correct formatting + assert len(estimators_samples) == len(estimators) + assert len(estimators_samples[0]) == len(X) // 2 + assert estimators_samples[0].dtype.kind == "i" + + # Re-fit single estimator to test for consistent sampling + estimator_index = 0 + estimator_samples = estimators_samples[estimator_index] + estimator_features = estimators_features[estimator_index] + estimator = estimators[estimator_index] + + X_train = (X[estimator_samples])[:, estimator_features] + y_train = y[estimator_samples] + + orig_coefs = estimator.coef_ + estimator.fit(X_train, y_train) + new_coefs = estimator.coef_ + + assert_array_almost_equal(orig_coefs, new_coefs) + + +def test_estimators_samples_deterministic(): + # This test is a regression test to check that with a random step + # (e.g. SparseRandomProjection) and a given random state, the results + # generated at fit time can be identically reproduced at a later time using + # data saved in object attributes. Check issue #9524 for full discussion. + + iris = load_iris() + X, y = iris.data, iris.target + + base_pipeline = make_pipeline( + SparseRandomProjection(n_components=2), LogisticRegression() + ) + clf = BaggingClassifier(estimator=base_pipeline, max_samples=0.5, random_state=0) + clf.fit(X, y) + pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy() + + estimator = clf.estimators_[0] + estimator_sample = clf.estimators_samples_[0] + estimator_feature = clf.estimators_features_[0] + + X_train = (X[estimator_sample])[:, estimator_feature] + y_train = y[estimator_sample] + + estimator.fit(X_train, y_train) + assert_array_equal(estimator.steps[-1][1].coef_, pipeline_estimator_coef) + + +def test_max_samples_consistency(): + # Make sure validated max_samples and original max_samples are identical + # when valid integer max_samples supplied by user + max_samples = 100 + X, y = make_hastie_10_2(n_samples=2 * max_samples, random_state=1) + bagging = BaggingClassifier( + KNeighborsClassifier(), + max_samples=max_samples, + max_features=0.5, + random_state=1, + ) + bagging.fit(X, y) + assert bagging._max_samples == max_samples + + +def test_set_oob_score_label_encoding(): + # Make sure the oob_score doesn't change when the labels change + # See: https://github.com/scikit-learn/scikit-learn/issues/8933 + random_state = 5 + X = [[-1], [0], [1]] * 5 + Y1 = ["A", "B", "C"] * 5 + Y2 = [-1, 0, 1] * 5 + Y3 = [0, 1, 2] * 5 + x1 = ( + BaggingClassifier(oob_score=True, random_state=random_state) + .fit(X, Y1) + .oob_score_ + ) + x2 = ( + BaggingClassifier(oob_score=True, random_state=random_state) + .fit(X, Y2) + .oob_score_ + ) + x3 = ( + BaggingClassifier(oob_score=True, random_state=random_state) + .fit(X, Y3) + .oob_score_ + ) + assert [x1, x2] == [x3, x3] + + +def replace(X): + X = X.astype("float", copy=True) + X[~np.isfinite(X)] = 0 + return X + + +def test_bagging_regressor_with_missing_inputs(): + # Check that BaggingRegressor can accept X with missing/infinite data + X = np.array( + [ + [1, 3, 5], + [2, None, 6], + [2, np.nan, 6], + [2, np.inf, 6], + [2, -np.inf, 6], + ] + ) + y_values = [ + np.array([2, 3, 3, 3, 3]), + np.array( + [ + [2, 1, 9], + [3, 6, 8], + [3, 6, 8], + [3, 6, 8], + [3, 6, 8], + ] + ), + ] + for y in y_values: + regressor = DecisionTreeRegressor() + pipeline = make_pipeline(FunctionTransformer(replace), regressor) + pipeline.fit(X, y).predict(X) + bagging_regressor = BaggingRegressor(pipeline) + y_hat = bagging_regressor.fit(X, y).predict(X) + assert y.shape == y_hat.shape + + # Verify that exceptions can be raised by wrapper regressor + regressor = DecisionTreeRegressor() + pipeline = make_pipeline(regressor) + with pytest.raises(ValueError): + pipeline.fit(X, y) + bagging_regressor = BaggingRegressor(pipeline) + with pytest.raises(ValueError): + bagging_regressor.fit(X, y) + + +def test_bagging_classifier_with_missing_inputs(): + # Check that BaggingClassifier can accept X with missing/infinite data + X = np.array( + [ + [1, 3, 5], + [2, None, 6], + [2, np.nan, 6], + [2, np.inf, 6], + [2, -np.inf, 6], + ] + ) + y = np.array([3, 6, 6, 6, 6]) + classifier = DecisionTreeClassifier() + pipeline = make_pipeline(FunctionTransformer(replace), classifier) + pipeline.fit(X, y).predict(X) + bagging_classifier = BaggingClassifier(pipeline) + bagging_classifier.fit(X, y) + y_hat = bagging_classifier.predict(X) + assert y.shape == y_hat.shape + bagging_classifier.predict_log_proba(X) + bagging_classifier.predict_proba(X) + + # Verify that exceptions can be raised by wrapper classifier + classifier = DecisionTreeClassifier() + pipeline = make_pipeline(classifier) + with pytest.raises(ValueError): + pipeline.fit(X, y) + bagging_classifier = BaggingClassifier(pipeline) + with pytest.raises(ValueError): + bagging_classifier.fit(X, y) + + +def test_bagging_small_max_features(): + # Check that Bagging estimator can accept low fractional max_features + + X = np.array([[1, 2], [3, 4]]) + y = np.array([1, 0]) + + bagging = BaggingClassifier(LogisticRegression(), max_features=0.3, random_state=1) + bagging.fit(X, y) + + +def test_bagging_get_estimators_indices(global_random_seed): + # Check that Bagging estimator can generate sample indices properly + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/16436 + + rng = np.random.RandomState(global_random_seed) + X = rng.randn(13, 4) + y = np.arange(13) + + class MyEstimator(DecisionTreeRegressor): + """An estimator which stores y indices information at fit.""" + + def fit(self, X, y): + self._sample_indices = y + + clf = BaggingRegressor(estimator=MyEstimator(), n_estimators=1, random_state=0) + clf.fit(X, y) + + assert_array_equal(clf.estimators_[0]._sample_indices, clf.estimators_samples_[0]) + + +@pytest.mark.parametrize( + "bagging, expected_allow_nan", + [ + (BaggingClassifier(HistGradientBoostingClassifier(max_iter=1)), True), + (BaggingRegressor(HistGradientBoostingRegressor(max_iter=1)), True), + (BaggingClassifier(LogisticRegression()), False), + (BaggingRegressor(SVR()), False), + ], +) +def test_bagging_allow_nan_tag(bagging, expected_allow_nan): + """Check that bagging inherits allow_nan tag.""" + assert bagging.__sklearn_tags__().input_tags.allow_nan == expected_allow_nan + + +# Metadata Routing Tests +# ====================== + + +@config_context(enable_metadata_routing=True) +@pytest.mark.parametrize( + "model", + [ + BaggingClassifier( + estimator=RandomForestClassifier(n_estimators=1), n_estimators=1 + ), + BaggingRegressor( + estimator=RandomForestRegressor(n_estimators=1), n_estimators=1 + ), + ], +) +def test_bagging_with_metadata_routing(model): + """Make sure that metadata routing works with non-default estimator.""" + model.fit(iris.data, iris.target) + + +@pytest.mark.parametrize( + "sub_estimator, caller, callee", + [ + (ConsumingClassifierWithoutPredictProba, "predict", "predict"), + ( + ConsumingClassifierWithoutPredictLogProba, + "predict_log_proba", + "predict_proba", + ), + (ConsumingClassifierWithOnlyPredict, "predict_log_proba", "predict"), + ], +) +@config_context(enable_metadata_routing=True) +def test_metadata_routing_with_dynamic_method_selection(sub_estimator, caller, callee): + """Test that metadata routing works in `BaggingClassifier` with dynamic selection of + the sub-estimator's methods. Here we test only specific test cases, where + sub-estimator methods are not present and are not tested with `ConsumingClassifier` + (which possesses all the methods) in + sklearn/tests/test_metaestimators_metadata_routing.py: `BaggingClassifier.predict()` + dynamically routes to `predict` if the sub-estimator doesn't have `predict_proba` + and `BaggingClassifier.predict_log_proba()` dynamically routes to `predict_proba` if + the sub-estimator doesn't have `predict_log_proba`, or to `predict`, if it doesn't + have it. + """ + X = np.array([[0, 2], [1, 4], [2, 6]]) + y = [1, 2, 3] + sample_weight, metadata = [1], "a" + registry = _Registry() + estimator = sub_estimator(registry=registry) + set_callee_request = "set_" + callee + "_request" + getattr(estimator, set_callee_request)(sample_weight=True, metadata=True) + + bagging = BaggingClassifier(estimator=estimator) + bagging.fit(X, y) + getattr(bagging, caller)( + X=np.array([[1, 1], [1, 3], [0, 2]]), + sample_weight=sample_weight, + metadata=metadata, + ) + + assert len(registry) + for estimator in registry: + check_recorded_metadata( + obj=estimator, + method=callee, + parent=caller, + sample_weight=sample_weight, + metadata=metadata, + ) + + +# End of Metadata Routing Tests +# ============================= + + +@pytest.mark.parametrize( + "model", + [ + BaggingClassifier( + estimator=AdaBoostClassifier(n_estimators=1), + n_estimators=1, + ), + BaggingRegressor(estimator=AdaBoostRegressor(n_estimators=1), n_estimators=1), + ], +) +def test_bagging_without_support_metadata_routing(model): + """Make sure that we still can use an estimator that does not implement the + metadata routing.""" + model.fit(iris.data, iris.target) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_base.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_base.py new file mode 100644 index 0000000000000000000000000000000000000000..95a852b8a7cc50e3b4440461e7ed5f5facde3e69 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_base.py @@ -0,0 +1,109 @@ +""" +Testing for the base module (sklearn.ensemble.base). +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from collections import OrderedDict + +import numpy as np + +from sklearn.datasets import load_iris +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +from sklearn.ensemble import BaggingClassifier +from sklearn.ensemble._base import _set_random_states +from sklearn.feature_selection import SelectFromModel +from sklearn.linear_model import Perceptron +from sklearn.pipeline import Pipeline + + +def test_base(): + # Check BaseEnsemble methods. + ensemble = BaggingClassifier( + estimator=Perceptron(random_state=None), n_estimators=3 + ) + + iris = load_iris() + ensemble.fit(iris.data, iris.target) + ensemble.estimators_ = [] # empty the list and create estimators manually + + ensemble._make_estimator() + random_state = np.random.RandomState(3) + ensemble._make_estimator(random_state=random_state) + ensemble._make_estimator(random_state=random_state) + ensemble._make_estimator(append=False) + + assert 3 == len(ensemble) + assert 3 == len(ensemble.estimators_) + + assert isinstance(ensemble[0], Perceptron) + assert ensemble[0].random_state is None + assert isinstance(ensemble[1].random_state, int) + assert isinstance(ensemble[2].random_state, int) + assert ensemble[1].random_state != ensemble[2].random_state + + np_int_ensemble = BaggingClassifier( + estimator=Perceptron(), n_estimators=np.int32(3) + ) + np_int_ensemble.fit(iris.data, iris.target) + + +def test_set_random_states(): + # Linear Discriminant Analysis doesn't have random state: smoke test + _set_random_states(LinearDiscriminantAnalysis(), random_state=17) + + clf1 = Perceptron(random_state=None) + assert clf1.random_state is None + # check random_state is None still sets + _set_random_states(clf1, None) + assert isinstance(clf1.random_state, int) + + # check random_state fixes results in consistent initialisation + _set_random_states(clf1, 3) + assert isinstance(clf1.random_state, int) + clf2 = Perceptron(random_state=None) + _set_random_states(clf2, 3) + assert clf1.random_state == clf2.random_state + + # nested random_state + + def make_steps(): + return [ + ("sel", SelectFromModel(Perceptron(random_state=None))), + ("clf", Perceptron(random_state=None)), + ] + + est1 = Pipeline(make_steps()) + _set_random_states(est1, 3) + assert isinstance(est1.steps[0][1].estimator.random_state, int) + assert isinstance(est1.steps[1][1].random_state, int) + assert ( + est1.get_params()["sel__estimator__random_state"] + != est1.get_params()["clf__random_state"] + ) + + # ensure multiple random_state parameters are invariant to get_params() + # iteration order + + class AlphaParamPipeline(Pipeline): + def get_params(self, *args, **kwargs): + params = Pipeline.get_params(self, *args, **kwargs).items() + return OrderedDict(sorted(params)) + + class RevParamPipeline(Pipeline): + def get_params(self, *args, **kwargs): + params = Pipeline.get_params(self, *args, **kwargs).items() + return OrderedDict(sorted(params, reverse=True)) + + for cls in [AlphaParamPipeline, RevParamPipeline]: + est2 = cls(make_steps()) + _set_random_states(est2, 3) + assert ( + est1.get_params()["sel__estimator__random_state"] + == est2.get_params()["sel__estimator__random_state"] + ) + assert ( + est1.get_params()["clf__random_state"] + == est2.get_params()["clf__random_state"] + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_common.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_common.py new file mode 100644 index 0000000000000000000000000000000000000000..6e83512ccd1d673951655c4572ac294fdda52af2 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_common.py @@ -0,0 +1,262 @@ +import numpy as np +import pytest + +from sklearn.base import ClassifierMixin, clone, is_classifier +from sklearn.datasets import ( + load_diabetes, + load_iris, + make_classification, + make_regression, +) +from sklearn.ensemble import ( + RandomForestClassifier, + RandomForestRegressor, + StackingClassifier, + StackingRegressor, + VotingClassifier, + VotingRegressor, +) +from sklearn.impute import SimpleImputer +from sklearn.linear_model import LinearRegression, LogisticRegression +from sklearn.pipeline import make_pipeline +from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR + +X, y = load_iris(return_X_y=True) + +X_r, y_r = load_diabetes(return_X_y=True) + + +@pytest.mark.parametrize( + "X, y, estimator", + [ + ( + *make_classification(n_samples=10), + StackingClassifier( + estimators=[ + ("lr", LogisticRegression()), + ("svm", LinearSVC()), + ("rf", RandomForestClassifier(n_estimators=5, max_depth=3)), + ], + cv=2, + ), + ), + ( + *make_classification(n_samples=10), + VotingClassifier( + estimators=[ + ("lr", LogisticRegression()), + ("svm", LinearSVC()), + ("rf", RandomForestClassifier(n_estimators=5, max_depth=3)), + ] + ), + ), + ( + *make_regression(n_samples=10), + StackingRegressor( + estimators=[ + ("lr", LinearRegression()), + ("svm", LinearSVR()), + ("rf", RandomForestRegressor(n_estimators=5, max_depth=3)), + ], + cv=2, + ), + ), + ( + *make_regression(n_samples=10), + VotingRegressor( + estimators=[ + ("lr", LinearRegression()), + ("svm", LinearSVR()), + ("rf", RandomForestRegressor(n_estimators=5, max_depth=3)), + ] + ), + ), + ], + ids=[ + "stacking-classifier", + "voting-classifier", + "stacking-regressor", + "voting-regressor", + ], +) +def test_ensemble_heterogeneous_estimators_behavior(X, y, estimator): + # check that the behavior of `estimators`, `estimators_`, + # `named_estimators`, `named_estimators_` is consistent across all + # ensemble classes and when using `set_params()`. + + # before fit + assert "svm" in estimator.named_estimators + assert estimator.named_estimators.svm is estimator.estimators[1][1] + assert estimator.named_estimators.svm is estimator.named_estimators["svm"] + + # check fitted attributes + estimator.fit(X, y) + assert len(estimator.named_estimators) == 3 + assert len(estimator.named_estimators_) == 3 + assert sorted(list(estimator.named_estimators_.keys())) == sorted( + ["lr", "svm", "rf"] + ) + + # check that set_params() does not add a new attribute + estimator_new_params = clone(estimator) + svm_estimator = SVC() if is_classifier(estimator) else SVR() + estimator_new_params.set_params(svm=svm_estimator).fit(X, y) + assert not hasattr(estimator_new_params, "svm") + assert ( + estimator_new_params.named_estimators.lr.get_params() + == estimator.named_estimators.lr.get_params() + ) + assert ( + estimator_new_params.named_estimators.rf.get_params() + == estimator.named_estimators.rf.get_params() + ) + + # check the behavior when setting an dropping an estimator + estimator_dropped = clone(estimator) + estimator_dropped.set_params(svm="drop") + estimator_dropped.fit(X, y) + assert len(estimator_dropped.named_estimators) == 3 + assert estimator_dropped.named_estimators.svm == "drop" + assert len(estimator_dropped.named_estimators_) == 3 + assert sorted(list(estimator_dropped.named_estimators_.keys())) == sorted( + ["lr", "svm", "rf"] + ) + for sub_est in estimator_dropped.named_estimators_: + # check that the correspondence is correct + assert not isinstance(sub_est, type(estimator.named_estimators.svm)) + + # check that we can set the parameters of the underlying classifier + estimator.set_params(svm__C=10.0) + estimator.set_params(rf__max_depth=5) + assert ( + estimator.get_params()["svm__C"] + == estimator.get_params()["svm"].get_params()["C"] + ) + assert ( + estimator.get_params()["rf__max_depth"] + == estimator.get_params()["rf"].get_params()["max_depth"] + ) + + +@pytest.mark.parametrize( + "Ensemble", + [VotingClassifier, StackingRegressor, VotingRegressor], +) +def test_ensemble_heterogeneous_estimators_type(Ensemble): + # check that ensemble will fail during validation if the underlying + # estimators are not of the same type (i.e. classifier or regressor) + # StackingClassifier can have an underlying regresor so it's not checked + if issubclass(Ensemble, ClassifierMixin): + X, y = make_classification(n_samples=10) + estimators = [("lr", LinearRegression())] + ensemble_type = "classifier" + else: + X, y = make_regression(n_samples=10) + estimators = [("lr", LogisticRegression())] + ensemble_type = "regressor" + ensemble = Ensemble(estimators=estimators) + + err_msg = "should be a {}".format(ensemble_type) + with pytest.raises(ValueError, match=err_msg): + ensemble.fit(X, y) + + +@pytest.mark.parametrize( + "X, y, Ensemble", + [ + (*make_classification(n_samples=10), StackingClassifier), + (*make_classification(n_samples=10), VotingClassifier), + (*make_regression(n_samples=10), StackingRegressor), + (*make_regression(n_samples=10), VotingRegressor), + ], +) +def test_ensemble_heterogeneous_estimators_name_validation(X, y, Ensemble): + # raise an error when the name contains dunder + if issubclass(Ensemble, ClassifierMixin): + estimators = [("lr__", LogisticRegression())] + else: + estimators = [("lr__", LinearRegression())] + ensemble = Ensemble(estimators=estimators) + + err_msg = r"Estimator names must not contain __: got \['lr__'\]" + with pytest.raises(ValueError, match=err_msg): + ensemble.fit(X, y) + + # raise an error when the name is not unique + if issubclass(Ensemble, ClassifierMixin): + estimators = [("lr", LogisticRegression()), ("lr", LogisticRegression())] + else: + estimators = [("lr", LinearRegression()), ("lr", LinearRegression())] + ensemble = Ensemble(estimators=estimators) + + err_msg = r"Names provided are not unique: \['lr', 'lr'\]" + with pytest.raises(ValueError, match=err_msg): + ensemble.fit(X, y) + + # raise an error when the name conflicts with the parameters + if issubclass(Ensemble, ClassifierMixin): + estimators = [("estimators", LogisticRegression())] + else: + estimators = [("estimators", LinearRegression())] + ensemble = Ensemble(estimators=estimators) + + err_msg = "Estimator names conflict with constructor arguments" + with pytest.raises(ValueError, match=err_msg): + ensemble.fit(X, y) + + +@pytest.mark.parametrize( + "X, y, estimator", + [ + ( + *make_classification(n_samples=10), + StackingClassifier(estimators=[("lr", LogisticRegression())]), + ), + ( + *make_classification(n_samples=10), + VotingClassifier(estimators=[("lr", LogisticRegression())]), + ), + ( + *make_regression(n_samples=10), + StackingRegressor(estimators=[("lr", LinearRegression())]), + ), + ( + *make_regression(n_samples=10), + VotingRegressor(estimators=[("lr", LinearRegression())]), + ), + ], + ids=[ + "stacking-classifier", + "voting-classifier", + "stacking-regressor", + "voting-regressor", + ], +) +def test_ensemble_heterogeneous_estimators_all_dropped(X, y, estimator): + # check that we raise a consistent error when all estimators are + # dropped + estimator.set_params(lr="drop") + with pytest.raises(ValueError, match="All estimators are dropped."): + estimator.fit(X, y) + + +@pytest.mark.parametrize( + "Ensemble, Estimator, X, y", + [ + (StackingClassifier, LogisticRegression, X, y), + (StackingRegressor, LinearRegression, X_r, y_r), + (VotingClassifier, LogisticRegression, X, y), + (VotingRegressor, LinearRegression, X_r, y_r), + ], +) +# FIXME: we should move this test in `estimator_checks` once we are able +# to construct meta-estimator instances +def test_heterogeneous_ensemble_support_missing_values(Ensemble, Estimator, X, y): + # check that Voting and Stacking predictor delegate the missing values + # validation to the underlying estimator. + X = X.copy() + mask = np.random.choice([1, 0], X.shape, p=[0.1, 0.9]).astype(bool) + X[mask] = np.nan + pipe = make_pipeline(SimpleImputer(), Estimator()) + ensemble = Ensemble(estimators=[("pipe1", pipe), ("pipe2", pipe)]) + ensemble.fit(X, y).score(X, y) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_forest.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_forest.py new file mode 100644 index 0000000000000000000000000000000000000000..5dec5c7ab90b2635aa58f0735859a6fdfea7d0ca --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_forest.py @@ -0,0 +1,1865 @@ +""" +Testing for the forest module (sklearn.ensemble.forest). +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import itertools +import math +import pickle +from collections import defaultdict +from functools import partial +from itertools import combinations, product +from typing import Any, Dict +from unittest.mock import patch + +import joblib +import numpy as np +import pytest +from scipy.special import comb + +import sklearn +from sklearn import clone, datasets +from sklearn.datasets import make_classification, make_hastie_10_2 +from sklearn.decomposition import TruncatedSVD +from sklearn.dummy import DummyRegressor +from sklearn.ensemble import ( + ExtraTreesClassifier, + ExtraTreesRegressor, + RandomForestClassifier, + RandomForestRegressor, + RandomTreesEmbedding, +) +from sklearn.ensemble._forest import ( + _generate_unsampled_indices, + _get_n_samples_bootstrap, +) +from sklearn.exceptions import NotFittedError +from sklearn.metrics import ( + explained_variance_score, + f1_score, + mean_poisson_deviance, + mean_squared_error, +) +from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split +from sklearn.svm import LinearSVC +from sklearn.tree._classes import SPARSE_SPLITTERS +from sklearn.utils._testing import ( + _convert_container, + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, + ignore_warnings, + skip_if_no_parallel, +) +from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS +from sklearn.utils.multiclass import type_of_target +from sklearn.utils.parallel import Parallel +from sklearn.utils.validation import check_random_state + +# toy sample +X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] +y = [-1, -1, -1, 1, 1, 1] +T = [[-1, -1], [2, 2], [3, 2]] +true_result = [-1, 1, 1] + +# Larger classification sample used for testing feature importances +X_large, y_large = datasets.make_classification( + n_samples=500, + n_features=10, + n_informative=3, + n_redundant=0, + n_repeated=0, + shuffle=False, + random_state=0, +) + +# also load the iris dataset +# and randomly permute it +iris = datasets.load_iris() +rng = check_random_state(0) +perm = rng.permutation(iris.target.size) +iris.data = iris.data[perm] +iris.target = iris.target[perm] + +# Make regression dataset +X_reg, y_reg = datasets.make_regression(n_samples=500, n_features=10, random_state=1) + +# also make a hastie_10_2 dataset +hastie_X, hastie_y = datasets.make_hastie_10_2(n_samples=20, random_state=1) +hastie_X = hastie_X.astype(np.float32) + +# Get the default backend in joblib to test parallelism and interaction with +# different backends +DEFAULT_JOBLIB_BACKEND = joblib.parallel.get_active_backend()[0].__class__ + +FOREST_CLASSIFIERS = { + "ExtraTreesClassifier": ExtraTreesClassifier, + "RandomForestClassifier": RandomForestClassifier, +} + +FOREST_REGRESSORS = { + "ExtraTreesRegressor": ExtraTreesRegressor, + "RandomForestRegressor": RandomForestRegressor, +} + +FOREST_TRANSFORMERS = { + "RandomTreesEmbedding": RandomTreesEmbedding, +} + +FOREST_ESTIMATORS: Dict[str, Any] = dict() +FOREST_ESTIMATORS.update(FOREST_CLASSIFIERS) +FOREST_ESTIMATORS.update(FOREST_REGRESSORS) +FOREST_ESTIMATORS.update(FOREST_TRANSFORMERS) + +FOREST_CLASSIFIERS_REGRESSORS: Dict[str, Any] = FOREST_CLASSIFIERS.copy() +FOREST_CLASSIFIERS_REGRESSORS.update(FOREST_REGRESSORS) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) +def test_classification_toy(name): + """Check classification on a toy dataset.""" + ForestClassifier = FOREST_CLASSIFIERS[name] + + clf = ForestClassifier(n_estimators=10, random_state=1) + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + assert 10 == len(clf) + + clf = ForestClassifier(n_estimators=10, max_features=1, random_state=1) + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + assert 10 == len(clf) + + # also test apply + leaf_indices = clf.apply(X) + assert leaf_indices.shape == (len(X), clf.n_estimators) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) +@pytest.mark.parametrize("criterion", ("gini", "log_loss")) +def test_iris_criterion(name, criterion): + # Check consistency on dataset iris. + ForestClassifier = FOREST_CLASSIFIERS[name] + + clf = ForestClassifier(n_estimators=10, criterion=criterion, random_state=1) + clf.fit(iris.data, iris.target) + score = clf.score(iris.data, iris.target) + assert score > 0.9, "Failed with criterion %s and score = %f" % (criterion, score) + + clf = ForestClassifier( + n_estimators=10, criterion=criterion, max_features=2, random_state=1 + ) + clf.fit(iris.data, iris.target) + score = clf.score(iris.data, iris.target) + assert score > 0.5, "Failed with criterion %s and score = %f" % (criterion, score) + + +@pytest.mark.parametrize("name", FOREST_REGRESSORS) +@pytest.mark.parametrize( + "criterion", ("squared_error", "absolute_error", "friedman_mse") +) +def test_regression_criterion(name, criterion): + # Check consistency on regression dataset. + ForestRegressor = FOREST_REGRESSORS[name] + + reg = ForestRegressor(n_estimators=5, criterion=criterion, random_state=1) + reg.fit(X_reg, y_reg) + score = reg.score(X_reg, y_reg) + assert score > 0.93, ( + "Failed with max_features=None, criterion %s and score = %f" + % ( + criterion, + score, + ) + ) + + reg = ForestRegressor( + n_estimators=5, criterion=criterion, max_features=6, random_state=1 + ) + reg.fit(X_reg, y_reg) + score = reg.score(X_reg, y_reg) + assert score > 0.92, "Failed with max_features=6, criterion %s and score = %f" % ( + criterion, + score, + ) + + +def test_poisson_vs_mse(): + """Test that random forest with poisson criterion performs better than + mse for a poisson target. + + There is a similar test for DecisionTreeRegressor. + """ + rng = np.random.RandomState(42) + n_train, n_test, n_features = 500, 500, 10 + X = datasets.make_low_rank_matrix( + n_samples=n_train + n_test, n_features=n_features, random_state=rng + ) + # We create a log-linear Poisson model and downscale coef as it will get + # exponentiated. + coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0) + y = rng.poisson(lam=np.exp(X @ coef)) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=n_test, random_state=rng + ) + # We prevent some overfitting by setting min_samples_split=10. + forest_poi = RandomForestRegressor( + criterion="poisson", min_samples_leaf=10, max_features="sqrt", random_state=rng + ) + forest_mse = RandomForestRegressor( + criterion="squared_error", + min_samples_leaf=10, + max_features="sqrt", + random_state=rng, + ) + + forest_poi.fit(X_train, y_train) + forest_mse.fit(X_train, y_train) + dummy = DummyRegressor(strategy="mean").fit(X_train, y_train) + + for X, y, data_name in [(X_train, y_train, "train"), (X_test, y_test, "test")]: + metric_poi = mean_poisson_deviance(y, forest_poi.predict(X)) + # squared_error forest might produce non-positive predictions => clip + # If y = 0 for those, the poisson deviance gets too good. + # If we drew more samples, we would eventually get y > 0 and the + # poisson deviance would explode, i.e. be undefined. Therefore, we do + # not clip to a tiny value like 1e-15, but to 1e-6. This acts like a + # small penalty to the non-positive predictions. + metric_mse = mean_poisson_deviance( + y, np.clip(forest_mse.predict(X), 1e-6, None) + ) + metric_dummy = mean_poisson_deviance(y, dummy.predict(X)) + # As squared_error might correctly predict 0 in train set, its train + # score can be better than Poisson. This is no longer the case for the + # test set. But keep the above comment for clipping in mind. + if data_name == "test": + assert metric_poi < metric_mse + assert metric_poi < 0.8 * metric_dummy + + +@pytest.mark.parametrize("criterion", ("poisson", "squared_error")) +def test_balance_property_random_forest(criterion): + """ "Test that sum(y_pred)==sum(y_true) on the training set.""" + rng = np.random.RandomState(42) + n_train, n_test, n_features = 500, 500, 10 + X = datasets.make_low_rank_matrix( + n_samples=n_train + n_test, n_features=n_features, random_state=rng + ) + + coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0) + y = rng.poisson(lam=np.exp(X @ coef)) + + reg = RandomForestRegressor( + criterion=criterion, n_estimators=10, bootstrap=False, random_state=rng + ) + reg.fit(X, y) + + assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y)) + + +@pytest.mark.parametrize("name", FOREST_REGRESSORS) +def test_regressor_attributes(name): + # Regression models should not have a classes_ attribute. + r = FOREST_REGRESSORS[name](random_state=0) + assert not hasattr(r, "classes_") + assert not hasattr(r, "n_classes_") + + r.fit([[1, 2, 3], [4, 5, 6]], [1, 2]) + assert not hasattr(r, "classes_") + assert not hasattr(r, "n_classes_") + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) +def test_probability(name): + # Predict probabilities. + ForestClassifier = FOREST_CLASSIFIERS[name] + with np.errstate(divide="ignore"): + clf = ForestClassifier( + n_estimators=10, random_state=1, max_features=1, max_depth=1 + ) + clf.fit(iris.data, iris.target) + assert_array_almost_equal( + np.sum(clf.predict_proba(iris.data), axis=1), np.ones(iris.data.shape[0]) + ) + assert_array_almost_equal( + clf.predict_proba(iris.data), np.exp(clf.predict_log_proba(iris.data)) + ) + + +@pytest.mark.parametrize("dtype", (np.float64, np.float32)) +@pytest.mark.parametrize( + "name, criterion", + itertools.chain( + product(FOREST_CLASSIFIERS, ["gini", "log_loss"]), + product(FOREST_REGRESSORS, ["squared_error", "friedman_mse", "absolute_error"]), + ), +) +def test_importances(dtype, name, criterion): + tolerance = 0.01 + if name in FOREST_REGRESSORS and criterion == "absolute_error": + tolerance = 0.05 + + # cast as dtype + X = X_large.astype(dtype, copy=False) + y = y_large.astype(dtype, copy=False) + + ForestEstimator = FOREST_ESTIMATORS[name] + + est = ForestEstimator(n_estimators=10, criterion=criterion, random_state=0) + est.fit(X, y) + importances = est.feature_importances_ + + # The forest estimator can detect that only the first 3 features of the + # dataset are informative: + n_important = np.sum(importances > 0.1) + assert importances.shape[0] == 10 + assert n_important == 3 + assert np.all(importances[:3] > 0.1) + + # Check with parallel + importances = est.feature_importances_ + est.set_params(n_jobs=2) + importances_parallel = est.feature_importances_ + assert_array_almost_equal(importances, importances_parallel) + + # Check with sample weights + sample_weight = check_random_state(0).randint(1, 10, len(X)) + est = ForestEstimator(n_estimators=10, random_state=0, criterion=criterion) + est.fit(X, y, sample_weight=sample_weight) + importances = est.feature_importances_ + assert np.all(importances >= 0.0) + + for scale in [0.5, 100]: + est = ForestEstimator(n_estimators=10, random_state=0, criterion=criterion) + est.fit(X, y, sample_weight=scale * sample_weight) + importances_bis = est.feature_importances_ + assert np.abs(importances - importances_bis).mean() < tolerance + + +def test_importances_asymptotic(): + # Check whether variable importances of totally randomized trees + # converge towards their theoretical values (See Louppe et al, + # Understanding variable importances in forests of randomized trees, 2013). + + def binomial(k, n): + return 0 if k < 0 or k > n else comb(int(n), int(k), exact=True) + + def entropy(samples): + n_samples = len(samples) + entropy = 0.0 + + for count in np.bincount(samples): + p = 1.0 * count / n_samples + if p > 0: + entropy -= p * np.log2(p) + + return entropy + + def mdi_importance(X_m, X, y): + n_samples, n_features = X.shape + + features = list(range(n_features)) + features.pop(X_m) + values = [np.unique(X[:, i]) for i in range(n_features)] + + imp = 0.0 + + for k in range(n_features): + # Weight of each B of size k + coef = 1.0 / (binomial(k, n_features) * (n_features - k)) + + # For all B of size k + for B in combinations(features, k): + # For all values B=b + for b in product(*[values[B[j]] for j in range(k)]): + mask_b = np.ones(n_samples, dtype=bool) + + for j in range(k): + mask_b &= X[:, B[j]] == b[j] + + X_, y_ = X[mask_b, :], y[mask_b] + n_samples_b = len(X_) + + if n_samples_b > 0: + children = [] + + for xi in values[X_m]: + mask_xi = X_[:, X_m] == xi + children.append(y_[mask_xi]) + + imp += ( + coef + * (1.0 * n_samples_b / n_samples) # P(B=b) + * ( + entropy(y_) + - sum( + [ + entropy(c) * len(c) / n_samples_b + for c in children + ] + ) + ) + ) + + return imp + + data = np.array( + [ + [0, 0, 1, 0, 0, 1, 0, 1], + [1, 0, 1, 1, 1, 0, 1, 2], + [1, 0, 1, 1, 0, 1, 1, 3], + [0, 1, 1, 1, 0, 1, 0, 4], + [1, 1, 0, 1, 0, 1, 1, 5], + [1, 1, 0, 1, 1, 1, 1, 6], + [1, 0, 1, 0, 0, 1, 0, 7], + [1, 1, 1, 1, 1, 1, 1, 8], + [1, 1, 1, 1, 0, 1, 1, 9], + [1, 1, 1, 0, 1, 1, 1, 0], + ] + ) + + X, y = np.array(data[:, :7], dtype=bool), data[:, 7] + n_features = X.shape[1] + + # Compute true importances + true_importances = np.zeros(n_features) + + for i in range(n_features): + true_importances[i] = mdi_importance(i, X, y) + + # Estimate importances with totally randomized trees + clf = ExtraTreesClassifier( + n_estimators=500, max_features=1, criterion="log_loss", random_state=0 + ).fit(X, y) + + importances = ( + sum( + tree.tree_.compute_feature_importances(normalize=False) + for tree in clf.estimators_ + ) + / clf.n_estimators + ) + + # Check correctness + assert_almost_equal(entropy(y), sum(importances)) + assert np.abs(true_importances - importances).mean() < 0.01 + + +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) +def test_unfitted_feature_importances(name): + err_msg = ( + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this estimator.".format(name) + ) + with pytest.raises(NotFittedError, match=err_msg): + getattr(FOREST_ESTIMATORS[name](), "feature_importances_") + + +@pytest.mark.parametrize("ForestClassifier", FOREST_CLASSIFIERS.values()) +@pytest.mark.parametrize("X_type", ["array", "sparse_csr", "sparse_csc"]) +@pytest.mark.parametrize( + "X, y, lower_bound_accuracy", + [ + ( + *datasets.make_classification(n_samples=300, n_classes=2, random_state=0), + 0.9, + ), + ( + *datasets.make_classification( + n_samples=1000, n_classes=3, n_informative=6, random_state=0 + ), + 0.65, + ), + ( + iris.data, + iris.target * 2 + 1, + 0.65, + ), + ( + *datasets.make_multilabel_classification(n_samples=300, random_state=0), + 0.18, + ), + ], +) +@pytest.mark.parametrize("oob_score", [True, partial(f1_score, average="micro")]) +def test_forest_classifier_oob( + ForestClassifier, X, y, X_type, lower_bound_accuracy, oob_score +): + """Check that OOB score is close to score on a test set.""" + X = _convert_container(X, constructor_name=X_type) + X_train, X_test, y_train, y_test = train_test_split( + X, + y, + test_size=0.5, + random_state=0, + ) + classifier = ForestClassifier( + n_estimators=40, + bootstrap=True, + oob_score=oob_score, + random_state=0, + ) + + assert not hasattr(classifier, "oob_score_") + assert not hasattr(classifier, "oob_decision_function_") + + classifier.fit(X_train, y_train) + if callable(oob_score): + test_score = oob_score(y_test, classifier.predict(X_test)) + else: + test_score = classifier.score(X_test, y_test) + assert classifier.oob_score_ >= lower_bound_accuracy + + abs_diff = abs(test_score - classifier.oob_score_) + assert abs_diff <= 0.11, f"{abs_diff=} is greater than 0.11" + + assert hasattr(classifier, "oob_score_") + assert not hasattr(classifier, "oob_prediction_") + assert hasattr(classifier, "oob_decision_function_") + + if y.ndim == 1: + expected_shape = (X_train.shape[0], len(set(y))) + else: + expected_shape = (X_train.shape[0], len(set(y[:, 0])), y.shape[1]) + assert classifier.oob_decision_function_.shape == expected_shape + + +@pytest.mark.parametrize("ForestRegressor", FOREST_REGRESSORS.values()) +@pytest.mark.parametrize("X_type", ["array", "sparse_csr", "sparse_csc"]) +@pytest.mark.parametrize( + "X, y, lower_bound_r2", + [ + ( + *datasets.make_regression( + n_samples=500, n_features=10, n_targets=1, random_state=0 + ), + 0.7, + ), + ( + *datasets.make_regression( + n_samples=500, n_features=10, n_targets=2, random_state=0 + ), + 0.55, + ), + ], +) +@pytest.mark.parametrize("oob_score", [True, explained_variance_score]) +def test_forest_regressor_oob(ForestRegressor, X, y, X_type, lower_bound_r2, oob_score): + """Check that forest-based regressor provide an OOB score close to the + score on a test set.""" + X = _convert_container(X, constructor_name=X_type) + X_train, X_test, y_train, y_test = train_test_split( + X, + y, + test_size=0.5, + random_state=0, + ) + regressor = ForestRegressor( + n_estimators=50, + bootstrap=True, + oob_score=oob_score, + random_state=0, + ) + + assert not hasattr(regressor, "oob_score_") + assert not hasattr(regressor, "oob_prediction_") + + regressor.fit(X_train, y_train) + if callable(oob_score): + test_score = oob_score(y_test, regressor.predict(X_test)) + else: + test_score = regressor.score(X_test, y_test) + assert regressor.oob_score_ >= lower_bound_r2 + + assert abs(test_score - regressor.oob_score_) <= 0.1 + + assert hasattr(regressor, "oob_score_") + assert hasattr(regressor, "oob_prediction_") + assert not hasattr(regressor, "oob_decision_function_") + + if y.ndim == 1: + expected_shape = (X_train.shape[0],) + else: + expected_shape = (X_train.shape[0], y.ndim) + assert regressor.oob_prediction_.shape == expected_shape + + +@pytest.mark.parametrize("ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values()) +def test_forest_oob_warning(ForestEstimator): + """Check that a warning is raised when not enough estimator and the OOB + estimates will be inaccurate.""" + estimator = ForestEstimator( + n_estimators=1, + oob_score=True, + bootstrap=True, + random_state=0, + ) + with pytest.warns(UserWarning, match="Some inputs do not have OOB scores"): + estimator.fit(iris.data, iris.target) + + +@pytest.mark.parametrize("ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values()) +def test_forest_oob_score_requires_bootstrap(ForestEstimator): + """Check that we raise an error if OOB score is requested without + activating bootstrapping. + """ + X = iris.data + y = iris.target + err_msg = "Out of bag estimation only available if bootstrap=True" + estimator = ForestEstimator(oob_score=True, bootstrap=False) + with pytest.raises(ValueError, match=err_msg): + estimator.fit(X, y) + + +@pytest.mark.parametrize("ForestClassifier", FOREST_CLASSIFIERS.values()) +def test_classifier_error_oob_score_multiclass_multioutput(ForestClassifier): + """Check that we raise an error with when requesting OOB score with + multiclass-multioutput classification target. + """ + rng = np.random.RandomState(42) + X = iris.data + y = rng.randint(low=0, high=5, size=(iris.data.shape[0], 2)) + y_type = type_of_target(y) + assert y_type == "multiclass-multioutput" + estimator = ForestClassifier(oob_score=True, bootstrap=True) + err_msg = "The type of target cannot be used to compute OOB estimates" + with pytest.raises(ValueError, match=err_msg): + estimator.fit(X, y) + + +@pytest.mark.parametrize("ForestRegressor", FOREST_REGRESSORS.values()) +def test_forest_multioutput_integral_regression_target(ForestRegressor): + """Check that multioutput regression with integral values is not interpreted + as a multiclass-multioutput target and OOB score can be computed. + """ + rng = np.random.RandomState(42) + X = iris.data + y = rng.randint(low=0, high=10, size=(iris.data.shape[0], 2)) + estimator = ForestRegressor( + n_estimators=30, oob_score=True, bootstrap=True, random_state=0 + ) + estimator.fit(X, y) + + n_samples_bootstrap = _get_n_samples_bootstrap(len(X), estimator.max_samples) + n_samples_test = X.shape[0] // 4 + oob_pred = np.zeros([n_samples_test, 2]) + for sample_idx, sample in enumerate(X[:n_samples_test]): + n_samples_oob = 0 + oob_pred_sample = np.zeros(2) + for tree in estimator.estimators_: + oob_unsampled_indices = _generate_unsampled_indices( + tree.random_state, len(X), n_samples_bootstrap + ) + if sample_idx in oob_unsampled_indices: + n_samples_oob += 1 + oob_pred_sample += tree.predict(sample.reshape(1, -1)).squeeze() + oob_pred[sample_idx] = oob_pred_sample / n_samples_oob + assert_allclose(oob_pred, estimator.oob_prediction_[:n_samples_test]) + + +@pytest.mark.parametrize("oob_score", [True, False]) +def test_random_trees_embedding_raise_error_oob(oob_score): + with pytest.raises(TypeError, match="got an unexpected keyword argument"): + RandomTreesEmbedding(oob_score=oob_score) + with pytest.raises(NotImplementedError, match="OOB score not supported"): + RandomTreesEmbedding()._set_oob_score_and_attributes(X, y) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) +def test_gridsearch(name): + # Check that base trees can be grid-searched. + forest = FOREST_CLASSIFIERS[name]() + clf = GridSearchCV(forest, {"n_estimators": (1, 2), "max_depth": (1, 2)}) + clf.fit(iris.data, iris.target) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) +def test_parallel(name): + """Check parallel computations in classification""" + if name in FOREST_CLASSIFIERS: + X = iris.data + y = iris.target + elif name in FOREST_REGRESSORS: + X = X_reg + y = y_reg + + ForestEstimator = FOREST_ESTIMATORS[name] + forest = ForestEstimator(n_estimators=10, n_jobs=3, random_state=0) + + forest.fit(X, y) + assert len(forest) == 10 + + forest.set_params(n_jobs=1) + y1 = forest.predict(X) + forest.set_params(n_jobs=2) + y2 = forest.predict(X) + assert_array_almost_equal(y1, y2, 3) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) +def test_pickle(name): + # Check pickability. + if name in FOREST_CLASSIFIERS: + X = iris.data[::2] + y = iris.target[::2] + elif name in FOREST_REGRESSORS: + X = X_reg[::2] + y = y_reg[::2] + + ForestEstimator = FOREST_ESTIMATORS[name] + obj = ForestEstimator(random_state=0) + obj.fit(X, y) + score = obj.score(X, y) + pickle_object = pickle.dumps(obj) + + obj2 = pickle.loads(pickle_object) + assert type(obj2) == obj.__class__ + score2 = obj2.score(X, y) + assert score == score2 + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) +def test_multioutput(name): + # Check estimators on multi-output problems. + + X_train = [ + [-2, -1], + [-1, -1], + [-1, -2], + [1, 1], + [1, 2], + [2, 1], + [-2, 1], + [-1, 1], + [-1, 2], + [2, -1], + [1, -1], + [1, -2], + ] + y_train = [ + [-1, 0], + [-1, 0], + [-1, 0], + [1, 1], + [1, 1], + [1, 1], + [-1, 2], + [-1, 2], + [-1, 2], + [1, 3], + [1, 3], + [1, 3], + ] + X_test = [[-1, -1], [1, 1], [-1, 1], [1, -1]] + y_test = [[-1, 0], [1, 1], [-1, 2], [1, 3]] + + est = FOREST_ESTIMATORS[name](random_state=0, bootstrap=False) + y_pred = est.fit(X_train, y_train).predict(X_test) + assert_array_almost_equal(y_pred, y_test) + + if name in FOREST_CLASSIFIERS: + with np.errstate(divide="ignore"): + proba = est.predict_proba(X_test) + assert len(proba) == 2 + assert proba[0].shape == (4, 2) + assert proba[1].shape == (4, 4) + + log_proba = est.predict_log_proba(X_test) + assert len(log_proba) == 2 + assert log_proba[0].shape == (4, 2) + assert log_proba[1].shape == (4, 4) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) +def test_multioutput_string(name): + # Check estimators on multi-output problems with string outputs. + + X_train = [ + [-2, -1], + [-1, -1], + [-1, -2], + [1, 1], + [1, 2], + [2, 1], + [-2, 1], + [-1, 1], + [-1, 2], + [2, -1], + [1, -1], + [1, -2], + ] + y_train = [ + ["red", "blue"], + ["red", "blue"], + ["red", "blue"], + ["green", "green"], + ["green", "green"], + ["green", "green"], + ["red", "purple"], + ["red", "purple"], + ["red", "purple"], + ["green", "yellow"], + ["green", "yellow"], + ["green", "yellow"], + ] + X_test = [[-1, -1], [1, 1], [-1, 1], [1, -1]] + y_test = [ + ["red", "blue"], + ["green", "green"], + ["red", "purple"], + ["green", "yellow"], + ] + + est = FOREST_ESTIMATORS[name](random_state=0, bootstrap=False) + y_pred = est.fit(X_train, y_train).predict(X_test) + assert_array_equal(y_pred, y_test) + + with np.errstate(divide="ignore"): + proba = est.predict_proba(X_test) + assert len(proba) == 2 + assert proba[0].shape == (4, 2) + assert proba[1].shape == (4, 4) + + log_proba = est.predict_log_proba(X_test) + assert len(log_proba) == 2 + assert log_proba[0].shape == (4, 2) + assert log_proba[1].shape == (4, 4) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) +def test_classes_shape(name): + # Test that n_classes_ and classes_ have proper shape. + ForestClassifier = FOREST_CLASSIFIERS[name] + + # Classification, single output + clf = ForestClassifier(random_state=0).fit(X, y) + + assert clf.n_classes_ == 2 + assert_array_equal(clf.classes_, [-1, 1]) + + # Classification, multi-output + _y = np.vstack((y, np.array(y) * 2)).T + clf = ForestClassifier(random_state=0).fit(X, _y) + + assert_array_equal(clf.n_classes_, [2, 2]) + assert_array_equal(clf.classes_, [[-1, 1], [-2, 2]]) + + +def test_random_trees_dense_type(): + # Test that the `sparse_output` parameter of RandomTreesEmbedding + # works by returning a dense array. + + # Create the RTE with sparse=False + hasher = RandomTreesEmbedding(n_estimators=10, sparse_output=False) + X, y = datasets.make_circles(factor=0.5) + X_transformed = hasher.fit_transform(X) + + # Assert that type is ndarray, not scipy.sparse.csr_matrix + assert isinstance(X_transformed, np.ndarray) + + +def test_random_trees_dense_equal(): + # Test that the `sparse_output` parameter of RandomTreesEmbedding + # works by returning the same array for both argument values. + + # Create the RTEs + hasher_dense = RandomTreesEmbedding( + n_estimators=10, sparse_output=False, random_state=0 + ) + hasher_sparse = RandomTreesEmbedding( + n_estimators=10, sparse_output=True, random_state=0 + ) + X, y = datasets.make_circles(factor=0.5) + X_transformed_dense = hasher_dense.fit_transform(X) + X_transformed_sparse = hasher_sparse.fit_transform(X) + + # Assert that dense and sparse hashers have same array. + assert_array_equal(X_transformed_sparse.toarray(), X_transformed_dense) + + +def test_random_hasher(): + # test random forest hashing on circles dataset + # make sure that it is linearly separable. + # even after projected to two SVD dimensions + # Note: Not all random_states produce perfect results. + hasher = RandomTreesEmbedding(n_estimators=30, random_state=1) + X, y = datasets.make_circles(factor=0.5) + X_transformed = hasher.fit_transform(X) + + # test fit and transform: + hasher = RandomTreesEmbedding(n_estimators=30, random_state=1) + assert_array_equal(hasher.fit(X).transform(X).toarray(), X_transformed.toarray()) + + # one leaf active per data point per forest + assert X_transformed.shape[0] == X.shape[0] + assert_array_equal(X_transformed.sum(axis=1), hasher.n_estimators) + svd = TruncatedSVD(n_components=2) + X_reduced = svd.fit_transform(X_transformed) + linear_clf = LinearSVC() + linear_clf.fit(X_reduced, y) + assert linear_clf.score(X_reduced, y) == 1.0 + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_random_hasher_sparse_data(csc_container): + X, y = datasets.make_multilabel_classification(random_state=0) + hasher = RandomTreesEmbedding(n_estimators=30, random_state=1) + X_transformed = hasher.fit_transform(X) + X_transformed_sparse = hasher.fit_transform(csc_container(X)) + assert_array_equal(X_transformed_sparse.toarray(), X_transformed.toarray()) + + +def test_parallel_train(): + rng = check_random_state(12321) + n_samples, n_features = 80, 30 + X_train = rng.randn(n_samples, n_features) + y_train = rng.randint(0, 2, n_samples) + + clfs = [ + RandomForestClassifier(n_estimators=20, n_jobs=n_jobs, random_state=12345).fit( + X_train, y_train + ) + for n_jobs in [1, 2, 3, 8, 16, 32] + ] + + X_test = rng.randn(n_samples, n_features) + probas = [clf.predict_proba(X_test) for clf in clfs] + for proba1, proba2 in itertools.pairwise(probas): + assert_array_almost_equal(proba1, proba2) + + +def test_distribution(): + rng = check_random_state(12321) + + # Single variable with 4 values + X = rng.randint(0, 4, size=(1000, 1)) + y = rng.rand(1000) + n_trees = 500 + + reg = ExtraTreesRegressor(n_estimators=n_trees, random_state=42).fit(X, y) + + uniques = defaultdict(int) + for tree in reg.estimators_: + tree = "".join( + ("%d,%d/" % (f, int(t)) if f >= 0 else "-") + for f, t in zip(tree.tree_.feature, tree.tree_.threshold) + ) + + uniques[tree] += 1 + + uniques = sorted([(1.0 * count / n_trees, tree) for tree, count in uniques.items()]) + + # On a single variable problem where X_0 has 4 equiprobable values, there + # are 5 ways to build a random tree. The more compact (0,1/0,0/--0,2/--) of + # them has probability 1/3 while the 4 others have probability 1/6. + + assert len(uniques) == 5 + assert 0.20 > uniques[0][0] # Rough approximation of 1/6. + assert 0.20 > uniques[1][0] + assert 0.20 > uniques[2][0] + assert 0.20 > uniques[3][0] + assert uniques[4][0] > 0.3 + assert uniques[4][1] == "0,1/0,0/--0,2/--" + + # Two variables, one with 2 values, one with 3 values + X = np.empty((1000, 2)) + X[:, 0] = np.random.randint(0, 2, 1000) + X[:, 1] = np.random.randint(0, 3, 1000) + y = rng.rand(1000) + + reg = ExtraTreesRegressor(max_features=1, random_state=1).fit(X, y) + + uniques = defaultdict(int) + for tree in reg.estimators_: + tree = "".join( + ("%d,%d/" % (f, int(t)) if f >= 0 else "-") + for f, t in zip(tree.tree_.feature, tree.tree_.threshold) + ) + + uniques[tree] += 1 + + uniques = [(count, tree) for tree, count in uniques.items()] + assert len(uniques) == 8 + + +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) +def test_max_leaf_nodes_max_depth(name): + X, y = hastie_X, hastie_y + + # Test precedence of max_leaf_nodes over max_depth. + ForestEstimator = FOREST_ESTIMATORS[name] + est = ForestEstimator( + max_depth=1, max_leaf_nodes=4, n_estimators=1, random_state=0 + ).fit(X, y) + assert est.estimators_[0].get_depth() == 1 + + est = ForestEstimator(max_depth=1, n_estimators=1, random_state=0).fit(X, y) + assert est.estimators_[0].get_depth() == 1 + + +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) +def test_min_samples_split(name): + X, y = hastie_X, hastie_y + ForestEstimator = FOREST_ESTIMATORS[name] + + est = ForestEstimator(min_samples_split=10, n_estimators=1, random_state=0) + est.fit(X, y) + node_idx = est.estimators_[0].tree_.children_left != -1 + node_samples = est.estimators_[0].tree_.n_node_samples[node_idx] + + assert np.min(node_samples) > len(X) * 0.5 - 1, "Failed with {0}".format(name) + + est = ForestEstimator(min_samples_split=0.5, n_estimators=1, random_state=0) + est.fit(X, y) + node_idx = est.estimators_[0].tree_.children_left != -1 + node_samples = est.estimators_[0].tree_.n_node_samples[node_idx] + + assert np.min(node_samples) > len(X) * 0.5 - 1, "Failed with {0}".format(name) + + +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) +def test_min_samples_leaf(name): + X, y = hastie_X, hastie_y + + # Test if leaves contain more than leaf_count training examples + ForestEstimator = FOREST_ESTIMATORS[name] + + est = ForestEstimator(min_samples_leaf=5, n_estimators=1, random_state=0) + est.fit(X, y) + out = est.estimators_[0].tree_.apply(X) + node_counts = np.bincount(out) + # drop inner nodes + leaf_count = node_counts[node_counts != 0] + assert np.min(leaf_count) > 4, "Failed with {0}".format(name) + + est = ForestEstimator(min_samples_leaf=0.25, n_estimators=1, random_state=0) + est.fit(X, y) + out = est.estimators_[0].tree_.apply(X) + node_counts = np.bincount(out) + # drop inner nodes + leaf_count = node_counts[node_counts != 0] + assert np.min(leaf_count) > len(X) * 0.25 - 1, "Failed with {0}".format(name) + + +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) +def test_min_weight_fraction_leaf(name): + X, y = hastie_X, hastie_y + + # Test if leaves contain at least min_weight_fraction_leaf of the + # training set + ForestEstimator = FOREST_ESTIMATORS[name] + rng = np.random.RandomState(0) + weights = rng.rand(X.shape[0]) + total_weight = np.sum(weights) + + # test both DepthFirstTreeBuilder and BestFirstTreeBuilder + # by setting max_leaf_nodes + for frac in np.linspace(0, 0.5, 6): + est = ForestEstimator( + min_weight_fraction_leaf=frac, n_estimators=1, random_state=0 + ) + if "RandomForest" in name: + est.bootstrap = False + + est.fit(X, y, sample_weight=weights) + out = est.estimators_[0].tree_.apply(X) + node_weights = np.bincount(out, weights=weights) + # drop inner nodes + leaf_weights = node_weights[node_weights != 0] + assert np.min(leaf_weights) >= total_weight * est.min_weight_fraction_leaf, ( + "Failed with {0} min_weight_fraction_leaf={1}".format( + name, est.min_weight_fraction_leaf + ) + ) + + +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) +@pytest.mark.parametrize( + "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS +) +def test_sparse_input(name, sparse_container): + X, y = datasets.make_multilabel_classification(random_state=0, n_samples=50) + + ForestEstimator = FOREST_ESTIMATORS[name] + + dense = ForestEstimator(random_state=0, max_depth=2).fit(X, y) + sparse = ForestEstimator(random_state=0, max_depth=2).fit(sparse_container(X), y) + + assert_array_almost_equal(sparse.apply(X), dense.apply(X)) + + if name in FOREST_CLASSIFIERS or name in FOREST_REGRESSORS: + assert_array_almost_equal(sparse.predict(X), dense.predict(X)) + assert_array_almost_equal( + sparse.feature_importances_, dense.feature_importances_ + ) + + if name in FOREST_CLASSIFIERS: + assert_array_almost_equal(sparse.predict_proba(X), dense.predict_proba(X)) + assert_array_almost_equal( + sparse.predict_log_proba(X), dense.predict_log_proba(X) + ) + + if name in FOREST_TRANSFORMERS: + assert_array_almost_equal( + sparse.transform(X).toarray(), dense.transform(X).toarray() + ) + assert_array_almost_equal( + sparse.fit_transform(X).toarray(), dense.fit_transform(X).toarray() + ) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) +@pytest.mark.parametrize("dtype", (np.float64, np.float32)) +def test_memory_layout(name, dtype): + # Test that it works no matter the memory layout + est = FOREST_ESTIMATORS[name](random_state=0, bootstrap=False) + + # Dense + for container, kwargs in ( + (np.asarray, {}), # Nothing + (np.asarray, {"order": "C"}), # C-order + (np.asarray, {"order": "F"}), # F-order + (np.ascontiguousarray, {}), # Contiguous + ): + X = container(iris.data, dtype=dtype, **kwargs) + y = iris.target + assert_array_almost_equal(est.fit(X, y).predict(X), y) + + # Sparse (if applicable) + if est.estimator.splitter in SPARSE_SPLITTERS: + for sparse_container in COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS: + X = sparse_container(iris.data, dtype=dtype) + y = iris.target + assert_array_almost_equal(est.fit(X, y).predict(X), y) + + # Strided + X = np.asarray(iris.data[::3], dtype=dtype) + y = iris.target[::3] + assert_array_almost_equal(est.fit(X, y).predict(X), y) + + +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) +def test_1d_input(name): + X = iris.data[:, 0] + X_2d = iris.data[:, 0].reshape((-1, 1)) + y = iris.target + + with ignore_warnings(): + ForestEstimator = FOREST_ESTIMATORS[name] + with pytest.raises(ValueError): + ForestEstimator(n_estimators=1, random_state=0).fit(X, y) + + est = ForestEstimator(random_state=0) + est.fit(X_2d, y) + + if name in FOREST_CLASSIFIERS or name in FOREST_REGRESSORS: + with pytest.raises(ValueError): + est.predict(X) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) +def test_class_weights(name): + # Check class_weights resemble sample_weights behavior. + ForestClassifier = FOREST_CLASSIFIERS[name] + + # Iris is balanced, so no effect expected for using 'balanced' weights + clf1 = ForestClassifier(random_state=0) + clf1.fit(iris.data, iris.target) + clf2 = ForestClassifier(class_weight="balanced", random_state=0) + clf2.fit(iris.data, iris.target) + assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_) + + # Make a multi-output problem with three copies of Iris + iris_multi = np.vstack((iris.target, iris.target, iris.target)).T + # Create user-defined weights that should balance over the outputs + clf3 = ForestClassifier( + class_weight=[ + {0: 2.0, 1: 2.0, 2: 1.0}, + {0: 2.0, 1: 1.0, 2: 2.0}, + {0: 1.0, 1: 2.0, 2: 2.0}, + ], + random_state=0, + ) + clf3.fit(iris.data, iris_multi) + assert_almost_equal(clf2.feature_importances_, clf3.feature_importances_) + # Check against multi-output "balanced" which should also have no effect + clf4 = ForestClassifier(class_weight="balanced", random_state=0) + clf4.fit(iris.data, iris_multi) + assert_almost_equal(clf3.feature_importances_, clf4.feature_importances_) + + # Inflate importance of class 1, check against user-defined weights + sample_weight = np.ones(iris.target.shape) + sample_weight[iris.target == 1] *= 100 + class_weight = {0: 1.0, 1: 100.0, 2: 1.0} + clf1 = ForestClassifier(random_state=0) + clf1.fit(iris.data, iris.target, sample_weight) + clf2 = ForestClassifier(class_weight=class_weight, random_state=0) + clf2.fit(iris.data, iris.target) + assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_) + + # Check that sample_weight and class_weight are multiplicative + clf1 = ForestClassifier(random_state=0) + clf1.fit(iris.data, iris.target, sample_weight**2) + clf2 = ForestClassifier(class_weight=class_weight, random_state=0) + clf2.fit(iris.data, iris.target, sample_weight) + assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) +def test_class_weight_balanced_and_bootstrap_multi_output(name): + # Test class_weight works for multi-output""" + ForestClassifier = FOREST_CLASSIFIERS[name] + _y = np.vstack((y, np.array(y) * 2)).T + clf = ForestClassifier(class_weight="balanced", random_state=0) + clf.fit(X, _y) + clf = ForestClassifier( + class_weight=[{-1: 0.5, 1: 1.0}, {-2: 1.0, 2: 1.0}], random_state=0 + ) + clf.fit(X, _y) + # smoke test for balanced subsample + clf = ForestClassifier(class_weight="balanced_subsample", random_state=0) + clf.fit(X, _y) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) +def test_class_weight_errors(name): + # Test if class_weight raises errors and warnings when expected. + ForestClassifier = FOREST_CLASSIFIERS[name] + _y = np.vstack((y, np.array(y) * 2)).T + + # Warning warm_start with preset + clf = ForestClassifier(class_weight="balanced", warm_start=True, random_state=0) + clf.fit(X, y) + + warn_msg = ( + "Warm-start fitting without increasing n_estimators does not fit new trees." + ) + with pytest.warns(UserWarning, match=warn_msg): + clf.fit(X, _y) + + # Incorrect length list for multi-output + clf = ForestClassifier(class_weight=[{-1: 0.5, 1: 1.0}], random_state=0) + with pytest.raises(ValueError): + clf.fit(X, _y) + + +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) +def test_warm_start(name): + # Test if fitting incrementally with warm start gives a forest of the + # right size and the same results as a normal fit. + X, y = hastie_X, hastie_y + ForestEstimator = FOREST_ESTIMATORS[name] + est_ws = None + for n_estimators in [5, 10]: + if est_ws is None: + est_ws = ForestEstimator( + n_estimators=n_estimators, random_state=42, warm_start=True + ) + else: + est_ws.set_params(n_estimators=n_estimators) + est_ws.fit(X, y) + assert len(est_ws) == n_estimators + + est_no_ws = ForestEstimator(n_estimators=10, random_state=42, warm_start=False) + est_no_ws.fit(X, y) + + assert set([tree.random_state for tree in est_ws]) == set( + [tree.random_state for tree in est_no_ws] + ) + + assert_array_equal( + est_ws.apply(X), est_no_ws.apply(X), err_msg="Failed with {0}".format(name) + ) + + +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) +def test_warm_start_clear(name): + # Test if fit clears state and grows a new forest when warm_start==False. + X, y = hastie_X, hastie_y + ForestEstimator = FOREST_ESTIMATORS[name] + est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=False, random_state=1) + est.fit(X, y) + + est_2 = ForestEstimator( + n_estimators=5, max_depth=1, warm_start=True, random_state=2 + ) + est_2.fit(X, y) # inits state + est_2.set_params(warm_start=False, random_state=1) + est_2.fit(X, y) # clears old state and equals est + + assert_array_almost_equal(est_2.apply(X), est.apply(X)) + + +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) +def test_warm_start_smaller_n_estimators(name): + # Test if warm start second fit with smaller n_estimators raises error. + X, y = hastie_X, hastie_y + ForestEstimator = FOREST_ESTIMATORS[name] + est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=True) + est.fit(X, y) + est.set_params(n_estimators=4) + with pytest.raises(ValueError): + est.fit(X, y) + + +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) +def test_warm_start_equal_n_estimators(name): + # Test if warm start with equal n_estimators does nothing and returns the + # same forest and raises a warning. + X, y = hastie_X, hastie_y + ForestEstimator = FOREST_ESTIMATORS[name] + est = ForestEstimator(n_estimators=5, max_depth=3, warm_start=True, random_state=1) + est.fit(X, y) + + est_2 = ForestEstimator( + n_estimators=5, max_depth=3, warm_start=True, random_state=1 + ) + est_2.fit(X, y) + # Now est_2 equals est. + + est_2.set_params(random_state=2) + warn_msg = ( + "Warm-start fitting without increasing n_estimators does not fit new trees." + ) + with pytest.warns(UserWarning, match=warn_msg): + est_2.fit(X, y) + # If we had fit the trees again we would have got a different forest as we + # changed the random state. + assert_array_equal(est.apply(X), est_2.apply(X)) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) +def test_warm_start_oob(name): + # Test that the warm start computes oob score when asked. + X, y = hastie_X, hastie_y + ForestEstimator = FOREST_ESTIMATORS[name] + # Use 15 estimators to avoid 'some inputs do not have OOB scores' warning. + est = ForestEstimator( + n_estimators=15, + max_depth=3, + warm_start=False, + random_state=1, + bootstrap=True, + oob_score=True, + ) + est.fit(X, y) + + est_2 = ForestEstimator( + n_estimators=5, + max_depth=3, + warm_start=False, + random_state=1, + bootstrap=True, + oob_score=False, + ) + est_2.fit(X, y) + + est_2.set_params(warm_start=True, oob_score=True, n_estimators=15) + est_2.fit(X, y) + + assert hasattr(est_2, "oob_score_") + assert est.oob_score_ == est_2.oob_score_ + + # Test that oob_score is computed even if we don't need to train + # additional trees. + est_3 = ForestEstimator( + n_estimators=15, + max_depth=3, + warm_start=True, + random_state=1, + bootstrap=True, + oob_score=False, + ) + est_3.fit(X, y) + assert not hasattr(est_3, "oob_score_") + + est_3.set_params(oob_score=True) + ignore_warnings(est_3.fit)(X, y) + + assert est.oob_score_ == est_3.oob_score_ + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) +def test_oob_not_computed_twice(name): + # Check that oob_score is not computed twice when warm_start=True. + X, y = hastie_X, hastie_y + ForestEstimator = FOREST_ESTIMATORS[name] + + est = ForestEstimator( + n_estimators=10, warm_start=True, bootstrap=True, oob_score=True + ) + + with patch.object( + est, "_set_oob_score_and_attributes", wraps=est._set_oob_score_and_attributes + ) as mock_set_oob_score_and_attributes: + est.fit(X, y) + + with pytest.warns(UserWarning, match="Warm-start fitting without increasing"): + est.fit(X, y) + + mock_set_oob_score_and_attributes.assert_called_once() + + +def test_dtype_convert(n_classes=15): + classifier = RandomForestClassifier(random_state=0, bootstrap=False) + + X = np.eye(n_classes) + y = [ch for ch in "ABCDEFGHIJKLMNOPQRSTU"[:n_classes]] + + result = classifier.fit(X, y).predict(X) + assert_array_equal(classifier.classes_, y) + assert_array_equal(result, y) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) +def test_decision_path(name): + X, y = hastie_X, hastie_y + n_samples = X.shape[0] + ForestEstimator = FOREST_ESTIMATORS[name] + est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=False, random_state=1) + est.fit(X, y) + indicator, n_nodes_ptr = est.decision_path(X) + + assert indicator.shape[1] == n_nodes_ptr[-1] + assert indicator.shape[0] == n_samples + assert_array_equal( + np.diff(n_nodes_ptr), [e.tree_.node_count for e in est.estimators_] + ) + + # Assert that leaves index are correct + leaves = est.apply(X) + for est_id in range(leaves.shape[1]): + leave_indicator = [ + indicator[i, n_nodes_ptr[est_id] + j] + for i, j in enumerate(leaves[:, est_id]) + ] + assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples)) + + +def test_min_impurity_decrease(): + X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) + all_estimators = [ + RandomForestClassifier, + RandomForestRegressor, + ExtraTreesClassifier, + ExtraTreesRegressor, + ] + + for Estimator in all_estimators: + est = Estimator(min_impurity_decrease=0.1) + est.fit(X, y) + for tree in est.estimators_: + # Simply check if the parameter is passed on correctly. Tree tests + # will suffice for the actual working of this param + assert tree.min_impurity_decrease == 0.1 + + +def test_poisson_y_positive_check(): + est = RandomForestRegressor(criterion="poisson") + X = np.zeros((3, 3)) + + y = [-1, 1, 3] + err_msg = ( + r"Some value\(s\) of y are negative which is " + r"not allowed for Poisson regression." + ) + with pytest.raises(ValueError, match=err_msg): + est.fit(X, y) + + y = [0, 0, 0] + err_msg = ( + r"Sum of y is not strictly positive which " + r"is necessary for Poisson regression." + ) + with pytest.raises(ValueError, match=err_msg): + est.fit(X, y) + + +# mypy error: Variable "DEFAULT_JOBLIB_BACKEND" is not valid type +class MyBackend(DEFAULT_JOBLIB_BACKEND): # type: ignore[valid-type,misc] + def __init__(self, *args, **kwargs): + self.count = 0 + super().__init__(*args, **kwargs) + + def start_call(self): + self.count += 1 + return super().start_call() + + +joblib.register_parallel_backend("testing", MyBackend) + + +@skip_if_no_parallel +def test_backend_respected(): + clf = RandomForestClassifier(n_estimators=10, n_jobs=2) + + with joblib.parallel_backend("testing") as (ba, n_jobs): + clf.fit(X, y) + + assert ba.count > 0 + + # predict_proba requires shared memory. Ensure that's honored. + with joblib.parallel_backend("testing") as (ba, _): + clf.predict_proba(X) + + assert ba.count == 0 + + +def test_forest_feature_importances_sum(): + X, y = make_classification( + n_samples=15, n_informative=3, random_state=1, n_classes=3 + ) + clf = RandomForestClassifier( + min_samples_leaf=5, random_state=42, n_estimators=200 + ).fit(X, y) + assert math.isclose(1, clf.feature_importances_.sum(), abs_tol=1e-7) + + +def test_forest_degenerate_feature_importances(): + # build a forest of single node trees. See #13636 + X = np.zeros((10, 10)) + y = np.ones((10,)) + gbr = RandomForestRegressor(n_estimators=10).fit(X, y) + assert_array_equal(gbr.feature_importances_, np.zeros(10, dtype=np.float64)) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) +def test_max_samples_bootstrap(name): + # Check invalid `max_samples` values + est = FOREST_CLASSIFIERS_REGRESSORS[name](bootstrap=False, max_samples=0.5) + err_msg = ( + r"`max_sample` cannot be set if `bootstrap=False`. " + r"Either switch to `bootstrap=True` or set " + r"`max_sample=None`." + ) + with pytest.raises(ValueError, match=err_msg): + est.fit(X, y) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) +def test_large_max_samples_exception(name): + # Check invalid `max_samples` + est = FOREST_CLASSIFIERS_REGRESSORS[name](bootstrap=True, max_samples=int(1e9)) + match = "`max_samples` must be <= n_samples=6 but got value 1000000000" + with pytest.raises(ValueError, match=match): + est.fit(X, y) + + +@pytest.mark.parametrize("name", FOREST_REGRESSORS) +def test_max_samples_boundary_regressors(name): + X_train, X_test, y_train, y_test = train_test_split( + X_reg, y_reg, train_size=0.7, test_size=0.3, random_state=0 + ) + + ms_1_model = FOREST_REGRESSORS[name]( + bootstrap=True, max_samples=1.0, random_state=0 + ) + ms_1_predict = ms_1_model.fit(X_train, y_train).predict(X_test) + + ms_None_model = FOREST_REGRESSORS[name]( + bootstrap=True, max_samples=None, random_state=0 + ) + ms_None_predict = ms_None_model.fit(X_train, y_train).predict(X_test) + + ms_1_ms = mean_squared_error(ms_1_predict, y_test) + ms_None_ms = mean_squared_error(ms_None_predict, y_test) + + assert ms_1_ms == pytest.approx(ms_None_ms) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) +def test_max_samples_boundary_classifiers(name): + X_train, X_test, y_train, _ = train_test_split( + X_large, y_large, random_state=0, stratify=y_large + ) + + ms_1_model = FOREST_CLASSIFIERS[name]( + bootstrap=True, max_samples=1.0, random_state=0 + ) + ms_1_proba = ms_1_model.fit(X_train, y_train).predict_proba(X_test) + + ms_None_model = FOREST_CLASSIFIERS[name]( + bootstrap=True, max_samples=None, random_state=0 + ) + ms_None_proba = ms_None_model.fit(X_train, y_train).predict_proba(X_test) + + np.testing.assert_allclose(ms_1_proba, ms_None_proba) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_forest_y_sparse(csr_container): + X = [[1, 2, 3]] + y = csr_container([[4, 5, 6]]) + est = RandomForestClassifier() + msg = "sparse multilabel-indicator for y is not supported." + with pytest.raises(ValueError, match=msg): + est.fit(X, y) + + +@pytest.mark.parametrize("ForestClass", [RandomForestClassifier, RandomForestRegressor]) +def test_little_tree_with_small_max_samples(ForestClass): + rng = np.random.RandomState(1) + + X = rng.randn(10000, 2) + y = rng.randn(10000) > 0 + + # First fit with no restriction on max samples + est1 = ForestClass( + n_estimators=1, + random_state=rng, + max_samples=None, + ) + + # Second fit with max samples restricted to just 2 + est2 = ForestClass( + n_estimators=1, + random_state=rng, + max_samples=2, + ) + + est1.fit(X, y) + est2.fit(X, y) + + tree1 = est1.estimators_[0].tree_ + tree2 = est2.estimators_[0].tree_ + + msg = "Tree without `max_samples` restriction should have more nodes" + assert tree1.node_count > tree2.node_count, msg + + +@pytest.mark.parametrize("Forest", FOREST_REGRESSORS) +def test_mse_criterion_object_segfault_smoke_test(Forest): + # This is a smoke test to ensure that passing a mutable criterion + # does not cause a segfault when fitting with concurrent threads. + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/12623 + from sklearn.tree._criterion import MSE + + y = y_reg.reshape(-1, 1) + n_samples, n_outputs = y.shape + mse_criterion = MSE(n_outputs, n_samples) + est = FOREST_REGRESSORS[Forest](n_estimators=2, n_jobs=2, criterion=mse_criterion) + + est.fit(X_reg, y) + + +def test_random_trees_embedding_feature_names_out(): + """Check feature names out for Random Trees Embedding.""" + random_state = np.random.RandomState(0) + X = np.abs(random_state.randn(100, 4)) + hasher = RandomTreesEmbedding( + n_estimators=2, max_depth=2, sparse_output=False, random_state=0 + ).fit(X) + names = hasher.get_feature_names_out() + expected_names = [ + f"randomtreesembedding_{tree}_{leaf}" + # Note: nodes with indices 0, 1 and 4 are internal split nodes and + # therefore do not appear in the expected output feature names. + for tree, leaf in [ + (0, 2), + (0, 3), + (0, 5), + (0, 6), + (1, 2), + (1, 3), + (1, 5), + (1, 6), + ] + ] + assert_array_equal(expected_names, names) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_read_only_buffer(csr_container, monkeypatch): + """RandomForestClassifier must work on readonly sparse data. + + Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/25333 + """ + monkeypatch.setattr( + sklearn.ensemble._forest, + "Parallel", + partial(Parallel, max_nbytes=100), + ) + rng = np.random.RandomState(seed=0) + + X, y = make_classification(n_samples=100, n_features=200, random_state=rng) + X = csr_container(X, copy=True) + + clf = RandomForestClassifier(n_jobs=2, random_state=rng) + cross_val_score(clf, X, y, cv=2) + + +@pytest.mark.parametrize("class_weight", ["balanced_subsample", None]) +def test_round_samples_to_one_when_samples_too_low(class_weight): + """Check low max_samples works and is rounded to one. + + Non-regression test for gh-24037. + """ + X, y = datasets.load_wine(return_X_y=True) + forest = RandomForestClassifier( + n_estimators=10, max_samples=1e-4, class_weight=class_weight, random_state=0 + ) + forest.fit(X, y) + + +@pytest.mark.parametrize("seed", [None, 1]) +@pytest.mark.parametrize("bootstrap", [True, False]) +@pytest.mark.parametrize("ForestClass", FOREST_CLASSIFIERS_REGRESSORS.values()) +def test_estimators_samples(ForestClass, bootstrap, seed): + """Estimators_samples_ property should be consistent. + + Tests consistency across fits and whether or not the seed for the random generator + is set. + """ + X, y = make_hastie_10_2(n_samples=200, random_state=1) + + if bootstrap: + max_samples = 0.5 + else: + max_samples = None + est = ForestClass( + n_estimators=10, + max_samples=max_samples, + max_features=0.5, + random_state=seed, + bootstrap=bootstrap, + ) + est.fit(X, y) + + estimators_samples = est.estimators_samples_.copy() + + # Test repeated calls result in same set of indices + assert_array_equal(estimators_samples, est.estimators_samples_) + estimators = est.estimators_ + + assert isinstance(estimators_samples, list) + assert len(estimators_samples) == len(estimators) + assert estimators_samples[0].dtype == np.int32 + + for i in range(len(estimators)): + if bootstrap: + assert len(estimators_samples[i]) == len(X) // 2 + + # the bootstrap should be a resampling with replacement + assert len(np.unique(estimators_samples[i])) < len(estimators_samples[i]) + else: + assert len(set(estimators_samples[i])) == len(X) + + estimator_index = 0 + estimator_samples = estimators_samples[estimator_index] + estimator = estimators[estimator_index] + + X_train = X[estimator_samples] + y_train = y[estimator_samples] + + orig_tree_values = estimator.tree_.value + estimator = clone(estimator) + estimator.fit(X_train, y_train) + new_tree_values = estimator.tree_.value + assert_allclose(orig_tree_values, new_tree_values) + + +@pytest.mark.parametrize( + "make_data, Forest", + [ + (datasets.make_regression, RandomForestRegressor), + (datasets.make_classification, RandomForestClassifier), + (datasets.make_regression, ExtraTreesRegressor), + (datasets.make_classification, ExtraTreesClassifier), + ], +) +def test_missing_values_is_resilient(make_data, Forest): + """Check that forest can deal with missing values and has decent performance.""" + + rng = np.random.RandomState(0) + n_samples, n_features = 1000, 10 + X, y = make_data(n_samples=n_samples, n_features=n_features, random_state=rng) + + # Create dataset with missing values + X_missing = X.copy() + X_missing[rng.choice([False, True], size=X.shape, p=[0.95, 0.05])] = np.nan + assert np.isnan(X_missing).any() + + X_missing_train, X_missing_test, y_train, y_test = train_test_split( + X_missing, y, random_state=0 + ) + + # Train forest with missing values + forest_with_missing = Forest(random_state=rng, n_estimators=50) + forest_with_missing.fit(X_missing_train, y_train) + score_with_missing = forest_with_missing.score(X_missing_test, y_test) + + # Train forest without missing values + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + forest = Forest(random_state=rng, n_estimators=50) + forest.fit(X_train, y_train) + score_without_missing = forest.score(X_test, y_test) + + # Score is still 80 percent of the forest's score that had no missing values + assert score_with_missing >= 0.80 * score_without_missing + + +@pytest.mark.parametrize( + "Forest", + [ + RandomForestClassifier, + RandomForestRegressor, + ExtraTreesRegressor, + ExtraTreesClassifier, + ], +) +def test_missing_value_is_predictive(Forest): + """Check that the forest learns when missing values are only present for + a predictive feature.""" + rng = np.random.RandomState(0) + n_samples = 300 + expected_score = 0.75 + + X_non_predictive = rng.standard_normal(size=(n_samples, 10)) + y = rng.randint(0, high=2, size=n_samples) + + # Create a predictive feature using `y` and with some noise + X_random_mask = rng.choice([False, True], size=n_samples, p=[0.95, 0.05]) + y_mask = y.astype(bool) + y_mask[X_random_mask] = ~y_mask[X_random_mask] + + predictive_feature = rng.standard_normal(size=n_samples) + predictive_feature[y_mask] = np.nan + assert np.isnan(predictive_feature).any() + + X_predictive = X_non_predictive.copy() + X_predictive[:, 5] = predictive_feature + + ( + X_predictive_train, + X_predictive_test, + X_non_predictive_train, + X_non_predictive_test, + y_train, + y_test, + ) = train_test_split(X_predictive, X_non_predictive, y, random_state=0) + forest_predictive = Forest(random_state=0).fit(X_predictive_train, y_train) + forest_non_predictive = Forest(random_state=0).fit(X_non_predictive_train, y_train) + + predictive_test_score = forest_predictive.score(X_predictive_test, y_test) + + assert predictive_test_score >= expected_score + assert predictive_test_score >= forest_non_predictive.score( + X_non_predictive_test, y_test + ) + + +@pytest.mark.parametrize("Forest", FOREST_REGRESSORS.values()) +def test_non_supported_criterion_raises_error_with_missing_values(Forest): + """Raise error for unsupported criterion when there are missing values.""" + X = np.array([[0, 1, 2], [np.nan, 0, 2.0]]) + y = [0.5, 1.0] + + forest = Forest(criterion="absolute_error") + + msg = ".*does not accept missing values" + with pytest.raises(ValueError, match=msg): + forest.fit(X, y) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_gradient_boosting.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_gradient_boosting.py new file mode 100644 index 0000000000000000000000000000000000000000..f799d51eec25cd908b9dfcda3704a0ab8b8d381a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_gradient_boosting.py @@ -0,0 +1,1711 @@ +""" +Testing for the gradient boosting module (sklearn.ensemble.gradient_boosting). +""" + +import re +import warnings + +import numpy as np +import pytest +from numpy.testing import assert_allclose + +from sklearn import datasets +from sklearn.base import clone +from sklearn.datasets import make_classification, make_regression +from sklearn.dummy import DummyClassifier, DummyRegressor +from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor +from sklearn.ensemble._gb import _safe_divide +from sklearn.ensemble._gradient_boosting import predict_stages +from sklearn.exceptions import DataConversionWarning, NotFittedError +from sklearn.linear_model import LinearRegression +from sklearn.metrics import mean_squared_error +from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import scale +from sklearn.svm import NuSVR +from sklearn.utils import check_random_state +from sklearn.utils._mocking import NoSampleWeightWrapper +from sklearn.utils._param_validation import InvalidParameterError +from sklearn.utils._testing import ( + assert_array_almost_equal, + assert_array_equal, + skip_if_32bit, +) +from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS + +GRADIENT_BOOSTING_ESTIMATORS = [GradientBoostingClassifier, GradientBoostingRegressor] + +# toy sample +X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] +y = [-1, -1, -1, 1, 1, 1] +T = [[-1, -1], [2, 2], [3, 2]] +true_result = [-1, 1, 1] + +# also make regression dataset +X_reg, y_reg = make_regression( + n_samples=100, n_features=4, n_informative=8, noise=10, random_state=7 +) +y_reg = scale(y_reg) + +rng = np.random.RandomState(0) +# also load the iris dataset +# and randomly permute it +iris = datasets.load_iris() +perm = rng.permutation(iris.target.size) +iris.data = iris.data[perm] +iris.target = iris.target[perm] + + +def test_exponential_n_classes_gt_2(): + """Test exponential loss raises for n_classes > 2.""" + clf = GradientBoostingClassifier(loss="exponential") + msg = "loss='exponential' is only suitable for a binary classification" + with pytest.raises(ValueError, match=msg): + clf.fit(iris.data, iris.target) + + +def test_raise_if_init_has_no_predict_proba(): + """Test raise if init_ has no predict_proba method.""" + clf = GradientBoostingClassifier(init=GradientBoostingRegressor) + msg = ( + "The 'init' parameter of GradientBoostingClassifier must be a str among " + "{'zero'}, None or an object implementing 'fit' and 'predict_proba'." + ) + with pytest.raises(ValueError, match=msg): + clf.fit(X, y) + + +@pytest.mark.parametrize("loss", ("log_loss", "exponential")) +def test_classification_toy(loss, global_random_seed): + # Check classification on a toy dataset. + clf = GradientBoostingClassifier( + loss=loss, n_estimators=10, random_state=global_random_seed + ) + + with pytest.raises(ValueError): + clf.predict(T) + + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + assert 10 == len(clf.estimators_) + + log_loss_decrease = clf.train_score_[:-1] - clf.train_score_[1:] + assert np.any(log_loss_decrease >= 0.0) + + leaves = clf.apply(X) + assert leaves.shape == (6, 10, 1) + + +@pytest.mark.parametrize("loss", ("log_loss", "exponential")) +def test_classification_synthetic(loss, global_random_seed): + # Test GradientBoostingClassifier on synthetic dataset used by + # Hastie et al. in ESLII - Figure 10.9 + # Note that Figure 10.9 reuses the dataset generated for figure 10.2 + # and should have 2_000 train data points and 10_000 test data points. + # Here we intentionally use a smaller variant to make the test run faster, + # but the conclusions are still the same, despite the smaller datasets. + X, y = datasets.make_hastie_10_2(n_samples=2000, random_state=global_random_seed) + + split_idx = 500 + X_train, X_test = X[:split_idx], X[split_idx:] + y_train, y_test = y[:split_idx], y[split_idx:] + + # Increasing the number of trees should decrease the test error + common_params = { + "max_depth": 1, + "learning_rate": 1.0, + "loss": loss, + "random_state": global_random_seed, + } + gbrt_10_stumps = GradientBoostingClassifier(n_estimators=10, **common_params) + gbrt_10_stumps.fit(X_train, y_train) + + gbrt_50_stumps = GradientBoostingClassifier(n_estimators=50, **common_params) + gbrt_50_stumps.fit(X_train, y_train) + + assert gbrt_10_stumps.score(X_test, y_test) < gbrt_50_stumps.score(X_test, y_test) + + # Decision stumps are better suited for this dataset with a large number of + # estimators. + common_params = { + "n_estimators": 200, + "learning_rate": 1.0, + "loss": loss, + "random_state": global_random_seed, + } + gbrt_stumps = GradientBoostingClassifier(max_depth=1, **common_params) + gbrt_stumps.fit(X_train, y_train) + + gbrt_10_nodes = GradientBoostingClassifier(max_leaf_nodes=10, **common_params) + gbrt_10_nodes.fit(X_train, y_train) + + assert gbrt_stumps.score(X_test, y_test) > gbrt_10_nodes.score(X_test, y_test) + + +@pytest.mark.parametrize("loss", ("squared_error", "absolute_error", "huber")) +@pytest.mark.parametrize("subsample", (1.0, 0.5)) +def test_regression_dataset(loss, subsample, global_random_seed): + # Check consistency on regression dataset with least squares + # and least absolute deviation. + ones = np.ones(len(y_reg)) + last_y_pred = None + for sample_weight in [None, ones, 2 * ones]: + # learning_rate, max_depth and n_estimators were adjusted to get a mode + # that is accurate enough to reach a low MSE on the training set while + # keeping the resource used to execute this test low enough. + reg = GradientBoostingRegressor( + n_estimators=30, + loss=loss, + max_depth=4, + subsample=subsample, + min_samples_split=2, + random_state=global_random_seed, + learning_rate=0.5, + ) + + reg.fit(X_reg, y_reg, sample_weight=sample_weight) + leaves = reg.apply(X_reg) + assert leaves.shape == (100, 30) + + y_pred = reg.predict(X_reg) + mse = mean_squared_error(y_reg, y_pred) + assert mse < 0.05 + + if last_y_pred is not None: + # FIXME: We temporarily bypass this test. This is due to the fact + # that GBRT with and without `sample_weight` do not use the same + # implementation of the median during the initialization with the + # `DummyRegressor`. In the future, we should make sure that both + # implementations should be the same. See PR #17377 for more. + # assert_allclose(last_y_pred, y_pred) + pass + + last_y_pred = y_pred + + +@pytest.mark.parametrize("subsample", (1.0, 0.5)) +@pytest.mark.parametrize("sample_weight", (None, 1)) +def test_iris(subsample, sample_weight, global_random_seed): + if sample_weight == 1: + sample_weight = np.ones(len(iris.target)) + # Check consistency on dataset iris. + clf = GradientBoostingClassifier( + n_estimators=100, + loss="log_loss", + random_state=global_random_seed, + subsample=subsample, + ) + clf.fit(iris.data, iris.target, sample_weight=sample_weight) + score = clf.score(iris.data, iris.target) + assert score > 0.9 + + leaves = clf.apply(iris.data) + assert leaves.shape == (150, 100, 3) + + +def test_regression_synthetic(global_random_seed): + # Test on synthetic regression datasets used in Leo Breiman, + # `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996). + random_state = check_random_state(global_random_seed) + regression_params = { + "n_estimators": 100, + "max_depth": 4, + "min_samples_split": 2, + "learning_rate": 0.1, + "loss": "squared_error", + "random_state": global_random_seed, + } + + # Friedman1 + X, y = datasets.make_friedman1(n_samples=1200, random_state=random_state, noise=1.0) + X_train, y_train = X[:200], y[:200] + X_test, y_test = X[200:], y[200:] + + clf = GradientBoostingRegressor(**regression_params) + clf.fit(X_train, y_train) + mse = mean_squared_error(y_test, clf.predict(X_test)) + assert mse < 6.5 + + # Friedman2 + X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state) + X_train, y_train = X[:200], y[:200] + X_test, y_test = X[200:], y[200:] + + clf = GradientBoostingRegressor(**regression_params) + clf.fit(X_train, y_train) + mse = mean_squared_error(y_test, clf.predict(X_test)) + assert mse < 2500.0 + + # Friedman3 + X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state) + X_train, y_train = X[:200], y[:200] + X_test, y_test = X[200:], y[200:] + + clf = GradientBoostingRegressor(**regression_params) + clf.fit(X_train, y_train) + mse = mean_squared_error(y_test, clf.predict(X_test)) + assert mse < 0.025 + + +@pytest.mark.parametrize( + "GradientBoosting, X, y", + [ + (GradientBoostingRegressor, X_reg, y_reg), + (GradientBoostingClassifier, iris.data, iris.target), + ], +) +def test_feature_importances(GradientBoosting, X, y): + # smoke test to check that the gradient boosting expose an attribute + # feature_importances_ + gbdt = GradientBoosting() + assert not hasattr(gbdt, "feature_importances_") + gbdt.fit(X, y) + assert hasattr(gbdt, "feature_importances_") + + +def test_probability_log(global_random_seed): + # Predict probabilities. + clf = GradientBoostingClassifier(n_estimators=100, random_state=global_random_seed) + + with pytest.raises(ValueError): + clf.predict_proba(T) + + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + + # check if probabilities are in [0, 1]. + y_proba = clf.predict_proba(T) + assert np.all(y_proba >= 0.0) + assert np.all(y_proba <= 1.0) + + # derive predictions from probabilities + y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0) + assert_array_equal(y_pred, true_result) + + +def test_single_class_with_sample_weight(): + sample_weight = [0, 0, 0, 1, 1, 1] + clf = GradientBoostingClassifier(n_estimators=100, random_state=1) + msg = ( + "y contains 1 class after sample_weight trimmed classes with " + "zero weights, while a minimum of 2 classes are required." + ) + with pytest.raises(ValueError, match=msg): + clf.fit(X, y, sample_weight=sample_weight) + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_check_inputs_predict_stages(csc_container): + # check that predict_stages through an error if the type of X is not + # supported + x, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) + x_sparse_csc = csc_container(x) + clf = GradientBoostingClassifier(n_estimators=100, random_state=1) + clf.fit(x, y) + score = np.zeros((y.shape)).reshape(-1, 1) + err_msg = "When X is a sparse matrix, a CSR format is expected" + with pytest.raises(ValueError, match=err_msg): + predict_stages(clf.estimators_, x_sparse_csc, clf.learning_rate, score) + x_fortran = np.asfortranarray(x) + with pytest.raises(ValueError, match="X should be C-ordered np.ndarray"): + predict_stages(clf.estimators_, x_fortran, clf.learning_rate, score) + + +def test_max_feature_regression(global_random_seed): + # Test to make sure random state is set properly. + X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=global_random_seed) + + X_train, X_test = X[:2000], X[2000:] + y_train, y_test = y[:2000], y[2000:] + + gbrt = GradientBoostingClassifier( + n_estimators=100, + min_samples_split=5, + max_depth=2, + learning_rate=0.1, + max_features=2, + random_state=global_random_seed, + ) + gbrt.fit(X_train, y_train) + log_loss = gbrt._loss(y_test, gbrt.decision_function(X_test)) + assert log_loss < 0.5, "GB failed with deviance %.4f" % log_loss + + +def test_feature_importance_regression( + fetch_california_housing_fxt, global_random_seed +): + """Test that Gini importance is calculated correctly. + + This test follows the example from [1]_ (pg. 373). + + .. [1] Friedman, J., Hastie, T., & Tibshirani, R. (2001). The elements + of statistical learning. New York: Springer series in statistics. + """ + california = fetch_california_housing_fxt() + X, y = california.data, california.target + X_train, X_test, y_train, y_test = train_test_split( + X, y, random_state=global_random_seed + ) + + reg = GradientBoostingRegressor( + loss="huber", + learning_rate=0.1, + max_leaf_nodes=6, + n_estimators=100, + random_state=global_random_seed, + ) + reg.fit(X_train, y_train) + sorted_idx = np.argsort(reg.feature_importances_)[::-1] + sorted_features = [california.feature_names[s] for s in sorted_idx] + + # The most important feature is the median income by far. + assert sorted_features[0] == "MedInc" + + # The three subsequent features are the following. Their relative ordering + # might change a bit depending on the randomness of the trees and the + # train / test split. + assert set(sorted_features[1:4]) == {"Longitude", "AveOccup", "Latitude"} + + +def test_max_features(): + # Test if max features is set properly for floats and str. + X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) + _, n_features = X.shape + + X_train = X[:2000] + y_train = y[:2000] + + gbrt = GradientBoostingClassifier(n_estimators=1, max_features=None) + gbrt.fit(X_train, y_train) + assert gbrt.max_features_ == n_features + + gbrt = GradientBoostingRegressor(n_estimators=1, max_features=None) + gbrt.fit(X_train, y_train) + assert gbrt.max_features_ == n_features + + gbrt = GradientBoostingRegressor(n_estimators=1, max_features=0.3) + gbrt.fit(X_train, y_train) + assert gbrt.max_features_ == int(n_features * 0.3) + + gbrt = GradientBoostingRegressor(n_estimators=1, max_features="sqrt") + gbrt.fit(X_train, y_train) + assert gbrt.max_features_ == int(np.sqrt(n_features)) + + gbrt = GradientBoostingRegressor(n_estimators=1, max_features="log2") + gbrt.fit(X_train, y_train) + assert gbrt.max_features_ == int(np.log2(n_features)) + + gbrt = GradientBoostingRegressor(n_estimators=1, max_features=0.01 / X.shape[1]) + gbrt.fit(X_train, y_train) + assert gbrt.max_features_ == 1 + + +def test_staged_predict(): + # Test whether staged decision function eventually gives + # the same prediction. + X, y = datasets.make_friedman1(n_samples=1200, random_state=1, noise=1.0) + X_train, y_train = X[:200], y[:200] + X_test = X[200:] + clf = GradientBoostingRegressor() + # test raise ValueError if not fitted + with pytest.raises(ValueError): + np.fromiter(clf.staged_predict(X_test), dtype=np.float64) + + clf.fit(X_train, y_train) + y_pred = clf.predict(X_test) + + # test if prediction for last stage equals ``predict`` + for y in clf.staged_predict(X_test): + assert y.shape == y_pred.shape + + assert_array_almost_equal(y_pred, y) + + +def test_staged_predict_proba(): + # Test whether staged predict proba eventually gives + # the same prediction. + X, y = datasets.make_hastie_10_2(n_samples=1200, random_state=1) + X_train, y_train = X[:200], y[:200] + X_test, y_test = X[200:], y[200:] + clf = GradientBoostingClassifier(n_estimators=20) + # test raise NotFittedError if not + with pytest.raises(NotFittedError): + np.fromiter(clf.staged_predict_proba(X_test), dtype=np.float64) + + clf.fit(X_train, y_train) + + # test if prediction for last stage equals ``predict`` + for y_pred in clf.staged_predict(X_test): + assert y_test.shape == y_pred.shape + + assert_array_equal(clf.predict(X_test), y_pred) + + # test if prediction for last stage equals ``predict_proba`` + for staged_proba in clf.staged_predict_proba(X_test): + assert y_test.shape[0] == staged_proba.shape[0] + assert 2 == staged_proba.shape[1] + + assert_array_almost_equal(clf.predict_proba(X_test), staged_proba) + + +@pytest.mark.parametrize("Estimator", GRADIENT_BOOSTING_ESTIMATORS) +def test_staged_functions_defensive(Estimator, global_random_seed): + # test that staged_functions make defensive copies + rng = np.random.RandomState(global_random_seed) + X = rng.uniform(size=(10, 3)) + y = (4 * X[:, 0]).astype(int) + 1 # don't predict zeros + estimator = Estimator() + estimator.fit(X, y) + for func in ["predict", "decision_function", "predict_proba"]: + staged_func = getattr(estimator, "staged_" + func, None) + if staged_func is None: + # regressor has no staged_predict_proba + continue + with warnings.catch_warnings(record=True): + staged_result = list(staged_func(X)) + staged_result[1][:] = 0 + assert np.all(staged_result[0] != 0) + + +def test_serialization(): + # Check model serialization. + clf = GradientBoostingClassifier(n_estimators=100, random_state=1) + + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + assert 100 == len(clf.estimators_) + + try: + import cPickle as pickle + except ImportError: + import pickle + + serialized_clf = pickle.dumps(clf, protocol=pickle.HIGHEST_PROTOCOL) + clf = None + clf = pickle.loads(serialized_clf) + assert_array_equal(clf.predict(T), true_result) + assert 100 == len(clf.estimators_) + + +def test_degenerate_targets(): + # Check if we can fit even though all targets are equal. + clf = GradientBoostingClassifier(n_estimators=100, random_state=1) + + # classifier should raise exception + with pytest.raises(ValueError): + clf.fit(X, np.ones(len(X))) + + clf = GradientBoostingRegressor(n_estimators=100, random_state=1) + clf.fit(X, np.ones(len(X))) + clf.predict([rng.rand(2)]) + assert_array_equal(np.ones((1,), dtype=np.float64), clf.predict([rng.rand(2)])) + + +def test_quantile_loss(global_random_seed): + # Check if quantile loss with alpha=0.5 equals absolute_error. + clf_quantile = GradientBoostingRegressor( + n_estimators=100, + loss="quantile", + max_depth=4, + alpha=0.5, + random_state=global_random_seed, + ) + + clf_quantile.fit(X_reg, y_reg) + y_quantile = clf_quantile.predict(X_reg) + + clf_ae = GradientBoostingRegressor( + n_estimators=100, + loss="absolute_error", + max_depth=4, + random_state=global_random_seed, + ) + + clf_ae.fit(X_reg, y_reg) + y_ae = clf_ae.predict(X_reg) + assert_allclose(y_quantile, y_ae) + + +def test_symbol_labels(): + # Test with non-integer class labels. + clf = GradientBoostingClassifier(n_estimators=100, random_state=1) + + symbol_y = list(map(str, y)) + + clf.fit(X, symbol_y) + assert_array_equal(clf.predict(T), list(map(str, true_result))) + assert 100 == len(clf.estimators_) + + +def test_float_class_labels(): + # Test with float class labels. + clf = GradientBoostingClassifier(n_estimators=100, random_state=1) + + float_y = np.asarray(y, dtype=np.float32) + + clf.fit(X, float_y) + assert_array_equal(clf.predict(T), np.asarray(true_result, dtype=np.float32)) + assert 100 == len(clf.estimators_) + + +def test_shape_y(): + # Test with float class labels. + clf = GradientBoostingClassifier(n_estimators=100, random_state=1) + + y_ = np.asarray(y, dtype=np.int32) + y_ = y_[:, np.newaxis] + + # This will raise a DataConversionWarning that we want to + # "always" raise, elsewhere the warnings gets ignored in the + # later tests, and the tests that check for this warning fail + warn_msg = ( + "A column-vector y was passed when a 1d array was expected. " + "Please change the shape of y to \\(n_samples, \\), for " + "example using ravel()." + ) + with pytest.warns(DataConversionWarning, match=warn_msg): + clf.fit(X, y_) + assert_array_equal(clf.predict(T), true_result) + assert 100 == len(clf.estimators_) + + +def test_mem_layout(): + # Test with different memory layouts of X and y + X_ = np.asfortranarray(X) + clf = GradientBoostingClassifier(n_estimators=100, random_state=1) + clf.fit(X_, y) + assert_array_equal(clf.predict(T), true_result) + assert 100 == len(clf.estimators_) + + X_ = np.ascontiguousarray(X) + clf = GradientBoostingClassifier(n_estimators=100, random_state=1) + clf.fit(X_, y) + assert_array_equal(clf.predict(T), true_result) + assert 100 == len(clf.estimators_) + + y_ = np.asarray(y, dtype=np.int32) + y_ = np.ascontiguousarray(y_) + clf = GradientBoostingClassifier(n_estimators=100, random_state=1) + clf.fit(X, y_) + assert_array_equal(clf.predict(T), true_result) + assert 100 == len(clf.estimators_) + + y_ = np.asarray(y, dtype=np.int32) + y_ = np.asfortranarray(y_) + clf = GradientBoostingClassifier(n_estimators=100, random_state=1) + clf.fit(X, y_) + assert_array_equal(clf.predict(T), true_result) + assert 100 == len(clf.estimators_) + + +@pytest.mark.parametrize("GradientBoostingEstimator", GRADIENT_BOOSTING_ESTIMATORS) +def test_oob_improvement(GradientBoostingEstimator): + # Test if oob improvement has correct shape and regression test. + estimator = GradientBoostingEstimator( + n_estimators=100, random_state=1, subsample=0.5 + ) + estimator.fit(X, y) + assert estimator.oob_improvement_.shape[0] == 100 + # hard-coded regression test - change if modification in OOB computation + assert_array_almost_equal( + estimator.oob_improvement_[:5], + np.array([0.19, 0.15, 0.12, -0.11, 0.11]), + decimal=2, + ) + + +@pytest.mark.parametrize("GradientBoostingEstimator", GRADIENT_BOOSTING_ESTIMATORS) +def test_oob_scores(GradientBoostingEstimator): + # Test if oob scores has correct shape and regression test. + X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) + estimator = GradientBoostingEstimator( + n_estimators=100, random_state=1, subsample=0.5 + ) + estimator.fit(X, y) + assert estimator.oob_scores_.shape[0] == 100 + assert estimator.oob_scores_[-1] == pytest.approx(estimator.oob_score_) + + estimator = GradientBoostingEstimator( + n_estimators=100, + random_state=1, + subsample=0.5, + n_iter_no_change=5, + ) + estimator.fit(X, y) + assert estimator.oob_scores_.shape[0] < 100 + assert estimator.oob_scores_[-1] == pytest.approx(estimator.oob_score_) + + +@pytest.mark.parametrize( + "GradientBoostingEstimator, oob_attribute", + [ + (GradientBoostingClassifier, "oob_improvement_"), + (GradientBoostingClassifier, "oob_scores_"), + (GradientBoostingClassifier, "oob_score_"), + (GradientBoostingRegressor, "oob_improvement_"), + (GradientBoostingRegressor, "oob_scores_"), + (GradientBoostingRegressor, "oob_score_"), + ], +) +def test_oob_attributes_error(GradientBoostingEstimator, oob_attribute): + """ + Check that we raise an AttributeError when the OOB statistics were not computed. + """ + X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) + estimator = GradientBoostingEstimator( + n_estimators=100, + random_state=1, + subsample=1.0, + ) + estimator.fit(X, y) + with pytest.raises(AttributeError): + estimator.oob_attribute + + +def test_oob_multilcass_iris(): + # Check OOB improvement on multi-class dataset. + estimator = GradientBoostingClassifier( + n_estimators=100, loss="log_loss", random_state=1, subsample=0.5 + ) + estimator.fit(iris.data, iris.target) + score = estimator.score(iris.data, iris.target) + assert score > 0.9 + assert estimator.oob_improvement_.shape[0] == estimator.n_estimators + assert estimator.oob_scores_.shape[0] == estimator.n_estimators + assert estimator.oob_scores_[-1] == pytest.approx(estimator.oob_score_) + + estimator = GradientBoostingClassifier( + n_estimators=100, + loss="log_loss", + random_state=1, + subsample=0.5, + n_iter_no_change=5, + ) + estimator.fit(iris.data, iris.target) + score = estimator.score(iris.data, iris.target) + assert estimator.oob_improvement_.shape[0] < estimator.n_estimators + assert estimator.oob_scores_.shape[0] < estimator.n_estimators + assert estimator.oob_scores_[-1] == pytest.approx(estimator.oob_score_) + + # hard-coded regression test - change if modification in OOB computation + # FIXME: the following snippet does not yield the same results on 32 bits + # assert_array_almost_equal(estimator.oob_improvement_[:5], + # np.array([12.68, 10.45, 8.18, 6.43, 5.13]), + # decimal=2) + + +def test_verbose_output(): + # Check verbose=1 does not cause error. + import sys + from io import StringIO + + old_stdout = sys.stdout + sys.stdout = StringIO() + clf = GradientBoostingClassifier( + n_estimators=100, random_state=1, verbose=1, subsample=0.8 + ) + clf.fit(X, y) + verbose_output = sys.stdout + sys.stdout = old_stdout + + # check output + verbose_output.seek(0) + header = verbose_output.readline().rstrip() + # with OOB + true_header = " ".join(["%10s"] + ["%16s"] * 3) % ( + "Iter", + "Train Loss", + "OOB Improve", + "Remaining Time", + ) + assert true_header == header + + n_lines = sum(1 for l in verbose_output.readlines()) + # one for 1-10 and then 9 for 20-100 + assert 10 + 9 == n_lines + + +def test_more_verbose_output(): + # Check verbose=2 does not cause error. + import sys + from io import StringIO + + old_stdout = sys.stdout + sys.stdout = StringIO() + clf = GradientBoostingClassifier(n_estimators=100, random_state=1, verbose=2) + clf.fit(X, y) + verbose_output = sys.stdout + sys.stdout = old_stdout + + # check output + verbose_output.seek(0) + header = verbose_output.readline().rstrip() + # no OOB + true_header = " ".join(["%10s"] + ["%16s"] * 2) % ( + "Iter", + "Train Loss", + "Remaining Time", + ) + assert true_header == header + + n_lines = sum(1 for l in verbose_output.readlines()) + # 100 lines for n_estimators==100 + assert 100 == n_lines + + +@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start(Cls, global_random_seed): + # Test if warm start equals fit. + X, y = datasets.make_hastie_10_2(n_samples=100, random_state=global_random_seed) + est = Cls(n_estimators=200, max_depth=1, random_state=global_random_seed) + est.fit(X, y) + + est_ws = Cls( + n_estimators=100, max_depth=1, warm_start=True, random_state=global_random_seed + ) + est_ws.fit(X, y) + est_ws.set_params(n_estimators=200) + est_ws.fit(X, y) + + if Cls is GradientBoostingRegressor: + assert_allclose(est_ws.predict(X), est.predict(X)) + else: + # Random state is preserved and hence predict_proba must also be + # same + assert_array_equal(est_ws.predict(X), est.predict(X)) + assert_allclose(est_ws.predict_proba(X), est.predict_proba(X)) + + +@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_n_estimators(Cls, global_random_seed): + # Test if warm start equals fit - set n_estimators. + X, y = datasets.make_hastie_10_2(n_samples=100, random_state=global_random_seed) + est = Cls(n_estimators=300, max_depth=1, random_state=global_random_seed) + est.fit(X, y) + + est_ws = Cls( + n_estimators=100, max_depth=1, warm_start=True, random_state=global_random_seed + ) + est_ws.fit(X, y) + est_ws.set_params(n_estimators=300) + est_ws.fit(X, y) + + assert_allclose(est_ws.predict(X), est.predict(X)) + + +@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_max_depth(Cls): + # Test if possible to fit trees of different depth in ensemble. + X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) + est = Cls(n_estimators=100, max_depth=1, warm_start=True) + est.fit(X, y) + est.set_params(n_estimators=110, max_depth=2) + est.fit(X, y) + + # last 10 trees have different depth + assert est.estimators_[0, 0].max_depth == 1 + for i in range(1, 11): + assert est.estimators_[-i, 0].max_depth == 2 + + +@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_clear(Cls): + # Test if fit clears state. + X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) + est = Cls(n_estimators=100, max_depth=1) + est.fit(X, y) + + est_2 = Cls(n_estimators=100, max_depth=1, warm_start=True) + est_2.fit(X, y) # inits state + est_2.set_params(warm_start=False) + est_2.fit(X, y) # clears old state and equals est + + assert_array_almost_equal(est_2.predict(X), est.predict(X)) + + +@pytest.mark.parametrize("GradientBoosting", GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_state_oob_scores(GradientBoosting): + """ + Check that the states of the OOB scores are cleared when used with `warm_start`. + """ + X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) + n_estimators = 100 + estimator = GradientBoosting( + n_estimators=n_estimators, + max_depth=1, + subsample=0.5, + warm_start=True, + random_state=1, + ) + estimator.fit(X, y) + oob_scores, oob_score = estimator.oob_scores_, estimator.oob_score_ + assert len(oob_scores) == n_estimators + assert oob_scores[-1] == pytest.approx(oob_score) + + n_more_estimators = 200 + estimator.set_params(n_estimators=n_more_estimators).fit(X, y) + assert len(estimator.oob_scores_) == n_more_estimators + assert_allclose(estimator.oob_scores_[:n_estimators], oob_scores) + + estimator.set_params(n_estimators=n_estimators, warm_start=False).fit(X, y) + assert estimator.oob_scores_ is not oob_scores + assert estimator.oob_score_ is not oob_score + assert_allclose(estimator.oob_scores_, oob_scores) + assert estimator.oob_score_ == pytest.approx(oob_score) + assert oob_scores[-1] == pytest.approx(oob_score) + + +@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_smaller_n_estimators(Cls): + # Test if warm start with smaller n_estimators raises error + X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) + est = Cls(n_estimators=100, max_depth=1, warm_start=True) + est.fit(X, y) + est.set_params(n_estimators=99) + with pytest.raises(ValueError): + est.fit(X, y) + + +@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_equal_n_estimators(Cls): + # Test if warm start with equal n_estimators does nothing + X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) + est = Cls(n_estimators=100, max_depth=1) + est.fit(X, y) + + est2 = clone(est) + est2.set_params(n_estimators=est.n_estimators, warm_start=True) + est2.fit(X, y) + + assert_array_almost_equal(est2.predict(X), est.predict(X)) + + +@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_oob_switch(Cls): + # Test if oob can be turned on during warm start. + X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) + est = Cls(n_estimators=100, max_depth=1, warm_start=True) + est.fit(X, y) + est.set_params(n_estimators=110, subsample=0.5) + est.fit(X, y) + + assert_array_equal(est.oob_improvement_[:100], np.zeros(100)) + assert_array_equal(est.oob_scores_[:100], np.zeros(100)) + + # the last 10 are not zeros + assert (est.oob_improvement_[-10:] != 0.0).all() + assert (est.oob_scores_[-10:] != 0.0).all() + + assert est.oob_scores_[-1] == pytest.approx(est.oob_score_) + + +@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_oob(Cls): + # Test if warm start OOB equals fit. + X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) + est = Cls(n_estimators=200, max_depth=1, subsample=0.5, random_state=1) + est.fit(X, y) + + est_ws = Cls( + n_estimators=100, max_depth=1, subsample=0.5, random_state=1, warm_start=True + ) + est_ws.fit(X, y) + est_ws.set_params(n_estimators=200) + est_ws.fit(X, y) + + assert_array_almost_equal(est_ws.oob_improvement_[:100], est.oob_improvement_[:100]) + assert_array_almost_equal(est_ws.oob_scores_[:100], est.oob_scores_[:100]) + assert est.oob_scores_[-1] == pytest.approx(est.oob_score_) + assert est_ws.oob_scores_[-1] == pytest.approx(est_ws.oob_score_) + + +@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS) +@pytest.mark.parametrize( + "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS +) +def test_warm_start_sparse(Cls, sparse_container): + # Test that all sparse matrix types are supported + X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) + est_dense = Cls( + n_estimators=100, max_depth=1, subsample=0.5, random_state=1, warm_start=True + ) + est_dense.fit(X, y) + est_dense.predict(X) + est_dense.set_params(n_estimators=200) + est_dense.fit(X, y) + y_pred_dense = est_dense.predict(X) + + X_sparse = sparse_container(X) + + est_sparse = Cls( + n_estimators=100, + max_depth=1, + subsample=0.5, + random_state=1, + warm_start=True, + ) + est_sparse.fit(X_sparse, y) + est_sparse.predict(X) + est_sparse.set_params(n_estimators=200) + est_sparse.fit(X_sparse, y) + y_pred_sparse = est_sparse.predict(X) + + assert_array_almost_equal( + est_dense.oob_improvement_[:100], est_sparse.oob_improvement_[:100] + ) + assert est_dense.oob_scores_[-1] == pytest.approx(est_dense.oob_score_) + assert_array_almost_equal(est_dense.oob_scores_[:100], est_sparse.oob_scores_[:100]) + assert est_sparse.oob_scores_[-1] == pytest.approx(est_sparse.oob_score_) + assert_array_almost_equal(y_pred_dense, y_pred_sparse) + + +@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS) +def test_warm_start_fortran(Cls, global_random_seed): + # Test that feeding a X in Fortran-ordered is giving the same results as + # in C-ordered + X, y = datasets.make_hastie_10_2(n_samples=100, random_state=global_random_seed) + est_c = Cls(n_estimators=1, random_state=global_random_seed, warm_start=True) + est_fortran = Cls(n_estimators=1, random_state=global_random_seed, warm_start=True) + + est_c.fit(X, y) + est_c.set_params(n_estimators=11) + est_c.fit(X, y) + + X_fortran = np.asfortranarray(X) + est_fortran.fit(X_fortran, y) + est_fortran.set_params(n_estimators=11) + est_fortran.fit(X_fortran, y) + + assert_allclose(est_c.predict(X), est_fortran.predict(X)) + + +def early_stopping_monitor(i, est, locals): + """Returns True on the 10th iteration.""" + if i == 9: + return True + else: + return False + + +@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS) +def test_monitor_early_stopping(Cls): + # Test if monitor return value works. + X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) + + est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5) + est.fit(X, y, monitor=early_stopping_monitor) + assert est.n_estimators == 20 # this is not altered + assert est.estimators_.shape[0] == 10 + assert est.train_score_.shape[0] == 10 + assert est.oob_improvement_.shape[0] == 10 + assert est.oob_scores_.shape[0] == 10 + assert est.oob_scores_[-1] == pytest.approx(est.oob_score_) + + # try refit + est.set_params(n_estimators=30) + est.fit(X, y) + assert est.n_estimators == 30 + assert est.estimators_.shape[0] == 30 + assert est.train_score_.shape[0] == 30 + assert est.oob_improvement_.shape[0] == 30 + assert est.oob_scores_.shape[0] == 30 + assert est.oob_scores_[-1] == pytest.approx(est.oob_score_) + + est = Cls( + n_estimators=20, max_depth=1, random_state=1, subsample=0.5, warm_start=True + ) + est.fit(X, y, monitor=early_stopping_monitor) + assert est.n_estimators == 20 + assert est.estimators_.shape[0] == 10 + assert est.train_score_.shape[0] == 10 + assert est.oob_improvement_.shape[0] == 10 + assert est.oob_scores_.shape[0] == 10 + assert est.oob_scores_[-1] == pytest.approx(est.oob_score_) + + # try refit + est.set_params(n_estimators=30, warm_start=False) + est.fit(X, y) + assert est.n_estimators == 30 + assert est.train_score_.shape[0] == 30 + assert est.estimators_.shape[0] == 30 + assert est.oob_improvement_.shape[0] == 30 + assert est.oob_scores_.shape[0] == 30 + assert est.oob_scores_[-1] == pytest.approx(est.oob_score_) + + +def test_complete_classification(): + # Test greedy trees with max_depth + 1 leafs. + from sklearn.tree._tree import TREE_LEAF + + X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) + k = 4 + + est = GradientBoostingClassifier( + n_estimators=20, max_depth=None, random_state=1, max_leaf_nodes=k + 1 + ) + est.fit(X, y) + + tree = est.estimators_[0, 0].tree_ + assert tree.max_depth == k + assert tree.children_left[tree.children_left == TREE_LEAF].shape[0] == k + 1 + + +def test_complete_regression(): + # Test greedy trees with max_depth + 1 leafs. + from sklearn.tree._tree import TREE_LEAF + + k = 4 + + est = GradientBoostingRegressor( + n_estimators=20, max_depth=None, random_state=1, max_leaf_nodes=k + 1 + ) + est.fit(X_reg, y_reg) + + tree = est.estimators_[-1, 0].tree_ + assert tree.children_left[tree.children_left == TREE_LEAF].shape[0] == k + 1 + + +def test_zero_estimator_reg(global_random_seed): + # Test if init='zero' works for regression by checking that it is better + # than a simple baseline. + + baseline = DummyRegressor(strategy="mean").fit(X_reg, y_reg) + mse_baseline = mean_squared_error(baseline.predict(X_reg), y_reg) + est = GradientBoostingRegressor( + n_estimators=5, + max_depth=1, + random_state=global_random_seed, + init="zero", + learning_rate=0.5, + ) + est.fit(X_reg, y_reg) + y_pred = est.predict(X_reg) + mse_gbdt = mean_squared_error(y_reg, y_pred) + assert mse_gbdt < mse_baseline + + +def test_zero_estimator_clf(global_random_seed): + # Test if init='zero' works for classification. + X = iris.data + y = np.array(iris.target) + + est = GradientBoostingClassifier( + n_estimators=20, max_depth=1, random_state=global_random_seed, init="zero" + ) + est.fit(X, y) + + assert est.score(X, y) > 0.96 + + # binary clf + mask = y != 0 + y[mask] = 1 + y[~mask] = 0 + est = GradientBoostingClassifier( + n_estimators=20, max_depth=1, random_state=global_random_seed, init="zero" + ) + est.fit(X, y) + assert est.score(X, y) > 0.96 + + +@pytest.mark.parametrize("GBEstimator", GRADIENT_BOOSTING_ESTIMATORS) +def test_max_leaf_nodes_max_depth(GBEstimator): + # Test precedence of max_leaf_nodes over max_depth. + X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) + + k = 4 + + est = GBEstimator(max_depth=1, max_leaf_nodes=k).fit(X, y) + tree = est.estimators_[0, 0].tree_ + assert tree.max_depth == 1 + + est = GBEstimator(max_depth=1).fit(X, y) + tree = est.estimators_[0, 0].tree_ + assert tree.max_depth == 1 + + +@pytest.mark.parametrize("GBEstimator", GRADIENT_BOOSTING_ESTIMATORS) +def test_min_impurity_decrease(GBEstimator): + X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) + + est = GBEstimator(min_impurity_decrease=0.1) + est.fit(X, y) + for tree in est.estimators_.flat: + # Simply check if the parameter is passed on correctly. Tree tests + # will suffice for the actual working of this param + assert tree.min_impurity_decrease == 0.1 + + +def test_warm_start_wo_nestimators_change(): + # Test if warm_start does nothing if n_estimators is not changed. + # Regression test for #3513. + clf = GradientBoostingClassifier(n_estimators=10, warm_start=True) + clf.fit([[0, 1], [2, 3]], [0, 1]) + assert clf.estimators_.shape[0] == 10 + clf.fit([[0, 1], [2, 3]], [0, 1]) + assert clf.estimators_.shape[0] == 10 + + +@pytest.mark.parametrize( + ("loss", "value"), + [ + ("squared_error", 0.5), + ("absolute_error", 0.0), + ("huber", 0.5), + ("quantile", 0.5), + ], +) +def test_non_uniform_weights_toy_edge_case_reg(loss, value): + X = [[1, 0], [1, 0], [1, 0], [0, 1]] + y = [0, 0, 1, 0] + # ignore the first 2 training samples by setting their weight to 0 + sample_weight = [0, 0, 1, 1] + gb = GradientBoostingRegressor(learning_rate=1.0, n_estimators=2, loss=loss) + gb.fit(X, y, sample_weight=sample_weight) + assert gb.predict([[1, 0]])[0] >= value + + +def test_non_uniform_weights_toy_edge_case_clf(): + X = [[1, 0], [1, 0], [1, 0], [0, 1]] + y = [0, 0, 1, 0] + # ignore the first 2 training samples by setting their weight to 0 + sample_weight = [0, 0, 1, 1] + for loss in ("log_loss", "exponential"): + gb = GradientBoostingClassifier(n_estimators=5, loss=loss) + gb.fit(X, y, sample_weight=sample_weight) + assert_array_equal(gb.predict([[1, 0]]), [1]) + + +@skip_if_32bit +@pytest.mark.parametrize( + "EstimatorClass", (GradientBoostingClassifier, GradientBoostingRegressor) +) +@pytest.mark.parametrize( + "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS +) +def test_sparse_input(EstimatorClass, sparse_container): + y, X = datasets.make_multilabel_classification( + random_state=0, n_samples=50, n_features=1, n_classes=20 + ) + y = y[:, 0] + X_sparse = sparse_container(X) + + dense = EstimatorClass( + n_estimators=10, random_state=0, max_depth=2, min_impurity_decrease=1e-7 + ).fit(X, y) + sparse = EstimatorClass( + n_estimators=10, random_state=0, max_depth=2, min_impurity_decrease=1e-7 + ).fit(X_sparse, y) + + assert_array_almost_equal(sparse.apply(X), dense.apply(X)) + assert_array_almost_equal(sparse.predict(X), dense.predict(X)) + assert_array_almost_equal(sparse.feature_importances_, dense.feature_importances_) + + assert_array_almost_equal(sparse.predict(X_sparse), dense.predict(X)) + assert_array_almost_equal(dense.predict(X_sparse), sparse.predict(X)) + + if issubclass(EstimatorClass, GradientBoostingClassifier): + assert_array_almost_equal(sparse.predict_proba(X), dense.predict_proba(X)) + assert_array_almost_equal( + sparse.predict_log_proba(X), dense.predict_log_proba(X) + ) + + assert_array_almost_equal( + sparse.decision_function(X_sparse), sparse.decision_function(X) + ) + assert_array_almost_equal( + dense.decision_function(X_sparse), sparse.decision_function(X) + ) + for res_sparse, res in zip( + sparse.staged_decision_function(X_sparse), + sparse.staged_decision_function(X), + ): + assert_array_almost_equal(res_sparse, res) + + +@pytest.mark.parametrize( + "GradientBoostingEstimator", [GradientBoostingClassifier, GradientBoostingRegressor] +) +def test_gradient_boosting_early_stopping(GradientBoostingEstimator): + # Check if early stopping works as expected, that is empirically check that the + # number of trained estimators is increasing when the tolerance decreases. + + X, y = make_classification(n_samples=1000, random_state=0) + n_estimators = 1000 + + gb_large_tol = GradientBoostingEstimator( + n_estimators=n_estimators, + n_iter_no_change=10, + learning_rate=0.1, + max_depth=3, + random_state=42, + tol=1e-1, + ) + + gb_small_tol = GradientBoostingEstimator( + n_estimators=n_estimators, + n_iter_no_change=10, + learning_rate=0.1, + max_depth=3, + random_state=42, + tol=1e-3, + ) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + gb_large_tol.fit(X_train, y_train) + gb_small_tol.fit(X_train, y_train) + + assert gb_large_tol.n_estimators_ < gb_small_tol.n_estimators_ < n_estimators + + assert gb_large_tol.score(X_test, y_test) > 0.7 + assert gb_small_tol.score(X_test, y_test) > 0.7 + + +def test_gradient_boosting_without_early_stopping(): + # When early stopping is not used, the number of trained estimators + # must be the one specified. + X, y = make_classification(n_samples=1000, random_state=0) + + gbc = GradientBoostingClassifier( + n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42 + ) + gbc.fit(X, y) + gbr = GradientBoostingRegressor( + n_estimators=30, learning_rate=0.1, max_depth=3, random_state=42 + ) + gbr.fit(X, y) + + # The number of trained estimators must be the one specified. + assert gbc.n_estimators_ == 50 + assert gbr.n_estimators_ == 30 + + +def test_gradient_boosting_validation_fraction(): + X, y = make_classification(n_samples=1000, random_state=0) + + gbc = GradientBoostingClassifier( + n_estimators=100, + n_iter_no_change=10, + validation_fraction=0.1, + learning_rate=0.1, + max_depth=3, + random_state=42, + ) + gbc2 = clone(gbc).set_params(validation_fraction=0.3) + gbc3 = clone(gbc).set_params(n_iter_no_change=20) + + gbr = GradientBoostingRegressor( + n_estimators=100, + n_iter_no_change=10, + learning_rate=0.1, + max_depth=3, + validation_fraction=0.1, + random_state=42, + ) + gbr2 = clone(gbr).set_params(validation_fraction=0.3) + gbr3 = clone(gbr).set_params(n_iter_no_change=20) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + # Check if validation_fraction has an effect + gbc.fit(X_train, y_train) + gbc2.fit(X_train, y_train) + assert gbc.n_estimators_ != gbc2.n_estimators_ + + gbr.fit(X_train, y_train) + gbr2.fit(X_train, y_train) + assert gbr.n_estimators_ != gbr2.n_estimators_ + + # Check if n_estimators_ increase monotonically with n_iter_no_change + # Set validation + gbc3.fit(X_train, y_train) + gbr3.fit(X_train, y_train) + assert gbr.n_estimators_ < gbr3.n_estimators_ + assert gbc.n_estimators_ < gbc3.n_estimators_ + + +def test_early_stopping_stratified(): + # Make sure data splitting for early stopping is stratified + X = [[1, 2], [2, 3], [3, 4], [4, 5]] + y = [0, 0, 0, 1] + + gbc = GradientBoostingClassifier(n_iter_no_change=5) + with pytest.raises( + ValueError, match="The least populated class in y has only 1 member" + ): + gbc.fit(X, y) + + +def _make_multiclass(): + return make_classification(n_classes=3, n_clusters_per_class=1) + + +@pytest.mark.parametrize( + "gb, dataset_maker, init_estimator", + [ + (GradientBoostingClassifier, make_classification, DummyClassifier), + (GradientBoostingClassifier, _make_multiclass, DummyClassifier), + (GradientBoostingRegressor, make_regression, DummyRegressor), + ], + ids=["binary classification", "multiclass classification", "regression"], +) +def test_gradient_boosting_with_init( + gb, dataset_maker, init_estimator, global_random_seed +): + # Check that GradientBoostingRegressor works when init is a sklearn + # estimator. + # Check that an error is raised if trying to fit with sample weight but + # initial estimator does not support sample weight + + X, y = dataset_maker() + sample_weight = np.random.RandomState(global_random_seed).rand(100) + + # init supports sample weights + init_est = init_estimator() + gb(init=init_est).fit(X, y, sample_weight=sample_weight) + + # init does not support sample weights + init_est = NoSampleWeightWrapper(init_estimator()) + gb(init=init_est).fit(X, y) # ok no sample weights + with pytest.raises(ValueError, match="estimator.*does not support sample weights"): + gb(init=init_est).fit(X, y, sample_weight=sample_weight) + + +def test_gradient_boosting_with_init_pipeline(): + # Check that the init estimator can be a pipeline (see issue #13466) + + X, y = make_regression(random_state=0) + init = make_pipeline(LinearRegression()) + gb = GradientBoostingRegressor(init=init) + gb.fit(X, y) # pipeline without sample_weight works fine + + with pytest.raises( + ValueError, + match="The initial estimator Pipeline does not support sample weights", + ): + gb.fit(X, y, sample_weight=np.ones(X.shape[0])) + + # Passing sample_weight to a pipeline raises a ValueError. This test makes + # sure we make the distinction between ValueError raised by a pipeline that + # was passed sample_weight, and a InvalidParameterError raised by a regular + # estimator whose input checking failed. + invalid_nu = 1.5 + err_msg = ( + "The 'nu' parameter of NuSVR must be a float in the" + f" range (0.0, 1.0]. Got {invalid_nu} instead." + ) + with pytest.raises(InvalidParameterError, match=re.escape(err_msg)): + # Note that NuSVR properly supports sample_weight + init = NuSVR(gamma="auto", nu=invalid_nu) + gb = GradientBoostingRegressor(init=init) + gb.fit(X, y, sample_weight=np.ones(X.shape[0])) + + +def test_early_stopping_n_classes(): + # when doing early stopping (_, , y_train, _ = train_test_split(X, y)) + # there might be classes in y that are missing in y_train. As the init + # estimator will be trained on y_train, we need to raise an error if this + # happens. + + X = [[1]] * 10 + y = [0, 0] + [1] * 8 # only 2 negative class over 10 samples + gb = GradientBoostingClassifier( + n_iter_no_change=5, random_state=0, validation_fraction=0.8 + ) + with pytest.raises( + ValueError, match="The training data after the early stopping split" + ): + gb.fit(X, y) + + # No error if we let training data be big enough + gb = GradientBoostingClassifier( + n_iter_no_change=5, random_state=0, validation_fraction=0.4 + ) + + +def test_gbr_degenerate_feature_importances(): + # growing an ensemble of single node trees. See #13620 + X = np.zeros((10, 10)) + y = np.ones((10,)) + gbr = GradientBoostingRegressor().fit(X, y) + assert_array_equal(gbr.feature_importances_, np.zeros(10, dtype=np.float64)) + + +def test_huber_vs_mean_and_median(): + """Check that huber lies between absolute and squared error.""" + n_rep = 100 + n_samples = 10 + y = np.tile(np.arange(n_samples), n_rep) + x1 = np.minimum(y, n_samples / 2) + x2 = np.minimum(-y, -n_samples / 2) + X = np.c_[x1, x2] + + rng = np.random.RandomState(42) + # We want an asymmetric distribution. + y = y + rng.exponential(scale=1, size=y.shape) + + gbt_absolute_error = GradientBoostingRegressor(loss="absolute_error").fit(X, y) + gbt_huber = GradientBoostingRegressor(loss="huber").fit(X, y) + gbt_squared_error = GradientBoostingRegressor().fit(X, y) + + gbt_huber_predictions = gbt_huber.predict(X) + assert np.all(gbt_absolute_error.predict(X) <= gbt_huber_predictions) + assert np.all(gbt_huber_predictions <= gbt_squared_error.predict(X)) + + +def test_safe_divide(): + """Test that _safe_divide handles division by zero.""" + with warnings.catch_warnings(): + warnings.simplefilter("error") + assert _safe_divide(np.float64(1e300), 0) == 0 + assert _safe_divide(np.float64(0.0), np.float64(0.0)) == 0 + with pytest.warns(RuntimeWarning, match="overflow"): + # np.finfo(float).max = 1.7976931348623157e+308 + _safe_divide(np.float64(1e300), 1e-10) + + +def test_squared_error_exact_backward_compat(): + """Test squared error GBT backward compat on a simple dataset. + + The results to compare against are taken from scikit-learn v1.2.0. + """ + n_samples = 10 + y = np.arange(n_samples) + x1 = np.minimum(y, n_samples / 2) + x2 = np.minimum(-y, -n_samples / 2) + X = np.c_[x1, x2] + gbt = GradientBoostingRegressor(loss="squared_error", n_estimators=100).fit(X, y) + + pred_result = np.array( + [ + 1.39245726e-04, + 1.00010468e00, + 2.00007043e00, + 3.00004051e00, + 4.00000802e00, + 4.99998972e00, + 5.99996312e00, + 6.99993395e00, + 7.99989372e00, + 8.99985660e00, + ] + ) + assert_allclose(gbt.predict(X), pred_result, rtol=1e-8) + + train_score = np.array( + [ + 4.87246390e-08, + 3.95590036e-08, + 3.21267865e-08, + 2.60970300e-08, + 2.11820178e-08, + 1.71995782e-08, + 1.39695549e-08, + 1.13391770e-08, + 9.19931587e-09, + 7.47000575e-09, + ] + ) + assert_allclose(gbt.train_score_[-10:], train_score, rtol=1e-8) + + # Same but with sample_weights + sample_weights = np.tile([1, 10], n_samples // 2) + gbt = GradientBoostingRegressor(loss="squared_error", n_estimators=100).fit( + X, y, sample_weight=sample_weights + ) + + pred_result = np.array( + [ + 1.52391462e-04, + 1.00011168e00, + 2.00007724e00, + 3.00004638e00, + 4.00001302e00, + 4.99999873e00, + 5.99997093e00, + 6.99994329e00, + 7.99991290e00, + 8.99988727e00, + ] + ) + assert_allclose(gbt.predict(X), pred_result, rtol=1e-6, atol=1e-5) + + train_score = np.array( + [ + 4.12445296e-08, + 3.34418322e-08, + 2.71151383e-08, + 2.19782469e-08, + 1.78173649e-08, + 1.44461976e-08, + 1.17120123e-08, + 9.49485678e-09, + 7.69772505e-09, + 6.24155316e-09, + ] + ) + assert_allclose(gbt.train_score_[-10:], train_score, rtol=1e-3, atol=1e-11) + + +@skip_if_32bit +def test_huber_exact_backward_compat(): + """Test huber GBT backward compat on a simple dataset. + + The results to compare against are taken from scikit-learn v1.2.0. + """ + n_samples = 10 + y = np.arange(n_samples) + x1 = np.minimum(y, n_samples / 2) + x2 = np.minimum(-y, -n_samples / 2) + X = np.c_[x1, x2] + gbt = GradientBoostingRegressor(loss="huber", n_estimators=100, alpha=0.8).fit(X, y) + + assert_allclose(gbt._loss.closs.delta, 0.0001655688041282133) + + pred_result = np.array( + [ + 1.48120765e-04, + 9.99949174e-01, + 2.00116957e00, + 2.99986716e00, + 4.00012064e00, + 5.00002462e00, + 5.99998898e00, + 6.99692549e00, + 8.00006356e00, + 8.99985099e00, + ] + ) + assert_allclose(gbt.predict(X), pred_result, rtol=1e-8) + + train_score = np.array( + [ + 2.59484709e-07, + 2.19165900e-07, + 1.89644782e-07, + 1.64556454e-07, + 1.38705110e-07, + 1.20373736e-07, + 1.04746082e-07, + 9.13835687e-08, + 8.20245756e-08, + 7.17122188e-08, + ] + ) + assert_allclose(gbt.train_score_[-10:], train_score, rtol=1e-8) + + +def test_binomial_error_exact_backward_compat(): + """Test binary log_loss GBT backward compat on a simple dataset. + + The results to compare against are taken from scikit-learn v1.2.0. + """ + n_samples = 10 + y = np.arange(n_samples) % 2 + x1 = np.minimum(y, n_samples / 2) + x2 = np.minimum(-y, -n_samples / 2) + X = np.c_[x1, x2] + gbt = GradientBoostingClassifier(loss="log_loss", n_estimators=100).fit(X, y) + + pred_result = np.array( + [ + [9.99978098e-01, 2.19017313e-05], + [2.19017313e-05, 9.99978098e-01], + [9.99978098e-01, 2.19017313e-05], + [2.19017313e-05, 9.99978098e-01], + [9.99978098e-01, 2.19017313e-05], + [2.19017313e-05, 9.99978098e-01], + [9.99978098e-01, 2.19017313e-05], + [2.19017313e-05, 9.99978098e-01], + [9.99978098e-01, 2.19017313e-05], + [2.19017313e-05, 9.99978098e-01], + ] + ) + assert_allclose(gbt.predict_proba(X), pred_result, rtol=1e-8) + + train_score = np.array( + [ + 1.07742210e-04, + 9.74889078e-05, + 8.82113863e-05, + 7.98167784e-05, + 7.22210566e-05, + 6.53481907e-05, + 5.91293869e-05, + 5.35023988e-05, + 4.84109045e-05, + 4.38039423e-05, + ] + ) + assert_allclose(gbt.train_score_[-10:], train_score, rtol=1e-8) + + +def test_multinomial_error_exact_backward_compat(): + """Test multiclass log_loss GBT backward compat on a simple dataset. + + The results to compare against are taken from scikit-learn v1.2.0. + """ + n_samples = 10 + y = np.arange(n_samples) % 4 + x1 = np.minimum(y, n_samples / 2) + x2 = np.minimum(-y, -n_samples / 2) + X = np.c_[x1, x2] + gbt = GradientBoostingClassifier(loss="log_loss", n_estimators=100).fit(X, y) + + pred_result = np.array( + [ + [9.99999727e-01, 1.11956255e-07, 8.04921671e-08, 8.04921668e-08], + [1.11956254e-07, 9.99999727e-01, 8.04921671e-08, 8.04921668e-08], + [1.19417637e-07, 1.19417637e-07, 9.99999675e-01, 8.60526098e-08], + [1.19417637e-07, 1.19417637e-07, 8.60526088e-08, 9.99999675e-01], + [9.99999727e-01, 1.11956255e-07, 8.04921671e-08, 8.04921668e-08], + [1.11956254e-07, 9.99999727e-01, 8.04921671e-08, 8.04921668e-08], + [1.19417637e-07, 1.19417637e-07, 9.99999675e-01, 8.60526098e-08], + [1.19417637e-07, 1.19417637e-07, 8.60526088e-08, 9.99999675e-01], + [9.99999727e-01, 1.11956255e-07, 8.04921671e-08, 8.04921668e-08], + [1.11956254e-07, 9.99999727e-01, 8.04921671e-08, 8.04921668e-08], + ] + ) + assert_allclose(gbt.predict_proba(X), pred_result, rtol=1e-8) + + train_score = np.array( + [ + 1.13300150e-06, + 9.75183397e-07, + 8.39348103e-07, + 7.22433588e-07, + 6.21804338e-07, + 5.35191943e-07, + 4.60643966e-07, + 3.96479930e-07, + 3.41253434e-07, + 2.93719550e-07, + ] + ) + assert_allclose(gbt.train_score_[-10:], train_score, rtol=1e-8) + + +def test_gb_denominator_zero(global_random_seed): + """Test _update_terminal_regions denominator is not zero. + + For instance for log loss based binary classification, the line search step might + become nan/inf as denominator = hessian = prob * (1 - prob) and prob = 0 or 1 can + happen. + Here, we create a situation were this happens (at least with roughly 80%) based + on the random seed. + """ + X, y = datasets.make_hastie_10_2(n_samples=100, random_state=20) + + params = { + "learning_rate": 1.0, + "subsample": 0.5, + "n_estimators": 100, + "max_leaf_nodes": 4, + "max_depth": None, + "random_state": global_random_seed, + "min_samples_leaf": 2, + } + + clf = GradientBoostingClassifier(**params) + # _safe_devide would raise a RuntimeWarning + with warnings.catch_warnings(): + warnings.simplefilter("error") + clf.fit(X, y) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_iforest.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_iforest.py new file mode 100644 index 0000000000000000000000000000000000000000..19e34bbf51808931fd29b650a527ac0bc668dd9c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_iforest.py @@ -0,0 +1,393 @@ +""" +Testing for Isolation Forest algorithm (sklearn.ensemble.iforest). +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from unittest.mock import Mock, patch + +import numpy as np +import pytest +from joblib import parallel_backend + +from sklearn.datasets import load_diabetes, load_iris, make_classification +from sklearn.ensemble import IsolationForest +from sklearn.ensemble._iforest import _average_path_length +from sklearn.metrics import roc_auc_score +from sklearn.model_selection import ParameterGrid, train_test_split +from sklearn.utils import check_random_state +from sklearn.utils._testing import ( + assert_allclose, + assert_array_almost_equal, + assert_array_equal, + ignore_warnings, +) +from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS + +# load iris & diabetes dataset +iris = load_iris() +diabetes = load_diabetes() + + +def test_iforest(global_random_seed): + """Check Isolation Forest for various parameter settings.""" + X_train = np.array([[0, 1], [1, 2]]) + X_test = np.array([[2, 1], [1, 1]]) + + grid = ParameterGrid( + {"n_estimators": [3], "max_samples": [0.5, 1.0, 3], "bootstrap": [True, False]} + ) + + with ignore_warnings(): + for params in grid: + IsolationForest(random_state=global_random_seed, **params).fit( + X_train + ).predict(X_test) + + +@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) +def test_iforest_sparse(global_random_seed, sparse_container): + """Check IForest for various parameter settings on sparse input.""" + rng = check_random_state(global_random_seed) + X_train, X_test = train_test_split(diabetes.data[:50], random_state=rng) + grid = ParameterGrid({"max_samples": [0.5, 1.0], "bootstrap": [True, False]}) + + X_train_sparse = sparse_container(X_train) + X_test_sparse = sparse_container(X_test) + + for params in grid: + # Trained on sparse format + sparse_classifier = IsolationForest( + n_estimators=10, random_state=global_random_seed, **params + ).fit(X_train_sparse) + sparse_results = sparse_classifier.predict(X_test_sparse) + + # Trained on dense format + dense_classifier = IsolationForest( + n_estimators=10, random_state=global_random_seed, **params + ).fit(X_train) + dense_results = dense_classifier.predict(X_test) + + assert_array_equal(sparse_results, dense_results) + + +def test_iforest_error(): + """Test that it gives proper exception on deficient input.""" + X = iris.data + + # The dataset has less than 256 samples, explicitly setting + # max_samples > n_samples should result in a warning. If not set + # explicitly there should be no warning + warn_msg = "max_samples will be set to n_samples for estimation" + with pytest.warns(UserWarning, match=warn_msg): + IsolationForest(max_samples=1000).fit(X) + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + IsolationForest(max_samples="auto").fit(X) + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + IsolationForest(max_samples=np.int64(2)).fit(X) + + # test X_test n_features match X_train one: + with pytest.raises(ValueError): + IsolationForest().fit(X).predict(X[:, 1:]) + + +def test_recalculate_max_depth(): + """Check max_depth recalculation when max_samples is reset to n_samples""" + X = iris.data + clf = IsolationForest().fit(X) + for est in clf.estimators_: + assert est.max_depth == int(np.ceil(np.log2(X.shape[0]))) + + +def test_max_samples_attribute(): + X = iris.data + clf = IsolationForest().fit(X) + assert clf.max_samples_ == X.shape[0] + + clf = IsolationForest(max_samples=500) + warn_msg = "max_samples will be set to n_samples for estimation" + with pytest.warns(UserWarning, match=warn_msg): + clf.fit(X) + assert clf.max_samples_ == X.shape[0] + + clf = IsolationForest(max_samples=0.4).fit(X) + assert clf.max_samples_ == 0.4 * X.shape[0] + + +def test_iforest_parallel_regression(global_random_seed): + """Check parallel regression.""" + rng = check_random_state(global_random_seed) + + X_train, X_test = train_test_split(diabetes.data, random_state=rng) + + ensemble = IsolationForest(n_jobs=3, random_state=global_random_seed).fit(X_train) + + ensemble.set_params(n_jobs=1) + y1 = ensemble.predict(X_test) + ensemble.set_params(n_jobs=2) + y2 = ensemble.predict(X_test) + assert_array_almost_equal(y1, y2) + + ensemble = IsolationForest(n_jobs=1, random_state=global_random_seed).fit(X_train) + + y3 = ensemble.predict(X_test) + assert_array_almost_equal(y1, y3) + + +def test_iforest_performance(global_random_seed): + """Test Isolation Forest performs well""" + + # Generate train/test data + rng = check_random_state(global_random_seed) + X = 0.3 * rng.randn(600, 2) + X = rng.permutation(np.vstack((X + 2, X - 2))) + X_train = X[:1000] + + # Generate some abnormal novel observations + X_outliers = rng.uniform(low=-1, high=1, size=(200, 2)) + X_test = np.vstack((X[1000:], X_outliers)) + y_test = np.array([0] * 200 + [1] * 200) + + # fit the model + clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train) + + # predict scores (the lower, the more normal) + y_pred = -clf.decision_function(X_test) + + # check that there is at most 6 errors (false positive or false negative) + assert roc_auc_score(y_test, y_pred) > 0.98 + + +@pytest.mark.parametrize("contamination", [0.25, "auto"]) +def test_iforest_works(contamination, global_random_seed): + # toy sample (the last two samples are outliers) + X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [7, 4], [-5, 9]] + + # Test IsolationForest + clf = IsolationForest(random_state=global_random_seed, contamination=contamination) + clf.fit(X) + decision_func = -clf.decision_function(X) + pred = clf.predict(X) + # assert detect outliers: + assert np.min(decision_func[-2:]) > np.max(decision_func[:-2]) + assert_array_equal(pred, 6 * [1] + 2 * [-1]) + + +def test_max_samples_consistency(): + # Make sure validated max_samples in iforest and BaseBagging are identical + X = iris.data + clf = IsolationForest().fit(X) + assert clf.max_samples_ == clf._max_samples + + +def test_iforest_subsampled_features(): + # It tests non-regression for #5732 which failed at predict. + rng = check_random_state(0) + X_train, X_test, y_train, y_test = train_test_split( + diabetes.data[:50], diabetes.target[:50], random_state=rng + ) + clf = IsolationForest(max_features=0.8) + clf.fit(X_train, y_train) + clf.predict(X_test) + + +def test_iforest_average_path_length(): + # It tests non-regression for #8549 which used the wrong formula + # for average path length, strictly for the integer case + # Updated to check average path length when input is <= 2 (issue #11839) + result_one = 2.0 * (np.log(4.0) + np.euler_gamma) - 2.0 * 4.0 / 5.0 + result_two = 2.0 * (np.log(998.0) + np.euler_gamma) - 2.0 * 998.0 / 999.0 + assert_allclose(_average_path_length([0]), [0.0]) + assert_allclose(_average_path_length([1]), [0.0]) + assert_allclose(_average_path_length([2]), [1.0]) + assert_allclose(_average_path_length([5]), [result_one]) + assert_allclose(_average_path_length([999]), [result_two]) + assert_allclose( + _average_path_length(np.array([1, 2, 5, 999])), + [0.0, 1.0, result_one, result_two], + ) + # _average_path_length is increasing + avg_path_length = _average_path_length(np.arange(5)) + assert_array_equal(avg_path_length, np.sort(avg_path_length)) + + +def test_score_samples(): + X_train = [[1, 1], [1, 2], [2, 1]] + clf1 = IsolationForest(contamination=0.1).fit(X_train) + clf2 = IsolationForest().fit(X_train) + assert_array_equal( + clf1.score_samples([[2.0, 2.0]]), + clf1.decision_function([[2.0, 2.0]]) + clf1.offset_, + ) + assert_array_equal( + clf2.score_samples([[2.0, 2.0]]), + clf2.decision_function([[2.0, 2.0]]) + clf2.offset_, + ) + assert_array_equal( + clf1.score_samples([[2.0, 2.0]]), clf2.score_samples([[2.0, 2.0]]) + ) + + +def test_iforest_warm_start(): + """Test iterative addition of iTrees to an iForest""" + + rng = check_random_state(0) + X = rng.randn(20, 2) + + # fit first 10 trees + clf = IsolationForest( + n_estimators=10, max_samples=20, random_state=rng, warm_start=True + ) + clf.fit(X) + # remember the 1st tree + tree_1 = clf.estimators_[0] + # fit another 10 trees + clf.set_params(n_estimators=20) + clf.fit(X) + # expecting 20 fitted trees and no overwritten trees + assert len(clf.estimators_) == 20 + assert clf.estimators_[0] is tree_1 + + +# mock get_chunk_n_rows to actually test more than one chunk (here one +# chunk has 3 rows): +@patch( + "sklearn.ensemble._iforest.get_chunk_n_rows", + side_effect=Mock(**{"return_value": 3}), +) +@pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)]) +def test_iforest_chunks_works1( + mocked_get_chunk, contamination, n_predict_calls, global_random_seed +): + test_iforest_works(contamination, global_random_seed) + assert mocked_get_chunk.call_count == n_predict_calls + + +# idem with chunk_size = 10 rows +@patch( + "sklearn.ensemble._iforest.get_chunk_n_rows", + side_effect=Mock(**{"return_value": 10}), +) +@pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)]) +def test_iforest_chunks_works2( + mocked_get_chunk, contamination, n_predict_calls, global_random_seed +): + test_iforest_works(contamination, global_random_seed) + assert mocked_get_chunk.call_count == n_predict_calls + + +def test_iforest_with_uniform_data(): + """Test whether iforest predicts inliers when using uniform data""" + + # 2-d array of all 1s + X = np.ones((100, 10)) + iforest = IsolationForest() + iforest.fit(X) + + rng = np.random.RandomState(0) + + assert all(iforest.predict(X) == 1) + assert all(iforest.predict(rng.randn(100, 10)) == 1) + assert all(iforest.predict(X + 1) == 1) + assert all(iforest.predict(X - 1) == 1) + + # 2-d array where columns contain the same value across rows + X = np.repeat(rng.randn(1, 10), 100, 0) + iforest = IsolationForest() + iforest.fit(X) + + assert all(iforest.predict(X) == 1) + assert all(iforest.predict(rng.randn(100, 10)) == 1) + assert all(iforest.predict(np.ones((100, 10))) == 1) + + # Single row + X = rng.randn(1, 10) + iforest = IsolationForest() + iforest.fit(X) + + assert all(iforest.predict(X) == 1) + assert all(iforest.predict(rng.randn(100, 10)) == 1) + assert all(iforest.predict(np.ones((100, 10))) == 1) + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_iforest_with_n_jobs_does_not_segfault(csc_container): + """Check that Isolation Forest does not segfault with n_jobs=2 + + Non-regression test for #23252 + """ + X, _ = make_classification(n_samples=85_000, n_features=100, random_state=0) + X = csc_container(X) + IsolationForest(n_estimators=10, max_samples=256, n_jobs=2).fit(X) + + +def test_iforest_preserve_feature_names(): + """Check that feature names are preserved when contamination is not "auto". + + Feature names are required for consistency checks during scoring. + + Non-regression test for Issue #25844 + """ + pd = pytest.importorskip("pandas") + rng = np.random.RandomState(0) + + X = pd.DataFrame(data=rng.randn(4), columns=["a"]) + model = IsolationForest(random_state=0, contamination=0.05) + + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + model.fit(X) + + +@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) +def test_iforest_sparse_input_float_contamination(sparse_container): + """Check that `IsolationForest` accepts sparse matrix input and float value for + contamination. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/27626 + """ + X, _ = make_classification(n_samples=50, n_features=4, random_state=0) + X = sparse_container(X) + X.sort_indices() + contamination = 0.1 + iforest = IsolationForest( + n_estimators=5, contamination=contamination, random_state=0 + ).fit(X) + + X_decision = iforest.decision_function(X) + assert (X_decision < 0).sum() / X.shape[0] == pytest.approx(contamination) + + +@pytest.mark.parametrize("n_jobs", [1, 2]) +@pytest.mark.parametrize("contamination", [0.25, "auto"]) +def test_iforest_predict_parallel(global_random_seed, contamination, n_jobs): + """Check that `IsolationForest.predict` is parallelized.""" + # toy sample (the last two samples are outliers) + X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [7, 4], [-5, 9]] + + # Test IsolationForest + clf = IsolationForest( + random_state=global_random_seed, contamination=contamination, n_jobs=None + ) + clf.fit(X) + decision_func = -clf.decision_function(X) + pred = clf.predict(X) + + # assert detect outliers: + assert np.min(decision_func[-2:]) > np.max(decision_func[:-2]) + assert_array_equal(pred, 6 * [1] + 2 * [-1]) + + clf_parallel = IsolationForest( + random_state=global_random_seed, contamination=contamination, n_jobs=-1 + ) + clf_parallel.fit(X) + with parallel_backend("threading", n_jobs=n_jobs): + pred_paralell = clf_parallel.predict(X) + + # assert the same results as non-parallel + assert_array_equal(pred, pred_paralell) diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_stacking.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_stacking.py new file mode 100644 index 0000000000000000000000000000000000000000..e944ecc4abb528c9bffb1cf23674831fcd0fb7ca --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_stacking.py @@ -0,0 +1,1019 @@ +"""Test the stacking classifier and regressor.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import re +from unittest.mock import Mock + +import numpy as np +import pytest +from numpy.testing import assert_array_equal +from scipy import sparse + +from sklearn import config_context +from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, clone +from sklearn.datasets import ( + load_breast_cancer, + load_diabetes, + load_iris, + make_classification, + make_multilabel_classification, + make_regression, +) +from sklearn.dummy import DummyClassifier, DummyRegressor +from sklearn.ensemble import ( + RandomForestClassifier, + RandomForestRegressor, + StackingClassifier, + StackingRegressor, +) +from sklearn.exceptions import ConvergenceWarning, NotFittedError +from sklearn.linear_model import ( + LinearRegression, + LogisticRegression, + Ridge, + RidgeClassifier, +) +from sklearn.model_selection import KFold, StratifiedKFold, train_test_split +from sklearn.neighbors import KNeighborsClassifier +from sklearn.neural_network import MLPClassifier +from sklearn.preprocessing import scale +from sklearn.svm import SVC, LinearSVC, LinearSVR +from sklearn.tests.metadata_routing_common import ( + ConsumingClassifier, + ConsumingRegressor, + _Registry, + check_recorded_metadata, +) +from sklearn.utils._mocking import CheckingClassifier +from sklearn.utils._testing import ( + assert_allclose, + assert_allclose_dense_sparse, + ignore_warnings, +) +from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS + +diabetes = load_diabetes() +X_diabetes, y_diabetes = diabetes.data, diabetes.target +iris = load_iris() +X_iris, y_iris = iris.data, iris.target +X_multilabel, y_multilabel = make_multilabel_classification( + n_classes=3, random_state=42 +) +X_binary, y_binary = make_classification(n_classes=2, random_state=42) + + +@pytest.mark.parametrize( + "cv", [3, StratifiedKFold(n_splits=3, shuffle=True, random_state=42)] +) +@pytest.mark.parametrize( + "final_estimator", [None, RandomForestClassifier(random_state=42)] +) +@pytest.mark.parametrize("passthrough", [False, True]) +def test_stacking_classifier_iris(cv, final_estimator, passthrough): + # prescale the data to avoid convergence warning without using a pipeline + # for later assert + X_train, X_test, y_train, y_test = train_test_split( + scale(X_iris), y_iris, stratify=y_iris, random_state=42 + ) + estimators = [("lr", LogisticRegression()), ("svc", LinearSVC())] + clf = StackingClassifier( + estimators=estimators, + final_estimator=final_estimator, + cv=cv, + passthrough=passthrough, + ) + clf.fit(X_train, y_train) + clf.predict(X_test) + clf.predict_proba(X_test) + assert clf.score(X_test, y_test) > 0.8 + + X_trans = clf.transform(X_test) + expected_column_count = 10 if passthrough else 6 + assert X_trans.shape[1] == expected_column_count + if passthrough: + assert_allclose(X_test, X_trans[:, -4:]) + + clf.set_params(lr="drop") + clf.fit(X_train, y_train) + clf.predict(X_test) + clf.predict_proba(X_test) + if final_estimator is None: + # LogisticRegression has decision_function method + clf.decision_function(X_test) + + X_trans = clf.transform(X_test) + expected_column_count_drop = 7 if passthrough else 3 + assert X_trans.shape[1] == expected_column_count_drop + if passthrough: + assert_allclose(X_test, X_trans[:, -4:]) + + +def test_stacking_classifier_drop_column_binary_classification(): + # check that a column is dropped in binary classification + X, y = load_breast_cancer(return_X_y=True) + X_train, X_test, y_train, _ = train_test_split( + scale(X), y, stratify=y, random_state=42 + ) + + # both classifiers implement 'predict_proba' and will both drop one column + estimators = [ + ("lr", LogisticRegression()), + ("rf", RandomForestClassifier(random_state=42)), + ] + clf = StackingClassifier(estimators=estimators, cv=3) + + clf.fit(X_train, y_train) + X_trans = clf.transform(X_test) + assert X_trans.shape[1] == 2 + + # LinearSVC does not implement 'predict_proba' and will not drop one column + estimators = [("lr", LogisticRegression()), ("svc", LinearSVC())] + clf.set_params(estimators=estimators) + + clf.fit(X_train, y_train) + X_trans = clf.transform(X_test) + assert X_trans.shape[1] == 2 + + +def test_stacking_classifier_drop_estimator(): + # prescale the data to avoid convergence warning without using a pipeline + # for later assert + X_train, X_test, y_train, _ = train_test_split( + scale(X_iris), y_iris, stratify=y_iris, random_state=42 + ) + estimators = [("lr", "drop"), ("svc", LinearSVC(random_state=0))] + rf = RandomForestClassifier(n_estimators=10, random_state=42) + clf = StackingClassifier( + estimators=[("svc", LinearSVC(random_state=0))], + final_estimator=rf, + cv=5, + ) + clf_drop = StackingClassifier(estimators=estimators, final_estimator=rf, cv=5) + + clf.fit(X_train, y_train) + clf_drop.fit(X_train, y_train) + assert_allclose(clf.predict(X_test), clf_drop.predict(X_test)) + assert_allclose(clf.predict_proba(X_test), clf_drop.predict_proba(X_test)) + assert_allclose(clf.transform(X_test), clf_drop.transform(X_test)) + + +def test_stacking_regressor_drop_estimator(): + # prescale the data to avoid convergence warning without using a pipeline + # for later assert + X_train, X_test, y_train, _ = train_test_split( + scale(X_diabetes), y_diabetes, random_state=42 + ) + estimators = [("lr", "drop"), ("svr", LinearSVR(random_state=0))] + rf = RandomForestRegressor(n_estimators=10, random_state=42) + reg = StackingRegressor( + estimators=[("svr", LinearSVR(random_state=0))], + final_estimator=rf, + cv=5, + ) + reg_drop = StackingRegressor(estimators=estimators, final_estimator=rf, cv=5) + + reg.fit(X_train, y_train) + reg_drop.fit(X_train, y_train) + assert_allclose(reg.predict(X_test), reg_drop.predict(X_test)) + assert_allclose(reg.transform(X_test), reg_drop.transform(X_test)) + + +@pytest.mark.parametrize("cv", [3, KFold(n_splits=3, shuffle=True, random_state=42)]) +@pytest.mark.parametrize( + "final_estimator, predict_params", + [ + (None, {}), + (RandomForestRegressor(random_state=42), {}), + (DummyRegressor(), {"return_std": True}), + ], +) +@pytest.mark.parametrize("passthrough", [False, True]) +def test_stacking_regressor_diabetes(cv, final_estimator, predict_params, passthrough): + # prescale the data to avoid convergence warning without using a pipeline + # for later assert + X_train, X_test, y_train, _ = train_test_split( + scale(X_diabetes), y_diabetes, random_state=42 + ) + estimators = [("lr", LinearRegression()), ("svr", LinearSVR())] + reg = StackingRegressor( + estimators=estimators, + final_estimator=final_estimator, + cv=cv, + passthrough=passthrough, + ) + reg.fit(X_train, y_train) + result = reg.predict(X_test, **predict_params) + expected_result_length = 2 if predict_params else 1 + if predict_params: + assert len(result) == expected_result_length + + X_trans = reg.transform(X_test) + expected_column_count = 12 if passthrough else 2 + assert X_trans.shape[1] == expected_column_count + if passthrough: + assert_allclose(X_test, X_trans[:, -10:]) + + reg.set_params(lr="drop") + reg.fit(X_train, y_train) + reg.predict(X_test) + + X_trans = reg.transform(X_test) + expected_column_count_drop = 11 if passthrough else 1 + assert X_trans.shape[1] == expected_column_count_drop + if passthrough: + assert_allclose(X_test, X_trans[:, -10:]) + + +@pytest.mark.parametrize( + "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS +) +def test_stacking_regressor_sparse_passthrough(sparse_container): + # Check passthrough behavior on a sparse X matrix + X_train, X_test, y_train, _ = train_test_split( + sparse_container(scale(X_diabetes)), y_diabetes, random_state=42 + ) + estimators = [("lr", LinearRegression()), ("svr", LinearSVR())] + rf = RandomForestRegressor(n_estimators=10, random_state=42) + clf = StackingRegressor( + estimators=estimators, final_estimator=rf, cv=5, passthrough=True + ) + clf.fit(X_train, y_train) + X_trans = clf.transform(X_test) + assert_allclose_dense_sparse(X_test, X_trans[:, -10:]) + assert sparse.issparse(X_trans) + assert X_test.format == X_trans.format + + +@pytest.mark.parametrize( + "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS +) +def test_stacking_classifier_sparse_passthrough(sparse_container): + # Check passthrough behavior on a sparse X matrix + X_train, X_test, y_train, _ = train_test_split( + sparse_container(scale(X_iris)), y_iris, random_state=42 + ) + estimators = [("lr", LogisticRegression()), ("svc", LinearSVC())] + rf = RandomForestClassifier(n_estimators=10, random_state=42) + clf = StackingClassifier( + estimators=estimators, final_estimator=rf, cv=5, passthrough=True + ) + clf.fit(X_train, y_train) + X_trans = clf.transform(X_test) + assert_allclose_dense_sparse(X_test, X_trans[:, -4:]) + assert sparse.issparse(X_trans) + assert X_test.format == X_trans.format + + +def test_stacking_classifier_drop_binary_prob(): + # check that classifier will drop one of the probability column for + # binary classification problem + + # Select only the 2 first classes + X_, y_ = scale(X_iris[:100]), y_iris[:100] + + estimators = [("lr", LogisticRegression()), ("rf", RandomForestClassifier())] + clf = StackingClassifier(estimators=estimators) + clf.fit(X_, y_) + X_meta = clf.transform(X_) + assert X_meta.shape[1] == 2 + + +class NoWeightRegressor(RegressorMixin, BaseEstimator): + def fit(self, X, y): + self.reg = DummyRegressor() + return self.reg.fit(X, y) + + def predict(self, X): + return np.ones(X.shape[0]) + + +class NoWeightClassifier(ClassifierMixin, BaseEstimator): + def fit(self, X, y): + self.clf = DummyClassifier(strategy="stratified") + return self.clf.fit(X, y) + + +@pytest.mark.parametrize( + "y, params, type_err, msg_err", + [ + (y_iris, {"estimators": []}, ValueError, "Invalid 'estimators' attribute,"), + ( + y_iris, + { + "estimators": [ + ("lr", LogisticRegression()), + ("svm", SVC(max_iter=50_000)), + ], + "stack_method": "predict_proba", + }, + ValueError, + "does not implement the method predict_proba", + ), + ( + y_iris, + { + "estimators": [ + ("lr", LogisticRegression()), + ("cor", NoWeightClassifier()), + ] + }, + TypeError, + "does not support sample weight", + ), + ( + y_iris, + { + "estimators": [ + ("lr", LogisticRegression()), + ("cor", LinearSVC(max_iter=50_000)), + ], + "final_estimator": NoWeightClassifier(), + }, + TypeError, + "does not support sample weight", + ), + ], +) +def test_stacking_classifier_error(y, params, type_err, msg_err): + with pytest.raises(type_err, match=msg_err): + clf = StackingClassifier(**params, cv=3) + clf.fit(scale(X_iris), y, sample_weight=np.ones(X_iris.shape[0])) + + +@pytest.mark.parametrize( + "y, params, type_err, msg_err", + [ + (y_diabetes, {"estimators": []}, ValueError, "Invalid 'estimators' attribute,"), + ( + y_diabetes, + {"estimators": [("lr", LinearRegression()), ("cor", NoWeightRegressor())]}, + TypeError, + "does not support sample weight", + ), + ( + y_diabetes, + { + "estimators": [ + ("lr", LinearRegression()), + ("cor", LinearSVR()), + ], + "final_estimator": NoWeightRegressor(), + }, + TypeError, + "does not support sample weight", + ), + ], +) +def test_stacking_regressor_error(y, params, type_err, msg_err): + with pytest.raises(type_err, match=msg_err): + reg = StackingRegressor(**params, cv=3) + reg.fit(scale(X_diabetes), y, sample_weight=np.ones(X_diabetes.shape[0])) + + +@pytest.mark.parametrize( + "estimator, X, y", + [ + ( + StackingClassifier( + estimators=[ + ("lr", LogisticRegression(random_state=0)), + ("svm", LinearSVC(random_state=0)), + ] + ), + X_iris[:100], + y_iris[:100], + ), # keep only classes 0 and 1 + ( + StackingRegressor( + estimators=[ + ("lr", LinearRegression()), + ("svm", LinearSVR(random_state=0)), + ] + ), + X_diabetes, + y_diabetes, + ), + ], + ids=["StackingClassifier", "StackingRegressor"], +) +def test_stacking_randomness(estimator, X, y): + # checking that fixing the random state of the CV will lead to the same + # results + estimator_full = clone(estimator) + estimator_full.set_params( + cv=KFold(shuffle=True, random_state=np.random.RandomState(0)) + ) + + estimator_drop = clone(estimator) + estimator_drop.set_params(lr="drop") + estimator_drop.set_params( + cv=KFold(shuffle=True, random_state=np.random.RandomState(0)) + ) + + assert_allclose( + estimator_full.fit(X, y).transform(X)[:, 1:], + estimator_drop.fit(X, y).transform(X), + ) + + +def test_stacking_classifier_stratify_default(): + # check that we stratify the classes for the default CV + clf = StackingClassifier( + estimators=[ + ("lr", LogisticRegression(max_iter=10_000)), + ("svm", LinearSVC(max_iter=10_000)), + ] + ) + # since iris is not shuffled, a simple k-fold would not contain the + # 3 classes during training + clf.fit(X_iris, y_iris) + + +@pytest.mark.parametrize( + "stacker, X, y", + [ + ( + StackingClassifier( + estimators=[ + ("lr", LogisticRegression()), + ("svm", LinearSVC(random_state=42)), + ], + final_estimator=LogisticRegression(), + cv=KFold(shuffle=True, random_state=42), + ), + *load_breast_cancer(return_X_y=True), + ), + ( + StackingRegressor( + estimators=[ + ("lr", LinearRegression()), + ("svm", LinearSVR(random_state=42)), + ], + final_estimator=LinearRegression(), + cv=KFold(shuffle=True, random_state=42), + ), + X_diabetes, + y_diabetes, + ), + ], + ids=["StackingClassifier", "StackingRegressor"], +) +def test_stacking_with_sample_weight(stacker, X, y): + # check that sample weights has an influence on the fitting + # note: ConvergenceWarning are catch since we are not worrying about the + # convergence here + n_half_samples = len(y) // 2 + total_sample_weight = np.array( + [0.1] * n_half_samples + [0.9] * (len(y) - n_half_samples) + ) + X_train, X_test, y_train, _, sample_weight_train, _ = train_test_split( + X, y, total_sample_weight, random_state=42 + ) + + with ignore_warnings(category=ConvergenceWarning): + stacker.fit(X_train, y_train) + y_pred_no_weight = stacker.predict(X_test) + + with ignore_warnings(category=ConvergenceWarning): + stacker.fit(X_train, y_train, sample_weight=np.ones(y_train.shape)) + y_pred_unit_weight = stacker.predict(X_test) + + assert_allclose(y_pred_no_weight, y_pred_unit_weight) + + with ignore_warnings(category=ConvergenceWarning): + stacker.fit(X_train, y_train, sample_weight=sample_weight_train) + y_pred_biased = stacker.predict(X_test) + + assert np.abs(y_pred_no_weight - y_pred_biased).sum() > 0 + + +def test_stacking_classifier_sample_weight_fit_param(): + # check sample_weight is passed to all invocations of fit + stacker = StackingClassifier( + estimators=[("lr", CheckingClassifier(expected_sample_weight=True))], + final_estimator=CheckingClassifier(expected_sample_weight=True), + ) + stacker.fit(X_iris, y_iris, sample_weight=np.ones(X_iris.shape[0])) + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +@pytest.mark.parametrize( + "stacker, X, y", + [ + ( + StackingClassifier( + estimators=[ + ("lr", LogisticRegression()), + ("svm", LinearSVC(random_state=42)), + ], + final_estimator=LogisticRegression(), + ), + *load_breast_cancer(return_X_y=True), + ), + ( + StackingRegressor( + estimators=[ + ("lr", LinearRegression()), + ("svm", LinearSVR(random_state=42)), + ], + final_estimator=LinearRegression(), + ), + X_diabetes, + y_diabetes, + ), + ], + ids=["StackingClassifier", "StackingRegressor"], +) +def test_stacking_cv_influence(stacker, X, y): + # check that the stacking affects the fit of the final estimator but not + # the fit of the base estimators + # note: ConvergenceWarning are catch since we are not worrying about the + # convergence here + stacker_cv_3 = clone(stacker) + stacker_cv_5 = clone(stacker) + + stacker_cv_3.set_params(cv=3) + stacker_cv_5.set_params(cv=5) + + stacker_cv_3.fit(X, y) + stacker_cv_5.fit(X, y) + + # the base estimators should be identical + for est_cv_3, est_cv_5 in zip(stacker_cv_3.estimators_, stacker_cv_5.estimators_): + assert_allclose(est_cv_3.coef_, est_cv_5.coef_) + + # the final estimator should be different + with pytest.raises(AssertionError, match="Not equal"): + assert_allclose( + stacker_cv_3.final_estimator_.coef_, stacker_cv_5.final_estimator_.coef_ + ) + + +@pytest.mark.parametrize( + "Stacker, Estimator, stack_method, final_estimator, X, y", + [ + ( + StackingClassifier, + DummyClassifier, + "predict_proba", + LogisticRegression(random_state=42), + X_iris, + y_iris, + ), + ( + StackingRegressor, + DummyRegressor, + "predict", + LinearRegression(), + X_diabetes, + y_diabetes, + ), + ], +) +def test_stacking_prefit(Stacker, Estimator, stack_method, final_estimator, X, y): + """Check the behaviour of stacking when `cv='prefit'`""" + X_train1, X_train2, y_train1, y_train2 = train_test_split( + X, y, random_state=42, test_size=0.5 + ) + estimators = [ + ("d0", Estimator().fit(X_train1, y_train1)), + ("d1", Estimator().fit(X_train1, y_train1)), + ] + + # mock out fit and stack_method to be asserted later + for _, estimator in estimators: + estimator.fit = Mock(name="fit") + stack_func = getattr(estimator, stack_method) + predict_method_mocked = Mock(side_effect=stack_func) + # Mocking a method will not provide a `__name__` while Python methods + # do and we are using it in `_get_response_method`. + predict_method_mocked.__name__ = stack_method + setattr(estimator, stack_method, predict_method_mocked) + + stacker = Stacker( + estimators=estimators, cv="prefit", final_estimator=final_estimator + ) + stacker.fit(X_train2, y_train2) + + assert stacker.estimators_ == [estimator for _, estimator in estimators] + # fit was not called again + assert all(estimator.fit.call_count == 0 for estimator in stacker.estimators_) + + # stack method is called with the proper inputs + for estimator in stacker.estimators_: + stack_func_mock = getattr(estimator, stack_method) + stack_func_mock.assert_called_with(X_train2) + + +@pytest.mark.parametrize( + "stacker, X, y", + [ + ( + StackingClassifier( + estimators=[("lr", LogisticRegression()), ("svm", SVC())], + cv="prefit", + ), + X_iris, + y_iris, + ), + ( + StackingRegressor( + estimators=[ + ("lr", LinearRegression()), + ("svm", LinearSVR()), + ], + cv="prefit", + ), + X_diabetes, + y_diabetes, + ), + ], +) +def test_stacking_prefit_error(stacker, X, y): + # check that NotFittedError is raised + # if base estimators are not fitted when cv="prefit" + with pytest.raises(NotFittedError): + stacker.fit(X, y) + + +@pytest.mark.parametrize( + "make_dataset, Stacking, Estimator", + [ + (make_classification, StackingClassifier, LogisticRegression), + (make_regression, StackingRegressor, LinearRegression), + ], +) +def test_stacking_without_n_features_in(make_dataset, Stacking, Estimator): + # Stacking supports estimators without `n_features_in_`. Regression test + # for #17353 + + class MyEstimator(Estimator): + """Estimator without n_features_in_""" + + def fit(self, X, y): + super().fit(X, y) + del self.n_features_in_ + + X, y = make_dataset(random_state=0, n_samples=100) + stacker = Stacking(estimators=[("lr", MyEstimator())]) + + msg = f"{Stacking.__name__} object has no attribute n_features_in_" + with pytest.raises(AttributeError, match=msg): + stacker.n_features_in_ + + # Does not raise + stacker.fit(X, y) + + msg = "'MyEstimator' object has no attribute 'n_features_in_'" + with pytest.raises(AttributeError, match=msg): + stacker.n_features_in_ + + +@pytest.mark.parametrize( + "estimator", + [ + # output a 2D array of the probability of the positive class for each output + MLPClassifier(random_state=42), + # output a list of 2D array containing the probability of each class + # for each output + RandomForestClassifier(random_state=42), + ], + ids=["MLPClassifier", "RandomForestClassifier"], +) +def test_stacking_classifier_multilabel_predict_proba(estimator): + """Check the behaviour for the multilabel classification case and the + `predict_proba` stacking method. + + Estimators are not consistent with the output arrays and we need to ensure that + we handle all cases. + """ + X_train, X_test, y_train, y_test = train_test_split( + X_multilabel, y_multilabel, stratify=y_multilabel, random_state=42 + ) + n_outputs = 3 + + estimators = [("est", estimator)] + stacker = StackingClassifier( + estimators=estimators, + final_estimator=KNeighborsClassifier(), + stack_method="predict_proba", + ).fit(X_train, y_train) + + X_trans = stacker.transform(X_test) + assert X_trans.shape == (X_test.shape[0], n_outputs) + # we should not have any collinear classes and thus nothing should sum to 1 + assert not any(np.isclose(X_trans.sum(axis=1), 1.0)) + + y_pred = stacker.predict(X_test) + assert y_pred.shape == y_test.shape + + +def test_stacking_classifier_multilabel_decision_function(): + """Check the behaviour for the multilabel classification case and the + `decision_function` stacking method. Only `RidgeClassifier` supports this + case. + """ + X_train, X_test, y_train, y_test = train_test_split( + X_multilabel, y_multilabel, stratify=y_multilabel, random_state=42 + ) + n_outputs = 3 + + estimators = [("est", RidgeClassifier())] + stacker = StackingClassifier( + estimators=estimators, + final_estimator=KNeighborsClassifier(), + stack_method="decision_function", + ).fit(X_train, y_train) + + X_trans = stacker.transform(X_test) + assert X_trans.shape == (X_test.shape[0], n_outputs) + + y_pred = stacker.predict(X_test) + assert y_pred.shape == y_test.shape + + +@pytest.mark.parametrize("stack_method", ["auto", "predict"]) +@pytest.mark.parametrize("passthrough", [False, True]) +def test_stacking_classifier_multilabel_auto_predict(stack_method, passthrough): + """Check the behaviour for the multilabel classification case for stack methods + supported for all estimators or automatically picked up. + """ + X_train, X_test, y_train, y_test = train_test_split( + X_multilabel, y_multilabel, stratify=y_multilabel, random_state=42 + ) + y_train_before_fit = y_train.copy() + n_outputs = 3 + + estimators = [ + ("mlp", MLPClassifier(random_state=42)), + ("rf", RandomForestClassifier(random_state=42)), + ("ridge", RidgeClassifier()), + ] + final_estimator = KNeighborsClassifier() + + clf = StackingClassifier( + estimators=estimators, + final_estimator=final_estimator, + passthrough=passthrough, + stack_method=stack_method, + ).fit(X_train, y_train) + + # make sure we don't change `y_train` inplace + assert_array_equal(y_train_before_fit, y_train) + + y_pred = clf.predict(X_test) + assert y_pred.shape == y_test.shape + + if stack_method == "auto": + expected_stack_methods = ["predict_proba", "predict_proba", "decision_function"] + else: + expected_stack_methods = ["predict"] * len(estimators) + assert clf.stack_method_ == expected_stack_methods + + n_features_X_trans = n_outputs * len(estimators) + if passthrough: + n_features_X_trans += X_train.shape[1] + X_trans = clf.transform(X_test) + assert X_trans.shape == (X_test.shape[0], n_features_X_trans) + + assert_array_equal(clf.classes_, [np.array([0, 1])] * n_outputs) + + +@pytest.mark.parametrize( + "stacker, feature_names, X, y, expected_names", + [ + ( + StackingClassifier( + estimators=[ + ("lr", LogisticRegression(random_state=0)), + ("svm", LinearSVC(random_state=0)), + ] + ), + iris.feature_names, + X_iris, + y_iris, + [ + "stackingclassifier_lr0", + "stackingclassifier_lr1", + "stackingclassifier_lr2", + "stackingclassifier_svm0", + "stackingclassifier_svm1", + "stackingclassifier_svm2", + ], + ), + ( + StackingClassifier( + estimators=[ + ("lr", LogisticRegression(random_state=0)), + ("other", "drop"), + ("svm", LinearSVC(random_state=0)), + ] + ), + iris.feature_names, + X_iris[:100], + y_iris[:100], # keep only classes 0 and 1 + [ + "stackingclassifier_lr", + "stackingclassifier_svm", + ], + ), + ( + StackingRegressor( + estimators=[ + ("lr", LinearRegression()), + ("svm", LinearSVR(random_state=0)), + ] + ), + diabetes.feature_names, + X_diabetes, + y_diabetes, + [ + "stackingregressor_lr", + "stackingregressor_svm", + ], + ), + ], + ids=[ + "StackingClassifier_multiclass", + "StackingClassifier_binary", + "StackingRegressor", + ], +) +@pytest.mark.parametrize("passthrough", [True, False]) +def test_get_feature_names_out( + stacker, feature_names, X, y, expected_names, passthrough +): + """Check get_feature_names_out works for stacking.""" + + stacker.set_params(passthrough=passthrough) + stacker.fit(scale(X), y) + + if passthrough: + expected_names = np.concatenate((expected_names, feature_names)) + + names_out = stacker.get_feature_names_out(feature_names) + assert_array_equal(names_out, expected_names) + + +def test_stacking_classifier_base_regressor(): + """Check that a regressor can be used as the first layer in `StackingClassifier`.""" + X_train, X_test, y_train, y_test = train_test_split( + scale(X_iris), y_iris, stratify=y_iris, random_state=42 + ) + clf = StackingClassifier(estimators=[("ridge", Ridge())]) + clf.fit(X_train, y_train) + clf.predict(X_test) + clf.predict_proba(X_test) + assert clf.score(X_test, y_test) > 0.8 + + +def test_stacking_final_estimator_attribute_error(): + """Check that we raise the proper AttributeError when the final estimator + does not implement the `decision_function` method, which is decorated with + `available_if`. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/28108 + """ + X, y = make_classification(random_state=42) + + estimators = [ + ("lr", LogisticRegression()), + ("rf", RandomForestClassifier(n_estimators=2, random_state=42)), + ] + # RandomForestClassifier does not implement 'decision_function' and should raise + # an AttributeError + final_estimator = RandomForestClassifier(n_estimators=2, random_state=42) + clf = StackingClassifier( + estimators=estimators, final_estimator=final_estimator, cv=3 + ) + + outer_msg = "This 'StackingClassifier' has no attribute 'decision_function'" + inner_msg = "'RandomForestClassifier' object has no attribute 'decision_function'" + with pytest.raises(AttributeError, match=outer_msg) as exec_info: + clf.fit(X, y).decision_function(X) + assert isinstance(exec_info.value.__cause__, AttributeError) + assert inner_msg in str(exec_info.value.__cause__) + + +# Metadata Routing Tests +# ====================== + + +@pytest.mark.parametrize( + "Estimator, Child", + [ + (StackingClassifier, ConsumingClassifier), + (StackingRegressor, ConsumingRegressor), + ], +) +def test_routing_passed_metadata_not_supported(Estimator, Child): + """Test that the right error message is raised when metadata is passed while + not supported when `enable_metadata_routing=False`.""" + + with pytest.raises( + ValueError, match="is only supported if enable_metadata_routing=True" + ): + Estimator(["clf", Child()]).fit( + X_iris, y_iris, sample_weight=[1, 1, 1, 1, 1], metadata="a" + ) + + +@pytest.mark.parametrize( + "Estimator, Child", + [ + (StackingClassifier, ConsumingClassifier), + (StackingRegressor, ConsumingRegressor), + ], +) +@config_context(enable_metadata_routing=True) +def test_get_metadata_routing_without_fit(Estimator, Child): + # Test that metadata_routing() doesn't raise when called before fit. + est = Estimator([("sub_est", Child())]) + est.get_metadata_routing() + + +@pytest.mark.parametrize( + "Estimator, Child", + [ + (StackingClassifier, ConsumingClassifier), + (StackingRegressor, ConsumingRegressor), + ], +) +@pytest.mark.parametrize( + "prop, prop_value", [("sample_weight", np.ones(X_iris.shape[0])), ("metadata", "a")] +) +@config_context(enable_metadata_routing=True) +def test_metadata_routing_for_stacking_estimators(Estimator, Child, prop, prop_value): + """Test that metadata is routed correctly for Stacking*.""" + + est = Estimator( + [ + ( + "sub_est1", + Child(registry=_Registry()).set_fit_request(**{prop: True}), + ), + ( + "sub_est2", + Child(registry=_Registry()).set_fit_request(**{prop: True}), + ), + ], + final_estimator=Child(registry=_Registry()).set_predict_request(**{prop: True}), + ) + + est.fit(X_iris, y_iris, **{prop: prop_value}) + est.fit_transform(X_iris, y_iris, **{prop: prop_value}) + + est.predict(X_iris, **{prop: prop_value}) + + for estimator in est.estimators: + # access sub-estimator in (name, est) with estimator[1]: + registry = estimator[1].registry + assert len(registry) + for sub_est in registry: + check_recorded_metadata( + obj=sub_est, + method="fit", + parent="fit", + split_params=(prop), + **{prop: prop_value}, + ) + # access final_estimator: + registry = est.final_estimator_.registry + assert len(registry) + check_recorded_metadata( + obj=registry[-1], + method="predict", + parent="predict", + split_params=(prop), + **{prop: prop_value}, + ) + + +@pytest.mark.parametrize( + "Estimator, Child", + [ + (StackingClassifier, ConsumingClassifier), + (StackingRegressor, ConsumingRegressor), + ], +) +@config_context(enable_metadata_routing=True) +def test_metadata_routing_error_for_stacking_estimators(Estimator, Child): + """Test that the right error is raised when metadata is not requested.""" + sample_weight, metadata = np.ones(X_iris.shape[0]), "a" + + est = Estimator([("sub_est", Child())]) + + error_message = ( + "[sample_weight, metadata] are passed but are not explicitly set as requested" + f" or not requested for {Child.__name__}.fit" + ) + + with pytest.raises(ValueError, match=re.escape(error_message)): + est.fit(X_iris, y_iris, sample_weight=sample_weight, metadata=metadata) + + +# End of Metadata Routing Tests +# ============================= diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_voting.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_voting.py new file mode 100644 index 0000000000000000000000000000000000000000..fc3fc82c2bee8a29d9b3da95a3cb231c86c3c71d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_voting.py @@ -0,0 +1,793 @@ +"""Testing for the VotingClassifier and VotingRegressor""" + +import re + +import numpy as np +import pytest + +from sklearn import config_context, datasets +from sklearn.base import BaseEstimator, ClassifierMixin, clone +from sklearn.datasets import make_multilabel_classification +from sklearn.dummy import DummyRegressor +from sklearn.ensemble import ( + RandomForestClassifier, + RandomForestRegressor, + VotingClassifier, + VotingRegressor, +) +from sklearn.exceptions import NotFittedError +from sklearn.linear_model import LinearRegression, LogisticRegression +from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split +from sklearn.multiclass import OneVsRestClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.svm import SVC +from sklearn.tests.metadata_routing_common import ( + ConsumingClassifier, + ConsumingRegressor, + _Registry, + check_recorded_metadata, +) +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +from sklearn.utils._testing import ( + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) + +# Load datasets +iris = datasets.load_iris() +X, y = iris.data[:, 1:3], iris.target +# Scaled to solve ConvergenceWarning throw by Logistic Regression +X_scaled = StandardScaler().fit_transform(X) + +X_r, y_r = datasets.load_diabetes(return_X_y=True) + + +@pytest.mark.parametrize( + "params, err_msg", + [ + ( + {"estimators": []}, + "Invalid 'estimators' attribute, 'estimators' should be a non-empty list", + ), + ( + {"estimators": [LogisticRegression()]}, + "Invalid 'estimators' attribute, 'estimators' should be a non-empty list", + ), + ( + {"estimators": [(213, LogisticRegression())]}, + "Invalid 'estimators' attribute, 'estimators' should be a non-empty list", + ), + ( + {"estimators": [("lr", LogisticRegression())], "weights": [1, 2]}, + "Number of `estimators` and weights must be equal", + ), + ], +) +def test_voting_classifier_estimator_init(params, err_msg): + ensemble = VotingClassifier(**params) + with pytest.raises(ValueError, match=err_msg): + ensemble.fit(X, y) + + +def test_predictproba_hardvoting(): + eclf = VotingClassifier( + estimators=[("lr1", LogisticRegression()), ("lr2", LogisticRegression())], + voting="hard", + ) + + inner_msg = "predict_proba is not available when voting='hard'" + outer_msg = "'VotingClassifier' has no attribute 'predict_proba'" + with pytest.raises(AttributeError, match=outer_msg) as exec_info: + eclf.predict_proba + assert isinstance(exec_info.value.__cause__, AttributeError) + assert inner_msg in str(exec_info.value.__cause__) + + assert not hasattr(eclf, "predict_proba") + eclf.fit(X_scaled, y) + assert not hasattr(eclf, "predict_proba") + + +def test_notfitted(): + eclf = VotingClassifier( + estimators=[("lr1", LogisticRegression()), ("lr2", LogisticRegression())], + voting="soft", + ) + ereg = VotingRegressor([("dr", DummyRegressor())]) + msg = ( + "This %s instance is not fitted yet. Call 'fit'" + " with appropriate arguments before using this estimator." + ) + with pytest.raises(NotFittedError, match=msg % "VotingClassifier"): + eclf.predict(X) + with pytest.raises(NotFittedError, match=msg % "VotingClassifier"): + eclf.predict_proba(X) + with pytest.raises(NotFittedError, match=msg % "VotingClassifier"): + eclf.transform(X) + with pytest.raises(NotFittedError, match=msg % "VotingRegressor"): + ereg.predict(X_r) + with pytest.raises(NotFittedError, match=msg % "VotingRegressor"): + ereg.transform(X_r) + + +def test_majority_label_iris(global_random_seed): + """Check classification by majority label on dataset iris.""" + clf1 = LogisticRegression(random_state=global_random_seed) + clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed) + clf3 = GaussianNB() + eclf = VotingClassifier( + estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="hard" + ) + scores = cross_val_score(eclf, X, y, scoring="accuracy") + + assert scores.mean() >= 0.9 + + +def test_tie_situation(): + """Check voting classifier selects smaller class label in tie situation.""" + clf1 = LogisticRegression(random_state=123) + clf2 = RandomForestClassifier(random_state=123) + eclf = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], voting="hard") + assert clf1.fit(X, y).predict(X)[52] == 2 + assert clf2.fit(X, y).predict(X)[52] == 1 + assert eclf.fit(X, y).predict(X)[52] == 1 + + +def test_weights_iris(global_random_seed): + """Check classification by average probabilities on dataset iris.""" + clf1 = LogisticRegression(random_state=global_random_seed) + clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed) + clf3 = GaussianNB() + eclf = VotingClassifier( + estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], + voting="soft", + weights=[1, 2, 10], + ) + scores = cross_val_score(eclf, X_scaled, y, scoring="accuracy") + assert scores.mean() >= 0.9 + + +def test_weights_regressor(): + """Check weighted average regression prediction on diabetes dataset.""" + reg1 = DummyRegressor(strategy="mean") + reg2 = DummyRegressor(strategy="median") + reg3 = DummyRegressor(strategy="quantile", quantile=0.2) + ereg = VotingRegressor( + [("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 2, 10] + ) + + X_r_train, X_r_test, y_r_train, y_r_test = train_test_split( + X_r, y_r, test_size=0.25 + ) + + reg1_pred = reg1.fit(X_r_train, y_r_train).predict(X_r_test) + reg2_pred = reg2.fit(X_r_train, y_r_train).predict(X_r_test) + reg3_pred = reg3.fit(X_r_train, y_r_train).predict(X_r_test) + ereg_pred = ereg.fit(X_r_train, y_r_train).predict(X_r_test) + + avg = np.average( + np.asarray([reg1_pred, reg2_pred, reg3_pred]), axis=0, weights=[1, 2, 10] + ) + assert_almost_equal(ereg_pred, avg, decimal=2) + + ereg_weights_none = VotingRegressor( + [("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=None + ) + ereg_weights_equal = VotingRegressor( + [("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 1, 1] + ) + ereg_weights_none.fit(X_r_train, y_r_train) + ereg_weights_equal.fit(X_r_train, y_r_train) + ereg_none_pred = ereg_weights_none.predict(X_r_test) + ereg_equal_pred = ereg_weights_equal.predict(X_r_test) + assert_almost_equal(ereg_none_pred, ereg_equal_pred, decimal=2) + + +def test_predict_on_toy_problem(global_random_seed): + """Manually check predicted class labels for toy dataset.""" + clf1 = LogisticRegression(random_state=global_random_seed) + clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed) + clf3 = GaussianNB() + + X = np.array( + [[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2], [2.1, 1.4], [3.1, 2.3]] + ) + + y = np.array([1, 1, 1, 2, 2, 2]) + + assert_array_equal(clf1.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2]) + assert_array_equal(clf2.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2]) + assert_array_equal(clf3.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2]) + + eclf = VotingClassifier( + estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], + voting="hard", + weights=[1, 1, 1], + ) + assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2]) + + eclf = VotingClassifier( + estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], + voting="soft", + weights=[1, 1, 1], + ) + assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2]) + + +def test_predict_proba_on_toy_problem(): + """Calculate predicted probabilities on toy dataset.""" + clf1 = LogisticRegression(random_state=123) + clf2 = RandomForestClassifier(random_state=123) + clf3 = GaussianNB() + X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]]) + y = np.array([1, 1, 2, 2]) + + clf1_res = np.array( + [ + [0.59790391, 0.40209609], + [0.57622162, 0.42377838], + [0.50728456, 0.49271544], + [0.40241774, 0.59758226], + ] + ) + + clf2_res = np.array([[0.8, 0.2], [0.8, 0.2], [0.2, 0.8], [0.3, 0.7]]) + + clf3_res = np.array( + [[0.9985082, 0.0014918], [0.99845843, 0.00154157], [0.0, 1.0], [0.0, 1.0]] + ) + + t00 = (2 * clf1_res[0][0] + clf2_res[0][0] + clf3_res[0][0]) / 4 + t11 = (2 * clf1_res[1][1] + clf2_res[1][1] + clf3_res[1][1]) / 4 + t21 = (2 * clf1_res[2][1] + clf2_res[2][1] + clf3_res[2][1]) / 4 + t31 = (2 * clf1_res[3][1] + clf2_res[3][1] + clf3_res[3][1]) / 4 + + eclf = VotingClassifier( + estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], + voting="soft", + weights=[2, 1, 1], + ) + eclf_res = eclf.fit(X, y).predict_proba(X) + + assert_almost_equal(t00, eclf_res[0][0], decimal=1) + assert_almost_equal(t11, eclf_res[1][1], decimal=1) + assert_almost_equal(t21, eclf_res[2][1], decimal=1) + assert_almost_equal(t31, eclf_res[3][1], decimal=1) + + inner_msg = "predict_proba is not available when voting='hard'" + outer_msg = "'VotingClassifier' has no attribute 'predict_proba'" + with pytest.raises(AttributeError, match=outer_msg) as exec_info: + eclf = VotingClassifier( + estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="hard" + ) + eclf.fit(X, y).predict_proba(X) + + assert isinstance(exec_info.value.__cause__, AttributeError) + assert inner_msg in str(exec_info.value.__cause__) + + +def test_multilabel(): + """Check if error is raised for multilabel classification.""" + X, y = make_multilabel_classification( + n_classes=2, n_labels=1, allow_unlabeled=False, random_state=123 + ) + clf = OneVsRestClassifier(SVC(kernel="linear")) + + eclf = VotingClassifier(estimators=[("ovr", clf)], voting="hard") + + try: + eclf.fit(X, y) + except NotImplementedError: + return + + +def test_gridsearch(): + """Check GridSearch support.""" + clf1 = LogisticRegression(random_state=1) + clf2 = RandomForestClassifier(random_state=1, n_estimators=3) + clf3 = GaussianNB() + eclf = VotingClassifier( + estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft" + ) + + params = { + "lr__C": [1.0, 100.0], + "voting": ["soft", "hard"], + "weights": [[0.5, 0.5, 0.5], [1.0, 0.5, 0.5]], + } + + grid = GridSearchCV(estimator=eclf, param_grid=params, cv=2) + grid.fit(X_scaled, y) + + +def test_parallel_fit(global_random_seed): + """Check parallel backend of VotingClassifier on toy dataset.""" + clf1 = LogisticRegression(random_state=global_random_seed) + clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed) + clf3 = GaussianNB() + X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]]) + y = np.array([1, 1, 2, 2]) + + eclf1 = VotingClassifier( + estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=1 + ).fit(X, y) + eclf2 = VotingClassifier( + estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=2 + ).fit(X, y) + + assert_array_equal(eclf1.predict(X), eclf2.predict(X)) + assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) + + +def test_sample_weight(global_random_seed): + """Tests sample_weight parameter of VotingClassifier""" + clf1 = LogisticRegression(random_state=global_random_seed) + clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed) + clf3 = SVC(probability=True, random_state=global_random_seed) + eclf1 = VotingClassifier( + estimators=[("lr", clf1), ("rf", clf2), ("svc", clf3)], voting="soft" + ).fit(X_scaled, y, sample_weight=np.ones((len(y),))) + eclf2 = VotingClassifier( + estimators=[("lr", clf1), ("rf", clf2), ("svc", clf3)], voting="soft" + ).fit(X_scaled, y) + assert_array_equal(eclf1.predict(X_scaled), eclf2.predict(X_scaled)) + assert_array_almost_equal( + eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled) + ) + sample_weight = np.random.RandomState(global_random_seed).uniform(size=(len(y),)) + eclf3 = VotingClassifier(estimators=[("lr", clf1)], voting="soft") + eclf3.fit(X_scaled, y, sample_weight=sample_weight) + clf1.fit(X_scaled, y, sample_weight) + assert_array_equal(eclf3.predict(X_scaled), clf1.predict(X_scaled)) + assert_array_almost_equal( + eclf3.predict_proba(X_scaled), clf1.predict_proba(X_scaled) + ) + + # check that an error is raised and indicative if sample_weight is not + # supported. + clf4 = KNeighborsClassifier() + eclf3 = VotingClassifier( + estimators=[("lr", clf1), ("svc", clf3), ("knn", clf4)], voting="soft" + ) + msg = "Underlying estimator KNeighborsClassifier does not support sample weights." + with pytest.raises(TypeError, match=msg): + eclf3.fit(X_scaled, y, sample_weight=sample_weight) + + # check that _fit_single_estimator will raise the right error + # it should raise the original error if this is not linked to sample_weight + class ClassifierErrorFit(ClassifierMixin, BaseEstimator): + def fit(self, X_scaled, y, sample_weight): + raise TypeError("Error unrelated to sample_weight.") + + clf = ClassifierErrorFit() + with pytest.raises(TypeError, match="Error unrelated to sample_weight"): + clf.fit(X_scaled, y, sample_weight=sample_weight) + + +def test_sample_weight_kwargs(): + """Check that VotingClassifier passes sample_weight as kwargs""" + + class MockClassifier(ClassifierMixin, BaseEstimator): + """Mock Classifier to check that sample_weight is received as kwargs""" + + def fit(self, X, y, *args, **sample_weight): + assert "sample_weight" in sample_weight + + clf = MockClassifier() + eclf = VotingClassifier(estimators=[("mock", clf)], voting="soft") + + # Should not raise an error. + eclf.fit(X, y, sample_weight=np.ones((len(y),))) + + +def test_voting_classifier_set_params(global_random_seed): + # check equivalence in the output when setting underlying estimators + clf1 = LogisticRegression(random_state=global_random_seed) + clf2 = RandomForestClassifier( + n_estimators=10, random_state=global_random_seed, max_depth=None + ) + clf3 = GaussianNB() + + eclf1 = VotingClassifier( + [("lr", clf1), ("rf", clf2)], voting="soft", weights=[1, 2] + ).fit(X_scaled, y) + eclf2 = VotingClassifier( + [("lr", clf1), ("nb", clf3)], voting="soft", weights=[1, 2] + ) + eclf2.set_params(nb=clf2).fit(X_scaled, y) + + assert_array_equal(eclf1.predict(X_scaled), eclf2.predict(X_scaled)) + assert_array_almost_equal( + eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled) + ) + assert eclf2.estimators[0][1].get_params() == clf1.get_params() + assert eclf2.estimators[1][1].get_params() == clf2.get_params() + + +def test_set_estimator_drop(): + # VotingClassifier set_params should be able to set estimators as drop + # Test predict + clf1 = LogisticRegression(random_state=123) + clf2 = RandomForestClassifier(n_estimators=10, random_state=123) + clf3 = GaussianNB() + eclf1 = VotingClassifier( + estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)], + voting="hard", + weights=[1, 0, 0.5], + ).fit(X, y) + + eclf2 = VotingClassifier( + estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)], + voting="hard", + weights=[1, 1, 0.5], + ) + eclf2.set_params(rf="drop").fit(X, y) + + assert_array_equal(eclf1.predict(X), eclf2.predict(X)) + + assert dict(eclf2.estimators)["rf"] == "drop" + assert len(eclf2.estimators_) == 2 + assert all( + isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_ + ) + assert eclf2.get_params()["rf"] == "drop" + + eclf1.set_params(voting="soft").fit(X, y) + eclf2.set_params(voting="soft").fit(X, y) + + assert_array_equal(eclf1.predict(X), eclf2.predict(X)) + assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) + msg = "All estimators are dropped. At least one is required" + with pytest.raises(ValueError, match=msg): + eclf2.set_params(lr="drop", rf="drop", nb="drop").fit(X, y) + + # Test soft voting transform + X1 = np.array([[1], [2]]) + y1 = np.array([1, 2]) + eclf1 = VotingClassifier( + estimators=[("rf", clf2), ("nb", clf3)], + voting="soft", + weights=[0, 0.5], + flatten_transform=False, + ).fit(X1, y1) + + eclf2 = VotingClassifier( + estimators=[("rf", clf2), ("nb", clf3)], + voting="soft", + weights=[1, 0.5], + flatten_transform=False, + ) + eclf2.set_params(rf="drop").fit(X1, y1) + assert_array_almost_equal( + eclf1.transform(X1), + np.array([[[0.7, 0.3], [0.3, 0.7]], [[1.0, 0.0], [0.0, 1.0]]]), + ) + assert_array_almost_equal(eclf2.transform(X1), np.array([[[1.0, 0.0], [0.0, 1.0]]])) + eclf1.set_params(voting="hard") + eclf2.set_params(voting="hard") + assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]])) + assert_array_equal(eclf2.transform(X1), np.array([[0], [1]])) + + +def test_estimator_weights_format(global_random_seed): + # Test estimator weights inputs as list and array + clf1 = LogisticRegression(random_state=global_random_seed) + clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed) + eclf1 = VotingClassifier( + estimators=[("lr", clf1), ("rf", clf2)], weights=[1, 2], voting="soft" + ) + eclf2 = VotingClassifier( + estimators=[("lr", clf1), ("rf", clf2)], weights=np.array((1, 2)), voting="soft" + ) + eclf1.fit(X_scaled, y) + eclf2.fit(X_scaled, y) + assert_array_almost_equal( + eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled) + ) + + +def test_transform(global_random_seed): + """Check transform method of VotingClassifier on toy dataset.""" + clf1 = LogisticRegression(random_state=global_random_seed) + clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed) + clf3 = GaussianNB() + X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]]) + y = np.array([1, 1, 2, 2]) + + eclf1 = VotingClassifier( + estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft" + ).fit(X, y) + eclf2 = VotingClassifier( + estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], + voting="soft", + flatten_transform=True, + ).fit(X, y) + eclf3 = VotingClassifier( + estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], + voting="soft", + flatten_transform=False, + ).fit(X, y) + + assert_array_equal(eclf1.transform(X).shape, (4, 6)) + assert_array_equal(eclf2.transform(X).shape, (4, 6)) + assert_array_equal(eclf3.transform(X).shape, (3, 4, 2)) + assert_array_almost_equal(eclf1.transform(X), eclf2.transform(X)) + assert_array_almost_equal( + eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X) + ) + + +@pytest.mark.parametrize( + "X, y, voter", + [ + ( + X, + y, + VotingClassifier( + [ + ("lr", LogisticRegression()), + ("rf", RandomForestClassifier(n_estimators=5)), + ] + ), + ), + ( + X_r, + y_r, + VotingRegressor( + [ + ("lr", LinearRegression()), + ("rf", RandomForestRegressor(n_estimators=5)), + ] + ), + ), + ], +) +def test_none_estimator_with_weights(X, y, voter): + # check that an estimator can be set to 'drop' and passing some weight + # regression test for + # https://github.com/scikit-learn/scikit-learn/issues/13777 + voter = clone(voter) + # Scaled to solve ConvergenceWarning throw by Logistic Regression + X_scaled = StandardScaler().fit_transform(X) + voter.fit(X_scaled, y, sample_weight=np.ones(y.shape)) + voter.set_params(lr="drop") + voter.fit(X_scaled, y, sample_weight=np.ones(y.shape)) + y_pred = voter.predict(X_scaled) + assert y_pred.shape == y.shape + + +@pytest.mark.parametrize( + "est", + [ + VotingRegressor( + estimators=[ + ("lr", LinearRegression()), + ("tree", DecisionTreeRegressor(random_state=0)), + ] + ), + VotingClassifier( + estimators=[ + ("lr", LogisticRegression(random_state=0)), + ("tree", DecisionTreeClassifier(random_state=0)), + ] + ), + ], + ids=["VotingRegressor", "VotingClassifier"], +) +def test_n_features_in(est): + X = [[1, 2], [3, 4], [5, 6]] + y = [0, 1, 2] + + assert not hasattr(est, "n_features_in_") + est.fit(X, y) + assert est.n_features_in_ == 2 + + +@pytest.mark.parametrize( + "estimator", + [ + VotingRegressor( + estimators=[ + ("lr", LinearRegression()), + ("rf", RandomForestRegressor(random_state=123)), + ], + verbose=True, + ), + VotingClassifier( + estimators=[ + ("lr", LogisticRegression(random_state=123)), + ("rf", RandomForestClassifier(random_state=123)), + ], + verbose=True, + ), + ], +) +def test_voting_verbose(estimator, capsys): + X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]]) + y = np.array([1, 1, 2, 2]) + + pattern = ( + r"\[Voting\].*\(1 of 2\) Processing lr, total=.*\n" + r"\[Voting\].*\(2 of 2\) Processing rf, total=.*\n$" + ) + clone(estimator).fit(X, y) + assert re.match(pattern, capsys.readouterr()[0]) + + +def test_get_features_names_out_regressor(): + """Check get_feature_names_out output for regressor.""" + + X = [[1, 2], [3, 4], [5, 6]] + y = [0, 1, 2] + + voting = VotingRegressor( + estimators=[ + ("lr", LinearRegression()), + ("tree", DecisionTreeRegressor(random_state=0)), + ("ignore", "drop"), + ] + ) + voting.fit(X, y) + + names_out = voting.get_feature_names_out() + expected_names = ["votingregressor_lr", "votingregressor_tree"] + assert_array_equal(names_out, expected_names) + + +@pytest.mark.parametrize( + "kwargs, expected_names", + [ + ( + {"voting": "soft", "flatten_transform": True}, + [ + "votingclassifier_lr0", + "votingclassifier_lr1", + "votingclassifier_lr2", + "votingclassifier_tree0", + "votingclassifier_tree1", + "votingclassifier_tree2", + ], + ), + ({"voting": "hard"}, ["votingclassifier_lr", "votingclassifier_tree"]), + ], +) +def test_get_features_names_out_classifier(kwargs, expected_names): + """Check get_feature_names_out for classifier for different settings.""" + X = [[1, 2], [3, 4], [5, 6], [1, 1.2]] + y = [0, 1, 2, 0] + + voting = VotingClassifier( + estimators=[ + ("lr", LogisticRegression(random_state=0)), + ("tree", DecisionTreeClassifier(random_state=0)), + ], + **kwargs, + ) + voting.fit(X, y) + X_trans = voting.transform(X) + names_out = voting.get_feature_names_out() + + assert X_trans.shape[1] == len(expected_names) + assert_array_equal(names_out, expected_names) + + +def test_get_features_names_out_classifier_error(): + """Check that error is raised when voting="soft" and flatten_transform=False.""" + X = [[1, 2], [3, 4], [5, 6]] + y = [0, 1, 2] + + voting = VotingClassifier( + estimators=[ + ("lr", LogisticRegression(random_state=0)), + ("tree", DecisionTreeClassifier(random_state=0)), + ], + voting="soft", + flatten_transform=False, + ) + voting.fit(X, y) + + msg = ( + "get_feature_names_out is not supported when `voting='soft'` and " + "`flatten_transform=False`" + ) + with pytest.raises(ValueError, match=msg): + voting.get_feature_names_out() + + +# Metadata Routing Tests +# ====================== + + +@pytest.mark.parametrize( + "Estimator, Child", + [(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)], +) +def test_routing_passed_metadata_not_supported(Estimator, Child): + """Test that the right error message is raised when metadata is passed while + not supported when `enable_metadata_routing=False`.""" + + X = np.array([[0, 1], [2, 2], [4, 6]]) + y = [1, 2, 3] + + with pytest.raises( + ValueError, match="is only supported if enable_metadata_routing=True" + ): + Estimator(["clf", Child()]).fit(X, y, sample_weight=[1, 1, 1], metadata="a") + + +@pytest.mark.parametrize( + "Estimator, Child", + [(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)], +) +@config_context(enable_metadata_routing=True) +def test_get_metadata_routing_without_fit(Estimator, Child): + # Test that metadata_routing() doesn't raise when called before fit. + est = Estimator([("sub_est", Child())]) + est.get_metadata_routing() + + +@pytest.mark.parametrize( + "Estimator, Child", + [(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)], +) +@pytest.mark.parametrize("prop", ["sample_weight", "metadata"]) +@config_context(enable_metadata_routing=True) +def test_metadata_routing_for_voting_estimators(Estimator, Child, prop): + """Test that metadata is routed correctly for Voting*.""" + X = np.array([[0, 1], [2, 2], [4, 6]]) + y = [1, 2, 3] + sample_weight, metadata = [1, 1, 1], "a" + + est = Estimator( + [ + ( + "sub_est1", + Child(registry=_Registry()).set_fit_request(**{prop: True}), + ), + ( + "sub_est2", + Child(registry=_Registry()).set_fit_request(**{prop: True}), + ), + ] + ) + + est.fit(X, y, **{prop: sample_weight if prop == "sample_weight" else metadata}) + + for estimator in est.estimators: + if prop == "sample_weight": + kwargs = {prop: sample_weight} + else: + kwargs = {prop: metadata} + # access sub-estimator in (name, est) with estimator[1] + registry = estimator[1].registry + assert len(registry) + for sub_est in registry: + check_recorded_metadata(obj=sub_est, method="fit", parent="fit", **kwargs) + + +@pytest.mark.parametrize( + "Estimator, Child", + [(VotingClassifier, ConsumingClassifier), (VotingRegressor, ConsumingRegressor)], +) +@config_context(enable_metadata_routing=True) +def test_metadata_routing_error_for_voting_estimators(Estimator, Child): + """Test that the right error is raised when metadata is not requested.""" + X = np.array([[0, 1], [2, 2], [4, 6]]) + y = [1, 2, 3] + sample_weight, metadata = [1, 1, 1], "a" + + est = Estimator([("sub_est", Child())]) + + error_message = ( + "[sample_weight, metadata] are passed but are not explicitly set as requested" + f" or not requested for {Child.__name__}.fit" + ) + + with pytest.raises(ValueError, match=re.escape(error_message)): + est.fit(X, y, sample_weight=sample_weight, metadata=metadata) + + +# End of Metadata Routing Tests +# ============================= diff --git a/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_weight_boosting.py b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_weight_boosting.py new file mode 100644 index 0000000000000000000000000000000000000000..55825c438d76b29b74d8108970f72e3ebaa5e745 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/ensemble/tests/test_weight_boosting.py @@ -0,0 +1,639 @@ +"""Testing for the boost module (sklearn.ensemble.boost).""" + +import re + +import numpy as np +import pytest + +from sklearn import datasets +from sklearn.base import BaseEstimator, clone +from sklearn.dummy import DummyClassifier, DummyRegressor +from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor +from sklearn.ensemble._weight_boosting import _samme_proba +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import GridSearchCV, train_test_split +from sklearn.svm import SVC, SVR +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +from sklearn.utils import shuffle +from sklearn.utils._mocking import NoSampleWeightWrapper +from sklearn.utils._testing import ( + assert_allclose, + assert_array_almost_equal, + assert_array_equal, +) +from sklearn.utils.fixes import ( + COO_CONTAINERS, + CSC_CONTAINERS, + CSR_CONTAINERS, + DOK_CONTAINERS, + LIL_CONTAINERS, +) + +# Common random state +rng = np.random.RandomState(0) + +# Toy sample +X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] +y_class = ["foo", "foo", "foo", 1, 1, 1] # test string class labels +y_regr = [-1, -1, -1, 1, 1, 1] +T = [[-1, -1], [2, 2], [3, 2]] +y_t_class = ["foo", 1, 1] +y_t_regr = [-1, 1, 1] + +# Load the iris dataset and randomly permute it +iris = datasets.load_iris() +perm = rng.permutation(iris.target.size) +iris.data, iris.target = shuffle(iris.data, iris.target, random_state=rng) + +# Load the diabetes dataset and randomly permute it +diabetes = datasets.load_diabetes() +diabetes.data, diabetes.target = shuffle( + diabetes.data, diabetes.target, random_state=rng +) + + +def test_samme_proba(): + # Test the `_samme_proba` helper function. + + # Define some example (bad) `predict_proba` output. + probs = np.array( + [[1, 1e-6, 0], [0.19, 0.6, 0.2], [-999, 0.51, 0.5], [1e-6, 1, 1e-9]] + ) + probs /= np.abs(probs.sum(axis=1))[:, np.newaxis] + + # _samme_proba calls estimator.predict_proba. + # Make a mock object so I can control what gets returned. + class MockEstimator: + def predict_proba(self, X): + assert_array_equal(X.shape, probs.shape) + return probs + + mock = MockEstimator() + + samme_proba = _samme_proba(mock, 3, np.ones_like(probs)) + + assert_array_equal(samme_proba.shape, probs.shape) + assert np.isfinite(samme_proba).all() + + # Make sure that the correct elements come out as smallest -- + # `_samme_proba` should preserve the ordering in each example. + assert_array_equal(np.argmin(samme_proba, axis=1), [2, 0, 0, 2]) + assert_array_equal(np.argmax(samme_proba, axis=1), [0, 1, 1, 1]) + + +def test_oneclass_adaboost_proba(): + # Test predict_proba robustness for one class label input. + # In response to issue #7501 + # https://github.com/scikit-learn/scikit-learn/issues/7501 + y_t = np.ones(len(X)) + clf = AdaBoostClassifier().fit(X, y_t) + assert_array_almost_equal(clf.predict_proba(X), np.ones((len(X), 1))) + + +def test_classification_toy(): + # Check classification on a toy dataset. + clf = AdaBoostClassifier(random_state=0) + clf.fit(X, y_class) + assert_array_equal(clf.predict(T), y_t_class) + assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_) + assert clf.predict_proba(T).shape == (len(T), 2) + assert clf.decision_function(T).shape == (len(T),) + + +def test_regression_toy(): + # Check classification on a toy dataset. + clf = AdaBoostRegressor(random_state=0) + clf.fit(X, y_regr) + assert_array_equal(clf.predict(T), y_t_regr) + + +def test_iris(): + # Check consistency on dataset iris. + classes = np.unique(iris.target) + + clf = AdaBoostClassifier() + clf.fit(iris.data, iris.target) + + assert_array_equal(classes, clf.classes_) + proba = clf.predict_proba(iris.data) + + assert proba.shape[1] == len(classes) + assert clf.decision_function(iris.data).shape[1] == len(classes) + + score = clf.score(iris.data, iris.target) + assert score > 0.9, f"Failed with {score = }" + + # Check we used multiple estimators + assert len(clf.estimators_) > 1 + # Check for distinct random states (see issue #7408) + assert len(set(est.random_state for est in clf.estimators_)) == len(clf.estimators_) + + +@pytest.mark.parametrize("loss", ["linear", "square", "exponential"]) +def test_diabetes(loss): + # Check consistency on dataset diabetes. + reg = AdaBoostRegressor(loss=loss, random_state=0) + reg.fit(diabetes.data, diabetes.target) + score = reg.score(diabetes.data, diabetes.target) + assert score > 0.55 + + # Check we used multiple estimators + assert len(reg.estimators_) > 1 + # Check for distinct random states (see issue #7408) + assert len(set(est.random_state for est in reg.estimators_)) == len(reg.estimators_) + + +def test_staged_predict(): + # Check staged predictions. + rng = np.random.RandomState(0) + iris_weights = rng.randint(10, size=iris.target.shape) + diabetes_weights = rng.randint(10, size=diabetes.target.shape) + + clf = AdaBoostClassifier(n_estimators=10) + clf.fit(iris.data, iris.target, sample_weight=iris_weights) + + predictions = clf.predict(iris.data) + staged_predictions = [p for p in clf.staged_predict(iris.data)] + proba = clf.predict_proba(iris.data) + staged_probas = [p for p in clf.staged_predict_proba(iris.data)] + score = clf.score(iris.data, iris.target, sample_weight=iris_weights) + staged_scores = [ + s for s in clf.staged_score(iris.data, iris.target, sample_weight=iris_weights) + ] + + assert len(staged_predictions) == 10 + assert_array_almost_equal(predictions, staged_predictions[-1]) + assert len(staged_probas) == 10 + assert_array_almost_equal(proba, staged_probas[-1]) + assert len(staged_scores) == 10 + assert_array_almost_equal(score, staged_scores[-1]) + + # AdaBoost regression + clf = AdaBoostRegressor(n_estimators=10, random_state=0) + clf.fit(diabetes.data, diabetes.target, sample_weight=diabetes_weights) + + predictions = clf.predict(diabetes.data) + staged_predictions = [p for p in clf.staged_predict(diabetes.data)] + score = clf.score(diabetes.data, diabetes.target, sample_weight=diabetes_weights) + staged_scores = [ + s + for s in clf.staged_score( + diabetes.data, diabetes.target, sample_weight=diabetes_weights + ) + ] + + assert len(staged_predictions) == 10 + assert_array_almost_equal(predictions, staged_predictions[-1]) + assert len(staged_scores) == 10 + assert_array_almost_equal(score, staged_scores[-1]) + + +def test_gridsearch(): + # Check that base trees can be grid-searched. + # AdaBoost classification + boost = AdaBoostClassifier(estimator=DecisionTreeClassifier()) + parameters = { + "n_estimators": (1, 2), + "estimator__max_depth": (1, 2), + } + clf = GridSearchCV(boost, parameters) + clf.fit(iris.data, iris.target) + + # AdaBoost regression + boost = AdaBoostRegressor(estimator=DecisionTreeRegressor(), random_state=0) + parameters = {"n_estimators": (1, 2), "estimator__max_depth": (1, 2)} + clf = GridSearchCV(boost, parameters) + clf.fit(diabetes.data, diabetes.target) + + +def test_pickle(): + # Check pickability. + import pickle + + # Adaboost classifier + obj = AdaBoostClassifier() + obj.fit(iris.data, iris.target) + score = obj.score(iris.data, iris.target) + s = pickle.dumps(obj) + + obj2 = pickle.loads(s) + assert type(obj2) == obj.__class__ + score2 = obj2.score(iris.data, iris.target) + assert score == score2 + + # Adaboost regressor + obj = AdaBoostRegressor(random_state=0) + obj.fit(diabetes.data, diabetes.target) + score = obj.score(diabetes.data, diabetes.target) + s = pickle.dumps(obj) + + obj2 = pickle.loads(s) + assert type(obj2) == obj.__class__ + score2 = obj2.score(diabetes.data, diabetes.target) + assert score == score2 + + +def test_importances(): + # Check variable importances. + X, y = datasets.make_classification( + n_samples=2000, + n_features=10, + n_informative=3, + n_redundant=0, + n_repeated=0, + shuffle=False, + random_state=1, + ) + + clf = AdaBoostClassifier() + + clf.fit(X, y) + importances = clf.feature_importances_ + + assert importances.shape[0] == 10 + assert (importances[:3, np.newaxis] >= importances[3:]).all() + + +def test_adaboost_classifier_sample_weight_error(): + # Test that it gives proper exception on incorrect sample weight. + clf = AdaBoostClassifier() + msg = re.escape("sample_weight.shape == (1,), expected (6,)") + with pytest.raises(ValueError, match=msg): + clf.fit(X, y_class, sample_weight=np.asarray([-1])) + + +def test_estimator(): + # Test different estimators. + from sklearn.ensemble import RandomForestClassifier + + # XXX doesn't work with y_class because RF doesn't support classes_ + # Shouldn't AdaBoost run a LabelBinarizer? + clf = AdaBoostClassifier(RandomForestClassifier()) + clf.fit(X, y_regr) + + clf = AdaBoostClassifier(SVC()) + clf.fit(X, y_class) + + from sklearn.ensemble import RandomForestRegressor + + clf = AdaBoostRegressor(RandomForestRegressor(), random_state=0) + clf.fit(X, y_regr) + + clf = AdaBoostRegressor(SVR(), random_state=0) + clf.fit(X, y_regr) + + # Check that an empty discrete ensemble fails in fit, not predict. + X_fail = [[1, 1], [1, 1], [1, 1], [1, 1]] + y_fail = ["foo", "bar", 1, 2] + clf = AdaBoostClassifier(SVC()) + with pytest.raises(ValueError, match="worse than random"): + clf.fit(X_fail, y_fail) + + +def test_sample_weights_infinite(): + msg = "Sample weights have reached infinite values" + clf = AdaBoostClassifier(n_estimators=30, learning_rate=23.0) + with pytest.warns(UserWarning, match=msg): + clf.fit(iris.data, iris.target) + + +@pytest.mark.parametrize( + "sparse_container, expected_internal_type", + zip( + [ + *CSC_CONTAINERS, + *CSR_CONTAINERS, + *LIL_CONTAINERS, + *COO_CONTAINERS, + *DOK_CONTAINERS, + ], + CSC_CONTAINERS + 4 * CSR_CONTAINERS, + ), +) +def test_sparse_classification(sparse_container, expected_internal_type): + # Check classification with sparse input. + + class CustomSVC(SVC): + """SVC variant that records the nature of the training set.""" + + def fit(self, X, y, sample_weight=None): + """Modification on fit caries data type for later verification.""" + super().fit(X, y, sample_weight=sample_weight) + self.data_type_ = type(X) + return self + + X, y = datasets.make_multilabel_classification( + n_classes=1, n_samples=15, n_features=5, random_state=42 + ) + # Flatten y to a 1d array + y = np.ravel(y) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + X_train_sparse = sparse_container(X_train) + X_test_sparse = sparse_container(X_test) + + # Trained on sparse format + sparse_classifier = AdaBoostClassifier( + estimator=CustomSVC(probability=True), + random_state=1, + ).fit(X_train_sparse, y_train) + + # Trained on dense format + dense_classifier = AdaBoostClassifier( + estimator=CustomSVC(probability=True), + random_state=1, + ).fit(X_train, y_train) + + # predict + sparse_clf_results = sparse_classifier.predict(X_test_sparse) + dense_clf_results = dense_classifier.predict(X_test) + assert_array_equal(sparse_clf_results, dense_clf_results) + + # decision_function + sparse_clf_results = sparse_classifier.decision_function(X_test_sparse) + dense_clf_results = dense_classifier.decision_function(X_test) + assert_array_almost_equal(sparse_clf_results, dense_clf_results) + + # predict_log_proba + sparse_clf_results = sparse_classifier.predict_log_proba(X_test_sparse) + dense_clf_results = dense_classifier.predict_log_proba(X_test) + assert_array_almost_equal(sparse_clf_results, dense_clf_results) + + # predict_proba + sparse_clf_results = sparse_classifier.predict_proba(X_test_sparse) + dense_clf_results = dense_classifier.predict_proba(X_test) + assert_array_almost_equal(sparse_clf_results, dense_clf_results) + + # score + sparse_clf_results = sparse_classifier.score(X_test_sparse, y_test) + dense_clf_results = dense_classifier.score(X_test, y_test) + assert_array_almost_equal(sparse_clf_results, dense_clf_results) + + # staged_decision_function + sparse_clf_results = sparse_classifier.staged_decision_function(X_test_sparse) + dense_clf_results = dense_classifier.staged_decision_function(X_test) + for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results): + assert_array_almost_equal(sparse_clf_res, dense_clf_res) + + # staged_predict + sparse_clf_results = sparse_classifier.staged_predict(X_test_sparse) + dense_clf_results = dense_classifier.staged_predict(X_test) + for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results): + assert_array_equal(sparse_clf_res, dense_clf_res) + + # staged_predict_proba + sparse_clf_results = sparse_classifier.staged_predict_proba(X_test_sparse) + dense_clf_results = dense_classifier.staged_predict_proba(X_test) + for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results): + assert_array_almost_equal(sparse_clf_res, dense_clf_res) + + # staged_score + sparse_clf_results = sparse_classifier.staged_score(X_test_sparse, y_test) + dense_clf_results = dense_classifier.staged_score(X_test, y_test) + for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results): + assert_array_equal(sparse_clf_res, dense_clf_res) + + # Verify sparsity of data is maintained during training + types = [i.data_type_ for i in sparse_classifier.estimators_] + + assert all([t == expected_internal_type for t in types]) + + +@pytest.mark.parametrize( + "sparse_container, expected_internal_type", + zip( + [ + *CSC_CONTAINERS, + *CSR_CONTAINERS, + *LIL_CONTAINERS, + *COO_CONTAINERS, + *DOK_CONTAINERS, + ], + CSC_CONTAINERS + 4 * CSR_CONTAINERS, + ), +) +def test_sparse_regression(sparse_container, expected_internal_type): + # Check regression with sparse input. + + class CustomSVR(SVR): + """SVR variant that records the nature of the training set.""" + + def fit(self, X, y, sample_weight=None): + """Modification on fit caries data type for later verification.""" + super().fit(X, y, sample_weight=sample_weight) + self.data_type_ = type(X) + return self + + X, y = datasets.make_regression( + n_samples=15, n_features=50, n_targets=1, random_state=42 + ) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + X_train_sparse = sparse_container(X_train) + X_test_sparse = sparse_container(X_test) + + # Trained on sparse format + sparse_regressor = AdaBoostRegressor(estimator=CustomSVR(), random_state=1).fit( + X_train_sparse, y_train + ) + + # Trained on dense format + dense_regressor = AdaBoostRegressor(estimator=CustomSVR(), random_state=1).fit( + X_train, y_train + ) + + # predict + sparse_regr_results = sparse_regressor.predict(X_test_sparse) + dense_regr_results = dense_regressor.predict(X_test) + assert_array_almost_equal(sparse_regr_results, dense_regr_results) + + # staged_predict + sparse_regr_results = sparse_regressor.staged_predict(X_test_sparse) + dense_regr_results = dense_regressor.staged_predict(X_test) + for sparse_regr_res, dense_regr_res in zip(sparse_regr_results, dense_regr_results): + assert_array_almost_equal(sparse_regr_res, dense_regr_res) + + types = [i.data_type_ for i in sparse_regressor.estimators_] + + assert all([t == expected_internal_type for t in types]) + + +def test_sample_weight_adaboost_regressor(): + """ + AdaBoostRegressor should work without sample_weights in the base estimator + The random weighted sampling is done internally in the _boost method in + AdaBoostRegressor. + """ + + class DummyEstimator(BaseEstimator): + def fit(self, X, y): + pass + + def predict(self, X): + return np.zeros(X.shape[0]) + + boost = AdaBoostRegressor(DummyEstimator(), n_estimators=3) + boost.fit(X, y_regr) + assert len(boost.estimator_weights_) == len(boost.estimator_errors_) + + +def test_multidimensional_X(): + """ + Check that the AdaBoost estimators can work with n-dimensional + data matrix + """ + rng = np.random.RandomState(0) + + X = rng.randn(51, 3, 3) + yc = rng.choice([0, 1], 51) + yr = rng.randn(51) + + boost = AdaBoostClassifier(DummyClassifier(strategy="most_frequent")) + boost.fit(X, yc) + boost.predict(X) + boost.predict_proba(X) + + boost = AdaBoostRegressor(DummyRegressor()) + boost.fit(X, yr) + boost.predict(X) + + +def test_adaboostclassifier_without_sample_weight(): + X, y = iris.data, iris.target + estimator = NoSampleWeightWrapper(DummyClassifier()) + clf = AdaBoostClassifier(estimator=estimator) + err_msg = "{} doesn't support sample_weight".format(estimator.__class__.__name__) + with pytest.raises(ValueError, match=err_msg): + clf.fit(X, y) + + +def test_adaboostregressor_sample_weight(): + # check that giving weight will have an influence on the error computed + # for a weak learner + rng = np.random.RandomState(42) + X = np.linspace(0, 100, num=1000) + y = (0.8 * X + 0.2) + (rng.rand(X.shape[0]) * 0.0001) + X = X.reshape(-1, 1) + + # add an arbitrary outlier + X[-1] *= 10 + y[-1] = 10000 + + # random_state=0 ensure that the underlying bootstrap will use the outlier + regr_no_outlier = AdaBoostRegressor( + estimator=LinearRegression(), n_estimators=1, random_state=0 + ) + regr_with_weight = clone(regr_no_outlier) + regr_with_outlier = clone(regr_no_outlier) + + # fit 3 models: + # - a model containing the outlier + # - a model without the outlier + # - a model containing the outlier but with a null sample-weight + regr_with_outlier.fit(X, y) + regr_no_outlier.fit(X[:-1], y[:-1]) + sample_weight = np.ones_like(y) + sample_weight[-1] = 0 + regr_with_weight.fit(X, y, sample_weight=sample_weight) + + score_with_outlier = regr_with_outlier.score(X[:-1], y[:-1]) + score_no_outlier = regr_no_outlier.score(X[:-1], y[:-1]) + score_with_weight = regr_with_weight.score(X[:-1], y[:-1]) + + assert score_with_outlier < score_no_outlier + assert score_with_outlier < score_with_weight + assert score_no_outlier == pytest.approx(score_with_weight) + + +def test_adaboost_consistent_predict(): + # check that predict_proba and predict give consistent results + # regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/14084 + X_train, X_test, y_train, y_test = train_test_split( + *datasets.load_digits(return_X_y=True), random_state=42 + ) + model = AdaBoostClassifier(random_state=42) + model.fit(X_train, y_train) + + assert_array_equal( + np.argmax(model.predict_proba(X_test), axis=1), model.predict(X_test) + ) + + +@pytest.mark.parametrize( + "model, X, y", + [ + (AdaBoostClassifier(), iris.data, iris.target), + (AdaBoostRegressor(), diabetes.data, diabetes.target), + ], +) +def test_adaboost_negative_weight_error(model, X, y): + sample_weight = np.ones_like(y) + sample_weight[-1] = -10 + + err_msg = "Negative values in data passed to `sample_weight`" + with pytest.raises(ValueError, match=err_msg): + model.fit(X, y, sample_weight=sample_weight) + + +def test_adaboost_numerically_stable_feature_importance_with_small_weights(): + """Check that we don't create NaN feature importance with numerically + instable inputs. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/20320 + """ + rng = np.random.RandomState(42) + X = rng.normal(size=(1000, 10)) + y = rng.choice([0, 1], size=1000) + sample_weight = np.ones_like(y) * 1e-263 + tree = DecisionTreeClassifier(max_depth=10, random_state=12) + ada_model = AdaBoostClassifier(estimator=tree, n_estimators=20, random_state=12) + ada_model.fit(X, y, sample_weight=sample_weight) + assert np.isnan(ada_model.feature_importances_).sum() == 0 + + +def test_adaboost_decision_function(global_random_seed): + """Check that the decision function respects the symmetric constraint for weak + learners. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/26520 + """ + n_classes = 3 + X, y = datasets.make_classification( + n_classes=n_classes, n_clusters_per_class=1, random_state=global_random_seed + ) + clf = AdaBoostClassifier(n_estimators=1, random_state=global_random_seed).fit(X, y) + + y_score = clf.decision_function(X) + assert_allclose(y_score.sum(axis=1), 0, atol=1e-8) + + # With a single learner, we expect to have a decision function in + # {1, - 1 / (n_classes - 1)}. + assert set(np.unique(y_score)) == {1, -1 / (n_classes - 1)} + + # We can assert the same for staged_decision_function since we have a single learner + for y_score in clf.staged_decision_function(X): + assert_allclose(y_score.sum(axis=1), 0, atol=1e-8) + + # With a single learner, we expect to have a decision function in + # {1, - 1 / (n_classes - 1)}. + assert set(np.unique(y_score)) == {1, -1 / (n_classes - 1)} + + clf.set_params(n_estimators=5).fit(X, y) + + y_score = clf.decision_function(X) + assert_allclose(y_score.sum(axis=1), 0, atol=1e-8) + + for y_score in clf.staged_decision_function(X): + assert_allclose(y_score.sum(axis=1), 0, atol=1e-8) + + +# TODO(1.8): remove +def test_deprecated_algorithm(): + adaboost_clf = AdaBoostClassifier(n_estimators=1, algorithm="SAMME") + with pytest.warns(FutureWarning, match="The parameter 'algorithm' is deprecated"): + adaboost_clf.fit(X, y_class) diff --git a/.venv/lib/python3.12/site-packages/sklearn/experimental/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/experimental/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..593d247e5bc403056808dafa8fba9d511457fbd0 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/experimental/__init__.py @@ -0,0 +1,10 @@ +"""Importable modules that enable the use of experimental features or estimators. + +.. warning:: + + The features and estimators that are experimental aren't subject to + deprecation cycles. Use them at your own risks! +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause diff --git a/.venv/lib/python3.12/site-packages/sklearn/experimental/enable_halving_search_cv.py b/.venv/lib/python3.12/site-packages/sklearn/experimental/enable_halving_search_cv.py new file mode 100644 index 0000000000000000000000000000000000000000..85f93b26459d0c5e154dc9e7000e81d586cb701e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/experimental/enable_halving_search_cv.py @@ -0,0 +1,35 @@ +"""Enables Successive Halving search-estimators + +The API and results of these estimators might change without any deprecation +cycle. + +Importing this file dynamically sets the +:class:`~sklearn.model_selection.HalvingRandomSearchCV` and +:class:`~sklearn.model_selection.HalvingGridSearchCV` as attributes of the +`model_selection` module:: + + >>> # explicitly require this experimental feature + >>> from sklearn.experimental import enable_halving_search_cv # noqa + >>> # now you can import normally from model_selection + >>> from sklearn.model_selection import HalvingRandomSearchCV + >>> from sklearn.model_selection import HalvingGridSearchCV + + +The ``# noqa`` comment comment can be removed: it just tells linters like +flake8 to ignore the import, which appears as unused. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from .. import model_selection +from ..model_selection._search_successive_halving import ( + HalvingGridSearchCV, + HalvingRandomSearchCV, +) + +# use settattr to avoid mypy errors when monkeypatching +setattr(model_selection, "HalvingRandomSearchCV", HalvingRandomSearchCV) +setattr(model_selection, "HalvingGridSearchCV", HalvingGridSearchCV) + +model_selection.__all__ += ["HalvingRandomSearchCV", "HalvingGridSearchCV"] diff --git a/.venv/lib/python3.12/site-packages/sklearn/experimental/enable_hist_gradient_boosting.py b/.venv/lib/python3.12/site-packages/sklearn/experimental/enable_hist_gradient_boosting.py new file mode 100644 index 0000000000000000000000000000000000000000..589348fe9bc21de2ae642d51be152de7958be0b1 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/experimental/enable_hist_gradient_boosting.py @@ -0,0 +1,23 @@ +"""This is now a no-op and can be safely removed from your code. + +It used to enable the use of +:class:`~sklearn.ensemble.HistGradientBoostingClassifier` and +:class:`~sklearn.ensemble.HistGradientBoostingRegressor` when they were still +:term:`experimental`, but these estimators are now stable and can be imported +normally from `sklearn.ensemble`. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# Don't remove this file, we don't want to break users code just because the +# feature isn't experimental anymore. + +import warnings + +warnings.warn( + "Since version 1.0, " + "it is not needed to import enable_hist_gradient_boosting anymore. " + "HistGradientBoostingClassifier and HistGradientBoostingRegressor are now " + "stable and can be normally imported from sklearn.ensemble." +) diff --git a/.venv/lib/python3.12/site-packages/sklearn/experimental/enable_iterative_imputer.py b/.venv/lib/python3.12/site-packages/sklearn/experimental/enable_iterative_imputer.py new file mode 100644 index 0000000000000000000000000000000000000000..544e0d60eea2863a4516da89d5af475d94f4aba3 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/experimental/enable_iterative_imputer.py @@ -0,0 +1,23 @@ +"""Enables IterativeImputer + +The API and results of this estimator might change without any deprecation +cycle. + +Importing this file dynamically sets :class:`~sklearn.impute.IterativeImputer` +as an attribute of the impute module:: + + >>> # explicitly require this experimental feature + >>> from sklearn.experimental import enable_iterative_imputer # noqa + >>> # now you can import normally from impute + >>> from sklearn.impute import IterativeImputer +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from .. import impute +from ..impute._iterative import IterativeImputer + +# use settattr to avoid mypy errors when monkeypatching +setattr(impute, "IterativeImputer", IterativeImputer) +impute.__all__ += ["IterativeImputer"] diff --git a/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py b/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py new file mode 100644 index 0000000000000000000000000000000000000000..a247bfd3f64280cc338825c7695da9f9cb7688e0 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py @@ -0,0 +1,19 @@ +"""Tests for making sure experimental imports work as expected.""" + +import textwrap + +import pytest + +from sklearn.utils._testing import assert_run_python_script_without_output +from sklearn.utils.fixes import _IS_WASM + + +@pytest.mark.xfail(_IS_WASM, reason="cannot start subprocess") +def test_import_raises_warning(): + code = """ + import pytest + with pytest.warns(UserWarning, match="it is not needed to import"): + from sklearn.experimental import enable_hist_gradient_boosting # noqa + """ + pattern = "it is not needed to import enable_hist_gradient_boosting anymore" + assert_run_python_script_without_output(textwrap.dedent(code), pattern=pattern) diff --git a/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/test_enable_iterative_imputer.py b/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/test_enable_iterative_imputer.py new file mode 100644 index 0000000000000000000000000000000000000000..17e9dfa0d037612d639a0e070fff8fd432b526a2 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/test_enable_iterative_imputer.py @@ -0,0 +1,51 @@ +"""Tests for making sure experimental imports work as expected.""" + +import textwrap + +import pytest + +from sklearn.utils._testing import assert_run_python_script_without_output +from sklearn.utils.fixes import _IS_WASM + + +@pytest.mark.xfail(_IS_WASM, reason="cannot start subprocess") +def test_imports_strategies(): + # Make sure different import strategies work or fail as expected. + + # Since Python caches the imported modules, we need to run a child process + # for every test case. Else, the tests would not be independent + # (manually removing the imports from the cache (sys.modules) is not + # recommended and can lead to many complications). + pattern = "IterativeImputer is experimental" + good_import = """ + from sklearn.experimental import enable_iterative_imputer + from sklearn.impute import IterativeImputer + """ + assert_run_python_script_without_output( + textwrap.dedent(good_import), pattern=pattern + ) + + good_import_with_ensemble_first = """ + import sklearn.ensemble + from sklearn.experimental import enable_iterative_imputer + from sklearn.impute import IterativeImputer + """ + assert_run_python_script_without_output( + textwrap.dedent(good_import_with_ensemble_first), + pattern=pattern, + ) + + bad_imports = f""" + import pytest + + with pytest.raises(ImportError, match={pattern!r}): + from sklearn.impute import IterativeImputer + + import sklearn.experimental + with pytest.raises(ImportError, match={pattern!r}): + from sklearn.impute import IterativeImputer + """ + assert_run_python_script_without_output( + textwrap.dedent(bad_imports), + pattern=pattern, + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/test_enable_successive_halving.py b/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/test_enable_successive_halving.py new file mode 100644 index 0000000000000000000000000000000000000000..0ba273f94cc496550ab6b9a4d2b688a88d6fc43b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/experimental/tests/test_enable_successive_halving.py @@ -0,0 +1,53 @@ +"""Tests for making sure experimental imports work as expected.""" + +import textwrap + +import pytest + +from sklearn.utils._testing import assert_run_python_script_without_output +from sklearn.utils.fixes import _IS_WASM + + +@pytest.mark.xfail(_IS_WASM, reason="cannot start subprocess") +def test_imports_strategies(): + # Make sure different import strategies work or fail as expected. + + # Since Python caches the imported modules, we need to run a child process + # for every test case. Else, the tests would not be independent + # (manually removing the imports from the cache (sys.modules) is not + # recommended and can lead to many complications). + pattern = "Halving(Grid|Random)SearchCV is experimental" + good_import = """ + from sklearn.experimental import enable_halving_search_cv + from sklearn.model_selection import HalvingGridSearchCV + from sklearn.model_selection import HalvingRandomSearchCV + """ + assert_run_python_script_without_output( + textwrap.dedent(good_import), pattern=pattern + ) + + good_import_with_model_selection_first = """ + import sklearn.model_selection + from sklearn.experimental import enable_halving_search_cv + from sklearn.model_selection import HalvingGridSearchCV + from sklearn.model_selection import HalvingRandomSearchCV + """ + assert_run_python_script_without_output( + textwrap.dedent(good_import_with_model_selection_first), + pattern=pattern, + ) + + bad_imports = f""" + import pytest + + with pytest.raises(ImportError, match={pattern!r}): + from sklearn.model_selection import HalvingGridSearchCV + + import sklearn.experimental + with pytest.raises(ImportError, match={pattern!r}): + from sklearn.model_selection import HalvingRandomSearchCV + """ + assert_run_python_script_without_output( + textwrap.dedent(bad_imports), + pattern=pattern, + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/externals/README b/.venv/lib/python3.12/site-packages/sklearn/externals/README new file mode 100644 index 0000000000000000000000000000000000000000..eef7ba7dd652e73413dad8ed1c6096dc4066d214 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/externals/README @@ -0,0 +1,7 @@ +This directory contains bundled external dependencies that are updated +every once in a while. + +Note for distribution packagers: if you want to remove the duplicated +code and depend on a packaged version, we suggest that you simply do a +symbolic link in this directory. + diff --git a/.venv/lib/python3.12/site-packages/sklearn/externals/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/externals/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..97cda1858d5655b4179183372d271299298c62be --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/externals/__init__.py @@ -0,0 +1,5 @@ + +""" +External, bundled dependencies. + +""" diff --git a/.venv/lib/python3.12/site-packages/sklearn/externals/_arff.py b/.venv/lib/python3.12/site-packages/sklearn/externals/_arff.py new file mode 100644 index 0000000000000000000000000000000000000000..7c9d51d0702ff5cbe70b80d405747e37a5e6cb1d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/externals/_arff.py @@ -0,0 +1,1107 @@ +# ============================================================================= +# Federal University of Rio Grande do Sul (UFRGS) +# Connectionist Artificial Intelligence Laboratory (LIAC) +# Renato de Pontes Pereira - rppereira@inf.ufrgs.br +# ============================================================================= +# Copyright (c) 2011 Renato de Pontes Pereira, renato.ppontes at gmail dot com +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ============================================================================= + +''' +The liac-arff module implements functions to read and write ARFF files in +Python. It was created in the Connectionist Artificial Intelligence Laboratory +(LIAC), which takes place at the Federal University of Rio Grande do Sul +(UFRGS), in Brazil. + +ARFF (Attribute-Relation File Format) is an file format specially created for +describe datasets which are commonly used for machine learning experiments and +software. This file format was created to be used in Weka, the best +representative software for machine learning automated experiments. + +An ARFF file can be divided into two sections: header and data. The Header +describes the metadata of the dataset, including a general description of the +dataset, its name and its attributes. The source below is an example of a +header section in a XOR dataset:: + + % + % XOR Dataset + % + % Created by Renato Pereira + % rppereira@inf.ufrgs.br + % http://inf.ufrgs.br/~rppereira + % + % + @RELATION XOR + + @ATTRIBUTE input1 REAL + @ATTRIBUTE input2 REAL + @ATTRIBUTE y REAL + +The Data section of an ARFF file describes the observations of the dataset, in +the case of XOR dataset:: + + @DATA + 0.0,0.0,0.0 + 0.0,1.0,1.0 + 1.0,0.0,1.0 + 1.0,1.0,0.0 + % + % + % + +Notice that several lines are starting with an ``%`` symbol, denoting a +comment, thus, lines with ``%`` at the beginning will be ignored, except by the +description part at the beginning of the file. The declarations ``@RELATION``, +``@ATTRIBUTE``, and ``@DATA`` are all case insensitive and obligatory. + +For more information and details about the ARFF file description, consult +http://www.cs.waikato.ac.nz/~ml/weka/arff.html + + +ARFF Files in Python +~~~~~~~~~~~~~~~~~~~~ + +This module uses built-ins python objects to represent a deserialized ARFF +file. A dictionary is used as the container of the data and metadata of ARFF, +and have the following keys: + +- **description**: (OPTIONAL) a string with the description of the dataset. +- **relation**: (OBLIGATORY) a string with the name of the dataset. +- **attributes**: (OBLIGATORY) a list of attributes with the following + template:: + + (attribute_name, attribute_type) + + the attribute_name is a string, and attribute_type must be an string + or a list of strings. +- **data**: (OBLIGATORY) a list of data instances. Each data instance must be + a list with values, depending on the attributes. + +The above keys must follow the case which were described, i.e., the keys are +case sensitive. The attribute type ``attribute_type`` must be one of these +strings (they are not case sensitive): ``NUMERIC``, ``INTEGER``, ``REAL`` or +``STRING``. For nominal attributes, the ``atribute_type`` must be a list of +strings. + +In this format, the XOR dataset presented above can be represented as a python +object as:: + + xor_dataset = { + 'description': 'XOR Dataset', + 'relation': 'XOR', + 'attributes': [ + ('input1', 'REAL'), + ('input2', 'REAL'), + ('y', 'REAL'), + ], + 'data': [ + [0.0, 0.0, 0.0], + [0.0, 1.0, 1.0], + [1.0, 0.0, 1.0], + [1.0, 1.0, 0.0] + ] + } + + +Features +~~~~~~~~ + +This module provides several features, including: + +- Read and write ARFF files using python built-in structures, such dictionaries + and lists; +- Supports `scipy.sparse.coo `_ + and lists of dictionaries as used by SVMLight +- Supports the following attribute types: NUMERIC, REAL, INTEGER, STRING, and + NOMINAL; +- Has an interface similar to other built-in modules such as ``json``, or + ``zipfile``; +- Supports read and write the descriptions of files; +- Supports missing values and names with spaces; +- Supports unicode values and names; +- Fully compatible with Python 2.7+, Python 3.5+, pypy and pypy3; +- Under `MIT License `_ + +''' +__author__ = 'Renato de Pontes Pereira, Matthias Feurer, Joel Nothman' +__author_email__ = ('renato.ppontes@gmail.com, ' + 'feurerm@informatik.uni-freiburg.de, ' + 'joel.nothman@gmail.com') +__version__ = '2.4.0' + +import re +import csv +from typing import TYPE_CHECKING +from typing import Optional, List, Dict, Any, Iterator, Union, Tuple + +# CONSTANTS =================================================================== +_SIMPLE_TYPES = ['NUMERIC', 'REAL', 'INTEGER', 'STRING'] + +_TK_DESCRIPTION = '%' +_TK_COMMENT = '%' +_TK_RELATION = '@RELATION' +_TK_ATTRIBUTE = '@ATTRIBUTE' +_TK_DATA = '@DATA' + +_RE_RELATION = re.compile(r'^([^\{\}%,\s]*|\".*\"|\'.*\')$', re.UNICODE) +_RE_ATTRIBUTE = re.compile(r'^(\".*\"|\'.*\'|[^\{\}%,\s]*)\s+(.+)$', re.UNICODE) +_RE_QUOTE_CHARS = re.compile(r'["\'\\\s%,\000-\031]', re.UNICODE) +_RE_ESCAPE_CHARS = re.compile(r'(?=["\'\\%])|[\n\r\t\000-\031]') +_RE_SPARSE_LINE = re.compile(r'^\s*\{.*\}\s*$', re.UNICODE) +_RE_NONTRIVIAL_DATA = re.compile('["\'{}\\s]', re.UNICODE) + +ArffDenseDataType = Iterator[List] +ArffSparseDataType = Tuple[List, ...] + + +if TYPE_CHECKING: + # typing_extensions is available when mypy is installed + from typing_extensions import TypedDict + + class ArffContainerType(TypedDict): + description: str + relation: str + attributes: List + data: Union[ArffDenseDataType, ArffSparseDataType] + +else: + ArffContainerType = Dict[str, Any] + + +def _build_re_values(): + quoted_re = r''' + " # open quote followed by zero or more of: + (?: + (?= len(conversors): + raise BadDataFormat(row) + # XXX: int 0 is used for implicit values, not '0' + values = [values[i] if i in values else 0 for i in + range(len(conversors))] + else: + if len(values) != len(conversors): + raise BadDataFormat(row) + + yield self._decode_values(values, conversors) + + @staticmethod + def _decode_values(values, conversors): + try: + values = [None if value is None else conversor(value) + for conversor, value + in zip(conversors, values)] + except ValueError as exc: + if 'float: ' in str(exc): + raise BadNumericalValue() + return values + + def encode_data(self, data, attributes): + '''(INTERNAL) Encodes a line of data. + + Data instances follow the csv format, i.e, attribute values are + delimited by commas. After converted from csv. + + :param data: a list of values. + :param attributes: a list of attributes. Used to check if data is valid. + :return: a string with the encoded data line. + ''' + current_row = 0 + + for inst in data: + if len(inst) != len(attributes): + raise BadObject( + 'Instance %d has %d attributes, expected %d' % + (current_row, len(inst), len(attributes)) + ) + + new_data = [] + for value in inst: + if value is None or value == '' or value != value: + s = '?' + else: + s = encode_string(str(value)) + new_data.append(s) + + current_row += 1 + yield ','.join(new_data) + + +class _DataListMixin: + """Mixin to return a list from decode_rows instead of a generator""" + def decode_rows(self, stream, conversors): + return list(super().decode_rows(stream, conversors)) + + +class Data(_DataListMixin, DenseGeneratorData): + pass + + +class COOData: + def decode_rows(self, stream, conversors): + data, rows, cols = [], [], [] + for i, row in enumerate(stream): + values = _parse_values(row) + if not isinstance(values, dict): + raise BadLayout() + if not values: + continue + row_cols, values = zip(*sorted(values.items())) + try: + values = [value if value is None else conversors[key](value) + for key, value in zip(row_cols, values)] + except ValueError as exc: + if 'float: ' in str(exc): + raise BadNumericalValue() + raise + except IndexError: + # conversor out of range + raise BadDataFormat(row) + + data.extend(values) + rows.extend([i] * len(values)) + cols.extend(row_cols) + + return data, rows, cols + + def encode_data(self, data, attributes): + num_attributes = len(attributes) + new_data = [] + current_row = 0 + + row = data.row + col = data.col + data = data.data + + # Check if the rows are sorted + if not all(row[i] <= row[i + 1] for i in range(len(row) - 1)): + raise ValueError("liac-arff can only output COO matrices with " + "sorted rows.") + + for v, col, row in zip(data, col, row): + if row > current_row: + # Add empty rows if necessary + while current_row < row: + yield " ".join(["{", ','.join(new_data), "}"]) + new_data = [] + current_row += 1 + + if col >= num_attributes: + raise BadObject( + 'Instance %d has at least %d attributes, expected %d' % + (current_row, col + 1, num_attributes) + ) + + if v is None or v == '' or v != v: + s = '?' + else: + s = encode_string(str(v)) + new_data.append("%d %s" % (col, s)) + + yield " ".join(["{", ','.join(new_data), "}"]) + +class LODGeneratorData: + def decode_rows(self, stream, conversors): + for row in stream: + values = _parse_values(row) + + if not isinstance(values, dict): + raise BadLayout() + try: + yield {key: None if value is None else conversors[key](value) + for key, value in values.items()} + except ValueError as exc: + if 'float: ' in str(exc): + raise BadNumericalValue() + raise + except IndexError: + # conversor out of range + raise BadDataFormat(row) + + def encode_data(self, data, attributes): + current_row = 0 + + num_attributes = len(attributes) + for row in data: + new_data = [] + + if len(row) > 0 and max(row) >= num_attributes: + raise BadObject( + 'Instance %d has %d attributes, expected %d' % + (current_row, max(row) + 1, num_attributes) + ) + + for col in sorted(row): + v = row[col] + if v is None or v == '' or v != v: + s = '?' + else: + s = encode_string(str(v)) + new_data.append("%d %s" % (col, s)) + + current_row += 1 + yield " ".join(["{", ','.join(new_data), "}"]) + +class LODData(_DataListMixin, LODGeneratorData): + pass + + +def _get_data_object_for_decoding(matrix_type): + if matrix_type == DENSE: + return Data() + elif matrix_type == COO: + return COOData() + elif matrix_type == LOD: + return LODData() + elif matrix_type == DENSE_GEN: + return DenseGeneratorData() + elif matrix_type == LOD_GEN: + return LODGeneratorData() + else: + raise ValueError("Matrix type %s not supported." % str(matrix_type)) + +def _get_data_object_for_encoding(matrix): + # Probably a scipy.sparse + if hasattr(matrix, 'format'): + if matrix.format == 'coo': + return COOData() + else: + raise ValueError('Cannot guess matrix format!') + elif isinstance(matrix[0], dict): + return LODData() + else: + return Data() + +# ============================================================================= + +# ADVANCED INTERFACE ========================================================== +class ArffDecoder: + '''An ARFF decoder.''' + + def __init__(self): + '''Constructor.''' + self._conversors = [] + self._current_line = 0 + + def _decode_comment(self, s): + '''(INTERNAL) Decodes a comment line. + + Comments are single line strings starting, obligatorily, with the ``%`` + character, and can have any symbol, including whitespaces or special + characters. + + This method must receive a normalized string, i.e., a string without + padding, including the "\r\n" characters. + + :param s: a normalized string. + :return: a string with the decoded comment. + ''' + res = re.sub(r'^\%( )?', '', s) + return res + + def _decode_relation(self, s): + '''(INTERNAL) Decodes a relation line. + + The relation declaration is a line with the format ``@RELATION + ``, where ``relation-name`` is a string. The string must + start with alphabetic character and must be quoted if the name includes + spaces, otherwise this method will raise a `BadRelationFormat` exception. + + This method must receive a normalized string, i.e., a string without + padding, including the "\r\n" characters. + + :param s: a normalized string. + :return: a string with the decoded relation name. + ''' + _, v = s.split(' ', 1) + v = v.strip() + + if not _RE_RELATION.match(v): + raise BadRelationFormat() + + res = str(v.strip('"\'')) + return res + + def _decode_attribute(self, s): + '''(INTERNAL) Decodes an attribute line. + + The attribute is the most complex declaration in an arff file. All + attributes must follow the template:: + + @attribute + + where ``attribute-name`` is a string, quoted if the name contains any + whitespace, and ``datatype`` can be: + + - Numerical attributes as ``NUMERIC``, ``INTEGER`` or ``REAL``. + - Strings as ``STRING``. + - Dates (NOT IMPLEMENTED). + - Nominal attributes with format: + + {, , , ...} + + The nominal names follow the rules for the attribute names, i.e., they + must be quoted if the name contains whitespaces. + + This method must receive a normalized string, i.e., a string without + padding, including the "\r\n" characters. + + :param s: a normalized string. + :return: a tuple (ATTRIBUTE_NAME, TYPE_OR_VALUES). + ''' + _, v = s.split(' ', 1) + v = v.strip() + + # Verify the general structure of declaration + m = _RE_ATTRIBUTE.match(v) + if not m: + raise BadAttributeFormat() + + # Extracts the raw name and type + name, type_ = m.groups() + + # Extracts the final name + name = str(name.strip('"\'')) + + # Extracts the final type + if type_[:1] == "{" and type_[-1:] == "}": + try: + type_ = _parse_values(type_.strip('{} ')) + except Exception: + raise BadAttributeType() + if isinstance(type_, dict): + raise BadAttributeType() + + else: + # If not nominal, verify the type name + type_ = str(type_).upper() + if type_ not in ['NUMERIC', 'REAL', 'INTEGER', 'STRING']: + raise BadAttributeType() + + return (name, type_) + + def _decode(self, s, encode_nominal=False, matrix_type=DENSE): + '''Do the job the ``encode``.''' + + # Make sure this method is idempotent + self._current_line = 0 + + # If string, convert to a list of lines + if isinstance(s, str): + s = s.strip('\r\n ').replace('\r\n', '\n').split('\n') + + # Create the return object + obj: ArffContainerType = { + 'description': '', + 'relation': '', + 'attributes': [], + 'data': [] + } + attribute_names = {} + + # Create the data helper object + data = _get_data_object_for_decoding(matrix_type) + + # Read all lines + STATE = _TK_DESCRIPTION + s = iter(s) + for row in s: + self._current_line += 1 + # Ignore empty lines + row = row.strip(' \r\n') + if not row: continue + + u_row = row.upper() + + # DESCRIPTION ----------------------------------------------------- + if u_row.startswith(_TK_DESCRIPTION) and STATE == _TK_DESCRIPTION: + obj['description'] += self._decode_comment(row) + '\n' + # ----------------------------------------------------------------- + + # RELATION -------------------------------------------------------- + elif u_row.startswith(_TK_RELATION): + if STATE != _TK_DESCRIPTION: + raise BadLayout() + + STATE = _TK_RELATION + obj['relation'] = self._decode_relation(row) + # ----------------------------------------------------------------- + + # ATTRIBUTE ------------------------------------------------------- + elif u_row.startswith(_TK_ATTRIBUTE): + if STATE != _TK_RELATION and STATE != _TK_ATTRIBUTE: + raise BadLayout() + + STATE = _TK_ATTRIBUTE + + attr = self._decode_attribute(row) + if attr[0] in attribute_names: + raise BadAttributeName(attr[0], attribute_names[attr[0]]) + else: + attribute_names[attr[0]] = self._current_line + obj['attributes'].append(attr) + + if isinstance(attr[1], (list, tuple)): + if encode_nominal: + conversor = EncodedNominalConversor(attr[1]) + else: + conversor = NominalConversor(attr[1]) + else: + CONVERSOR_MAP = {'STRING': str, + 'INTEGER': lambda x: int(float(x)), + 'NUMERIC': float, + 'REAL': float} + conversor = CONVERSOR_MAP[attr[1]] + + self._conversors.append(conversor) + # ----------------------------------------------------------------- + + # DATA ------------------------------------------------------------ + elif u_row.startswith(_TK_DATA): + if STATE != _TK_ATTRIBUTE: + raise BadLayout() + + break + # ----------------------------------------------------------------- + + # COMMENT --------------------------------------------------------- + elif u_row.startswith(_TK_COMMENT): + pass + # ----------------------------------------------------------------- + else: + # Never found @DATA + raise BadLayout() + + def stream(): + for row in s: + self._current_line += 1 + row = row.strip() + # Ignore empty lines and comment lines. + if row and not row.startswith(_TK_COMMENT): + yield row + + # Alter the data object + obj['data'] = data.decode_rows(stream(), self._conversors) + if obj['description'].endswith('\n'): + obj['description'] = obj['description'][:-1] + + return obj + + def decode(self, s, encode_nominal=False, return_type=DENSE): + '''Returns the Python representation of a given ARFF file. + + When a file object is passed as an argument, this method reads lines + iteratively, avoiding to load unnecessary information to the memory. + + :param s: a string or file object with the ARFF file. + :param encode_nominal: boolean, if True perform a label encoding + while reading the .arff file. + :param return_type: determines the data structure used to store the + dataset. Can be one of `arff.DENSE`, `arff.COO`, `arff.LOD`, + `arff.DENSE_GEN` or `arff.LOD_GEN`. + Consult the sections on `working with sparse data`_ and `loading + progressively`_. + ''' + try: + return self._decode(s, encode_nominal=encode_nominal, + matrix_type=return_type) + except ArffException as e: + e.line = self._current_line + raise e + + +class ArffEncoder: + '''An ARFF encoder.''' + + def _encode_comment(self, s=''): + '''(INTERNAL) Encodes a comment line. + + Comments are single line strings starting, obligatorily, with the ``%`` + character, and can have any symbol, including whitespaces or special + characters. + + If ``s`` is None, this method will simply return an empty comment. + + :param s: (OPTIONAL) string. + :return: a string with the encoded comment line. + ''' + if s: + return '%s %s'%(_TK_COMMENT, s) + else: + return '%s' % _TK_COMMENT + + def _encode_relation(self, name): + '''(INTERNAL) Decodes a relation line. + + The relation declaration is a line with the format ``@RELATION + ``, where ``relation-name`` is a string. + + :param name: a string. + :return: a string with the encoded relation declaration. + ''' + for char in ' %{},': + if char in name: + name = '"%s"'%name + break + + return '%s %s'%(_TK_RELATION, name) + + def _encode_attribute(self, name, type_): + '''(INTERNAL) Encodes an attribute line. + + The attribute follow the template:: + + @attribute + + where ``attribute-name`` is a string, and ``datatype`` can be: + + - Numerical attributes as ``NUMERIC``, ``INTEGER`` or ``REAL``. + - Strings as ``STRING``. + - Dates (NOT IMPLEMENTED). + - Nominal attributes with format: + + {, , , ...} + + This method must receive a the name of the attribute and its type, if + the attribute type is nominal, ``type`` must be a list of values. + + :param name: a string. + :param type_: a string or a list of string. + :return: a string with the encoded attribute declaration. + ''' + for char in ' %{},': + if char in name: + name = '"%s"'%name + break + + if isinstance(type_, (tuple, list)): + type_tmp = ['%s' % encode_string(type_k) for type_k in type_] + type_ = '{%s}'%(', '.join(type_tmp)) + + return '%s %s %s'%(_TK_ATTRIBUTE, name, type_) + + def encode(self, obj): + '''Encodes a given object to an ARFF file. + + :param obj: the object containing the ARFF information. + :return: the ARFF file as an string. + ''' + data = [row for row in self.iter_encode(obj)] + + return '\n'.join(data) + + def iter_encode(self, obj): + '''The iterative version of `arff.ArffEncoder.encode`. + + This encodes iteratively a given object and return, one-by-one, the + lines of the ARFF file. + + :param obj: the object containing the ARFF information. + :return: (yields) the ARFF file as strings. + ''' + # DESCRIPTION + if obj.get('description', None): + for row in obj['description'].split('\n'): + yield self._encode_comment(row) + + # RELATION + if not obj.get('relation'): + raise BadObject('Relation name not found or with invalid value.') + + yield self._encode_relation(obj['relation']) + yield '' + + # ATTRIBUTES + if not obj.get('attributes'): + raise BadObject('Attributes not found.') + + attribute_names = set() + for attr in obj['attributes']: + # Verify for bad object format + if not isinstance(attr, (tuple, list)) or \ + len(attr) != 2 or \ + not isinstance(attr[0], str): + raise BadObject('Invalid attribute declaration "%s"'%str(attr)) + + if isinstance(attr[1], str): + # Verify for invalid types + if attr[1] not in _SIMPLE_TYPES: + raise BadObject('Invalid attribute type "%s"'%str(attr)) + + # Verify for bad object format + elif not isinstance(attr[1], (tuple, list)): + raise BadObject('Invalid attribute type "%s"'%str(attr)) + + # Verify attribute name is not used twice + if attr[0] in attribute_names: + raise BadObject('Trying to use attribute name "%s" for the ' + 'second time.' % str(attr[0])) + else: + attribute_names.add(attr[0]) + + yield self._encode_attribute(attr[0], attr[1]) + yield '' + attributes = obj['attributes'] + + # DATA + yield _TK_DATA + if 'data' in obj: + data = _get_data_object_for_encoding(obj.get('data')) + yield from data.encode_data(obj.get('data'), attributes) + + yield '' + +# ============================================================================= + +# BASIC INTERFACE ============================================================= +def load(fp, encode_nominal=False, return_type=DENSE): + '''Load a file-like object containing the ARFF document and convert it into + a Python object. + + :param fp: a file-like object. + :param encode_nominal: boolean, if True perform a label encoding + while reading the .arff file. + :param return_type: determines the data structure used to store the + dataset. Can be one of `arff.DENSE`, `arff.COO`, `arff.LOD`, + `arff.DENSE_GEN` or `arff.LOD_GEN`. + Consult the sections on `working with sparse data`_ and `loading + progressively`_. + :return: a dictionary. + ''' + decoder = ArffDecoder() + return decoder.decode(fp, encode_nominal=encode_nominal, + return_type=return_type) + +def loads(s, encode_nominal=False, return_type=DENSE): + '''Convert a string instance containing the ARFF document into a Python + object. + + :param s: a string object. + :param encode_nominal: boolean, if True perform a label encoding + while reading the .arff file. + :param return_type: determines the data structure used to store the + dataset. Can be one of `arff.DENSE`, `arff.COO`, `arff.LOD`, + `arff.DENSE_GEN` or `arff.LOD_GEN`. + Consult the sections on `working with sparse data`_ and `loading + progressively`_. + :return: a dictionary. + ''' + decoder = ArffDecoder() + return decoder.decode(s, encode_nominal=encode_nominal, + return_type=return_type) + +def dump(obj, fp): + '''Serialize an object representing the ARFF document to a given file-like + object. + + :param obj: a dictionary. + :param fp: a file-like object. + ''' + encoder = ArffEncoder() + generator = encoder.iter_encode(obj) + + last_row = next(generator) + for row in generator: + fp.write(last_row + '\n') + last_row = row + fp.write(last_row) + + return fp + +def dumps(obj): + '''Serialize an object representing the ARFF document, returning a string. + + :param obj: a dictionary. + :return: a string with the ARFF document. + ''' + encoder = ArffEncoder() + return encoder.encode(obj) +# ============================================================================= diff --git a/.venv/lib/python3.12/site-packages/sklearn/externals/_array_api_compat_vendor.py b/.venv/lib/python3.12/site-packages/sklearn/externals/_array_api_compat_vendor.py new file mode 100644 index 0000000000000000000000000000000000000000..38cefd2fe6f3f51cb76caa0137eef1af927b9e45 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/externals/_array_api_compat_vendor.py @@ -0,0 +1,5 @@ +# DO NOT RENAME THIS FILE +# This is a hook for array_api_extra/_lib/_compat.py +# to co-vendor array_api_compat and potentially override its functions. + +from .array_api_compat import * # noqa: F403 diff --git a/.venv/lib/python3.12/site-packages/sklearn/externals/conftest.py b/.venv/lib/python3.12/site-packages/sklearn/externals/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..c763d9761a438dca43e5856d6eaf9747cdeed2bb --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/externals/conftest.py @@ -0,0 +1,6 @@ +# Do not collect any tests in externals. This is more robust than using +# --ignore because --ignore needs a path and it is not convenient to pass in +# the externals path (very long install-dependent path in site-packages) when +# using --pyargs +def pytest_ignore_collect(collection_path, config): + return True diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0f8c53b4ffb6b5c0784743e414d6053ca0ddfa65 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/__init__.py @@ -0,0 +1,18 @@ +"""Feature extraction from raw data.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from . import image, text +from ._dict_vectorizer import DictVectorizer +from ._hash import FeatureHasher +from .image import grid_to_graph, img_to_graph + +__all__ = [ + "DictVectorizer", + "FeatureHasher", + "grid_to_graph", + "image", + "img_to_graph", + "text", +] diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_dict_vectorizer.py b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_dict_vectorizer.py new file mode 100644 index 0000000000000000000000000000000000000000..689146bd229d83b463511a6578a4dab9bec7fa72 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_dict_vectorizer.py @@ -0,0 +1,459 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from array import array +from collections.abc import Iterable, Mapping +from numbers import Number +from operator import itemgetter + +import numpy as np +import scipy.sparse as sp + +from sklearn.utils import metadata_routing + +from ..base import BaseEstimator, TransformerMixin, _fit_context +from ..utils import check_array +from ..utils.validation import check_is_fitted + + +class DictVectorizer(TransformerMixin, BaseEstimator): + """Transforms lists of feature-value mappings to vectors. + + This transformer turns lists of mappings (dict-like objects) of feature + names to feature values into Numpy arrays or scipy.sparse matrices for use + with scikit-learn estimators. + + When feature values are strings, this transformer will do a binary one-hot + (aka one-of-K) coding: one boolean-valued feature is constructed for each + of the possible string values that the feature can take on. For instance, + a feature "f" that can take on the values "ham" and "spam" will become two + features in the output, one signifying "f=ham", the other "f=spam". + + If a feature value is a sequence or set of strings, this transformer + will iterate over the values and will count the occurrences of each string + value. + + However, note that this transformer will only do a binary one-hot encoding + when feature values are of type string. If categorical features are + represented as numeric values such as int or iterables of strings, the + DictVectorizer can be followed by + :class:`~sklearn.preprocessing.OneHotEncoder` to complete + binary one-hot encoding. + + Features that do not occur in a sample (mapping) will have a zero value + in the resulting array/matrix. + + For an efficiency comparison of the different feature extractors, see + :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + dtype : dtype, default=np.float64 + The type of feature values. Passed to Numpy array/scipy.sparse matrix + constructors as the dtype argument. + separator : str, default="=" + Separator string used when constructing new features for one-hot + coding. + sparse : bool, default=True + Whether transform should produce scipy.sparse matrices. + sort : bool, default=True + Whether ``feature_names_`` and ``vocabulary_`` should be + sorted when fitting. + + Attributes + ---------- + vocabulary_ : dict + A dictionary mapping feature names to feature indices. + + feature_names_ : list + A list of length n_features containing the feature names (e.g., "f=ham" + and "f=spam"). + + See Also + -------- + FeatureHasher : Performs vectorization using only a hash function. + sklearn.preprocessing.OrdinalEncoder : Handles nominal/categorical + features encoded as columns of arbitrary data types. + + Examples + -------- + >>> from sklearn.feature_extraction import DictVectorizer + >>> v = DictVectorizer(sparse=False) + >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}] + >>> X = v.fit_transform(D) + >>> X + array([[2., 0., 1.], + [0., 1., 3.]]) + >>> v.inverse_transform(X) == [{'bar': 2.0, 'foo': 1.0}, + ... {'baz': 1.0, 'foo': 3.0}] + True + >>> v.transform({'foo': 4, 'unseen_feature': 3}) + array([[0., 0., 4.]]) + """ + + # This isn't something that people should be routing / using in a pipeline. + __metadata_request__inverse_transform = {"dict_type": metadata_routing.UNUSED} + + _parameter_constraints: dict = { + "dtype": "no_validation", # validation delegated to numpy, + "separator": [str], + "sparse": ["boolean"], + "sort": ["boolean"], + } + + def __init__(self, *, dtype=np.float64, separator="=", sparse=True, sort=True): + self.dtype = dtype + self.separator = separator + self.sparse = sparse + self.sort = sort + + def _add_iterable_element( + self, + f, + v, + feature_names, + vocab, + *, + fitting=True, + transforming=False, + indices=None, + values=None, + ): + """Add feature names for iterable of strings""" + for vv in v: + if isinstance(vv, str): + feature_name = "%s%s%s" % (f, self.separator, vv) + vv = 1 + else: + raise TypeError( + f"Unsupported type {type(vv)} in iterable " + "value. Only iterables of string are " + "supported." + ) + if fitting and feature_name not in vocab: + vocab[feature_name] = len(feature_names) + feature_names.append(feature_name) + + if transforming and feature_name in vocab: + indices.append(vocab[feature_name]) + values.append(self.dtype(vv)) + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Learn a list of feature name -> indices mappings. + + Parameters + ---------- + X : Mapping or iterable over Mappings + Dict(s) or Mapping(s) from feature names (arbitrary Python + objects) to feature values (strings or convertible to dtype). + + .. versionchanged:: 0.24 + Accepts multiple string values for one categorical feature. + + y : (ignored) + Ignored parameter. + + Returns + ------- + self : object + DictVectorizer class instance. + """ + feature_names = [] + vocab = {} + + for x in X: + for f, v in x.items(): + if isinstance(v, str): + feature_name = "%s%s%s" % (f, self.separator, v) + elif isinstance(v, Number) or (v is None): + feature_name = f + elif isinstance(v, Mapping): + raise TypeError( + f"Unsupported value type {type(v)} " + f"for {f}: {v}.\n" + "Mapping objects are not supported." + ) + elif isinstance(v, Iterable): + feature_name = None + self._add_iterable_element(f, v, feature_names, vocab) + + if feature_name is not None: + if feature_name not in vocab: + vocab[feature_name] = len(feature_names) + feature_names.append(feature_name) + + if self.sort: + feature_names.sort() + vocab = {f: i for i, f in enumerate(feature_names)} + + self.feature_names_ = feature_names + self.vocabulary_ = vocab + + return self + + def _transform(self, X, fitting): + # Sanity check: Python's array has no way of explicitly requesting the + # signed 32-bit integers that scipy.sparse needs, so we use the next + # best thing: typecode "i" (int). However, if that gives larger or + # smaller integers than 32-bit ones, np.frombuffer screws up. + assert array("i").itemsize == 4, ( + "sizeof(int) != 4 on your platform; please report this at" + " https://github.com/scikit-learn/scikit-learn/issues and" + " include the output from platform.platform() in your bug report" + ) + + dtype = self.dtype + if fitting: + feature_names = [] + vocab = {} + else: + feature_names = self.feature_names_ + vocab = self.vocabulary_ + + transforming = True + + # Process everything as sparse regardless of setting + X = [X] if isinstance(X, Mapping) else X + + indices = array("i") + indptr = [0] + # XXX we could change values to an array.array as well, but it + # would require (heuristic) conversion of dtype to typecode... + values = [] + + # collect all the possible feature names and build sparse matrix at + # same time + for x in X: + for f, v in x.items(): + if isinstance(v, str): + feature_name = "%s%s%s" % (f, self.separator, v) + v = 1 + elif isinstance(v, Number) or (v is None): + feature_name = f + elif not isinstance(v, Mapping) and isinstance(v, Iterable): + feature_name = None + self._add_iterable_element( + f, + v, + feature_names, + vocab, + fitting=fitting, + transforming=transforming, + indices=indices, + values=values, + ) + else: + raise TypeError( + f"Unsupported value Type {type(v)} " + f"for {f}: {v}.\n" + f"{type(v)} objects are not supported." + ) + + if feature_name is not None: + if fitting and feature_name not in vocab: + vocab[feature_name] = len(feature_names) + feature_names.append(feature_name) + + if feature_name in vocab: + indices.append(vocab[feature_name]) + values.append(self.dtype(v)) + + indptr.append(len(indices)) + + if len(indptr) == 1: + raise ValueError("Sample sequence X is empty.") + + indices = np.frombuffer(indices, dtype=np.intc) + shape = (len(indptr) - 1, len(vocab)) + + result_matrix = sp.csr_matrix( + (values, indices, indptr), shape=shape, dtype=dtype + ) + + # Sort everything if asked + if fitting and self.sort: + feature_names.sort() + map_index = np.empty(len(feature_names), dtype=np.int32) + for new_val, f in enumerate(feature_names): + map_index[new_val] = vocab[f] + vocab[f] = new_val + result_matrix = result_matrix[:, map_index] + + if self.sparse: + result_matrix.sort_indices() + else: + result_matrix = result_matrix.toarray() + + if fitting: + self.feature_names_ = feature_names + self.vocabulary_ = vocab + + return result_matrix + + @_fit_context(prefer_skip_nested_validation=True) + def fit_transform(self, X, y=None): + """Learn a list of feature name -> indices mappings and transform X. + + Like fit(X) followed by transform(X), but does not require + materializing X in memory. + + Parameters + ---------- + X : Mapping or iterable over Mappings + Dict(s) or Mapping(s) from feature names (arbitrary Python + objects) to feature values (strings or convertible to dtype). + + .. versionchanged:: 0.24 + Accepts multiple string values for one categorical feature. + + y : (ignored) + Ignored parameter. + + Returns + ------- + Xa : {array, sparse matrix} + Feature vectors; always 2-d. + """ + return self._transform(X, fitting=True) + + def inverse_transform(self, X, dict_type=dict): + """Transform array or sparse matrix X back to feature mappings. + + X must have been produced by this DictVectorizer's transform or + fit_transform method; it may only have passed through transformers + that preserve the number of features and their order. + + In the case of one-hot/one-of-K coding, the constructed feature + names and values are returned rather than the original ones. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Sample matrix. + dict_type : type, default=dict + Constructor for feature mappings. Must conform to the + collections.Mapping API. + + Returns + ------- + X_original : list of dict_type objects of shape (n_samples,) + Feature mappings for the samples in X. + """ + check_is_fitted(self, "feature_names_") + + # COO matrix is not subscriptable + X = check_array(X, accept_sparse=["csr", "csc"]) + n_samples = X.shape[0] + + names = self.feature_names_ + dicts = [dict_type() for _ in range(n_samples)] + + if sp.issparse(X): + for i, j in zip(*X.nonzero()): + dicts[i][names[j]] = X[i, j] + else: + for i, d in enumerate(dicts): + for j, v in enumerate(X[i, :]): + if v != 0: + d[names[j]] = X[i, j] + + return dicts + + def transform(self, X): + """Transform feature->value dicts to array or sparse matrix. + + Named features not encountered during fit or fit_transform will be + silently ignored. + + Parameters + ---------- + X : Mapping or iterable over Mappings of shape (n_samples,) + Dict(s) or Mapping(s) from feature names (arbitrary Python + objects) to feature values (strings or convertible to dtype). + + Returns + ------- + Xa : {array, sparse matrix} + Feature vectors; always 2-d. + """ + check_is_fitted(self, ["feature_names_", "vocabulary_"]) + return self._transform(X, fitting=False) + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + check_is_fitted(self, "feature_names_") + if any(not isinstance(name, str) for name in self.feature_names_): + feature_names = [str(name) for name in self.feature_names_] + else: + feature_names = self.feature_names_ + return np.asarray(feature_names, dtype=object) + + def restrict(self, support, indices=False): + """Restrict the features to those in support using feature selection. + + This function modifies the estimator in-place. + + Parameters + ---------- + support : array-like + Boolean mask or list of indices (as returned by the get_support + member of feature selectors). + indices : bool, default=False + Whether support is a list of indices. + + Returns + ------- + self : object + DictVectorizer class instance. + + Examples + -------- + >>> from sklearn.feature_extraction import DictVectorizer + >>> from sklearn.feature_selection import SelectKBest, chi2 + >>> v = DictVectorizer() + >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}] + >>> X = v.fit_transform(D) + >>> support = SelectKBest(chi2, k=2).fit(X, [0, 1]) + >>> v.get_feature_names_out() + array(['bar', 'baz', 'foo'], ...) + >>> v.restrict(support.get_support()) + DictVectorizer() + >>> v.get_feature_names_out() + array(['bar', 'foo'], ...) + """ + check_is_fitted(self, "feature_names_") + + if not indices: + support = np.where(support)[0] + + names = self.feature_names_ + new_vocab = {} + for i in support: + new_vocab[names[i]] = len(new_vocab) + + self.vocabulary_ = new_vocab + self.feature_names_ = [ + f for f, i in sorted(new_vocab.items(), key=itemgetter(1)) + ] + + return self + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.dict = True + tags.input_tags.two_d_array = False + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hash.py b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hash.py new file mode 100644 index 0000000000000000000000000000000000000000..34756fa06eb4e701cd1f0364d604e6a432ebea68 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hash.py @@ -0,0 +1,209 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from itertools import chain +from numbers import Integral + +import numpy as np +import scipy.sparse as sp + +from sklearn.utils import metadata_routing + +from ..base import BaseEstimator, TransformerMixin, _fit_context +from ..utils._param_validation import Interval, StrOptions +from ._hashing_fast import transform as _hashing_transform + + +def _iteritems(d): + """Like d.iteritems, but accepts any collections.Mapping.""" + return d.iteritems() if hasattr(d, "iteritems") else d.items() + + +class FeatureHasher(TransformerMixin, BaseEstimator): + """Implements feature hashing, aka the hashing trick. + + This class turns sequences of symbolic feature names (strings) into + scipy.sparse matrices, using a hash function to compute the matrix column + corresponding to a name. The hash function employed is the signed 32-bit + version of Murmurhash3. + + Feature names of type byte string are used as-is. Unicode strings are + converted to UTF-8 first, but no Unicode normalization is done. + Feature values must be (finite) numbers. + + This class is a low-memory alternative to DictVectorizer and + CountVectorizer, intended for large-scale (online) learning and situations + where memory is tight, e.g. when running prediction code on embedded + devices. + + For an efficiency comparison of the different feature extractors, see + :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.13 + + Parameters + ---------- + n_features : int, default=2**20 + The number of features (columns) in the output matrices. Small numbers + of features are likely to cause hash collisions, but large numbers + will cause larger coefficient dimensions in linear learners. + input_type : str, default='dict' + Choose a string from {'dict', 'pair', 'string'}. + Either "dict" (the default) to accept dictionaries over + (feature_name, value); "pair" to accept pairs of (feature_name, value); + or "string" to accept single strings. + feature_name should be a string, while value should be a number. + In the case of "string", a value of 1 is implied. + The feature_name is hashed to find the appropriate column for the + feature. The value's sign might be flipped in the output (but see + non_negative, below). + dtype : numpy dtype, default=np.float64 + The type of feature values. Passed to scipy.sparse matrix constructors + as the dtype argument. Do not set this to bool, np.boolean or any + unsigned integer type. + alternate_sign : bool, default=True + When True, an alternating sign is added to the features as to + approximately conserve the inner product in the hashed space even for + small n_features. This approach is similar to sparse random projection. + + .. versionchanged:: 0.19 + ``alternate_sign`` replaces the now deprecated ``non_negative`` + parameter. + + See Also + -------- + DictVectorizer : Vectorizes string-valued features using a hash table. + sklearn.preprocessing.OneHotEncoder : Handles nominal/categorical features. + + Notes + ----- + This estimator is :term:`stateless` and does not need to be fitted. + However, we recommend to call :meth:`fit_transform` instead of + :meth:`transform`, as parameter validation is only performed in + :meth:`fit`. + + Examples + -------- + >>> from sklearn.feature_extraction import FeatureHasher + >>> h = FeatureHasher(n_features=10) + >>> D = [{'dog': 1, 'cat':2, 'elephant':4},{'dog': 2, 'run': 5}] + >>> f = h.transform(D) + >>> f.toarray() + array([[ 0., 0., -4., -1., 0., 0., 0., 0., 0., 2.], + [ 0., 0., 0., -2., -5., 0., 0., 0., 0., 0.]]) + + With `input_type="string"`, the input must be an iterable over iterables of + strings: + + >>> h = FeatureHasher(n_features=8, input_type="string") + >>> raw_X = [["dog", "cat", "snake"], ["snake", "dog"], ["cat", "bird"]] + >>> f = h.transform(raw_X) + >>> f.toarray() + array([[ 0., 0., 0., -1., 0., -1., 0., 1.], + [ 0., 0., 0., -1., 0., -1., 0., 0.], + [ 0., -1., 0., 0., 0., 0., 0., 1.]]) + """ + + # raw_X should have been called X + __metadata_request__transform = {"raw_X": metadata_routing.UNUSED} + + _parameter_constraints: dict = { + "n_features": [Interval(Integral, 1, np.iinfo(np.int32).max, closed="both")], + "input_type": [StrOptions({"dict", "pair", "string"})], + "dtype": "no_validation", # delegate to numpy + "alternate_sign": ["boolean"], + } + + def __init__( + self, + n_features=(2**20), + *, + input_type="dict", + dtype=np.float64, + alternate_sign=True, + ): + self.dtype = dtype + self.input_type = input_type + self.n_features = n_features + self.alternate_sign = alternate_sign + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X=None, y=None): + """Only validates estimator's parameters. + + This method allows to: (i) validate the estimator's parameters and + (ii) be consistent with the scikit-learn transformer API. + + Parameters + ---------- + X : Ignored + Not used, present here for API consistency by convention. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self : object + FeatureHasher class instance. + """ + return self + + def transform(self, raw_X): + """Transform a sequence of instances to a scipy.sparse matrix. + + Parameters + ---------- + raw_X : iterable over iterable over raw features, length = n_samples + Samples. Each sample must be iterable an (e.g., a list or tuple) + containing/generating feature names (and optionally values, see + the input_type constructor argument) which will be hashed. + raw_X need not support the len function, so it can be the result + of a generator; n_samples is determined on the fly. + + Returns + ------- + X : sparse matrix of shape (n_samples, n_features) + Feature matrix, for use with estimators or further transformers. + """ + raw_X = iter(raw_X) + if self.input_type == "dict": + raw_X = (_iteritems(d) for d in raw_X) + elif self.input_type == "string": + first_raw_X = next(raw_X) + if isinstance(first_raw_X, str): + raise ValueError( + "Samples can not be a single string. The input must be an iterable" + " over iterables of strings." + ) + raw_X_ = chain([first_raw_X], raw_X) + raw_X = (((f, 1) for f in x) for x in raw_X_) + + indices, indptr, values = _hashing_transform( + raw_X, self.n_features, self.dtype, self.alternate_sign, seed=0 + ) + n_samples = indptr.shape[0] - 1 + + if n_samples == 0: + raise ValueError("Cannot vectorize empty sequence.") + + X = sp.csr_matrix( + (values, indices, indptr), + dtype=self.dtype, + shape=(n_samples, self.n_features), + ) + X.sum_duplicates() # also sorts the indices + + return X + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.two_d_array = False + if self.input_type == "string": + tags.input_tags.string = True + elif self.input_type == "dict": + tags.input_tags.dict = True + tags.requires_fit = False + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hashing_fast.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hashing_fast.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..56d8ee4948c7d16355c73ecf22d7c43d93e6b2e9 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hashing_fast.cpython-312-x86_64-linux-gnu.so differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hashing_fast.pyx b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hashing_fast.pyx new file mode 100644 index 0000000000000000000000000000000000000000..5069d555d60eae0ccc4cbfb04c03fbfce78b87bc --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_hashing_fast.pyx @@ -0,0 +1,89 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from libc.stdlib cimport abs +from libcpp.vector cimport vector + +cimport numpy as cnp +import numpy as np +from ..utils._typedefs cimport int32_t, int64_t +from ..utils.murmurhash cimport murmurhash3_bytes_s32 +from ..utils._vector_sentinel cimport vector_to_nd_array + +cnp.import_array() + + +def transform(raw_X, Py_ssize_t n_features, dtype, + bint alternate_sign=1, unsigned int seed=0): + """Guts of FeatureHasher.transform. + + Returns + ------- + n_samples : integer + indices, indptr, values : lists + For constructing a scipy.sparse.csr_matrix. + + """ + cdef int32_t h + cdef double value + + cdef vector[int32_t] indices + cdef vector[int64_t] indptr + indptr.push_back(0) + + # Since Python array does not understand Numpy dtypes, we grow the indices + # and values arrays ourselves. Use a Py_ssize_t capacity for safety. + cdef Py_ssize_t capacity = 8192 # arbitrary + cdef int64_t size = 0 + cdef cnp.ndarray values = np.empty(capacity, dtype=dtype) + + for x in raw_X: + for f, v in x: + if isinstance(v, (str, unicode)): + f = "%s%s%s" % (f, '=', v) + value = 1 + else: + value = v + + if value == 0: + continue + + if isinstance(f, unicode): + f = (f).encode("utf-8") + # Need explicit type check because Murmurhash does not propagate + # all exceptions. Add "except *" there? + elif not isinstance(f, bytes): + raise TypeError("feature names must be strings") + + h = murmurhash3_bytes_s32(f, seed) + + if h == - 2147483648: + # abs(-2**31) is undefined behavior because h is a `np.int32` + # The following is defined such that it is equal to: abs(-2**31) % n_features + indices.push_back((2147483647 - (n_features - 1)) % n_features) + else: + indices.push_back(abs(h) % n_features) + # improve inner product preservation in the hashed space + if alternate_sign: + value *= (h >= 0) * 2 - 1 + values[size] = value + size += 1 + + if size == capacity: + capacity *= 2 + # can't use resize member because there might be multiple + # references to the arrays due to Cython's error checking + values = np.resize(values, capacity) + + indptr.push_back(size) + + indices_array = vector_to_nd_array(&indices) + indptr_array = vector_to_nd_array(&indptr) + + if indptr_array[indptr_array.shape[0]-1] > np.iinfo(np.int32).max: # = 2**31 - 1 + # both indices and indptr have the same dtype in CSR arrays + indices_array = indices_array.astype(np.int64, copy=False) + else: + indptr_array = indptr_array.astype(np.int32, copy=False) + + return (indices_array, indptr_array, values[:size]) diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_stop_words.py b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_stop_words.py new file mode 100644 index 0000000000000000000000000000000000000000..6bc8e6d2f37dc06cf834cb42b363594901a86d1f --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/_stop_words.py @@ -0,0 +1,328 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# This list of English stop words is taken from the "Glasgow Information +# Retrieval Group". The original list can be found at +# http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words +ENGLISH_STOP_WORDS = frozenset( + [ + "a", + "about", + "above", + "across", + "after", + "afterwards", + "again", + "against", + "all", + "almost", + "alone", + "along", + "already", + "also", + "although", + "always", + "am", + "among", + "amongst", + "amoungst", + "amount", + "an", + "and", + "another", + "any", + "anyhow", + "anyone", + "anything", + "anyway", + "anywhere", + "are", + "around", + "as", + "at", + "back", + "be", + "became", + "because", + "become", + "becomes", + "becoming", + "been", + "before", + "beforehand", + "behind", + "being", + "below", + "beside", + "besides", + "between", + "beyond", + "bill", + "both", + "bottom", + "but", + "by", + "call", + "can", + "cannot", + "cant", + "co", + "con", + "could", + "couldnt", + "cry", + "de", + "describe", + "detail", + "do", + "done", + "down", + "due", + "during", + "each", + "eg", + "eight", + "either", + "eleven", + "else", + "elsewhere", + "empty", + "enough", + "etc", + "even", + "ever", + "every", + "everyone", + "everything", + "everywhere", + "except", + "few", + "fifteen", + "fifty", + "fill", + "find", + "fire", + "first", + "five", + "for", + "former", + "formerly", + "forty", + "found", + "four", + "from", + "front", + "full", + "further", + "get", + "give", + "go", + "had", + "has", + "hasnt", + "have", + "he", + "hence", + "her", + "here", + "hereafter", + "hereby", + "herein", + "hereupon", + "hers", + "herself", + "him", + "himself", + "his", + "how", + "however", + "hundred", + "i", + "ie", + "if", + "in", + "inc", + "indeed", + "interest", + "into", + "is", + "it", + "its", + "itself", + "keep", + "last", + "latter", + "latterly", + "least", + "less", + "ltd", + "made", + "many", + "may", + "me", + "meanwhile", + "might", + "mill", + "mine", + "more", + "moreover", + "most", + "mostly", + "move", + "much", + "must", + "my", + "myself", + "name", + "namely", + "neither", + "never", + "nevertheless", + "next", + "nine", + "no", + "nobody", + "none", + "noone", + "nor", + "not", + "nothing", + "now", + "nowhere", + "of", + "off", + "often", + "on", + "once", + "one", + "only", + "onto", + "or", + "other", + "others", + "otherwise", + "our", + "ours", + "ourselves", + "out", + "over", + "own", + "part", + "per", + "perhaps", + "please", + "put", + "rather", + "re", + "same", + "see", + "seem", + "seemed", + "seeming", + "seems", + "serious", + "several", + "she", + "should", + "show", + "side", + "since", + "sincere", + "six", + "sixty", + "so", + "some", + "somehow", + "someone", + "something", + "sometime", + "sometimes", + "somewhere", + "still", + "such", + "system", + "take", + "ten", + "than", + "that", + "the", + "their", + "them", + "themselves", + "then", + "thence", + "there", + "thereafter", + "thereby", + "therefore", + "therein", + "thereupon", + "these", + "they", + "thick", + "thin", + "third", + "this", + "those", + "though", + "three", + "through", + "throughout", + "thru", + "thus", + "to", + "together", + "too", + "top", + "toward", + "towards", + "twelve", + "twenty", + "two", + "un", + "under", + "until", + "up", + "upon", + "us", + "very", + "via", + "was", + "we", + "well", + "were", + "what", + "whatever", + "when", + "whence", + "whenever", + "where", + "whereafter", + "whereas", + "whereby", + "wherein", + "whereupon", + "wherever", + "whether", + "which", + "while", + "whither", + "who", + "whoever", + "whole", + "whom", + "whose", + "why", + "will", + "with", + "within", + "without", + "would", + "yet", + "you", + "your", + "yours", + "yourself", + "yourselves", + ] +) diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/image.py b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/image.py new file mode 100644 index 0000000000000000000000000000000000000000..b571215de47be973d81ae1b4dbab517b4de571c6 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/image.py @@ -0,0 +1,687 @@ +"""Utilities to extract features from images.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from itertools import product +from numbers import Integral, Number, Real + +import numpy as np +from numpy.lib.stride_tricks import as_strided +from scipy import sparse + +from ..base import BaseEstimator, TransformerMixin, _fit_context +from ..utils import check_array, check_random_state +from ..utils._param_validation import Hidden, Interval, RealNotInt, validate_params + +__all__ = [ + "PatchExtractor", + "extract_patches_2d", + "grid_to_graph", + "img_to_graph", + "reconstruct_from_patches_2d", +] + +from ..utils.validation import validate_data + +############################################################################### +# From an image to a graph + + +def _make_edges_3d(n_x, n_y, n_z=1): + """Returns a list of edges for a 3D image. + + Parameters + ---------- + n_x : int + The size of the grid in the x direction. + n_y : int + The size of the grid in the y direction. + n_z : integer, default=1 + The size of the grid in the z direction, defaults to 1 + """ + vertices = np.arange(n_x * n_y * n_z).reshape((n_x, n_y, n_z)) + edges_deep = np.vstack((vertices[:, :, :-1].ravel(), vertices[:, :, 1:].ravel())) + edges_right = np.vstack((vertices[:, :-1].ravel(), vertices[:, 1:].ravel())) + edges_down = np.vstack((vertices[:-1].ravel(), vertices[1:].ravel())) + edges = np.hstack((edges_deep, edges_right, edges_down)) + return edges + + +def _compute_gradient_3d(edges, img): + _, n_y, n_z = img.shape + gradient = np.abs( + img[ + edges[0] // (n_y * n_z), + (edges[0] % (n_y * n_z)) // n_z, + (edges[0] % (n_y * n_z)) % n_z, + ] + - img[ + edges[1] // (n_y * n_z), + (edges[1] % (n_y * n_z)) // n_z, + (edges[1] % (n_y * n_z)) % n_z, + ] + ) + return gradient + + +# XXX: Why mask the image after computing the weights? + + +def _mask_edges_weights(mask, edges, weights=None): + """Apply a mask to edges (weighted or not)""" + inds = np.arange(mask.size) + inds = inds[mask.ravel()] + ind_mask = np.logical_and(np.isin(edges[0], inds), np.isin(edges[1], inds)) + edges = edges[:, ind_mask] + if weights is not None: + weights = weights[ind_mask] + if len(edges.ravel()): + maxval = edges.max() + else: + maxval = 0 + order = np.searchsorted(np.flatnonzero(mask), np.arange(maxval + 1)) + edges = order[edges] + if weights is None: + return edges + else: + return edges, weights + + +def _to_graph( + n_x, n_y, n_z, mask=None, img=None, return_as=sparse.coo_matrix, dtype=None +): + """Auxiliary function for img_to_graph and grid_to_graph""" + edges = _make_edges_3d(n_x, n_y, n_z) + + if dtype is None: # To not overwrite input dtype + if img is None: + dtype = int + else: + dtype = img.dtype + + if img is not None: + img = np.atleast_3d(img) + weights = _compute_gradient_3d(edges, img) + if mask is not None: + edges, weights = _mask_edges_weights(mask, edges, weights) + diag = img.squeeze()[mask] + else: + diag = img.ravel() + n_voxels = diag.size + else: + if mask is not None: + mask = mask.astype(dtype=bool, copy=False) + edges = _mask_edges_weights(mask, edges) + n_voxels = np.sum(mask) + else: + n_voxels = n_x * n_y * n_z + weights = np.ones(edges.shape[1], dtype=dtype) + diag = np.ones(n_voxels, dtype=dtype) + + diag_idx = np.arange(n_voxels) + i_idx = np.hstack((edges[0], edges[1])) + j_idx = np.hstack((edges[1], edges[0])) + graph = sparse.coo_matrix( + ( + np.hstack((weights, weights, diag)), + (np.hstack((i_idx, diag_idx)), np.hstack((j_idx, diag_idx))), + ), + (n_voxels, n_voxels), + dtype=dtype, + ) + if return_as is np.ndarray: + return graph.toarray() + return return_as(graph) + + +@validate_params( + { + "img": ["array-like"], + "mask": [None, np.ndarray], + "return_as": [type], + "dtype": "no_validation", # validation delegated to numpy + }, + prefer_skip_nested_validation=True, +) +def img_to_graph(img, *, mask=None, return_as=sparse.coo_matrix, dtype=None): + """Graph of the pixel-to-pixel gradient connections. + + Edges are weighted with the gradient values. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + img : array-like of shape (height, width) or (height, width, channel) + 2D or 3D image. + mask : ndarray of shape (height, width) or \ + (height, width, channel), dtype=bool, default=None + An optional mask of the image, to consider only part of the + pixels. + return_as : np.ndarray or a sparse matrix class, \ + default=sparse.coo_matrix + The class to use to build the returned adjacency matrix. + dtype : dtype, default=None + The data of the returned sparse matrix. By default it is the + dtype of img. + + Returns + ------- + graph : ndarray or a sparse matrix class + The computed adjacency matrix. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.feature_extraction.image import img_to_graph + >>> img = np.array([[0, 0], [0, 1]]) + >>> img_to_graph(img, return_as=np.ndarray) + array([[0, 0, 0, 0], + [0, 0, 0, 1], + [0, 0, 0, 1], + [0, 1, 1, 1]]) + """ + img = np.atleast_3d(img) + n_x, n_y, n_z = img.shape + return _to_graph(n_x, n_y, n_z, mask, img, return_as, dtype) + + +@validate_params( + { + "n_x": [Interval(Integral, left=1, right=None, closed="left")], + "n_y": [Interval(Integral, left=1, right=None, closed="left")], + "n_z": [Interval(Integral, left=1, right=None, closed="left")], + "mask": [None, np.ndarray], + "return_as": [type], + "dtype": "no_validation", # validation delegated to numpy + }, + prefer_skip_nested_validation=True, +) +def grid_to_graph( + n_x, n_y, n_z=1, *, mask=None, return_as=sparse.coo_matrix, dtype=int +): + """Graph of the pixel-to-pixel connections. + + Edges exist if 2 voxels are connected. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_x : int + Dimension in x axis. + n_y : int + Dimension in y axis. + n_z : int, default=1 + Dimension in z axis. + mask : ndarray of shape (n_x, n_y, n_z), dtype=bool, default=None + An optional mask of the image, to consider only part of the + pixels. + return_as : np.ndarray or a sparse matrix class, \ + default=sparse.coo_matrix + The class to use to build the returned adjacency matrix. + dtype : dtype, default=int + The data of the returned sparse matrix. By default it is int. + + Returns + ------- + graph : np.ndarray or a sparse matrix class + The computed adjacency matrix. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.feature_extraction.image import grid_to_graph + >>> shape_img = (4, 4, 1) + >>> mask = np.zeros(shape=shape_img, dtype=bool) + >>> mask[[1, 2], [1, 2], :] = True + >>> graph = grid_to_graph(*shape_img, mask=mask) + >>> print(graph) + + Coords Values + (0, 0) 1 + (1, 1) 1 + """ + return _to_graph(n_x, n_y, n_z, mask=mask, return_as=return_as, dtype=dtype) + + +############################################################################### +# From an image to a set of small image patches + + +def _compute_n_patches(i_h, i_w, p_h, p_w, max_patches=None): + """Compute the number of patches that will be extracted in an image. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + i_h : int + The image height + i_w : int + The image with + p_h : int + The height of a patch + p_w : int + The width of a patch + max_patches : int or float, default=None + The maximum number of patches to extract. If `max_patches` is a float + between 0 and 1, it is taken to be a proportion of the total number + of patches. If `max_patches` is None, all possible patches are extracted. + """ + n_h = i_h - p_h + 1 + n_w = i_w - p_w + 1 + all_patches = n_h * n_w + + if max_patches: + if isinstance(max_patches, (Integral)) and max_patches < all_patches: + return max_patches + elif isinstance(max_patches, (Integral)) and max_patches >= all_patches: + return all_patches + elif isinstance(max_patches, (Real)) and 0 < max_patches < 1: + return int(max_patches * all_patches) + else: + raise ValueError("Invalid value for max_patches: %r" % max_patches) + else: + return all_patches + + +def _extract_patches(arr, patch_shape=8, extraction_step=1): + """Extracts patches of any n-dimensional array in place using strides. + + Given an n-dimensional array it will return a 2n-dimensional array with + the first n dimensions indexing patch position and the last n indexing + the patch content. This operation is immediate (O(1)). A reshape + performed on the first n dimensions will cause numpy to copy data, leading + to a list of extracted patches. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + arr : ndarray + n-dimensional array of which patches are to be extracted + + patch_shape : int or tuple of length arr.ndim.default=8 + Indicates the shape of the patches to be extracted. If an + integer is given, the shape will be a hypercube of + sidelength given by its value. + + extraction_step : int or tuple of length arr.ndim, default=1 + Indicates step size at which extraction shall be performed. + If integer is given, then the step is uniform in all dimensions. + + + Returns + ------- + patches : strided ndarray + 2n-dimensional array indexing patches on first n dimensions and + containing patches on the last n dimensions. These dimensions + are fake, but this way no data is copied. A simple reshape invokes + a copying operation to obtain a list of patches: + result.reshape([-1] + list(patch_shape)) + """ + + arr_ndim = arr.ndim + + if isinstance(patch_shape, Number): + patch_shape = tuple([patch_shape] * arr_ndim) + if isinstance(extraction_step, Number): + extraction_step = tuple([extraction_step] * arr_ndim) + + patch_strides = arr.strides + + slices = tuple(slice(None, None, st) for st in extraction_step) + indexing_strides = arr[slices].strides + + patch_indices_shape = ( + (np.array(arr.shape) - np.array(patch_shape)) // np.array(extraction_step) + ) + 1 + + shape = tuple(list(patch_indices_shape) + list(patch_shape)) + strides = tuple(list(indexing_strides) + list(patch_strides)) + + patches = as_strided(arr, shape=shape, strides=strides) + return patches + + +@validate_params( + { + "image": [np.ndarray], + "patch_size": [tuple, list], + "max_patches": [ + Interval(RealNotInt, 0, 1, closed="neither"), + Interval(Integral, 1, None, closed="left"), + None, + ], + "random_state": ["random_state"], + }, + prefer_skip_nested_validation=True, +) +def extract_patches_2d(image, patch_size, *, max_patches=None, random_state=None): + """Reshape a 2D image into a collection of patches. + + The resulting patches are allocated in a dedicated array. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + image : ndarray of shape (image_height, image_width) or \ + (image_height, image_width, n_channels) + The original image data. For color images, the last dimension specifies + the channel: a RGB image would have `n_channels=3`. + + patch_size : tuple of int (patch_height, patch_width) + The dimensions of one patch. + + max_patches : int or float, default=None + The maximum number of patches to extract. If `max_patches` is a float + between 0 and 1, it is taken to be a proportion of the total number + of patches. If `max_patches` is None it corresponds to the total number + of patches that can be extracted. + + random_state : int, RandomState instance, default=None + Determines the random number generator used for random sampling when + `max_patches` is not None. Use an int to make the randomness + deterministic. + See :term:`Glossary `. + + Returns + ------- + patches : array of shape (n_patches, patch_height, patch_width) or \ + (n_patches, patch_height, patch_width, n_channels) + The collection of patches extracted from the image, where `n_patches` + is either `max_patches` or the total number of patches that can be + extracted. + + Examples + -------- + >>> from sklearn.datasets import load_sample_image + >>> from sklearn.feature_extraction import image + >>> # Use the array data from the first image in this dataset: + >>> one_image = load_sample_image("china.jpg") + >>> print('Image shape: {}'.format(one_image.shape)) + Image shape: (427, 640, 3) + >>> patches = image.extract_patches_2d(one_image, (2, 2)) + >>> print('Patches shape: {}'.format(patches.shape)) + Patches shape: (272214, 2, 2, 3) + >>> # Here are just two of these patches: + >>> print(patches[1]) + [[[174 201 231] + [174 201 231]] + [[173 200 230] + [173 200 230]]] + >>> print(patches[800]) + [[[187 214 243] + [188 215 244]] + [[187 214 243] + [188 215 244]]] + """ + i_h, i_w = image.shape[:2] + p_h, p_w = patch_size + + if p_h > i_h: + raise ValueError( + "Height of the patch should be less than the height of the image." + ) + + if p_w > i_w: + raise ValueError( + "Width of the patch should be less than the width of the image." + ) + + image = check_array(image, allow_nd=True) + image = image.reshape((i_h, i_w, -1)) + n_colors = image.shape[-1] + + extracted_patches = _extract_patches( + image, patch_shape=(p_h, p_w, n_colors), extraction_step=1 + ) + + n_patches = _compute_n_patches(i_h, i_w, p_h, p_w, max_patches) + if max_patches: + rng = check_random_state(random_state) + i_s = rng.randint(i_h - p_h + 1, size=n_patches) + j_s = rng.randint(i_w - p_w + 1, size=n_patches) + patches = extracted_patches[i_s, j_s, 0] + else: + patches = extracted_patches + + patches = patches.reshape(-1, p_h, p_w, n_colors) + # remove the color dimension if useless + if patches.shape[-1] == 1: + return patches.reshape((n_patches, p_h, p_w)) + else: + return patches + + +@validate_params( + {"patches": [np.ndarray], "image_size": [tuple, Hidden(list)]}, + prefer_skip_nested_validation=True, +) +def reconstruct_from_patches_2d(patches, image_size): + """Reconstruct the image from all of its patches. + + Patches are assumed to overlap and the image is constructed by filling in + the patches from left to right, top to bottom, averaging the overlapping + regions. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + patches : ndarray of shape (n_patches, patch_height, patch_width) or \ + (n_patches, patch_height, patch_width, n_channels) + The complete set of patches. If the patches contain colour information, + channels are indexed along the last dimension: RGB patches would + have `n_channels=3`. + + image_size : tuple of int (image_height, image_width) or \ + (image_height, image_width, n_channels) + The size of the image that will be reconstructed. + + Returns + ------- + image : ndarray of shape image_size + The reconstructed image. + + Examples + -------- + >>> from sklearn.datasets import load_sample_image + >>> from sklearn.feature_extraction import image + >>> one_image = load_sample_image("china.jpg") + >>> print('Image shape: {}'.format(one_image.shape)) + Image shape: (427, 640, 3) + >>> image_patches = image.extract_patches_2d(image=one_image, patch_size=(10, 10)) + >>> print('Patches shape: {}'.format(image_patches.shape)) + Patches shape: (263758, 10, 10, 3) + >>> image_reconstructed = image.reconstruct_from_patches_2d( + ... patches=image_patches, + ... image_size=one_image.shape + ... ) + >>> print(f"Reconstructed shape: {image_reconstructed.shape}") + Reconstructed shape: (427, 640, 3) + """ + i_h, i_w = image_size[:2] + p_h, p_w = patches.shape[1:3] + img = np.zeros(image_size) + # compute the dimensions of the patches array + n_h = i_h - p_h + 1 + n_w = i_w - p_w + 1 + for p, (i, j) in zip(patches, product(range(n_h), range(n_w))): + img[i : i + p_h, j : j + p_w] += p + + for i in range(i_h): + for j in range(i_w): + # divide by the amount of overlap + # XXX: is this the most efficient way? memory-wise yes, cpu wise? + img[i, j] /= float(min(i + 1, p_h, i_h - i) * min(j + 1, p_w, i_w - j)) + return img + + +class PatchExtractor(TransformerMixin, BaseEstimator): + """Extracts patches from a collection of images. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.9 + + Parameters + ---------- + patch_size : tuple of int (patch_height, patch_width), default=None + The dimensions of one patch. If set to None, the patch size will be + automatically set to `(img_height // 10, img_width // 10)`, where + `img_height` and `img_width` are the dimensions of the input images. + + max_patches : int or float, default=None + The maximum number of patches per image to extract. If `max_patches` is + a float in (0, 1), it is taken to mean a proportion of the total number + of patches. If set to None, extract all possible patches. + + random_state : int, RandomState instance, default=None + Determines the random number generator used for random sampling when + `max_patches is not None`. Use an int to make the randomness + deterministic. + See :term:`Glossary `. + + See Also + -------- + reconstruct_from_patches_2d : Reconstruct image from all of its patches. + + Notes + ----- + This estimator is stateless and does not need to be fitted. However, we + recommend to call :meth:`fit_transform` instead of :meth:`transform`, as + parameter validation is only performed in :meth:`fit`. + + Examples + -------- + >>> from sklearn.datasets import load_sample_images + >>> from sklearn.feature_extraction import image + >>> # Use the array data from the second image in this dataset: + >>> X = load_sample_images().images[1] + >>> X = X[None, ...] + >>> print(f"Image shape: {X.shape}") + Image shape: (1, 427, 640, 3) + >>> pe = image.PatchExtractor(patch_size=(10, 10)) + >>> pe_trans = pe.transform(X) + >>> print(f"Patches shape: {pe_trans.shape}") + Patches shape: (263758, 10, 10, 3) + >>> X_reconstructed = image.reconstruct_from_patches_2d(pe_trans, X.shape[1:]) + >>> print(f"Reconstructed shape: {X_reconstructed.shape}") + Reconstructed shape: (427, 640, 3) + """ + + _parameter_constraints: dict = { + "patch_size": [tuple, None], + "max_patches": [ + None, + Interval(RealNotInt, 0, 1, closed="neither"), + Interval(Integral, 1, None, closed="left"), + ], + "random_state": ["random_state"], + } + + def __init__(self, *, patch_size=None, max_patches=None, random_state=None): + self.patch_size = patch_size + self.max_patches = max_patches + self.random_state = random_state + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Only validate the parameters of the estimator. + + This method allows to: (i) validate the parameters of the estimator and + (ii) be consistent with the scikit-learn transformer API. + + Parameters + ---------- + X : ndarray of shape (n_samples, image_height, image_width) or \ + (n_samples, image_height, image_width, n_channels) + Array of images from which to extract patches. For color images, + the last dimension specifies the channel: a RGB image would have + `n_channels=3`. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Returns the instance itself. + """ + return self + + def transform(self, X): + """Transform the image samples in `X` into a matrix of patch data. + + Parameters + ---------- + X : ndarray of shape (n_samples, image_height, image_width) or \ + (n_samples, image_height, image_width, n_channels) + Array of images from which to extract patches. For color images, + the last dimension specifies the channel: a RGB image would have + `n_channels=3`. + + Returns + ------- + patches : array of shape (n_patches, patch_height, patch_width) or \ + (n_patches, patch_height, patch_width, n_channels) + The collection of patches extracted from the images, where + `n_patches` is either `n_samples * max_patches` or the total + number of patches that can be extracted. + """ + X = validate_data( + self, + X=X, + ensure_2d=False, + allow_nd=True, + ensure_min_samples=1, + ensure_min_features=1, + reset=False, + ) + random_state = check_random_state(self.random_state) + n_imgs, img_height, img_width = X.shape[:3] + if self.patch_size is None: + patch_size = img_height // 10, img_width // 10 + else: + if len(self.patch_size) != 2: + raise ValueError( + "patch_size must be a tuple of two integers. Got" + f" {self.patch_size} instead." + ) + patch_size = self.patch_size + + n_imgs, img_height, img_width = X.shape[:3] + X = np.reshape(X, (n_imgs, img_height, img_width, -1)) + n_channels = X.shape[-1] + + # compute the dimensions of the patches array + patch_height, patch_width = patch_size + n_patches = _compute_n_patches( + img_height, img_width, patch_height, patch_width, self.max_patches + ) + patches_shape = (n_imgs * n_patches,) + patch_size + if n_channels > 1: + patches_shape += (n_channels,) + + # extract the patches + patches = np.empty(patches_shape) + for ii, image in enumerate(X): + patches[ii * n_patches : (ii + 1) * n_patches] = extract_patches_2d( + image, + patch_size, + max_patches=self.max_patches, + random_state=random_state, + ) + return patches + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.two_d_array = False + tags.input_tags.three_d_array = True + tags.requires_fit = False + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/meson.build b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..f810d7b28576c82945ac4f285b55ab4ffc6c8fe9 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/meson.build @@ -0,0 +1,7 @@ +py.extension_module( + '_hashing_fast', + [cython_gen_cpp.process('_hashing_fast.pyx'), utils_cython_tree], + dependencies: [np_dep], + subdir: 'sklearn/feature_extraction', + install: true +) diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/text.py b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/text.py new file mode 100644 index 0000000000000000000000000000000000000000..d32248978a97ae85b7f3feee71fa234cbdeab9c6 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_extraction/text.py @@ -0,0 +1,2137 @@ +"""Utilities to build feature vectors from text documents.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import array +import re +import unicodedata +import warnings +from collections import defaultdict +from collections.abc import Mapping +from functools import partial +from numbers import Integral +from operator import itemgetter + +import numpy as np +import scipy.sparse as sp + +from sklearn.utils import metadata_routing + +from ..base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin, _fit_context +from ..exceptions import NotFittedError +from ..preprocessing import normalize +from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions +from ..utils.fixes import _IS_32BIT +from ..utils.validation import FLOAT_DTYPES, check_array, check_is_fitted, validate_data +from ._hash import FeatureHasher +from ._stop_words import ENGLISH_STOP_WORDS + +__all__ = [ + "ENGLISH_STOP_WORDS", + "CountVectorizer", + "HashingVectorizer", + "TfidfTransformer", + "TfidfVectorizer", + "strip_accents_ascii", + "strip_accents_unicode", + "strip_tags", +] + + +def _preprocess(doc, accent_function=None, lower=False): + """Chain together an optional series of text preprocessing steps to + apply to a document. + + Parameters + ---------- + doc: str + The string to preprocess + accent_function: callable, default=None + Function for handling accented characters. Common strategies include + normalizing and removing. + lower: bool, default=False + Whether to use str.lower to lowercase all of the text + + Returns + ------- + doc: str + preprocessed string + """ + if lower: + doc = doc.lower() + if accent_function is not None: + doc = accent_function(doc) + return doc + + +def _analyze( + doc, + analyzer=None, + tokenizer=None, + ngrams=None, + preprocessor=None, + decoder=None, + stop_words=None, +): + """Chain together an optional series of text processing steps to go from + a single document to ngrams, with or without tokenizing or preprocessing. + + If analyzer is used, only the decoder argument is used, as the analyzer is + intended to replace the preprocessor, tokenizer, and ngrams steps. + + Parameters + ---------- + analyzer: callable, default=None + tokenizer: callable, default=None + ngrams: callable, default=None + preprocessor: callable, default=None + decoder: callable, default=None + stop_words: list, default=None + + Returns + ------- + ngrams: list + A sequence of tokens, possibly with pairs, triples, etc. + """ + + if decoder is not None: + doc = decoder(doc) + if analyzer is not None: + doc = analyzer(doc) + else: + if preprocessor is not None: + doc = preprocessor(doc) + if tokenizer is not None: + doc = tokenizer(doc) + if ngrams is not None: + if stop_words is not None: + doc = ngrams(doc, stop_words) + else: + doc = ngrams(doc) + return doc + + +def strip_accents_unicode(s): + """Transform accentuated unicode symbols into their simple counterpart. + + Warning: the python-level loop and join operations make this + implementation 20 times slower than the strip_accents_ascii basic + normalization. + + Parameters + ---------- + s : str + The string to strip. + + Returns + ------- + s : str + The stripped string. + + See Also + -------- + strip_accents_ascii : Remove accentuated char for any unicode symbol that + has a direct ASCII equivalent. + """ + try: + # If `s` is ASCII-compatible, then it does not contain any accented + # characters and we can avoid an expensive list comprehension + s.encode("ASCII", errors="strict") + return s + except UnicodeEncodeError: + normalized = unicodedata.normalize("NFKD", s) + return "".join([c for c in normalized if not unicodedata.combining(c)]) + + +def strip_accents_ascii(s): + """Transform accentuated unicode symbols into ascii or nothing. + + Warning: this solution is only suited for languages that have a direct + transliteration to ASCII symbols. + + Parameters + ---------- + s : str + The string to strip. + + Returns + ------- + s : str + The stripped string. + + See Also + -------- + strip_accents_unicode : Remove accentuated char for any unicode symbol. + """ + nkfd_form = unicodedata.normalize("NFKD", s) + return nkfd_form.encode("ASCII", "ignore").decode("ASCII") + + +def strip_tags(s): + """Basic regexp based HTML / XML tag stripper function. + + For serious HTML/XML preprocessing you should rather use an external + library such as lxml or BeautifulSoup. + + Parameters + ---------- + s : str + The string to strip. + + Returns + ------- + s : str + The stripped string. + """ + return re.compile(r"<([^>]+)>", flags=re.UNICODE).sub(" ", s) + + +def _check_stop_list(stop): + if stop == "english": + return ENGLISH_STOP_WORDS + elif isinstance(stop, str): + raise ValueError("not a built-in stop list: %s" % stop) + elif stop is None: + return None + else: # assume it's a collection + return frozenset(stop) + + +class _VectorizerMixin: + """Provides common code for text vectorizers (tokenization logic).""" + + _white_spaces = re.compile(r"\s\s+") + + def decode(self, doc): + """Decode the input into a string of unicode symbols. + + The decoding strategy depends on the vectorizer parameters. + + Parameters + ---------- + doc : bytes or str + The string to decode. + + Returns + ------- + doc: str + A string of unicode symbols. + """ + if self.input == "filename": + with open(doc, "rb") as fh: + doc = fh.read() + + elif self.input == "file": + doc = doc.read() + + if isinstance(doc, bytes): + doc = doc.decode(self.encoding, self.decode_error) + + if doc is np.nan: + raise ValueError( + "np.nan is an invalid document, expected byte or unicode string." + ) + + return doc + + def _word_ngrams(self, tokens, stop_words=None): + """Turn tokens into a sequence of n-grams after stop words filtering""" + # handle stop words + if stop_words is not None: + tokens = [w for w in tokens if w not in stop_words] + + # handle token n-grams + min_n, max_n = self.ngram_range + if max_n != 1: + original_tokens = tokens + if min_n == 1: + # no need to do any slicing for unigrams + # just iterate through the original tokens + tokens = list(original_tokens) + min_n += 1 + else: + tokens = [] + + n_original_tokens = len(original_tokens) + + # bind method outside of loop to reduce overhead + tokens_append = tokens.append + space_join = " ".join + + for n in range(min_n, min(max_n + 1, n_original_tokens + 1)): + for i in range(n_original_tokens - n + 1): + tokens_append(space_join(original_tokens[i : i + n])) + + return tokens + + def _char_ngrams(self, text_document): + """Tokenize text_document into a sequence of character n-grams""" + # normalize white spaces + text_document = self._white_spaces.sub(" ", text_document) + + text_len = len(text_document) + min_n, max_n = self.ngram_range + if min_n == 1: + # no need to do any slicing for unigrams + # iterate through the string + ngrams = list(text_document) + min_n += 1 + else: + ngrams = [] + + # bind method outside of loop to reduce overhead + ngrams_append = ngrams.append + + for n in range(min_n, min(max_n + 1, text_len + 1)): + for i in range(text_len - n + 1): + ngrams_append(text_document[i : i + n]) + return ngrams + + def _char_wb_ngrams(self, text_document): + """Whitespace sensitive char-n-gram tokenization. + + Tokenize text_document into a sequence of character n-grams + operating only inside word boundaries. n-grams at the edges + of words are padded with space.""" + # normalize white spaces + text_document = self._white_spaces.sub(" ", text_document) + + min_n, max_n = self.ngram_range + ngrams = [] + + # bind method outside of loop to reduce overhead + ngrams_append = ngrams.append + + for w in text_document.split(): + w = " " + w + " " + w_len = len(w) + for n in range(min_n, max_n + 1): + offset = 0 + ngrams_append(w[offset : offset + n]) + while offset + n < w_len: + offset += 1 + ngrams_append(w[offset : offset + n]) + if offset == 0: # count a short word (w_len < n) only once + break + return ngrams + + def build_preprocessor(self): + """Return a function to preprocess the text before tokenization. + + Returns + ------- + preprocessor: callable + A function to preprocess the text before tokenization. + """ + if self.preprocessor is not None: + return self.preprocessor + + # accent stripping + if not self.strip_accents: + strip_accents = None + elif callable(self.strip_accents): + strip_accents = self.strip_accents + elif self.strip_accents == "ascii": + strip_accents = strip_accents_ascii + elif self.strip_accents == "unicode": + strip_accents = strip_accents_unicode + else: + raise ValueError( + 'Invalid value for "strip_accents": %s' % self.strip_accents + ) + + return partial(_preprocess, accent_function=strip_accents, lower=self.lowercase) + + def build_tokenizer(self): + """Return a function that splits a string into a sequence of tokens. + + Returns + ------- + tokenizer: callable + A function to split a string into a sequence of tokens. + """ + if self.tokenizer is not None: + return self.tokenizer + token_pattern = re.compile(self.token_pattern) + + if token_pattern.groups > 1: + raise ValueError( + "More than 1 capturing group in token pattern. Only a single " + "group should be captured." + ) + + return token_pattern.findall + + def get_stop_words(self): + """Build or fetch the effective stop words list. + + Returns + ------- + stop_words: list or None + A list of stop words. + """ + return _check_stop_list(self.stop_words) + + def _check_stop_words_consistency(self, stop_words, preprocess, tokenize): + """Check if stop words are consistent + + Returns + ------- + is_consistent : True if stop words are consistent with the preprocessor + and tokenizer, False if they are not, None if the check + was previously performed, "error" if it could not be + performed (e.g. because of the use of a custom + preprocessor / tokenizer) + """ + if id(self.stop_words) == getattr(self, "_stop_words_id", None): + # Stop words are were previously validated + return None + + # NB: stop_words is validated, unlike self.stop_words + try: + inconsistent = set() + for w in stop_words or (): + tokens = list(tokenize(preprocess(w))) + for token in tokens: + if token not in stop_words: + inconsistent.add(token) + self._stop_words_id = id(self.stop_words) + + if inconsistent: + warnings.warn( + "Your stop_words may be inconsistent with " + "your preprocessing. Tokenizing the stop " + "words generated tokens %r not in " + "stop_words." % sorted(inconsistent) + ) + return not inconsistent + except Exception: + # Failed to check stop words consistency (e.g. because a custom + # preprocessor or tokenizer was used) + self._stop_words_id = id(self.stop_words) + return "error" + + def build_analyzer(self): + """Return a callable to process input data. + + The callable handles preprocessing, tokenization, and n-grams generation. + + Returns + ------- + analyzer: callable + A function to handle preprocessing, tokenization + and n-grams generation. + """ + + if callable(self.analyzer): + return partial(_analyze, analyzer=self.analyzer, decoder=self.decode) + + preprocess = self.build_preprocessor() + + if self.analyzer == "char": + return partial( + _analyze, + ngrams=self._char_ngrams, + preprocessor=preprocess, + decoder=self.decode, + ) + + elif self.analyzer == "char_wb": + return partial( + _analyze, + ngrams=self._char_wb_ngrams, + preprocessor=preprocess, + decoder=self.decode, + ) + + elif self.analyzer == "word": + stop_words = self.get_stop_words() + tokenize = self.build_tokenizer() + self._check_stop_words_consistency(stop_words, preprocess, tokenize) + return partial( + _analyze, + ngrams=self._word_ngrams, + tokenizer=tokenize, + preprocessor=preprocess, + decoder=self.decode, + stop_words=stop_words, + ) + + else: + raise ValueError( + "%s is not a valid tokenization scheme/analyzer" % self.analyzer + ) + + def _validate_vocabulary(self): + vocabulary = self.vocabulary + if vocabulary is not None: + if isinstance(vocabulary, set): + vocabulary = sorted(vocabulary) + if not isinstance(vocabulary, Mapping): + vocab = {} + for i, t in enumerate(vocabulary): + if vocab.setdefault(t, i) != i: + msg = "Duplicate term in vocabulary: %r" % t + raise ValueError(msg) + vocabulary = vocab + else: + indices = set(vocabulary.values()) + if len(indices) != len(vocabulary): + raise ValueError("Vocabulary contains repeated indices.") + for i in range(len(vocabulary)): + if i not in indices: + msg = "Vocabulary of size %d doesn't contain index %d." % ( + len(vocabulary), + i, + ) + raise ValueError(msg) + if not vocabulary: + raise ValueError("empty vocabulary passed to fit") + self.fixed_vocabulary_ = True + self.vocabulary_ = dict(vocabulary) + else: + self.fixed_vocabulary_ = False + + def _check_vocabulary(self): + """Check if vocabulary is empty or missing (not fitted)""" + if not hasattr(self, "vocabulary_"): + self._validate_vocabulary() + if not self.fixed_vocabulary_: + raise NotFittedError("Vocabulary not fitted or provided") + + if len(self.vocabulary_) == 0: + raise ValueError("Vocabulary is empty") + + def _validate_ngram_range(self): + """Check validity of ngram_range parameter""" + min_n, max_m = self.ngram_range + if min_n > max_m: + raise ValueError( + "Invalid value for ngram_range=%s " + "lower boundary larger than the upper boundary." % str(self.ngram_range) + ) + + def _warn_for_unused_params(self): + if self.tokenizer is not None and self.token_pattern is not None: + warnings.warn( + "The parameter 'token_pattern' will not be used" + " since 'tokenizer' is not None'" + ) + + if self.preprocessor is not None and callable(self.analyzer): + warnings.warn( + "The parameter 'preprocessor' will not be used" + " since 'analyzer' is callable'" + ) + + if ( + self.ngram_range != (1, 1) + and self.ngram_range is not None + and callable(self.analyzer) + ): + warnings.warn( + "The parameter 'ngram_range' will not be used" + " since 'analyzer' is callable'" + ) + if self.analyzer != "word" or callable(self.analyzer): + if self.stop_words is not None: + warnings.warn( + "The parameter 'stop_words' will not be used" + " since 'analyzer' != 'word'" + ) + if ( + self.token_pattern is not None + and self.token_pattern != r"(?u)\b\w\w+\b" + ): + warnings.warn( + "The parameter 'token_pattern' will not be used" + " since 'analyzer' != 'word'" + ) + if self.tokenizer is not None: + warnings.warn( + "The parameter 'tokenizer' will not be used" + " since 'analyzer' != 'word'" + ) + + +class HashingVectorizer( + TransformerMixin, _VectorizerMixin, BaseEstimator, auto_wrap_output_keys=None +): + r"""Convert a collection of text documents to a matrix of token occurrences. + + It turns a collection of text documents into a scipy.sparse matrix holding + token occurrence counts (or binary occurrence information), possibly + normalized as token frequencies if norm='l1' or projected on the euclidean + unit sphere if norm='l2'. + + This text vectorizer implementation uses the hashing trick to find the + token string name to feature integer index mapping. + + This strategy has several advantages: + + - it is very low memory scalable to large datasets as there is no need to + store a vocabulary dictionary in memory. + + - it is fast to pickle and un-pickle as it holds no state besides the + constructor parameters. + + - it can be used in a streaming (partial fit) or parallel pipeline as there + is no state computed during fit. + + There are also a couple of cons (vs using a CountVectorizer with an + in-memory vocabulary): + + - there is no way to compute the inverse transform (from feature indices to + string feature names) which can be a problem when trying to introspect + which features are most important to a model. + + - there can be collisions: distinct tokens can be mapped to the same + feature index. However in practice this is rarely an issue if n_features + is large enough (e.g. 2 ** 18 for text classification problems). + + - no IDF weighting as this would render the transformer stateful. + + The hash function employed is the signed 32-bit version of Murmurhash3. + + For an efficiency comparison of the different feature extractors, see + :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`. + + For an example of document clustering and comparison with + :class:`~sklearn.feature_extraction.text.TfidfVectorizer`, see + :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + input : {'filename', 'file', 'content'}, default='content' + - If `'filename'`, the sequence passed as an argument to fit is + expected to be a list of filenames that need reading to fetch + the raw content to analyze. + + - If `'file'`, the sequence items must have a 'read' method (file-like + object) that is called to fetch the bytes in memory. + + - If `'content'`, the input is expected to be a sequence of items that + can be of type string or byte. + + encoding : str, default='utf-8' + If bytes or files are given to analyze, this encoding is used to + decode. + + decode_error : {'strict', 'ignore', 'replace'}, default='strict' + Instruction on what to do if a byte sequence is given to analyze that + contains characters not of the given `encoding`. By default, it is + 'strict', meaning that a UnicodeDecodeError will be raised. Other + values are 'ignore' and 'replace'. + + strip_accents : {'ascii', 'unicode'} or callable, default=None + Remove accents and perform other character normalization + during the preprocessing step. + 'ascii' is a fast method that only works on characters that have + a direct ASCII mapping. + 'unicode' is a slightly slower method that works on any character. + None (default) means no character normalization is performed. + + Both 'ascii' and 'unicode' use NFKD normalization from + :func:`unicodedata.normalize`. + + lowercase : bool, default=True + Convert all characters to lowercase before tokenizing. + + preprocessor : callable, default=None + Override the preprocessing (string transformation) stage while + preserving the tokenizing and n-grams generation steps. + Only applies if ``analyzer`` is not callable. + + tokenizer : callable, default=None + Override the string tokenization step while preserving the + preprocessing and n-grams generation steps. + Only applies if ``analyzer == 'word'``. + + stop_words : {'english'}, list, default=None + If 'english', a built-in stop word list for English is used. + There are several known issues with 'english' and you should + consider an alternative (see :ref:`stop_words`). + + If a list, that list is assumed to contain stop words, all of which + will be removed from the resulting tokens. + Only applies if ``analyzer == 'word'``. + + token_pattern : str or None, default=r"(?u)\\b\\w\\w+\\b" + Regular expression denoting what constitutes a "token", only used + if ``analyzer == 'word'``. The default regexp selects tokens of 2 + or more alphanumeric characters (punctuation is completely ignored + and always treated as a token separator). + + If there is a capturing group in token_pattern then the + captured group content, not the entire match, becomes the token. + At most one capturing group is permitted. + + ngram_range : tuple (min_n, max_n), default=(1, 1) + The lower and upper boundary of the range of n-values for different + n-grams to be extracted. All values of n such that min_n <= n <= max_n + will be used. For example an ``ngram_range`` of ``(1, 1)`` means only + unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means + only bigrams. + Only applies if ``analyzer`` is not callable. + + analyzer : {'word', 'char', 'char_wb'} or callable, default='word' + Whether the feature should be made of word or character n-grams. + Option 'char_wb' creates character n-grams only from text inside + word boundaries; n-grams at the edges of words are padded with space. + + If a callable is passed it is used to extract the sequence of features + out of the raw, unprocessed input. + + .. versionchanged:: 0.21 + Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data + is first read from the file and then passed to the given callable + analyzer. + + n_features : int, default=(2 ** 20) + The number of features (columns) in the output matrices. Small numbers + of features are likely to cause hash collisions, but large numbers + will cause larger coefficient dimensions in linear learners. + + binary : bool, default=False + If True, all non zero counts are set to 1. This is useful for discrete + probabilistic models that model binary events rather than integer + counts. + + norm : {'l1', 'l2'}, default='l2' + Norm used to normalize term vectors. None for no normalization. + + alternate_sign : bool, default=True + When True, an alternating sign is added to the features as to + approximately conserve the inner product in the hashed space even for + small n_features. This approach is similar to sparse random projection. + + .. versionadded:: 0.19 + + dtype : type, default=np.float64 + Type of the matrix returned by fit_transform() or transform(). + + See Also + -------- + CountVectorizer : Convert a collection of text documents to a matrix of + token counts. + TfidfVectorizer : Convert a collection of raw documents to a matrix of + TF-IDF features. + + Notes + ----- + This estimator is :term:`stateless` and does not need to be fitted. + However, we recommend to call :meth:`fit_transform` instead of + :meth:`transform`, as parameter validation is only performed in + :meth:`fit`. + + Examples + -------- + >>> from sklearn.feature_extraction.text import HashingVectorizer + >>> corpus = [ + ... 'This is the first document.', + ... 'This document is the second document.', + ... 'And this is the third one.', + ... 'Is this the first document?', + ... ] + >>> vectorizer = HashingVectorizer(n_features=2**4) + >>> X = vectorizer.fit_transform(corpus) + >>> print(X.shape) + (4, 16) + """ + + _parameter_constraints: dict = { + "input": [StrOptions({"filename", "file", "content"})], + "encoding": [str], + "decode_error": [StrOptions({"strict", "ignore", "replace"})], + "strip_accents": [StrOptions({"ascii", "unicode"}), None, callable], + "lowercase": ["boolean"], + "preprocessor": [callable, None], + "tokenizer": [callable, None], + "stop_words": [StrOptions({"english"}), list, None], + "token_pattern": [str, None], + "ngram_range": [tuple], + "analyzer": [StrOptions({"word", "char", "char_wb"}), callable], + "n_features": [Interval(Integral, 1, np.iinfo(np.int32).max, closed="left")], + "binary": ["boolean"], + "norm": [StrOptions({"l1", "l2"}), None], + "alternate_sign": ["boolean"], + "dtype": "no_validation", # delegate to numpy + } + + def __init__( + self, + *, + input="content", + encoding="utf-8", + decode_error="strict", + strip_accents=None, + lowercase=True, + preprocessor=None, + tokenizer=None, + stop_words=None, + token_pattern=r"(?u)\b\w\w+\b", + ngram_range=(1, 1), + analyzer="word", + n_features=(2**20), + binary=False, + norm="l2", + alternate_sign=True, + dtype=np.float64, + ): + self.input = input + self.encoding = encoding + self.decode_error = decode_error + self.strip_accents = strip_accents + self.preprocessor = preprocessor + self.tokenizer = tokenizer + self.analyzer = analyzer + self.lowercase = lowercase + self.token_pattern = token_pattern + self.stop_words = stop_words + self.n_features = n_features + self.ngram_range = ngram_range + self.binary = binary + self.norm = norm + self.alternate_sign = alternate_sign + self.dtype = dtype + + @_fit_context(prefer_skip_nested_validation=True) + def partial_fit(self, X, y=None): + """Only validates estimator's parameters. + + This method allows to: (i) validate the estimator's parameters and + (ii) be consistent with the scikit-learn transformer API. + + Parameters + ---------- + X : ndarray of shape [n_samples, n_features] + Training data. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + HashingVectorizer instance. + """ + return self + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Only validates estimator's parameters. + + This method allows to: (i) validate the estimator's parameters and + (ii) be consistent with the scikit-learn transformer API. + + Parameters + ---------- + X : ndarray of shape [n_samples, n_features] + Training data. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + HashingVectorizer instance. + """ + # triggers a parameter validation + if isinstance(X, str): + raise ValueError( + "Iterable over raw text documents expected, string object received." + ) + + self._warn_for_unused_params() + self._validate_ngram_range() + + self._get_hasher().fit(X, y=y) + return self + + def transform(self, X): + """Transform a sequence of documents to a document-term matrix. + + Parameters + ---------- + X : iterable over raw text documents, length = n_samples + Samples. Each sample must be a text document (either bytes or + unicode strings, file name or file object depending on the + constructor argument) which will be tokenized and hashed. + + Returns + ------- + X : sparse matrix of shape (n_samples, n_features) + Document-term matrix. + """ + if isinstance(X, str): + raise ValueError( + "Iterable over raw text documents expected, string object received." + ) + + self._validate_ngram_range() + + analyzer = self.build_analyzer() + X = self._get_hasher().transform(analyzer(doc) for doc in X) + if self.binary: + X.data.fill(1) + if self.norm is not None: + X = normalize(X, norm=self.norm, copy=False) + return X + + def fit_transform(self, X, y=None): + """Transform a sequence of documents to a document-term matrix. + + Parameters + ---------- + X : iterable over raw text documents, length = n_samples + Samples. Each sample must be a text document (either bytes or + unicode strings, file name or file object depending on the + constructor argument) which will be tokenized and hashed. + y : any + Ignored. This parameter exists only for compatibility with + sklearn.pipeline.Pipeline. + + Returns + ------- + X : sparse matrix of shape (n_samples, n_features) + Document-term matrix. + """ + return self.fit(X, y).transform(X) + + def _get_hasher(self): + return FeatureHasher( + n_features=self.n_features, + input_type="string", + dtype=self.dtype, + alternate_sign=self.alternate_sign, + ) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.string = True + tags.input_tags.two_d_array = False + tags.requires_fit = False + return tags + + +def _document_frequency(X): + """Count the number of non-zero values for each feature in sparse X.""" + if sp.issparse(X) and X.format == "csr": + return np.bincount(X.indices, minlength=X.shape[1]) + else: + return np.diff(X.indptr) + + +class CountVectorizer(_VectorizerMixin, BaseEstimator): + r"""Convert a collection of text documents to a matrix of token counts. + + This implementation produces a sparse representation of the counts using + scipy.sparse.csr_matrix. + + If you do not provide an a-priori dictionary and you do not use an analyzer + that does some kind of feature selection then the number of features will + be equal to the vocabulary size found by analyzing the data. + + For an efficiency comparison of the different feature extractors, see + :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + input : {'filename', 'file', 'content'}, default='content' + - If `'filename'`, the sequence passed as an argument to fit is + expected to be a list of filenames that need reading to fetch + the raw content to analyze. + + - If `'file'`, the sequence items must have a 'read' method (file-like + object) that is called to fetch the bytes in memory. + + - If `'content'`, the input is expected to be a sequence of items that + can be of type string or byte. + + encoding : str, default='utf-8' + If bytes or files are given to analyze, this encoding is used to + decode. + + decode_error : {'strict', 'ignore', 'replace'}, default='strict' + Instruction on what to do if a byte sequence is given to analyze that + contains characters not of the given `encoding`. By default, it is + 'strict', meaning that a UnicodeDecodeError will be raised. Other + values are 'ignore' and 'replace'. + + strip_accents : {'ascii', 'unicode'} or callable, default=None + Remove accents and perform other character normalization + during the preprocessing step. + 'ascii' is a fast method that only works on characters that have + a direct ASCII mapping. + 'unicode' is a slightly slower method that works on any characters. + None (default) means no character normalization is performed. + + Both 'ascii' and 'unicode' use NFKD normalization from + :func:`unicodedata.normalize`. + + lowercase : bool, default=True + Convert all characters to lowercase before tokenizing. + + preprocessor : callable, default=None + Override the preprocessing (strip_accents and lowercase) stage while + preserving the tokenizing and n-grams generation steps. + Only applies if ``analyzer`` is not callable. + + tokenizer : callable, default=None + Override the string tokenization step while preserving the + preprocessing and n-grams generation steps. + Only applies if ``analyzer == 'word'``. + + stop_words : {'english'}, list, default=None + If 'english', a built-in stop word list for English is used. + There are several known issues with 'english' and you should + consider an alternative (see :ref:`stop_words`). + + If a list, that list is assumed to contain stop words, all of which + will be removed from the resulting tokens. + Only applies if ``analyzer == 'word'``. + + If None, no stop words will be used. In this case, setting `max_df` + to a higher value, such as in the range (0.7, 1.0), can automatically detect + and filter stop words based on intra corpus document frequency of terms. + + token_pattern : str or None, default=r"(?u)\\b\\w\\w+\\b" + Regular expression denoting what constitutes a "token", only used + if ``analyzer == 'word'``. The default regexp select tokens of 2 + or more alphanumeric characters (punctuation is completely ignored + and always treated as a token separator). + + If there is a capturing group in token_pattern then the + captured group content, not the entire match, becomes the token. + At most one capturing group is permitted. + + ngram_range : tuple (min_n, max_n), default=(1, 1) + The lower and upper boundary of the range of n-values for different + word n-grams or char n-grams to be extracted. All values of n such + such that min_n <= n <= max_n will be used. For example an + ``ngram_range`` of ``(1, 1)`` means only unigrams, ``(1, 2)`` means + unigrams and bigrams, and ``(2, 2)`` means only bigrams. + Only applies if ``analyzer`` is not callable. + + analyzer : {'word', 'char', 'char_wb'} or callable, default='word' + Whether the feature should be made of word n-gram or character + n-grams. + Option 'char_wb' creates character n-grams only from text inside + word boundaries; n-grams at the edges of words are padded with space. + + If a callable is passed it is used to extract the sequence of features + out of the raw, unprocessed input. + + .. versionchanged:: 0.21 + + Since v0.21, if ``input`` is ``filename`` or ``file``, the data is + first read from the file and then passed to the given callable + analyzer. + + max_df : float in range [0.0, 1.0] or int, default=1.0 + When building the vocabulary ignore terms that have a document + frequency strictly higher than the given threshold (corpus-specific + stop words). + If float, the parameter represents a proportion of documents, integer + absolute counts. + This parameter is ignored if vocabulary is not None. + + min_df : float in range [0.0, 1.0] or int, default=1 + When building the vocabulary ignore terms that have a document + frequency strictly lower than the given threshold. This value is also + called cut-off in the literature. + If float, the parameter represents a proportion of documents, integer + absolute counts. + This parameter is ignored if vocabulary is not None. + + max_features : int, default=None + If not None, build a vocabulary that only consider the top + `max_features` ordered by term frequency across the corpus. + Otherwise, all features are used. + + This parameter is ignored if vocabulary is not None. + + vocabulary : Mapping or iterable, default=None + Either a Mapping (e.g., a dict) where keys are terms and values are + indices in the feature matrix, or an iterable over terms. If not + given, a vocabulary is determined from the input documents. Indices + in the mapping should not be repeated and should not have any gap + between 0 and the largest index. + + binary : bool, default=False + If True, all non zero counts are set to 1. This is useful for discrete + probabilistic models that model binary events rather than integer + counts. + + dtype : dtype, default=np.int64 + Type of the matrix returned by fit_transform() or transform(). + + Attributes + ---------- + vocabulary_ : dict + A mapping of terms to feature indices. + + fixed_vocabulary_ : bool + True if a fixed vocabulary of term to indices mapping + is provided by the user. + + See Also + -------- + HashingVectorizer : Convert a collection of text documents to a + matrix of token counts. + + TfidfVectorizer : Convert a collection of raw documents to a matrix + of TF-IDF features. + + Examples + -------- + >>> from sklearn.feature_extraction.text import CountVectorizer + >>> corpus = [ + ... 'This is the first document.', + ... 'This document is the second document.', + ... 'And this is the third one.', + ... 'Is this the first document?', + ... ] + >>> vectorizer = CountVectorizer() + >>> X = vectorizer.fit_transform(corpus) + >>> vectorizer.get_feature_names_out() + array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', + 'this'], ...) + >>> print(X.toarray()) + [[0 1 1 1 0 0 1 0 1] + [0 2 0 1 0 1 1 0 1] + [1 0 0 1 1 0 1 1 1] + [0 1 1 1 0 0 1 0 1]] + >>> vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2)) + >>> X2 = vectorizer2.fit_transform(corpus) + >>> vectorizer2.get_feature_names_out() + array(['and this', 'document is', 'first document', 'is the', 'is this', + 'second document', 'the first', 'the second', 'the third', 'third one', + 'this document', 'this is', 'this the'], ...) + >>> print(X2.toarray()) + [[0 0 1 1 0 0 1 0 0 0 0 1 0] + [0 1 0 1 0 1 0 1 0 0 1 0 0] + [1 0 0 1 0 0 0 0 1 1 0 1 0] + [0 0 1 0 1 0 1 0 0 0 0 0 1]] + """ + + # raw_documents should not be in the routing mechanism. It should have been + # called X in the first place. + __metadata_request__fit = {"raw_documents": metadata_routing.UNUSED} + __metadata_request__transform = {"raw_documents": metadata_routing.UNUSED} + + _parameter_constraints: dict = { + "input": [StrOptions({"filename", "file", "content"})], + "encoding": [str], + "decode_error": [StrOptions({"strict", "ignore", "replace"})], + "strip_accents": [StrOptions({"ascii", "unicode"}), None, callable], + "lowercase": ["boolean"], + "preprocessor": [callable, None], + "tokenizer": [callable, None], + "stop_words": [StrOptions({"english"}), list, None], + "token_pattern": [str, None], + "ngram_range": [tuple], + "analyzer": [StrOptions({"word", "char", "char_wb"}), callable], + "max_df": [ + Interval(RealNotInt, 0, 1, closed="both"), + Interval(Integral, 1, None, closed="left"), + ], + "min_df": [ + Interval(RealNotInt, 0, 1, closed="both"), + Interval(Integral, 1, None, closed="left"), + ], + "max_features": [Interval(Integral, 1, None, closed="left"), None], + "vocabulary": [Mapping, HasMethods("__iter__"), None], + "binary": ["boolean"], + "dtype": "no_validation", # delegate to numpy + } + + def __init__( + self, + *, + input="content", + encoding="utf-8", + decode_error="strict", + strip_accents=None, + lowercase=True, + preprocessor=None, + tokenizer=None, + stop_words=None, + token_pattern=r"(?u)\b\w\w+\b", + ngram_range=(1, 1), + analyzer="word", + max_df=1.0, + min_df=1, + max_features=None, + vocabulary=None, + binary=False, + dtype=np.int64, + ): + self.input = input + self.encoding = encoding + self.decode_error = decode_error + self.strip_accents = strip_accents + self.preprocessor = preprocessor + self.tokenizer = tokenizer + self.analyzer = analyzer + self.lowercase = lowercase + self.token_pattern = token_pattern + self.stop_words = stop_words + self.max_df = max_df + self.min_df = min_df + self.max_features = max_features + self.ngram_range = ngram_range + self.vocabulary = vocabulary + self.binary = binary + self.dtype = dtype + + def _sort_features(self, X, vocabulary): + """Sort features by name + + Returns a reordered matrix and modifies the vocabulary in place + """ + sorted_features = sorted(vocabulary.items()) + map_index = np.empty(len(sorted_features), dtype=X.indices.dtype) + for new_val, (term, old_val) in enumerate(sorted_features): + vocabulary[term] = new_val + map_index[old_val] = new_val + + X.indices = map_index.take(X.indices, mode="clip") + return X + + def _limit_features(self, X, vocabulary, high=None, low=None, limit=None): + """Remove too rare or too common features. + + Prune features that are non zero in more samples than high or less + documents than low, modifying the vocabulary, and restricting it to + at most the limit most frequent. + + This does not prune samples with zero features. + """ + if high is None and low is None and limit is None: + return X, set() + + # Calculate a mask based on document frequencies + dfs = _document_frequency(X) + mask = np.ones(len(dfs), dtype=bool) + if high is not None: + mask &= dfs <= high + if low is not None: + mask &= dfs >= low + if limit is not None and mask.sum() > limit: + tfs = np.asarray(X.sum(axis=0)).ravel() + mask_inds = (-tfs[mask]).argsort()[:limit] + new_mask = np.zeros(len(dfs), dtype=bool) + new_mask[np.where(mask)[0][mask_inds]] = True + mask = new_mask + + new_indices = np.cumsum(mask) - 1 # maps old indices to new + for term, old_index in list(vocabulary.items()): + if mask[old_index]: + vocabulary[term] = new_indices[old_index] + else: + del vocabulary[term] + kept_indices = np.where(mask)[0] + if len(kept_indices) == 0: + raise ValueError( + "After pruning, no terms remain. Try a lower min_df or a higher max_df." + ) + return X[:, kept_indices] + + def _count_vocab(self, raw_documents, fixed_vocab): + """Create sparse feature matrix, and vocabulary where fixed_vocab=False""" + if fixed_vocab: + vocabulary = self.vocabulary_ + else: + # Add a new value when a new vocabulary item is seen + vocabulary = defaultdict() + vocabulary.default_factory = vocabulary.__len__ + + analyze = self.build_analyzer() + j_indices = [] + indptr = [] + + values = _make_int_array() + indptr.append(0) + for doc in raw_documents: + feature_counter = {} + for feature in analyze(doc): + try: + feature_idx = vocabulary[feature] + if feature_idx not in feature_counter: + feature_counter[feature_idx] = 1 + else: + feature_counter[feature_idx] += 1 + except KeyError: + # Ignore out-of-vocabulary items for fixed_vocab=True + continue + + j_indices.extend(feature_counter.keys()) + values.extend(feature_counter.values()) + indptr.append(len(j_indices)) + + if not fixed_vocab: + # disable defaultdict behaviour + vocabulary = dict(vocabulary) + if not vocabulary: + raise ValueError( + "empty vocabulary; perhaps the documents only contain stop words" + ) + + if indptr[-1] > np.iinfo(np.int32).max: # = 2**31 - 1 + if _IS_32BIT: + raise ValueError( + ( + "sparse CSR array has {} non-zero " + "elements and requires 64 bit indexing, " + "which is unsupported with 32 bit Python." + ).format(indptr[-1]) + ) + indices_dtype = np.int64 + + else: + indices_dtype = np.int32 + j_indices = np.asarray(j_indices, dtype=indices_dtype) + indptr = np.asarray(indptr, dtype=indices_dtype) + values = np.frombuffer(values, dtype=np.intc) + + X = sp.csr_matrix( + (values, j_indices, indptr), + shape=(len(indptr) - 1, len(vocabulary)), + dtype=self.dtype, + ) + X.sort_indices() + return vocabulary, X + + def fit(self, raw_documents, y=None): + """Learn a vocabulary dictionary of all tokens in the raw documents. + + Parameters + ---------- + raw_documents : iterable + An iterable which generates either str, unicode or file objects. + + y : None + This parameter is ignored. + + Returns + ------- + self : object + Fitted vectorizer. + """ + self.fit_transform(raw_documents) + return self + + @_fit_context(prefer_skip_nested_validation=True) + def fit_transform(self, raw_documents, y=None): + """Learn the vocabulary dictionary and return document-term matrix. + + This is equivalent to fit followed by transform, but more efficiently + implemented. + + Parameters + ---------- + raw_documents : iterable + An iterable which generates either str, unicode or file objects. + + y : None + This parameter is ignored. + + Returns + ------- + X : array of shape (n_samples, n_features) + Document-term matrix. + """ + # We intentionally don't call the transform method to make + # fit_transform overridable without unwanted side effects in + # TfidfVectorizer. + if isinstance(raw_documents, str): + raise ValueError( + "Iterable over raw text documents expected, string object received." + ) + + self._validate_ngram_range() + self._warn_for_unused_params() + self._validate_vocabulary() + max_df = self.max_df + min_df = self.min_df + max_features = self.max_features + + if self.fixed_vocabulary_ and self.lowercase: + for term in self.vocabulary: + if any(map(str.isupper, term)): + warnings.warn( + "Upper case characters found in" + " vocabulary while 'lowercase'" + " is True. These entries will not" + " be matched with any documents" + ) + break + + vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_) + + if self.binary: + X.data.fill(1) + + if not self.fixed_vocabulary_: + n_doc = X.shape[0] + max_doc_count = max_df if isinstance(max_df, Integral) else max_df * n_doc + min_doc_count = min_df if isinstance(min_df, Integral) else min_df * n_doc + if max_doc_count < min_doc_count: + raise ValueError("max_df corresponds to < documents than min_df") + if max_features is not None: + X = self._sort_features(X, vocabulary) + X = self._limit_features( + X, vocabulary, max_doc_count, min_doc_count, max_features + ) + if max_features is None: + X = self._sort_features(X, vocabulary) + self.vocabulary_ = vocabulary + + return X + + def transform(self, raw_documents): + """Transform documents to document-term matrix. + + Extract token counts out of raw text documents using the vocabulary + fitted with fit or the one provided to the constructor. + + Parameters + ---------- + raw_documents : iterable + An iterable which generates either str, unicode or file objects. + + Returns + ------- + X : sparse matrix of shape (n_samples, n_features) + Document-term matrix. + """ + if isinstance(raw_documents, str): + raise ValueError( + "Iterable over raw text documents expected, string object received." + ) + self._check_vocabulary() + + # use the same matrix-building strategy as fit_transform + _, X = self._count_vocab(raw_documents, fixed_vocab=True) + if self.binary: + X.data.fill(1) + return X + + def inverse_transform(self, X): + """Return terms per document with nonzero entries in X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Document-term matrix. + + Returns + ------- + X_original : list of arrays of shape (n_samples,) + List of arrays of terms. + """ + self._check_vocabulary() + # We need CSR format for fast row manipulations. + X = check_array(X, accept_sparse="csr") + n_samples = X.shape[0] + + terms = np.array(list(self.vocabulary_.keys())) + indices = np.array(list(self.vocabulary_.values())) + inverse_vocabulary = terms[np.argsort(indices)] + + if sp.issparse(X): + return [ + inverse_vocabulary[X[i, :].nonzero()[1]].ravel() + for i in range(n_samples) + ] + else: + return [ + inverse_vocabulary[np.flatnonzero(X[i, :])].ravel() + for i in range(n_samples) + ] + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + self._check_vocabulary() + return np.asarray( + [t for t, i in sorted(self.vocabulary_.items(), key=itemgetter(1))], + dtype=object, + ) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.string = True + tags.input_tags.two_d_array = False + return tags + + +def _make_int_array(): + """Construct an array.array of a type suitable for scipy.sparse indices.""" + return array.array(str("i")) + + +class TfidfTransformer( + OneToOneFeatureMixin, TransformerMixin, BaseEstimator, auto_wrap_output_keys=None +): + """Transform a count matrix to a normalized tf or tf-idf representation. + + Tf means term-frequency while tf-idf means term-frequency times inverse + document-frequency. This is a common term weighting scheme in information + retrieval, that has also found good use in document classification. + + The goal of using tf-idf instead of the raw frequencies of occurrence of a + token in a given document is to scale down the impact of tokens that occur + very frequently in a given corpus and that are hence empirically less + informative than features that occur in a small fraction of the training + corpus. + + The formula that is used to compute the tf-idf for a term t of a document d + in a document set is tf-idf(t, d) = tf(t, d) * idf(t), and the idf is + computed as idf(t) = log [ n / df(t) ] + 1 (if ``smooth_idf=False``), where + n is the total number of documents in the document set and df(t) is the + document frequency of t; the document frequency is the number of documents + in the document set that contain the term t. The effect of adding "1" to + the idf in the equation above is that terms with zero idf, i.e., terms + that occur in all documents in a training set, will not be entirely + ignored. + (Note that the idf formula above differs from the standard textbook + notation that defines the idf as + idf(t) = log [ n / (df(t) + 1) ]). + + If ``smooth_idf=True`` (the default), the constant "1" is added to the + numerator and denominator of the idf as if an extra document was seen + containing every term in the collection exactly once, which prevents + zero divisions: idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1. + + Furthermore, the formulas used to compute tf and idf depend + on parameter settings that correspond to the SMART notation used in IR + as follows: + + Tf is "n" (natural) by default, "l" (logarithmic) when + ``sublinear_tf=True``. + Idf is "t" when use_idf is given, "n" (none) otherwise. + Normalization is "c" (cosine) when ``norm='l2'``, "n" (none) + when ``norm=None``. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + norm : {'l1', 'l2'} or None, default='l2' + Each output row will have unit norm, either: + + - 'l2': Sum of squares of vector elements is 1. The cosine + similarity between two vectors is their dot product when l2 norm has + been applied. + - 'l1': Sum of absolute values of vector elements is 1. + See :func:`~sklearn.preprocessing.normalize`. + - None: No normalization. + + use_idf : bool, default=True + Enable inverse-document-frequency reweighting. If False, idf(t) = 1. + + smooth_idf : bool, default=True + Smooth idf weights by adding one to document frequencies, as if an + extra document was seen containing every term in the collection + exactly once. Prevents zero divisions. + + sublinear_tf : bool, default=False + Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). + + Attributes + ---------- + idf_ : array of shape (n_features) + The inverse document frequency (IDF) vector; only defined + if ``use_idf`` is True. + + .. versionadded:: 0.20 + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 1.0 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + CountVectorizer : Transforms text into a sparse matrix of n-gram counts. + + TfidfVectorizer : Convert a collection of raw documents to a matrix of + TF-IDF features. + + HashingVectorizer : Convert a collection of text documents to a matrix + of token occurrences. + + References + ---------- + .. [Yates2011] R. Baeza-Yates and B. Ribeiro-Neto (2011). Modern + Information Retrieval. Addison Wesley, pp. 68-74. + + .. [MRS2008] C.D. Manning, P. Raghavan and H. Schütze (2008). + Introduction to Information Retrieval. Cambridge University + Press, pp. 118-120. + + Examples + -------- + >>> from sklearn.feature_extraction.text import TfidfTransformer + >>> from sklearn.feature_extraction.text import CountVectorizer + >>> from sklearn.pipeline import Pipeline + >>> corpus = ['this is the first document', + ... 'this document is the second document', + ... 'and this is the third one', + ... 'is this the first document'] + >>> vocabulary = ['this', 'document', 'first', 'is', 'second', 'the', + ... 'and', 'one'] + >>> pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)), + ... ('tfid', TfidfTransformer())]).fit(corpus) + >>> pipe['count'].transform(corpus).toarray() + array([[1, 1, 1, 1, 0, 1, 0, 0], + [1, 2, 0, 1, 1, 1, 0, 0], + [1, 0, 0, 1, 0, 1, 1, 1], + [1, 1, 1, 1, 0, 1, 0, 0]]) + >>> pipe['tfid'].idf_ + array([1. , 1.22314355, 1.51082562, 1. , 1.91629073, + 1. , 1.91629073, 1.91629073]) + >>> pipe.transform(corpus).shape + (4, 8) + """ + + _parameter_constraints: dict = { + "norm": [StrOptions({"l1", "l2"}), None], + "use_idf": ["boolean"], + "smooth_idf": ["boolean"], + "sublinear_tf": ["boolean"], + } + + def __init__(self, *, norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=False): + self.norm = norm + self.use_idf = use_idf + self.smooth_idf = smooth_idf + self.sublinear_tf = sublinear_tf + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Learn the idf vector (global term weights). + + Parameters + ---------- + X : sparse matrix of shape (n_samples, n_features) + A matrix of term/token counts. + + y : None + This parameter is not needed to compute tf-idf. + + Returns + ------- + self : object + Fitted transformer. + """ + # large sparse data is not supported for 32bit platforms because + # _document_frequency uses np.bincount which works on arrays of + # dtype NPY_INTP which is int32 for 32bit platforms. See #20923 + X = validate_data( + self, X, accept_sparse=("csr", "csc"), accept_large_sparse=not _IS_32BIT + ) + if not sp.issparse(X): + X = sp.csr_matrix(X) + dtype = X.dtype if X.dtype in (np.float64, np.float32) else np.float64 + + if self.use_idf: + n_samples, _ = X.shape + df = _document_frequency(X) + df = df.astype(dtype, copy=False) + + # perform idf smoothing if required + df += float(self.smooth_idf) + n_samples += int(self.smooth_idf) + + # log+1 instead of log makes sure terms with zero idf don't get + # suppressed entirely. + # Force the dtype of `idf_` to be the same as `df`. In NumPy < 2, the dtype + # was depending on the value of `n_samples`. + self.idf_ = np.full_like(df, fill_value=n_samples, dtype=dtype) + self.idf_ /= df + # `np.log` preserves the dtype of `df` and thus `dtype`. + np.log(self.idf_, out=self.idf_) + self.idf_ += 1.0 + + return self + + def transform(self, X, copy=True): + """Transform a count matrix to a tf or tf-idf representation. + + Parameters + ---------- + X : sparse matrix of (n_samples, n_features) + A matrix of term/token counts. + + copy : bool, default=True + Whether to copy X and operate on the copy or perform in-place + operations. `copy=False` will only be effective with CSR sparse matrix. + + Returns + ------- + vectors : sparse matrix of shape (n_samples, n_features) + Tf-idf-weighted document-term matrix. + """ + check_is_fitted(self) + X = validate_data( + self, + X, + accept_sparse="csr", + dtype=[np.float64, np.float32], + copy=copy, + reset=False, + ) + if not sp.issparse(X): + X = sp.csr_matrix(X, dtype=X.dtype) + + if self.sublinear_tf: + np.log(X.data, X.data) + X.data += 1.0 + + if hasattr(self, "idf_"): + # the columns of X (CSR matrix) can be accessed with `X.indices `and + # multiplied with the corresponding `idf` value + X.data *= self.idf_[X.indices] + + if self.norm is not None: + X = normalize(X, norm=self.norm, copy=False) + + return X + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + # FIXME: np.float16 could be preserved if _inplace_csr_row_normalize_l2 + # accepted it. + tags.transformer_tags.preserves_dtype = ["float64", "float32"] + return tags + + +class TfidfVectorizer(CountVectorizer): + r"""Convert a collection of raw documents to a matrix of TF-IDF features. + + Equivalent to :class:`CountVectorizer` followed by + :class:`TfidfTransformer`. + + For an example of usage, see + :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py`. + + For an efficiency comparison of the different feature extractors, see + :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`. + + For an example of document clustering and comparison with + :class:`~sklearn.feature_extraction.text.HashingVectorizer`, see + :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + input : {'filename', 'file', 'content'}, default='content' + - If `'filename'`, the sequence passed as an argument to fit is + expected to be a list of filenames that need reading to fetch + the raw content to analyze. + + - If `'file'`, the sequence items must have a 'read' method (file-like + object) that is called to fetch the bytes in memory. + + - If `'content'`, the input is expected to be a sequence of items that + can be of type string or byte. + + encoding : str, default='utf-8' + If bytes or files are given to analyze, this encoding is used to + decode. + + decode_error : {'strict', 'ignore', 'replace'}, default='strict' + Instruction on what to do if a byte sequence is given to analyze that + contains characters not of the given `encoding`. By default, it is + 'strict', meaning that a UnicodeDecodeError will be raised. Other + values are 'ignore' and 'replace'. + + strip_accents : {'ascii', 'unicode'} or callable, default=None + Remove accents and perform other character normalization + during the preprocessing step. + 'ascii' is a fast method that only works on characters that have + a direct ASCII mapping. + 'unicode' is a slightly slower method that works on any characters. + None (default) means no character normalization is performed. + + Both 'ascii' and 'unicode' use NFKD normalization from + :func:`unicodedata.normalize`. + + lowercase : bool, default=True + Convert all characters to lowercase before tokenizing. + + preprocessor : callable, default=None + Override the preprocessing (string transformation) stage while + preserving the tokenizing and n-grams generation steps. + Only applies if ``analyzer`` is not callable. + + tokenizer : callable, default=None + Override the string tokenization step while preserving the + preprocessing and n-grams generation steps. + Only applies if ``analyzer == 'word'``. + + analyzer : {'word', 'char', 'char_wb'} or callable, default='word' + Whether the feature should be made of word or character n-grams. + Option 'char_wb' creates character n-grams only from text inside + word boundaries; n-grams at the edges of words are padded with space. + + If a callable is passed it is used to extract the sequence of features + out of the raw, unprocessed input. + + .. versionchanged:: 0.21 + Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data + is first read from the file and then passed to the given callable + analyzer. + + stop_words : {'english'}, list, default=None + If a string, it is passed to _check_stop_list and the appropriate stop + list is returned. 'english' is currently the only supported string + value. + There are several known issues with 'english' and you should + consider an alternative (see :ref:`stop_words`). + + If a list, that list is assumed to contain stop words, all of which + will be removed from the resulting tokens. + Only applies if ``analyzer == 'word'``. + + If None, no stop words will be used. In this case, setting `max_df` + to a higher value, such as in the range (0.7, 1.0), can automatically detect + and filter stop words based on intra corpus document frequency of terms. + + token_pattern : str, default=r"(?u)\\b\\w\\w+\\b" + Regular expression denoting what constitutes a "token", only used + if ``analyzer == 'word'``. The default regexp selects tokens of 2 + or more alphanumeric characters (punctuation is completely ignored + and always treated as a token separator). + + If there is a capturing group in token_pattern then the + captured group content, not the entire match, becomes the token. + At most one capturing group is permitted. + + ngram_range : tuple (min_n, max_n), default=(1, 1) + The lower and upper boundary of the range of n-values for different + n-grams to be extracted. All values of n such that min_n <= n <= max_n + will be used. For example an ``ngram_range`` of ``(1, 1)`` means only + unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means + only bigrams. + Only applies if ``analyzer`` is not callable. + + max_df : float or int, default=1.0 + When building the vocabulary ignore terms that have a document + frequency strictly higher than the given threshold (corpus-specific + stop words). + If float in range [0.0, 1.0], the parameter represents a proportion of + documents, integer absolute counts. + This parameter is ignored if vocabulary is not None. + + min_df : float or int, default=1 + When building the vocabulary ignore terms that have a document + frequency strictly lower than the given threshold. This value is also + called cut-off in the literature. + If float in range of [0.0, 1.0], the parameter represents a proportion + of documents, integer absolute counts. + This parameter is ignored if vocabulary is not None. + + max_features : int, default=None + If not None, build a vocabulary that only consider the top + `max_features` ordered by term frequency across the corpus. + Otherwise, all features are used. + + This parameter is ignored if vocabulary is not None. + + vocabulary : Mapping or iterable, default=None + Either a Mapping (e.g., a dict) where keys are terms and values are + indices in the feature matrix, or an iterable over terms. If not + given, a vocabulary is determined from the input documents. + + binary : bool, default=False + If True, all non-zero term counts are set to 1. This does not mean + outputs will have only 0/1 values, only that the tf term in tf-idf + is binary. (Set `binary` to True, `use_idf` to False and + `norm` to None to get 0/1 outputs). + + dtype : dtype, default=float64 + Type of the matrix returned by fit_transform() or transform(). + + norm : {'l1', 'l2'} or None, default='l2' + Each output row will have unit norm, either: + + - 'l2': Sum of squares of vector elements is 1. The cosine + similarity between two vectors is their dot product when l2 norm has + been applied. + - 'l1': Sum of absolute values of vector elements is 1. + See :func:`~sklearn.preprocessing.normalize`. + - None: No normalization. + + use_idf : bool, default=True + Enable inverse-document-frequency reweighting. If False, idf(t) = 1. + + smooth_idf : bool, default=True + Smooth idf weights by adding one to document frequencies, as if an + extra document was seen containing every term in the collection + exactly once. Prevents zero divisions. + + sublinear_tf : bool, default=False + Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). + + Attributes + ---------- + vocabulary_ : dict + A mapping of terms to feature indices. + + fixed_vocabulary_ : bool + True if a fixed vocabulary of term to indices mapping + is provided by the user. + + idf_ : array of shape (n_features,) + The inverse document frequency (IDF) vector; only defined + if ``use_idf`` is True. + + See Also + -------- + CountVectorizer : Transforms text into a sparse matrix of n-gram counts. + + TfidfTransformer : Performs the TF-IDF transformation from a provided + matrix of counts. + + Examples + -------- + >>> from sklearn.feature_extraction.text import TfidfVectorizer + >>> corpus = [ + ... 'This is the first document.', + ... 'This document is the second document.', + ... 'And this is the third one.', + ... 'Is this the first document?', + ... ] + >>> vectorizer = TfidfVectorizer() + >>> X = vectorizer.fit_transform(corpus) + >>> vectorizer.get_feature_names_out() + array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', + 'this'], ...) + >>> print(X.shape) + (4, 9) + """ + + _parameter_constraints: dict = {**CountVectorizer._parameter_constraints} + _parameter_constraints.update( + { + "norm": [StrOptions({"l1", "l2"}), None], + "use_idf": ["boolean"], + "smooth_idf": ["boolean"], + "sublinear_tf": ["boolean"], + } + ) + + def __init__( + self, + *, + input="content", + encoding="utf-8", + decode_error="strict", + strip_accents=None, + lowercase=True, + preprocessor=None, + tokenizer=None, + analyzer="word", + stop_words=None, + token_pattern=r"(?u)\b\w\w+\b", + ngram_range=(1, 1), + max_df=1.0, + min_df=1, + max_features=None, + vocabulary=None, + binary=False, + dtype=np.float64, + norm="l2", + use_idf=True, + smooth_idf=True, + sublinear_tf=False, + ): + super().__init__( + input=input, + encoding=encoding, + decode_error=decode_error, + strip_accents=strip_accents, + lowercase=lowercase, + preprocessor=preprocessor, + tokenizer=tokenizer, + analyzer=analyzer, + stop_words=stop_words, + token_pattern=token_pattern, + ngram_range=ngram_range, + max_df=max_df, + min_df=min_df, + max_features=max_features, + vocabulary=vocabulary, + binary=binary, + dtype=dtype, + ) + self.norm = norm + self.use_idf = use_idf + self.smooth_idf = smooth_idf + self.sublinear_tf = sublinear_tf + + # Broadcast the TF-IDF parameters to the underlying transformer instance + # for easy grid search and repr + + @property + def idf_(self): + """Inverse document frequency vector, only defined if `use_idf=True`. + + Returns + ------- + ndarray of shape (n_features,) + """ + if not hasattr(self, "_tfidf"): + raise NotFittedError( + f"{self.__class__.__name__} is not fitted yet. Call 'fit' with " + "appropriate arguments before using this attribute." + ) + return self._tfidf.idf_ + + @idf_.setter + def idf_(self, value): + if not self.use_idf: + raise ValueError("`idf_` cannot be set when `user_idf=False`.") + if not hasattr(self, "_tfidf"): + # We should support transferring `idf_` from another `TfidfTransformer` + # and therefore, we need to create the transformer instance it does not + # exist yet. + self._tfidf = TfidfTransformer( + norm=self.norm, + use_idf=self.use_idf, + smooth_idf=self.smooth_idf, + sublinear_tf=self.sublinear_tf, + ) + self._validate_vocabulary() + if hasattr(self, "vocabulary_"): + if len(self.vocabulary_) != len(value): + raise ValueError( + "idf length = %d must be equal to vocabulary size = %d" + % (len(value), len(self.vocabulary)) + ) + self._tfidf.idf_ = value + + def _check_params(self): + if self.dtype not in FLOAT_DTYPES: + warnings.warn( + "Only {} 'dtype' should be used. {} 'dtype' will " + "be converted to np.float64.".format(FLOAT_DTYPES, self.dtype), + UserWarning, + ) + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, raw_documents, y=None): + """Learn vocabulary and idf from training set. + + Parameters + ---------- + raw_documents : iterable + An iterable which generates either str, unicode or file objects. + + y : None + This parameter is not needed to compute tfidf. + + Returns + ------- + self : object + Fitted vectorizer. + """ + self._check_params() + self._warn_for_unused_params() + self._tfidf = TfidfTransformer( + norm=self.norm, + use_idf=self.use_idf, + smooth_idf=self.smooth_idf, + sublinear_tf=self.sublinear_tf, + ) + X = super().fit_transform(raw_documents) + self._tfidf.fit(X) + return self + + def fit_transform(self, raw_documents, y=None): + """Learn vocabulary and idf, return document-term matrix. + + This is equivalent to fit followed by transform, but more efficiently + implemented. + + Parameters + ---------- + raw_documents : iterable + An iterable which generates either str, unicode or file objects. + + y : None + This parameter is ignored. + + Returns + ------- + X : sparse matrix of (n_samples, n_features) + Tf-idf-weighted document-term matrix. + """ + self._check_params() + self._tfidf = TfidfTransformer( + norm=self.norm, + use_idf=self.use_idf, + smooth_idf=self.smooth_idf, + sublinear_tf=self.sublinear_tf, + ) + X = super().fit_transform(raw_documents) + self._tfidf.fit(X) + # X is already a transformed view of raw_documents so + # we set copy to False + return self._tfidf.transform(X, copy=False) + + def transform(self, raw_documents): + """Transform documents to document-term matrix. + + Uses the vocabulary and document frequencies (df) learned by fit (or + fit_transform). + + Parameters + ---------- + raw_documents : iterable + An iterable which generates either str, unicode or file objects. + + Returns + ------- + X : sparse matrix of (n_samples, n_features) + Tf-idf-weighted document-term matrix. + """ + check_is_fitted(self, msg="The TF-IDF vectorizer is not fitted") + + X = super().transform(raw_documents) + return self._tfidf.transform(X, copy=False) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.string = True + tags.input_tags.two_d_array = False + tags._skip_test = True + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d0d2dcee909f4741d7ba79093812118dd14459d8 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/__init__.py @@ -0,0 +1,50 @@ +"""Feature selection algorithms. + +These include univariate filter selection methods and the recursive feature elimination +algorithm. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._base import SelectorMixin +from ._from_model import SelectFromModel +from ._mutual_info import mutual_info_classif, mutual_info_regression +from ._rfe import RFE, RFECV +from ._sequential import SequentialFeatureSelector +from ._univariate_selection import ( + GenericUnivariateSelect, + SelectFdr, + SelectFpr, + SelectFwe, + SelectKBest, + SelectPercentile, + chi2, + f_classif, + f_oneway, + f_regression, + r_regression, +) +from ._variance_threshold import VarianceThreshold + +__all__ = [ + "RFE", + "RFECV", + "GenericUnivariateSelect", + "SelectFdr", + "SelectFpr", + "SelectFromModel", + "SelectFwe", + "SelectKBest", + "SelectPercentile", + "SelectorMixin", + "SequentialFeatureSelector", + "VarianceThreshold", + "chi2", + "f_classif", + "f_oneway", + "f_regression", + "mutual_info_classif", + "mutual_info_regression", + "r_regression", +] diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_base.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..56e50e49ca30c6970366b1c7799dcca46deef859 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_base.py @@ -0,0 +1,267 @@ +"""Generic feature selection mixin""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from abc import ABCMeta, abstractmethod +from operator import attrgetter + +import numpy as np +from scipy.sparse import csc_matrix, issparse + +from ..base import TransformerMixin +from ..utils import _safe_indexing, check_array, safe_sqr +from ..utils._set_output import _get_output_config +from ..utils._tags import get_tags +from ..utils.validation import ( + _check_feature_names_in, + _is_pandas_df, + check_is_fitted, + validate_data, +) + + +class SelectorMixin(TransformerMixin, metaclass=ABCMeta): + """ + Transformer mixin that performs feature selection given a support mask + + This mixin provides a feature selector implementation with `transform` and + `inverse_transform` functionality given an implementation of + `_get_support_mask`. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.datasets import load_iris + >>> from sklearn.base import BaseEstimator + >>> from sklearn.feature_selection import SelectorMixin + >>> class FeatureSelector(SelectorMixin, BaseEstimator): + ... def fit(self, X, y=None): + ... self.n_features_in_ = X.shape[1] + ... return self + ... def _get_support_mask(self): + ... mask = np.zeros(self.n_features_in_, dtype=bool) + ... mask[:2] = True # select the first two features + ... return mask + >>> X, y = load_iris(return_X_y=True) + >>> FeatureSelector().fit_transform(X, y).shape + (150, 2) + """ + + def get_support(self, indices=False): + """ + Get a mask, or integer index, of the features selected. + + Parameters + ---------- + indices : bool, default=False + If True, the return value will be an array of integers, rather + than a boolean mask. + + Returns + ------- + support : array + An index that selects the retained features from a feature vector. + If `indices` is False, this is a boolean array of shape + [# input features], in which an element is True iff its + corresponding feature is selected for retention. If `indices` is + True, this is an integer array of shape [# output features] whose + values are indices into the input feature vector. + """ + mask = self._get_support_mask() + return mask if not indices else np.nonzero(mask)[0] + + @abstractmethod + def _get_support_mask(self): + """ + Get the boolean mask indicating which features are selected + + Returns + ------- + support : boolean array of shape [# input features] + An element is True iff its corresponding feature is selected for + retention. + """ + + def transform(self, X): + """Reduce X to the selected features. + + Parameters + ---------- + X : array of shape [n_samples, n_features] + The input samples. + + Returns + ------- + X_r : array of shape [n_samples, n_selected_features] + The input samples with only the selected features. + """ + # Preserve X when X is a dataframe and the output is configured to + # be pandas. + output_config_dense = _get_output_config("transform", estimator=self)["dense"] + preserve_X = output_config_dense != "default" and _is_pandas_df(X) + + # note: we use get_tags instead of __sklearn_tags__ because this is a + # public Mixin. + X = validate_data( + self, + X, + dtype=None, + accept_sparse="csr", + ensure_all_finite=not get_tags(self).input_tags.allow_nan, + skip_check_array=preserve_X, + reset=False, + ) + return self._transform(X) + + def _transform(self, X): + """Reduce X to the selected features.""" + mask = self.get_support() + if not mask.any(): + warnings.warn( + ( + "No features were selected: either the data is" + " too noisy or the selection test too strict." + ), + UserWarning, + ) + if hasattr(X, "iloc"): + return X.iloc[:, :0] + return np.empty(0, dtype=X.dtype).reshape((X.shape[0], 0)) + return _safe_indexing(X, mask, axis=1) + + def inverse_transform(self, X): + """Reverse the transformation operation. + + Parameters + ---------- + X : array of shape [n_samples, n_selected_features] + The input samples. + + Returns + ------- + X_original : array of shape [n_samples, n_original_features] + `X` with columns of zeros inserted where features would have + been removed by :meth:`transform`. + """ + if issparse(X): + X = X.tocsc() + # insert additional entries in indptr: + # e.g. if transform changed indptr from [0 2 6 7] to [0 2 3] + # col_nonzeros here will be [2 0 1] so indptr becomes [0 2 2 3] + it = self.inverse_transform(np.diff(X.indptr).reshape(1, -1)) + col_nonzeros = it.ravel() + indptr = np.concatenate([[0], np.cumsum(col_nonzeros)]) + Xt = csc_matrix( + (X.data, X.indices, indptr), + shape=(X.shape[0], len(indptr) - 1), + dtype=X.dtype, + ) + return Xt + + support = self.get_support() + X = check_array(X, dtype=None) + if support.sum() != X.shape[1]: + raise ValueError("X has a different shape than during fitting.") + + if X.ndim == 1: + X = X[None, :] + Xt = np.zeros((X.shape[0], support.size), dtype=X.dtype) + Xt[:, support] = X + return Xt + + def get_feature_names_out(self, input_features=None): + """Mask feature names according to selected features. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then the following input feature names are generated: + `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + check_is_fitted(self) + input_features = _check_feature_names_in(self, input_features) + return input_features[self.get_support()] + + +def _get_feature_importances(estimator, getter, transform_func=None, norm_order=1): + """ + Retrieve and aggregate (ndim > 1) the feature importances + from an estimator. Also optionally applies transformation. + + Parameters + ---------- + estimator : estimator + A scikit-learn estimator from which we want to get the feature + importances. + + getter : "auto", str or callable + An attribute or a callable to get the feature importance. If `"auto"`, + `estimator` is expected to expose `coef_` or `feature_importances`. + + transform_func : {"norm", "square"}, default=None + The transform to apply to the feature importances. By default (`None`) + no transformation is applied. + + norm_order : int, default=1 + The norm order to apply when `transform_func="norm"`. Only applied + when `importances.ndim > 1`. + + Returns + ------- + importances : ndarray of shape (n_features,) + The features importances, optionally transformed. + """ + if isinstance(getter, str): + if getter == "auto": + if hasattr(estimator, "coef_"): + getter = attrgetter("coef_") + elif hasattr(estimator, "feature_importances_"): + getter = attrgetter("feature_importances_") + else: + raise ValueError( + "when `importance_getter=='auto'`, the underlying " + f"estimator {estimator.__class__.__name__} should have " + "`coef_` or `feature_importances_` attribute. Either " + "pass a fitted estimator to feature selector or call fit " + "before calling transform." + ) + else: + getter = attrgetter(getter) + elif not callable(getter): + raise ValueError("`importance_getter` has to be a string or `callable`") + + importances = getter(estimator) + + if transform_func is None: + return importances + elif transform_func == "norm": + if importances.ndim == 1: + importances = np.abs(importances) + else: + importances = np.linalg.norm(importances, axis=0, ord=norm_order) + elif transform_func == "square": + if importances.ndim == 1: + importances = safe_sqr(importances) + else: + importances = safe_sqr(importances).sum(axis=0) + else: + raise ValueError( + "Valid values for `transform_func` are " + "None, 'norm' and 'square'. Those two " + "transformation are only supported now" + ) + + return importances diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_from_model.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_from_model.py new file mode 100644 index 0000000000000000000000000000000000000000..3b2c73c6cbfaeeca449af4b0c04388dbe10be8b7 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_from_model.py @@ -0,0 +1,513 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from copy import deepcopy +from numbers import Integral, Real + +import numpy as np + +from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone +from ..exceptions import NotFittedError +from ..utils._param_validation import HasMethods, Interval, Options +from ..utils._tags import get_tags +from ..utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _routing_enabled, + process_routing, +) +from ..utils.metaestimators import available_if +from ..utils.validation import ( + _check_feature_names, + _estimator_has, + _num_features, + check_is_fitted, + check_scalar, +) +from ._base import SelectorMixin, _get_feature_importances + + +def _calculate_threshold(estimator, importances, threshold): + """Interpret the threshold value""" + + if threshold is None: + # determine default from estimator + est_name = estimator.__class__.__name__ + is_l1_penalized = hasattr(estimator, "penalty") and estimator.penalty == "l1" + is_lasso = "Lasso" in est_name + is_elasticnet_l1_penalized = est_name == "ElasticNet" and ( + hasattr(estimator, "l1_ratio") and np.isclose(estimator.l1_ratio, 1.0) + ) + is_elasticnetcv_l1_penalized = est_name == "ElasticNetCV" and ( + hasattr(estimator, "l1_ratio_") and np.isclose(estimator.l1_ratio_, 1.0) + ) + if ( + is_l1_penalized + or is_lasso + or is_elasticnet_l1_penalized + or is_elasticnetcv_l1_penalized + ): + # the natural default threshold is 0 when l1 penalty was used + threshold = 1e-5 + else: + threshold = "mean" + + if isinstance(threshold, str): + if "*" in threshold: + scale, reference = threshold.split("*") + scale = float(scale.strip()) + reference = reference.strip() + + if reference == "median": + reference = np.median(importances) + elif reference == "mean": + reference = np.mean(importances) + else: + raise ValueError("Unknown reference: " + reference) + + threshold = scale * reference + + elif threshold == "median": + threshold = np.median(importances) + + elif threshold == "mean": + threshold = np.mean(importances) + + else: + raise ValueError( + "Expected threshold='mean' or threshold='median' got %s" % threshold + ) + + else: + threshold = float(threshold) + + return threshold + + +class SelectFromModel(MetaEstimatorMixin, SelectorMixin, BaseEstimator): + """Meta-transformer for selecting features based on importance weights. + + .. versionadded:: 0.17 + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : object + The base estimator from which the transformer is built. + This can be both a fitted (if ``prefit`` is set to True) + or a non-fitted estimator. The estimator should have a + ``feature_importances_`` or ``coef_`` attribute after fitting. + Otherwise, the ``importance_getter`` parameter should be used. + + threshold : str or float, default=None + The threshold value to use for feature selection. Features whose + absolute importance value is greater or equal are kept while the others + are discarded. If "median" (resp. "mean"), then the ``threshold`` value + is the median (resp. the mean) of the feature importances. A scaling + factor (e.g., "1.25*mean") may also be used. If None and if the + estimator has a parameter penalty set to l1, either explicitly + or implicitly (e.g, Lasso), the threshold used is 1e-5. + Otherwise, "mean" is used by default. + + prefit : bool, default=False + Whether a prefit model is expected to be passed into the constructor + directly or not. + If `True`, `estimator` must be a fitted estimator. + If `False`, `estimator` is fitted and updated by calling + `fit` and `partial_fit`, respectively. + + norm_order : non-zero int, inf, -inf, default=1 + Order of the norm used to filter the vectors of coefficients below + ``threshold`` in the case where the ``coef_`` attribute of the + estimator is of dimension 2. + + max_features : int, callable, default=None + The maximum number of features to select. + + - If an integer, then it specifies the maximum number of features to + allow. + - If a callable, then it specifies how to calculate the maximum number of + features allowed by using the output of `max_features(X)`. + - If `None`, then all features are kept. + + To only select based on ``max_features``, set ``threshold=-np.inf``. + + .. versionadded:: 0.20 + .. versionchanged:: 1.1 + `max_features` accepts a callable. + + importance_getter : str or callable, default='auto' + If 'auto', uses the feature importance either through a ``coef_`` + attribute or ``feature_importances_`` attribute of estimator. + + Also accepts a string that specifies an attribute name/path + for extracting feature importance (implemented with `attrgetter`). + For example, give `regressor_.coef_` in case of + :class:`~sklearn.compose.TransformedTargetRegressor` or + `named_steps.clf.feature_importances_` in case of + :class:`~sklearn.pipeline.Pipeline` with its last step named `clf`. + + If `callable`, overrides the default feature importance getter. + The callable is passed with the fitted estimator and it should + return importance for each feature. + + .. versionadded:: 0.24 + + Attributes + ---------- + estimator_ : estimator + The base estimator from which the transformer is built. This attribute + exist only when `fit` has been called. + + - If `prefit=True`, it is a deep copy of `estimator`. + - If `prefit=False`, it is a clone of `estimator` and fit on the data + passed to `fit` or `partial_fit`. + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + + .. versionadded:: 0.24 + + max_features_ : int + Maximum number of features calculated during :term:`fit`. Only defined + if the ``max_features`` is not `None`. + + - If `max_features` is an `int`, then `max_features_ = max_features`. + - If `max_features` is a callable, then `max_features_ = max_features(X)`. + + .. versionadded:: 1.1 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + threshold_ : float + The threshold value used for feature selection. + + See Also + -------- + RFE : Recursive feature elimination based on importance weights. + RFECV : Recursive feature elimination with built-in cross-validated + selection of the best number of features. + SequentialFeatureSelector : Sequential cross-validation based feature + selection. Does not rely on importance weights. + + Notes + ----- + Allows NaN/Inf in the input if the underlying estimator does as well. + + Examples + -------- + >>> from sklearn.feature_selection import SelectFromModel + >>> from sklearn.linear_model import LogisticRegression + >>> X = [[ 0.87, -1.34, 0.31 ], + ... [-2.79, -0.02, -0.85 ], + ... [-1.34, -0.48, -2.55 ], + ... [ 1.92, 1.48, 0.65 ]] + >>> y = [0, 1, 0, 1] + >>> selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y) + >>> selector.estimator_.coef_ + array([[-0.3252, 0.8345, 0.4976]]) + >>> selector.threshold_ + np.float64(0.55249) + >>> selector.get_support() + array([False, True, False]) + >>> selector.transform(X) + array([[-1.34], + [-0.02], + [-0.48], + [ 1.48]]) + + Using a callable to create a selector that can use no more than half + of the input features. + + >>> def half_callable(X): + ... return round(len(X[0]) / 2) + >>> half_selector = SelectFromModel(estimator=LogisticRegression(), + ... max_features=half_callable) + >>> _ = half_selector.fit(X, y) + >>> half_selector.max_features_ + 2 + """ + + _parameter_constraints: dict = { + "estimator": [HasMethods("fit")], + "threshold": [Interval(Real, None, None, closed="both"), str, None], + "prefit": ["boolean"], + "norm_order": [ + Interval(Integral, None, -1, closed="right"), + Interval(Integral, 1, None, closed="left"), + Options(Real, {np.inf, -np.inf}), + ], + "max_features": [Interval(Integral, 0, None, closed="left"), callable, None], + "importance_getter": [str, callable], + } + + def __init__( + self, + estimator, + *, + threshold=None, + prefit=False, + norm_order=1, + max_features=None, + importance_getter="auto", + ): + self.estimator = estimator + self.threshold = threshold + self.prefit = prefit + self.importance_getter = importance_getter + self.norm_order = norm_order + self.max_features = max_features + + def _get_support_mask(self): + estimator = getattr(self, "estimator_", self.estimator) + max_features = getattr(self, "max_features_", self.max_features) + + if self.prefit: + try: + check_is_fitted(self.estimator) + except NotFittedError as exc: + raise NotFittedError( + "When `prefit=True`, `estimator` is expected to be a fitted " + "estimator." + ) from exc + if callable(max_features): + # This branch is executed when `transform` is called directly and thus + # `max_features_` is not set and we fallback using `self.max_features` + # that is not validated + raise NotFittedError( + "When `prefit=True` and `max_features` is a callable, call `fit` " + "before calling `transform`." + ) + elif max_features is not None and not isinstance(max_features, Integral): + raise ValueError( + f"`max_features` must be an integer. Got `max_features={max_features}` " + "instead." + ) + + scores = _get_feature_importances( + estimator=estimator, + getter=self.importance_getter, + transform_func="norm", + norm_order=self.norm_order, + ) + threshold = _calculate_threshold(estimator, scores, self.threshold) + if self.max_features is not None: + mask = np.zeros_like(scores, dtype=bool) + candidate_indices = np.argsort(-scores, kind="mergesort")[:max_features] + mask[candidate_indices] = True + else: + mask = np.ones_like(scores, dtype=bool) + mask[scores < threshold] = False + return mask + + def _check_max_features(self, X): + if self.max_features is not None: + n_features = _num_features(X) + + if callable(self.max_features): + max_features = self.max_features(X) + else: # int + max_features = self.max_features + + check_scalar( + max_features, + "max_features", + Integral, + min_val=0, + max_val=n_features, + ) + self.max_features_ = max_features + + @_fit_context( + # SelectFromModel.estimator is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y=None, **fit_params): + """Fit the SelectFromModel meta-transformer. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The training input samples. + + y : array-like of shape (n_samples,), default=None + The target values (integers that correspond to classes in + classification, real numbers in regression). + + **fit_params : dict + - If `enable_metadata_routing=False` (default): Parameters directly passed + to the `fit` method of the sub-estimator. They are ignored if + `prefit=True`. + + - If `enable_metadata_routing=True`: Parameters safely routed to the `fit` + method of the sub-estimator. They are ignored if `prefit=True`. + + .. versionchanged:: 1.4 + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Fitted estimator. + """ + self._check_max_features(X) + + if self.prefit: + try: + check_is_fitted(self.estimator) + except NotFittedError as exc: + raise NotFittedError( + "When `prefit=True`, `estimator` is expected to be a fitted " + "estimator." + ) from exc + self.estimator_ = deepcopy(self.estimator) + else: + if _routing_enabled(): + routed_params = process_routing(self, "fit", **fit_params) + self.estimator_ = clone(self.estimator) + self.estimator_.fit(X, y, **routed_params.estimator.fit) + else: + # TODO(SLEP6): remove when metadata routing cannot be disabled. + self.estimator_ = clone(self.estimator) + self.estimator_.fit(X, y, **fit_params) + + if hasattr(self.estimator_, "feature_names_in_"): + self.feature_names_in_ = self.estimator_.feature_names_in_ + else: + _check_feature_names(self, X, reset=True) + + return self + + @property + def threshold_(self): + """Threshold value used for feature selection.""" + scores = _get_feature_importances( + estimator=self.estimator_, + getter=self.importance_getter, + transform_func="norm", + norm_order=self.norm_order, + ) + return _calculate_threshold(self.estimator, scores, self.threshold) + + @available_if(_estimator_has("partial_fit")) + @_fit_context( + # SelectFromModel.estimator is not validated yet + prefer_skip_nested_validation=False + ) + def partial_fit(self, X, y=None, **partial_fit_params): + """Fit the SelectFromModel meta-transformer only once. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The training input samples. + + y : array-like of shape (n_samples,), default=None + The target values (integers that correspond to classes in + classification, real numbers in regression). + + **partial_fit_params : dict + - If `enable_metadata_routing=False` (default): Parameters directly passed + to the `partial_fit` method of the sub-estimator. + + - If `enable_metadata_routing=True`: Parameters passed to the `partial_fit` + method of the sub-estimator. They are ignored if `prefit=True`. + + .. versionchanged:: 1.4 + + `**partial_fit_params` are routed to the sub-estimator, if + `enable_metadata_routing=True` is set via + :func:`~sklearn.set_config`, which allows for aliasing. + + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Fitted estimator. + """ + first_call = not hasattr(self, "estimator_") + + if first_call: + self._check_max_features(X) + + if self.prefit: + if first_call: + try: + check_is_fitted(self.estimator) + except NotFittedError as exc: + raise NotFittedError( + "When `prefit=True`, `estimator` is expected to be a fitted " + "estimator." + ) from exc + self.estimator_ = deepcopy(self.estimator) + return self + + if first_call: + self.estimator_ = clone(self.estimator) + if _routing_enabled(): + routed_params = process_routing(self, "partial_fit", **partial_fit_params) + self.estimator_ = clone(self.estimator) + self.estimator_.partial_fit(X, y, **routed_params.estimator.partial_fit) + else: + # TODO(SLEP6): remove when metadata routing cannot be disabled. + self.estimator_.partial_fit(X, y, **partial_fit_params) + + if hasattr(self.estimator_, "feature_names_in_"): + self.feature_names_in_ = self.estimator_.feature_names_in_ + else: + _check_feature_names(self, X, reset=first_call) + + return self + + @property + def n_features_in_(self): + """Number of features seen during `fit`.""" + # For consistency with other estimators we raise a AttributeError so + # that hasattr() fails if the estimator isn't fitted. + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute.".format( + self.__class__.__name__ + ) + ) from nfe + + return self.estimator_.n_features_in_ + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.4 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__).add( + estimator=self.estimator, + method_mapping=MethodMapping() + .add(caller="partial_fit", callee="partial_fit") + .add(caller="fit", callee="fit"), + ) + return router + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse + tags.input_tags.allow_nan = get_tags(self.estimator).input_tags.allow_nan + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_mutual_info.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_mutual_info.py new file mode 100644 index 0000000000000000000000000000000000000000..aef9097879fcaf02efa50f7c5e3d33f492e14495 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_mutual_info.py @@ -0,0 +1,580 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Integral + +import numpy as np +from scipy.sparse import issparse +from scipy.special import digamma + +from ..metrics.cluster import mutual_info_score +from ..neighbors import KDTree, NearestNeighbors +from ..preprocessing import scale +from ..utils import check_random_state +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.multiclass import check_classification_targets +from ..utils.parallel import Parallel, delayed +from ..utils.validation import check_array, check_X_y + + +def _compute_mi_cc(x, y, n_neighbors): + """Compute mutual information between two continuous variables. + + Parameters + ---------- + x, y : ndarray, shape (n_samples,) + Samples of two continuous random variables, must have an identical + shape. + + n_neighbors : int + Number of nearest neighbors to search for each point, see [1]_. + + Returns + ------- + mi : float + Estimated mutual information in nat units. If it turned out to be + negative it is replaced by 0. + + Notes + ----- + True mutual information can't be negative. If its estimate by a numerical + method is negative, it means (providing the method is adequate) that the + mutual information is close to 0 and replacing it by 0 is a reasonable + strategy. + + References + ---------- + .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual + information". Phys. Rev. E 69, 2004. + """ + n_samples = x.size + + x = x.reshape((-1, 1)) + y = y.reshape((-1, 1)) + xy = np.hstack((x, y)) + + # Here we rely on NearestNeighbors to select the fastest algorithm. + nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors) + + nn.fit(xy) + radius = nn.kneighbors()[0] + radius = np.nextafter(radius[:, -1], 0) + + # KDTree is explicitly fit to allow for the querying of number of + # neighbors within a specified radius + kd = KDTree(x, metric="chebyshev") + nx = kd.query_radius(x, radius, count_only=True, return_distance=False) + nx = np.array(nx) - 1.0 + + kd = KDTree(y, metric="chebyshev") + ny = kd.query_radius(y, radius, count_only=True, return_distance=False) + ny = np.array(ny) - 1.0 + + mi = ( + digamma(n_samples) + + digamma(n_neighbors) + - np.mean(digamma(nx + 1)) + - np.mean(digamma(ny + 1)) + ) + + return max(0, mi) + + +def _compute_mi_cd(c, d, n_neighbors): + """Compute mutual information between continuous and discrete variables. + + Parameters + ---------- + c : ndarray, shape (n_samples,) + Samples of a continuous random variable. + + d : ndarray, shape (n_samples,) + Samples of a discrete random variable. + + n_neighbors : int + Number of nearest neighbors to search for each point, see [1]_. + + Returns + ------- + mi : float + Estimated mutual information in nat units. If it turned out to be + negative it is replaced by 0. + + Notes + ----- + True mutual information can't be negative. If its estimate by a numerical + method is negative, it means (providing the method is adequate) that the + mutual information is close to 0 and replacing it by 0 is a reasonable + strategy. + + References + ---------- + .. [1] B. C. Ross "Mutual Information between Discrete and Continuous + Data Sets". PLoS ONE 9(2), 2014. + """ + n_samples = c.shape[0] + c = c.reshape((-1, 1)) + + radius = np.empty(n_samples) + label_counts = np.empty(n_samples) + k_all = np.empty(n_samples) + nn = NearestNeighbors() + for label in np.unique(d): + mask = d == label + count = np.sum(mask) + if count > 1: + k = min(n_neighbors, count - 1) + nn.set_params(n_neighbors=k) + nn.fit(c[mask]) + r = nn.kneighbors()[0] + radius[mask] = np.nextafter(r[:, -1], 0) + k_all[mask] = k + label_counts[mask] = count + + # Ignore points with unique labels. + mask = label_counts > 1 + n_samples = np.sum(mask) + label_counts = label_counts[mask] + k_all = k_all[mask] + c = c[mask] + radius = radius[mask] + + kd = KDTree(c) + m_all = kd.query_radius(c, radius, count_only=True, return_distance=False) + m_all = np.array(m_all) + + mi = ( + digamma(n_samples) + + np.mean(digamma(k_all)) + - np.mean(digamma(label_counts)) + - np.mean(digamma(m_all)) + ) + + return max(0, mi) + + +def _compute_mi(x, y, x_discrete, y_discrete, n_neighbors=3): + """Compute mutual information between two variables. + + This is a simple wrapper which selects a proper function to call based on + whether `x` and `y` are discrete or not. + """ + if x_discrete and y_discrete: + return mutual_info_score(x, y) + elif x_discrete and not y_discrete: + return _compute_mi_cd(y, x, n_neighbors) + elif not x_discrete and y_discrete: + return _compute_mi_cd(x, y, n_neighbors) + else: + return _compute_mi_cc(x, y, n_neighbors) + + +def _iterate_columns(X, columns=None): + """Iterate over columns of a matrix. + + Parameters + ---------- + X : ndarray or csc_matrix, shape (n_samples, n_features) + Matrix over which to iterate. + + columns : iterable or None, default=None + Indices of columns to iterate over. If None, iterate over all columns. + + Yields + ------ + x : ndarray, shape (n_samples,) + Columns of `X` in dense format. + """ + if columns is None: + columns = range(X.shape[1]) + + if issparse(X): + for i in columns: + x = np.zeros(X.shape[0]) + start_ptr, end_ptr = X.indptr[i], X.indptr[i + 1] + x[X.indices[start_ptr:end_ptr]] = X.data[start_ptr:end_ptr] + yield x + else: + for i in columns: + yield X[:, i] + + +def _estimate_mi( + X, + y, + *, + discrete_features="auto", + discrete_target=False, + n_neighbors=3, + copy=True, + random_state=None, + n_jobs=None, +): + """Estimate mutual information between the features and the target. + + Parameters + ---------- + X : array-like or sparse matrix, shape (n_samples, n_features) + Feature matrix. + + y : array-like of shape (n_samples,) + Target vector. + + discrete_features : {'auto', bool, array-like}, default='auto' + If bool, then determines whether to consider all features discrete + or continuous. If array, then it should be either a boolean mask + with shape (n_features,) or array with indices of discrete features. + If 'auto', it is assigned to False for dense `X` and to True for + sparse `X`. + + discrete_target : bool, default=False + Whether to consider `y` as a discrete variable. + + n_neighbors : int, default=3 + Number of neighbors to use for MI estimation for continuous variables, + see [1]_ and [2]_. Higher values reduce variance of the estimation, but + could introduce a bias. + + copy : bool, default=True + Whether to make a copy of the given data. If set to False, the initial + data will be overwritten. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for adding small noise to + continuous variables in order to remove repeated values. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + n_jobs : int, default=None + The number of jobs to use for computing the mutual information. + The parallelization is done on the columns of `X`. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + .. versionadded:: 1.5 + + + Returns + ------- + mi : ndarray, shape (n_features,) + Estimated mutual information between each feature and the target in + nat units. A negative value will be replaced by 0. + + References + ---------- + .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual + information". Phys. Rev. E 69, 2004. + .. [2] B. C. Ross "Mutual Information between Discrete and Continuous + Data Sets". PLoS ONE 9(2), 2014. + """ + X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target) + n_samples, n_features = X.shape + + if isinstance(discrete_features, (str, bool)): + if isinstance(discrete_features, str): + if discrete_features == "auto": + discrete_features = issparse(X) + else: + raise ValueError("Invalid string value for discrete_features.") + discrete_mask = np.empty(n_features, dtype=bool) + discrete_mask.fill(discrete_features) + else: + discrete_features = check_array(discrete_features, ensure_2d=False) + if discrete_features.dtype != "bool": + discrete_mask = np.zeros(n_features, dtype=bool) + discrete_mask[discrete_features] = True + else: + discrete_mask = discrete_features + + continuous_mask = ~discrete_mask + if np.any(continuous_mask) and issparse(X): + raise ValueError("Sparse matrix `X` can't have continuous features.") + + rng = check_random_state(random_state) + if np.any(continuous_mask): + X = X.astype(np.float64, copy=copy) + X[:, continuous_mask] = scale( + X[:, continuous_mask], with_mean=False, copy=False + ) + + # Add small noise to continuous features as advised in Kraskov et. al. + means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0)) + X[:, continuous_mask] += ( + 1e-10 + * means + * rng.standard_normal(size=(n_samples, np.sum(continuous_mask))) + ) + + if not discrete_target: + y = scale(y, with_mean=False) + y += ( + 1e-10 + * np.maximum(1, np.mean(np.abs(y))) + * rng.standard_normal(size=n_samples) + ) + + mi = Parallel(n_jobs=n_jobs)( + delayed(_compute_mi)(x, y, discrete_feature, discrete_target, n_neighbors) + for x, discrete_feature in zip(_iterate_columns(X), discrete_mask) + ) + + return np.array(mi) + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "y": ["array-like"], + "discrete_features": [StrOptions({"auto"}), "boolean", "array-like"], + "n_neighbors": [Interval(Integral, 1, None, closed="left")], + "copy": ["boolean"], + "random_state": ["random_state"], + "n_jobs": [Integral, None], + }, + prefer_skip_nested_validation=True, +) +def mutual_info_regression( + X, + y, + *, + discrete_features="auto", + n_neighbors=3, + copy=True, + random_state=None, + n_jobs=None, +): + """Estimate mutual information for a continuous target variable. + + Mutual information (MI) [1]_ between two random variables is a non-negative + value, which measures the dependency between the variables. It is equal + to zero if and only if two random variables are independent, and higher + values mean higher dependency. + + The function relies on nonparametric methods based on entropy estimation + from k-nearest neighbors distances as described in [2]_ and [3]_. Both + methods are based on the idea originally proposed in [4]_. + + It can be used for univariate features selection, read more in the + :ref:`User Guide `. + + Parameters + ---------- + X : array-like or sparse matrix, shape (n_samples, n_features) + Feature matrix. + + y : array-like of shape (n_samples,) + Target vector. + + discrete_features : {'auto', bool, array-like}, default='auto' + If bool, then determines whether to consider all features discrete + or continuous. If array, then it should be either a boolean mask + with shape (n_features,) or array with indices of discrete features. + If 'auto', it is assigned to False for dense `X` and to True for + sparse `X`. + + n_neighbors : int, default=3 + Number of neighbors to use for MI estimation for continuous variables, + see [2]_ and [3]_. Higher values reduce variance of the estimation, but + could introduce a bias. + + copy : bool, default=True + Whether to make a copy of the given data. If set to False, the initial + data will be overwritten. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for adding small noise to + continuous variables in order to remove repeated values. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + n_jobs : int, default=None + The number of jobs to use for computing the mutual information. + The parallelization is done on the columns of `X`. + + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + .. versionadded:: 1.5 + + Returns + ------- + mi : ndarray, shape (n_features,) + Estimated mutual information between each feature and the target in + nat units. + + Notes + ----- + 1. The term "discrete features" is used instead of naming them + "categorical", because it describes the essence more accurately. + For example, pixel intensities of an image are discrete features + (but hardly categorical) and you will get better results if mark them + as such. Also note, that treating a continuous variable as discrete and + vice versa will usually give incorrect results, so be attentive about + that. + 2. True mutual information can't be negative. If its estimate turns out + to be negative, it is replaced by zero. + + References + ---------- + .. [1] `Mutual Information + `_ + on Wikipedia. + .. [2] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual + information". Phys. Rev. E 69, 2004. + .. [3] B. C. Ross "Mutual Information between Discrete and Continuous + Data Sets". PLoS ONE 9(2), 2014. + .. [4] L. F. Kozachenko, N. N. Leonenko, "Sample Estimate of the Entropy + of a Random Vector", Probl. Peredachi Inf., 23:2 (1987), 9-16 + + Examples + -------- + >>> from sklearn.datasets import make_regression + >>> from sklearn.feature_selection import mutual_info_regression + >>> X, y = make_regression( + ... n_samples=50, n_features=3, n_informative=1, noise=1e-4, random_state=42 + ... ) + >>> mutual_info_regression(X, y) + array([0.117, 2.645, 0.0287]) + """ + return _estimate_mi( + X, + y, + discrete_features=discrete_features, + discrete_target=False, + n_neighbors=n_neighbors, + copy=copy, + random_state=random_state, + n_jobs=n_jobs, + ) + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "y": ["array-like"], + "discrete_features": [StrOptions({"auto"}), "boolean", "array-like"], + "n_neighbors": [Interval(Integral, 1, None, closed="left")], + "copy": ["boolean"], + "random_state": ["random_state"], + "n_jobs": [Integral, None], + }, + prefer_skip_nested_validation=True, +) +def mutual_info_classif( + X, + y, + *, + discrete_features="auto", + n_neighbors=3, + copy=True, + random_state=None, + n_jobs=None, +): + """Estimate mutual information for a discrete target variable. + + Mutual information (MI) [1]_ between two random variables is a non-negative + value, which measures the dependency between the variables. It is equal + to zero if and only if two random variables are independent, and higher + values mean higher dependency. + + The function relies on nonparametric methods based on entropy estimation + from k-nearest neighbors distances as described in [2]_ and [3]_. Both + methods are based on the idea originally proposed in [4]_. + + It can be used for univariate features selection, read more in the + :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Feature matrix. + + y : array-like of shape (n_samples,) + Target vector. + + discrete_features : 'auto', bool or array-like, default='auto' + If bool, then determines whether to consider all features discrete + or continuous. If array, then it should be either a boolean mask + with shape (n_features,) or array with indices of discrete features. + If 'auto', it is assigned to False for dense `X` and to True for + sparse `X`. + + n_neighbors : int, default=3 + Number of neighbors to use for MI estimation for continuous variables, + see [2]_ and [3]_. Higher values reduce variance of the estimation, but + could introduce a bias. + + copy : bool, default=True + Whether to make a copy of the given data. If set to False, the initial + data will be overwritten. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for adding small noise to + continuous variables in order to remove repeated values. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + n_jobs : int, default=None + The number of jobs to use for computing the mutual information. + The parallelization is done on the columns of `X`. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + .. versionadded:: 1.5 + + Returns + ------- + mi : ndarray, shape (n_features,) + Estimated mutual information between each feature and the target in + nat units. + + Notes + ----- + 1. The term "discrete features" is used instead of naming them + "categorical", because it describes the essence more accurately. + For example, pixel intensities of an image are discrete features + (but hardly categorical) and you will get better results if mark them + as such. Also note, that treating a continuous variable as discrete and + vice versa will usually give incorrect results, so be attentive about + that. + 2. True mutual information can't be negative. If its estimate turns out + to be negative, it is replaced by zero. + + References + ---------- + .. [1] `Mutual Information + `_ + on Wikipedia. + .. [2] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual + information". Phys. Rev. E 69, 2004. + .. [3] B. C. Ross "Mutual Information between Discrete and Continuous + Data Sets". PLoS ONE 9(2), 2014. + .. [4] L. F. Kozachenko, N. N. Leonenko, "Sample Estimate of the Entropy + of a Random Vector:, Probl. Peredachi Inf., 23:2 (1987), 9-16 + + Examples + -------- + >>> from sklearn.datasets import make_classification + >>> from sklearn.feature_selection import mutual_info_classif + >>> X, y = make_classification( + ... n_samples=100, n_features=10, n_informative=2, n_clusters_per_class=1, + ... shuffle=False, random_state=42 + ... ) + >>> mutual_info_classif(X, y) + array([0.589, 0.107, 0.196, 0.0968 , 0., + 0. , 0. , 0. , 0. , 0.]) + """ + check_classification_targets(y) + return _estimate_mi( + X, + y, + discrete_features=discrete_features, + discrete_target=True, + n_neighbors=n_neighbors, + copy=copy, + random_state=random_state, + n_jobs=n_jobs, + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_rfe.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_rfe.py new file mode 100644 index 0000000000000000000000000000000000000000..d647ad0ca19b10d36bcf4bb9f5ccf698f506f24b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_rfe.py @@ -0,0 +1,1025 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +"""Recursive feature elimination for feature ranking""" + +import warnings +from copy import deepcopy +from numbers import Integral + +import numpy as np +from joblib import effective_n_jobs + +from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier +from ..metrics import get_scorer +from ..model_selection import check_cv +from ..model_selection._validation import _score +from ..utils import Bunch, metadata_routing +from ..utils._metadata_requests import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + _routing_enabled, + process_routing, +) +from ..utils._param_validation import HasMethods, Interval, RealNotInt +from ..utils._tags import get_tags +from ..utils.metaestimators import _safe_split, available_if +from ..utils.parallel import Parallel, delayed +from ..utils.validation import ( + _check_method_params, + _deprecate_positional_args, + _estimator_has, + check_is_fitted, + validate_data, +) +from ._base import SelectorMixin, _get_feature_importances + + +def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer, routed_params): + """ + Return the score and n_features per step for a fit across one fold. + """ + X_train, y_train = _safe_split(estimator, X, y, train) + X_test, y_test = _safe_split(estimator, X, y, test, train) + fit_params = _check_method_params( + X, params=routed_params.estimator.fit, indices=train + ) + score_params = _check_method_params( + X=X, params=routed_params.scorer.score, indices=test + ) + + rfe._fit( + X_train, + y_train, + lambda estimator, features: _score( + estimator, + X_test[:, features], + y_test, + scorer, + score_params=score_params, + ), + **fit_params, + ) + + return rfe.step_scores_, rfe.step_support_, rfe.step_ranking_, rfe.step_n_features_ + + +class RFE(SelectorMixin, MetaEstimatorMixin, BaseEstimator): + """Feature ranking with recursive feature elimination. + + Given an external estimator that assigns weights to features (e.g., the + coefficients of a linear model), the goal of recursive feature elimination + (RFE) is to select features by recursively considering smaller and smaller + sets of features. First, the estimator is trained on the initial set of + features and the importance of each feature is obtained either through + any specific attribute or callable. + Then, the least important features are pruned from current set of features. + That procedure is recursively repeated on the pruned set until the desired + number of features to select is eventually reached. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : ``Estimator`` instance + A supervised learning estimator with a ``fit`` method that provides + information about feature importance + (e.g. `coef_`, `feature_importances_`). + + n_features_to_select : int or float, default=None + The number of features to select. If `None`, half of the features are + selected. If integer, the parameter is the absolute number of features + to select. If float between 0 and 1, it is the fraction of features to + select. + + .. versionchanged:: 0.24 + Added float values for fractions. + + step : int or float, default=1 + If greater than or equal to 1, then ``step`` corresponds to the + (integer) number of features to remove at each iteration. + If within (0.0, 1.0), then ``step`` corresponds to the percentage + (rounded down) of features to remove at each iteration. + + verbose : int, default=0 + Controls verbosity of output. + + importance_getter : str or callable, default='auto' + If 'auto', uses the feature importance either through a `coef_` + or `feature_importances_` attributes of estimator. + + Also accepts a string that specifies an attribute name/path + for extracting feature importance (implemented with `attrgetter`). + For example, give `regressor_.coef_` in case of + :class:`~sklearn.compose.TransformedTargetRegressor` or + `named_steps.clf.feature_importances_` in case of + class:`~sklearn.pipeline.Pipeline` with its last step named `clf`. + + If `callable`, overrides the default feature importance getter. + The callable is passed with the fitted estimator and it should + return importance for each feature. + + .. versionadded:: 0.24 + + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) + The classes labels. Only available when `estimator` is a classifier. + + estimator_ : ``Estimator`` instance + The fitted estimator used to select features. + + n_features_ : int + The number of selected features. + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + ranking_ : ndarray of shape (n_features,) + The feature ranking, such that ``ranking_[i]`` corresponds to the + ranking position of the i-th feature. Selected (i.e., estimated + best) features are assigned rank 1. + + support_ : ndarray of shape (n_features,) + The mask of selected features. + + See Also + -------- + RFECV : Recursive feature elimination with built-in cross-validated + selection of the best number of features. + SelectFromModel : Feature selection based on thresholds of importance + weights. + SequentialFeatureSelector : Sequential cross-validation based feature + selection. Does not rely on importance weights. + + Notes + ----- + Allows NaN/Inf in the input if the underlying estimator does as well. + + References + ---------- + + .. [1] Guyon, I., Weston, J., Barnhill, S., & Vapnik, V., "Gene selection + for cancer classification using support vector machines", + Mach. Learn., 46(1-3), 389--422, 2002. + + Examples + -------- + The following example shows how to retrieve the 5 most informative + features in the Friedman #1 dataset. + + >>> from sklearn.datasets import make_friedman1 + >>> from sklearn.feature_selection import RFE + >>> from sklearn.svm import SVR + >>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0) + >>> estimator = SVR(kernel="linear") + >>> selector = RFE(estimator, n_features_to_select=5, step=1) + >>> selector = selector.fit(X, y) + >>> selector.support_ + array([ True, True, True, True, True, False, False, False, False, + False]) + >>> selector.ranking_ + array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5]) + """ + + _parameter_constraints: dict = { + "estimator": [HasMethods(["fit"])], + "n_features_to_select": [ + None, + Interval(RealNotInt, 0, 1, closed="right"), + Interval(Integral, 0, None, closed="neither"), + ], + "step": [ + Interval(Integral, 0, None, closed="neither"), + Interval(RealNotInt, 0, 1, closed="neither"), + ], + "verbose": ["verbose"], + "importance_getter": [str, callable], + } + + def __init__( + self, + estimator, + *, + n_features_to_select=None, + step=1, + verbose=0, + importance_getter="auto", + ): + self.estimator = estimator + self.n_features_to_select = n_features_to_select + self.step = step + self.importance_getter = importance_getter + self.verbose = verbose + + # TODO(1.8) remove this property + @property + def _estimator_type(self): + return self.estimator._estimator_type + + @property + def classes_(self): + """Classes labels available when `estimator` is a classifier. + + Returns + ------- + ndarray of shape (n_classes,) + """ + return self.estimator_.classes_ + + @_fit_context( + # RFE.estimator is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y, **fit_params): + """Fit the RFE model and then the underlying estimator on the selected features. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. + + y : array-like of shape (n_samples,) + The target values. + + **fit_params : dict + - If `enable_metadata_routing=False` (default): Parameters directly passed + to the ``fit`` method of the underlying estimator. + + - If `enable_metadata_routing=True`: Parameters safely routed to the ``fit`` + method of the underlying estimator. + + .. versionchanged:: 1.6 + See :ref:`Metadata Routing User Guide ` + for more details. + + Returns + ------- + self : object + Fitted estimator. + """ + if _routing_enabled(): + routed_params = process_routing(self, "fit", **fit_params) + else: + routed_params = Bunch(estimator=Bunch(fit=fit_params)) + + return self._fit(X, y, **routed_params.estimator.fit) + + def _fit(self, X, y, step_score=None, **fit_params): + # Parameter step_score controls the calculation of self.step_scores_ + # step_score is not exposed to users and is used when implementing RFECV + # self.step_scores_ will not be calculated when calling _fit through fit + + X, y = validate_data( + self, + X, + y, + accept_sparse="csc", + ensure_min_features=2, + ensure_all_finite=False, + multi_output=True, + ) + + # Initialization + n_features = X.shape[1] + if self.n_features_to_select is None: + n_features_to_select = n_features // 2 + elif isinstance(self.n_features_to_select, Integral): # int + n_features_to_select = self.n_features_to_select + if n_features_to_select > n_features: + warnings.warn( + ( + f"Found {n_features_to_select=} > {n_features=}. There will be" + " no feature selection and all features will be kept." + ), + UserWarning, + ) + else: # float + n_features_to_select = int(n_features * self.n_features_to_select) + + if 0.0 < self.step < 1.0: + step = int(max(1, self.step * n_features)) + else: + step = int(self.step) + + support_ = np.ones(n_features, dtype=bool) + ranking_ = np.ones(n_features, dtype=int) + + if step_score: + self.step_n_features_ = [] + self.step_scores_ = [] + self.step_support_ = [] + self.step_ranking_ = [] + + # Elimination + while np.sum(support_) > n_features_to_select: + # Remaining features + features = np.arange(n_features)[support_] + + # Rank the remaining features + estimator = clone(self.estimator) + if self.verbose > 0: + print("Fitting estimator with %d features." % np.sum(support_)) + + estimator.fit(X[:, features], y, **fit_params) + + # Compute step values on the previous selection iteration because + # 'estimator' must use features that have not been eliminated yet + if step_score: + self.step_n_features_.append(len(features)) + self.step_scores_.append(step_score(estimator, features)) + self.step_support_.append(list(support_)) + self.step_ranking_.append(list(ranking_)) + + # Get importance and rank them + importances = _get_feature_importances( + estimator, + self.importance_getter, + transform_func="square", + ) + ranks = np.argsort(importances) + + # for sparse case ranks is matrix + ranks = np.ravel(ranks) + + # Eliminate the worse features + threshold = min(step, np.sum(support_) - n_features_to_select) + + support_[features[ranks][:threshold]] = False + ranking_[np.logical_not(support_)] += 1 + + # Set final attributes + features = np.arange(n_features)[support_] + self.estimator_ = clone(self.estimator) + self.estimator_.fit(X[:, features], y, **fit_params) + + # Compute step values when only n_features_to_select features left + if step_score: + self.step_n_features_.append(len(features)) + self.step_scores_.append(step_score(self.estimator_, features)) + self.step_support_.append(support_) + self.step_ranking_.append(ranking_) + self.n_features_ = support_.sum() + self.support_ = support_ + self.ranking_ = ranking_ + + return self + + @available_if(_estimator_has("predict")) + def predict(self, X, **predict_params): + """Reduce X to the selected features and predict using the estimator. + + Parameters + ---------- + X : array of shape [n_samples, n_features] + The input samples. + + **predict_params : dict + Parameters to route to the ``predict`` method of the + underlying estimator. + + .. versionadded:: 1.6 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` + for more details. + + Returns + ------- + y : array of shape [n_samples] + The predicted target values. + """ + _raise_for_params(predict_params, self, "predict") + check_is_fitted(self) + if _routing_enabled(): + routed_params = process_routing(self, "predict", **predict_params) + else: + routed_params = Bunch(estimator=Bunch(predict={})) + + return self.estimator_.predict( + self.transform(X), **routed_params.estimator.predict + ) + + @available_if(_estimator_has("score")) + def score(self, X, y, **score_params): + """Reduce X to the selected features and return the score of the estimator. + + Parameters + ---------- + X : array of shape [n_samples, n_features] + The input samples. + + y : array of shape [n_samples] + The target values. + + **score_params : dict + - If `enable_metadata_routing=False` (default): Parameters directly passed + to the ``score`` method of the underlying estimator. + + - If `enable_metadata_routing=True`: Parameters safely routed to the `score` + method of the underlying estimator. + + .. versionadded:: 1.0 + + .. versionchanged:: 1.6 + See :ref:`Metadata Routing User Guide ` + for more details. + + Returns + ------- + score : float + Score of the underlying base estimator computed with the selected + features returned by `rfe.transform(X)` and `y`. + """ + check_is_fitted(self) + if _routing_enabled(): + routed_params = process_routing(self, "score", **score_params) + else: + routed_params = Bunch(estimator=Bunch(score=score_params)) + + return self.estimator_.score( + self.transform(X), y, **routed_params.estimator.score + ) + + def _get_support_mask(self): + check_is_fitted(self) + return self.support_ + + @available_if(_estimator_has("decision_function")) + def decision_function(self, X): + """Compute the decision function of ``X``. + + Parameters + ---------- + X : {array-like or sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + + Returns + ------- + score : array, shape = [n_samples, n_classes] or [n_samples] + The decision function of the input samples. The order of the + classes corresponds to that in the attribute :term:`classes_`. + Regression and binary classification produce an array of shape + [n_samples]. + """ + check_is_fitted(self) + return self.estimator_.decision_function(self.transform(X)) + + @available_if(_estimator_has("predict_proba")) + def predict_proba(self, X): + """Predict class probabilities for X. + + Parameters + ---------- + X : {array-like or sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + + Returns + ------- + p : array of shape (n_samples, n_classes) + The class probabilities of the input samples. The order of the + classes corresponds to that in the attribute :term:`classes_`. + """ + check_is_fitted(self) + return self.estimator_.predict_proba(self.transform(X)) + + @available_if(_estimator_has("predict_log_proba")) + def predict_log_proba(self, X): + """Predict class log-probabilities for X. + + Parameters + ---------- + X : array of shape [n_samples, n_features] + The input samples. + + Returns + ------- + p : array of shape (n_samples, n_classes) + The class log-probabilities of the input samples. The order of the + classes corresponds to that in the attribute :term:`classes_`. + """ + check_is_fitted(self) + return self.estimator_.predict_log_proba(self.transform(X)) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + sub_estimator_tags = get_tags(self.estimator) + tags.estimator_type = sub_estimator_tags.estimator_type + tags.classifier_tags = deepcopy(sub_estimator_tags.classifier_tags) + tags.regressor_tags = deepcopy(sub_estimator_tags.regressor_tags) + if tags.classifier_tags is not None: + tags.classifier_tags.poor_score = True + if tags.regressor_tags is not None: + tags.regressor_tags.poor_score = True + tags.target_tags.required = True + tags.input_tags.sparse = sub_estimator_tags.input_tags.sparse + tags.input_tags.allow_nan = sub_estimator_tags.input_tags.allow_nan + return tags + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.6 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__).add( + estimator=self.estimator, + method_mapping=MethodMapping() + .add(caller="fit", callee="fit") + .add(caller="predict", callee="predict") + .add(caller="score", callee="score"), + ) + return router + + +class RFECV(RFE): + """Recursive feature elimination with cross-validation to select features. + + The number of features selected is tuned automatically by fitting an :class:`RFE` + selector on the different cross-validation splits (provided by the `cv` parameter). + The performance of each :class:`RFE` selector is evaluated using `scoring` for + different numbers of selected features and aggregated together. Finally, the scores + are averaged across folds and the number of features selected is set to the number + of features that maximize the cross-validation score. + + See glossary entry for :term:`cross-validation estimator`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : ``Estimator`` instance + A supervised learning estimator with a ``fit`` method that provides + information about feature importance either through a ``coef_`` + attribute or through a ``feature_importances_`` attribute. + + step : int or float, default=1 + If greater than or equal to 1, then ``step`` corresponds to the + (integer) number of features to remove at each iteration. + If within (0.0, 1.0), then ``step`` corresponds to the percentage + (rounded down) of features to remove at each iteration. + Note that the last iteration may remove fewer than ``step`` features in + order to reach ``min_features_to_select``. + + min_features_to_select : int, default=1 + The minimum number of features to be selected. This number of features + will always be scored, even if the difference between the original + feature count and ``min_features_to_select`` isn't divisible by + ``step``. + + .. versionadded:: 0.20 + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross-validation, + - integer, to specify the number of folds. + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, if ``y`` is binary or multiclass, + :class:`~sklearn.model_selection.StratifiedKFold` is used. If the + estimator is not a classifier or if ``y`` is neither binary nor multiclass, + :class:`~sklearn.model_selection.KFold` is used. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value of None changed from 3-fold to 5-fold. + + scoring : str or callable, default=None + Scoring method to evaluate the :class:`RFE` selectors' performance. Options: + + - str: see :ref:`scoring_string_names` for options. + - callable: a scorer callable object (e.g., function) with signature + ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details. + - `None`: the `estimator`'s + :ref:`default evaluation criterion ` is used. + + verbose : int, default=0 + Controls verbosity of output. + + n_jobs : int or None, default=None + Number of cores to run in parallel while fitting across folds. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + .. versionadded:: 0.18 + + importance_getter : str or callable, default='auto' + If 'auto', uses the feature importance either through a `coef_` + or `feature_importances_` attributes of estimator. + + Also accepts a string that specifies an attribute name/path + for extracting feature importance. + For example, give `regressor_.coef_` in case of + :class:`~sklearn.compose.TransformedTargetRegressor` or + `named_steps.clf.feature_importances_` in case of + :class:`~sklearn.pipeline.Pipeline` with its last step named `clf`. + + If `callable`, overrides the default feature importance getter. + The callable is passed with the fitted estimator and it should + return importance for each feature. + + .. versionadded:: 0.24 + + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) + The classes labels. Only available when `estimator` is a classifier. + + estimator_ : ``Estimator`` instance + The fitted estimator used to select features. + + cv_results_ : dict of ndarrays + All arrays (values of the dictionary) are sorted in ascending order + by the number of features used (i.e., the first element of the array + represents the models that used the least number of features, while the + last element represents the models that used all available features). + + .. versionadded:: 1.0 + + This dictionary contains the following keys: + + split(k)_test_score : ndarray of shape (n_subsets_of_features,) + The cross-validation scores across (k)th fold. + + mean_test_score : ndarray of shape (n_subsets_of_features,) + Mean of scores over the folds. + + std_test_score : ndarray of shape (n_subsets_of_features,) + Standard deviation of scores over the folds. + + n_features : ndarray of shape (n_subsets_of_features,) + Number of features used at each step. + + .. versionadded:: 1.5 + + split(k)_ranking : ndarray of shape (n_subsets_of_features,) + The cross-validation rankings across (k)th fold. + Selected (i.e., estimated best) features are assigned rank 1. + Illustration in + :ref:`sphx_glr_auto_examples_feature_selection_plot_rfe_with_cross_validation.py` + + .. versionadded:: 1.7 + + split(k)_support : ndarray of shape (n_subsets_of_features,) + The cross-validation supports across (k)th fold. The support + is the mask of selected features. + + .. versionadded:: 1.7 + + n_features_ : int + The number of selected features with cross-validation. + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + ranking_ : narray of shape (n_features,) + The feature ranking, such that `ranking_[i]` + corresponds to the ranking + position of the i-th feature. + Selected (i.e., estimated best) + features are assigned rank 1. + + support_ : ndarray of shape (n_features,) + The mask of selected features. + + See Also + -------- + RFE : Recursive feature elimination. + + Notes + ----- + The size of all values in ``cv_results_`` is equal to + ``ceil((n_features - min_features_to_select) / step) + 1``, + where step is the number of features removed at each iteration. + + Allows NaN/Inf in the input if the underlying estimator does as well. + + References + ---------- + + .. [1] Guyon, I., Weston, J., Barnhill, S., & Vapnik, V., "Gene selection + for cancer classification using support vector machines", + Mach. Learn., 46(1-3), 389--422, 2002. + + Examples + -------- + The following example shows how to retrieve the a-priori not known 5 + informative features in the Friedman #1 dataset. + + >>> from sklearn.datasets import make_friedman1 + >>> from sklearn.feature_selection import RFECV + >>> from sklearn.svm import SVR + >>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0) + >>> estimator = SVR(kernel="linear") + >>> selector = RFECV(estimator, step=1, cv=5) + >>> selector = selector.fit(X, y) + >>> selector.support_ + array([ True, True, True, True, True, False, False, False, False, + False]) + >>> selector.ranking_ + array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5]) + + For a detailed example of using RFECV to select features when training a + :class:`~sklearn.linear_model.LogisticRegression`, see + :ref:`sphx_glr_auto_examples_feature_selection_plot_rfe_with_cross_validation.py`. + """ + + _parameter_constraints: dict = { + **RFE._parameter_constraints, + "min_features_to_select": [Interval(Integral, 0, None, closed="neither")], + "cv": ["cv_object"], + "scoring": [None, str, callable], + "n_jobs": [None, Integral], + } + _parameter_constraints.pop("n_features_to_select") + __metadata_request__fit = {"groups": metadata_routing.UNUSED} + + def __init__( + self, + estimator, + *, + step=1, + min_features_to_select=1, + cv=None, + scoring=None, + verbose=0, + n_jobs=None, + importance_getter="auto", + ): + self.estimator = estimator + self.step = step + self.importance_getter = importance_getter + self.cv = cv + self.scoring = scoring + self.verbose = verbose + self.n_jobs = n_jobs + self.min_features_to_select = min_features_to_select + + # TODO(1.8): remove `groups` from the signature after deprecation cycle. + @_deprecate_positional_args(version="1.8") + @_fit_context( + # RFECV.estimator is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y, *, groups=None, **params): + """Fit the RFE model and automatically tune the number of selected features. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the total number of features. + + y : array-like of shape (n_samples,) + Target values (integers for classification, real numbers for + regression). + + groups : array-like of shape (n_samples,) or None, default=None + Group labels for the samples used while splitting the dataset into + train/test set. Only used in conjunction with a "Group" :term:`cv` + instance (e.g., :class:`~sklearn.model_selection.GroupKFold`). + + .. versionadded:: 0.20 + + **params : dict of str -> object + Parameters passed to the ``fit`` method of the estimator, + the scorer, and the CV splitter. + + .. versionadded:: 1.6 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` + for more details. + + Returns + ------- + self : object + Fitted estimator. + """ + _raise_for_params(params, self, "fit") + X, y = validate_data( + self, + X, + y, + accept_sparse="csr", + ensure_min_features=2, + ensure_all_finite=False, + multi_output=True, + ) + + if _routing_enabled(): + if groups is not None: + params.update({"groups": groups}) + routed_params = process_routing(self, "fit", **params) + else: + routed_params = Bunch( + estimator=Bunch(fit={}), + splitter=Bunch(split={"groups": groups}), + scorer=Bunch(score={}), + ) + + # Initialization + cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) + scorer = self._get_scorer() + + # Build an RFE object, which will evaluate and score each possible + # feature count, down to self.min_features_to_select + n_features = X.shape[1] + if self.min_features_to_select > n_features: + warnings.warn( + ( + f"Found min_features_to_select={self.min_features_to_select} > " + f"{n_features=}. There will be no feature selection and all " + "features will be kept." + ), + UserWarning, + ) + rfe = RFE( + estimator=self.estimator, + n_features_to_select=min(self.min_features_to_select, n_features), + importance_getter=self.importance_getter, + step=self.step, + verbose=self.verbose, + ) + + # Determine the number of subsets of features by fitting across + # the train folds and choosing the "features_to_select" parameter + # that gives the least averaged error across all folds. + + # Note that joblib raises a non-picklable error for bound methods + # even if n_jobs is set to 1 with the default multiprocessing + # backend. + # This branching is done so that to + # make sure that user code that sets n_jobs to 1 + # and provides bound methods as scorers is not broken with the + # addition of n_jobs parameter in version 0.18. + + if effective_n_jobs(self.n_jobs) == 1: + parallel, func = list, _rfe_single_fit + else: + parallel = Parallel(n_jobs=self.n_jobs) + func = delayed(_rfe_single_fit) + + step_results = parallel( + func(clone(rfe), self.estimator, X, y, train, test, scorer, routed_params) + for train, test in cv.split(X, y, **routed_params.splitter.split) + ) + scores, supports, rankings, step_n_features = zip(*step_results) + + step_n_features_rev = np.array(step_n_features[0])[::-1] + scores = np.array(scores) + rankings = np.array(rankings) + supports = np.array(supports) + + # Reverse order such that lowest number of features is selected in case of tie. + scores_sum_rev = np.sum(scores, axis=0)[::-1] + n_features_to_select = step_n_features_rev[np.argmax(scores_sum_rev)] + + # Re-execute an elimination with best_k over the whole set + rfe = RFE( + estimator=self.estimator, + n_features_to_select=n_features_to_select, + step=self.step, + importance_getter=self.importance_getter, + verbose=self.verbose, + ) + + rfe.fit(X, y, **routed_params.estimator.fit) + + # Set final attributes + self.support_ = rfe.support_ + self.n_features_ = rfe.n_features_ + self.ranking_ = rfe.ranking_ + self.estimator_ = clone(self.estimator) + self.estimator_.fit(self._transform(X), y, **routed_params.estimator.fit) + + # reverse to stay consistent with before + scores_rev = scores[:, ::-1] + supports_rev = supports[:, ::-1] + rankings_rev = rankings[:, ::-1] + self.cv_results_ = { + "mean_test_score": np.mean(scores_rev, axis=0), + "std_test_score": np.std(scores_rev, axis=0), + **{f"split{i}_test_score": scores_rev[i] for i in range(scores.shape[0])}, + **{f"split{i}_ranking": rankings_rev[i] for i in range(rankings.shape[0])}, + **{f"split{i}_support": supports_rev[i] for i in range(supports.shape[0])}, + "n_features": step_n_features_rev, + } + return self + + def score(self, X, y, **score_params): + """Score using the `scoring` option on the given test data and labels. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Test samples. + + y : array-like of shape (n_samples,) + True labels for X. + + **score_params : dict + Parameters to pass to the `score` method of the underlying scorer. + + .. versionadded:: 1.6 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` + for more details. + + Returns + ------- + score : float + Score of self.predict(X) w.r.t. y defined by `scoring`. + """ + _raise_for_params(score_params, self, "score") + scoring = self._get_scorer() + if _routing_enabled(): + routed_params = process_routing(self, "score", **score_params) + else: + routed_params = Bunch() + routed_params.scorer = Bunch(score={}) + + return scoring(self, X, y, **routed_params.scorer.score) + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.6 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__) + router.add( + estimator=self.estimator, + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + router.add( + splitter=check_cv(self.cv), + method_mapping=MethodMapping().add( + caller="fit", + callee="split", + ), + ) + router.add( + scorer=self._get_scorer(), + method_mapping=MethodMapping() + .add(caller="fit", callee="score") + .add(caller="score", callee="score"), + ) + + return router + + def _get_scorer(self): + if self.scoring is None: + scoring = "accuracy" if is_classifier(self.estimator) else "r2" + else: + scoring = self.scoring + return get_scorer(scoring) diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_sequential.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_sequential.py new file mode 100644 index 0000000000000000000000000000000000000000..c6d6ed9e2e72e278bee29638945bc9a2456826f6 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_sequential.py @@ -0,0 +1,363 @@ +""" +Sequential feature selection +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Integral, Real + +import numpy as np + +from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier +from ..metrics import check_scoring, get_scorer_names +from ..model_selection import check_cv, cross_val_score +from ..utils._metadata_requests import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + _routing_enabled, + process_routing, +) +from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions +from ..utils._tags import get_tags +from ..utils.validation import check_is_fitted, validate_data +from ._base import SelectorMixin + + +class SequentialFeatureSelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator): + """Transformer that performs Sequential Feature Selection. + + This Sequential Feature Selector adds (forward selection) or + removes (backward selection) features to form a feature subset in a + greedy fashion. At each stage, this estimator chooses the best feature to + add or remove based on the cross-validation score of an estimator. In + the case of unsupervised learning, this Sequential Feature Selector + looks only at the features (X), not the desired outputs (y). + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.24 + + Parameters + ---------- + estimator : estimator instance + An unfitted estimator. + + n_features_to_select : "auto", int or float, default="auto" + If `"auto"`, the behaviour depends on the `tol` parameter: + + - if `tol` is not `None`, then features are selected while the score + change does not exceed `tol`. + - otherwise, half of the features are selected. + + If integer, the parameter is the absolute number of features to select. + If float between 0 and 1, it is the fraction of features to select. + + .. versionadded:: 1.1 + The option `"auto"` was added in version 1.1. + + .. versionchanged:: 1.3 + The default changed from `"warn"` to `"auto"` in 1.3. + + tol : float, default=None + If the score is not incremented by at least `tol` between two + consecutive feature additions or removals, stop adding or removing. + + `tol` can be negative when removing features using `direction="backward"`. + `tol` is required to be strictly positive when doing forward selection. + It can be useful to reduce the number of features at the cost of a small + decrease in the score. + + `tol` is enabled only when `n_features_to_select` is `"auto"`. + + .. versionadded:: 1.1 + + direction : {'forward', 'backward'}, default='forward' + Whether to perform forward selection or backward selection. + + scoring : str or callable, default=None + Scoring method to use for cross-validation. Options: + + - str: see :ref:`scoring_string_names` for options. + - callable: a scorer callable object (e.g., function) with signature + ``scorer(estimator, X, y)`` that returns a single value. + See :ref:`scoring_callable` for details. + - `None`: the `estimator`'s + :ref:`default evaluation criterion ` is used. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - integer, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, + :class:`~sklearn.model_selection.StratifiedKFold` is used. In all other + cases, :class:`~sklearn.model_selection.KFold` is used. These splitters + are instantiated with `shuffle=False` so the splits will be the same + across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + n_jobs : int, default=None + Number of jobs to run in parallel. When evaluating a new feature to + add or remove, the cross-validation procedure is parallel over the + folds. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Attributes + ---------- + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_features_to_select_ : int + The number of features that were selected. + + support_ : ndarray of shape (n_features,), dtype=bool + The mask of selected features. + + See Also + -------- + GenericUnivariateSelect : Univariate feature selector with configurable + strategy. + RFE : Recursive feature elimination based on importance weights. + RFECV : Recursive feature elimination based on importance weights, with + automatic selection of the number of features. + SelectFromModel : Feature selection based on thresholds of importance + weights. + + Examples + -------- + >>> from sklearn.feature_selection import SequentialFeatureSelector + >>> from sklearn.neighbors import KNeighborsClassifier + >>> from sklearn.datasets import load_iris + >>> X, y = load_iris(return_X_y=True) + >>> knn = KNeighborsClassifier(n_neighbors=3) + >>> sfs = SequentialFeatureSelector(knn, n_features_to_select=3) + >>> sfs.fit(X, y) + SequentialFeatureSelector(estimator=KNeighborsClassifier(n_neighbors=3), + n_features_to_select=3) + >>> sfs.get_support() + array([ True, False, True, True]) + >>> sfs.transform(X).shape + (150, 3) + """ + + _parameter_constraints: dict = { + "estimator": [HasMethods(["fit"])], + "n_features_to_select": [ + StrOptions({"auto"}), + Interval(RealNotInt, 0, 1, closed="right"), + Interval(Integral, 0, None, closed="neither"), + ], + "tol": [None, Interval(Real, None, None, closed="neither")], + "direction": [StrOptions({"forward", "backward"})], + "scoring": [None, StrOptions(set(get_scorer_names())), callable], + "cv": ["cv_object"], + "n_jobs": [None, Integral], + } + + def __init__( + self, + estimator, + *, + n_features_to_select="auto", + tol=None, + direction="forward", + scoring=None, + cv=5, + n_jobs=None, + ): + self.estimator = estimator + self.n_features_to_select = n_features_to_select + self.tol = tol + self.direction = direction + self.scoring = scoring + self.cv = cv + self.n_jobs = n_jobs + + @_fit_context( + # SequentialFeatureSelector.estimator is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y=None, **params): + """Learn the features to select from X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of predictors. + + y : array-like of shape (n_samples,), default=None + Target values. This parameter may be ignored for + unsupervised learning. + + **params : dict, default=None + Parameters to be passed to the underlying `estimator`, `cv` + and `scorer` objects. + + .. versionadded:: 1.6 + + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Returns the instance itself. + """ + _raise_for_params(params, self, "fit") + tags = self.__sklearn_tags__() + X = validate_data( + self, + X, + accept_sparse="csc", + ensure_min_features=2, + ensure_all_finite=not tags.input_tags.allow_nan, + ) + n_features = X.shape[1] + + if self.n_features_to_select == "auto": + if self.tol is not None: + # With auto feature selection, `n_features_to_select_` will be updated + # to `support_.sum()` after features are selected. + self.n_features_to_select_ = n_features - 1 + else: + self.n_features_to_select_ = n_features // 2 + elif isinstance(self.n_features_to_select, Integral): + if self.n_features_to_select >= n_features: + raise ValueError("n_features_to_select must be < n_features.") + self.n_features_to_select_ = self.n_features_to_select + elif isinstance(self.n_features_to_select, Real): + self.n_features_to_select_ = int(n_features * self.n_features_to_select) + + if self.tol is not None and self.tol < 0 and self.direction == "forward": + raise ValueError( + "tol must be strictly positive when doing forward selection" + ) + + cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) + + cloned_estimator = clone(self.estimator) + + # the current mask corresponds to the set of features: + # - that we have already *selected* if we do forward selection + # - that we have already *excluded* if we do backward selection + current_mask = np.zeros(shape=n_features, dtype=bool) + n_iterations = ( + self.n_features_to_select_ + if self.n_features_to_select == "auto" or self.direction == "forward" + else n_features - self.n_features_to_select_ + ) + + old_score = -np.inf + is_auto_select = self.tol is not None and self.n_features_to_select == "auto" + + # We only need to verify the routing here and not use the routed params + # because internally the actual routing will also take place inside the + # `cross_val_score` function. + if _routing_enabled(): + process_routing(self, "fit", **params) + for _ in range(n_iterations): + new_feature_idx, new_score = self._get_best_new_feature_score( + cloned_estimator, X, y, cv, current_mask, **params + ) + if is_auto_select and ((new_score - old_score) < self.tol): + break + + old_score = new_score + current_mask[new_feature_idx] = True + + if self.direction == "backward": + current_mask = ~current_mask + + self.support_ = current_mask + self.n_features_to_select_ = self.support_.sum() + + return self + + def _get_best_new_feature_score(self, estimator, X, y, cv, current_mask, **params): + # Return the best new feature and its score to add to the current_mask, + # i.e. return the best new feature and its score to add (resp. remove) + # when doing forward selection (resp. backward selection). + # Feature will be added if the current score and past score are greater + # than tol when n_feature is auto, + candidate_feature_indices = np.flatnonzero(~current_mask) + scores = {} + for feature_idx in candidate_feature_indices: + candidate_mask = current_mask.copy() + candidate_mask[feature_idx] = True + if self.direction == "backward": + candidate_mask = ~candidate_mask + X_new = X[:, candidate_mask] + scores[feature_idx] = cross_val_score( + estimator, + X_new, + y, + cv=cv, + scoring=self.scoring, + n_jobs=self.n_jobs, + params=params, + ).mean() + new_feature_idx = max(scores, key=lambda feature_idx: scores[feature_idx]) + return new_feature_idx, scores[new_feature_idx] + + def _get_support_mask(self): + check_is_fitted(self) + return self.support_ + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = get_tags(self.estimator).input_tags.allow_nan + tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse + return tags + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.6 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__) + router.add( + estimator=self.estimator, + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + router.add( + splitter=check_cv(self.cv, classifier=is_classifier(self.estimator)), + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + router.add( + scorer=check_scoring(self.estimator, scoring=self.scoring), + method_mapping=MethodMapping().add(caller="fit", callee="score"), + ) + return router diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_univariate_selection.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_univariate_selection.py new file mode 100644 index 0000000000000000000000000000000000000000..7671a7ad7921d618cfdb98ba6baa60f24e3a9316 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_univariate_selection.py @@ -0,0 +1,1171 @@ +"""Univariate features selection.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from numbers import Integral, Real + +import numpy as np +from scipy import special, stats +from scipy.sparse import issparse + +from ..base import BaseEstimator, _fit_context +from ..preprocessing import LabelBinarizer +from ..utils import as_float_array, check_array, check_X_y, safe_mask, safe_sqr +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.extmath import row_norms, safe_sparse_dot +from ..utils.validation import check_is_fitted, validate_data +from ._base import SelectorMixin + + +def _clean_nans(scores): + """ + Fixes Issue #1240: NaNs can't be properly compared, so change them to the + smallest value of scores's dtype. -inf seems to be unreliable. + """ + # XXX where should this function be called? fit? scoring functions + # themselves? + scores = as_float_array(scores, copy=True) + scores[np.isnan(scores)] = np.finfo(scores.dtype).min + return scores + + +###################################################################### +# Scoring functions + + +# The following function is a rewriting of scipy.stats.f_oneway +# Contrary to the scipy.stats.f_oneway implementation it does not +# copy the data while keeping the inputs unchanged. +def f_oneway(*args): + """Perform a 1-way ANOVA. + + The one-way ANOVA tests the null hypothesis that 2 or more groups have + the same population mean. The test is applied to samples from two or + more groups, possibly with differing sizes. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + *args : {array-like, sparse matrix} + Sample1, sample2... The sample measurements should be given as + arguments. + + Returns + ------- + f_statistic : float + The computed F-value of the test. + p_value : float + The associated p-value from the F-distribution. + + Notes + ----- + The ANOVA test has important assumptions that must be satisfied in order + for the associated p-value to be valid. + + 1. The samples are independent + 2. Each sample is from a normally distributed population + 3. The population standard deviations of the groups are all equal. This + property is known as homoscedasticity. + + If these assumptions are not true for a given set of data, it may still be + possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`_) although + with some loss of power. + + The algorithm is from Heiman[2], pp.394-7. + + See ``scipy.stats.f_oneway`` that should give the same results while + being less efficient. + + References + ---------- + .. [1] Lowry, Richard. "Concepts and Applications of Inferential + Statistics". Chapter 14. + http://vassarstats.net/textbook + + .. [2] Heiman, G.W. Research Methods in Statistics. 2002. + """ + n_classes = len(args) + args = [as_float_array(a) for a in args] + n_samples_per_class = np.array([a.shape[0] for a in args]) + n_samples = np.sum(n_samples_per_class) + ss_alldata = sum(safe_sqr(a).sum(axis=0) for a in args) + sums_args = [np.asarray(a.sum(axis=0)) for a in args] + square_of_sums_alldata = sum(sums_args) ** 2 + square_of_sums_args = [s**2 for s in sums_args] + sstot = ss_alldata - square_of_sums_alldata / float(n_samples) + ssbn = 0.0 + for k, _ in enumerate(args): + ssbn += square_of_sums_args[k] / n_samples_per_class[k] + ssbn -= square_of_sums_alldata / float(n_samples) + sswn = sstot - ssbn + dfbn = n_classes - 1 + dfwn = n_samples - n_classes + msb = ssbn / float(dfbn) + msw = sswn / float(dfwn) + constant_features_idx = np.where(msw == 0.0)[0] + if np.nonzero(msb)[0].size != msb.size and constant_features_idx.size: + warnings.warn("Features %s are constant." % constant_features_idx, UserWarning) + f = msb / msw + # flatten matrix to vector in sparse case + f = np.asarray(f).ravel() + prob = special.fdtrc(dfbn, dfwn, f) + return f, prob + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "y": ["array-like"], + }, + prefer_skip_nested_validation=True, +) +def f_classif(X, y): + """Compute the ANOVA F-value for the provided sample. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The set of regressors that will be tested sequentially. + + y : array-like of shape (n_samples,) + The target vector. + + Returns + ------- + f_statistic : ndarray of shape (n_features,) + F-statistic for each feature. + + p_values : ndarray of shape (n_features,) + P-values associated with the F-statistic. + + See Also + -------- + chi2 : Chi-squared stats of non-negative features for classification tasks. + f_regression : F-value between label/feature for regression tasks. + + Examples + -------- + >>> from sklearn.datasets import make_classification + >>> from sklearn.feature_selection import f_classif + >>> X, y = make_classification( + ... n_samples=100, n_features=10, n_informative=2, n_clusters_per_class=1, + ... shuffle=False, random_state=42 + ... ) + >>> f_statistic, p_values = f_classif(X, y) + >>> f_statistic + array([2.21e+02, 7.02e-01, 1.70e+00, 9.31e-01, + 5.41e+00, 3.25e-01, 4.71e-02, 5.72e-01, + 7.54e-01, 8.90e-02]) + >>> p_values + array([7.14e-27, 4.04e-01, 1.96e-01, 3.37e-01, + 2.21e-02, 5.70e-01, 8.29e-01, 4.51e-01, + 3.87e-01, 7.66e-01]) + """ + X, y = check_X_y(X, y, accept_sparse=["csr", "csc", "coo"]) + args = [X[safe_mask(X, y == k)] for k in np.unique(y)] + return f_oneway(*args) + + +def _chisquare(f_obs, f_exp): + """Fast replacement for scipy.stats.chisquare. + + Version from https://github.com/scipy/scipy/pull/2525 with additional + optimizations. + """ + f_obs = np.asarray(f_obs, dtype=np.float64) + + k = len(f_obs) + # Reuse f_obs for chi-squared statistics + chisq = f_obs + chisq -= f_exp + chisq **= 2 + with np.errstate(invalid="ignore"): + chisq /= f_exp + chisq = chisq.sum(axis=0) + return chisq, special.chdtrc(k - 1, chisq) + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "y": ["array-like"], + }, + prefer_skip_nested_validation=True, +) +def chi2(X, y): + """Compute chi-squared stats between each non-negative feature and class. + + This score can be used to select the `n_features` features with the + highest values for the test chi-squared statistic from X, which must + contain only **non-negative integer feature values** such as booleans or frequencies + (e.g., term counts in document classification), relative to the classes. + + If some of your features are continuous, you need to bin them, for + example by using :class:`~sklearn.preprocessing.KBinsDiscretizer`. + + Recall that the chi-square test measures dependence between stochastic + variables, so using this function "weeds out" the features that are the + most likely to be independent of class and therefore irrelevant for + classification. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Sample vectors. + + y : array-like of shape (n_samples,) + Target vector (class labels). + + Returns + ------- + chi2 : ndarray of shape (n_features,) + Chi2 statistics for each feature. + + p_values : ndarray of shape (n_features,) + P-values for each feature. + + See Also + -------- + f_classif : ANOVA F-value between label/feature for classification tasks. + f_regression : F-value between label/feature for regression tasks. + + Notes + ----- + Complexity of this algorithm is O(n_classes * n_features). + + Examples + -------- + >>> import numpy as np + >>> from sklearn.feature_selection import chi2 + >>> X = np.array([[1, 1, 3], + ... [0, 1, 5], + ... [5, 4, 1], + ... [6, 6, 2], + ... [1, 4, 0], + ... [0, 0, 0]]) + >>> y = np.array([1, 1, 0, 0, 2, 2]) + >>> chi2_stats, p_values = chi2(X, y) + >>> chi2_stats + array([15.3, 6.5 , 8.9]) + >>> p_values + array([0.000456, 0.0387, 0.0116 ]) + """ + + # XXX: we might want to do some of the following in logspace instead for + # numerical stability. + # Converting X to float allows getting better performance for the + # safe_sparse_dot call made below. + X = check_array(X, accept_sparse="csr", dtype=(np.float64, np.float32)) + if np.any((X.data if issparse(X) else X) < 0): + raise ValueError("Input X must be non-negative.") + + # Use a sparse representation for Y by default to reduce memory usage when + # y has many unique classes. + Y = LabelBinarizer(sparse_output=True).fit_transform(y) + if Y.shape[1] == 1: + Y = Y.toarray() + Y = np.append(1 - Y, Y, axis=1) + + observed = safe_sparse_dot(Y.T, X) # n_classes * n_features + + if issparse(observed): + # convert back to a dense array before calling _chisquare + # XXX: could _chisquare be reimplement to accept sparse matrices for + # cases where both n_classes and n_features are large (and X is + # sparse)? + observed = observed.toarray() + + feature_count = X.sum(axis=0).reshape(1, -1) + class_prob = Y.mean(axis=0).reshape(1, -1) + expected = np.dot(class_prob.T, feature_count) + + return _chisquare(observed, expected) + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "y": ["array-like"], + "center": ["boolean"], + "force_finite": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def r_regression(X, y, *, center=True, force_finite=True): + """Compute Pearson's r for each features and the target. + + Pearson's r is also known as the Pearson correlation coefficient. + + Linear model for testing the individual effect of each of many regressors. + This is a scoring function to be used in a feature selection procedure, not + a free standing feature selection procedure. + + The cross correlation between each regressor and the target is computed + as:: + + E[(X[:, i] - mean(X[:, i])) * (y - mean(y))] / (std(X[:, i]) * std(y)) + + For more on usage see the :ref:`User Guide `. + + .. versionadded:: 1.0 + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data matrix. + + y : array-like of shape (n_samples,) + The target vector. + + center : bool, default=True + Whether or not to center the data matrix `X` and the target vector `y`. + By default, `X` and `y` will be centered. + + force_finite : bool, default=True + Whether or not to force the Pearson's R correlation to be finite. + In the particular case where some features in `X` or the target `y` + are constant, the Pearson's R correlation is not defined. When + `force_finite=False`, a correlation of `np.nan` is returned to + acknowledge this case. When `force_finite=True`, this value will be + forced to a minimal correlation of `0.0`. + + .. versionadded:: 1.1 + + Returns + ------- + correlation_coefficient : ndarray of shape (n_features,) + Pearson's R correlation coefficients of features. + + See Also + -------- + f_regression: Univariate linear regression tests returning f-statistic + and p-values. + mutual_info_regression: Mutual information for a continuous target. + f_classif: ANOVA F-value between label/feature for classification tasks. + chi2: Chi-squared stats of non-negative features for classification tasks. + + Examples + -------- + >>> from sklearn.datasets import make_regression + >>> from sklearn.feature_selection import r_regression + >>> X, y = make_regression( + ... n_samples=50, n_features=3, n_informative=1, noise=1e-4, random_state=42 + ... ) + >>> r_regression(X, y) + array([-0.157, 1. , -0.229]) + """ + X, y = check_X_y(X, y, accept_sparse=["csr", "csc", "coo"], dtype=np.float64) + n_samples = X.shape[0] + + # Compute centered values + # Note that E[(x - mean(x))*(y - mean(y))] = E[x*(y - mean(y))], so we + # need not center X + if center: + y = y - np.mean(y) + # TODO: for Scipy <= 1.10, `isspmatrix(X)` returns `True` for sparse arrays. + # Here, we check the output of the `.mean` operation that returns a `np.matrix` + # for sparse matrices while a `np.array` for dense and sparse arrays. + # We can reconsider using `isspmatrix` when the minimum version is + # SciPy >= 1.11 + X_means = X.mean(axis=0) + X_means = X_means.getA1() if isinstance(X_means, np.matrix) else X_means + # Compute the scaled standard deviations via moments + X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means**2) + else: + X_norms = row_norms(X.T) + + correlation_coefficient = safe_sparse_dot(y, X) + with np.errstate(divide="ignore", invalid="ignore"): + correlation_coefficient /= X_norms + correlation_coefficient /= np.linalg.norm(y) + + if force_finite and not np.isfinite(correlation_coefficient).all(): + # case where the target or some features are constant + # the correlation coefficient(s) is/are set to the minimum (i.e. 0.0) + nan_mask = np.isnan(correlation_coefficient) + correlation_coefficient[nan_mask] = 0.0 + return correlation_coefficient + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "y": ["array-like"], + "center": ["boolean"], + "force_finite": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def f_regression(X, y, *, center=True, force_finite=True): + """Univariate linear regression tests returning F-statistic and p-values. + + Quick linear model for testing the effect of a single regressor, + sequentially for many regressors. + + This is done in 2 steps: + + 1. The cross correlation between each regressor and the target is computed + using :func:`r_regression` as:: + + E[(X[:, i] - mean(X[:, i])) * (y - mean(y))] / (std(X[:, i]) * std(y)) + + 2. It is converted to an F score and then to a p-value. + + :func:`f_regression` is derived from :func:`r_regression` and will rank + features in the same order if all the features are positively correlated + with the target. + + Note however that contrary to :func:`f_regression`, :func:`r_regression` + values lie in [-1, 1] and can thus be negative. :func:`f_regression` is + therefore recommended as a feature selection criterion to identify + potentially predictive feature for a downstream classifier, irrespective of + the sign of the association with the target variable. + + Furthermore :func:`f_regression` returns p-values while + :func:`r_regression` does not. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data matrix. + + y : array-like of shape (n_samples,) + The target vector. + + center : bool, default=True + Whether or not to center the data matrix `X` and the target vector `y`. + By default, `X` and `y` will be centered. + + force_finite : bool, default=True + Whether or not to force the F-statistics and associated p-values to + be finite. There are two cases where the F-statistic is expected to not + be finite: + + - when the target `y` or some features in `X` are constant. In this + case, the Pearson's R correlation is not defined leading to obtain + `np.nan` values in the F-statistic and p-value. When + `force_finite=True`, the F-statistic is set to `0.0` and the + associated p-value is set to `1.0`. + - when a feature in `X` is perfectly correlated (or + anti-correlated) with the target `y`. In this case, the F-statistic + is expected to be `np.inf`. When `force_finite=True`, the F-statistic + is set to `np.finfo(dtype).max` and the associated p-value is set to + `0.0`. + + .. versionadded:: 1.1 + + Returns + ------- + f_statistic : ndarray of shape (n_features,) + F-statistic for each feature. + + p_values : ndarray of shape (n_features,) + P-values associated with the F-statistic. + + See Also + -------- + r_regression: Pearson's R between label/feature for regression tasks. + f_classif: ANOVA F-value between label/feature for classification tasks. + chi2: Chi-squared stats of non-negative features for classification tasks. + SelectKBest: Select features based on the k highest scores. + SelectFpr: Select features based on a false positive rate test. + SelectFdr: Select features based on an estimated false discovery rate. + SelectFwe: Select features based on family-wise error rate. + SelectPercentile: Select features based on percentile of the highest + scores. + + Examples + -------- + >>> from sklearn.datasets import make_regression + >>> from sklearn.feature_selection import f_regression + >>> X, y = make_regression( + ... n_samples=50, n_features=3, n_informative=1, noise=1e-4, random_state=42 + ... ) + >>> f_statistic, p_values = f_regression(X, y) + >>> f_statistic + array([1.21, 2.67e13, 2.66]) + >>> p_values + array([0.276, 1.54e-283, 0.11]) + """ + correlation_coefficient = r_regression( + X, y, center=center, force_finite=force_finite + ) + deg_of_freedom = y.size - (2 if center else 1) + + corr_coef_squared = correlation_coefficient**2 + + with np.errstate(divide="ignore", invalid="ignore"): + f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom + p_values = stats.f.sf(f_statistic, 1, deg_of_freedom) + + if force_finite and not np.isfinite(f_statistic).all(): + # case where there is a perfect (anti-)correlation + # f-statistics can be set to the maximum and p-values to zero + mask_inf = np.isinf(f_statistic) + f_statistic[mask_inf] = np.finfo(f_statistic.dtype).max + # case where the target or some features are constant + # f-statistics would be minimum and thus p-values large + mask_nan = np.isnan(f_statistic) + f_statistic[mask_nan] = 0.0 + p_values[mask_nan] = 1.0 + return f_statistic, p_values + + +###################################################################### +# Base classes + + +class _BaseFilter(SelectorMixin, BaseEstimator): + """Initialize the univariate feature selection. + + Parameters + ---------- + score_func : callable + Function taking two arrays X and y, and returning a pair of arrays + (scores, pvalues) or a single array with scores. + """ + + _parameter_constraints: dict = {"score_func": [callable]} + + def __init__(self, score_func): + self.score_func = score_func + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Run score function on (X, y) and get the appropriate features. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The training input samples. + + y : array-like of shape (n_samples,) or None + The target values (class labels in classification, real numbers in + regression). If the selector is unsupervised then `y` can be set to `None`. + + Returns + ------- + self : object + Returns the instance itself. + """ + if y is None: + X = validate_data(self, X, accept_sparse=["csr", "csc"]) + else: + X, y = validate_data( + self, X, y, accept_sparse=["csr", "csc"], multi_output=True + ) + + self._check_params(X, y) + score_func_ret = self.score_func(X, y) + if isinstance(score_func_ret, (list, tuple)): + self.scores_, self.pvalues_ = score_func_ret + self.pvalues_ = np.asarray(self.pvalues_) + else: + self.scores_ = score_func_ret + self.pvalues_ = None + + self.scores_ = np.asarray(self.scores_) + + return self + + def _check_params(self, X, y): + pass + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.target_tags.required = True + tags.input_tags.sparse = True + return tags + + +###################################################################### +# Specific filters +###################################################################### +class SelectPercentile(_BaseFilter): + """Select features according to a percentile of the highest scores. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + score_func : callable, default=f_classif + Function taking two arrays X and y, and returning a pair of arrays + (scores, pvalues) or a single array with scores. + Default is f_classif (see below "See Also"). The default function only + works with classification tasks. + + .. versionadded:: 0.18 + + percentile : int, default=10 + Percent of features to keep. + + Attributes + ---------- + scores_ : array-like of shape (n_features,) + Scores of features. + + pvalues_ : array-like of shape (n_features,) + p-values of feature scores, None if `score_func` returned only scores. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + f_classif : ANOVA F-value between label/feature for classification tasks. + mutual_info_classif : Mutual information for a discrete target. + chi2 : Chi-squared stats of non-negative features for classification tasks. + f_regression : F-value between label/feature for regression tasks. + mutual_info_regression : Mutual information for a continuous target. + SelectKBest : Select features based on the k highest scores. + SelectFpr : Select features based on a false positive rate test. + SelectFdr : Select features based on an estimated false discovery rate. + SelectFwe : Select features based on family-wise error rate. + GenericUnivariateSelect : Univariate feature selector with configurable + mode. + + Notes + ----- + Ties between features with equal scores will be broken in an unspecified + way. + + This filter supports unsupervised feature selection that only requests `X` for + computing the scores. + + Examples + -------- + >>> from sklearn.datasets import load_digits + >>> from sklearn.feature_selection import SelectPercentile, chi2 + >>> X, y = load_digits(return_X_y=True) + >>> X.shape + (1797, 64) + >>> X_new = SelectPercentile(chi2, percentile=10).fit_transform(X, y) + >>> X_new.shape + (1797, 7) + """ + + _parameter_constraints: dict = { + **_BaseFilter._parameter_constraints, + "percentile": [Interval(Real, 0, 100, closed="both")], + } + + def __init__(self, score_func=f_classif, *, percentile=10): + super().__init__(score_func=score_func) + self.percentile = percentile + + def _get_support_mask(self): + check_is_fitted(self) + + # Cater for NaNs + if self.percentile == 100: + return np.ones(len(self.scores_), dtype=bool) + elif self.percentile == 0: + return np.zeros(len(self.scores_), dtype=bool) + + scores = _clean_nans(self.scores_) + threshold = np.percentile(scores, 100 - self.percentile) + mask = scores > threshold + ties = np.where(scores == threshold)[0] + if len(ties): + max_feats = int(len(scores) * self.percentile / 100) + kept_ties = ties[: max_feats - mask.sum()] + mask[kept_ties] = True + return mask + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.target_tags.required = False + return tags + + +class SelectKBest(_BaseFilter): + """Select features according to the k highest scores. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + score_func : callable, default=f_classif + Function taking two arrays X and y, and returning a pair of arrays + (scores, pvalues) or a single array with scores. + Default is f_classif (see below "See Also"). The default function only + works with classification tasks. + + .. versionadded:: 0.18 + + k : int or "all", default=10 + Number of top features to select. + The "all" option bypasses selection, for use in a parameter search. + + Attributes + ---------- + scores_ : array-like of shape (n_features,) + Scores of features. + + pvalues_ : array-like of shape (n_features,) + p-values of feature scores, None if `score_func` returned only scores. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + f_classif: ANOVA F-value between label/feature for classification tasks. + mutual_info_classif: Mutual information for a discrete target. + chi2: Chi-squared stats of non-negative features for classification tasks. + f_regression: F-value between label/feature for regression tasks. + mutual_info_regression: Mutual information for a continuous target. + SelectPercentile: Select features based on percentile of the highest + scores. + SelectFpr : Select features based on a false positive rate test. + SelectFdr : Select features based on an estimated false discovery rate. + SelectFwe : Select features based on family-wise error rate. + GenericUnivariateSelect : Univariate feature selector with configurable + mode. + + Notes + ----- + Ties between features with equal scores will be broken in an unspecified + way. + + This filter supports unsupervised feature selection that only requests `X` for + computing the scores. + + Examples + -------- + >>> from sklearn.datasets import load_digits + >>> from sklearn.feature_selection import SelectKBest, chi2 + >>> X, y = load_digits(return_X_y=True) + >>> X.shape + (1797, 64) + >>> X_new = SelectKBest(chi2, k=20).fit_transform(X, y) + >>> X_new.shape + (1797, 20) + """ + + _parameter_constraints: dict = { + **_BaseFilter._parameter_constraints, + "k": [StrOptions({"all"}), Interval(Integral, 0, None, closed="left")], + } + + def __init__(self, score_func=f_classif, *, k=10): + super().__init__(score_func=score_func) + self.k = k + + def _check_params(self, X, y): + if not isinstance(self.k, str) and self.k > X.shape[1]: + warnings.warn( + f"k={self.k} is greater than n_features={X.shape[1]}. " + "All the features will be returned." + ) + + def _get_support_mask(self): + check_is_fitted(self) + + if self.k == "all": + return np.ones(self.scores_.shape, dtype=bool) + elif self.k == 0: + return np.zeros(self.scores_.shape, dtype=bool) + else: + scores = _clean_nans(self.scores_) + mask = np.zeros(scores.shape, dtype=bool) + + # Request a stable sort. Mergesort takes more memory (~40MB per + # megafeature on x86-64). + mask[np.argsort(scores, kind="mergesort")[-self.k :]] = 1 + return mask + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.target_tags.required = False + return tags + + +class SelectFpr(_BaseFilter): + """Filter: Select the pvalues below alpha based on a FPR test. + + FPR test stands for False Positive Rate test. It controls the total + amount of false detections. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + score_func : callable, default=f_classif + Function taking two arrays X and y, and returning a pair of arrays + (scores, pvalues). + Default is f_classif (see below "See Also"). The default function only + works with classification tasks. + + alpha : float, default=5e-2 + Features with p-values less than `alpha` are selected. + + Attributes + ---------- + scores_ : array-like of shape (n_features,) + Scores of features. + + pvalues_ : array-like of shape (n_features,) + p-values of feature scores. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + f_classif : ANOVA F-value between label/feature for classification tasks. + chi2 : Chi-squared stats of non-negative features for classification tasks. + mutual_info_classif: Mutual information for a discrete target. + f_regression : F-value between label/feature for regression tasks. + mutual_info_regression : Mutual information for a continuous target. + SelectPercentile : Select features based on percentile of the highest + scores. + SelectKBest : Select features based on the k highest scores. + SelectFdr : Select features based on an estimated false discovery rate. + SelectFwe : Select features based on family-wise error rate. + GenericUnivariateSelect : Univariate feature selector with configurable + mode. + + Examples + -------- + >>> from sklearn.datasets import load_breast_cancer + >>> from sklearn.feature_selection import SelectFpr, chi2 + >>> X, y = load_breast_cancer(return_X_y=True) + >>> X.shape + (569, 30) + >>> X_new = SelectFpr(chi2, alpha=0.01).fit_transform(X, y) + >>> X_new.shape + (569, 16) + """ + + _parameter_constraints: dict = { + **_BaseFilter._parameter_constraints, + "alpha": [Interval(Real, 0, 1, closed="both")], + } + + def __init__(self, score_func=f_classif, *, alpha=5e-2): + super().__init__(score_func=score_func) + self.alpha = alpha + + def _get_support_mask(self): + check_is_fitted(self) + + return self.pvalues_ < self.alpha + + +class SelectFdr(_BaseFilter): + """Filter: Select the p-values for an estimated false discovery rate. + + This uses the Benjamini-Hochberg procedure. ``alpha`` is an upper bound + on the expected false discovery rate. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + score_func : callable, default=f_classif + Function taking two arrays X and y, and returning a pair of arrays + (scores, pvalues). + Default is f_classif (see below "See Also"). The default function only + works with classification tasks. + + alpha : float, default=5e-2 + The highest uncorrected p-value for features to keep. + + Attributes + ---------- + scores_ : array-like of shape (n_features,) + Scores of features. + + pvalues_ : array-like of shape (n_features,) + p-values of feature scores. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + f_classif : ANOVA F-value between label/feature for classification tasks. + mutual_info_classif : Mutual information for a discrete target. + chi2 : Chi-squared stats of non-negative features for classification tasks. + f_regression : F-value between label/feature for regression tasks. + mutual_info_regression : Mutual information for a continuous target. + SelectPercentile : Select features based on percentile of the highest + scores. + SelectKBest : Select features based on the k highest scores. + SelectFpr : Select features based on a false positive rate test. + SelectFwe : Select features based on family-wise error rate. + GenericUnivariateSelect : Univariate feature selector with configurable + mode. + + References + ---------- + https://en.wikipedia.org/wiki/False_discovery_rate + + Examples + -------- + >>> from sklearn.datasets import load_breast_cancer + >>> from sklearn.feature_selection import SelectFdr, chi2 + >>> X, y = load_breast_cancer(return_X_y=True) + >>> X.shape + (569, 30) + >>> X_new = SelectFdr(chi2, alpha=0.01).fit_transform(X, y) + >>> X_new.shape + (569, 16) + """ + + _parameter_constraints: dict = { + **_BaseFilter._parameter_constraints, + "alpha": [Interval(Real, 0, 1, closed="both")], + } + + def __init__(self, score_func=f_classif, *, alpha=5e-2): + super().__init__(score_func=score_func) + self.alpha = alpha + + def _get_support_mask(self): + check_is_fitted(self) + + n_features = len(self.pvalues_) + sv = np.sort(self.pvalues_) + selected = sv[ + sv <= float(self.alpha) / n_features * np.arange(1, n_features + 1) + ] + if selected.size == 0: + return np.zeros_like(self.pvalues_, dtype=bool) + return self.pvalues_ <= selected.max() + + +class SelectFwe(_BaseFilter): + """Filter: Select the p-values corresponding to Family-wise error rate. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + score_func : callable, default=f_classif + Function taking two arrays X and y, and returning a pair of arrays + (scores, pvalues). + Default is f_classif (see below "See Also"). The default function only + works with classification tasks. + + alpha : float, default=5e-2 + The highest uncorrected p-value for features to keep. + + Attributes + ---------- + scores_ : array-like of shape (n_features,) + Scores of features. + + pvalues_ : array-like of shape (n_features,) + p-values of feature scores. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + f_classif : ANOVA F-value between label/feature for classification tasks. + chi2 : Chi-squared stats of non-negative features for classification tasks. + f_regression : F-value between label/feature for regression tasks. + SelectPercentile : Select features based on percentile of the highest + scores. + SelectKBest : Select features based on the k highest scores. + SelectFpr : Select features based on a false positive rate test. + SelectFdr : Select features based on an estimated false discovery rate. + GenericUnivariateSelect : Univariate feature selector with configurable + mode. + + Examples + -------- + >>> from sklearn.datasets import load_breast_cancer + >>> from sklearn.feature_selection import SelectFwe, chi2 + >>> X, y = load_breast_cancer(return_X_y=True) + >>> X.shape + (569, 30) + >>> X_new = SelectFwe(chi2, alpha=0.01).fit_transform(X, y) + >>> X_new.shape + (569, 15) + """ + + _parameter_constraints: dict = { + **_BaseFilter._parameter_constraints, + "alpha": [Interval(Real, 0, 1, closed="both")], + } + + def __init__(self, score_func=f_classif, *, alpha=5e-2): + super().__init__(score_func=score_func) + self.alpha = alpha + + def _get_support_mask(self): + check_is_fitted(self) + + return self.pvalues_ < self.alpha / len(self.pvalues_) + + +###################################################################### +# Generic filter +###################################################################### + + +# TODO this class should fit on either p-values or scores, +# depending on the mode. +class GenericUnivariateSelect(_BaseFilter): + """Univariate feature selector with configurable strategy. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + score_func : callable, default=f_classif + Function taking two arrays X and y, and returning a pair of arrays + (scores, pvalues). For modes 'percentile' or 'kbest' it can return + a single array scores. + + mode : {'percentile', 'k_best', 'fpr', 'fdr', 'fwe'}, default='percentile' + Feature selection mode. Note that the `'percentile'` and `'kbest'` + modes are supporting unsupervised feature selection (when `y` is `None`). + + param : "all", float or int, default=1e-5 + Parameter of the corresponding mode. + + Attributes + ---------- + scores_ : array-like of shape (n_features,) + Scores of features. + + pvalues_ : array-like of shape (n_features,) + p-values of feature scores, None if `score_func` returned scores only. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + f_classif : ANOVA F-value between label/feature for classification tasks. + mutual_info_classif : Mutual information for a discrete target. + chi2 : Chi-squared stats of non-negative features for classification tasks. + f_regression : F-value between label/feature for regression tasks. + mutual_info_regression : Mutual information for a continuous target. + SelectPercentile : Select features based on percentile of the highest + scores. + SelectKBest : Select features based on the k highest scores. + SelectFpr : Select features based on a false positive rate test. + SelectFdr : Select features based on an estimated false discovery rate. + SelectFwe : Select features based on family-wise error rate. + + Examples + -------- + >>> from sklearn.datasets import load_breast_cancer + >>> from sklearn.feature_selection import GenericUnivariateSelect, chi2 + >>> X, y = load_breast_cancer(return_X_y=True) + >>> X.shape + (569, 30) + >>> transformer = GenericUnivariateSelect(chi2, mode='k_best', param=20) + >>> X_new = transformer.fit_transform(X, y) + >>> X_new.shape + (569, 20) + """ + + _selection_modes: dict = { + "percentile": SelectPercentile, + "k_best": SelectKBest, + "fpr": SelectFpr, + "fdr": SelectFdr, + "fwe": SelectFwe, + } + + _parameter_constraints: dict = { + **_BaseFilter._parameter_constraints, + "mode": [StrOptions(set(_selection_modes.keys()))], + "param": [Interval(Real, 0, None, closed="left"), StrOptions({"all"})], + } + + def __init__(self, score_func=f_classif, *, mode="percentile", param=1e-5): + super().__init__(score_func=score_func) + self.mode = mode + self.param = param + + def _make_selector(self): + selector = self._selection_modes[self.mode](score_func=self.score_func) + + # Now perform some acrobatics to set the right named parameter in + # the selector + possible_params = selector._get_param_names() + possible_params.remove("score_func") + selector.set_params(**{possible_params[0]: self.param}) + + return selector + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = ["float64", "float32"] + return tags + + def _check_params(self, X, y): + self._make_selector()._check_params(X, y) + + def _get_support_mask(self): + check_is_fitted(self) + + selector = self._make_selector() + selector.pvalues_ = self.pvalues_ + selector.scores_ = self.scores_ + return selector._get_support_mask() diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_variance_threshold.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_variance_threshold.py new file mode 100644 index 0000000000000000000000000000000000000000..f26d70ecf8f82ab317103ba73b52f85b3af5e524 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/_variance_threshold.py @@ -0,0 +1,141 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Real + +import numpy as np + +from ..base import BaseEstimator, _fit_context +from ..utils._param_validation import Interval +from ..utils.sparsefuncs import mean_variance_axis, min_max_axis +from ..utils.validation import check_is_fitted, validate_data +from ._base import SelectorMixin + + +class VarianceThreshold(SelectorMixin, BaseEstimator): + """Feature selector that removes all low-variance features. + + This feature selection algorithm looks only at the features (X), not the + desired outputs (y), and can thus be used for unsupervised learning. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + threshold : float, default=0 + Features with a training-set variance lower than this threshold will + be removed. The default is to keep all features with non-zero variance, + i.e. remove the features that have the same value in all samples. + + Attributes + ---------- + variances_ : array, shape (n_features,) + Variances of individual features. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + SelectFromModel: Meta-transformer for selecting features based on + importance weights. + SelectPercentile : Select features according to a percentile of the highest + scores. + SequentialFeatureSelector : Transformer that performs Sequential Feature + Selection. + + Notes + ----- + Allows NaN in the input. + Raises ValueError if no feature in X meets the variance threshold. + + Examples + -------- + The following dataset has integer features, two of which are the same + in every sample. These are removed with the default setting for threshold:: + + >>> from sklearn.feature_selection import VarianceThreshold + >>> X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]] + >>> selector = VarianceThreshold() + >>> selector.fit_transform(X) + array([[2, 0], + [1, 4], + [1, 1]]) + """ + + _parameter_constraints: dict = { + "threshold": [Interval(Real, 0, None, closed="left")] + } + + def __init__(self, threshold=0.0): + self.threshold = threshold + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Learn empirical variances from X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Data from which to compute variances, where `n_samples` is + the number of samples and `n_features` is the number of features. + + y : any, default=None + Ignored. This parameter exists only for compatibility with + sklearn.pipeline.Pipeline. + + Returns + ------- + self : object + Returns the instance itself. + """ + X = validate_data( + self, + X, + accept_sparse=("csr", "csc"), + dtype=np.float64, + ensure_all_finite="allow-nan", + ) + + if hasattr(X, "toarray"): # sparse matrix + _, self.variances_ = mean_variance_axis(X, axis=0) + if self.threshold == 0: + mins, maxes = min_max_axis(X, axis=0) + peak_to_peaks = maxes - mins + else: + self.variances_ = np.nanvar(X, axis=0) + if self.threshold == 0: + peak_to_peaks = np.ptp(X, axis=0) + + if self.threshold == 0: + # Use peak-to-peak to avoid numeric precision issues + # for constant features + compare_arr = np.array([self.variances_, peak_to_peaks]) + self.variances_ = np.nanmin(compare_arr, axis=0) + + if np.all(~np.isfinite(self.variances_) | (self.variances_ <= self.threshold)): + msg = "No feature in X meets the variance threshold {0:.5f}" + if X.shape[0] == 1: + msg += " (X contains only one sample)" + raise ValueError(msg.format(self.threshold)) + + return self + + def _get_support_mask(self): + check_is_fitted(self) + + return self.variances_ > self.threshold + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + tags.input_tags.sparse = True + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_base.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_base.py new file mode 100644 index 0000000000000000000000000000000000000000..0bf51a80f01baa1a4340335ce397aeb0ca3e4b5f --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_base.py @@ -0,0 +1,154 @@ +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +from sklearn.base import BaseEstimator +from sklearn.feature_selection._base import SelectorMixin +from sklearn.utils.fixes import CSC_CONTAINERS +from sklearn.utils.validation import validate_data + + +class StepSelector(SelectorMixin, BaseEstimator): + """Retain every `step` features (beginning with 0). + + If `step < 1`, then no features are selected. + """ + + def __init__(self, step=2): + self.step = step + + def fit(self, X, y=None): + X = validate_data(self, X, accept_sparse="csc") + return self + + def _get_support_mask(self): + mask = np.zeros(self.n_features_in_, dtype=bool) + if self.step >= 1: + mask[:: self.step] = True + return mask + + +support = [True, False] * 5 +support_inds = [0, 2, 4, 6, 8] +X = np.arange(20).reshape(2, 10) +Xt = np.arange(0, 20, 2).reshape(2, 5) +Xinv = X.copy() +Xinv[:, 1::2] = 0 +y = [0, 1] +feature_names = list("ABCDEFGHIJ") +feature_names_t = feature_names[::2] +feature_names_inv = np.array(feature_names) +feature_names_inv[1::2] = "" + + +def test_transform_dense(): + sel = StepSelector() + Xt_actual = sel.fit(X, y).transform(X) + Xt_actual2 = StepSelector().fit_transform(X, y) + assert_array_equal(Xt, Xt_actual) + assert_array_equal(Xt, Xt_actual2) + + # Check dtype matches + assert np.int32 == sel.transform(X.astype(np.int32)).dtype + assert np.float32 == sel.transform(X.astype(np.float32)).dtype + + # Check 1d list and other dtype: + names_t_actual = sel.transform([feature_names]) + assert_array_equal(feature_names_t, names_t_actual.ravel()) + + # Check wrong shape raises error + with pytest.raises(ValueError): + sel.transform(np.array([[1], [2]])) + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_transform_sparse(csc_container): + X_sp = csc_container(X) + sel = StepSelector() + Xt_actual = sel.fit(X_sp).transform(X_sp) + Xt_actual2 = sel.fit_transform(X_sp) + assert_array_equal(Xt, Xt_actual.toarray()) + assert_array_equal(Xt, Xt_actual2.toarray()) + + # Check dtype matches + assert np.int32 == sel.transform(X_sp.astype(np.int32)).dtype + assert np.float32 == sel.transform(X_sp.astype(np.float32)).dtype + + # Check wrong shape raises error + with pytest.raises(ValueError): + sel.transform(np.array([[1], [2]])) + + +def test_inverse_transform_dense(): + sel = StepSelector() + Xinv_actual = sel.fit(X, y).inverse_transform(Xt) + assert_array_equal(Xinv, Xinv_actual) + + # Check dtype matches + assert np.int32 == sel.inverse_transform(Xt.astype(np.int32)).dtype + assert np.float32 == sel.inverse_transform(Xt.astype(np.float32)).dtype + + # Check 1d list and other dtype: + names_inv_actual = sel.inverse_transform([feature_names_t]) + assert_array_equal(feature_names_inv, names_inv_actual.ravel()) + + # Check wrong shape raises error + with pytest.raises(ValueError): + sel.inverse_transform(np.array([[1], [2]])) + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_inverse_transform_sparse(csc_container): + X_sp = csc_container(X) + Xt_sp = csc_container(Xt) + sel = StepSelector() + Xinv_actual = sel.fit(X_sp).inverse_transform(Xt_sp) + assert_array_equal(Xinv, Xinv_actual.toarray()) + + # Check dtype matches + assert np.int32 == sel.inverse_transform(Xt_sp.astype(np.int32)).dtype + assert np.float32 == sel.inverse_transform(Xt_sp.astype(np.float32)).dtype + + # Check wrong shape raises error + with pytest.raises(ValueError): + sel.inverse_transform(np.array([[1], [2]])) + + +def test_get_support(): + sel = StepSelector() + sel.fit(X, y) + assert_array_equal(support, sel.get_support()) + assert_array_equal(support_inds, sel.get_support(indices=True)) + + +def test_output_dataframe(): + """Check output dtypes for dataframes is consistent with the input dtypes.""" + pd = pytest.importorskip("pandas") + + X = pd.DataFrame( + { + "a": pd.Series([1.0, 2.4, 4.5], dtype=np.float32), + "b": pd.Series(["a", "b", "a"], dtype="category"), + "c": pd.Series(["j", "b", "b"], dtype="category"), + "d": pd.Series([3.0, 2.4, 1.2], dtype=np.float64), + } + ) + + for step in [2, 3]: + sel = StepSelector(step=step).set_output(transform="pandas") + sel.fit(X) + + output = sel.transform(X) + for name, dtype in output.dtypes.items(): + assert dtype == X.dtypes[name] + + # step=0 will select nothing + sel0 = StepSelector(step=0).set_output(transform="pandas") + sel0.fit(X, y) + + msg = "No features were selected" + with pytest.warns(UserWarning, match=msg): + output0 = sel0.transform(X) + + assert_array_equal(output0.index, X.index) + assert output0.shape == (X.shape[0], 0) diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_chi2.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_chi2.py new file mode 100644 index 0000000000000000000000000000000000000000..c50def36f1b6c281e6c96019355b901bf4326a38 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_chi2.py @@ -0,0 +1,93 @@ +""" +Tests for chi2, currently the only feature selection function designed +specifically to work with sparse matrices. +""" + +import warnings + +import numpy as np +import pytest +import scipy.stats + +from sklearn.feature_selection import SelectKBest, chi2 +from sklearn.feature_selection._univariate_selection import _chisquare +from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal +from sklearn.utils.fixes import COO_CONTAINERS, CSR_CONTAINERS + +# Feature 0 is highly informative for class 1; +# feature 1 is the same everywhere; +# feature 2 is a bit informative for class 2. +X = [[2, 1, 2], [9, 1, 1], [6, 1, 2], [0, 1, 2]] +y = [0, 1, 2, 2] + + +def mkchi2(k): + """Make k-best chi2 selector""" + return SelectKBest(chi2, k=k) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_chi2(csr_container): + # Test Chi2 feature extraction + + chi2 = mkchi2(k=1).fit(X, y) + chi2 = mkchi2(k=1).fit(X, y) + assert_array_equal(chi2.get_support(indices=True), [0]) + assert_array_equal(chi2.transform(X), np.array(X)[:, [0]]) + + chi2 = mkchi2(k=2).fit(X, y) + assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2]) + + Xsp = csr_container(X, dtype=np.float64) + chi2 = mkchi2(k=2).fit(Xsp, y) + assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2]) + Xtrans = chi2.transform(Xsp) + assert_array_equal(Xtrans.shape, [Xsp.shape[0], 2]) + + # == doesn't work on scipy.sparse matrices + Xtrans = Xtrans.toarray() + Xtrans2 = mkchi2(k=2).fit_transform(Xsp, y).toarray() + assert_array_almost_equal(Xtrans, Xtrans2) + + +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +def test_chi2_coo(coo_container): + # Check that chi2 works with a COO matrix + # (as returned by CountVectorizer, DictVectorizer) + Xcoo = coo_container(X) + mkchi2(k=2).fit_transform(Xcoo, y) + # if we got here without an exception, we're safe + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_chi2_negative(csr_container): + # Check for proper error on negative numbers in the input X. + X, y = [[0, 1], [-1e-20, 1]], [0, 1] + for X in (X, np.array(X), csr_container(X)): + with pytest.raises(ValueError): + chi2(X, y) + + +def test_chi2_unused_feature(): + # Unused feature should evaluate to NaN + # and should issue no runtime warning + with warnings.catch_warnings(record=True) as warned: + warnings.simplefilter("always") + chi, p = chi2([[1, 0], [0, 0]], [1, 0]) + for w in warned: + if "divide by zero" in repr(w): + raise AssertionError("Found unexpected warning %s" % w) + assert_array_equal(chi, [1, np.nan]) + assert_array_equal(p[1], np.nan) + + +def test_chisquare(): + # Test replacement for scipy.stats.chisquare against the original. + obs = np.array([[2.0, 2.0], [1.0, 1.0]]) + exp = np.array([[1.5, 1.5], [1.5, 1.5]]) + # call SciPy first because our version overwrites obs + chi_scp, p_scp = scipy.stats.chisquare(obs, exp) + chi_our, p_our = _chisquare(obs, exp) + + assert_array_almost_equal(chi_scp, chi_our) + assert_array_almost_equal(p_scp, p_our) diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_feature_select.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_feature_select.py new file mode 100644 index 0000000000000000000000000000000000000000..d7bffec5159bfc7ba8faf452a218d5147906419c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_feature_select.py @@ -0,0 +1,1018 @@ +""" +Todo: cross-check the F-value with stats model +""" + +import itertools +import warnings + +import numpy as np +import pytest +from numpy.testing import assert_allclose +from scipy import sparse, stats + +from sklearn.datasets import load_iris, make_classification, make_regression +from sklearn.feature_selection import ( + GenericUnivariateSelect, + SelectFdr, + SelectFpr, + SelectFwe, + SelectKBest, + SelectPercentile, + chi2, + f_classif, + f_oneway, + f_regression, + mutual_info_classif, + mutual_info_regression, + r_regression, +) +from sklearn.utils import safe_mask +from sklearn.utils._testing import ( + _convert_container, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, + ignore_warnings, +) +from sklearn.utils.fixes import CSR_CONTAINERS + +############################################################################## +# Test the score functions + + +def test_f_oneway_vs_scipy_stats(): + # Test that our f_oneway gives the same result as scipy.stats + rng = np.random.RandomState(0) + X1 = rng.randn(10, 3) + X2 = 1 + rng.randn(10, 3) + f, pv = stats.f_oneway(X1, X2) + f2, pv2 = f_oneway(X1, X2) + assert np.allclose(f, f2) + assert np.allclose(pv, pv2) + + +def test_f_oneway_ints(): + # Smoke test f_oneway on integers: that it does raise casting errors + # with recent numpys + rng = np.random.RandomState(0) + X = rng.randint(10, size=(10, 10)) + y = np.arange(10) + fint, pint = f_oneway(X, y) + + # test that is gives the same result as with float + f, p = f_oneway(X.astype(float), y) + assert_array_almost_equal(f, fint, decimal=4) + assert_array_almost_equal(p, pint, decimal=4) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_f_classif(csr_container): + # Test whether the F test yields meaningful results + # on a simple simulated classification problem + X, y = make_classification( + n_samples=200, + n_features=20, + n_informative=3, + n_redundant=2, + n_repeated=0, + n_classes=8, + n_clusters_per_class=1, + flip_y=0.0, + class_sep=10, + shuffle=False, + random_state=0, + ) + + F, pv = f_classif(X, y) + F_sparse, pv_sparse = f_classif(csr_container(X), y) + assert (F > 0).all() + assert (pv > 0).all() + assert (pv < 1).all() + assert (pv[:5] < 0.05).all() + assert (pv[5:] > 1.0e-4).all() + assert_array_almost_equal(F_sparse, F) + assert_array_almost_equal(pv_sparse, pv) + + +@pytest.mark.parametrize("center", [True, False]) +def test_r_regression(center): + X, y = make_regression( + n_samples=2000, n_features=20, n_informative=5, shuffle=False, random_state=0 + ) + + corr_coeffs = r_regression(X, y, center=center) + assert (-1 < corr_coeffs).all() + assert (corr_coeffs < 1).all() + + sparse_X = _convert_container(X, "sparse") + + sparse_corr_coeffs = r_regression(sparse_X, y, center=center) + assert_allclose(sparse_corr_coeffs, corr_coeffs) + + # Testing against numpy for reference + Z = np.hstack((X, y[:, np.newaxis])) + correlation_matrix = np.corrcoef(Z, rowvar=False) + np_corr_coeffs = correlation_matrix[:-1, -1] + assert_array_almost_equal(np_corr_coeffs, corr_coeffs, decimal=3) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_f_regression(csr_container): + # Test whether the F test yields meaningful results + # on a simple simulated regression problem + X, y = make_regression( + n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0 + ) + + F, pv = f_regression(X, y) + assert (F > 0).all() + assert (pv > 0).all() + assert (pv < 1).all() + assert (pv[:5] < 0.05).all() + assert (pv[5:] > 1.0e-4).all() + + # with centering, compare with sparse + F, pv = f_regression(X, y, center=True) + F_sparse, pv_sparse = f_regression(csr_container(X), y, center=True) + assert_allclose(F_sparse, F) + assert_allclose(pv_sparse, pv) + + # again without centering, compare with sparse + F, pv = f_regression(X, y, center=False) + F_sparse, pv_sparse = f_regression(csr_container(X), y, center=False) + assert_allclose(F_sparse, F) + assert_allclose(pv_sparse, pv) + + +def test_f_regression_input_dtype(): + # Test whether f_regression returns the same value + # for any numeric data_type + rng = np.random.RandomState(0) + X = rng.rand(10, 20) + y = np.arange(10).astype(int) + + F1, pv1 = f_regression(X, y) + F2, pv2 = f_regression(X, y.astype(float)) + assert_allclose(F1, F2, 5) + assert_allclose(pv1, pv2, 5) + + +def test_f_regression_center(): + # Test whether f_regression preserves dof according to 'center' argument + # We use two centered variates so we have a simple relationship between + # F-score with variates centering and F-score without variates centering. + # Create toy example + X = np.arange(-5, 6).reshape(-1, 1) # X has zero mean + n_samples = X.size + Y = np.ones(n_samples) + Y[::2] *= -1.0 + Y[0] = 0.0 # have Y mean being null + + F1, _ = f_regression(X, Y, center=True) + F2, _ = f_regression(X, Y, center=False) + assert_allclose(F1 * (n_samples - 1.0) / (n_samples - 2.0), F2) + assert_almost_equal(F2[0], 0.232558139) # value from statsmodels OLS + + +@pytest.mark.parametrize( + "X, y, expected_corr_coef, force_finite", + [ + ( + # A feature in X is constant - forcing finite + np.array([[2, 1], [2, 0], [2, 10], [2, 4]]), + np.array([0, 1, 1, 0]), + np.array([0.0, 0.32075]), + True, + ), + ( + # The target y is constant - forcing finite + np.array([[5, 1], [3, 0], [2, 10], [8, 4]]), + np.array([0, 0, 0, 0]), + np.array([0.0, 0.0]), + True, + ), + ( + # A feature in X is constant - not forcing finite + np.array([[2, 1], [2, 0], [2, 10], [2, 4]]), + np.array([0, 1, 1, 0]), + np.array([np.nan, 0.32075]), + False, + ), + ( + # The target y is constant - not forcing finite + np.array([[5, 1], [3, 0], [2, 10], [8, 4]]), + np.array([0, 0, 0, 0]), + np.array([np.nan, np.nan]), + False, + ), + ], +) +def test_r_regression_force_finite(X, y, expected_corr_coef, force_finite): + """Check the behaviour of `force_finite` for some corner cases with `r_regression`. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/15672 + """ + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + corr_coef = r_regression(X, y, force_finite=force_finite) + np.testing.assert_array_almost_equal(corr_coef, expected_corr_coef) + + +@pytest.mark.parametrize( + "X, y, expected_f_statistic, expected_p_values, force_finite", + [ + ( + # A feature in X is constant - forcing finite + np.array([[2, 1], [2, 0], [2, 10], [2, 4]]), + np.array([0, 1, 1, 0]), + np.array([0.0, 0.2293578]), + np.array([1.0, 0.67924985]), + True, + ), + ( + # The target y is constant - forcing finite + np.array([[5, 1], [3, 0], [2, 10], [8, 4]]), + np.array([0, 0, 0, 0]), + np.array([0.0, 0.0]), + np.array([1.0, 1.0]), + True, + ), + ( + # Feature in X correlated with y - forcing finite + np.array([[0, 1], [1, 0], [2, 10], [3, 4]]), + np.array([0, 1, 2, 3]), + np.array([np.finfo(np.float64).max, 0.845433]), + np.array([0.0, 0.454913]), + True, + ), + ( + # Feature in X anti-correlated with y - forcing finite + np.array([[3, 1], [2, 0], [1, 10], [0, 4]]), + np.array([0, 1, 2, 3]), + np.array([np.finfo(np.float64).max, 0.845433]), + np.array([0.0, 0.454913]), + True, + ), + ( + # A feature in X is constant - not forcing finite + np.array([[2, 1], [2, 0], [2, 10], [2, 4]]), + np.array([0, 1, 1, 0]), + np.array([np.nan, 0.2293578]), + np.array([np.nan, 0.67924985]), + False, + ), + ( + # The target y is constant - not forcing finite + np.array([[5, 1], [3, 0], [2, 10], [8, 4]]), + np.array([0, 0, 0, 0]), + np.array([np.nan, np.nan]), + np.array([np.nan, np.nan]), + False, + ), + ( + # Feature in X correlated with y - not forcing finite + np.array([[0, 1], [1, 0], [2, 10], [3, 4]]), + np.array([0, 1, 2, 3]), + np.array([np.inf, 0.845433]), + np.array([0.0, 0.454913]), + False, + ), + ( + # Feature in X anti-correlated with y - not forcing finite + np.array([[3, 1], [2, 0], [1, 10], [0, 4]]), + np.array([0, 1, 2, 3]), + np.array([np.inf, 0.845433]), + np.array([0.0, 0.454913]), + False, + ), + ], +) +def test_f_regression_corner_case( + X, y, expected_f_statistic, expected_p_values, force_finite +): + """Check the behaviour of `force_finite` for some corner cases with `f_regression`. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/15672 + """ + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + f_statistic, p_values = f_regression(X, y, force_finite=force_finite) + np.testing.assert_array_almost_equal(f_statistic, expected_f_statistic) + np.testing.assert_array_almost_equal(p_values, expected_p_values) + + +def test_f_classif_multi_class(): + # Test whether the F test yields meaningful results + # on a simple simulated classification problem + X, y = make_classification( + n_samples=200, + n_features=20, + n_informative=3, + n_redundant=2, + n_repeated=0, + n_classes=8, + n_clusters_per_class=1, + flip_y=0.0, + class_sep=10, + shuffle=False, + random_state=0, + ) + + F, pv = f_classif(X, y) + assert (F > 0).all() + assert (pv > 0).all() + assert (pv < 1).all() + assert (pv[:5] < 0.05).all() + assert (pv[5:] > 1.0e-4).all() + + +def test_select_percentile_classif(): + # Test whether the relative univariate feature selection + # gets the correct items in a simple classification problem + # with the percentile heuristic + X, y = make_classification( + n_samples=200, + n_features=20, + n_informative=3, + n_redundant=2, + n_repeated=0, + n_classes=8, + n_clusters_per_class=1, + flip_y=0.0, + class_sep=10, + shuffle=False, + random_state=0, + ) + + univariate_filter = SelectPercentile(f_classif, percentile=25) + X_r = univariate_filter.fit(X, y).transform(X) + X_r2 = ( + GenericUnivariateSelect(f_classif, mode="percentile", param=25) + .fit(X, y) + .transform(X) + ) + assert_array_equal(X_r, X_r2) + support = univariate_filter.get_support() + gtruth = np.zeros(20) + gtruth[:5] = 1 + assert_array_equal(support, gtruth) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_select_percentile_classif_sparse(csr_container): + # Test whether the relative univariate feature selection + # gets the correct items in a simple classification problem + # with the percentile heuristic + X, y = make_classification( + n_samples=200, + n_features=20, + n_informative=3, + n_redundant=2, + n_repeated=0, + n_classes=8, + n_clusters_per_class=1, + flip_y=0.0, + class_sep=10, + shuffle=False, + random_state=0, + ) + X = csr_container(X) + univariate_filter = SelectPercentile(f_classif, percentile=25) + X_r = univariate_filter.fit(X, y).transform(X) + X_r2 = ( + GenericUnivariateSelect(f_classif, mode="percentile", param=25) + .fit(X, y) + .transform(X) + ) + assert_array_equal(X_r.toarray(), X_r2.toarray()) + support = univariate_filter.get_support() + gtruth = np.zeros(20) + gtruth[:5] = 1 + assert_array_equal(support, gtruth) + + X_r2inv = univariate_filter.inverse_transform(X_r2) + assert sparse.issparse(X_r2inv) + support_mask = safe_mask(X_r2inv, support) + assert X_r2inv.shape == X.shape + assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray()) + # Check other columns are empty + assert X_r2inv.nnz == X_r.nnz + + +############################################################################## +# Test univariate selection in classification settings + + +def test_select_kbest_classif(): + # Test whether the relative univariate feature selection + # gets the correct items in a simple classification problem + # with the k best heuristic + X, y = make_classification( + n_samples=200, + n_features=20, + n_informative=3, + n_redundant=2, + n_repeated=0, + n_classes=8, + n_clusters_per_class=1, + flip_y=0.0, + class_sep=10, + shuffle=False, + random_state=0, + ) + + univariate_filter = SelectKBest(f_classif, k=5) + X_r = univariate_filter.fit(X, y).transform(X) + X_r2 = ( + GenericUnivariateSelect(f_classif, mode="k_best", param=5) + .fit(X, y) + .transform(X) + ) + assert_array_equal(X_r, X_r2) + support = univariate_filter.get_support() + gtruth = np.zeros(20) + gtruth[:5] = 1 + assert_array_equal(support, gtruth) + + +def test_select_kbest_all(): + # Test whether k="all" correctly returns all features. + X, y = make_classification( + n_samples=20, n_features=10, shuffle=False, random_state=0 + ) + + univariate_filter = SelectKBest(f_classif, k="all") + X_r = univariate_filter.fit(X, y).transform(X) + assert_array_equal(X, X_r) + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/24949 + X_r2 = ( + GenericUnivariateSelect(f_classif, mode="k_best", param="all") + .fit(X, y) + .transform(X) + ) + assert_array_equal(X_r, X_r2) + + +@pytest.mark.parametrize("dtype_in", [np.float32, np.float64]) +def test_select_kbest_zero(dtype_in): + # Test whether k=0 correctly returns no features. + X, y = make_classification( + n_samples=20, n_features=10, shuffle=False, random_state=0 + ) + X = X.astype(dtype_in) + + univariate_filter = SelectKBest(f_classif, k=0) + univariate_filter.fit(X, y) + support = univariate_filter.get_support() + gtruth = np.zeros(10, dtype=bool) + assert_array_equal(support, gtruth) + with pytest.warns(UserWarning, match="No features were selected"): + X_selected = univariate_filter.transform(X) + assert X_selected.shape == (20, 0) + assert X_selected.dtype == dtype_in + + +def test_select_heuristics_classif(): + # Test whether the relative univariate feature selection + # gets the correct items in a simple classification problem + # with the fdr, fwe and fpr heuristics + X, y = make_classification( + n_samples=200, + n_features=20, + n_informative=3, + n_redundant=2, + n_repeated=0, + n_classes=8, + n_clusters_per_class=1, + flip_y=0.0, + class_sep=10, + shuffle=False, + random_state=0, + ) + + univariate_filter = SelectFwe(f_classif, alpha=0.01) + X_r = univariate_filter.fit(X, y).transform(X) + gtruth = np.zeros(20) + gtruth[:5] = 1 + for mode in ["fdr", "fpr", "fwe"]: + X_r2 = ( + GenericUnivariateSelect(f_classif, mode=mode, param=0.01) + .fit(X, y) + .transform(X) + ) + assert_array_equal(X_r, X_r2) + support = univariate_filter.get_support() + assert_allclose(support, gtruth) + + +############################################################################## +# Test univariate selection in regression settings + + +def assert_best_scores_kept(score_filter): + scores = score_filter.scores_ + support = score_filter.get_support() + assert_allclose(np.sort(scores[support]), np.sort(scores)[-support.sum() :]) + + +def test_select_percentile_regression(): + # Test whether the relative univariate feature selection + # gets the correct items in a simple regression problem + # with the percentile heuristic + X, y = make_regression( + n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0 + ) + + univariate_filter = SelectPercentile(f_regression, percentile=25) + X_r = univariate_filter.fit(X, y).transform(X) + assert_best_scores_kept(univariate_filter) + X_r2 = ( + GenericUnivariateSelect(f_regression, mode="percentile", param=25) + .fit(X, y) + .transform(X) + ) + assert_array_equal(X_r, X_r2) + support = univariate_filter.get_support() + gtruth = np.zeros(20) + gtruth[:5] = 1 + assert_array_equal(support, gtruth) + X_2 = X.copy() + X_2[:, np.logical_not(support)] = 0 + assert_array_equal(X_2, univariate_filter.inverse_transform(X_r)) + # Check inverse_transform respects dtype + assert_array_equal( + X_2.astype(bool), univariate_filter.inverse_transform(X_r.astype(bool)) + ) + + +def test_select_percentile_regression_full(): + # Test whether the relative univariate feature selection + # selects all features when '100%' is asked. + X, y = make_regression( + n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0 + ) + + univariate_filter = SelectPercentile(f_regression, percentile=100) + X_r = univariate_filter.fit(X, y).transform(X) + assert_best_scores_kept(univariate_filter) + X_r2 = ( + GenericUnivariateSelect(f_regression, mode="percentile", param=100) + .fit(X, y) + .transform(X) + ) + assert_array_equal(X_r, X_r2) + support = univariate_filter.get_support() + gtruth = np.ones(20) + assert_array_equal(support, gtruth) + + +def test_select_kbest_regression(): + # Test whether the relative univariate feature selection + # gets the correct items in a simple regression problem + # with the k best heuristic + X, y = make_regression( + n_samples=200, + n_features=20, + n_informative=5, + shuffle=False, + random_state=0, + noise=10, + ) + + univariate_filter = SelectKBest(f_regression, k=5) + X_r = univariate_filter.fit(X, y).transform(X) + assert_best_scores_kept(univariate_filter) + X_r2 = ( + GenericUnivariateSelect(f_regression, mode="k_best", param=5) + .fit(X, y) + .transform(X) + ) + assert_array_equal(X_r, X_r2) + support = univariate_filter.get_support() + gtruth = np.zeros(20) + gtruth[:5] = 1 + assert_array_equal(support, gtruth) + + +def test_select_heuristics_regression(): + # Test whether the relative univariate feature selection + # gets the correct items in a simple regression problem + # with the fpr, fdr or fwe heuristics + X, y = make_regression( + n_samples=200, + n_features=20, + n_informative=5, + shuffle=False, + random_state=0, + noise=10, + ) + + univariate_filter = SelectFpr(f_regression, alpha=0.01) + X_r = univariate_filter.fit(X, y).transform(X) + gtruth = np.zeros(20) + gtruth[:5] = 1 + for mode in ["fdr", "fpr", "fwe"]: + X_r2 = ( + GenericUnivariateSelect(f_regression, mode=mode, param=0.01) + .fit(X, y) + .transform(X) + ) + assert_array_equal(X_r, X_r2) + support = univariate_filter.get_support() + assert_array_equal(support[:5], np.ones((5,), dtype=bool)) + assert np.sum(support[5:] == 1) < 3 + + +def test_boundary_case_ch2(): + # Test boundary case, and always aim to select 1 feature. + X = np.array([[10, 20], [20, 20], [20, 30]]) + y = np.array([[1], [0], [0]]) + scores, pvalues = chi2(X, y) + assert_array_almost_equal(scores, np.array([4.0, 0.71428571])) + assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) + + filter_fdr = SelectFdr(chi2, alpha=0.1) + filter_fdr.fit(X, y) + support_fdr = filter_fdr.get_support() + assert_array_equal(support_fdr, np.array([True, False])) + + filter_kbest = SelectKBest(chi2, k=1) + filter_kbest.fit(X, y) + support_kbest = filter_kbest.get_support() + assert_array_equal(support_kbest, np.array([True, False])) + + filter_percentile = SelectPercentile(chi2, percentile=50) + filter_percentile.fit(X, y) + support_percentile = filter_percentile.get_support() + assert_array_equal(support_percentile, np.array([True, False])) + + filter_fpr = SelectFpr(chi2, alpha=0.1) + filter_fpr.fit(X, y) + support_fpr = filter_fpr.get_support() + assert_array_equal(support_fpr, np.array([True, False])) + + filter_fwe = SelectFwe(chi2, alpha=0.1) + filter_fwe.fit(X, y) + support_fwe = filter_fwe.get_support() + assert_array_equal(support_fwe, np.array([True, False])) + + +@pytest.mark.parametrize("alpha", [0.001, 0.01, 0.1]) +@pytest.mark.parametrize("n_informative", [1, 5, 10]) +def test_select_fdr_regression(alpha, n_informative): + # Test that fdr heuristic actually has low FDR. + def single_fdr(alpha, n_informative, random_state): + X, y = make_regression( + n_samples=150, + n_features=20, + n_informative=n_informative, + shuffle=False, + random_state=random_state, + noise=10, + ) + + with warnings.catch_warnings(record=True): + # Warnings can be raised when no features are selected + # (low alpha or very noisy data) + univariate_filter = SelectFdr(f_regression, alpha=alpha) + X_r = univariate_filter.fit(X, y).transform(X) + X_r2 = ( + GenericUnivariateSelect(f_regression, mode="fdr", param=alpha) + .fit(X, y) + .transform(X) + ) + + assert_array_equal(X_r, X_r2) + support = univariate_filter.get_support() + num_false_positives = np.sum(support[n_informative:] == 1) + num_true_positives = np.sum(support[:n_informative] == 1) + + if num_false_positives == 0: + return 0.0 + false_discovery_rate = num_false_positives / ( + num_true_positives + num_false_positives + ) + return false_discovery_rate + + # As per Benjamini-Hochberg, the expected false discovery rate + # should be lower than alpha: + # FDR = E(FP / (TP + FP)) <= alpha + false_discovery_rate = np.mean( + [single_fdr(alpha, n_informative, random_state) for random_state in range(100)] + ) + assert alpha >= false_discovery_rate + + # Make sure that the empirical false discovery rate increases + # with alpha: + if false_discovery_rate != 0: + assert false_discovery_rate > alpha / 10 + + +def test_select_fwe_regression(): + # Test whether the relative univariate feature selection + # gets the correct items in a simple regression problem + # with the fwe heuristic + X, y = make_regression( + n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0 + ) + + univariate_filter = SelectFwe(f_regression, alpha=0.01) + X_r = univariate_filter.fit(X, y).transform(X) + X_r2 = ( + GenericUnivariateSelect(f_regression, mode="fwe", param=0.01) + .fit(X, y) + .transform(X) + ) + assert_array_equal(X_r, X_r2) + support = univariate_filter.get_support() + gtruth = np.zeros(20) + gtruth[:5] = 1 + assert_array_equal(support[:5], np.ones((5,), dtype=bool)) + assert np.sum(support[5:] == 1) < 2 + + +def test_selectkbest_tiebreaking(): + # Test whether SelectKBest actually selects k features in case of ties. + # Prior to 0.11, SelectKBest would return more features than requested. + Xs = [[0, 1, 1], [0, 0, 1], [1, 0, 0], [1, 1, 0]] + y = [1] + dummy_score = lambda X, y: (X[0], X[0]) + for X in Xs: + sel = SelectKBest(dummy_score, k=1) + X1 = ignore_warnings(sel.fit_transform)([X], y) + assert X1.shape[1] == 1 + assert_best_scores_kept(sel) + + sel = SelectKBest(dummy_score, k=2) + X2 = ignore_warnings(sel.fit_transform)([X], y) + assert X2.shape[1] == 2 + assert_best_scores_kept(sel) + + +def test_selectpercentile_tiebreaking(): + # Test if SelectPercentile selects the right n_features in case of ties. + Xs = [[0, 1, 1], [0, 0, 1], [1, 0, 0], [1, 1, 0]] + y = [1] + dummy_score = lambda X, y: (X[0], X[0]) + for X in Xs: + sel = SelectPercentile(dummy_score, percentile=34) + X1 = ignore_warnings(sel.fit_transform)([X], y) + assert X1.shape[1] == 1 + assert_best_scores_kept(sel) + + sel = SelectPercentile(dummy_score, percentile=67) + X2 = ignore_warnings(sel.fit_transform)([X], y) + assert X2.shape[1] == 2 + assert_best_scores_kept(sel) + + +def test_tied_pvalues(): + # Test whether k-best and percentiles work with tied pvalues from chi2. + # chi2 will return the same p-values for the following features, but it + # will return different scores. + X0 = np.array([[10000, 9999, 9998], [1, 1, 1]]) + y = [0, 1] + + for perm in itertools.permutations((0, 1, 2)): + X = X0[:, perm] + Xt = SelectKBest(chi2, k=2).fit_transform(X, y) + assert Xt.shape == (2, 2) + assert 9998 not in Xt + + Xt = SelectPercentile(chi2, percentile=67).fit_transform(X, y) + assert Xt.shape == (2, 2) + assert 9998 not in Xt + + +def test_scorefunc_multilabel(): + # Test whether k-best and percentiles works with multilabels with chi2. + + X = np.array([[10000, 9999, 0], [100, 9999, 0], [1000, 99, 0]]) + y = [[1, 1], [0, 1], [1, 0]] + + Xt = SelectKBest(chi2, k=2).fit_transform(X, y) + assert Xt.shape == (3, 2) + assert 0 not in Xt + + Xt = SelectPercentile(chi2, percentile=67).fit_transform(X, y) + assert Xt.shape == (3, 2) + assert 0 not in Xt + + +def test_tied_scores(): + # Test for stable sorting in k-best with tied scores. + X_train = np.array([[0, 0, 0], [1, 1, 1]]) + y_train = [0, 1] + + for n_features in [1, 2, 3]: + sel = SelectKBest(chi2, k=n_features).fit(X_train, y_train) + X_test = sel.transform([[0, 1, 2]]) + assert_array_equal(X_test[0], np.arange(3)[-n_features:]) + + +def test_nans(): + # Assert that SelectKBest and SelectPercentile can handle NaNs. + # First feature has zero variance to confuse f_classif (ANOVA) and + # make it return a NaN. + X = [[0, 1, 0], [0, -1, -1], [0, 0.5, 0.5]] + y = [1, 0, 1] + + for select in ( + SelectKBest(f_classif, k=2), + SelectPercentile(f_classif, percentile=67), + ): + ignore_warnings(select.fit)(X, y) + assert_array_equal(select.get_support(indices=True), np.array([1, 2])) + + +def test_invalid_k(): + X = [[0, 1, 0], [0, -1, -1], [0, 0.5, 0.5]] + y = [1, 0, 1] + + msg = "k=4 is greater than n_features=3. All the features will be returned." + with pytest.warns(UserWarning, match=msg): + SelectKBest(k=4).fit(X, y) + with pytest.warns(UserWarning, match=msg): + GenericUnivariateSelect(mode="k_best", param=4).fit(X, y) + + +def test_f_classif_constant_feature(): + # Test that f_classif warns if a feature is constant throughout. + + X, y = make_classification(n_samples=10, n_features=5) + X[:, 0] = 2.0 + with pytest.warns(UserWarning): + f_classif(X, y) + + +def test_no_feature_selected(): + rng = np.random.RandomState(0) + + # Generate random uncorrelated data: a strict univariate test should + # rejects all the features + X = rng.rand(40, 10) + y = rng.randint(0, 4, size=40) + strict_selectors = [ + SelectFwe(alpha=0.01).fit(X, y), + SelectFdr(alpha=0.01).fit(X, y), + SelectFpr(alpha=0.01).fit(X, y), + SelectPercentile(percentile=0).fit(X, y), + SelectKBest(k=0).fit(X, y), + ] + for selector in strict_selectors: + assert_array_equal(selector.get_support(), np.zeros(10)) + with pytest.warns(UserWarning, match="No features were selected"): + X_selected = selector.transform(X) + assert X_selected.shape == (40, 0) + + +def test_mutual_info_classif(): + X, y = make_classification( + n_samples=100, + n_features=5, + n_informative=1, + n_redundant=1, + n_repeated=0, + n_classes=2, + n_clusters_per_class=1, + flip_y=0.0, + class_sep=10, + shuffle=False, + random_state=0, + ) + + # Test in KBest mode. + univariate_filter = SelectKBest(mutual_info_classif, k=2) + X_r = univariate_filter.fit(X, y).transform(X) + X_r2 = ( + GenericUnivariateSelect(mutual_info_classif, mode="k_best", param=2) + .fit(X, y) + .transform(X) + ) + assert_array_equal(X_r, X_r2) + support = univariate_filter.get_support() + gtruth = np.zeros(5) + gtruth[:2] = 1 + assert_array_equal(support, gtruth) + + # Test in Percentile mode. + univariate_filter = SelectPercentile(mutual_info_classif, percentile=40) + X_r = univariate_filter.fit(X, y).transform(X) + X_r2 = ( + GenericUnivariateSelect(mutual_info_classif, mode="percentile", param=40) + .fit(X, y) + .transform(X) + ) + assert_array_equal(X_r, X_r2) + support = univariate_filter.get_support() + gtruth = np.zeros(5) + gtruth[:2] = 1 + assert_array_equal(support, gtruth) + + +def test_mutual_info_regression(): + X, y = make_regression( + n_samples=100, + n_features=10, + n_informative=2, + shuffle=False, + random_state=0, + noise=10, + ) + + # Test in KBest mode. + univariate_filter = SelectKBest(mutual_info_regression, k=2) + X_r = univariate_filter.fit(X, y).transform(X) + assert_best_scores_kept(univariate_filter) + X_r2 = ( + GenericUnivariateSelect(mutual_info_regression, mode="k_best", param=2) + .fit(X, y) + .transform(X) + ) + assert_array_equal(X_r, X_r2) + support = univariate_filter.get_support() + gtruth = np.zeros(10) + gtruth[:2] = 1 + assert_array_equal(support, gtruth) + + # Test in Percentile mode. + univariate_filter = SelectPercentile(mutual_info_regression, percentile=20) + X_r = univariate_filter.fit(X, y).transform(X) + X_r2 = ( + GenericUnivariateSelect(mutual_info_regression, mode="percentile", param=20) + .fit(X, y) + .transform(X) + ) + assert_array_equal(X_r, X_r2) + support = univariate_filter.get_support() + gtruth = np.zeros(10) + gtruth[:2] = 1 + assert_array_equal(support, gtruth) + + +def test_dataframe_output_dtypes(): + """Check that the output datafarme dtypes are the same as the input. + + Non-regression test for gh-24860. + """ + pd = pytest.importorskip("pandas") + + X, y = load_iris(return_X_y=True, as_frame=True) + X = X.astype( + { + "petal length (cm)": np.float32, + "petal width (cm)": np.float64, + } + ) + X["petal_width_binned"] = pd.cut(X["petal width (cm)"], bins=10) + + column_order = X.columns + + def selector(X, y): + ranking = { + "sepal length (cm)": 1, + "sepal width (cm)": 2, + "petal length (cm)": 3, + "petal width (cm)": 4, + "petal_width_binned": 5, + } + return np.asarray([ranking[name] for name in column_order]) + + univariate_filter = SelectKBest(selector, k=3).set_output(transform="pandas") + output = univariate_filter.fit_transform(X, y) + + assert_array_equal( + output.columns, ["petal length (cm)", "petal width (cm)", "petal_width_binned"] + ) + for name, dtype in output.dtypes.items(): + assert dtype == X.dtypes[name] + + +@pytest.mark.parametrize( + "selector", + [ + SelectKBest(k=4), + SelectPercentile(percentile=80), + GenericUnivariateSelect(mode="k_best", param=4), + GenericUnivariateSelect(mode="percentile", param=80), + ], +) +def test_unsupervised_filter(selector): + """Check support for unsupervised feature selection for the filter that could + require only `X`. + """ + rng = np.random.RandomState(0) + X = rng.randn(10, 5) + + def score_func(X, y=None): + return np.array([1, 1, 1, 1, 0]) + + selector.set_params(score_func=score_func) + selector.fit(X) + X_trans = selector.transform(X) + assert_allclose(X_trans, X[:, :4]) + X_trans = selector.fit_transform(X) + assert_allclose(X_trans, X[:, :4]) diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_from_model.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_from_model.py new file mode 100644 index 0000000000000000000000000000000000000000..17bedf44748fbcf9e65e9b2aee1a94621d1b709e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_from_model.py @@ -0,0 +1,704 @@ +import re +import warnings +from unittest.mock import Mock + +import numpy as np +import pytest + +from sklearn import datasets +from sklearn.base import BaseEstimator +from sklearn.cross_decomposition import CCA, PLSCanonical, PLSRegression +from sklearn.datasets import make_friedman1, make_regression +from sklearn.decomposition import PCA +from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier +from sklearn.exceptions import NotFittedError +from sklearn.feature_selection import SelectFromModel +from sklearn.linear_model import ( + ElasticNet, + ElasticNetCV, + Lasso, + LassoCV, + LinearRegression, + LogisticRegression, + PassiveAggressiveClassifier, + SGDClassifier, +) +from sklearn.pipeline import make_pipeline +from sklearn.svm import LinearSVC +from sklearn.utils._testing import ( + MinimalClassifier, + assert_allclose, + assert_array_almost_equal, + assert_array_equal, + skip_if_32bit, +) + + +class NaNTag(BaseEstimator): + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags + + +class NoNaNTag(BaseEstimator): + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = False + return tags + + +class NaNTagRandomForest(RandomForestClassifier): + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags + + +iris = datasets.load_iris() +data, y = iris.data, iris.target + + +def test_invalid_input(): + clf = SGDClassifier( + alpha=0.1, max_iter=10, shuffle=True, random_state=None, tol=None + ) + for threshold in ["gobbledigook", ".5 * gobbledigook"]: + model = SelectFromModel(clf, threshold=threshold) + model.fit(data, y) + with pytest.raises(ValueError): + model.transform(data) + + +def test_input_estimator_unchanged(): + # Test that SelectFromModel fits on a clone of the estimator. + est = RandomForestClassifier() + transformer = SelectFromModel(estimator=est) + transformer.fit(data, y) + assert transformer.estimator is est + + +@pytest.mark.parametrize( + "max_features, err_type, err_msg", + [ + ( + data.shape[1] + 1, + ValueError, + "max_features ==", + ), + ( + lambda X: 1.5, + TypeError, + "max_features must be an instance of int, not float.", + ), + ( + lambda X: data.shape[1] + 1, + ValueError, + "max_features ==", + ), + ( + lambda X: -1, + ValueError, + "max_features ==", + ), + ], +) +def test_max_features_error(max_features, err_type, err_msg): + err_msg = re.escape(err_msg) + clf = RandomForestClassifier(n_estimators=5, random_state=0) + + transformer = SelectFromModel( + estimator=clf, max_features=max_features, threshold=-np.inf + ) + with pytest.raises(err_type, match=err_msg): + transformer.fit(data, y) + + +@pytest.mark.parametrize("max_features", [0, 2, data.shape[1], None]) +def test_inferred_max_features_integer(max_features): + """Check max_features_ and output shape for integer max_features.""" + clf = RandomForestClassifier(n_estimators=5, random_state=0) + transformer = SelectFromModel( + estimator=clf, max_features=max_features, threshold=-np.inf + ) + X_trans = transformer.fit_transform(data, y) + if max_features is not None: + assert transformer.max_features_ == max_features + assert X_trans.shape[1] == transformer.max_features_ + else: + assert not hasattr(transformer, "max_features_") + assert X_trans.shape[1] == data.shape[1] + + +@pytest.mark.parametrize( + "max_features", + [lambda X: 1, lambda X: X.shape[1], lambda X: min(X.shape[1], 10000)], +) +def test_inferred_max_features_callable(max_features): + """Check max_features_ and output shape for callable max_features.""" + clf = RandomForestClassifier(n_estimators=5, random_state=0) + transformer = SelectFromModel( + estimator=clf, max_features=max_features, threshold=-np.inf + ) + X_trans = transformer.fit_transform(data, y) + assert transformer.max_features_ == max_features(data) + assert X_trans.shape[1] == transformer.max_features_ + + +@pytest.mark.parametrize("max_features", [lambda X: round(len(X[0]) / 2), 2]) +def test_max_features_array_like(max_features): + X = [ + [0.87, -1.34, 0.31], + [-2.79, -0.02, -0.85], + [-1.34, -0.48, -2.55], + [1.92, 1.48, 0.65], + ] + y = [0, 1, 0, 1] + + clf = RandomForestClassifier(n_estimators=5, random_state=0) + transformer = SelectFromModel( + estimator=clf, max_features=max_features, threshold=-np.inf + ) + X_trans = transformer.fit_transform(X, y) + assert X_trans.shape[1] == transformer.max_features_ + + +@pytest.mark.parametrize( + "max_features", + [lambda X: min(X.shape[1], 10000), lambda X: X.shape[1], lambda X: 1], +) +def test_max_features_callable_data(max_features): + """Tests that the callable passed to `fit` is called on X.""" + clf = RandomForestClassifier(n_estimators=50, random_state=0) + m = Mock(side_effect=max_features) + transformer = SelectFromModel(estimator=clf, max_features=m, threshold=-np.inf) + transformer.fit_transform(data, y) + m.assert_called_with(data) + + +class FixedImportanceEstimator(BaseEstimator): + def __init__(self, importances): + self.importances = importances + + def fit(self, X, y=None): + self.feature_importances_ = np.array(self.importances) + + +def test_max_features(): + # Test max_features parameter using various values + X, y = datasets.make_classification( + n_samples=1000, + n_features=10, + n_informative=3, + n_redundant=0, + n_repeated=0, + shuffle=False, + random_state=0, + ) + max_features = X.shape[1] + est = RandomForestClassifier(n_estimators=50, random_state=0) + + transformer1 = SelectFromModel(estimator=est, threshold=-np.inf) + transformer2 = SelectFromModel( + estimator=est, max_features=max_features, threshold=-np.inf + ) + X_new1 = transformer1.fit_transform(X, y) + X_new2 = transformer2.fit_transform(X, y) + assert_allclose(X_new1, X_new2) + + # Test max_features against actual model. + transformer1 = SelectFromModel(estimator=Lasso(alpha=0.025, random_state=42)) + X_new1 = transformer1.fit_transform(X, y) + scores1 = np.abs(transformer1.estimator_.coef_) + candidate_indices1 = np.argsort(-scores1, kind="mergesort") + + for n_features in range(1, X_new1.shape[1] + 1): + transformer2 = SelectFromModel( + estimator=Lasso(alpha=0.025, random_state=42), + max_features=n_features, + threshold=-np.inf, + ) + X_new2 = transformer2.fit_transform(X, y) + scores2 = np.abs(transformer2.estimator_.coef_) + candidate_indices2 = np.argsort(-scores2, kind="mergesort") + assert_allclose( + X[:, candidate_indices1[:n_features]], X[:, candidate_indices2[:n_features]] + ) + assert_allclose(transformer1.estimator_.coef_, transformer2.estimator_.coef_) + + +def test_max_features_tiebreak(): + # Test if max_features can break tie among feature importance + X, y = datasets.make_classification( + n_samples=1000, + n_features=10, + n_informative=3, + n_redundant=0, + n_repeated=0, + shuffle=False, + random_state=0, + ) + max_features = X.shape[1] + + feature_importances = np.array([4, 4, 4, 4, 3, 3, 3, 2, 2, 1]) + for n_features in range(1, max_features + 1): + transformer = SelectFromModel( + FixedImportanceEstimator(feature_importances), + max_features=n_features, + threshold=-np.inf, + ) + X_new = transformer.fit_transform(X, y) + selected_feature_indices = np.where(transformer._get_support_mask())[0] + assert_array_equal(selected_feature_indices, np.arange(n_features)) + assert X_new.shape[1] == n_features + + +def test_threshold_and_max_features(): + X, y = datasets.make_classification( + n_samples=1000, + n_features=10, + n_informative=3, + n_redundant=0, + n_repeated=0, + shuffle=False, + random_state=0, + ) + est = RandomForestClassifier(n_estimators=50, random_state=0) + + transformer1 = SelectFromModel(estimator=est, max_features=3, threshold=-np.inf) + X_new1 = transformer1.fit_transform(X, y) + + transformer2 = SelectFromModel(estimator=est, threshold=0.04) + X_new2 = transformer2.fit_transform(X, y) + + transformer3 = SelectFromModel(estimator=est, max_features=3, threshold=0.04) + X_new3 = transformer3.fit_transform(X, y) + assert X_new3.shape[1] == min(X_new1.shape[1], X_new2.shape[1]) + selected_indices = transformer3.transform(np.arange(X.shape[1])[np.newaxis, :]) + assert_allclose(X_new3, X[:, selected_indices[0]]) + + +@skip_if_32bit +def test_feature_importances(): + X, y = datasets.make_classification( + n_samples=1000, + n_features=10, + n_informative=3, + n_redundant=0, + n_repeated=0, + shuffle=False, + random_state=0, + ) + + est = RandomForestClassifier(n_estimators=50, random_state=0) + for threshold, func in zip(["mean", "median"], [np.mean, np.median]): + transformer = SelectFromModel(estimator=est, threshold=threshold) + transformer.fit(X, y) + assert hasattr(transformer.estimator_, "feature_importances_") + + X_new = transformer.transform(X) + assert X_new.shape[1] < X.shape[1] + importances = transformer.estimator_.feature_importances_ + + feature_mask = np.abs(importances) > func(importances) + assert_array_almost_equal(X_new, X[:, feature_mask]) + + +def test_sample_weight(): + # Ensure sample weights are passed to underlying estimator + X, y = datasets.make_classification( + n_samples=100, + n_features=10, + n_informative=3, + n_redundant=0, + n_repeated=0, + shuffle=False, + random_state=0, + ) + + # Check with sample weights + sample_weight = np.ones(y.shape) + sample_weight[y == 1] *= 100 + + est = LogisticRegression(random_state=0, fit_intercept=False) + transformer = SelectFromModel(estimator=est) + transformer.fit(X, y, sample_weight=None) + mask = transformer._get_support_mask() + transformer.fit(X, y, sample_weight=sample_weight) + weighted_mask = transformer._get_support_mask() + assert not np.all(weighted_mask == mask) + transformer.fit(X, y, sample_weight=3 * sample_weight) + reweighted_mask = transformer._get_support_mask() + assert np.all(weighted_mask == reweighted_mask) + + +@pytest.mark.parametrize( + "estimator", + [ + Lasso(alpha=0.1, random_state=42), + LassoCV(random_state=42), + ElasticNet(l1_ratio=1, random_state=42), + ElasticNetCV(l1_ratio=[1], random_state=42), + ], +) +def test_coef_default_threshold(estimator): + X, y = datasets.make_classification( + n_samples=100, + n_features=10, + n_informative=3, + n_redundant=0, + n_repeated=0, + shuffle=False, + random_state=0, + ) + + # For the Lasso and related models, the threshold defaults to 1e-5 + transformer = SelectFromModel(estimator=estimator) + transformer.fit(X, y) + X_new = transformer.transform(X) + mask = np.abs(transformer.estimator_.coef_) > 1e-5 + assert_array_almost_equal(X_new, X[:, mask]) + + +@skip_if_32bit +def test_2d_coef(): + X, y = datasets.make_classification( + n_samples=1000, + n_features=10, + n_informative=3, + n_redundant=0, + n_repeated=0, + shuffle=False, + random_state=0, + n_classes=4, + ) + + est = LogisticRegression() + for threshold, func in zip(["mean", "median"], [np.mean, np.median]): + for order in [1, 2, np.inf]: + # Fit SelectFromModel a multi-class problem + transformer = SelectFromModel( + estimator=LogisticRegression(), threshold=threshold, norm_order=order + ) + transformer.fit(X, y) + assert hasattr(transformer.estimator_, "coef_") + X_new = transformer.transform(X) + assert X_new.shape[1] < X.shape[1] + + # Manually check that the norm is correctly performed + est.fit(X, y) + importances = np.linalg.norm(est.coef_, axis=0, ord=order) + feature_mask = importances > func(importances) + assert_array_almost_equal(X_new, X[:, feature_mask]) + + +def test_partial_fit(): + est = PassiveAggressiveClassifier( + random_state=0, shuffle=False, max_iter=5, tol=None + ) + transformer = SelectFromModel(estimator=est) + transformer.partial_fit(data, y, classes=np.unique(y)) + old_model = transformer.estimator_ + transformer.partial_fit(data, y, classes=np.unique(y)) + new_model = transformer.estimator_ + assert old_model is new_model + + X_transform = transformer.transform(data) + transformer.fit(np.vstack((data, data)), np.concatenate((y, y))) + assert_array_almost_equal(X_transform, transformer.transform(data)) + + # check that if est doesn't have partial_fit, neither does SelectFromModel + transformer = SelectFromModel(estimator=RandomForestClassifier()) + assert not hasattr(transformer, "partial_fit") + + +def test_calling_fit_reinitializes(): + est = LinearSVC(random_state=0) + transformer = SelectFromModel(estimator=est) + transformer.fit(data, y) + transformer.set_params(estimator__C=100) + transformer.fit(data, y) + assert transformer.estimator_.C == 100 + + +def test_prefit(): + # Test all possible combinations of the prefit parameter. + + # Passing a prefit parameter with the selected model + # and fitting a unfit model with prefit=False should give same results. + clf = SGDClassifier(alpha=0.1, max_iter=10, shuffle=True, random_state=0, tol=None) + model = SelectFromModel(clf) + model.fit(data, y) + X_transform = model.transform(data) + clf.fit(data, y) + model = SelectFromModel(clf, prefit=True) + assert_array_almost_equal(model.transform(data), X_transform) + model.fit(data, y) + assert model.estimator_ is not clf + + # Check that the model is rewritten if prefit=False and a fitted model is + # passed + model = SelectFromModel(clf, prefit=False) + model.fit(data, y) + assert_array_almost_equal(model.transform(data), X_transform) + + # Check that passing an unfitted estimator with `prefit=True` raises a + # `ValueError` + clf = SGDClassifier(alpha=0.1, max_iter=10, shuffle=True, random_state=0, tol=None) + model = SelectFromModel(clf, prefit=True) + err_msg = "When `prefit=True`, `estimator` is expected to be a fitted estimator." + with pytest.raises(NotFittedError, match=err_msg): + model.fit(data, y) + with pytest.raises(NotFittedError, match=err_msg): + model.partial_fit(data, y) + with pytest.raises(NotFittedError, match=err_msg): + model.transform(data) + + # Check that the internal parameters of prefitted model are not changed + # when calling `fit` or `partial_fit` with `prefit=True` + clf = SGDClassifier(alpha=0.1, max_iter=10, shuffle=True, tol=None).fit(data, y) + model = SelectFromModel(clf, prefit=True) + model.fit(data, y) + assert_allclose(model.estimator_.coef_, clf.coef_) + model.partial_fit(data, y) + assert_allclose(model.estimator_.coef_, clf.coef_) + + +def test_prefit_max_features(): + """Check the interaction between `prefit` and `max_features`.""" + # case 1: an error should be raised at `transform` if `fit` was not called to + # validate the attributes + estimator = RandomForestClassifier(n_estimators=5, random_state=0) + estimator.fit(data, y) + model = SelectFromModel(estimator, prefit=True, max_features=lambda X: X.shape[1]) + + err_msg = ( + "When `prefit=True` and `max_features` is a callable, call `fit` " + "before calling `transform`." + ) + with pytest.raises(NotFittedError, match=err_msg): + model.transform(data) + + # case 2: `max_features` is not validated and different from an integer + # FIXME: we cannot validate the upper bound of the attribute at transform + # and we should force calling `fit` if we intend to force the attribute + # to have such an upper bound. + max_features = 2.5 + model.set_params(max_features=max_features) + with pytest.raises(ValueError, match="`max_features` must be an integer"): + model.transform(data) + + +def test_get_feature_names_out_elasticnetcv(): + """Check if ElasticNetCV works with a list of floats. + + Non-regression test for #30936.""" + X, y = make_regression(n_features=5, n_informative=3, random_state=0) + estimator = ElasticNetCV(l1_ratio=[0.25, 0.5, 0.75], random_state=0) + selector = SelectFromModel(estimator=estimator) + selector.fit(X, y) + + names_out = selector.get_feature_names_out() + mask = selector.get_support() + expected = np.array([f"x{i}" for i in range(X.shape[1])])[mask] + assert_array_equal(names_out, expected) + + +def test_prefit_get_feature_names_out(): + """Check the interaction between prefit and the feature names.""" + clf = RandomForestClassifier(n_estimators=2, random_state=0) + clf.fit(data, y) + model = SelectFromModel(clf, prefit=True, max_features=1) + + name = type(model).__name__ + err_msg = ( + f"This {name} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this estimator." + ) + with pytest.raises(NotFittedError, match=err_msg): + model.get_feature_names_out() + + model.fit(data, y) + feature_names = model.get_feature_names_out() + assert feature_names == ["x3"] + + +def test_threshold_string(): + est = RandomForestClassifier(n_estimators=50, random_state=0) + model = SelectFromModel(est, threshold="0.5*mean") + model.fit(data, y) + X_transform = model.transform(data) + + # Calculate the threshold from the estimator directly. + est.fit(data, y) + threshold = 0.5 * np.mean(est.feature_importances_) + mask = est.feature_importances_ > threshold + assert_array_almost_equal(X_transform, data[:, mask]) + + +def test_threshold_without_refitting(): + # Test that the threshold can be set without refitting the model. + clf = SGDClassifier(alpha=0.1, max_iter=10, shuffle=True, random_state=0, tol=None) + model = SelectFromModel(clf, threshold="0.1 * mean") + model.fit(data, y) + X_transform = model.transform(data) + + # Set a higher threshold to filter out more features. + model.threshold = "1.0 * mean" + assert X_transform.shape[1] > model.transform(data).shape[1] + + +def test_fit_accepts_nan_inf(): + # Test that fit doesn't check for np.inf and np.nan values. + clf = HistGradientBoostingClassifier(random_state=0) + + model = SelectFromModel(estimator=clf) + + nan_data = data.copy() + nan_data[0] = np.nan + nan_data[1] = np.inf + + model.fit(data, y) + + +def test_transform_accepts_nan_inf(): + # Test that transform doesn't check for np.inf and np.nan values. + clf = NaNTagRandomForest(n_estimators=100, random_state=0) + nan_data = data.copy() + + model = SelectFromModel(estimator=clf) + model.fit(nan_data, y) + + nan_data[0] = np.nan + nan_data[1] = np.inf + + model.transform(nan_data) + + +def test_allow_nan_tag_comes_from_estimator(): + allow_nan_est = NaNTag() + model = SelectFromModel(estimator=allow_nan_est) + assert model.__sklearn_tags__().input_tags.allow_nan is True + + no_nan_est = NoNaNTag() + model = SelectFromModel(estimator=no_nan_est) + assert model.__sklearn_tags__().input_tags.allow_nan is False + + +def _pca_importances(pca_estimator): + return np.abs(pca_estimator.explained_variance_) + + +@pytest.mark.parametrize( + "estimator, importance_getter", + [ + ( + make_pipeline(PCA(random_state=0), LogisticRegression()), + "named_steps.logisticregression.coef_", + ), + (PCA(random_state=0), _pca_importances), + ], +) +def test_importance_getter(estimator, importance_getter): + selector = SelectFromModel( + estimator, threshold="mean", importance_getter=importance_getter + ) + selector.fit(data, y) + assert selector.transform(data).shape[1] == 1 + + +@pytest.mark.parametrize("PLSEstimator", [CCA, PLSCanonical, PLSRegression]) +def test_select_from_model_pls(PLSEstimator): + """Check the behaviour of SelectFromModel with PLS estimators. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/12410 + """ + X, y = make_friedman1(n_samples=50, n_features=10, random_state=0) + estimator = PLSEstimator(n_components=1) + model = make_pipeline(SelectFromModel(estimator), estimator).fit(X, y) + assert model.score(X, y) > 0.5 + + +def test_estimator_does_not_support_feature_names(): + """SelectFromModel works with estimators that do not support feature_names_in_. + + Non-regression test for #21949. + """ + pytest.importorskip("pandas") + X, y = datasets.load_iris(as_frame=True, return_X_y=True) + all_feature_names = set(X.columns) + + def importance_getter(estimator): + return np.arange(X.shape[1]) + + selector = SelectFromModel( + MinimalClassifier(), importance_getter=importance_getter + ).fit(X, y) + + # selector learns the feature names itself + assert_array_equal(selector.feature_names_in_, X.columns) + + feature_names_out = set(selector.get_feature_names_out()) + assert feature_names_out < all_feature_names + + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + + selector.transform(X.iloc[1:3]) + + +@pytest.mark.parametrize( + "error, err_msg, max_features", + ( + [ValueError, "max_features == 10, must be <= 4", 10], + [ValueError, "max_features == 5, must be <= 4", lambda x: x.shape[1] + 1], + ), +) +def test_partial_fit_validate_max_features(error, err_msg, max_features): + """Test that partial_fit from SelectFromModel validates `max_features`.""" + X, y = datasets.make_classification( + n_samples=100, + n_features=4, + random_state=0, + ) + + with pytest.raises(error, match=err_msg): + SelectFromModel( + estimator=SGDClassifier(), max_features=max_features + ).partial_fit(X, y, classes=[0, 1]) + + +@pytest.mark.parametrize("as_frame", [True, False]) +def test_partial_fit_validate_feature_names(as_frame): + """Test that partial_fit from SelectFromModel validates `feature_names_in_`.""" + pytest.importorskip("pandas") + X, y = datasets.load_iris(as_frame=as_frame, return_X_y=True) + + selector = SelectFromModel(estimator=SGDClassifier(), max_features=4).partial_fit( + X, y, classes=[0, 1, 2] + ) + if as_frame: + assert_array_equal(selector.feature_names_in_, X.columns) + else: + assert not hasattr(selector, "feature_names_in_") + + +def test_from_model_estimator_attribute_error(): + """Check that we raise the proper AttributeError when the estimator + does not implement the `partial_fit` method, which is decorated with + `available_if`. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/28108 + """ + # `LinearRegression` does not implement 'partial_fit' and should raise an + # AttributeError + from_model = SelectFromModel(estimator=LinearRegression()) + + outer_msg = "This 'SelectFromModel' has no attribute 'partial_fit'" + inner_msg = "'LinearRegression' object has no attribute 'partial_fit'" + with pytest.raises(AttributeError, match=outer_msg) as exec_info: + from_model.fit(data, y).partial_fit(data) + assert isinstance(exec_info.value.__cause__, AttributeError) + assert inner_msg in str(exec_info.value.__cause__) diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_mutual_info.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_mutual_info.py new file mode 100644 index 0000000000000000000000000000000000000000..4922b7e4e57b352456e8295d7dba44feb4eef535 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_mutual_info.py @@ -0,0 +1,270 @@ +import numpy as np +import pytest + +from sklearn.datasets import make_classification, make_regression +from sklearn.feature_selection import mutual_info_classif, mutual_info_regression +from sklearn.feature_selection._mutual_info import _compute_mi +from sklearn.utils import check_random_state +from sklearn.utils._testing import ( + assert_allclose, + assert_array_equal, +) +from sklearn.utils.fixes import CSR_CONTAINERS + + +def test_compute_mi_dd(): + # In discrete case computations are straightforward and can be done + # by hand on given vectors. + x = np.array([0, 1, 1, 0, 0]) + y = np.array([1, 0, 0, 0, 1]) + + H_x = H_y = -(3 / 5) * np.log(3 / 5) - (2 / 5) * np.log(2 / 5) + H_xy = -1 / 5 * np.log(1 / 5) - 2 / 5 * np.log(2 / 5) - 2 / 5 * np.log(2 / 5) + I_xy = H_x + H_y - H_xy + + assert_allclose(_compute_mi(x, y, x_discrete=True, y_discrete=True), I_xy) + + +def test_compute_mi_cc(global_dtype): + # For two continuous variables a good approach is to test on bivariate + # normal distribution, where mutual information is known. + + # Mean of the distribution, irrelevant for mutual information. + mean = np.zeros(2) + + # Setup covariance matrix with correlation coeff. equal 0.5. + sigma_1 = 1 + sigma_2 = 10 + corr = 0.5 + cov = np.array( + [ + [sigma_1**2, corr * sigma_1 * sigma_2], + [corr * sigma_1 * sigma_2, sigma_2**2], + ] + ) + + # True theoretical mutual information. + I_theory = np.log(sigma_1) + np.log(sigma_2) - 0.5 * np.log(np.linalg.det(cov)) + + rng = check_random_state(0) + Z = rng.multivariate_normal(mean, cov, size=1000).astype(global_dtype, copy=False) + + x, y = Z[:, 0], Z[:, 1] + + # Theory and computed values won't be very close + # We here check with a large relative tolerance + for n_neighbors in [3, 5, 7]: + I_computed = _compute_mi( + x, y, x_discrete=False, y_discrete=False, n_neighbors=n_neighbors + ) + assert_allclose(I_computed, I_theory, rtol=1e-1) + + +def test_compute_mi_cd(global_dtype): + # To test define a joint distribution as follows: + # p(x, y) = p(x) p(y | x) + # X ~ Bernoulli(p) + # (Y | x = 0) ~ Uniform(-1, 1) + # (Y | x = 1) ~ Uniform(0, 2) + + # Use the following formula for mutual information: + # I(X; Y) = H(Y) - H(Y | X) + # Two entropies can be computed by hand: + # H(Y) = -(1-p)/2 * ln((1-p)/2) - p/2*log(p/2) - 1/2*log(1/2) + # H(Y | X) = ln(2) + + # Now we need to implement sampling from out distribution, which is + # done easily using conditional distribution logic. + + n_samples = 1000 + rng = check_random_state(0) + + for p in [0.3, 0.5, 0.7]: + x = rng.uniform(size=n_samples) > p + + y = np.empty(n_samples, global_dtype) + mask = x == 0 + y[mask] = rng.uniform(-1, 1, size=np.sum(mask)) + y[~mask] = rng.uniform(0, 2, size=np.sum(~mask)) + + I_theory = -0.5 * ( + (1 - p) * np.log(0.5 * (1 - p)) + p * np.log(0.5 * p) + np.log(0.5) + ) - np.log(2) + + # Assert the same tolerance. + for n_neighbors in [3, 5, 7]: + I_computed = _compute_mi( + x, y, x_discrete=True, y_discrete=False, n_neighbors=n_neighbors + ) + assert_allclose(I_computed, I_theory, rtol=1e-1) + + +def test_compute_mi_cd_unique_label(global_dtype): + # Test that adding unique label doesn't change MI. + n_samples = 100 + x = np.random.uniform(size=n_samples) > 0.5 + + y = np.empty(n_samples, global_dtype) + mask = x == 0 + y[mask] = np.random.uniform(-1, 1, size=np.sum(mask)) + y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask)) + + mi_1 = _compute_mi(x, y, x_discrete=True, y_discrete=False) + + x = np.hstack((x, 2)) + y = np.hstack((y, 10)) + mi_2 = _compute_mi(x, y, x_discrete=True, y_discrete=False) + + assert_allclose(mi_1, mi_2) + + +# We are going test that feature ordering by MI matches our expectations. +def test_mutual_info_classif_discrete(global_dtype): + X = np.array( + [[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=global_dtype + ) + y = np.array([0, 1, 2, 2, 1]) + + # Here X[:, 0] is the most informative feature, and X[:, 1] is weakly + # informative. + mi = mutual_info_classif(X, y, discrete_features=True) + assert_array_equal(np.argsort(-mi), np.array([0, 2, 1])) + + +def test_mutual_info_regression(global_dtype): + # We generate sample from multivariate normal distribution, using + # transformation from initially uncorrelated variables. The zero + # variables after transformation is selected as the target vector, + # it has the strongest correlation with the variable 2, and + # the weakest correlation with the variable 1. + T = np.array([[1, 0.5, 2, 1], [0, 1, 0.1, 0.0], [0, 0.1, 1, 0.1], [0, 0.1, 0.1, 1]]) + cov = T.dot(T.T) + mean = np.zeros(4) + + rng = check_random_state(0) + Z = rng.multivariate_normal(mean, cov, size=1000).astype(global_dtype, copy=False) + X = Z[:, 1:] + y = Z[:, 0] + + mi = mutual_info_regression(X, y, random_state=0) + assert_array_equal(np.argsort(-mi), np.array([1, 2, 0])) + # XXX: should mutual_info_regression be fixed to avoid + # up-casting float32 inputs to float64? + assert mi.dtype == np.float64 + + +def test_mutual_info_classif_mixed(global_dtype): + # Here the target is discrete and there are two continuous and one + # discrete feature. The idea of this test is clear from the code. + rng = check_random_state(0) + X = rng.rand(1000, 3).astype(global_dtype, copy=False) + X[:, 1] += X[:, 0] + y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int) + X[:, 2] = X[:, 2] > 0.5 + + mi = mutual_info_classif(X, y, discrete_features=[2], n_neighbors=3, random_state=0) + assert_array_equal(np.argsort(-mi), [2, 0, 1]) + for n_neighbors in [5, 7, 9]: + mi_nn = mutual_info_classif( + X, y, discrete_features=[2], n_neighbors=n_neighbors, random_state=0 + ) + # Check that the continuous values have an higher MI with greater + # n_neighbors + assert mi_nn[0] > mi[0] + assert mi_nn[1] > mi[1] + # The n_neighbors should not have any effect on the discrete value + # The MI should be the same + assert mi_nn[2] == mi[2] + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_mutual_info_options(global_dtype, csr_container): + X = np.array( + [[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=global_dtype + ) + y = np.array([0, 1, 2, 2, 1], dtype=global_dtype) + X_csr = csr_container(X) + + for mutual_info in (mutual_info_regression, mutual_info_classif): + with pytest.raises(ValueError): + mutual_info(X_csr, y, discrete_features=False) + with pytest.raises(ValueError): + mutual_info(X, y, discrete_features="manual") + with pytest.raises(ValueError): + mutual_info(X_csr, y, discrete_features=[True, False, True]) + with pytest.raises(IndexError): + mutual_info(X, y, discrete_features=[True, False, True, False]) + with pytest.raises(IndexError): + mutual_info(X, y, discrete_features=[1, 4]) + + mi_1 = mutual_info(X, y, discrete_features="auto", random_state=0) + mi_2 = mutual_info(X, y, discrete_features=False, random_state=0) + mi_3 = mutual_info(X_csr, y, discrete_features="auto", random_state=0) + mi_4 = mutual_info(X_csr, y, discrete_features=True, random_state=0) + mi_5 = mutual_info(X, y, discrete_features=[True, False, True], random_state=0) + mi_6 = mutual_info(X, y, discrete_features=[0, 2], random_state=0) + + assert_allclose(mi_1, mi_2) + assert_allclose(mi_3, mi_4) + assert_allclose(mi_5, mi_6) + + assert not np.allclose(mi_1, mi_3) + + +@pytest.mark.parametrize("correlated", [True, False]) +def test_mutual_information_symmetry_classif_regression(correlated, global_random_seed): + """Check that `mutual_info_classif` and `mutual_info_regression` are + symmetric by switching the target `y` as `feature` in `X` and vice + versa. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/23720 + """ + rng = np.random.RandomState(global_random_seed) + n = 100 + d = rng.randint(10, size=n) + + if correlated: + c = d.astype(np.float64) + else: + c = rng.normal(0, 1, size=n) + + mi_classif = mutual_info_classif( + c[:, None], d, discrete_features=[False], random_state=global_random_seed + ) + + mi_regression = mutual_info_regression( + d[:, None], c, discrete_features=[True], random_state=global_random_seed + ) + + assert mi_classif == pytest.approx(mi_regression) + + +def test_mutual_info_regression_X_int_dtype(global_random_seed): + """Check that results agree when X is integer dtype and float dtype. + + Non-regression test for Issue #26696. + """ + rng = np.random.RandomState(global_random_seed) + X = rng.randint(100, size=(100, 10)) + X_float = X.astype(np.float64, copy=True) + y = rng.randint(100, size=100) + + expected = mutual_info_regression(X_float, y, random_state=global_random_seed) + result = mutual_info_regression(X, y, random_state=global_random_seed) + assert_allclose(result, expected) + + +@pytest.mark.parametrize( + "mutual_info_func, data_generator", + [ + (mutual_info_regression, make_regression), + (mutual_info_classif, make_classification), + ], +) +def test_mutual_info_n_jobs(global_random_seed, mutual_info_func, data_generator): + """Check that results are consistent with different `n_jobs`.""" + X, y = data_generator(random_state=global_random_seed) + single_job = mutual_info_func(X, y, random_state=global_random_seed, n_jobs=1) + multi_job = mutual_info_func(X, y, random_state=global_random_seed, n_jobs=2) + assert_allclose(single_job, multi_job) diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_rfe.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_rfe.py new file mode 100644 index 0000000000000000000000000000000000000000..1f5672545874c057847a5d135f1c29a7211647e0 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_rfe.py @@ -0,0 +1,755 @@ +""" +Testing Recursive feature elimination +""" + +import re +from operator import attrgetter + +import numpy as np +import pytest +from joblib import parallel_backend +from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal + +from sklearn.base import BaseEstimator, ClassifierMixin, is_classifier +from sklearn.compose import TransformedTargetRegressor +from sklearn.cross_decomposition import CCA, PLSCanonical, PLSRegression +from sklearn.datasets import load_iris, make_classification, make_friedman1 +from sklearn.ensemble import RandomForestClassifier +from sklearn.feature_selection import RFE, RFECV +from sklearn.impute import SimpleImputer +from sklearn.linear_model import LinearRegression, LogisticRegression +from sklearn.metrics import get_scorer, make_scorer, zero_one_loss +from sklearn.model_selection import GroupKFold, cross_val_score +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.svm import SVC, SVR, LinearSVR +from sklearn.utils import check_random_state +from sklearn.utils._testing import ignore_warnings +from sklearn.utils.fixes import CSR_CONTAINERS + + +class MockClassifier(ClassifierMixin, BaseEstimator): + """ + Dummy classifier to test recursive feature elimination + """ + + def __init__(self, foo_param=0): + self.foo_param = foo_param + + def fit(self, X, y): + assert len(X) == len(y) + self.coef_ = np.ones(X.shape[1], dtype=np.float64) + self.classes_ = sorted(set(y)) + return self + + def predict(self, T): + return np.ones(T.shape[0]) + + predict_proba = predict + decision_function = predict + transform = predict + + def score(self, X=None, y=None): + return 0.0 + + def get_params(self, deep=True): + return {"foo_param": self.foo_param} + + def set_params(self, **params): + return self + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags + + +def test_rfe_features_importance(): + generator = check_random_state(0) + iris = load_iris() + # Add some irrelevant features. Random seed is set to make sure that + # irrelevant features are always irrelevant. + X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] + y = iris.target + + clf = RandomForestClassifier(n_estimators=20, random_state=generator, max_depth=2) + rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1) + rfe.fit(X, y) + assert len(rfe.ranking_) == X.shape[1] + + clf_svc = SVC(kernel="linear") + rfe_svc = RFE(estimator=clf_svc, n_features_to_select=4, step=0.1) + rfe_svc.fit(X, y) + + # Check if the supports are equal + assert_array_equal(rfe.get_support(), rfe_svc.get_support()) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_rfe(csr_container): + generator = check_random_state(0) + iris = load_iris() + # Add some irrelevant features. Random seed is set to make sure that + # irrelevant features are always irrelevant. + X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] + X_sparse = csr_container(X) + y = iris.target + + # dense model + clf = SVC(kernel="linear") + rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1) + rfe.fit(X, y) + X_r = rfe.transform(X) + clf.fit(X_r, y) + assert len(rfe.ranking_) == X.shape[1] + + # sparse model + clf_sparse = SVC(kernel="linear") + rfe_sparse = RFE(estimator=clf_sparse, n_features_to_select=4, step=0.1) + rfe_sparse.fit(X_sparse, y) + X_r_sparse = rfe_sparse.transform(X_sparse) + + assert X_r.shape == iris.data.shape + assert_array_almost_equal(X_r[:10], iris.data[:10]) + + assert_array_almost_equal(rfe.predict(X), clf.predict(iris.data)) + assert rfe.score(X, y) == clf.score(iris.data, iris.target) + assert_array_almost_equal(X_r, X_r_sparse.toarray()) + + +def test_RFE_fit_score_params(): + # Make sure RFE passes the metadata down to fit and score methods of the + # underlying estimator + class TestEstimator(BaseEstimator, ClassifierMixin): + def fit(self, X, y, prop=None): + if prop is None: + raise ValueError("fit: prop cannot be None") + self.svc_ = SVC(kernel="linear").fit(X, y) + self.coef_ = self.svc_.coef_ + return self + + def score(self, X, y, prop=None): + if prop is None: + raise ValueError("score: prop cannot be None") + return self.svc_.score(X, y) + + X, y = load_iris(return_X_y=True) + with pytest.raises(ValueError, match="fit: prop cannot be None"): + RFE(estimator=TestEstimator()).fit(X, y) + with pytest.raises(ValueError, match="score: prop cannot be None"): + RFE(estimator=TestEstimator()).fit(X, y, prop="foo").score(X, y) + + RFE(estimator=TestEstimator()).fit(X, y, prop="foo").score(X, y, prop="foo") + + +def test_rfe_percent_n_features(): + # test that the results are the same + generator = check_random_state(0) + iris = load_iris() + # Add some irrelevant features. Random seed is set to make sure that + # irrelevant features are always irrelevant. + X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] + y = iris.target + # there are 10 features in the data. We select 40%. + clf = SVC(kernel="linear") + rfe_num = RFE(estimator=clf, n_features_to_select=4, step=0.1) + rfe_num.fit(X, y) + + rfe_perc = RFE(estimator=clf, n_features_to_select=0.4, step=0.1) + rfe_perc.fit(X, y) + + assert_array_equal(rfe_perc.ranking_, rfe_num.ranking_) + assert_array_equal(rfe_perc.support_, rfe_num.support_) + + +def test_rfe_mockclassifier(): + generator = check_random_state(0) + iris = load_iris() + # Add some irrelevant features. Random seed is set to make sure that + # irrelevant features are always irrelevant. + X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] + y = iris.target + + # dense model + clf = MockClassifier() + rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1) + rfe.fit(X, y) + X_r = rfe.transform(X) + clf.fit(X_r, y) + assert len(rfe.ranking_) == X.shape[1] + assert X_r.shape == iris.data.shape + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_rfecv(csr_container): + generator = check_random_state(0) + iris = load_iris() + # Add some irrelevant features. Random seed is set to make sure that + # irrelevant features are always irrelevant. + X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] + y = list(iris.target) # regression test: list should be supported + + # Test using the score function + rfecv = RFECV(estimator=SVC(kernel="linear"), step=1) + rfecv.fit(X, y) + # non-regression test for missing worst feature: + + for key in rfecv.cv_results_.keys(): + assert len(rfecv.cv_results_[key]) == X.shape[1] + + assert len(rfecv.ranking_) == X.shape[1] + X_r = rfecv.transform(X) + + # All the noisy variable were filtered out + assert_array_equal(X_r, iris.data) + + # same in sparse + rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1) + X_sparse = csr_container(X) + rfecv_sparse.fit(X_sparse, y) + X_r_sparse = rfecv_sparse.transform(X_sparse) + assert_array_equal(X_r_sparse.toarray(), iris.data) + + # Test using a customized loss function + scoring = make_scorer(zero_one_loss, greater_is_better=False) + rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=scoring) + ignore_warnings(rfecv.fit)(X, y) + X_r = rfecv.transform(X) + assert_array_equal(X_r, iris.data) + + # Test using a scorer + scorer = get_scorer("accuracy") + rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=scorer) + rfecv.fit(X, y) + X_r = rfecv.transform(X) + assert_array_equal(X_r, iris.data) + + # Test fix on cv_results_ + def test_scorer(estimator, X, y): + return 1.0 + + rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=test_scorer) + rfecv.fit(X, y) + + # In the event of cross validation score ties, the expected behavior of + # RFECV is to return the FEWEST features that maximize the CV score. + # Because test_scorer always returns 1.0 in this example, RFECV should + # reduce the dimensionality to a single feature (i.e. n_features_ = 1) + assert rfecv.n_features_ == 1 + + # Same as the first two tests, but with step=2 + rfecv = RFECV(estimator=SVC(kernel="linear"), step=2) + rfecv.fit(X, y) + + for key in rfecv.cv_results_.keys(): + assert len(rfecv.cv_results_[key]) == 6 + + assert len(rfecv.ranking_) == X.shape[1] + X_r = rfecv.transform(X) + assert_array_equal(X_r, iris.data) + + rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2) + X_sparse = csr_container(X) + rfecv_sparse.fit(X_sparse, y) + X_r_sparse = rfecv_sparse.transform(X_sparse) + assert_array_equal(X_r_sparse.toarray(), iris.data) + + # Verifying that steps < 1 don't blow up. + rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=0.2) + X_sparse = csr_container(X) + rfecv_sparse.fit(X_sparse, y) + X_r_sparse = rfecv_sparse.transform(X_sparse) + assert_array_equal(X_r_sparse.toarray(), iris.data) + + +def test_rfecv_mockclassifier(): + generator = check_random_state(0) + iris = load_iris() + X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] + y = list(iris.target) # regression test: list should be supported + + # Test using the score function + rfecv = RFECV(estimator=MockClassifier(), step=1) + rfecv.fit(X, y) + # non-regression test for missing worst feature: + + for key in rfecv.cv_results_.keys(): + assert len(rfecv.cv_results_[key]) == X.shape[1] + + assert len(rfecv.ranking_) == X.shape[1] + + +def test_rfecv_verbose_output(): + # Check verbose=1 is producing an output. + import sys + from io import StringIO + + sys.stdout = StringIO() + + generator = check_random_state(0) + iris = load_iris() + X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] + y = list(iris.target) + + rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, verbose=1) + rfecv.fit(X, y) + + verbose_output = sys.stdout + verbose_output.seek(0) + assert len(verbose_output.readline()) > 0 + + +def test_rfecv_cv_results_size(global_random_seed): + generator = check_random_state(global_random_seed) + iris = load_iris() + X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] + y = list(iris.target) # regression test: list should be supported + + # Non-regression test for varying combinations of step and + # min_features_to_select. + for step, min_features_to_select in [[2, 1], [2, 2], [3, 3]]: + rfecv = RFECV( + estimator=MockClassifier(), + step=step, + min_features_to_select=min_features_to_select, + ) + rfecv.fit(X, y) + + score_len = np.ceil((X.shape[1] - min_features_to_select) / step) + 1 + + for key in rfecv.cv_results_.keys(): + assert len(rfecv.cv_results_[key]) == score_len + + assert len(rfecv.ranking_) == X.shape[1] + assert rfecv.n_features_ >= min_features_to_select + + +def test_rfe_estimator_tags(): + rfe = RFE(SVC(kernel="linear")) + assert is_classifier(rfe) + # make sure that cross-validation is stratified + iris = load_iris() + score = cross_val_score(rfe, iris.data, iris.target) + assert score.min() > 0.7 + + +def test_rfe_min_step(global_random_seed): + n_features = 10 + X, y = make_friedman1( + n_samples=50, n_features=n_features, random_state=global_random_seed + ) + n_samples, n_features = X.shape + estimator = SVR(kernel="linear") + + # Test when floor(step * n_features) <= 0 + selector = RFE(estimator, step=0.01) + sel = selector.fit(X, y) + assert sel.support_.sum() == n_features // 2 + + # Test when step is between (0,1) and floor(step * n_features) > 0 + selector = RFE(estimator, step=0.20) + sel = selector.fit(X, y) + assert sel.support_.sum() == n_features // 2 + + # Test when step is an integer + selector = RFE(estimator, step=5) + sel = selector.fit(X, y) + assert sel.support_.sum() == n_features // 2 + + +def test_number_of_subsets_of_features(global_random_seed): + # In RFE, 'number_of_subsets_of_features' + # = the number of iterations in '_fit' + # = max(ranking_) + # = 1 + (n_features + step - n_features_to_select - 1) // step + # After optimization #4534, this number + # = 1 + np.ceil((n_features - n_features_to_select) / float(step)) + # This test case is to test their equivalence, refer to #4534 and #3824 + + def formula1(n_features, n_features_to_select, step): + return 1 + ((n_features + step - n_features_to_select - 1) // step) + + def formula2(n_features, n_features_to_select, step): + return 1 + np.ceil((n_features - n_features_to_select) / float(step)) + + # RFE + # Case 1, n_features - n_features_to_select is divisible by step + # Case 2, n_features - n_features_to_select is not divisible by step + n_features_list = [11, 11] + n_features_to_select_list = [3, 3] + step_list = [2, 3] + for n_features, n_features_to_select, step in zip( + n_features_list, n_features_to_select_list, step_list + ): + generator = check_random_state(global_random_seed) + X = generator.normal(size=(100, n_features)) + y = generator.rand(100).round() + rfe = RFE( + estimator=SVC(kernel="linear"), + n_features_to_select=n_features_to_select, + step=step, + ) + rfe.fit(X, y) + # this number also equals to the maximum of ranking_ + assert np.max(rfe.ranking_) == formula1(n_features, n_features_to_select, step) + assert np.max(rfe.ranking_) == formula2(n_features, n_features_to_select, step) + + # In RFECV, 'fit' calls 'RFE._fit' + # 'number_of_subsets_of_features' of RFE + # = the size of each score in 'cv_results_' of RFECV + # = the number of iterations of the for loop before optimization #4534 + + # RFECV, n_features_to_select = 1 + # Case 1, n_features - 1 is divisible by step + # Case 2, n_features - 1 is not divisible by step + + n_features_to_select = 1 + n_features_list = [11, 10] + step_list = [2, 2] + for n_features, step in zip(n_features_list, step_list): + generator = check_random_state(global_random_seed) + X = generator.normal(size=(100, n_features)) + y = generator.rand(100).round() + rfecv = RFECV(estimator=SVC(kernel="linear"), step=step) + rfecv.fit(X, y) + + for key in rfecv.cv_results_.keys(): + assert len(rfecv.cv_results_[key]) == formula1( + n_features, n_features_to_select, step + ) + assert len(rfecv.cv_results_[key]) == formula2( + n_features, n_features_to_select, step + ) + + +def test_rfe_cv_n_jobs(global_random_seed): + generator = check_random_state(global_random_seed) + iris = load_iris() + X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] + y = iris.target + + rfecv = RFECV(estimator=SVC(kernel="linear")) + rfecv.fit(X, y) + rfecv_ranking = rfecv.ranking_ + + rfecv_cv_results_ = rfecv.cv_results_ + + rfecv.set_params(n_jobs=2) + rfecv.fit(X, y) + assert_array_almost_equal(rfecv.ranking_, rfecv_ranking) + + assert rfecv_cv_results_.keys() == rfecv.cv_results_.keys() + for key in rfecv_cv_results_.keys(): + assert rfecv_cv_results_[key] == pytest.approx(rfecv.cv_results_[key]) + + +def test_rfe_cv_groups(): + generator = check_random_state(0) + iris = load_iris() + number_groups = 4 + groups = np.floor(np.linspace(0, number_groups, len(iris.target))) + X = iris.data + y = (iris.target > 0).astype(int) + + est_groups = RFECV( + estimator=RandomForestClassifier(random_state=generator), + step=1, + scoring="accuracy", + cv=GroupKFold(n_splits=2), + ) + est_groups.fit(X, y, groups=groups) + assert est_groups.n_features_ > 0 + + +@pytest.mark.parametrize( + "importance_getter", [attrgetter("regressor_.coef_"), "regressor_.coef_"] +) +@pytest.mark.parametrize("selector, expected_n_features", [(RFE, 5), (RFECV, 4)]) +def test_rfe_wrapped_estimator(importance_getter, selector, expected_n_features): + # Non-regression test for + # https://github.com/scikit-learn/scikit-learn/issues/15312 + X, y = make_friedman1(n_samples=50, n_features=10, random_state=0) + estimator = LinearSVR(random_state=0) + + log_estimator = TransformedTargetRegressor( + regressor=estimator, func=np.log, inverse_func=np.exp + ) + + selector = selector(log_estimator, importance_getter=importance_getter) + sel = selector.fit(X, y) + assert sel.support_.sum() == expected_n_features + + +@pytest.mark.parametrize( + "importance_getter, err_type", + [ + ("auto", ValueError), + ("random", AttributeError), + (lambda x: x.importance, AttributeError), + ], +) +@pytest.mark.parametrize("Selector", [RFE, RFECV]) +def test_rfe_importance_getter_validation(importance_getter, err_type, Selector): + X, y = make_friedman1(n_samples=50, n_features=10, random_state=42) + estimator = LinearSVR() + log_estimator = TransformedTargetRegressor( + regressor=estimator, func=np.log, inverse_func=np.exp + ) + + with pytest.raises(err_type): + model = Selector(log_estimator, importance_getter=importance_getter) + model.fit(X, y) + + +@pytest.mark.parametrize("cv", [None, 5]) +def test_rfe_allow_nan_inf_in_x(cv): + iris = load_iris() + X = iris.data + y = iris.target + + # add nan and inf value to X + X[0][0] = np.nan + X[0][1] = np.inf + + clf = MockClassifier() + if cv is not None: + rfe = RFECV(estimator=clf, cv=cv) + else: + rfe = RFE(estimator=clf) + rfe.fit(X, y) + rfe.transform(X) + + +def test_w_pipeline_2d_coef_(): + pipeline = make_pipeline(StandardScaler(), LogisticRegression()) + + data, y = load_iris(return_X_y=True) + sfm = RFE( + pipeline, + n_features_to_select=2, + importance_getter="named_steps.logisticregression.coef_", + ) + + sfm.fit(data, y) + assert sfm.transform(data).shape[1] == 2 + + +def test_rfecv_std_and_mean(global_random_seed): + generator = check_random_state(global_random_seed) + iris = load_iris() + X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] + y = iris.target + + rfecv = RFECV(estimator=SVC(kernel="linear")) + rfecv.fit(X, y) + split_keys = [ + key + for key in rfecv.cv_results_.keys() + if re.search(r"split\d+_test_score", key) + ] + cv_scores = np.asarray([rfecv.cv_results_[key] for key in split_keys]) + expected_mean = np.mean(cv_scores, axis=0) + expected_std = np.std(cv_scores, axis=0) + + assert_allclose(rfecv.cv_results_["mean_test_score"], expected_mean) + assert_allclose(rfecv.cv_results_["std_test_score"], expected_std) + + +@pytest.mark.parametrize( + ["min_features_to_select", "n_features", "step", "cv_results_n_features"], + [ + [1, 4, 1, np.array([1, 2, 3, 4])], + [1, 5, 1, np.array([1, 2, 3, 4, 5])], + [1, 4, 2, np.array([1, 2, 4])], + [1, 5, 2, np.array([1, 3, 5])], + [1, 4, 3, np.array([1, 4])], + [1, 5, 3, np.array([1, 2, 5])], + [1, 4, 4, np.array([1, 4])], + [1, 5, 4, np.array([1, 5])], + [4, 4, 2, np.array([4])], + [4, 5, 1, np.array([4, 5])], + [4, 5, 2, np.array([4, 5])], + ], +) +def test_rfecv_cv_results_n_features( + min_features_to_select, + n_features, + step, + cv_results_n_features, +): + X, y = make_classification( + n_samples=20, n_features=n_features, n_informative=n_features, n_redundant=0 + ) + rfecv = RFECV( + estimator=SVC(kernel="linear"), + step=step, + min_features_to_select=min_features_to_select, + ) + rfecv.fit(X, y) + assert_array_equal(rfecv.cv_results_["n_features"], cv_results_n_features) + assert all( + len(value) == len(rfecv.cv_results_["n_features"]) + for value in rfecv.cv_results_.values() + ) + + +@pytest.mark.parametrize("ClsRFE", [RFE, RFECV]) +def test_multioutput(ClsRFE): + X = np.random.normal(size=(10, 3)) + y = np.random.randint(2, size=(10, 2)) + clf = RandomForestClassifier(n_estimators=5) + rfe_test = ClsRFE(clf) + rfe_test.fit(X, y) + + +@pytest.mark.parametrize("ClsRFE", [RFE, RFECV]) +def test_pipeline_with_nans(ClsRFE): + """Check that RFE works with pipeline that accept nans. + + Non-regression test for gh-21743. + """ + X, y = load_iris(return_X_y=True) + X[0, 0] = np.nan + + pipe = make_pipeline( + SimpleImputer(), + StandardScaler(), + LogisticRegression(), + ) + + fs = ClsRFE( + estimator=pipe, + importance_getter="named_steps.logisticregression.coef_", + ) + fs.fit(X, y) + + +@pytest.mark.parametrize("ClsRFE", [RFE, RFECV]) +@pytest.mark.parametrize("PLSEstimator", [CCA, PLSCanonical, PLSRegression]) +def test_rfe_pls(ClsRFE, PLSEstimator): + """Check the behaviour of RFE with PLS estimators. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/12410 + """ + X, y = make_friedman1(n_samples=50, n_features=10, random_state=0) + estimator = PLSEstimator(n_components=1) + selector = ClsRFE(estimator, step=1).fit(X, y) + assert selector.score(X, y) > 0.5 + + +def test_rfe_estimator_attribute_error(): + """Check that we raise the proper AttributeError when the estimator + does not implement the `decision_function` method, which is decorated with + `available_if`. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/28108 + """ + iris = load_iris() + + # `LinearRegression` does not implement 'decision_function' and should raise an + # AttributeError + rfe = RFE(estimator=LinearRegression()) + + outer_msg = "This 'RFE' has no attribute 'decision_function'" + inner_msg = "'LinearRegression' object has no attribute 'decision_function'" + with pytest.raises(AttributeError, match=outer_msg) as exec_info: + rfe.fit(iris.data, iris.target).decision_function(iris.data) + assert isinstance(exec_info.value.__cause__, AttributeError) + assert inner_msg in str(exec_info.value.__cause__) + + +@pytest.mark.parametrize( + "ClsRFE, param", [(RFE, "n_features_to_select"), (RFECV, "min_features_to_select")] +) +def test_rfe_n_features_to_select_warning(ClsRFE, param): + """Check if the correct warning is raised when trying to initialize a RFE + object with a n_features_to_select attribute larger than the number of + features present in the X variable that is passed to the fit method + """ + X, y = make_classification(n_features=20, random_state=0) + + with pytest.warns(UserWarning, match=f"{param}=21 > n_features=20"): + # Create RFE/RFECV with n_features_to_select/min_features_to_select + # larger than the number of features present in the X variable + clsrfe = ClsRFE(estimator=LogisticRegression(), **{param: 21}) + clsrfe.fit(X, y) + + +def test_rfe_with_sample_weight(): + """Test that `RFE` works correctly with sample weights.""" + X, y = make_classification(random_state=0) + n_samples = X.shape[0] + + # Assign the first half of the samples with twice the weight + sample_weight = np.ones_like(y) + sample_weight[: n_samples // 2] = 2 + + # Duplicate the first half of the data samples to replicate the effect + # of sample weights for comparison + X2 = np.concatenate([X, X[: n_samples // 2]], axis=0) + y2 = np.concatenate([y, y[: n_samples // 2]]) + + estimator = SVC(kernel="linear") + + rfe_sw = RFE(estimator=estimator, step=0.1) + rfe_sw.fit(X, y, sample_weight=sample_weight) + + rfe = RFE(estimator=estimator, step=0.1) + rfe.fit(X2, y2) + + assert_array_equal(rfe_sw.ranking_, rfe.ranking_) + + # Also verify that when sample weights are not doubled the results + # are different from the duplicated data + rfe_sw_2 = RFE(estimator=estimator, step=0.1) + sample_weight_2 = np.ones_like(y) + rfe_sw_2.fit(X, y, sample_weight=sample_weight_2) + + assert not np.array_equal(rfe_sw_2.ranking_, rfe.ranking_) + + +def test_rfe_with_joblib_threading_backend(global_random_seed): + X, y = make_classification(random_state=global_random_seed) + + clf = LogisticRegression() + rfe = RFECV( + estimator=clf, + n_jobs=2, + ) + + rfe.fit(X, y) + ranking_ref = rfe.ranking_ + + with parallel_backend("threading"): + rfe.fit(X, y) + + assert_array_equal(ranking_ref, rfe.ranking_) + + +def test_results_per_cv_in_rfecv(global_random_seed): + """ + Test that the results of RFECV are consistent across the different folds + in terms of length of the arrays. + """ + X, y = make_classification(random_state=global_random_seed) + + clf = LogisticRegression() + rfecv = RFECV( + estimator=clf, + n_jobs=2, + cv=5, + ) + + rfecv.fit(X, y) + + assert len(rfecv.cv_results_["split1_test_score"]) == len( + rfecv.cv_results_["split2_test_score"] + ) + assert len(rfecv.cv_results_["split1_support"]) == len( + rfecv.cv_results_["split2_support"] + ) + assert len(rfecv.cv_results_["split1_ranking"]) == len( + rfecv.cv_results_["split2_ranking"] + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_sequential.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_sequential.py new file mode 100644 index 0000000000000000000000000000000000000000..b98d5b400b84eaa68440c0dbc3891b99372444a2 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_sequential.py @@ -0,0 +1,332 @@ +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +from sklearn.cluster import KMeans +from sklearn.datasets import make_blobs, make_classification, make_regression +from sklearn.ensemble import HistGradientBoostingRegressor +from sklearn.feature_selection import SequentialFeatureSelector +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import LeaveOneGroupOut, cross_val_score +from sklearn.neighbors import KNeighborsClassifier +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.utils.fixes import CSR_CONTAINERS + + +def test_bad_n_features_to_select(): + n_features = 5 + X, y = make_regression(n_features=n_features) + sfs = SequentialFeatureSelector(LinearRegression(), n_features_to_select=n_features) + with pytest.raises(ValueError, match="n_features_to_select must be < n_features"): + sfs.fit(X, y) + + +@pytest.mark.parametrize("direction", ("forward", "backward")) +@pytest.mark.parametrize("n_features_to_select", (1, 5, 9, "auto")) +def test_n_features_to_select(direction, n_features_to_select): + # Make sure n_features_to_select is respected + + n_features = 10 + X, y = make_regression(n_features=n_features, random_state=0) + sfs = SequentialFeatureSelector( + LinearRegression(), + n_features_to_select=n_features_to_select, + direction=direction, + cv=2, + ) + sfs.fit(X, y) + + if n_features_to_select == "auto": + n_features_to_select = n_features // 2 + + assert sfs.get_support(indices=True).shape[0] == n_features_to_select + assert sfs.n_features_to_select_ == n_features_to_select + assert sfs.transform(X).shape[1] == n_features_to_select + + +@pytest.mark.parametrize("direction", ("forward", "backward")) +def test_n_features_to_select_auto(direction): + """Check the behaviour of `n_features_to_select="auto"` with different + values for the parameter `tol`. + """ + + n_features = 10 + tol = 1e-3 + X, y = make_regression(n_features=n_features, random_state=0) + sfs = SequentialFeatureSelector( + LinearRegression(), + n_features_to_select="auto", + tol=tol, + direction=direction, + cv=2, + ) + sfs.fit(X, y) + + max_features_to_select = n_features - 1 + + assert sfs.get_support(indices=True).shape[0] <= max_features_to_select + assert sfs.n_features_to_select_ <= max_features_to_select + assert sfs.transform(X).shape[1] <= max_features_to_select + assert sfs.get_support(indices=True).shape[0] == sfs.n_features_to_select_ + + +@pytest.mark.parametrize("direction", ("forward", "backward")) +def test_n_features_to_select_stopping_criterion(direction): + """Check the behaviour stopping criterion for feature selection + depending on the values of `n_features_to_select` and `tol`. + + When `direction` is `'forward'`, select a new features at random + among those not currently selected in selector.support_, + build a new version of the data that includes all the features + in selector.support_ + this newly selected feature. + And check that the cross-validation score of the model trained on + this new dataset variant is lower than the model with + the selected forward selected features or at least does not improve + by more than the tol margin. + + When `direction` is `'backward'`, instead of adding a new feature + to selector.support_, try to remove one of those selected features at random + And check that the cross-validation score is either decreasing or + not improving by more than the tol margin. + """ + + X, y = make_regression(n_features=50, n_informative=10, random_state=0) + + tol = 1e-3 + + sfs = SequentialFeatureSelector( + LinearRegression(), + n_features_to_select="auto", + tol=tol, + direction=direction, + cv=2, + ) + sfs.fit(X, y) + selected_X = sfs.transform(X) + + rng = np.random.RandomState(0) + + added_candidates = list(set(range(X.shape[1])) - set(sfs.get_support(indices=True))) + added_X = np.hstack( + [ + selected_X, + (X[:, rng.choice(added_candidates)])[:, np.newaxis], + ] + ) + + removed_candidate = rng.choice(list(range(sfs.n_features_to_select_))) + removed_X = np.delete(selected_X, removed_candidate, axis=1) + + plain_cv_score = cross_val_score(LinearRegression(), X, y, cv=2).mean() + sfs_cv_score = cross_val_score(LinearRegression(), selected_X, y, cv=2).mean() + added_cv_score = cross_val_score(LinearRegression(), added_X, y, cv=2).mean() + removed_cv_score = cross_val_score(LinearRegression(), removed_X, y, cv=2).mean() + + assert sfs_cv_score >= plain_cv_score + + if direction == "forward": + assert (sfs_cv_score - added_cv_score) <= tol + assert (sfs_cv_score - removed_cv_score) >= tol + else: + assert (added_cv_score - sfs_cv_score) <= tol + assert (removed_cv_score - sfs_cv_score) <= tol + + +@pytest.mark.parametrize("direction", ("forward", "backward")) +@pytest.mark.parametrize( + "n_features_to_select, expected", + ( + (0.1, 1), + (1.0, 10), + (0.5, 5), + ), +) +def test_n_features_to_select_float(direction, n_features_to_select, expected): + # Test passing a float as n_features_to_select + X, y = make_regression(n_features=10) + sfs = SequentialFeatureSelector( + LinearRegression(), + n_features_to_select=n_features_to_select, + direction=direction, + cv=2, + ) + sfs.fit(X, y) + assert sfs.n_features_to_select_ == expected + + +@pytest.mark.parametrize("seed", range(10)) +@pytest.mark.parametrize("direction", ("forward", "backward")) +@pytest.mark.parametrize( + "n_features_to_select, expected_selected_features", + [ + (2, [0, 2]), # f1 is dropped since it has no predictive power + (1, [2]), # f2 is more predictive than f0 so it's kept + ], +) +def test_sanity(seed, direction, n_features_to_select, expected_selected_features): + # Basic sanity check: 3 features, only f0 and f2 are correlated with the + # target, f2 having a stronger correlation than f0. We expect f1 to be + # dropped, and f2 to always be selected. + + rng = np.random.RandomState(seed) + n_samples = 100 + X = rng.randn(n_samples, 3) + y = 3 * X[:, 0] - 10 * X[:, 2] + + sfs = SequentialFeatureSelector( + LinearRegression(), + n_features_to_select=n_features_to_select, + direction=direction, + cv=2, + ) + sfs.fit(X, y) + assert_array_equal(sfs.get_support(indices=True), expected_selected_features) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse_support(csr_container): + # Make sure sparse data is supported + + X, y = make_regression(n_features=10) + X = csr_container(X) + sfs = SequentialFeatureSelector( + LinearRegression(), n_features_to_select="auto", cv=2 + ) + sfs.fit(X, y) + sfs.transform(X) + + +def test_nan_support(): + # Make sure nans are OK if the underlying estimator supports nans + + rng = np.random.RandomState(0) + n_samples, n_features = 40, 4 + X, y = make_regression(n_samples, n_features, random_state=0) + nan_mask = rng.randint(0, 2, size=(n_samples, n_features), dtype=bool) + X[nan_mask] = np.nan + sfs = SequentialFeatureSelector( + HistGradientBoostingRegressor(), n_features_to_select="auto", cv=2 + ) + sfs.fit(X, y) + sfs.transform(X) + + with pytest.raises(ValueError, match="Input X contains NaN"): + # LinearRegression does not support nans + SequentialFeatureSelector( + LinearRegression(), n_features_to_select="auto", cv=2 + ).fit(X, y) + + +def test_pipeline_support(): + # Make sure that pipelines can be passed into SFS and that SFS can be + # passed into a pipeline + + n_samples, n_features = 50, 3 + X, y = make_regression(n_samples, n_features, random_state=0) + + # pipeline in SFS + pipe = make_pipeline(StandardScaler(), LinearRegression()) + sfs = SequentialFeatureSelector(pipe, n_features_to_select="auto", cv=2) + sfs.fit(X, y) + sfs.transform(X) + + # SFS in pipeline + sfs = SequentialFeatureSelector( + LinearRegression(), n_features_to_select="auto", cv=2 + ) + pipe = make_pipeline(StandardScaler(), sfs) + pipe.fit(X, y) + pipe.transform(X) + + +@pytest.mark.parametrize("n_features_to_select", (2, 3)) +def test_unsupervised_model_fit(n_features_to_select): + # Make sure that models without classification labels are not being + # validated + + X, y = make_blobs(n_features=4) + sfs = SequentialFeatureSelector( + KMeans(n_init=1), + n_features_to_select=n_features_to_select, + ) + sfs.fit(X) + assert sfs.transform(X).shape[1] == n_features_to_select + + +@pytest.mark.parametrize("y", ("no_validation", 1j, 99.9, np.nan, 3)) +def test_no_y_validation_model_fit(y): + # Make sure that other non-conventional y labels are not accepted + + X, clusters = make_blobs(n_features=6) + sfs = SequentialFeatureSelector( + KMeans(), + n_features_to_select=3, + ) + + with pytest.raises((TypeError, ValueError)): + sfs.fit(X, y) + + +def test_forward_neg_tol_error(): + """Check that we raise an error when tol<0 and direction='forward'""" + X, y = make_regression(n_features=10, random_state=0) + sfs = SequentialFeatureSelector( + LinearRegression(), + n_features_to_select="auto", + direction="forward", + tol=-1e-3, + ) + + with pytest.raises(ValueError, match="tol must be strictly positive"): + sfs.fit(X, y) + + +def test_backward_neg_tol(): + """Check that SequentialFeatureSelector works negative tol + + non-regression test for #25525 + """ + X, y = make_regression(n_features=10, random_state=0) + lr = LinearRegression() + initial_score = lr.fit(X, y).score(X, y) + + sfs = SequentialFeatureSelector( + lr, + n_features_to_select="auto", + direction="backward", + tol=-1e-3, + ) + Xr = sfs.fit_transform(X, y) + new_score = lr.fit(Xr, y).score(Xr, y) + + assert 0 < sfs.get_support().sum() < X.shape[1] + assert new_score < initial_score + + +def test_cv_generator_support(): + """Check that no exception raised when cv is generator + + non-regression test for #25957 + """ + X, y = make_classification(random_state=0) + + groups = np.zeros_like(y, dtype=int) + groups[y.size // 2 :] = 1 + + cv = LeaveOneGroupOut() + splits = cv.split(X, y, groups=groups) + + knc = KNeighborsClassifier(n_neighbors=5) + + sfs = SequentialFeatureSelector(knc, n_features_to_select=5, cv=splits) + sfs.fit(X, y) + + +def test_fit_rejects_params_with_no_routing_enabled(): + X, y = make_classification(random_state=42) + est = LinearRegression() + sfs = SequentialFeatureSelector(estimator=est) + + with pytest.raises(ValueError, match="is only supported if"): + sfs.fit(X, y, sample_weight=np.ones_like(y)) diff --git a/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_variance_threshold.py b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_variance_threshold.py new file mode 100644 index 0000000000000000000000000000000000000000..45e66cb338a4b7a5a410db669a13f6f9213451dc --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/feature_selection/tests/test_variance_threshold.py @@ -0,0 +1,72 @@ +import numpy as np +import pytest + +from sklearn.feature_selection import VarianceThreshold +from sklearn.utils._testing import assert_array_equal +from sklearn.utils.fixes import BSR_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS + +data = [[0, 1, 2, 3, 4], [0, 2, 2, 3, 5], [1, 1, 2, 4, 0]] + +data2 = [[-0.13725701]] * 10 + + +@pytest.mark.parametrize( + "sparse_container", [None] + BSR_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS +) +def test_zero_variance(sparse_container): + # Test VarianceThreshold with default setting, zero variance. + X = data if sparse_container is None else sparse_container(data) + sel = VarianceThreshold().fit(X) + assert_array_equal([0, 1, 3, 4], sel.get_support(indices=True)) + + +def test_zero_variance_value_error(): + # Test VarianceThreshold with default setting, zero variance, error cases. + with pytest.raises(ValueError): + VarianceThreshold().fit([[0, 1, 2, 3]]) + with pytest.raises(ValueError): + VarianceThreshold().fit([[0, 1], [0, 1]]) + + +@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS) +def test_variance_threshold(sparse_container): + # Test VarianceThreshold with custom variance. + X = data if sparse_container is None else sparse_container(data) + X = VarianceThreshold(threshold=0.4).fit_transform(X) + assert (len(data), 1) == X.shape + + +@pytest.mark.skipif( + np.var(data2) == 0, + reason=( + "This test is not valid for this platform, " + "as it relies on numerical instabilities." + ), +) +@pytest.mark.parametrize( + "sparse_container", [None] + BSR_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS +) +def test_zero_variance_floating_point_error(sparse_container): + # Test that VarianceThreshold(0.0).fit eliminates features that have + # the same value in every sample, even when floating point errors + # cause np.var not to be 0 for the feature. + # See #13691 + X = data2 if sparse_container is None else sparse_container(data2) + msg = "No feature in X meets the variance threshold 0.00000" + with pytest.raises(ValueError, match=msg): + VarianceThreshold().fit(X) + + +@pytest.mark.parametrize( + "sparse_container", [None] + BSR_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS +) +def test_variance_nan(sparse_container): + arr = np.array(data, dtype=np.float64) + # add single NaN and feature should still be included + arr[0, 0] = np.nan + # make all values in feature NaN and feature should be rejected + arr[:, 1] = np.nan + + X = arr if sparse_container is None else sparse_container(arr) + sel = VarianceThreshold().fit(X) + assert_array_equal([0, 3, 4], sel.get_support(indices=True)) diff --git a/.venv/lib/python3.12/site-packages/sklearn/frozen/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/frozen/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8ca540b79229c87447f40eed6717fe59202885f0 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/frozen/__init__.py @@ -0,0 +1,6 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._frozen import FrozenEstimator + +__all__ = ["FrozenEstimator"] diff --git a/.venv/lib/python3.12/site-packages/sklearn/frozen/_frozen.py b/.venv/lib/python3.12/site-packages/sklearn/frozen/_frozen.py new file mode 100644 index 0000000000000000000000000000000000000000..7585ea2597b5995a5e7ffcaf8f7f9b78fd676e6e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/frozen/_frozen.py @@ -0,0 +1,166 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from copy import deepcopy + +from ..base import BaseEstimator +from ..exceptions import NotFittedError +from ..utils import get_tags +from ..utils.metaestimators import available_if +from ..utils.validation import check_is_fitted + + +def _estimator_has(attr): + """Check that final_estimator has `attr`. + + Used together with `available_if`. + """ + + def check(self): + # raise original `AttributeError` if `attr` does not exist + getattr(self.estimator, attr) + return True + + return check + + +class FrozenEstimator(BaseEstimator): + """Estimator that wraps a fitted estimator to prevent re-fitting. + + This meta-estimator takes an estimator and freezes it, in the sense that calling + `fit` on it has no effect. `fit_predict` and `fit_transform` are also disabled. + All other methods are delegated to the original estimator and original estimator's + attributes are accessible as well. + + This is particularly useful when you have a fitted or a pre-trained model as a + transformer in a pipeline, and you'd like `pipeline.fit` to have no effect on this + step. + + Parameters + ---------- + estimator : estimator + The estimator which is to be kept frozen. + + See Also + -------- + None: No similar entry in the scikit-learn documentation. + + Examples + -------- + >>> from sklearn.datasets import make_classification + >>> from sklearn.frozen import FrozenEstimator + >>> from sklearn.linear_model import LogisticRegression + >>> X, y = make_classification(random_state=0) + >>> clf = LogisticRegression(random_state=0).fit(X, y) + >>> frozen_clf = FrozenEstimator(clf) + >>> frozen_clf.fit(X, y) # No-op + FrozenEstimator(estimator=LogisticRegression(random_state=0)) + >>> frozen_clf.predict(X) # Predictions from `clf.predict` + array(...) + """ + + def __init__(self, estimator): + self.estimator = estimator + + @available_if(_estimator_has("__getitem__")) + def __getitem__(self, *args, **kwargs): + """__getitem__ is defined in :class:`~sklearn.pipeline.Pipeline` and \ + :class:`~sklearn.compose.ColumnTransformer`. + """ + return self.estimator.__getitem__(*args, **kwargs) + + def __getattr__(self, name): + # `estimator`'s attributes are now accessible except `fit_predict` and + # `fit_transform` + if name in ["fit_predict", "fit_transform"]: + raise AttributeError(f"{name} is not available for frozen estimators.") + return getattr(self.estimator, name) + + def __sklearn_clone__(self): + return self + + def __sklearn_is_fitted__(self): + try: + check_is_fitted(self.estimator) + return True + except NotFittedError: + return False + + def fit(self, X, y, *args, **kwargs): + """No-op. + + As a frozen estimator, calling `fit` has no effect. + + Parameters + ---------- + X : object + Ignored. + + y : object + Ignored. + + *args : tuple + Additional positional arguments. Ignored, but present for API compatibility + with `self.estimator`. + + **kwargs : dict + Additional keyword arguments. Ignored, but present for API compatibility + with `self.estimator`. + + Returns + ------- + self : object + Returns the instance itself. + """ + check_is_fitted(self.estimator) + return self + + def set_params(self, **kwargs): + """Set the parameters of this estimator. + + The only valid key here is `estimator`. You cannot set the parameters of the + inner estimator. + + Parameters + ---------- + **kwargs : dict + Estimator parameters. + + Returns + ------- + self : FrozenEstimator + This estimator. + """ + estimator = kwargs.pop("estimator", None) + if estimator is not None: + self.estimator = estimator + if kwargs: + raise ValueError( + "You cannot set parameters of the inner estimator in a frozen " + "estimator since calling `fit` has no effect. You can use " + "`frozenestimator.estimator.set_params` to set parameters of the inner " + "estimator." + ) + + def get_params(self, deep=True): + """Get parameters for this estimator. + + Returns a `{"estimator": estimator}` dict. The parameters of the inner + estimator are not included. + + Parameters + ---------- + deep : bool, default=True + Ignored. + + Returns + ------- + params : dict + Parameter names mapped to their values. + """ + return {"estimator": self.estimator} + + def __sklearn_tags__(self): + tags = deepcopy(get_tags(self.estimator)) + tags._skip_test = True + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/frozen/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/frozen/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/frozen/tests/test_frozen.py b/.venv/lib/python3.12/site-packages/sklearn/frozen/tests/test_frozen.py new file mode 100644 index 0000000000000000000000000000000000000000..b304d3ac0aa2c32d6b494351ef0c0d0209866b71 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/frozen/tests/test_frozen.py @@ -0,0 +1,223 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import re + +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +from sklearn import config_context +from sklearn.base import ( + BaseEstimator, + clone, + is_classifier, + is_clusterer, + is_outlier_detector, + is_regressor, +) +from sklearn.cluster import KMeans +from sklearn.compose import make_column_transformer +from sklearn.datasets import make_classification, make_regression +from sklearn.exceptions import NotFittedError, UnsetMetadataPassedError +from sklearn.frozen import FrozenEstimator +from sklearn.linear_model import LinearRegression, LogisticRegression +from sklearn.neighbors import LocalOutlierFactor +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import RobustScaler, StandardScaler +from sklearn.utils._testing import set_random_state +from sklearn.utils.validation import check_is_fitted + + +@pytest.fixture +def regression_dataset(): + return make_regression() + + +@pytest.fixture +def classification_dataset(): + return make_classification() + + +@pytest.mark.parametrize( + "estimator, dataset", + [ + (LinearRegression(), "regression_dataset"), + (LogisticRegression(), "classification_dataset"), + (make_pipeline(StandardScaler(), LinearRegression()), "regression_dataset"), + ( + make_pipeline(StandardScaler(), LogisticRegression()), + "classification_dataset", + ), + (StandardScaler(), "regression_dataset"), + (KMeans(), "regression_dataset"), + (LocalOutlierFactor(), "regression_dataset"), + ( + make_column_transformer( + (StandardScaler(), [0]), + (RobustScaler(), [1]), + ), + "regression_dataset", + ), + ], +) +@pytest.mark.parametrize( + "method", + ["predict", "predict_proba", "predict_log_proba", "decision_function", "transform"], +) +def test_frozen_methods(estimator, dataset, request, method): + """Test that frozen.fit doesn't do anything, and that all other methods are + exposed by the frozen estimator and return the same values as the estimator. + """ + X, y = request.getfixturevalue(dataset) + set_random_state(estimator) + estimator.fit(X, y) + frozen = FrozenEstimator(estimator) + # this should be no-op + frozen.fit([[1]], [1]) + + if hasattr(estimator, method): + assert_array_equal(getattr(estimator, method)(X), getattr(frozen, method)(X)) + + assert is_classifier(estimator) == is_classifier(frozen) + assert is_regressor(estimator) == is_regressor(frozen) + assert is_clusterer(estimator) == is_clusterer(frozen) + assert is_outlier_detector(estimator) == is_outlier_detector(frozen) + + +@config_context(enable_metadata_routing=True) +def test_frozen_metadata_routing(regression_dataset): + """Test that metadata routing works with frozen estimators.""" + + class ConsumesMetadata(BaseEstimator): + def __init__(self, on_fit=None, on_predict=None): + self.on_fit = on_fit + self.on_predict = on_predict + + def fit(self, X, y, metadata=None): + if self.on_fit: + assert metadata is not None + self.fitted_ = True + return self + + def predict(self, X, metadata=None): + if self.on_predict: + assert metadata is not None + return np.ones(len(X)) + + X, y = regression_dataset + pipeline = make_pipeline( + ConsumesMetadata(on_fit=True, on_predict=True) + .set_fit_request(metadata=True) + .set_predict_request(metadata=True) + ) + + pipeline.fit(X, y, metadata="test") + frozen = FrozenEstimator(pipeline) + pipeline.predict(X, metadata="test") + frozen.predict(X, metadata="test") + + frozen["consumesmetadata"].set_predict_request(metadata=False) + with pytest.raises( + TypeError, + match=re.escape( + "Pipeline.predict got unexpected argument(s) {'metadata'}, which are not " + "routed to any object." + ), + ): + frozen.predict(X, metadata="test") + + frozen["consumesmetadata"].set_predict_request(metadata=None) + with pytest.raises(UnsetMetadataPassedError): + frozen.predict(X, metadata="test") + + +def test_composite_fit(classification_dataset): + """Test that calling fit_transform and fit_predict doesn't call fit.""" + + class Estimator(BaseEstimator): + def fit(self, X, y): + try: + self._fit_counter += 1 + except AttributeError: + self._fit_counter = 1 + return self + + def fit_transform(self, X, y=None): + # only here to test that it doesn't get called + ... # pragma: no cover + + def fit_predict(self, X, y=None): + # only here to test that it doesn't get called + ... # pragma: no cover + + X, y = classification_dataset + est = Estimator().fit(X, y) + frozen = FrozenEstimator(est) + + with pytest.raises(AttributeError): + frozen.fit_predict(X, y) + with pytest.raises(AttributeError): + frozen.fit_transform(X, y) + + assert frozen._fit_counter == 1 + + +def test_clone_frozen(regression_dataset): + """Test that cloning a frozen estimator keeps the frozen state.""" + X, y = regression_dataset + estimator = LinearRegression().fit(X, y) + frozen = FrozenEstimator(estimator) + cloned = clone(frozen) + assert cloned.estimator is estimator + + +def test_check_is_fitted(regression_dataset): + """Test that check_is_fitted works on frozen estimators.""" + X, y = regression_dataset + + estimator = LinearRegression() + frozen = FrozenEstimator(estimator) + with pytest.raises(NotFittedError): + check_is_fitted(frozen) + + estimator = LinearRegression().fit(X, y) + frozen = FrozenEstimator(estimator) + check_is_fitted(frozen) + + +def test_frozen_tags(): + """Test that frozen estimators have the same tags as the original estimator + except for the skip_test tag.""" + + class Estimator(BaseEstimator): + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.categorical = True + return tags + + estimator = Estimator() + frozen = FrozenEstimator(estimator) + frozen_tags = frozen.__sklearn_tags__() + estimator_tags = estimator.__sklearn_tags__() + + assert frozen_tags._skip_test is True + assert estimator_tags._skip_test is False + + assert estimator_tags.input_tags.categorical is True + assert frozen_tags.input_tags.categorical is True + + +def test_frozen_params(): + """Test that FrozenEstimator only exposes the estimator parameter.""" + est = LogisticRegression() + frozen = FrozenEstimator(est) + + with pytest.raises(ValueError, match="You cannot set parameters of the inner"): + frozen.set_params(estimator__C=1) + + assert frozen.get_params() == {"estimator": est} + + other_est = LocalOutlierFactor() + frozen.set_params(estimator=other_est) + assert frozen.get_params() == {"estimator": other_est} diff --git a/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9fafaf67e4ed042a95058e294f2395ea0dffb55d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/__init__.py @@ -0,0 +1,10 @@ +"""Gaussian process based regression and classification.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from . import kernels +from ._gpc import GaussianProcessClassifier +from ._gpr import GaussianProcessRegressor + +__all__ = ["GaussianProcessClassifier", "GaussianProcessRegressor", "kernels"] diff --git a/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/_gpc.py b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/_gpc.py new file mode 100644 index 0000000000000000000000000000000000000000..0ecceb47de9058643daee84faaab1e9927919c26 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/_gpc.py @@ -0,0 +1,973 @@ +"""Gaussian processes classification.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Integral +from operator import itemgetter + +import numpy as np +import scipy.optimize +from scipy.linalg import cho_solve, cholesky, solve +from scipy.special import erf, expit + +from ..base import BaseEstimator, ClassifierMixin, _fit_context, clone +from ..multiclass import OneVsOneClassifier, OneVsRestClassifier +from ..preprocessing import LabelEncoder +from ..utils import check_random_state +from ..utils._param_validation import Interval, StrOptions +from ..utils.optimize import _check_optimize_result +from ..utils.validation import check_is_fitted, validate_data +from .kernels import RBF, CompoundKernel, Kernel +from .kernels import ConstantKernel as C + +# Values required for approximating the logistic sigmoid by +# error functions. coefs are obtained via: +# x = np.array([0, 0.6, 2, 3.5, 4.5, np.inf]) +# b = logistic(x) +# A = (erf(np.dot(x, self.lambdas)) + 1) / 2 +# coefs = lstsq(A, b)[0] +LAMBDAS = np.array([0.41, 0.4, 0.37, 0.44, 0.39])[:, np.newaxis] +COEFS = np.array( + [-1854.8214151, 3516.89893646, 221.29346712, 128.12323805, -2010.49422654] +)[:, np.newaxis] + + +class _BinaryGaussianProcessClassifierLaplace(BaseEstimator): + """Binary Gaussian process classification based on Laplace approximation. + + The implementation is based on Algorithm 3.1, 3.2, and 5.1 from [RW2006]_. + + Internally, the Laplace approximation is used for approximating the + non-Gaussian posterior by a Gaussian. + + Currently, the implementation is restricted to using the logistic link + function. + + .. versionadded:: 0.18 + + Parameters + ---------- + kernel : kernel instance, default=None + The kernel specifying the covariance function of the GP. If None is + passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that + the kernel's hyperparameters are optimized during fitting. + + optimizer : 'fmin_l_bfgs_b' or callable, default='fmin_l_bfgs_b' + Can either be one of the internally supported optimizers for optimizing + the kernel's parameters, specified by a string, or an externally + defined optimizer passed as a callable. If a callable is passed, it + must have the signature:: + + def optimizer(obj_func, initial_theta, bounds): + # * 'obj_func' is the objective function to be maximized, which + # takes the hyperparameters theta as parameter and an + # optional flag eval_gradient, which determines if the + # gradient is returned additionally to the function value + # * 'initial_theta': the initial value for theta, which can be + # used by local optimizers + # * 'bounds': the bounds on the values of theta + .... + # Returned are the best found hyperparameters theta and + # the corresponding value of the target function. + return theta_opt, func_min + + Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize + is used. If None is passed, the kernel's parameters are kept fixed. + Available internal optimizers are:: + + 'fmin_l_bfgs_b' + + n_restarts_optimizer : int, default=0 + The number of restarts of the optimizer for finding the kernel's + parameters which maximize the log-marginal likelihood. The first run + of the optimizer is performed from the kernel's initial parameters, + the remaining ones (if any) from thetas sampled log-uniform randomly + from the space of allowed theta-values. If greater than 0, all bounds + must be finite. Note that n_restarts_optimizer=0 implies that one + run is performed. + + max_iter_predict : int, default=100 + The maximum number of iterations in Newton's method for approximating + the posterior during predict. Smaller values will reduce computation + time at the cost of worse results. + + warm_start : bool, default=False + If warm-starts are enabled, the solution of the last Newton iteration + on the Laplace approximation of the posterior mode is used as + initialization for the next call of _posterior_mode(). This can speed + up convergence when _posterior_mode is called several times on similar + problems as in hyperparameter optimization. See :term:`the Glossary + `. + + copy_X_train : bool, default=True + If True, a persistent copy of the training data is stored in the + object. Otherwise, just a reference to the training data is stored, + which might cause predictions to change if the data is modified + externally. + + random_state : int, RandomState instance or None, default=None + Determines random number generation used to initialize the centers. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + X_train_ : array-like of shape (n_samples, n_features) or list of object + Feature vectors or other representations of training data (also + required for prediction). + + y_train_ : array-like of shape (n_samples,) + Target values in training data (also required for prediction) + + classes_ : array-like of shape (n_classes,) + Unique class labels. + + kernel_ : kernl instance + The kernel used for prediction. The structure of the kernel is the + same as the one passed as parameter but with optimized hyperparameters + + L_ : array-like of shape (n_samples, n_samples) + Lower-triangular Cholesky decomposition of the kernel in X_train_ + + pi_ : array-like of shape (n_samples,) + The probabilities of the positive class for the training points + X_train_ + + W_sr_ : array-like of shape (n_samples,) + Square root of W, the Hessian of log-likelihood of the latent function + values for the observed labels. Since W is diagonal, only the diagonal + of sqrt(W) is stored. + + log_marginal_likelihood_value_ : float + The log-marginal-likelihood of ``self.kernel_.theta`` + + References + ---------- + .. [RW2006] `Carl E. Rasmussen and Christopher K.I. Williams, + "Gaussian Processes for Machine Learning", + MIT Press 2006 `_ + """ + + def __init__( + self, + kernel=None, + *, + optimizer="fmin_l_bfgs_b", + n_restarts_optimizer=0, + max_iter_predict=100, + warm_start=False, + copy_X_train=True, + random_state=None, + ): + self.kernel = kernel + self.optimizer = optimizer + self.n_restarts_optimizer = n_restarts_optimizer + self.max_iter_predict = max_iter_predict + self.warm_start = warm_start + self.copy_X_train = copy_X_train + self.random_state = random_state + + def fit(self, X, y): + """Fit Gaussian process classification model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) or list of object + Feature vectors or other representations of training data. + + y : array-like of shape (n_samples,) + Target values, must be binary. + + Returns + ------- + self : returns an instance of self. + """ + if self.kernel is None: # Use an RBF kernel as default + self.kernel_ = C(1.0, constant_value_bounds="fixed") * RBF( + 1.0, length_scale_bounds="fixed" + ) + else: + self.kernel_ = clone(self.kernel) + + self.rng = check_random_state(self.random_state) + + self.X_train_ = np.copy(X) if self.copy_X_train else X + + # Encode class labels and check that it is a binary classification + # problem + label_encoder = LabelEncoder() + self.y_train_ = label_encoder.fit_transform(y) + self.classes_ = label_encoder.classes_ + if self.classes_.size > 2: + raise ValueError( + "%s supports only binary classification. y contains classes %s" + % (self.__class__.__name__, self.classes_) + ) + elif self.classes_.size == 1: + raise ValueError( + "{0:s} requires 2 classes; got {1:d} class".format( + self.__class__.__name__, self.classes_.size + ) + ) + + if self.optimizer is not None and self.kernel_.n_dims > 0: + # Choose hyperparameters based on maximizing the log-marginal + # likelihood (potentially starting from several initial values) + def obj_func(theta, eval_gradient=True): + if eval_gradient: + lml, grad = self.log_marginal_likelihood( + theta, eval_gradient=True, clone_kernel=False + ) + return -lml, -grad + else: + return -self.log_marginal_likelihood(theta, clone_kernel=False) + + # First optimize starting from theta specified in kernel + optima = [ + self._constrained_optimization( + obj_func, self.kernel_.theta, self.kernel_.bounds + ) + ] + + # Additional runs are performed from log-uniform chosen initial + # theta + if self.n_restarts_optimizer > 0: + if not np.isfinite(self.kernel_.bounds).all(): + raise ValueError( + "Multiple optimizer restarts (n_restarts_optimizer>0) " + "requires that all bounds are finite." + ) + bounds = self.kernel_.bounds + for iteration in range(self.n_restarts_optimizer): + theta_initial = np.exp(self.rng.uniform(bounds[:, 0], bounds[:, 1])) + optima.append( + self._constrained_optimization(obj_func, theta_initial, bounds) + ) + # Select result from run with minimal (negative) log-marginal + # likelihood + lml_values = list(map(itemgetter(1), optima)) + self.kernel_.theta = optima[np.argmin(lml_values)][0] + self.kernel_._check_bounds_params() + + self.log_marginal_likelihood_value_ = -np.min(lml_values) + else: + self.log_marginal_likelihood_value_ = self.log_marginal_likelihood( + self.kernel_.theta + ) + + # Precompute quantities required for predictions which are independent + # of actual query points + K = self.kernel_(self.X_train_) + + _, (self.pi_, self.W_sr_, self.L_, _, _) = self._posterior_mode( + K, return_temporaries=True + ) + + return self + + def predict(self, X): + """Perform classification on an array of test vectors X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) or list of object + Query points where the GP is evaluated for classification. + + Returns + ------- + C : ndarray of shape (n_samples,) + Predicted target values for X, values are from ``classes_`` + """ + check_is_fitted(self) + + # As discussed on Section 3.4.2 of GPML, for making hard binary + # decisions, it is enough to compute the MAP of the posterior and + # pass it through the link function + K_star = self.kernel_(self.X_train_, X) # K_star =k(x_star) + f_star = K_star.T.dot(self.y_train_ - self.pi_) # Algorithm 3.2,Line 4 + + return np.where(f_star > 0, self.classes_[1], self.classes_[0]) + + def predict_proba(self, X): + """Return probability estimates for the test vector X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) or list of object + Query points where the GP is evaluated for classification. + + Returns + ------- + C : array-like of shape (n_samples, n_classes) + Returns the probability of the samples for each class in + the model. The columns correspond to the classes in sorted + order, as they appear in the attribute ``classes_``. + """ + check_is_fitted(self) + + # Compute the mean and variance of the latent function + # (Lines 4-6 of Algorithm 3.2 of GPML) + latent_mean, latent_var = self.latent_mean_and_variance(X) + + # Line 7: + # Approximate \int log(z) * N(z | f_star, var_f_star) + # Approximation is due to Williams & Barber, "Bayesian Classification + # with Gaussian Processes", Appendix A: Approximate the logistic + # sigmoid by a linear combination of 5 error functions. + # For information on how this integral can be computed see + # blitiri.blogspot.de/2012/11/gaussian-integral-of-error-function.html + alpha = 1 / (2 * latent_var) + gamma = LAMBDAS * latent_mean + integrals = ( + np.sqrt(np.pi / alpha) + * erf(gamma * np.sqrt(alpha / (alpha + LAMBDAS**2))) + / (2 * np.sqrt(latent_var * 2 * np.pi)) + ) + pi_star = (COEFS * integrals).sum(axis=0) + 0.5 * COEFS.sum() + + return np.vstack((1 - pi_star, pi_star)).T + + def log_marginal_likelihood( + self, theta=None, eval_gradient=False, clone_kernel=True + ): + """Returns log-marginal likelihood of theta for training data. + + Parameters + ---------- + theta : array-like of shape (n_kernel_params,), default=None + Kernel hyperparameters for which the log-marginal likelihood is + evaluated. If None, the precomputed log_marginal_likelihood + of ``self.kernel_.theta`` is returned. + + eval_gradient : bool, default=False + If True, the gradient of the log-marginal likelihood with respect + to the kernel hyperparameters at position theta is returned + additionally. If True, theta must not be None. + + clone_kernel : bool, default=True + If True, the kernel attribute is copied. If False, the kernel + attribute is modified, but may result in a performance improvement. + + Returns + ------- + log_likelihood : float + Log-marginal likelihood of theta for training data. + + log_likelihood_gradient : ndarray of shape (n_kernel_params,), \ + optional + Gradient of the log-marginal likelihood with respect to the kernel + hyperparameters at position theta. + Only returned when `eval_gradient` is True. + """ + if theta is None: + if eval_gradient: + raise ValueError("Gradient can only be evaluated for theta!=None") + return self.log_marginal_likelihood_value_ + + if clone_kernel: + kernel = self.kernel_.clone_with_theta(theta) + else: + kernel = self.kernel_ + kernel.theta = theta + + if eval_gradient: + K, K_gradient = kernel(self.X_train_, eval_gradient=True) + else: + K = kernel(self.X_train_) + + # Compute log-marginal-likelihood Z and also store some temporaries + # which can be reused for computing Z's gradient + Z, (pi, W_sr, L, b, a) = self._posterior_mode(K, return_temporaries=True) + + if not eval_gradient: + return Z + + # Compute gradient based on Algorithm 5.1 of GPML + d_Z = np.empty(theta.shape[0]) + # XXX: Get rid of the np.diag() in the next line + R = W_sr[:, np.newaxis] * cho_solve((L, True), np.diag(W_sr)) # Line 7 + C = solve(L, W_sr[:, np.newaxis] * K) # Line 8 + # Line 9: (use einsum to compute np.diag(C.T.dot(C)))) + s_2 = ( + -0.5 + * (np.diag(K) - np.einsum("ij, ij -> j", C, C)) + * (pi * (1 - pi) * (1 - 2 * pi)) + ) # third derivative + + for j in range(d_Z.shape[0]): + C = K_gradient[:, :, j] # Line 11 + # Line 12: (R.T.ravel().dot(C.ravel()) = np.trace(R.dot(C))) + s_1 = 0.5 * a.T.dot(C).dot(a) - 0.5 * R.T.ravel().dot(C.ravel()) + + b = C.dot(self.y_train_ - pi) # Line 13 + s_3 = b - K.dot(R.dot(b)) # Line 14 + + d_Z[j] = s_1 + s_2.T.dot(s_3) # Line 15 + + return Z, d_Z + + def latent_mean_and_variance(self, X): + """Compute the mean and variance of the latent function values. + + Based on algorithm 3.2 of [RW2006]_, this function returns the latent + mean (Line 4) and variance (Line 6) of the Gaussian process + classification model. + + Note that this function is only supported for binary classification. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) or list of object + Query points where the GP is evaluated for classification. + + Returns + ------- + latent_mean : array-like of shape (n_samples,) + Mean of the latent function values at the query points. + + latent_var : array-like of shape (n_samples,) + Variance of the latent function values at the query points. + """ + check_is_fitted(self) + + # Based on Algorithm 3.2 of GPML + K_star = self.kernel_(self.X_train_, X) # K_star =k(x_star) + latent_mean = K_star.T.dot(self.y_train_ - self.pi_) # Line 4 + v = solve(self.L_, self.W_sr_[:, np.newaxis] * K_star) # Line 5 + # Line 6 (compute np.diag(v.T.dot(v)) via einsum) + latent_var = self.kernel_.diag(X) - np.einsum("ij,ij->j", v, v) + + return latent_mean, latent_var + + def _posterior_mode(self, K, return_temporaries=False): + """Mode-finding for binary Laplace GPC and fixed kernel. + + This approximates the posterior of the latent function values for given + inputs and target observations with a Gaussian approximation and uses + Newton's iteration to find the mode of this approximation. + """ + # Based on Algorithm 3.1 of GPML + + # If warm_start are enabled, we reuse the last solution for the + # posterior mode as initialization; otherwise, we initialize with 0 + if ( + self.warm_start + and hasattr(self, "f_cached") + and self.f_cached.shape == self.y_train_.shape + ): + f = self.f_cached + else: + f = np.zeros_like(self.y_train_, dtype=np.float64) + + # Use Newton's iteration method to find mode of Laplace approximation + log_marginal_likelihood = -np.inf + for _ in range(self.max_iter_predict): + # Line 4 + pi = expit(f) + W = pi * (1 - pi) + # Line 5 + W_sr = np.sqrt(W) + W_sr_K = W_sr[:, np.newaxis] * K + B = np.eye(W.shape[0]) + W_sr_K * W_sr + L = cholesky(B, lower=True) + # Line 6 + b = W * f + (self.y_train_ - pi) + # Line 7 + a = b - W_sr * cho_solve((L, True), W_sr_K.dot(b)) + # Line 8 + f = K.dot(a) + + # Line 10: Compute log marginal likelihood in loop and use as + # convergence criterion + lml = ( + -0.5 * a.T.dot(f) + - np.log1p(np.exp(-(self.y_train_ * 2 - 1) * f)).sum() + - np.log(np.diag(L)).sum() + ) + # Check if we have converged (log marginal likelihood does + # not decrease) + # XXX: more complex convergence criterion + if lml - log_marginal_likelihood < 1e-10: + break + log_marginal_likelihood = lml + + self.f_cached = f # Remember solution for later warm-starts + if return_temporaries: + return log_marginal_likelihood, (pi, W_sr, L, b, a) + else: + return log_marginal_likelihood + + def _constrained_optimization(self, obj_func, initial_theta, bounds): + if self.optimizer == "fmin_l_bfgs_b": + opt_res = scipy.optimize.minimize( + obj_func, initial_theta, method="L-BFGS-B", jac=True, bounds=bounds + ) + _check_optimize_result("lbfgs", opt_res) + theta_opt, func_min = opt_res.x, opt_res.fun + elif callable(self.optimizer): + theta_opt, func_min = self.optimizer(obj_func, initial_theta, bounds=bounds) + else: + raise ValueError("Unknown optimizer %s." % self.optimizer) + + return theta_opt, func_min + + +class GaussianProcessClassifier(ClassifierMixin, BaseEstimator): + """Gaussian process classification (GPC) based on Laplace approximation. + + The implementation is based on Algorithm 3.1, 3.2, and 5.1 from [RW2006]_. + + Internally, the Laplace approximation is used for approximating the + non-Gaussian posterior by a Gaussian. + + Currently, the implementation is restricted to using the logistic link + function. For multi-class classification, several binary one-versus rest + classifiers are fitted. Note that this class thus does not implement + a true multi-class Laplace approximation. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.18 + + Parameters + ---------- + kernel : kernel instance, default=None + The kernel specifying the covariance function of the GP. If None is + passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that + the kernel's hyperparameters are optimized during fitting. Also kernel + cannot be a `CompoundKernel`. + + optimizer : 'fmin_l_bfgs_b', callable or None, default='fmin_l_bfgs_b' + Can either be one of the internally supported optimizers for optimizing + the kernel's parameters, specified by a string, or an externally + defined optimizer passed as a callable. If a callable is passed, it + must have the signature:: + + def optimizer(obj_func, initial_theta, bounds): + # * 'obj_func' is the objective function to be maximized, which + # takes the hyperparameters theta as parameter and an + # optional flag eval_gradient, which determines if the + # gradient is returned additionally to the function value + # * 'initial_theta': the initial value for theta, which can be + # used by local optimizers + # * 'bounds': the bounds on the values of theta + .... + # Returned are the best found hyperparameters theta and + # the corresponding value of the target function. + return theta_opt, func_min + + Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize + is used. If None is passed, the kernel's parameters are kept fixed. + Available internal optimizers are:: + + 'fmin_l_bfgs_b' + + n_restarts_optimizer : int, default=0 + The number of restarts of the optimizer for finding the kernel's + parameters which maximize the log-marginal likelihood. The first run + of the optimizer is performed from the kernel's initial parameters, + the remaining ones (if any) from thetas sampled log-uniform randomly + from the space of allowed theta-values. If greater than 0, all bounds + must be finite. Note that n_restarts_optimizer=0 implies that one + run is performed. + + max_iter_predict : int, default=100 + The maximum number of iterations in Newton's method for approximating + the posterior during predict. Smaller values will reduce computation + time at the cost of worse results. + + warm_start : bool, default=False + If warm-starts are enabled, the solution of the last Newton iteration + on the Laplace approximation of the posterior mode is used as + initialization for the next call of _posterior_mode(). This can speed + up convergence when _posterior_mode is called several times on similar + problems as in hyperparameter optimization. See :term:`the Glossary + `. + + copy_X_train : bool, default=True + If True, a persistent copy of the training data is stored in the + object. Otherwise, just a reference to the training data is stored, + which might cause predictions to change if the data is modified + externally. + + random_state : int, RandomState instance or None, default=None + Determines random number generation used to initialize the centers. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + multi_class : {'one_vs_rest', 'one_vs_one'}, default='one_vs_rest' + Specifies how multi-class classification problems are handled. + Supported are 'one_vs_rest' and 'one_vs_one'. In 'one_vs_rest', + one binary Gaussian process classifier is fitted for each class, which + is trained to separate this class from the rest. In 'one_vs_one', one + binary Gaussian process classifier is fitted for each pair of classes, + which is trained to separate these two classes. The predictions of + these binary predictors are combined into multi-class predictions. + Note that 'one_vs_one' does not support predicting probability + estimates. + + n_jobs : int, default=None + The number of jobs to use for the computation: the specified + multiclass problems are computed in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Attributes + ---------- + base_estimator_ : ``Estimator`` instance + The estimator instance that defines the likelihood function + using the observed data. + + kernel_ : kernel instance + The kernel used for prediction. In case of binary classification, + the structure of the kernel is the same as the one passed as parameter + but with optimized hyperparameters. In case of multi-class + classification, a CompoundKernel is returned which consists of the + different kernels used in the one-versus-rest classifiers. + + log_marginal_likelihood_value_ : float + The log-marginal-likelihood of ``self.kernel_.theta`` + + classes_ : array-like of shape (n_classes,) + Unique class labels. + + n_classes_ : int + The number of classes in the training data + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + GaussianProcessRegressor : Gaussian process regression (GPR). + + References + ---------- + .. [RW2006] `Carl E. Rasmussen and Christopher K.I. Williams, + "Gaussian Processes for Machine Learning", + MIT Press 2006 `_ + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> from sklearn.gaussian_process import GaussianProcessClassifier + >>> from sklearn.gaussian_process.kernels import RBF + >>> X, y = load_iris(return_X_y=True) + >>> kernel = 1.0 * RBF(1.0) + >>> gpc = GaussianProcessClassifier(kernel=kernel, + ... random_state=0).fit(X, y) + >>> gpc.score(X, y) + 0.9866... + >>> gpc.predict_proba(X[:2,:]) + array([[0.83548752, 0.03228706, 0.13222543], + [0.79064206, 0.06525643, 0.14410151]]) + + For a comparison of the GaussianProcessClassifier with other classifiers see: + :ref:`sphx_glr_auto_examples_classification_plot_classification_probability.py`. + """ + + _parameter_constraints: dict = { + "kernel": [Kernel, None], + "optimizer": [StrOptions({"fmin_l_bfgs_b"}), callable, None], + "n_restarts_optimizer": [Interval(Integral, 0, None, closed="left")], + "max_iter_predict": [Interval(Integral, 1, None, closed="left")], + "warm_start": ["boolean"], + "copy_X_train": ["boolean"], + "random_state": ["random_state"], + "multi_class": [StrOptions({"one_vs_rest", "one_vs_one"})], + "n_jobs": [Integral, None], + } + + def __init__( + self, + kernel=None, + *, + optimizer="fmin_l_bfgs_b", + n_restarts_optimizer=0, + max_iter_predict=100, + warm_start=False, + copy_X_train=True, + random_state=None, + multi_class="one_vs_rest", + n_jobs=None, + ): + self.kernel = kernel + self.optimizer = optimizer + self.n_restarts_optimizer = n_restarts_optimizer + self.max_iter_predict = max_iter_predict + self.warm_start = warm_start + self.copy_X_train = copy_X_train + self.random_state = random_state + self.multi_class = multi_class + self.n_jobs = n_jobs + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y): + """Fit Gaussian process classification model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) or list of object + Feature vectors or other representations of training data. + + y : array-like of shape (n_samples,) + Target values, must be binary. + + Returns + ------- + self : object + Returns an instance of self. + """ + if isinstance(self.kernel, CompoundKernel): + raise ValueError("kernel cannot be a CompoundKernel") + + if self.kernel is None or self.kernel.requires_vector_input: + X, y = validate_data( + self, X, y, multi_output=False, ensure_2d=True, dtype="numeric" + ) + else: + X, y = validate_data( + self, X, y, multi_output=False, ensure_2d=False, dtype=None + ) + + self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace( + kernel=self.kernel, + optimizer=self.optimizer, + n_restarts_optimizer=self.n_restarts_optimizer, + max_iter_predict=self.max_iter_predict, + warm_start=self.warm_start, + copy_X_train=self.copy_X_train, + random_state=self.random_state, + ) + + self.classes_ = np.unique(y) + self.n_classes_ = self.classes_.size + if self.n_classes_ == 1: + raise ValueError( + "GaussianProcessClassifier requires 2 or more " + "distinct classes; got %d class (only class %s " + "is present)" % (self.n_classes_, self.classes_[0]) + ) + if self.n_classes_ > 2: + if self.multi_class == "one_vs_rest": + self.base_estimator_ = OneVsRestClassifier( + self.base_estimator_, n_jobs=self.n_jobs + ) + elif self.multi_class == "one_vs_one": + self.base_estimator_ = OneVsOneClassifier( + self.base_estimator_, n_jobs=self.n_jobs + ) + else: + raise ValueError("Unknown multi-class mode %s" % self.multi_class) + + self.base_estimator_.fit(X, y) + + if self.n_classes_ > 2: + self.log_marginal_likelihood_value_ = np.mean( + [ + estimator.log_marginal_likelihood() + for estimator in self.base_estimator_.estimators_ + ] + ) + else: + self.log_marginal_likelihood_value_ = ( + self.base_estimator_.log_marginal_likelihood() + ) + + return self + + def predict(self, X): + """Perform classification on an array of test vectors X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) or list of object + Query points where the GP is evaluated for classification. + + Returns + ------- + C : ndarray of shape (n_samples,) + Predicted target values for X, values are from ``classes_``. + """ + check_is_fitted(self) + + if self.kernel is None or self.kernel.requires_vector_input: + X = validate_data(self, X, ensure_2d=True, dtype="numeric", reset=False) + else: + X = validate_data(self, X, ensure_2d=False, dtype=None, reset=False) + + return self.base_estimator_.predict(X) + + def predict_proba(self, X): + """Return probability estimates for the test vector X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) or list of object + Query points where the GP is evaluated for classification. + + Returns + ------- + C : array-like of shape (n_samples, n_classes) + Returns the probability of the samples for each class in + the model. The columns correspond to the classes in sorted + order, as they appear in the attribute :term:`classes_`. + """ + check_is_fitted(self) + if self.n_classes_ > 2 and self.multi_class == "one_vs_one": + raise ValueError( + "one_vs_one multi-class mode does not support " + "predicting probability estimates. Use " + "one_vs_rest mode instead." + ) + + if self.kernel is None or self.kernel.requires_vector_input: + X = validate_data(self, X, ensure_2d=True, dtype="numeric", reset=False) + else: + X = validate_data(self, X, ensure_2d=False, dtype=None, reset=False) + + return self.base_estimator_.predict_proba(X) + + @property + def kernel_(self): + """Return the kernel of the base estimator.""" + if self.n_classes_ == 2: + return self.base_estimator_.kernel_ + else: + return CompoundKernel( + [estimator.kernel_ for estimator in self.base_estimator_.estimators_] + ) + + def log_marginal_likelihood( + self, theta=None, eval_gradient=False, clone_kernel=True + ): + """Return log-marginal likelihood of theta for training data. + + In the case of multi-class classification, the mean log-marginal + likelihood of the one-versus-rest classifiers are returned. + + Parameters + ---------- + theta : array-like of shape (n_kernel_params,), default=None + Kernel hyperparameters for which the log-marginal likelihood is + evaluated. In the case of multi-class classification, theta may + be the hyperparameters of the compound kernel or of an individual + kernel. In the latter case, all individual kernel get assigned the + same theta values. If None, the precomputed log_marginal_likelihood + of ``self.kernel_.theta`` is returned. + + eval_gradient : bool, default=False + If True, the gradient of the log-marginal likelihood with respect + to the kernel hyperparameters at position theta is returned + additionally. Note that gradient computation is not supported + for non-binary classification. If True, theta must not be None. + + clone_kernel : bool, default=True + If True, the kernel attribute is copied. If False, the kernel + attribute is modified, but may result in a performance improvement. + + Returns + ------- + log_likelihood : float + Log-marginal likelihood of theta for training data. + + log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional + Gradient of the log-marginal likelihood with respect to the kernel + hyperparameters at position theta. + Only returned when `eval_gradient` is True. + """ + check_is_fitted(self) + + if theta is None: + if eval_gradient: + raise ValueError("Gradient can only be evaluated for theta!=None") + return self.log_marginal_likelihood_value_ + + theta = np.asarray(theta) + if self.n_classes_ == 2: + return self.base_estimator_.log_marginal_likelihood( + theta, eval_gradient, clone_kernel=clone_kernel + ) + else: + if eval_gradient: + raise NotImplementedError( + "Gradient of log-marginal-likelihood not implemented for " + "multi-class GPC." + ) + estimators = self.base_estimator_.estimators_ + n_dims = estimators[0].kernel_.n_dims + if theta.shape[0] == n_dims: # use same theta for all sub-kernels + return np.mean( + [ + estimator.log_marginal_likelihood( + theta, clone_kernel=clone_kernel + ) + for i, estimator in enumerate(estimators) + ] + ) + elif theta.shape[0] == n_dims * self.classes_.shape[0]: + # theta for compound kernel + return np.mean( + [ + estimator.log_marginal_likelihood( + theta[n_dims * i : n_dims * (i + 1)], + clone_kernel=clone_kernel, + ) + for i, estimator in enumerate(estimators) + ] + ) + else: + raise ValueError( + "Shape of theta must be either %d or %d. " + "Obtained theta with shape %d." + % (n_dims, n_dims * self.classes_.shape[0], theta.shape[0]) + ) + + def latent_mean_and_variance(self, X): + """Compute the mean and variance of the latent function. + + Based on algorithm 3.2 of [RW2006]_, this function returns the latent + mean (Line 4) and variance (Line 6) of the Gaussian process + classification model. + + Note that this function is only supported for binary classification. + + .. versionadded:: 1.7 + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) or list of object + Query points where the GP is evaluated for classification. + + Returns + ------- + latent_mean : array-like of shape (n_samples,) + Mean of the latent function values at the query points. + + latent_var : array-like of shape (n_samples,) + Variance of the latent function values at the query points. + """ + if self.n_classes_ > 2: + raise ValueError( + "Returning the mean and variance of the latent function f " + "is only supported for binary classification, received " + f"{self.n_classes_} classes." + ) + check_is_fitted(self) + + if self.kernel is None or self.kernel.requires_vector_input: + X = validate_data(self, X, ensure_2d=True, dtype="numeric", reset=False) + else: + X = validate_data(self, X, ensure_2d=False, dtype=None, reset=False) + + return self.base_estimator_.latent_mean_and_variance(X) diff --git a/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/_gpr.py b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/_gpr.py new file mode 100644 index 0000000000000000000000000000000000000000..d56e7735be787eaf2b1aaeaac0fce228651b2eb6 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/_gpr.py @@ -0,0 +1,675 @@ +"""Gaussian processes regression.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from numbers import Integral, Real +from operator import itemgetter + +import numpy as np +import scipy.optimize +from scipy.linalg import cho_solve, cholesky, solve_triangular + +from ..base import BaseEstimator, MultiOutputMixin, RegressorMixin, _fit_context, clone +from ..preprocessing._data import _handle_zeros_in_scale +from ..utils import check_random_state +from ..utils._param_validation import Interval, StrOptions +from ..utils.optimize import _check_optimize_result +from ..utils.validation import validate_data +from .kernels import RBF, Kernel +from .kernels import ConstantKernel as C + +GPR_CHOLESKY_LOWER = True + + +class GaussianProcessRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): + """Gaussian process regression (GPR). + + The implementation is based on Algorithm 2.1 of [RW2006]_. + + In addition to standard scikit-learn estimator API, + :class:`GaussianProcessRegressor`: + + * allows prediction without prior fitting (based on the GP prior) + * provides an additional method `sample_y(X)`, which evaluates samples + drawn from the GPR (prior or posterior) at given inputs + * exposes a method `log_marginal_likelihood(theta)`, which can be used + externally for other ways of selecting hyperparameters, e.g., via + Markov chain Monte Carlo. + + To learn the difference between a point-estimate approach vs. a more + Bayesian modelling approach, refer to the example entitled + :ref:`sphx_glr_auto_examples_gaussian_process_plot_compare_gpr_krr.py`. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.18 + + Parameters + ---------- + kernel : kernel instance, default=None + The kernel specifying the covariance function of the GP. If None is + passed, the kernel ``ConstantKernel(1.0, constant_value_bounds="fixed") + * RBF(1.0, length_scale_bounds="fixed")`` is used as default. Note that + the kernel hyperparameters are optimized during fitting unless the + bounds are marked as "fixed". + + alpha : float or ndarray of shape (n_samples,), default=1e-10 + Value added to the diagonal of the kernel matrix during fitting. + This can prevent a potential numerical issue during fitting, by + ensuring that the calculated values form a positive definite matrix. + It can also be interpreted as the variance of additional Gaussian + measurement noise on the training observations. Note that this is + different from using a `WhiteKernel`. If an array is passed, it must + have the same number of entries as the data used for fitting and is + used as datapoint-dependent noise level. Allowing to specify the + noise level directly as a parameter is mainly for convenience and + for consistency with :class:`~sklearn.linear_model.Ridge`. + For an example illustrating how the alpha parameter controls + the noise variance in Gaussian Process Regression, see + :ref:`sphx_glr_auto_examples_gaussian_process_plot_gpr_noisy_targets.py`. + + optimizer : "fmin_l_bfgs_b", callable or None, default="fmin_l_bfgs_b" + Can either be one of the internally supported optimizers for optimizing + the kernel's parameters, specified by a string, or an externally + defined optimizer passed as a callable. If a callable is passed, it + must have the signature:: + + def optimizer(obj_func, initial_theta, bounds): + # * 'obj_func': the objective function to be minimized, which + # takes the hyperparameters theta as a parameter and an + # optional flag eval_gradient, which determines if the + # gradient is returned additionally to the function value + # * 'initial_theta': the initial value for theta, which can be + # used by local optimizers + # * 'bounds': the bounds on the values of theta + .... + # Returned are the best found hyperparameters theta and + # the corresponding value of the target function. + return theta_opt, func_min + + Per default, the L-BFGS-B algorithm from `scipy.optimize.minimize` + is used. If None is passed, the kernel's parameters are kept fixed. + Available internal optimizers are: `{'fmin_l_bfgs_b'}`. + + n_restarts_optimizer : int, default=0 + The number of restarts of the optimizer for finding the kernel's + parameters which maximize the log-marginal likelihood. The first run + of the optimizer is performed from the kernel's initial parameters, + the remaining ones (if any) from thetas sampled log-uniform randomly + from the space of allowed theta-values. If greater than 0, all bounds + must be finite. Note that `n_restarts_optimizer == 0` implies that one + run is performed. + + normalize_y : bool, default=False + Whether or not to normalize the target values `y` by removing the mean + and scaling to unit-variance. This is recommended for cases where + zero-mean, unit-variance priors are used. Note that, in this + implementation, the normalisation is reversed before the GP predictions + are reported. + + .. versionchanged:: 0.23 + + copy_X_train : bool, default=True + If True, a persistent copy of the training data is stored in the + object. Otherwise, just a reference to the training data is stored, + which might cause predictions to change if the data is modified + externally. + + n_targets : int, default=None + The number of dimensions of the target values. Used to decide the number + of outputs when sampling from the prior distributions (i.e. calling + :meth:`sample_y` before :meth:`fit`). This parameter is ignored once + :meth:`fit` has been called. + + .. versionadded:: 1.3 + + random_state : int, RandomState instance or None, default=None + Determines random number generation used to initialize the centers. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + X_train_ : array-like of shape (n_samples, n_features) or list of object + Feature vectors or other representations of training data (also + required for prediction). + + y_train_ : array-like of shape (n_samples,) or (n_samples, n_targets) + Target values in training data (also required for prediction). + + kernel_ : kernel instance + The kernel used for prediction. The structure of the kernel is the + same as the one passed as parameter but with optimized hyperparameters. + + L_ : array-like of shape (n_samples, n_samples) + Lower-triangular Cholesky decomposition of the kernel in ``X_train_``. + + alpha_ : array-like of shape (n_samples,) + Dual coefficients of training data points in kernel space. + + log_marginal_likelihood_value_ : float + The log-marginal-likelihood of ``self.kernel_.theta``. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + GaussianProcessClassifier : Gaussian process classification (GPC) + based on Laplace approximation. + + References + ---------- + .. [RW2006] `Carl E. Rasmussen and Christopher K.I. Williams, + "Gaussian Processes for Machine Learning", + MIT Press 2006 `_ + + Examples + -------- + >>> from sklearn.datasets import make_friedman2 + >>> from sklearn.gaussian_process import GaussianProcessRegressor + >>> from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel + >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0) + >>> kernel = DotProduct() + WhiteKernel() + >>> gpr = GaussianProcessRegressor(kernel=kernel, + ... random_state=0).fit(X, y) + >>> gpr.score(X, y) + 0.3680... + >>> gpr.predict(X[:2,:], return_std=True) + (array([653.0, 592.1]), array([316.6, 316.6])) + """ + + _parameter_constraints: dict = { + "kernel": [None, Kernel], + "alpha": [Interval(Real, 0, None, closed="left"), np.ndarray], + "optimizer": [StrOptions({"fmin_l_bfgs_b"}), callable, None], + "n_restarts_optimizer": [Interval(Integral, 0, None, closed="left")], + "normalize_y": ["boolean"], + "copy_X_train": ["boolean"], + "n_targets": [Interval(Integral, 1, None, closed="left"), None], + "random_state": ["random_state"], + } + + def __init__( + self, + kernel=None, + *, + alpha=1e-10, + optimizer="fmin_l_bfgs_b", + n_restarts_optimizer=0, + normalize_y=False, + copy_X_train=True, + n_targets=None, + random_state=None, + ): + self.kernel = kernel + self.alpha = alpha + self.optimizer = optimizer + self.n_restarts_optimizer = n_restarts_optimizer + self.normalize_y = normalize_y + self.copy_X_train = copy_X_train + self.n_targets = n_targets + self.random_state = random_state + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y): + """Fit Gaussian process regression model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) or list of object + Feature vectors or other representations of training data. + + y : array-like of shape (n_samples,) or (n_samples, n_targets) + Target values. + + Returns + ------- + self : object + GaussianProcessRegressor class instance. + """ + if self.kernel is None: # Use an RBF kernel as default + self.kernel_ = C(1.0, constant_value_bounds="fixed") * RBF( + 1.0, length_scale_bounds="fixed" + ) + else: + self.kernel_ = clone(self.kernel) + + self._rng = check_random_state(self.random_state) + + if self.kernel_.requires_vector_input: + dtype, ensure_2d = "numeric", True + else: + dtype, ensure_2d = None, False + X, y = validate_data( + self, + X, + y, + multi_output=True, + y_numeric=True, + ensure_2d=ensure_2d, + dtype=dtype, + ) + + n_targets_seen = y.shape[1] if y.ndim > 1 else 1 + if self.n_targets is not None and n_targets_seen != self.n_targets: + raise ValueError( + "The number of targets seen in `y` is different from the parameter " + f"`n_targets`. Got {n_targets_seen} != {self.n_targets}." + ) + + # Normalize target value + if self.normalize_y: + self._y_train_mean = np.mean(y, axis=0) + self._y_train_std = _handle_zeros_in_scale(np.std(y, axis=0), copy=False) + + # Remove mean and make unit variance + y = (y - self._y_train_mean) / self._y_train_std + + else: + shape_y_stats = (y.shape[1],) if y.ndim == 2 else 1 + self._y_train_mean = np.zeros(shape=shape_y_stats) + self._y_train_std = np.ones(shape=shape_y_stats) + + if np.iterable(self.alpha) and self.alpha.shape[0] != y.shape[0]: + if self.alpha.shape[0] == 1: + self.alpha = self.alpha[0] + else: + raise ValueError( + "alpha must be a scalar or an array with same number of " + f"entries as y. ({self.alpha.shape[0]} != {y.shape[0]})" + ) + + self.X_train_ = np.copy(X) if self.copy_X_train else X + self.y_train_ = np.copy(y) if self.copy_X_train else y + + if self.optimizer is not None and self.kernel_.n_dims > 0: + # Choose hyperparameters based on maximizing the log-marginal + # likelihood (potentially starting from several initial values) + def obj_func(theta, eval_gradient=True): + if eval_gradient: + lml, grad = self.log_marginal_likelihood( + theta, eval_gradient=True, clone_kernel=False + ) + return -lml, -grad + else: + return -self.log_marginal_likelihood(theta, clone_kernel=False) + + # First optimize starting from theta specified in kernel + optima = [ + ( + self._constrained_optimization( + obj_func, self.kernel_.theta, self.kernel_.bounds + ) + ) + ] + + # Additional runs are performed from log-uniform chosen initial + # theta + if self.n_restarts_optimizer > 0: + if not np.isfinite(self.kernel_.bounds).all(): + raise ValueError( + "Multiple optimizer restarts (n_restarts_optimizer>0) " + "requires that all bounds are finite." + ) + bounds = self.kernel_.bounds + for iteration in range(self.n_restarts_optimizer): + theta_initial = self._rng.uniform(bounds[:, 0], bounds[:, 1]) + optima.append( + self._constrained_optimization(obj_func, theta_initial, bounds) + ) + # Select result from run with minimal (negative) log-marginal + # likelihood + lml_values = list(map(itemgetter(1), optima)) + self.kernel_.theta = optima[np.argmin(lml_values)][0] + self.kernel_._check_bounds_params() + + self.log_marginal_likelihood_value_ = -np.min(lml_values) + else: + self.log_marginal_likelihood_value_ = self.log_marginal_likelihood( + self.kernel_.theta, clone_kernel=False + ) + + # Precompute quantities required for predictions which are independent + # of actual query points + # Alg. 2.1, page 19, line 2 -> L = cholesky(K + sigma^2 I) + K = self.kernel_(self.X_train_) + K[np.diag_indices_from(K)] += self.alpha + try: + self.L_ = cholesky(K, lower=GPR_CHOLESKY_LOWER, check_finite=False) + except np.linalg.LinAlgError as exc: + exc.args = ( + ( + f"The kernel, {self.kernel_}, is not returning a positive " + "definite matrix. Try gradually increasing the 'alpha' " + "parameter of your GaussianProcessRegressor estimator." + ), + ) + exc.args + raise + # Alg 2.1, page 19, line 3 -> alpha = L^T \ (L \ y) + self.alpha_ = cho_solve( + (self.L_, GPR_CHOLESKY_LOWER), + self.y_train_, + check_finite=False, + ) + return self + + def predict(self, X, return_std=False, return_cov=False): + """Predict using the Gaussian process regression model. + + We can also predict based on an unfitted model by using the GP prior. + In addition to the mean of the predictive distribution, optionally also + returns its standard deviation (`return_std=True`) or covariance + (`return_cov=True`). Note that at most one of the two can be requested. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) or list of object + Query points where the GP is evaluated. + + return_std : bool, default=False + If True, the standard-deviation of the predictive distribution at + the query points is returned along with the mean. + + return_cov : bool, default=False + If True, the covariance of the joint predictive distribution at + the query points is returned along with the mean. + + Returns + ------- + y_mean : ndarray of shape (n_samples,) or (n_samples, n_targets) + Mean of predictive distribution at query points. + + y_std : ndarray of shape (n_samples,) or (n_samples, n_targets), optional + Standard deviation of predictive distribution at query points. + Only returned when `return_std` is True. + + y_cov : ndarray of shape (n_samples, n_samples) or \ + (n_samples, n_samples, n_targets), optional + Covariance of joint predictive distribution at query points. + Only returned when `return_cov` is True. + """ + if return_std and return_cov: + raise RuntimeError( + "At most one of return_std or return_cov can be requested." + ) + + if self.kernel is None or self.kernel.requires_vector_input: + dtype, ensure_2d = "numeric", True + else: + dtype, ensure_2d = None, False + + X = validate_data(self, X, ensure_2d=ensure_2d, dtype=dtype, reset=False) + + if not hasattr(self, "X_train_"): # Unfitted;predict based on GP prior + if self.kernel is None: + kernel = C(1.0, constant_value_bounds="fixed") * RBF( + 1.0, length_scale_bounds="fixed" + ) + else: + kernel = self.kernel + + n_targets = self.n_targets if self.n_targets is not None else 1 + y_mean = np.zeros(shape=(X.shape[0], n_targets)).squeeze() + + if return_cov: + y_cov = kernel(X) + if n_targets > 1: + y_cov = np.repeat( + np.expand_dims(y_cov, -1), repeats=n_targets, axis=-1 + ) + return y_mean, y_cov + elif return_std: + y_var = kernel.diag(X) + if n_targets > 1: + y_var = np.repeat( + np.expand_dims(y_var, -1), repeats=n_targets, axis=-1 + ) + return y_mean, np.sqrt(y_var) + else: + return y_mean + else: # Predict based on GP posterior + # Alg 2.1, page 19, line 4 -> f*_bar = K(X_test, X_train) . alpha + K_trans = self.kernel_(X, self.X_train_) + y_mean = K_trans @ self.alpha_ + + # undo normalisation + y_mean = self._y_train_std * y_mean + self._y_train_mean + + # if y_mean has shape (n_samples, 1), reshape to (n_samples,) + if y_mean.ndim > 1 and y_mean.shape[1] == 1: + y_mean = np.squeeze(y_mean, axis=1) + + # Alg 2.1, page 19, line 5 -> v = L \ K(X_test, X_train)^T + V = solve_triangular( + self.L_, K_trans.T, lower=GPR_CHOLESKY_LOWER, check_finite=False + ) + + if return_cov: + # Alg 2.1, page 19, line 6 -> K(X_test, X_test) - v^T. v + y_cov = self.kernel_(X) - V.T @ V + + # undo normalisation + y_cov = np.outer(y_cov, self._y_train_std**2).reshape(*y_cov.shape, -1) + # if y_cov has shape (n_samples, n_samples, 1), reshape to + # (n_samples, n_samples) + if y_cov.shape[2] == 1: + y_cov = np.squeeze(y_cov, axis=2) + + return y_mean, y_cov + elif return_std: + # Compute variance of predictive distribution + # Use einsum to avoid explicitly forming the large matrix + # V^T @ V just to extract its diagonal afterward. + y_var = self.kernel_.diag(X).copy() + y_var -= np.einsum("ij,ji->i", V.T, V) + + # Check if any of the variances is negative because of + # numerical issues. If yes: set the variance to 0. + y_var_negative = y_var < 0 + if np.any(y_var_negative): + warnings.warn( + "Predicted variances smaller than 0. " + "Setting those variances to 0." + ) + y_var[y_var_negative] = 0.0 + + # undo normalisation + y_var = np.outer(y_var, self._y_train_std**2).reshape(*y_var.shape, -1) + + # if y_var has shape (n_samples, 1), reshape to (n_samples,) + if y_var.shape[1] == 1: + y_var = np.squeeze(y_var, axis=1) + + return y_mean, np.sqrt(y_var) + else: + return y_mean + + def sample_y(self, X, n_samples=1, random_state=0): + """Draw samples from Gaussian process and evaluate at X. + + Parameters + ---------- + X : array-like of shape (n_samples_X, n_features) or list of object + Query points where the GP is evaluated. + + n_samples : int, default=1 + Number of samples drawn from the Gaussian process per query point. + + random_state : int, RandomState instance or None, default=0 + Determines random number generation to randomly draw samples. + Pass an int for reproducible results across multiple function + calls. + See :term:`Glossary `. + + Returns + ------- + y_samples : ndarray of shape (n_samples_X, n_samples), or \ + (n_samples_X, n_targets, n_samples) + Values of n_samples samples drawn from Gaussian process and + evaluated at query points. + """ + rng = check_random_state(random_state) + + y_mean, y_cov = self.predict(X, return_cov=True) + if y_mean.ndim == 1: + y_samples = rng.multivariate_normal(y_mean, y_cov, n_samples).T + else: + y_samples = [ + rng.multivariate_normal( + y_mean[:, target], y_cov[..., target], n_samples + ).T[:, np.newaxis] + for target in range(y_mean.shape[1]) + ] + y_samples = np.hstack(y_samples) + return y_samples + + def log_marginal_likelihood( + self, theta=None, eval_gradient=False, clone_kernel=True + ): + """Return log-marginal likelihood of theta for training data. + + Parameters + ---------- + theta : array-like of shape (n_kernel_params,) default=None + Kernel hyperparameters for which the log-marginal likelihood is + evaluated. If None, the precomputed log_marginal_likelihood + of ``self.kernel_.theta`` is returned. + + eval_gradient : bool, default=False + If True, the gradient of the log-marginal likelihood with respect + to the kernel hyperparameters at position theta is returned + additionally. If True, theta must not be None. + + clone_kernel : bool, default=True + If True, the kernel attribute is copied. If False, the kernel + attribute is modified, but may result in a performance improvement. + + Returns + ------- + log_likelihood : float + Log-marginal likelihood of theta for training data. + + log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional + Gradient of the log-marginal likelihood with respect to the kernel + hyperparameters at position theta. + Only returned when eval_gradient is True. + """ + if theta is None: + if eval_gradient: + raise ValueError("Gradient can only be evaluated for theta!=None") + return self.log_marginal_likelihood_value_ + + if clone_kernel: + kernel = self.kernel_.clone_with_theta(theta) + else: + kernel = self.kernel_ + kernel.theta = theta + + if eval_gradient: + K, K_gradient = kernel(self.X_train_, eval_gradient=True) + else: + K = kernel(self.X_train_) + + # Alg. 2.1, page 19, line 2 -> L = cholesky(K + sigma^2 I) + K[np.diag_indices_from(K)] += self.alpha + try: + L = cholesky(K, lower=GPR_CHOLESKY_LOWER, check_finite=False) + except np.linalg.LinAlgError: + return (-np.inf, np.zeros_like(theta)) if eval_gradient else -np.inf + + # Support multi-dimensional output of self.y_train_ + y_train = self.y_train_ + if y_train.ndim == 1: + y_train = y_train[:, np.newaxis] + + # Alg 2.1, page 19, line 3 -> alpha = L^T \ (L \ y) + alpha = cho_solve((L, GPR_CHOLESKY_LOWER), y_train, check_finite=False) + + # Alg 2.1, page 19, line 7 + # -0.5 . y^T . alpha - sum(log(diag(L))) - n_samples / 2 log(2*pi) + # y is originally thought to be a (1, n_samples) row vector. However, + # in multioutputs, y is of shape (n_samples, 2) and we need to compute + # y^T . alpha for each output, independently using einsum. Thus, it + # is equivalent to: + # for output_idx in range(n_outputs): + # log_likelihood_dims[output_idx] = ( + # y_train[:, [output_idx]] @ alpha[:, [output_idx]] + # ) + log_likelihood_dims = -0.5 * np.einsum("ik,ik->k", y_train, alpha) + log_likelihood_dims -= np.log(np.diag(L)).sum() + log_likelihood_dims -= K.shape[0] / 2 * np.log(2 * np.pi) + # the log likelihood is sum-up across the outputs + log_likelihood = log_likelihood_dims.sum(axis=-1) + + if eval_gradient: + # Eq. 5.9, p. 114, and footnote 5 in p. 114 + # 0.5 * trace((alpha . alpha^T - K^-1) . K_gradient) + # alpha is supposed to be a vector of (n_samples,) elements. With + # multioutputs, alpha is a matrix of size (n_samples, n_outputs). + # Therefore, we want to construct a matrix of + # (n_samples, n_samples, n_outputs) equivalent to + # for output_idx in range(n_outputs): + # output_alpha = alpha[:, [output_idx]] + # inner_term[..., output_idx] = output_alpha @ output_alpha.T + inner_term = np.einsum("ik,jk->ijk", alpha, alpha) + # compute K^-1 of shape (n_samples, n_samples) + K_inv = cho_solve( + (L, GPR_CHOLESKY_LOWER), np.eye(K.shape[0]), check_finite=False + ) + # create a new axis to use broadcasting between inner_term and + # K_inv + inner_term -= K_inv[..., np.newaxis] + # Since we are interested about the trace of + # inner_term @ K_gradient, we don't explicitly compute the + # matrix-by-matrix operation and instead use an einsum. Therefore + # it is equivalent to: + # for param_idx in range(n_kernel_params): + # for output_idx in range(n_output): + # log_likehood_gradient_dims[param_idx, output_idx] = ( + # inner_term[..., output_idx] @ + # K_gradient[..., param_idx] + # ) + log_likelihood_gradient_dims = 0.5 * np.einsum( + "ijl,jik->kl", inner_term, K_gradient + ) + # the log likelihood gradient is the sum-up across the outputs + log_likelihood_gradient = log_likelihood_gradient_dims.sum(axis=-1) + + if eval_gradient: + return log_likelihood, log_likelihood_gradient + else: + return log_likelihood + + def _constrained_optimization(self, obj_func, initial_theta, bounds): + if self.optimizer == "fmin_l_bfgs_b": + opt_res = scipy.optimize.minimize( + obj_func, + initial_theta, + method="L-BFGS-B", + jac=True, + bounds=bounds, + ) + _check_optimize_result("lbfgs", opt_res) + theta_opt, func_min = opt_res.x, opt_res.fun + elif callable(self.optimizer): + theta_opt, func_min = self.optimizer(obj_func, initial_theta, bounds=bounds) + else: + raise ValueError(f"Unknown optimizer {self.optimizer}.") + + return theta_opt, func_min + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.requires_fit = False + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/kernels.py b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/kernels.py new file mode 100644 index 0000000000000000000000000000000000000000..4a0a6ec667be421695e9d5e85d8282887614a2fe --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/kernels.py @@ -0,0 +1,2408 @@ +"""A set of kernels that can be combined by operators and used in Gaussian processes.""" + +# Kernels for Gaussian process regression and classification. +# +# The kernels in this module allow kernel-engineering, i.e., they can be +# combined via the "+" and "*" operators or be exponentiated with a scalar +# via "**". These sum and product expressions can also contain scalar values, +# which are automatically converted to a constant kernel. +# +# All kernels allow (analytic) gradient-based hyperparameter optimization. +# The space of hyperparameters can be specified by giving lower und upper +# boundaries for the value of each hyperparameter (the search space is thus +# rectangular). Instead of specifying bounds, hyperparameters can also be +# declared to be "fixed", which causes these hyperparameters to be excluded from +# optimization. + + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# Note: this module is strongly inspired by the kernel module of the george +# package. + +import math +import warnings +from abc import ABCMeta, abstractmethod +from collections import namedtuple +from inspect import signature + +import numpy as np +from scipy.spatial.distance import cdist, pdist, squareform +from scipy.special import gamma, kv + +from ..base import clone +from ..exceptions import ConvergenceWarning +from ..metrics.pairwise import pairwise_kernels +from ..utils.validation import _num_samples + + +def _check_length_scale(X, length_scale): + length_scale = np.squeeze(length_scale).astype(float) + if np.ndim(length_scale) > 1: + raise ValueError("length_scale cannot be of dimension greater than 1") + if np.ndim(length_scale) == 1 and X.shape[1] != length_scale.shape[0]: + raise ValueError( + "Anisotropic kernel must have the same number of " + "dimensions as data (%d!=%d)" % (length_scale.shape[0], X.shape[1]) + ) + return length_scale + + +class Hyperparameter( + namedtuple( + "Hyperparameter", ("name", "value_type", "bounds", "n_elements", "fixed") + ) +): + """A kernel hyperparameter's specification in form of a namedtuple. + + .. versionadded:: 0.18 + + Attributes + ---------- + name : str + The name of the hyperparameter. Note that a kernel using a + hyperparameter with name "x" must have the attributes self.x and + self.x_bounds + + value_type : str + The type of the hyperparameter. Currently, only "numeric" + hyperparameters are supported. + + bounds : pair of floats >= 0 or "fixed" + The lower and upper bound on the parameter. If n_elements>1, a pair + of 1d array with n_elements each may be given alternatively. If + the string "fixed" is passed as bounds, the hyperparameter's value + cannot be changed. + + n_elements : int, default=1 + The number of elements of the hyperparameter value. Defaults to 1, + which corresponds to a scalar hyperparameter. n_elements > 1 + corresponds to a hyperparameter which is vector-valued, + such as, e.g., anisotropic length-scales. + + fixed : bool, default=None + Whether the value of this hyperparameter is fixed, i.e., cannot be + changed during hyperparameter tuning. If None is passed, the "fixed" is + derived based on the given bounds. + + Examples + -------- + >>> from sklearn.gaussian_process.kernels import ConstantKernel + >>> from sklearn.datasets import make_friedman2 + >>> from sklearn.gaussian_process import GaussianProcessRegressor + >>> from sklearn.gaussian_process.kernels import Hyperparameter + >>> X, y = make_friedman2(n_samples=50, noise=0, random_state=0) + >>> kernel = ConstantKernel(constant_value=1.0, + ... constant_value_bounds=(0.0, 10.0)) + + We can access each hyperparameter: + + >>> for hyperparameter in kernel.hyperparameters: + ... print(hyperparameter) + Hyperparameter(name='constant_value', value_type='numeric', + bounds=array([[ 0., 10.]]), n_elements=1, fixed=False) + + >>> params = kernel.get_params() + >>> for key in sorted(params): print(f"{key} : {params[key]}") + constant_value : 1.0 + constant_value_bounds : (0.0, 10.0) + """ + + # A raw namedtuple is very memory efficient as it packs the attributes + # in a struct to get rid of the __dict__ of attributes in particular it + # does not copy the string for the keys on each instance. + # By deriving a namedtuple class just to introduce the __init__ method we + # would also reintroduce the __dict__ on the instance. By telling the + # Python interpreter that this subclass uses static __slots__ instead of + # dynamic attributes. Furthermore we don't need any additional slot in the + # subclass so we set __slots__ to the empty tuple. + __slots__ = () + + def __new__(cls, name, value_type, bounds, n_elements=1, fixed=None): + if not isinstance(bounds, str) or bounds != "fixed": + bounds = np.atleast_2d(bounds) + if n_elements > 1: # vector-valued parameter + if bounds.shape[0] == 1: + bounds = np.repeat(bounds, n_elements, 0) + elif bounds.shape[0] != n_elements: + raise ValueError( + "Bounds on %s should have either 1 or " + "%d dimensions. Given are %d" + % (name, n_elements, bounds.shape[0]) + ) + + if fixed is None: + fixed = isinstance(bounds, str) and bounds == "fixed" + return super().__new__(cls, name, value_type, bounds, n_elements, fixed) + + # This is mainly a testing utility to check that two hyperparameters + # are equal. + def __eq__(self, other): + return ( + self.name == other.name + and self.value_type == other.value_type + and np.all(self.bounds == other.bounds) + and self.n_elements == other.n_elements + and self.fixed == other.fixed + ) + + +class Kernel(metaclass=ABCMeta): + """Base class for all kernels. + + .. versionadded:: 0.18 + + Examples + -------- + >>> from sklearn.gaussian_process.kernels import Kernel, RBF + >>> import numpy as np + >>> class CustomKernel(Kernel): + ... def __init__(self, length_scale=1.0): + ... self.length_scale = length_scale + ... def __call__(self, X, Y=None): + ... if Y is None: + ... Y = X + ... return np.inner(X, X if Y is None else Y) ** 2 + ... def diag(self, X): + ... return np.ones(X.shape[0]) + ... def is_stationary(self): + ... return True + >>> kernel = CustomKernel(length_scale=2.0) + >>> X = np.array([[1, 2], [3, 4]]) + >>> print(kernel(X)) + [[ 25 121] + [121 625]] + """ + + def get_params(self, deep=True): + """Get parameters of this kernel. + + Parameters + ---------- + deep : bool, default=True + If True, will return the parameters for this estimator and + contained subobjects that are estimators. + + Returns + ------- + params : dict + Parameter names mapped to their values. + """ + params = dict() + + # introspect the constructor arguments to find the model parameters + # to represent + cls = self.__class__ + init = getattr(cls.__init__, "deprecated_original", cls.__init__) + init_sign = signature(init) + args, varargs = [], [] + for parameter in init_sign.parameters.values(): + if parameter.kind != parameter.VAR_KEYWORD and parameter.name != "self": + args.append(parameter.name) + if parameter.kind == parameter.VAR_POSITIONAL: + varargs.append(parameter.name) + + if len(varargs) != 0: + raise RuntimeError( + "scikit-learn kernels should always " + "specify their parameters in the signature" + " of their __init__ (no varargs)." + " %s doesn't follow this convention." % (cls,) + ) + for arg in args: + params[arg] = getattr(self, arg) + + return params + + def set_params(self, **params): + """Set the parameters of this kernel. + + The method works on simple kernels as well as on nested kernels. + The latter have parameters of the form ``__`` + so that it's possible to update each component of a nested object. + + Returns + ------- + self + """ + if not params: + # Simple optimisation to gain speed (inspect is slow) + return self + valid_params = self.get_params(deep=True) + for key, value in params.items(): + split = key.split("__", 1) + if len(split) > 1: + # nested objects case + name, sub_name = split + if name not in valid_params: + raise ValueError( + "Invalid parameter %s for kernel %s. " + "Check the list of available parameters " + "with `kernel.get_params().keys()`." % (name, self) + ) + sub_object = valid_params[name] + sub_object.set_params(**{sub_name: value}) + else: + # simple objects case + if key not in valid_params: + raise ValueError( + "Invalid parameter %s for kernel %s. " + "Check the list of available parameters " + "with `kernel.get_params().keys()`." + % (key, self.__class__.__name__) + ) + setattr(self, key, value) + return self + + def clone_with_theta(self, theta): + """Returns a clone of self with given hyperparameters theta. + + Parameters + ---------- + theta : ndarray of shape (n_dims,) + The hyperparameters + """ + cloned = clone(self) + cloned.theta = theta + return cloned + + @property + def n_dims(self): + """Returns the number of non-fixed hyperparameters of the kernel.""" + return self.theta.shape[0] + + @property + def hyperparameters(self): + """Returns a list of all hyperparameter specifications.""" + r = [ + getattr(self, attr) + for attr in dir(self) + if attr.startswith("hyperparameter_") + ] + return r + + @property + def theta(self): + """Returns the (flattened, log-transformed) non-fixed hyperparameters. + + Note that theta are typically the log-transformed values of the + kernel's hyperparameters as this representation of the search space + is more amenable for hyperparameter search, as hyperparameters like + length-scales naturally live on a log-scale. + + Returns + ------- + theta : ndarray of shape (n_dims,) + The non-fixed, log-transformed hyperparameters of the kernel + """ + theta = [] + params = self.get_params() + for hyperparameter in self.hyperparameters: + if not hyperparameter.fixed: + theta.append(params[hyperparameter.name]) + if len(theta) > 0: + return np.log(np.hstack(theta)) + else: + return np.array([]) + + @theta.setter + def theta(self, theta): + """Sets the (flattened, log-transformed) non-fixed hyperparameters. + + Parameters + ---------- + theta : ndarray of shape (n_dims,) + The non-fixed, log-transformed hyperparameters of the kernel + """ + params = self.get_params() + i = 0 + for hyperparameter in self.hyperparameters: + if hyperparameter.fixed: + continue + if hyperparameter.n_elements > 1: + # vector-valued parameter + params[hyperparameter.name] = np.exp( + theta[i : i + hyperparameter.n_elements] + ) + i += hyperparameter.n_elements + else: + params[hyperparameter.name] = np.exp(theta[i]) + i += 1 + + if i != len(theta): + raise ValueError( + "theta has not the correct number of entries." + " Should be %d; given are %d" % (i, len(theta)) + ) + self.set_params(**params) + + @property + def bounds(self): + """Returns the log-transformed bounds on the theta. + + Returns + ------- + bounds : ndarray of shape (n_dims, 2) + The log-transformed bounds on the kernel's hyperparameters theta + """ + bounds = [ + hyperparameter.bounds + for hyperparameter in self.hyperparameters + if not hyperparameter.fixed + ] + if len(bounds) > 0: + return np.log(np.vstack(bounds)) + else: + return np.array([]) + + def __add__(self, b): + if not isinstance(b, Kernel): + return Sum(self, ConstantKernel(b)) + return Sum(self, b) + + def __radd__(self, b): + if not isinstance(b, Kernel): + return Sum(ConstantKernel(b), self) + return Sum(b, self) + + def __mul__(self, b): + if not isinstance(b, Kernel): + return Product(self, ConstantKernel(b)) + return Product(self, b) + + def __rmul__(self, b): + if not isinstance(b, Kernel): + return Product(ConstantKernel(b), self) + return Product(b, self) + + def __pow__(self, b): + return Exponentiation(self, b) + + def __eq__(self, b): + if type(self) != type(b): + return False + params_a = self.get_params() + params_b = b.get_params() + for key in set(list(params_a.keys()) + list(params_b.keys())): + if np.any(params_a.get(key, None) != params_b.get(key, None)): + return False + return True + + def __repr__(self): + return "{0}({1})".format( + self.__class__.__name__, ", ".join(map("{0:.3g}".format, self.theta)) + ) + + @abstractmethod + def __call__(self, X, Y=None, eval_gradient=False): + """Evaluate the kernel.""" + + @abstractmethod + def diag(self, X): + """Returns the diagonal of the kernel k(X, X). + + The result of this method is identical to np.diag(self(X)); however, + it can be evaluated more efficiently since only the diagonal is + evaluated. + + Parameters + ---------- + X : array-like of shape (n_samples,) + Left argument of the returned kernel k(X, Y) + + Returns + ------- + K_diag : ndarray of shape (n_samples_X,) + Diagonal of kernel k(X, X) + """ + + @abstractmethod + def is_stationary(self): + """Returns whether the kernel is stationary.""" + + @property + def requires_vector_input(self): + """Returns whether the kernel is defined on fixed-length feature + vectors or generic objects. Defaults to True for backward + compatibility.""" + return True + + def _check_bounds_params(self): + """Called after fitting to warn if bounds may have been too tight.""" + list_close = np.isclose(self.bounds, np.atleast_2d(self.theta).T) + idx = 0 + for hyp in self.hyperparameters: + if hyp.fixed: + continue + for dim in range(hyp.n_elements): + if list_close[idx, 0]: + warnings.warn( + "The optimal value found for " + "dimension %s of parameter %s is " + "close to the specified lower " + "bound %s. Decreasing the bound and" + " calling fit again may find a " + "better value." % (dim, hyp.name, hyp.bounds[dim][0]), + ConvergenceWarning, + ) + elif list_close[idx, 1]: + warnings.warn( + "The optimal value found for " + "dimension %s of parameter %s is " + "close to the specified upper " + "bound %s. Increasing the bound and" + " calling fit again may find a " + "better value." % (dim, hyp.name, hyp.bounds[dim][1]), + ConvergenceWarning, + ) + idx += 1 + + +class NormalizedKernelMixin: + """Mixin for kernels which are normalized: k(X, X)=1. + + .. versionadded:: 0.18 + """ + + def diag(self, X): + """Returns the diagonal of the kernel k(X, X). + + The result of this method is identical to np.diag(self(X)); however, + it can be evaluated more efficiently since only the diagonal is + evaluated. + + Parameters + ---------- + X : ndarray of shape (n_samples_X, n_features) + Left argument of the returned kernel k(X, Y) + + Returns + ------- + K_diag : ndarray of shape (n_samples_X,) + Diagonal of kernel k(X, X) + """ + return np.ones(X.shape[0]) + + +class StationaryKernelMixin: + """Mixin for kernels which are stationary: k(X, Y)= f(X-Y). + + .. versionadded:: 0.18 + """ + + def is_stationary(self): + """Returns whether the kernel is stationary.""" + return True + + +class GenericKernelMixin: + """Mixin for kernels which operate on generic objects such as variable- + length sequences, trees, and graphs. + + .. versionadded:: 0.22 + """ + + @property + def requires_vector_input(self): + """Whether the kernel works only on fixed-length feature vectors.""" + return False + + +class CompoundKernel(Kernel): + """Kernel which is composed of a set of other kernels. + + .. versionadded:: 0.18 + + Parameters + ---------- + kernels : list of Kernels + The other kernels + + Examples + -------- + >>> from sklearn.gaussian_process.kernels import WhiteKernel + >>> from sklearn.gaussian_process.kernels import RBF + >>> from sklearn.gaussian_process.kernels import CompoundKernel + >>> kernel = CompoundKernel( + ... [WhiteKernel(noise_level=3.0), RBF(length_scale=2.0)]) + >>> print(kernel.bounds) + [[-11.51292546 11.51292546] + [-11.51292546 11.51292546]] + >>> print(kernel.n_dims) + 2 + >>> print(kernel.theta) + [1.09861229 0.69314718] + """ + + def __init__(self, kernels): + self.kernels = kernels + + def get_params(self, deep=True): + """Get parameters of this kernel. + + Parameters + ---------- + deep : bool, default=True + If True, will return the parameters for this estimator and + contained subobjects that are estimators. + + Returns + ------- + params : dict + Parameter names mapped to their values. + """ + return dict(kernels=self.kernels) + + @property + def theta(self): + """Returns the (flattened, log-transformed) non-fixed hyperparameters. + + Note that theta are typically the log-transformed values of the + kernel's hyperparameters as this representation of the search space + is more amenable for hyperparameter search, as hyperparameters like + length-scales naturally live on a log-scale. + + Returns + ------- + theta : ndarray of shape (n_dims,) + The non-fixed, log-transformed hyperparameters of the kernel + """ + return np.hstack([kernel.theta for kernel in self.kernels]) + + @theta.setter + def theta(self, theta): + """Sets the (flattened, log-transformed) non-fixed hyperparameters. + + Parameters + ---------- + theta : array of shape (n_dims,) + The non-fixed, log-transformed hyperparameters of the kernel + """ + k_dims = self.k1.n_dims + for i, kernel in enumerate(self.kernels): + kernel.theta = theta[i * k_dims : (i + 1) * k_dims] + + @property + def bounds(self): + """Returns the log-transformed bounds on the theta. + + Returns + ------- + bounds : array of shape (n_dims, 2) + The log-transformed bounds on the kernel's hyperparameters theta + """ + return np.vstack([kernel.bounds for kernel in self.kernels]) + + def __call__(self, X, Y=None, eval_gradient=False): + """Return the kernel k(X, Y) and optionally its gradient. + + Note that this compound kernel returns the results of all simple kernel + stacked along an additional axis. + + Parameters + ---------- + X : array-like of shape (n_samples_X, n_features) or list of object, \ + default=None + Left argument of the returned kernel k(X, Y) + + Y : array-like of shape (n_samples_X, n_features) or list of object, \ + default=None + Right argument of the returned kernel k(X, Y). If None, k(X, X) + is evaluated instead. + + eval_gradient : bool, default=False + Determines whether the gradient with respect to the log of the + kernel hyperparameter is computed. + + Returns + ------- + K : ndarray of shape (n_samples_X, n_samples_Y, n_kernels) + Kernel k(X, Y) + + K_gradient : ndarray of shape \ + (n_samples_X, n_samples_X, n_dims, n_kernels), optional + The gradient of the kernel k(X, X) with respect to the log of the + hyperparameter of the kernel. Only returned when `eval_gradient` + is True. + """ + if eval_gradient: + K = [] + K_grad = [] + for kernel in self.kernels: + K_single, K_grad_single = kernel(X, Y, eval_gradient) + K.append(K_single) + K_grad.append(K_grad_single[..., np.newaxis]) + return np.dstack(K), np.concatenate(K_grad, 3) + else: + return np.dstack([kernel(X, Y, eval_gradient) for kernel in self.kernels]) + + def __eq__(self, b): + if type(self) != type(b) or len(self.kernels) != len(b.kernels): + return False + return np.all( + [self.kernels[i] == b.kernels[i] for i in range(len(self.kernels))] + ) + + def is_stationary(self): + """Returns whether the kernel is stationary.""" + return np.all([kernel.is_stationary() for kernel in self.kernels]) + + @property + def requires_vector_input(self): + """Returns whether the kernel is defined on discrete structures.""" + return np.any([kernel.requires_vector_input for kernel in self.kernels]) + + def diag(self, X): + """Returns the diagonal of the kernel k(X, X). + + The result of this method is identical to `np.diag(self(X))`; however, + it can be evaluated more efficiently since only the diagonal is + evaluated. + + Parameters + ---------- + X : array-like of shape (n_samples_X, n_features) or list of object + Argument to the kernel. + + Returns + ------- + K_diag : ndarray of shape (n_samples_X, n_kernels) + Diagonal of kernel k(X, X) + """ + return np.vstack([kernel.diag(X) for kernel in self.kernels]).T + + +class KernelOperator(Kernel): + """Base class for all kernel operators. + + .. versionadded:: 0.18 + """ + + def __init__(self, k1, k2): + self.k1 = k1 + self.k2 = k2 + + def get_params(self, deep=True): + """Get parameters of this kernel. + + Parameters + ---------- + deep : bool, default=True + If True, will return the parameters for this estimator and + contained subobjects that are estimators. + + Returns + ------- + params : dict + Parameter names mapped to their values. + """ + params = dict(k1=self.k1, k2=self.k2) + if deep: + deep_items = self.k1.get_params().items() + params.update(("k1__" + k, val) for k, val in deep_items) + deep_items = self.k2.get_params().items() + params.update(("k2__" + k, val) for k, val in deep_items) + + return params + + @property + def hyperparameters(self): + """Returns a list of all hyperparameter.""" + r = [ + Hyperparameter( + "k1__" + hyperparameter.name, + hyperparameter.value_type, + hyperparameter.bounds, + hyperparameter.n_elements, + ) + for hyperparameter in self.k1.hyperparameters + ] + + for hyperparameter in self.k2.hyperparameters: + r.append( + Hyperparameter( + "k2__" + hyperparameter.name, + hyperparameter.value_type, + hyperparameter.bounds, + hyperparameter.n_elements, + ) + ) + return r + + @property + def theta(self): + """Returns the (flattened, log-transformed) non-fixed hyperparameters. + + Note that theta are typically the log-transformed values of the + kernel's hyperparameters as this representation of the search space + is more amenable for hyperparameter search, as hyperparameters like + length-scales naturally live on a log-scale. + + Returns + ------- + theta : ndarray of shape (n_dims,) + The non-fixed, log-transformed hyperparameters of the kernel + """ + return np.append(self.k1.theta, self.k2.theta) + + @theta.setter + def theta(self, theta): + """Sets the (flattened, log-transformed) non-fixed hyperparameters. + + Parameters + ---------- + theta : ndarray of shape (n_dims,) + The non-fixed, log-transformed hyperparameters of the kernel + """ + k1_dims = self.k1.n_dims + self.k1.theta = theta[:k1_dims] + self.k2.theta = theta[k1_dims:] + + @property + def bounds(self): + """Returns the log-transformed bounds on the theta. + + Returns + ------- + bounds : ndarray of shape (n_dims, 2) + The log-transformed bounds on the kernel's hyperparameters theta + """ + if self.k1.bounds.size == 0: + return self.k2.bounds + if self.k2.bounds.size == 0: + return self.k1.bounds + return np.vstack((self.k1.bounds, self.k2.bounds)) + + def __eq__(self, b): + if type(self) != type(b): + return False + return (self.k1 == b.k1 and self.k2 == b.k2) or ( + self.k1 == b.k2 and self.k2 == b.k1 + ) + + def is_stationary(self): + """Returns whether the kernel is stationary.""" + return self.k1.is_stationary() and self.k2.is_stationary() + + @property + def requires_vector_input(self): + """Returns whether the kernel is stationary.""" + return self.k1.requires_vector_input or self.k2.requires_vector_input + + +class Sum(KernelOperator): + """The `Sum` kernel takes two kernels :math:`k_1` and :math:`k_2` + and combines them via + + .. math:: + k_{sum}(X, Y) = k_1(X, Y) + k_2(X, Y) + + Note that the `__add__` magic method is overridden, so + `Sum(RBF(), RBF())` is equivalent to using the + operator + with `RBF() + RBF()`. + + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.18 + + Parameters + ---------- + k1 : Kernel + The first base-kernel of the sum-kernel + + k2 : Kernel + The second base-kernel of the sum-kernel + + Examples + -------- + >>> from sklearn.datasets import make_friedman2 + >>> from sklearn.gaussian_process import GaussianProcessRegressor + >>> from sklearn.gaussian_process.kernels import RBF, Sum, ConstantKernel + >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0) + >>> kernel = Sum(ConstantKernel(2), RBF()) + >>> gpr = GaussianProcessRegressor(kernel=kernel, + ... random_state=0).fit(X, y) + >>> gpr.score(X, y) + 1.0 + >>> kernel + 1.41**2 + RBF(length_scale=1) + """ + + def __call__(self, X, Y=None, eval_gradient=False): + """Return the kernel k(X, Y) and optionally its gradient. + + Parameters + ---------- + X : array-like of shape (n_samples_X, n_features) or list of object + Left argument of the returned kernel k(X, Y) + + Y : array-like of shape (n_samples_X, n_features) or list of object,\ + default=None + Right argument of the returned kernel k(X, Y). If None, k(X, X) + is evaluated instead. + + eval_gradient : bool, default=False + Determines whether the gradient with respect to the log of + the kernel hyperparameter is computed. + + Returns + ------- + K : ndarray of shape (n_samples_X, n_samples_Y) + Kernel k(X, Y) + + K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\ + optional + The gradient of the kernel k(X, X) with respect to the log of the + hyperparameter of the kernel. Only returned when `eval_gradient` + is True. + """ + if eval_gradient: + K1, K1_gradient = self.k1(X, Y, eval_gradient=True) + K2, K2_gradient = self.k2(X, Y, eval_gradient=True) + return K1 + K2, np.dstack((K1_gradient, K2_gradient)) + else: + return self.k1(X, Y) + self.k2(X, Y) + + def diag(self, X): + """Returns the diagonal of the kernel k(X, X). + + The result of this method is identical to `np.diag(self(X))`; however, + it can be evaluated more efficiently since only the diagonal is + evaluated. + + Parameters + ---------- + X : array-like of shape (n_samples_X, n_features) or list of object + Argument to the kernel. + + Returns + ------- + K_diag : ndarray of shape (n_samples_X,) + Diagonal of kernel k(X, X) + """ + return self.k1.diag(X) + self.k2.diag(X) + + def __repr__(self): + return "{0} + {1}".format(self.k1, self.k2) + + +class Product(KernelOperator): + """The `Product` kernel takes two kernels :math:`k_1` and :math:`k_2` + and combines them via + + .. math:: + k_{prod}(X, Y) = k_1(X, Y) * k_2(X, Y) + + Note that the `__mul__` magic method is overridden, so + `Product(RBF(), RBF())` is equivalent to using the * operator + with `RBF() * RBF()`. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.18 + + Parameters + ---------- + k1 : Kernel + The first base-kernel of the product-kernel + + k2 : Kernel + The second base-kernel of the product-kernel + + + Examples + -------- + >>> from sklearn.datasets import make_friedman2 + >>> from sklearn.gaussian_process import GaussianProcessRegressor + >>> from sklearn.gaussian_process.kernels import (RBF, Product, + ... ConstantKernel) + >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0) + >>> kernel = Product(ConstantKernel(2), RBF()) + >>> gpr = GaussianProcessRegressor(kernel=kernel, + ... random_state=0).fit(X, y) + >>> gpr.score(X, y) + 1.0 + >>> kernel + 1.41**2 * RBF(length_scale=1) + """ + + def __call__(self, X, Y=None, eval_gradient=False): + """Return the kernel k(X, Y) and optionally its gradient. + + Parameters + ---------- + X : array-like of shape (n_samples_X, n_features) or list of object + Left argument of the returned kernel k(X, Y) + + Y : array-like of shape (n_samples_Y, n_features) or list of object,\ + default=None + Right argument of the returned kernel k(X, Y). If None, k(X, X) + is evaluated instead. + + eval_gradient : bool, default=False + Determines whether the gradient with respect to the log of + the kernel hyperparameter is computed. + + Returns + ------- + K : ndarray of shape (n_samples_X, n_samples_Y) + Kernel k(X, Y) + + K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), \ + optional + The gradient of the kernel k(X, X) with respect to the log of the + hyperparameter of the kernel. Only returned when `eval_gradient` + is True. + """ + if eval_gradient: + K1, K1_gradient = self.k1(X, Y, eval_gradient=True) + K2, K2_gradient = self.k2(X, Y, eval_gradient=True) + return K1 * K2, np.dstack( + (K1_gradient * K2[:, :, np.newaxis], K2_gradient * K1[:, :, np.newaxis]) + ) + else: + return self.k1(X, Y) * self.k2(X, Y) + + def diag(self, X): + """Returns the diagonal of the kernel k(X, X). + + The result of this method is identical to np.diag(self(X)); however, + it can be evaluated more efficiently since only the diagonal is + evaluated. + + Parameters + ---------- + X : array-like of shape (n_samples_X, n_features) or list of object + Argument to the kernel. + + Returns + ------- + K_diag : ndarray of shape (n_samples_X,) + Diagonal of kernel k(X, X) + """ + return self.k1.diag(X) * self.k2.diag(X) + + def __repr__(self): + return "{0} * {1}".format(self.k1, self.k2) + + +class Exponentiation(Kernel): + """The Exponentiation kernel takes one base kernel and a scalar parameter + :math:`p` and combines them via + + .. math:: + k_{exp}(X, Y) = k(X, Y) ^p + + Note that the `__pow__` magic method is overridden, so + `Exponentiation(RBF(), 2)` is equivalent to using the ** operator + with `RBF() ** 2`. + + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.18 + + Parameters + ---------- + kernel : Kernel + The base kernel + + exponent : float + The exponent for the base kernel + + + Examples + -------- + >>> from sklearn.datasets import make_friedman2 + >>> from sklearn.gaussian_process import GaussianProcessRegressor + >>> from sklearn.gaussian_process.kernels import (RationalQuadratic, + ... Exponentiation) + >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0) + >>> kernel = Exponentiation(RationalQuadratic(), exponent=2) + >>> gpr = GaussianProcessRegressor(kernel=kernel, alpha=5, + ... random_state=0).fit(X, y) + >>> gpr.score(X, y) + 0.419 + >>> gpr.predict(X[:1,:], return_std=True) + (array([635.5]), array([0.559])) + """ + + def __init__(self, kernel, exponent): + self.kernel = kernel + self.exponent = exponent + + def get_params(self, deep=True): + """Get parameters of this kernel. + + Parameters + ---------- + deep : bool, default=True + If True, will return the parameters for this estimator and + contained subobjects that are estimators. + + Returns + ------- + params : dict + Parameter names mapped to their values. + """ + params = dict(kernel=self.kernel, exponent=self.exponent) + if deep: + deep_items = self.kernel.get_params().items() + params.update(("kernel__" + k, val) for k, val in deep_items) + return params + + @property + def hyperparameters(self): + """Returns a list of all hyperparameter.""" + r = [] + for hyperparameter in self.kernel.hyperparameters: + r.append( + Hyperparameter( + "kernel__" + hyperparameter.name, + hyperparameter.value_type, + hyperparameter.bounds, + hyperparameter.n_elements, + ) + ) + return r + + @property + def theta(self): + """Returns the (flattened, log-transformed) non-fixed hyperparameters. + + Note that theta are typically the log-transformed values of the + kernel's hyperparameters as this representation of the search space + is more amenable for hyperparameter search, as hyperparameters like + length-scales naturally live on a log-scale. + + Returns + ------- + theta : ndarray of shape (n_dims,) + The non-fixed, log-transformed hyperparameters of the kernel + """ + return self.kernel.theta + + @theta.setter + def theta(self, theta): + """Sets the (flattened, log-transformed) non-fixed hyperparameters. + + Parameters + ---------- + theta : ndarray of shape (n_dims,) + The non-fixed, log-transformed hyperparameters of the kernel + """ + self.kernel.theta = theta + + @property + def bounds(self): + """Returns the log-transformed bounds on the theta. + + Returns + ------- + bounds : ndarray of shape (n_dims, 2) + The log-transformed bounds on the kernel's hyperparameters theta + """ + return self.kernel.bounds + + def __eq__(self, b): + if type(self) != type(b): + return False + return self.kernel == b.kernel and self.exponent == b.exponent + + def __call__(self, X, Y=None, eval_gradient=False): + """Return the kernel k(X, Y) and optionally its gradient. + + Parameters + ---------- + X : array-like of shape (n_samples_X, n_features) or list of object + Left argument of the returned kernel k(X, Y) + + Y : array-like of shape (n_samples_Y, n_features) or list of object,\ + default=None + Right argument of the returned kernel k(X, Y). If None, k(X, X) + is evaluated instead. + + eval_gradient : bool, default=False + Determines whether the gradient with respect to the log of + the kernel hyperparameter is computed. + + Returns + ------- + K : ndarray of shape (n_samples_X, n_samples_Y) + Kernel k(X, Y) + + K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\ + optional + The gradient of the kernel k(X, X) with respect to the log of the + hyperparameter of the kernel. Only returned when `eval_gradient` + is True. + """ + if eval_gradient: + K, K_gradient = self.kernel(X, Y, eval_gradient=True) + K_gradient *= self.exponent * K[:, :, np.newaxis] ** (self.exponent - 1) + return K**self.exponent, K_gradient + else: + K = self.kernel(X, Y, eval_gradient=False) + return K**self.exponent + + def diag(self, X): + """Returns the diagonal of the kernel k(X, X). + + The result of this method is identical to np.diag(self(X)); however, + it can be evaluated more efficiently since only the diagonal is + evaluated. + + Parameters + ---------- + X : array-like of shape (n_samples_X, n_features) or list of object + Argument to the kernel. + + Returns + ------- + K_diag : ndarray of shape (n_samples_X,) + Diagonal of kernel k(X, X) + """ + return self.kernel.diag(X) ** self.exponent + + def __repr__(self): + return "{0} ** {1}".format(self.kernel, self.exponent) + + def is_stationary(self): + """Returns whether the kernel is stationary.""" + return self.kernel.is_stationary() + + @property + def requires_vector_input(self): + """Returns whether the kernel is defined on discrete structures.""" + return self.kernel.requires_vector_input + + +class ConstantKernel(StationaryKernelMixin, GenericKernelMixin, Kernel): + """Constant kernel. + + Can be used as part of a product-kernel where it scales the magnitude of + the other factor (kernel) or as part of a sum-kernel, where it modifies + the mean of the Gaussian process. + + .. math:: + k(x_1, x_2) = constant\\_value \\;\\forall\\; x_1, x_2 + + Adding a constant kernel is equivalent to adding a constant:: + + kernel = RBF() + ConstantKernel(constant_value=2) + + is the same as:: + + kernel = RBF() + 2 + + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.18 + + Parameters + ---------- + constant_value : float, default=1.0 + The constant value which defines the covariance: + k(x_1, x_2) = constant_value + + constant_value_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5) + The lower and upper bound on `constant_value`. + If set to "fixed", `constant_value` cannot be changed during + hyperparameter tuning. + + Examples + -------- + >>> from sklearn.datasets import make_friedman2 + >>> from sklearn.gaussian_process import GaussianProcessRegressor + >>> from sklearn.gaussian_process.kernels import RBF, ConstantKernel + >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0) + >>> kernel = RBF() + ConstantKernel(constant_value=2) + >>> gpr = GaussianProcessRegressor(kernel=kernel, alpha=5, + ... random_state=0).fit(X, y) + >>> gpr.score(X, y) + 0.3696 + >>> gpr.predict(X[:1,:], return_std=True) + (array([606.1]), array([0.248])) + """ + + def __init__(self, constant_value=1.0, constant_value_bounds=(1e-5, 1e5)): + self.constant_value = constant_value + self.constant_value_bounds = constant_value_bounds + + @property + def hyperparameter_constant_value(self): + return Hyperparameter("constant_value", "numeric", self.constant_value_bounds) + + def __call__(self, X, Y=None, eval_gradient=False): + """Return the kernel k(X, Y) and optionally its gradient. + + Parameters + ---------- + X : array-like of shape (n_samples_X, n_features) or list of object + Left argument of the returned kernel k(X, Y) + + Y : array-like of shape (n_samples_X, n_features) or list of object, \ + default=None + Right argument of the returned kernel k(X, Y). If None, k(X, X) + is evaluated instead. + + eval_gradient : bool, default=False + Determines whether the gradient with respect to the log of + the kernel hyperparameter is computed. + Only supported when Y is None. + + Returns + ------- + K : ndarray of shape (n_samples_X, n_samples_Y) + Kernel k(X, Y) + + K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), \ + optional + The gradient of the kernel k(X, X) with respect to the log of the + hyperparameter of the kernel. Only returned when eval_gradient + is True. + """ + if Y is None: + Y = X + elif eval_gradient: + raise ValueError("Gradient can only be evaluated when Y is None.") + + K = np.full( + (_num_samples(X), _num_samples(Y)), + self.constant_value, + dtype=np.array(self.constant_value).dtype, + ) + if eval_gradient: + if not self.hyperparameter_constant_value.fixed: + return ( + K, + np.full( + (_num_samples(X), _num_samples(X), 1), + self.constant_value, + dtype=np.array(self.constant_value).dtype, + ), + ) + else: + return K, np.empty((_num_samples(X), _num_samples(X), 0)) + else: + return K + + def diag(self, X): + """Returns the diagonal of the kernel k(X, X). + + The result of this method is identical to np.diag(self(X)); however, + it can be evaluated more efficiently since only the diagonal is + evaluated. + + Parameters + ---------- + X : array-like of shape (n_samples_X, n_features) or list of object + Argument to the kernel. + + Returns + ------- + K_diag : ndarray of shape (n_samples_X,) + Diagonal of kernel k(X, X) + """ + return np.full( + _num_samples(X), + self.constant_value, + dtype=np.array(self.constant_value).dtype, + ) + + def __repr__(self): + return "{0:.3g}**2".format(np.sqrt(self.constant_value)) + + +class WhiteKernel(StationaryKernelMixin, GenericKernelMixin, Kernel): + """White kernel. + + The main use-case of this kernel is as part of a sum-kernel where it + explains the noise of the signal as independently and identically + normally-distributed. The parameter noise_level equals the variance of this + noise. + + .. math:: + k(x_1, x_2) = noise\\_level \\text{ if } x_i == x_j \\text{ else } 0 + + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.18 + + Parameters + ---------- + noise_level : float, default=1.0 + Parameter controlling the noise level (variance) + + noise_level_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5) + The lower and upper bound on 'noise_level'. + If set to "fixed", 'noise_level' cannot be changed during + hyperparameter tuning. + + Examples + -------- + >>> from sklearn.datasets import make_friedman2 + >>> from sklearn.gaussian_process import GaussianProcessRegressor + >>> from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel + >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0) + >>> kernel = DotProduct() + WhiteKernel(noise_level=0.5) + >>> gpr = GaussianProcessRegressor(kernel=kernel, + ... random_state=0).fit(X, y) + >>> gpr.score(X, y) + 0.3680 + >>> gpr.predict(X[:2,:], return_std=True) + (array([653.0, 592.1 ]), array([316.6, 316.6])) + """ + + def __init__(self, noise_level=1.0, noise_level_bounds=(1e-5, 1e5)): + self.noise_level = noise_level + self.noise_level_bounds = noise_level_bounds + + @property + def hyperparameter_noise_level(self): + return Hyperparameter("noise_level", "numeric", self.noise_level_bounds) + + def __call__(self, X, Y=None, eval_gradient=False): + """Return the kernel k(X, Y) and optionally its gradient. + + Parameters + ---------- + X : array-like of shape (n_samples_X, n_features) or list of object + Left argument of the returned kernel k(X, Y) + + Y : array-like of shape (n_samples_X, n_features) or list of object,\ + default=None + Right argument of the returned kernel k(X, Y). If None, k(X, X) + is evaluated instead. + + eval_gradient : bool, default=False + Determines whether the gradient with respect to the log of + the kernel hyperparameter is computed. + Only supported when Y is None. + + Returns + ------- + K : ndarray of shape (n_samples_X, n_samples_Y) + Kernel k(X, Y) + + K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\ + optional + The gradient of the kernel k(X, X) with respect to the log of the + hyperparameter of the kernel. Only returned when eval_gradient + is True. + """ + if Y is not None and eval_gradient: + raise ValueError("Gradient can only be evaluated when Y is None.") + + if Y is None: + K = self.noise_level * np.eye(_num_samples(X)) + if eval_gradient: + if not self.hyperparameter_noise_level.fixed: + return ( + K, + self.noise_level * np.eye(_num_samples(X))[:, :, np.newaxis], + ) + else: + return K, np.empty((_num_samples(X), _num_samples(X), 0)) + else: + return K + else: + return np.zeros((_num_samples(X), _num_samples(Y))) + + def diag(self, X): + """Returns the diagonal of the kernel k(X, X). + + The result of this method is identical to np.diag(self(X)); however, + it can be evaluated more efficiently since only the diagonal is + evaluated. + + Parameters + ---------- + X : array-like of shape (n_samples_X, n_features) or list of object + Argument to the kernel. + + Returns + ------- + K_diag : ndarray of shape (n_samples_X,) + Diagonal of kernel k(X, X) + """ + return np.full( + _num_samples(X), self.noise_level, dtype=np.array(self.noise_level).dtype + ) + + def __repr__(self): + return "{0}(noise_level={1:.3g})".format( + self.__class__.__name__, self.noise_level + ) + + +class RBF(StationaryKernelMixin, NormalizedKernelMixin, Kernel): + """Radial basis function kernel (aka squared-exponential kernel). + + The RBF kernel is a stationary kernel. It is also known as the + "squared exponential" kernel. It is parameterized by a length scale + parameter :math:`l>0`, which can either be a scalar (isotropic variant + of the kernel) or a vector with the same number of dimensions as the inputs + X (anisotropic variant of the kernel). The kernel is given by: + + .. math:: + k(x_i, x_j) = \\exp\\left(- \\frac{d(x_i, x_j)^2}{2l^2} \\right) + + where :math:`l` is the length scale of the kernel and + :math:`d(\\cdot,\\cdot)` is the Euclidean distance. + For advice on how to set the length scale parameter, see e.g. [1]_. + + This kernel is infinitely differentiable, which implies that GPs with this + kernel as covariance function have mean square derivatives of all orders, + and are thus very smooth. + See [2]_, Chapter 4, Section 4.2, for further details of the RBF kernel. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.18 + + Parameters + ---------- + length_scale : float or ndarray of shape (n_features,), default=1.0 + The length scale of the kernel. If a float, an isotropic kernel is + used. If an array, an anisotropic kernel is used where each dimension + of l defines the length-scale of the respective feature dimension. + + length_scale_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5) + The lower and upper bound on 'length_scale'. + If set to "fixed", 'length_scale' cannot be changed during + hyperparameter tuning. + + References + ---------- + .. [1] `David Duvenaud (2014). "The Kernel Cookbook: + Advice on Covariance functions". + `_ + + .. [2] `Carl Edward Rasmussen, Christopher K. I. Williams (2006). + "Gaussian Processes for Machine Learning". The MIT Press. + `_ + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> from sklearn.gaussian_process import GaussianProcessClassifier + >>> from sklearn.gaussian_process.kernels import RBF + >>> X, y = load_iris(return_X_y=True) + >>> kernel = 1.0 * RBF(1.0) + >>> gpc = GaussianProcessClassifier(kernel=kernel, + ... random_state=0).fit(X, y) + >>> gpc.score(X, y) + 0.9866 + >>> gpc.predict_proba(X[:2,:]) + array([[0.8354, 0.03228, 0.1322], + [0.7906, 0.0652, 0.1441]]) + """ + + def __init__(self, length_scale=1.0, length_scale_bounds=(1e-5, 1e5)): + self.length_scale = length_scale + self.length_scale_bounds = length_scale_bounds + + @property + def anisotropic(self): + return np.iterable(self.length_scale) and len(self.length_scale) > 1 + + @property + def hyperparameter_length_scale(self): + if self.anisotropic: + return Hyperparameter( + "length_scale", + "numeric", + self.length_scale_bounds, + len(self.length_scale), + ) + return Hyperparameter("length_scale", "numeric", self.length_scale_bounds) + + def __call__(self, X, Y=None, eval_gradient=False): + """Return the kernel k(X, Y) and optionally its gradient. + + Parameters + ---------- + X : ndarray of shape (n_samples_X, n_features) + Left argument of the returned kernel k(X, Y) + + Y : ndarray of shape (n_samples_Y, n_features), default=None + Right argument of the returned kernel k(X, Y). If None, k(X, X) + if evaluated instead. + + eval_gradient : bool, default=False + Determines whether the gradient with respect to the log of + the kernel hyperparameter is computed. + Only supported when Y is None. + + Returns + ------- + K : ndarray of shape (n_samples_X, n_samples_Y) + Kernel k(X, Y) + + K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), \ + optional + The gradient of the kernel k(X, X) with respect to the log of the + hyperparameter of the kernel. Only returned when `eval_gradient` + is True. + """ + X = np.atleast_2d(X) + length_scale = _check_length_scale(X, self.length_scale) + if Y is None: + dists = pdist(X / length_scale, metric="sqeuclidean") + K = np.exp(-0.5 * dists) + # convert from upper-triangular matrix to square matrix + K = squareform(K) + np.fill_diagonal(K, 1) + else: + if eval_gradient: + raise ValueError("Gradient can only be evaluated when Y is None.") + dists = cdist(X / length_scale, Y / length_scale, metric="sqeuclidean") + K = np.exp(-0.5 * dists) + + if eval_gradient: + if self.hyperparameter_length_scale.fixed: + # Hyperparameter l kept fixed + return K, np.empty((X.shape[0], X.shape[0], 0)) + elif not self.anisotropic or length_scale.shape[0] == 1: + K_gradient = (K * squareform(dists))[:, :, np.newaxis] + return K, K_gradient + elif self.anisotropic: + # We need to recompute the pairwise dimension-wise distances + K_gradient = (X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2 / ( + length_scale**2 + ) + K_gradient *= K[..., np.newaxis] + return K, K_gradient + else: + return K + + def __repr__(self): + if self.anisotropic: + return "{0}(length_scale=[{1}])".format( + self.__class__.__name__, + ", ".join(map("{0:.3g}".format, self.length_scale)), + ) + else: # isotropic + return "{0}(length_scale={1:.3g})".format( + self.__class__.__name__, np.ravel(self.length_scale)[0] + ) + + +class Matern(RBF): + """Matern kernel. + + The class of Matern kernels is a generalization of the :class:`RBF`. + It has an additional parameter :math:`\\nu` which controls the + smoothness of the resulting function. The smaller :math:`\\nu`, + the less smooth the approximated function is. + As :math:`\\nu\\rightarrow\\infty`, the kernel becomes equivalent to + the :class:`RBF` kernel. When :math:`\\nu = 1/2`, the Matérn kernel + becomes identical to the absolute exponential kernel. + Important intermediate values are + :math:`\\nu=1.5` (once differentiable functions) + and :math:`\\nu=2.5` (twice differentiable functions). + + The kernel is given by: + + .. math:: + k(x_i, x_j) = \\frac{1}{\\Gamma(\\nu)2^{\\nu-1}}\\Bigg( + \\frac{\\sqrt{2\\nu}}{l} d(x_i , x_j ) + \\Bigg)^\\nu K_\\nu\\Bigg( + \\frac{\\sqrt{2\\nu}}{l} d(x_i , x_j )\\Bigg) + + + + where :math:`d(\\cdot,\\cdot)` is the Euclidean distance, + :math:`K_{\\nu}(\\cdot)` is a modified Bessel function and + :math:`\\Gamma(\\cdot)` is the gamma function. + See [1]_, Chapter 4, Section 4.2, for details regarding the different + variants of the Matern kernel. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.18 + + Parameters + ---------- + length_scale : float or ndarray of shape (n_features,), default=1.0 + The length scale of the kernel. If a float, an isotropic kernel is + used. If an array, an anisotropic kernel is used where each dimension + of l defines the length-scale of the respective feature dimension. + + length_scale_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5) + The lower and upper bound on 'length_scale'. + If set to "fixed", 'length_scale' cannot be changed during + hyperparameter tuning. + + nu : float, default=1.5 + The parameter nu controlling the smoothness of the learned function. + The smaller nu, the less smooth the approximated function is. + For nu=inf, the kernel becomes equivalent to the RBF kernel and for + nu=0.5 to the absolute exponential kernel. Important intermediate + values are nu=1.5 (once differentiable functions) and nu=2.5 + (twice differentiable functions). Note that values of nu not in + [0.5, 1.5, 2.5, inf] incur a considerably higher computational cost + (appr. 10 times higher) since they require to evaluate the modified + Bessel function. Furthermore, in contrast to l, nu is kept fixed to + its initial value and not optimized. + + References + ---------- + .. [1] `Carl Edward Rasmussen, Christopher K. I. Williams (2006). + "Gaussian Processes for Machine Learning". The MIT Press. + `_ + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> from sklearn.gaussian_process import GaussianProcessClassifier + >>> from sklearn.gaussian_process.kernels import Matern + >>> X, y = load_iris(return_X_y=True) + >>> kernel = 1.0 * Matern(length_scale=1.0, nu=1.5) + >>> gpc = GaussianProcessClassifier(kernel=kernel, + ... random_state=0).fit(X, y) + >>> gpc.score(X, y) + 0.9866 + >>> gpc.predict_proba(X[:2,:]) + array([[0.8513, 0.0368, 0.1117], + [0.8086, 0.0693, 0.1220]]) + """ + + def __init__(self, length_scale=1.0, length_scale_bounds=(1e-5, 1e5), nu=1.5): + super().__init__(length_scale, length_scale_bounds) + self.nu = nu + + def __call__(self, X, Y=None, eval_gradient=False): + """Return the kernel k(X, Y) and optionally its gradient. + + Parameters + ---------- + X : ndarray of shape (n_samples_X, n_features) + Left argument of the returned kernel k(X, Y) + + Y : ndarray of shape (n_samples_Y, n_features), default=None + Right argument of the returned kernel k(X, Y). If None, k(X, X) + if evaluated instead. + + eval_gradient : bool, default=False + Determines whether the gradient with respect to the log of + the kernel hyperparameter is computed. + Only supported when Y is None. + + Returns + ------- + K : ndarray of shape (n_samples_X, n_samples_Y) + Kernel k(X, Y) + + K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), \ + optional + The gradient of the kernel k(X, X) with respect to the log of the + hyperparameter of the kernel. Only returned when `eval_gradient` + is True. + """ + X = np.atleast_2d(X) + length_scale = _check_length_scale(X, self.length_scale) + if Y is None: + dists = pdist(X / length_scale, metric="euclidean") + else: + if eval_gradient: + raise ValueError("Gradient can only be evaluated when Y is None.") + dists = cdist(X / length_scale, Y / length_scale, metric="euclidean") + + if self.nu == 0.5: + K = np.exp(-dists) + elif self.nu == 1.5: + K = dists * math.sqrt(3) + K = (1.0 + K) * np.exp(-K) + elif self.nu == 2.5: + K = dists * math.sqrt(5) + K = (1.0 + K + K**2 / 3.0) * np.exp(-K) + elif self.nu == np.inf: + K = np.exp(-(dists**2) / 2.0) + else: # general case; expensive to evaluate + K = dists + K[K == 0.0] += np.finfo(float).eps # strict zeros result in nan + tmp = math.sqrt(2 * self.nu) * K + K.fill((2 ** (1.0 - self.nu)) / gamma(self.nu)) + K *= tmp**self.nu + K *= kv(self.nu, tmp) + + if Y is None: + # convert from upper-triangular matrix to square matrix + K = squareform(K) + np.fill_diagonal(K, 1) + + if eval_gradient: + if self.hyperparameter_length_scale.fixed: + # Hyperparameter l kept fixed + K_gradient = np.empty((X.shape[0], X.shape[0], 0)) + return K, K_gradient + + # We need to recompute the pairwise dimension-wise distances + if self.anisotropic: + D = (X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2 / (length_scale**2) + else: + D = squareform(dists**2)[:, :, np.newaxis] + + if self.nu == 0.5: + denominator = np.sqrt(D.sum(axis=2))[:, :, np.newaxis] + divide_result = np.zeros_like(D) + np.divide( + D, + denominator, + out=divide_result, + where=denominator != 0, + ) + K_gradient = K[..., np.newaxis] * divide_result + elif self.nu == 1.5: + K_gradient = 3 * D * np.exp(-np.sqrt(3 * D.sum(-1)))[..., np.newaxis] + elif self.nu == 2.5: + tmp = np.sqrt(5 * D.sum(-1))[..., np.newaxis] + K_gradient = 5.0 / 3.0 * D * (tmp + 1) * np.exp(-tmp) + elif self.nu == np.inf: + K_gradient = D * K[..., np.newaxis] + else: + # approximate gradient numerically + def f(theta): # helper function + return self.clone_with_theta(theta)(X, Y) + + return K, _approx_fprime(self.theta, f, 1e-10) + + if not self.anisotropic: + return K, K_gradient[:, :].sum(-1)[:, :, np.newaxis] + else: + return K, K_gradient + else: + return K + + def __repr__(self): + if self.anisotropic: + return "{0}(length_scale=[{1}], nu={2:.3g})".format( + self.__class__.__name__, + ", ".join(map("{0:.3g}".format, self.length_scale)), + self.nu, + ) + else: + return "{0}(length_scale={1:.3g}, nu={2:.3g})".format( + self.__class__.__name__, np.ravel(self.length_scale)[0], self.nu + ) + + +class RationalQuadratic(StationaryKernelMixin, NormalizedKernelMixin, Kernel): + """Rational Quadratic kernel. + + The RationalQuadratic kernel can be seen as a scale mixture (an infinite + sum) of RBF kernels with different characteristic length scales. It is + parameterized by a length scale parameter :math:`l>0` and a scale + mixture parameter :math:`\\alpha>0`. Only the isotropic variant + where length_scale :math:`l` is a scalar is supported at the moment. + The kernel is given by: + + .. math:: + k(x_i, x_j) = \\left( + 1 + \\frac{d(x_i, x_j)^2 }{ 2\\alpha l^2}\\right)^{-\\alpha} + + where :math:`\\alpha` is the scale mixture parameter, :math:`l` is + the length scale of the kernel and :math:`d(\\cdot,\\cdot)` is the + Euclidean distance. + For advice on how to set the parameters, see e.g. [1]_. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.18 + + Parameters + ---------- + length_scale : float > 0, default=1.0 + The length scale of the kernel. + + alpha : float > 0, default=1.0 + Scale mixture parameter + + length_scale_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5) + The lower and upper bound on 'length_scale'. + If set to "fixed", 'length_scale' cannot be changed during + hyperparameter tuning. + + alpha_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5) + The lower and upper bound on 'alpha'. + If set to "fixed", 'alpha' cannot be changed during + hyperparameter tuning. + + References + ---------- + .. [1] `David Duvenaud (2014). "The Kernel Cookbook: + Advice on Covariance functions". + `_ + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> from sklearn.gaussian_process import GaussianProcessClassifier + >>> from sklearn.gaussian_process.kernels import RationalQuadratic + >>> X, y = load_iris(return_X_y=True) + >>> kernel = RationalQuadratic(length_scale=1.0, alpha=1.5) + >>> gpc = GaussianProcessClassifier(kernel=kernel, + ... random_state=0).fit(X, y) + >>> gpc.score(X, y) + 0.9733 + >>> gpc.predict_proba(X[:2,:]) + array([[0.8881, 0.0566, 0.05518], + [0.8678, 0.0707 , 0.0614]]) + """ + + def __init__( + self, + length_scale=1.0, + alpha=1.0, + length_scale_bounds=(1e-5, 1e5), + alpha_bounds=(1e-5, 1e5), + ): + self.length_scale = length_scale + self.alpha = alpha + self.length_scale_bounds = length_scale_bounds + self.alpha_bounds = alpha_bounds + + @property + def hyperparameter_length_scale(self): + return Hyperparameter("length_scale", "numeric", self.length_scale_bounds) + + @property + def hyperparameter_alpha(self): + return Hyperparameter("alpha", "numeric", self.alpha_bounds) + + def __call__(self, X, Y=None, eval_gradient=False): + """Return the kernel k(X, Y) and optionally its gradient. + + Parameters + ---------- + X : ndarray of shape (n_samples_X, n_features) + Left argument of the returned kernel k(X, Y) + + Y : ndarray of shape (n_samples_Y, n_features), default=None + Right argument of the returned kernel k(X, Y). If None, k(X, X) + if evaluated instead. + + eval_gradient : bool, default=False + Determines whether the gradient with respect to the log of + the kernel hyperparameter is computed. + Only supported when Y is None. + + Returns + ------- + K : ndarray of shape (n_samples_X, n_samples_Y) + Kernel k(X, Y) + + K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims) + The gradient of the kernel k(X, X) with respect to the log of the + hyperparameter of the kernel. Only returned when eval_gradient + is True. + """ + if len(np.atleast_1d(self.length_scale)) > 1: + raise AttributeError( + "RationalQuadratic kernel only supports isotropic version, " + "please use a single scalar for length_scale" + ) + X = np.atleast_2d(X) + if Y is None: + dists = squareform(pdist(X, metric="sqeuclidean")) + tmp = dists / (2 * self.alpha * self.length_scale**2) + base = 1 + tmp + K = base**-self.alpha + np.fill_diagonal(K, 1) + else: + if eval_gradient: + raise ValueError("Gradient can only be evaluated when Y is None.") + dists = cdist(X, Y, metric="sqeuclidean") + K = (1 + dists / (2 * self.alpha * self.length_scale**2)) ** -self.alpha + + if eval_gradient: + # gradient with respect to length_scale + if not self.hyperparameter_length_scale.fixed: + length_scale_gradient = dists * K / (self.length_scale**2 * base) + length_scale_gradient = length_scale_gradient[:, :, np.newaxis] + else: # l is kept fixed + length_scale_gradient = np.empty((K.shape[0], K.shape[1], 0)) + + # gradient with respect to alpha + if not self.hyperparameter_alpha.fixed: + alpha_gradient = K * ( + -self.alpha * np.log(base) + + dists / (2 * self.length_scale**2 * base) + ) + alpha_gradient = alpha_gradient[:, :, np.newaxis] + else: # alpha is kept fixed + alpha_gradient = np.empty((K.shape[0], K.shape[1], 0)) + + return K, np.dstack((alpha_gradient, length_scale_gradient)) + else: + return K + + def __repr__(self): + return "{0}(alpha={1:.3g}, length_scale={2:.3g})".format( + self.__class__.__name__, self.alpha, self.length_scale + ) + + +class ExpSineSquared(StationaryKernelMixin, NormalizedKernelMixin, Kernel): + r"""Exp-Sine-Squared kernel (aka periodic kernel). + + The ExpSineSquared kernel allows one to model functions which repeat + themselves exactly. It is parameterized by a length scale + parameter :math:`l>0` and a periodicity parameter :math:`p>0`. + Only the isotropic variant where :math:`l` is a scalar is + supported at the moment. The kernel is given by: + + .. math:: + k(x_i, x_j) = \text{exp}\left(- + \frac{ 2\sin^2(\pi d(x_i, x_j)/p) }{ l^ 2} \right) + + where :math:`l` is the length scale of the kernel, :math:`p` the + periodicity of the kernel and :math:`d(\cdot,\cdot)` is the + Euclidean distance. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.18 + + Parameters + ---------- + + length_scale : float > 0, default=1.0 + The length scale of the kernel. + + periodicity : float > 0, default=1.0 + The periodicity of the kernel. + + length_scale_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5) + The lower and upper bound on 'length_scale'. + If set to "fixed", 'length_scale' cannot be changed during + hyperparameter tuning. + + periodicity_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5) + The lower and upper bound on 'periodicity'. + If set to "fixed", 'periodicity' cannot be changed during + hyperparameter tuning. + + Examples + -------- + >>> from sklearn.datasets import make_friedman2 + >>> from sklearn.gaussian_process import GaussianProcessRegressor + >>> from sklearn.gaussian_process.kernels import ExpSineSquared + >>> X, y = make_friedman2(n_samples=50, noise=0, random_state=0) + >>> kernel = ExpSineSquared(length_scale=1, periodicity=1) + >>> gpr = GaussianProcessRegressor(kernel=kernel, alpha=5, + ... random_state=0).fit(X, y) + >>> gpr.score(X, y) + 0.0144 + >>> gpr.predict(X[:2,:], return_std=True) + (array([425.6, 457.5]), array([0.3894, 0.3467])) + """ + + def __init__( + self, + length_scale=1.0, + periodicity=1.0, + length_scale_bounds=(1e-5, 1e5), + periodicity_bounds=(1e-5, 1e5), + ): + self.length_scale = length_scale + self.periodicity = periodicity + self.length_scale_bounds = length_scale_bounds + self.periodicity_bounds = periodicity_bounds + + @property + def hyperparameter_length_scale(self): + """Returns the length scale""" + return Hyperparameter("length_scale", "numeric", self.length_scale_bounds) + + @property + def hyperparameter_periodicity(self): + return Hyperparameter("periodicity", "numeric", self.periodicity_bounds) + + def __call__(self, X, Y=None, eval_gradient=False): + """Return the kernel k(X, Y) and optionally its gradient. + + Parameters + ---------- + X : ndarray of shape (n_samples_X, n_features) + Left argument of the returned kernel k(X, Y) + + Y : ndarray of shape (n_samples_Y, n_features), default=None + Right argument of the returned kernel k(X, Y). If None, k(X, X) + if evaluated instead. + + eval_gradient : bool, default=False + Determines whether the gradient with respect to the log of + the kernel hyperparameter is computed. + Only supported when Y is None. + + Returns + ------- + K : ndarray of shape (n_samples_X, n_samples_Y) + Kernel k(X, Y) + + K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), \ + optional + The gradient of the kernel k(X, X) with respect to the log of the + hyperparameter of the kernel. Only returned when `eval_gradient` + is True. + """ + X = np.atleast_2d(X) + if Y is None: + dists = squareform(pdist(X, metric="euclidean")) + arg = np.pi * dists / self.periodicity + sin_of_arg = np.sin(arg) + K = np.exp(-2 * (sin_of_arg / self.length_scale) ** 2) + else: + if eval_gradient: + raise ValueError("Gradient can only be evaluated when Y is None.") + dists = cdist(X, Y, metric="euclidean") + K = np.exp( + -2 * (np.sin(np.pi / self.periodicity * dists) / self.length_scale) ** 2 + ) + + if eval_gradient: + cos_of_arg = np.cos(arg) + # gradient with respect to length_scale + if not self.hyperparameter_length_scale.fixed: + length_scale_gradient = 4 / self.length_scale**2 * sin_of_arg**2 * K + length_scale_gradient = length_scale_gradient[:, :, np.newaxis] + else: # length_scale is kept fixed + length_scale_gradient = np.empty((K.shape[0], K.shape[1], 0)) + # gradient with respect to p + if not self.hyperparameter_periodicity.fixed: + periodicity_gradient = ( + 4 * arg / self.length_scale**2 * cos_of_arg * sin_of_arg * K + ) + periodicity_gradient = periodicity_gradient[:, :, np.newaxis] + else: # p is kept fixed + periodicity_gradient = np.empty((K.shape[0], K.shape[1], 0)) + + return K, np.dstack((length_scale_gradient, periodicity_gradient)) + else: + return K + + def __repr__(self): + return "{0}(length_scale={1:.3g}, periodicity={2:.3g})".format( + self.__class__.__name__, self.length_scale, self.periodicity + ) + + +class DotProduct(Kernel): + r"""Dot-Product kernel. + + The DotProduct kernel is non-stationary and can be obtained from linear + regression by putting :math:`N(0, 1)` priors on the coefficients + of :math:`x_d (d = 1, . . . , D)` and a prior of :math:`N(0, \sigma_0^2)` + on the bias. The DotProduct kernel is invariant to a rotation of + the coordinates about the origin, but not translations. + It is parameterized by a parameter sigma_0 :math:`\sigma` + which controls the inhomogenity of the kernel. For :math:`\sigma_0^2 =0`, + the kernel is called the homogeneous linear kernel, otherwise + it is inhomogeneous. The kernel is given by + + .. math:: + k(x_i, x_j) = \sigma_0 ^ 2 + x_i \cdot x_j + + The DotProduct kernel is commonly combined with exponentiation. + + See [1]_, Chapter 4, Section 4.2, for further details regarding the + DotProduct kernel. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.18 + + Parameters + ---------- + sigma_0 : float >= 0, default=1.0 + Parameter controlling the inhomogenity of the kernel. If sigma_0=0, + the kernel is homogeneous. + + sigma_0_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5) + The lower and upper bound on 'sigma_0'. + If set to "fixed", 'sigma_0' cannot be changed during + hyperparameter tuning. + + References + ---------- + .. [1] `Carl Edward Rasmussen, Christopher K. I. Williams (2006). + "Gaussian Processes for Machine Learning". The MIT Press. + `_ + + Examples + -------- + >>> from sklearn.datasets import make_friedman2 + >>> from sklearn.gaussian_process import GaussianProcessRegressor + >>> from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel + >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0) + >>> kernel = DotProduct() + WhiteKernel() + >>> gpr = GaussianProcessRegressor(kernel=kernel, + ... random_state=0).fit(X, y) + >>> gpr.score(X, y) + 0.3680 + >>> gpr.predict(X[:2,:], return_std=True) + (array([653.0, 592.1]), array([316.6, 316.6])) + """ + + def __init__(self, sigma_0=1.0, sigma_0_bounds=(1e-5, 1e5)): + self.sigma_0 = sigma_0 + self.sigma_0_bounds = sigma_0_bounds + + @property + def hyperparameter_sigma_0(self): + return Hyperparameter("sigma_0", "numeric", self.sigma_0_bounds) + + def __call__(self, X, Y=None, eval_gradient=False): + """Return the kernel k(X, Y) and optionally its gradient. + + Parameters + ---------- + X : ndarray of shape (n_samples_X, n_features) + Left argument of the returned kernel k(X, Y) + + Y : ndarray of shape (n_samples_Y, n_features), default=None + Right argument of the returned kernel k(X, Y). If None, k(X, X) + if evaluated instead. + + eval_gradient : bool, default=False + Determines whether the gradient with respect to the log of + the kernel hyperparameter is computed. + Only supported when Y is None. + + Returns + ------- + K : ndarray of shape (n_samples_X, n_samples_Y) + Kernel k(X, Y) + + K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\ + optional + The gradient of the kernel k(X, X) with respect to the log of the + hyperparameter of the kernel. Only returned when `eval_gradient` + is True. + """ + X = np.atleast_2d(X) + if Y is None: + K = np.inner(X, X) + self.sigma_0**2 + else: + if eval_gradient: + raise ValueError("Gradient can only be evaluated when Y is None.") + K = np.inner(X, Y) + self.sigma_0**2 + + if eval_gradient: + if not self.hyperparameter_sigma_0.fixed: + K_gradient = np.empty((K.shape[0], K.shape[1], 1)) + K_gradient[..., 0] = 2 * self.sigma_0**2 + return K, K_gradient + else: + return K, np.empty((X.shape[0], X.shape[0], 0)) + else: + return K + + def diag(self, X): + """Returns the diagonal of the kernel k(X, X). + + The result of this method is identical to np.diag(self(X)); however, + it can be evaluated more efficiently since only the diagonal is + evaluated. + + Parameters + ---------- + X : ndarray of shape (n_samples_X, n_features) + Left argument of the returned kernel k(X, Y). + + Returns + ------- + K_diag : ndarray of shape (n_samples_X,) + Diagonal of kernel k(X, X). + """ + return np.einsum("ij,ij->i", X, X) + self.sigma_0**2 + + def is_stationary(self): + """Returns whether the kernel is stationary.""" + return False + + def __repr__(self): + return "{0}(sigma_0={1:.3g})".format(self.__class__.__name__, self.sigma_0) + + +# adapted from scipy/optimize/optimize.py for functions with 2d output +def _approx_fprime(xk, f, epsilon, args=()): + f0 = f(*((xk,) + args)) + grad = np.zeros((f0.shape[0], f0.shape[1], len(xk)), float) + ei = np.zeros((len(xk),), float) + for k in range(len(xk)): + ei[k] = 1.0 + d = epsilon * ei + grad[:, :, k] = (f(*((xk + d,) + args)) - f0) / d[k] + ei[k] = 0.0 + return grad + + +class PairwiseKernel(Kernel): + """Wrapper for kernels in sklearn.metrics.pairwise. + + A thin wrapper around the functionality of the kernels in + sklearn.metrics.pairwise. + + Note: Evaluation of eval_gradient is not analytic but numeric and all + kernels support only isotropic distances. The parameter gamma is + considered to be a hyperparameter and may be optimized. The other + kernel parameters are set directly at initialization and are kept + fixed. + + .. versionadded:: 0.18 + + Parameters + ---------- + gamma : float, default=1.0 + Parameter gamma of the pairwise kernel specified by metric. It should + be positive. + + gamma_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5) + The lower and upper bound on 'gamma'. + If set to "fixed", 'gamma' cannot be changed during + hyperparameter tuning. + + metric : {"linear", "additive_chi2", "chi2", "poly", "polynomial", \ + "rbf", "laplacian", "sigmoid", "cosine"} or callable, \ + default="linear" + The metric to use when calculating kernel between instances in a + feature array. If metric is a string, it must be one of the metrics + in pairwise.PAIRWISE_KERNEL_FUNCTIONS. + If metric is "precomputed", X is assumed to be a kernel matrix. + Alternatively, if metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays from X as input and return a value indicating + the distance between them. + + pairwise_kernels_kwargs : dict, default=None + All entries of this dict (if any) are passed as keyword arguments to + the pairwise kernel function. + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> from sklearn.gaussian_process import GaussianProcessClassifier + >>> from sklearn.gaussian_process.kernels import PairwiseKernel + >>> X, y = load_iris(return_X_y=True) + >>> kernel = PairwiseKernel(metric='rbf') + >>> gpc = GaussianProcessClassifier(kernel=kernel, + ... random_state=0).fit(X, y) + >>> gpc.score(X, y) + 0.9733 + >>> gpc.predict_proba(X[:2,:]) + array([[0.8880, 0.05663, 0.05532], + [0.8676, 0.07073, 0.06165]]) + """ + + def __init__( + self, + gamma=1.0, + gamma_bounds=(1e-5, 1e5), + metric="linear", + pairwise_kernels_kwargs=None, + ): + self.gamma = gamma + self.gamma_bounds = gamma_bounds + self.metric = metric + self.pairwise_kernels_kwargs = pairwise_kernels_kwargs + + @property + def hyperparameter_gamma(self): + return Hyperparameter("gamma", "numeric", self.gamma_bounds) + + def __call__(self, X, Y=None, eval_gradient=False): + """Return the kernel k(X, Y) and optionally its gradient. + + Parameters + ---------- + X : ndarray of shape (n_samples_X, n_features) + Left argument of the returned kernel k(X, Y) + + Y : ndarray of shape (n_samples_Y, n_features), default=None + Right argument of the returned kernel k(X, Y). If None, k(X, X) + if evaluated instead. + + eval_gradient : bool, default=False + Determines whether the gradient with respect to the log of + the kernel hyperparameter is computed. + Only supported when Y is None. + + Returns + ------- + K : ndarray of shape (n_samples_X, n_samples_Y) + Kernel k(X, Y) + + K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\ + optional + The gradient of the kernel k(X, X) with respect to the log of the + hyperparameter of the kernel. Only returned when `eval_gradient` + is True. + """ + pairwise_kernels_kwargs = self.pairwise_kernels_kwargs + if self.pairwise_kernels_kwargs is None: + pairwise_kernels_kwargs = {} + + X = np.atleast_2d(X) + K = pairwise_kernels( + X, + Y, + metric=self.metric, + gamma=self.gamma, + filter_params=True, + **pairwise_kernels_kwargs, + ) + if eval_gradient: + if self.hyperparameter_gamma.fixed: + return K, np.empty((X.shape[0], X.shape[0], 0)) + else: + # approximate gradient numerically + def f(gamma): # helper function + return pairwise_kernels( + X, + Y, + metric=self.metric, + gamma=np.exp(gamma), + filter_params=True, + **pairwise_kernels_kwargs, + ) + + return K, _approx_fprime(self.theta, f, 1e-10) + else: + return K + + def diag(self, X): + """Returns the diagonal of the kernel k(X, X). + + The result of this method is identical to np.diag(self(X)); however, + it can be evaluated more efficiently since only the diagonal is + evaluated. + + Parameters + ---------- + X : ndarray of shape (n_samples_X, n_features) + Left argument of the returned kernel k(X, Y) + + Returns + ------- + K_diag : ndarray of shape (n_samples_X,) + Diagonal of kernel k(X, X) + """ + # We have to fall back to slow way of computing diagonal + return np.apply_along_axis(self, 1, X).ravel() + + def is_stationary(self): + """Returns whether the kernel is stationary.""" + return self.metric in ["rbf"] + + def __repr__(self): + return "{0}(gamma={1}, metric={2})".format( + self.__class__.__name__, self.gamma, self.metric + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/_mini_sequence_kernel.py b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/_mini_sequence_kernel.py new file mode 100644 index 0000000000000000000000000000000000000000..4667329aff9b8dbeffa90bb0c40c98a708fcc205 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/_mini_sequence_kernel.py @@ -0,0 +1,54 @@ +import numpy as np + +from sklearn.base import clone +from sklearn.gaussian_process.kernels import ( + GenericKernelMixin, + Hyperparameter, + Kernel, + StationaryKernelMixin, +) + + +class MiniSeqKernel(GenericKernelMixin, StationaryKernelMixin, Kernel): + """ + A minimal (but valid) convolutional kernel for sequences of variable + length. + """ + + def __init__(self, baseline_similarity=0.5, baseline_similarity_bounds=(1e-5, 1)): + self.baseline_similarity = baseline_similarity + self.baseline_similarity_bounds = baseline_similarity_bounds + + @property + def hyperparameter_baseline_similarity(self): + return Hyperparameter( + "baseline_similarity", "numeric", self.baseline_similarity_bounds + ) + + def _f(self, s1, s2): + return sum( + [1.0 if c1 == c2 else self.baseline_similarity for c1 in s1 for c2 in s2] + ) + + def _g(self, s1, s2): + return sum([0.0 if c1 == c2 else 1.0 for c1 in s1 for c2 in s2]) + + def __call__(self, X, Y=None, eval_gradient=False): + if Y is None: + Y = X + + if eval_gradient: + return ( + np.array([[self._f(x, y) for y in Y] for x in X]), + np.array([[[self._g(x, y)] for y in Y] for x in X]), + ) + else: + return np.array([[self._f(x, y) for y in Y] for x in X]) + + def diag(self, X): + return np.array([self._f(x, x) for x in X]) + + def clone_with_theta(self, theta): + cloned = clone(self) + cloned.theta = theta + return cloned diff --git a/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/test_gpc.py b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/test_gpc.py new file mode 100644 index 0000000000000000000000000000000000000000..365b8f5a114417fdd2ab9979341ba95489c2b1d2 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/test_gpc.py @@ -0,0 +1,320 @@ +"""Testing for Gaussian process classification""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings + +import numpy as np +import pytest +from scipy.optimize import approx_fprime + +from sklearn.exceptions import ConvergenceWarning +from sklearn.gaussian_process import GaussianProcessClassifier +from sklearn.gaussian_process.kernels import ( + RBF, + CompoundKernel, + WhiteKernel, +) +from sklearn.gaussian_process.kernels import ( + ConstantKernel as C, +) +from sklearn.gaussian_process.tests._mini_sequence_kernel import MiniSeqKernel +from sklearn.utils._testing import assert_almost_equal, assert_array_equal + + +def f(x): + return np.sin(x) + + +X = np.atleast_2d(np.linspace(0, 10, 30)).T +X2 = np.atleast_2d([2.0, 4.0, 5.5, 6.5, 7.5]).T +y = np.array(f(X).ravel() > 0, dtype=int) +fX = f(X).ravel() +y_mc = np.empty(y.shape, dtype=int) # multi-class +y_mc[fX < -0.35] = 0 +y_mc[(fX >= -0.35) & (fX < 0.35)] = 1 +y_mc[fX > 0.35] = 2 + + +fixed_kernel = RBF(length_scale=1.0, length_scale_bounds="fixed") +kernels = [ + RBF(length_scale=0.1), + fixed_kernel, + RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)), + C(1.0, (1e-2, 1e2)) * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)), +] +non_fixed_kernels = [kernel for kernel in kernels if kernel != fixed_kernel] + + +@pytest.mark.parametrize("kernel", kernels) +def test_predict_consistent(kernel): + # Check binary predict decision has also predicted probability above 0.5. + gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) + assert_array_equal(gpc.predict(X), gpc.predict_proba(X)[:, 1] >= 0.5) + + +def test_predict_consistent_structured(): + # Check binary predict decision has also predicted probability above 0.5. + X = ["A", "AB", "B"] + y = np.array([True, False, True]) + kernel = MiniSeqKernel(baseline_similarity_bounds="fixed") + gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) + assert_array_equal(gpc.predict(X), gpc.predict_proba(X)[:, 1] >= 0.5) + + +@pytest.mark.parametrize("kernel", non_fixed_kernels) +def test_lml_improving(kernel): + # Test that hyperparameter-tuning improves log-marginal likelihood. + gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) + assert gpc.log_marginal_likelihood(gpc.kernel_.theta) > gpc.log_marginal_likelihood( + kernel.theta + ) + + +@pytest.mark.parametrize("kernel", kernels) +def test_lml_precomputed(kernel): + # Test that lml of optimized kernel is stored correctly. + gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) + assert_almost_equal( + gpc.log_marginal_likelihood(gpc.kernel_.theta), gpc.log_marginal_likelihood(), 7 + ) + + +@pytest.mark.parametrize("kernel", kernels) +def test_lml_without_cloning_kernel(kernel): + # Test that clone_kernel=False has side-effects of kernel.theta. + gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) + input_theta = np.ones(gpc.kernel_.theta.shape, dtype=np.float64) + + gpc.log_marginal_likelihood(input_theta, clone_kernel=False) + assert_almost_equal(gpc.kernel_.theta, input_theta, 7) + + +@pytest.mark.parametrize("kernel", non_fixed_kernels) +def test_converged_to_local_maximum(kernel): + # Test that we are in local maximum after hyperparameter-optimization. + gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) + + lml, lml_gradient = gpc.log_marginal_likelihood(gpc.kernel_.theta, True) + + assert np.all( + (np.abs(lml_gradient) < 1e-4) + | (gpc.kernel_.theta == gpc.kernel_.bounds[:, 0]) + | (gpc.kernel_.theta == gpc.kernel_.bounds[:, 1]) + ) + + +@pytest.mark.parametrize("kernel", kernels) +def test_lml_gradient(kernel): + # Compare analytic and numeric gradient of log marginal likelihood. + gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) + + lml, lml_gradient = gpc.log_marginal_likelihood(kernel.theta, True) + lml_gradient_approx = approx_fprime( + kernel.theta, lambda theta: gpc.log_marginal_likelihood(theta, False), 1e-10 + ) + + assert_almost_equal(lml_gradient, lml_gradient_approx, 3) + + +def test_random_starts(global_random_seed): + # Test that an increasing number of random-starts of GP fitting only + # increases the log marginal likelihood of the chosen theta. + n_samples, n_features = 25, 2 + rng = np.random.RandomState(global_random_seed) + X = rng.randn(n_samples, n_features) * 2 - 1 + y = (np.sin(X).sum(axis=1) + np.sin(3 * X).sum(axis=1)) > 0 + + kernel = C(1.0, (1e-2, 1e2)) * RBF( + length_scale=[1e-3] * n_features, length_scale_bounds=[(1e-4, 1e2)] * n_features + ) + last_lml = -np.inf + for n_restarts_optimizer in range(5): + gp = GaussianProcessClassifier( + kernel=kernel, + n_restarts_optimizer=n_restarts_optimizer, + random_state=global_random_seed, + ).fit(X, y) + lml = gp.log_marginal_likelihood(gp.kernel_.theta) + assert lml > last_lml - np.finfo(np.float32).eps + last_lml = lml + + +@pytest.mark.parametrize("kernel", non_fixed_kernels) +def test_custom_optimizer(kernel, global_random_seed): + # Test that GPC can use externally defined optimizers. + # Define a dummy optimizer that simply tests 10 random hyperparameters + def optimizer(obj_func, initial_theta, bounds): + rng = np.random.RandomState(global_random_seed) + theta_opt, func_min = ( + initial_theta, + obj_func(initial_theta, eval_gradient=False), + ) + for _ in range(10): + theta = np.atleast_1d( + rng.uniform(np.maximum(-2, bounds[:, 0]), np.minimum(1, bounds[:, 1])) + ) + f = obj_func(theta, eval_gradient=False) + if f < func_min: + theta_opt, func_min = theta, f + return theta_opt, func_min + + gpc = GaussianProcessClassifier(kernel=kernel, optimizer=optimizer) + gpc.fit(X, y_mc) + # Checks that optimizer improved marginal likelihood + assert gpc.log_marginal_likelihood( + gpc.kernel_.theta + ) >= gpc.log_marginal_likelihood(kernel.theta) + + +@pytest.mark.parametrize("kernel", kernels) +def test_multi_class(kernel): + # Test GPC for multi-class classification problems. + gpc = GaussianProcessClassifier(kernel=kernel) + gpc.fit(X, y_mc) + + y_prob = gpc.predict_proba(X2) + assert_almost_equal(y_prob.sum(1), 1) + + y_pred = gpc.predict(X2) + assert_array_equal(np.argmax(y_prob, 1), y_pred) + + +@pytest.mark.parametrize("kernel", kernels) +def test_multi_class_n_jobs(kernel): + # Test that multi-class GPC produces identical results with n_jobs>1. + gpc = GaussianProcessClassifier(kernel=kernel) + gpc.fit(X, y_mc) + + gpc_2 = GaussianProcessClassifier(kernel=kernel, n_jobs=2) + gpc_2.fit(X, y_mc) + + y_prob = gpc.predict_proba(X2) + y_prob_2 = gpc_2.predict_proba(X2) + assert_almost_equal(y_prob, y_prob_2) + + +def test_warning_bounds(): + kernel = RBF(length_scale_bounds=[1e-5, 1e-3]) + gpc = GaussianProcessClassifier(kernel=kernel) + warning_message = ( + "The optimal value found for dimension 0 of parameter " + "length_scale is close to the specified upper bound " + "0.001. Increasing the bound and calling fit again may " + "find a better value." + ) + with pytest.warns(ConvergenceWarning, match=warning_message): + gpc.fit(X, y) + + kernel_sum = WhiteKernel(noise_level_bounds=[1e-5, 1e-3]) + RBF( + length_scale_bounds=[1e3, 1e5] + ) + gpc_sum = GaussianProcessClassifier(kernel=kernel_sum) + with warnings.catch_warnings(record=True) as record: + warnings.simplefilter("always") + gpc_sum.fit(X, y) + + assert len(record) == 2 + + assert issubclass(record[0].category, ConvergenceWarning) + assert ( + record[0].message.args[0] == "The optimal value found for " + "dimension 0 of parameter " + "k1__noise_level is close to the " + "specified upper bound 0.001. " + "Increasing the bound and calling " + "fit again may find a better value." + ) + + assert issubclass(record[1].category, ConvergenceWarning) + assert ( + record[1].message.args[0] == "The optimal value found for " + "dimension 0 of parameter " + "k2__length_scale is close to the " + "specified lower bound 1000.0. " + "Decreasing the bound and calling " + "fit again may find a better value." + ) + + X_tile = np.tile(X, 2) + kernel_dims = RBF(length_scale=[1.0, 2.0], length_scale_bounds=[1e1, 1e2]) + gpc_dims = GaussianProcessClassifier(kernel=kernel_dims) + + with warnings.catch_warnings(record=True) as record: + warnings.simplefilter("always") + gpc_dims.fit(X_tile, y) + + assert len(record) == 2 + + assert issubclass(record[0].category, ConvergenceWarning) + assert ( + record[0].message.args[0] == "The optimal value found for " + "dimension 0 of parameter " + "length_scale is close to the " + "specified upper bound 100.0. " + "Increasing the bound and calling " + "fit again may find a better value." + ) + + assert issubclass(record[1].category, ConvergenceWarning) + assert ( + record[1].message.args[0] == "The optimal value found for " + "dimension 1 of parameter " + "length_scale is close to the " + "specified upper bound 100.0. " + "Increasing the bound and calling " + "fit again may find a better value." + ) + + +@pytest.mark.parametrize( + "params, error_type, err_msg", + [ + ( + {"kernel": CompoundKernel(0)}, + ValueError, + "kernel cannot be a CompoundKernel", + ) + ], +) +def test_gpc_fit_error(params, error_type, err_msg): + """Check that expected error are raised during fit.""" + gpc = GaussianProcessClassifier(**params) + with pytest.raises(error_type, match=err_msg): + gpc.fit(X, y) + + +@pytest.mark.parametrize("kernel", kernels) +def test_gpc_latent_mean_and_variance_shape(kernel): + """Checks that the latent mean and variance have the right shape.""" + gpc = GaussianProcessClassifier(kernel=kernel) + gpc.fit(X, y) + + # Check that the latent mean and variance have the right shape + latent_mean, latent_variance = gpc.latent_mean_and_variance(X) + assert latent_mean.shape == (X.shape[0],) + assert latent_variance.shape == (X.shape[0],) + + +def test_gpc_latent_mean_and_variance_complain_on_more_than_2_classes(): + """Checks that the latent mean and variance have the right shape.""" + gpc = GaussianProcessClassifier(kernel=RBF()) + gpc.fit(X, y_mc) + + # Check that the latent mean and variance have the right shape + with pytest.raises( + ValueError, + match="Returning the mean and variance of the latent function f " + "is only supported for binary classification", + ): + gpc.latent_mean_and_variance(X) + + +def test_latent_mean_and_variance_works_on_structured_kernels(): + X = ["A", "AB", "B"] + y = np.array([True, False, True]) + kernel = MiniSeqKernel(baseline_similarity_bounds="fixed") + gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) + + gpc.latent_mean_and_variance(X) diff --git a/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/test_gpr.py b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/test_gpr.py new file mode 100644 index 0000000000000000000000000000000000000000..f43cc3613b3ff7669aba9b73526fd774bfd8452e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/test_gpr.py @@ -0,0 +1,849 @@ +"""Testing for Gaussian process regression""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import re +import sys +import warnings + +import numpy as np +import pytest +from scipy.optimize import approx_fprime + +from sklearn.exceptions import ConvergenceWarning +from sklearn.gaussian_process import GaussianProcessRegressor +from sklearn.gaussian_process.kernels import ( + RBF, + DotProduct, + ExpSineSquared, + WhiteKernel, +) +from sklearn.gaussian_process.kernels import ( + ConstantKernel as C, +) +from sklearn.gaussian_process.tests._mini_sequence_kernel import MiniSeqKernel +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + assert_array_less, +) + + +def f(x): + return x * np.sin(x) + + +X = np.atleast_2d([1.0, 3.0, 5.0, 6.0, 7.0, 8.0]).T +X2 = np.atleast_2d([2.0, 4.0, 5.5, 6.5, 7.5]).T +y = f(X).ravel() + +fixed_kernel = RBF(length_scale=1.0, length_scale_bounds="fixed") +kernels = [ + RBF(length_scale=1.0), + fixed_kernel, + RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)), + C(1.0, (1e-2, 1e2)) * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)), + C(1.0, (1e-2, 1e2)) * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)) + + C(1e-5, (1e-5, 1e2)), + C(0.1, (1e-2, 1e2)) * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)) + + C(1e-5, (1e-5, 1e2)), +] +non_fixed_kernels = [kernel for kernel in kernels if kernel != fixed_kernel] + + +@pytest.mark.parametrize("kernel", kernels) +def test_gpr_interpolation(kernel): + if sys.maxsize <= 2**32: + pytest.xfail("This test may fail on 32 bit Python") + + # Test the interpolating property for different kernels. + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + y_pred, y_cov = gpr.predict(X, return_cov=True) + + assert_almost_equal(y_pred, y) + assert_almost_equal(np.diag(y_cov), 0.0) + + +def test_gpr_interpolation_structured(): + # Test the interpolating property for different kernels. + kernel = MiniSeqKernel(baseline_similarity_bounds="fixed") + X = ["A", "B", "C"] + y = np.array([1, 2, 3]) + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + y_pred, y_cov = gpr.predict(X, return_cov=True) + + assert_almost_equal( + kernel(X, eval_gradient=True)[1].ravel(), (1 - np.eye(len(X))).ravel() + ) + assert_almost_equal(y_pred, y) + assert_almost_equal(np.diag(y_cov), 0.0) + + +@pytest.mark.parametrize("kernel", non_fixed_kernels) +def test_lml_improving(kernel): + if sys.maxsize <= 2**32: + pytest.xfail("This test may fail on 32 bit Python") + + # Test that hyperparameter-tuning improves log-marginal likelihood. + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + assert gpr.log_marginal_likelihood(gpr.kernel_.theta) > gpr.log_marginal_likelihood( + kernel.theta + ) + + +@pytest.mark.parametrize("kernel", kernels) +def test_lml_precomputed(kernel): + # Test that lml of optimized kernel is stored correctly. + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + assert gpr.log_marginal_likelihood(gpr.kernel_.theta) == pytest.approx( + gpr.log_marginal_likelihood() + ) + + +@pytest.mark.parametrize("kernel", kernels) +def test_lml_without_cloning_kernel(kernel): + # Test that lml of optimized kernel is stored correctly. + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + input_theta = np.ones(gpr.kernel_.theta.shape, dtype=np.float64) + + gpr.log_marginal_likelihood(input_theta, clone_kernel=False) + assert_almost_equal(gpr.kernel_.theta, input_theta, 7) + + +@pytest.mark.parametrize("kernel", non_fixed_kernels) +def test_converged_to_local_maximum(kernel): + # Test that we are in local maximum after hyperparameter-optimization. + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + + lml, lml_gradient = gpr.log_marginal_likelihood(gpr.kernel_.theta, True) + + assert np.all( + (np.abs(lml_gradient) < 1e-4) + | (gpr.kernel_.theta == gpr.kernel_.bounds[:, 0]) + | (gpr.kernel_.theta == gpr.kernel_.bounds[:, 1]) + ) + + +@pytest.mark.parametrize("kernel", non_fixed_kernels) +def test_solution_inside_bounds(kernel): + # Test that hyperparameter-optimization remains in bounds# + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + + bounds = gpr.kernel_.bounds + max_ = np.finfo(gpr.kernel_.theta.dtype).max + tiny = 1e-10 + bounds[~np.isfinite(bounds[:, 1]), 1] = max_ + + assert_array_less(bounds[:, 0], gpr.kernel_.theta + tiny) + assert_array_less(gpr.kernel_.theta, bounds[:, 1] + tiny) + + +@pytest.mark.parametrize("kernel", kernels) +def test_lml_gradient(kernel): + # Compare analytic and numeric gradient of log marginal likelihood. + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + + lml, lml_gradient = gpr.log_marginal_likelihood(kernel.theta, True) + lml_gradient_approx = approx_fprime( + kernel.theta, lambda theta: gpr.log_marginal_likelihood(theta, False), 1e-10 + ) + + assert_almost_equal(lml_gradient, lml_gradient_approx, 3) + + +@pytest.mark.parametrize("kernel", kernels) +def test_prior(kernel): + # Test that GP prior has mean 0 and identical variances. + gpr = GaussianProcessRegressor(kernel=kernel) + + y_mean, y_cov = gpr.predict(X, return_cov=True) + + assert_almost_equal(y_mean, 0, 5) + if len(gpr.kernel.theta) > 1: + # XXX: quite hacky, works only for current kernels + assert_almost_equal(np.diag(y_cov), np.exp(kernel.theta[0]), 5) + else: + assert_almost_equal(np.diag(y_cov), 1, 5) + + +@pytest.mark.parametrize("kernel", kernels) +def test_sample_statistics(kernel): + # Test that statistics of samples drawn from GP are correct. + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + + y_mean, y_cov = gpr.predict(X2, return_cov=True) + + samples = gpr.sample_y(X2, 300000) + + # More digits accuracy would require many more samples + assert_almost_equal(y_mean, np.mean(samples, 1), 1) + assert_almost_equal( + np.diag(y_cov) / np.diag(y_cov).max(), + np.var(samples, 1) / np.diag(y_cov).max(), + 1, + ) + + +def test_no_optimizer(): + # Test that kernel parameters are unmodified when optimizer is None. + kernel = RBF(1.0) + gpr = GaussianProcessRegressor(kernel=kernel, optimizer=None).fit(X, y) + assert np.exp(gpr.kernel_.theta) == 1.0 + + +@pytest.mark.parametrize("kernel", kernels) +@pytest.mark.parametrize("target", [y, np.ones(X.shape[0], dtype=np.float64)]) +def test_predict_cov_vs_std(kernel, target): + if sys.maxsize <= 2**32: + pytest.xfail("This test may fail on 32 bit Python") + + # Test that predicted std.-dev. is consistent with cov's diagonal. + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + y_mean, y_cov = gpr.predict(X2, return_cov=True) + y_mean, y_std = gpr.predict(X2, return_std=True) + assert_almost_equal(np.sqrt(np.diag(y_cov)), y_std) + + +def test_anisotropic_kernel(): + # Test that GPR can identify meaningful anisotropic length-scales. + # We learn a function which varies in one dimension ten-times slower + # than in the other. The corresponding length-scales should differ by at + # least a factor 5 + rng = np.random.RandomState(0) + X = rng.uniform(-1, 1, (50, 2)) + y = X[:, 0] + 0.1 * X[:, 1] + + kernel = RBF([1.0, 1.0]) + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + assert np.exp(gpr.kernel_.theta[1]) > np.exp(gpr.kernel_.theta[0]) * 5 + + +def test_random_starts(): + # Test that an increasing number of random-starts of GP fitting only + # increases the log marginal likelihood of the chosen theta. + n_samples, n_features = 25, 2 + rng = np.random.RandomState(0) + X = rng.randn(n_samples, n_features) * 2 - 1 + y = ( + np.sin(X).sum(axis=1) + + np.sin(3 * X).sum(axis=1) + + rng.normal(scale=0.1, size=n_samples) + ) + + kernel = C(1.0, (1e-2, 1e2)) * RBF( + length_scale=[1.0] * n_features, length_scale_bounds=[(1e-4, 1e2)] * n_features + ) + WhiteKernel(noise_level=1e-5, noise_level_bounds=(1e-5, 1e1)) + last_lml = -np.inf + for n_restarts_optimizer in range(5): + gp = GaussianProcessRegressor( + kernel=kernel, + n_restarts_optimizer=n_restarts_optimizer, + random_state=0, + ).fit(X, y) + lml = gp.log_marginal_likelihood(gp.kernel_.theta) + assert lml > last_lml - np.finfo(np.float32).eps + last_lml = lml + + +@pytest.mark.parametrize("kernel", kernels) +def test_y_normalization(kernel): + """ + Test normalization of the target values in GP + + Fitting non-normalizing GP on normalized y and fitting normalizing GP + on unnormalized y should yield identical results. Note that, here, + 'normalized y' refers to y that has been made zero mean and unit + variance. + + """ + + y_mean = np.mean(y) + y_std = np.std(y) + y_norm = (y - y_mean) / y_std + + # Fit non-normalizing GP on normalized y + gpr = GaussianProcessRegressor(kernel=kernel) + gpr.fit(X, y_norm) + + # Fit normalizing GP on unnormalized y + gpr_norm = GaussianProcessRegressor(kernel=kernel, normalize_y=True) + gpr_norm.fit(X, y) + + # Compare predicted mean, std-devs and covariances + y_pred, y_pred_std = gpr.predict(X2, return_std=True) + y_pred = y_pred * y_std + y_mean + y_pred_std = y_pred_std * y_std + y_pred_norm, y_pred_std_norm = gpr_norm.predict(X2, return_std=True) + + assert_almost_equal(y_pred, y_pred_norm) + assert_almost_equal(y_pred_std, y_pred_std_norm) + + _, y_cov = gpr.predict(X2, return_cov=True) + y_cov = y_cov * y_std**2 + _, y_cov_norm = gpr_norm.predict(X2, return_cov=True) + + assert_almost_equal(y_cov, y_cov_norm) + + +def test_large_variance_y(): + """ + Here we test that, when noramlize_y=True, our GP can produce a + sensible fit to training data whose variance is significantly + larger than unity. This test was made in response to issue #15612. + + GP predictions are verified against predictions that were made + using GPy which, here, is treated as the 'gold standard'. Note that we + only investigate the RBF kernel here, as that is what was used in the + GPy implementation. + + The following code can be used to recreate the GPy data: + + -------------------------------------------------------------------------- + import GPy + + kernel_gpy = GPy.kern.RBF(input_dim=1, lengthscale=1.) + gpy = GPy.models.GPRegression(X, np.vstack(y_large), kernel_gpy) + gpy.optimize() + y_pred_gpy, y_var_gpy = gpy.predict(X2) + y_pred_std_gpy = np.sqrt(y_var_gpy) + -------------------------------------------------------------------------- + """ + + # Here we utilise a larger variance version of the training data + y_large = 10 * y + + # Standard GP with normalize_y=True + RBF_params = {"length_scale": 1.0} + kernel = RBF(**RBF_params) + gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True) + gpr.fit(X, y_large) + y_pred, y_pred_std = gpr.predict(X2, return_std=True) + + # 'Gold standard' mean predictions from GPy + y_pred_gpy = np.array( + [15.16918303, -27.98707845, -39.31636019, 14.52605515, 69.18503589] + ) + + # 'Gold standard' std predictions from GPy + y_pred_std_gpy = np.array( + [7.78860962, 3.83179178, 0.63149951, 0.52745188, 0.86170042] + ) + + # Based on numerical experiments, it's reasonable to expect our + # GP's mean predictions to get within 7% of predictions of those + # made by GPy. + assert_allclose(y_pred, y_pred_gpy, rtol=0.07, atol=0) + + # Based on numerical experiments, it's reasonable to expect our + # GP's std predictions to get within 15% of predictions of those + # made by GPy. + assert_allclose(y_pred_std, y_pred_std_gpy, rtol=0.15, atol=0) + + +def test_y_multioutput(): + # Test that GPR can deal with multi-dimensional target values + y_2d = np.vstack((y, y * 2)).T + + # Test for fixed kernel that first dimension of 2d GP equals the output + # of 1d GP and that second dimension is twice as large + kernel = RBF(length_scale=1.0) + + gpr = GaussianProcessRegressor(kernel=kernel, optimizer=None, normalize_y=False) + gpr.fit(X, y) + + gpr_2d = GaussianProcessRegressor(kernel=kernel, optimizer=None, normalize_y=False) + gpr_2d.fit(X, y_2d) + + y_pred_1d, y_std_1d = gpr.predict(X2, return_std=True) + y_pred_2d, y_std_2d = gpr_2d.predict(X2, return_std=True) + _, y_cov_1d = gpr.predict(X2, return_cov=True) + _, y_cov_2d = gpr_2d.predict(X2, return_cov=True) + + assert_almost_equal(y_pred_1d, y_pred_2d[:, 0]) + assert_almost_equal(y_pred_1d, y_pred_2d[:, 1] / 2) + + # Standard deviation and covariance do not depend on output + for target in range(y_2d.shape[1]): + assert_almost_equal(y_std_1d, y_std_2d[..., target]) + assert_almost_equal(y_cov_1d, y_cov_2d[..., target]) + + y_sample_1d = gpr.sample_y(X2, n_samples=10) + y_sample_2d = gpr_2d.sample_y(X2, n_samples=10) + + assert y_sample_1d.shape == (5, 10) + assert y_sample_2d.shape == (5, 2, 10) + # Only the first target will be equal + assert_almost_equal(y_sample_1d, y_sample_2d[:, 0, :]) + + # Test hyperparameter optimization + for kernel in kernels: + gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True) + gpr.fit(X, y) + + gpr_2d = GaussianProcessRegressor(kernel=kernel, normalize_y=True) + gpr_2d.fit(X, np.vstack((y, y)).T) + + assert_almost_equal(gpr.kernel_.theta, gpr_2d.kernel_.theta, 4) + + +@pytest.mark.parametrize("kernel", non_fixed_kernels) +def test_custom_optimizer(kernel): + # Test that GPR can use externally defined optimizers. + # Define a dummy optimizer that simply tests 50 random hyperparameters + def optimizer(obj_func, initial_theta, bounds): + rng = np.random.RandomState(0) + theta_opt, func_min = ( + initial_theta, + obj_func(initial_theta, eval_gradient=False), + ) + for _ in range(50): + theta = np.atleast_1d( + rng.uniform(np.maximum(-2, bounds[:, 0]), np.minimum(1, bounds[:, 1])) + ) + f = obj_func(theta, eval_gradient=False) + if f < func_min: + theta_opt, func_min = theta, f + return theta_opt, func_min + + gpr = GaussianProcessRegressor(kernel=kernel, optimizer=optimizer) + gpr.fit(X, y) + # Checks that optimizer improved marginal likelihood + assert gpr.log_marginal_likelihood(gpr.kernel_.theta) > gpr.log_marginal_likelihood( + gpr.kernel.theta + ) + + +def test_gpr_correct_error_message(): + X = np.arange(12).reshape(6, -1) + y = np.ones(6) + kernel = DotProduct() + gpr = GaussianProcessRegressor(kernel=kernel, alpha=0.0) + message = ( + "The kernel, %s, is not returning a " + "positive definite matrix. Try gradually increasing " + "the 'alpha' parameter of your " + "GaussianProcessRegressor estimator." % kernel + ) + with pytest.raises(np.linalg.LinAlgError, match=re.escape(message)): + gpr.fit(X, y) + + +@pytest.mark.parametrize("kernel", kernels) +def test_duplicate_input(kernel): + # Test GPR can handle two different output-values for the same input. + gpr_equal_inputs = GaussianProcessRegressor(kernel=kernel, alpha=1e-2) + gpr_similar_inputs = GaussianProcessRegressor(kernel=kernel, alpha=1e-2) + + X_ = np.vstack((X, X[0])) + y_ = np.hstack((y, y[0] + 1)) + gpr_equal_inputs.fit(X_, y_) + + X_ = np.vstack((X, X[0] + 1e-15)) + y_ = np.hstack((y, y[0] + 1)) + gpr_similar_inputs.fit(X_, y_) + + X_test = np.linspace(0, 10, 100)[:, None] + y_pred_equal, y_std_equal = gpr_equal_inputs.predict(X_test, return_std=True) + y_pred_similar, y_std_similar = gpr_similar_inputs.predict(X_test, return_std=True) + + assert_almost_equal(y_pred_equal, y_pred_similar) + assert_almost_equal(y_std_equal, y_std_similar) + + +def test_no_fit_default_predict(): + # Test that GPR predictions without fit does not break by default. + default_kernel = C(1.0, constant_value_bounds="fixed") * RBF( + 1.0, length_scale_bounds="fixed" + ) + gpr1 = GaussianProcessRegressor() + _, y_std1 = gpr1.predict(X, return_std=True) + _, y_cov1 = gpr1.predict(X, return_cov=True) + + gpr2 = GaussianProcessRegressor(kernel=default_kernel) + _, y_std2 = gpr2.predict(X, return_std=True) + _, y_cov2 = gpr2.predict(X, return_cov=True) + + assert_array_almost_equal(y_std1, y_std2) + assert_array_almost_equal(y_cov1, y_cov2) + + +def test_warning_bounds(): + kernel = RBF(length_scale_bounds=[1e-5, 1e-3]) + gpr = GaussianProcessRegressor(kernel=kernel) + warning_message = ( + "The optimal value found for dimension 0 of parameter " + "length_scale is close to the specified upper bound " + "0.001. Increasing the bound and calling fit again may " + "find a better value." + ) + with pytest.warns(ConvergenceWarning, match=warning_message): + gpr.fit(X, y) + + kernel_sum = WhiteKernel(noise_level_bounds=[1e-5, 1e-3]) + RBF( + length_scale_bounds=[1e3, 1e5] + ) + gpr_sum = GaussianProcessRegressor(kernel=kernel_sum) + with warnings.catch_warnings(record=True) as record: + warnings.simplefilter("always") + gpr_sum.fit(X, y) + + assert len(record) == 2 + + assert issubclass(record[0].category, ConvergenceWarning) + assert ( + record[0].message.args[0] == "The optimal value found for " + "dimension 0 of parameter " + "k1__noise_level is close to the " + "specified upper bound 0.001. " + "Increasing the bound and calling " + "fit again may find a better value." + ) + + assert issubclass(record[1].category, ConvergenceWarning) + assert ( + record[1].message.args[0] == "The optimal value found for " + "dimension 0 of parameter " + "k2__length_scale is close to the " + "specified lower bound 1000.0. " + "Decreasing the bound and calling " + "fit again may find a better value." + ) + + X_tile = np.tile(X, 2) + kernel_dims = RBF(length_scale=[1.0, 2.0], length_scale_bounds=[1e1, 1e2]) + gpr_dims = GaussianProcessRegressor(kernel=kernel_dims) + + with warnings.catch_warnings(record=True) as record: + warnings.simplefilter("always") + gpr_dims.fit(X_tile, y) + + assert len(record) == 2 + + assert issubclass(record[0].category, ConvergenceWarning) + assert ( + record[0].message.args[0] == "The optimal value found for " + "dimension 0 of parameter " + "length_scale is close to the " + "specified lower bound 10.0. " + "Decreasing the bound and calling " + "fit again may find a better value." + ) + + assert issubclass(record[1].category, ConvergenceWarning) + assert ( + record[1].message.args[0] == "The optimal value found for " + "dimension 1 of parameter " + "length_scale is close to the " + "specified lower bound 10.0. " + "Decreasing the bound and calling " + "fit again may find a better value." + ) + + +def test_bound_check_fixed_hyperparameter(): + # Regression test for issue #17943 + # Check that having a hyperparameter with fixed bounds doesn't cause an + # error + k1 = 50.0**2 * RBF(length_scale=50.0) # long term smooth rising trend + k2 = ExpSineSquared( + length_scale=1.0, periodicity=1.0, periodicity_bounds="fixed" + ) # seasonal component + kernel = k1 + k2 + GaussianProcessRegressor(kernel=kernel).fit(X, y) + + +@pytest.mark.parametrize("kernel", kernels) +def test_constant_target(kernel): + """Check that the std. dev. is affected to 1 when normalizing a constant + feature. + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/18318 + NaN where affected to the target when scaling due to null std. dev. with + constant target. + """ + y_constant = np.ones(X.shape[0], dtype=np.float64) + + gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True) + gpr.fit(X, y_constant) + assert gpr._y_train_std == pytest.approx(1.0) + + y_pred, y_cov = gpr.predict(X, return_cov=True) + assert_allclose(y_pred, y_constant) + # set atol because we compare to zero + assert_allclose(np.diag(y_cov), 0.0, atol=1e-9) + + # Test multi-target data + n_samples, n_targets = X.shape[0], 2 + rng = np.random.RandomState(0) + y = np.concatenate( + [ + rng.normal(size=(n_samples, 1)), # non-constant target + np.full(shape=(n_samples, 1), fill_value=2), # constant target + ], + axis=1, + ) + + gpr.fit(X, y) + Y_pred, Y_cov = gpr.predict(X, return_cov=True) + + assert_allclose(Y_pred[:, 1], 2) + assert_allclose(np.diag(Y_cov[..., 1]), 0.0, atol=1e-9) + + assert Y_pred.shape == (n_samples, n_targets) + assert Y_cov.shape == (n_samples, n_samples, n_targets) + + +def test_gpr_consistency_std_cov_non_invertible_kernel(): + """Check the consistency between the returned std. dev. and the covariance. + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/19936 + Inconsistencies were observed when the kernel cannot be inverted (or + numerically stable). + """ + kernel = C(8.98576054e05, (1e-12, 1e12)) * RBF( + [5.91326520e02, 1.32584051e03], (1e-12, 1e12) + ) + WhiteKernel(noise_level=1e-5) + gpr = GaussianProcessRegressor(kernel=kernel, alpha=0, optimizer=None) + X_train = np.array( + [ + [0.0, 0.0], + [1.54919334, -0.77459667], + [-1.54919334, 0.0], + [0.0, -1.54919334], + [0.77459667, 0.77459667], + [-0.77459667, 1.54919334], + ] + ) + y_train = np.array( + [ + [-2.14882017e-10], + [-4.66975823e00], + [4.01823986e00], + [-1.30303674e00], + [-1.35760156e00], + [3.31215668e00], + ] + ) + gpr.fit(X_train, y_train) + X_test = np.array( + [ + [-1.93649167, -1.93649167], + [1.93649167, -1.93649167], + [-1.93649167, 1.93649167], + [1.93649167, 1.93649167], + ] + ) + pred1, std = gpr.predict(X_test, return_std=True) + pred2, cov = gpr.predict(X_test, return_cov=True) + assert_allclose(std, np.sqrt(np.diagonal(cov)), rtol=1e-5) + + +@pytest.mark.parametrize( + "params, TypeError, err_msg", + [ + ( + {"alpha": np.zeros(100)}, + ValueError, + "alpha must be a scalar or an array with same number of entries as y", + ), + ( + { + "kernel": WhiteKernel(noise_level_bounds=(-np.inf, np.inf)), + "n_restarts_optimizer": 2, + }, + ValueError, + "requires that all bounds are finite", + ), + ], +) +def test_gpr_fit_error(params, TypeError, err_msg): + """Check that expected error are raised during fit.""" + gpr = GaussianProcessRegressor(**params) + with pytest.raises(TypeError, match=err_msg): + gpr.fit(X, y) + + +def test_gpr_lml_error(): + """Check that we raise the proper error in the LML method.""" + gpr = GaussianProcessRegressor(kernel=RBF()).fit(X, y) + + err_msg = "Gradient can only be evaluated for theta!=None" + with pytest.raises(ValueError, match=err_msg): + gpr.log_marginal_likelihood(eval_gradient=True) + + +def test_gpr_predict_error(): + """Check that we raise the proper error during predict.""" + gpr = GaussianProcessRegressor(kernel=RBF()).fit(X, y) + + err_msg = "At most one of return_std or return_cov can be requested." + with pytest.raises(RuntimeError, match=err_msg): + gpr.predict(X, return_cov=True, return_std=True) + + +@pytest.mark.parametrize("normalize_y", [True, False]) +@pytest.mark.parametrize("n_targets", [None, 1, 10]) +def test_predict_shapes(normalize_y, n_targets): + """Check the shapes of y_mean, y_std, and y_cov in single-output + (n_targets=None) and multi-output settings, including the edge case when + n_targets=1, where the sklearn convention is to squeeze the predictions. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/17394 + https://github.com/scikit-learn/scikit-learn/issues/18065 + https://github.com/scikit-learn/scikit-learn/issues/22174 + """ + rng = np.random.RandomState(1234) + + n_features, n_samples_train, n_samples_test = 6, 9, 7 + + y_train_shape = (n_samples_train,) + if n_targets is not None: + y_train_shape = y_train_shape + (n_targets,) + + # By convention single-output data is squeezed upon prediction + y_test_shape = (n_samples_test,) + if n_targets is not None and n_targets > 1: + y_test_shape = y_test_shape + (n_targets,) + + X_train = rng.randn(n_samples_train, n_features) + X_test = rng.randn(n_samples_test, n_features) + y_train = rng.randn(*y_train_shape) + + model = GaussianProcessRegressor(normalize_y=normalize_y) + model.fit(X_train, y_train) + + y_pred, y_std = model.predict(X_test, return_std=True) + _, y_cov = model.predict(X_test, return_cov=True) + + assert y_pred.shape == y_test_shape + assert y_std.shape == y_test_shape + assert y_cov.shape == (n_samples_test,) + y_test_shape + + +@pytest.mark.parametrize("normalize_y", [True, False]) +@pytest.mark.parametrize("n_targets", [None, 1, 10]) +def test_sample_y_shapes(normalize_y, n_targets): + """Check the shapes of y_samples in single-output (n_targets=0) and + multi-output settings, including the edge case when n_targets=1, where the + sklearn convention is to squeeze the predictions. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/22175 + """ + rng = np.random.RandomState(1234) + + n_features, n_samples_train = 6, 9 + # Number of spatial locations to predict at + n_samples_X_test = 7 + # Number of sample predictions per test point + n_samples_y_test = 5 + + y_train_shape = (n_samples_train,) + if n_targets is not None: + y_train_shape = y_train_shape + (n_targets,) + + # By convention single-output data is squeezed upon prediction + if n_targets is not None and n_targets > 1: + y_test_shape = (n_samples_X_test, n_targets, n_samples_y_test) + else: + y_test_shape = (n_samples_X_test, n_samples_y_test) + + X_train = rng.randn(n_samples_train, n_features) + X_test = rng.randn(n_samples_X_test, n_features) + y_train = rng.randn(*y_train_shape) + + model = GaussianProcessRegressor(normalize_y=normalize_y) + + # FIXME: before fitting, the estimator does not have information regarding + # the number of targets and default to 1. This is inconsistent with the shape + # provided after `fit`. This assert should be made once the following issue + # is fixed: + # https://github.com/scikit-learn/scikit-learn/issues/22430 + # y_samples = model.sample_y(X_test, n_samples=n_samples_y_test) + # assert y_samples.shape == y_test_shape + + model.fit(X_train, y_train) + + y_samples = model.sample_y(X_test, n_samples=n_samples_y_test) + assert y_samples.shape == y_test_shape + + +@pytest.mark.parametrize("n_targets", [None, 1, 2, 3]) +@pytest.mark.parametrize("n_samples", [1, 5]) +def test_sample_y_shape_with_prior(n_targets, n_samples): + """Check the output shape of `sample_y` is consistent before and after `fit`.""" + rng = np.random.RandomState(1024) + + X = rng.randn(10, 3) + y = rng.randn(10, n_targets if n_targets is not None else 1) + + model = GaussianProcessRegressor(n_targets=n_targets) + shape_before_fit = model.sample_y(X, n_samples=n_samples).shape + model.fit(X, y) + shape_after_fit = model.sample_y(X, n_samples=n_samples).shape + assert shape_before_fit == shape_after_fit + + +@pytest.mark.parametrize("n_targets", [None, 1, 2, 3]) +def test_predict_shape_with_prior(n_targets): + """Check the output shape of `predict` with prior distribution.""" + rng = np.random.RandomState(1024) + + n_sample = 10 + X = rng.randn(n_sample, 3) + y = rng.randn(n_sample, n_targets if n_targets is not None else 1) + + model = GaussianProcessRegressor(n_targets=n_targets) + mean_prior, cov_prior = model.predict(X, return_cov=True) + _, std_prior = model.predict(X, return_std=True) + + model.fit(X, y) + mean_post, cov_post = model.predict(X, return_cov=True) + _, std_post = model.predict(X, return_std=True) + + assert mean_prior.shape == mean_post.shape + assert cov_prior.shape == cov_post.shape + assert std_prior.shape == std_post.shape + + +def test_n_targets_error(): + """Check that an error is raised when the number of targets seen at fit is + inconsistent with n_targets. + """ + rng = np.random.RandomState(0) + X = rng.randn(10, 3) + y = rng.randn(10, 2) + + model = GaussianProcessRegressor(n_targets=1) + with pytest.raises(ValueError, match="The number of targets seen in `y`"): + model.fit(X, y) + + +class CustomKernel(C): + """ + A custom kernel that has a diag method that returns the first column of the + input matrix X. This is a helper for the test to check that the input + matrix X is not mutated. + """ + + def diag(self, X): + return X[:, 0] + + +def test_gpr_predict_input_not_modified(): + """ + Check that the input X is not modified by the predict method of the + GaussianProcessRegressor when setting return_std=True. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/24340 + """ + gpr = GaussianProcessRegressor(kernel=CustomKernel()).fit(X, y) + + X2_copy = np.copy(X2) + _, _ = gpr.predict(X2, return_std=True) + + assert_allclose(X2, X2_copy) diff --git a/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/test_kernels.py b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/test_kernels.py new file mode 100644 index 0000000000000000000000000000000000000000..5174d50b7df9210fbf67677ed5f18eaedf209ecc --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/gaussian_process/tests/test_kernels.py @@ -0,0 +1,403 @@ +"""Testing for kernels for Gaussian processes.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from inspect import signature + +import numpy as np +import pytest + +from sklearn.base import clone +from sklearn.gaussian_process.kernels import ( + RBF, + CompoundKernel, + ConstantKernel, + DotProduct, + Exponentiation, + ExpSineSquared, + KernelOperator, + Matern, + PairwiseKernel, + RationalQuadratic, + WhiteKernel, + _approx_fprime, +) +from sklearn.metrics.pairwise import ( + PAIRWISE_KERNEL_FUNCTIONS, + euclidean_distances, + pairwise_kernels, +) +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) + +X = np.random.RandomState(0).normal(0, 1, (5, 2)) +Y = np.random.RandomState(0).normal(0, 1, (6, 2)) +# Set shared test data as read-only to avoid unintentional in-place +# modifications that would introduce side-effects between tests. +X.flags.writeable = False +Y.flags.writeable = False + +kernel_rbf_plus_white = RBF(length_scale=2.0) + WhiteKernel(noise_level=3.0) +kernels = [ + RBF(length_scale=2.0), + RBF(length_scale_bounds=(0.5, 2.0)), + ConstantKernel(constant_value=10.0), + 2.0 * RBF(length_scale=0.33, length_scale_bounds="fixed"), + 2.0 * RBF(length_scale=0.5), + kernel_rbf_plus_white, + 2.0 * RBF(length_scale=[0.5, 2.0]), + 2.0 * Matern(length_scale=0.33, length_scale_bounds="fixed"), + 2.0 * Matern(length_scale=0.5, nu=0.5), + 2.0 * Matern(length_scale=1.5, nu=1.5), + 2.0 * Matern(length_scale=2.5, nu=2.5), + 2.0 * Matern(length_scale=[0.5, 2.0], nu=0.5), + 3.0 * Matern(length_scale=[2.0, 0.5], nu=1.5), + 4.0 * Matern(length_scale=[0.5, 0.5], nu=2.5), + RationalQuadratic(length_scale=0.5, alpha=1.5), + ExpSineSquared(length_scale=0.5, periodicity=1.5), + DotProduct(sigma_0=2.0), + DotProduct(sigma_0=2.0) ** 2, + RBF(length_scale=[2.0]), + Matern(length_scale=[2.0]), +] +for metric in PAIRWISE_KERNEL_FUNCTIONS: + if metric in ["additive_chi2", "chi2"]: + continue + kernels.append(PairwiseKernel(gamma=1.0, metric=metric)) + + +@pytest.mark.parametrize("kernel", kernels) +def test_kernel_gradient(kernel): + # Compare analytic and numeric gradient of kernels. + kernel = clone(kernel) # make tests independent of one-another + K, K_gradient = kernel(X, eval_gradient=True) + + assert K_gradient.shape[0] == X.shape[0] + assert K_gradient.shape[1] == X.shape[0] + assert K_gradient.shape[2] == kernel.theta.shape[0] + + def eval_kernel_for_theta(theta): + kernel_clone = kernel.clone_with_theta(theta) + K = kernel_clone(X, eval_gradient=False) + return K + + K_gradient_approx = _approx_fprime(kernel.theta, eval_kernel_for_theta, 1e-10) + + assert_almost_equal(K_gradient, K_gradient_approx, 4) + + +@pytest.mark.parametrize( + "kernel", + [ + kernel + for kernel in kernels + # skip non-basic kernels + if not (isinstance(kernel, (KernelOperator, Exponentiation))) + ], +) +def test_kernel_theta(kernel): + # Check that parameter vector theta of kernel is set correctly. + kernel = clone(kernel) # make tests independent of one-another + theta = kernel.theta + _, K_gradient = kernel(X, eval_gradient=True) + + # Determine kernel parameters that contribute to theta + init_sign = signature(kernel.__class__.__init__).parameters.values() + args = [p.name for p in init_sign if p.name != "self"] + theta_vars = map( + lambda s: s[0 : -len("_bounds")], filter(lambda s: s.endswith("_bounds"), args) + ) + assert set(hyperparameter.name for hyperparameter in kernel.hyperparameters) == set( + theta_vars + ) + + # Check that values returned in theta are consistent with + # hyperparameter values (being their logarithms) + for i, hyperparameter in enumerate(kernel.hyperparameters): + assert theta[i] == np.log(getattr(kernel, hyperparameter.name)) + + # Fixed kernel parameters must be excluded from theta and gradient. + for i, hyperparameter in enumerate(kernel.hyperparameters): + # create copy with certain hyperparameter fixed + params = kernel.get_params() + params[hyperparameter.name + "_bounds"] = "fixed" + kernel_class = kernel.__class__ + new_kernel = kernel_class(**params) + # Check that theta and K_gradient are identical with the fixed + # dimension left out + _, K_gradient_new = new_kernel(X, eval_gradient=True) + assert theta.shape[0] == new_kernel.theta.shape[0] + 1 + assert K_gradient.shape[2] == K_gradient_new.shape[2] + 1 + if i > 0: + assert theta[:i] == new_kernel.theta[:i] + assert_array_equal(K_gradient[..., :i], K_gradient_new[..., :i]) + if i + 1 < len(kernel.hyperparameters): + assert theta[i + 1 :] == new_kernel.theta[i:] + assert_array_equal(K_gradient[..., i + 1 :], K_gradient_new[..., i:]) + + # Check that values of theta are modified correctly + for i, hyperparameter in enumerate(kernel.hyperparameters): + theta[i] = np.log(42) + kernel.theta = theta + assert_almost_equal(getattr(kernel, hyperparameter.name), 42) + + setattr(kernel, hyperparameter.name, 43) + assert_almost_equal(kernel.theta[i], np.log(43)) + + +@pytest.mark.parametrize( + "kernel", + [ + kernel + for kernel in kernels + # Identity is not satisfied on diagonal + if kernel != kernel_rbf_plus_white + ], +) +def test_auto_vs_cross(kernel): + kernel = clone(kernel) # make tests independent of one-another + # Auto-correlation and cross-correlation should be consistent. + K_auto = kernel(X) + K_cross = kernel(X, X) + assert_almost_equal(K_auto, K_cross, 5) + + +@pytest.mark.parametrize("kernel", kernels) +def test_kernel_diag(kernel): + kernel = clone(kernel) # make tests independent of one-another + # Test that diag method of kernel returns consistent results. + K_call_diag = np.diag(kernel(X)) + K_diag = kernel.diag(X) + assert_almost_equal(K_call_diag, K_diag, 5) + + +def test_kernel_operator_commutative(): + # Adding kernels and multiplying kernels should be commutative. + # Check addition + assert_almost_equal((RBF(2.0) + 1.0)(X), (1.0 + RBF(2.0))(X)) + + # Check multiplication + assert_almost_equal((3.0 * RBF(2.0))(X), (RBF(2.0) * 3.0)(X)) + + +def test_kernel_anisotropic(): + # Anisotropic kernel should be consistent with isotropic kernels. + kernel = 3.0 * RBF([0.5, 2.0]) + + K = kernel(X) + X1 = X.copy() + X1[:, 0] *= 4 + K1 = 3.0 * RBF(2.0)(X1) + assert_almost_equal(K, K1) + + X2 = X.copy() + X2[:, 1] /= 4 + K2 = 3.0 * RBF(0.5)(X2) + assert_almost_equal(K, K2) + + # Check getting and setting via theta + kernel.theta = kernel.theta + np.log(2) + assert_array_equal(kernel.theta, np.log([6.0, 1.0, 4.0])) + assert_array_equal(kernel.k2.length_scale, [1.0, 4.0]) + + +@pytest.mark.parametrize( + "kernel", [kernel for kernel in kernels if kernel.is_stationary()] +) +def test_kernel_stationary(kernel): + kernel = clone(kernel) # make tests independent of one-another + # Test stationarity of kernels. + K = kernel(X, X + 1) + assert_almost_equal(K[0, 0], np.diag(K)) + + +@pytest.mark.parametrize("kernel", kernels) +def test_kernel_input_type(kernel): + kernel = clone(kernel) # make tests independent of one-another + # Test whether kernels is for vectors or structured data + if isinstance(kernel, Exponentiation): + assert kernel.requires_vector_input == kernel.kernel.requires_vector_input + if isinstance(kernel, KernelOperator): + assert kernel.requires_vector_input == ( + kernel.k1.requires_vector_input or kernel.k2.requires_vector_input + ) + + +def test_compound_kernel_input_type(): + kernel = CompoundKernel([WhiteKernel(noise_level=3.0)]) + assert not kernel.requires_vector_input + + kernel = CompoundKernel([WhiteKernel(noise_level=3.0), RBF(length_scale=2.0)]) + assert kernel.requires_vector_input + + +def check_hyperparameters_equal(kernel1, kernel2): + # Check that hyperparameters of two kernels are equal + for attr in set(dir(kernel1) + dir(kernel2)): + if attr.startswith("hyperparameter_"): + attr_value1 = getattr(kernel1, attr) + attr_value2 = getattr(kernel2, attr) + assert attr_value1 == attr_value2 + + +@pytest.mark.parametrize("kernel", kernels) +def test_kernel_clone(kernel): + kernel = clone(kernel) # make tests independent of one-another + # Test that sklearn's clone works correctly on kernels. + kernel_cloned = clone(kernel) + + # XXX: Should this be fixed? + # This differs from the sklearn's estimators equality check. + assert kernel == kernel_cloned + assert id(kernel) != id(kernel_cloned) + + # Check that all constructor parameters are equal. + assert kernel.get_params() == kernel_cloned.get_params() + + # Check that all hyperparameters are equal. + check_hyperparameters_equal(kernel, kernel_cloned) + + +@pytest.mark.parametrize("kernel", kernels) +def test_kernel_clone_after_set_params(kernel): + kernel = clone(kernel) # make tests independent of one-another + # This test is to verify that using set_params does not + # break clone on kernels. + # This used to break because in kernels such as the RBF, non-trivial + # logic that modified the length scale used to be in the constructor + # See https://github.com/scikit-learn/scikit-learn/issues/6961 + # for more details. + bounds = (1e-5, 1e5) + kernel_cloned = clone(kernel) + params = kernel.get_params() + # RationalQuadratic kernel is isotropic. + isotropic_kernels = (ExpSineSquared, RationalQuadratic) + if "length_scale" in params and not isinstance(kernel, isotropic_kernels): + length_scale = params["length_scale"] + if np.iterable(length_scale): + # XXX unreached code as of v0.22 + params["length_scale"] = length_scale[0] + params["length_scale_bounds"] = bounds + else: + params["length_scale"] = [length_scale] * 2 + params["length_scale_bounds"] = bounds * 2 + kernel_cloned.set_params(**params) + kernel_cloned_clone = clone(kernel_cloned) + assert kernel_cloned_clone.get_params() == kernel_cloned.get_params() + assert id(kernel_cloned_clone) != id(kernel_cloned) + check_hyperparameters_equal(kernel_cloned, kernel_cloned_clone) + + +def test_matern_kernel(): + # Test consistency of Matern kernel for special values of nu. + K = Matern(nu=1.5, length_scale=1.0)(X) + # the diagonal elements of a matern kernel are 1 + assert_array_almost_equal(np.diag(K), np.ones(X.shape[0])) + # matern kernel for coef0==0.5 is equal to absolute exponential kernel + K_absexp = np.exp(-euclidean_distances(X, X, squared=False)) + K = Matern(nu=0.5, length_scale=1.0)(X) + assert_array_almost_equal(K, K_absexp) + # matern kernel with coef0==inf is equal to RBF kernel + K_rbf = RBF(length_scale=1.0)(X) + K = Matern(nu=np.inf, length_scale=1.0)(X) + assert_array_almost_equal(K, K_rbf) + assert_allclose(K, K_rbf) + # test that special cases of matern kernel (coef0 in [0.5, 1.5, 2.5]) + # result in nearly identical results as the general case for coef0 in + # [0.5 + tiny, 1.5 + tiny, 2.5 + tiny] + tiny = 1e-10 + for nu in [0.5, 1.5, 2.5]: + K1 = Matern(nu=nu, length_scale=1.0)(X) + K2 = Matern(nu=nu + tiny, length_scale=1.0)(X) + assert_array_almost_equal(K1, K2) + # test that coef0==large is close to RBF + large = 100 + K1 = Matern(nu=large, length_scale=1.0)(X) + K2 = RBF(length_scale=1.0)(X) + assert_array_almost_equal(K1, K2, decimal=2) + + +@pytest.mark.parametrize("kernel", kernels) +def test_kernel_versus_pairwise(kernel): + kernel = clone(kernel) # make tests independent of one-another + # Check that GP kernels can also be used as pairwise kernels. + + # Test auto-kernel + if kernel != kernel_rbf_plus_white: + # For WhiteKernel: k(X) != k(X,X). This is assumed by + # pairwise_kernels + K1 = kernel(X) + K2 = pairwise_kernels(X, metric=kernel) + assert_array_almost_equal(K1, K2) + + # Test cross-kernel + K1 = kernel(X, Y) + K2 = pairwise_kernels(X, Y, metric=kernel) + assert_array_almost_equal(K1, K2) + + +@pytest.mark.parametrize("kernel", kernels) +def test_set_get_params(kernel): + kernel = clone(kernel) # make tests independent of one-another + # Check that set_params()/get_params() is consistent with kernel.theta. + + # Test get_params() + index = 0 + params = kernel.get_params() + for hyperparameter in kernel.hyperparameters: + if isinstance("string", type(hyperparameter.bounds)): + if hyperparameter.bounds == "fixed": + continue + size = hyperparameter.n_elements + if size > 1: # anisotropic kernels + assert_almost_equal( + np.exp(kernel.theta[index : index + size]), params[hyperparameter.name] + ) + index += size + else: + assert_almost_equal( + np.exp(kernel.theta[index]), params[hyperparameter.name] + ) + index += 1 + # Test set_params() + index = 0 + value = 10 # arbitrary value + for hyperparameter in kernel.hyperparameters: + if isinstance("string", type(hyperparameter.bounds)): + if hyperparameter.bounds == "fixed": + continue + size = hyperparameter.n_elements + if size > 1: # anisotropic kernels + kernel.set_params(**{hyperparameter.name: [value] * size}) + assert_almost_equal( + np.exp(kernel.theta[index : index + size]), [value] * size + ) + index += size + else: + kernel.set_params(**{hyperparameter.name: value}) + assert_almost_equal(np.exp(kernel.theta[index]), value) + index += 1 + + +@pytest.mark.parametrize("kernel", kernels) +def test_repr_kernels(kernel): + kernel = clone(kernel) # make tests independent of one-another + # Smoke-test for repr in kernels. + + repr(kernel) + + +def test_rational_quadratic_kernel(): + kernel = RationalQuadratic(length_scale=[1.0, 1.0]) + message = ( + "RationalQuadratic kernel only supports isotropic " + "version, please use a single " + "scalar for length_scale" + ) + with pytest.raises(AttributeError, match=message): + kernel(X) diff --git a/.venv/lib/python3.12/site-packages/sklearn/impute/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/impute/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aaa81d73c34a19004645733c05ea362aae8dcb01 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/impute/__init__.py @@ -0,0 +1,28 @@ +"""Transformers for missing value imputation.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import typing + +from ._base import MissingIndicator, SimpleImputer +from ._knn import KNNImputer + +if typing.TYPE_CHECKING: + # Avoid errors in type checkers (e.g. mypy) for experimental estimators. + # TODO: remove this check once the estimator is no longer experimental. + from ._iterative import IterativeImputer # noqa: F401 + +__all__ = ["KNNImputer", "MissingIndicator", "SimpleImputer"] + + +# TODO: remove this check once the estimator is no longer experimental. +def __getattr__(name): + if name == "IterativeImputer": + raise ImportError( + f"{name} is experimental and the API might change without any " + "deprecation cycle. To use it, you need to explicitly import " + "enable_iterative_imputer:\n" + "from sklearn.experimental import enable_iterative_imputer" + ) + raise AttributeError(f"module {__name__} has no attribute {name}") diff --git a/.venv/lib/python3.12/site-packages/sklearn/impute/_base.py b/.venv/lib/python3.12/site-packages/sklearn/impute/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..ae74068145678bd362296a367007371a5a353a95 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/impute/_base.py @@ -0,0 +1,1155 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numbers +import warnings +from collections import Counter +from functools import partial +from typing import Callable + +import numpy as np +import numpy.ma as ma +from scipy import sparse as sp + +from ..base import BaseEstimator, TransformerMixin, _fit_context +from ..utils._mask import _get_mask +from ..utils._missing import is_pandas_na, is_scalar_nan +from ..utils._param_validation import MissingValues, StrOptions +from ..utils.fixes import _mode +from ..utils.sparsefuncs import _get_median +from ..utils.validation import ( + FLOAT_DTYPES, + _check_feature_names_in, + _check_n_features, + check_is_fitted, + validate_data, +) + + +def _check_inputs_dtype(X, missing_values): + if is_pandas_na(missing_values): + # Allow using `pd.NA` as missing values to impute numerical arrays. + return + if X.dtype.kind in ("f", "i", "u") and not isinstance(missing_values, numbers.Real): + raise ValueError( + "'X' and 'missing_values' types are expected to be" + " both numerical. Got X.dtype={} and " + " type(missing_values)={}.".format(X.dtype, type(missing_values)) + ) + + +def _safe_min(items): + """Compute the minimum of a list of potentially non-comparable values. + + If values cannot be directly compared due to type incompatibility, the object with + the lowest string representation is returned. + """ + try: + return min(items) + except TypeError as e: + if "'<' not supported between" in str(e): + return min(items, key=lambda x: (str(type(x)), str(x))) + raise # pragma: no cover + + +def _most_frequent(array, extra_value, n_repeat): + """Compute the most frequent value in a 1d array extended with + [extra_value] * n_repeat, where extra_value is assumed to be not part + of the array.""" + # Compute the most frequent value in array only + if array.size > 0: + if array.dtype == object: + # scipy.stats.mode is slow with object dtype array. + # Python Counter is more efficient + counter = Counter(array) + most_frequent_count = counter.most_common(1)[0][1] + # tie breaking similarly to scipy.stats.mode + most_frequent_value = _safe_min( + [ + value + for value, count in counter.items() + if count == most_frequent_count + ] + ) + else: + mode = _mode(array) + most_frequent_value = mode[0][0] + most_frequent_count = mode[1][0] + else: + most_frequent_value = 0 + most_frequent_count = 0 + + # Compare to array + [extra_value] * n_repeat + if most_frequent_count == 0 and n_repeat == 0: + return np.nan + elif most_frequent_count < n_repeat: + return extra_value + elif most_frequent_count > n_repeat: + return most_frequent_value + elif most_frequent_count == n_repeat: + # tie breaking similarly to scipy.stats.mode + return _safe_min([most_frequent_value, extra_value]) + + +class _BaseImputer(TransformerMixin, BaseEstimator): + """Base class for all imputers. + + It adds automatically support for `add_indicator`. + """ + + _parameter_constraints: dict = { + "missing_values": [MissingValues()], + "add_indicator": ["boolean"], + "keep_empty_features": ["boolean"], + } + + def __init__( + self, *, missing_values=np.nan, add_indicator=False, keep_empty_features=False + ): + self.missing_values = missing_values + self.add_indicator = add_indicator + self.keep_empty_features = keep_empty_features + + def _fit_indicator(self, X): + """Fit a MissingIndicator.""" + if self.add_indicator: + self.indicator_ = MissingIndicator( + missing_values=self.missing_values, error_on_new=False + ) + self.indicator_._fit(X, precomputed=True) + else: + self.indicator_ = None + + def _transform_indicator(self, X): + """Compute the indicator mask.' + + Note that X must be the original data as passed to the imputer before + any imputation, since imputation may be done inplace in some cases. + """ + if self.add_indicator: + if not hasattr(self, "indicator_"): + raise ValueError( + "Make sure to call _fit_indicator before _transform_indicator" + ) + return self.indicator_.transform(X) + + def _concatenate_indicator(self, X_imputed, X_indicator): + """Concatenate indicator mask with the imputed data.""" + if not self.add_indicator: + return X_imputed + + if sp.issparse(X_imputed): + # sp.hstack may result in different formats between sparse arrays and + # matrices; specify the format to keep consistent behavior + hstack = partial(sp.hstack, format=X_imputed.format) + else: + hstack = np.hstack + + if X_indicator is None: + raise ValueError( + "Data from the missing indicator are not provided. Call " + "_fit_indicator and _transform_indicator in the imputer " + "implementation." + ) + + return hstack((X_imputed, X_indicator)) + + def _concatenate_indicator_feature_names_out(self, names, input_features): + if not self.add_indicator: + return names + + indicator_names = self.indicator_.get_feature_names_out(input_features) + return np.concatenate([names, indicator_names]) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = is_scalar_nan(self.missing_values) + return tags + + +class SimpleImputer(_BaseImputer): + """Univariate imputer for completing missing values with simple strategies. + + Replace missing values using a descriptive statistic (e.g. mean, median, or + most frequent) along each column, or using a constant value. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.20 + `SimpleImputer` replaces the previous `sklearn.preprocessing.Imputer` + estimator which is now removed. + + Parameters + ---------- + missing_values : int, float, str, np.nan, None or pandas.NA, default=np.nan + The placeholder for the missing values. All occurrences of + `missing_values` will be imputed. For pandas' dataframes with + nullable integer dtypes with missing values, `missing_values` + can be set to either `np.nan` or `pd.NA`. + + strategy : str or Callable, default='mean' + The imputation strategy. + + - If "mean", then replace missing values using the mean along + each column. Can only be used with numeric data. + - If "median", then replace missing values using the median along + each column. Can only be used with numeric data. + - If "most_frequent", then replace missing using the most frequent + value along each column. Can be used with strings or numeric data. + If there is more than one such value, only the smallest is returned. + - If "constant", then replace missing values with fill_value. Can be + used with strings or numeric data. + - If an instance of Callable, then replace missing values using the + scalar statistic returned by running the callable over a dense 1d + array containing non-missing values of each column. + + .. versionadded:: 0.20 + strategy="constant" for fixed value imputation. + + .. versionadded:: 1.5 + strategy=callable for custom value imputation. + + fill_value : str or numerical value, default=None + When strategy == "constant", `fill_value` is used to replace all + occurrences of missing_values. For string or object data types, + `fill_value` must be a string. + If `None`, `fill_value` will be 0 when imputing numerical + data and "missing_value" for strings or object data types. + + copy : bool, default=True + If True, a copy of X will be created. If False, imputation will + be done in-place whenever possible. Note that, in the following cases, + a new copy will always be made, even if `copy=False`: + + - If `X` is not an array of floating values; + - If `X` is encoded as a CSR matrix; + - If `add_indicator=True`. + + add_indicator : bool, default=False + If True, a :class:`MissingIndicator` transform will stack onto output + of the imputer's transform. This allows a predictive estimator + to account for missingness despite imputation. If a feature has no + missing values at fit/train time, the feature won't appear on + the missing indicator even if there are missing values at + transform/test time. + + keep_empty_features : bool, default=False + If True, features that consist exclusively of missing values when + `fit` is called are returned in results when `transform` is called. + The imputed value is always `0` except when `strategy="constant"` + in which case `fill_value` will be used instead. + + .. versionadded:: 1.2 + + .. versionchanged:: 1.6 + Currently, when `keep_empty_feature=False` and `strategy="constant"`, + empty features are not dropped. This behaviour will change in version + 1.8. Set `keep_empty_feature=True` to preserve this behaviour. + + Attributes + ---------- + statistics_ : array of shape (n_features,) + The imputation fill value for each feature. + Computing statistics can result in `np.nan` values. + During :meth:`transform`, features corresponding to `np.nan` + statistics will be discarded. + + indicator_ : :class:`~sklearn.impute.MissingIndicator` + Indicator used to add binary indicators for missing values. + `None` if `add_indicator=False`. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + IterativeImputer : Multivariate imputer that estimates values to impute for + each feature with missing values from all the others. + KNNImputer : Multivariate imputer that estimates missing features using + nearest samples. + + Notes + ----- + Columns which only contained missing values at :meth:`fit` are discarded + upon :meth:`transform` if strategy is not `"constant"`. + + In a prediction context, simple imputation usually performs poorly when + associated with a weak learner. However, with a powerful learner, it can + lead to as good or better performance than complex imputation such as + :class:`~sklearn.impute.IterativeImputer` or :class:`~sklearn.impute.KNNImputer`. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.impute import SimpleImputer + >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') + >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]) + SimpleImputer() + >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]] + >>> print(imp_mean.transform(X)) + [[ 7. 2. 3. ] + [ 4. 3.5 6. ] + [10. 3.5 9. ]] + + For a more detailed example see + :ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`. + """ + + _parameter_constraints: dict = { + **_BaseImputer._parameter_constraints, + "strategy": [ + StrOptions({"mean", "median", "most_frequent", "constant"}), + callable, + ], + "fill_value": "no_validation", # any object is valid + "copy": ["boolean"], + } + + def __init__( + self, + *, + missing_values=np.nan, + strategy="mean", + fill_value=None, + copy=True, + add_indicator=False, + keep_empty_features=False, + ): + super().__init__( + missing_values=missing_values, + add_indicator=add_indicator, + keep_empty_features=keep_empty_features, + ) + self.strategy = strategy + self.fill_value = fill_value + self.copy = copy + + def _validate_input(self, X, in_fit): + if self.strategy in ("most_frequent", "constant"): + # If input is a list of strings, dtype = object. + # Otherwise ValueError is raised in SimpleImputer + # with strategy='most_frequent' or 'constant' + # because the list is converted to Unicode numpy array + if isinstance(X, list) and any( + isinstance(elem, str) for row in X for elem in row + ): + dtype = object + else: + dtype = None + else: + dtype = FLOAT_DTYPES + + if not in_fit and self._fit_dtype.kind == "O": + # Use object dtype if fitted on object dtypes + dtype = self._fit_dtype + + if is_pandas_na(self.missing_values) or is_scalar_nan(self.missing_values): + ensure_all_finite = "allow-nan" + else: + ensure_all_finite = True + + try: + X = validate_data( + self, + X, + reset=in_fit, + accept_sparse="csc", + dtype=dtype, + force_writeable=True if not in_fit else None, + ensure_all_finite=ensure_all_finite, + copy=self.copy, + ) + except ValueError as ve: + if "could not convert" in str(ve): + new_ve = ValueError( + "Cannot use {} strategy with non-numeric data:\n{}".format( + self.strategy, ve + ) + ) + raise new_ve from None + else: + raise ve + + if in_fit: + # Use the dtype seen in `fit` for non-`fit` conversion + self._fit_dtype = X.dtype + + _check_inputs_dtype(X, self.missing_values) + if X.dtype.kind not in ("i", "u", "f", "O"): + raise ValueError( + "SimpleImputer does not support data with dtype " + "{0}. Please provide either a numeric array (with" + " a floating point or integer dtype) or " + "categorical data represented either as an array " + "with integer dtype or an array of string values " + "with an object dtype.".format(X.dtype) + ) + + if sp.issparse(X) and self.missing_values == 0: + # missing_values = 0 not allowed with sparse data as it would + # force densification + raise ValueError( + "Imputation not possible when missing_values " + "== 0 and input is sparse. Provide a dense " + "array instead." + ) + + if self.strategy == "constant": + if in_fit and self.fill_value is not None: + fill_value_dtype = type(self.fill_value) + err_msg = ( + f"fill_value={self.fill_value!r} (of type {fill_value_dtype!r}) " + f"cannot be cast to the input data that is {X.dtype!r}. " + "If fill_value is a Python scalar, instead pass a numpy scalar " + "(e.g. fill_value=np.uint8(0) if your data is of type np.uint8). " + "Make sure that both dtypes are of the same kind." + ) + elif not in_fit: + fill_value_dtype = self.statistics_.dtype + err_msg = ( + f"The dtype of the filling value (i.e. {fill_value_dtype!r}) " + f"cannot be cast to the input data that is {X.dtype!r}. " + "Make sure that the dtypes of the input data are of the same kind " + "between fit and transform." + ) + else: + # By default, fill_value=None, and the replacement is always + # compatible with the input data + fill_value_dtype = X.dtype + + # Make sure we can safely cast fill_value dtype to the input data dtype + if not np.can_cast(fill_value_dtype, X.dtype, casting="same_kind"): + raise ValueError(err_msg) + + return X + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit the imputer on `X`. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Input data, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self : object + Fitted estimator. + """ + X = self._validate_input(X, in_fit=True) + + # default fill_value is 0 for numerical input and "missing_value" + # otherwise + if self.fill_value is None: + if X.dtype.kind in ("i", "u", "f"): + fill_value = 0 + else: + fill_value = "missing_value" + else: + fill_value = self.fill_value + + if sp.issparse(X): + self.statistics_ = self._sparse_fit( + X, self.strategy, self.missing_values, fill_value + ) + else: + self.statistics_ = self._dense_fit( + X, self.strategy, self.missing_values, fill_value + ) + + return self + + def _sparse_fit(self, X, strategy, missing_values, fill_value): + """Fit the transformer on sparse data.""" + missing_mask = _get_mask(X, missing_values) + mask_data = missing_mask.data + n_implicit_zeros = X.shape[0] - np.diff(X.indptr) + + statistics = np.empty(X.shape[1]) + + if strategy == "constant": + # TODO(1.8): Remove FutureWarning and add `np.nan` as a statistic + # for empty features to drop them later. + if not self.keep_empty_features and any( + [all(missing_mask[:, i].data) for i in range(missing_mask.shape[1])] + ): + warnings.warn( + "Currently, when `keep_empty_feature=False` and " + '`strategy="constant"`, empty features are not dropped. ' + "This behaviour will change in version 1.8. Set " + "`keep_empty_feature=True` to preserve this behaviour.", + FutureWarning, + ) + + # for constant strategy, self.statistics_ is used to store + # fill_value in each column + statistics.fill(fill_value) + else: + for i in range(X.shape[1]): + column = X.data[X.indptr[i] : X.indptr[i + 1]] + mask_column = mask_data[X.indptr[i] : X.indptr[i + 1]] + column = column[~mask_column] + + # combine explicit and implicit zeros + mask_zeros = _get_mask(column, 0) + column = column[~mask_zeros] + n_explicit_zeros = mask_zeros.sum() + n_zeros = n_implicit_zeros[i] + n_explicit_zeros + + if len(column) == 0 and self.keep_empty_features: + # in case we want to keep columns with only missing values. + statistics[i] = 0 + else: + if strategy == "mean": + s = column.size + n_zeros + statistics[i] = np.nan if s == 0 else column.sum() / s + + elif strategy == "median": + statistics[i] = _get_median(column, n_zeros) + + elif strategy == "most_frequent": + statistics[i] = _most_frequent(column, 0, n_zeros) + + elif isinstance(strategy, Callable): + statistics[i] = self.strategy(column) + + super()._fit_indicator(missing_mask) + + return statistics + + def _dense_fit(self, X, strategy, missing_values, fill_value): + """Fit the transformer on dense data.""" + missing_mask = _get_mask(X, missing_values) + masked_X = ma.masked_array(X, mask=missing_mask) + + super()._fit_indicator(missing_mask) + + # Mean + if strategy == "mean": + mean_masked = np.ma.mean(masked_X, axis=0) + # Avoid the warning "Warning: converting a masked element to nan." + mean = np.ma.getdata(mean_masked) + mean[np.ma.getmask(mean_masked)] = 0 if self.keep_empty_features else np.nan + + return mean + + # Median + elif strategy == "median": + median_masked = np.ma.median(masked_X, axis=0) + # Avoid the warning "Warning: converting a masked element to nan." + median = np.ma.getdata(median_masked) + median[np.ma.getmaskarray(median_masked)] = ( + 0 if self.keep_empty_features else np.nan + ) + + return median + + # Most frequent + elif strategy == "most_frequent": + # Avoid use of scipy.stats.mstats.mode due to the required + # additional overhead and slow benchmarking performance. + # See Issue 14325 and PR 14399 for full discussion. + + # To be able access the elements by columns + X = X.transpose() + mask = missing_mask.transpose() + + if X.dtype.kind == "O": + most_frequent = np.empty(X.shape[0], dtype=object) + else: + most_frequent = np.empty(X.shape[0]) + + for i, (row, row_mask) in enumerate(zip(X[:], mask[:])): + row_mask = np.logical_not(row_mask).astype(bool) + row = row[row_mask] + if len(row) == 0 and self.keep_empty_features: + most_frequent[i] = 0 + else: + most_frequent[i] = _most_frequent(row, np.nan, 0) + + return most_frequent + + # Constant + elif strategy == "constant": + # TODO(1.8): Remove FutureWarning and add `np.nan` as a statistic + # for empty features to drop them later. + if not self.keep_empty_features and ma.getmask(masked_X).all(axis=0).any(): + warnings.warn( + "Currently, when `keep_empty_feature=False` and " + '`strategy="constant"`, empty features are not dropped. ' + "This behaviour will change in version 1.8. Set " + "`keep_empty_feature=True` to preserve this behaviour.", + FutureWarning, + ) + + # for constant strategy, self.statistcs_ is used to store + # fill_value in each column + return np.full(X.shape[1], fill_value, dtype=X.dtype) + + # Custom + elif isinstance(strategy, Callable): + statistics = np.empty(masked_X.shape[1]) + for i in range(masked_X.shape[1]): + statistics[i] = self.strategy(masked_X[:, i].compressed()) + return statistics + + def transform(self, X): + """Impute all missing values in `X`. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The input data to complete. + + Returns + ------- + X_imputed : {ndarray, sparse matrix} of shape \ + (n_samples, n_features_out) + `X` with imputed values. + """ + check_is_fitted(self) + + X = self._validate_input(X, in_fit=False) + statistics = self.statistics_ + + if X.shape[1] != statistics.shape[0]: + raise ValueError( + "X has %d features per sample, expected %d" + % (X.shape[1], self.statistics_.shape[0]) + ) + + # compute mask before eliminating invalid features + missing_mask = _get_mask(X, self.missing_values) + + # Decide whether to keep missing features + if self.strategy == "constant" or self.keep_empty_features: + valid_statistics = statistics + valid_statistics_indexes = None + else: + # same as np.isnan but also works for object dtypes + invalid_mask = _get_mask(statistics, np.nan) + valid_mask = np.logical_not(invalid_mask) + valid_statistics = statistics[valid_mask] + valid_statistics_indexes = np.flatnonzero(valid_mask) + + if invalid_mask.any(): + invalid_features = np.arange(X.shape[1])[invalid_mask] + # use feature names warning if features are provided + if hasattr(self, "feature_names_in_"): + invalid_features = self.feature_names_in_[invalid_features] + warnings.warn( + "Skipping features without any observed values:" + f" {invalid_features}. At least one non-missing value is needed" + f" for imputation with strategy='{self.strategy}'." + ) + X = X[:, valid_statistics_indexes] + + # Do actual imputation + if sp.issparse(X): + if self.missing_values == 0: + raise ValueError( + "Imputation not possible when missing_values " + "== 0 and input is sparse. Provide a dense " + "array instead." + ) + else: + # if no invalid statistics are found, use the mask computed + # before, else recompute mask + if valid_statistics_indexes is None: + mask = missing_mask.data + else: + mask = _get_mask(X.data, self.missing_values) + indexes = np.repeat( + np.arange(len(X.indptr) - 1, dtype=int), np.diff(X.indptr) + )[mask] + + X.data[mask] = valid_statistics[indexes].astype(X.dtype, copy=False) + else: + # use mask computed before eliminating invalid mask + if valid_statistics_indexes is None: + mask_valid_features = missing_mask + else: + mask_valid_features = missing_mask[:, valid_statistics_indexes] + n_missing = np.sum(mask_valid_features, axis=0) + values = np.repeat(valid_statistics, n_missing) + coordinates = np.where(mask_valid_features.transpose())[::-1] + + X[coordinates] = values + + X_indicator = super()._transform_indicator(missing_mask) + + return super()._concatenate_indicator(X, X_indicator) + + def inverse_transform(self, X): + """Convert the data back to the original representation. + + Inverts the `transform` operation performed on an array. + This operation can only be performed after :class:`SimpleImputer` is + instantiated with `add_indicator=True`. + + Note that `inverse_transform` can only invert the transform in + features that have binary indicators for missing values. If a feature + has no missing values at `fit` time, the feature won't have a binary + indicator, and the imputation done at `transform` time won't be + inverted. + + .. versionadded:: 0.24 + + Parameters + ---------- + X : array-like of shape \ + (n_samples, n_features + n_features_missing_indicator) + The imputed data to be reverted to original data. It has to be + an augmented array of imputed data and the missing indicator mask. + + Returns + ------- + X_original : ndarray of shape (n_samples, n_features) + The original `X` with missing values as it was prior + to imputation. + """ + check_is_fitted(self) + + if not self.add_indicator: + raise ValueError( + "'inverse_transform' works only when " + "'SimpleImputer' is instantiated with " + "'add_indicator=True'. " + f"Got 'add_indicator={self.add_indicator}' " + "instead." + ) + + n_features_missing = len(self.indicator_.features_) + non_empty_feature_count = X.shape[1] - n_features_missing + array_imputed = X[:, :non_empty_feature_count].copy() + missing_mask = X[:, non_empty_feature_count:].astype(bool) + + n_features_original = len(self.statistics_) + shape_original = (X.shape[0], n_features_original) + X_original = np.zeros(shape_original) + X_original[:, self.indicator_.features_] = missing_mask + full_mask = X_original.astype(bool) + + imputed_idx, original_idx = 0, 0 + while imputed_idx < len(array_imputed.T): + if not np.all(X_original[:, original_idx]): + X_original[:, original_idx] = array_imputed.T[imputed_idx] + imputed_idx += 1 + original_idx += 1 + else: + original_idx += 1 + + X_original[full_mask] = self.missing_values + return X_original + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + tags.input_tags.allow_nan = is_pandas_na(self.missing_values) or is_scalar_nan( + self.missing_values + ) + return tags + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then the following input feature names are generated: + `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + check_is_fitted(self, "n_features_in_") + input_features = _check_feature_names_in(self, input_features) + non_missing_mask = np.logical_not(_get_mask(self.statistics_, np.nan)) + names = input_features[non_missing_mask] + return self._concatenate_indicator_feature_names_out(names, input_features) + + +class MissingIndicator(TransformerMixin, BaseEstimator): + """Binary indicators for missing values. + + Note that this component typically should not be used in a vanilla + :class:`~sklearn.pipeline.Pipeline` consisting of transformers and a + classifier, but rather could be added using a + :class:`~sklearn.pipeline.FeatureUnion` or + :class:`~sklearn.compose.ColumnTransformer`. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.20 + + Parameters + ---------- + missing_values : int, float, str, np.nan or None, default=np.nan + The placeholder for the missing values. All occurrences of + `missing_values` will be imputed. For pandas' dataframes with + nullable integer dtypes with missing values, `missing_values` + should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`. + + features : {'missing-only', 'all'}, default='missing-only' + Whether the imputer mask should represent all or a subset of + features. + + - If `'missing-only'` (default), the imputer mask will only represent + features containing missing values during fit time. + - If `'all'`, the imputer mask will represent all features. + + sparse : bool or 'auto', default='auto' + Whether the imputer mask format should be sparse or dense. + + - If `'auto'` (default), the imputer mask will be of same type as + input. + - If `True`, the imputer mask will be a sparse matrix. + - If `False`, the imputer mask will be a numpy array. + + error_on_new : bool, default=True + If `True`, :meth:`transform` will raise an error when there are + features with missing values that have no missing values in + :meth:`fit`. This is applicable only when `features='missing-only'`. + + Attributes + ---------- + features_ : ndarray of shape (n_missing_features,) or (n_features,) + The features indices which will be returned when calling + :meth:`transform`. They are computed during :meth:`fit`. If + `features='all'`, `features_` is equal to `range(n_features)`. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + SimpleImputer : Univariate imputation of missing values. + IterativeImputer : Multivariate imputation of missing values. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.impute import MissingIndicator + >>> X1 = np.array([[np.nan, 1, 3], + ... [4, 0, np.nan], + ... [8, 1, 0]]) + >>> X2 = np.array([[5, 1, np.nan], + ... [np.nan, 2, 3], + ... [2, 4, 0]]) + >>> indicator = MissingIndicator() + >>> indicator.fit(X1) + MissingIndicator() + >>> X2_tr = indicator.transform(X2) + >>> X2_tr + array([[False, True], + [ True, False], + [False, False]]) + """ + + _parameter_constraints: dict = { + "missing_values": [MissingValues()], + "features": [StrOptions({"missing-only", "all"})], + "sparse": ["boolean", StrOptions({"auto"})], + "error_on_new": ["boolean"], + } + + def __init__( + self, + *, + missing_values=np.nan, + features="missing-only", + sparse="auto", + error_on_new=True, + ): + self.missing_values = missing_values + self.features = features + self.sparse = sparse + self.error_on_new = error_on_new + + def _get_missing_features_info(self, X): + """Compute the imputer mask and the indices of the features + containing missing values. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + The input data with missing values. Note that `X` has been + checked in :meth:`fit` and :meth:`transform` before to call this + function. + + Returns + ------- + imputer_mask : {ndarray, sparse matrix} of shape \ + (n_samples, n_features) + The imputer mask of the original data. + + features_with_missing : ndarray of shape (n_features_with_missing) + The features containing missing values. + """ + if not self._precomputed: + imputer_mask = _get_mask(X, self.missing_values) + else: + imputer_mask = X + + if sp.issparse(X): + imputer_mask.eliminate_zeros() + + if self.features == "missing-only": + # count number of True values in each row. + n_missing = imputer_mask.sum(axis=0) + + if self.sparse is False: + imputer_mask = imputer_mask.toarray() + elif imputer_mask.format == "csr": + imputer_mask = imputer_mask.tocsc() + else: + if not self._precomputed: + imputer_mask = _get_mask(X, self.missing_values) + else: + imputer_mask = X + + if self.features == "missing-only": + n_missing = imputer_mask.sum(axis=0) + + if self.sparse is True: + imputer_mask = sp.csc_matrix(imputer_mask) + + if self.features == "all": + features_indices = np.arange(X.shape[1]) + else: + features_indices = np.flatnonzero(n_missing) + + return imputer_mask, features_indices + + def _validate_input(self, X, in_fit): + if not is_scalar_nan(self.missing_values): + ensure_all_finite = True + else: + ensure_all_finite = "allow-nan" + X = validate_data( + self, + X, + reset=in_fit, + accept_sparse=("csc", "csr"), + dtype=None, + ensure_all_finite=ensure_all_finite, + ) + _check_inputs_dtype(X, self.missing_values) + if X.dtype.kind not in ("i", "u", "f", "O"): + raise ValueError( + "MissingIndicator does not support data with " + "dtype {0}. Please provide either a numeric array" + " (with a floating point or integer dtype) or " + "categorical data represented either as an array " + "with integer dtype or an array of string values " + "with an object dtype.".format(X.dtype) + ) + + if sp.issparse(X) and self.missing_values == 0: + # missing_values = 0 not allowed with sparse data as it would + # force densification + raise ValueError( + "Sparse input with missing_values=0 is " + "not supported. Provide a dense " + "array instead." + ) + + return X + + def _fit(self, X, y=None, precomputed=False): + """Fit the transformer on `X`. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data, where `n_samples` is the number of samples and + `n_features` is the number of features. + If `precomputed=True`, then `X` is a mask of the input data. + + precomputed : bool + Whether the input data is a mask. + + Returns + ------- + imputer_mask : {ndarray, sparse matrix} of shape (n_samples, \ + n_features) + The imputer mask of the original data. + """ + if precomputed: + if not (hasattr(X, "dtype") and X.dtype.kind == "b"): + raise ValueError("precomputed is True but the input data is not a mask") + self._precomputed = True + else: + self._precomputed = False + + # Need not validate X again as it would have already been validated + # in the Imputer calling MissingIndicator + if not self._precomputed: + X = self._validate_input(X, in_fit=True) + else: + # only create `n_features_in_` in the precomputed case + _check_n_features(self, X, reset=True) + + self._n_features = X.shape[1] + + missing_features_info = self._get_missing_features_info(X) + self.features_ = missing_features_info[1] + + return missing_features_info[0] + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit the transformer on `X`. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Fitted estimator. + """ + self._fit(X, y) + + return self + + def transform(self, X): + """Generate missing values indicator for `X`. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input data to complete. + + Returns + ------- + Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) \ + or (n_samples, n_features_with_missing) + The missing indicator for input data. The data type of `Xt` + will be boolean. + """ + check_is_fitted(self) + + # Need not validate X again as it would have already been validated + # in the Imputer calling MissingIndicator + if not self._precomputed: + X = self._validate_input(X, in_fit=False) + else: + if not (hasattr(X, "dtype") and X.dtype.kind == "b"): + raise ValueError("precomputed is True but the input data is not a mask") + + imputer_mask, features = self._get_missing_features_info(X) + + if self.features == "missing-only": + features_diff_fit_trans = np.setdiff1d(features, self.features_) + if self.error_on_new and features_diff_fit_trans.size > 0: + raise ValueError( + "The features {} have missing values " + "in transform but have no missing values " + "in fit.".format(features_diff_fit_trans) + ) + + if self.features_.size < self._n_features: + imputer_mask = imputer_mask[:, self.features_] + + return imputer_mask + + @_fit_context(prefer_skip_nested_validation=True) + def fit_transform(self, X, y=None): + """Generate missing values indicator for `X`. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input data to complete. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) \ + or (n_samples, n_features_with_missing) + The missing indicator for input data. The data type of `Xt` + will be boolean. + """ + imputer_mask = self._fit(X, y) + + if self.features_.size < self._n_features: + imputer_mask = imputer_mask[:, self.features_] + + return imputer_mask + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then the following input feature names are generated: + `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + check_is_fitted(self, "n_features_in_") + input_features = _check_feature_names_in(self, input_features) + prefix = self.__class__.__name__.lower() + return np.asarray( + [ + f"{prefix}_{feature_name}" + for feature_name in input_features[self.features_] + ], + dtype=object, + ) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + tags.input_tags.string = True + tags.input_tags.sparse = True + tags.transformer_tags.preserves_dtype = [] + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/impute/_iterative.py b/.venv/lib/python3.12/site-packages/sklearn/impute/_iterative.py new file mode 100644 index 0000000000000000000000000000000000000000..ddae5373c5460891467d50fd7105473031d957b4 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/impute/_iterative.py @@ -0,0 +1,1030 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from collections import namedtuple +from numbers import Integral, Real +from time import time + +import numpy as np +from scipy import stats + +from ..base import _fit_context, clone +from ..exceptions import ConvergenceWarning +from ..preprocessing import normalize +from ..utils import _safe_indexing, check_array, check_random_state +from ..utils._indexing import _safe_assign +from ..utils._mask import _get_mask +from ..utils._missing import is_scalar_nan +from ..utils._param_validation import HasMethods, Interval, StrOptions +from ..utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + process_routing, +) +from ..utils.validation import ( + FLOAT_DTYPES, + _check_feature_names_in, + _num_samples, + check_is_fitted, + validate_data, +) +from ._base import SimpleImputer, _BaseImputer, _check_inputs_dtype + +_ImputerTriplet = namedtuple( + "_ImputerTriplet", ["feat_idx", "neighbor_feat_idx", "estimator"] +) + + +def _assign_where(X1, X2, cond): + """Assign X2 to X1 where cond is True. + + Parameters + ---------- + X1 : ndarray or dataframe of shape (n_samples, n_features) + Data. + + X2 : ndarray of shape (n_samples, n_features) + Data to be assigned. + + cond : ndarray of shape (n_samples, n_features) + Boolean mask to assign data. + """ + if hasattr(X1, "mask"): # pandas dataframes + X1.mask(cond=cond, other=X2, inplace=True) + else: # ndarrays + X1[cond] = X2[cond] + + +class IterativeImputer(_BaseImputer): + """Multivariate imputer that estimates each feature from all the others. + + A strategy for imputing missing values by modeling each feature with + missing values as a function of other features in a round-robin fashion. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.21 + + .. note:: + + This estimator is still **experimental** for now: the predictions + and the API might change without any deprecation cycle. To use it, + you need to explicitly import `enable_iterative_imputer`:: + + >>> # explicitly require this experimental feature + >>> from sklearn.experimental import enable_iterative_imputer # noqa + >>> # now you can import normally from sklearn.impute + >>> from sklearn.impute import IterativeImputer + + Parameters + ---------- + estimator : estimator object, default=BayesianRidge() + The estimator to use at each step of the round-robin imputation. + If `sample_posterior=True`, the estimator must support + `return_std` in its `predict` method. + + missing_values : int or np.nan, default=np.nan + The placeholder for the missing values. All occurrences of + `missing_values` will be imputed. For pandas' dataframes with + nullable integer dtypes with missing values, `missing_values` + should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`. + + sample_posterior : bool, default=False + Whether to sample from the (Gaussian) predictive posterior of the + fitted estimator for each imputation. Estimator must support + `return_std` in its `predict` method if set to `True`. Set to + `True` if using `IterativeImputer` for multiple imputations. + + max_iter : int, default=10 + Maximum number of imputation rounds to perform before returning the + imputations computed during the final round. A round is a single + imputation of each feature with missing values. The stopping criterion + is met once `max(abs(X_t - X_{t-1}))/max(abs(X[known_vals])) < tol`, + where `X_t` is `X` at iteration `t`. Note that early stopping is only + applied if `sample_posterior=False`. + + tol : float, default=1e-3 + Tolerance of the stopping condition. + + n_nearest_features : int, default=None + Number of other features to use to estimate the missing values of + each feature column. Nearness between features is measured using + the absolute correlation coefficient between each feature pair (after + initial imputation). To ensure coverage of features throughout the + imputation process, the neighbor features are not necessarily nearest, + but are drawn with probability proportional to correlation for each + imputed target feature. Can provide significant speed-up when the + number of features is huge. If `None`, all features will be used. + + initial_strategy : {'mean', 'median', 'most_frequent', 'constant'}, \ + default='mean' + Which strategy to use to initialize the missing values. Same as the + `strategy` parameter in :class:`~sklearn.impute.SimpleImputer`. + + fill_value : str or numerical value, default=None + When `strategy="constant"`, `fill_value` is used to replace all + occurrences of missing_values. For string or object data types, + `fill_value` must be a string. + If `None`, `fill_value` will be 0 when imputing numerical + data and "missing_value" for strings or object data types. + + .. versionadded:: 1.3 + + imputation_order : {'ascending', 'descending', 'roman', 'arabic', \ + 'random'}, default='ascending' + The order in which the features will be imputed. Possible values: + + - `'ascending'`: From features with fewest missing values to most. + - `'descending'`: From features with most missing values to fewest. + - `'roman'`: Left to right. + - `'arabic'`: Right to left. + - `'random'`: A random order for each round. + + skip_complete : bool, default=False + If `True` then features with missing values during :meth:`transform` + which did not have any missing values during :meth:`fit` will be + imputed with the initial imputation method only. Set to `True` if you + have many features with no missing values at both :meth:`fit` and + :meth:`transform` time to save compute. + + min_value : float or array-like of shape (n_features,), default=-np.inf + Minimum possible imputed value. Broadcast to shape `(n_features,)` if + scalar. If array-like, expects shape `(n_features,)`, one min value for + each feature. The default is `-np.inf`. + + .. versionchanged:: 0.23 + Added support for array-like. + + max_value : float or array-like of shape (n_features,), default=np.inf + Maximum possible imputed value. Broadcast to shape `(n_features,)` if + scalar. If array-like, expects shape `(n_features,)`, one max value for + each feature. The default is `np.inf`. + + .. versionchanged:: 0.23 + Added support for array-like. + + verbose : int, default=0 + Verbosity flag, controls the debug messages that are issued + as functions are evaluated. The higher, the more verbose. Can be 0, 1, + or 2. + + random_state : int, RandomState instance or None, default=None + The seed of the pseudo random number generator to use. Randomizes + selection of estimator features if `n_nearest_features` is not `None`, + the `imputation_order` if `random`, and the sampling from posterior if + `sample_posterior=True`. Use an integer for determinism. + See :term:`the Glossary `. + + add_indicator : bool, default=False + If `True`, a :class:`MissingIndicator` transform will stack onto output + of the imputer's transform. This allows a predictive estimator + to account for missingness despite imputation. If a feature has no + missing values at fit/train time, the feature won't appear on + the missing indicator even if there are missing values at + transform/test time. + + keep_empty_features : bool, default=False + If True, features that consist exclusively of missing values when + `fit` is called are returned in results when `transform` is called. + The imputed value is always `0` except when + `initial_strategy="constant"` in which case `fill_value` will be + used instead. + + .. versionadded:: 1.2 + + Attributes + ---------- + initial_imputer_ : object of type :class:`~sklearn.impute.SimpleImputer` + Imputer used to initialize the missing values. + + imputation_sequence_ : list of tuples + Each tuple has `(feat_idx, neighbor_feat_idx, estimator)`, where + `feat_idx` is the current feature to be imputed, + `neighbor_feat_idx` is the array of other features used to impute the + current feature, and `estimator` is the trained estimator used for + the imputation. Length is `self.n_features_with_missing_ * + self.n_iter_`. + + n_iter_ : int + Number of iteration rounds that occurred. Will be less than + `self.max_iter` if early stopping criterion was reached. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_features_with_missing_ : int + Number of features with missing values. + + indicator_ : :class:`~sklearn.impute.MissingIndicator` + Indicator used to add binary indicators for missing values. + `None` if `add_indicator=False`. + + random_state_ : RandomState instance + RandomState instance that is generated either from a seed, the random + number generator or by `np.random`. + + See Also + -------- + SimpleImputer : Univariate imputer for completing missing values + with simple strategies. + KNNImputer : Multivariate imputer that estimates missing features using + nearest samples. + + Notes + ----- + To support imputation in inductive mode we store each feature's estimator + during the :meth:`fit` phase, and predict without refitting (in order) + during the :meth:`transform` phase. + + Features which contain all missing values at :meth:`fit` are discarded upon + :meth:`transform`. + + Using defaults, the imputer scales in :math:`\\mathcal{O}(knp^3\\min(n,p))` + where :math:`k` = `max_iter`, :math:`n` the number of samples and + :math:`p` the number of features. It thus becomes prohibitively costly when + the number of features increases. Setting + `n_nearest_features << n_features`, `skip_complete=True` or increasing `tol` + can help to reduce its computational cost. + + Depending on the nature of missing values, simple imputers can be + preferable in a prediction context. + + References + ---------- + .. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice: + Multivariate Imputation by Chained Equations in R". Journal of + Statistical Software 45: 1-67. + `_ + + .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in + Multivariate Data Suitable for use with an Electronic Computer". + Journal of the Royal Statistical Society 22(2): 302-306. + `_ + + Examples + -------- + >>> import numpy as np + >>> from sklearn.experimental import enable_iterative_imputer + >>> from sklearn.impute import IterativeImputer + >>> imp_mean = IterativeImputer(random_state=0) + >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]) + IterativeImputer(random_state=0) + >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]] + >>> imp_mean.transform(X) + array([[ 6.9584, 2. , 3. ], + [ 4. , 2.6000, 6. ], + [10. , 4.9999, 9. ]]) + + For a more detailed example see + :ref:`sphx_glr_auto_examples_impute_plot_missing_values.py` or + :ref:`sphx_glr_auto_examples_impute_plot_iterative_imputer_variants_comparison.py`. + """ + + _parameter_constraints: dict = { + **_BaseImputer._parameter_constraints, + "estimator": [None, HasMethods(["fit", "predict"])], + "sample_posterior": ["boolean"], + "max_iter": [Interval(Integral, 0, None, closed="left")], + "tol": [Interval(Real, 0, None, closed="left")], + "n_nearest_features": [None, Interval(Integral, 1, None, closed="left")], + "initial_strategy": [ + StrOptions({"mean", "median", "most_frequent", "constant"}) + ], + "fill_value": "no_validation", # any object is valid + "imputation_order": [ + StrOptions({"ascending", "descending", "roman", "arabic", "random"}) + ], + "skip_complete": ["boolean"], + "min_value": [None, Interval(Real, None, None, closed="both"), "array-like"], + "max_value": [None, Interval(Real, None, None, closed="both"), "array-like"], + "verbose": ["verbose"], + "random_state": ["random_state"], + } + + def __init__( + self, + estimator=None, + *, + missing_values=np.nan, + sample_posterior=False, + max_iter=10, + tol=1e-3, + n_nearest_features=None, + initial_strategy="mean", + fill_value=None, + imputation_order="ascending", + skip_complete=False, + min_value=-np.inf, + max_value=np.inf, + verbose=0, + random_state=None, + add_indicator=False, + keep_empty_features=False, + ): + super().__init__( + missing_values=missing_values, + add_indicator=add_indicator, + keep_empty_features=keep_empty_features, + ) + + self.estimator = estimator + self.sample_posterior = sample_posterior + self.max_iter = max_iter + self.tol = tol + self.n_nearest_features = n_nearest_features + self.initial_strategy = initial_strategy + self.fill_value = fill_value + self.imputation_order = imputation_order + self.skip_complete = skip_complete + self.min_value = min_value + self.max_value = max_value + self.verbose = verbose + self.random_state = random_state + + def _impute_one_feature( + self, + X_filled, + mask_missing_values, + feat_idx, + neighbor_feat_idx, + estimator=None, + fit_mode=True, + params=None, + ): + """Impute a single feature from the others provided. + + This function predicts the missing values of one of the features using + the current estimates of all the other features. The `estimator` must + support `return_std=True` in its `predict` method for this function + to work. + + Parameters + ---------- + X_filled : ndarray + Input data with the most recent imputations. + + mask_missing_values : ndarray + Input data's missing indicator matrix. + + feat_idx : int + Index of the feature currently being imputed. + + neighbor_feat_idx : ndarray + Indices of the features to be used in imputing `feat_idx`. + + estimator : object + The estimator to use at this step of the round-robin imputation. + If `sample_posterior=True`, the estimator must support + `return_std` in its `predict` method. + If None, it will be cloned from self._estimator. + + fit_mode : boolean, default=True + Whether to fit and predict with the estimator or just predict. + + params : dict + Additional params routed to the individual estimator. + + Returns + ------- + X_filled : ndarray + Input data with `X_filled[missing_row_mask, feat_idx]` updated. + + estimator : estimator with sklearn API + The fitted estimator used to impute + `X_filled[missing_row_mask, feat_idx]`. + """ + if estimator is None and fit_mode is False: + raise ValueError( + "If fit_mode is False, then an already-fitted " + "estimator should be passed in." + ) + + if estimator is None: + estimator = clone(self._estimator) + + missing_row_mask = mask_missing_values[:, feat_idx] + if fit_mode: + X_train = _safe_indexing( + _safe_indexing(X_filled, neighbor_feat_idx, axis=1), + ~missing_row_mask, + axis=0, + ) + y_train = _safe_indexing( + _safe_indexing(X_filled, feat_idx, axis=1), + ~missing_row_mask, + axis=0, + ) + estimator.fit(X_train, y_train, **params) + + # if no missing values, don't predict + if np.sum(missing_row_mask) == 0: + return X_filled, estimator + + # get posterior samples if there is at least one missing value + X_test = _safe_indexing( + _safe_indexing(X_filled, neighbor_feat_idx, axis=1), + missing_row_mask, + axis=0, + ) + if self.sample_posterior: + mus, sigmas = estimator.predict(X_test, return_std=True) + imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype) + # two types of problems: (1) non-positive sigmas + # (2) mus outside legal range of min_value and max_value + # (results in inf sample) + positive_sigmas = sigmas > 0 + imputed_values[~positive_sigmas] = mus[~positive_sigmas] + mus_too_low = mus < self._min_value[feat_idx] + imputed_values[mus_too_low] = self._min_value[feat_idx] + mus_too_high = mus > self._max_value[feat_idx] + imputed_values[mus_too_high] = self._max_value[feat_idx] + # the rest can be sampled without statistical issues + inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high + mus = mus[inrange_mask] + sigmas = sigmas[inrange_mask] + a = (self._min_value[feat_idx] - mus) / sigmas + b = (self._max_value[feat_idx] - mus) / sigmas + + truncated_normal = stats.truncnorm(a=a, b=b, loc=mus, scale=sigmas) + imputed_values[inrange_mask] = truncated_normal.rvs( + random_state=self.random_state_ + ) + else: + imputed_values = estimator.predict(X_test) + imputed_values = np.clip( + imputed_values, self._min_value[feat_idx], self._max_value[feat_idx] + ) + + # update the feature + _safe_assign( + X_filled, + imputed_values, + row_indexer=missing_row_mask, + column_indexer=feat_idx, + ) + return X_filled, estimator + + def _get_neighbor_feat_idx(self, n_features, feat_idx, abs_corr_mat): + """Get a list of other features to predict `feat_idx`. + + If `self.n_nearest_features` is less than or equal to the total + number of features, then use a probability proportional to the absolute + correlation between `feat_idx` and each other feature to randomly + choose a subsample of the other features (without replacement). + + Parameters + ---------- + n_features : int + Number of features in `X`. + + feat_idx : int + Index of the feature currently being imputed. + + abs_corr_mat : ndarray, shape (n_features, n_features) + Absolute correlation matrix of `X`. The diagonal has been zeroed + out and each feature has been normalized to sum to 1. Can be None. + + Returns + ------- + neighbor_feat_idx : array-like + The features to use to impute `feat_idx`. + """ + if self.n_nearest_features is not None and self.n_nearest_features < n_features: + p = abs_corr_mat[:, feat_idx] + neighbor_feat_idx = self.random_state_.choice( + np.arange(n_features), self.n_nearest_features, replace=False, p=p + ) + else: + inds_left = np.arange(feat_idx) + inds_right = np.arange(feat_idx + 1, n_features) + neighbor_feat_idx = np.concatenate((inds_left, inds_right)) + return neighbor_feat_idx + + def _get_ordered_idx(self, mask_missing_values): + """Decide in what order we will update the features. + + As a homage to the MICE R package, we will have 4 main options of + how to order the updates, and use a random order if anything else + is specified. + + Also, this function skips features which have no missing values. + + Parameters + ---------- + mask_missing_values : array-like, shape (n_samples, n_features) + Input data's missing indicator matrix, where `n_samples` is the + number of samples and `n_features` is the number of features. + + Returns + ------- + ordered_idx : ndarray, shape (n_features,) + The order in which to impute the features. + """ + frac_of_missing_values = mask_missing_values.mean(axis=0) + if self.skip_complete: + missing_values_idx = np.flatnonzero(frac_of_missing_values) + else: + missing_values_idx = np.arange(np.shape(frac_of_missing_values)[0]) + if self.imputation_order == "roman": + ordered_idx = missing_values_idx + elif self.imputation_order == "arabic": + ordered_idx = missing_values_idx[::-1] + elif self.imputation_order == "ascending": + n = len(frac_of_missing_values) - len(missing_values_idx) + ordered_idx = np.argsort(frac_of_missing_values, kind="mergesort")[n:] + elif self.imputation_order == "descending": + n = len(frac_of_missing_values) - len(missing_values_idx) + ordered_idx = np.argsort(frac_of_missing_values, kind="mergesort")[n:][::-1] + elif self.imputation_order == "random": + ordered_idx = missing_values_idx + self.random_state_.shuffle(ordered_idx) + return ordered_idx + + def _get_abs_corr_mat(self, X_filled, tolerance=1e-6): + """Get absolute correlation matrix between features. + + Parameters + ---------- + X_filled : ndarray, shape (n_samples, n_features) + Input data with the most recent imputations. + + tolerance : float, default=1e-6 + `abs_corr_mat` can have nans, which will be replaced + with `tolerance`. + + Returns + ------- + abs_corr_mat : ndarray, shape (n_features, n_features) + Absolute correlation matrix of `X` at the beginning of the + current round. The diagonal has been zeroed out and each feature's + absolute correlations with all others have been normalized to sum + to 1. + """ + n_features = X_filled.shape[1] + if self.n_nearest_features is None or self.n_nearest_features >= n_features: + return None + with np.errstate(invalid="ignore"): + # if a feature in the neighborhood has only a single value + # (e.g., categorical feature), the std. dev. will be null and + # np.corrcoef will raise a warning due to a division by zero + abs_corr_mat = np.abs(np.corrcoef(X_filled.T)) + # np.corrcoef is not defined for features with zero std + abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance + # ensures exploration, i.e. at least some probability of sampling + np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat) + # features are not their own neighbors + np.fill_diagonal(abs_corr_mat, 0) + # needs to sum to 1 for np.random.choice sampling + abs_corr_mat = normalize(abs_corr_mat, norm="l1", axis=0, copy=False) + return abs_corr_mat + + def _initial_imputation(self, X, in_fit=False): + """Perform initial imputation for input `X`. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Input data, where `n_samples` is the number of samples and + `n_features` is the number of features. + + in_fit : bool, default=False + Whether function is called in :meth:`fit`. + + Returns + ------- + Xt : ndarray of shape (n_samples, n_features) + Input data, where `n_samples` is the number of samples and + `n_features` is the number of features. + + X_filled : ndarray of shape (n_samples, n_features) + Input data with the most recent imputations. + + mask_missing_values : ndarray of shape (n_samples, n_features) + Input data's missing indicator matrix, where `n_samples` is the + number of samples and `n_features` is the number of features, + masked by non-missing features. + + X_missing_mask : ndarray, shape (n_samples, n_features) + Input data's mask matrix indicating missing datapoints, where + `n_samples` is the number of samples and `n_features` is the + number of features. + """ + if is_scalar_nan(self.missing_values): + ensure_all_finite = "allow-nan" + else: + ensure_all_finite = True + + X = validate_data( + self, + X, + dtype=FLOAT_DTYPES, + order="F", + reset=in_fit, + ensure_all_finite=ensure_all_finite, + ) + _check_inputs_dtype(X, self.missing_values) + + X_missing_mask = _get_mask(X, self.missing_values) + mask_missing_values = X_missing_mask.copy() + + # TODO (1.8): remove this once the deprecation is removed. In the meantime, + # we need to catch the warning to avoid false positives. + catch_warning = ( + self.initial_strategy == "constant" and not self.keep_empty_features + ) + + if self.initial_imputer_ is None: + self.initial_imputer_ = SimpleImputer( + missing_values=self.missing_values, + strategy=self.initial_strategy, + fill_value=self.fill_value, + keep_empty_features=self.keep_empty_features, + ).set_output(transform="default") + + # TODO (1.8): remove this once the deprecation is removed to keep only + # the code in the else case. + if catch_warning: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + X_filled = self.initial_imputer_.fit_transform(X) + else: + X_filled = self.initial_imputer_.fit_transform(X) + else: + # TODO (1.8): remove this once the deprecation is removed to keep only + # the code in the else case. + if catch_warning: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + X_filled = self.initial_imputer_.transform(X) + else: + X_filled = self.initial_imputer_.transform(X) + + if in_fit: + self._is_empty_feature = np.all(mask_missing_values, axis=0) + + if not self.keep_empty_features: + # drop empty features + Xt = X[:, ~self._is_empty_feature] + mask_missing_values = mask_missing_values[:, ~self._is_empty_feature] + + if self.initial_imputer_.get_params()["strategy"] == "constant": + # The constant strategy has a specific behavior and preserve empty + # features even with ``keep_empty_features=False``. We need to drop + # the column for consistency. + # TODO (1.8): remove this `if` branch once the following issue is + # addressed: + # https://github.com/scikit-learn/scikit-learn/issues/29827 + X_filled = X_filled[:, ~self._is_empty_feature] + + else: + # mark empty features as not missing and keep the original + # imputation + mask_missing_values[:, self._is_empty_feature] = False + Xt = X + Xt[:, self._is_empty_feature] = X_filled[:, self._is_empty_feature] + + return Xt, X_filled, mask_missing_values, X_missing_mask + + @staticmethod + def _validate_limit( + limit, limit_type, n_features, is_empty_feature, keep_empty_feature + ): + """Validate the limits (min/max) of the feature values. + + Converts scalar min/max limits to vectors of shape `(n_features,)`. + + Parameters + ---------- + limit: scalar or array-like + The user-specified limit (i.e, min_value or max_value). + limit_type: {'max', 'min'} + Type of limit to validate. + n_features: int + Number of features in the dataset. + is_empty_feature: ndarray, shape (n_features, ) + Mask array indicating empty feature imputer has seen during fit. + keep_empty_feature: bool + If False, remove empty-feature indices from the limit. + + Returns + ------- + limit: ndarray, shape(n_features,) + Array of limits, one for each feature. + """ + n_features_in = _num_samples(is_empty_feature) + if ( + limit is not None + and not np.isscalar(limit) + and _num_samples(limit) != n_features_in + ): + raise ValueError( + f"'{limit_type}_value' should be of shape ({n_features_in},) when an" + f" array-like is provided. Got {len(limit)}, instead." + ) + + limit_bound = np.inf if limit_type == "max" else -np.inf + limit = limit_bound if limit is None else limit + if np.isscalar(limit): + limit = np.full(n_features, limit) + limit = check_array(limit, ensure_all_finite=False, copy=False, ensure_2d=False) + + # Make sure to remove the empty feature elements from the bounds + if not keep_empty_feature and len(limit) == len(is_empty_feature): + limit = limit[~is_empty_feature] + + return limit + + @_fit_context( + # IterativeImputer.estimator is not validated yet + prefer_skip_nested_validation=False + ) + def fit_transform(self, X, y=None, **params): + """Fit the imputer on `X` and return the transformed `X`. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Input data, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + **params : dict + Parameters routed to the `fit` method of the sub-estimator via the + metadata routing API. + + .. versionadded:: 1.5 + Only available if + `sklearn.set_config(enable_metadata_routing=True)` is set. See + :ref:`Metadata Routing User Guide ` for more + details. + + Returns + ------- + Xt : array-like, shape (n_samples, n_features) + The imputed input data. + """ + _raise_for_params(params, self, "fit") + + routed_params = process_routing( + self, + "fit", + **params, + ) + + self.random_state_ = getattr( + self, "random_state_", check_random_state(self.random_state) + ) + + if self.estimator is None: + from ..linear_model import BayesianRidge + + self._estimator = BayesianRidge() + else: + self._estimator = clone(self.estimator) + + self.imputation_sequence_ = [] + + self.initial_imputer_ = None + + X, Xt, mask_missing_values, complete_mask = self._initial_imputation( + X, in_fit=True + ) + + super()._fit_indicator(complete_mask) + X_indicator = super()._transform_indicator(complete_mask) + + if self.max_iter == 0 or np.all(mask_missing_values): + self.n_iter_ = 0 + return super()._concatenate_indicator(Xt, X_indicator) + + # Edge case: a single feature, we return the initial imputation. + if Xt.shape[1] == 1: + self.n_iter_ = 0 + return super()._concatenate_indicator(Xt, X_indicator) + + self._min_value = self._validate_limit( + self.min_value, + "min", + X.shape[1], + self._is_empty_feature, + self.keep_empty_features, + ) + self._max_value = self._validate_limit( + self.max_value, + "max", + X.shape[1], + self._is_empty_feature, + self.keep_empty_features, + ) + + if not np.all(np.greater(self._max_value, self._min_value)): + raise ValueError("One (or more) features have min_value >= max_value.") + + # order in which to impute + # note this is probably too slow for large feature data (d > 100000) + # and a better way would be good. + # see: https://goo.gl/KyCNwj and subsequent comments + ordered_idx = self._get_ordered_idx(mask_missing_values) + self.n_features_with_missing_ = len(ordered_idx) + + abs_corr_mat = self._get_abs_corr_mat(Xt) + + n_samples, n_features = Xt.shape + if self.verbose > 0: + print("[IterativeImputer] Completing matrix with shape %s" % (X.shape,)) + start_t = time() + if not self.sample_posterior: + Xt_previous = Xt.copy() + normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values])) + for self.n_iter_ in range(1, self.max_iter + 1): + if self.imputation_order == "random": + ordered_idx = self._get_ordered_idx(mask_missing_values) + + for feat_idx in ordered_idx: + neighbor_feat_idx = self._get_neighbor_feat_idx( + n_features, feat_idx, abs_corr_mat + ) + Xt, estimator = self._impute_one_feature( + Xt, + mask_missing_values, + feat_idx, + neighbor_feat_idx, + estimator=None, + fit_mode=True, + params=routed_params.estimator.fit, + ) + estimator_triplet = _ImputerTriplet( + feat_idx, neighbor_feat_idx, estimator + ) + self.imputation_sequence_.append(estimator_triplet) + + if self.verbose > 1: + print( + "[IterativeImputer] Ending imputation round " + "%d/%d, elapsed time %0.2f" + % (self.n_iter_, self.max_iter, time() - start_t) + ) + + if not self.sample_posterior: + inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf, axis=None) + if self.verbose > 0: + print( + "[IterativeImputer] Change: {}, scaled tolerance: {} ".format( + inf_norm, normalized_tol + ) + ) + if inf_norm < normalized_tol: + if self.verbose > 0: + print("[IterativeImputer] Early stopping criterion reached.") + break + Xt_previous = Xt.copy() + else: + if not self.sample_posterior: + warnings.warn( + "[IterativeImputer] Early stopping criterion not reached.", + ConvergenceWarning, + ) + _assign_where(Xt, X, cond=~mask_missing_values) + + return super()._concatenate_indicator(Xt, X_indicator) + + def transform(self, X): + """Impute all missing values in `X`. + + Note that this is stochastic, and that if `random_state` is not fixed, + repeated calls, or permuted input, results will differ. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input data to complete. + + Returns + ------- + Xt : array-like, shape (n_samples, n_features) + The imputed input data. + """ + check_is_fitted(self) + + X, Xt, mask_missing_values, complete_mask = self._initial_imputation( + X, in_fit=False + ) + + X_indicator = super()._transform_indicator(complete_mask) + + if self.n_iter_ == 0 or np.all(mask_missing_values): + return super()._concatenate_indicator(Xt, X_indicator) + + imputations_per_round = len(self.imputation_sequence_) // self.n_iter_ + i_rnd = 0 + if self.verbose > 0: + print("[IterativeImputer] Completing matrix with shape %s" % (X.shape,)) + start_t = time() + for it, estimator_triplet in enumerate(self.imputation_sequence_): + Xt, _ = self._impute_one_feature( + Xt, + mask_missing_values, + estimator_triplet.feat_idx, + estimator_triplet.neighbor_feat_idx, + estimator=estimator_triplet.estimator, + fit_mode=False, + ) + if not (it + 1) % imputations_per_round: + if self.verbose > 1: + print( + "[IterativeImputer] Ending imputation round " + "%d/%d, elapsed time %0.2f" + % (i_rnd + 1, self.n_iter_, time() - start_t) + ) + i_rnd += 1 + + _assign_where(Xt, X, cond=~mask_missing_values) + + return super()._concatenate_indicator(Xt, X_indicator) + + def fit(self, X, y=None, **fit_params): + """Fit the imputer on `X` and return self. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Input data, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + **fit_params : dict + Parameters routed to the `fit` method of the sub-estimator via the + metadata routing API. + + .. versionadded:: 1.5 + Only available if + `sklearn.set_config(enable_metadata_routing=True)` is set. See + :ref:`Metadata Routing User Guide ` for more + details. + + Returns + ------- + self : object + Fitted estimator. + """ + self.fit_transform(X, **fit_params) + return self + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then the following input feature names are generated: + `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + check_is_fitted(self, "n_features_in_") + input_features = _check_feature_names_in(self, input_features) + names = self.initial_imputer_.get_feature_names_out(input_features) + return self._concatenate_indicator_feature_names_out(names, input_features) + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.5 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__).add( + estimator=self.estimator, + method_mapping=MethodMapping().add(callee="fit", caller="fit"), + ) + return router diff --git a/.venv/lib/python3.12/site-packages/sklearn/impute/_knn.py b/.venv/lib/python3.12/site-packages/sklearn/impute/_knn.py new file mode 100644 index 0000000000000000000000000000000000000000..1b7ef06edc256372890cbd9cb85123239b37e3e9 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/impute/_knn.py @@ -0,0 +1,411 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Integral + +import numpy as np + +from ..base import _fit_context +from ..metrics import pairwise_distances_chunked +from ..metrics.pairwise import _NAN_METRICS +from ..neighbors._base import _get_weights +from ..utils._mask import _get_mask +from ..utils._missing import is_scalar_nan +from ..utils._param_validation import Hidden, Interval, StrOptions +from ..utils.validation import ( + FLOAT_DTYPES, + _check_feature_names_in, + check_is_fitted, + validate_data, +) +from ._base import _BaseImputer + + +class KNNImputer(_BaseImputer): + """Imputation for completing missing values using k-Nearest Neighbors. + + Each sample's missing values are imputed using the mean value from + `n_neighbors` nearest neighbors found in the training set. Two samples are + close if the features that neither is missing are close. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.22 + + Parameters + ---------- + missing_values : int, float, str, np.nan or None, default=np.nan + The placeholder for the missing values. All occurrences of + `missing_values` will be imputed. For pandas' dataframes with + nullable integer dtypes with missing values, `missing_values` + should be set to np.nan, since `pd.NA` will be converted to np.nan. + + n_neighbors : int, default=5 + Number of neighboring samples to use for imputation. + + weights : {'uniform', 'distance'} or callable, default='uniform' + Weight function used in prediction. Possible values: + + - 'uniform' : uniform weights. All points in each neighborhood are + weighted equally. + - 'distance' : weight points by the inverse of their distance. + in this case, closer neighbors of a query point will have a + greater influence than neighbors which are further away. + - callable : a user-defined function which accepts an + array of distances, and returns an array of the same shape + containing the weights. + + metric : {'nan_euclidean'} or callable, default='nan_euclidean' + Distance metric for searching neighbors. Possible values: + + - 'nan_euclidean' + - callable : a user-defined function which conforms to the definition + of ``func_metric(x, y, *, missing_values=np.nan)``. `x` and `y` + corresponds to a row (i.e. 1-D arrays) of `X` and `Y`, respectively. + The callable should returns a scalar distance value. + + copy : bool, default=True + If True, a copy of X will be created. If False, imputation will + be done in-place whenever possible. + + add_indicator : bool, default=False + If True, a :class:`MissingIndicator` transform will stack onto the + output of the imputer's transform. This allows a predictive estimator + to account for missingness despite imputation. If a feature has no + missing values at fit/train time, the feature won't appear on the + missing indicator even if there are missing values at transform/test + time. + + keep_empty_features : bool, default=False + If True, features that consist exclusively of missing values when + `fit` is called are returned in results when `transform` is called. + The imputed value is always `0`. + + .. versionadded:: 1.2 + + Attributes + ---------- + indicator_ : :class:`~sklearn.impute.MissingIndicator` + Indicator used to add binary indicators for missing values. + ``None`` if add_indicator is False. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + SimpleImputer : Univariate imputer for completing missing values + with simple strategies. + IterativeImputer : Multivariate imputer that estimates values to impute for + each feature with missing values from all the others. + + References + ---------- + * `Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor + Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing + value estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17 + no. 6, 2001 Pages 520-525. + `_ + + Examples + -------- + >>> import numpy as np + >>> from sklearn.impute import KNNImputer + >>> X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]] + >>> imputer = KNNImputer(n_neighbors=2) + >>> imputer.fit_transform(X) + array([[1. , 2. , 4. ], + [3. , 4. , 3. ], + [5.5, 6. , 5. ], + [8. , 8. , 7. ]]) + + For a more detailed example see + :ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`. + """ + + _parameter_constraints: dict = { + **_BaseImputer._parameter_constraints, + "n_neighbors": [Interval(Integral, 1, None, closed="left")], + "weights": [StrOptions({"uniform", "distance"}), callable, Hidden(None)], + "metric": [StrOptions(set(_NAN_METRICS)), callable], + "copy": ["boolean"], + } + + def __init__( + self, + *, + missing_values=np.nan, + n_neighbors=5, + weights="uniform", + metric="nan_euclidean", + copy=True, + add_indicator=False, + keep_empty_features=False, + ): + super().__init__( + missing_values=missing_values, + add_indicator=add_indicator, + keep_empty_features=keep_empty_features, + ) + self.n_neighbors = n_neighbors + self.weights = weights + self.metric = metric + self.copy = copy + + def _calc_impute(self, dist_pot_donors, n_neighbors, fit_X_col, mask_fit_X_col): + """Helper function to impute a single column. + + Parameters + ---------- + dist_pot_donors : ndarray of shape (n_receivers, n_potential_donors) + Distance matrix between the receivers and potential donors from + training set. There must be at least one non-nan distance between + a receiver and a potential donor. + + n_neighbors : int + Number of neighbors to consider. + + fit_X_col : ndarray of shape (n_potential_donors,) + Column of potential donors from training set. + + mask_fit_X_col : ndarray of shape (n_potential_donors,) + Missing mask for fit_X_col. + + Returns + ------- + imputed_values: ndarray of shape (n_receivers,) + Imputed values for receiver. + """ + # Get donors + donors_idx = np.argpartition(dist_pot_donors, n_neighbors - 1, axis=1)[ + :, :n_neighbors + ] + + # Get weight matrix from distance matrix + donors_dist = dist_pot_donors[ + np.arange(donors_idx.shape[0])[:, None], donors_idx + ] + + weight_matrix = _get_weights(donors_dist, self.weights) + + # fill nans with zeros + if weight_matrix is not None: + weight_matrix[np.isnan(weight_matrix)] = 0.0 + else: + weight_matrix = np.ones_like(donors_dist) + weight_matrix[np.isnan(donors_dist)] = 0.0 + + # Retrieve donor values and calculate kNN average + donors = fit_X_col.take(donors_idx) + donors_mask = mask_fit_X_col.take(donors_idx) + donors = np.ma.array(donors, mask=donors_mask) + + return np.ma.average(donors, axis=1, weights=weight_matrix).data + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit the imputer on X. + + Parameters + ---------- + X : array-like shape of (n_samples, n_features) + Input data, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self : object + The fitted `KNNImputer` class instance. + """ + # Check data integrity and calling arguments + if not is_scalar_nan(self.missing_values): + ensure_all_finite = True + else: + ensure_all_finite = "allow-nan" + + X = validate_data( + self, + X, + accept_sparse=False, + dtype=FLOAT_DTYPES, + ensure_all_finite=ensure_all_finite, + copy=self.copy, + ) + + self._fit_X = X + self._mask_fit_X = _get_mask(self._fit_X, self.missing_values) + self._valid_mask = ~np.all(self._mask_fit_X, axis=0) + + super()._fit_indicator(self._mask_fit_X) + + return self + + def transform(self, X): + """Impute all missing values in X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input data to complete. + + Returns + ------- + X : array-like of shape (n_samples, n_output_features) + The imputed dataset. `n_output_features` is the number of features + that is not always missing during `fit`. + """ + + check_is_fitted(self) + if not is_scalar_nan(self.missing_values): + ensure_all_finite = True + else: + ensure_all_finite = "allow-nan" + X = validate_data( + self, + X, + accept_sparse=False, + dtype=FLOAT_DTYPES, + force_writeable=True, + ensure_all_finite=ensure_all_finite, + copy=self.copy, + reset=False, + ) + + mask = _get_mask(X, self.missing_values) + mask_fit_X = self._mask_fit_X + valid_mask = self._valid_mask + + X_indicator = super()._transform_indicator(mask) + + # Removes columns where the training data is all nan + if not np.any(mask[:, valid_mask]): + # No missing values in X + if self.keep_empty_features: + Xc = X + Xc[:, ~valid_mask] = 0 + else: + Xc = X[:, valid_mask] + + # Even if there are no missing values in X, we still concatenate Xc + # with the missing value indicator matrix, X_indicator. + # This is to ensure that the output maintains consistency in terms + # of columns, regardless of whether missing values exist in X or not. + return super()._concatenate_indicator(Xc, X_indicator) + + row_missing_idx = np.flatnonzero(mask[:, valid_mask].any(axis=1)) + + non_missing_fix_X = np.logical_not(mask_fit_X) + + # Maps from indices from X to indices in dist matrix + dist_idx_map = np.zeros(X.shape[0], dtype=int) + dist_idx_map[row_missing_idx] = np.arange(row_missing_idx.shape[0]) + + def process_chunk(dist_chunk, start): + row_missing_chunk = row_missing_idx[start : start + len(dist_chunk)] + + # Find and impute missing by column + for col in range(X.shape[1]): + if not valid_mask[col]: + # column was all missing during training + continue + + col_mask = mask[row_missing_chunk, col] + if not np.any(col_mask): + # column has no missing values + continue + + (potential_donors_idx,) = np.nonzero(non_missing_fix_X[:, col]) + + # receivers_idx are indices in X + receivers_idx = row_missing_chunk[np.flatnonzero(col_mask)] + + # distances for samples that needed imputation for column + dist_subset = dist_chunk[dist_idx_map[receivers_idx] - start][ + :, potential_donors_idx + ] + + # receivers with all nan distances impute with mean + all_nan_dist_mask = np.isnan(dist_subset).all(axis=1) + all_nan_receivers_idx = receivers_idx[all_nan_dist_mask] + + if all_nan_receivers_idx.size: + col_mean = np.ma.array( + self._fit_X[:, col], mask=mask_fit_X[:, col] + ).mean() + X[all_nan_receivers_idx, col] = col_mean + + if len(all_nan_receivers_idx) == len(receivers_idx): + # all receivers imputed with mean + continue + + # receivers with at least one defined distance + receivers_idx = receivers_idx[~all_nan_dist_mask] + dist_subset = dist_chunk[dist_idx_map[receivers_idx] - start][ + :, potential_donors_idx + ] + + n_neighbors = min(self.n_neighbors, len(potential_donors_idx)) + value = self._calc_impute( + dist_subset, + n_neighbors, + self._fit_X[potential_donors_idx, col], + mask_fit_X[potential_donors_idx, col], + ) + X[receivers_idx, col] = value + + # process in fixed-memory chunks + gen = pairwise_distances_chunked( + X[row_missing_idx, :], + self._fit_X, + metric=self.metric, + missing_values=self.missing_values, + ensure_all_finite=ensure_all_finite, + reduce_func=process_chunk, + ) + for chunk in gen: + # process_chunk modifies X in place. No return value. + pass + + if self.keep_empty_features: + Xc = X + Xc[:, ~valid_mask] = 0 + else: + Xc = X[:, valid_mask] + + return super()._concatenate_indicator(Xc, X_indicator) + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then the following input feature names are generated: + `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + check_is_fitted(self, "n_features_in_") + input_features = _check_feature_names_in(self, input_features) + names = input_features[self._valid_mask] + return self._concatenate_indicator_feature_names_out(names, input_features) diff --git a/.venv/lib/python3.12/site-packages/sklearn/impute/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/impute/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_base.py b/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_base.py new file mode 100644 index 0000000000000000000000000000000000000000..0c1bd83f7ca9ea8adde76940e2f7fdd86d89ea5c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_base.py @@ -0,0 +1,107 @@ +import numpy as np +import pytest + +from sklearn.impute._base import _BaseImputer +from sklearn.impute._iterative import _assign_where +from sklearn.utils._mask import _get_mask +from sklearn.utils._testing import _convert_container, assert_allclose + + +@pytest.fixture +def data(): + X = np.random.randn(10, 2) + X[::2] = np.nan + return X + + +class NoFitIndicatorImputer(_BaseImputer): + def fit(self, X, y=None): + return self + + def transform(self, X, y=None): + return self._concatenate_indicator(X, self._transform_indicator(X)) + + +class NoTransformIndicatorImputer(_BaseImputer): + def fit(self, X, y=None): + mask = _get_mask(X, value_to_mask=np.nan) + super()._fit_indicator(mask) + return self + + def transform(self, X, y=None): + return self._concatenate_indicator(X, None) + + +class NoPrecomputedMaskFit(_BaseImputer): + def fit(self, X, y=None): + self._fit_indicator(X) + return self + + def transform(self, X): + return self._concatenate_indicator(X, self._transform_indicator(X)) + + +class NoPrecomputedMaskTransform(_BaseImputer): + def fit(self, X, y=None): + mask = _get_mask(X, value_to_mask=np.nan) + self._fit_indicator(mask) + return self + + def transform(self, X): + return self._concatenate_indicator(X, self._transform_indicator(X)) + + +def test_base_imputer_not_fit(data): + imputer = NoFitIndicatorImputer(add_indicator=True) + err_msg = "Make sure to call _fit_indicator before _transform_indicator" + with pytest.raises(ValueError, match=err_msg): + imputer.fit(data).transform(data) + with pytest.raises(ValueError, match=err_msg): + imputer.fit_transform(data) + + +def test_base_imputer_not_transform(data): + imputer = NoTransformIndicatorImputer(add_indicator=True) + err_msg = ( + "Call _fit_indicator and _transform_indicator in the imputer implementation" + ) + with pytest.raises(ValueError, match=err_msg): + imputer.fit(data).transform(data) + with pytest.raises(ValueError, match=err_msg): + imputer.fit_transform(data) + + +def test_base_no_precomputed_mask_fit(data): + imputer = NoPrecomputedMaskFit(add_indicator=True) + err_msg = "precomputed is True but the input data is not a mask" + with pytest.raises(ValueError, match=err_msg): + imputer.fit(data) + with pytest.raises(ValueError, match=err_msg): + imputer.fit_transform(data) + + +def test_base_no_precomputed_mask_transform(data): + imputer = NoPrecomputedMaskTransform(add_indicator=True) + err_msg = "precomputed is True but the input data is not a mask" + imputer.fit(data) + with pytest.raises(ValueError, match=err_msg): + imputer.transform(data) + with pytest.raises(ValueError, match=err_msg): + imputer.fit_transform(data) + + +@pytest.mark.parametrize("X1_type", ["array", "dataframe"]) +def test_assign_where(X1_type): + """Check the behaviour of the private helpers `_assign_where`.""" + rng = np.random.RandomState(0) + + n_samples, n_features = 10, 5 + X1 = _convert_container(rng.randn(n_samples, n_features), constructor_name=X1_type) + X2 = rng.randn(n_samples, n_features) + mask = rng.randint(0, 2, size=(n_samples, n_features)).astype(bool) + + _assign_where(X1, X2, mask) + + if X1_type == "dataframe": + X1 = X1.to_numpy() + assert_allclose(X1[mask], X2[mask]) diff --git a/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_common.py b/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_common.py new file mode 100644 index 0000000000000000000000000000000000000000..afebc96ac035c4945c4084dc968323a602072066 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_common.py @@ -0,0 +1,220 @@ +import numpy as np +import pytest + +from sklearn.experimental import enable_iterative_imputer # noqa: F401 +from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer +from sklearn.utils._testing import ( + assert_allclose, + assert_allclose_dense_sparse, + assert_array_equal, +) +from sklearn.utils.fixes import CSR_CONTAINERS + + +def imputers(): + return [IterativeImputer(tol=0.1), KNNImputer(), SimpleImputer()] + + +def sparse_imputers(): + return [SimpleImputer()] + + +# ConvergenceWarning will be raised by the IterativeImputer +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__) +def test_imputation_missing_value_in_test_array(imputer): + # [Non Regression Test for issue #13968] Missing value in test set should + # not throw an error and return a finite dataset + train = [[1], [2]] + test = [[3], [np.nan]] + imputer.set_params(add_indicator=True) + imputer.fit(train).transform(test) + + +# ConvergenceWarning will be raised by the IterativeImputer +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +@pytest.mark.parametrize("marker", [np.nan, -1, 0]) +@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__) +def test_imputers_add_indicator(marker, imputer): + X = np.array( + [ + [marker, 1, 5, marker, 1], + [2, marker, 1, marker, 2], + [6, 3, marker, marker, 3], + [1, 2, 9, marker, 4], + ] + ) + X_true_indicator = np.array( + [ + [1.0, 0.0, 0.0, 1.0], + [0.0, 1.0, 0.0, 1.0], + [0.0, 0.0, 1.0, 1.0], + [0.0, 0.0, 0.0, 1.0], + ] + ) + imputer.set_params(missing_values=marker, add_indicator=True) + + X_trans = imputer.fit_transform(X) + assert_allclose(X_trans[:, -4:], X_true_indicator) + assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3])) + + imputer.set_params(add_indicator=False) + X_trans_no_indicator = imputer.fit_transform(X) + assert_allclose(X_trans[:, :-4], X_trans_no_indicator) + + +# ConvergenceWarning will be raised by the IterativeImputer +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +@pytest.mark.parametrize("marker", [np.nan, -1]) +@pytest.mark.parametrize( + "imputer", sparse_imputers(), ids=lambda x: x.__class__.__name__ +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_imputers_add_indicator_sparse(imputer, marker, csr_container): + X = csr_container( + [ + [marker, 1, 5, marker, 1], + [2, marker, 1, marker, 2], + [6, 3, marker, marker, 3], + [1, 2, 9, marker, 4], + ] + ) + X_true_indicator = csr_container( + [ + [1.0, 0.0, 0.0, 1.0], + [0.0, 1.0, 0.0, 1.0], + [0.0, 0.0, 1.0, 1.0], + [0.0, 0.0, 0.0, 1.0], + ] + ) + imputer.set_params(missing_values=marker, add_indicator=True) + + X_trans = imputer.fit_transform(X) + assert_allclose_dense_sparse(X_trans[:, -4:], X_true_indicator) + assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3])) + + imputer.set_params(add_indicator=False) + X_trans_no_indicator = imputer.fit_transform(X) + assert_allclose_dense_sparse(X_trans[:, :-4], X_trans_no_indicator) + + +# ConvergenceWarning will be raised by the IterativeImputer +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__) +@pytest.mark.parametrize("add_indicator", [True, False]) +def test_imputers_pandas_na_integer_array_support(imputer, add_indicator): + # Test pandas IntegerArray with pd.NA + pd = pytest.importorskip("pandas") + marker = np.nan + imputer = imputer.set_params(add_indicator=add_indicator, missing_values=marker) + + X = np.array( + [ + [marker, 1, 5, marker, 1], + [2, marker, 1, marker, 2], + [6, 3, marker, marker, 3], + [1, 2, 9, marker, 4], + ] + ) + # fit on numpy array + X_trans_expected = imputer.fit_transform(X) + + # Creates dataframe with IntegerArrays with pd.NA + X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c", "d", "e"]) + + # fit on pandas dataframe with IntegerArrays + X_trans = imputer.fit_transform(X_df) + + assert_allclose(X_trans_expected, X_trans) + + +@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__) +@pytest.mark.parametrize("add_indicator", [True, False]) +def test_imputers_feature_names_out_pandas(imputer, add_indicator): + """Check feature names out for imputers.""" + pd = pytest.importorskip("pandas") + marker = np.nan + imputer = imputer.set_params(add_indicator=add_indicator, missing_values=marker) + + X = np.array( + [ + [marker, 1, 5, 3, marker, 1], + [2, marker, 1, 4, marker, 2], + [6, 3, 7, marker, marker, 3], + [1, 2, 9, 8, marker, 4], + ] + ) + X_df = pd.DataFrame(X, columns=["a", "b", "c", "d", "e", "f"]) + imputer.fit(X_df) + + names = imputer.get_feature_names_out() + + if add_indicator: + expected_names = [ + "a", + "b", + "c", + "d", + "f", + "missingindicator_a", + "missingindicator_b", + "missingindicator_d", + "missingindicator_e", + ] + assert_array_equal(expected_names, names) + else: + expected_names = ["a", "b", "c", "d", "f"] + assert_array_equal(expected_names, names) + + +@pytest.mark.parametrize("keep_empty_features", [True, False]) +@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__) +def test_keep_empty_features(imputer, keep_empty_features): + """Check that the imputer keeps features with only missing values.""" + X = np.array([[np.nan, 1], [np.nan, 2], [np.nan, 3]]) + imputer = imputer.set_params( + add_indicator=False, keep_empty_features=keep_empty_features + ) + + for method in ["fit_transform", "transform"]: + X_imputed = getattr(imputer, method)(X) + if keep_empty_features: + assert X_imputed.shape == X.shape + else: + assert X_imputed.shape == (X.shape[0], X.shape[1] - 1) + + +@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__) +@pytest.mark.parametrize("missing_value_test", [np.nan, 1]) +def test_imputation_adds_missing_indicator_if_add_indicator_is_true( + imputer, missing_value_test +): + """Check that missing indicator always exists when add_indicator=True. + + Non-regression test for gh-26590. + """ + X_train = np.array([[0, np.nan], [1, 2]]) + + # Test data where missing_value_test variable can be set to np.nan or 1. + X_test = np.array([[0, missing_value_test], [1, 2]]) + + imputer.set_params(add_indicator=True) + imputer.fit(X_train) + + X_test_imputed_with_indicator = imputer.transform(X_test) + assert X_test_imputed_with_indicator.shape == (2, 3) + + imputer.set_params(add_indicator=False) + imputer.fit(X_train) + X_test_imputed_without_indicator = imputer.transform(X_test) + assert X_test_imputed_without_indicator.shape == (2, 2) + + assert_allclose( + X_test_imputed_with_indicator[:, :-1], X_test_imputed_without_indicator + ) + if np.isnan(missing_value_test): + expected_missing_indicator = [1, 0] + else: + expected_missing_indicator = [0, 0] + + assert_allclose(X_test_imputed_with_indicator[:, -1], expected_missing_indicator) diff --git a/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_impute.py b/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_impute.py new file mode 100644 index 0000000000000000000000000000000000000000..4116964c49a7a3be21771593e60e24515a7b475c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_impute.py @@ -0,0 +1,1955 @@ +import io +import re +import warnings +from itertools import product + +import numpy as np +import pytest +from scipy import sparse +from scipy.stats import kstest + +from sklearn import tree +from sklearn.datasets import load_diabetes +from sklearn.dummy import DummyRegressor +from sklearn.exceptions import ConvergenceWarning + +# make IterativeImputer available +from sklearn.experimental import enable_iterative_imputer # noqa: F401 +from sklearn.impute import IterativeImputer, KNNImputer, MissingIndicator, SimpleImputer +from sklearn.impute._base import _most_frequent +from sklearn.linear_model import ARDRegression, BayesianRidge, RidgeCV +from sklearn.model_selection import GridSearchCV +from sklearn.pipeline import Pipeline, make_union +from sklearn.random_projection import _sparse_random_matrix +from sklearn.utils._testing import ( + _convert_container, + assert_allclose, + assert_allclose_dense_sparse, + assert_array_almost_equal, + assert_array_equal, +) +from sklearn.utils.fixes import ( + BSR_CONTAINERS, + COO_CONTAINERS, + CSC_CONTAINERS, + CSR_CONTAINERS, + LIL_CONTAINERS, +) + + +def _assert_array_equal_and_same_dtype(x, y): + assert_array_equal(x, y) + assert x.dtype == y.dtype + + +def _assert_allclose_and_same_dtype(x, y): + assert_allclose(x, y) + assert x.dtype == y.dtype + + +def _check_statistics( + X, X_true, strategy, statistics, missing_values, sparse_container +): + """Utility function for testing imputation for a given strategy. + + Test with dense and sparse arrays + + Check that: + - the statistics (mean, median, mode) are correct + - the missing values are imputed correctly""" + + err_msg = "Parameters: strategy = %s, missing_values = %s, sparse = {0}" % ( + strategy, + missing_values, + ) + + assert_ae = assert_array_equal + + if X.dtype.kind == "f" or X_true.dtype.kind == "f": + assert_ae = assert_array_almost_equal + + # Normal matrix + imputer = SimpleImputer(missing_values=missing_values, strategy=strategy) + X_trans = imputer.fit(X).transform(X.copy()) + assert_ae(imputer.statistics_, statistics, err_msg=err_msg.format(False)) + assert_ae(X_trans, X_true, err_msg=err_msg.format(False)) + + # Sparse matrix + imputer = SimpleImputer(missing_values=missing_values, strategy=strategy) + imputer.fit(sparse_container(X)) + X_trans = imputer.transform(sparse_container(X.copy())) + + if sparse.issparse(X_trans): + X_trans = X_trans.toarray() + + assert_ae(imputer.statistics_, statistics, err_msg=err_msg.format(True)) + assert_ae(X_trans, X_true, err_msg=err_msg.format(True)) + + +@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent", "constant"]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_imputation_shape(strategy, csr_container): + # Verify the shapes of the imputed matrix for different strategies. + X = np.random.randn(10, 2) + X[::2] = np.nan + + imputer = SimpleImputer(strategy=strategy) + X_imputed = imputer.fit_transform(csr_container(X)) + assert X_imputed.shape == (10, 2) + X_imputed = imputer.fit_transform(X) + assert X_imputed.shape == (10, 2) + + iterative_imputer = IterativeImputer(initial_strategy=strategy) + X_imputed = iterative_imputer.fit_transform(X) + assert X_imputed.shape == (10, 2) + + +@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent"]) +def test_imputation_deletion_warning(strategy): + X = np.ones((3, 5)) + X[:, 0] = np.nan + imputer = SimpleImputer(strategy=strategy).fit(X) + + with pytest.warns(UserWarning, match="Skipping"): + imputer.transform(X) + + +@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent"]) +def test_imputation_deletion_warning_feature_names(strategy): + pd = pytest.importorskip("pandas") + + missing_values = np.nan + feature_names = np.array(["a", "b", "c", "d"], dtype=object) + X = pd.DataFrame( + [ + [missing_values, missing_values, 1, missing_values], + [4, missing_values, 2, 10], + ], + columns=feature_names, + ) + + imputer = SimpleImputer(strategy=strategy).fit(X) + + # check SimpleImputer returning feature name attribute correctly + assert_array_equal(imputer.feature_names_in_, feature_names) + + # ensure that skipped feature warning includes feature name + with pytest.warns( + UserWarning, match=r"Skipping features without any observed values: \['b'\]" + ): + imputer.transform(X) + + +@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent", "constant"]) +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_imputation_error_sparse_0(strategy, csc_container): + # check that error are raised when missing_values = 0 and input is sparse + X = np.ones((3, 5)) + X[0] = 0 + X = csc_container(X) + + imputer = SimpleImputer(strategy=strategy, missing_values=0) + with pytest.raises(ValueError, match="Provide a dense array"): + imputer.fit(X) + + imputer.fit(X.toarray()) + with pytest.raises(ValueError, match="Provide a dense array"): + imputer.transform(X) + + +def safe_median(arr, *args, **kwargs): + # np.median([]) raises a TypeError for numpy >= 1.10.1 + length = arr.size if hasattr(arr, "size") else len(arr) + return np.nan if length == 0 else np.median(arr, *args, **kwargs) + + +def safe_mean(arr, *args, **kwargs): + # np.mean([]) raises a RuntimeWarning for numpy >= 1.10.1 + length = arr.size if hasattr(arr, "size") else len(arr) + return np.nan if length == 0 else np.mean(arr, *args, **kwargs) + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_imputation_mean_median(csc_container): + # Test imputation using the mean and median strategies, when + # missing_values != 0. + rng = np.random.RandomState(0) + + dim = 10 + dec = 10 + shape = (dim * dim, dim + dec) + + zeros = np.zeros(shape[0]) + values = np.arange(1, shape[0] + 1) + values[4::2] = -values[4::2] + + tests = [ + ("mean", np.nan, lambda z, v, p: safe_mean(np.hstack((z, v)))), + ("median", np.nan, lambda z, v, p: safe_median(np.hstack((z, v)))), + ] + + for strategy, test_missing_values, true_value_fun in tests: + X = np.empty(shape) + X_true = np.empty(shape) + true_statistics = np.empty(shape[1]) + + # Create a matrix X with columns + # - with only zeros, + # - with only missing values + # - with zeros, missing values and values + # And a matrix X_true containing all true values + for j in range(shape[1]): + nb_zeros = (j - dec + 1 > 0) * (j - dec + 1) * (j - dec + 1) + nb_missing_values = max(shape[0] + dec * dec - (j + dec) * (j + dec), 0) + nb_values = shape[0] - nb_zeros - nb_missing_values + + z = zeros[:nb_zeros] + p = np.repeat(test_missing_values, nb_missing_values) + v = values[rng.permutation(len(values))[:nb_values]] + + true_statistics[j] = true_value_fun(z, v, p) + + # Create the columns + X[:, j] = np.hstack((v, z, p)) + + if 0 == test_missing_values: + # XXX unreached code as of v0.22 + X_true[:, j] = np.hstack( + (v, np.repeat(true_statistics[j], nb_missing_values + nb_zeros)) + ) + else: + X_true[:, j] = np.hstack( + (v, z, np.repeat(true_statistics[j], nb_missing_values)) + ) + + # Shuffle them the same way + np.random.RandomState(j).shuffle(X[:, j]) + np.random.RandomState(j).shuffle(X_true[:, j]) + + # Mean doesn't support columns containing NaNs, median does + if strategy == "median": + cols_to_keep = ~np.isnan(X_true).any(axis=0) + else: + cols_to_keep = ~np.isnan(X_true).all(axis=0) + + X_true = X_true[:, cols_to_keep] + + _check_statistics( + X, X_true, strategy, true_statistics, test_missing_values, csc_container + ) + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_imputation_median_special_cases(csc_container): + # Test median imputation with sparse boundary cases + X = np.array( + [ + [0, np.nan, np.nan], # odd: implicit zero + [5, np.nan, np.nan], # odd: explicit nonzero + [0, 0, np.nan], # even: average two zeros + [-5, 0, np.nan], # even: avg zero and neg + [0, 5, np.nan], # even: avg zero and pos + [4, 5, np.nan], # even: avg nonzeros + [-4, -5, np.nan], # even: avg negatives + [-1, 2, np.nan], # even: crossing neg and pos + ] + ).transpose() + + X_imputed_median = np.array( + [ + [0, 0, 0], + [5, 5, 5], + [0, 0, 0], + [-5, 0, -2.5], + [0, 5, 2.5], + [4, 5, 4.5], + [-4, -5, -4.5], + [-1, 2, 0.5], + ] + ).transpose() + statistics_median = [0, 5, 0, -2.5, 2.5, 4.5, -4.5, 0.5] + + _check_statistics( + X, X_imputed_median, "median", statistics_median, np.nan, csc_container + ) + + +@pytest.mark.parametrize("strategy", ["mean", "median"]) +@pytest.mark.parametrize("dtype", [None, object, str]) +def test_imputation_mean_median_error_invalid_type(strategy, dtype): + X = np.array([["a", "b", 3], [4, "e", 6], ["g", "h", 9]], dtype=dtype) + msg = "non-numeric data:\ncould not convert string to float:" + with pytest.raises(ValueError, match=msg): + imputer = SimpleImputer(strategy=strategy) + imputer.fit_transform(X) + + +@pytest.mark.parametrize("strategy", ["mean", "median"]) +@pytest.mark.parametrize("type", ["list", "dataframe"]) +def test_imputation_mean_median_error_invalid_type_list_pandas(strategy, type): + X = [["a", "b", 3], [4, "e", 6], ["g", "h", 9]] + if type == "dataframe": + pd = pytest.importorskip("pandas") + X = pd.DataFrame(X) + msg = "non-numeric data:\ncould not convert string to float:" + with pytest.raises(ValueError, match=msg): + imputer = SimpleImputer(strategy=strategy) + imputer.fit_transform(X) + + +@pytest.mark.parametrize("strategy", ["constant", "most_frequent"]) +@pytest.mark.parametrize("dtype", [str, np.dtype("U"), np.dtype("S")]) +def test_imputation_const_mostf_error_invalid_types(strategy, dtype): + # Test imputation on non-numeric data using "most_frequent" and "constant" + # strategy + X = np.array( + [ + [np.nan, np.nan, "a", "f"], + [np.nan, "c", np.nan, "d"], + [np.nan, "b", "d", np.nan], + [np.nan, "c", "d", "h"], + ], + dtype=dtype, + ) + + err_msg = "SimpleImputer does not support data" + with pytest.raises(ValueError, match=err_msg): + imputer = SimpleImputer(strategy=strategy) + imputer.fit(X).transform(X) + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_imputation_most_frequent(csc_container): + # Test imputation using the most-frequent strategy. + X = np.array( + [ + [-1, -1, 0, 5], + [-1, 2, -1, 3], + [-1, 1, 3, -1], + [-1, 2, 3, 7], + ] + ) + + X_true = np.array( + [ + [2, 0, 5], + [2, 3, 3], + [1, 3, 3], + [2, 3, 7], + ] + ) + + # scipy.stats.mode, used in SimpleImputer, doesn't return the first most + # frequent as promised in the doc but the lowest most frequent. When this + # test will fail after an update of scipy, SimpleImputer will need to be + # updated to be consistent with the new (correct) behaviour + _check_statistics(X, X_true, "most_frequent", [np.nan, 2, 3, 3], -1, csc_container) + + +@pytest.mark.parametrize("marker", [None, np.nan, "NAN", "", 0]) +def test_imputation_most_frequent_objects(marker): + # Test imputation using the most-frequent strategy. + X = np.array( + [ + [marker, marker, "a", "f"], + [marker, "c", marker, "d"], + [marker, "b", "d", marker], + [marker, "c", "d", "h"], + ], + dtype=object, + ) + + X_true = np.array( + [ + ["c", "a", "f"], + ["c", "d", "d"], + ["b", "d", "d"], + ["c", "d", "h"], + ], + dtype=object, + ) + + imputer = SimpleImputer(missing_values=marker, strategy="most_frequent") + X_trans = imputer.fit(X).transform(X) + + assert_array_equal(X_trans, X_true) + + +@pytest.mark.parametrize("dtype", [object, "category"]) +def test_imputation_most_frequent_pandas(dtype): + # Test imputation using the most frequent strategy on pandas df + pd = pytest.importorskip("pandas") + + f = io.StringIO("Cat1,Cat2,Cat3,Cat4\n,i,x,\na,,y,\na,j,,\nb,j,x,") + + df = pd.read_csv(f, dtype=dtype) + + X_true = np.array( + [["a", "i", "x"], ["a", "j", "y"], ["a", "j", "x"], ["b", "j", "x"]], + dtype=object, + ) + + imputer = SimpleImputer(strategy="most_frequent") + X_trans = imputer.fit_transform(df) + + assert_array_equal(X_trans, X_true) + + +@pytest.mark.parametrize("X_data, missing_value", [(1, 0), (1.0, np.nan)]) +def test_imputation_constant_error_invalid_type(X_data, missing_value): + # Verify that exceptions are raised on invalid fill_value type + X = np.full((3, 5), X_data, dtype=float) + X[0, 0] = missing_value + + fill_value = "x" + err_msg = f"fill_value={fill_value!r} (of type {type(fill_value)!r}) cannot be cast" + with pytest.raises(ValueError, match=re.escape(err_msg)): + imputer = SimpleImputer( + missing_values=missing_value, strategy="constant", fill_value=fill_value + ) + imputer.fit_transform(X) + + +# TODO (1.8): check that `keep_empty_features=False` drop the +# empty features due to the behaviour change. +def test_imputation_constant_integer(): + # Test imputation using the constant strategy on integers + X = np.array([[-1, 2, 3, -1], [4, -1, 5, -1], [6, 7, -1, -1], [8, 9, 0, -1]]) + + X_true = np.array([[0, 2, 3, 0], [4, 0, 5, 0], [6, 7, 0, 0], [8, 9, 0, 0]]) + + imputer = SimpleImputer( + missing_values=-1, strategy="constant", fill_value=0, keep_empty_features=True + ) + X_trans = imputer.fit_transform(X) + + assert_array_equal(X_trans, X_true) + + +# TODO (1.8): check that `keep_empty_features=False` drop the +# empty features due to the behaviour change. +@pytest.mark.parametrize("array_constructor", CSR_CONTAINERS + [np.asarray]) +def test_imputation_constant_float(array_constructor): + # Test imputation using the constant strategy on floats + X = np.array( + [ + [np.nan, 1.1, 0, np.nan], + [1.2, np.nan, 1.3, np.nan], + [0, 0, np.nan, np.nan], + [1.4, 1.5, 0, np.nan], + ] + ) + + X_true = np.array( + [[-1, 1.1, 0, -1], [1.2, -1, 1.3, -1], [0, 0, -1, -1], [1.4, 1.5, 0, -1]] + ) + + X = array_constructor(X) + + X_true = array_constructor(X_true) + + imputer = SimpleImputer( + strategy="constant", fill_value=-1, keep_empty_features=True + ) + X_trans = imputer.fit_transform(X) + + assert_allclose_dense_sparse(X_trans, X_true) + + +# TODO (1.8): check that `keep_empty_features=False` drop the +# empty features due to the behaviour change. +@pytest.mark.parametrize("marker", [None, np.nan, "NAN", "", 0]) +def test_imputation_constant_object(marker): + # Test imputation using the constant strategy on objects + X = np.array( + [ + [marker, "a", "b", marker], + ["c", marker, "d", marker], + ["e", "f", marker, marker], + ["g", "h", "i", marker], + ], + dtype=object, + ) + + X_true = np.array( + [ + ["missing", "a", "b", "missing"], + ["c", "missing", "d", "missing"], + ["e", "f", "missing", "missing"], + ["g", "h", "i", "missing"], + ], + dtype=object, + ) + + imputer = SimpleImputer( + missing_values=marker, + strategy="constant", + fill_value="missing", + keep_empty_features=True, + ) + X_trans = imputer.fit_transform(X) + + assert_array_equal(X_trans, X_true) + + +# TODO (1.8): check that `keep_empty_features=False` drop the +# empty features due to the behaviour change. +@pytest.mark.parametrize("dtype", [object, "category"]) +def test_imputation_constant_pandas(dtype): + # Test imputation using the constant strategy on pandas df + pd = pytest.importorskip("pandas") + + f = io.StringIO("Cat1,Cat2,Cat3,Cat4\n,i,x,\na,,y,\na,j,,\nb,j,x,") + + df = pd.read_csv(f, dtype=dtype) + + X_true = np.array( + [ + ["missing_value", "i", "x", "missing_value"], + ["a", "missing_value", "y", "missing_value"], + ["a", "j", "missing_value", "missing_value"], + ["b", "j", "x", "missing_value"], + ], + dtype=object, + ) + + imputer = SimpleImputer(strategy="constant", keep_empty_features=True) + X_trans = imputer.fit_transform(df) + + assert_array_equal(X_trans, X_true) + + +@pytest.mark.parametrize("X", [[[1], [2]], [[1], [np.nan]]]) +def test_iterative_imputer_one_feature(X): + # check we exit early when there is a single feature + imputer = IterativeImputer().fit(X) + assert imputer.n_iter_ == 0 + imputer = IterativeImputer() + imputer.fit([[1], [2]]) + assert imputer.n_iter_ == 0 + imputer.fit([[1], [np.nan]]) + assert imputer.n_iter_ == 0 + + +def test_imputation_pipeline_grid_search(): + # Test imputation within a pipeline + gridsearch. + X = _sparse_random_matrix(100, 100, density=0.10) + missing_values = X.data[0] + + pipeline = Pipeline( + [ + ("imputer", SimpleImputer(missing_values=missing_values)), + ("tree", tree.DecisionTreeRegressor(random_state=0)), + ] + ) + + parameters = {"imputer__strategy": ["mean", "median", "most_frequent"]} + + Y = _sparse_random_matrix(100, 1, density=0.10).toarray() + gs = GridSearchCV(pipeline, parameters) + gs.fit(X, Y) + + +def test_imputation_copy(): + # Test imputation with copy + X_orig = _sparse_random_matrix(5, 5, density=0.75, random_state=0) + + # copy=True, dense => copy + X = X_orig.copy().toarray() + imputer = SimpleImputer(missing_values=0, strategy="mean", copy=True) + Xt = imputer.fit(X).transform(X) + Xt[0, 0] = -1 + assert not np.all(X == Xt) + + # copy=True, sparse csr => copy + X = X_orig.copy() + imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=True) + Xt = imputer.fit(X).transform(X) + Xt.data[0] = -1 + assert not np.all(X.data == Xt.data) + + # copy=False, dense => no copy + X = X_orig.copy().toarray() + imputer = SimpleImputer(missing_values=0, strategy="mean", copy=False) + Xt = imputer.fit(X).transform(X) + Xt[0, 0] = -1 + assert_array_almost_equal(X, Xt) + + # copy=False, sparse csc => no copy + X = X_orig.copy().tocsc() + imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False) + Xt = imputer.fit(X).transform(X) + Xt.data[0] = -1 + assert_array_almost_equal(X.data, Xt.data) + + # copy=False, sparse csr => copy + X = X_orig.copy() + imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False) + Xt = imputer.fit(X).transform(X) + Xt.data[0] = -1 + assert not np.all(X.data == Xt.data) + + # Note: If X is sparse and if missing_values=0, then a (dense) copy of X is + # made, even if copy=False. + + +def test_iterative_imputer_zero_iters(): + rng = np.random.RandomState(0) + + n = 100 + d = 10 + X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() + missing_flag = X == 0 + X[missing_flag] = np.nan + + imputer = IterativeImputer(max_iter=0) + X_imputed = imputer.fit_transform(X) + # with max_iter=0, only initial imputation is performed + assert_allclose(X_imputed, imputer.initial_imputer_.transform(X)) + + # repeat but force n_iter_ to 0 + imputer = IterativeImputer(max_iter=5).fit(X) + # transformed should not be equal to initial imputation + assert not np.all(imputer.transform(X) == imputer.initial_imputer_.transform(X)) + + imputer.n_iter_ = 0 + # now they should be equal as only initial imputation is done + assert_allclose(imputer.transform(X), imputer.initial_imputer_.transform(X)) + + +def test_iterative_imputer_verbose(): + rng = np.random.RandomState(0) + + n = 100 + d = 3 + X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() + imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=1) + imputer.fit(X) + imputer.transform(X) + imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=2) + imputer.fit(X) + imputer.transform(X) + + +def test_iterative_imputer_all_missing(): + n = 100 + d = 3 + X = np.zeros((n, d)) + imputer = IterativeImputer(missing_values=0, max_iter=1) + X_imputed = imputer.fit_transform(X) + assert_allclose(X_imputed, imputer.initial_imputer_.transform(X)) + + +@pytest.mark.parametrize( + "imputation_order", ["random", "roman", "ascending", "descending", "arabic"] +) +def test_iterative_imputer_imputation_order(imputation_order): + rng = np.random.RandomState(0) + n = 100 + d = 10 + max_iter = 2 + X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() + X[:, 0] = 1 # this column should not be discarded by IterativeImputer + + imputer = IterativeImputer( + missing_values=0, + max_iter=max_iter, + n_nearest_features=5, + sample_posterior=False, + skip_complete=True, + min_value=0, + max_value=1, + verbose=1, + imputation_order=imputation_order, + random_state=rng, + ) + imputer.fit_transform(X) + ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] + + assert len(ordered_idx) // imputer.n_iter_ == imputer.n_features_with_missing_ + + if imputation_order == "roman": + assert np.all(ordered_idx[: d - 1] == np.arange(1, d)) + elif imputation_order == "arabic": + assert np.all(ordered_idx[: d - 1] == np.arange(d - 1, 0, -1)) + elif imputation_order == "random": + ordered_idx_round_1 = ordered_idx[: d - 1] + ordered_idx_round_2 = ordered_idx[d - 1 :] + assert ordered_idx_round_1 != ordered_idx_round_2 + elif "ending" in imputation_order: + assert len(ordered_idx) == max_iter * (d - 1) + + +@pytest.mark.parametrize( + "estimator", [None, DummyRegressor(), BayesianRidge(), ARDRegression(), RidgeCV()] +) +def test_iterative_imputer_estimators(estimator): + rng = np.random.RandomState(0) + + n = 100 + d = 10 + X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() + + imputer = IterativeImputer( + missing_values=0, max_iter=1, estimator=estimator, random_state=rng + ) + imputer.fit_transform(X) + + # check that types are correct for estimators + hashes = [] + for triplet in imputer.imputation_sequence_: + expected_type = ( + type(estimator) if estimator is not None else type(BayesianRidge()) + ) + assert isinstance(triplet.estimator, expected_type) + hashes.append(id(triplet.estimator)) + + # check that each estimator is unique + assert len(set(hashes)) == len(hashes) + + +def test_iterative_imputer_clip(): + rng = np.random.RandomState(0) + n = 100 + d = 10 + X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() + + imputer = IterativeImputer( + missing_values=0, max_iter=1, min_value=0.1, max_value=0.2, random_state=rng + ) + + Xt = imputer.fit_transform(X) + assert_allclose(np.min(Xt[X == 0]), 0.1) + assert_allclose(np.max(Xt[X == 0]), 0.2) + assert_allclose(Xt[X != 0], X[X != 0]) + + +def test_iterative_imputer_clip_truncnorm(): + rng = np.random.RandomState(0) + n = 100 + d = 10 + X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() + X[:, 0] = 1 + + imputer = IterativeImputer( + missing_values=0, + max_iter=2, + n_nearest_features=5, + sample_posterior=True, + min_value=0.1, + max_value=0.2, + verbose=1, + imputation_order="random", + random_state=rng, + ) + Xt = imputer.fit_transform(X) + assert_allclose(np.min(Xt[X == 0]), 0.1) + assert_allclose(np.max(Xt[X == 0]), 0.2) + assert_allclose(Xt[X != 0], X[X != 0]) + + +def test_iterative_imputer_truncated_normal_posterior(): + # test that the values that are imputed using `sample_posterior=True` + # with boundaries (`min_value` and `max_value` are not None) are drawn + # from a distribution that looks gaussian via the Kolmogorov Smirnov test. + # note that starting from the wrong random seed will make this test fail + # because random sampling doesn't occur at all when the imputation + # is outside of the (min_value, max_value) range + rng = np.random.RandomState(42) + + X = rng.normal(size=(5, 5)) + X[0][0] = np.nan + + imputer = IterativeImputer( + min_value=0, max_value=0.5, sample_posterior=True, random_state=rng + ) + + imputer.fit_transform(X) + # generate multiple imputations for the single missing value + imputations = np.array([imputer.transform(X)[0][0] for _ in range(100)]) + + assert all(imputations >= 0) + assert all(imputations <= 0.5) + + mu, sigma = imputations.mean(), imputations.std() + ks_statistic, p_value = kstest((imputations - mu) / sigma, "norm") + if sigma == 0: + sigma += 1e-12 + ks_statistic, p_value = kstest((imputations - mu) / sigma, "norm") + # we want to fail to reject null hypothesis + # null hypothesis: distributions are the same + assert ks_statistic < 0.2 or p_value > 0.1, "The posterior does appear to be normal" + + +@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent"]) +def test_iterative_imputer_missing_at_transform(strategy): + rng = np.random.RandomState(0) + n = 100 + d = 10 + X_train = rng.randint(low=0, high=3, size=(n, d)) + X_test = rng.randint(low=0, high=3, size=(n, d)) + + X_train[:, 0] = 1 # definitely no missing values in 0th column + X_test[0, 0] = 0 # definitely missing value in 0th column + + imputer = IterativeImputer( + missing_values=0, max_iter=1, initial_strategy=strategy, random_state=rng + ).fit(X_train) + initial_imputer = SimpleImputer(missing_values=0, strategy=strategy).fit(X_train) + + # if there were no missing values at time of fit, then imputer will + # only use the initial imputer for that feature at transform + assert_allclose( + imputer.transform(X_test)[:, 0], initial_imputer.transform(X_test)[:, 0] + ) + + +def test_iterative_imputer_transform_stochasticity(): + rng1 = np.random.RandomState(0) + rng2 = np.random.RandomState(1) + n = 100 + d = 10 + X = _sparse_random_matrix(n, d, density=0.10, random_state=rng1).toarray() + + # when sample_posterior=True, two transforms shouldn't be equal + imputer = IterativeImputer( + missing_values=0, max_iter=1, sample_posterior=True, random_state=rng1 + ) + imputer.fit(X) + + X_fitted_1 = imputer.transform(X) + X_fitted_2 = imputer.transform(X) + + # sufficient to assert that the means are not the same + assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2)) + + # when sample_posterior=False, and n_nearest_features=None + # and imputation_order is not random + # the two transforms should be identical even if rng are different + imputer1 = IterativeImputer( + missing_values=0, + max_iter=1, + sample_posterior=False, + n_nearest_features=None, + imputation_order="ascending", + random_state=rng1, + ) + + imputer2 = IterativeImputer( + missing_values=0, + max_iter=1, + sample_posterior=False, + n_nearest_features=None, + imputation_order="ascending", + random_state=rng2, + ) + imputer1.fit(X) + imputer2.fit(X) + + X_fitted_1a = imputer1.transform(X) + X_fitted_1b = imputer1.transform(X) + X_fitted_2 = imputer2.transform(X) + + assert_allclose(X_fitted_1a, X_fitted_1b) + assert_allclose(X_fitted_1a, X_fitted_2) + + +def test_iterative_imputer_no_missing(): + rng = np.random.RandomState(0) + X = rng.rand(100, 100) + X[:, 0] = np.nan + m1 = IterativeImputer(max_iter=10, random_state=rng) + m2 = IterativeImputer(max_iter=10, random_state=rng) + pred1 = m1.fit(X).transform(X) + pred2 = m2.fit_transform(X) + # should exclude the first column entirely + assert_allclose(X[:, 1:], pred1) + # fit and fit_transform should both be identical + assert_allclose(pred1, pred2) + + +def test_iterative_imputer_rank_one(): + rng = np.random.RandomState(0) + d = 50 + A = rng.rand(d, 1) + B = rng.rand(1, d) + X = np.dot(A, B) + nan_mask = rng.rand(d, d) < 0.5 + X_missing = X.copy() + X_missing[nan_mask] = np.nan + + imputer = IterativeImputer(max_iter=5, verbose=1, random_state=rng) + X_filled = imputer.fit_transform(X_missing) + assert_allclose(X_filled, X, atol=0.02) + + +@pytest.mark.parametrize("rank", [3, 5]) +def test_iterative_imputer_transform_recovery(rank): + rng = np.random.RandomState(0) + n = 70 + d = 70 + A = rng.rand(n, rank) + B = rng.rand(rank, d) + X_filled = np.dot(A, B) + nan_mask = rng.rand(n, d) < 0.5 + X_missing = X_filled.copy() + X_missing[nan_mask] = np.nan + + # split up data in half + n = n // 2 + X_train = X_missing[:n] + X_test_filled = X_filled[n:] + X_test = X_missing[n:] + + imputer = IterativeImputer( + max_iter=5, imputation_order="descending", verbose=1, random_state=rng + ).fit(X_train) + X_test_est = imputer.transform(X_test) + assert_allclose(X_test_filled, X_test_est, atol=0.1) + + +def test_iterative_imputer_additive_matrix(): + rng = np.random.RandomState(0) + n = 100 + d = 10 + A = rng.randn(n, d) + B = rng.randn(n, d) + X_filled = np.zeros(A.shape) + for i in range(d): + for j in range(d): + X_filled[:, (i + j) % d] += (A[:, i] + B[:, j]) / 2 + # a quarter is randomly missing + nan_mask = rng.rand(n, d) < 0.25 + X_missing = X_filled.copy() + X_missing[nan_mask] = np.nan + + # split up data + n = n // 2 + X_train = X_missing[:n] + X_test_filled = X_filled[n:] + X_test = X_missing[n:] + + imputer = IterativeImputer(max_iter=10, verbose=1, random_state=rng).fit(X_train) + X_test_est = imputer.transform(X_test) + assert_allclose(X_test_filled, X_test_est, rtol=1e-3, atol=0.01) + + +def test_iterative_imputer_early_stopping(): + rng = np.random.RandomState(0) + n = 50 + d = 5 + A = rng.rand(n, 1) + B = rng.rand(1, d) + X = np.dot(A, B) + nan_mask = rng.rand(n, d) < 0.5 + X_missing = X.copy() + X_missing[nan_mask] = np.nan + + imputer = IterativeImputer( + max_iter=100, tol=1e-2, sample_posterior=False, verbose=1, random_state=rng + ) + X_filled_100 = imputer.fit_transform(X_missing) + assert len(imputer.imputation_sequence_) == d * imputer.n_iter_ + + imputer = IterativeImputer( + max_iter=imputer.n_iter_, sample_posterior=False, verbose=1, random_state=rng + ) + X_filled_early = imputer.fit_transform(X_missing) + assert_allclose(X_filled_100, X_filled_early, atol=1e-7) + + imputer = IterativeImputer( + max_iter=100, tol=0, sample_posterior=False, verbose=1, random_state=rng + ) + imputer.fit(X_missing) + assert imputer.n_iter_ == imputer.max_iter + + +def test_iterative_imputer_catch_warning(): + # check that we catch a RuntimeWarning due to a division by zero when a + # feature is constant in the dataset + X, y = load_diabetes(return_X_y=True) + n_samples, n_features = X.shape + + # simulate that a feature only contain one category during fit + X[:, 3] = 1 + + # add some missing values + rng = np.random.RandomState(0) + missing_rate = 0.15 + for feat in range(n_features): + sample_idx = rng.choice( + np.arange(n_samples), size=int(n_samples * missing_rate), replace=False + ) + X[sample_idx, feat] = np.nan + + imputer = IterativeImputer(n_nearest_features=5, sample_posterior=True) + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + X_fill = imputer.fit_transform(X, y) + assert not np.any(np.isnan(X_fill)) + + +@pytest.mark.parametrize( + "min_value, max_value, correct_output", + [ + (0, 100, np.array([[0] * 3, [100] * 3])), + (None, None, np.array([[-np.inf] * 3, [np.inf] * 3])), + (-np.inf, np.inf, np.array([[-np.inf] * 3, [np.inf] * 3])), + ([-5, 5, 10], [100, 200, 300], np.array([[-5, 5, 10], [100, 200, 300]])), + ( + [-5, -np.inf, 10], + [100, 200, np.inf], + np.array([[-5, -np.inf, 10], [100, 200, np.inf]]), + ), + ], + ids=["scalars", "None-default", "inf", "lists", "lists-with-inf"], +) +def test_iterative_imputer_min_max_array_like(min_value, max_value, correct_output): + # check that passing scalar or array-like + # for min_value and max_value in IterativeImputer works + X = np.random.RandomState(0).randn(10, 3) + imputer = IterativeImputer(min_value=min_value, max_value=max_value) + imputer.fit(X) + + assert isinstance(imputer._min_value, np.ndarray) and isinstance( + imputer._max_value, np.ndarray + ) + assert (imputer._min_value.shape[0] == X.shape[1]) and ( + imputer._max_value.shape[0] == X.shape[1] + ) + + assert_allclose(correct_output[0, :], imputer._min_value) + assert_allclose(correct_output[1, :], imputer._max_value) + + +@pytest.mark.parametrize( + "min_value, max_value, err_msg", + [ + (100, 0, "min_value >= max_value."), + (np.inf, -np.inf, "min_value >= max_value."), + ([-5, 5], [100, 200, 0], "_value' should be of shape"), + ([-5, 5, 5], [100, 200], "_value' should be of shape"), + ], +) +def test_iterative_imputer_catch_min_max_error(min_value, max_value, err_msg): + # check that passing scalar or array-like + # for min_value and max_value in IterativeImputer works + X = np.random.random((10, 3)) + imputer = IterativeImputer(min_value=min_value, max_value=max_value) + with pytest.raises(ValueError, match=err_msg): + imputer.fit(X) + + +@pytest.mark.parametrize( + "min_max_1, min_max_2", + [([None, None], [-np.inf, np.inf]), ([-10, 10], [[-10] * 4, [10] * 4])], + ids=["None-vs-inf", "Scalar-vs-vector"], +) +def test_iterative_imputer_min_max_array_like_imputation(min_max_1, min_max_2): + # Test that None/inf and scalar/vector give the same imputation + X_train = np.array( + [ + [np.nan, 2, 2, 1], + [10, np.nan, np.nan, 7], + [3, 1, np.nan, 1], + [np.nan, 4, 2, np.nan], + ] + ) + X_test = np.array( + [[np.nan, 2, np.nan, 5], [2, 4, np.nan, np.nan], [np.nan, 1, 10, 1]] + ) + imputer1 = IterativeImputer( + min_value=min_max_1[0], max_value=min_max_1[1], random_state=0 + ) + imputer2 = IterativeImputer( + min_value=min_max_2[0], max_value=min_max_2[1], random_state=0 + ) + X_test_imputed1 = imputer1.fit(X_train).transform(X_test) + X_test_imputed2 = imputer2.fit(X_train).transform(X_test) + assert_allclose(X_test_imputed1[:, 0], X_test_imputed2[:, 0]) + + +@pytest.mark.parametrize("skip_complete", [True, False]) +def test_iterative_imputer_skip_non_missing(skip_complete): + # check the imputing strategy when missing data are present in the + # testing set only. + # taken from: https://github.com/scikit-learn/scikit-learn/issues/14383 + rng = np.random.RandomState(0) + X_train = np.array([[5, 2, 2, 1], [10, 1, 2, 7], [3, 1, 1, 1], [8, 4, 2, 2]]) + X_test = np.array([[np.nan, 2, 4, 5], [np.nan, 4, 1, 2], [np.nan, 1, 10, 1]]) + imputer = IterativeImputer( + initial_strategy="mean", skip_complete=skip_complete, random_state=rng + ) + X_test_est = imputer.fit(X_train).transform(X_test) + if skip_complete: + # impute with the initial strategy: 'mean' + assert_allclose(X_test_est[:, 0], np.mean(X_train[:, 0])) + else: + assert_allclose(X_test_est[:, 0], [11, 7, 12], rtol=1e-4) + + +@pytest.mark.parametrize("rs_imputer", [None, 1, np.random.RandomState(seed=1)]) +@pytest.mark.parametrize("rs_estimator", [None, 1, np.random.RandomState(seed=1)]) +def test_iterative_imputer_dont_set_random_state(rs_imputer, rs_estimator): + class ZeroEstimator: + def __init__(self, random_state): + self.random_state = random_state + + def fit(self, *args, **kgards): + return self + + def predict(self, X): + return np.zeros(X.shape[0]) + + estimator = ZeroEstimator(random_state=rs_estimator) + imputer = IterativeImputer(random_state=rs_imputer) + X_train = np.zeros((10, 3)) + imputer.fit(X_train) + assert estimator.random_state == rs_estimator + + +@pytest.mark.parametrize( + "X_fit, X_trans, params, msg_err", + [ + ( + np.array([[-1, 1], [1, 2]]), + np.array([[-1, 1], [1, -1]]), + {"features": "missing-only", "sparse": "auto"}, + "have missing values in transform but have no missing values in fit", + ), + ( + np.array([["a", "b"], ["c", "a"]], dtype=str), + np.array([["a", "b"], ["c", "a"]], dtype=str), + {}, + "MissingIndicator does not support data with dtype", + ), + ], +) +def test_missing_indicator_error(X_fit, X_trans, params, msg_err): + indicator = MissingIndicator(missing_values=-1) + indicator.set_params(**params) + with pytest.raises(ValueError, match=msg_err): + indicator.fit(X_fit).transform(X_trans) + + +def _generate_missing_indicator_cases(): + missing_values_dtypes = [(0, np.int32), (np.nan, np.float64), (-1, np.int32)] + arr_types = ( + [np.array] + + CSC_CONTAINERS + + CSR_CONTAINERS + + COO_CONTAINERS + + LIL_CONTAINERS + + BSR_CONTAINERS + ) + return [ + (arr_type, missing_values, dtype) + for arr_type, (missing_values, dtype) in product( + arr_types, missing_values_dtypes + ) + if not (missing_values == 0 and arr_type is not np.array) + ] + + +@pytest.mark.parametrize( + "arr_type, missing_values, dtype", _generate_missing_indicator_cases() +) +@pytest.mark.parametrize( + "param_features, n_features, features_indices", + [("missing-only", 3, np.array([0, 1, 2])), ("all", 3, np.array([0, 1, 2]))], +) +def test_missing_indicator_new( + missing_values, arr_type, dtype, param_features, n_features, features_indices +): + X_fit = np.array([[missing_values, missing_values, 1], [4, 2, missing_values]]) + X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]]) + X_fit_expected = np.array([[1, 1, 0], [0, 0, 1]]) + X_trans_expected = np.array([[1, 1, 0], [0, 0, 0]]) + + # convert the input to the right array format and right dtype + X_fit = arr_type(X_fit).astype(dtype) + X_trans = arr_type(X_trans).astype(dtype) + X_fit_expected = X_fit_expected.astype(dtype) + X_trans_expected = X_trans_expected.astype(dtype) + + indicator = MissingIndicator( + missing_values=missing_values, features=param_features, sparse=False + ) + X_fit_mask = indicator.fit_transform(X_fit) + X_trans_mask = indicator.transform(X_trans) + + assert X_fit_mask.shape[1] == n_features + assert X_trans_mask.shape[1] == n_features + + assert_array_equal(indicator.features_, features_indices) + assert_allclose(X_fit_mask, X_fit_expected[:, features_indices]) + assert_allclose(X_trans_mask, X_trans_expected[:, features_indices]) + + assert X_fit_mask.dtype == bool + assert X_trans_mask.dtype == bool + assert isinstance(X_fit_mask, np.ndarray) + assert isinstance(X_trans_mask, np.ndarray) + + indicator.set_params(sparse=True) + X_fit_mask_sparse = indicator.fit_transform(X_fit) + X_trans_mask_sparse = indicator.transform(X_trans) + + assert X_fit_mask_sparse.dtype == bool + assert X_trans_mask_sparse.dtype == bool + assert X_fit_mask_sparse.format == "csc" + assert X_trans_mask_sparse.format == "csc" + assert_allclose(X_fit_mask_sparse.toarray(), X_fit_mask) + assert_allclose(X_trans_mask_sparse.toarray(), X_trans_mask) + + +@pytest.mark.parametrize( + "arr_type", + CSC_CONTAINERS + CSR_CONTAINERS + COO_CONTAINERS + LIL_CONTAINERS + BSR_CONTAINERS, +) +def test_missing_indicator_raise_on_sparse_with_missing_0(arr_type): + # test for sparse input and missing_value == 0 + + missing_values = 0 + X_fit = np.array([[missing_values, missing_values, 1], [4, missing_values, 2]]) + X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]]) + + # convert the input to the right array format + X_fit_sparse = arr_type(X_fit) + X_trans_sparse = arr_type(X_trans) + + indicator = MissingIndicator(missing_values=missing_values) + + with pytest.raises(ValueError, match="Sparse input with missing_values=0"): + indicator.fit_transform(X_fit_sparse) + + indicator.fit_transform(X_fit) + with pytest.raises(ValueError, match="Sparse input with missing_values=0"): + indicator.transform(X_trans_sparse) + + +@pytest.mark.parametrize("param_sparse", [True, False, "auto"]) +@pytest.mark.parametrize( + "arr_type, missing_values", + [(np.array, 0)] + + list( + product( + CSC_CONTAINERS + + CSR_CONTAINERS + + COO_CONTAINERS + + LIL_CONTAINERS + + BSR_CONTAINERS, + [np.nan], + ) + ), +) +def test_missing_indicator_sparse_param(arr_type, missing_values, param_sparse): + # check the format of the output with different sparse parameter + X_fit = np.array([[missing_values, missing_values, 1], [4, missing_values, 2]]) + X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]]) + X_fit = arr_type(X_fit).astype(np.float64) + X_trans = arr_type(X_trans).astype(np.float64) + + indicator = MissingIndicator(missing_values=missing_values, sparse=param_sparse) + X_fit_mask = indicator.fit_transform(X_fit) + X_trans_mask = indicator.transform(X_trans) + + if param_sparse is True: + assert X_fit_mask.format == "csc" + assert X_trans_mask.format == "csc" + elif param_sparse == "auto" and missing_values == 0: + assert isinstance(X_fit_mask, np.ndarray) + assert isinstance(X_trans_mask, np.ndarray) + elif param_sparse is False: + assert isinstance(X_fit_mask, np.ndarray) + assert isinstance(X_trans_mask, np.ndarray) + else: + if sparse.issparse(X_fit): + assert X_fit_mask.format == "csc" + assert X_trans_mask.format == "csc" + else: + assert isinstance(X_fit_mask, np.ndarray) + assert isinstance(X_trans_mask, np.ndarray) + + +def test_missing_indicator_string(): + X = np.array([["a", "b", "c"], ["b", "c", "a"]], dtype=object) + indicator = MissingIndicator(missing_values="a", features="all") + X_trans = indicator.fit_transform(X) + assert_array_equal(X_trans, np.array([[True, False, False], [False, False, True]])) + + +@pytest.mark.parametrize( + "X, missing_values, X_trans_exp", + [ + ( + np.array([["a", "b"], ["b", "a"]], dtype=object), + "a", + np.array([["b", "b", True, False], ["b", "b", False, True]], dtype=object), + ), + ( + np.array([[np.nan, 1.0], [1.0, np.nan]]), + np.nan, + np.array([[1.0, 1.0, True, False], [1.0, 1.0, False, True]]), + ), + ( + np.array([[np.nan, "b"], ["b", np.nan]], dtype=object), + np.nan, + np.array([["b", "b", True, False], ["b", "b", False, True]], dtype=object), + ), + ( + np.array([[None, "b"], ["b", None]], dtype=object), + None, + np.array([["b", "b", True, False], ["b", "b", False, True]], dtype=object), + ), + ], +) +def test_missing_indicator_with_imputer(X, missing_values, X_trans_exp): + trans = make_union( + SimpleImputer(missing_values=missing_values, strategy="most_frequent"), + MissingIndicator(missing_values=missing_values), + ) + X_trans = trans.fit_transform(X) + assert_array_equal(X_trans, X_trans_exp) + + +@pytest.mark.parametrize("imputer_constructor", [SimpleImputer, IterativeImputer]) +@pytest.mark.parametrize( + "imputer_missing_values, missing_value, err_msg", + [ + ("NaN", np.nan, "Input X contains NaN"), + ("-1", -1, "types are expected to be both numerical."), + ], +) +def test_inconsistent_dtype_X_missing_values( + imputer_constructor, imputer_missing_values, missing_value, err_msg +): + # regression test for issue #11390. Comparison between incoherent dtype + # for X and missing_values was not raising a proper error. + rng = np.random.RandomState(42) + X = rng.randn(10, 10) + X[0, 0] = missing_value + + imputer = imputer_constructor(missing_values=imputer_missing_values) + + with pytest.raises(ValueError, match=err_msg): + imputer.fit_transform(X) + + +def test_missing_indicator_no_missing(): + # check that all features are dropped if there are no missing values when + # features='missing-only' (#13491) + X = np.array([[1, 1], [1, 1]]) + + mi = MissingIndicator(features="missing-only", missing_values=-1) + Xt = mi.fit_transform(X) + + assert Xt.shape[1] == 0 + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_missing_indicator_sparse_no_explicit_zeros(csr_container): + # Check that non missing values don't become explicit zeros in the mask + # generated by missing indicator when X is sparse. (#13491) + X = csr_container([[0, 1, 2], [1, 2, 0], [2, 0, 1]]) + + mi = MissingIndicator(features="all", missing_values=1) + Xt = mi.fit_transform(X) + + assert Xt.nnz == Xt.sum() + + +@pytest.mark.parametrize("imputer_constructor", [SimpleImputer, IterativeImputer]) +def test_imputer_without_indicator(imputer_constructor): + X = np.array([[1, 1], [1, 1]]) + imputer = imputer_constructor() + imputer.fit(X) + + assert imputer.indicator_ is None + + +@pytest.mark.parametrize( + "arr_type", + CSC_CONTAINERS + CSR_CONTAINERS + COO_CONTAINERS + LIL_CONTAINERS + BSR_CONTAINERS, +) +def test_simple_imputation_add_indicator_sparse_matrix(arr_type): + X_sparse = arr_type([[np.nan, 1, 5], [2, np.nan, 1], [6, 3, np.nan], [1, 2, 9]]) + X_true = np.array( + [ + [3.0, 1.0, 5.0, 1.0, 0.0, 0.0], + [2.0, 2.0, 1.0, 0.0, 1.0, 0.0], + [6.0, 3.0, 5.0, 0.0, 0.0, 1.0], + [1.0, 2.0, 9.0, 0.0, 0.0, 0.0], + ] + ) + + imputer = SimpleImputer(missing_values=np.nan, add_indicator=True) + X_trans = imputer.fit_transform(X_sparse) + + assert sparse.issparse(X_trans) + assert X_trans.shape == X_true.shape + assert_allclose(X_trans.toarray(), X_true) + + +@pytest.mark.parametrize( + "strategy, expected", [("most_frequent", "b"), ("constant", "missing_value")] +) +def test_simple_imputation_string_list(strategy, expected): + X = [["a", "b"], ["c", np.nan]] + + X_true = np.array([["a", "b"], ["c", expected]], dtype=object) + + imputer = SimpleImputer(strategy=strategy) + X_trans = imputer.fit_transform(X) + + assert_array_equal(X_trans, X_true) + + +@pytest.mark.parametrize( + "order, idx_order", + [("ascending", [3, 4, 2, 0, 1]), ("descending", [1, 0, 2, 4, 3])], +) +def test_imputation_order(order, idx_order): + # regression test for #15393 + rng = np.random.RandomState(42) + X = rng.rand(100, 5) + X[:50, 1] = np.nan + X[:30, 0] = np.nan + X[:20, 2] = np.nan + X[:10, 4] = np.nan + + with pytest.warns(ConvergenceWarning): + trs = IterativeImputer(max_iter=1, imputation_order=order, random_state=0).fit( + X + ) + idx = [x.feat_idx for x in trs.imputation_sequence_] + assert idx == idx_order + + +@pytest.mark.parametrize("missing_value", [-1, np.nan]) +def test_simple_imputation_inverse_transform(missing_value): + # Test inverse_transform feature for np.nan + X_1 = np.array( + [ + [9, missing_value, 3, -1], + [4, -1, 5, 4], + [6, 7, missing_value, -1], + [8, 9, 0, missing_value], + ] + ) + + X_2 = np.array( + [ + [5, 4, 2, 1], + [2, 1, missing_value, 3], + [9, missing_value, 7, 1], + [6, 4, 2, missing_value], + ] + ) + + X_3 = np.array( + [ + [1, missing_value, 5, 9], + [missing_value, 4, missing_value, missing_value], + [2, missing_value, 7, missing_value], + [missing_value, 3, missing_value, 8], + ] + ) + + X_4 = np.array( + [ + [1, 1, 1, 3], + [missing_value, 2, missing_value, 1], + [2, 3, 3, 4], + [missing_value, 4, missing_value, 2], + ] + ) + + imputer = SimpleImputer( + missing_values=missing_value, strategy="mean", add_indicator=True + ) + + X_1_trans = imputer.fit_transform(X_1) + X_1_inv_trans = imputer.inverse_transform(X_1_trans) + + X_2_trans = imputer.transform(X_2) # test on new data + X_2_inv_trans = imputer.inverse_transform(X_2_trans) + + assert_array_equal(X_1_inv_trans, X_1) + assert_array_equal(X_2_inv_trans, X_2) + + for X in [X_3, X_4]: + X_trans = imputer.fit_transform(X) + X_inv_trans = imputer.inverse_transform(X_trans) + assert_array_equal(X_inv_trans, X) + + +@pytest.mark.parametrize("missing_value", [-1, np.nan]) +def test_simple_imputation_inverse_transform_exceptions(missing_value): + X_1 = np.array( + [ + [9, missing_value, 3, -1], + [4, -1, 5, 4], + [6, 7, missing_value, -1], + [8, 9, 0, missing_value], + ] + ) + + imputer = SimpleImputer(missing_values=missing_value, strategy="mean") + X_1_trans = imputer.fit_transform(X_1) + with pytest.raises( + ValueError, match=f"Got 'add_indicator={imputer.add_indicator}'" + ): + imputer.inverse_transform(X_1_trans) + + +@pytest.mark.parametrize( + "expected,array,dtype,extra_value,n_repeat", + [ + # array of object dtype + ("extra_value", ["a", "b", "c"], object, "extra_value", 2), + ( + "most_frequent_value", + ["most_frequent_value", "most_frequent_value", "value"], + object, + "extra_value", + 1, + ), + ("a", ["min_value", "min_valuevalue"], object, "a", 2), + ("min_value", ["min_value", "min_value", "value"], object, "z", 2), + # array of numeric dtype + (10, [1, 2, 3], int, 10, 2), + (1, [1, 1, 2], int, 10, 1), + (10, [20, 20, 1], int, 10, 2), + (1, [1, 1, 20], int, 10, 2), + ], +) +def test_most_frequent(expected, array, dtype, extra_value, n_repeat): + assert expected == _most_frequent( + np.array(array, dtype=dtype), extra_value, n_repeat + ) + + +@pytest.mark.parametrize( + "expected,array", + [ + ("a", ["a", "b"]), + (1, [1, 2]), + (None, [None, "a"]), + (None, [None, 1]), + (None, [None, "a", 1]), + (1, [1, "1"]), + (1, ["1", 1]), + ], +) +def test_most_frequent_tie_object(expected, array): + """Check the tie breaking behavior of the most frequent strategy. + + Non-regression test for issue #31717. + """ + assert expected == _most_frequent(np.array(array, dtype=object), None, 0) + + +@pytest.mark.parametrize( + "initial_strategy", ["mean", "median", "most_frequent", "constant"] +) +def test_iterative_imputer_keep_empty_features(initial_strategy): + """Check the behaviour of the iterative imputer with different initial strategy + and keeping empty features (i.e. features containing only missing values). + """ + X = np.array([[1, np.nan, 2], [3, np.nan, np.nan]]) + + imputer = IterativeImputer( + initial_strategy=initial_strategy, keep_empty_features=True + ) + X_imputed = imputer.fit_transform(X) + assert_allclose(X_imputed[:, 1], 0) + X_imputed = imputer.transform(X) + assert_allclose(X_imputed[:, 1], 0) + + +# TODO (1.8): check that `keep_empty_features=False` drop the +# empty features due to the behaviour change. +def test_iterative_imputer_constant_fill_value(): + """Check that we propagate properly the parameter `fill_value`.""" + X = np.array([[-1, 2, 3, -1], [4, -1, 5, -1], [6, 7, -1, -1], [8, 9, 0, -1]]) + + fill_value = 100 + imputer = IterativeImputer( + missing_values=-1, + initial_strategy="constant", + fill_value=fill_value, + max_iter=0, + keep_empty_features=True, + ) + imputer.fit_transform(X) + assert_array_equal(imputer.initial_imputer_.statistics_, fill_value) + + +def test_iterative_imputer_min_max_value_remove_empty(): + """Check that we properly apply the empty feature mask to `min_value` and + `max_value`. + + Non-regression test for https://github.com/scikit-learn/scikit-learn/issues/29355 + """ + # Intentionally make column 2 as a missing column, then the bound of the imputed + # value of column 3 should be (4, 5) + X = np.array( + [ + [1, 2, np.nan, np.nan], + [4, 5, np.nan, 6], + [7, 8, np.nan, np.nan], + [10, 11, np.nan, 12], + ] + ) + min_value = [-np.inf, -np.inf, -np.inf, 4] + max_value = [np.inf, np.inf, np.inf, 5] + + X_imputed = IterativeImputer( + min_value=min_value, + max_value=max_value, + keep_empty_features=False, + ).fit_transform(X) + + X_without_missing_column = np.delete(X, 2, axis=1) + assert X_imputed.shape == X_without_missing_column.shape + assert np.min(X_imputed[np.isnan(X_without_missing_column)]) == pytest.approx(4) + assert np.max(X_imputed[np.isnan(X_without_missing_column)]) == pytest.approx(5) + + # Intentionally make column 3 as a missing column, then the bound of the imputed + # value of column 2 should be (3.5, 6) + X = np.array( + [ + [1, 2, np.nan, np.nan], + [4, 5, 6, np.nan], + [7, 8, np.nan, np.nan], + [10, 11, 12, np.nan], + ] + ) + min_value = [-np.inf, -np.inf, 3.5, -np.inf] + max_value = [np.inf, np.inf, 6, np.inf] + + X_imputed = IterativeImputer( + min_value=min_value, + max_value=max_value, + keep_empty_features=False, + ).fit_transform(X) + + X_without_missing_column = X[:, :3] + assert X_imputed.shape == X_without_missing_column.shape + assert np.min(X_imputed[np.isnan(X_without_missing_column)]) == pytest.approx(3.5) + assert np.max(X_imputed[np.isnan(X_without_missing_column)]) == pytest.approx(6) + + +@pytest.mark.parametrize("keep_empty_features", [True, False]) +def test_knn_imputer_keep_empty_features(keep_empty_features): + """Check the behaviour of `keep_empty_features` for `KNNImputer`.""" + X = np.array([[1, np.nan, 2], [3, np.nan, np.nan]]) + + imputer = KNNImputer(keep_empty_features=keep_empty_features) + + for method in ["fit_transform", "transform"]: + X_imputed = getattr(imputer, method)(X) + if keep_empty_features: + assert X_imputed.shape == X.shape + assert_array_equal(X_imputed[:, 1], 0) + else: + assert X_imputed.shape == (X.shape[0], X.shape[1] - 1) + + +def test_simple_impute_pd_na(): + pd = pytest.importorskip("pandas") + + # Impute pandas array of string types. + df = pd.DataFrame({"feature": pd.Series(["abc", None, "de"], dtype="string")}) + imputer = SimpleImputer(missing_values=pd.NA, strategy="constant", fill_value="na") + _assert_array_equal_and_same_dtype( + imputer.fit_transform(df), np.array([["abc"], ["na"], ["de"]], dtype=object) + ) + + # Impute pandas array of string types without any missing values. + df = pd.DataFrame({"feature": pd.Series(["abc", "de", "fgh"], dtype="string")}) + imputer = SimpleImputer(fill_value="ok", strategy="constant") + _assert_array_equal_and_same_dtype( + imputer.fit_transform(df), np.array([["abc"], ["de"], ["fgh"]], dtype=object) + ) + + # Impute pandas array of integer types. + df = pd.DataFrame({"feature": pd.Series([1, None, 3], dtype="Int64")}) + imputer = SimpleImputer(missing_values=pd.NA, strategy="constant", fill_value=-1) + _assert_allclose_and_same_dtype( + imputer.fit_transform(df), np.array([[1], [-1], [3]], dtype="float64") + ) + + # Use `np.nan` also works. + imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=-1) + _assert_allclose_and_same_dtype( + imputer.fit_transform(df), np.array([[1], [-1], [3]], dtype="float64") + ) + + # Impute pandas array of integer types with 'median' strategy. + df = pd.DataFrame({"feature": pd.Series([1, None, 2, 3], dtype="Int64")}) + imputer = SimpleImputer(missing_values=pd.NA, strategy="median") + _assert_allclose_and_same_dtype( + imputer.fit_transform(df), np.array([[1], [2], [2], [3]], dtype="float64") + ) + + # Impute pandas array of integer types with 'mean' strategy. + df = pd.DataFrame({"feature": pd.Series([1, None, 2], dtype="Int64")}) + imputer = SimpleImputer(missing_values=pd.NA, strategy="mean") + _assert_allclose_and_same_dtype( + imputer.fit_transform(df), np.array([[1], [1.5], [2]], dtype="float64") + ) + + # Impute pandas array of float types. + df = pd.DataFrame({"feature": pd.Series([1.0, None, 3.0], dtype="float64")}) + imputer = SimpleImputer(missing_values=pd.NA, strategy="constant", fill_value=-2.0) + _assert_allclose_and_same_dtype( + imputer.fit_transform(df), np.array([[1.0], [-2.0], [3.0]], dtype="float64") + ) + + # Impute pandas array of float types with 'median' strategy. + df = pd.DataFrame({"feature": pd.Series([1.0, None, 2.0, 3.0], dtype="float64")}) + imputer = SimpleImputer(missing_values=pd.NA, strategy="median") + _assert_allclose_and_same_dtype( + imputer.fit_transform(df), + np.array([[1.0], [2.0], [2.0], [3.0]], dtype="float64"), + ) + + +def test_missing_indicator_feature_names_out(): + """Check that missing indicator return the feature names with a prefix.""" + pd = pytest.importorskip("pandas") + + missing_values = np.nan + X = pd.DataFrame( + [ + [missing_values, missing_values, 1, missing_values], + [4, missing_values, 2, 10], + ], + columns=["a", "b", "c", "d"], + ) + + indicator = MissingIndicator(missing_values=missing_values).fit(X) + feature_names = indicator.get_feature_names_out() + expected_names = ["missingindicator_a", "missingindicator_b", "missingindicator_d"] + assert_array_equal(expected_names, feature_names) + + +def test_imputer_lists_fit_transform(): + """Check transform uses object dtype when fitted on an object dtype. + + Non-regression test for #19572. + """ + + X = [["a", "b"], ["c", "b"], ["a", "a"]] + imp_frequent = SimpleImputer(strategy="most_frequent").fit(X) + X_trans = imp_frequent.transform([[np.nan, np.nan]]) + assert X_trans.dtype == object + assert_array_equal(X_trans, [["a", "b"]]) + + +@pytest.mark.parametrize("dtype_test", [np.float32, np.float64]) +def test_imputer_transform_preserves_numeric_dtype(dtype_test): + """Check transform preserves numeric dtype independent of fit dtype.""" + X = np.asarray( + [[1.2, 3.4, np.nan], [np.nan, 1.2, 1.3], [4.2, 2, 1]], dtype=np.float64 + ) + imp = SimpleImputer().fit(X) + + X_test = np.asarray([[np.nan, np.nan, np.nan]], dtype=dtype_test) + X_trans = imp.transform(X_test) + assert X_trans.dtype == dtype_test + + +@pytest.mark.parametrize("array_type", ["array", "sparse"]) +@pytest.mark.parametrize("keep_empty_features", [True, False]) +def test_simple_imputer_constant_keep_empty_features(array_type, keep_empty_features): + """Check the behaviour of `keep_empty_features` with `strategy='constant'. + For backward compatibility, a column full of missing values will always be + fill and never dropped. + """ + X = np.array([[np.nan, 2], [np.nan, 3], [np.nan, 6]]) + X = _convert_container(X, array_type) + fill_value = 10 + imputer = SimpleImputer( + strategy="constant", + fill_value=fill_value, + keep_empty_features=keep_empty_features, + ) + + for method in ["fit_transform", "transform"]: + # TODO(1.8): Remove the condition and still call getattr(imputer, method)(X) + if method.startswith("fit") and not keep_empty_features: + warn_msg = '`strategy="constant"`, empty features are not dropped. ' + with pytest.warns(FutureWarning, match=warn_msg): + X_imputed = getattr(imputer, method)(X) + else: + X_imputed = getattr(imputer, method)(X) + assert X_imputed.shape == X.shape + constant_feature = ( + X_imputed[:, 0].toarray() if array_type == "sparse" else X_imputed[:, 0] + ) + assert_array_equal(constant_feature, fill_value) + + +@pytest.mark.parametrize("array_type", ["array", "sparse"]) +@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent"]) +@pytest.mark.parametrize("keep_empty_features", [True, False]) +def test_simple_imputer_keep_empty_features(strategy, array_type, keep_empty_features): + """Check the behaviour of `keep_empty_features` with all strategies but + 'constant'. + """ + X = np.array([[np.nan, 2], [np.nan, 3], [np.nan, 6]]) + X = _convert_container(X, array_type) + imputer = SimpleImputer(strategy=strategy, keep_empty_features=keep_empty_features) + + for method in ["fit_transform", "transform"]: + X_imputed = getattr(imputer, method)(X) + if keep_empty_features: + assert X_imputed.shape == X.shape + constant_feature = ( + X_imputed[:, 0].toarray() if array_type == "sparse" else X_imputed[:, 0] + ) + assert_array_equal(constant_feature, 0) + else: + assert X_imputed.shape == (X.shape[0], X.shape[1] - 1) + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_imputation_custom(csc_container): + X = np.array( + [ + [1.1, 1.1, 1.1], + [3.9, 1.2, np.nan], + [np.nan, 1.3, np.nan], + [0.1, 1.4, 1.4], + [4.9, 1.5, 1.5], + [np.nan, 1.6, 1.6], + ] + ) + + X_true = np.array( + [ + [1.1, 1.1, 1.1], + [3.9, 1.2, 1.1], + [0.1, 1.3, 1.1], + [0.1, 1.4, 1.4], + [4.9, 1.5, 1.5], + [0.1, 1.6, 1.6], + ] + ) + + imputer = SimpleImputer(missing_values=np.nan, strategy=np.min) + X_trans = imputer.fit_transform(X) + assert_array_equal(X_trans, X_true) + + # Sparse matrix + imputer = SimpleImputer(missing_values=np.nan, strategy=np.min) + X_trans = imputer.fit_transform(csc_container(X)) + assert_array_equal(X_trans.toarray(), X_true) + + +def test_simple_imputer_constant_fill_value_casting(): + """Check that we raise a proper error message when we cannot cast the fill value + to the input data type. Otherwise, check that the casting is done properly. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/28309 + """ + # cannot cast fill_value at fit + fill_value = 1.5 + X_int64 = np.array([[1, 2, 3], [2, 3, 4]], dtype=np.int64) + imputer = SimpleImputer( + strategy="constant", fill_value=fill_value, missing_values=2 + ) + err_msg = f"fill_value={fill_value!r} (of type {type(fill_value)!r}) cannot be cast" + with pytest.raises(ValueError, match=re.escape(err_msg)): + imputer.fit(X_int64) + + # cannot cast fill_value at transform + X_float64 = np.array([[1, 2, 3], [2, 3, 4]], dtype=np.float64) + imputer.fit(X_float64) + err_msg = ( + f"The dtype of the filling value (i.e. {imputer.statistics_.dtype!r}) " + "cannot be cast" + ) + with pytest.raises(ValueError, match=re.escape(err_msg)): + imputer.transform(X_int64) + + # check that no error is raised when having the same kind of dtype + fill_value_list = [np.float64(1.5), 1.5, 1] + X_float32 = X_float64.astype(np.float32) + + for fill_value in fill_value_list: + imputer = SimpleImputer( + strategy="constant", fill_value=fill_value, missing_values=2 + ) + X_trans = imputer.fit_transform(X_float32) + assert X_trans.dtype == X_float32.dtype + + +@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent", "constant"]) +def test_iterative_imputer_no_empty_features(strategy): + """Check the behaviour of `keep_empty_features` with no empty features. + + With no-empty features, we should get the same imputation whatever the + parameter `keep_empty_features`. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/29375 + """ + X = np.array([[np.nan, 0, 1], [2, np.nan, 3], [4, 5, np.nan]]) + + imputer_drop_empty_features = IterativeImputer( + initial_strategy=strategy, fill_value=1, keep_empty_features=False + ) + + imputer_keep_empty_features = IterativeImputer( + initial_strategy=strategy, fill_value=1, keep_empty_features=True + ) + + assert_allclose( + imputer_drop_empty_features.fit_transform(X), + imputer_keep_empty_features.fit_transform(X), + ) + + +@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent", "constant"]) +@pytest.mark.parametrize( + "X_test", + [ + np.array([[1, 2, 3, 4], [5, 6, 7, 8]]), # without empty feature + np.array([[np.nan, 2, 3, 4], [np.nan, 6, 7, 8]]), # empty feature at column 0 + np.array([[1, 2, 3, np.nan], [5, 6, 7, np.nan]]), # empty feature at column 3 + ], +) +def test_iterative_imputer_with_empty_features(strategy, X_test): + """Check the behaviour of `keep_empty_features` in the presence of empty features. + + With `keep_empty_features=True`, the empty feature will be imputed with the value + defined by the initial imputation. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/29375 + """ + X_train = np.array( + [[np.nan, np.nan, 0, 1], [np.nan, 2, np.nan, 3], [np.nan, 4, 5, np.nan]] + ) + + imputer_drop_empty_features = IterativeImputer( + initial_strategy=strategy, fill_value=0, keep_empty_features=False + ) + X_train_drop_empty_features = imputer_drop_empty_features.fit_transform(X_train) + X_test_drop_empty_features = imputer_drop_empty_features.transform(X_test) + + imputer_keep_empty_features = IterativeImputer( + initial_strategy=strategy, fill_value=0, keep_empty_features=True + ) + X_train_keep_empty_features = imputer_keep_empty_features.fit_transform(X_train) + X_test_keep_empty_features = imputer_keep_empty_features.transform(X_test) + + assert_allclose(X_train_drop_empty_features, X_train_keep_empty_features[:, 1:]) + assert_allclose(X_train_keep_empty_features[:, 0], 0) + + assert X_train_drop_empty_features.shape[1] == X_test_drop_empty_features.shape[1] + assert X_train_keep_empty_features.shape[1] == X_test_keep_empty_features.shape[1] diff --git a/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_knn.py b/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_knn.py new file mode 100644 index 0000000000000000000000000000000000000000..34244d628600fc29ae2af1e620f34c83eafc6d81 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/impute/tests/test_knn.py @@ -0,0 +1,570 @@ +import numpy as np +import pytest + +from sklearn import config_context +from sklearn.impute import KNNImputer +from sklearn.metrics.pairwise import nan_euclidean_distances, pairwise_distances +from sklearn.neighbors import KNeighborsRegressor +from sklearn.utils._testing import assert_allclose + + +@pytest.mark.parametrize("weights", ["uniform", "distance"]) +@pytest.mark.parametrize("n_neighbors", range(1, 6)) +def test_knn_imputer_shape(weights, n_neighbors): + # Verify the shapes of the imputed matrix for different weights and + # number of neighbors. + n_rows = 10 + n_cols = 2 + X = np.random.rand(n_rows, n_cols) + X[0, 0] = np.nan + + imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights) + X_imputed = imputer.fit_transform(X) + assert X_imputed.shape == (n_rows, n_cols) + + +@pytest.mark.parametrize("na", [np.nan, -1]) +def test_knn_imputer_default_with_invalid_input(na): + # Test imputation with default values and invalid input + + # Test with inf present + X = np.array( + [ + [np.inf, 1, 1, 2, na], + [2, 1, 2, 2, 3], + [3, 2, 3, 3, 8], + [na, 6, 0, 5, 13], + [na, 7, 0, 7, 8], + [6, 6, 2, 5, 7], + ] + ) + with pytest.raises(ValueError, match="Input X contains (infinity|NaN)"): + KNNImputer(missing_values=na).fit(X) + + # Test with inf present in matrix passed in transform() + X = np.array( + [ + [np.inf, 1, 1, 2, na], + [2, 1, 2, 2, 3], + [3, 2, 3, 3, 8], + [na, 6, 0, 5, 13], + [na, 7, 0, 7, 8], + [6, 6, 2, 5, 7], + ] + ) + + X_fit = np.array( + [ + [0, 1, 1, 2, na], + [2, 1, 2, 2, 3], + [3, 2, 3, 3, 8], + [na, 6, 0, 5, 13], + [na, 7, 0, 7, 8], + [6, 6, 2, 5, 7], + ] + ) + imputer = KNNImputer(missing_values=na).fit(X_fit) + with pytest.raises(ValueError, match="Input X contains (infinity|NaN)"): + imputer.transform(X) + + # Test with missing_values=0 when NaN present + imputer = KNNImputer(missing_values=0, n_neighbors=2, weights="uniform") + X = np.array( + [ + [np.nan, 0, 0, 0, 5], + [np.nan, 1, 0, np.nan, 3], + [np.nan, 2, 0, 0, 0], + [np.nan, 6, 0, 5, 13], + ] + ) + msg = "Input X contains NaN" + with pytest.raises(ValueError, match=msg): + imputer.fit(X) + + X = np.array( + [ + [0, 0], + [np.nan, 2], + ] + ) + + +@pytest.mark.parametrize("na", [np.nan, -1]) +def test_knn_imputer_removes_all_na_features(na): + X = np.array( + [ + [1, 1, na, 1, 1, 1.0], + [2, 3, na, 2, 2, 2], + [3, 4, na, 3, 3, na], + [6, 4, na, na, 6, 6], + ] + ) + knn = KNNImputer(missing_values=na, n_neighbors=2).fit(X) + + X_transform = knn.transform(X) + assert not np.isnan(X_transform).any() + assert X_transform.shape == (4, 5) + + X_test = np.arange(0, 12).reshape(2, 6) + X_transform = knn.transform(X_test) + assert_allclose(X_test[:, [0, 1, 3, 4, 5]], X_transform) + + +@pytest.mark.parametrize("na", [np.nan, -1]) +def test_knn_imputer_zero_nan_imputes_the_same(na): + # Test with an imputable matrix and compare with different missing_values + X_zero = np.array( + [ + [1, 0, 1, 1, 1.0], + [2, 2, 2, 2, 2], + [3, 3, 3, 3, 0], + [6, 6, 0, 6, 6], + ] + ) + + X_nan = np.array( + [ + [1, na, 1, 1, 1.0], + [2, 2, 2, 2, 2], + [3, 3, 3, 3, na], + [6, 6, na, 6, 6], + ] + ) + + X_imputed = np.array( + [ + [1, 2.5, 1, 1, 1.0], + [2, 2, 2, 2, 2], + [3, 3, 3, 3, 1.5], + [6, 6, 2.5, 6, 6], + ] + ) + + imputer_zero = KNNImputer(missing_values=0, n_neighbors=2, weights="uniform") + + imputer_nan = KNNImputer(missing_values=na, n_neighbors=2, weights="uniform") + + assert_allclose(imputer_zero.fit_transform(X_zero), X_imputed) + assert_allclose( + imputer_zero.fit_transform(X_zero), imputer_nan.fit_transform(X_nan) + ) + + +@pytest.mark.parametrize("na", [np.nan, -1]) +def test_knn_imputer_verify(na): + # Test with an imputable matrix + X = np.array( + [ + [1, 0, 0, 1], + [2, 1, 2, na], + [3, 2, 3, na], + [na, 4, 5, 5], + [6, na, 6, 7], + [8, 8, 8, 8], + [16, 15, 18, 19], + ] + ) + + X_imputed = np.array( + [ + [1, 0, 0, 1], + [2, 1, 2, 8], + [3, 2, 3, 8], + [4, 4, 5, 5], + [6, 3, 6, 7], + [8, 8, 8, 8], + [16, 15, 18, 19], + ] + ) + + imputer = KNNImputer(missing_values=na) + assert_allclose(imputer.fit_transform(X), X_imputed) + + # Test when there is not enough neighbors + X = np.array( + [ + [1, 0, 0, na], + [2, 1, 2, na], + [3, 2, 3, na], + [4, 4, 5, na], + [6, 7, 6, na], + [8, 8, 8, na], + [20, 20, 20, 20], + [22, 22, 22, 22], + ] + ) + + # Not enough neighbors, use column mean from training + X_impute_value = (20 + 22) / 2 + X_imputed = np.array( + [ + [1, 0, 0, X_impute_value], + [2, 1, 2, X_impute_value], + [3, 2, 3, X_impute_value], + [4, 4, 5, X_impute_value], + [6, 7, 6, X_impute_value], + [8, 8, 8, X_impute_value], + [20, 20, 20, 20], + [22, 22, 22, 22], + ] + ) + + imputer = KNNImputer(missing_values=na) + assert_allclose(imputer.fit_transform(X), X_imputed) + + # Test when data in fit() and transform() are different + X = np.array([[0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 16]]) + + X1 = np.array([[1, 0], [3, 2], [4, na]]) + + X_2_1 = (0 + 3 + 6 + 7 + 8) / 5 + X1_imputed = np.array([[1, 0], [3, 2], [4, X_2_1]]) + + imputer = KNNImputer(missing_values=na) + assert_allclose(imputer.fit(X).transform(X1), X1_imputed) + + +@pytest.mark.parametrize("na", [np.nan, -1]) +def test_knn_imputer_one_n_neighbors(na): + X = np.array([[0, 0], [na, 2], [4, 3], [5, na], [7, 7], [na, 8], [14, 13]]) + + X_imputed = np.array([[0, 0], [4, 2], [4, 3], [5, 3], [7, 7], [7, 8], [14, 13]]) + + imputer = KNNImputer(n_neighbors=1, missing_values=na) + + assert_allclose(imputer.fit_transform(X), X_imputed) + + +@pytest.mark.parametrize("na", [np.nan, -1]) +def test_knn_imputer_all_samples_are_neighbors(na): + X = np.array([[0, 0], [na, 2], [4, 3], [5, na], [7, 7], [na, 8], [14, 13]]) + + X_imputed = np.array( + [[0, 0], [6.25, 2], [4, 3], [5, 5.75], [7, 7], [6.25, 8], [14, 13]] + ) + + n_neighbors = X.shape[0] - 1 + imputer = KNNImputer(n_neighbors=n_neighbors, missing_values=na) + + assert_allclose(imputer.fit_transform(X), X_imputed) + + n_neighbors = X.shape[0] + imputer_plus1 = KNNImputer(n_neighbors=n_neighbors, missing_values=na) + assert_allclose(imputer_plus1.fit_transform(X), X_imputed) + + +@pytest.mark.parametrize("na", [np.nan, -1]) +def test_knn_imputer_weight_uniform(na): + X = np.array([[0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]) + + # Test with "uniform" weight (or unweighted) + X_imputed_uniform = np.array( + [[0, 0], [5, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]] + ) + + imputer = KNNImputer(weights="uniform", missing_values=na) + assert_allclose(imputer.fit_transform(X), X_imputed_uniform) + + # Test with "callable" weight + def no_weight(dist): + return None + + imputer = KNNImputer(weights=no_weight, missing_values=na) + assert_allclose(imputer.fit_transform(X), X_imputed_uniform) + + # Test with "callable" uniform weight + def uniform_weight(dist): + return np.ones_like(dist) + + imputer = KNNImputer(weights=uniform_weight, missing_values=na) + assert_allclose(imputer.fit_transform(X), X_imputed_uniform) + + +@pytest.mark.parametrize("na", [np.nan, -1]) +def test_knn_imputer_weight_distance(na): + X = np.array([[0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]) + + # Test with "distance" weight + nn = KNeighborsRegressor(metric="euclidean", weights="distance") + X_rows_idx = [0, 2, 3, 4, 5, 6] + nn.fit(X[X_rows_idx, 1:], X[X_rows_idx, 0]) + knn_imputed_value = nn.predict(X[1:2, 1:])[0] + + # Manual calculation + X_neighbors_idx = [0, 2, 3, 4, 5] + dist = nan_euclidean_distances(X[1:2, :], X, missing_values=na) + weights = 1 / dist[:, X_neighbors_idx].ravel() + manual_imputed_value = np.average(X[X_neighbors_idx, 0], weights=weights) + + X_imputed_distance1 = np.array( + [[0, 0], [manual_imputed_value, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]] + ) + + # NearestNeighbor calculation + X_imputed_distance2 = np.array( + [[0, 0], [knn_imputed_value, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]] + ) + + imputer = KNNImputer(weights="distance", missing_values=na) + assert_allclose(imputer.fit_transform(X), X_imputed_distance1) + assert_allclose(imputer.fit_transform(X), X_imputed_distance2) + + # Test with weights = "distance" and n_neighbors=2 + X = np.array( + [ + [na, 0, 0], + [2, 1, 2], + [3, 2, 3], + [4, 5, 5], + ] + ) + + # neighbors are rows 1, 2, the nan_euclidean_distances are: + dist_0_1 = np.sqrt((3 / 2) * ((1 - 0) ** 2 + (2 - 0) ** 2)) + dist_0_2 = np.sqrt((3 / 2) * ((2 - 0) ** 2 + (3 - 0) ** 2)) + imputed_value = np.average([2, 3], weights=[1 / dist_0_1, 1 / dist_0_2]) + + X_imputed = np.array( + [ + [imputed_value, 0, 0], + [2, 1, 2], + [3, 2, 3], + [4, 5, 5], + ] + ) + + imputer = KNNImputer(n_neighbors=2, weights="distance", missing_values=na) + assert_allclose(imputer.fit_transform(X), X_imputed) + + # Test with varying missingness patterns + X = np.array( + [ + [1, 0, 0, 1], + [0, na, 1, na], + [1, 1, 1, na], + [0, 1, 0, 0], + [0, 0, 0, 0], + [1, 0, 1, 1], + [10, 10, 10, 10], + ] + ) + + # Get weights of donor neighbors + dist = nan_euclidean_distances(X, missing_values=na) + r1c1_nbor_dists = dist[1, [0, 2, 3, 4, 5]] + r1c3_nbor_dists = dist[1, [0, 3, 4, 5, 6]] + r1c1_nbor_wt = 1 / r1c1_nbor_dists + r1c3_nbor_wt = 1 / r1c3_nbor_dists + + r2c3_nbor_dists = dist[2, [0, 3, 4, 5, 6]] + r2c3_nbor_wt = 1 / r2c3_nbor_dists + + # Collect donor values + col1_donor_values = np.ma.masked_invalid(X[[0, 2, 3, 4, 5], 1]).copy() + col3_donor_values = np.ma.masked_invalid(X[[0, 3, 4, 5, 6], 3]).copy() + + # Final imputed values + r1c1_imp = np.ma.average(col1_donor_values, weights=r1c1_nbor_wt) + r1c3_imp = np.ma.average(col3_donor_values, weights=r1c3_nbor_wt) + r2c3_imp = np.ma.average(col3_donor_values, weights=r2c3_nbor_wt) + + X_imputed = np.array( + [ + [1, 0, 0, 1], + [0, r1c1_imp, 1, r1c3_imp], + [1, 1, 1, r2c3_imp], + [0, 1, 0, 0], + [0, 0, 0, 0], + [1, 0, 1, 1], + [10, 10, 10, 10], + ] + ) + + imputer = KNNImputer(weights="distance", missing_values=na) + assert_allclose(imputer.fit_transform(X), X_imputed) + + X = np.array( + [ + [0, 0, 0, na], + [1, 1, 1, na], + [2, 2, na, 2], + [3, 3, 3, 3], + [4, 4, 4, 4], + [5, 5, 5, 5], + [6, 6, 6, 6], + [na, 7, 7, 7], + ] + ) + + dist = pairwise_distances( + X, metric="nan_euclidean", squared=False, missing_values=na + ) + + # Calculate weights + r0c3_w = 1.0 / dist[0, 2:-1] + r1c3_w = 1.0 / dist[1, 2:-1] + r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)] + r7c0_w = 1.0 / dist[7, 2:7] + + # Calculate weighted averages + r0c3 = np.average(X[2:-1, -1], weights=r0c3_w) + r1c3 = np.average(X[2:-1, -1], weights=r1c3_w) + r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w) + r7c0 = np.average(X[2:7, 0], weights=r7c0_w) + + X_imputed = np.array( + [ + [0, 0, 0, r0c3], + [1, 1, 1, r1c3], + [2, 2, r2c2, 2], + [3, 3, 3, 3], + [4, 4, 4, 4], + [5, 5, 5, 5], + [6, 6, 6, 6], + [r7c0, 7, 7, 7], + ] + ) + + imputer_comp_wt = KNNImputer(missing_values=na, weights="distance") + assert_allclose(imputer_comp_wt.fit_transform(X), X_imputed) + + +def test_knn_imputer_callable_metric(): + # Define callable metric that returns the l1 norm: + def custom_callable(x, y, missing_values=np.nan, squared=False): + x = np.ma.array(x, mask=np.isnan(x)) + y = np.ma.array(y, mask=np.isnan(y)) + dist = np.nansum(np.abs(x - y)) + return dist + + X = np.array([[4, 3, 3, np.nan], [6, 9, 6, 9], [4, 8, 6, 9], [np.nan, 9, 11, 10.0]]) + + X_0_3 = (9 + 9) / 2 + X_3_0 = (6 + 4) / 2 + X_imputed = np.array( + [[4, 3, 3, X_0_3], [6, 9, 6, 9], [4, 8, 6, 9], [X_3_0, 9, 11, 10.0]] + ) + + imputer = KNNImputer(n_neighbors=2, metric=custom_callable) + assert_allclose(imputer.fit_transform(X), X_imputed) + + +@pytest.mark.parametrize("working_memory", [None, 0]) +@pytest.mark.parametrize("na", [-1, np.nan]) +# Note that we use working_memory=0 to ensure that chunking is tested, even +# for a small dataset. However, it should raise a UserWarning that we ignore. +@pytest.mark.filterwarnings("ignore:adhere to working_memory") +def test_knn_imputer_with_simple_example(na, working_memory): + X = np.array( + [ + [0, na, 0, na], + [1, 1, 1, na], + [2, 2, na, 2], + [3, 3, 3, 3], + [4, 4, 4, 4], + [5, 5, 5, 5], + [6, 6, 6, 6], + [na, 7, 7, 7], + ] + ) + + r0c1 = np.mean(X[1:6, 1]) + r0c3 = np.mean(X[2:-1, -1]) + r1c3 = np.mean(X[2:-1, -1]) + r2c2 = np.mean(X[[0, 1, 3, 4, 5], 2]) + r7c0 = np.mean(X[2:-1, 0]) + + X_imputed = np.array( + [ + [0, r0c1, 0, r0c3], + [1, 1, 1, r1c3], + [2, 2, r2c2, 2], + [3, 3, 3, 3], + [4, 4, 4, 4], + [5, 5, 5, 5], + [6, 6, 6, 6], + [r7c0, 7, 7, 7], + ] + ) + + with config_context(working_memory=working_memory): + imputer_comp = KNNImputer(missing_values=na) + assert_allclose(imputer_comp.fit_transform(X), X_imputed) + + +@pytest.mark.parametrize("na", [-1, np.nan]) +@pytest.mark.parametrize("weights", ["uniform", "distance"]) +def test_knn_imputer_not_enough_valid_distances(na, weights): + # Samples with needed feature has nan distance + X1 = np.array([[na, 11], [na, 1], [3, na]]) + X1_imputed = np.array([[3, 11], [3, 1], [3, 6]]) + + knn = KNNImputer(missing_values=na, n_neighbors=1, weights=weights) + assert_allclose(knn.fit_transform(X1), X1_imputed) + + X2 = np.array([[4, na]]) + X2_imputed = np.array([[4, 6]]) + assert_allclose(knn.transform(X2), X2_imputed) + + +@pytest.mark.parametrize("na", [-1, np.nan]) +@pytest.mark.parametrize("weights", ["uniform", "distance"]) +def test_knn_imputer_nan_distance(na, weights): + # Samples with nan distance should be excluded from the mean computation + X1_train = np.array([[1, 1], [na, 2]]) + X1_test = np.array([[0, na]]) + X1_test_expected = np.array([[0, 1]]) + + knn1 = KNNImputer(n_neighbors=2, missing_values=na, weights=weights) + knn1.fit(X1_train) + assert_allclose(knn1.transform(X1_test), X1_test_expected) + + X2_train = np.array([[na, 1, 1], [2, na, 2], [3, 3, na]]) + X2_test = np.array([[na, 0, na], [0, na, na], [na, na, 0]]) + X2_test_expected = np.array([[3, 0, 1], [0, 3, 2], [2, 1, 0]]) + + knn2 = KNNImputer(n_neighbors=2, missing_values=na, weights=weights) + knn2.fit(X2_train) + assert_allclose(knn2.transform(X2_test), X2_test_expected) + + +@pytest.mark.parametrize("na", [-1, np.nan]) +def test_knn_imputer_drops_all_nan_features(na): + X1 = np.array([[na, 1], [na, 2]]) + knn = KNNImputer(missing_values=na, n_neighbors=1) + X1_expected = np.array([[1], [2]]) + assert_allclose(knn.fit_transform(X1), X1_expected) + + X2 = np.array([[1, 2], [3, na]]) + X2_expected = np.array([[2], [1.5]]) + assert_allclose(knn.transform(X2), X2_expected) + + +@pytest.mark.parametrize("working_memory", [None, 0]) +@pytest.mark.parametrize("na", [-1, np.nan]) +def test_knn_imputer_distance_weighted_not_enough_neighbors(na, working_memory): + X = np.array([[3, na], [2, na], [na, 4], [5, 6], [6, 8], [na, 5]]) + + dist = pairwise_distances( + X, metric="nan_euclidean", squared=False, missing_values=na + ) + + X_01 = np.average(X[3:5, 1], weights=1 / dist[0, 3:5]) + X_11 = np.average(X[3:5, 1], weights=1 / dist[1, 3:5]) + X_20 = np.average(X[3:5, 0], weights=1 / dist[2, 3:5]) + X_50 = np.average(X[3:5, 0], weights=1 / dist[5, 3:5]) + + X_expected = np.array([[3, X_01], [2, X_11], [X_20, 4], [5, 6], [6, 8], [X_50, 5]]) + + with config_context(working_memory=working_memory): + knn_3 = KNNImputer(missing_values=na, n_neighbors=3, weights="distance") + assert_allclose(knn_3.fit_transform(X), X_expected) + + knn_4 = KNNImputer(missing_values=na, n_neighbors=4, weights="distance") + assert_allclose(knn_4.fit_transform(X), X_expected) + + +@pytest.mark.parametrize("na, allow_nan", [(-1, False), (np.nan, True)]) +def test_knn_tags(na, allow_nan): + knn = KNNImputer(missing_values=na) + assert knn.__sklearn_tags__().input_tags.allow_nan == allow_nan diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8e0a1125ef04198083da041736a7ebc2ffeafe6a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/__init__.py @@ -0,0 +1,16 @@ +"""Tools for model inspection.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._partial_dependence import partial_dependence +from ._permutation_importance import permutation_importance +from ._plot.decision_boundary import DecisionBoundaryDisplay +from ._plot.partial_dependence import PartialDependenceDisplay + +__all__ = [ + "DecisionBoundaryDisplay", + "PartialDependenceDisplay", + "partial_dependence", + "permutation_importance", +] diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/_partial_dependence.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/_partial_dependence.py new file mode 100644 index 0000000000000000000000000000000000000000..ad352c45cc03bd6018617c4ccaa6247fd68718b5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/_partial_dependence.py @@ -0,0 +1,775 @@ +"""Partial dependence plots for regression and classification models.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from collections.abc import Iterable + +import numpy as np +from scipy import sparse +from scipy.stats.mstats import mquantiles + +from ..base import is_classifier, is_regressor +from ..ensemble import RandomForestRegressor +from ..ensemble._gb import BaseGradientBoosting +from ..ensemble._hist_gradient_boosting.gradient_boosting import ( + BaseHistGradientBoosting, +) +from ..tree import DecisionTreeRegressor +from ..utils import Bunch, _safe_indexing, check_array +from ..utils._indexing import _determine_key_type, _get_column_indices, _safe_assign +from ..utils._optional_dependencies import check_matplotlib_support # noqa: F401 +from ..utils._param_validation import ( + HasMethods, + Integral, + Interval, + StrOptions, + validate_params, +) +from ..utils._response import _get_response_values +from ..utils.extmath import cartesian +from ..utils.validation import _check_sample_weight, check_is_fitted +from ._pd_utils import _check_feature_names, _get_feature_index + +__all__ = [ + "partial_dependence", +] + + +def _grid_from_X(X, percentiles, is_categorical, grid_resolution, custom_values): + """Generate a grid of points based on the percentiles of X. + + The grid is a cartesian product between the columns of ``values``. The + ith column of ``values`` consists in ``grid_resolution`` equally-spaced + points between the percentiles of the jth column of X. + + If ``grid_resolution`` is bigger than the number of unique values in the + j-th column of X or if the feature is a categorical feature (by inspecting + `is_categorical`) , then those unique values will be used instead. + + Parameters + ---------- + X : array-like of shape (n_samples, n_target_features) + The data. + + percentiles : tuple of float + The percentiles which are used to construct the extreme values of + the grid. Must be in [0, 1]. + + is_categorical : list of bool + For each feature, tells whether it is categorical or not. If a feature + is categorical, then the values used will be the unique ones + (i.e. categories) instead of the percentiles. + + grid_resolution : int + The number of equally spaced points to be placed on the grid for each + feature. + + custom_values: dict + Mapping from column index of X to an array-like of values where + the partial dependence should be calculated for that feature + + Returns + ------- + grid : ndarray of shape (n_points, n_target_features) + A value for each feature at each point in the grid. ``n_points`` is + always ``<= grid_resolution ** X.shape[1]``. + + values : list of 1d ndarrays + The values with which the grid has been created. The size of each + array ``values[j]`` is either ``grid_resolution``, the number of + unique values in ``X[:, j]``, if j is not in ``custom_range``. + If j is in ``custom_range``, then it is the length of ``custom_range[j]``. + """ + if not isinstance(percentiles, Iterable) or len(percentiles) != 2: + raise ValueError("'percentiles' must be a sequence of 2 elements.") + if not all(0 <= x <= 1 for x in percentiles): + raise ValueError("'percentiles' values must be in [0, 1].") + if percentiles[0] >= percentiles[1]: + raise ValueError("percentiles[0] must be strictly less than percentiles[1].") + + if grid_resolution <= 1: + raise ValueError("'grid_resolution' must be strictly greater than 1.") + + def _convert_custom_values(values): + # Convert custom types such that object types are always used for string arrays + dtype = object if any(isinstance(v, str) for v in values) else None + return np.asarray(values, dtype=dtype) + + custom_values = {k: _convert_custom_values(v) for k, v in custom_values.items()} + if any(v.ndim != 1 for v in custom_values.values()): + error_string = ", ".join( + f"Feature {k}: {v.ndim} dimensions" + for k, v in custom_values.items() + if v.ndim != 1 + ) + + raise ValueError( + "The custom grid for some features is not a one-dimensional array. " + f"{error_string}" + ) + + values = [] + # TODO: we should handle missing values (i.e. `np.nan`) specifically and store them + # in a different Bunch attribute. + for feature, is_cat in enumerate(is_categorical): + if feature in custom_values: + # Use values in the custom range + axis = custom_values[feature] + else: + try: + uniques = np.unique(_safe_indexing(X, feature, axis=1)) + except TypeError as exc: + # `np.unique` will fail in the presence of `np.nan` and `str` categories + # due to sorting. Temporary, we reraise an error explaining the problem. + raise ValueError( + f"The column #{feature} contains mixed data types. Finding unique " + "categories fail due to sorting. It usually means that the column " + "contains `np.nan` values together with `str` categories. Such use " + "case is not yet supported in scikit-learn." + ) from exc + + if is_cat or uniques.shape[0] < grid_resolution: + # Use the unique values either because: + # - feature has low resolution use unique values + # - feature is categorical + axis = uniques + else: + # create axis based on percentiles and grid resolution + emp_percentiles = mquantiles( + _safe_indexing(X, feature, axis=1), prob=percentiles, axis=0 + ) + if np.allclose(emp_percentiles[0], emp_percentiles[1]): + raise ValueError( + "percentiles are too close to each other, " + "unable to build the grid. Please choose percentiles " + "that are further apart." + ) + axis = np.linspace( + emp_percentiles[0], + emp_percentiles[1], + num=grid_resolution, + endpoint=True, + ) + values.append(axis) + + return cartesian(values), values + + +def _partial_dependence_recursion(est, grid, features): + """Calculate partial dependence via the recursion method. + + The recursion method is in particular enabled for tree-based estimators. + + For each `grid` value, a weighted tree traversal is performed: if a split node + involves an input feature of interest, the corresponding left or right branch + is followed; otherwise both branches are followed, each branch being weighted + by the fraction of training samples that entered that branch. Finally, the + partial dependence is given by a weighted average of all the visited leaves + values. + + This method is more efficient in terms of speed than the `'brute'` method + (:func:`~sklearn.inspection._partial_dependence._partial_dependence_brute`). + However, here, the partial dependence computation is done explicitly with the + `X` used during training of `est`. + + Parameters + ---------- + est : BaseEstimator + A fitted estimator object implementing :term:`predict` or + :term:`decision_function`. Multioutput-multiclass classifiers are not + supported. Note that `'recursion'` is only supported for some tree-based + estimators (namely + :class:`~sklearn.ensemble.GradientBoostingClassifier`, + :class:`~sklearn.ensemble.GradientBoostingRegressor`, + :class:`~sklearn.ensemble.HistGradientBoostingClassifier`, + :class:`~sklearn.ensemble.HistGradientBoostingRegressor`, + :class:`~sklearn.tree.DecisionTreeRegressor`, + :class:`~sklearn.ensemble.RandomForestRegressor`, + ). + + grid : array-like of shape (n_points, n_target_features) + The grid of feature values for which the partial dependence is calculated. + Note that `n_points` is the number of points in the grid and `n_target_features` + is the number of features you are doing partial dependence at. + + features : array-like of {int, str} + The feature (e.g. `[0]`) or pair of interacting features + (e.g. `[(0, 1)]`) for which the partial dependency should be computed. + + Returns + ------- + averaged_predictions : array-like of shape (n_targets, n_points) + The averaged predictions for the given `grid` of features values. + Note that `n_targets` is the number of targets (e.g. 1 for binary + classification, `n_tasks` for multi-output regression, and `n_classes` for + multiclass classification) and `n_points` is the number of points in the `grid`. + """ + averaged_predictions = est._compute_partial_dependence_recursion(grid, features) + if averaged_predictions.ndim == 1: + # reshape to (1, n_points) for consistency with + # _partial_dependence_brute + averaged_predictions = averaged_predictions.reshape(1, -1) + + return averaged_predictions + + +def _partial_dependence_brute( + est, grid, features, X, response_method, sample_weight=None +): + """Calculate partial dependence via the brute force method. + + The brute method explicitly averages the predictions of an estimator over a + grid of feature values. + + For each `grid` value, all the samples from `X` have their variables of + interest replaced by that specific `grid` value. The predictions are then made + and averaged across the samples. + + This method is slower than the `'recursion'` + (:func:`~sklearn.inspection._partial_dependence._partial_dependence_recursion`) + version for estimators with this second option. However, with the `'brute'` + force method, the average will be done with the given `X` and not the `X` + used during training, as it is done in the `'recursion'` version. Therefore + the average can always accept `sample_weight` (even when the estimator was + fitted without). + + Parameters + ---------- + est : BaseEstimator + A fitted estimator object implementing :term:`predict`, + :term:`predict_proba`, or :term:`decision_function`. + Multioutput-multiclass classifiers are not supported. + + grid : array-like of shape (n_points, n_target_features) + The grid of feature values for which the partial dependence is calculated. + Note that `n_points` is the number of points in the grid and `n_target_features` + is the number of features you are doing partial dependence at. + + features : array-like of {int, str} + The feature (e.g. `[0]`) or pair of interacting features + (e.g. `[(0, 1)]`) for which the partial dependency should be computed. + + X : array-like of shape (n_samples, n_features) + `X` is used to generate values for the complement features. That is, for + each value in `grid`, the method will average the prediction of each + sample from `X` having that grid value for `features`. + + response_method : {'auto', 'predict_proba', 'decision_function'}, \ + default='auto' + Specifies whether to use :term:`predict_proba` or + :term:`decision_function` as the target response. For regressors + this parameter is ignored and the response is always the output of + :term:`predict`. By default, :term:`predict_proba` is tried first + and we revert to :term:`decision_function` if it doesn't exist. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights are used to calculate weighted means when averaging the + model output. If `None`, then samples are equally weighted. Note that + `sample_weight` does not change the individual predictions. + + Returns + ------- + averaged_predictions : array-like of shape (n_targets, n_points) + The averaged predictions for the given `grid` of features values. + Note that `n_targets` is the number of targets (e.g. 1 for binary + classification, `n_tasks` for multi-output regression, and `n_classes` for + multiclass classification) and `n_points` is the number of points in the `grid`. + + predictions : array-like + The predictions for the given `grid` of features values over the samples + from `X`. For non-multioutput regression and binary classification the + shape is `(n_instances, n_points)` and for multi-output regression and + multiclass classification the shape is `(n_targets, n_instances, n_points)`, + where `n_targets` is the number of targets (`n_tasks` for multi-output + regression, and `n_classes` for multiclass classification), `n_instances` + is the number of instances in `X`, and `n_points` is the number of points + in the `grid`. + """ + predictions = [] + averaged_predictions = [] + + if response_method == "auto": + response_method = ( + "predict" if is_regressor(est) else ["predict_proba", "decision_function"] + ) + + X_eval = X.copy() + for new_values in grid: + for i, variable in enumerate(features): + _safe_assign(X_eval, new_values[i], column_indexer=variable) + + # Note: predictions is of shape + # (n_points,) for non-multioutput regressors + # (n_points, n_tasks) for multioutput regressors + # (n_points, 1) for the regressors in cross_decomposition (I think) + # (n_points, 1) for binary classification (positive class already selected) + # (n_points, n_classes) for multiclass classification + pred, _ = _get_response_values(est, X_eval, response_method=response_method) + + predictions.append(pred) + # average over samples + averaged_predictions.append(np.average(pred, axis=0, weights=sample_weight)) + + n_samples = X.shape[0] + + # reshape to (n_targets, n_instances, n_points) where n_targets is: + # - 1 for non-multioutput regression and binary classification (shape is + # already correct in those cases) + # - n_tasks for multi-output regression + # - n_classes for multiclass classification. + predictions = np.array(predictions).T + if is_regressor(est) and predictions.ndim == 2: + # non-multioutput regression, shape is (n_instances, n_points,) + predictions = predictions.reshape(n_samples, -1) + elif is_classifier(est) and predictions.shape[0] == 2: + # Binary classification, shape is (2, n_instances, n_points). + # we output the effect of **positive** class + predictions = predictions[1] + predictions = predictions.reshape(n_samples, -1) + + # reshape averaged_predictions to (n_targets, n_points) where n_targets is: + # - 1 for non-multioutput regression and binary classification (shape is + # already correct in those cases) + # - n_tasks for multi-output regression + # - n_classes for multiclass classification. + averaged_predictions = np.array(averaged_predictions).T + if averaged_predictions.ndim == 1: + # reshape to (1, n_points) for consistency with + # _partial_dependence_recursion + averaged_predictions = averaged_predictions.reshape(1, -1) + + return averaged_predictions, predictions + + +@validate_params( + { + "estimator": [ + HasMethods(["fit", "predict"]), + HasMethods(["fit", "predict_proba"]), + HasMethods(["fit", "decision_function"]), + ], + "X": ["array-like", "sparse matrix"], + "features": ["array-like", Integral, str], + "sample_weight": ["array-like", None], + "categorical_features": ["array-like", None], + "feature_names": ["array-like", None], + "response_method": [StrOptions({"auto", "predict_proba", "decision_function"})], + "percentiles": [tuple], + "grid_resolution": [Interval(Integral, 1, None, closed="left")], + "method": [StrOptions({"auto", "recursion", "brute"})], + "kind": [StrOptions({"average", "individual", "both"})], + "custom_values": [dict, None], + }, + prefer_skip_nested_validation=True, +) +def partial_dependence( + estimator, + X, + features, + *, + sample_weight=None, + categorical_features=None, + feature_names=None, + response_method="auto", + percentiles=(0.05, 0.95), + grid_resolution=100, + custom_values=None, + method="auto", + kind="average", +): + """Partial dependence of ``features``. + + Partial dependence of a feature (or a set of features) corresponds to + the average response of an estimator for each possible value of the + feature. + + Read more in + :ref:`sphx_glr_auto_examples_inspection_plot_partial_dependence.py` + and the :ref:`User Guide `. + + .. warning:: + + For :class:`~sklearn.ensemble.GradientBoostingClassifier` and + :class:`~sklearn.ensemble.GradientBoostingRegressor`, the + `'recursion'` method (used by default) will not account for the `init` + predictor of the boosting process. In practice, this will produce + the same values as `'brute'` up to a constant offset in the target + response, provided that `init` is a constant estimator (which is the + default). However, if `init` is not a constant estimator, the + partial dependence values are incorrect for `'recursion'` because the + offset will be sample-dependent. It is preferable to use the `'brute'` + method. Note that this only applies to + :class:`~sklearn.ensemble.GradientBoostingClassifier` and + :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to + :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and + :class:`~sklearn.ensemble.HistGradientBoostingRegressor`. + + Parameters + ---------- + estimator : BaseEstimator + A fitted estimator object implementing :term:`predict`, + :term:`predict_proba`, or :term:`decision_function`. + Multioutput-multiclass classifiers are not supported. + + X : {array-like, sparse matrix or dataframe} of shape (n_samples, n_features) + ``X`` is used to generate a grid of values for the target + ``features`` (where the partial dependence will be evaluated), and + also to generate values for the complement features when the + `method` is 'brute'. + + features : array-like of {int, str, bool} or int or str + The feature (e.g. `[0]`) or pair of interacting features + (e.g. `[(0, 1)]`) for which the partial dependency should be computed. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights are used to calculate weighted means when averaging the + model output. If `None`, then samples are equally weighted. If + `sample_weight` is not `None`, then `method` will be set to `'brute'`. + Note that `sample_weight` is ignored for `kind='individual'`. + + .. versionadded:: 1.3 + + categorical_features : array-like of shape (n_features,) or shape \ + (n_categorical_features,), dtype={bool, int, str}, default=None + Indicates the categorical features. + + - `None`: no feature will be considered categorical; + - boolean array-like: boolean mask of shape `(n_features,)` + indicating which features are categorical. Thus, this array has + the same shape has `X.shape[1]`; + - integer or string array-like: integer indices or strings + indicating categorical features. + + .. versionadded:: 1.2 + + feature_names : array-like of shape (n_features,), dtype=str, default=None + Name of each feature; `feature_names[i]` holds the name of the feature + with index `i`. + By default, the name of the feature corresponds to their numerical + index for NumPy array and their column name for pandas dataframe. + + .. versionadded:: 1.2 + + response_method : {'auto', 'predict_proba', 'decision_function'}, \ + default='auto' + Specifies whether to use :term:`predict_proba` or + :term:`decision_function` as the target response. For regressors + this parameter is ignored and the response is always the output of + :term:`predict`. By default, :term:`predict_proba` is tried first + and we revert to :term:`decision_function` if it doesn't exist. If + ``method`` is 'recursion', the response is always the output of + :term:`decision_function`. + + percentiles : tuple of float, default=(0.05, 0.95) + The lower and upper percentile used to create the extreme values + for the grid. Must be in [0, 1]. + This parameter is overridden by `custom_values` if that parameter is set. + + grid_resolution : int, default=100 + The number of equally spaced points on the grid, for each target + feature. + This parameter is overridden by `custom_values` if that parameter is set. + + custom_values : dict + A dictionary mapping the index of an element of `features` to an array + of values where the partial dependence should be calculated + for that feature. Setting a range of values for a feature overrides + `grid_resolution` and `percentiles`. + + See :ref:`how to use partial_dependence + ` for an example of how this parameter can + be used. + + .. versionadded:: 1.7 + + method : {'auto', 'recursion', 'brute'}, default='auto' + The method used to calculate the averaged predictions: + + - `'recursion'` is only supported for some tree-based estimators + (namely + :class:`~sklearn.ensemble.GradientBoostingClassifier`, + :class:`~sklearn.ensemble.GradientBoostingRegressor`, + :class:`~sklearn.ensemble.HistGradientBoostingClassifier`, + :class:`~sklearn.ensemble.HistGradientBoostingRegressor`, + :class:`~sklearn.tree.DecisionTreeRegressor`, + :class:`~sklearn.ensemble.RandomForestRegressor`, + ) when `kind='average'`. + This is more efficient in terms of speed. + With this method, the target response of a + classifier is always the decision function, not the predicted + probabilities. Since the `'recursion'` method implicitly computes + the average of the Individual Conditional Expectation (ICE) by + design, it is not compatible with ICE and thus `kind` must be + `'average'`. + + - `'brute'` is supported for any estimator, but is more + computationally intensive. + + - `'auto'`: the `'recursion'` is used for estimators that support it, + and `'brute'` is used otherwise. If `sample_weight` is not `None`, + then `'brute'` is used regardless of the estimator. + + Please see :ref:`this note ` for + differences between the `'brute'` and `'recursion'` method. + + kind : {'average', 'individual', 'both'}, default='average' + Whether to return the partial dependence averaged across all the + samples in the dataset or one value per sample or both. + See Returns below. + + Note that the fast `method='recursion'` option is only available for + `kind='average'` and `sample_weights=None`. Computing individual + dependencies and doing weighted averages requires using the slower + `method='brute'`. + + .. versionadded:: 0.24 + + Returns + ------- + predictions : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + individual : ndarray of shape (n_outputs, n_instances, \ + len(values[0]), len(values[1]), ...) + The predictions for all the points in the grid for all + samples in X. This is also known as Individual + Conditional Expectation (ICE). + Only available when `kind='individual'` or `kind='both'`. + + average : ndarray of shape (n_outputs, len(values[0]), \ + len(values[1]), ...) + The predictions for all the points in the grid, averaged + over all samples in X (or over the training data if + `method` is 'recursion'). + Only available when `kind='average'` or `kind='both'`. + + grid_values : seq of 1d ndarrays + The values with which the grid has been created. The generated + grid is a cartesian product of the arrays in `grid_values` where + `len(grid_values) == len(features)`. The size of each array + `grid_values[j]` is either `grid_resolution`, or the number of + unique values in `X[:, j]`, whichever is smaller. + + .. versionadded:: 1.3 + + `n_outputs` corresponds to the number of classes in a multi-class + setting, or to the number of tasks for multi-output regression. + For classical regression and binary classification `n_outputs==1`. + `n_values_feature_j` corresponds to the size `grid_values[j]`. + + See Also + -------- + PartialDependenceDisplay.from_estimator : Plot Partial Dependence. + PartialDependenceDisplay : Partial Dependence visualization. + + Examples + -------- + >>> X = [[0, 0, 2], [1, 0, 0]] + >>> y = [0, 1] + >>> from sklearn.ensemble import GradientBoostingClassifier + >>> gb = GradientBoostingClassifier(random_state=0).fit(X, y) + >>> partial_dependence(gb, features=[0], X=X, percentiles=(0, 1), + ... grid_resolution=2) # doctest: +SKIP + (array([[-4.52, 4.52]]), [array([ 0., 1.])]) + """ + check_is_fitted(estimator) + + if not (is_classifier(estimator) or is_regressor(estimator)): + raise ValueError("'estimator' must be a fitted regressor or classifier.") + + if is_classifier(estimator) and isinstance(estimator.classes_[0], np.ndarray): + raise ValueError("Multiclass-multioutput estimators are not supported") + + # Use check_array only on lists and other non-array-likes / sparse. Do not + # convert DataFrame into a NumPy array. + if not (hasattr(X, "__array__") or sparse.issparse(X)): + X = check_array(X, ensure_all_finite="allow-nan", dtype=object) + + if is_regressor(estimator) and response_method != "auto": + raise ValueError( + "The response_method parameter is ignored for regressors and " + "must be 'auto'." + ) + + if kind != "average": + if method == "recursion": + raise ValueError( + "The 'recursion' method only applies when 'kind' is set to 'average'" + ) + method = "brute" + + if method == "recursion" and sample_weight is not None: + raise ValueError( + "The 'recursion' method can only be applied when sample_weight is None." + ) + + if method == "auto": + if sample_weight is not None: + method = "brute" + elif isinstance(estimator, BaseGradientBoosting) and estimator.init is None: + method = "recursion" + elif isinstance( + estimator, + (BaseHistGradientBoosting, DecisionTreeRegressor, RandomForestRegressor), + ): + method = "recursion" + else: + method = "brute" + + if method == "recursion": + if not isinstance( + estimator, + ( + BaseGradientBoosting, + BaseHistGradientBoosting, + DecisionTreeRegressor, + RandomForestRegressor, + ), + ): + supported_classes_recursion = ( + "GradientBoostingClassifier", + "GradientBoostingRegressor", + "HistGradientBoostingClassifier", + "HistGradientBoostingRegressor", + "HistGradientBoostingRegressor", + "DecisionTreeRegressor", + "RandomForestRegressor", + ) + raise ValueError( + "Only the following estimators support the 'recursion' " + "method: {}. Try using method='brute'.".format( + ", ".join(supported_classes_recursion) + ) + ) + if response_method == "auto": + response_method = "decision_function" + + if response_method != "decision_function": + raise ValueError( + "With the 'recursion' method, the response_method must be " + "'decision_function'. Got {}.".format(response_method) + ) + + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) + + if _determine_key_type(features, accept_slice=False) == "int": + # _get_column_indices() supports negative indexing. Here, we limit + # the indexing to be positive. The upper bound will be checked + # by _get_column_indices() + if np.any(np.less(features, 0)): + raise ValueError("all features must be in [0, {}]".format(X.shape[1] - 1)) + + features_indices = np.asarray( + _get_column_indices(X, features), dtype=np.intp, order="C" + ).ravel() + + feature_names = _check_feature_names(X, feature_names) + + n_features = X.shape[1] + if categorical_features is None: + is_categorical = [False] * len(features_indices) + else: + categorical_features = np.asarray(categorical_features) + if categorical_features.size == 0: + raise ValueError( + "Passing an empty list (`[]`) to `categorical_features` is not " + "supported. Use `None` instead to indicate that there are no " + "categorical features." + ) + if categorical_features.dtype.kind == "b": + # categorical features provided as a list of boolean + if categorical_features.size != n_features: + raise ValueError( + "When `categorical_features` is a boolean array-like, " + "the array should be of shape (n_features,). Got " + f"{categorical_features.size} elements while `X` contains " + f"{n_features} features." + ) + is_categorical = [categorical_features[idx] for idx in features_indices] + elif categorical_features.dtype.kind in ("i", "O", "U"): + # categorical features provided as a list of indices or feature names + categorical_features_idx = [ + _get_feature_index(cat, feature_names=feature_names) + for cat in categorical_features + ] + is_categorical = [ + idx in categorical_features_idx for idx in features_indices + ] + else: + raise ValueError( + "Expected `categorical_features` to be an array-like of boolean," + f" integer, or string. Got {categorical_features.dtype} instead." + ) + + custom_values = custom_values or {} + if isinstance(features, (str, int)): + features = [features] + + for feature_idx, feature, is_cat in zip(features_indices, features, is_categorical): + if is_cat: + continue + + if _safe_indexing(X, feature_idx, axis=1).dtype.kind in "iu": + # TODO(1.9): raise a ValueError instead. + warnings.warn( + f"The column {feature!r} contains integer data. Partial " + "dependence plots are not supported for integer data: this " + "can lead to implicit rounding with NumPy arrays or even errors " + "with newer pandas versions. Please convert numerical features" + "to floating point dtypes ahead of time to avoid problems. " + "This will raise ValueError in scikit-learn 1.9.", + FutureWarning, + ) + # Do not warn again for other features to avoid spamming the caller. + break + + X_subset = _safe_indexing(X, features_indices, axis=1) + + custom_values_for_X_subset = { + index: custom_values.get(feature) + for index, feature in enumerate(features) + if feature in custom_values + } + + grid, values = _grid_from_X( + X_subset, + percentiles, + is_categorical, + grid_resolution, + custom_values_for_X_subset, + ) + + if method == "brute": + averaged_predictions, predictions = _partial_dependence_brute( + estimator, grid, features_indices, X, response_method, sample_weight + ) + + # reshape predictions to + # (n_outputs, n_instances, n_values_feature_0, n_values_feature_1, ...) + predictions = predictions.reshape( + -1, X.shape[0], *[val.shape[0] for val in values] + ) + else: + averaged_predictions = _partial_dependence_recursion( + estimator, grid, features_indices + ) + + # reshape averaged_predictions to + # (n_outputs, n_values_feature_0, n_values_feature_1, ...) + averaged_predictions = averaged_predictions.reshape( + -1, *[val.shape[0] for val in values] + ) + pdp_results = Bunch(grid_values=values) + + if kind == "average": + pdp_results["average"] = averaged_predictions + elif kind == "individual": + pdp_results["individual"] = predictions + else: # kind='both' + pdp_results["average"] = averaged_predictions + pdp_results["individual"] = predictions + + return pdp_results diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/_pd_utils.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/_pd_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a48ba4d9a4490df59b8503f0b8768c7a986537a9 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/_pd_utils.py @@ -0,0 +1,68 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + + +def _check_feature_names(X, feature_names=None): + """Check feature names. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data. + + feature_names : None or array-like of shape (n_names,), dtype=str + Feature names to check or `None`. + + Returns + ------- + feature_names : list of str + Feature names validated. If `feature_names` is `None`, then a list of + feature names is provided, i.e. the column names of a pandas dataframe + or a generic list of feature names (e.g. `["x0", "x1", ...]`) for a + NumPy array. + """ + if feature_names is None: + if hasattr(X, "columns") and hasattr(X.columns, "tolist"): + # get the column names for a pandas dataframe + feature_names = X.columns.tolist() + else: + # define a list of numbered indices for a numpy array + feature_names = [f"x{i}" for i in range(X.shape[1])] + elif hasattr(feature_names, "tolist"): + # convert numpy array or pandas index to a list + feature_names = feature_names.tolist() + if len(set(feature_names)) != len(feature_names): + raise ValueError("feature_names should not contain duplicates.") + + return feature_names + + +def _get_feature_index(fx, feature_names=None): + """Get feature index. + + Parameters + ---------- + fx : int or str + Feature index or name. + + feature_names : list of str, default=None + All feature names from which to search the indices. + + Returns + ------- + idx : int + Feature index. + """ + if isinstance(fx, str): + if feature_names is None: + raise ValueError( + f"Cannot plot partial dependence for feature {fx!r} since " + "the list of feature names was not provided, neither as " + "column names of a pandas data-frame nor via the feature_names " + "parameter." + ) + try: + return feature_names.index(fx) + except ValueError as e: + raise ValueError(f"Feature {fx!r} not in feature_names") from e + return fx diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/_permutation_importance.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/_permutation_importance.py new file mode 100644 index 0000000000000000000000000000000000000000..451062fbe272e066350b8b5307d23f9180ed6760 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/_permutation_importance.py @@ -0,0 +1,313 @@ +"""Permutation importance for estimators.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numbers + +import numpy as np + +from ..ensemble._bagging import _generate_indices +from ..metrics import check_scoring, get_scorer_names +from ..model_selection._validation import _aggregate_score_dicts +from ..utils import Bunch, _safe_indexing, check_array, check_random_state +from ..utils._param_validation import ( + HasMethods, + Integral, + Interval, + RealNotInt, + StrOptions, + validate_params, +) +from ..utils.parallel import Parallel, delayed + + +def _weights_scorer(scorer, estimator, X, y, sample_weight): + if sample_weight is not None: + return scorer(estimator, X, y, sample_weight=sample_weight) + return scorer(estimator, X, y) + + +def _calculate_permutation_scores( + estimator, + X, + y, + sample_weight, + col_idx, + random_state, + n_repeats, + scorer, + max_samples, +): + """Calculate score when `col_idx` is permuted.""" + random_state = check_random_state(random_state) + + # Work on a copy of X to ensure thread-safety in case of threading based + # parallelism. Furthermore, making a copy is also useful when the joblib + # backend is 'loky' (default) or the old 'multiprocessing': in those cases, + # if X is large it will be automatically be backed by a readonly memory map + # (memmap). X.copy() on the other hand is always guaranteed to return a + # writable data-structure whose columns can be shuffled inplace. + if max_samples < X.shape[0]: + row_indices = _generate_indices( + random_state=random_state, + bootstrap=False, + n_population=X.shape[0], + n_samples=max_samples, + ) + X_permuted = _safe_indexing(X, row_indices, axis=0) + y = _safe_indexing(y, row_indices, axis=0) + if sample_weight is not None: + sample_weight = _safe_indexing(sample_weight, row_indices, axis=0) + else: + X_permuted = X.copy() + + scores = [] + shuffling_idx = np.arange(X_permuted.shape[0]) + for _ in range(n_repeats): + random_state.shuffle(shuffling_idx) + if hasattr(X_permuted, "iloc"): + col = X_permuted.iloc[shuffling_idx, col_idx] + col.index = X_permuted.index + X_permuted[X_permuted.columns[col_idx]] = col + else: + X_permuted[:, col_idx] = X_permuted[shuffling_idx, col_idx] + scores.append(_weights_scorer(scorer, estimator, X_permuted, y, sample_weight)) + + if isinstance(scores[0], dict): + scores = _aggregate_score_dicts(scores) + else: + scores = np.array(scores) + + return scores + + +def _create_importances_bunch(baseline_score, permuted_score): + """Compute the importances as the decrease in score. + + Parameters + ---------- + baseline_score : ndarray of shape (n_features,) + The baseline score without permutation. + permuted_score : ndarray of shape (n_features, n_repeats) + The permuted scores for the `n` repetitions. + + Returns + ------- + importances : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + importances_mean : ndarray, shape (n_features, ) + Mean of feature importance over `n_repeats`. + importances_std : ndarray, shape (n_features, ) + Standard deviation over `n_repeats`. + importances : ndarray, shape (n_features, n_repeats) + Raw permutation importance scores. + """ + importances = baseline_score - permuted_score + return Bunch( + importances_mean=np.mean(importances, axis=1), + importances_std=np.std(importances, axis=1), + importances=importances, + ) + + +@validate_params( + { + "estimator": [HasMethods(["fit"])], + "X": ["array-like"], + "y": ["array-like", None], + "scoring": [ + StrOptions(set(get_scorer_names())), + callable, + list, + tuple, + dict, + None, + ], + "n_repeats": [Interval(Integral, 1, None, closed="left")], + "n_jobs": [Integral, None], + "random_state": ["random_state"], + "sample_weight": ["array-like", None], + "max_samples": [ + Interval(Integral, 1, None, closed="left"), + Interval(RealNotInt, 0, 1, closed="right"), + ], + }, + prefer_skip_nested_validation=True, +) +def permutation_importance( + estimator, + X, + y, + *, + scoring=None, + n_repeats=5, + n_jobs=None, + random_state=None, + sample_weight=None, + max_samples=1.0, +): + """Permutation importance for feature evaluation [BRE]_. + + The :term:`estimator` is required to be a fitted estimator. `X` can be the + data set used to train the estimator or a hold-out set. The permutation + importance of a feature is calculated as follows. First, a baseline metric, + defined by :term:`scoring`, is evaluated on a (potentially different) + dataset defined by the `X`. Next, a feature column from the validation set + is permuted and the metric is evaluated again. The permutation importance + is defined to be the difference between the baseline metric and metric from + permutating the feature column. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : object + An estimator that has already been :term:`fitted` and is compatible + with :term:`scorer`. + + X : ndarray or DataFrame, shape (n_samples, n_features) + Data on which permutation importance will be computed. + + y : array-like or None, shape (n_samples, ) or (n_samples, n_classes) + Targets for supervised or `None` for unsupervised. + + scoring : str, callable, list, tuple, or dict, default=None + Scorer to use. + If `scoring` represents a single score, one can use: + + - str: see :ref:`scoring_string_names` for options. + - callable: a scorer callable object (e.g., function) with signature + ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details. + - `None`: the `estimator`'s + :ref:`default evaluation criterion ` is used. + + If `scoring` represents multiple scores, one can use: + + - a list or tuple of unique strings; + - a callable returning a dictionary where the keys are the metric + names and the values are the metric scores; + - a dictionary with metric names as keys and callables a values. + + Passing multiple scores to `scoring` is more efficient than calling + `permutation_importance` for each of the scores as it reuses + predictions to avoid redundant computation. + + n_repeats : int, default=5 + Number of times to permute a feature. + + n_jobs : int or None, default=None + Number of jobs to run in parallel. The computation is done by computing + permutation score for each columns and parallelized over the columns. + `None` means 1 unless in a :obj:`joblib.parallel_backend` context. + `-1` means using all processors. See :term:`Glossary ` + for more details. + + random_state : int, RandomState instance, default=None + Pseudo-random number generator to control the permutations of each + feature. + Pass an int to get reproducible results across function calls. + See :term:`Glossary `. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights used in scoring. + + .. versionadded:: 0.24 + + max_samples : int or float, default=1.0 + The number of samples to draw from X to compute feature importance + in each repeat (without replacement). + + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. + - If `max_samples` is equal to `1.0` or `X.shape[0]`, all samples + will be used. + + While using this option may provide less accurate importance estimates, + it keeps the method tractable when evaluating feature importance on + large datasets. In combination with `n_repeats`, this allows to control + the computational speed vs statistical accuracy trade-off of this method. + + .. versionadded:: 1.0 + + Returns + ------- + result : :class:`~sklearn.utils.Bunch` or dict of such instances + Dictionary-like object, with the following attributes. + + importances_mean : ndarray of shape (n_features, ) + Mean of feature importance over `n_repeats`. + importances_std : ndarray of shape (n_features, ) + Standard deviation over `n_repeats`. + importances : ndarray of shape (n_features, n_repeats) + Raw permutation importance scores. + + If there are multiple scoring metrics in the scoring parameter + `result` is a dict with scorer names as keys (e.g. 'roc_auc') and + `Bunch` objects like above as values. + + References + ---------- + .. [BRE] :doi:`L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, + 2001. <10.1023/A:1010933404324>` + + Examples + -------- + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.inspection import permutation_importance + >>> X = [[1, 9, 9],[1, 9, 9],[1, 9, 9], + ... [0, 9, 9],[0, 9, 9],[0, 9, 9]] + >>> y = [1, 1, 1, 0, 0, 0] + >>> clf = LogisticRegression().fit(X, y) + >>> result = permutation_importance(clf, X, y, n_repeats=10, + ... random_state=0) + >>> result.importances_mean + array([0.4666, 0. , 0. ]) + >>> result.importances_std + array([0.2211, 0. , 0. ]) + """ + if not hasattr(X, "iloc"): + X = check_array(X, ensure_all_finite="allow-nan", dtype=None) + + # Precompute random seed from the random state to be used + # to get a fresh independent RandomState instance for each + # parallel call to _calculate_permutation_scores, irrespective of + # the fact that variables are shared or not depending on the active + # joblib backend (sequential, thread-based or process-based). + random_state = check_random_state(random_state) + random_seed = random_state.randint(np.iinfo(np.int32).max + 1) + + if not isinstance(max_samples, numbers.Integral): + max_samples = int(max_samples * X.shape[0]) + elif max_samples > X.shape[0]: + raise ValueError("max_samples must be <= n_samples") + + scorer = check_scoring(estimator, scoring=scoring) + baseline_score = _weights_scorer(scorer, estimator, X, y, sample_weight) + + scores = Parallel(n_jobs=n_jobs)( + delayed(_calculate_permutation_scores)( + estimator, + X, + y, + sample_weight, + col_idx, + random_seed, + n_repeats, + scorer, + max_samples, + ) + for col_idx in range(X.shape[1]) + ) + + if isinstance(baseline_score, dict): + return { + name: _create_importances_bunch( + baseline_score[name], + # unpack the permuted scores + np.array([scores[col_idx][name] for col_idx in range(X.shape[1])]), + ) + for name in baseline_score + } + else: + return _create_importances_bunch(baseline_score, np.array(scores)) diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..67dd18fb94b593f0a3125c1f5833f3b9597614ba --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/__init__.py @@ -0,0 +1,2 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/decision_boundary.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/decision_boundary.py new file mode 100644 index 0000000000000000000000000000000000000000..2ef85380583937f564891e8705b7ac91eff0f321 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/decision_boundary.py @@ -0,0 +1,564 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings + +import numpy as np + +from ...base import is_regressor +from ...preprocessing import LabelEncoder +from ...utils import _safe_indexing +from ...utils._optional_dependencies import check_matplotlib_support +from ...utils._response import _get_response_values +from ...utils._set_output import _get_adapter_from_container +from ...utils.validation import ( + _is_arraylike_not_scalar, + _is_pandas_df, + _is_polars_df, + _num_features, + check_is_fitted, +) + + +def _check_boundary_response_method(estimator, response_method, class_of_interest): + """Validate the response methods to be used with the fitted estimator. + + Parameters + ---------- + estimator : object + Fitted estimator to check. + + response_method : {'auto', 'decision_function', 'predict_proba', 'predict'} + Specifies whether to use :term:`decision_function`, :term:`predict_proba`, + :term:`predict` as the target response. If set to 'auto', the response method is + tried in the before mentioned order. + + class_of_interest : int, float, bool, str or None + The class considered when plotting the decision. Cannot be None if + multiclass and `response_method` is 'predict_proba' or 'decision_function'. + + .. versionadded:: 1.4 + + Returns + ------- + prediction_method : list of str or str + The name or list of names of the response methods to use. + """ + has_classes = hasattr(estimator, "classes_") + if has_classes and _is_arraylike_not_scalar(estimator.classes_[0]): + msg = "Multi-label and multi-output multi-class classifiers are not supported" + raise ValueError(msg) + + if response_method == "auto": + if is_regressor(estimator): + prediction_method = "predict" + else: + prediction_method = ["decision_function", "predict_proba", "predict"] + else: + prediction_method = response_method + + return prediction_method + + +class DecisionBoundaryDisplay: + """Decisions boundary visualization. + + It is recommended to use + :func:`~sklearn.inspection.DecisionBoundaryDisplay.from_estimator` + to create a :class:`DecisionBoundaryDisplay`. All parameters are stored as + attributes. + + Read more in the :ref:`User Guide `. + + For a detailed example comparing the decision boundaries of multinomial and + one-vs-rest logistic regression, please see + :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_multinomial.py`. + + .. versionadded:: 1.1 + + Parameters + ---------- + xx0 : ndarray of shape (grid_resolution, grid_resolution) + First output of :func:`meshgrid `. + + xx1 : ndarray of shape (grid_resolution, grid_resolution) + Second output of :func:`meshgrid `. + + response : ndarray of shape (grid_resolution, grid_resolution) or \ + (grid_resolution, grid_resolution, n_classes) + Values of the response function. + + multiclass_colors : list of str or str, default=None + Specifies how to color each class when plotting all classes of multiclass + problem. Ignored for binary problems and multiclass problems when plotting a + single prediction value per point. + Possible inputs are: + + * list: list of Matplotlib + `color `_ + strings, of length `n_classes` + * str: name of :class:`matplotlib.colors.Colormap` + * None: 'viridis' colormap is used to sample colors + + Single color colormaps will be generated from the colors in the list or + colors taken from the colormap and passed to the `cmap` parameter of + the `plot_method`. + + .. versionadded:: 1.7 + + xlabel : str, default=None + Default label to place on x axis. + + ylabel : str, default=None + Default label to place on y axis. + + Attributes + ---------- + surface_ : matplotlib `QuadContourSet` or `QuadMesh` or list of such objects + If `plot_method` is 'contour' or 'contourf', `surface_` is + :class:`QuadContourSet `. If + `plot_method` is 'pcolormesh', `surface_` is + :class:`QuadMesh `. + + multiclass_colors_ : array of shape (n_classes, 4) + Colors used to plot each class in multiclass problems. + Only defined when `color_of_interest` is None. + + .. versionadded:: 1.7 + + ax_ : matplotlib Axes + Axes with decision boundary. + + figure_ : matplotlib Figure + Figure containing the decision boundary. + + See Also + -------- + DecisionBoundaryDisplay.from_estimator : Plot decision boundary given an estimator. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> import numpy as np + >>> from sklearn.datasets import load_iris + >>> from sklearn.inspection import DecisionBoundaryDisplay + >>> from sklearn.tree import DecisionTreeClassifier + >>> iris = load_iris() + >>> feature_1, feature_2 = np.meshgrid( + ... np.linspace(iris.data[:, 0].min(), iris.data[:, 0].max()), + ... np.linspace(iris.data[:, 1].min(), iris.data[:, 1].max()) + ... ) + >>> grid = np.vstack([feature_1.ravel(), feature_2.ravel()]).T + >>> tree = DecisionTreeClassifier().fit(iris.data[:, :2], iris.target) + >>> y_pred = np.reshape(tree.predict(grid), feature_1.shape) + >>> display = DecisionBoundaryDisplay( + ... xx0=feature_1, xx1=feature_2, response=y_pred + ... ) + >>> display.plot() + <...> + >>> display.ax_.scatter( + ... iris.data[:, 0], iris.data[:, 1], c=iris.target, edgecolor="black" + ... ) + <...> + >>> plt.show() + """ + + def __init__( + self, *, xx0, xx1, response, multiclass_colors=None, xlabel=None, ylabel=None + ): + self.xx0 = xx0 + self.xx1 = xx1 + self.response = response + self.multiclass_colors = multiclass_colors + self.xlabel = xlabel + self.ylabel = ylabel + + def plot(self, plot_method="contourf", ax=None, xlabel=None, ylabel=None, **kwargs): + """Plot visualization. + + Parameters + ---------- + plot_method : {'contourf', 'contour', 'pcolormesh'}, default='contourf' + Plotting method to call when plotting the response. Please refer + to the following matplotlib documentation for details: + :func:`contourf `, + :func:`contour `, + :func:`pcolormesh `. + + ax : Matplotlib axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + xlabel : str, default=None + Overwrite the x-axis label. + + ylabel : str, default=None + Overwrite the y-axis label. + + **kwargs : dict + Additional keyword arguments to be passed to the `plot_method`. + + Returns + ------- + display: :class:`~sklearn.inspection.DecisionBoundaryDisplay` + Object that stores computed values. + """ + check_matplotlib_support("DecisionBoundaryDisplay.plot") + import matplotlib as mpl + import matplotlib.pyplot as plt + + if plot_method not in ("contourf", "contour", "pcolormesh"): + raise ValueError( + "plot_method must be 'contourf', 'contour', or 'pcolormesh'. " + f"Got {plot_method} instead." + ) + + if ax is None: + _, ax = plt.subplots() + + plot_func = getattr(ax, plot_method) + if self.response.ndim == 2: + self.surface_ = plot_func(self.xx0, self.xx1, self.response, **kwargs) + else: # self.response.ndim == 3 + n_responses = self.response.shape[-1] + for kwarg in ("cmap", "colors"): + if kwarg in kwargs: + warnings.warn( + f"'{kwarg}' is ignored in favor of 'multiclass_colors' " + "in the multiclass case when the response method is " + "'decision_function' or 'predict_proba'." + ) + del kwargs[kwarg] + + if self.multiclass_colors is None or isinstance( + self.multiclass_colors, str + ): + if self.multiclass_colors is None: + cmap = "tab10" if n_responses <= 10 else "gist_rainbow" + else: + cmap = self.multiclass_colors + + # Special case for the tab10 and tab20 colormaps that encode a + # discrete set of colors that are easily distinguishable + # contrary to other colormaps that are continuous. + if cmap == "tab10" and n_responses <= 10: + colors = plt.get_cmap("tab10", 10).colors[:n_responses] + elif cmap == "tab20" and n_responses <= 20: + colors = plt.get_cmap("tab20", 20).colors[:n_responses] + else: + cmap = plt.get_cmap(cmap, n_responses) + if not hasattr(cmap, "colors"): + # For LinearSegmentedColormap + colors = cmap(np.linspace(0, 1, n_responses)) + else: + colors = cmap.colors + elif isinstance(self.multiclass_colors, list): + colors = [mpl.colors.to_rgba(color) for color in self.multiclass_colors] + else: + raise ValueError("'multiclass_colors' must be a list or a str.") + + self.multiclass_colors_ = colors + if plot_method == "contour": + # Plot only argmax map for contour + class_map = self.response.argmax(axis=2) + self.surface_ = plot_func( + self.xx0, self.xx1, class_map, colors=colors, **kwargs + ) + else: + multiclass_cmaps = [ + mpl.colors.LinearSegmentedColormap.from_list( + f"colormap_{class_idx}", [(1.0, 1.0, 1.0, 1.0), (r, g, b, 1.0)] + ) + for class_idx, (r, g, b, _) in enumerate(colors) + ] + + self.surface_ = [] + for class_idx, cmap in enumerate(multiclass_cmaps): + response = np.ma.array( + self.response[:, :, class_idx], + mask=~(self.response.argmax(axis=2) == class_idx), + ) + self.surface_.append( + plot_func(self.xx0, self.xx1, response, cmap=cmap, **kwargs) + ) + + if xlabel is not None or not ax.get_xlabel(): + xlabel = self.xlabel if xlabel is None else xlabel + ax.set_xlabel(xlabel) + if ylabel is not None or not ax.get_ylabel(): + ylabel = self.ylabel if ylabel is None else ylabel + ax.set_ylabel(ylabel) + + self.ax_ = ax + self.figure_ = ax.figure + return self + + @classmethod + def from_estimator( + cls, + estimator, + X, + *, + grid_resolution=100, + eps=1.0, + plot_method="contourf", + response_method="auto", + class_of_interest=None, + multiclass_colors=None, + xlabel=None, + ylabel=None, + ax=None, + **kwargs, + ): + """Plot decision boundary given an estimator. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : object + Trained estimator used to plot the decision boundary. + + X : {array-like, sparse matrix, dataframe} of shape (n_samples, 2) + Input data that should be only 2-dimensional. + + grid_resolution : int, default=100 + Number of grid points to use for plotting decision boundary. + Higher values will make the plot look nicer but be slower to + render. + + eps : float, default=1.0 + Extends the minimum and maximum values of X for evaluating the + response function. + + plot_method : {'contourf', 'contour', 'pcolormesh'}, default='contourf' + Plotting method to call when plotting the response. Please refer + to the following matplotlib documentation for details: + :func:`contourf `, + :func:`contour `, + :func:`pcolormesh `. + + response_method : {'auto', 'decision_function', 'predict_proba', \ + 'predict'}, default='auto' + Specifies whether to use :term:`decision_function`, + :term:`predict_proba` or :term:`predict` as the target response. + If set to 'auto', the response method is tried in the order as + listed above. + + .. versionchanged:: 1.6 + For multiclass problems, 'auto' no longer defaults to 'predict'. + + class_of_interest : int, float, bool or str, default=None + The class to be plotted when `response_method` is 'predict_proba' + or 'decision_function'. If None, `estimator.classes_[1]` is considered + the positive class for binary classifiers. For multiclass + classifiers, if None, all classes will be represented in the + decision boundary plot; the class with the highest response value + at each point is plotted. The color of each class can be set via + `multiclass_colors`. + + .. versionadded:: 1.4 + + multiclass_colors : list of str, or str, default=None + Specifies how to color each class when plotting multiclass + 'predict_proba' or 'decision_function' and `class_of_interest` is + None. Ignored in all other cases. + + Possible inputs are: + + * list: list of Matplotlib + `color `_ + strings, of length `n_classes` + * str: name of :class:`matplotlib.colors.Colormap` + * None: 'tab10' colormap is used to sample colors if the number of + classes is less than or equal to 10, otherwise 'gist_rainbow' + colormap. + + Single color colormaps will be generated from the colors in the list or + colors taken from the colormap, and passed to the `cmap` parameter of + the `plot_method`. + + .. versionadded:: 1.7 + + xlabel : str, default=None + The label used for the x-axis. If `None`, an attempt is made to + extract a label from `X` if it is a dataframe, otherwise an empty + string is used. + + ylabel : str, default=None + The label used for the y-axis. If `None`, an attempt is made to + extract a label from `X` if it is a dataframe, otherwise an empty + string is used. + + ax : Matplotlib axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + **kwargs : dict + Additional keyword arguments to be passed to the + `plot_method`. + + Returns + ------- + display : :class:`~sklearn.inspection.DecisionBoundaryDisplay` + Object that stores the result. + + See Also + -------- + DecisionBoundaryDisplay : Decision boundary visualization. + sklearn.metrics.ConfusionMatrixDisplay.from_estimator : Plot the + confusion matrix given an estimator, the data, and the label. + sklearn.metrics.ConfusionMatrixDisplay.from_predictions : Plot the + confusion matrix given the true and predicted labels. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import load_iris + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.inspection import DecisionBoundaryDisplay + >>> iris = load_iris() + >>> X = iris.data[:, :2] + >>> classifier = LogisticRegression().fit(X, iris.target) + >>> disp = DecisionBoundaryDisplay.from_estimator( + ... classifier, X, response_method="predict", + ... xlabel=iris.feature_names[0], ylabel=iris.feature_names[1], + ... alpha=0.5, + ... ) + >>> disp.ax_.scatter(X[:, 0], X[:, 1], c=iris.target, edgecolor="k") + <...> + >>> plt.show() + """ + check_matplotlib_support(f"{cls.__name__}.from_estimator") + check_is_fitted(estimator) + import matplotlib as mpl + + if not grid_resolution > 1: + raise ValueError( + "grid_resolution must be greater than 1. Got" + f" {grid_resolution} instead." + ) + + if not eps >= 0: + raise ValueError( + f"eps must be greater than or equal to 0. Got {eps} instead." + ) + + possible_plot_methods = ("contourf", "contour", "pcolormesh") + if plot_method not in possible_plot_methods: + available_methods = ", ".join(possible_plot_methods) + raise ValueError( + f"plot_method must be one of {available_methods}. " + f"Got {plot_method} instead." + ) + + num_features = _num_features(X) + if num_features != 2: + raise ValueError( + f"n_features must be equal to 2. Got {num_features} instead." + ) + + if ( + response_method in ("predict_proba", "decision_function", "auto") + and multiclass_colors is not None + and hasattr(estimator, "classes_") + and (n_classes := len(estimator.classes_)) > 2 + ): + if isinstance(multiclass_colors, list): + if len(multiclass_colors) != n_classes: + raise ValueError( + "When 'multiclass_colors' is a list, it must be of the same " + f"length as 'estimator.classes_' ({n_classes}), got: " + f"{len(multiclass_colors)}." + ) + elif any( + not mpl.colors.is_color_like(col) for col in multiclass_colors + ): + raise ValueError( + "When 'multiclass_colors' is a list, it can only contain valid" + f" Matplotlib color names. Got: {multiclass_colors}" + ) + if isinstance(multiclass_colors, str): + if multiclass_colors not in mpl.pyplot.colormaps(): + raise ValueError( + "When 'multiclass_colors' is a string, it must be a valid " + f"Matplotlib colormap. Got: {multiclass_colors}" + ) + + x0, x1 = _safe_indexing(X, 0, axis=1), _safe_indexing(X, 1, axis=1) + + x0_min, x0_max = x0.min() - eps, x0.max() + eps + x1_min, x1_max = x1.min() - eps, x1.max() + eps + + xx0, xx1 = np.meshgrid( + np.linspace(x0_min, x0_max, grid_resolution), + np.linspace(x1_min, x1_max, grid_resolution), + ) + + X_grid = np.c_[xx0.ravel(), xx1.ravel()] + if _is_pandas_df(X) or _is_polars_df(X): + adapter = _get_adapter_from_container(X) + X_grid = adapter.create_container( + X_grid, + X_grid, + columns=X.columns, + ) + + prediction_method = _check_boundary_response_method( + estimator, response_method, class_of_interest + ) + try: + response, _, response_method_used = _get_response_values( + estimator, + X_grid, + response_method=prediction_method, + pos_label=class_of_interest, + return_response_method_used=True, + ) + except ValueError as exc: + if "is not a valid label" in str(exc): + # re-raise a more informative error message since `pos_label` is unknown + # to our user when interacting with + # `DecisionBoundaryDisplay.from_estimator` + raise ValueError( + f"class_of_interest={class_of_interest} is not a valid label: It " + f"should be one of {estimator.classes_}" + ) from exc + raise + + # convert classes predictions into integers + if response_method_used == "predict" and hasattr(estimator, "classes_"): + encoder = LabelEncoder() + encoder.classes_ = estimator.classes_ + response = encoder.transform(response) + + if response.ndim == 1: + response = response.reshape(*xx0.shape) + else: + if is_regressor(estimator): + raise ValueError("Multi-output regressors are not supported") + + if class_of_interest is not None: + # For the multiclass case, `_get_response_values` returns the response + # as-is. Thus, we have a column per class and we need to select the + # column corresponding to the positive class. + col_idx = np.flatnonzero(estimator.classes_ == class_of_interest)[0] + response = response[:, col_idx].reshape(*xx0.shape) + else: + response = response.reshape(*xx0.shape, response.shape[-1]) + + if xlabel is None: + xlabel = X.columns[0] if hasattr(X, "columns") else "" + + if ylabel is None: + ylabel = X.columns[1] if hasattr(X, "columns") else "" + + display = cls( + xx0=xx0, + xx1=xx1, + response=response, + multiclass_colors=multiclass_colors, + xlabel=xlabel, + ylabel=ylabel, + ) + return display.plot(ax=ax, plot_method=plot_method, **kwargs) diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/partial_dependence.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/partial_dependence.py new file mode 100644 index 0000000000000000000000000000000000000000..b31a5070b236b811195f97b6643be7b4c191343e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/partial_dependence.py @@ -0,0 +1,1495 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numbers +from itertools import chain +from math import ceil + +import numpy as np +from scipy import sparse +from scipy.stats.mstats import mquantiles + +from ...base import is_regressor +from ...utils import ( + Bunch, + _safe_indexing, + check_array, + check_random_state, +) +from ...utils._encode import _unique +from ...utils._optional_dependencies import check_matplotlib_support +from ...utils._plotting import _validate_style_kwargs +from ...utils.parallel import Parallel, delayed +from .. import partial_dependence +from .._pd_utils import _check_feature_names, _get_feature_index + + +class PartialDependenceDisplay: + """Partial Dependence Plot (PDP) and Individual Conditional Expectation (ICE). + + It is recommended to use + :func:`~sklearn.inspection.PartialDependenceDisplay.from_estimator` to create a + :class:`~sklearn.inspection.PartialDependenceDisplay`. All parameters are stored + as attributes. + + For general information regarding `scikit-learn` visualization tools, see + the :ref:`Visualization Guide `. + For guidance on interpreting these plots, refer to the + :ref:`Inspection Guide `. + + For an example on how to use this class, see the following example: + :ref:`sphx_glr_auto_examples_miscellaneous_plot_partial_dependence_visualization_api.py`. + + .. versionadded:: 0.22 + + Parameters + ---------- + pd_results : list of Bunch + Results of :func:`~sklearn.inspection.partial_dependence` for + ``features``. + + features : list of (int,) or list of (int, int) + Indices of features for a given plot. A tuple of one integer will plot + a partial dependence curve of one feature. A tuple of two integers will + plot a two-way partial dependence curve as a contour plot. + + feature_names : list of str + Feature names corresponding to the indices in ``features``. + + target_idx : int + + - In a multiclass setting, specifies the class for which the PDPs + should be computed. Note that for binary classification, the + positive class (index 1) is always used. + - In a multioutput setting, specifies the task for which the PDPs + should be computed. + + Ignored in binary classification or classical regression settings. + + deciles : dict + Deciles for feature indices in ``features``. + + kind : {'average', 'individual', 'both'} or list of such str, \ + default='average' + Whether to plot the partial dependence averaged across all the samples + in the dataset or one line per sample or both. + + - ``kind='average'`` results in the traditional PD plot; + - ``kind='individual'`` results in the ICE plot; + - ``kind='both'`` results in plotting both the ICE and PD on the same + plot. + + A list of such strings can be provided to specify `kind` on a per-plot + basis. The length of the list should be the same as the number of + interaction requested in `features`. + + .. note:: + ICE ('individual' or 'both') is not a valid option for 2-ways + interactions plot. As a result, an error will be raised. + 2-ways interaction plots should always be configured to + use the 'average' kind instead. + + .. note:: + The fast ``method='recursion'`` option is only available for + `kind='average'` and `sample_weights=None`. Computing individual + dependencies and doing weighted averages requires using the slower + `method='brute'`. + + .. versionadded:: 0.24 + Add `kind` parameter with `'average'`, `'individual'`, and `'both'` + options. + + .. versionadded:: 1.1 + Add the possibility to pass a list of string specifying `kind` + for each plot. + + subsample : float, int or None, default=1000 + Sampling for ICE curves when `kind` is 'individual' or 'both'. + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to be used to plot ICE curves. If int, represents the + maximum absolute number of samples to use. + + Note that the full dataset is still used to calculate partial + dependence when `kind='both'`. + + .. versionadded:: 0.24 + + random_state : int, RandomState instance or None, default=None + Controls the randomness of the selected samples when subsamples is not + `None`. See :term:`Glossary ` for details. + + .. versionadded:: 0.24 + + is_categorical : list of (bool,) or list of (bool, bool), default=None + Whether each target feature in `features` is categorical or not. + The list should be same size as `features`. If `None`, all features + are assumed to be continuous. + + .. versionadded:: 1.2 + + Attributes + ---------- + bounding_ax_ : matplotlib Axes or None + If `ax` is an axes or None, the `bounding_ax_` is the axes where the + grid of partial dependence plots are drawn. If `ax` is a list of axes + or a numpy array of axes, `bounding_ax_` is None. + + axes_ : ndarray of matplotlib Axes + If `ax` is an axes or None, `axes_[i, j]` is the axes on the i-th row + and j-th column. If `ax` is a list of axes, `axes_[i]` is the i-th item + in `ax`. Elements that are None correspond to a nonexisting axes in + that position. + + lines_ : ndarray of matplotlib Artists + If `ax` is an axes or None, `lines_[i, j]` is the partial dependence + curve on the i-th row and j-th column. If `ax` is a list of axes, + `lines_[i]` is the partial dependence curve corresponding to the i-th + item in `ax`. Elements that are None correspond to a nonexisting axes + or an axes that does not include a line plot. + + deciles_vlines_ : ndarray of matplotlib LineCollection + If `ax` is an axes or None, `vlines_[i, j]` is the line collection + representing the x axis deciles of the i-th row and j-th column. If + `ax` is a list of axes, `vlines_[i]` corresponds to the i-th item in + `ax`. Elements that are None correspond to a nonexisting axes or an + axes that does not include a PDP plot. + + .. versionadded:: 0.23 + + deciles_hlines_ : ndarray of matplotlib LineCollection + If `ax` is an axes or None, `vlines_[i, j]` is the line collection + representing the y axis deciles of the i-th row and j-th column. If + `ax` is a list of axes, `vlines_[i]` corresponds to the i-th item in + `ax`. Elements that are None correspond to a nonexisting axes or an + axes that does not include a 2-way plot. + + .. versionadded:: 0.23 + + contours_ : ndarray of matplotlib Artists + If `ax` is an axes or None, `contours_[i, j]` is the partial dependence + plot on the i-th row and j-th column. If `ax` is a list of axes, + `contours_[i]` is the partial dependence plot corresponding to the i-th + item in `ax`. Elements that are None correspond to a nonexisting axes + or an axes that does not include a contour plot. + + bars_ : ndarray of matplotlib Artists + If `ax` is an axes or None, `bars_[i, j]` is the partial dependence bar + plot on the i-th row and j-th column (for a categorical feature). + If `ax` is a list of axes, `bars_[i]` is the partial dependence bar + plot corresponding to the i-th item in `ax`. Elements that are None + correspond to a nonexisting axes or an axes that does not include a + bar plot. + + .. versionadded:: 1.2 + + heatmaps_ : ndarray of matplotlib Artists + If `ax` is an axes or None, `heatmaps_[i, j]` is the partial dependence + heatmap on the i-th row and j-th column (for a pair of categorical + features) . If `ax` is a list of axes, `heatmaps_[i]` is the partial + dependence heatmap corresponding to the i-th item in `ax`. Elements + that are None correspond to a nonexisting axes or an axes that does not + include a heatmap. + + .. versionadded:: 1.2 + + figure_ : matplotlib Figure + Figure containing partial dependence plots. + + See Also + -------- + partial_dependence : Compute Partial Dependence values. + PartialDependenceDisplay.from_estimator : Plot Partial Dependence. + + Examples + -------- + >>> import numpy as np + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_friedman1 + >>> from sklearn.ensemble import GradientBoostingRegressor + >>> from sklearn.inspection import PartialDependenceDisplay + >>> from sklearn.inspection import partial_dependence + >>> X, y = make_friedman1() + >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y) + >>> features, feature_names = [(0,)], [f"Features #{i}" for i in range(X.shape[1])] + >>> deciles = {0: np.linspace(0, 1, num=5)} + >>> pd_results = partial_dependence( + ... clf, X, features=0, kind="average", grid_resolution=5) + >>> display = PartialDependenceDisplay( + ... [pd_results], features=features, feature_names=feature_names, + ... target_idx=0, deciles=deciles + ... ) + >>> display.plot(pdp_lim={1: (-1.38, 0.66)}) + <...> + >>> plt.show() + """ + + def __init__( + self, + pd_results, + *, + features, + feature_names, + target_idx, + deciles, + kind="average", + subsample=1000, + random_state=None, + is_categorical=None, + ): + self.pd_results = pd_results + self.features = features + self.feature_names = feature_names + self.target_idx = target_idx + self.deciles = deciles + self.kind = kind + self.subsample = subsample + self.random_state = random_state + self.is_categorical = is_categorical + + @classmethod + def from_estimator( + cls, + estimator, + X, + features, + *, + sample_weight=None, + categorical_features=None, + feature_names=None, + target=None, + response_method="auto", + n_cols=3, + grid_resolution=100, + percentiles=(0.05, 0.95), + custom_values=None, + method="auto", + n_jobs=None, + verbose=0, + line_kw=None, + ice_lines_kw=None, + pd_line_kw=None, + contour_kw=None, + ax=None, + kind="average", + centered=False, + subsample=1000, + random_state=None, + ): + """Partial dependence (PD) and individual conditional expectation (ICE) plots. + + Partial dependence plots, individual conditional expectation plots, or an + overlay of both can be plotted by setting the `kind` parameter. + This method generates one plot for each entry in `features`. The plots + are arranged in a grid with `n_cols` columns. For one-way partial + dependence plots, the deciles of the feature values are shown on the + x-axis. For two-way plots, the deciles are shown on both axes and PDPs + are contour plots. + + For general information regarding `scikit-learn` visualization tools, see + the :ref:`Visualization Guide `. + For guidance on interpreting these plots, refer to the + :ref:`Inspection Guide `. + + For an example on how to use this class method, see + :ref:`sphx_glr_auto_examples_inspection_plot_partial_dependence.py`. + + .. note:: + + :func:`PartialDependenceDisplay.from_estimator` does not support using the + same axes with multiple calls. To plot the partial dependence for + multiple estimators, please pass the axes created by the first call to the + second call:: + + >>> from sklearn.inspection import PartialDependenceDisplay + >>> from sklearn.datasets import make_friedman1 + >>> from sklearn.linear_model import LinearRegression + >>> from sklearn.ensemble import RandomForestRegressor + >>> X, y = make_friedman1() + >>> est1 = LinearRegression().fit(X, y) + >>> est2 = RandomForestRegressor().fit(X, y) + >>> disp1 = PartialDependenceDisplay.from_estimator(est1, X, + ... [1, 2]) + >>> disp2 = PartialDependenceDisplay.from_estimator(est2, X, [1, 2], + ... ax=disp1.axes_) + + .. warning:: + + For :class:`~sklearn.ensemble.GradientBoostingClassifier` and + :class:`~sklearn.ensemble.GradientBoostingRegressor`, the + `'recursion'` method (used by default) will not account for the `init` + predictor of the boosting process. In practice, this will produce + the same values as `'brute'` up to a constant offset in the target + response, provided that `init` is a constant estimator (which is the + default). However, if `init` is not a constant estimator, the + partial dependence values are incorrect for `'recursion'` because the + offset will be sample-dependent. It is preferable to use the `'brute'` + method. Note that this only applies to + :class:`~sklearn.ensemble.GradientBoostingClassifier` and + :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to + :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and + :class:`~sklearn.ensemble.HistGradientBoostingRegressor`. + + .. versionadded:: 1.0 + + Parameters + ---------- + estimator : BaseEstimator + A fitted estimator object implementing :term:`predict`, + :term:`predict_proba`, or :term:`decision_function`. + Multioutput-multiclass classifiers are not supported. + + X : {array-like, dataframe} of shape (n_samples, n_features) + ``X`` is used to generate a grid of values for the target + ``features`` (where the partial dependence will be evaluated), and + also to generate values for the complement features when the + `method` is `'brute'`. + + features : list of {int, str, pair of int, pair of str} + The target features for which to create the PDPs. + If `features[i]` is an integer or a string, a one-way PDP is created; + if `features[i]` is a tuple, a two-way PDP is created (only supported + with `kind='average'`). Each tuple must be of size 2. + If any entry is a string, then it must be in ``feature_names``. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights are used to calculate weighted means when averaging the + model output. If `None`, then samples are equally weighted. If + `sample_weight` is not `None`, then `method` will be set to `'brute'`. + Note that `sample_weight` is ignored for `kind='individual'`. + + .. versionadded:: 1.3 + + categorical_features : array-like of shape (n_features,) or shape \ + (n_categorical_features,), dtype={bool, int, str}, default=None + Indicates the categorical features. + + - `None`: no feature will be considered categorical; + - boolean array-like: boolean mask of shape `(n_features,)` + indicating which features are categorical. Thus, this array has + the same shape has `X.shape[1]`; + - integer or string array-like: integer indices or strings + indicating categorical features. + + .. versionadded:: 1.2 + + feature_names : array-like of shape (n_features,), dtype=str, default=None + Name of each feature; `feature_names[i]` holds the name of the feature + with index `i`. + By default, the name of the feature corresponds to their numerical + index for NumPy array and their column name for pandas dataframe. + + target : int, default=None + - In a multiclass setting, specifies the class for which the PDPs + should be computed. Note that for binary classification, the + positive class (index 1) is always used. + - In a multioutput setting, specifies the task for which the PDPs + should be computed. + + Ignored in binary classification or classical regression settings. + + response_method : {'auto', 'predict_proba', 'decision_function'}, \ + default='auto' + Specifies whether to use :term:`predict_proba` or + :term:`decision_function` as the target response. For regressors + this parameter is ignored and the response is always the output of + :term:`predict`. By default, :term:`predict_proba` is tried first + and we revert to :term:`decision_function` if it doesn't exist. If + ``method`` is `'recursion'`, the response is always the output of + :term:`decision_function`. + + n_cols : int, default=3 + The maximum number of columns in the grid plot. Only active when `ax` + is a single axis or `None`. + + grid_resolution : int, default=100 + The number of equally spaced points on the axes of the plots, for each + target feature. + This parameter is overridden by `custom_values` if that parameter is set. + + percentiles : tuple of float, default=(0.05, 0.95) + The lower and upper percentile used to create the extreme values + for the PDP axes. Must be in [0, 1]. + This parameter is overridden by `custom_values` if that parameter is set. + + custom_values : dict + A dictionary mapping the index of an element of `features` to an + array of values where the partial dependence should be calculated + for that feature. Setting a range of values for a feature overrides + `grid_resolution` and `percentiles`. + + .. versionadded:: 1.7 + + method : str, default='auto' + The method used to calculate the averaged predictions: + + - `'recursion'` is only supported for some tree-based estimators + (namely + :class:`~sklearn.ensemble.GradientBoostingClassifier`, + :class:`~sklearn.ensemble.GradientBoostingRegressor`, + :class:`~sklearn.ensemble.HistGradientBoostingClassifier`, + :class:`~sklearn.ensemble.HistGradientBoostingRegressor`, + :class:`~sklearn.tree.DecisionTreeRegressor`, + :class:`~sklearn.ensemble.RandomForestRegressor` + but is more efficient in terms of speed. + With this method, the target response of a + classifier is always the decision function, not the predicted + probabilities. Since the `'recursion'` method implicitly computes + the average of the ICEs by design, it is not compatible with ICE and + thus `kind` must be `'average'`. + + - `'brute'` is supported for any estimator, but is more + computationally intensive. + + - `'auto'`: the `'recursion'` is used for estimators that support it, + and `'brute'` is used otherwise. If `sample_weight` is not `None`, + then `'brute'` is used regardless of the estimator. + + Please see :ref:`this note ` for + differences between the `'brute'` and `'recursion'` method. + + n_jobs : int, default=None + The number of CPUs to use to compute the partial dependences. + Computation is parallelized over features specified by the `features` + parameter. + + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + verbose : int, default=0 + Verbose output during PD computations. + + line_kw : dict, default=None + Dict with keywords passed to the ``matplotlib.pyplot.plot`` call. + For one-way partial dependence plots. It can be used to define common + properties for both `ice_lines_kw` and `pdp_line_kw`. + + ice_lines_kw : dict, default=None + Dictionary with keywords passed to the `matplotlib.pyplot.plot` call. + For ICE lines in the one-way partial dependence plots. + The key value pairs defined in `ice_lines_kw` takes priority over + `line_kw`. + + pd_line_kw : dict, default=None + Dictionary with keywords passed to the `matplotlib.pyplot.plot` call. + For partial dependence in one-way partial dependence plots. + The key value pairs defined in `pd_line_kw` takes priority over + `line_kw`. + + contour_kw : dict, default=None + Dict with keywords passed to the ``matplotlib.pyplot.contourf`` call. + For two-way partial dependence plots. + + ax : Matplotlib axes or array-like of Matplotlib axes, default=None + - If a single axis is passed in, it is treated as a bounding axes + and a grid of partial dependence plots will be drawn within + these bounds. The `n_cols` parameter controls the number of + columns in the grid. + - If an array-like of axes are passed in, the partial dependence + plots will be drawn directly into these axes. + - If `None`, a figure and a bounding axes is created and treated + as the single axes case. + + kind : {'average', 'individual', 'both'}, default='average' + Whether to plot the partial dependence averaged across all the samples + in the dataset or one line per sample or both. + + - ``kind='average'`` results in the traditional PD plot; + - ``kind='individual'`` results in the ICE plot. + + Note that the fast `method='recursion'` option is only available for + `kind='average'` and `sample_weights=None`. Computing individual + dependencies and doing weighted averages requires using the slower + `method='brute'`. + + centered : bool, default=False + If `True`, the ICE and PD lines will start at the origin of the + y-axis. By default, no centering is done. + + .. versionadded:: 1.1 + + subsample : float, int or None, default=1000 + Sampling for ICE curves when `kind` is 'individual' or 'both'. + If `float`, should be between 0.0 and 1.0 and represent the proportion + of the dataset to be used to plot ICE curves. If `int`, represents the + absolute number samples to use. + + Note that the full dataset is still used to calculate averaged partial + dependence when `kind='both'`. + + random_state : int, RandomState instance or None, default=None + Controls the randomness of the selected samples when subsamples is not + `None` and `kind` is either `'both'` or `'individual'`. + See :term:`Glossary ` for details. + + Returns + ------- + display : :class:`~sklearn.inspection.PartialDependenceDisplay` + + See Also + -------- + partial_dependence : Compute Partial Dependence values. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_friedman1 + >>> from sklearn.ensemble import GradientBoostingRegressor + >>> from sklearn.inspection import PartialDependenceDisplay + >>> X, y = make_friedman1() + >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y) + >>> PartialDependenceDisplay.from_estimator(clf, X, [0, (0, 1)]) + <...> + >>> plt.show() + """ + check_matplotlib_support(f"{cls.__name__}.from_estimator") + import matplotlib.pyplot as plt + + # set target_idx for multi-class estimators + if hasattr(estimator, "classes_") and np.size(estimator.classes_) > 2: + if target is None: + raise ValueError("target must be specified for multi-class") + target_idx = np.searchsorted(estimator.classes_, target) + if ( + not (0 <= target_idx < len(estimator.classes_)) + or estimator.classes_[target_idx] != target + ): + raise ValueError("target not in est.classes_, got {}".format(target)) + else: + # regression and binary classification + target_idx = 0 + + # Use check_array only on lists and other non-array-likes / sparse. Do not + # convert DataFrame into a NumPy array. + if not (hasattr(X, "__array__") or sparse.issparse(X)): + X = check_array(X, ensure_all_finite="allow-nan", dtype=object) + n_features = X.shape[1] + + feature_names = _check_feature_names(X, feature_names) + # expand kind to always be a list of str + kind_ = [kind] * len(features) if isinstance(kind, str) else kind + if len(kind_) != len(features): + raise ValueError( + "When `kind` is provided as a list of strings, it should contain " + f"as many elements as `features`. `kind` contains {len(kind_)} " + f"element(s) and `features` contains {len(features)} element(s)." + ) + + # convert features into a seq of int tuples + tmp_features, ice_for_two_way_pd = [], [] + for kind_plot, fxs in zip(kind_, features): + if isinstance(fxs, (numbers.Integral, str)): + fxs = (fxs,) + try: + fxs = tuple( + _get_feature_index(fx, feature_names=feature_names) for fx in fxs + ) + except TypeError as e: + raise ValueError( + "Each entry in features must be either an int, " + "a string, or an iterable of size at most 2." + ) from e + if not 1 <= np.size(fxs) <= 2: + raise ValueError( + "Each entry in features must be either an int, " + "a string, or an iterable of size at most 2." + ) + # store the information if 2-way PD was requested with ICE to later + # raise a ValueError with an exhaustive list of problematic + # settings. + ice_for_two_way_pd.append(kind_plot != "average" and np.size(fxs) > 1) + + tmp_features.append(fxs) + + if any(ice_for_two_way_pd): + # raise an error and be specific regarding the parameter values + # when 1- and 2-way PD were requested + kind_ = [ + "average" if forcing_average else kind_plot + for forcing_average, kind_plot in zip(ice_for_two_way_pd, kind_) + ] + raise ValueError( + "ICE plot cannot be rendered for 2-way feature interactions. " + "2-way feature interactions mandates PD plots using the " + "'average' kind: " + f"features={features!r} should be configured to use " + f"kind={kind_!r} explicitly." + ) + features = tmp_features + + if categorical_features is None: + is_categorical = [ + (False,) if len(fxs) == 1 else (False, False) for fxs in features + ] + else: + # we need to create a boolean indicator of which features are + # categorical from the categorical_features list. + categorical_features = np.asarray(categorical_features) + if categorical_features.dtype.kind == "b": + # categorical features provided as a list of boolean + if categorical_features.size != n_features: + raise ValueError( + "When `categorical_features` is a boolean array-like, " + "the array should be of shape (n_features,). Got " + f"{categorical_features.size} elements while `X` contains " + f"{n_features} features." + ) + is_categorical = [ + tuple(categorical_features[fx] for fx in fxs) for fxs in features + ] + elif categorical_features.dtype.kind in ("i", "O", "U"): + # categorical features provided as a list of indices or feature names + categorical_features_idx = [ + _get_feature_index(cat, feature_names=feature_names) + for cat in categorical_features + ] + is_categorical = [ + tuple([idx in categorical_features_idx for idx in fxs]) + for fxs in features + ] + else: + raise ValueError( + "Expected `categorical_features` to be an array-like of boolean," + f" integer, or string. Got {categorical_features.dtype} instead." + ) + + for cats in is_categorical: + if np.size(cats) == 2 and (cats[0] != cats[1]): + raise ValueError( + "Two-way partial dependence plots are not supported for pairs" + " of continuous and categorical features." + ) + + # collect the indices of the categorical features targeted by the partial + # dependence computation + categorical_features_targeted = set( + [ + fx + for fxs, cats in zip(features, is_categorical) + for fx in fxs + if any(cats) + ] + ) + if categorical_features_targeted: + min_n_cats = min( + [ + len(_unique(_safe_indexing(X, idx, axis=1))) + for idx in categorical_features_targeted + ] + ) + if grid_resolution < min_n_cats: + raise ValueError( + "The resolution of the computed grid is less than the " + "minimum number of categories in the targeted categorical " + "features. Expect the `grid_resolution` to be greater than " + f"{min_n_cats}. Got {grid_resolution} instead." + ) + + for is_cat, kind_plot in zip(is_categorical, kind_): + if any(is_cat) and kind_plot != "average": + raise ValueError( + "It is not possible to display individual effects for" + " categorical features." + ) + + # Early exit if the axes does not have the correct number of axes + if ax is not None and not isinstance(ax, plt.Axes): + axes = np.asarray(ax, dtype=object) + if axes.size != len(features): + raise ValueError( + "Expected ax to have {} axes, got {}".format( + len(features), axes.size + ) + ) + + for i in chain.from_iterable(features): + if i >= len(feature_names): + raise ValueError( + "All entries of features must be less than " + "len(feature_names) = {0}, got {1}.".format(len(feature_names), i) + ) + + if isinstance(subsample, numbers.Integral): + if subsample <= 0: + raise ValueError( + f"When an integer, subsample={subsample} should be positive." + ) + elif isinstance(subsample, numbers.Real): + if subsample <= 0 or subsample >= 1: + raise ValueError( + f"When a floating-point, subsample={subsample} should be in " + "the (0, 1) range." + ) + + # compute predictions and/or averaged predictions + pd_results = Parallel(n_jobs=n_jobs, verbose=verbose)( + delayed(partial_dependence)( + estimator, + X, + fxs, + sample_weight=sample_weight, + feature_names=feature_names, + categorical_features=categorical_features, + response_method=response_method, + method=method, + grid_resolution=grid_resolution, + percentiles=percentiles, + kind=kind_plot, + custom_values=custom_values, + ) + for kind_plot, fxs in zip(kind_, features) + ) + + # For multioutput regression, we can only check the validity of target + # now that we have the predictions. + # Also note: as multiclass-multioutput classifiers are not supported, + # multiclass and multioutput scenario are mutually exclusive. So there is + # no risk of overwriting target_idx here. + pd_result = pd_results[0] # checking the first result is enough + n_tasks = ( + pd_result.average.shape[0] + if kind_[0] == "average" + else pd_result.individual.shape[0] + ) + if is_regressor(estimator) and n_tasks > 1: + if target is None: + raise ValueError("target must be specified for multi-output regressors") + if not 0 <= target <= n_tasks: + raise ValueError( + "target must be in [0, n_tasks], got {}.".format(target) + ) + target_idx = target + + deciles = {} + for fxs, cats in zip(features, is_categorical): + for fx, cat in zip(fxs, cats): + if not cat and fx not in deciles: + X_col = _safe_indexing(X, fx, axis=1) + deciles[fx] = mquantiles(X_col, prob=np.arange(0.1, 1.0, 0.1)) + + display = cls( + pd_results=pd_results, + features=features, + feature_names=feature_names, + target_idx=target_idx, + deciles=deciles, + kind=kind, + subsample=subsample, + random_state=random_state, + is_categorical=is_categorical, + ) + return display.plot( + ax=ax, + n_cols=n_cols, + line_kw=line_kw, + ice_lines_kw=ice_lines_kw, + pd_line_kw=pd_line_kw, + contour_kw=contour_kw, + centered=centered, + ) + + def _get_sample_count(self, n_samples): + """Compute the number of samples as an integer.""" + if isinstance(self.subsample, numbers.Integral): + if self.subsample < n_samples: + return self.subsample + return n_samples + elif isinstance(self.subsample, numbers.Real): + return ceil(n_samples * self.subsample) + return n_samples + + def _plot_ice_lines( + self, + preds, + feature_values, + n_ice_to_plot, + ax, + pd_plot_idx, + n_total_lines_by_plot, + individual_line_kw, + ): + """Plot the ICE lines. + + Parameters + ---------- + preds : ndarray of shape \ + (n_instances, n_grid_points) + The predictions computed for all points of `feature_values` for a + given feature for all samples in `X`. + feature_values : ndarray of shape (n_grid_points,) + The feature values for which the predictions have been computed. + n_ice_to_plot : int + The number of ICE lines to plot. + ax : Matplotlib axes + The axis on which to plot the ICE lines. + pd_plot_idx : int + The sequential index of the plot. It will be unraveled to find the + matching 2D position in the grid layout. + n_total_lines_by_plot : int + The total number of lines expected to be plot on the axis. + individual_line_kw : dict + Dict with keywords passed when plotting the ICE lines. + """ + rng = check_random_state(self.random_state) + # subsample ice + ice_lines_idx = rng.choice( + preds.shape[0], + n_ice_to_plot, + replace=False, + ) + ice_lines_subsampled = preds[ice_lines_idx, :] + # plot the subsampled ice + for ice_idx, ice in enumerate(ice_lines_subsampled): + line_idx = np.unravel_index( + pd_plot_idx * n_total_lines_by_plot + ice_idx, self.lines_.shape + ) + self.lines_[line_idx] = ax.plot( + feature_values, ice.ravel(), **individual_line_kw + )[0] + + def _plot_average_dependence( + self, + avg_preds, + feature_values, + ax, + pd_line_idx, + line_kw, + categorical, + bar_kw, + ): + """Plot the average partial dependence. + + Parameters + ---------- + avg_preds : ndarray of shape (n_grid_points,) + The average predictions for all points of `feature_values` for a + given feature for all samples in `X`. + feature_values : ndarray of shape (n_grid_points,) + The feature values for which the predictions have been computed. + ax : Matplotlib axes + The axis on which to plot the average PD. + pd_line_idx : int + The sequential index of the plot. It will be unraveled to find the + matching 2D position in the grid layout. + line_kw : dict + Dict with keywords passed when plotting the PD plot. + categorical : bool + Whether feature is categorical. + bar_kw: dict + Dict with keywords passed when plotting the PD bars (categorical). + """ + if categorical: + bar_idx = np.unravel_index(pd_line_idx, self.bars_.shape) + self.bars_[bar_idx] = ax.bar(feature_values, avg_preds, **bar_kw)[0] + ax.tick_params(axis="x", rotation=90) + else: + line_idx = np.unravel_index(pd_line_idx, self.lines_.shape) + self.lines_[line_idx] = ax.plot( + feature_values, + avg_preds, + **line_kw, + )[0] + + def _plot_one_way_partial_dependence( + self, + kind, + preds, + avg_preds, + feature_values, + feature_idx, + n_ice_lines, + ax, + n_cols, + pd_plot_idx, + n_lines, + ice_lines_kw, + pd_line_kw, + categorical, + bar_kw, + pdp_lim, + ): + """Plot 1-way partial dependence: ICE and PDP. + + Parameters + ---------- + kind : str + The kind of partial plot to draw. + preds : ndarray of shape \ + (n_instances, n_grid_points) or None + The predictions computed for all points of `feature_values` for a + given feature for all samples in `X`. + avg_preds : ndarray of shape (n_grid_points,) + The average predictions for all points of `feature_values` for a + given feature for all samples in `X`. + feature_values : ndarray of shape (n_grid_points,) + The feature values for which the predictions have been computed. + feature_idx : int + The index corresponding to the target feature. + n_ice_lines : int + The number of ICE lines to plot. + ax : Matplotlib axes + The axis on which to plot the ICE and PDP lines. + n_cols : int or None + The number of column in the axis. + pd_plot_idx : int + The sequential index of the plot. It will be unraveled to find the + matching 2D position in the grid layout. + n_lines : int + The total number of lines expected to be plot on the axis. + ice_lines_kw : dict + Dict with keywords passed when plotting the ICE lines. + pd_line_kw : dict + Dict with keywords passed when plotting the PD plot. + categorical : bool + Whether feature is categorical. + bar_kw: dict + Dict with keywords passed when plotting the PD bars (categorical). + pdp_lim : dict + Global min and max average predictions, such that all plots will + have the same scale and y limits. `pdp_lim[1]` is the global min + and max for single partial dependence curves. + """ + from matplotlib import transforms + + if kind in ("individual", "both"): + self._plot_ice_lines( + preds[self.target_idx], + feature_values, + n_ice_lines, + ax, + pd_plot_idx, + n_lines, + ice_lines_kw, + ) + + if kind in ("average", "both"): + # the average is stored as the last line + if kind == "average": + pd_line_idx = pd_plot_idx + else: + pd_line_idx = pd_plot_idx * n_lines + n_ice_lines + self._plot_average_dependence( + avg_preds[self.target_idx].ravel(), + feature_values, + ax, + pd_line_idx, + pd_line_kw, + categorical, + bar_kw, + ) + + trans = transforms.blended_transform_factory(ax.transData, ax.transAxes) + # create the decile line for the vertical axis + vlines_idx = np.unravel_index(pd_plot_idx, self.deciles_vlines_.shape) + if self.deciles.get(feature_idx[0], None) is not None: + self.deciles_vlines_[vlines_idx] = ax.vlines( + self.deciles[feature_idx[0]], + 0, + 0.05, + transform=trans, + color="k", + ) + # reset ylim which was overwritten by vlines + min_val = min(val[0] for val in pdp_lim.values()) + max_val = max(val[1] for val in pdp_lim.values()) + ax.set_ylim([min_val, max_val]) + + # Set xlabel if it is not already set + if not ax.get_xlabel(): + ax.set_xlabel(self.feature_names[feature_idx[0]]) + + if n_cols is None or pd_plot_idx % n_cols == 0: + if not ax.get_ylabel(): + ax.set_ylabel("Partial dependence") + else: + ax.set_yticklabels([]) + + if pd_line_kw.get("label", None) and kind != "individual" and not categorical: + ax.legend() + + def _plot_two_way_partial_dependence( + self, + avg_preds, + feature_values, + feature_idx, + ax, + pd_plot_idx, + Z_level, + contour_kw, + categorical, + heatmap_kw, + ): + """Plot 2-way partial dependence. + + Parameters + ---------- + avg_preds : ndarray of shape \ + (n_instances, n_grid_points, n_grid_points) + The average predictions for all points of `feature_values[0]` and + `feature_values[1]` for some given features for all samples in `X`. + feature_values : seq of 1d array + A sequence of array of the feature values for which the predictions + have been computed. + feature_idx : tuple of int + The indices of the target features + ax : Matplotlib axes + The axis on which to plot the ICE and PDP lines. + pd_plot_idx : int + The sequential index of the plot. It will be unraveled to find the + matching 2D position in the grid layout. + Z_level : ndarray of shape (8, 8) + The Z-level used to encode the average predictions. + contour_kw : dict + Dict with keywords passed when plotting the contours. + categorical : bool + Whether features are categorical. + heatmap_kw: dict + Dict with keywords passed when plotting the PD heatmap + (categorical). + """ + if categorical: + import matplotlib.pyplot as plt + + default_im_kw = dict(interpolation="nearest", cmap="viridis") + im_kw = {**default_im_kw, **heatmap_kw} + + data = avg_preds[self.target_idx] + im = ax.imshow(data, **im_kw) + text = None + cmap_min, cmap_max = im.cmap(0), im.cmap(1.0) + + text = np.empty_like(data, dtype=object) + # print text with appropriate color depending on background + thresh = (data.max() + data.min()) / 2.0 + + for flat_index in range(data.size): + row, col = np.unravel_index(flat_index, data.shape) + color = cmap_max if data[row, col] < thresh else cmap_min + + values_format = ".2f" + text_data = format(data[row, col], values_format) + + text_kwargs = dict(ha="center", va="center", color=color) + text[row, col] = ax.text(col, row, text_data, **text_kwargs) + + fig = ax.figure + fig.colorbar(im, ax=ax) + ax.set( + xticks=np.arange(len(feature_values[1])), + yticks=np.arange(len(feature_values[0])), + xticklabels=feature_values[1], + yticklabels=feature_values[0], + xlabel=self.feature_names[feature_idx[1]], + ylabel=self.feature_names[feature_idx[0]], + ) + + plt.setp(ax.get_xticklabels(), rotation="vertical") + + heatmap_idx = np.unravel_index(pd_plot_idx, self.heatmaps_.shape) + self.heatmaps_[heatmap_idx] = im + else: + from matplotlib import transforms + + XX, YY = np.meshgrid(feature_values[0], feature_values[1]) + Z = avg_preds[self.target_idx].T + CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5, colors="k") + contour_idx = np.unravel_index(pd_plot_idx, self.contours_.shape) + self.contours_[contour_idx] = ax.contourf( + XX, + YY, + Z, + levels=Z_level, + vmax=Z_level[-1], + vmin=Z_level[0], + **contour_kw, + ) + ax.clabel(CS, fmt="%2.2f", colors="k", fontsize=10, inline=True) + + trans = transforms.blended_transform_factory(ax.transData, ax.transAxes) + # create the decile line for the vertical axis + xlim, ylim = ax.get_xlim(), ax.get_ylim() + vlines_idx = np.unravel_index(pd_plot_idx, self.deciles_vlines_.shape) + self.deciles_vlines_[vlines_idx] = ax.vlines( + self.deciles[feature_idx[0]], + 0, + 0.05, + transform=trans, + color="k", + ) + # create the decile line for the horizontal axis + hlines_idx = np.unravel_index(pd_plot_idx, self.deciles_hlines_.shape) + self.deciles_hlines_[hlines_idx] = ax.hlines( + self.deciles[feature_idx[1]], + 0, + 0.05, + transform=trans, + color="k", + ) + # reset xlim and ylim since they are overwritten by hlines and + # vlines + ax.set_xlim(xlim) + ax.set_ylim(ylim) + + # set xlabel if it is not already set + if not ax.get_xlabel(): + ax.set_xlabel(self.feature_names[feature_idx[0]]) + ax.set_ylabel(self.feature_names[feature_idx[1]]) + + def plot( + self, + *, + ax=None, + n_cols=3, + line_kw=None, + ice_lines_kw=None, + pd_line_kw=None, + contour_kw=None, + bar_kw=None, + heatmap_kw=None, + pdp_lim=None, + centered=False, + ): + """Plot partial dependence plots. + + Parameters + ---------- + ax : Matplotlib axes or array-like of Matplotlib axes, default=None + - If a single axis is passed in, it is treated as a bounding axes + and a grid of partial dependence plots will be drawn within + these bounds. The `n_cols` parameter controls the number of + columns in the grid. + - If an array-like of axes are passed in, the partial dependence + plots will be drawn directly into these axes. + - If `None`, a figure and a bounding axes is created and treated + as the single axes case. + + n_cols : int, default=3 + The maximum number of columns in the grid plot. Only active when + `ax` is a single axes or `None`. + + line_kw : dict, default=None + Dict with keywords passed to the `matplotlib.pyplot.plot` call. + For one-way partial dependence plots. + + ice_lines_kw : dict, default=None + Dictionary with keywords passed to the `matplotlib.pyplot.plot` call. + For ICE lines in the one-way partial dependence plots. + The key value pairs defined in `ice_lines_kw` takes priority over + `line_kw`. + + .. versionadded:: 1.0 + + pd_line_kw : dict, default=None + Dictionary with keywords passed to the `matplotlib.pyplot.plot` call. + For partial dependence in one-way partial dependence plots. + The key value pairs defined in `pd_line_kw` takes priority over + `line_kw`. + + .. versionadded:: 1.0 + + contour_kw : dict, default=None + Dict with keywords passed to the `matplotlib.pyplot.contourf` + call for two-way partial dependence plots. + + bar_kw : dict, default=None + Dict with keywords passed to the `matplotlib.pyplot.bar` + call for one-way categorical partial dependence plots. + + .. versionadded:: 1.2 + + heatmap_kw : dict, default=None + Dict with keywords passed to the `matplotlib.pyplot.imshow` + call for two-way categorical partial dependence plots. + + .. versionadded:: 1.2 + + pdp_lim : dict, default=None + Global min and max average predictions, such that all plots will have the + same scale and y limits. `pdp_lim[1]` is the global min and max for single + partial dependence curves. `pdp_lim[2]` is the global min and max for + two-way partial dependence curves. If `None` (default), the limit will be + inferred from the global minimum and maximum of all predictions. + + .. versionadded:: 1.1 + + centered : bool, default=False + If `True`, the ICE and PD lines will start at the origin of the + y-axis. By default, no centering is done. + + .. versionadded:: 1.1 + + Returns + ------- + display : :class:`~sklearn.inspection.PartialDependenceDisplay` + Returns a :class:`~sklearn.inspection.PartialDependenceDisplay` + object that contains the partial dependence plots. + """ + + check_matplotlib_support("plot_partial_dependence") + import matplotlib.pyplot as plt + from matplotlib.gridspec import GridSpecFromSubplotSpec + + if isinstance(self.kind, str): + kind = [self.kind] * len(self.features) + else: + kind = self.kind + + if self.is_categorical is None: + is_categorical = [ + (False,) if len(fx) == 1 else (False, False) for fx in self.features + ] + else: + is_categorical = self.is_categorical + + if len(kind) != len(self.features): + raise ValueError( + "When `kind` is provided as a list of strings, it should " + "contain as many elements as `features`. `kind` contains " + f"{len(kind)} element(s) and `features` contains " + f"{len(self.features)} element(s)." + ) + + valid_kinds = {"average", "individual", "both"} + if any([k not in valid_kinds for k in kind]): + raise ValueError( + f"Values provided to `kind` must be one of: {valid_kinds!r} or a list" + f" of such values. Currently, kind={self.kind!r}" + ) + + # Center results before plotting + if not centered: + pd_results_ = self.pd_results + else: + pd_results_ = [] + for kind_plot, pd_result in zip(kind, self.pd_results): + current_results = {"grid_values": pd_result["grid_values"]} + + if kind_plot in ("individual", "both"): + preds = pd_result.individual + preds = preds - preds[self.target_idx, :, 0, None] + current_results["individual"] = preds + + if kind_plot in ("average", "both"): + avg_preds = pd_result.average + avg_preds = avg_preds - avg_preds[self.target_idx, 0, None] + current_results["average"] = avg_preds + + pd_results_.append(Bunch(**current_results)) + + if pdp_lim is None: + # get global min and max average predictions of PD grouped by plot type + pdp_lim = {} + for kind_plot, pdp in zip(kind, pd_results_): + values = pdp["grid_values"] + preds = pdp.average if kind_plot == "average" else pdp.individual + min_pd = preds[self.target_idx].min() + max_pd = preds[self.target_idx].max() + + # expand the limits to account so that the plotted lines do not touch + # the edges of the plot + span = max_pd - min_pd + min_pd -= 0.05 * span + max_pd += 0.05 * span + + n_fx = len(values) + old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd)) + min_pd = min(min_pd, old_min_pd) + max_pd = max(max_pd, old_max_pd) + pdp_lim[n_fx] = (min_pd, max_pd) + + if line_kw is None: + line_kw = {} + if ice_lines_kw is None: + ice_lines_kw = {} + if pd_line_kw is None: + pd_line_kw = {} + if bar_kw is None: + bar_kw = {} + if heatmap_kw is None: + heatmap_kw = {} + + if ax is None: + _, ax = plt.subplots() + + if contour_kw is None: + contour_kw = {} + default_contour_kws = {"alpha": 0.75} + contour_kw = _validate_style_kwargs(default_contour_kws, contour_kw) + + n_features = len(self.features) + is_average_plot = [kind_plot == "average" for kind_plot in kind] + if all(is_average_plot): + # only average plots are requested + n_ice_lines = 0 + n_lines = 1 + else: + # we need to determine the number of ICE samples computed + ice_plot_idx = is_average_plot.index(False) + n_ice_lines = self._get_sample_count( + len(pd_results_[ice_plot_idx].individual[0]) + ) + if any([kind_plot == "both" for kind_plot in kind]): + n_lines = n_ice_lines + 1 # account for the average line + else: + n_lines = n_ice_lines + + if isinstance(ax, plt.Axes): + # If ax was set off, it has most likely been set to off + # by a previous call to plot. + if not ax.axison: + raise ValueError( + "The ax was already used in another plot " + "function, please set ax=display.axes_ " + "instead" + ) + + ax.set_axis_off() + self.bounding_ax_ = ax + self.figure_ = ax.figure + + n_cols = min(n_cols, n_features) + n_rows = int(np.ceil(n_features / float(n_cols))) + + self.axes_ = np.empty((n_rows, n_cols), dtype=object) + if all(is_average_plot): + self.lines_ = np.empty((n_rows, n_cols), dtype=object) + else: + self.lines_ = np.empty((n_rows, n_cols, n_lines), dtype=object) + self.contours_ = np.empty((n_rows, n_cols), dtype=object) + self.bars_ = np.empty((n_rows, n_cols), dtype=object) + self.heatmaps_ = np.empty((n_rows, n_cols), dtype=object) + + axes_ravel = self.axes_.ravel() + + gs = GridSpecFromSubplotSpec( + n_rows, n_cols, subplot_spec=ax.get_subplotspec() + ) + for i, spec in zip(range(n_features), gs): + axes_ravel[i] = self.figure_.add_subplot(spec) + + else: # array-like + ax = np.asarray(ax, dtype=object) + if ax.size != n_features: + raise ValueError( + "Expected ax to have {} axes, got {}".format(n_features, ax.size) + ) + + if ax.ndim == 2: + n_cols = ax.shape[1] + else: + n_cols = None + + self.bounding_ax_ = None + self.figure_ = ax.ravel()[0].figure + self.axes_ = ax + if all(is_average_plot): + self.lines_ = np.empty_like(ax, dtype=object) + else: + self.lines_ = np.empty(ax.shape + (n_lines,), dtype=object) + self.contours_ = np.empty_like(ax, dtype=object) + self.bars_ = np.empty_like(ax, dtype=object) + self.heatmaps_ = np.empty_like(ax, dtype=object) + + # create contour levels for two-way plots + if 2 in pdp_lim: + Z_level = np.linspace(*pdp_lim[2], num=8) + + self.deciles_vlines_ = np.empty_like(self.axes_, dtype=object) + self.deciles_hlines_ = np.empty_like(self.axes_, dtype=object) + + for pd_plot_idx, (axi, feature_idx, cat, pd_result, kind_plot) in enumerate( + zip( + self.axes_.ravel(), + self.features, + is_categorical, + pd_results_, + kind, + ) + ): + avg_preds = None + preds = None + feature_values = pd_result["grid_values"] + if kind_plot == "individual": + preds = pd_result.individual + elif kind_plot == "average": + avg_preds = pd_result.average + else: # kind_plot == 'both' + avg_preds = pd_result.average + preds = pd_result.individual + + if len(feature_values) == 1: + # define the line-style for the current plot + default_line_kws = { + "color": "C0", + "label": "average" if kind_plot == "both" else None, + } + if kind_plot == "individual": + default_ice_lines_kws = {"alpha": 0.3, "linewidth": 0.5} + default_pd_lines_kws = {} + elif kind_plot == "both": + # by default, we need to distinguish the average line from + # the individual lines via color and line style + default_ice_lines_kws = { + "alpha": 0.3, + "linewidth": 0.5, + "color": "tab:blue", + } + default_pd_lines_kws = { + "color": "tab:orange", + "linestyle": "--", + } + else: + default_ice_lines_kws = {} + default_pd_lines_kws = {} + + default_ice_lines_kws = {**default_line_kws, **default_ice_lines_kws} + default_pd_lines_kws = {**default_line_kws, **default_pd_lines_kws} + + line_kw = _validate_style_kwargs(default_line_kws, line_kw) + + ice_lines_kw = _validate_style_kwargs( + _validate_style_kwargs(default_ice_lines_kws, line_kw), ice_lines_kw + ) + del ice_lines_kw["label"] + + pd_line_kw = _validate_style_kwargs( + _validate_style_kwargs(default_pd_lines_kws, line_kw), pd_line_kw + ) + + default_bar_kws = {"color": "C0"} + bar_kw = _validate_style_kwargs(default_bar_kws, bar_kw) + + default_heatmap_kw = {} + heatmap_kw = _validate_style_kwargs(default_heatmap_kw, heatmap_kw) + + self._plot_one_way_partial_dependence( + kind_plot, + preds, + avg_preds, + feature_values[0], + feature_idx, + n_ice_lines, + axi, + n_cols, + pd_plot_idx, + n_lines, + ice_lines_kw, + pd_line_kw, + cat[0], + bar_kw, + pdp_lim, + ) + else: + self._plot_two_way_partial_dependence( + avg_preds, + feature_values, + feature_idx, + axi, + pd_plot_idx, + Z_level, + contour_kw, + cat[0] and cat[1], + heatmap_kw, + ) + + return self diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/tests/test_boundary_decision_display.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/tests/test_boundary_decision_display.py new file mode 100644 index 0000000000000000000000000000000000000000..f409a50ab58c0865c17082f95122247bb0d5344d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/tests/test_boundary_decision_display.py @@ -0,0 +1,710 @@ +import warnings + +import numpy as np +import pytest + +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.datasets import ( + load_diabetes, + load_iris, + make_classification, + make_multilabel_classification, +) +from sklearn.ensemble import IsolationForest +from sklearn.inspection import DecisionBoundaryDisplay +from sklearn.inspection._plot.decision_boundary import _check_boundary_response_method +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import scale +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +from sklearn.utils._testing import ( + _convert_container, + assert_allclose, + assert_array_equal, +) +from sklearn.utils.fixes import parse_version + +X, y = make_classification( + n_informative=1, + n_redundant=1, + n_clusters_per_class=1, + n_features=2, + random_state=42, +) + + +def load_iris_2d_scaled(): + X, y = load_iris(return_X_y=True) + X = scale(X)[:, :2] + return X, y + + +@pytest.fixture(scope="module") +def fitted_clf(): + return LogisticRegression().fit(X, y) + + +def test_input_data_dimension(pyplot): + """Check that we raise an error when `X` does not have exactly 2 features.""" + X, y = make_classification(n_samples=10, n_features=4, random_state=0) + + clf = LogisticRegression().fit(X, y) + msg = "n_features must be equal to 2. Got 4 instead." + with pytest.raises(ValueError, match=msg): + DecisionBoundaryDisplay.from_estimator(estimator=clf, X=X) + + +def test_check_boundary_response_method_error(): + """Check error raised for multi-output multi-class classifiers by + `_check_boundary_response_method`. + """ + + class MultiLabelClassifier: + classes_ = [np.array([0, 1]), np.array([0, 1])] + + err_msg = "Multi-label and multi-output multi-class classifiers are not supported" + with pytest.raises(ValueError, match=err_msg): + _check_boundary_response_method(MultiLabelClassifier(), "predict", None) + + +@pytest.mark.parametrize( + "estimator, response_method, class_of_interest, expected_prediction_method", + [ + (DecisionTreeRegressor(), "predict", None, "predict"), + (DecisionTreeRegressor(), "auto", None, "predict"), + (LogisticRegression().fit(*load_iris_2d_scaled()), "predict", None, "predict"), + ( + LogisticRegression().fit(*load_iris_2d_scaled()), + "auto", + None, + ["decision_function", "predict_proba", "predict"], + ), + ( + LogisticRegression().fit(*load_iris_2d_scaled()), + "predict_proba", + 0, + "predict_proba", + ), + ( + LogisticRegression().fit(*load_iris_2d_scaled()), + "decision_function", + 0, + "decision_function", + ), + ( + LogisticRegression().fit(X, y), + "auto", + None, + ["decision_function", "predict_proba", "predict"], + ), + (LogisticRegression().fit(X, y), "predict", None, "predict"), + ( + LogisticRegression().fit(X, y), + ["predict_proba", "decision_function"], + None, + ["predict_proba", "decision_function"], + ), + ], +) +def test_check_boundary_response_method( + estimator, response_method, class_of_interest, expected_prediction_method +): + """Check the behaviour of `_check_boundary_response_method` for the supported + cases. + """ + prediction_method = _check_boundary_response_method( + estimator, response_method, class_of_interest + ) + assert prediction_method == expected_prediction_method + + +def test_multiclass_predict(pyplot): + """Check multiclass `response=predict` gives expected results.""" + grid_resolution = 10 + eps = 1.0 + X, y = make_classification(n_classes=3, n_informative=3, random_state=0) + X = X[:, [0, 1]] + lr = LogisticRegression(random_state=0).fit(X, y) + + disp = DecisionBoundaryDisplay.from_estimator( + lr, X, response_method="predict", grid_resolution=grid_resolution, eps=1.0 + ) + + x0_min, x0_max = X[:, 0].min() - eps, X[:, 0].max() + eps + x1_min, x1_max = X[:, 1].min() - eps, X[:, 1].max() + eps + xx0, xx1 = np.meshgrid( + np.linspace(x0_min, x0_max, grid_resolution), + np.linspace(x1_min, x1_max, grid_resolution), + ) + response = lr.predict(np.c_[xx0.ravel(), xx1.ravel()]) + assert_allclose(disp.response, response.reshape(xx0.shape)) + assert_allclose(disp.xx0, xx0) + assert_allclose(disp.xx1, xx1) + + +@pytest.mark.parametrize( + "kwargs, error_msg", + [ + ( + {"plot_method": "hello_world"}, + r"plot_method must be one of contourf, contour, pcolormesh. Got hello_world" + r" instead.", + ), + ( + {"grid_resolution": 1}, + r"grid_resolution must be greater than 1. Got 1 instead", + ), + ( + {"grid_resolution": -1}, + r"grid_resolution must be greater than 1. Got -1 instead", + ), + ({"eps": -1.1}, r"eps must be greater than or equal to 0. Got -1.1 instead"), + ], +) +def test_input_validation_errors(pyplot, kwargs, error_msg, fitted_clf): + """Check input validation from_estimator.""" + with pytest.raises(ValueError, match=error_msg): + DecisionBoundaryDisplay.from_estimator(fitted_clf, X, **kwargs) + + +@pytest.mark.parametrize( + "kwargs, error_msg", + [ + ( + {"multiclass_colors": {"dict": "not_list"}}, + "'multiclass_colors' must be a list or a str.", + ), + ({"multiclass_colors": "not_cmap"}, "it must be a valid Matplotlib colormap"), + ({"multiclass_colors": ["red", "green"]}, "it must be of the same length"), + ( + {"multiclass_colors": ["red", "green", "not color"]}, + "it can only contain valid Matplotlib color names", + ), + ], +) +def test_input_validation_errors_multiclass_colors(pyplot, kwargs, error_msg): + """Check input validation for `multiclass_colors` in `from_estimator`.""" + X, y = load_iris_2d_scaled() + clf = LogisticRegression().fit(X, y) + with pytest.raises(ValueError, match=error_msg): + DecisionBoundaryDisplay.from_estimator(clf, X, **kwargs) + + +def test_display_plot_input_error(pyplot, fitted_clf): + """Check input validation for `plot`.""" + disp = DecisionBoundaryDisplay.from_estimator(fitted_clf, X, grid_resolution=5) + + with pytest.raises(ValueError, match="plot_method must be 'contourf'"): + disp.plot(plot_method="hello_world") + + +@pytest.mark.parametrize( + "response_method", ["auto", "predict", "predict_proba", "decision_function"] +) +@pytest.mark.parametrize("plot_method", ["contourf", "contour"]) +def test_decision_boundary_display_classifier( + pyplot, fitted_clf, response_method, plot_method +): + """Check that decision boundary is correct.""" + fig, ax = pyplot.subplots() + eps = 2.0 + disp = DecisionBoundaryDisplay.from_estimator( + fitted_clf, + X, + grid_resolution=5, + response_method=response_method, + plot_method=plot_method, + eps=eps, + ax=ax, + ) + assert isinstance(disp.surface_, pyplot.matplotlib.contour.QuadContourSet) + assert disp.ax_ == ax + assert disp.figure_ == fig + + x0, x1 = X[:, 0], X[:, 1] + + x0_min, x0_max = x0.min() - eps, x0.max() + eps + x1_min, x1_max = x1.min() - eps, x1.max() + eps + + assert disp.xx0.min() == pytest.approx(x0_min) + assert disp.xx0.max() == pytest.approx(x0_max) + assert disp.xx1.min() == pytest.approx(x1_min) + assert disp.xx1.max() == pytest.approx(x1_max) + + fig2, ax2 = pyplot.subplots() + # change plotting method for second plot + disp.plot(plot_method="pcolormesh", ax=ax2, shading="auto") + assert isinstance(disp.surface_, pyplot.matplotlib.collections.QuadMesh) + assert disp.ax_ == ax2 + assert disp.figure_ == fig2 + + +@pytest.mark.parametrize("response_method", ["auto", "predict", "decision_function"]) +@pytest.mark.parametrize("plot_method", ["contourf", "contour"]) +def test_decision_boundary_display_outlier_detector( + pyplot, response_method, plot_method +): + """Check that decision boundary is correct for outlier detector.""" + fig, ax = pyplot.subplots() + eps = 2.0 + outlier_detector = IsolationForest(random_state=0).fit(X, y) + disp = DecisionBoundaryDisplay.from_estimator( + outlier_detector, + X, + grid_resolution=5, + response_method=response_method, + plot_method=plot_method, + eps=eps, + ax=ax, + ) + assert isinstance(disp.surface_, pyplot.matplotlib.contour.QuadContourSet) + assert disp.ax_ == ax + assert disp.figure_ == fig + + x0, x1 = X[:, 0], X[:, 1] + + x0_min, x0_max = x0.min() - eps, x0.max() + eps + x1_min, x1_max = x1.min() - eps, x1.max() + eps + + assert disp.xx0.min() == pytest.approx(x0_min) + assert disp.xx0.max() == pytest.approx(x0_max) + assert disp.xx1.min() == pytest.approx(x1_min) + assert disp.xx1.max() == pytest.approx(x1_max) + + +@pytest.mark.parametrize("response_method", ["auto", "predict"]) +@pytest.mark.parametrize("plot_method", ["contourf", "contour"]) +def test_decision_boundary_display_regressor(pyplot, response_method, plot_method): + """Check that we can display the decision boundary for a regressor.""" + X, y = load_diabetes(return_X_y=True) + X = X[:, :2] + tree = DecisionTreeRegressor().fit(X, y) + fig, ax = pyplot.subplots() + eps = 2.0 + disp = DecisionBoundaryDisplay.from_estimator( + tree, + X, + response_method=response_method, + ax=ax, + eps=eps, + plot_method=plot_method, + ) + assert isinstance(disp.surface_, pyplot.matplotlib.contour.QuadContourSet) + assert disp.ax_ == ax + assert disp.figure_ == fig + + x0, x1 = X[:, 0], X[:, 1] + + x0_min, x0_max = x0.min() - eps, x0.max() + eps + x1_min, x1_max = x1.min() - eps, x1.max() + eps + + assert disp.xx0.min() == pytest.approx(x0_min) + assert disp.xx0.max() == pytest.approx(x0_max) + assert disp.xx1.min() == pytest.approx(x1_min) + assert disp.xx1.max() == pytest.approx(x1_max) + + fig2, ax2 = pyplot.subplots() + # change plotting method for second plot + disp.plot(plot_method="pcolormesh", ax=ax2, shading="auto") + assert isinstance(disp.surface_, pyplot.matplotlib.collections.QuadMesh) + assert disp.ax_ == ax2 + assert disp.figure_ == fig2 + + +@pytest.mark.parametrize( + "response_method, msg", + [ + ( + "predict_proba", + "MyClassifier has none of the following attributes: predict_proba", + ), + ( + "decision_function", + "MyClassifier has none of the following attributes: decision_function", + ), + ( + "auto", + ( + "MyClassifier has none of the following attributes: decision_function, " + "predict_proba, predict" + ), + ), + ( + "bad_method", + "MyClassifier has none of the following attributes: bad_method", + ), + ], +) +def test_error_bad_response(pyplot, response_method, msg): + """Check errors for bad response.""" + + class MyClassifier(ClassifierMixin, BaseEstimator): + def fit(self, X, y): + self.fitted_ = True + self.classes_ = [0, 1] + return self + + clf = MyClassifier().fit(X, y) + + with pytest.raises(AttributeError, match=msg): + DecisionBoundaryDisplay.from_estimator(clf, X, response_method=response_method) + + +@pytest.mark.parametrize("response_method", ["auto", "predict", "predict_proba"]) +def test_multilabel_classifier_error(pyplot, response_method): + """Check that multilabel classifier raises correct error.""" + X, y = make_multilabel_classification(random_state=0) + X = X[:, :2] + tree = DecisionTreeClassifier().fit(X, y) + + msg = "Multi-label and multi-output multi-class classifiers are not supported" + with pytest.raises(ValueError, match=msg): + DecisionBoundaryDisplay.from_estimator( + tree, + X, + response_method=response_method, + ) + + +@pytest.mark.parametrize("response_method", ["auto", "predict", "predict_proba"]) +def test_multi_output_multi_class_classifier_error(pyplot, response_method): + """Check that multi-output multi-class classifier raises correct error.""" + X = np.asarray([[0, 1], [1, 2]]) + y = np.asarray([["tree", "cat"], ["cat", "tree"]]) + tree = DecisionTreeClassifier().fit(X, y) + + msg = "Multi-label and multi-output multi-class classifiers are not supported" + with pytest.raises(ValueError, match=msg): + DecisionBoundaryDisplay.from_estimator( + tree, + X, + response_method=response_method, + ) + + +def test_multioutput_regressor_error(pyplot): + """Check that multioutput regressor raises correct error.""" + X = np.asarray([[0, 1], [1, 2]]) + y = np.asarray([[0, 1], [4, 1]]) + tree = DecisionTreeRegressor().fit(X, y) + with pytest.raises(ValueError, match="Multi-output regressors are not supported"): + DecisionBoundaryDisplay.from_estimator(tree, X, response_method="predict") + + +@pytest.mark.parametrize( + "response_method", + ["predict_proba", "decision_function", ["predict_proba", "predict"]], +) +def test_regressor_unsupported_response(pyplot, response_method): + """Check that we can display the decision boundary for a regressor.""" + X, y = load_diabetes(return_X_y=True) + X = X[:, :2] + tree = DecisionTreeRegressor().fit(X, y) + err_msg = "should either be a classifier to be used with response_method" + with pytest.raises(ValueError, match=err_msg): + DecisionBoundaryDisplay.from_estimator(tree, X, response_method=response_method) + + +@pytest.mark.filterwarnings( + # We expect to raise the following warning because the classifier is fit on a + # NumPy array + "ignore:X has feature names, but LogisticRegression was fitted without" +) +def test_dataframe_labels_used(pyplot, fitted_clf): + """Check that column names are used for pandas.""" + pd = pytest.importorskip("pandas") + df = pd.DataFrame(X, columns=["col_x", "col_y"]) + + # pandas column names are used by default + _, ax = pyplot.subplots() + disp = DecisionBoundaryDisplay.from_estimator(fitted_clf, df, ax=ax) + assert ax.get_xlabel() == "col_x" + assert ax.get_ylabel() == "col_y" + + # second call to plot will have the names + fig, ax = pyplot.subplots() + disp.plot(ax=ax) + assert ax.get_xlabel() == "col_x" + assert ax.get_ylabel() == "col_y" + + # axes with a label will not get overridden + fig, ax = pyplot.subplots() + ax.set(xlabel="hello", ylabel="world") + disp.plot(ax=ax) + assert ax.get_xlabel() == "hello" + assert ax.get_ylabel() == "world" + + # labels get overridden only if provided to the `plot` method + disp.plot(ax=ax, xlabel="overwritten_x", ylabel="overwritten_y") + assert ax.get_xlabel() == "overwritten_x" + assert ax.get_ylabel() == "overwritten_y" + + # labels do not get inferred if provided to `from_estimator` + _, ax = pyplot.subplots() + disp = DecisionBoundaryDisplay.from_estimator( + fitted_clf, df, ax=ax, xlabel="overwritten_x", ylabel="overwritten_y" + ) + assert ax.get_xlabel() == "overwritten_x" + assert ax.get_ylabel() == "overwritten_y" + + +def test_string_target(pyplot): + """Check that decision boundary works with classifiers trained on string labels.""" + iris = load_iris() + X = iris.data[:, [0, 1]] + + # Use strings as target + y = iris.target_names[iris.target] + log_reg = LogisticRegression().fit(X, y) + + # Does not raise + DecisionBoundaryDisplay.from_estimator( + log_reg, + X, + grid_resolution=5, + response_method="predict", + ) + + +@pytest.mark.parametrize("constructor_name", ["pandas", "polars"]) +def test_dataframe_support(pyplot, constructor_name): + """Check that passing a dataframe at fit and to the Display does not + raise warnings. + + Non-regression test for: + * https://github.com/scikit-learn/scikit-learn/issues/23311 + * https://github.com/scikit-learn/scikit-learn/issues/28717 + """ + df = _convert_container( + X, constructor_name=constructor_name, columns_name=["col_x", "col_y"] + ) + estimator = LogisticRegression().fit(df, y) + + with warnings.catch_warnings(): + # no warnings linked to feature names validation should be raised + warnings.simplefilter("error", UserWarning) + DecisionBoundaryDisplay.from_estimator(estimator, df, response_method="predict") + + +@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"]) +def test_class_of_interest_binary(pyplot, response_method): + """Check the behaviour of passing `class_of_interest` for plotting the output of + `predict_proba` and `decision_function` in the binary case. + """ + iris = load_iris() + X = iris.data[:100, :2] + y = iris.target[:100] + assert_array_equal(np.unique(y), [0, 1]) + + estimator = LogisticRegression().fit(X, y) + # We will check that `class_of_interest=None` is equivalent to + # `class_of_interest=estimator.classes_[1]` + disp_default = DecisionBoundaryDisplay.from_estimator( + estimator, + X, + response_method=response_method, + class_of_interest=None, + ) + disp_class_1 = DecisionBoundaryDisplay.from_estimator( + estimator, + X, + response_method=response_method, + class_of_interest=estimator.classes_[1], + ) + + assert_allclose(disp_default.response, disp_class_1.response) + + # we can check that `_get_response_values` modifies the response when targeting + # the other class, i.e. 1 - p(y=1|x) for `predict_proba` and -decision_function + # for `decision_function`. + disp_class_0 = DecisionBoundaryDisplay.from_estimator( + estimator, + X, + response_method=response_method, + class_of_interest=estimator.classes_[0], + ) + + if response_method == "predict_proba": + assert_allclose(disp_default.response, 1 - disp_class_0.response) + else: + assert response_method == "decision_function" + assert_allclose(disp_default.response, -disp_class_0.response) + + +@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"]) +def test_class_of_interest_multiclass(pyplot, response_method): + """Check the behaviour of passing `class_of_interest` for plotting the output of + `predict_proba` and `decision_function` in the multiclass case. + """ + iris = load_iris() + X = iris.data[:, :2] + y = iris.target # the target are numerical labels + class_of_interest_idx = 2 + + estimator = LogisticRegression().fit(X, y) + disp = DecisionBoundaryDisplay.from_estimator( + estimator, + X, + response_method=response_method, + class_of_interest=class_of_interest_idx, + ) + + # we will check that we plot the expected values as response + grid = np.concatenate([disp.xx0.reshape(-1, 1), disp.xx1.reshape(-1, 1)], axis=1) + response = getattr(estimator, response_method)(grid)[:, class_of_interest_idx] + assert_allclose(response.reshape(*disp.response.shape), disp.response) + + # make the same test but this time using target as strings + y = iris.target_names[iris.target] + estimator = LogisticRegression().fit(X, y) + + disp = DecisionBoundaryDisplay.from_estimator( + estimator, + X, + response_method=response_method, + class_of_interest=iris.target_names[class_of_interest_idx], + ) + + grid = np.concatenate([disp.xx0.reshape(-1, 1), disp.xx1.reshape(-1, 1)], axis=1) + response = getattr(estimator, response_method)(grid)[:, class_of_interest_idx] + assert_allclose(response.reshape(*disp.response.shape), disp.response) + + # check that we raise an error for unknown labels + # this test should already be handled in `_get_response_values` but we can have this + # test here as well + err_msg = "class_of_interest=2 is not a valid label: It should be one of" + with pytest.raises(ValueError, match=err_msg): + DecisionBoundaryDisplay.from_estimator( + estimator, + X, + response_method=response_method, + class_of_interest=class_of_interest_idx, + ) + + +@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"]) +def test_multiclass_plot_max_class(pyplot, response_method): + """Check plot correct when plotting max multiclass class.""" + import matplotlib as mpl + + # In matplotlib < v3.5, default value of `pcolormesh(shading)` is 'flat', which + # results in the last row and column being dropped. Thus older versions produce + # a 99x99 grid, while newer versions produce a 100x100 grid. + if parse_version(mpl.__version__) < parse_version("3.5"): + pytest.skip("`pcolormesh` in Matplotlib >= 3.5 gives smaller grid size.") + + X, y = load_iris_2d_scaled() + clf = LogisticRegression().fit(X, y) + + disp = DecisionBoundaryDisplay.from_estimator( + clf, + X, + plot_method="pcolormesh", + response_method=response_method, + ) + + grid = np.concatenate([disp.xx0.reshape(-1, 1), disp.xx1.reshape(-1, 1)], axis=1) + response = getattr(clf, response_method)(grid).reshape(*disp.response.shape) + assert_allclose(response, disp.response) + + assert len(disp.surface_) == len(clf.classes_) + # Get which class has highest response and check it is plotted + highest_class = np.argmax(response, axis=2) + for idx, quadmesh in enumerate(disp.surface_): + # Note quadmesh mask is True (i.e. masked) when `idx` is NOT the highest class + assert_array_equal( + highest_class != idx, + quadmesh.get_array().mask.reshape(*highest_class.shape), + ) + + +@pytest.mark.parametrize( + "multiclass_colors", + [ + "plasma", + "Blues", + ["red", "green", "blue"], + ], +) +@pytest.mark.parametrize("plot_method", ["contourf", "contour", "pcolormesh"]) +def test_multiclass_colors_cmap(pyplot, plot_method, multiclass_colors): + """Check correct cmap used for all `multiclass_colors` inputs.""" + import matplotlib as mpl + + if parse_version(mpl.__version__) < parse_version("3.5"): + pytest.skip( + "Matplotlib >= 3.5 is needed for `==` to check equivalence of colormaps" + ) + + X, y = load_iris_2d_scaled() + clf = LogisticRegression().fit(X, y) + + disp = DecisionBoundaryDisplay.from_estimator( + clf, + X, + plot_method=plot_method, + multiclass_colors=multiclass_colors, + ) + + if multiclass_colors == "plasma": + colors = mpl.pyplot.get_cmap(multiclass_colors, len(clf.classes_)).colors + elif multiclass_colors == "Blues": + cmap = mpl.pyplot.get_cmap(multiclass_colors, len(clf.classes_)) + colors = cmap(np.linspace(0, 1, len(clf.classes_))) + else: + colors = [mpl.colors.to_rgba(color) for color in multiclass_colors] + + if plot_method != "contour": + cmaps = [ + mpl.colors.LinearSegmentedColormap.from_list( + f"colormap_{class_idx}", [(1.0, 1.0, 1.0, 1.0), (r, g, b, 1.0)] + ) + for class_idx, (r, g, b, _) in enumerate(colors) + ] + for idx, quad in enumerate(disp.surface_): + assert quad.cmap == cmaps[idx] + else: + assert_allclose(disp.surface_.colors, colors) + + +def test_cmap_and_colors_logic(pyplot): + """Check the handling logic for `cmap` and `colors`.""" + X, y = load_iris_2d_scaled() + clf = LogisticRegression().fit(X, y) + + with pytest.warns( + UserWarning, + match="'cmap' is ignored in favor of 'multiclass_colors'", + ): + DecisionBoundaryDisplay.from_estimator( + clf, + X, + multiclass_colors="plasma", + cmap="Blues", + ) + + with pytest.warns( + UserWarning, + match="'colors' is ignored in favor of 'multiclass_colors'", + ): + DecisionBoundaryDisplay.from_estimator( + clf, + X, + multiclass_colors="plasma", + colors="blue", + ) + + +def test_subclass_named_constructors_return_type_is_subclass(pyplot): + """Check that named constructors return the correct type when subclassed. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/pull/27675 + """ + clf = LogisticRegression().fit(X, y) + + class SubclassOfDisplay(DecisionBoundaryDisplay): + pass + + curve = SubclassOfDisplay.from_estimator(estimator=clf, X=X) + + assert isinstance(curve, SubclassOfDisplay) diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py new file mode 100644 index 0000000000000000000000000000000000000000..75869079be9cc4fd2113a5186960e7acbc3722d4 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py @@ -0,0 +1,1315 @@ +import numpy as np +import pytest +from numpy.testing import assert_allclose +from scipy.stats.mstats import mquantiles + +from sklearn.compose import make_column_transformer +from sklearn.datasets import ( + load_diabetes, + load_iris, + make_classification, + make_regression, +) +from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor +from sklearn.inspection import PartialDependenceDisplay +from sklearn.linear_model import LinearRegression +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import OneHotEncoder +from sklearn.utils._testing import _convert_container + + +@pytest.fixture(scope="module") +def diabetes(): + # diabetes dataset, subsampled for speed + data = load_diabetes() + data.data = data.data[:50] + data.target = data.target[:50] + return data + + +@pytest.fixture(scope="module") +def clf_diabetes(diabetes): + clf = GradientBoostingRegressor(n_estimators=10, random_state=1) + clf.fit(diabetes.data, diabetes.target) + return clf + + +def custom_values_helper(feature, grid_resolution): + return np.linspace( + *mquantiles(feature, (0.05, 0.95), axis=0), num=grid_resolution, endpoint=True + ) + + +@pytest.mark.filterwarnings("ignore:A Bunch will be returned") +@pytest.mark.parametrize("grid_resolution", [10, 20]) +@pytest.mark.parametrize("use_custom_values", [True, False]) +def test_plot_partial_dependence( + use_custom_values, + grid_resolution, + pyplot, + clf_diabetes, + diabetes, +): + # Test partial dependence plot function. + # Use columns 0 & 2 as 1 is not quantitative (sex) + feature_names = diabetes.feature_names + custom_values = None + if use_custom_values: + custom_values = { + 0: custom_values_helper(diabetes.data[:, 0], grid_resolution), + 2: custom_values_helper(diabetes.data[:, 2], grid_resolution), + } + disp = PartialDependenceDisplay.from_estimator( + clf_diabetes, + diabetes.data, + [0, 2, (0, 2)], + grid_resolution=grid_resolution, + feature_names=feature_names, + contour_kw={"cmap": "jet"}, + custom_values=custom_values, + ) + fig = pyplot.gcf() + axs = fig.get_axes() + assert disp.figure_ is fig + assert len(axs) == 4 + + assert disp.bounding_ax_ is not None + assert disp.axes_.shape == (1, 3) + assert disp.lines_.shape == (1, 3) + assert disp.contours_.shape == (1, 3) + assert disp.deciles_vlines_.shape == (1, 3) + assert disp.deciles_hlines_.shape == (1, 3) + + assert disp.lines_[0, 2] is None + assert disp.contours_[0, 0] is None + assert disp.contours_[0, 1] is None + + # deciles lines: always show on xaxis, only show on yaxis if 2-way PDP + for i in range(3): + assert disp.deciles_vlines_[0, i] is not None + assert disp.deciles_hlines_[0, 0] is None + assert disp.deciles_hlines_[0, 1] is None + assert disp.deciles_hlines_[0, 2] is not None + + assert disp.features == [(0,), (2,), (0, 2)] + assert np.all(disp.feature_names == feature_names) + assert len(disp.deciles) == 2 + for i in [0, 2]: + assert_allclose( + disp.deciles[i], + mquantiles(diabetes.data[:, i], prob=np.arange(0.1, 1.0, 0.1)), + ) + + single_feature_positions = [(0, (0, 0)), (2, (0, 1))] + expected_ylabels = ["Partial dependence", ""] + + for i, (feat_col, pos) in enumerate(single_feature_positions): + ax = disp.axes_[pos] + assert ax.get_ylabel() == expected_ylabels[i] + assert ax.get_xlabel() == diabetes.feature_names[feat_col] + + line = disp.lines_[pos] + + avg_preds = disp.pd_results[i] + assert avg_preds.average.shape == (1, grid_resolution) + target_idx = disp.target_idx + + line_data = line.get_data() + assert_allclose(line_data[0], avg_preds["grid_values"][0]) + assert_allclose(line_data[1], avg_preds.average[target_idx].ravel()) + + # two feature position + ax = disp.axes_[0, 2] + coutour = disp.contours_[0, 2] + assert coutour.get_cmap().name == "jet" + assert ax.get_xlabel() == diabetes.feature_names[0] + assert ax.get_ylabel() == diabetes.feature_names[2] + + +@pytest.mark.parametrize( + "kind, centered, subsample, shape", + [ + ("average", False, None, (1, 3)), + ("individual", False, None, (1, 3, 50)), + ("both", False, None, (1, 3, 51)), + ("individual", False, 20, (1, 3, 20)), + ("both", False, 20, (1, 3, 21)), + ("individual", False, 0.5, (1, 3, 25)), + ("both", False, 0.5, (1, 3, 26)), + ("average", True, None, (1, 3)), + ("individual", True, None, (1, 3, 50)), + ("both", True, None, (1, 3, 51)), + ("individual", True, 20, (1, 3, 20)), + ("both", True, 20, (1, 3, 21)), + ], +) +def test_plot_partial_dependence_kind( + pyplot, + kind, + centered, + subsample, + shape, + clf_diabetes, + diabetes, +): + disp = PartialDependenceDisplay.from_estimator( + clf_diabetes, + diabetes.data, + [0, 1, 2], + kind=kind, + centered=centered, + subsample=subsample, + ) + + assert disp.axes_.shape == (1, 3) + assert disp.lines_.shape == shape + assert disp.contours_.shape == (1, 3) + + assert disp.contours_[0, 0] is None + assert disp.contours_[0, 1] is None + assert disp.contours_[0, 2] is None + + if centered: + assert all([ln._y[0] == 0.0 for ln in disp.lines_.ravel() if ln is not None]) + else: + assert all([ln._y[0] != 0.0 for ln in disp.lines_.ravel() if ln is not None]) + + +@pytest.mark.parametrize( + "input_type, feature_names_type", + [ + ("dataframe", None), + ("dataframe", "list"), + ("list", "list"), + ("array", "list"), + ("dataframe", "array"), + ("list", "array"), + ("array", "array"), + ("dataframe", "series"), + ("list", "series"), + ("array", "series"), + ("dataframe", "index"), + ("list", "index"), + ("array", "index"), + ], +) +@pytest.mark.parametrize("use_custom_values", [True, False]) +def test_plot_partial_dependence_str_features( + pyplot, + use_custom_values, + clf_diabetes, + diabetes, + input_type, + feature_names_type, +): + age = diabetes.data[:, diabetes.feature_names.index("age")] + bmi = diabetes.data[:, diabetes.feature_names.index("bmi")] + + if input_type == "dataframe": + pd = pytest.importorskip("pandas") + X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names) + elif input_type == "list": + X = diabetes.data.tolist() + else: + X = diabetes.data + + if feature_names_type is None: + feature_names = None + else: + feature_names = _convert_container(diabetes.feature_names, feature_names_type) + + grid_resolution = 25 + custom_values = None + if use_custom_values: + custom_values = { + "age": custom_values_helper(age, grid_resolution), + "bmi": custom_values_helper(bmi, grid_resolution), + } + # check with str features and array feature names and single column + disp = PartialDependenceDisplay.from_estimator( + clf_diabetes, + X, + [("age", "bmi"), "bmi"], + grid_resolution=grid_resolution, + feature_names=feature_names, + n_cols=1, + line_kw={"alpha": 0.8}, + custom_values=custom_values, + ) + fig = pyplot.gcf() + axs = fig.get_axes() + assert len(axs) == 3 + + assert disp.figure_ is fig + assert disp.axes_.shape == (2, 1) + assert disp.lines_.shape == (2, 1) + assert disp.contours_.shape == (2, 1) + assert disp.deciles_vlines_.shape == (2, 1) + assert disp.deciles_hlines_.shape == (2, 1) + + assert disp.lines_[0, 0] is None + assert disp.deciles_vlines_[0, 0] is not None + assert disp.deciles_hlines_[0, 0] is not None + assert disp.contours_[1, 0] is None + assert disp.deciles_hlines_[1, 0] is None + assert disp.deciles_vlines_[1, 0] is not None + + # line + ax = disp.axes_[1, 0] + assert ax.get_xlabel() == "bmi" + assert ax.get_ylabel() == "Partial dependence" + + line = disp.lines_[1, 0] + avg_preds = disp.pd_results[1] + target_idx = disp.target_idx + assert line.get_alpha() == 0.8 + + line_data = line.get_data() + assert_allclose(line_data[0], avg_preds["grid_values"][0]) + assert_allclose(line_data[1], avg_preds.average[target_idx].ravel()) + + # contour + ax = disp.axes_[0, 0] + assert ax.get_xlabel() == "age" + assert ax.get_ylabel() == "bmi" + + +@pytest.mark.filterwarnings("ignore:A Bunch will be returned") +@pytest.mark.parametrize("use_custom_values", [True, False]) +def test_plot_partial_dependence_custom_axes( + use_custom_values, pyplot, clf_diabetes, diabetes +): + grid_resolution = 25 + fig, (ax1, ax2) = pyplot.subplots(1, 2) + + age = diabetes.data[:, diabetes.feature_names.index("age")] + bmi = diabetes.data[:, diabetes.feature_names.index("bmi")] + custom_values = None + if use_custom_values: + custom_values = { + "age": custom_values_helper(age, grid_resolution), + "bmi": custom_values_helper(bmi, grid_resolution), + } + + disp = PartialDependenceDisplay.from_estimator( + clf_diabetes, + diabetes.data, + ["age", ("age", "bmi")], + grid_resolution=grid_resolution, + feature_names=diabetes.feature_names, + ax=[ax1, ax2], + custom_values=custom_values, + ) + assert fig is disp.figure_ + assert disp.bounding_ax_ is None + assert disp.axes_.shape == (2,) + assert disp.axes_[0] is ax1 + assert disp.axes_[1] is ax2 + + ax = disp.axes_[0] + assert ax.get_xlabel() == "age" + assert ax.get_ylabel() == "Partial dependence" + + line = disp.lines_[0] + avg_preds = disp.pd_results[0] + target_idx = disp.target_idx + + line_data = line.get_data() + assert_allclose(line_data[0], avg_preds["grid_values"][0]) + assert_allclose(line_data[1], avg_preds.average[target_idx].ravel()) + + # contour + ax = disp.axes_[1] + assert ax.get_xlabel() == "age" + assert ax.get_ylabel() == "bmi" + + +@pytest.mark.parametrize( + "kind, lines", [("average", 1), ("individual", 50), ("both", 51)] +) +@pytest.mark.parametrize("use_custom_values", [True, False]) +def test_plot_partial_dependence_passing_numpy_axes( + pyplot, + clf_diabetes, + diabetes, + use_custom_values, + kind, + lines, +): + grid_resolution = 25 + feature_names = diabetes.feature_names + + age = diabetes.data[:, diabetes.feature_names.index("age")] + bmi = diabetes.data[:, diabetes.feature_names.index("bmi")] + custom_values = None + if use_custom_values: + custom_values = { + "age": custom_values_helper(age, grid_resolution), + "bmi": custom_values_helper(bmi, grid_resolution), + } + + disp1 = PartialDependenceDisplay.from_estimator( + clf_diabetes, + diabetes.data, + ["age", "bmi"], + kind=kind, + grid_resolution=grid_resolution, + feature_names=feature_names, + custom_values=custom_values, + ) + assert disp1.axes_.shape == (1, 2) + assert disp1.axes_[0, 0].get_ylabel() == "Partial dependence" + assert disp1.axes_[0, 1].get_ylabel() == "" + assert len(disp1.axes_[0, 0].get_lines()) == lines + assert len(disp1.axes_[0, 1].get_lines()) == lines + + lr = LinearRegression() + lr.fit(diabetes.data, diabetes.target) + + disp2 = PartialDependenceDisplay.from_estimator( + lr, + diabetes.data, + ["age", "bmi"], + kind=kind, + grid_resolution=grid_resolution, + feature_names=feature_names, + ax=disp1.axes_, + ) + + assert np.all(disp1.axes_ == disp2.axes_) + assert len(disp2.axes_[0, 0].get_lines()) == 2 * lines + assert len(disp2.axes_[0, 1].get_lines()) == 2 * lines + + +@pytest.mark.parametrize("nrows, ncols", [(2, 2), (3, 1)]) +@pytest.mark.parametrize("use_custom_values", [True, False]) +def test_plot_partial_dependence_incorrent_num_axes( + pyplot, + clf_diabetes, + diabetes, + use_custom_values, + nrows, + ncols, +): + grid_resolution = 5 + fig, axes = pyplot.subplots(nrows, ncols) + axes_formats = [list(axes.ravel()), tuple(axes.ravel()), axes] + + msg = "Expected ax to have 2 axes, got {}".format(nrows * ncols) + + age = diabetes.data[:, diabetes.feature_names.index("age")] + bmi = diabetes.data[:, diabetes.feature_names.index("bmi")] + custom_values = None + if use_custom_values: + custom_values = { + "age": custom_values_helper(age, grid_resolution), + "bmi": custom_values_helper(bmi, grid_resolution), + } + + age = diabetes.data[:, diabetes.feature_names.index("age")] + bmi = diabetes.data[:, diabetes.feature_names.index("bmi")] + custom_values = None + if use_custom_values: + custom_values = { + "age": custom_values_helper(age, grid_resolution), + "bmi": custom_values_helper(bmi, grid_resolution), + } + + disp = PartialDependenceDisplay.from_estimator( + clf_diabetes, + diabetes.data, + ["age", "bmi"], + grid_resolution=grid_resolution, + feature_names=diabetes.feature_names, + custom_values=custom_values, + ) + + for ax_format in axes_formats: + with pytest.raises(ValueError, match=msg): + PartialDependenceDisplay.from_estimator( + clf_diabetes, + diabetes.data, + ["age", "bmi"], + grid_resolution=grid_resolution, + feature_names=diabetes.feature_names, + ax=ax_format, + custom_values=custom_values, + ) + + # with axes object + with pytest.raises(ValueError, match=msg): + disp.plot(ax=ax_format) + + +@pytest.mark.filterwarnings("ignore:A Bunch will be returned") +@pytest.mark.parametrize("use_custom_values", [True, False]) +def test_plot_partial_dependence_with_same_axes( + use_custom_values, pyplot, clf_diabetes, diabetes +): + # The first call to plot_partial_dependence will create two new axes to + # place in the space of the passed in axes, which results in a total of + # three axes in the figure. + # Currently the API does not allow for the second call to + # plot_partial_dependence to use the same axes again, because it will + # create two new axes in the space resulting in five axes. To get the + # expected behavior one needs to pass the generated axes into the second + # call: + # disp1 = plot_partial_dependence(...) + # disp2 = plot_partial_dependence(..., ax=disp1.axes_) + + grid_resolution = 25 + + age = diabetes.data[:, diabetes.feature_names.index("age")] + bmi = diabetes.data[:, diabetes.feature_names.index("bmi")] + custom_values = None + if use_custom_values: + custom_values = { + "age": custom_values_helper(age, grid_resolution), + "bmi": custom_values_helper(bmi, grid_resolution), + } + + fig, ax = pyplot.subplots() + PartialDependenceDisplay.from_estimator( + clf_diabetes, + diabetes.data, + ["age", "bmi"], + grid_resolution=grid_resolution, + feature_names=diabetes.feature_names, + ax=ax, + custom_values=custom_values, + ) + + msg = ( + "The ax was already used in another plot function, please set " + "ax=display.axes_ instead" + ) + + with pytest.raises(ValueError, match=msg): + PartialDependenceDisplay.from_estimator( + clf_diabetes, + diabetes.data, + ["age", "bmi"], + grid_resolution=grid_resolution, + feature_names=diabetes.feature_names, + custom_values=custom_values, + ax=ax, + ) + + +@pytest.mark.filterwarnings("ignore:A Bunch will be returned") +@pytest.mark.parametrize("use_custom_values", [True, False]) +def test_plot_partial_dependence_feature_name_reuse( + use_custom_values, pyplot, clf_diabetes, diabetes +): + # second call to plot does not change the feature names from the first + # call + grid_resolution = 10 + + custom_values = None + if use_custom_values: + custom_values = { + 0: custom_values_helper(diabetes.data[:, 0], grid_resolution), + 1: custom_values_helper(diabetes.data[:, 1], grid_resolution), + } + + feature_names = diabetes.feature_names + disp = PartialDependenceDisplay.from_estimator( + clf_diabetes, + diabetes.data, + [0, 1], + grid_resolution=grid_resolution, + feature_names=feature_names, + custom_values=custom_values, + ) + + PartialDependenceDisplay.from_estimator( + clf_diabetes, + diabetes.data, + [0, 1], + grid_resolution=grid_resolution, + ax=disp.axes_, + custom_values=custom_values, + ) + + for i, ax in enumerate(disp.axes_.ravel()): + assert ax.get_xlabel() == feature_names[i] + + +@pytest.mark.filterwarnings("ignore:A Bunch will be returned") +@pytest.mark.parametrize("use_custom_values", [True, False]) +def test_plot_partial_dependence_multiclass(use_custom_values, pyplot): + grid_resolution = 25 + clf_int = GradientBoostingClassifier(n_estimators=10, random_state=1) + iris = load_iris() + + custom_values = None + if use_custom_values: + custom_values = { + 0: custom_values_helper(iris.data[:, 0], grid_resolution), + 1: custom_values_helper(iris.data[:, 1], grid_resolution), + } + + # Test partial dependence plot function on multi-class input. + clf_int.fit(iris.data, iris.target) + + disp_target_0 = PartialDependenceDisplay.from_estimator( + clf_int, + iris.data, + [0, 1], + target=0, + grid_resolution=grid_resolution, + custom_values=custom_values, + ) + assert disp_target_0.figure_ is pyplot.gcf() + assert disp_target_0.axes_.shape == (1, 2) + assert disp_target_0.lines_.shape == (1, 2) + assert disp_target_0.contours_.shape == (1, 2) + assert disp_target_0.deciles_vlines_.shape == (1, 2) + assert disp_target_0.deciles_hlines_.shape == (1, 2) + assert all(c is None for c in disp_target_0.contours_.flat) + assert disp_target_0.target_idx == 0 + + # now with symbol labels + target = iris.target_names[iris.target] + clf_symbol = GradientBoostingClassifier(n_estimators=10, random_state=1) + clf_symbol.fit(iris.data, target) + + disp_symbol = PartialDependenceDisplay.from_estimator( + clf_symbol, + iris.data, + [0, 1], + target="setosa", + grid_resolution=grid_resolution, + custom_values=custom_values, + ) + assert disp_symbol.figure_ is pyplot.gcf() + assert disp_symbol.axes_.shape == (1, 2) + assert disp_symbol.lines_.shape == (1, 2) + assert disp_symbol.contours_.shape == (1, 2) + assert disp_symbol.deciles_vlines_.shape == (1, 2) + assert disp_symbol.deciles_hlines_.shape == (1, 2) + assert all(c is None for c in disp_symbol.contours_.flat) + assert disp_symbol.target_idx == 0 + + for int_result, symbol_result in zip( + disp_target_0.pd_results, disp_symbol.pd_results + ): + assert_allclose(int_result.average, symbol_result.average) + assert_allclose(int_result["grid_values"], symbol_result["grid_values"]) + + # check that the pd plots are different for another target + + disp_target_1 = PartialDependenceDisplay.from_estimator( + clf_int, + iris.data, + [0, 3], + target=1, + grid_resolution=grid_resolution, + custom_values=custom_values, + ) + target_0_data_y = disp_target_0.lines_[0, 0].get_data()[1] + target_1_data_y = disp_target_1.lines_[0, 0].get_data()[1] + assert any(target_0_data_y != target_1_data_y) + + +multioutput_regression_data = make_regression(n_samples=50, n_targets=2, random_state=0) + + +@pytest.mark.parametrize("target", [0, 1]) +@pytest.mark.parametrize("use_custom_values", [True, False]) +def test_plot_partial_dependence_multioutput(use_custom_values, pyplot, target): + # Test partial dependence plot function on multi-output input. + X, y = multioutput_regression_data + clf = LinearRegression().fit(X, y) + + grid_resolution = 25 + + custom_values = None + if use_custom_values: + custom_values = { + 0: custom_values_helper(X[:, 0], grid_resolution), + 1: custom_values_helper(X[:, 1], grid_resolution), + } + + disp = PartialDependenceDisplay.from_estimator( + clf, + X, + [0, 1], + target=target, + grid_resolution=grid_resolution, + custom_values=custom_values, + ) + fig = pyplot.gcf() + axs = fig.get_axes() + assert len(axs) == 3 + assert disp.target_idx == target + assert disp.bounding_ax_ is not None + + positions = [(0, 0), (0, 1)] + expected_label = ["Partial dependence", ""] + + for i, pos in enumerate(positions): + ax = disp.axes_[pos] + assert ax.get_ylabel() == expected_label[i] + assert ax.get_xlabel() == f"x{i}" + + +def test_plot_partial_dependence_dataframe(pyplot, clf_diabetes, diabetes): + pd = pytest.importorskip("pandas") + df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names) + + grid_resolution = 25 + + PartialDependenceDisplay.from_estimator( + clf_diabetes, + df, + ["bp", "s1"], + grid_resolution=grid_resolution, + feature_names=df.columns.tolist(), + ) + + +dummy_classification_data = make_classification(random_state=0) + + +@pytest.mark.parametrize( + "data, params, err_msg", + [ + ( + multioutput_regression_data, + {"target": None, "features": [0]}, + "target must be specified for multi-output", + ), + ( + multioutput_regression_data, + {"target": -1, "features": [0]}, + r"target must be in \[0, n_tasks\]", + ), + ( + multioutput_regression_data, + {"target": 100, "features": [0]}, + r"target must be in \[0, n_tasks\]", + ), + ( + dummy_classification_data, + {"features": ["foobar"], "feature_names": None}, + "Feature 'foobar' not in feature_names", + ), + ( + dummy_classification_data, + {"features": ["foobar"], "feature_names": ["abcd", "def"]}, + "Feature 'foobar' not in feature_names", + ), + ( + dummy_classification_data, + {"features": [(1, 2, 3)]}, + "Each entry in features must be either an int, ", + ), + ( + dummy_classification_data, + {"features": [1, {}]}, + "Each entry in features must be either an int, ", + ), + ( + dummy_classification_data, + {"features": [tuple()]}, + "Each entry in features must be either an int, ", + ), + ( + dummy_classification_data, + {"features": [123], "feature_names": ["blahblah"]}, + "All entries of features must be less than ", + ), + ( + dummy_classification_data, + {"features": [0, 1, 2], "feature_names": ["a", "b", "a"]}, + "feature_names should not contain duplicates", + ), + ( + dummy_classification_data, + {"features": [1, 2], "kind": ["both"]}, + "When `kind` is provided as a list of strings, it should contain", + ), + ( + dummy_classification_data, + {"features": [1], "subsample": -1}, + "When an integer, subsample=-1 should be positive.", + ), + ( + dummy_classification_data, + {"features": [1], "subsample": 1.2}, + r"When a floating-point, subsample=1.2 should be in the \(0, 1\) range", + ), + ( + dummy_classification_data, + {"features": [1, 2], "categorical_features": [1.0, 2.0]}, + "Expected `categorical_features` to be an array-like of boolean,", + ), + ( + dummy_classification_data, + {"features": [(1, 2)], "categorical_features": [2]}, + "Two-way partial dependence plots are not supported for pairs", + ), + ( + dummy_classification_data, + {"features": [1], "categorical_features": [1], "kind": "individual"}, + "It is not possible to display individual effects", + ), + ], +) +def test_plot_partial_dependence_error(pyplot, data, params, err_msg): + X, y = data + estimator = LinearRegression().fit(X, y) + + with pytest.raises(ValueError, match=err_msg): + PartialDependenceDisplay.from_estimator(estimator, X, **params) + + +@pytest.mark.parametrize( + "params, err_msg", + [ + ({"target": 4, "features": [0]}, "target not in est.classes_, got 4"), + ({"target": None, "features": [0]}, "target must be specified for multi-class"), + ( + {"target": 1, "features": [4.5]}, + "Each entry in features must be either an int,", + ), + ], +) +def test_plot_partial_dependence_multiclass_error(pyplot, params, err_msg): + iris = load_iris() + clf = GradientBoostingClassifier(n_estimators=10, random_state=1) + clf.fit(iris.data, iris.target) + + with pytest.raises(ValueError, match=err_msg): + PartialDependenceDisplay.from_estimator(clf, iris.data, **params) + + +def test_plot_partial_dependence_does_not_override_ylabel( + pyplot, clf_diabetes, diabetes +): + # Non-regression test to be sure to not override the ylabel if it has been + # See https://github.com/scikit-learn/scikit-learn/issues/15772 + _, axes = pyplot.subplots(1, 2) + axes[0].set_ylabel("Hello world") + PartialDependenceDisplay.from_estimator( + clf_diabetes, diabetes.data, [0, 1], ax=axes + ) + + assert axes[0].get_ylabel() == "Hello world" + assert axes[1].get_ylabel() == "Partial dependence" + + +@pytest.mark.parametrize( + "categorical_features, array_type", + [ + (["col_A", "col_C"], "dataframe"), + ([0, 2], "array"), + ([True, False, True], "array"), + ], +) +def test_plot_partial_dependence_with_categorical( + pyplot, categorical_features, array_type +): + X = [[1, 1, "A"], [2, 0, "C"], [3, 2, "B"]] + column_name = ["col_A", "col_B", "col_C"] + X = _convert_container(X, array_type, columns_name=column_name) + y = np.array([1.2, 0.5, 0.45]).T + + preprocessor = make_column_transformer((OneHotEncoder(), categorical_features)) + model = make_pipeline(preprocessor, LinearRegression()) + model.fit(X, y) + + # single feature + disp = PartialDependenceDisplay.from_estimator( + model, + X, + features=["col_C"], + feature_names=column_name, + categorical_features=categorical_features, + ) + + assert disp.figure_ is pyplot.gcf() + assert disp.bars_.shape == (1, 1) + assert disp.bars_[0][0] is not None + assert disp.lines_.shape == (1, 1) + assert disp.lines_[0][0] is None + assert disp.contours_.shape == (1, 1) + assert disp.contours_[0][0] is None + assert disp.deciles_vlines_.shape == (1, 1) + assert disp.deciles_vlines_[0][0] is None + assert disp.deciles_hlines_.shape == (1, 1) + assert disp.deciles_hlines_[0][0] is None + assert disp.axes_[0, 0].get_legend() is None + + # interaction between two features + disp = PartialDependenceDisplay.from_estimator( + model, + X, + features=[("col_A", "col_C")], + feature_names=column_name, + categorical_features=categorical_features, + ) + + assert disp.figure_ is pyplot.gcf() + assert disp.bars_.shape == (1, 1) + assert disp.bars_[0][0] is None + assert disp.lines_.shape == (1, 1) + assert disp.lines_[0][0] is None + assert disp.contours_.shape == (1, 1) + assert disp.contours_[0][0] is None + assert disp.deciles_vlines_.shape == (1, 1) + assert disp.deciles_vlines_[0][0] is None + assert disp.deciles_hlines_.shape == (1, 1) + assert disp.deciles_hlines_[0][0] is None + assert disp.axes_[0, 0].get_legend() is None + + +def test_plot_partial_dependence_legend(pyplot): + pd = pytest.importorskip("pandas") + X = pd.DataFrame( + { + "col_A": ["A", "B", "C"], + "col_B": [1.0, 0.0, 2.0], + "col_C": ["C", "B", "A"], + } + ) + y = np.array([1.2, 0.5, 0.45]).T + + categorical_features = ["col_A", "col_C"] + preprocessor = make_column_transformer((OneHotEncoder(), categorical_features)) + model = make_pipeline(preprocessor, LinearRegression()) + model.fit(X, y) + + disp = PartialDependenceDisplay.from_estimator( + model, + X, + features=["col_B", "col_C"], + categorical_features=categorical_features, + kind=["both", "average"], + ) + + legend_text = disp.axes_[0, 0].get_legend().get_texts() + assert len(legend_text) == 1 + assert legend_text[0].get_text() == "average" + assert disp.axes_[0, 1].get_legend() is None + + +@pytest.mark.parametrize( + "kind, expected_shape", + [("average", (1, 2)), ("individual", (1, 2, 20)), ("both", (1, 2, 21))], +) +@pytest.mark.parametrize("use_custom_values", [True, False]) +def test_plot_partial_dependence_subsampling( + pyplot, + clf_diabetes, + diabetes, + use_custom_values, + kind, + expected_shape, +): + # check that the subsampling is properly working + # non-regression test for: + # https://github.com/scikit-learn/scikit-learn/pull/18359 + matplotlib = pytest.importorskip("matplotlib") + grid_resolution = 25 + feature_names = diabetes.feature_names + + age = diabetes.data[:, diabetes.feature_names.index("age")] + bmi = diabetes.data[:, diabetes.feature_names.index("bmi")] + + custom_values = None + if use_custom_values: + custom_values = { + "age": custom_values_helper(age, grid_resolution), + "bmi": custom_values_helper(bmi, grid_resolution), + } + + disp1 = PartialDependenceDisplay.from_estimator( + clf_diabetes, + diabetes.data, + ["age", "bmi"], + kind=kind, + grid_resolution=grid_resolution, + feature_names=feature_names, + subsample=20, + random_state=0, + custom_values=custom_values, + ) + + assert disp1.lines_.shape == expected_shape + assert all( + [isinstance(line, matplotlib.lines.Line2D) for line in disp1.lines_.ravel()] + ) + + +@pytest.mark.parametrize( + "kind, line_kw, label", + [ + ("individual", {}, None), + ("individual", {"label": "xxx"}, None), + ("average", {}, None), + ("average", {"label": "xxx"}, "xxx"), + ("both", {}, "average"), + ("both", {"label": "xxx"}, "xxx"), + ], +) +def test_partial_dependence_overwrite_labels( + pyplot, + clf_diabetes, + diabetes, + kind, + line_kw, + label, +): + """Test that make sure that we can overwrite the label of the PDP plot""" + disp = PartialDependenceDisplay.from_estimator( + clf_diabetes, + diabetes.data, + [0, 2], + grid_resolution=25, + feature_names=diabetes.feature_names, + kind=kind, + line_kw=line_kw, + ) + + for ax in disp.axes_.ravel(): + if label is None: + assert ax.get_legend() is None + else: + legend_text = ax.get_legend().get_texts() + assert len(legend_text) == 1 + assert legend_text[0].get_text() == label + + +@pytest.mark.parametrize( + "categorical_features, array_type", + [ + (["col_A", "col_C"], "dataframe"), + ([0, 2], "array"), + ([True, False, True], "array"), + ], +) +def test_grid_resolution_with_categorical(pyplot, categorical_features, array_type): + """Check that we raise a ValueError when the grid_resolution is too small + respect to the number of categories in the categorical features targeted. + """ + X = [["A", 1, "A"], ["B", 0, "C"], ["C", 2, "B"]] + column_name = ["col_A", "col_B", "col_C"] + X = _convert_container(X, array_type, columns_name=column_name) + y = np.array([1.2, 0.5, 0.45]).T + + preprocessor = make_column_transformer((OneHotEncoder(), categorical_features)) + model = make_pipeline(preprocessor, LinearRegression()) + model.fit(X, y) + + err_msg = ( + "resolution of the computed grid is less than the minimum number of categories" + ) + with pytest.raises(ValueError, match=err_msg): + PartialDependenceDisplay.from_estimator( + model, + X, + features=["col_C"], + feature_names=column_name, + categorical_features=categorical_features, + grid_resolution=2, + ) + + +@pytest.mark.parametrize("kind", ["individual", "average", "both"]) +@pytest.mark.parametrize("centered", [True, False]) +def test_partial_dependence_plot_limits_one_way( + pyplot, clf_diabetes, diabetes, kind, centered +): + """Check that the PD limit on the plots are properly set on one-way plots.""" + disp = PartialDependenceDisplay.from_estimator( + clf_diabetes, + diabetes.data, + features=(0, 1), + kind=kind, + grid_resolution=25, + feature_names=diabetes.feature_names, + ) + + range_pd = np.array([-1, 1], dtype=np.float64) + for pd in disp.pd_results: + if "average" in pd: + pd["average"][...] = range_pd[1] + pd["average"][0, 0] = range_pd[0] + if "individual" in pd: + pd["individual"][...] = range_pd[1] + pd["individual"][0, 0, 0] = range_pd[0] + + disp.plot(centered=centered) + # check that we anchor to zero x-axis when centering + y_lim = range_pd - range_pd[0] if centered else range_pd + padding = 0.05 * (y_lim[1] - y_lim[0]) + y_lim[0] -= padding + y_lim[1] += padding + for ax in disp.axes_.ravel(): + assert_allclose(ax.get_ylim(), y_lim) + + +@pytest.mark.parametrize("centered", [True, False]) +def test_partial_dependence_plot_limits_two_way( + pyplot, clf_diabetes, diabetes, centered +): + """Check that the PD limit on the plots are properly set on two-way plots.""" + disp = PartialDependenceDisplay.from_estimator( + clf_diabetes, + diabetes.data, + features=[(0, 1)], + kind="average", + grid_resolution=25, + feature_names=diabetes.feature_names, + ) + + range_pd = np.array([-1, 1], dtype=np.float64) + for pd in disp.pd_results: + pd["average"][...] = range_pd[1] + pd["average"][0, 0] = range_pd[0] + + disp.plot(centered=centered) + contours = disp.contours_[0, 0] + levels = range_pd - range_pd[0] if centered else range_pd + + padding = 0.05 * (levels[1] - levels[0]) + levels[0] -= padding + levels[1] += padding + expect_levels = np.linspace(*levels, num=8) + assert_allclose(contours.levels, expect_levels) + + +def test_partial_dependence_kind_list( + pyplot, + clf_diabetes, + diabetes, +): + """Check that we can provide a list of strings to kind parameter.""" + matplotlib = pytest.importorskip("matplotlib") + + disp = PartialDependenceDisplay.from_estimator( + clf_diabetes, + diabetes.data, + features=[0, 2, (1, 2)], + grid_resolution=20, + kind=["both", "both", "average"], + ) + + for idx in [0, 1]: + assert all( + [ + isinstance(line, matplotlib.lines.Line2D) + for line in disp.lines_[0, idx].ravel() + ] + ) + assert disp.contours_[0, idx] is None + + assert disp.contours_[0, 2] is not None + assert all([line is None for line in disp.lines_[0, 2].ravel()]) + + +@pytest.mark.parametrize( + "features, kind", + [ + ([0, 2, (1, 2)], "individual"), + ([0, 2, (1, 2)], "both"), + ([(0, 1), (0, 2), (1, 2)], "individual"), + ([(0, 1), (0, 2), (1, 2)], "both"), + ([0, 2, (1, 2)], ["individual", "individual", "individual"]), + ([0, 2, (1, 2)], ["both", "both", "both"]), + ], +) +def test_partial_dependence_kind_error( + pyplot, + clf_diabetes, + diabetes, + features, + kind, +): + """Check that we raise an informative error when 2-way PD is requested + together with 1-way PD/ICE""" + warn_msg = ( + "ICE plot cannot be rendered for 2-way feature interactions. 2-way " + "feature interactions mandates PD plots using the 'average' kind" + ) + with pytest.raises(ValueError, match=warn_msg): + PartialDependenceDisplay.from_estimator( + clf_diabetes, + diabetes.data, + features=features, + grid_resolution=20, + kind=kind, + ) + + +@pytest.mark.parametrize( + "line_kw, pd_line_kw, ice_lines_kw, expected_colors", + [ + ({"color": "r"}, {"color": "g"}, {"color": "b"}, ("g", "b")), + (None, {"color": "g"}, {"color": "b"}, ("g", "b")), + ({"color": "r"}, None, {"color": "b"}, ("r", "b")), + ({"color": "r"}, {"color": "g"}, None, ("g", "r")), + ({"color": "r"}, None, None, ("r", "r")), + ({"color": "r"}, {"linestyle": "--"}, {"linestyle": "-."}, ("r", "r")), + ({"c": "r"}, None, None, ("r", "r")), + ({"c": "r", "ls": "-."}, {"color": "g"}, {"color": "b"}, ("g", "b")), + ({"c": "r"}, {"c": "g"}, {"c": "b"}, ("g", "b")), + ({"c": "r"}, {"ls": "--"}, {"ls": "-."}, ("r", "r")), + ], +) +def test_plot_partial_dependence_lines_kw( + pyplot, + clf_diabetes, + diabetes, + line_kw, + pd_line_kw, + ice_lines_kw, + expected_colors, +): + """Check that passing `pd_line_kw` and `ice_lines_kw` will act on the + specific lines in the plot. + """ + + disp = PartialDependenceDisplay.from_estimator( + clf_diabetes, + diabetes.data, + [0, 2], + grid_resolution=20, + feature_names=diabetes.feature_names, + n_cols=2, + kind="both", + line_kw=line_kw, + pd_line_kw=pd_line_kw, + ice_lines_kw=ice_lines_kw, + ) + + line = disp.lines_[0, 0, -1] + assert line.get_color() == expected_colors[0], ( + f"{line.get_color()}!={expected_colors[0]}\n{line_kw} and {pd_line_kw}" + ) + if pd_line_kw is not None: + if "linestyle" in pd_line_kw: + assert line.get_linestyle() == pd_line_kw["linestyle"] + elif "ls" in pd_line_kw: + assert line.get_linestyle() == pd_line_kw["ls"] + else: + assert line.get_linestyle() == "--" + + line = disp.lines_[0, 0, 0] + assert line.get_color() == expected_colors[1], ( + f"{line.get_color()}!={expected_colors[1]}" + ) + if ice_lines_kw is not None: + if "linestyle" in ice_lines_kw: + assert line.get_linestyle() == ice_lines_kw["linestyle"] + elif "ls" in ice_lines_kw: + assert line.get_linestyle() == ice_lines_kw["ls"] + else: + assert line.get_linestyle() == "-" + + +def test_partial_dependence_display_wrong_len_kind( + pyplot, + clf_diabetes, + diabetes, +): + """Check that we raise an error when `kind` is a list with a wrong length. + + This case can only be triggered using the `PartialDependenceDisplay.from_estimator` + method. + """ + disp = PartialDependenceDisplay.from_estimator( + clf_diabetes, + diabetes.data, + features=[0, 2], + grid_resolution=20, + kind="average", # len(kind) != len(features) + ) + + # alter `kind` to be a list with a length different from length of `features` + disp.kind = ["average"] + err_msg = ( + r"When `kind` is provided as a list of strings, it should contain as many" + r" elements as `features`. `kind` contains 1 element\(s\) and `features`" + r" contains 2 element\(s\)." + ) + with pytest.raises(ValueError, match=err_msg): + disp.plot() + + +@pytest.mark.parametrize( + "kind", + ["individual", "both", "average", ["average", "both"], ["individual", "both"]], +) +def test_partial_dependence_display_kind_centered_interaction( + pyplot, + kind, + clf_diabetes, + diabetes, +): + """Check that we properly center ICE and PD when passing kind as a string and as a + list.""" + disp = PartialDependenceDisplay.from_estimator( + clf_diabetes, + diabetes.data, + [0, 1], + kind=kind, + centered=True, + subsample=5, + ) + + assert all([ln._y[0] == 0.0 for ln in disp.lines_.ravel() if ln is not None]) + + +def test_partial_dependence_display_with_constant_sample_weight( + pyplot, + clf_diabetes, + diabetes, +): + """Check that the utilization of a constant sample weight maintains the + standard behavior. + """ + disp = PartialDependenceDisplay.from_estimator( + clf_diabetes, + diabetes.data, + [0, 1], + kind="average", + method="brute", + ) + + sample_weight = np.ones_like(diabetes.target) + disp_sw = PartialDependenceDisplay.from_estimator( + clf_diabetes, + diabetes.data, + [0, 1], + sample_weight=sample_weight, + kind="average", + method="brute", + ) + + assert np.array_equal( + disp.pd_results[0]["average"], disp_sw.pd_results[0]["average"] + ) + + +def test_subclass_named_constructors_return_type_is_subclass( + pyplot, diabetes, clf_diabetes +): + """Check that named constructors return the correct type when subclassed. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/pull/27675 + """ + + class SubclassOfDisplay(PartialDependenceDisplay): + pass + + curve = SubclassOfDisplay.from_estimator( + clf_diabetes, + diabetes.data, + [0, 2, (0, 2)], + ) + + assert isinstance(curve, SubclassOfDisplay) diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/test_partial_dependence.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/test_partial_dependence.py new file mode 100644 index 0000000000000000000000000000000000000000..816fe5512edc4a142380c3d84bc59a030e1168ff --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/test_partial_dependence.py @@ -0,0 +1,1217 @@ +""" +Testing for the partial dependence module. +""" + +import re +import warnings + +import numpy as np +import pytest + +import sklearn +from sklearn.base import BaseEstimator, ClassifierMixin, clone, is_regressor +from sklearn.cluster import KMeans +from sklearn.compose import make_column_transformer +from sklearn.datasets import load_iris, make_classification, make_regression +from sklearn.dummy import DummyClassifier +from sklearn.ensemble import ( + GradientBoostingClassifier, + GradientBoostingRegressor, + HistGradientBoostingClassifier, + HistGradientBoostingRegressor, + RandomForestRegressor, +) +from sklearn.exceptions import NotFittedError +from sklearn.impute import SimpleImputer +from sklearn.inspection import partial_dependence +from sklearn.inspection._partial_dependence import ( + _grid_from_X, + _partial_dependence_brute, + _partial_dependence_recursion, +) +from sklearn.linear_model import LinearRegression, LogisticRegression, MultiTaskLasso +from sklearn.metrics import r2_score +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import ( + OneHotEncoder, + PolynomialFeatures, + RobustScaler, + StandardScaler, + scale, +) +from sklearn.tree import DecisionTreeRegressor +from sklearn.tree.tests.test_tree import assert_is_subtree +from sklearn.utils._testing import assert_allclose, assert_array_equal +from sklearn.utils.fixes import _IS_32BIT +from sklearn.utils.validation import check_random_state + +# toy sample +X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] +y = [-1, -1, -1, 1, 1, 1] + + +# (X, y), n_targets <-- as expected in the output of partial_dep() +binary_classification_data = (make_classification(n_samples=50, random_state=0), 1) +multiclass_classification_data = ( + make_classification( + n_samples=50, n_classes=3, n_clusters_per_class=1, random_state=0 + ), + 3, +) +regression_data = (make_regression(n_samples=50, random_state=0), 1) +multioutput_regression_data = ( + make_regression(n_samples=50, n_targets=2, random_state=0), + 2, +) + +# iris +iris = load_iris() + + +@pytest.mark.parametrize( + "Estimator, method, data", + [ + (GradientBoostingClassifier, "auto", binary_classification_data), + (GradientBoostingClassifier, "auto", multiclass_classification_data), + (GradientBoostingClassifier, "brute", binary_classification_data), + (GradientBoostingClassifier, "brute", multiclass_classification_data), + (GradientBoostingRegressor, "auto", regression_data), + (GradientBoostingRegressor, "brute", regression_data), + (DecisionTreeRegressor, "brute", regression_data), + (LinearRegression, "brute", regression_data), + (LinearRegression, "brute", multioutput_regression_data), + (LogisticRegression, "brute", binary_classification_data), + (LogisticRegression, "brute", multiclass_classification_data), + (MultiTaskLasso, "brute", multioutput_regression_data), + ], +) +@pytest.mark.parametrize("grid_resolution", (5, 10)) +@pytest.mark.parametrize("features", ([1], [1, 2])) +@pytest.mark.parametrize("kind", ("average", "individual", "both")) +@pytest.mark.parametrize("use_custom_values", [True, False]) +def test_output_shape( + Estimator, method, data, grid_resolution, features, kind, use_custom_values +): + # Check that partial_dependence has consistent output shape for different + # kinds of estimators: + # - classifiers with binary and multiclass settings + # - regressors + # - multi-task regressors + + est = Estimator() + if hasattr(est, "n_estimators"): + est.set_params(n_estimators=2) # speed-up computations + + # n_target corresponds to the number of classes (1 for binary classif) or + # the number of tasks / outputs in multi task settings. It's equal to 1 for + # classical regression_data. + (X, y), n_targets = data + n_instances = X.shape[0] + + custom_values = None + if use_custom_values: + grid_resolution = 5 + custom_values = {f: X[:grid_resolution, f] for f in features} + + est.fit(X, y) + result = partial_dependence( + est, + X=X, + features=features, + method=method, + kind=kind, + grid_resolution=grid_resolution, + custom_values=custom_values, + ) + pdp, axes = result, result["grid_values"] + + expected_pdp_shape = (n_targets, *[grid_resolution for _ in range(len(features))]) + expected_ice_shape = ( + n_targets, + n_instances, + *[grid_resolution for _ in range(len(features))], + ) + if kind == "average": + assert pdp.average.shape == expected_pdp_shape + elif kind == "individual": + assert pdp.individual.shape == expected_ice_shape + else: # 'both' + assert pdp.average.shape == expected_pdp_shape + assert pdp.individual.shape == expected_ice_shape + + expected_axes_shape = (len(features), grid_resolution) + assert axes is not None + assert np.asarray(axes).shape == expected_axes_shape + + +def test_grid_from_X(): + # tests for _grid_from_X: sanity check for output, and for shapes. + + # Make sure that the grid is a cartesian product of the input (it will use + # the unique values instead of the percentiles) + percentiles = (0.05, 0.95) + grid_resolution = 100 + is_categorical = [False, False] + X = np.asarray([[1, 2], [3, 4]]) + grid, axes = _grid_from_X(X, percentiles, is_categorical, grid_resolution, {}) + assert_array_equal(grid, [[1, 2], [1, 4], [3, 2], [3, 4]]) + assert_array_equal(axes, X.T) + + # test shapes of returned objects depending on the number of unique values + # for a feature. + rng = np.random.RandomState(0) + grid_resolution = 15 + + # n_unique_values > grid_resolution + X = rng.normal(size=(20, 2)) + grid, axes = _grid_from_X( + X, + percentiles, + is_categorical, + grid_resolution=grid_resolution, + custom_values={}, + ) + assert grid.shape == (grid_resolution * grid_resolution, X.shape[1]) + assert np.asarray(axes).shape == (2, grid_resolution) + assert grid.dtype == X.dtype + + # n_unique_values < grid_resolution, will use actual values + n_unique_values = 12 + X[n_unique_values - 1 :, 0] = 12345 + rng.shuffle(X) # just to make sure the order is irrelevant + grid, axes = _grid_from_X( + X, + percentiles, + is_categorical, + grid_resolution=grid_resolution, + custom_values={}, + ) + assert grid.shape == (n_unique_values * grid_resolution, X.shape[1]) + # axes is a list of arrays of different shapes + assert axes[0].shape == (n_unique_values,) + assert axes[1].shape == (grid_resolution,) + assert grid.dtype == X.dtype + + # Check that uses custom_range + X = rng.normal(size=(20, 2)) + X[n_unique_values - 1 :, 0] = 12345 + col_1_range = [0, 2, 3] + grid, axes = _grid_from_X( + X, + percentiles, + is_categorical=is_categorical, + grid_resolution=grid_resolution, + custom_values={1: col_1_range}, + ) + assert grid.shape == (n_unique_values * len(col_1_range), X.shape[1]) + # axes is a list of arrays of different shapes + assert axes[0].shape == (n_unique_values,) + assert axes[1].shape == (len(col_1_range),) + assert grid.dtype == X.dtype + + # Check that grid_resolution does not impact custom_range + X = rng.normal(size=(20, 2)) + col_0_range = [0, 2, 3, 4, 5, 6] + grid_resolution = 5 + grid, axes = _grid_from_X( + X, + percentiles, + is_categorical=is_categorical, + grid_resolution=grid_resolution, + custom_values={0: col_0_range}, + ) + assert grid.shape == (grid_resolution * len(col_0_range), X.shape[1]) + # axes is a list of arrays of different shapes + assert axes[0].shape == (len(col_0_range),) + assert axes[1].shape == (grid_resolution,) + assert grid.dtype == np.result_type(X, np.asarray(col_0_range).dtype) + + X = np.array([[0, "a"], [1, "b"], [2, "c"]]) + + grid, axes = _grid_from_X( + X, + percentiles, + is_categorical=is_categorical, + grid_resolution=grid_resolution, + custom_values={1: ["a", "b", "c"]}, + ) + assert grid.dtype == object + + +@pytest.mark.parametrize( + "grid_resolution", + [ + 2, # since n_categories > 2, we should not use quantiles resampling + 100, + ], +) +def test_grid_from_X_with_categorical(grid_resolution): + """Check that `_grid_from_X` always sample from categories and does not + depend from the percentiles. + """ + pd = pytest.importorskip("pandas") + percentiles = (0.05, 0.95) + is_categorical = [True] + X = pd.DataFrame({"cat_feature": ["A", "B", "C", "A", "B", "D", "E"]}) + grid, axes = _grid_from_X( + X, + percentiles, + is_categorical, + grid_resolution=grid_resolution, + custom_values={}, + ) + assert grid.shape == (5, X.shape[1]) + assert axes[0].shape == (5,) + + +@pytest.mark.parametrize("grid_resolution", [3, 100]) +def test_grid_from_X_heterogeneous_type(grid_resolution): + """Check that `_grid_from_X` always sample from categories and does not + depend from the percentiles. + """ + pd = pytest.importorskip("pandas") + percentiles = (0.05, 0.95) + is_categorical = [True, False] + X = pd.DataFrame( + { + "cat": ["A", "B", "C", "A", "B", "D", "E", "A", "B", "D"], + "num": [1, 1, 1, 2, 5, 6, 6, 6, 6, 8], + } + ) + nunique = X.nunique() + + grid, axes = _grid_from_X( + X, + percentiles, + is_categorical, + grid_resolution=grid_resolution, + custom_values={}, + ) + if grid_resolution == 3: + assert grid.shape == (15, 2) + assert axes[0].shape[0] == nunique["num"] + assert axes[1].shape[0] == grid_resolution + else: + assert grid.shape == (25, 2) + assert axes[0].shape[0] == nunique["cat"] + assert axes[1].shape[0] == nunique["cat"] + + +@pytest.mark.parametrize( + "grid_resolution, percentiles, err_msg", + [ + (2, (0, 0.0001), "percentiles are too close"), + (100, (1, 2, 3, 4), "'percentiles' must be a sequence of 2 elements"), + (100, 12345, "'percentiles' must be a sequence of 2 elements"), + (100, (-1, 0.95), r"'percentiles' values must be in \[0, 1\]"), + (100, (0.05, 2), r"'percentiles' values must be in \[0, 1\]"), + (100, (0.9, 0.1), r"percentiles\[0\] must be strictly less than"), + (1, (0.05, 0.95), "'grid_resolution' must be strictly greater than 1"), + ], +) +def test_grid_from_X_error(grid_resolution, percentiles, err_msg): + X = np.asarray([[1, 2], [3, 4]]) + is_categorical = [False] + with pytest.raises(ValueError, match=err_msg): + _grid_from_X(X, percentiles, is_categorical, grid_resolution, custom_values={}) + + +@pytest.mark.parametrize("target_feature", range(5)) +@pytest.mark.parametrize( + "est, method", + [ + (LinearRegression(), "brute"), + (GradientBoostingRegressor(random_state=0), "brute"), + (GradientBoostingRegressor(random_state=0), "recursion"), + (HistGradientBoostingRegressor(random_state=0), "brute"), + (HistGradientBoostingRegressor(random_state=0), "recursion"), + ], +) +def test_partial_dependence_helpers(est, method, target_feature): + # Check that what is returned by _partial_dependence_brute or + # _partial_dependence_recursion is equivalent to manually setting a target + # feature to a given value, and computing the average prediction over all + # samples. + # This also checks that the brute and recursion methods give the same + # output. + # Note that even on the trainset, the brute and the recursion methods + # aren't always strictly equivalent, in particular when the slow method + # generates unrealistic samples that have low mass in the joint + # distribution of the input features, and when some of the features are + # dependent. Hence the high tolerance on the checks. + + X, y = make_regression(random_state=0, n_features=5, n_informative=5) + # The 'init' estimator for GBDT (here the average prediction) isn't taken + # into account with the recursion method, for technical reasons. We set + # the mean to 0 to that this 'bug' doesn't have any effect. + y = y - y.mean() + + # Clone is necessary to make the test thread-safe. + est = clone(est).fit(X, y) + + # target feature will be set to .5 and then to 123 + features = np.array([target_feature], dtype=np.intp) + grid = np.array([[0.5], [123]]) + + if method == "brute": + pdp, predictions = _partial_dependence_brute( + est, grid, features, X, response_method="auto" + ) + else: + pdp = _partial_dependence_recursion(est, grid, features) + + mean_predictions = [] + for val in (0.5, 123): + X_ = X.copy() + X_[:, target_feature] = val + mean_predictions.append(est.predict(X_).mean()) + + pdp = pdp[0] # (shape is (1, 2) so make it (2,)) + + # allow for greater margin for error with recursion method + rtol = 1e-1 if method == "recursion" else 1e-3 + assert np.allclose(pdp, mean_predictions, rtol=rtol) + + +@pytest.mark.parametrize("seed", range(1)) +def test_recursion_decision_tree_vs_forest_and_gbdt(seed): + # Make sure that the recursion method gives the same results on a + # DecisionTreeRegressor and a GradientBoostingRegressor or a + # RandomForestRegressor with 1 tree and equivalent parameters. + + rng = np.random.RandomState(seed) + + # Purely random dataset to avoid correlated features + n_samples = 1000 + n_features = 5 + X = rng.randn(n_samples, n_features) + y = rng.randn(n_samples) * 10 + + # The 'init' estimator for GBDT (here the average prediction) isn't taken + # into account with the recursion method, for technical reasons. We set + # the mean to 0 to that this 'bug' doesn't have any effect. + y = y - y.mean() + + # set max_depth not too high to avoid splits with same gain but different + # features + max_depth = 5 + + tree_seed = 0 + forest = RandomForestRegressor( + n_estimators=1, + max_features=None, + bootstrap=False, + max_depth=max_depth, + random_state=tree_seed, + ) + # The forest will use ensemble.base._set_random_states to set the + # random_state of the tree sub-estimator. We simulate this here to have + # equivalent estimators. + equiv_random_state = check_random_state(tree_seed).randint(np.iinfo(np.int32).max) + gbdt = GradientBoostingRegressor( + n_estimators=1, + learning_rate=1, + criterion="squared_error", + max_depth=max_depth, + random_state=equiv_random_state, + ) + tree = DecisionTreeRegressor(max_depth=max_depth, random_state=equiv_random_state) + + forest.fit(X, y) + gbdt.fit(X, y) + tree.fit(X, y) + + # sanity check: if the trees aren't the same, the PD values won't be equal + try: + assert_is_subtree(tree.tree_, gbdt[0, 0].tree_) + assert_is_subtree(tree.tree_, forest[0].tree_) + except AssertionError: + # For some reason the trees aren't exactly equal on 32bits, so the PDs + # cannot be equal either. See + # https://github.com/scikit-learn/scikit-learn/issues/8853 + assert _IS_32BIT, "this should only fail on 32 bit platforms" + return + + grid = rng.randn(50).reshape(-1, 1) + for f in range(n_features): + features = np.array([f], dtype=np.intp) + + pdp_forest = _partial_dependence_recursion(forest, grid, features) + pdp_gbdt = _partial_dependence_recursion(gbdt, grid, features) + pdp_tree = _partial_dependence_recursion(tree, grid, features) + + np.testing.assert_allclose(pdp_gbdt, pdp_tree) + np.testing.assert_allclose(pdp_forest, pdp_tree) + + +@pytest.mark.parametrize( + "est", + ( + GradientBoostingClassifier(random_state=0), + HistGradientBoostingClassifier(random_state=0), + ), +) +@pytest.mark.parametrize("target_feature", (0, 1, 2, 3, 4, 5)) +def test_recursion_decision_function(est, target_feature): + # Make sure the recursion method (implicitly uses decision_function) has + # the same result as using brute method with + # response_method=decision_function + + X, y = make_classification(n_classes=2, n_clusters_per_class=1, random_state=1) + assert np.mean(y) == 0.5 # make sure the init estimator predicts 0 anyway + + est = clone(est).fit(X, y) + + preds_1 = partial_dependence( + est, + X, + [target_feature], + response_method="decision_function", + method="recursion", + kind="average", + ) + preds_2 = partial_dependence( + est, + X, + [target_feature], + response_method="decision_function", + method="brute", + kind="average", + ) + + assert_allclose(preds_1["average"], preds_2["average"], atol=1e-7) + + +@pytest.mark.parametrize( + "est", + ( + LinearRegression(), + GradientBoostingRegressor(random_state=0), + HistGradientBoostingRegressor( + random_state=0, min_samples_leaf=1, max_leaf_nodes=None, max_iter=1 + ), + DecisionTreeRegressor(random_state=0), + ), +) +@pytest.mark.parametrize("power", (1, 2)) +def test_partial_dependence_easy_target(est, power): + # If the target y only depends on one feature in an obvious way (linear or + # quadratic) then the partial dependence for that feature should reflect + # it. + # We here fit a linear regression_data model (with polynomial features if + # needed) and compute r_squared to check that the partial dependence + # correctly reflects the target. + + rng = np.random.RandomState(0) + n_samples = 200 + target_variable = 2 + X = rng.normal(size=(n_samples, 5)) + y = X[:, target_variable] ** power + + est = clone(est).fit(X, y) + + pdp = partial_dependence( + est, features=[target_variable], X=X, grid_resolution=1000, kind="average" + ) + + new_X = pdp["grid_values"][0].reshape(-1, 1) + new_y = pdp["average"][0] + # add polynomial features if needed + new_X = PolynomialFeatures(degree=power).fit_transform(new_X) + + lr = LinearRegression().fit(new_X, new_y) + r2 = r2_score(new_y, lr.predict(new_X)) + + assert r2 > 0.99 + + +@pytest.mark.parametrize( + "Estimator", + ( + sklearn.tree.DecisionTreeClassifier, + sklearn.tree.ExtraTreeClassifier, + sklearn.ensemble.ExtraTreesClassifier, + sklearn.neighbors.KNeighborsClassifier, + sklearn.neighbors.RadiusNeighborsClassifier, + sklearn.ensemble.RandomForestClassifier, + ), +) +def test_multiclass_multioutput(Estimator): + # Make sure error is raised for multiclass-multioutput classifiers + + # make multiclass-multioutput dataset + X, y = make_classification(n_classes=3, n_clusters_per_class=1, random_state=0) + y = np.array([y, y]).T + + est = Estimator() + est.fit(X, y) + + with pytest.raises( + ValueError, match="Multiclass-multioutput estimators are not supported" + ): + partial_dependence(est, X, [0]) + + +class NoPredictProbaNoDecisionFunction(ClassifierMixin, BaseEstimator): + def fit(self, X, y): + # simulate that we have some classes + self.classes_ = [0, 1] + return self + + +@pytest.mark.parametrize( + "estimator, params, err_msg", + [ + ( + KMeans(random_state=0, n_init="auto"), + {"features": [0]}, + "'estimator' must be a fitted regressor or classifier", + ), + ( + LinearRegression(), + {"features": [0], "response_method": "predict_proba"}, + "The response_method parameter is ignored for regressors", + ), + ( + GradientBoostingClassifier(random_state=0), + { + "features": [0], + "response_method": "predict_proba", + "method": "recursion", + }, + "'recursion' method, the response_method must be 'decision_function'", + ), + ( + GradientBoostingClassifier(random_state=0), + {"features": [0], "response_method": "predict_proba", "method": "auto"}, + "'recursion' method, the response_method must be 'decision_function'", + ), + ( + LinearRegression(), + {"features": [0], "method": "recursion", "kind": "individual"}, + "The 'recursion' method only applies when 'kind' is set to 'average'", + ), + ( + LinearRegression(), + {"features": [0], "method": "recursion", "kind": "both"}, + "The 'recursion' method only applies when 'kind' is set to 'average'", + ), + ( + LinearRegression(), + {"features": [0], "method": "recursion"}, + "Only the following estimators support the 'recursion' method:", + ), + ( + LinearRegression(), + {"features": [0, 1], "custom_values": {0: [1, 2, 3], 1: np.ones((3, 3))}}, + ( + "The custom grid for some features is not a one-dimensional array. " + "Feature 1: 2 dimensions" + ), + ), + ], +) +def test_partial_dependence_error(estimator, params, err_msg): + X, y = make_classification(random_state=0) + estimator = clone(estimator).fit(X, y) + + with pytest.raises(ValueError, match=err_msg): + partial_dependence(estimator, X, **params) + + +@pytest.mark.parametrize( + "estimator", [LinearRegression(), GradientBoostingClassifier(random_state=0)] +) +@pytest.mark.parametrize("features", [-1, 10000]) +def test_partial_dependence_unknown_feature_indices(estimator, features): + X, y = make_classification(random_state=0) + estimator = clone(estimator).fit(X, y) + + err_msg = "all features must be in" + with pytest.raises(ValueError, match=err_msg): + partial_dependence(estimator, X, [features]) + + +@pytest.mark.parametrize( + "estimator", [LinearRegression(), GradientBoostingClassifier(random_state=0)] +) +def test_partial_dependence_unknown_feature_string(estimator): + pd = pytest.importorskip("pandas") + X, y = make_classification(random_state=0) + df = pd.DataFrame(X) + estimator = clone(estimator).fit(df, y) + + features = ["random"] + err_msg = "A given column is not a column of the dataframe" + with pytest.raises(ValueError, match=err_msg): + partial_dependence(estimator, df, features) + + +@pytest.mark.parametrize( + "estimator", [LinearRegression(), GradientBoostingClassifier(random_state=0)] +) +def test_partial_dependence_X_list(estimator): + # check that array-like objects are accepted + X, y = make_classification(random_state=0) + estimator = clone(estimator).fit(X, y) + partial_dependence(estimator, list(X), [0], kind="average") + + +def test_warning_recursion_non_constant_init(): + # make sure that passing a non-constant init parameter to a GBDT and using + # recursion method yields a warning. + + gbc = GradientBoostingClassifier(init=DummyClassifier(), random_state=0) + gbc.fit(X, y) + + with pytest.warns( + UserWarning, match="Using recursion method with a non-constant init predictor" + ): + partial_dependence(gbc, X, [0], method="recursion", kind="average") + + with pytest.warns( + UserWarning, match="Using recursion method with a non-constant init predictor" + ): + partial_dependence(gbc, X, [0], method="recursion", kind="average") + + +def test_partial_dependence_sample_weight_of_fitted_estimator(): + # Test near perfect correlation between partial dependence and diagonal + # when sample weights emphasize y = x predictions + # non-regression test for #13193 + # TODO: extend to HistGradientBoosting once sample_weight is supported + N = 1000 + rng = np.random.RandomState(123456) + mask = rng.randint(2, size=N, dtype=bool) + + x = rng.rand(N) + # set y = x on mask and y = -x outside + y = x.copy() + y[~mask] = -y[~mask] + X = np.c_[mask, x] + # sample weights to emphasize data points where y = x + sample_weight = np.ones(N) + sample_weight[mask] = 1000.0 + + clf = GradientBoostingRegressor(n_estimators=10, random_state=1) + clf.fit(X, y, sample_weight=sample_weight) + + pdp = partial_dependence(clf, X, features=[1], kind="average") + + assert np.corrcoef(pdp["average"], pdp["grid_values"])[0, 1] > 0.99 + + +def test_hist_gbdt_sw_not_supported(): + # TODO: remove/fix when PDP supports HGBT with sample weights + clf = HistGradientBoostingRegressor(random_state=1) + clf.fit(X, y, sample_weight=np.ones(len(X))) + + with pytest.raises( + NotImplementedError, match="does not support partial dependence" + ): + partial_dependence(clf, X, features=[1]) + + +def test_partial_dependence_pipeline(): + # check that the partial dependence support pipeline + iris = load_iris() + + scaler = StandardScaler() + clf = DummyClassifier(random_state=42) + pipe = make_pipeline(scaler, clf) + + clf.fit(scaler.fit_transform(iris.data), iris.target) + pipe.fit(iris.data, iris.target) + + features = 0 + pdp_pipe = partial_dependence( + pipe, iris.data, features=[features], grid_resolution=10, kind="average" + ) + pdp_clf = partial_dependence( + clf, + scaler.transform(iris.data), + features=[features], + grid_resolution=10, + kind="average", + ) + assert_allclose(pdp_pipe["average"], pdp_clf["average"]) + assert_allclose( + pdp_pipe["grid_values"][0], + pdp_clf["grid_values"][0] * scaler.scale_[features] + scaler.mean_[features], + ) + + +@pytest.mark.parametrize( + "features, grid_resolution, n_vals_expected", + [ + (["a"], 10, 10), + (["a"], 2, 2), + ], +) +def test_partial_dependence_binary_model_grid_resolution( + features, grid_resolution, n_vals_expected +): + pd = pytest.importorskip("pandas") + model = DummyClassifier() + + rng = np.random.RandomState(0) + X = pd.DataFrame( + { + "a": rng.randint(0, 10, size=100).astype(np.float64), + "b": rng.randint(0, 10, size=100).astype(np.float64), + } + ) + y = pd.Series(rng.randint(0, 2, size=100)) + model.fit(X, y) + + part_dep = partial_dependence( + model, + X, + features=features, + grid_resolution=grid_resolution, + kind="average", + ) + assert part_dep["average"].size == n_vals_expected + + +@pytest.mark.parametrize( + "features, custom_values, n_vals_expected", + [ + (["a"], {"a": [1.0, 2.0, 3.0, 4.0]}, 4), + (["a"], {"a": [1.0, 2.0]}, 2), + (["a"], {"a": [1.0]}, 1), + ], +) +def test_partial_dependence_binary_model_custom_values( + features, custom_values, n_vals_expected +): + pd = pytest.importorskip("pandas") + model = DummyClassifier() + + X = pd.DataFrame({"a": [1.0, 2.0, 3.0, 4.0], "b": [6.0, 7.0, 8.0, 9.0]}) + y = pd.Series([0, 1, 0, 1]) + model.fit(X, y) + + part_dep = partial_dependence( + model, + X, + features=features, + grid_resolution=3, + custom_values=custom_values, + kind="average", + ) + assert part_dep["average"].size == n_vals_expected + + +@pytest.mark.parametrize( + "features, custom_values, n_vals_expected", + [ + (["b"], {"b": ["a", "b"]}, 2), + (["b"], {"b": ["a"]}, 1), + (["a", "b"], {"a": [1.0, 2.0], "b": ["a", "b"]}, 4), + ], +) +def test_partial_dependence_pipeline_custom_values( + features, custom_values, n_vals_expected +): + pd = pytest.importorskip("pandas") + pl = make_pipeline( + SimpleImputer(strategy="most_frequent"), OneHotEncoder(), DummyClassifier() + ) + + X = pd.DataFrame({"a": [1.0, 2.0, 3.0, 4.0], "b": ["a", "b", "a", "b"]}) + y = pd.Series([0, 1, 0, 1]) + pl.fit(X, y) + + X_holdout = pd.DataFrame({"a": [1.0, 2.0, 3.0, 4.0], "b": ["a", "b", "a", None]}) + part_dep = partial_dependence( + pl, + X_holdout, + features=features, + grid_resolution=3, + custom_values=custom_values, + kind="average", + ) + assert part_dep["average"].size == n_vals_expected + + +@pytest.mark.parametrize( + "estimator", + [ + LogisticRegression(max_iter=1000, random_state=0), + GradientBoostingClassifier(random_state=0, n_estimators=5), + ], + ids=["estimator-brute", "estimator-recursion"], +) +@pytest.mark.parametrize( + "preprocessor", + [ + None, + make_column_transformer( + (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]), + (RobustScaler(), [iris.feature_names[i] for i in (1, 3)]), + ), + make_column_transformer( + (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]), + remainder="passthrough", + ), + ], + ids=["None", "column-transformer", "column-transformer-passthrough"], +) +@pytest.mark.parametrize( + "features", + [[0, 2], [iris.feature_names[i] for i in (0, 2)]], + ids=["features-integer", "features-string"], +) +def test_partial_dependence_dataframe(estimator, preprocessor, features): + # check that the partial dependence support dataframe and pipeline + # including a column transformer + pd = pytest.importorskip("pandas") + df = pd.DataFrame(scale(iris.data), columns=iris.feature_names) + + pipe = make_pipeline(preprocessor, clone(estimator)) + pipe.fit(df, iris.target) + pdp_pipe = partial_dependence( + pipe, df, features=features, grid_resolution=10, kind="average" + ) + + # the column transformer will reorder the column when transforming + # we mixed the index to be sure that we are computing the partial + # dependence of the right columns + if preprocessor is not None: + X_proc = clone(preprocessor).fit_transform(df) + features_clf = [0, 1] + else: + X_proc = df + features_clf = [0, 2] + + clf = clone(estimator).fit(X_proc, iris.target) + pdp_clf = partial_dependence( + clf, + X_proc, + features=features_clf, + method="brute", + grid_resolution=10, + kind="average", + ) + + assert_allclose(pdp_pipe["average"], pdp_clf["average"]) + if preprocessor is not None: + scaler = preprocessor.named_transformers_["standardscaler"] + assert_allclose( + pdp_pipe["grid_values"][1], + pdp_clf["grid_values"][1] * scaler.scale_[1] + scaler.mean_[1], + ) + else: + assert_allclose(pdp_pipe["grid_values"][1], pdp_clf["grid_values"][1]) + + +@pytest.mark.parametrize( + "features, custom_values, expected_pd_shape", + [ + (0, None, (3, 10)), + (0, {0: [1.0, 2.0, 3.0]}, (3, 3)), + (iris.feature_names[0], None, (3, 10)), + (iris.feature_names[0], {iris.feature_names[0]: np.array([1.0, 2.0])}, (3, 2)), + ([0, 2], None, (3, 10, 10)), + ([0, 2], {2: [7, 8, 9, 10]}, (3, 10, 4)), + ([iris.feature_names[i] for i in (0, 2)], None, (3, 10, 10)), + ( + [iris.feature_names[i] for i in (0, 2)], + {iris.feature_names[2]: [1, 2, 3, 10]}, + (3, 10, 4), + ), + ([iris.feature_names[i] for i in (0, 2)], {2: [1, 2, 3, 10]}, (3, 10, 10)), + ( + [iris.feature_names[i] for i in (0, 2, 3)], + {iris.feature_names[2]: [1, 10]}, + (3, 10, 2, 10), + ), + ([True, False, True, False], None, (3, 10, 10)), + ], + ids=[ + "scalar-int", + "scalar-int-custom-values", + "scalar-str", + "scalar-str-custom-values", + "list-int", + "list-int-custom-values", + "list-str", + "list-str-custom-values", + "list-str-custom-values-incorrect", + "list-str-three-features", + "mask", + ], +) +def test_partial_dependence_feature_type(features, custom_values, expected_pd_shape): + # check all possible features type supported in PDP + pd = pytest.importorskip("pandas") + df = pd.DataFrame(iris.data, columns=iris.feature_names) + + preprocessor = make_column_transformer( + (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]), + (RobustScaler(), [iris.feature_names[i] for i in (1, 3)]), + ) + pipe = make_pipeline( + preprocessor, LogisticRegression(max_iter=1000, random_state=0) + ) + pipe.fit(df, iris.target) + pdp_pipe = partial_dependence( + pipe, + df, + features=features, + grid_resolution=10, + kind="average", + custom_values=custom_values, + ) + assert pdp_pipe["average"].shape == expected_pd_shape + assert len(pdp_pipe["grid_values"]) == len(pdp_pipe["average"].shape) - 1 + + +@pytest.mark.parametrize( + "estimator", + [ + LinearRegression(), + LogisticRegression(), + GradientBoostingRegressor(), + GradientBoostingClassifier(), + ], +) +def test_partial_dependence_unfitted(estimator): + X = iris.data + preprocessor = make_column_transformer( + (StandardScaler(), [0, 2]), (RobustScaler(), [1, 3]) + ) + pipe = make_pipeline(preprocessor, estimator) + with pytest.raises(NotFittedError, match="is not fitted yet"): + partial_dependence(pipe, X, features=[0, 2], grid_resolution=10) + with pytest.raises(NotFittedError, match="is not fitted yet"): + partial_dependence(estimator, X, features=[0, 2], grid_resolution=10) + + +@pytest.mark.parametrize( + "Estimator, data", + [ + (LinearRegression, multioutput_regression_data), + (LogisticRegression, binary_classification_data), + ], +) +def test_kind_average_and_average_of_individual(Estimator, data): + est = Estimator() + (X, y), n_targets = data + est.fit(X, y) + + pdp_avg = partial_dependence(est, X=X, features=[1, 2], kind="average") + pdp_ind = partial_dependence(est, X=X, features=[1, 2], kind="individual") + avg_ind = np.mean(pdp_ind["individual"], axis=1) + assert_allclose(avg_ind, pdp_avg["average"]) + + +@pytest.mark.parametrize( + "Estimator, data", + [ + (LinearRegression, multioutput_regression_data), + (LogisticRegression, binary_classification_data), + ], +) +def test_partial_dependence_kind_individual_ignores_sample_weight(Estimator, data): + """Check that `sample_weight` does not have any effect on reported ICE.""" + est = Estimator() + (X, y), n_targets = data + sample_weight = np.arange(X.shape[0]) + est.fit(X, y) + + pdp_nsw = partial_dependence(est, X=X, features=[1, 2], kind="individual") + pdp_sw = partial_dependence( + est, X=X, features=[1, 2], kind="individual", sample_weight=sample_weight + ) + assert_allclose(pdp_nsw["individual"], pdp_sw["individual"]) + assert_allclose(pdp_nsw["grid_values"], pdp_sw["grid_values"]) + + +@pytest.mark.parametrize( + "estimator", + [ + LinearRegression(), + LogisticRegression(), + RandomForestRegressor(), + GradientBoostingClassifier(), + ], +) +@pytest.mark.parametrize("non_null_weight_idx", [0, 1, -1]) +def test_partial_dependence_non_null_weight_idx(estimator, non_null_weight_idx): + """Check that if we pass a `sample_weight` of zeros with only one index with + sample weight equals one, then the average `partial_dependence` with this + `sample_weight` is equal to the individual `partial_dependence` of the + corresponding index. + """ + X, y = iris.data, iris.target + preprocessor = make_column_transformer( + (StandardScaler(), [0, 2]), (RobustScaler(), [1, 3]) + ) + pipe = make_pipeline(preprocessor, clone(estimator)).fit(X, y) + + sample_weight = np.zeros_like(y) + sample_weight[non_null_weight_idx] = 1 + pdp_sw = partial_dependence( + pipe, + X, + [2, 3], + kind="average", + sample_weight=sample_weight, + grid_resolution=10, + ) + pdp_ind = partial_dependence(pipe, X, [2, 3], kind="individual", grid_resolution=10) + output_dim = 1 if is_regressor(pipe) else len(np.unique(y)) + for i in range(output_dim): + assert_allclose( + pdp_ind["individual"][i][non_null_weight_idx], + pdp_sw["average"][i], + ) + + +@pytest.mark.parametrize( + "Estimator, data", + [ + (LinearRegression, multioutput_regression_data), + (LogisticRegression, binary_classification_data), + ], +) +def test_partial_dependence_equivalence_equal_sample_weight(Estimator, data): + """Check that `sample_weight=None` is equivalent to having equal weights.""" + + est = Estimator() + (X, y), n_targets = data + est.fit(X, y) + + sample_weight, params = None, {"X": X, "features": [1, 2], "kind": "average"} + pdp_sw_none = partial_dependence(est, **params, sample_weight=sample_weight) + sample_weight = np.ones(len(y)) + pdp_sw_unit = partial_dependence(est, **params, sample_weight=sample_weight) + assert_allclose(pdp_sw_none["average"], pdp_sw_unit["average"]) + sample_weight = 2 * np.ones(len(y)) + pdp_sw_doubling = partial_dependence(est, **params, sample_weight=sample_weight) + assert_allclose(pdp_sw_none["average"], pdp_sw_doubling["average"]) + + +def test_partial_dependence_sample_weight_size_error(): + """Check that we raise an error when the size of `sample_weight` is not + consistent with `X` and `y`. + """ + est = LogisticRegression() + (X, y), n_targets = binary_classification_data + sample_weight = np.ones_like(y) + est.fit(X, y) + + with pytest.raises(ValueError, match="sample_weight.shape =="): + partial_dependence( + est, X, features=[0], sample_weight=sample_weight[1:], grid_resolution=10 + ) + + +def test_partial_dependence_sample_weight_with_recursion(): + """Check that we raise an error when `sample_weight` is provided with + `"recursion"` method. + """ + est = RandomForestRegressor() + (X, y), n_targets = regression_data + sample_weight = np.ones_like(y) + est.fit(X, y, sample_weight=sample_weight) + + with pytest.raises(ValueError, match="'recursion' method can only be applied when"): + partial_dependence( + est, X, features=[0], method="recursion", sample_weight=sample_weight + ) + + +def test_mixed_type_categorical(): + """Check that we raise a proper error when a column has mixed types and + the sorting of `np.unique` will fail.""" + X = np.array(["A", "B", "C", np.nan], dtype=object).reshape(-1, 1) + y = np.array([0, 1, 0, 1]) + + from sklearn.preprocessing import OrdinalEncoder + + clf = make_pipeline( + OrdinalEncoder(encoded_missing_value=-1), + LogisticRegression(), + ).fit(X, y) + with pytest.raises(ValueError, match="The column #0 contains mixed data types"): + partial_dependence(clf, X, features=[0]) + + +def test_reject_array_with_integer_dtype(): + X = np.arange(8).reshape(4, 2) + y = np.array([0, 1, 0, 1]) + clf = DummyClassifier() + clf.fit(X, y) + with pytest.warns( + FutureWarning, match=re.escape("The column 0 contains integer data.") + ): + partial_dependence(clf, X, features=0) + + with pytest.warns( + FutureWarning, match=re.escape("The column 1 contains integer data.") + ): + partial_dependence(clf, X, features=[1], categorical_features=[0]) + + with pytest.warns( + FutureWarning, match=re.escape("The column 0 contains integer data.") + ): + partial_dependence(clf, X, features=[0, 1]) + + # The following should not raise as we do not compute numerical partial + # dependence on integer columns. + with warnings.catch_warnings(): + warnings.simplefilter("error") + partial_dependence(clf, X, features=1, categorical_features=[1]) + + +def test_reject_pandas_with_integer_dtype(): + pd = pytest.importorskip("pandas") + X = pd.DataFrame( + { + "a": [1.0, 2.0, 3.0], + "b": [1, 2, 3], + "c": [1, 2, 3], + } + ) + y = np.array([0, 1, 0]) + clf = DummyClassifier() + clf.fit(X, y) + + with pytest.warns( + FutureWarning, match=re.escape("The column 'c' contains integer data.") + ): + partial_dependence(clf, X, features="c") + + with pytest.warns( + FutureWarning, match=re.escape("The column 'c' contains integer data.") + ): + partial_dependence(clf, X, features=["a", "c"]) + + # The following should not raise as we do not compute numerical partial + # dependence on integer columns. + with warnings.catch_warnings(): + warnings.simplefilter("error") + partial_dependence(clf, X, features=["a"]) + partial_dependence(clf, X, features=["c"], categorical_features=["c"]) + + +def test_partial_dependence_empty_categorical_features(): + """Check that we raise the proper exception when `categorical_features` + is an empty list""" + clf = make_pipeline(StandardScaler(), LogisticRegression()) + clf.fit(iris.data, iris.target) + + with pytest.raises( + ValueError, + match=re.escape( + "Passing an empty list (`[]`) to `categorical_features` is not " + "supported. Use `None` instead to indicate that there are no " + "categorical features." + ), + ): + partial_dependence( + estimator=clf, X=iris.data, features=[0], categorical_features=[] + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/test_pd_utils.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/test_pd_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5dea3834a77a70891a4efab25a560d09a49a13e1 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/test_pd_utils.py @@ -0,0 +1,47 @@ +import numpy as np +import pytest + +from sklearn.inspection._pd_utils import _check_feature_names, _get_feature_index +from sklearn.utils._testing import _convert_container + + +@pytest.mark.parametrize( + "feature_names, array_type, expected_feature_names", + [ + (None, "array", ["x0", "x1", "x2"]), + (None, "dataframe", ["a", "b", "c"]), + (np.array(["a", "b", "c"]), "array", ["a", "b", "c"]), + ], +) +def test_check_feature_names(feature_names, array_type, expected_feature_names): + X = np.random.randn(10, 3) + column_names = ["a", "b", "c"] + X = _convert_container(X, constructor_name=array_type, columns_name=column_names) + feature_names_validated = _check_feature_names(X, feature_names) + assert feature_names_validated == expected_feature_names + + +def test_check_feature_names_error(): + X = np.random.randn(10, 3) + feature_names = ["a", "b", "c", "a"] + msg = "feature_names should not contain duplicates." + with pytest.raises(ValueError, match=msg): + _check_feature_names(X, feature_names) + + +@pytest.mark.parametrize("fx, idx", [(0, 0), (1, 1), ("a", 0), ("b", 1), ("c", 2)]) +def test_get_feature_index(fx, idx): + feature_names = ["a", "b", "c"] + assert _get_feature_index(fx, feature_names) == idx + + +@pytest.mark.parametrize( + "fx, feature_names, err_msg", + [ + ("a", None, "Cannot plot partial dependence for feature 'a'"), + ("d", ["a", "b", "c"], "Feature 'd' not in feature_names"), + ], +) +def test_get_feature_names_error(fx, feature_names, err_msg): + with pytest.raises(ValueError, match=err_msg): + _get_feature_index(fx, feature_names) diff --git a/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/test_permutation_importance.py b/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/test_permutation_importance.py new file mode 100644 index 0000000000000000000000000000000000000000..b51ad7b71f66dc897ae2700f20e6f968da56e758 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/inspection/tests/test_permutation_importance.py @@ -0,0 +1,540 @@ +import numpy as np +import pytest +from joblib import parallel_backend +from numpy.testing import assert_allclose + +from sklearn.compose import ColumnTransformer +from sklearn.datasets import ( + load_diabetes, + load_iris, + make_classification, + make_regression, +) +from sklearn.dummy import DummyClassifier +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.impute import SimpleImputer +from sklearn.inspection import permutation_importance +from sklearn.linear_model import LinearRegression, LogisticRegression +from sklearn.metrics import ( + get_scorer, + mean_squared_error, + r2_score, +) +from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, StandardScaler, scale +from sklearn.utils._testing import _convert_container + + +@pytest.mark.parametrize("n_jobs", [1, 2]) +@pytest.mark.parametrize("max_samples", [0.5, 1.0]) +@pytest.mark.parametrize("sample_weight", [None, "ones"]) +def test_permutation_importance_correlated_feature_regression( + n_jobs, max_samples, sample_weight +): + # Make sure that feature highly correlated to the target have a higher + # importance + rng = np.random.RandomState(42) + n_repeats = 5 + + X, y = load_diabetes(return_X_y=True) + y_with_little_noise = (y + rng.normal(scale=0.001, size=y.shape[0])).reshape(-1, 1) + + X = np.hstack([X, y_with_little_noise]) + + weights = np.ones_like(y) if sample_weight == "ones" else sample_weight + clf = RandomForestRegressor(n_estimators=10, random_state=42) + clf.fit(X, y) + + result = permutation_importance( + clf, + X, + y, + sample_weight=weights, + n_repeats=n_repeats, + random_state=rng, + n_jobs=n_jobs, + max_samples=max_samples, + ) + + assert result.importances.shape == (X.shape[1], n_repeats) + + # the correlated feature with y was added as the last column and should + # have the highest importance + assert np.all(result.importances_mean[-1] > result.importances_mean[:-1]) + + +@pytest.mark.parametrize("n_jobs", [1, 2]) +@pytest.mark.parametrize("max_samples", [0.5, 1.0]) +def test_permutation_importance_correlated_feature_regression_pandas( + n_jobs, max_samples +): + pd = pytest.importorskip("pandas") + + # Make sure that feature highly correlated to the target have a higher + # importance + rng = np.random.RandomState(42) + n_repeats = 5 + + dataset = load_iris() + X, y = dataset.data, dataset.target + y_with_little_noise = (y + rng.normal(scale=0.001, size=y.shape[0])).reshape(-1, 1) + + # Adds feature correlated with y as the last column + X = pd.DataFrame(X, columns=dataset.feature_names) + X["correlated_feature"] = y_with_little_noise + + clf = RandomForestClassifier(n_estimators=10, random_state=42) + clf.fit(X, y) + + result = permutation_importance( + clf, + X, + y, + n_repeats=n_repeats, + random_state=rng, + n_jobs=n_jobs, + max_samples=max_samples, + ) + + assert result.importances.shape == (X.shape[1], n_repeats) + + # the correlated feature with y was added as the last column and should + # have the highest importance + assert np.all(result.importances_mean[-1] > result.importances_mean[:-1]) + + +@pytest.mark.parametrize("n_jobs", [1, 2]) +@pytest.mark.parametrize("max_samples", [0.5, 1.0]) +def test_robustness_to_high_cardinality_noisy_feature(n_jobs, max_samples, seed=42): + # Permutation variable importance should not be affected by the high + # cardinality bias of traditional feature importances, especially when + # computed on a held-out test set: + rng = np.random.RandomState(seed) + n_repeats = 5 + n_samples = 1000 + n_classes = 5 + n_informative_features = 2 + n_noise_features = 1 + n_features = n_informative_features + n_noise_features + + # Generate a multiclass classification dataset and a set of informative + # binary features that can be used to predict some classes of y exactly + # while leaving some classes unexplained to make the problem harder. + classes = np.arange(n_classes) + y = rng.choice(classes, size=n_samples) + X = np.hstack([(y == c).reshape(-1, 1) for c in classes[:n_informative_features]]) + X = X.astype(np.float32) + + # Not all target classes are explained by the binary class indicator + # features: + assert n_informative_features < n_classes + + # Add 10 other noisy features with high cardinality (numerical) values + # that can be used to overfit the training data. + X = np.concatenate([X, rng.randn(n_samples, n_noise_features)], axis=1) + assert X.shape == (n_samples, n_features) + + # Split the dataset to be able to evaluate on a held-out test set. The + # Test size should be large enough for importance measurements to be + # stable: + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.5, random_state=rng + ) + clf = RandomForestClassifier(n_estimators=5, random_state=rng) + clf.fit(X_train, y_train) + + # Variable importances computed by impurity decrease on the tree node + # splits often use the noisy features in splits. This can give misleading + # impression that high cardinality noisy variables are the most important: + tree_importances = clf.feature_importances_ + informative_tree_importances = tree_importances[:n_informative_features] + noisy_tree_importances = tree_importances[n_informative_features:] + assert informative_tree_importances.max() < noisy_tree_importances.min() + + # Let's check that permutation-based feature importances do not have this + # problem. + r = permutation_importance( + clf, + X_test, + y_test, + n_repeats=n_repeats, + random_state=rng, + n_jobs=n_jobs, + max_samples=max_samples, + ) + + assert r.importances.shape == (X.shape[1], n_repeats) + + # Split the importances between informative and noisy features + informative_importances = r.importances_mean[:n_informative_features] + noisy_importances = r.importances_mean[n_informative_features:] + + # Because we do not have a binary variable explaining each target classes, + # the RF model will have to use the random variable to make some + # (overfitting) splits (as max_depth is not set). Therefore the noisy + # variables will be non-zero but with small values oscillating around + # zero: + assert max(np.abs(noisy_importances)) > 1e-7 + assert noisy_importances.max() < 0.05 + + # The binary features correlated with y should have a higher importance + # than the high cardinality noisy features. + # The maximum test accuracy is 2 / 5 == 0.4, each informative feature + # contributing approximately a bit more than 0.2 of accuracy. + assert informative_importances.min() > 0.15 + + +def test_permutation_importance_mixed_types(): + rng = np.random.RandomState(42) + n_repeats = 4 + + # Last column is correlated with y + X = np.array([[1.0, 2.0, 3.0, np.nan], [2, 1, 2, 1]]).T + y = np.array([0, 1, 0, 1]) + + clf = make_pipeline(SimpleImputer(), LogisticRegression(solver="lbfgs")) + clf.fit(X, y) + result = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng) + + assert result.importances.shape == (X.shape[1], n_repeats) + + # the correlated feature with y is the last column and should + # have the highest importance + assert np.all(result.importances_mean[-1] > result.importances_mean[:-1]) + + # use another random state + rng = np.random.RandomState(0) + result2 = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng) + assert result2.importances.shape == (X.shape[1], n_repeats) + + assert not np.allclose(result.importances, result2.importances) + + # the correlated feature with y is the last column and should + # have the highest importance + assert np.all(result2.importances_mean[-1] > result2.importances_mean[:-1]) + + +def test_permutation_importance_mixed_types_pandas(): + pd = pytest.importorskip("pandas") + rng = np.random.RandomState(42) + n_repeats = 5 + + # Last column is correlated with y + X = pd.DataFrame({"col1": [1.0, 2.0, 3.0, np.nan], "col2": ["a", "b", "a", "b"]}) + y = np.array([0, 1, 0, 1]) + + num_preprocess = make_pipeline(SimpleImputer(), StandardScaler()) + preprocess = ColumnTransformer( + [("num", num_preprocess, ["col1"]), ("cat", OneHotEncoder(), ["col2"])] + ) + clf = make_pipeline(preprocess, LogisticRegression(solver="lbfgs")) + clf.fit(X, y) + + result = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng) + + assert result.importances.shape == (X.shape[1], n_repeats) + # the correlated feature with y is the last column and should + # have the highest importance + assert np.all(result.importances_mean[-1] > result.importances_mean[:-1]) + + +def test_permutation_importance_linear_regresssion(): + X, y = make_regression(n_samples=500, n_features=10, random_state=0) + + X = scale(X) + y = scale(y) + + lr = LinearRegression().fit(X, y) + + # this relationship can be computed in closed form + expected_importances = 2 * lr.coef_**2 + results = permutation_importance( + lr, X, y, n_repeats=50, scoring="neg_mean_squared_error" + ) + assert_allclose( + expected_importances, results.importances_mean, rtol=1e-1, atol=1e-6 + ) + + +@pytest.mark.parametrize("max_samples", [500, 1.0]) +def test_permutation_importance_equivalence_sequential_parallel(max_samples): + # regression test to make sure that sequential and parallel calls will + # output the same results. + # Also tests that max_samples equal to number of samples is equivalent to 1.0 + X, y = make_regression(n_samples=500, n_features=10, random_state=0) + lr = LinearRegression().fit(X, y) + + importance_sequential = permutation_importance( + lr, X, y, n_repeats=5, random_state=0, n_jobs=1, max_samples=max_samples + ) + + # First check that the problem is structured enough and that the model is + # complex enough to not yield trivial, constant importances: + imp_min = importance_sequential["importances"].min() + imp_max = importance_sequential["importances"].max() + assert imp_max - imp_min > 0.3 + + # The actually check that parallelism does not impact the results + # either with shared memory (threading) or without isolated memory + # via process-based parallelism using the default backend + # ('loky' or 'multiprocessing') depending on the joblib version: + + # process-based parallelism (by default): + importance_processes = permutation_importance( + lr, X, y, n_repeats=5, random_state=0, n_jobs=2 + ) + assert_allclose( + importance_processes["importances"], importance_sequential["importances"] + ) + + # thread-based parallelism: + with parallel_backend("threading"): + importance_threading = permutation_importance( + lr, X, y, n_repeats=5, random_state=0, n_jobs=2 + ) + assert_allclose( + importance_threading["importances"], importance_sequential["importances"] + ) + + +@pytest.mark.parametrize("n_jobs", [None, 1, 2]) +@pytest.mark.parametrize("max_samples", [0.5, 1.0]) +def test_permutation_importance_equivalence_array_dataframe(n_jobs, max_samples): + # This test checks that the column shuffling logic has the same behavior + # both a dataframe and a simple numpy array. + pd = pytest.importorskip("pandas") + + # regression test to make sure that sequential and parallel calls will + # output the same results. + X, y = make_regression(n_samples=100, n_features=5, random_state=0) + X_df = pd.DataFrame(X) + + # Add a categorical feature that is statistically linked to y: + binner = KBinsDiscretizer( + n_bins=3, + encode="ordinal", + quantile_method="averaged_inverted_cdf", + ) + cat_column = binner.fit_transform(y.reshape(-1, 1)) + + # Concatenate the extra column to the numpy array: integers will be + # cast to float values + X = np.hstack([X, cat_column]) + assert X.dtype.kind == "f" + + # Insert extra column as a non-numpy-native dtype: + cat_column = pd.Categorical(cat_column.ravel()) + new_col_idx = len(X_df.columns) + X_df[new_col_idx] = cat_column + assert X_df[new_col_idx].dtype == cat_column.dtype + + # Stich an arbitrary index to the dataframe: + X_df.index = np.arange(len(X_df)).astype(str) + + rf = RandomForestRegressor(n_estimators=5, max_depth=3, random_state=0) + rf.fit(X, y) + + n_repeats = 3 + importance_array = permutation_importance( + rf, + X, + y, + n_repeats=n_repeats, + random_state=0, + n_jobs=n_jobs, + max_samples=max_samples, + ) + + # First check that the problem is structured enough and that the model is + # complex enough to not yield trivial, constant importances: + imp_min = importance_array["importances"].min() + imp_max = importance_array["importances"].max() + assert imp_max - imp_min > 0.3 + + # Now check that importances computed on dataframe matche the values + # of those computed on the array with the same data. + importance_dataframe = permutation_importance( + rf, + X_df, + y, + n_repeats=n_repeats, + random_state=0, + n_jobs=n_jobs, + max_samples=max_samples, + ) + assert_allclose( + importance_array["importances"], importance_dataframe["importances"] + ) + + +@pytest.mark.parametrize("input_type", ["array", "dataframe"]) +def test_permutation_importance_large_memmaped_data(input_type): + # Smoke, non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/15810 + n_samples, n_features = int(5e4), 4 + X, y = make_classification( + n_samples=n_samples, n_features=n_features, random_state=0 + ) + assert X.nbytes > 1e6 # trigger joblib memmaping + + X = _convert_container(X, input_type) + clf = DummyClassifier(strategy="prior").fit(X, y) + + # Actual smoke test: should not raise any error: + n_repeats = 5 + r = permutation_importance(clf, X, y, n_repeats=n_repeats, n_jobs=2) + + # Auxiliary check: DummyClassifier is feature independent: + # permutating feature should not change the predictions + expected_importances = np.zeros((n_features, n_repeats)) + assert_allclose(expected_importances, r.importances) + + +def test_permutation_importance_sample_weight(): + # Creating data with 2 features and 1000 samples, where the target + # variable is a linear combination of the two features, such that + # in half of the samples the impact of feature 1 is twice the impact of + # feature 2, and vice versa on the other half of the samples. + rng = np.random.RandomState(1) + n_samples = 1000 + n_features = 2 + n_half_samples = n_samples // 2 + x = rng.normal(0.0, 0.001, (n_samples, n_features)) + y = np.zeros(n_samples) + y[:n_half_samples] = 2 * x[:n_half_samples, 0] + x[:n_half_samples, 1] + y[n_half_samples:] = x[n_half_samples:, 0] + 2 * x[n_half_samples:, 1] + + # Fitting linear regression with perfect prediction + lr = LinearRegression(fit_intercept=False) + lr.fit(x, y) + + # When all samples are weighted with the same weights, the ratio of + # the two features importance should equal to 1 on expectation (when using + # mean absolutes error as the loss function). + pi = permutation_importance( + lr, x, y, random_state=1, scoring="neg_mean_absolute_error", n_repeats=200 + ) + x1_x2_imp_ratio_w_none = pi.importances_mean[0] / pi.importances_mean[1] + assert x1_x2_imp_ratio_w_none == pytest.approx(1, 0.01) + + # When passing a vector of ones as the sample_weight, results should be + # the same as in the case that sample_weight=None. + w = np.ones(n_samples) + pi = permutation_importance( + lr, + x, + y, + random_state=1, + scoring="neg_mean_absolute_error", + n_repeats=200, + sample_weight=w, + ) + x1_x2_imp_ratio_w_ones = pi.importances_mean[0] / pi.importances_mean[1] + assert x1_x2_imp_ratio_w_ones == pytest.approx(x1_x2_imp_ratio_w_none, 0.01) + + # When the ratio between the weights of the first half of the samples and + # the second half of the samples approaches to infinity, the ratio of + # the two features importance should equal to 2 on expectation (when using + # mean absolutes error as the loss function). + w = np.hstack([np.repeat(10.0**10, n_half_samples), np.repeat(1.0, n_half_samples)]) + lr.fit(x, y, w) + pi = permutation_importance( + lr, + x, + y, + random_state=1, + scoring="neg_mean_absolute_error", + n_repeats=200, + sample_weight=w, + ) + x1_x2_imp_ratio_w = pi.importances_mean[0] / pi.importances_mean[1] + assert x1_x2_imp_ratio_w / x1_x2_imp_ratio_w_none == pytest.approx(2, 0.01) + + +def test_permutation_importance_no_weights_scoring_function(): + # Creating a scorer function that does not takes sample_weight + def my_scorer(estimator, X, y): + return 1 + + # Creating some data and estimator for the permutation test + x = np.array([[1, 2], [3, 4]]) + y = np.array([1, 2]) + w = np.array([1, 1]) + lr = LinearRegression() + lr.fit(x, y) + + # test that permutation_importance does not return error when + # sample_weight is None + try: + permutation_importance(lr, x, y, random_state=1, scoring=my_scorer, n_repeats=1) + except TypeError: + pytest.fail( + "permutation_test raised an error when using a scorer " + "function that does not accept sample_weight even though " + "sample_weight was None" + ) + + # test that permutation_importance raise exception when sample_weight is + # not None + with pytest.raises(TypeError): + permutation_importance( + lr, x, y, random_state=1, scoring=my_scorer, n_repeats=1, sample_weight=w + ) + + +@pytest.mark.parametrize( + "list_single_scorer, multi_scorer", + [ + (["r2", "neg_mean_squared_error"], ["r2", "neg_mean_squared_error"]), + ( + ["r2", "neg_mean_squared_error"], + { + "r2": get_scorer("r2"), + "neg_mean_squared_error": get_scorer("neg_mean_squared_error"), + }, + ), + ( + ["r2", "neg_mean_squared_error"], + lambda estimator, X, y: { + "r2": r2_score(y, estimator.predict(X)), + "neg_mean_squared_error": -mean_squared_error(y, estimator.predict(X)), + }, + ), + ], +) +def test_permutation_importance_multi_metric(list_single_scorer, multi_scorer): + # Test permutation importance when scoring contains multiple scorers + + # Creating some data and estimator for the permutation test + x, y = make_regression(n_samples=500, n_features=10, random_state=0) + lr = LinearRegression().fit(x, y) + + multi_importance = permutation_importance( + lr, x, y, random_state=1, scoring=multi_scorer, n_repeats=2 + ) + assert set(multi_importance.keys()) == set(list_single_scorer) + + for scorer in list_single_scorer: + multi_result = multi_importance[scorer] + single_result = permutation_importance( + lr, x, y, random_state=1, scoring=scorer, n_repeats=2 + ) + + assert_allclose(multi_result.importances, single_result.importances) + + +def test_permutation_importance_max_samples_error(): + """Check that a proper error message is raised when `max_samples` is not + set to a valid input value. + """ + X = np.array([(1.0, 2.0, 3.0, 4.0)]).T + y = np.array([0, 1, 0, 1]) + + clf = LogisticRegression() + clf.fit(X, y) + + err_msg = r"max_samples must be <= n_samples" + + with pytest.raises(ValueError, match=err_msg): + permutation_importance(clf, X, y, max_samples=5) diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..541f164daf46a336719b4148b7b25cea73fe212c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/__init__.py @@ -0,0 +1,95 @@ +"""A variety of linear models.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# See http://scikit-learn.sourceforge.net/modules/sgd.html and +# http://scikit-learn.sourceforge.net/modules/linear_model.html for +# complete documentation. + +from ._base import LinearRegression +from ._bayes import ARDRegression, BayesianRidge +from ._coordinate_descent import ( + ElasticNet, + ElasticNetCV, + Lasso, + LassoCV, + MultiTaskElasticNet, + MultiTaskElasticNetCV, + MultiTaskLasso, + MultiTaskLassoCV, + enet_path, + lasso_path, +) +from ._glm import GammaRegressor, PoissonRegressor, TweedieRegressor +from ._huber import HuberRegressor +from ._least_angle import ( + Lars, + LarsCV, + LassoLars, + LassoLarsCV, + LassoLarsIC, + lars_path, + lars_path_gram, +) +from ._logistic import LogisticRegression, LogisticRegressionCV +from ._omp import ( + OrthogonalMatchingPursuit, + OrthogonalMatchingPursuitCV, + orthogonal_mp, + orthogonal_mp_gram, +) +from ._passive_aggressive import PassiveAggressiveClassifier, PassiveAggressiveRegressor +from ._perceptron import Perceptron +from ._quantile import QuantileRegressor +from ._ransac import RANSACRegressor +from ._ridge import Ridge, RidgeClassifier, RidgeClassifierCV, RidgeCV, ridge_regression +from ._stochastic_gradient import SGDClassifier, SGDOneClassSVM, SGDRegressor +from ._theil_sen import TheilSenRegressor + +__all__ = [ + "ARDRegression", + "BayesianRidge", + "ElasticNet", + "ElasticNetCV", + "GammaRegressor", + "HuberRegressor", + "Lars", + "LarsCV", + "Lasso", + "LassoCV", + "LassoLars", + "LassoLarsCV", + "LassoLarsIC", + "LinearRegression", + "LogisticRegression", + "LogisticRegressionCV", + "MultiTaskElasticNet", + "MultiTaskElasticNetCV", + "MultiTaskLasso", + "MultiTaskLassoCV", + "OrthogonalMatchingPursuit", + "OrthogonalMatchingPursuitCV", + "PassiveAggressiveClassifier", + "PassiveAggressiveRegressor", + "Perceptron", + "PoissonRegressor", + "QuantileRegressor", + "RANSACRegressor", + "Ridge", + "RidgeCV", + "RidgeClassifier", + "RidgeClassifierCV", + "SGDClassifier", + "SGDOneClassSVM", + "SGDRegressor", + "TheilSenRegressor", + "TweedieRegressor", + "enet_path", + "lars_path", + "lars_path_gram", + "lasso_path", + "orthogonal_mp", + "orthogonal_mp_gram", + "ridge_regression", +] diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_base.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..c059e3fa84310e4bc022d43cf159eaed3aa752fc --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_base.py @@ -0,0 +1,869 @@ +""" +Generalized Linear Models. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numbers +import warnings +from abc import ABCMeta, abstractmethod +from numbers import Integral, Real + +import numpy as np +import scipy.sparse as sp +from scipy import linalg, optimize, sparse +from scipy.sparse.linalg import lsqr +from scipy.special import expit + +from ..base import ( + BaseEstimator, + ClassifierMixin, + MultiOutputMixin, + RegressorMixin, + _fit_context, +) +from ..utils import check_array, check_random_state +from ..utils._array_api import ( + _asarray_with_order, + _average, + get_namespace, + get_namespace_and_device, + indexing_dtype, + supported_float_dtypes, +) +from ..utils._param_validation import Interval +from ..utils._seq_dataset import ( + ArrayDataset32, + ArrayDataset64, + CSRDataset32, + CSRDataset64, +) +from ..utils.extmath import safe_sparse_dot +from ..utils.parallel import Parallel, delayed +from ..utils.sparsefuncs import mean_variance_axis +from ..utils.validation import _check_sample_weight, check_is_fitted, validate_data + +# TODO: bayesian_ridge_regression and bayesian_regression_ard +# should be squashed into its respective objects. + +SPARSE_INTERCEPT_DECAY = 0.01 +# For sparse data intercept updates are scaled by this decay factor to avoid +# intercept oscillation. + + +def make_dataset(X, y, sample_weight, random_state=None): + """Create ``Dataset`` abstraction for sparse and dense inputs. + + This also returns the ``intercept_decay`` which is different + for sparse datasets. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Training data + + y : array-like, shape (n_samples, ) + Target values. + + sample_weight : numpy array of shape (n_samples,) + The weight of each sample + + random_state : int, RandomState instance or None (default) + Determines random number generation for dataset random sampling. It is not + used for dataset shuffling. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + dataset + The ``Dataset`` abstraction + intercept_decay + The intercept decay + """ + + rng = check_random_state(random_state) + # seed should never be 0 in SequentialDataset64 + seed = rng.randint(1, np.iinfo(np.int32).max) + + if X.dtype == np.float32: + CSRData = CSRDataset32 + ArrayData = ArrayDataset32 + else: + CSRData = CSRDataset64 + ArrayData = ArrayDataset64 + + if sp.issparse(X): + dataset = CSRData(X.data, X.indptr, X.indices, y, sample_weight, seed=seed) + intercept_decay = SPARSE_INTERCEPT_DECAY + else: + X = np.ascontiguousarray(X) + dataset = ArrayData(X, y, sample_weight, seed=seed) + intercept_decay = 1.0 + + return dataset, intercept_decay + + +def _preprocess_data( + X, + y, + *, + fit_intercept, + copy=True, + copy_y=True, + sample_weight=None, + check_input=True, +): + """Common data preprocessing for fitting linear models. + + This helper is in charge of the following steps: + + - Ensure that `sample_weight` is an array or `None`. + - If `check_input=True`, perform standard input validation of `X`, `y`. + - Perform copies if requested to avoid side-effects in case of inplace + modifications of the input. + + Then, if `fit_intercept=True` this preprocessing centers both `X` and `y` as + follows: + - if `X` is dense, center the data and + store the mean vector in `X_offset`. + - if `X` is sparse, store the mean in `X_offset` + without centering `X`. The centering is expected to be handled by the + linear solver where appropriate. + - in either case, always center `y` and store the mean in `y_offset`. + - both `X_offset` and `y_offset` are always weighted by `sample_weight` + if not set to `None`. + + If `fit_intercept=False`, no centering is performed and `X_offset`, `y_offset` + are set to zero. + + Returns + ------- + X_out : {ndarray, sparse matrix} of shape (n_samples, n_features) + If copy=True a copy of the input X is triggered, otherwise operations are + inplace. + If input X is dense, then X_out is centered. + y_out : {ndarray, sparse matrix} of shape (n_samples,) or (n_samples, n_targets) + Centered version of y. Possibly performed inplace on input y depending + on the copy_y parameter. + X_offset : ndarray of shape (n_features,) + The mean per column of input X. + y_offset : float or ndarray of shape (n_features,) + X_scale : ndarray of shape (n_features,) + Always an array of ones. TODO: refactor the code base to make it + possible to remove this unused variable. + """ + xp, _, device_ = get_namespace_and_device(X, y, sample_weight) + n_samples, n_features = X.shape + X_is_sparse = sp.issparse(X) + + if isinstance(sample_weight, numbers.Number): + sample_weight = None + if sample_weight is not None: + sample_weight = xp.asarray(sample_weight) + + if check_input: + X = check_array( + X, copy=copy, accept_sparse=["csr", "csc"], dtype=supported_float_dtypes(xp) + ) + y = check_array(y, dtype=X.dtype, copy=copy_y, ensure_2d=False) + else: + y = xp.astype(y, X.dtype, copy=copy_y) + if copy: + if X_is_sparse: + X = X.copy() + else: + X = _asarray_with_order(X, order="K", copy=True, xp=xp) + + dtype_ = X.dtype + + if fit_intercept: + if X_is_sparse: + X_offset, X_var = mean_variance_axis(X, axis=0, weights=sample_weight) + else: + X_offset = _average(X, axis=0, weights=sample_weight, xp=xp) + + X_offset = xp.astype(X_offset, X.dtype, copy=False) + X -= X_offset + + y_offset = _average(y, axis=0, weights=sample_weight, xp=xp) + y -= y_offset + else: + X_offset = xp.zeros(n_features, dtype=X.dtype, device=device_) + if y.ndim == 1: + y_offset = xp.asarray(0.0, dtype=dtype_, device=device_) + else: + y_offset = xp.zeros(y.shape[1], dtype=dtype_, device=device_) + + # XXX: X_scale is no longer needed. It is an historic artifact from the + # time where linear model exposed the normalize parameter. + X_scale = xp.ones(n_features, dtype=X.dtype, device=device_) + return X, y, X_offset, y_offset, X_scale + + +# TODO: _rescale_data should be factored into _preprocess_data. +# Currently, the fact that sag implements its own way to deal with +# sample_weight makes the refactoring tricky. + + +def _rescale_data(X, y, sample_weight, inplace=False): + """Rescale data sample-wise by square root of sample_weight. + + For many linear models, this enables easy support for sample_weight because + + (y - X w)' S (y - X w) + + with S = diag(sample_weight) becomes + + ||y_rescaled - X_rescaled w||_2^2 + + when setting + + y_rescaled = sqrt(S) y + X_rescaled = sqrt(S) X + + Returns + ------- + X_rescaled : {array-like, sparse matrix} + + y_rescaled : {array-like, sparse matrix} + """ + # Assume that _validate_data and _check_sample_weight have been called by + # the caller. + xp, _ = get_namespace(X, y, sample_weight) + n_samples = X.shape[0] + sample_weight_sqrt = xp.sqrt(sample_weight) + + if sp.issparse(X) or sp.issparse(y): + sw_matrix = sparse.dia_matrix( + (sample_weight_sqrt, 0), shape=(n_samples, n_samples) + ) + + if sp.issparse(X): + X = safe_sparse_dot(sw_matrix, X) + else: + if inplace: + X *= sample_weight_sqrt[:, None] + else: + X = X * sample_weight_sqrt[:, None] + + if sp.issparse(y): + y = safe_sparse_dot(sw_matrix, y) + else: + if inplace: + if y.ndim == 1: + y *= sample_weight_sqrt + else: + y *= sample_weight_sqrt[:, None] + else: + if y.ndim == 1: + y = y * sample_weight_sqrt + else: + y = y * sample_weight_sqrt[:, None] + return X, y, sample_weight_sqrt + + +class LinearModel(BaseEstimator, metaclass=ABCMeta): + """Base class for Linear Models""" + + @abstractmethod + def fit(self, X, y): + """Fit model.""" + + def _decision_function(self, X): + check_is_fitted(self) + + X = validate_data(self, X, accept_sparse=["csr", "csc", "coo"], reset=False) + coef_ = self.coef_ + if coef_.ndim == 1: + return X @ coef_ + self.intercept_ + else: + return X @ coef_.T + self.intercept_ + + def predict(self, X): + """ + Predict using the linear model. + + Parameters + ---------- + X : array-like or sparse matrix, shape (n_samples, n_features) + Samples. + + Returns + ------- + C : array, shape (n_samples,) + Returns predicted values. + """ + return self._decision_function(X) + + def _set_intercept(self, X_offset, y_offset, X_scale): + """Set the intercept_""" + + xp, _ = get_namespace(X_offset, y_offset, X_scale) + + if self.fit_intercept: + # We always want coef_.dtype=X.dtype. For instance, X.dtype can differ from + # coef_.dtype if warm_start=True. + coef_ = xp.astype(self.coef_, X_scale.dtype, copy=False) + coef_ = self.coef_ = xp.divide(coef_, X_scale) + + if coef_.ndim == 1: + intercept_ = y_offset - X_offset @ coef_ + else: + intercept_ = y_offset - X_offset @ coef_.T + + self.intercept_ = intercept_ + + else: + self.intercept_ = 0.0 + + +# XXX Should this derive from LinearModel? It should be a mixin, not an ABC. +# Maybe the n_features checking can be moved to LinearModel. +class LinearClassifierMixin(ClassifierMixin): + """Mixin for linear classifiers. + + Handles prediction for sparse and dense X. + """ + + def decision_function(self, X): + """ + Predict confidence scores for samples. + + The confidence score for a sample is proportional to the signed + distance of that sample to the hyperplane. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data matrix for which we want to get the confidence scores. + + Returns + ------- + scores : ndarray of shape (n_samples,) or (n_samples, n_classes) + Confidence scores per `(n_samples, n_classes)` combination. In the + binary case, confidence score for `self.classes_[1]` where >0 means + this class would be predicted. + """ + check_is_fitted(self) + xp, _ = get_namespace(X) + + X = validate_data(self, X, accept_sparse="csr", reset=False) + scores = safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_ + return ( + xp.reshape(scores, (-1,)) + if (scores.ndim > 1 and scores.shape[1] == 1) + else scores + ) + + def predict(self, X): + """ + Predict class labels for samples in X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data matrix for which we want to get the predictions. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + Vector containing the class labels for each sample. + """ + xp, _ = get_namespace(X) + scores = self.decision_function(X) + if len(scores.shape) == 1: + indices = xp.astype(scores > 0, indexing_dtype(xp)) + else: + indices = xp.argmax(scores, axis=1) + + return xp.take(self.classes_, indices, axis=0) + + def _predict_proba_lr(self, X): + """Probability estimation for OvR logistic regression. + + Positive class probabilities are computed as + 1. / (1. + np.exp(-self.decision_function(X))); + multiclass is handled by normalizing that over all classes. + """ + prob = self.decision_function(X) + expit(prob, out=prob) + if prob.ndim == 1: + return np.vstack([1 - prob, prob]).T + else: + # OvR normalization, like LibLinear's predict_probability + prob /= prob.sum(axis=1).reshape((prob.shape[0], -1)) + return prob + + +class SparseCoefMixin: + """Mixin for converting coef_ to and from CSR format. + + L1-regularizing estimators should inherit this. + """ + + def densify(self): + """ + Convert coefficient matrix to dense array format. + + Converts the ``coef_`` member (back) to a numpy.ndarray. This is the + default format of ``coef_`` and is required for fitting, so calling + this method is only required on models that have previously been + sparsified; otherwise, it is a no-op. + + Returns + ------- + self + Fitted estimator. + """ + msg = "Estimator, %(name)s, must be fitted before densifying." + check_is_fitted(self, msg=msg) + if sp.issparse(self.coef_): + self.coef_ = self.coef_.toarray() + return self + + def sparsify(self): + """ + Convert coefficient matrix to sparse format. + + Converts the ``coef_`` member to a scipy.sparse matrix, which for + L1-regularized models can be much more memory- and storage-efficient + than the usual numpy.ndarray representation. + + The ``intercept_`` member is not converted. + + Returns + ------- + self + Fitted estimator. + + Notes + ----- + For non-sparse models, i.e. when there are not many zeros in ``coef_``, + this may actually *increase* memory usage, so use this method with + care. A rule of thumb is that the number of zero elements, which can + be computed with ``(coef_ == 0).sum()``, must be more than 50% for this + to provide significant benefits. + + After calling this method, further fitting with the partial_fit + method (if any) will not work until you call densify. + """ + msg = "Estimator, %(name)s, must be fitted before sparsifying." + check_is_fitted(self, msg=msg) + self.coef_ = sp.csr_matrix(self.coef_) + return self + + +class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel): + """ + Ordinary least squares Linear Regression. + + LinearRegression fits a linear model with coefficients w = (w1, ..., wp) + to minimize the residual sum of squares between the observed targets in + the dataset, and the targets predicted by the linear approximation. + + Parameters + ---------- + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to False, no intercept will be used in calculations + (i.e. data is expected to be centered). + + copy_X : bool, default=True + If True, X will be copied; else, it may be overwritten. + + tol : float, default=1e-6 + The precision of the solution (`coef_`) is determined by `tol` which + specifies a different convergence criterion for the `lsqr` solver. + `tol` is set as `atol` and `btol` of `scipy.sparse.linalg.lsqr` when + fitting on sparse training data. This parameter has no effect when fitting + on dense data. + + .. versionadded:: 1.7 + + n_jobs : int, default=None + The number of jobs to use for the computation. This will only provide + speedup in case of sufficiently large problems, that is if firstly + `n_targets > 1` and secondly `X` is sparse or if `positive` is set + to `True`. ``None`` means 1 unless in a + :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. See :term:`Glossary ` for more details. + + positive : bool, default=False + When set to ``True``, forces the coefficients to be positive. This + option is only supported for dense arrays. + + For a comparison between a linear regression model with positive constraints + on the regression coefficients and a linear regression without such constraints, + see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. + + .. versionadded:: 0.24 + + Attributes + ---------- + coef_ : array of shape (n_features, ) or (n_targets, n_features) + Estimated coefficients for the linear regression problem. + If multiple targets are passed during the fit (y 2D), this + is a 2D array of shape (n_targets, n_features), while if only + one target is passed, this is a 1D array of length n_features. + + rank_ : int + Rank of matrix `X`. Only available when `X` is dense. + + singular_ : array of shape (min(X, y),) + Singular values of `X`. Only available when `X` is dense. + + intercept_ : float or array of shape (n_targets,) + Independent term in the linear model. Set to 0.0 if + `fit_intercept = False`. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + Ridge : Ridge regression addresses some of the + problems of Ordinary Least Squares by imposing a penalty on the + size of the coefficients with l2 regularization. + Lasso : The Lasso is a linear model that estimates + sparse coefficients with l1 regularization. + ElasticNet : Elastic-Net is a linear regression + model trained with both l1 and l2 -norm regularization of the + coefficients. + + Notes + ----- + From the implementation point of view, this is just plain Ordinary + Least Squares (scipy.linalg.lstsq) or Non Negative Least Squares + (scipy.optimize.nnls) wrapped as a predictor object. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.linear_model import LinearRegression + >>> X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]]) + >>> # y = 1 * x_0 + 2 * x_1 + 3 + >>> y = np.dot(X, np.array([1, 2])) + 3 + >>> reg = LinearRegression().fit(X, y) + >>> reg.score(X, y) + 1.0 + >>> reg.coef_ + array([1., 2.]) + >>> reg.intercept_ + np.float64(3.0) + >>> reg.predict(np.array([[3, 5]])) + array([16.]) + """ + + _parameter_constraints: dict = { + "fit_intercept": ["boolean"], + "copy_X": ["boolean"], + "n_jobs": [None, Integral], + "positive": ["boolean"], + "tol": [Interval(Real, 0, None, closed="left")], + } + + def __init__( + self, + *, + fit_intercept=True, + copy_X=True, + tol=1e-6, + n_jobs=None, + positive=False, + ): + self.fit_intercept = fit_intercept + self.copy_X = copy_X + self.tol = tol + self.n_jobs = n_jobs + self.positive = positive + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, sample_weight=None): + """ + Fit linear model. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) or (n_samples, n_targets) + Target values. Will be cast to X's dtype if necessary. + + sample_weight : array-like of shape (n_samples,), default=None + Individual weights for each sample. + + .. versionadded:: 0.17 + parameter *sample_weight* support to LinearRegression. + + Returns + ------- + self : object + Fitted Estimator. + """ + n_jobs_ = self.n_jobs + + accept_sparse = False if self.positive else ["csr", "csc", "coo"] + + X, y = validate_data( + self, + X, + y, + accept_sparse=accept_sparse, + y_numeric=True, + multi_output=True, + force_writeable=True, + ) + + has_sw = sample_weight is not None + if has_sw: + sample_weight = _check_sample_weight( + sample_weight, X, dtype=X.dtype, ensure_non_negative=True + ) + + # Note that neither _rescale_data nor the rest of the fit method of + # LinearRegression can benefit from in-place operations when X is a + # sparse matrix. Therefore, let's not copy X when it is sparse. + copy_X_in_preprocess_data = self.copy_X and not sp.issparse(X) + + X, y, X_offset, y_offset, X_scale = _preprocess_data( + X, + y, + fit_intercept=self.fit_intercept, + copy=copy_X_in_preprocess_data, + sample_weight=sample_weight, + ) + + if has_sw: + # Sample weight can be implemented via a simple rescaling. Note + # that we safely do inplace rescaling when _preprocess_data has + # already made a copy if requested. + X, y, sample_weight_sqrt = _rescale_data( + X, y, sample_weight, inplace=copy_X_in_preprocess_data + ) + + if self.positive: + if y.ndim < 2: + self.coef_ = optimize.nnls(X, y)[0] + else: + # scipy.optimize.nnls cannot handle y with shape (M, K) + outs = Parallel(n_jobs=n_jobs_)( + delayed(optimize.nnls)(X, y[:, j]) for j in range(y.shape[1]) + ) + self.coef_ = np.vstack([out[0] for out in outs]) + elif sp.issparse(X): + X_offset_scale = X_offset / X_scale + + if has_sw: + + def matvec(b): + return X.dot(b) - sample_weight_sqrt * b.dot(X_offset_scale) + + def rmatvec(b): + return X.T.dot(b) - X_offset_scale * b.dot(sample_weight_sqrt) + + else: + + def matvec(b): + return X.dot(b) - b.dot(X_offset_scale) + + def rmatvec(b): + return X.T.dot(b) - X_offset_scale * b.sum() + + X_centered = sparse.linalg.LinearOperator( + shape=X.shape, matvec=matvec, rmatvec=rmatvec + ) + + if y.ndim < 2: + self.coef_ = lsqr(X_centered, y, atol=self.tol, btol=self.tol)[0] + else: + # sparse_lstsq cannot handle y with shape (M, K) + outs = Parallel(n_jobs=n_jobs_)( + delayed(lsqr)( + X_centered, y[:, j].ravel(), atol=self.tol, btol=self.tol + ) + for j in range(y.shape[1]) + ) + self.coef_ = np.vstack([out[0] for out in outs]) + else: + # cut-off ratio for small singular values + cond = max(X.shape) * np.finfo(X.dtype).eps + self.coef_, _, self.rank_, self.singular_ = linalg.lstsq(X, y, cond=cond) + self.coef_ = self.coef_.T + + if y.ndim == 1: + self.coef_ = np.ravel(self.coef_) + self._set_intercept(X_offset, y_offset, X_scale) + return self + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = not self.positive + return tags + + +def _check_precomputed_gram_matrix( + X, precompute, X_offset, X_scale, rtol=None, atol=1e-5 +): + """Computes a single element of the gram matrix and compares it to + the corresponding element of the user supplied gram matrix. + + If the values do not match a ValueError will be thrown. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Data array. + + precompute : array-like of shape (n_features, n_features) + User-supplied gram matrix. + + X_offset : ndarray of shape (n_features,) + Array of feature means used to center design matrix. + + X_scale : ndarray of shape (n_features,) + Array of feature scale factors used to normalize design matrix. + + rtol : float, default=None + Relative tolerance; see numpy.allclose + If None, it is set to 1e-4 for arrays of dtype numpy.float32 and 1e-7 + otherwise. + + atol : float, default=1e-5 + absolute tolerance; see :func`numpy.allclose`. Note that the default + here is more tolerant than the default for + :func:`numpy.testing.assert_allclose`, where `atol=0`. + + Raises + ------ + ValueError + Raised when the provided Gram matrix is not consistent. + """ + + n_features = X.shape[1] + f1 = n_features // 2 + f2 = min(f1 + 1, n_features - 1) + + v1 = (X[:, f1] - X_offset[f1]) * X_scale[f1] + v2 = (X[:, f2] - X_offset[f2]) * X_scale[f2] + + expected = np.dot(v1, v2) + actual = precompute[f1, f2] + + dtypes = [precompute.dtype, expected.dtype] + if rtol is None: + rtols = [1e-4 if dtype == np.float32 else 1e-7 for dtype in dtypes] + rtol = max(rtols) + + if not np.isclose(expected, actual, rtol=rtol, atol=atol): + raise ValueError( + "Gram matrix passed in via 'precompute' parameter " + "did not pass validation when a single element was " + "checked - please check that it was computed " + f"properly. For element ({f1},{f2}) we computed " + f"{expected} but the user-supplied value was " + f"{actual}." + ) + + +def _pre_fit( + X, + y, + Xy, + precompute, + fit_intercept, + copy, + check_input=True, + sample_weight=None, +): + """Function used at beginning of fit in linear models with L1 or L0 penalty. + + This function applies _preprocess_data and additionally computes the gram matrix + `precompute` as needed as well as `Xy`. + """ + n_samples, n_features = X.shape + + if sparse.issparse(X): + # copy is not needed here as X is not modified inplace when X is sparse + precompute = False + X, y, X_offset, y_offset, X_scale = _preprocess_data( + X, + y, + fit_intercept=fit_intercept, + copy=False, + check_input=check_input, + sample_weight=sample_weight, + ) + else: + # copy was done in fit if necessary + X, y, X_offset, y_offset, X_scale = _preprocess_data( + X, + y, + fit_intercept=fit_intercept, + copy=copy, + check_input=check_input, + sample_weight=sample_weight, + ) + # Rescale only in dense case. Sparse cd solver directly deals with + # sample_weight. + if sample_weight is not None: + # This triggers copies anyway. + X, y, _ = _rescale_data(X, y, sample_weight=sample_weight) + + if hasattr(precompute, "__array__"): + if fit_intercept and not np.allclose(X_offset, np.zeros(n_features)): + warnings.warn( + ( + "Gram matrix was provided but X was centered to fit " + "intercept: recomputing Gram matrix." + ), + UserWarning, + ) + # TODO: instead of warning and recomputing, we could just center + # the user provided Gram matrix a-posteriori (after making a copy + # when `copy=True`). + # recompute Gram + precompute = "auto" + Xy = None + elif check_input: + # If we're going to use the user's precomputed gram matrix, we + # do a quick check to make sure its not totally bogus. + _check_precomputed_gram_matrix(X, precompute, X_offset, X_scale) + + # precompute if n_samples > n_features + if isinstance(precompute, str) and precompute == "auto": + precompute = n_samples > n_features + + if precompute is True: + # make sure that the 'precompute' array is contiguous. + precompute = np.empty(shape=(n_features, n_features), dtype=X.dtype, order="C") + np.dot(X.T, X, out=precompute) + + if not hasattr(precompute, "__array__"): + Xy = None # cannot use Xy if precompute is not Gram + + if hasattr(precompute, "__array__") and Xy is None: + common_dtype = np.result_type(X.dtype, y.dtype) + if y.ndim == 1: + # Xy is 1d, make sure it is contiguous. + Xy = np.empty(shape=n_features, dtype=common_dtype, order="C") + np.dot(X.T, y, out=Xy) + else: + # Make sure that Xy is always F contiguous even if X or y are not + # contiguous: the goal is to make it fast to extract the data for a + # specific target. + n_targets = y.shape[1] + Xy = np.empty(shape=(n_features, n_targets), dtype=common_dtype, order="F") + np.dot(y.T, X, out=Xy.T) + + return X, y, X_offset, y_offset, X_scale, precompute, Xy diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_bayes.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_bayes.py new file mode 100644 index 0000000000000000000000000000000000000000..e519660323d80f8d8f7f607451f59d26ecb62f19 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_bayes.py @@ -0,0 +1,826 @@ +""" +Various bayesian regression +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from math import log +from numbers import Integral, Real + +import numpy as np +from scipy import linalg +from scipy.linalg import pinvh + +from ..base import RegressorMixin, _fit_context +from ..utils import _safe_indexing +from ..utils._param_validation import Interval +from ..utils.extmath import fast_logdet +from ..utils.validation import _check_sample_weight, validate_data +from ._base import LinearModel, _preprocess_data, _rescale_data + +############################################################################### +# BayesianRidge regression + + +class BayesianRidge(RegressorMixin, LinearModel): + """Bayesian ridge regression. + + Fit a Bayesian ridge model. See the Notes section for details on this + implementation and the optimization of the regularization parameters + lambda (precision of the weights) and alpha (precision of the noise). + + Read more in the :ref:`User Guide `. + For an intuitive visualization of how the sinusoid is approximated by + a polynomial using different pairs of initial values, see + :ref:`sphx_glr_auto_examples_linear_model_plot_bayesian_ridge_curvefit.py`. + + Parameters + ---------- + max_iter : int, default=300 + Maximum number of iterations over the complete dataset before + stopping independently of any early stopping criterion. + + .. versionchanged:: 1.3 + + tol : float, default=1e-3 + Stop the algorithm if w has converged. + + alpha_1 : float, default=1e-6 + Hyper-parameter : shape parameter for the Gamma distribution prior + over the alpha parameter. + + alpha_2 : float, default=1e-6 + Hyper-parameter : inverse scale parameter (rate parameter) for the + Gamma distribution prior over the alpha parameter. + + lambda_1 : float, default=1e-6 + Hyper-parameter : shape parameter for the Gamma distribution prior + over the lambda parameter. + + lambda_2 : float, default=1e-6 + Hyper-parameter : inverse scale parameter (rate parameter) for the + Gamma distribution prior over the lambda parameter. + + alpha_init : float, default=None + Initial value for alpha (precision of the noise). + If not set, alpha_init is 1/Var(y). + + .. versionadded:: 0.22 + + lambda_init : float, default=None + Initial value for lambda (precision of the weights). + If not set, lambda_init is 1. + + .. versionadded:: 0.22 + + compute_score : bool, default=False + If True, compute the log marginal likelihood at each iteration of the + optimization. + + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. + The intercept is not treated as a probabilistic parameter + and thus has no associated variance. If set + to False, no intercept will be used in calculations + (i.e. data is expected to be centered). + + copy_X : bool, default=True + If True, X will be copied; else, it may be overwritten. + + verbose : bool, default=False + Verbose mode when fitting the model. + + Attributes + ---------- + coef_ : array-like of shape (n_features,) + Coefficients of the regression model (mean of distribution) + + intercept_ : float + Independent term in decision function. Set to 0.0 if + `fit_intercept = False`. + + alpha_ : float + Estimated precision of the noise. + + lambda_ : float + Estimated precision of the weights. + + sigma_ : array-like of shape (n_features, n_features) + Estimated variance-covariance matrix of the weights + + scores_ : array-like of shape (n_iter_+1,) + If computed_score is True, value of the log marginal likelihood (to be + maximized) at each iteration of the optimization. The array starts + with the value of the log marginal likelihood obtained for the initial + values of alpha and lambda and ends with the value obtained for the + estimated alpha and lambda. + + n_iter_ : int + The actual number of iterations to reach the stopping criterion. + + X_offset_ : ndarray of shape (n_features,) + If `fit_intercept=True`, offset subtracted for centering data to a + zero mean. Set to np.zeros(n_features) otherwise. + + X_scale_ : ndarray of shape (n_features,) + Set to np.ones(n_features). + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + ARDRegression : Bayesian ARD regression. + + Notes + ----- + There exist several strategies to perform Bayesian ridge regression. This + implementation is based on the algorithm described in Appendix A of + (Tipping, 2001) where updates of the regularization parameters are done as + suggested in (MacKay, 1992). Note that according to A New + View of Automatic Relevance Determination (Wipf and Nagarajan, 2008) these + update rules do not guarantee that the marginal likelihood is increasing + between two consecutive iterations of the optimization. + + References + ---------- + D. J. C. MacKay, Bayesian Interpolation, Computation and Neural Systems, + Vol. 4, No. 3, 1992. + + M. E. Tipping, Sparse Bayesian Learning and the Relevance Vector Machine, + Journal of Machine Learning Research, Vol. 1, 2001. + + Examples + -------- + >>> from sklearn import linear_model + >>> clf = linear_model.BayesianRidge() + >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2]) + BayesianRidge() + >>> clf.predict([[1, 1]]) + array([1.]) + """ + + _parameter_constraints: dict = { + "max_iter": [Interval(Integral, 1, None, closed="left")], + "tol": [Interval(Real, 0, None, closed="neither")], + "alpha_1": [Interval(Real, 0, None, closed="left")], + "alpha_2": [Interval(Real, 0, None, closed="left")], + "lambda_1": [Interval(Real, 0, None, closed="left")], + "lambda_2": [Interval(Real, 0, None, closed="left")], + "alpha_init": [None, Interval(Real, 0, None, closed="left")], + "lambda_init": [None, Interval(Real, 0, None, closed="left")], + "compute_score": ["boolean"], + "fit_intercept": ["boolean"], + "copy_X": ["boolean"], + "verbose": ["verbose"], + } + + def __init__( + self, + *, + max_iter=300, + tol=1.0e-3, + alpha_1=1.0e-6, + alpha_2=1.0e-6, + lambda_1=1.0e-6, + lambda_2=1.0e-6, + alpha_init=None, + lambda_init=None, + compute_score=False, + fit_intercept=True, + copy_X=True, + verbose=False, + ): + self.max_iter = max_iter + self.tol = tol + self.alpha_1 = alpha_1 + self.alpha_2 = alpha_2 + self.lambda_1 = lambda_1 + self.lambda_2 = lambda_2 + self.alpha_init = alpha_init + self.lambda_init = lambda_init + self.compute_score = compute_score + self.fit_intercept = fit_intercept + self.copy_X = copy_X + self.verbose = verbose + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, sample_weight=None): + """Fit the model. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Training data. + y : ndarray of shape (n_samples,) + Target values. Will be cast to X's dtype if necessary. + + sample_weight : ndarray of shape (n_samples,), default=None + Individual weights for each sample. + + .. versionadded:: 0.20 + parameter *sample_weight* support to BayesianRidge. + + Returns + ------- + self : object + Returns the instance itself. + """ + X, y = validate_data( + self, + X, + y, + dtype=[np.float64, np.float32], + force_writeable=True, + y_numeric=True, + ) + dtype = X.dtype + n_samples, n_features = X.shape + + sw_sum = n_samples + y_var = y.var() + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, dtype=dtype) + sw_sum = sample_weight.sum() + y_mean = np.average(y, weights=sample_weight) + y_var = np.average((y - y_mean) ** 2, weights=sample_weight) + + X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data( + X, + y, + fit_intercept=self.fit_intercept, + copy=self.copy_X, + sample_weight=sample_weight, + ) + + if sample_weight is not None: + # Sample weight can be implemented via a simple rescaling. + X, y, _ = _rescale_data(X, y, sample_weight) + + self.X_offset_ = X_offset_ + self.X_scale_ = X_scale_ + + # Initialization of the values of the parameters + eps = np.finfo(np.float64).eps + # Add `eps` in the denominator to omit division by zero + alpha_ = self.alpha_init + lambda_ = self.lambda_init + if alpha_ is None: + alpha_ = 1.0 / (y_var + eps) + if lambda_ is None: + lambda_ = 1.0 + + # Avoid unintended type promotion to float64 with numpy 2 + alpha_ = np.asarray(alpha_, dtype=dtype) + lambda_ = np.asarray(lambda_, dtype=dtype) + + verbose = self.verbose + lambda_1 = self.lambda_1 + lambda_2 = self.lambda_2 + alpha_1 = self.alpha_1 + alpha_2 = self.alpha_2 + + self.scores_ = list() + coef_old_ = None + + XT_y = np.dot(X.T, y) + # Let M, N = n_samples, n_features and K = min(M, N). + # The posterior covariance matrix needs Vh_full: (N, N). + # The full SVD is only required when n_samples < n_features. + # When n_samples < n_features, K=M and full_matrices=True + # U: (M, M), S: M, Vh_full: (N, N), Vh: (M, N) + # When n_samples > n_features, K=N and full_matrices=False + # U: (M, N), S: N, Vh_full: (N, N), Vh: (N, N) + U, S, Vh_full = linalg.svd(X, full_matrices=(n_samples < n_features)) + K = len(S) + eigen_vals_ = S**2 + eigen_vals_full = np.zeros(n_features, dtype=dtype) + eigen_vals_full[0:K] = eigen_vals_ + Vh = Vh_full[0:K, :] + + # Convergence loop of the bayesian ridge regression + for iter_ in range(self.max_iter): + # update posterior mean coef_ based on alpha_ and lambda_ and + # compute corresponding sse (sum of squared errors) + coef_, sse_ = self._update_coef_( + X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_ + ) + if self.compute_score: + # compute the log marginal likelihood + s = self._log_marginal_likelihood( + n_samples, + n_features, + sw_sum, + eigen_vals_, + alpha_, + lambda_, + coef_, + sse_, + ) + self.scores_.append(s) + + # Update alpha and lambda according to (MacKay, 1992) + gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_)) + lambda_ = (gamma_ + 2 * lambda_1) / (np.sum(coef_**2) + 2 * lambda_2) + alpha_ = (sw_sum - gamma_ + 2 * alpha_1) / (sse_ + 2 * alpha_2) + + # Check for convergence + if iter_ != 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol: + if verbose: + print("Convergence after ", str(iter_), " iterations") + break + coef_old_ = np.copy(coef_) + + self.n_iter_ = iter_ + 1 + + # return regularization parameters and corresponding posterior mean, + # log marginal likelihood and posterior covariance + self.alpha_ = alpha_ + self.lambda_ = lambda_ + self.coef_, sse_ = self._update_coef_( + X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_ + ) + if self.compute_score: + # compute the log marginal likelihood + s = self._log_marginal_likelihood( + n_samples, + n_features, + sw_sum, + eigen_vals_, + alpha_, + lambda_, + coef_, + sse_, + ) + self.scores_.append(s) + self.scores_ = np.array(self.scores_) + + # posterior covariance + self.sigma_ = np.dot( + Vh_full.T, Vh_full / (alpha_ * eigen_vals_full + lambda_)[:, np.newaxis] + ) + + self._set_intercept(X_offset_, y_offset_, X_scale_) + + return self + + def predict(self, X, return_std=False): + """Predict using the linear model. + + In addition to the mean of the predictive distribution, also its + standard deviation can be returned. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Samples. + + return_std : bool, default=False + Whether to return the standard deviation of posterior prediction. + + Returns + ------- + y_mean : array-like of shape (n_samples,) + Mean of predictive distribution of query points. + + y_std : array-like of shape (n_samples,) + Standard deviation of predictive distribution of query points. + """ + y_mean = self._decision_function(X) + if not return_std: + return y_mean + else: + sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1) + y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_)) + return y_mean, y_std + + def _update_coef_( + self, X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_ + ): + """Update posterior mean and compute corresponding sse (sum of squared errors). + + Posterior mean is given by coef_ = scaled_sigma_ * X.T * y where + scaled_sigma_ = (lambda_/alpha_ * np.eye(n_features) + + np.dot(X.T, X))^-1 + """ + + if n_samples > n_features: + coef_ = np.linalg.multi_dot( + [Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis], XT_y] + ) + else: + coef_ = np.linalg.multi_dot( + [X.T, U / (eigen_vals_ + lambda_ / alpha_)[None, :], U.T, y] + ) + + # Note: we do not need to explicitly use the weights in this sum because + # y and X were preprocessed by _rescale_data to handle the weights. + sse_ = np.sum((y - np.dot(X, coef_)) ** 2) + + return coef_, sse_ + + def _log_marginal_likelihood( + self, n_samples, n_features, sw_sum, eigen_vals, alpha_, lambda_, coef, sse + ): + """Log marginal likelihood.""" + alpha_1 = self.alpha_1 + alpha_2 = self.alpha_2 + lambda_1 = self.lambda_1 + lambda_2 = self.lambda_2 + + # compute the log of the determinant of the posterior covariance. + # posterior covariance is given by + # sigma = (lambda_ * np.eye(n_features) + alpha_ * np.dot(X.T, X))^-1 + if n_samples > n_features: + logdet_sigma = -np.sum(np.log(lambda_ + alpha_ * eigen_vals)) + else: + logdet_sigma = np.full(n_features, lambda_, dtype=np.array(lambda_).dtype) + logdet_sigma[:n_samples] += alpha_ * eigen_vals + logdet_sigma = -np.sum(np.log(logdet_sigma)) + + score = lambda_1 * log(lambda_) - lambda_2 * lambda_ + score += alpha_1 * log(alpha_) - alpha_2 * alpha_ + score += 0.5 * ( + n_features * log(lambda_) + + sw_sum * log(alpha_) + - alpha_ * sse + - lambda_ * np.sum(coef**2) + + logdet_sigma + - sw_sum * log(2 * np.pi) + ) + + return score + + +############################################################################### +# ARD (Automatic Relevance Determination) regression + + +class ARDRegression(RegressorMixin, LinearModel): + """Bayesian ARD regression. + + Fit the weights of a regression model, using an ARD prior. The weights of + the regression model are assumed to be in Gaussian distributions. + Also estimate the parameters lambda (precisions of the distributions of the + weights) and alpha (precision of the distribution of the noise). + The estimation is done by an iterative procedures (Evidence Maximization) + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + max_iter : int, default=300 + Maximum number of iterations. + + .. versionchanged:: 1.3 + + tol : float, default=1e-3 + Stop the algorithm if w has converged. + + alpha_1 : float, default=1e-6 + Hyper-parameter : shape parameter for the Gamma distribution prior + over the alpha parameter. + + alpha_2 : float, default=1e-6 + Hyper-parameter : inverse scale parameter (rate parameter) for the + Gamma distribution prior over the alpha parameter. + + lambda_1 : float, default=1e-6 + Hyper-parameter : shape parameter for the Gamma distribution prior + over the lambda parameter. + + lambda_2 : float, default=1e-6 + Hyper-parameter : inverse scale parameter (rate parameter) for the + Gamma distribution prior over the lambda parameter. + + compute_score : bool, default=False + If True, compute the objective function at each step of the model. + + threshold_lambda : float, default=10 000 + Threshold for removing (pruning) weights with high precision from + the computation. + + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to false, no intercept will be used in calculations + (i.e. data is expected to be centered). + + copy_X : bool, default=True + If True, X will be copied; else, it may be overwritten. + + verbose : bool, default=False + Verbose mode when fitting the model. + + Attributes + ---------- + coef_ : array-like of shape (n_features,) + Coefficients of the regression model (mean of distribution) + + alpha_ : float + estimated precision of the noise. + + lambda_ : array-like of shape (n_features,) + estimated precisions of the weights. + + sigma_ : array-like of shape (n_features, n_features) + estimated variance-covariance matrix of the weights + + scores_ : float + if computed, value of the objective function (to be maximized) + + n_iter_ : int + The actual number of iterations to reach the stopping criterion. + + .. versionadded:: 1.3 + + intercept_ : float + Independent term in decision function. Set to 0.0 if + ``fit_intercept = False``. + + X_offset_ : float + If `fit_intercept=True`, offset subtracted for centering data to a + zero mean. Set to np.zeros(n_features) otherwise. + + X_scale_ : float + Set to np.ones(n_features). + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + BayesianRidge : Bayesian ridge regression. + + References + ---------- + D. J. C. MacKay, Bayesian nonlinear modeling for the prediction + competition, ASHRAE Transactions, 1994. + + R. Salakhutdinov, Lecture notes on Statistical Machine Learning, + http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf#page=15 + Their beta is our ``self.alpha_`` + Their alpha is our ``self.lambda_`` + ARD is a little different than the slide: only dimensions/features for + which ``self.lambda_ < self.threshold_lambda`` are kept and the rest are + discarded. + + Examples + -------- + >>> from sklearn import linear_model + >>> clf = linear_model.ARDRegression() + >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2]) + ARDRegression() + >>> clf.predict([[1, 1]]) + array([1.]) + + - :ref:`sphx_glr_auto_examples_linear_model_plot_ard.py` demonstrates ARD + Regression. + - :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_and_elasticnet.py` + showcases ARD Regression alongside Lasso and Elastic-Net for sparse, + correlated signals, in the presence of noise. + """ + + _parameter_constraints: dict = { + "max_iter": [Interval(Integral, 1, None, closed="left")], + "tol": [Interval(Real, 0, None, closed="left")], + "alpha_1": [Interval(Real, 0, None, closed="left")], + "alpha_2": [Interval(Real, 0, None, closed="left")], + "lambda_1": [Interval(Real, 0, None, closed="left")], + "lambda_2": [Interval(Real, 0, None, closed="left")], + "compute_score": ["boolean"], + "threshold_lambda": [Interval(Real, 0, None, closed="left")], + "fit_intercept": ["boolean"], + "copy_X": ["boolean"], + "verbose": ["verbose"], + } + + def __init__( + self, + *, + max_iter=300, + tol=1.0e-3, + alpha_1=1.0e-6, + alpha_2=1.0e-6, + lambda_1=1.0e-6, + lambda_2=1.0e-6, + compute_score=False, + threshold_lambda=1.0e4, + fit_intercept=True, + copy_X=True, + verbose=False, + ): + self.max_iter = max_iter + self.tol = tol + self.fit_intercept = fit_intercept + self.alpha_1 = alpha_1 + self.alpha_2 = alpha_2 + self.lambda_1 = lambda_1 + self.lambda_2 = lambda_2 + self.compute_score = compute_score + self.threshold_lambda = threshold_lambda + self.copy_X = copy_X + self.verbose = verbose + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y): + """Fit the model according to the given training data and parameters. + + Iterative procedure to maximize the evidence + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + y : array-like of shape (n_samples,) + Target values (integers). Will be cast to X's dtype if necessary. + + Returns + ------- + self : object + Fitted estimator. + """ + X, y = validate_data( + self, + X, + y, + dtype=[np.float64, np.float32], + force_writeable=True, + y_numeric=True, + ensure_min_samples=2, + ) + dtype = X.dtype + + n_samples, n_features = X.shape + coef_ = np.zeros(n_features, dtype=dtype) + + X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data( + X, y, fit_intercept=self.fit_intercept, copy=self.copy_X + ) + + self.X_offset_ = X_offset_ + self.X_scale_ = X_scale_ + + # Launch the convergence loop + keep_lambda = np.ones(n_features, dtype=bool) + + lambda_1 = self.lambda_1 + lambda_2 = self.lambda_2 + alpha_1 = self.alpha_1 + alpha_2 = self.alpha_2 + verbose = self.verbose + + # Initialization of the values of the parameters + eps = np.finfo(np.float64).eps + # Add `eps` in the denominator to omit division by zero if `np.var(y)` + # is zero. + # Explicitly set dtype to avoid unintended type promotion with numpy 2. + alpha_ = np.asarray(1.0 / (np.var(y) + eps), dtype=dtype) + lambda_ = np.ones(n_features, dtype=dtype) + + self.scores_ = list() + coef_old_ = None + + def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_): + coef_[keep_lambda] = alpha_ * np.linalg.multi_dot( + [sigma_, X[:, keep_lambda].T, y] + ) + return coef_ + + update_sigma = ( + self._update_sigma + if n_samples >= n_features + else self._update_sigma_woodbury + ) + # Iterative procedure of ARDRegression + for iter_ in range(self.max_iter): + sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda) + coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_) + + # Update alpha and lambda + sse_ = np.sum((y - np.dot(X, coef_)) ** 2) + gamma_ = 1.0 - lambda_[keep_lambda] * np.diag(sigma_) + lambda_[keep_lambda] = (gamma_ + 2.0 * lambda_1) / ( + (coef_[keep_lambda]) ** 2 + 2.0 * lambda_2 + ) + alpha_ = (n_samples - gamma_.sum() + 2.0 * alpha_1) / (sse_ + 2.0 * alpha_2) + + # Prune the weights with a precision over a threshold + keep_lambda = lambda_ < self.threshold_lambda + coef_[~keep_lambda] = 0 + + # Compute the objective function + if self.compute_score: + s = (lambda_1 * np.log(lambda_) - lambda_2 * lambda_).sum() + s += alpha_1 * log(alpha_) - alpha_2 * alpha_ + s += 0.5 * ( + fast_logdet(sigma_) + + n_samples * log(alpha_) + + np.sum(np.log(lambda_)) + ) + s -= 0.5 * (alpha_ * sse_ + (lambda_ * coef_**2).sum()) + self.scores_.append(s) + + # Check for convergence + if iter_ > 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol: + if verbose: + print("Converged after %s iterations" % iter_) + break + coef_old_ = np.copy(coef_) + + if not keep_lambda.any(): + break + + self.n_iter_ = iter_ + 1 + + if keep_lambda.any(): + # update sigma and mu using updated params from the last iteration + sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda) + coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_) + else: + sigma_ = np.array([]).reshape(0, 0) + + self.coef_ = coef_ + self.alpha_ = alpha_ + self.sigma_ = sigma_ + self.lambda_ = lambda_ + self._set_intercept(X_offset_, y_offset_, X_scale_) + return self + + def _update_sigma_woodbury(self, X, alpha_, lambda_, keep_lambda): + # See slides as referenced in the docstring note + # this function is used when n_samples < n_features and will invert + # a matrix of shape (n_samples, n_samples) making use of the + # woodbury formula: + # https://en.wikipedia.org/wiki/Woodbury_matrix_identity + n_samples = X.shape[0] + X_keep = X[:, keep_lambda] + inv_lambda = 1 / lambda_[keep_lambda].reshape(1, -1) + sigma_ = pinvh( + np.eye(n_samples, dtype=X.dtype) / alpha_ + + np.dot(X_keep * inv_lambda, X_keep.T) + ) + sigma_ = np.dot(sigma_, X_keep * inv_lambda) + sigma_ = -np.dot(inv_lambda.reshape(-1, 1) * X_keep.T, sigma_) + sigma_[np.diag_indices(sigma_.shape[1])] += 1.0 / lambda_[keep_lambda] + return sigma_ + + def _update_sigma(self, X, alpha_, lambda_, keep_lambda): + # See slides as referenced in the docstring note + # this function is used when n_samples >= n_features and will + # invert a matrix of shape (n_features, n_features) + X_keep = X[:, keep_lambda] + gram = np.dot(X_keep.T, X_keep) + eye = np.eye(gram.shape[0], dtype=X.dtype) + sigma_inv = lambda_[keep_lambda] * eye + alpha_ * gram + sigma_ = pinvh(sigma_inv) + return sigma_ + + def predict(self, X, return_std=False): + """Predict using the linear model. + + In addition to the mean of the predictive distribution, also its + standard deviation can be returned. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Samples. + + return_std : bool, default=False + Whether to return the standard deviation of posterior prediction. + + Returns + ------- + y_mean : array-like of shape (n_samples,) + Mean of predictive distribution of query points. + + y_std : array-like of shape (n_samples,) + Standard deviation of predictive distribution of query points. + """ + y_mean = self._decision_function(X) + if return_std is False: + return y_mean + else: + col_index = self.lambda_ < self.threshold_lambda + X = _safe_indexing(X, indices=col_index, axis=1) + sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1) + y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_)) + return y_mean, y_std diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_cd_fast.pyx b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_cd_fast.pyx new file mode 100644 index 0000000000000000000000000000000000000000..ce598ebb011d216ffdbbd70cbed507ad14bdb848 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_cd_fast.pyx @@ -0,0 +1,962 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from libc.math cimport fabs +import numpy as np + +from cython cimport floating +import warnings +from ..exceptions import ConvergenceWarning + +from ..utils._cython_blas cimport ( + _axpy, _dot, _asum, _gemv, _nrm2, _copy, _scal +) +from ..utils._cython_blas cimport ColMajor, Trans, NoTrans +from ..utils._typedefs cimport uint32_t +from ..utils._random cimport our_rand_r + + +# The following two functions are shamelessly copied from the tree code. + +cdef enum: + # Max value for our rand_r replacement (near the bottom). + # We don't use RAND_MAX because it's different across platforms and + # particularly tiny on Windows/MSVC. + # It corresponds to the maximum representable value for + # 32-bit signed integers (i.e. 2^31 - 1). + RAND_R_MAX = 2147483647 + + +cdef inline uint32_t rand_int(uint32_t end, uint32_t* random_state) noexcept nogil: + """Generate a random integer in [0; end).""" + return our_rand_r(random_state) % end + + +cdef inline floating fmax(floating x, floating y) noexcept nogil: + if x > y: + return x + return y + + +cdef inline floating fsign(floating f) noexcept nogil: + if f == 0: + return 0 + elif f > 0: + return 1.0 + else: + return -1.0 + + +cdef floating abs_max(int n, const floating* a) noexcept nogil: + """np.max(np.abs(a))""" + cdef int i + cdef floating m = fabs(a[0]) + cdef floating d + for i in range(1, n): + d = fabs(a[i]) + if d > m: + m = d + return m + + +cdef floating max(int n, floating* a) noexcept nogil: + """np.max(a)""" + cdef int i + cdef floating m = a[0] + cdef floating d + for i in range(1, n): + d = a[i] + if d > m: + m = d + return m + + +cdef floating diff_abs_max(int n, const floating* a, floating* b) noexcept nogil: + """np.max(np.abs(a - b))""" + cdef int i + cdef floating m = fabs(a[0] - b[0]) + cdef floating d + for i in range(1, n): + d = fabs(a[i] - b[i]) + if d > m: + m = d + return m + + +message_conv = ( + "Objective did not converge. You might want to increase " + "the number of iterations, check the scale of the " + "features or consider increasing regularisation." +) + + +message_ridge = ( + "Linear regression models with a zero l1 penalization " + "strength are more efficiently fitted using one of the " + "solvers implemented in " + "sklearn.linear_model.Ridge/RidgeCV instead." +) + + +def enet_coordinate_descent( + floating[::1] w, + floating alpha, + floating beta, + const floating[::1, :] X, + const floating[::1] y, + unsigned int max_iter, + floating tol, + object rng, + bint random=0, + bint positive=0 +): + """Cython version of the coordinate descent algorithm + for Elastic-Net regression + + We minimize + + (1/2) * norm(y - X w, 2)^2 + alpha norm(w, 1) + (beta/2) norm(w, 2)^2 + + Returns + ------- + w : ndarray of shape (n_features,) + ElasticNet coefficients. + gap : float + Achieved dual gap. + tol : float + Equals input `tol` times `np.dot(y, y)`. The tolerance used for the dual gap. + n_iter : int + Number of coordinate descent iterations. + """ + + if floating is float: + dtype = np.float32 + else: + dtype = np.float64 + + # get the data information into easy vars + cdef unsigned int n_samples = X.shape[0] + cdef unsigned int n_features = X.shape[1] + + # compute norms of the columns of X + cdef floating[::1] norm_cols_X = np.square(X).sum(axis=0) + + # initial value of the residuals + cdef floating[::1] R = np.empty(n_samples, dtype=dtype) + cdef floating[::1] XtA = np.empty(n_features, dtype=dtype) + + cdef floating tmp + cdef floating w_ii + cdef floating d_w_max + cdef floating w_max + cdef floating d_w_ii + cdef floating gap = tol + 1.0 + cdef floating d_w_tol = tol + cdef floating dual_norm_XtA + cdef floating R_norm2 + cdef floating w_norm2 + cdef floating l1_norm + cdef floating const_ + cdef floating A_norm2 + cdef unsigned int ii + cdef unsigned int n_iter = 0 + cdef unsigned int f_iter + cdef uint32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX) + cdef uint32_t* rand_r_state = &rand_r_state_seed + + if alpha == 0 and beta == 0: + warnings.warn("Coordinate descent with no regularization may lead to " + "unexpected results and is discouraged.") + + with nogil: + # R = y - np.dot(X, w) + _copy(n_samples, &y[0], 1, &R[0], 1) + _gemv(ColMajor, NoTrans, n_samples, n_features, -1.0, &X[0, 0], + n_samples, &w[0], 1, 1.0, &R[0], 1) + + # tol *= np.dot(y, y) + tol *= _dot(n_samples, &y[0], 1, &y[0], 1) + + for n_iter in range(max_iter): + w_max = 0.0 + d_w_max = 0.0 + for f_iter in range(n_features): # Loop over coordinates + if random: + ii = rand_int(n_features, rand_r_state) + else: + ii = f_iter + + if norm_cols_X[ii] == 0.0: + continue + + w_ii = w[ii] # Store previous value + + if w_ii != 0.0: + # R += w_ii * X[:,ii] + _axpy(n_samples, w_ii, &X[0, ii], 1, &R[0], 1) + + # tmp = (X[:,ii]*R).sum() + tmp = _dot(n_samples, &X[0, ii], 1, &R[0], 1) + + if positive and tmp < 0: + w[ii] = 0.0 + else: + w[ii] = (fsign(tmp) * fmax(fabs(tmp) - alpha, 0) + / (norm_cols_X[ii] + beta)) + + if w[ii] != 0.0: + # R -= w[ii] * X[:,ii] # Update residual + _axpy(n_samples, -w[ii], &X[0, ii], 1, &R[0], 1) + + # update the maximum absolute coefficient update + d_w_ii = fabs(w[ii] - w_ii) + d_w_max = fmax(d_w_max, d_w_ii) + + w_max = fmax(w_max, fabs(w[ii])) + + if ( + w_max == 0.0 + or d_w_max / w_max < d_w_tol + or n_iter == max_iter - 1 + ): + # the biggest coordinate update of this iteration was smaller + # than the tolerance: check the duality gap as ultimate + # stopping criterion + + # XtA = np.dot(X.T, R) - beta * w + _copy(n_features, &w[0], 1, &XtA[0], 1) + _gemv(ColMajor, Trans, + n_samples, n_features, 1.0, &X[0, 0], n_samples, + &R[0], 1, + -beta, &XtA[0], 1) + + if positive: + dual_norm_XtA = max(n_features, &XtA[0]) + else: + dual_norm_XtA = abs_max(n_features, &XtA[0]) + + # R_norm2 = np.dot(R, R) + R_norm2 = _dot(n_samples, &R[0], 1, &R[0], 1) + + # w_norm2 = np.dot(w, w) + w_norm2 = _dot(n_features, &w[0], 1, &w[0], 1) + + if (dual_norm_XtA > alpha): + const_ = alpha / dual_norm_XtA + A_norm2 = R_norm2 * (const_ ** 2) + gap = 0.5 * (R_norm2 + A_norm2) + else: + const_ = 1.0 + gap = R_norm2 + + l1_norm = _asum(n_features, &w[0], 1) + + gap += (alpha * l1_norm + - const_ * _dot(n_samples, &R[0], 1, &y[0], 1) # np.dot(R.T, y) + + 0.5 * beta * (1 + const_ ** 2) * (w_norm2)) + + if gap < tol: + # return if we reached desired tolerance + break + + else: + # for/else, runs if for doesn't end with a `break` + with gil: + message = ( + message_conv + + f" Duality gap: {gap:.3e}, tolerance: {tol:.3e}" + ) + if alpha < np.finfo(np.float64).eps: + message += "\n" + message_ridge + warnings.warn(message, ConvergenceWarning) + + return np.asarray(w), gap, tol, n_iter + 1 + + +def sparse_enet_coordinate_descent( + floating[::1] w, + floating alpha, + floating beta, + const floating[::1] X_data, + const int[::1] X_indices, + const int[::1] X_indptr, + const floating[::1] y, + const floating[::1] sample_weight, + const floating[::1] X_mean, + unsigned int max_iter, + floating tol, + object rng, + bint random=0, + bint positive=0, +): + """Cython version of the coordinate descent algorithm for Elastic-Net + + We minimize: + + 1/2 * norm(y - Z w, 2)^2 + alpha * norm(w, 1) + (beta/2) * norm(w, 2)^2 + + where Z = X - X_mean. + With sample weights sw, this becomes + + 1/2 * sum(sw * (y - Z w)^2, axis=0) + alpha * norm(w, 1) + + (beta/2) * norm(w, 2)^2 + + and X_mean is the weighted average of X (per column). + + Returns + ------- + w : ndarray of shape (n_features,) + ElasticNet coefficients. + gap : float + Achieved dual gap. + tol : float + Equals input `tol` times `np.dot(y, y)`. The tolerance used for the dual gap. + n_iter : int + Number of coordinate descent iterations. + """ + # Notes for sample_weight: + # For dense X, one centers X and y and then rescales them by sqrt(sample_weight). + # Here, for sparse X, we get the sample_weight averaged center X_mean. We take care + # that every calculation results as if we had rescaled y and X (and therefore also + # X_mean) by sqrt(sample_weight) without actually calculating the square root. + # We work with: + # yw = sample_weight * y + # R = sample_weight * residual + # norm_cols_X = np.sum(sample_weight * (X - X_mean)**2, axis=0) + + if floating is float: + dtype = np.float32 + else: + dtype = np.float64 + + # get the data information into easy vars + cdef unsigned int n_samples = y.shape[0] + cdef unsigned int n_features = w.shape[0] + + # compute norms of the columns of X + cdef floating[:] norm_cols_X = np.zeros(n_features, dtype=dtype) + + # initial value of the residuals + # R = y - Zw, weighted version R = sample_weight * (y - Zw) + cdef floating[::1] R + cdef floating[::1] XtA = np.empty(n_features, dtype=dtype) + cdef const floating[::1] yw + + cdef floating tmp + cdef floating w_ii + cdef floating d_w_max + cdef floating w_max + cdef floating d_w_ii + cdef floating gap = tol + 1.0 + cdef floating d_w_tol = tol + cdef floating dual_norm_XtA + cdef floating X_mean_ii + cdef floating R_sum = 0.0 + cdef floating R_norm2 + cdef floating w_norm2 + cdef floating l1_norm + cdef floating const_ + cdef floating A_norm2 + cdef floating normalize_sum + cdef unsigned int ii + cdef unsigned int jj + cdef unsigned int n_iter = 0 + cdef unsigned int f_iter + cdef unsigned int startptr = X_indptr[0] + cdef unsigned int endptr + cdef uint32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX) + cdef uint32_t* rand_r_state = &rand_r_state_seed + cdef bint center = False + cdef bint no_sample_weights = sample_weight is None + cdef int kk + + if no_sample_weights: + yw = y + R = y.copy() + else: + yw = np.multiply(sample_weight, y) + R = yw.copy() + + with nogil: + # center = (X_mean != 0).any() + for ii in range(n_features): + if X_mean[ii]: + center = True + break + + # R = y - np.dot(X, w) + for ii in range(n_features): + X_mean_ii = X_mean[ii] + endptr = X_indptr[ii + 1] + normalize_sum = 0.0 + w_ii = w[ii] + + if no_sample_weights: + for jj in range(startptr, endptr): + normalize_sum += (X_data[jj] - X_mean_ii) ** 2 + R[X_indices[jj]] -= X_data[jj] * w_ii + norm_cols_X[ii] = normalize_sum + \ + (n_samples - endptr + startptr) * X_mean_ii ** 2 + if center: + for jj in range(n_samples): + R[jj] += X_mean_ii * w_ii + R_sum += R[jj] + else: + # R = sw * (y - np.dot(X, w)) + for jj in range(startptr, endptr): + tmp = sample_weight[X_indices[jj]] + # second term will be subtracted by loop over range(n_samples) + normalize_sum += (tmp * (X_data[jj] - X_mean_ii) ** 2 + - tmp * X_mean_ii ** 2) + R[X_indices[jj]] -= tmp * X_data[jj] * w_ii + if center: + for jj in range(n_samples): + normalize_sum += sample_weight[jj] * X_mean_ii ** 2 + R[jj] += sample_weight[jj] * X_mean_ii * w_ii + R_sum += R[jj] + norm_cols_X[ii] = normalize_sum + startptr = endptr + + # Note: No need to update R_sum from here on because the update terms cancel + # each other: w_ii * np.sum(X[:,ii] - X_mean[ii]) = 0. R_sum is only ever + # needed and calculated if X_mean is provided. + + # tol *= np.dot(y, y) + # with sample weights: tol *= y @ (sw * y) + tol *= _dot(n_samples, &y[0], 1, &yw[0], 1) + + for n_iter in range(max_iter): + + w_max = 0.0 + d_w_max = 0.0 + + for f_iter in range(n_features): # Loop over coordinates + if random: + ii = rand_int(n_features, rand_r_state) + else: + ii = f_iter + + if norm_cols_X[ii] == 0.0: + continue + + startptr = X_indptr[ii] + endptr = X_indptr[ii + 1] + w_ii = w[ii] # Store previous value + X_mean_ii = X_mean[ii] + + if w_ii != 0.0: + # R += w_ii * X[:,ii] + if no_sample_weights: + for jj in range(startptr, endptr): + R[X_indices[jj]] += X_data[jj] * w_ii + if center: + for jj in range(n_samples): + R[jj] -= X_mean_ii * w_ii + else: + for jj in range(startptr, endptr): + tmp = sample_weight[X_indices[jj]] + R[X_indices[jj]] += tmp * X_data[jj] * w_ii + if center: + for jj in range(n_samples): + R[jj] -= sample_weight[jj] * X_mean_ii * w_ii + + # tmp = (X[:,ii] * R).sum() + tmp = 0.0 + for jj in range(startptr, endptr): + tmp += R[X_indices[jj]] * X_data[jj] + + if center: + tmp -= R_sum * X_mean_ii + + if positive and tmp < 0.0: + w[ii] = 0.0 + else: + w[ii] = fsign(tmp) * fmax(fabs(tmp) - alpha, 0) \ + / (norm_cols_X[ii] + beta) + + if w[ii] != 0.0: + # R -= w[ii] * X[:,ii] # Update residual + if no_sample_weights: + for jj in range(startptr, endptr): + R[X_indices[jj]] -= X_data[jj] * w[ii] + if center: + for jj in range(n_samples): + R[jj] += X_mean_ii * w[ii] + else: + for jj in range(startptr, endptr): + tmp = sample_weight[X_indices[jj]] + R[X_indices[jj]] -= tmp * X_data[jj] * w[ii] + if center: + for jj in range(n_samples): + R[jj] += sample_weight[jj] * X_mean_ii * w[ii] + + # update the maximum absolute coefficient update + d_w_ii = fabs(w[ii] - w_ii) + d_w_max = fmax(d_w_max, d_w_ii) + + w_max = fmax(w_max, fabs(w[ii])) + + if w_max == 0.0 or d_w_max / w_max < d_w_tol or n_iter == max_iter - 1: + # the biggest coordinate update of this iteration was smaller than + # the tolerance: check the duality gap as ultimate stopping + # criterion + + # XtA = X.T @ R - beta * w + # sparse X.T / dense R dot product + for ii in range(n_features): + XtA[ii] = 0.0 + for kk in range(X_indptr[ii], X_indptr[ii + 1]): + XtA[ii] += X_data[kk] * R[X_indices[kk]] + + if center: + XtA[ii] -= X_mean[ii] * R_sum + XtA[ii] -= beta * w[ii] + + if positive: + dual_norm_XtA = max(n_features, &XtA[0]) + else: + dual_norm_XtA = abs_max(n_features, &XtA[0]) + + # R_norm2 = np.dot(R, R) + if no_sample_weights: + R_norm2 = _dot(n_samples, &R[0], 1, &R[0], 1) + else: + R_norm2 = 0.0 + for jj in range(n_samples): + # R is already multiplied by sample_weight + if sample_weight[jj] != 0: + R_norm2 += (R[jj] ** 2) / sample_weight[jj] + + # w_norm2 = np.dot(w, w) + w_norm2 = _dot(n_features, &w[0], 1, &w[0], 1) + if (dual_norm_XtA > alpha): + const_ = alpha / dual_norm_XtA + A_norm2 = R_norm2 * const_**2 + gap = 0.5 * (R_norm2 + A_norm2) + else: + const_ = 1.0 + gap = R_norm2 + + l1_norm = _asum(n_features, &w[0], 1) + + gap += (alpha * l1_norm + - const_ * _dot(n_samples, &R[0], 1, &y[0], 1) # np.dot(R.T, y) + + 0.5 * beta * (1 + const_ ** 2) * w_norm2) + + if gap < tol: + # return if we reached desired tolerance + break + + else: + # for/else, runs if for doesn't end with a `break` + with gil: + message = ( + message_conv + + f" Duality gap: {gap:.3e}, tolerance: {tol:.3e}" + ) + if alpha < np.finfo(np.float64).eps: + message += "\n" + message_ridge + warnings.warn(message, ConvergenceWarning) + + return np.asarray(w), gap, tol, n_iter + 1 + + +def enet_coordinate_descent_gram( + floating[::1] w, + floating alpha, + floating beta, + const floating[:, ::1] Q, + const floating[::1] q, + const floating[:] y, + unsigned int max_iter, + floating tol, + object rng, + bint random=0, + bint positive=0 +): + """Cython version of the coordinate descent algorithm + for Elastic-Net regression + + We minimize + + (1/2) * w^T Q w - q^T w + alpha norm(w, 1) + (beta/2) * norm(w, 2)^2 + + which amount to the Elastic-Net problem when: + Q = X^T X (Gram matrix) + q = X^T y + + Returns + ------- + w : ndarray of shape (n_features,) + ElasticNet coefficients. + gap : float + Achieved dual gap. + tol : float + Equals input `tol` times `np.dot(y, y)`. The tolerance used for the dual gap. + n_iter : int + Number of coordinate descent iterations. + """ + + if floating is float: + dtype = np.float32 + else: + dtype = np.float64 + + # get the data information into easy vars + cdef unsigned int n_features = Q.shape[0] + + # initial value "Q w" which will be kept of up to date in the iterations + cdef floating[:] H = np.dot(Q, w) + + cdef floating[:] XtA = np.zeros(n_features, dtype=dtype) + cdef floating tmp + cdef floating w_ii + cdef floating d_w_max + cdef floating w_max + cdef floating d_w_ii + cdef floating q_dot_w + cdef floating w_norm2 + cdef floating gap = tol + 1.0 + cdef floating d_w_tol = tol + cdef floating dual_norm_XtA + cdef unsigned int ii + cdef unsigned int n_iter = 0 + cdef unsigned int f_iter + cdef uint32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX) + cdef uint32_t* rand_r_state = &rand_r_state_seed + + cdef floating y_norm2 = np.dot(y, y) + cdef floating* w_ptr = &w[0] + cdef const floating* Q_ptr = &Q[0, 0] + cdef const floating* q_ptr = &q[0] + cdef floating* H_ptr = &H[0] + cdef floating* XtA_ptr = &XtA[0] + tol = tol * y_norm2 + + if alpha == 0: + warnings.warn( + "Coordinate descent without L1 regularization may " + "lead to unexpected results and is discouraged. " + "Set l1_ratio > 0 to add L1 regularization." + ) + + with nogil: + for n_iter in range(max_iter): + w_max = 0.0 + d_w_max = 0.0 + for f_iter in range(n_features): # Loop over coordinates + if random: + ii = rand_int(n_features, rand_r_state) + else: + ii = f_iter + + if Q[ii, ii] == 0.0: + continue + + w_ii = w[ii] # Store previous value + + if w_ii != 0.0: + # H -= w_ii * Q[ii] + _axpy(n_features, -w_ii, Q_ptr + ii * n_features, 1, + H_ptr, 1) + + tmp = q[ii] - H[ii] + + if positive and tmp < 0: + w[ii] = 0.0 + else: + w[ii] = fsign(tmp) * fmax(fabs(tmp) - alpha, 0) \ + / (Q[ii, ii] + beta) + + if w[ii] != 0.0: + # H += w[ii] * Q[ii] # Update H = X.T X w + _axpy(n_features, w[ii], Q_ptr + ii * n_features, 1, + H_ptr, 1) + + # update the maximum absolute coefficient update + d_w_ii = fabs(w[ii] - w_ii) + if d_w_ii > d_w_max: + d_w_max = d_w_ii + + if fabs(w[ii]) > w_max: + w_max = fabs(w[ii]) + + if w_max == 0.0 or d_w_max / w_max < d_w_tol or n_iter == max_iter - 1: + # the biggest coordinate update of this iteration was smaller than + # the tolerance: check the duality gap as ultimate stopping + # criterion + + # q_dot_w = np.dot(w, q) + q_dot_w = _dot(n_features, w_ptr, 1, q_ptr, 1) + + for ii in range(n_features): + XtA[ii] = q[ii] - H[ii] - beta * w[ii] + if positive: + dual_norm_XtA = max(n_features, XtA_ptr) + else: + dual_norm_XtA = abs_max(n_features, XtA_ptr) + + # temp = np.sum(w * H) + tmp = 0.0 + for ii in range(n_features): + tmp += w[ii] * H[ii] + R_norm2 = y_norm2 + tmp - 2.0 * q_dot_w + + # w_norm2 = np.dot(w, w) + w_norm2 = _dot(n_features, &w[0], 1, &w[0], 1) + + if (dual_norm_XtA > alpha): + const_ = alpha / dual_norm_XtA + A_norm2 = R_norm2 * (const_ ** 2) + gap = 0.5 * (R_norm2 + A_norm2) + else: + const_ = 1.0 + gap = R_norm2 + + # The call to asum is equivalent to the L1 norm of w + gap += ( + alpha * _asum(n_features, &w[0], 1) + - const_ * y_norm2 + + const_ * q_dot_w + + 0.5 * beta * (1 + const_ ** 2) * w_norm2 + ) + + if gap < tol: + # return if we reached desired tolerance + break + + else: + # for/else, runs if for doesn't end with a `break` + with gil: + message = ( + message_conv + + f" Duality gap: {gap:.3e}, tolerance: {tol:.3e}" + ) + warnings.warn(message, ConvergenceWarning) + + return np.asarray(w), gap, tol, n_iter + 1 + + +def enet_coordinate_descent_multi_task( + const floating[::1, :] W, + floating l1_reg, + floating l2_reg, + const floating[::1, :] X, + const floating[::1, :] Y, + unsigned int max_iter, + floating tol, + object rng, + bint random=0 +): + """Cython version of the coordinate descent algorithm + for Elastic-Net multi-task regression + + We minimize + + 0.5 * norm(Y - X W.T, 2)^2 + l1_reg ||W.T||_21 + 0.5 * l2_reg norm(W.T, 2)^2 + + Returns + ------- + W : ndarray of shape (n_tasks, n_features) + ElasticNet coefficients. + gap : float + Achieved dual gap. + tol : float + Equals input `tol` times `np.dot(y, y)`. The tolerance used for the dual gap. + n_iter : int + Number of coordinate descent iterations. + """ + + if floating is float: + dtype = np.float32 + else: + dtype = np.float64 + + # get the data information into easy vars + cdef unsigned int n_samples = X.shape[0] + cdef unsigned int n_features = X.shape[1] + cdef unsigned int n_tasks = Y.shape[1] + + # to store XtA + cdef floating[:, ::1] XtA = np.zeros((n_features, n_tasks), dtype=dtype) + cdef floating XtA_axis1norm + cdef floating dual_norm_XtA + + # initial value of the residuals + cdef floating[::1, :] R = np.zeros((n_samples, n_tasks), dtype=dtype, order='F') + + cdef floating[::1] norm_cols_X = np.zeros(n_features, dtype=dtype) + cdef floating[::1] tmp = np.zeros(n_tasks, dtype=dtype) + cdef floating[::1] w_ii = np.zeros(n_tasks, dtype=dtype) + cdef floating d_w_max + cdef floating w_max + cdef floating d_w_ii + cdef floating nn + cdef floating W_ii_abs_max + cdef floating gap = tol + 1.0 + cdef floating d_w_tol = tol + cdef floating R_norm + cdef floating w_norm + cdef floating ry_sum + cdef floating l21_norm + cdef unsigned int ii + cdef unsigned int jj + cdef unsigned int n_iter = 0 + cdef unsigned int f_iter + cdef uint32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX) + cdef uint32_t* rand_r_state = &rand_r_state_seed + + cdef const floating* X_ptr = &X[0, 0] + cdef const floating* Y_ptr = &Y[0, 0] + + if l1_reg == 0: + warnings.warn( + "Coordinate descent with l1_reg=0 may lead to unexpected" + " results and is discouraged." + ) + + with nogil: + # norm_cols_X = (np.asarray(X) ** 2).sum(axis=0) + for ii in range(n_features): + norm_cols_X[ii] = _nrm2(n_samples, X_ptr + ii * n_samples, 1) ** 2 + + # R = Y - np.dot(X, W.T) + _copy(n_samples * n_tasks, Y_ptr, 1, &R[0, 0], 1) + for ii in range(n_features): + for jj in range(n_tasks): + if W[jj, ii] != 0: + _axpy(n_samples, -W[jj, ii], X_ptr + ii * n_samples, 1, + &R[0, jj], 1) + + # tol = tol * linalg.norm(Y, ord='fro') ** 2 + tol = tol * _nrm2(n_samples * n_tasks, Y_ptr, 1) ** 2 + + for n_iter in range(max_iter): + w_max = 0.0 + d_w_max = 0.0 + for f_iter in range(n_features): # Loop over coordinates + if random: + ii = rand_int(n_features, rand_r_state) + else: + ii = f_iter + + if norm_cols_X[ii] == 0.0: + continue + + # w_ii = W[:, ii] # Store previous value + _copy(n_tasks, &W[0, ii], 1, &w_ii[0], 1) + + # Using Numpy: + # R += np.dot(X[:, ii][:, None], w_ii[None, :]) # rank 1 update + # Using Blas Level2: + # _ger(RowMajor, n_samples, n_tasks, 1.0, + # &X[0, ii], 1, + # &w_ii[0], 1, &R[0, 0], n_tasks) + # Using Blas Level1 and for loop to avoid slower threads + # for such small vectors + for jj in range(n_tasks): + if w_ii[jj] != 0: + _axpy(n_samples, w_ii[jj], X_ptr + ii * n_samples, 1, + &R[0, jj], 1) + + # Using numpy: + # tmp = np.dot(X[:, ii][None, :], R).ravel() + # Using BLAS Level 2: + # _gemv(RowMajor, Trans, n_samples, n_tasks, 1.0, &R[0, 0], + # n_tasks, &X[0, ii], 1, 0.0, &tmp[0], 1) + # Using BLAS Level 1 (faster for small vectors like here): + for jj in range(n_tasks): + tmp[jj] = _dot(n_samples, X_ptr + ii * n_samples, 1, + &R[0, jj], 1) + + # nn = sqrt(np.sum(tmp ** 2)) + nn = _nrm2(n_tasks, &tmp[0], 1) + + # W[:, ii] = tmp * fmax(1. - l1_reg / nn, 0) / (norm_cols_X[ii] + l2_reg) + _copy(n_tasks, &tmp[0], 1, &W[0, ii], 1) + _scal(n_tasks, fmax(1. - l1_reg / nn, 0) / (norm_cols_X[ii] + l2_reg), + &W[0, ii], 1) + + # Using numpy: + # R -= np.dot(X[:, ii][:, None], W[:, ii][None, :]) + # Using BLAS Level 2: + # Update residual : rank 1 update + # _ger(RowMajor, n_samples, n_tasks, -1.0, + # &X[0, ii], 1, &W[0, ii], 1, + # &R[0, 0], n_tasks) + # Using BLAS Level 1 (faster for small vectors like here): + for jj in range(n_tasks): + if W[jj, ii] != 0: + _axpy(n_samples, -W[jj, ii], X_ptr + ii * n_samples, 1, + &R[0, jj], 1) + + # update the maximum absolute coefficient update + d_w_ii = diff_abs_max(n_tasks, &W[0, ii], &w_ii[0]) + + if d_w_ii > d_w_max: + d_w_max = d_w_ii + + W_ii_abs_max = abs_max(n_tasks, &W[0, ii]) + if W_ii_abs_max > w_max: + w_max = W_ii_abs_max + + if w_max == 0.0 or d_w_max / w_max < d_w_tol or n_iter == max_iter - 1: + # the biggest coordinate update of this iteration was smaller than + # the tolerance: check the duality gap as ultimate stopping + # criterion + + # XtA = np.dot(X.T, R) - l2_reg * W.T + for ii in range(n_features): + for jj in range(n_tasks): + XtA[ii, jj] = _dot( + n_samples, X_ptr + ii * n_samples, 1, &R[0, jj], 1 + ) - l2_reg * W[jj, ii] + + # dual_norm_XtA = np.max(np.sqrt(np.sum(XtA ** 2, axis=1))) + dual_norm_XtA = 0.0 + for ii in range(n_features): + # np.sqrt(np.sum(XtA ** 2, axis=1)) + XtA_axis1norm = _nrm2(n_tasks, &XtA[ii, 0], 1) + if XtA_axis1norm > dual_norm_XtA: + dual_norm_XtA = XtA_axis1norm + + # TODO: use squared L2 norm directly + # R_norm = linalg.norm(R, ord='fro') + # w_norm = linalg.norm(W, ord='fro') + R_norm = _nrm2(n_samples * n_tasks, &R[0, 0], 1) + w_norm = _nrm2(n_features * n_tasks, &W[0, 0], 1) + if (dual_norm_XtA > l1_reg): + const_ = l1_reg / dual_norm_XtA + A_norm = R_norm * const_ + gap = 0.5 * (R_norm ** 2 + A_norm ** 2) + else: + const_ = 1.0 + gap = R_norm ** 2 + + # ry_sum = np.sum(R * y) + ry_sum = _dot(n_samples * n_tasks, &R[0, 0], 1, &Y[0, 0], 1) + + # l21_norm = np.sqrt(np.sum(W ** 2, axis=0)).sum() + l21_norm = 0.0 + for ii in range(n_features): + l21_norm += _nrm2(n_tasks, &W[0, ii], 1) + + gap += ( + l1_reg * l21_norm + - const_ * ry_sum + + 0.5 * l2_reg * (1 + const_ ** 2) * (w_norm ** 2) + ) + + if gap <= tol: + # return if we reached desired tolerance + break + else: + # for/else, runs if for doesn't end with a `break` + with gil: + message = ( + message_conv + + f" Duality gap: {gap:.3e}, tolerance: {tol:.3e}" + ) + warnings.warn(message, ConvergenceWarning) + + return np.asarray(W), gap, tol, n_iter + 1 diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_coordinate_descent.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_coordinate_descent.py new file mode 100644 index 0000000000000000000000000000000000000000..940ae6f5e3a3010f7fe2f21d28d68f538d893d8c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_coordinate_descent.py @@ -0,0 +1,3403 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numbers +import sys +import warnings +from abc import ABC, abstractmethod +from functools import partial +from numbers import Integral, Real + +import numpy as np +from joblib import effective_n_jobs +from scipy import sparse + +from sklearn.utils import metadata_routing + +from ..base import MultiOutputMixin, RegressorMixin, _fit_context +from ..model_selection import check_cv +from ..utils import Bunch, check_array, check_scalar +from ..utils._metadata_requests import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + get_routing_for_object, +) +from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params +from ..utils.extmath import safe_sparse_dot +from ..utils.metadata_routing import ( + _routing_enabled, + process_routing, +) +from ..utils.parallel import Parallel, delayed +from ..utils.validation import ( + _check_sample_weight, + check_consistent_length, + check_is_fitted, + check_random_state, + column_or_1d, + has_fit_parameter, + validate_data, +) + +# mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast' +from . import _cd_fast as cd_fast # type: ignore[attr-defined] +from ._base import LinearModel, _pre_fit, _preprocess_data + + +def _set_order(X, y, order="C"): + """Change the order of X and y if necessary. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : ndarray of shape (n_samples,) + Target values. + + order : {None, 'C', 'F'} + If 'C', dense arrays are returned as C-ordered, sparse matrices in csr + format. If 'F', dense arrays are return as F-ordered, sparse matrices + in csc format. + + Returns + ------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data with guaranteed order. + + y : ndarray of shape (n_samples,) + Target values with guaranteed order. + """ + if order not in [None, "C", "F"]: + raise ValueError( + "Unknown value for order. Got {} instead of None, 'C' or 'F'.".format(order) + ) + sparse_X = sparse.issparse(X) + sparse_y = sparse.issparse(y) + if order is not None: + sparse_format = "csc" if order == "F" else "csr" + if sparse_X: + X = X.asformat(sparse_format, copy=False) + else: + X = np.asarray(X, order=order) + if sparse_y: + y = y.asformat(sparse_format) + else: + y = np.asarray(y, order=order) + return X, y + + +############################################################################### +# Paths functions + + +def _alpha_grid( + X, + y, + Xy=None, + l1_ratio=1.0, + fit_intercept=True, + eps=1e-3, + n_alphas=100, + copy_X=True, + sample_weight=None, +): + """Compute the grid of alpha values for elastic net parameter search + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. Pass directly as Fortran-contiguous data to avoid + unnecessary memory duplication + + y : ndarray of shape (n_samples,) or (n_samples, n_outputs) + Target values + + Xy : array-like of shape (n_features,) or (n_features, n_outputs),\ + default=None + Xy = np.dot(X.T, y) that can be precomputed. + + l1_ratio : float, default=1.0 + The elastic net mixing parameter, with ``0 < l1_ratio <= 1``. + For ``l1_ratio = 0`` the penalty is an L2 penalty. (currently not + supported) ``For l1_ratio = 1`` it is an L1 penalty. For + ``0 < l1_ratio <1``, the penalty is a combination of L1 and L2. + + eps : float, default=1e-3 + Length of the path. ``eps=1e-3`` means that + ``alpha_min / alpha_max = 1e-3`` + + n_alphas : int, default=100 + Number of alphas along the regularization path + + fit_intercept : bool, default=True + Whether to fit an intercept or not + + copy_X : bool, default=True + If ``True``, X will be copied; else, it may be overwritten. + + sample_weight : ndarray of shape (n_samples,), default=None + """ + if l1_ratio == 0: + raise ValueError( + "Automatic alpha grid generation is not supported for" + " l1_ratio=0. Please supply a grid by providing " + "your estimator with the appropriate `alphas=` " + "argument." + ) + if Xy is not None: + Xyw = Xy + else: + X, y, X_offset, _, _ = _preprocess_data( + X, + y, + fit_intercept=fit_intercept, + copy=copy_X, + sample_weight=sample_weight, + check_input=False, + ) + if sample_weight is not None: + if y.ndim > 1: + yw = y * sample_weight.reshape(-1, 1) + else: + yw = y * sample_weight + else: + yw = y + if sparse.issparse(X): + Xyw = safe_sparse_dot(X.T, yw, dense_output=True) - np.sum(yw) * X_offset + else: + Xyw = np.dot(X.T, yw) + + if Xyw.ndim == 1: + Xyw = Xyw[:, np.newaxis] + if sample_weight is not None: + n_samples = sample_weight.sum() + else: + n_samples = X.shape[0] + alpha_max = np.sqrt(np.sum(Xyw**2, axis=1)).max() / (n_samples * l1_ratio) + + if alpha_max <= np.finfo(np.float64).resolution: + return np.full(n_alphas, np.finfo(np.float64).resolution) + + return np.geomspace(alpha_max, alpha_max * eps, num=n_alphas) + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "y": ["array-like", "sparse matrix"], + "eps": [Interval(Real, 0, None, closed="neither")], + "n_alphas": [Interval(Integral, 1, None, closed="left")], + "alphas": ["array-like", None], + "precompute": [StrOptions({"auto"}), "boolean", "array-like"], + "Xy": ["array-like", None], + "copy_X": ["boolean"], + "coef_init": ["array-like", None], + "verbose": ["verbose"], + "return_n_iter": ["boolean"], + "positive": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def lasso_path( + X, + y, + *, + eps=1e-3, + n_alphas=100, + alphas=None, + precompute="auto", + Xy=None, + copy_X=True, + coef_init=None, + verbose=False, + return_n_iter=False, + positive=False, + **params, +): + """Compute Lasso path with coordinate descent. + + The Lasso optimization function varies for mono and multi-outputs. + + For mono-output tasks it is:: + + (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 + + For multi-output tasks it is:: + + (1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_21 + + Where:: + + ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2} + + i.e. the sum of norm of each row. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. Pass directly as Fortran-contiguous data to avoid + unnecessary memory duplication. If ``y`` is mono-output then ``X`` + can be sparse. + + y : {array-like, sparse matrix} of shape (n_samples,) or \ + (n_samples, n_targets) + Target values. + + eps : float, default=1e-3 + Length of the path. ``eps=1e-3`` means that + ``alpha_min / alpha_max = 1e-3``. + + n_alphas : int, default=100 + Number of alphas along the regularization path. + + alphas : array-like, default=None + List of alphas where to compute the models. + If ``None`` alphas are set automatically. + + precompute : 'auto', bool or array-like of shape \ + (n_features, n_features), default='auto' + Whether to use a precomputed Gram matrix to speed up + calculations. If set to ``'auto'`` let us decide. The Gram + matrix can also be passed as argument. + + Xy : array-like of shape (n_features,) or (n_features, n_targets),\ + default=None + Xy = np.dot(X.T, y) that can be precomputed. It is useful + only when the Gram matrix is precomputed. + + copy_X : bool, default=True + If ``True``, X will be copied; else, it may be overwritten. + + coef_init : array-like of shape (n_features, ), default=None + The initial values of the coefficients. + + verbose : bool or int, default=False + Amount of verbosity. + + return_n_iter : bool, default=False + Whether to return the number of iterations or not. + + positive : bool, default=False + If set to True, forces coefficients to be positive. + (Only allowed when ``y.ndim == 1``). + + **params : kwargs + Keyword arguments passed to the coordinate descent solver. + + Returns + ------- + alphas : ndarray of shape (n_alphas,) + The alphas along the path where models are computed. + + coefs : ndarray of shape (n_features, n_alphas) or \ + (n_targets, n_features, n_alphas) + Coefficients along the path. + + dual_gaps : ndarray of shape (n_alphas,) + The dual gaps at the end of the optimization for each alpha. + + n_iters : list of int + The number of iterations taken by the coordinate descent optimizer to + reach the specified tolerance for each alpha. + + See Also + -------- + lars_path : Compute Least Angle Regression or Lasso path using LARS + algorithm. + Lasso : The Lasso is a linear model that estimates sparse coefficients. + LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars. + LassoCV : Lasso linear model with iterative fitting along a regularization + path. + LassoLarsCV : Cross-validated Lasso using the LARS algorithm. + sklearn.decomposition.sparse_encode : Estimator that can be used to + transform signals into sparse linear combination of atoms from a fixed. + + Notes + ----- + For an example, see + :ref:`examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py + `. + + To avoid unnecessary memory duplication the X argument of the fit method + should be directly passed as a Fortran-contiguous numpy array. + + Note that in certain cases, the Lars solver may be significantly + faster to implement this functionality. In particular, linear + interpolation can be used to retrieve model coefficients between the + values output by lars_path + + Examples + -------- + + Comparing lasso_path and lars_path with interpolation: + + >>> import numpy as np + >>> from sklearn.linear_model import lasso_path + >>> X = np.array([[1, 2, 3.1], [2.3, 5.4, 4.3]]).T + >>> y = np.array([1, 2, 3.1]) + >>> # Use lasso_path to compute a coefficient path + >>> _, coef_path, _ = lasso_path(X, y, alphas=[5., 1., .5]) + >>> print(coef_path) + [[0. 0. 0.46874778] + [0.2159048 0.4425765 0.23689075]] + + >>> # Now use lars_path and 1D linear interpolation to compute the + >>> # same path + >>> from sklearn.linear_model import lars_path + >>> alphas, active, coef_path_lars = lars_path(X, y, method='lasso') + >>> from scipy import interpolate + >>> coef_path_continuous = interpolate.interp1d(alphas[::-1], + ... coef_path_lars[:, ::-1]) + >>> print(coef_path_continuous([5., 1., .5])) + [[0. 0. 0.46915237] + [0.2159048 0.4425765 0.23668876]] + """ + return enet_path( + X, + y, + l1_ratio=1.0, + eps=eps, + n_alphas=n_alphas, + alphas=alphas, + precompute=precompute, + Xy=Xy, + copy_X=copy_X, + coef_init=coef_init, + verbose=verbose, + positive=positive, + return_n_iter=return_n_iter, + **params, + ) + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "y": ["array-like", "sparse matrix"], + "l1_ratio": [Interval(Real, 0.0, 1.0, closed="both")], + "eps": [Interval(Real, 0.0, None, closed="neither")], + "n_alphas": [Interval(Integral, 1, None, closed="left")], + "alphas": ["array-like", None], + "precompute": [StrOptions({"auto"}), "boolean", "array-like"], + "Xy": ["array-like", None], + "copy_X": ["boolean"], + "coef_init": ["array-like", None], + "verbose": ["verbose"], + "return_n_iter": ["boolean"], + "positive": ["boolean"], + "check_input": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def enet_path( + X, + y, + *, + l1_ratio=0.5, + eps=1e-3, + n_alphas=100, + alphas=None, + precompute="auto", + Xy=None, + copy_X=True, + coef_init=None, + verbose=False, + return_n_iter=False, + positive=False, + check_input=True, + **params, +): + """Compute elastic net path with coordinate descent. + + The elastic net optimization function varies for mono and multi-outputs. + + For mono-output tasks it is:: + + 1 / (2 * n_samples) * ||y - Xw||^2_2 + + alpha * l1_ratio * ||w||_1 + + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2 + + For multi-output tasks it is:: + + (1 / (2 * n_samples)) * ||Y - XW||_Fro^2 + + alpha * l1_ratio * ||W||_21 + + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2 + + Where:: + + ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2} + + i.e. the sum of norm of each row. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. Pass directly as Fortran-contiguous data to avoid + unnecessary memory duplication. If ``y`` is mono-output then ``X`` + can be sparse. + + y : {array-like, sparse matrix} of shape (n_samples,) or \ + (n_samples, n_targets) + Target values. + + l1_ratio : float, default=0.5 + Number between 0 and 1 passed to elastic net (scaling between + l1 and l2 penalties). ``l1_ratio=1`` corresponds to the Lasso. + + eps : float, default=1e-3 + Length of the path. ``eps=1e-3`` means that + ``alpha_min / alpha_max = 1e-3``. + + n_alphas : int, default=100 + Number of alphas along the regularization path. + + alphas : array-like, default=None + List of alphas where to compute the models. + If None alphas are set automatically. + + precompute : 'auto', bool or array-like of shape \ + (n_features, n_features), default='auto' + Whether to use a precomputed Gram matrix to speed up + calculations. If set to ``'auto'`` let us decide. The Gram + matrix can also be passed as argument. + + Xy : array-like of shape (n_features,) or (n_features, n_targets),\ + default=None + Xy = np.dot(X.T, y) that can be precomputed. It is useful + only when the Gram matrix is precomputed. + + copy_X : bool, default=True + If ``True``, X will be copied; else, it may be overwritten. + + coef_init : array-like of shape (n_features, ), default=None + The initial values of the coefficients. + + verbose : bool or int, default=False + Amount of verbosity. + + return_n_iter : bool, default=False + Whether to return the number of iterations or not. + + positive : bool, default=False + If set to True, forces coefficients to be positive. + (Only allowed when ``y.ndim == 1``). + + check_input : bool, default=True + If set to False, the input validation checks are skipped (including the + Gram matrix when provided). It is assumed that they are handled + by the caller. + + **params : kwargs + Keyword arguments passed to the coordinate descent solver. + + Returns + ------- + alphas : ndarray of shape (n_alphas,) + The alphas along the path where models are computed. + + coefs : ndarray of shape (n_features, n_alphas) or \ + (n_targets, n_features, n_alphas) + Coefficients along the path. + + dual_gaps : ndarray of shape (n_alphas,) + The dual gaps at the end of the optimization for each alpha. + + n_iters : list of int + The number of iterations taken by the coordinate descent optimizer to + reach the specified tolerance for each alpha. + (Is returned when ``return_n_iter`` is set to True). + + See Also + -------- + MultiTaskElasticNet : Multi-task ElasticNet model trained with L1/L2 mixed-norm \ + as regularizer. + MultiTaskElasticNetCV : Multi-task L1/L2 ElasticNet with built-in cross-validation. + ElasticNet : Linear regression with combined L1 and L2 priors as regularizer. + ElasticNetCV : Elastic Net model with iterative fitting along a regularization path. + + Notes + ----- + For an example, see + :ref:`examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py + `. + + Examples + -------- + >>> from sklearn.linear_model import enet_path + >>> from sklearn.datasets import make_regression + >>> X, y, true_coef = make_regression( + ... n_samples=100, n_features=5, n_informative=2, coef=True, random_state=0 + ... ) + >>> true_coef + array([ 0. , 0. , 0. , 97.9, 45.7]) + >>> alphas, estimated_coef, _ = enet_path(X, y, n_alphas=3) + >>> alphas.shape + (3,) + >>> estimated_coef + array([[ 0., 0.787, 0.568], + [ 0., 1.120, 0.620], + [-0., -2.129, -1.128], + [ 0., 23.046, 88.939], + [ 0., 10.637, 41.566]]) + """ + X_offset_param = params.pop("X_offset", None) + X_scale_param = params.pop("X_scale", None) + sample_weight = params.pop("sample_weight", None) + tol = params.pop("tol", 1e-4) + max_iter = params.pop("max_iter", 1000) + random_state = params.pop("random_state", None) + selection = params.pop("selection", "cyclic") + + if len(params) > 0: + raise ValueError("Unexpected parameters in params", params.keys()) + + # We expect X and y to be already Fortran ordered when bypassing + # checks + if check_input: + X = check_array( + X, + accept_sparse="csc", + dtype=[np.float64, np.float32], + order="F", + copy=copy_X, + ) + y = check_array( + y, + accept_sparse="csc", + dtype=X.dtype.type, + order="F", + copy=False, + ensure_2d=False, + ) + if Xy is not None: + # Xy should be a 1d contiguous array or a 2D C ordered array + Xy = check_array( + Xy, dtype=X.dtype.type, order="C", copy=False, ensure_2d=False + ) + + n_samples, n_features = X.shape + + multi_output = False + if y.ndim != 1: + multi_output = True + n_targets = y.shape[1] + + if multi_output and positive: + raise ValueError("positive=True is not allowed for multi-output (y.ndim != 1)") + + # MultiTaskElasticNet does not support sparse matrices + if not multi_output and sparse.issparse(X): + if X_offset_param is not None: + # As sparse matrices are not actually centered we need this to be passed to + # the CD solver. + X_sparse_scaling = X_offset_param / X_scale_param + X_sparse_scaling = np.asarray(X_sparse_scaling, dtype=X.dtype) + else: + X_sparse_scaling = np.zeros(n_features, dtype=X.dtype) + + # X should have been passed through _pre_fit already if function is called + # from ElasticNet.fit + if check_input: + X, y, _, _, _, precompute, Xy = _pre_fit( + X, + y, + Xy, + precompute, + fit_intercept=False, + copy=False, + check_input=check_input, + ) + if alphas is None: + # No need to normalize of fit_intercept: it has been done + # above + alphas = _alpha_grid( + X, + y, + Xy=Xy, + l1_ratio=l1_ratio, + fit_intercept=False, + eps=eps, + n_alphas=n_alphas, + copy_X=False, + ) + elif len(alphas) > 1: + alphas = np.sort(alphas)[::-1] # make sure alphas are properly ordered + + n_alphas = len(alphas) + dual_gaps = np.empty(n_alphas) + n_iters = [] + + rng = check_random_state(random_state) + if selection not in ["random", "cyclic"]: + raise ValueError("selection should be either random or cyclic.") + random = selection == "random" + + if not multi_output: + coefs = np.empty((n_features, n_alphas), dtype=X.dtype) + else: + coefs = np.empty((n_targets, n_features, n_alphas), dtype=X.dtype) + + if coef_init is None: + coef_ = np.zeros(coefs.shape[:-1], dtype=X.dtype, order="F") + else: + coef_ = np.asfortranarray(coef_init, dtype=X.dtype) + + for i, alpha in enumerate(alphas): + # account for n_samples scaling in objectives between here and cd_fast + l1_reg = alpha * l1_ratio * n_samples + l2_reg = alpha * (1.0 - l1_ratio) * n_samples + if not multi_output and sparse.issparse(X): + model = cd_fast.sparse_enet_coordinate_descent( + w=coef_, + alpha=l1_reg, + beta=l2_reg, + X_data=X.data, + X_indices=X.indices, + X_indptr=X.indptr, + y=y, + sample_weight=sample_weight, + X_mean=X_sparse_scaling, + max_iter=max_iter, + tol=tol, + rng=rng, + random=random, + positive=positive, + ) + elif multi_output: + model = cd_fast.enet_coordinate_descent_multi_task( + coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random + ) + elif isinstance(precompute, np.ndarray): + # We expect precompute to be already Fortran ordered when bypassing + # checks + if check_input: + precompute = check_array(precompute, dtype=X.dtype.type, order="C") + model = cd_fast.enet_coordinate_descent_gram( + coef_, + l1_reg, + l2_reg, + precompute, + Xy, + y, + max_iter, + tol, + rng, + random, + positive, + ) + elif precompute is False: + model = cd_fast.enet_coordinate_descent( + coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive + ) + else: + raise ValueError( + "Precompute should be one of True, False, 'auto' or array-like. Got %r" + % precompute + ) + coef_, dual_gap_, eps_, n_iter_ = model + coefs[..., i] = coef_ + # we correct the scale of the returned dual gap, as the objective + # in cd_fast is n_samples * the objective in this docstring. + dual_gaps[i] = dual_gap_ / n_samples + n_iters.append(n_iter_) + + if verbose: + if verbose > 2: + print(model) + elif verbose > 1: + print("Path: %03i out of %03i" % (i, n_alphas)) + else: + sys.stderr.write(".") + + if return_n_iter: + return alphas, coefs, dual_gaps, n_iters + return alphas, coefs, dual_gaps + + +############################################################################### +# ElasticNet model + + +class ElasticNet(MultiOutputMixin, RegressorMixin, LinearModel): + """Linear regression with combined L1 and L2 priors as regularizer. + + Minimizes the objective function:: + + 1 / (2 * n_samples) * ||y - Xw||^2_2 + + alpha * l1_ratio * ||w||_1 + + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2 + + If you are interested in controlling the L1 and L2 penalty + separately, keep in mind that this is equivalent to:: + + a * ||w||_1 + 0.5 * b * ||w||_2^2 + + where:: + + alpha = a + b and l1_ratio = a / (a + b) + + The parameter l1_ratio corresponds to alpha in the glmnet R package while + alpha corresponds to the lambda parameter in glmnet. Specifically, l1_ratio + = 1 is the lasso penalty. Currently, l1_ratio <= 0.01 is not reliable, + unless you supply your own sequence of alpha. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + alpha : float, default=1.0 + Constant that multiplies the penalty terms. Defaults to 1.0. + See the notes for the exact mathematical meaning of this + parameter. ``alpha = 0`` is equivalent to an ordinary least square, + solved by the :class:`LinearRegression` object. For numerical + reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised. + Given this, you should use the :class:`LinearRegression` object. + + l1_ratio : float, default=0.5 + The ElasticNet mixing parameter, with ``0 <= l1_ratio <= 1``. For + ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it + is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a + combination of L1 and L2. + + fit_intercept : bool, default=True + Whether the intercept should be estimated or not. If ``False``, the + data is assumed to be already centered. + + precompute : bool or array-like of shape (n_features, n_features),\ + default=False + Whether to use a precomputed Gram matrix to speed up + calculations. The Gram matrix can also be passed as argument. + For sparse input this option is always ``False`` to preserve sparsity. + Check :ref:`an example on how to use a precomputed Gram Matrix in ElasticNet + ` + for details. + + max_iter : int, default=1000 + The maximum number of iterations. + + copy_X : bool, default=True + If ``True``, X will be copied; else, it may be overwritten. + + tol : float, default=1e-4 + The tolerance for the optimization: if the updates are + smaller than ``tol``, the optimization code checks the + dual gap for optimality and continues until it is smaller + than ``tol``, see Notes below. + + warm_start : bool, default=False + When set to ``True``, reuse the solution of the previous call to fit as + initialization, otherwise, just erase the previous solution. + See :term:`the Glossary `. + + positive : bool, default=False + When set to ``True``, forces the coefficients to be positive. + + random_state : int, RandomState instance, default=None + The seed of the pseudo random number generator that selects a random + feature to update. Used when ``selection`` == 'random'. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + selection : {'cyclic', 'random'}, default='cyclic' + If set to 'random', a random coefficient is updated every iteration + rather than looping over features sequentially by default. This + (setting to 'random') often leads to significantly faster convergence + especially when tol is higher than 1e-4. + + Attributes + ---------- + coef_ : ndarray of shape (n_features,) or (n_targets, n_features) + Parameter vector (w in the cost function formula). + + sparse_coef_ : sparse matrix of shape (n_features,) or \ + (n_targets, n_features) + Sparse representation of the `coef_`. + + intercept_ : float or ndarray of shape (n_targets,) + Independent term in decision function. + + n_iter_ : list of int + Number of iterations run by the coordinate descent solver to reach + the specified tolerance. + + dual_gap_ : float or ndarray of shape (n_targets,) + Given param alpha, the dual gaps at the end of the optimization, + same shape as each observation of y. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + ElasticNetCV : Elastic net model with best model selection by + cross-validation. + SGDRegressor : Implements elastic net regression with incremental training. + SGDClassifier : Implements logistic regression with elastic net penalty + (``SGDClassifier(loss="log_loss", penalty="elasticnet")``). + + Notes + ----- + To avoid unnecessary memory duplication the X argument of the fit method + should be directly passed as a Fortran-contiguous numpy array. + + The precise stopping criteria based on `tol` are the following: First, check that + that maximum coordinate update, i.e. :math:`\\max_j |w_j^{new} - w_j^{old}|` + is smaller than `tol` times the maximum absolute coefficient, :math:`\\max_j |w_j|`. + If so, then additionally check whether the dual gap is smaller than `tol` times + :math:`||y||_2^2 / n_{\text{samples}}`. + + Examples + -------- + >>> from sklearn.linear_model import ElasticNet + >>> from sklearn.datasets import make_regression + + >>> X, y = make_regression(n_features=2, random_state=0) + >>> regr = ElasticNet(random_state=0) + >>> regr.fit(X, y) + ElasticNet(random_state=0) + >>> print(regr.coef_) + [18.83816048 64.55968825] + >>> print(regr.intercept_) + 1.451 + >>> print(regr.predict([[0, 0]])) + [1.451] + + - :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_and_elasticnet.py` + showcases ElasticNet alongside Lasso and ARD Regression for sparse + signal recovery in the presence of noise and feature correlation. + """ + + # "check_input" is used for optimisation and isn't something to be passed + # around in a pipeline. + __metadata_request__fit = {"check_input": metadata_routing.UNUSED} + + _parameter_constraints: dict = { + "alpha": [Interval(Real, 0, None, closed="left")], + "l1_ratio": [Interval(Real, 0, 1, closed="both")], + "fit_intercept": ["boolean"], + "precompute": ["boolean", "array-like"], + "max_iter": [Interval(Integral, 1, None, closed="left"), None], + "copy_X": ["boolean"], + "tol": [Interval(Real, 0, None, closed="left")], + "warm_start": ["boolean"], + "positive": ["boolean"], + "random_state": ["random_state"], + "selection": [StrOptions({"cyclic", "random"})], + } + + path = staticmethod(enet_path) + + def __init__( + self, + alpha=1.0, + *, + l1_ratio=0.5, + fit_intercept=True, + precompute=False, + max_iter=1000, + copy_X=True, + tol=1e-4, + warm_start=False, + positive=False, + random_state=None, + selection="cyclic", + ): + self.alpha = alpha + self.l1_ratio = l1_ratio + self.fit_intercept = fit_intercept + self.precompute = precompute + self.max_iter = max_iter + self.copy_X = copy_X + self.tol = tol + self.warm_start = warm_start + self.positive = positive + self.random_state = random_state + self.selection = selection + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, sample_weight=None, check_input=True): + """Fit model with coordinate descent. + + Parameters + ---------- + X : {ndarray, sparse matrix, sparse array} of (n_samples, n_features) + Data. + + Note that large sparse matrices and arrays requiring `int64` + indices are not accepted. + + y : ndarray of shape (n_samples,) or (n_samples, n_targets) + Target. Will be cast to X's dtype if necessary. + + sample_weight : float or array-like of shape (n_samples,), default=None + Sample weights. Internally, the `sample_weight` vector will be + rescaled to sum to `n_samples`. + + .. versionadded:: 0.23 + + check_input : bool, default=True + Allow to bypass several input checking. + Don't use this parameter unless you know what you do. + + Returns + ------- + self : object + Fitted estimator. + + Notes + ----- + Coordinate descent is an algorithm that considers each column of + data at a time hence it will automatically convert the X input + as a Fortran-contiguous numpy array if necessary. + + To avoid memory re-allocation it is advised to allocate the + initial data in memory directly using that format. + """ + if self.alpha == 0: + warnings.warn( + ( + "With alpha=0, this algorithm does not converge " + "well. You are advised to use the LinearRegression " + "estimator" + ), + stacklevel=2, + ) + + # Remember if X is copied + X_copied = False + # We expect X and y to be float64 or float32 Fortran ordered arrays + # when bypassing checks + if check_input: + X_copied = self.copy_X and self.fit_intercept + X, y = validate_data( + self, + X, + y, + accept_sparse="csc", + order="F", + dtype=[np.float64, np.float32], + force_writeable=True, + accept_large_sparse=False, + copy=X_copied, + multi_output=True, + y_numeric=True, + ) + y = check_array( + y, order="F", copy=False, dtype=X.dtype.type, ensure_2d=False + ) + + n_samples, n_features = X.shape + alpha = self.alpha + + if isinstance(sample_weight, numbers.Number): + sample_weight = None + if sample_weight is not None: + if check_input: + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + # TLDR: Rescale sw to sum up to n_samples. + # Long: The objective function of Enet + # + # 1/2 * np.average(squared error, weights=sw) + # + alpha * penalty (1) + # + # is invariant under rescaling of sw. + # But enet_path coordinate descent minimizes + # + # 1/2 * sum(squared error) + alpha' * penalty (2) + # + # and therefore sets + # + # alpha' = n_samples * alpha (3) + # + # inside its function body, which results in objective (2) being + # equivalent to (1) in case of no sw. + # With sw, however, enet_path should set + # + # alpha' = sum(sw) * alpha (4) + # + # Therefore, we use the freedom of Eq. (1) to rescale sw before + # calling enet_path, i.e. + # + # sw *= n_samples / sum(sw) + # + # such that sum(sw) = n_samples. This way, (3) and (4) are the same. + sample_weight = sample_weight * (n_samples / np.sum(sample_weight)) + # Note: Alternatively, we could also have rescaled alpha instead + # of sample_weight: + # + # alpha *= np.sum(sample_weight) / n_samples + + # Ensure copying happens only once, don't do it again if done above. + # X and y will be rescaled if sample_weight is not None, order='F' + # ensures that the returned X and y are still F-contiguous. + should_copy = self.copy_X and not X_copied + X, y, X_offset, y_offset, X_scale, precompute, Xy = _pre_fit( + X, + y, + None, + self.precompute, + fit_intercept=self.fit_intercept, + copy=should_copy, + check_input=check_input, + sample_weight=sample_weight, + ) + # coordinate descent needs F-ordered arrays and _pre_fit might have + # called _rescale_data + if check_input or sample_weight is not None: + X, y = _set_order(X, y, order="F") + if y.ndim == 1: + y = y[:, np.newaxis] + if Xy is not None and Xy.ndim == 1: + Xy = Xy[:, np.newaxis] + + n_targets = y.shape[1] + + if not self.warm_start or not hasattr(self, "coef_"): + coef_ = np.zeros((n_targets, n_features), dtype=X.dtype, order="F") + else: + coef_ = self.coef_ + if coef_.ndim == 1: + coef_ = coef_[np.newaxis, :] + + dual_gaps_ = np.zeros(n_targets, dtype=X.dtype) + self.n_iter_ = [] + + for k in range(n_targets): + if Xy is not None: + this_Xy = Xy[:, k] + else: + this_Xy = None + _, this_coef, this_dual_gap, this_iter = self.path( + X, + y[:, k], + l1_ratio=self.l1_ratio, + eps=None, + n_alphas=None, + alphas=[alpha], + precompute=precompute, + Xy=this_Xy, + copy_X=True, + coef_init=coef_[k], + verbose=False, + return_n_iter=True, + positive=self.positive, + check_input=False, + # from here on **params + tol=self.tol, + X_offset=X_offset, + X_scale=X_scale, + max_iter=self.max_iter, + random_state=self.random_state, + selection=self.selection, + sample_weight=sample_weight, + ) + coef_[k] = this_coef[:, 0] + dual_gaps_[k] = this_dual_gap[0] + self.n_iter_.append(this_iter[0]) + + if n_targets == 1: + self.n_iter_ = self.n_iter_[0] + self.coef_ = coef_[0] + self.dual_gap_ = dual_gaps_[0] + else: + self.coef_ = coef_ + self.dual_gap_ = dual_gaps_ + + self._set_intercept(X_offset, y_offset, X_scale) + + # check for finiteness of coefficients + if not all(np.isfinite(w).all() for w in [self.coef_, self.intercept_]): + raise ValueError( + "Coordinate descent iterations resulted in non-finite parameter" + " values. The input data may contain large values and need to" + " be preprocessed." + ) + + # return self for chaining fit and predict calls + return self + + @property + def sparse_coef_(self): + """Sparse representation of the fitted `coef_`.""" + return sparse.csr_matrix(self.coef_) + + def _decision_function(self, X): + """Decision function of the linear model. + + Parameters + ---------- + X : numpy array or scipy.sparse matrix of shape (n_samples, n_features) + + Returns + ------- + T : ndarray of shape (n_samples,) + The predicted decision function. + """ + check_is_fitted(self) + if sparse.issparse(X): + return safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_ + else: + return super()._decision_function(X) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + + +############################################################################### +# Lasso model + + +class Lasso(ElasticNet): + """Linear Model trained with L1 prior as regularizer (aka the Lasso). + + The optimization objective for Lasso is:: + + (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 + + Technically the Lasso model is optimizing the same objective function as + the Elastic Net with ``l1_ratio=1.0`` (no L2 penalty). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + alpha : float, default=1.0 + Constant that multiplies the L1 term, controlling regularization + strength. `alpha` must be a non-negative float i.e. in `[0, inf)`. + + When `alpha = 0`, the objective is equivalent to ordinary least + squares, solved by the :class:`LinearRegression` object. For numerical + reasons, using `alpha = 0` with the `Lasso` object is not advised. + Instead, you should use the :class:`LinearRegression` object. + + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to False, no intercept will be used in calculations + (i.e. data is expected to be centered). + + precompute : bool or array-like of shape (n_features, n_features),\ + default=False + Whether to use a precomputed Gram matrix to speed up + calculations. The Gram matrix can also be passed as argument. + For sparse input this option is always ``False`` to preserve sparsity. + + copy_X : bool, default=True + If ``True``, X will be copied; else, it may be overwritten. + + max_iter : int, default=1000 + The maximum number of iterations. + + tol : float, default=1e-4 + The tolerance for the optimization: if the updates are + smaller than ``tol``, the optimization code checks the + dual gap for optimality and continues until it is smaller + than ``tol``, see Notes below. + + warm_start : bool, default=False + When set to True, reuse the solution of the previous call to fit as + initialization, otherwise, just erase the previous solution. + See :term:`the Glossary `. + + positive : bool, default=False + When set to ``True``, forces the coefficients to be positive. + + random_state : int, RandomState instance, default=None + The seed of the pseudo random number generator that selects a random + feature to update. Used when ``selection`` == 'random'. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + selection : {'cyclic', 'random'}, default='cyclic' + If set to 'random', a random coefficient is updated every iteration + rather than looping over features sequentially by default. This + (setting to 'random') often leads to significantly faster convergence + especially when tol is higher than 1e-4. + + Attributes + ---------- + coef_ : ndarray of shape (n_features,) or (n_targets, n_features) + Parameter vector (w in the cost function formula). + + dual_gap_ : float or ndarray of shape (n_targets,) + Given param alpha, the dual gaps at the end of the optimization, + same shape as each observation of y. + + sparse_coef_ : sparse matrix of shape (n_features, 1) or \ + (n_targets, n_features) + Readonly property derived from ``coef_``. + + intercept_ : float or ndarray of shape (n_targets,) + Independent term in decision function. + + n_iter_ : int or list of int + Number of iterations run by the coordinate descent solver to reach + the specified tolerance. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + lars_path : Regularization path using LARS. + lasso_path : Regularization path using Lasso. + LassoLars : Lasso Path along the regularization parameter using LARS algorithm. + LassoCV : Lasso alpha parameter by cross-validation. + LassoLarsCV : Lasso least angle parameter algorithm by cross-validation. + sklearn.decomposition.sparse_encode : Sparse coding array estimator. + + Notes + ----- + The algorithm used to fit the model is coordinate descent. + + To avoid unnecessary memory duplication the X argument of the fit method + should be directly passed as a Fortran-contiguous numpy array. + + Regularization improves the conditioning of the problem and + reduces the variance of the estimates. Larger values specify stronger + regularization. Alpha corresponds to `1 / (2C)` in other linear + models such as :class:`~sklearn.linear_model.LogisticRegression` or + :class:`~sklearn.svm.LinearSVC`. + + The precise stopping criteria based on `tol` are the following: First, check that + that maximum coordinate update, i.e. :math:`\\max_j |w_j^{new} - w_j^{old}|` + is smaller than `tol` times the maximum absolute coefficient, :math:`\\max_j |w_j|`. + If so, then additionally check whether the dual gap is smaller than `tol` times + :math:`||y||_2^2 / n_{\\text{samples}}`. + + The target can be a 2-dimensional array, resulting in the optimization of the + following objective:: + + (1 / (2 * n_samples)) * ||Y - XW||^2_F + alpha * ||W||_11 + + where :math:`||W||_{1,1}` is the sum of the magnitude of the matrix coefficients. + It should not be confused with :class:`~sklearn.linear_model.MultiTaskLasso` which + instead penalizes the :math:`L_{2,1}` norm of the coefficients, yielding row-wise + sparsity in the coefficients. + + Examples + -------- + >>> from sklearn import linear_model + >>> clf = linear_model.Lasso(alpha=0.1) + >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2]) + Lasso(alpha=0.1) + >>> print(clf.coef_) + [0.85 0. ] + >>> print(clf.intercept_) + 0.15 + + - :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_and_elasticnet.py` + compares Lasso with other L1-based regression models (ElasticNet and ARD + Regression) for sparse signal recovery in the presence of noise and + feature correlation. + """ + + _parameter_constraints: dict = { + **ElasticNet._parameter_constraints, + } + _parameter_constraints.pop("l1_ratio") + + path = staticmethod(enet_path) + + def __init__( + self, + alpha=1.0, + *, + fit_intercept=True, + precompute=False, + copy_X=True, + max_iter=1000, + tol=1e-4, + warm_start=False, + positive=False, + random_state=None, + selection="cyclic", + ): + super().__init__( + alpha=alpha, + l1_ratio=1.0, + fit_intercept=fit_intercept, + precompute=precompute, + copy_X=copy_X, + max_iter=max_iter, + tol=tol, + warm_start=warm_start, + positive=positive, + random_state=random_state, + selection=selection, + ) + + +############################################################################### +# Functions for CV with paths functions + + +def _path_residuals( + X, + y, + sample_weight, + train, + test, + fit_intercept, + path, + path_params, + alphas=None, + l1_ratio=1, + X_order=None, + dtype=None, +): + """Returns the MSE for the models computed by 'path'. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) or (n_samples, n_targets) + Target values. + + sample_weight : None or array-like of shape (n_samples,) + Sample weights. + + train : list of indices + The indices of the train set. + + test : list of indices + The indices of the test set. + + path : callable + Function returning a list of models on the path. See + enet_path for an example of signature. + + path_params : dictionary + Parameters passed to the path function. + + alphas : array-like, default=None + Array of float that is used for cross-validation. If not + provided, computed using 'path'. + + l1_ratio : float, default=1 + float between 0 and 1 passed to ElasticNet (scaling between + l1 and l2 penalties). For ``l1_ratio = 0`` the penalty is an + L2 penalty. For ``l1_ratio = 1`` it is an L1 penalty. For ``0 + < l1_ratio < 1``, the penalty is a combination of L1 and L2. + + X_order : {'F', 'C'}, default=None + The order of the arrays expected by the path function to + avoid memory copies. + + dtype : a numpy dtype, default=None + The dtype of the arrays expected by the path function to + avoid memory copies. + """ + X_train = X[train] + y_train = y[train] + X_test = X[test] + y_test = y[test] + if sample_weight is None: + sw_train, sw_test = None, None + else: + sw_train = sample_weight[train] + sw_test = sample_weight[test] + n_samples = X_train.shape[0] + # TLDR: Rescale sw_train to sum up to n_samples on the training set. + # See TLDR and long comment inside ElasticNet.fit. + sw_train *= n_samples / np.sum(sw_train) + # Note: Alternatively, we could also have rescaled alpha instead + # of sample_weight: + # + # alpha *= np.sum(sample_weight) / n_samples + + if not sparse.issparse(X): + for array, array_input in ( + (X_train, X), + (y_train, y), + (X_test, X), + (y_test, y), + ): + if array.base is not array_input and not array.flags["WRITEABLE"]: + # fancy indexing should create a writable copy but it doesn't + # for read-only memmaps (cf. numpy#14132). + array.setflags(write=True) + + if y.ndim == 1: + precompute = path_params["precompute"] + else: + # No Gram variant of multi-task exists right now. + # Fall back to default enet_multitask + precompute = False + + X_train, y_train, X_offset, y_offset, X_scale, precompute, Xy = _pre_fit( + X_train, + y_train, + None, + precompute, + fit_intercept=fit_intercept, + copy=False, + sample_weight=sw_train, + ) + + path_params = path_params.copy() + path_params["Xy"] = Xy + path_params["X_offset"] = X_offset + path_params["X_scale"] = X_scale + path_params["precompute"] = precompute + path_params["copy_X"] = False + path_params["alphas"] = alphas + # needed for sparse cd solver + path_params["sample_weight"] = sw_train + + if "l1_ratio" in path_params: + path_params["l1_ratio"] = l1_ratio + + # Do the ordering and type casting here, as if it is done in the path, + # X is copied and a reference is kept here + X_train = check_array(X_train, accept_sparse="csc", dtype=dtype, order=X_order) + alphas, coefs, _ = path(X_train, y_train, **path_params) + del X_train, y_train + + if y.ndim == 1: + # Doing this so that it becomes coherent with multioutput. + coefs = coefs[np.newaxis, :, :] + y_offset = np.atleast_1d(y_offset) + y_test = y_test[:, np.newaxis] + + intercepts = y_offset[:, np.newaxis] - np.dot(X_offset, coefs) + X_test_coefs = safe_sparse_dot(X_test, coefs) + residues = X_test_coefs - y_test[:, :, np.newaxis] + residues += intercepts + if sample_weight is None: + this_mse = (residues**2).mean(axis=0) + else: + this_mse = np.average(residues**2, weights=sw_test, axis=0) + + return this_mse.mean(axis=0) + + +class LinearModelCV(MultiOutputMixin, LinearModel, ABC): + """Base class for iterative model fitting along a regularization path.""" + + _parameter_constraints: dict = { + "eps": [Interval(Real, 0, None, closed="neither")], + "n_alphas": [ + Interval(Integral, 1, None, closed="left"), + Hidden(StrOptions({"deprecated"})), + ], + # TODO(1.9): remove "warn" and None options. + "alphas": [ + Interval(Integral, 1, None, closed="left"), + "array-like", + None, + Hidden(StrOptions({"warn"})), + ], + "fit_intercept": ["boolean"], + "precompute": [StrOptions({"auto"}), "array-like", "boolean"], + "max_iter": [Interval(Integral, 1, None, closed="left")], + "tol": [Interval(Real, 0, None, closed="left")], + "copy_X": ["boolean"], + "cv": ["cv_object"], + "verbose": ["verbose"], + "n_jobs": [Integral, None], + "positive": ["boolean"], + "random_state": ["random_state"], + "selection": [StrOptions({"cyclic", "random"})], + } + + @abstractmethod + def __init__( + self, + eps=1e-3, + n_alphas="deprecated", + alphas="warn", + fit_intercept=True, + precompute="auto", + max_iter=1000, + tol=1e-4, + copy_X=True, + cv=None, + verbose=False, + n_jobs=None, + positive=False, + random_state=None, + selection="cyclic", + ): + self.eps = eps + self.n_alphas = n_alphas + self.alphas = alphas + self.fit_intercept = fit_intercept + self.precompute = precompute + self.max_iter = max_iter + self.tol = tol + self.copy_X = copy_X + self.cv = cv + self.verbose = verbose + self.n_jobs = n_jobs + self.positive = positive + self.random_state = random_state + self.selection = selection + + @abstractmethod + def _get_estimator(self): + """Model to be fitted after the best alpha has been determined.""" + + @abstractmethod + def _is_multitask(self): + """Bool indicating if class is meant for multidimensional target.""" + + @staticmethod + @abstractmethod + def path(X, y, **kwargs): + """Compute path with coordinate descent.""" + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, sample_weight=None, **params): + """Fit linear model with coordinate descent. + + Fit is on grid of alphas and best alpha estimated by cross-validation. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. Pass directly as Fortran-contiguous data + to avoid unnecessary memory duplication. If y is mono-output, + X can be sparse. Note that large sparse matrices and arrays + requiring `int64` indices are not accepted. + + y : array-like of shape (n_samples,) or (n_samples, n_targets) + Target values. + + sample_weight : float or array-like of shape (n_samples,), \ + default=None + Sample weights used for fitting and evaluation of the weighted + mean squared error of each cv-fold. Note that the cross validated + MSE that is finally used to find the best model is the unweighted + mean over the (weighted) MSEs of each test fold. + + **params : dict, default=None + Parameters to be passed to the CV splitter. + + .. versionadded:: 1.4 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Returns an instance of fitted model. + """ + _raise_for_params(params, self, "fit") + + # TODO(1.9): remove n_alphas and alphas={"warn", None}; set alphas=100 by + # default. Remove these deprecations messages and use self.alphas directly + # instead of self._alphas. + if self.n_alphas == "deprecated": + self._alphas = 100 + else: + warnings.warn( + "'n_alphas' was deprecated in 1.7 and will be removed in 1.9. " + "'alphas' now accepts an integer value which removes the need to pass " + "'n_alphas'. The default value of 'alphas' will change from None to " + "100 in 1.9. Pass an explicit value to 'alphas' and leave 'n_alphas' " + "to its default value to silence this warning.", + FutureWarning, + ) + self._alphas = self.n_alphas + + if isinstance(self.alphas, str) and self.alphas == "warn": + # - If self.n_alphas == "deprecated", both are left to their default values + # so we don't warn since the future default behavior will be the same as + # the current default behavior. + # - If self.n_alphas != "deprecated", then we already warned about it + # and the warning message mentions the future self.alphas default, so + # no need to warn a second time. + pass + elif self.alphas is None: + warnings.warn( + "'alphas=None' is deprecated and will be removed in 1.9, at which " + "point the default value will be set to 100. Set 'alphas=100' " + "to silence this warning.", + FutureWarning, + ) + else: + self._alphas = self.alphas + + # This makes sure that there is no duplication in memory. + # Dealing right with copy_X is important in the following: + # Multiple functions touch X and subsamples of X and can induce a + # lot of duplication of memory + copy_X = self.copy_X and self.fit_intercept + + check_y_params = dict( + copy=False, dtype=[np.float64, np.float32], ensure_2d=False + ) + if isinstance(X, np.ndarray) or sparse.issparse(X): + # Keep a reference to X + reference_to_old_X = X + # Let us not impose fortran ordering so far: it is + # not useful for the cross-validation loop and will be done + # by the model fitting itself + + # Need to validate separately here. + # We can't pass multi_output=True because that would allow y to be + # csr. We also want to allow y to be 64 or 32 but check_X_y only + # allows to convert for 64. + check_X_params = dict( + accept_sparse="csc", + dtype=[np.float64, np.float32], + force_writeable=True, + copy=False, + accept_large_sparse=False, + ) + X, y = validate_data( + self, X, y, validate_separately=(check_X_params, check_y_params) + ) + if sparse.issparse(X): + if hasattr(reference_to_old_X, "data") and not np.may_share_memory( + reference_to_old_X.data, X.data + ): + # X is a sparse matrix and has been copied + copy_X = False + elif not np.may_share_memory(reference_to_old_X, X): + # X has been copied + copy_X = False + del reference_to_old_X + else: + # Need to validate separately here. + # We can't pass multi_output=True because that would allow y to be + # csr. We also want to allow y to be 64 or 32 but check_X_y only + # allows to convert for 64. + check_X_params = dict( + accept_sparse="csc", + dtype=[np.float64, np.float32], + order="F", + force_writeable=True, + copy=copy_X, + ) + X, y = validate_data( + self, X, y, validate_separately=(check_X_params, check_y_params) + ) + copy_X = False + + check_consistent_length(X, y) + + if not self._is_multitask(): + if y.ndim > 1 and y.shape[1] > 1: + raise ValueError( + "For multi-task outputs, use MultiTask%s" % self.__class__.__name__ + ) + y = column_or_1d(y, warn=True) + else: + if sparse.issparse(X): + raise TypeError("X should be dense but a sparse matrix waspassed") + elif y.ndim == 1: + raise ValueError( + "For mono-task outputs, use %sCV" % self.__class__.__name__[9:] + ) + + if isinstance(sample_weight, numbers.Number): + sample_weight = None + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + + model = self._get_estimator() + + # All LinearModelCV parameters except 'cv' are acceptable + path_params = self.get_params() + + # Pop `intercept` that is not parameter of the path function + path_params.pop("fit_intercept", None) + + if "l1_ratio" in path_params: + l1_ratios = np.atleast_1d(path_params["l1_ratio"]) + # For the first path, we need to set l1_ratio + path_params["l1_ratio"] = l1_ratios[0] + else: + l1_ratios = [ + 1, + ] + path_params.pop("cv", None) + path_params.pop("n_jobs", None) + + n_l1_ratio = len(l1_ratios) + + check_scalar_alpha = partial( + check_scalar, + target_type=Real, + min_val=0.0, + include_boundaries="left", + ) + + if isinstance(self._alphas, Integral): + alphas = [ + _alpha_grid( + X, + y, + l1_ratio=l1_ratio, + fit_intercept=self.fit_intercept, + eps=self.eps, + n_alphas=self._alphas, + copy_X=self.copy_X, + sample_weight=sample_weight, + ) + for l1_ratio in l1_ratios + ] + else: + # Making sure alphas entries are scalars. + for index, alpha in enumerate(self._alphas): + check_scalar_alpha(alpha, f"alphas[{index}]") + # Making sure alphas is properly ordered. + alphas = np.tile(np.sort(self._alphas)[::-1], (n_l1_ratio, 1)) + + # We want n_alphas to be the number of alphas used for each l1_ratio. + n_alphas = len(alphas[0]) + path_params.update({"n_alphas": n_alphas}) + + path_params["copy_X"] = copy_X + # We are not computing in parallel, we can modify X + # inplace in the folds + if effective_n_jobs(self.n_jobs) > 1: + path_params["copy_X"] = False + + # init cross-validation generator + cv = check_cv(self.cv) + + if _routing_enabled(): + splitter_supports_sample_weight = get_routing_for_object(cv).consumes( + method="split", params=["sample_weight"] + ) + if ( + sample_weight is not None + and not splitter_supports_sample_weight + and not has_fit_parameter(self, "sample_weight") + ): + raise ValueError( + "The CV splitter and underlying estimator do not support" + " sample weights." + ) + + if splitter_supports_sample_weight: + params["sample_weight"] = sample_weight + + routed_params = process_routing(self, "fit", **params) + + if sample_weight is not None and not has_fit_parameter( + self, "sample_weight" + ): + # MultiTaskElasticNetCV does not (yet) support sample_weight + sample_weight = None + else: + routed_params = Bunch() + routed_params.splitter = Bunch(split=Bunch()) + + # Compute path for all folds and compute MSE to get the best alpha + folds = list(cv.split(X, y, **routed_params.splitter.split)) + best_mse = np.inf + + # We do a double for loop folded in one, in order to be able to + # iterate in parallel on l1_ratio and folds + jobs = ( + delayed(_path_residuals)( + X, + y, + sample_weight, + train, + test, + self.fit_intercept, + self.path, + path_params, + alphas=this_alphas, + l1_ratio=this_l1_ratio, + X_order="F", + dtype=X.dtype.type, + ) + for this_l1_ratio, this_alphas in zip(l1_ratios, alphas) + for train, test in folds + ) + mse_paths = Parallel( + n_jobs=self.n_jobs, + verbose=self.verbose, + prefer="threads", + )(jobs) + mse_paths = np.reshape(mse_paths, (n_l1_ratio, len(folds), -1)) + # The mean is computed over folds. + mean_mse = np.mean(mse_paths, axis=1) + self.mse_path_ = np.squeeze(np.moveaxis(mse_paths, 2, 1)) + for l1_ratio, l1_alphas, mse_alphas in zip(l1_ratios, alphas, mean_mse): + i_best_alpha = np.argmin(mse_alphas) + this_best_mse = mse_alphas[i_best_alpha] + if this_best_mse < best_mse: + best_alpha = l1_alphas[i_best_alpha] + best_l1_ratio = l1_ratio + best_mse = this_best_mse + + self.l1_ratio_ = best_l1_ratio + self.alpha_ = best_alpha + if isinstance(self._alphas, Integral): + self.alphas_ = np.asarray(alphas) + if n_l1_ratio == 1: + self.alphas_ = self.alphas_[0] + # Remove duplicate alphas in case alphas is provided. + else: + self.alphas_ = np.asarray(alphas[0]) + + # Refit the model with the parameters selected + common_params = { + name: value + for name, value in self.get_params().items() + if name in model.get_params() + } + model.set_params(**common_params) + model.alpha = best_alpha + model.l1_ratio = best_l1_ratio + model.copy_X = copy_X + precompute = getattr(self, "precompute", None) + if isinstance(precompute, str) and precompute == "auto": + model.precompute = False + + if sample_weight is None: + # MultiTaskElasticNetCV does not (yet) support sample_weight, even + # not sample_weight=None. + model.fit(X, y) + else: + model.fit(X, y, sample_weight=sample_weight) + if not hasattr(self, "l1_ratio"): + del self.l1_ratio_ + self.coef_ = model.coef_ + self.intercept_ = model.intercept_ + self.dual_gap_ = model.dual_gap_ + self.n_iter_ = model.n_iter_ + return self + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.4 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = ( + MetadataRouter(owner=self.__class__.__name__) + .add_self_request(self) + .add( + splitter=check_cv(self.cv), + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + ) + return router + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + multitask = self._is_multitask() + tags.input_tags.sparse = not multitask + tags.target_tags.multi_output = multitask + return tags + + +class LassoCV(RegressorMixin, LinearModelCV): + """Lasso linear model with iterative fitting along a regularization path. + + See glossary entry for :term:`cross-validation estimator`. + + The best model is selected by cross-validation. + + The optimization objective for Lasso is:: + + (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + eps : float, default=1e-3 + Length of the path. ``eps=1e-3`` means that + ``alpha_min / alpha_max = 1e-3``. + + n_alphas : int, default=100 + Number of alphas along the regularization path. + + .. deprecated:: 1.7 + `n_alphas` was deprecated in 1.7 and will be removed in 1.9. Use `alphas` + instead. + + alphas : array-like or int, default=None + Values of alphas to test along the regularization path. + If int, `alphas` values are generated automatically. + If array-like, list of alpha values to use. + + .. versionchanged:: 1.7 + `alphas` accepts an integer value which removes the need to pass + `n_alphas`. + + .. deprecated:: 1.7 + `alphas=None` was deprecated in 1.7 and will be removed in 1.9, at which + point the default value will be set to 100. + + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to false, no intercept will be used in calculations + (i.e. data is expected to be centered). + + precompute : 'auto', bool or array-like of shape \ + (n_features, n_features), default='auto' + Whether to use a precomputed Gram matrix to speed up + calculations. If set to ``'auto'`` let us decide. The Gram + matrix can also be passed as argument. + + max_iter : int, default=1000 + The maximum number of iterations. + + tol : float, default=1e-4 + The tolerance for the optimization: if the updates are + smaller than ``tol``, the optimization code checks the + dual gap for optimality and continues until it is smaller + than ``tol``. + + copy_X : bool, default=True + If ``True``, X will be copied; else, it may be overwritten. + + cv : int, cross-validation generator or iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross-validation, + - int, to specify the number of folds. + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For int/None inputs, :class:`~sklearn.model_selection.KFold` is used. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value if None changed from 3-fold to 5-fold. + + verbose : bool or int, default=False + Amount of verbosity. + + n_jobs : int, default=None + Number of CPUs to use during the cross validation. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + positive : bool, default=False + If positive, restrict regression coefficients to be positive. + + random_state : int, RandomState instance, default=None + The seed of the pseudo random number generator that selects a random + feature to update. Used when ``selection`` == 'random'. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + selection : {'cyclic', 'random'}, default='cyclic' + If set to 'random', a random coefficient is updated every iteration + rather than looping over features sequentially by default. This + (setting to 'random') often leads to significantly faster convergence + especially when tol is higher than 1e-4. + + Attributes + ---------- + alpha_ : float + The amount of penalization chosen by cross validation. + + coef_ : ndarray of shape (n_features,) or (n_targets, n_features) + Parameter vector (w in the cost function formula). + + intercept_ : float or ndarray of shape (n_targets,) + Independent term in decision function. + + mse_path_ : ndarray of shape (n_alphas, n_folds) + Mean square error for the test set on each fold, varying alpha. + + alphas_ : ndarray of shape (n_alphas,) + The grid of alphas used for fitting. + + dual_gap_ : float or ndarray of shape (n_targets,) + The dual gap at the end of the optimization for the optimal alpha + (``alpha_``). + + n_iter_ : int + Number of iterations run by the coordinate descent solver to reach + the specified tolerance for the optimal alpha. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + lars_path : Compute Least Angle Regression or Lasso path using LARS + algorithm. + lasso_path : Compute Lasso path with coordinate descent. + Lasso : The Lasso is a linear model that estimates sparse coefficients. + LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars. + LassoCV : Lasso linear model with iterative fitting along a regularization + path. + LassoLarsCV : Cross-validated Lasso using the LARS algorithm. + + Notes + ----- + In `fit`, once the best parameter `alpha` is found through + cross-validation, the model is fit again using the entire training set. + + To avoid unnecessary memory duplication the `X` argument of the `fit` + method should be directly passed as a Fortran-contiguous numpy array. + + For an example, see :ref:`examples/linear_model/plot_lasso_model_selection.py + `. + + :class:`LassoCV` leads to different results than a hyperparameter + search using :class:`~sklearn.model_selection.GridSearchCV` with a + :class:`Lasso` model. In :class:`LassoCV`, a model for a given + penalty `alpha` is warm started using the coefficients of the + closest model (trained at the previous iteration) on the + regularization path. It tends to speed up the hyperparameter + search. + + Examples + -------- + >>> from sklearn.linear_model import LassoCV + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression(noise=4, random_state=0) + >>> reg = LassoCV(cv=5, random_state=0).fit(X, y) + >>> reg.score(X, y) + 0.9993 + >>> reg.predict(X[:1,]) + array([-78.4951]) + """ + + path = staticmethod(lasso_path) + + def __init__( + self, + *, + eps=1e-3, + n_alphas="deprecated", + alphas="warn", + fit_intercept=True, + precompute="auto", + max_iter=1000, + tol=1e-4, + copy_X=True, + cv=None, + verbose=False, + n_jobs=None, + positive=False, + random_state=None, + selection="cyclic", + ): + super().__init__( + eps=eps, + n_alphas=n_alphas, + alphas=alphas, + fit_intercept=fit_intercept, + precompute=precompute, + max_iter=max_iter, + tol=tol, + copy_X=copy_X, + cv=cv, + verbose=verbose, + n_jobs=n_jobs, + positive=positive, + random_state=random_state, + selection=selection, + ) + + def _get_estimator(self): + return Lasso() + + def _is_multitask(self): + return False + + def fit(self, X, y, sample_weight=None, **params): + """Fit Lasso model with coordinate descent. + + Fit is on grid of alphas and best alpha estimated by cross-validation. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. Pass directly as Fortran-contiguous data + to avoid unnecessary memory duplication. If y is mono-output, + X can be sparse. Note that large sparse matrices and arrays + requiring `int64` indices are not accepted. + + y : array-like of shape (n_samples,) + Target values. + + sample_weight : float or array-like of shape (n_samples,), \ + default=None + Sample weights used for fitting and evaluation of the weighted + mean squared error of each cv-fold. Note that the cross validated + MSE that is finally used to find the best model is the unweighted + mean over the (weighted) MSEs of each test fold. + + **params : dict, default=None + Parameters to be passed to the CV splitter. + + .. versionadded:: 1.4 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Returns an instance of fitted model. + """ + return super().fit(X, y, sample_weight=sample_weight, **params) + + +class ElasticNetCV(RegressorMixin, LinearModelCV): + """Elastic Net model with iterative fitting along a regularization path. + + See glossary entry for :term:`cross-validation estimator`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + l1_ratio : float or list of float, default=0.5 + Float between 0 and 1 passed to ElasticNet (scaling between + l1 and l2 penalties). For ``l1_ratio = 0`` + the penalty is an L2 penalty. For ``l1_ratio = 1`` it is an L1 penalty. + For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2 + This parameter can be a list, in which case the different + values are tested by cross-validation and the one giving the best + prediction score is used. Note that a good choice of list of + values for l1_ratio is often to put more values close to 1 + (i.e. Lasso) and less close to 0 (i.e. Ridge), as in ``[.1, .5, .7, + .9, .95, .99, 1]``. + + eps : float, default=1e-3 + Length of the path. ``eps=1e-3`` means that + ``alpha_min / alpha_max = 1e-3``. + + n_alphas : int, default=100 + Number of alphas along the regularization path, used for each l1_ratio. + + .. deprecated:: 1.7 + `n_alphas` was deprecated in 1.7 and will be removed in 1.9. Use `alphas` + instead. + + alphas : array-like or int, default=None + Values of alphas to test along the regularization path, used for each l1_ratio. + If int, `alphas` values are generated automatically. + If array-like, list of alpha values to use. + + .. versionchanged:: 1.7 + `alphas` accepts an integer value which removes the need to pass + `n_alphas`. + + .. deprecated:: 1.7 + `alphas=None` was deprecated in 1.7 and will be removed in 1.9, at which + point the default value will be set to 100. + + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to false, no intercept will be used in calculations + (i.e. data is expected to be centered). + + precompute : 'auto', bool or array-like of shape \ + (n_features, n_features), default='auto' + Whether to use a precomputed Gram matrix to speed up + calculations. If set to ``'auto'`` let us decide. The Gram + matrix can also be passed as argument. + + max_iter : int, default=1000 + The maximum number of iterations. + + tol : float, default=1e-4 + The tolerance for the optimization: if the updates are + smaller than ``tol``, the optimization code checks the + dual gap for optimality and continues until it is smaller + than ``tol``. + + cv : int, cross-validation generator or iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross-validation, + - int, to specify the number of folds. + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For int/None inputs, :class:`~sklearn.model_selection.KFold` is used. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value if None changed from 3-fold to 5-fold. + + copy_X : bool, default=True + If ``True``, X will be copied; else, it may be overwritten. + + verbose : bool or int, default=0 + Amount of verbosity. + + n_jobs : int, default=None + Number of CPUs to use during the cross validation. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + positive : bool, default=False + When set to ``True``, forces the coefficients to be positive. + + random_state : int, RandomState instance, default=None + The seed of the pseudo random number generator that selects a random + feature to update. Used when ``selection`` == 'random'. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + selection : {'cyclic', 'random'}, default='cyclic' + If set to 'random', a random coefficient is updated every iteration + rather than looping over features sequentially by default. This + (setting to 'random') often leads to significantly faster convergence + especially when tol is higher than 1e-4. + + Attributes + ---------- + alpha_ : float + The amount of penalization chosen by cross validation. + + l1_ratio_ : float + The compromise between l1 and l2 penalization chosen by + cross validation. + + coef_ : ndarray of shape (n_features,) or (n_targets, n_features) + Parameter vector (w in the cost function formula). + + intercept_ : float or ndarray of shape (n_targets, n_features) + Independent term in the decision function. + + mse_path_ : ndarray of shape (n_l1_ratio, n_alpha, n_folds) + Mean square error for the test set on each fold, varying l1_ratio and + alpha. + + alphas_ : ndarray of shape (n_alphas,) or (n_l1_ratio, n_alphas) + The grid of alphas used for fitting, for each l1_ratio. + + dual_gap_ : float + The dual gaps at the end of the optimization for the optimal alpha. + + n_iter_ : int + Number of iterations run by the coordinate descent solver to reach + the specified tolerance for the optimal alpha. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + enet_path : Compute elastic net path with coordinate descent. + ElasticNet : Linear regression with combined L1 and L2 priors as regularizer. + + Notes + ----- + In `fit`, once the best parameters `l1_ratio` and `alpha` are found through + cross-validation, the model is fit again using the entire training set. + + To avoid unnecessary memory duplication the `X` argument of the `fit` + method should be directly passed as a Fortran-contiguous numpy array. + + The parameter `l1_ratio` corresponds to alpha in the glmnet R package + while alpha corresponds to the lambda parameter in glmnet. + More specifically, the optimization objective is:: + + 1 / (2 * n_samples) * ||y - Xw||^2_2 + + alpha * l1_ratio * ||w||_1 + + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2 + + If you are interested in controlling the L1 and L2 penalty + separately, keep in mind that this is equivalent to:: + + a * L1 + b * L2 + + for:: + + alpha = a + b and l1_ratio = a / (a + b). + + For an example, see + :ref:`examples/linear_model/plot_lasso_model_selection.py + `. + + Examples + -------- + >>> from sklearn.linear_model import ElasticNetCV + >>> from sklearn.datasets import make_regression + + >>> X, y = make_regression(n_features=2, random_state=0) + >>> regr = ElasticNetCV(cv=5, random_state=0) + >>> regr.fit(X, y) + ElasticNetCV(cv=5, random_state=0) + >>> print(regr.alpha_) + 0.199 + >>> print(regr.intercept_) + 0.398 + >>> print(regr.predict([[0, 0]])) + [0.398] + """ + + _parameter_constraints: dict = { + **LinearModelCV._parameter_constraints, + "l1_ratio": [Interval(Real, 0, 1, closed="both"), "array-like"], + } + + path = staticmethod(enet_path) + + def __init__( + self, + *, + l1_ratio=0.5, + eps=1e-3, + n_alphas="deprecated", + alphas="warn", + fit_intercept=True, + precompute="auto", + max_iter=1000, + tol=1e-4, + cv=None, + copy_X=True, + verbose=0, + n_jobs=None, + positive=False, + random_state=None, + selection="cyclic", + ): + self.l1_ratio = l1_ratio + self.eps = eps + self.n_alphas = n_alphas + self.alphas = alphas + self.fit_intercept = fit_intercept + self.precompute = precompute + self.max_iter = max_iter + self.tol = tol + self.cv = cv + self.copy_X = copy_X + self.verbose = verbose + self.n_jobs = n_jobs + self.positive = positive + self.random_state = random_state + self.selection = selection + + def _get_estimator(self): + return ElasticNet() + + def _is_multitask(self): + return False + + def fit(self, X, y, sample_weight=None, **params): + """Fit ElasticNet model with coordinate descent. + + Fit is on grid of alphas and best alpha estimated by cross-validation. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. Pass directly as Fortran-contiguous data + to avoid unnecessary memory duplication. If y is mono-output, + X can be sparse. Note that large sparse matrices and arrays + requiring `int64` indices are not accepted. + + y : array-like of shape (n_samples,) + Target values. + + sample_weight : float or array-like of shape (n_samples,), \ + default=None + Sample weights used for fitting and evaluation of the weighted + mean squared error of each cv-fold. Note that the cross validated + MSE that is finally used to find the best model is the unweighted + mean over the (weighted) MSEs of each test fold. + + **params : dict, default=None + Parameters to be passed to the CV splitter. + + .. versionadded:: 1.4 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Returns an instance of fitted model. + """ + return super().fit(X, y, sample_weight=sample_weight, **params) + + +############################################################################### +# Multi Task ElasticNet and Lasso models (with joint feature selection) + + +class MultiTaskElasticNet(Lasso): + """Multi-task ElasticNet model trained with L1/L2 mixed-norm as regularizer. + + The optimization objective for MultiTaskElasticNet is:: + + (1 / (2 * n_samples)) * ||Y - XW||_Fro^2 + + alpha * l1_ratio * ||W||_21 + + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2 + + Where:: + + ||W||_21 = sum_i sqrt(sum_j W_ij ^ 2) + + i.e. the sum of norms of each row. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + alpha : float, default=1.0 + Constant that multiplies the L1/L2 term. Defaults to 1.0. + + l1_ratio : float, default=0.5 + The ElasticNet mixing parameter, with 0 < l1_ratio <= 1. + For l1_ratio = 1 the penalty is an L1/L2 penalty. For l1_ratio = 0 it + is an L2 penalty. + For ``0 < l1_ratio < 1``, the penalty is a combination of L1/L2 and L2. + + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to false, no intercept will be used in calculations + (i.e. data is expected to be centered). + + copy_X : bool, default=True + If ``True``, X will be copied; else, it may be overwritten. + + max_iter : int, default=1000 + The maximum number of iterations. + + tol : float, default=1e-4 + The tolerance for the optimization: if the updates are + smaller than ``tol``, the optimization code checks the + dual gap for optimality and continues until it is smaller + than ``tol``. + + warm_start : bool, default=False + When set to ``True``, reuse the solution of the previous call to fit as + initialization, otherwise, just erase the previous solution. + See :term:`the Glossary `. + + random_state : int, RandomState instance, default=None + The seed of the pseudo random number generator that selects a random + feature to update. Used when ``selection`` == 'random'. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + selection : {'cyclic', 'random'}, default='cyclic' + If set to 'random', a random coefficient is updated every iteration + rather than looping over features sequentially by default. This + (setting to 'random') often leads to significantly faster convergence + especially when tol is higher than 1e-4. + + Attributes + ---------- + intercept_ : ndarray of shape (n_targets,) + Independent term in decision function. + + coef_ : ndarray of shape (n_targets, n_features) + Parameter vector (W in the cost function formula). If a 1D y is + passed in at fit (non multi-task usage), ``coef_`` is then a 1D array. + Note that ``coef_`` stores the transpose of ``W``, ``W.T``. + + n_iter_ : int + Number of iterations run by the coordinate descent solver to reach + the specified tolerance. + + dual_gap_ : float + The dual gaps at the end of the optimization. + + eps_ : float + The tolerance scaled scaled by the variance of the target `y`. + + sparse_coef_ : sparse matrix of shape (n_features,) or \ + (n_targets, n_features) + Sparse representation of the `coef_`. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + MultiTaskElasticNetCV : Multi-task L1/L2 ElasticNet with built-in + cross-validation. + ElasticNet : Linear regression with combined L1 and L2 priors as regularizer. + MultiTaskLasso : Multi-task Lasso model trained with L1/L2 + mixed-norm as regularizer. + + Notes + ----- + The algorithm used to fit the model is coordinate descent. + + To avoid unnecessary memory duplication the X and y arguments of the fit + method should be directly passed as Fortran-contiguous numpy arrays. + + Examples + -------- + >>> from sklearn import linear_model + >>> clf = linear_model.MultiTaskElasticNet(alpha=0.1) + >>> clf.fit([[0,0], [1, 1], [2, 2]], [[0, 0], [1, 1], [2, 2]]) + MultiTaskElasticNet(alpha=0.1) + >>> print(clf.coef_) + [[0.45663524 0.45612256] + [0.45663524 0.45612256]] + >>> print(clf.intercept_) + [0.0872422 0.0872422] + """ + + _parameter_constraints: dict = { + **ElasticNet._parameter_constraints, + } + for param in ("precompute", "positive"): + _parameter_constraints.pop(param) + + def __init__( + self, + alpha=1.0, + *, + l1_ratio=0.5, + fit_intercept=True, + copy_X=True, + max_iter=1000, + tol=1e-4, + warm_start=False, + random_state=None, + selection="cyclic", + ): + self.l1_ratio = l1_ratio + self.alpha = alpha + self.fit_intercept = fit_intercept + self.max_iter = max_iter + self.copy_X = copy_X + self.tol = tol + self.warm_start = warm_start + self.random_state = random_state + self.selection = selection + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y): + """Fit MultiTaskElasticNet model with coordinate descent. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Data. + y : ndarray of shape (n_samples, n_targets) + Target. Will be cast to X's dtype if necessary. + + Returns + ------- + self : object + Fitted estimator. + + Notes + ----- + Coordinate descent is an algorithm that considers each column of + data at a time hence it will automatically convert the X input + as a Fortran-contiguous numpy array if necessary. + + To avoid memory re-allocation it is advised to allocate the + initial data in memory directly using that format. + """ + # Need to validate separately here. + # We can't pass multi_output=True because that would allow y to be csr. + check_X_params = dict( + dtype=[np.float64, np.float32], + order="F", + force_writeable=True, + copy=self.copy_X and self.fit_intercept, + ) + check_y_params = dict(ensure_2d=False, order="F") + X, y = validate_data( + self, X, y, validate_separately=(check_X_params, check_y_params) + ) + check_consistent_length(X, y) + y = y.astype(X.dtype) + + if hasattr(self, "l1_ratio"): + model_str = "ElasticNet" + else: + model_str = "Lasso" + if y.ndim == 1: + raise ValueError("For mono-task outputs, use %s" % model_str) + + n_samples, n_features = X.shape + n_targets = y.shape[1] + + X, y, X_offset, y_offset, X_scale = _preprocess_data( + X, y, fit_intercept=self.fit_intercept, copy=False + ) + + if not self.warm_start or not hasattr(self, "coef_"): + self.coef_ = np.zeros( + (n_targets, n_features), dtype=X.dtype.type, order="F" + ) + + l1_reg = self.alpha * self.l1_ratio * n_samples + l2_reg = self.alpha * (1.0 - self.l1_ratio) * n_samples + + self.coef_ = np.asfortranarray(self.coef_) # coef contiguous in memory + + random = self.selection == "random" + + ( + self.coef_, + self.dual_gap_, + self.eps_, + self.n_iter_, + ) = cd_fast.enet_coordinate_descent_multi_task( + self.coef_, + l1_reg, + l2_reg, + X, + y, + self.max_iter, + self.tol, + check_random_state(self.random_state), + random, + ) + + # account for different objective scaling here and in cd_fast + self.dual_gap_ /= n_samples + + self._set_intercept(X_offset, y_offset, X_scale) + + # return self for chaining fit and predict calls + return self + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = False + tags.target_tags.multi_output = True + tags.target_tags.single_output = False + return tags + + +class MultiTaskLasso(MultiTaskElasticNet): + """Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer. + + The optimization objective for Lasso is:: + + (1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_21 + + Where:: + + ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2} + + i.e. the sum of norm of each row. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + alpha : float, default=1.0 + Constant that multiplies the L1/L2 term. Defaults to 1.0. + + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to false, no intercept will be used in calculations + (i.e. data is expected to be centered). + + copy_X : bool, default=True + If ``True``, X will be copied; else, it may be overwritten. + + max_iter : int, default=1000 + The maximum number of iterations. + + tol : float, default=1e-4 + The tolerance for the optimization: if the updates are + smaller than ``tol``, the optimization code checks the + dual gap for optimality and continues until it is smaller + than ``tol``. + + warm_start : bool, default=False + When set to ``True``, reuse the solution of the previous call to fit as + initialization, otherwise, just erase the previous solution. + See :term:`the Glossary `. + + random_state : int, RandomState instance, default=None + The seed of the pseudo random number generator that selects a random + feature to update. Used when ``selection`` == 'random'. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + selection : {'cyclic', 'random'}, default='cyclic' + If set to 'random', a random coefficient is updated every iteration + rather than looping over features sequentially by default. This + (setting to 'random') often leads to significantly faster convergence + especially when tol is higher than 1e-4. + + Attributes + ---------- + coef_ : ndarray of shape (n_targets, n_features) + Parameter vector (W in the cost function formula). + Note that ``coef_`` stores the transpose of ``W``, ``W.T``. + + intercept_ : ndarray of shape (n_targets,) + Independent term in decision function. + + n_iter_ : int + Number of iterations run by the coordinate descent solver to reach + the specified tolerance. + + dual_gap_ : ndarray of shape (n_alphas,) + The dual gaps at the end of the optimization for each alpha. + + eps_ : float + The tolerance scaled scaled by the variance of the target `y`. + + sparse_coef_ : sparse matrix of shape (n_features,) or \ + (n_targets, n_features) + Sparse representation of the `coef_`. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + Lasso: Linear Model trained with L1 prior as regularizer (aka the Lasso). + MultiTaskLassoCV: Multi-task L1 regularized linear model with built-in + cross-validation. + MultiTaskElasticNetCV: Multi-task L1/L2 ElasticNet with built-in cross-validation. + + Notes + ----- + The algorithm used to fit the model is coordinate descent. + + To avoid unnecessary memory duplication the X and y arguments of the fit + method should be directly passed as Fortran-contiguous numpy arrays. + + Examples + -------- + >>> from sklearn import linear_model + >>> clf = linear_model.MultiTaskLasso(alpha=0.1) + >>> clf.fit([[0, 1], [1, 2], [2, 4]], [[0, 0], [1, 1], [2, 3]]) + MultiTaskLasso(alpha=0.1) + >>> print(clf.coef_) + [[0. 0.60809415] + [0. 0.94592424]] + >>> print(clf.intercept_) + [-0.41888636 -0.87382323] + """ + + _parameter_constraints: dict = { + **MultiTaskElasticNet._parameter_constraints, + } + _parameter_constraints.pop("l1_ratio") + + def __init__( + self, + alpha=1.0, + *, + fit_intercept=True, + copy_X=True, + max_iter=1000, + tol=1e-4, + warm_start=False, + random_state=None, + selection="cyclic", + ): + self.alpha = alpha + self.fit_intercept = fit_intercept + self.max_iter = max_iter + self.copy_X = copy_X + self.tol = tol + self.warm_start = warm_start + self.l1_ratio = 1.0 + self.random_state = random_state + self.selection = selection + + +class MultiTaskElasticNetCV(RegressorMixin, LinearModelCV): + """Multi-task L1/L2 ElasticNet with built-in cross-validation. + + See glossary entry for :term:`cross-validation estimator`. + + The optimization objective for MultiTaskElasticNet is:: + + (1 / (2 * n_samples)) * ||Y - XW||^Fro_2 + + alpha * l1_ratio * ||W||_21 + + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2 + + Where:: + + ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2} + + i.e. the sum of norm of each row. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.15 + + Parameters + ---------- + l1_ratio : float or list of float, default=0.5 + The ElasticNet mixing parameter, with 0 < l1_ratio <= 1. + For l1_ratio = 1 the penalty is an L1/L2 penalty. For l1_ratio = 0 it + is an L2 penalty. + For ``0 < l1_ratio < 1``, the penalty is a combination of L1/L2 and L2. + This parameter can be a list, in which case the different + values are tested by cross-validation and the one giving the best + prediction score is used. Note that a good choice of list of + values for l1_ratio is often to put more values close to 1 + (i.e. Lasso) and less close to 0 (i.e. Ridge), as in ``[.1, .5, .7, + .9, .95, .99, 1]``. + + eps : float, default=1e-3 + Length of the path. ``eps=1e-3`` means that + ``alpha_min / alpha_max = 1e-3``. + + n_alphas : int, default=100 + Number of alphas along the regularization path. + + .. deprecated:: 1.7 + `n_alphas` was deprecated in 1.7 and will be removed in 1.9. Use `alphas` + instead. + + alphas : array-like or int, default=None + Values of alphas to test along the regularization path, used for each l1_ratio. + If int, `alphas` values are generated automatically. + If array-like, list of alpha values to use. + + .. versionchanged:: 1.7 + `alphas` accepts an integer value which removes the need to pass + `n_alphas`. + + .. deprecated:: 1.7 + `alphas=None` was deprecated in 1.7 and will be removed in 1.9, at which + point the default value will be set to 100. + + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to false, no intercept will be used in calculations + (i.e. data is expected to be centered). + + max_iter : int, default=1000 + The maximum number of iterations. + + tol : float, default=1e-4 + The tolerance for the optimization: if the updates are + smaller than ``tol``, the optimization code checks the + dual gap for optimality and continues until it is smaller + than ``tol``. + + cv : int, cross-validation generator or iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross-validation, + - int, to specify the number of folds. + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For int/None inputs, :class:`~sklearn.model_selection.KFold` is used. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value if None changed from 3-fold to 5-fold. + + copy_X : bool, default=True + If ``True``, X will be copied; else, it may be overwritten. + + verbose : bool or int, default=0 + Amount of verbosity. + + n_jobs : int, default=None + Number of CPUs to use during the cross validation. Note that this is + used only if multiple values for l1_ratio are given. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + random_state : int, RandomState instance, default=None + The seed of the pseudo random number generator that selects a random + feature to update. Used when ``selection`` == 'random'. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + selection : {'cyclic', 'random'}, default='cyclic' + If set to 'random', a random coefficient is updated every iteration + rather than looping over features sequentially by default. This + (setting to 'random') often leads to significantly faster convergence + especially when tol is higher than 1e-4. + + Attributes + ---------- + intercept_ : ndarray of shape (n_targets,) + Independent term in decision function. + + coef_ : ndarray of shape (n_targets, n_features) + Parameter vector (W in the cost function formula). + Note that ``coef_`` stores the transpose of ``W``, ``W.T``. + + alpha_ : float + The amount of penalization chosen by cross validation. + + mse_path_ : ndarray of shape (n_alphas, n_folds) or \ + (n_l1_ratio, n_alphas, n_folds) + Mean square error for the test set on each fold, varying alpha. + + alphas_ : ndarray of shape (n_alphas,) or (n_l1_ratio, n_alphas) + The grid of alphas used for fitting, for each l1_ratio. + + l1_ratio_ : float + Best l1_ratio obtained by cross-validation. + + n_iter_ : int + Number of iterations run by the coordinate descent solver to reach + the specified tolerance for the optimal alpha. + + dual_gap_ : float + The dual gap at the end of the optimization for the optimal alpha. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + MultiTaskElasticNet : Multi-task L1/L2 ElasticNet with built-in cross-validation. + ElasticNetCV : Elastic net model with best model selection by + cross-validation. + MultiTaskLassoCV : Multi-task Lasso model trained with L1 norm + as regularizer and built-in cross-validation. + + Notes + ----- + The algorithm used to fit the model is coordinate descent. + + In `fit`, once the best parameters `l1_ratio` and `alpha` are found through + cross-validation, the model is fit again using the entire training set. + + To avoid unnecessary memory duplication the `X` and `y` arguments of the + `fit` method should be directly passed as Fortran-contiguous numpy arrays. + + Examples + -------- + >>> from sklearn import linear_model + >>> clf = linear_model.MultiTaskElasticNetCV(cv=3) + >>> clf.fit([[0,0], [1, 1], [2, 2]], + ... [[0, 0], [1, 1], [2, 2]]) + MultiTaskElasticNetCV(cv=3) + >>> print(clf.coef_) + [[0.52875032 0.46958558] + [0.52875032 0.46958558]] + >>> print(clf.intercept_) + [0.00166409 0.00166409] + """ + + _parameter_constraints: dict = { + **LinearModelCV._parameter_constraints, + "l1_ratio": [Interval(Real, 0, 1, closed="both"), "array-like"], + } + _parameter_constraints.pop("precompute") + _parameter_constraints.pop("positive") + + path = staticmethod(enet_path) + + def __init__( + self, + *, + l1_ratio=0.5, + eps=1e-3, + n_alphas="deprecated", + alphas="warn", + fit_intercept=True, + max_iter=1000, + tol=1e-4, + cv=None, + copy_X=True, + verbose=0, + n_jobs=None, + random_state=None, + selection="cyclic", + ): + self.l1_ratio = l1_ratio + self.eps = eps + self.n_alphas = n_alphas + self.alphas = alphas + self.fit_intercept = fit_intercept + self.max_iter = max_iter + self.tol = tol + self.cv = cv + self.copy_X = copy_X + self.verbose = verbose + self.n_jobs = n_jobs + self.random_state = random_state + self.selection = selection + + def _get_estimator(self): + return MultiTaskElasticNet() + + def _is_multitask(self): + return True + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.target_tags.single_output = False + return tags + + # This is necessary as LinearModelCV now supports sample_weight while + # MultiTaskElasticNetCV does not (yet). + def fit(self, X, y, **params): + """Fit MultiTaskElasticNet model with coordinate descent. + + Fit is on grid of alphas and best alpha estimated by cross-validation. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Training data. + y : ndarray of shape (n_samples, n_targets) + Training target variable. Will be cast to X's dtype if necessary. + + **params : dict, default=None + Parameters to be passed to the CV splitter. + + .. versionadded:: 1.4 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Returns MultiTaskElasticNet instance. + """ + return super().fit(X, y, **params) + + +class MultiTaskLassoCV(RegressorMixin, LinearModelCV): + """Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer. + + See glossary entry for :term:`cross-validation estimator`. + + The optimization objective for MultiTaskLasso is:: + + (1 / (2 * n_samples)) * ||Y - XW||^Fro_2 + alpha * ||W||_21 + + Where:: + + ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2} + + i.e. the sum of norm of each row. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.15 + + Parameters + ---------- + eps : float, default=1e-3 + Length of the path. ``eps=1e-3`` means that + ``alpha_min / alpha_max = 1e-3``. + + n_alphas : int, default=100 + Number of alphas along the regularization path. + + .. deprecated:: 1.7 + `n_alphas` was deprecated in 1.7 and will be removed in 1.9. Use `alphas` + instead. + + alphas : array-like or int, default=None + Values of alphas to test along the regularization path. + If int, `alphas` values are generated automatically. + If array-like, list of alpha values to use. + + .. versionchanged:: 1.7 + `alphas` accepts an integer value which removes the need to pass + `n_alphas`. + + .. deprecated:: 1.7 + `alphas=None` was deprecated in 1.7 and will be removed in 1.9, at which + point the default value will be set to 100. + + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to false, no intercept will be used in calculations + (i.e. data is expected to be centered). + + max_iter : int, default=1000 + The maximum number of iterations. + + tol : float, default=1e-4 + The tolerance for the optimization: if the updates are + smaller than ``tol``, the optimization code checks the + dual gap for optimality and continues until it is smaller + than ``tol``. + + copy_X : bool, default=True + If ``True``, X will be copied; else, it may be overwritten. + + cv : int, cross-validation generator or iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross-validation, + - int, to specify the number of folds. + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For int/None inputs, :class:`~sklearn.model_selection.KFold` is used. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value if None changed from 3-fold to 5-fold. + + verbose : bool or int, default=False + Amount of verbosity. + + n_jobs : int, default=None + Number of CPUs to use during the cross validation. Note that this is + used only if multiple values for l1_ratio are given. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + random_state : int, RandomState instance, default=None + The seed of the pseudo random number generator that selects a random + feature to update. Used when ``selection`` == 'random'. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + selection : {'cyclic', 'random'}, default='cyclic' + If set to 'random', a random coefficient is updated every iteration + rather than looping over features sequentially by default. This + (setting to 'random') often leads to significantly faster convergence + especially when tol is higher than 1e-4. + + Attributes + ---------- + intercept_ : ndarray of shape (n_targets,) + Independent term in decision function. + + coef_ : ndarray of shape (n_targets, n_features) + Parameter vector (W in the cost function formula). + Note that ``coef_`` stores the transpose of ``W``, ``W.T``. + + alpha_ : float + The amount of penalization chosen by cross validation. + + mse_path_ : ndarray of shape (n_alphas, n_folds) + Mean square error for the test set on each fold, varying alpha. + + alphas_ : ndarray of shape (n_alphas,) + The grid of alphas used for fitting. + + n_iter_ : int + Number of iterations run by the coordinate descent solver to reach + the specified tolerance for the optimal alpha. + + dual_gap_ : float + The dual gap at the end of the optimization for the optimal alpha. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + MultiTaskElasticNet : Multi-task ElasticNet model trained with L1/L2 + mixed-norm as regularizer. + ElasticNetCV : Elastic net model with best model selection by + cross-validation. + MultiTaskElasticNetCV : Multi-task L1/L2 ElasticNet with built-in + cross-validation. + + Notes + ----- + The algorithm used to fit the model is coordinate descent. + + In `fit`, once the best parameter `alpha` is found through + cross-validation, the model is fit again using the entire training set. + + To avoid unnecessary memory duplication the `X` and `y` arguments of the + `fit` method should be directly passed as Fortran-contiguous numpy arrays. + + Examples + -------- + >>> from sklearn.linear_model import MultiTaskLassoCV + >>> from sklearn.datasets import make_regression + >>> from sklearn.metrics import r2_score + >>> X, y = make_regression(n_targets=2, noise=4, random_state=0) + >>> reg = MultiTaskLassoCV(cv=5, random_state=0).fit(X, y) + >>> r2_score(y, reg.predict(X)) + 0.9994 + >>> reg.alpha_ + np.float64(0.5713) + >>> reg.predict(X[:1,]) + array([[153.7971, 94.9015]]) + """ + + _parameter_constraints: dict = { + **LinearModelCV._parameter_constraints, + } + _parameter_constraints.pop("precompute") + _parameter_constraints.pop("positive") + + path = staticmethod(lasso_path) + + def __init__( + self, + *, + eps=1e-3, + n_alphas="deprecated", + alphas="warn", + fit_intercept=True, + max_iter=1000, + tol=1e-4, + copy_X=True, + cv=None, + verbose=False, + n_jobs=None, + random_state=None, + selection="cyclic", + ): + super().__init__( + eps=eps, + n_alphas=n_alphas, + alphas=alphas, + fit_intercept=fit_intercept, + max_iter=max_iter, + tol=tol, + copy_X=copy_X, + cv=cv, + verbose=verbose, + n_jobs=n_jobs, + random_state=random_state, + selection=selection, + ) + + def _get_estimator(self): + return MultiTaskLasso() + + def _is_multitask(self): + return True + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.target_tags.single_output = False + return tags + + # This is necessary as LinearModelCV now supports sample_weight while + # MultiTaskLassoCV does not (yet). + def fit(self, X, y, **params): + """Fit MultiTaskLasso model with coordinate descent. + + Fit is on grid of alphas and best alpha estimated by cross-validation. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Data. + y : ndarray of shape (n_samples, n_targets) + Target. Will be cast to X's dtype if necessary. + + **params : dict, default=None + Parameters to be passed to the CV splitter. + + .. versionadded:: 1.4 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Returns an instance of fitted model. + """ + return super().fit(X, y, **params) diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_glm/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_glm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5c471c35096f8ab59d042ea4c0758d88d8819282 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_glm/__init__.py @@ -0,0 +1,16 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from .glm import ( + GammaRegressor, + PoissonRegressor, + TweedieRegressor, + _GeneralizedLinearRegressor, +) + +__all__ = [ + "GammaRegressor", + "PoissonRegressor", + "TweedieRegressor", + "_GeneralizedLinearRegressor", +] diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_glm/_newton_solver.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_glm/_newton_solver.py new file mode 100644 index 0000000000000000000000000000000000000000..cfef023692d68102d6aee9602f31fd90854bb89d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_glm/_newton_solver.py @@ -0,0 +1,631 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +""" +Newton solver for Generalized Linear Models +""" + +import warnings +from abc import ABC, abstractmethod + +import numpy as np +import scipy.linalg +import scipy.optimize + +from ..._loss.loss import HalfSquaredError +from ...exceptions import ConvergenceWarning +from ...utils.fixes import _get_additional_lbfgs_options_dict +from ...utils.optimize import _check_optimize_result +from .._linear_loss import LinearModelLoss + + +class NewtonSolver(ABC): + """Newton solver for GLMs. + + This class implements Newton/2nd-order optimization routines for GLMs. Each Newton + iteration aims at finding the Newton step which is done by the inner solver. With + Hessian H, gradient g and coefficients coef, one step solves: + + H @ coef_newton = -g + + For our GLM / LinearModelLoss, we have gradient g and Hessian H: + + g = X.T @ loss.gradient + l2_reg_strength * coef + H = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity + + Backtracking line search updates coef = coef_old + t * coef_newton for some t in + (0, 1]. + + This is a base class, actual implementations (child classes) may deviate from the + above pattern and use structure specific tricks. + + Usage pattern: + - initialize solver: sol = NewtonSolver(...) + - solve the problem: sol.solve(X, y, sample_weight) + + References + ---------- + - Jorge Nocedal, Stephen J. Wright. (2006) "Numerical Optimization" + 2nd edition + https://doi.org/10.1007/978-0-387-40065-5 + + - Stephen P. Boyd, Lieven Vandenberghe. (2004) "Convex Optimization." + Cambridge University Press, 2004. + https://web.stanford.edu/~boyd/cvxbook/bv_cvxbook.pdf + + Parameters + ---------- + coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,) + Initial coefficients of a linear model. + If shape (n_classes * n_dof,), the classes of one feature are contiguous, + i.e. one reconstructs the 2d-array via + coef.reshape((n_classes, -1), order="F"). + + linear_loss : LinearModelLoss + The loss to be minimized. + + l2_reg_strength : float, default=0.0 + L2 regularization strength. + + tol : float, default=1e-4 + The optimization problem is solved when each of the following condition is + fulfilled: + 1. maximum |gradient| <= tol + 2. Newton decrement d: 1/2 * d^2 <= tol + + max_iter : int, default=100 + Maximum number of Newton steps allowed. + + n_threads : int, default=1 + Number of OpenMP threads to use for the computation of the Hessian and gradient + of the loss function. + + Attributes + ---------- + coef_old : ndarray of shape coef.shape + Coefficient of previous iteration. + + coef_newton : ndarray of shape coef.shape + Newton step. + + gradient : ndarray of shape coef.shape + Gradient of the loss w.r.t. the coefficients. + + gradient_old : ndarray of shape coef.shape + Gradient of previous iteration. + + loss_value : float + Value of objective function = loss + penalty. + + loss_value_old : float + Value of objective function of previous itertion. + + raw_prediction : ndarray of shape (n_samples,) or (n_samples, n_classes) + + converged : bool + Indicator for convergence of the solver. + + iteration : int + Number of Newton steps, i.e. calls to inner_solve + + use_fallback_lbfgs_solve : bool + If set to True, the solver will resort to call LBFGS to finish the optimisation + procedure in case of convergence issues. + + gradient_times_newton : float + gradient @ coef_newton, set in inner_solve and used by line_search. If the + Newton step is a descent direction, this is negative. + """ + + def __init__( + self, + *, + coef, + linear_loss=LinearModelLoss(base_loss=HalfSquaredError(), fit_intercept=True), + l2_reg_strength=0.0, + tol=1e-4, + max_iter=100, + n_threads=1, + verbose=0, + ): + self.coef = coef + self.linear_loss = linear_loss + self.l2_reg_strength = l2_reg_strength + self.tol = tol + self.max_iter = max_iter + self.n_threads = n_threads + self.verbose = verbose + + def setup(self, X, y, sample_weight): + """Precomputations + + If None, initializes: + - self.coef + Sets: + - self.raw_prediction + - self.loss_value + """ + _, _, self.raw_prediction = self.linear_loss.weight_intercept_raw(self.coef, X) + self.loss_value = self.linear_loss.loss( + coef=self.coef, + X=X, + y=y, + sample_weight=sample_weight, + l2_reg_strength=self.l2_reg_strength, + n_threads=self.n_threads, + raw_prediction=self.raw_prediction, + ) + + @abstractmethod + def update_gradient_hessian(self, X, y, sample_weight): + """Update gradient and Hessian.""" + + @abstractmethod + def inner_solve(self, X, y, sample_weight): + """Compute Newton step. + + Sets: + - self.coef_newton + - self.gradient_times_newton + """ + + def fallback_lbfgs_solve(self, X, y, sample_weight): + """Fallback solver in case of emergency. + + If a solver detects convergence problems, it may fall back to this methods in + the hope to exit with success instead of raising an error. + + Sets: + - self.coef + - self.converged + """ + max_iter = self.max_iter - self.iteration + opt_res = scipy.optimize.minimize( + self.linear_loss.loss_gradient, + self.coef, + method="L-BFGS-B", + jac=True, + options={ + "maxiter": max_iter, + "maxls": 50, # default is 20 + "gtol": self.tol, + "ftol": 64 * np.finfo(np.float64).eps, + **_get_additional_lbfgs_options_dict("iprint", self.verbose - 1), + }, + args=(X, y, sample_weight, self.l2_reg_strength, self.n_threads), + ) + self.iteration += _check_optimize_result("lbfgs", opt_res, max_iter=max_iter) + self.coef = opt_res.x + self.converged = opt_res.status == 0 + + def line_search(self, X, y, sample_weight): + """Backtracking line search. + + Sets: + - self.coef_old + - self.coef + - self.loss_value_old + - self.loss_value + - self.gradient_old + - self.gradient + - self.raw_prediction + """ + # line search parameters + beta, sigma = 0.5, 0.00048828125 # 1/2, 1/2**11 + eps = 16 * np.finfo(self.loss_value.dtype).eps + t = 1 # step size + + # gradient_times_newton = self.gradient @ self.coef_newton + # was computed in inner_solve. + armijo_term = sigma * self.gradient_times_newton + _, _, raw_prediction_newton = self.linear_loss.weight_intercept_raw( + self.coef_newton, X + ) + + self.coef_old = self.coef + self.loss_value_old = self.loss_value + self.gradient_old = self.gradient + + # np.sum(np.abs(self.gradient_old)) + sum_abs_grad_old = -1 + + is_verbose = self.verbose >= 2 + if is_verbose: + print(" Backtracking Line Search") + print(f" eps=16 * finfo.eps={eps}") + + for i in range(21): # until and including t = beta**20 ~ 1e-6 + self.coef = self.coef_old + t * self.coef_newton + raw = self.raw_prediction + t * raw_prediction_newton + self.loss_value, self.gradient = self.linear_loss.loss_gradient( + coef=self.coef, + X=X, + y=y, + sample_weight=sample_weight, + l2_reg_strength=self.l2_reg_strength, + n_threads=self.n_threads, + raw_prediction=raw, + ) + # Note: If coef_newton is too large, loss_gradient may produce inf values, + # potentially accompanied by a RuntimeWarning. + # This case will be captured by the Armijo condition. + + # 1. Check Armijo / sufficient decrease condition. + # The smaller (more negative) the better. + loss_improvement = self.loss_value - self.loss_value_old + check = loss_improvement <= t * armijo_term + if is_verbose: + print( + f" line search iteration={i + 1}, step size={t}\n" + f" check loss improvement <= armijo term: {loss_improvement} " + f"<= {t * armijo_term} {check}" + ) + if check: + break + # 2. Deal with relative loss differences around machine precision. + tiny_loss = np.abs(self.loss_value_old * eps) + check = np.abs(loss_improvement) <= tiny_loss + if is_verbose: + print( + " check loss |improvement| <= eps * |loss_old|:" + f" {np.abs(loss_improvement)} <= {tiny_loss} {check}" + ) + if check: + if sum_abs_grad_old < 0: + sum_abs_grad_old = scipy.linalg.norm(self.gradient_old, ord=1) + # 2.1 Check sum of absolute gradients as alternative condition. + sum_abs_grad = scipy.linalg.norm(self.gradient, ord=1) + check = sum_abs_grad < sum_abs_grad_old + if is_verbose: + print( + " check sum(|gradient|) < sum(|gradient_old|): " + f"{sum_abs_grad} < {sum_abs_grad_old} {check}" + ) + if check: + break + + t *= beta + else: + warnings.warn( + ( + f"Line search of Newton solver {self.__class__.__name__} at" + f" iteration #{self.iteration} did no converge after 21 line search" + " refinement iterations. It will now resort to lbfgs instead." + ), + ConvergenceWarning, + ) + if self.verbose: + print(" Line search did not converge and resorts to lbfgs instead.") + self.use_fallback_lbfgs_solve = True + return + + self.raw_prediction = raw + if is_verbose: + print( + f" line search successful after {i + 1} iterations with " + f"loss={self.loss_value}." + ) + + def check_convergence(self, X, y, sample_weight): + """Check for convergence. + + Sets self.converged. + """ + if self.verbose: + print(" Check Convergence") + # Note: Checking maximum relative change of coefficient <= tol is a bad + # convergence criterion because even a large step could have brought us close + # to the true minimum. + # coef_step = self.coef - self.coef_old + # change = np.max(np.abs(coef_step) / np.maximum(1, np.abs(self.coef_old))) + # check = change <= tol + + # 1. Criterion: maximum |gradient| <= tol + # The gradient was already updated in line_search() + g_max_abs = np.max(np.abs(self.gradient)) + check = g_max_abs <= self.tol + if self.verbose: + print(f" 1. max |gradient| {g_max_abs} <= {self.tol} {check}") + if not check: + return + + # 2. Criterion: For Newton decrement d, check 1/2 * d^2 <= tol + # d = sqrt(grad @ hessian^-1 @ grad) + # = sqrt(coef_newton @ hessian @ coef_newton) + # See Boyd, Vanderberghe (2009) "Convex Optimization" Chapter 9.5.1. + d2 = self.coef_newton @ self.hessian @ self.coef_newton + check = 0.5 * d2 <= self.tol + if self.verbose: + print(f" 2. Newton decrement {0.5 * d2} <= {self.tol} {check}") + if not check: + return + + if self.verbose: + loss_value = self.linear_loss.loss( + coef=self.coef, + X=X, + y=y, + sample_weight=sample_weight, + l2_reg_strength=self.l2_reg_strength, + n_threads=self.n_threads, + ) + print(f" Solver did converge at loss = {loss_value}.") + self.converged = True + + def finalize(self, X, y, sample_weight): + """Finalize the solvers results. + + Some solvers may need this, others not. + """ + pass + + def solve(self, X, y, sample_weight): + """Solve the optimization problem. + + This is the main routine. + + Order of calls: + self.setup() + while iteration: + self.update_gradient_hessian() + self.inner_solve() + self.line_search() + self.check_convergence() + self.finalize() + + Returns + ------- + coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,) + Solution of the optimization problem. + """ + # setup usually: + # - initializes self.coef if needed + # - initializes and calculates self.raw_predictions, self.loss_value + self.setup(X=X, y=y, sample_weight=sample_weight) + + self.iteration = 1 + self.converged = False + self.use_fallback_lbfgs_solve = False + + while self.iteration <= self.max_iter and not self.converged: + if self.verbose: + print(f"Newton iter={self.iteration}") + + self.use_fallback_lbfgs_solve = False # Fallback solver. + + # 1. Update Hessian and gradient + self.update_gradient_hessian(X=X, y=y, sample_weight=sample_weight) + + # TODO: + # if iteration == 1: + # We might stop early, e.g. we already are close to the optimum, + # usually detected by zero gradients at this stage. + + # 2. Inner solver + # Calculate Newton step/direction + # This usually sets self.coef_newton and self.gradient_times_newton. + self.inner_solve(X=X, y=y, sample_weight=sample_weight) + if self.use_fallback_lbfgs_solve: + break + + # 3. Backtracking line search + # This usually sets self.coef_old, self.coef, self.loss_value_old + # self.loss_value, self.gradient_old, self.gradient, + # self.raw_prediction. + self.line_search(X=X, y=y, sample_weight=sample_weight) + if self.use_fallback_lbfgs_solve: + break + + # 4. Check convergence + # Sets self.converged. + self.check_convergence(X=X, y=y, sample_weight=sample_weight) + + # 5. Next iteration + self.iteration += 1 + + if not self.converged: + if self.use_fallback_lbfgs_solve: + # Note: The fallback solver circumvents check_convergence and relies on + # the convergence checks of lbfgs instead. Enough warnings have been + # raised on the way. + self.fallback_lbfgs_solve(X=X, y=y, sample_weight=sample_weight) + else: + warnings.warn( + ( + f"Newton solver did not converge after {self.iteration - 1} " + "iterations." + ), + ConvergenceWarning, + ) + + self.iteration -= 1 + self.finalize(X=X, y=y, sample_weight=sample_weight) + return self.coef + + +class NewtonCholeskySolver(NewtonSolver): + """Cholesky based Newton solver. + + Inner solver for finding the Newton step H w_newton = -g uses Cholesky based linear + solver. + """ + + def setup(self, X, y, sample_weight): + super().setup(X=X, y=y, sample_weight=sample_weight) + if self.linear_loss.base_loss.is_multiclass: + # Easier with ravelled arrays, e.g., for scipy.linalg.solve. + # As with LinearModelLoss, we always are contiguous in n_classes. + self.coef = self.coef.ravel(order="F") + # Note that the computation of gradient in LinearModelLoss follows the shape of + # coef. + self.gradient = np.empty_like(self.coef) + # But the hessian is always 2d. + n = self.coef.size + self.hessian = np.empty_like(self.coef, shape=(n, n)) + # To help case distinctions. + self.is_multinomial_with_intercept = ( + self.linear_loss.base_loss.is_multiclass and self.linear_loss.fit_intercept + ) + self.is_multinomial_no_penalty = ( + self.linear_loss.base_loss.is_multiclass and self.l2_reg_strength == 0 + ) + if self.is_multinomial_no_penalty: + # See inner_solve. The provided coef might not adhere to the convention + # that the last class is set to zero. + # This is done by the usual freedom of a (overparametrized) multinomial to + # add a constant to all classes which doesn't change predictions. + n_classes = self.linear_loss.base_loss.n_classes + coef = self.coef.reshape(n_classes, -1, order="F") # easier as 2d + coef -= coef[-1, :] # coef -= coef of last class + elif self.is_multinomial_with_intercept: + # See inner_solve. Same as above, but only for the intercept. + n_classes = self.linear_loss.base_loss.n_classes + # intercept -= intercept of last class + self.coef[-n_classes:] -= self.coef[-1] + + def update_gradient_hessian(self, X, y, sample_weight): + _, _, self.hessian_warning = self.linear_loss.gradient_hessian( + coef=self.coef, + X=X, + y=y, + sample_weight=sample_weight, + l2_reg_strength=self.l2_reg_strength, + n_threads=self.n_threads, + gradient_out=self.gradient, + hessian_out=self.hessian, + raw_prediction=self.raw_prediction, # this was updated in line_search + ) + + def inner_solve(self, X, y, sample_weight): + if self.hessian_warning: + warnings.warn( + ( + f"The inner solver of {self.__class__.__name__} detected a " + "pointwise hessian with many negative values at iteration " + f"#{self.iteration}. It will now resort to lbfgs instead." + ), + ConvergenceWarning, + ) + if self.verbose: + print( + " The inner solver detected a pointwise Hessian with many " + "negative values and resorts to lbfgs instead." + ) + self.use_fallback_lbfgs_solve = True + return + + # Note: The following case distinction could also be shifted to the + # implementation of HalfMultinomialLoss instead of here within the solver. + if self.is_multinomial_no_penalty: + # The multinomial loss is overparametrized for each unpenalized feature, so + # at least the intercepts. This can be seen by noting that predicted + # probabilities are invariant under shifting all coefficients of a single + # feature j for all classes by the same amount c: + # coef[k, :] -> coef[k, :] + c => proba stays the same + # where we have assumed coef.shape = (n_classes, n_features). + # Therefore, also the loss (-log-likelihood), gradient and hessian stay the + # same, see + # Noah Simon and Jerome Friedman and Trevor Hastie. (2013) "A Blockwise + # Descent Algorithm for Group-penalized Multiresponse and Multinomial + # Regression". https://doi.org/10.48550/arXiv.1311.6529 + # + # We choose the standard approach and set all the coefficients of the last + # class to zero, for all features including the intercept. + # Note that coef was already dealt with in setup. + n_classes = self.linear_loss.base_loss.n_classes + n_dof = self.coef.size // n_classes # degree of freedom per class + n = self.coef.size - n_dof # effective size + self.gradient[n_classes - 1 :: n_classes] = 0 + self.hessian[n_classes - 1 :: n_classes, :] = 0 + self.hessian[:, n_classes - 1 :: n_classes] = 0 + # We also need the reduced variants of gradient and hessian where the + # entries set to zero are removed. For 2 features and 3 classes with + # arbitrary values, "x" means removed: + # gradient = [0, 1, x, 3, 4, x] + # + # hessian = [0, 1, x, 3, 4, x] + # [1, 7, x, 9, 10, x] + # [x, x, x, x, x, x] + # [3, 9, x, 21, 22, x] + # [4, 10, x, 22, 28, x] + # [x, x, x, x, x, x] + # The following slicing triggers copies of gradient and hessian. + gradient = self.gradient.reshape(-1, n_classes)[:, :-1].flatten() + hessian = self.hessian.reshape(n_dof, n_classes, n_dof, n_classes)[ + :, :-1, :, :-1 + ].reshape(n, n) + elif self.is_multinomial_with_intercept: + # Here, only intercepts are unpenalized. We again choose the last class and + # set its intercept to zero. + # Note that coef was already dealt with in setup. + self.gradient[-1] = 0 + self.hessian[-1, :] = 0 + self.hessian[:, -1] = 0 + gradient, hessian = self.gradient[:-1], self.hessian[:-1, :-1] + else: + gradient, hessian = self.gradient, self.hessian + + try: + with warnings.catch_warnings(): + warnings.simplefilter("error", scipy.linalg.LinAlgWarning) + self.coef_newton = scipy.linalg.solve( + hessian, -gradient, check_finite=False, assume_a="sym" + ) + if self.is_multinomial_no_penalty: + self.coef_newton = np.c_[ + self.coef_newton.reshape(n_dof, n_classes - 1), np.zeros(n_dof) + ].reshape(-1) + assert self.coef_newton.flags.f_contiguous + elif self.is_multinomial_with_intercept: + self.coef_newton = np.r_[self.coef_newton, 0] + self.gradient_times_newton = self.gradient @ self.coef_newton + if self.gradient_times_newton > 0: + if self.verbose: + print( + " The inner solver found a Newton step that is not a " + "descent direction and resorts to LBFGS steps instead." + ) + self.use_fallback_lbfgs_solve = True + return + except (np.linalg.LinAlgError, scipy.linalg.LinAlgWarning) as e: + warnings.warn( + f"The inner solver of {self.__class__.__name__} stumbled upon a " + "singular or very ill-conditioned Hessian matrix at iteration " + f"{self.iteration}. It will now resort to lbfgs instead.\n" + "Further options are to use another solver or to avoid such situation " + "in the first place. Possible remedies are removing collinear features" + " of X or increasing the penalization strengths.\n" + "The original Linear Algebra message was:\n" + str(e), + scipy.linalg.LinAlgWarning, + ) + # Possible causes: + # 1. hess_pointwise is negative. But this is already taken care in + # LinearModelLoss.gradient_hessian. + # 2. X is singular or ill-conditioned + # This might be the most probable cause. + # + # There are many possible ways to deal with this situation. Most of them + # add, explicitly or implicitly, a matrix to the hessian to make it + # positive definite, confer to Chapter 3.4 of Nocedal & Wright 2nd ed. + # Instead, we resort to lbfgs. + if self.verbose: + print( + " The inner solver stumbled upon an singular or ill-conditioned " + "Hessian matrix and resorts to LBFGS instead." + ) + self.use_fallback_lbfgs_solve = True + return + + def finalize(self, X, y, sample_weight): + if self.is_multinomial_no_penalty: + # Our convention is usually the symmetric parametrization where + # sum(coef[classes, features], axis=0) = 0. + # We convert now to this convention. Note that it does not change + # the predicted probabilities. + n_classes = self.linear_loss.base_loss.n_classes + self.coef = self.coef.reshape(n_classes, -1, order="F") + self.coef -= np.mean(self.coef, axis=0) + elif self.is_multinomial_with_intercept: + # Only the intercept needs an update to the symmetric parametrization. + n_classes = self.linear_loss.base_loss.n_classes + self.coef[-n_classes:] -= np.mean(self.coef[-n_classes:]) diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_glm/glm.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_glm/glm.py new file mode 100644 index 0000000000000000000000000000000000000000..8ba24878b95b2dedbbc7bee89be5616fb5928359 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_glm/glm.py @@ -0,0 +1,911 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +""" +Generalized Linear Models with Exponential Dispersion Family +""" + +from numbers import Integral, Real + +import numpy as np +import scipy.optimize + +from ..._loss.loss import ( + HalfGammaLoss, + HalfPoissonLoss, + HalfSquaredError, + HalfTweedieLoss, + HalfTweedieLossIdentity, +) +from ...base import BaseEstimator, RegressorMixin, _fit_context +from ...utils import check_array +from ...utils._openmp_helpers import _openmp_effective_n_threads +from ...utils._param_validation import Hidden, Interval, StrOptions +from ...utils.fixes import _get_additional_lbfgs_options_dict +from ...utils.optimize import _check_optimize_result +from ...utils.validation import _check_sample_weight, check_is_fitted, validate_data +from .._linear_loss import LinearModelLoss +from ._newton_solver import NewtonCholeskySolver, NewtonSolver + + +class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator): + """Regression via a penalized Generalized Linear Model (GLM). + + GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at fitting and + predicting the mean of the target y as y_pred=h(X*w) with coefficients w. + Therefore, the fit minimizes the following objective function with L2 priors as + regularizer:: + + 1/(2*sum(s_i)) * sum(s_i * deviance(y_i, h(x_i*w)) + 1/2 * alpha * ||w||_2^2 + + with inverse link function h, s=sample_weight and per observation (unit) deviance + deviance(y_i, h(x_i*w)). Note that for an EDM, 1/2 * deviance is the negative + log-likelihood up to a constant (in w) term. + The parameter ``alpha`` corresponds to the lambda parameter in glmnet. + + Instead of implementing the EDM family and a link function separately, we directly + use the loss functions `from sklearn._loss` which have the link functions included + in them for performance reasons. We pick the loss functions that implement + (1/2 times) EDM deviances. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.23 + + Parameters + ---------- + alpha : float, default=1 + Constant that multiplies the penalty term and thus determines the + regularization strength. ``alpha = 0`` is equivalent to unpenalized + GLMs. In this case, the design matrix `X` must have full column rank + (no collinearities). + Values must be in the range `[0.0, inf)`. + + fit_intercept : bool, default=True + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X @ coef + intercept). + + solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs' + Algorithm to use in the optimization problem: + + 'lbfgs' + Calls scipy's L-BFGS-B optimizer. + + 'newton-cholesky' + Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to + iterated reweighted least squares) with an inner Cholesky based solver. + This solver is a good choice for `n_samples` >> `n_features`, especially + with one-hot encoded categorical features with rare categories. Be aware + that the memory usage of this solver has a quadratic dependency on + `n_features` because it explicitly computes the Hessian matrix. + + .. versionadded:: 1.2 + + max_iter : int, default=100 + The maximal number of iterations for the solver. + Values must be in the range `[1, inf)`. + + tol : float, default=1e-4 + Stopping criterion. For the lbfgs solver, + the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol`` + where ``g_j`` is the j-th component of the gradient (derivative) of + the objective function. + Values must be in the range `(0.0, inf)`. + + warm_start : bool, default=False + If set to ``True``, reuse the solution of the previous call to ``fit`` + as initialization for ``coef_`` and ``intercept_``. + + verbose : int, default=0 + For the lbfgs solver set verbose to any positive number for verbosity. + Values must be in the range `[0, inf)`. + + Attributes + ---------- + coef_ : array of shape (n_features,) + Estimated coefficients for the linear predictor (`X @ coef_ + + intercept_`) in the GLM. + + intercept_ : float + Intercept (a.k.a. bias) added to linear predictor. + + n_iter_ : int + Actual number of iterations used in the solver. + + _base_loss : BaseLoss, default=HalfSquaredError() + This is set during fit via `self._get_loss()`. + A `_base_loss` contains a specific loss function as well as the link + function. The loss to be minimized specifies the distributional assumption of + the GLM, i.e. the distribution from the EDM. Here are some examples: + + ======================= ======== ========================== + _base_loss Link Target Domain + ======================= ======== ========================== + HalfSquaredError identity y any real number + HalfPoissonLoss log 0 <= y + HalfGammaLoss log 0 < y + HalfTweedieLoss log dependent on tweedie power + HalfTweedieLossIdentity identity dependent on tweedie power + ======================= ======== ========================== + + The link function of the GLM, i.e. mapping from linear predictor + `X @ coeff + intercept` to prediction `y_pred`. For instance, with a log link, + we have `y_pred = exp(X @ coeff + intercept)`. + """ + + # We allow for NewtonSolver classes for the "solver" parameter but do not + # make them public in the docstrings. This facilitates testing and + # benchmarking. + _parameter_constraints: dict = { + "alpha": [Interval(Real, 0.0, None, closed="left")], + "fit_intercept": ["boolean"], + "solver": [ + StrOptions({"lbfgs", "newton-cholesky"}), + Hidden(type), + ], + "max_iter": [Interval(Integral, 1, None, closed="left")], + "tol": [Interval(Real, 0.0, None, closed="neither")], + "warm_start": ["boolean"], + "verbose": ["verbose"], + } + + def __init__( + self, + *, + alpha=1.0, + fit_intercept=True, + solver="lbfgs", + max_iter=100, + tol=1e-4, + warm_start=False, + verbose=0, + ): + self.alpha = alpha + self.fit_intercept = fit_intercept + self.solver = solver + self.max_iter = max_iter + self.tol = tol + self.warm_start = warm_start + self.verbose = verbose + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, sample_weight=None): + """Fit a Generalized Linear Model. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) + Target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + self : object + Fitted model. + """ + X, y = validate_data( + self, + X, + y, + accept_sparse=["csc", "csr"], + dtype=[np.float64, np.float32], + y_numeric=True, + multi_output=False, + ) + + # required by losses + if self.solver == "lbfgs": + # lbfgs will force coef and therefore raw_prediction to be float64. The + # base_loss needs y, X @ coef and sample_weight all of same dtype + # (and contiguous). + loss_dtype = np.float64 + else: + loss_dtype = min(max(y.dtype, X.dtype), np.float64) + y = check_array(y, dtype=loss_dtype, order="C", ensure_2d=False) + + if sample_weight is not None: + # Note that _check_sample_weight calls check_array(order="C") required by + # losses. + sample_weight = _check_sample_weight(sample_weight, X, dtype=loss_dtype) + + n_samples, n_features = X.shape + self._base_loss = self._get_loss() + + linear_loss = LinearModelLoss( + base_loss=self._base_loss, + fit_intercept=self.fit_intercept, + ) + + if not linear_loss.base_loss.in_y_true_range(y): + raise ValueError( + "Some value(s) of y are out of the valid range of the loss" + f" {self._base_loss.__class__.__name__!r}." + ) + + # TODO: if alpha=0 check that X is not rank deficient + + # NOTE: Rescaling of sample_weight: + # We want to minimize + # obj = 1/(2 * sum(sample_weight)) * sum(sample_weight * deviance) + # + 1/2 * alpha * L2, + # with + # deviance = 2 * loss. + # The objective is invariant to multiplying sample_weight by a constant. We + # could choose this constant such that sum(sample_weight) = 1 in order to end + # up with + # obj = sum(sample_weight * loss) + 1/2 * alpha * L2. + # But LinearModelLoss.loss() already computes + # average(loss, weights=sample_weight) + # Thus, without rescaling, we have + # obj = LinearModelLoss.loss(...) + + if self.warm_start and hasattr(self, "coef_"): + if self.fit_intercept: + # LinearModelLoss needs intercept at the end of coefficient array. + coef = np.concatenate((self.coef_, np.array([self.intercept_]))) + else: + coef = self.coef_ + coef = coef.astype(loss_dtype, copy=False) + else: + coef = linear_loss.init_zero_coef(X, dtype=loss_dtype) + if self.fit_intercept: + coef[-1] = linear_loss.base_loss.link.link( + np.average(y, weights=sample_weight) + ) + + l2_reg_strength = self.alpha + n_threads = _openmp_effective_n_threads() + + # Algorithms for optimization: + # Note again that our losses implement 1/2 * deviance. + if self.solver == "lbfgs": + func = linear_loss.loss_gradient + + opt_res = scipy.optimize.minimize( + func, + coef, + method="L-BFGS-B", + jac=True, + options={ + "maxiter": self.max_iter, + "maxls": 50, # default is 20 + "gtol": self.tol, + # The constant 64 was found empirically to pass the test suite. + # The point is that ftol is very small, but a bit larger than + # machine precision for float64, which is the dtype used by lbfgs. + "ftol": 64 * np.finfo(float).eps, + **_get_additional_lbfgs_options_dict("iprint", self.verbose - 1), + }, + args=(X, y, sample_weight, l2_reg_strength, n_threads), + ) + self.n_iter_ = _check_optimize_result( + "lbfgs", opt_res, max_iter=self.max_iter + ) + coef = opt_res.x + elif self.solver == "newton-cholesky": + sol = NewtonCholeskySolver( + coef=coef, + linear_loss=linear_loss, + l2_reg_strength=l2_reg_strength, + tol=self.tol, + max_iter=self.max_iter, + n_threads=n_threads, + verbose=self.verbose, + ) + coef = sol.solve(X, y, sample_weight) + self.n_iter_ = sol.iteration + elif issubclass(self.solver, NewtonSolver): + sol = self.solver( + coef=coef, + linear_loss=linear_loss, + l2_reg_strength=l2_reg_strength, + tol=self.tol, + max_iter=self.max_iter, + n_threads=n_threads, + ) + coef = sol.solve(X, y, sample_weight) + self.n_iter_ = sol.iteration + else: + raise ValueError(f"Invalid solver={self.solver}.") + + if self.fit_intercept: + self.intercept_ = coef[-1] + self.coef_ = coef[:-1] + else: + # set intercept to zero as the other linear models do + self.intercept_ = 0.0 + self.coef_ = coef + + return self + + def _linear_predictor(self, X): + """Compute the linear_predictor = `X @ coef_ + intercept_`. + + Note that we often use the term raw_prediction instead of linear predictor. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Samples. + + Returns + ------- + y_pred : array of shape (n_samples,) + Returns predicted values of linear predictor. + """ + check_is_fitted(self) + X = validate_data( + self, + X, + accept_sparse=["csr", "csc", "coo"], + dtype=[np.float64, np.float32], + ensure_2d=True, + allow_nd=False, + reset=False, + ) + return X @ self.coef_ + self.intercept_ + + def predict(self, X): + """Predict using GLM with feature matrix X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Samples. + + Returns + ------- + y_pred : array of shape (n_samples,) + Returns predicted values. + """ + # check_array is done in _linear_predictor + raw_prediction = self._linear_predictor(X) + y_pred = self._base_loss.link.inverse(raw_prediction) + return y_pred + + def score(self, X, y, sample_weight=None): + """Compute D^2, the percentage of deviance explained. + + D^2 is a generalization of the coefficient of determination R^2. + R^2 uses squared error and D^2 uses the deviance of this GLM, see the + :ref:`User Guide `. + + D^2 is defined as + :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`, + :math:`D_{null}` is the null deviance, i.e. the deviance of a model + with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`. + The mean :math:`\\bar{y}` is averaged by sample_weight. + Best possible score is 1.0 and it can be negative (because the model + can be arbitrarily worse). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Test samples. + + y : array-like of shape (n_samples,) + True values of target. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + score : float + D^2 of self.predict(X) w.r.t. y. + """ + # TODO: Adapt link to User Guide in the docstring, once + # https://github.com/scikit-learn/scikit-learn/pull/22118 is merged. + # + # Note, default score defined in RegressorMixin is R^2 score. + # TODO: make D^2 a score function in module metrics (and thereby get + # input validation and so on) + raw_prediction = self._linear_predictor(X) # validates X + # required by losses + y = check_array(y, dtype=raw_prediction.dtype, order="C", ensure_2d=False) + + if sample_weight is not None: + # Note that _check_sample_weight calls check_array(order="C") required by + # losses. + sample_weight = _check_sample_weight(sample_weight, X, dtype=y.dtype) + + base_loss = self._base_loss + + if not base_loss.in_y_true_range(y): + raise ValueError( + "Some value(s) of y are out of the valid range of the loss" + f" {base_loss.__name__}." + ) + + constant = np.average( + base_loss.constant_to_optimal_zero(y_true=y, sample_weight=None), + weights=sample_weight, + ) + + # Missing factor of 2 in deviance cancels out. + deviance = base_loss( + y_true=y, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + n_threads=1, + ) + y_mean = base_loss.link.link(np.average(y, weights=sample_weight)) + deviance_null = base_loss( + y_true=y, + raw_prediction=np.tile(y_mean, y.shape[0]), + sample_weight=sample_weight, + n_threads=1, + ) + return 1 - (deviance + constant) / (deviance_null + constant) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + try: + # Create instance of BaseLoss if fit wasn't called yet. This is necessary as + # TweedieRegressor might set the used loss during fit different from + # self._base_loss. + base_loss = self._get_loss() + tags.target_tags.positive_only = not base_loss.in_y_true_range(-1.0) + except (ValueError, AttributeError, TypeError): + # This happens when the link or power parameter of TweedieRegressor is + # invalid. We fallback on the default tags in that case. + pass # pragma: no cover + return tags + + def _get_loss(self): + """This is only necessary because of the link and power arguments of the + TweedieRegressor. + + Note that we do not need to pass sample_weight to the loss class as this is + only needed to set loss.constant_hessian on which GLMs do not rely. + """ + return HalfSquaredError() + + +class PoissonRegressor(_GeneralizedLinearRegressor): + """Generalized Linear Model with a Poisson distribution. + + This regressor uses the 'log' link function. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.23 + + Parameters + ---------- + alpha : float, default=1 + Constant that multiplies the L2 penalty term and determines the + regularization strength. ``alpha = 0`` is equivalent to unpenalized + GLMs. In this case, the design matrix `X` must have full column rank + (no collinearities). + Values of `alpha` must be in the range `[0.0, inf)`. + + fit_intercept : bool, default=True + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (`X @ coef + intercept`). + + solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs' + Algorithm to use in the optimization problem: + + 'lbfgs' + Calls scipy's L-BFGS-B optimizer. + + 'newton-cholesky' + Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to + iterated reweighted least squares) with an inner Cholesky based solver. + This solver is a good choice for `n_samples` >> `n_features`, especially + with one-hot encoded categorical features with rare categories. Be aware + that the memory usage of this solver has a quadratic dependency on + `n_features` because it explicitly computes the Hessian matrix. + + .. versionadded:: 1.2 + + max_iter : int, default=100 + The maximal number of iterations for the solver. + Values must be in the range `[1, inf)`. + + tol : float, default=1e-4 + Stopping criterion. For the lbfgs solver, + the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol`` + where ``g_j`` is the j-th component of the gradient (derivative) of + the objective function. + Values must be in the range `(0.0, inf)`. + + warm_start : bool, default=False + If set to ``True``, reuse the solution of the previous call to ``fit`` + as initialization for ``coef_`` and ``intercept_`` . + + verbose : int, default=0 + For the lbfgs solver set verbose to any positive number for verbosity. + Values must be in the range `[0, inf)`. + + Attributes + ---------- + coef_ : array of shape (n_features,) + Estimated coefficients for the linear predictor (`X @ coef_ + + intercept_`) in the GLM. + + intercept_ : float + Intercept (a.k.a. bias) added to linear predictor. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + Actual number of iterations used in the solver. + + See Also + -------- + TweedieRegressor : Generalized Linear Model with a Tweedie distribution. + + Examples + -------- + >>> from sklearn import linear_model + >>> clf = linear_model.PoissonRegressor() + >>> X = [[1, 2], [2, 3], [3, 4], [4, 3]] + >>> y = [12, 17, 22, 21] + >>> clf.fit(X, y) + PoissonRegressor() + >>> clf.score(X, y) + np.float64(0.990) + >>> clf.coef_ + array([0.121, 0.158]) + >>> clf.intercept_ + np.float64(2.088) + >>> clf.predict([[1, 1], [3, 4]]) + array([10.676, 21.875]) + """ + + _parameter_constraints: dict = { + **_GeneralizedLinearRegressor._parameter_constraints + } + + def __init__( + self, + *, + alpha=1.0, + fit_intercept=True, + solver="lbfgs", + max_iter=100, + tol=1e-4, + warm_start=False, + verbose=0, + ): + super().__init__( + alpha=alpha, + fit_intercept=fit_intercept, + solver=solver, + max_iter=max_iter, + tol=tol, + warm_start=warm_start, + verbose=verbose, + ) + + def _get_loss(self): + return HalfPoissonLoss() + + +class GammaRegressor(_GeneralizedLinearRegressor): + """Generalized Linear Model with a Gamma distribution. + + This regressor uses the 'log' link function. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.23 + + Parameters + ---------- + alpha : float, default=1 + Constant that multiplies the L2 penalty term and determines the + regularization strength. ``alpha = 0`` is equivalent to unpenalized + GLMs. In this case, the design matrix `X` must have full column rank + (no collinearities). + Values of `alpha` must be in the range `[0.0, inf)`. + + fit_intercept : bool, default=True + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor `X @ coef_ + intercept_`. + + solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs' + Algorithm to use in the optimization problem: + + 'lbfgs' + Calls scipy's L-BFGS-B optimizer. + + 'newton-cholesky' + Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to + iterated reweighted least squares) with an inner Cholesky based solver. + This solver is a good choice for `n_samples` >> `n_features`, especially + with one-hot encoded categorical features with rare categories. Be aware + that the memory usage of this solver has a quadratic dependency on + `n_features` because it explicitly computes the Hessian matrix. + + .. versionadded:: 1.2 + + max_iter : int, default=100 + The maximal number of iterations for the solver. + Values must be in the range `[1, inf)`. + + tol : float, default=1e-4 + Stopping criterion. For the lbfgs solver, + the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol`` + where ``g_j`` is the j-th component of the gradient (derivative) of + the objective function. + Values must be in the range `(0.0, inf)`. + + warm_start : bool, default=False + If set to ``True``, reuse the solution of the previous call to ``fit`` + as initialization for `coef_` and `intercept_`. + + verbose : int, default=0 + For the lbfgs solver set verbose to any positive number for verbosity. + Values must be in the range `[0, inf)`. + + Attributes + ---------- + coef_ : array of shape (n_features,) + Estimated coefficients for the linear predictor (`X @ coef_ + + intercept_`) in the GLM. + + intercept_ : float + Intercept (a.k.a. bias) added to linear predictor. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + n_iter_ : int + Actual number of iterations used in the solver. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + PoissonRegressor : Generalized Linear Model with a Poisson distribution. + TweedieRegressor : Generalized Linear Model with a Tweedie distribution. + + Examples + -------- + >>> from sklearn import linear_model + >>> clf = linear_model.GammaRegressor() + >>> X = [[1, 2], [2, 3], [3, 4], [4, 3]] + >>> y = [19, 26, 33, 30] + >>> clf.fit(X, y) + GammaRegressor() + >>> clf.score(X, y) + np.float64(0.773) + >>> clf.coef_ + array([0.073, 0.067]) + >>> clf.intercept_ + np.float64(2.896) + >>> clf.predict([[1, 0], [2, 8]]) + array([19.483, 35.795]) + """ + + _parameter_constraints: dict = { + **_GeneralizedLinearRegressor._parameter_constraints + } + + def __init__( + self, + *, + alpha=1.0, + fit_intercept=True, + solver="lbfgs", + max_iter=100, + tol=1e-4, + warm_start=False, + verbose=0, + ): + super().__init__( + alpha=alpha, + fit_intercept=fit_intercept, + solver=solver, + max_iter=max_iter, + tol=tol, + warm_start=warm_start, + verbose=verbose, + ) + + def _get_loss(self): + return HalfGammaLoss() + + +class TweedieRegressor(_GeneralizedLinearRegressor): + """Generalized Linear Model with a Tweedie distribution. + + This estimator can be used to model different GLMs depending on the + ``power`` parameter, which determines the underlying distribution. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.23 + + Parameters + ---------- + power : float, default=0 + The power determines the underlying target distribution according + to the following table: + + +-------+------------------------+ + | Power | Distribution | + +=======+========================+ + | 0 | Normal | + +-------+------------------------+ + | 1 | Poisson | + +-------+------------------------+ + | (1,2) | Compound Poisson Gamma | + +-------+------------------------+ + | 2 | Gamma | + +-------+------------------------+ + | 3 | Inverse Gaussian | + +-------+------------------------+ + + For ``0 < power < 1``, no distribution exists. + + alpha : float, default=1 + Constant that multiplies the L2 penalty term and determines the + regularization strength. ``alpha = 0`` is equivalent to unpenalized + GLMs. In this case, the design matrix `X` must have full column rank + (no collinearities). + Values of `alpha` must be in the range `[0.0, inf)`. + + fit_intercept : bool, default=True + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (`X @ coef + intercept`). + + link : {'auto', 'identity', 'log'}, default='auto' + The link function of the GLM, i.e. mapping from linear predictor + `X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets + the link depending on the chosen `power` parameter as follows: + + - 'identity' for ``power <= 0``, e.g. for the Normal distribution + - 'log' for ``power > 0``, e.g. for Poisson, Gamma and Inverse Gaussian + distributions + + solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs' + Algorithm to use in the optimization problem: + + 'lbfgs' + Calls scipy's L-BFGS-B optimizer. + + 'newton-cholesky' + Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to + iterated reweighted least squares) with an inner Cholesky based solver. + This solver is a good choice for `n_samples` >> `n_features`, especially + with one-hot encoded categorical features with rare categories. Be aware + that the memory usage of this solver has a quadratic dependency on + `n_features` because it explicitly computes the Hessian matrix. + + .. versionadded:: 1.2 + + max_iter : int, default=100 + The maximal number of iterations for the solver. + Values must be in the range `[1, inf)`. + + tol : float, default=1e-4 + Stopping criterion. For the lbfgs solver, + the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol`` + where ``g_j`` is the j-th component of the gradient (derivative) of + the objective function. + Values must be in the range `(0.0, inf)`. + + warm_start : bool, default=False + If set to ``True``, reuse the solution of the previous call to ``fit`` + as initialization for ``coef_`` and ``intercept_`` . + + verbose : int, default=0 + For the lbfgs solver set verbose to any positive number for verbosity. + Values must be in the range `[0, inf)`. + + Attributes + ---------- + coef_ : array of shape (n_features,) + Estimated coefficients for the linear predictor (`X @ coef_ + + intercept_`) in the GLM. + + intercept_ : float + Intercept (a.k.a. bias) added to linear predictor. + + n_iter_ : int + Actual number of iterations used in the solver. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + PoissonRegressor : Generalized Linear Model with a Poisson distribution. + GammaRegressor : Generalized Linear Model with a Gamma distribution. + + Examples + -------- + >>> from sklearn import linear_model + >>> clf = linear_model.TweedieRegressor() + >>> X = [[1, 2], [2, 3], [3, 4], [4, 3]] + >>> y = [2, 3.5, 5, 5.5] + >>> clf.fit(X, y) + TweedieRegressor() + >>> clf.score(X, y) + np.float64(0.839) + >>> clf.coef_ + array([0.599, 0.299]) + >>> clf.intercept_ + np.float64(1.600) + >>> clf.predict([[1, 1], [3, 4]]) + array([2.500, 4.599]) + """ + + _parameter_constraints: dict = { + **_GeneralizedLinearRegressor._parameter_constraints, + "power": [Interval(Real, None, None, closed="neither")], + "link": [StrOptions({"auto", "identity", "log"})], + } + + def __init__( + self, + *, + power=0.0, + alpha=1.0, + fit_intercept=True, + link="auto", + solver="lbfgs", + max_iter=100, + tol=1e-4, + warm_start=False, + verbose=0, + ): + super().__init__( + alpha=alpha, + fit_intercept=fit_intercept, + solver=solver, + max_iter=max_iter, + tol=tol, + warm_start=warm_start, + verbose=verbose, + ) + self.link = link + self.power = power + + def _get_loss(self): + if self.link == "auto": + if self.power <= 0: + # identity link + return HalfTweedieLossIdentity(power=self.power) + else: + # log link + return HalfTweedieLoss(power=self.power) + + if self.link == "log": + return HalfTweedieLoss(power=self.power) + + if self.link == "identity": + return HalfTweedieLossIdentity(power=self.power) diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_huber.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_huber.py new file mode 100644 index 0000000000000000000000000000000000000000..87e735ec998db226235f22f33477f50dc9e4152e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_huber.py @@ -0,0 +1,363 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Integral, Real + +import numpy as np +from scipy import optimize + +from ..base import BaseEstimator, RegressorMixin, _fit_context +from ..utils._mask import axis0_safe_slice +from ..utils._param_validation import Interval +from ..utils.extmath import safe_sparse_dot +from ..utils.fixes import _get_additional_lbfgs_options_dict +from ..utils.optimize import _check_optimize_result +from ..utils.validation import _check_sample_weight, validate_data +from ._base import LinearModel + + +def _huber_loss_and_gradient(w, X, y, epsilon, alpha, sample_weight=None): + """Returns the Huber loss and the gradient. + + Parameters + ---------- + w : ndarray, shape (n_features + 1,) or (n_features + 2,) + Feature vector. + w[:n_features] gives the coefficients + w[-1] gives the scale factor and if the intercept is fit w[-2] + gives the intercept factor. + + X : ndarray of shape (n_samples, n_features) + Input data. + + y : ndarray of shape (n_samples,) + Target vector. + + epsilon : float + Robustness of the Huber estimator. + + alpha : float + Regularization parameter. + + sample_weight : ndarray of shape (n_samples,), default=None + Weight assigned to each sample. + + Returns + ------- + loss : float + Huber loss. + + gradient : ndarray, shape (len(w)) + Returns the derivative of the Huber loss with respect to each + coefficient, intercept and the scale as a vector. + """ + _, n_features = X.shape + fit_intercept = n_features + 2 == w.shape[0] + if fit_intercept: + intercept = w[-2] + sigma = w[-1] + w = w[:n_features] + n_samples = np.sum(sample_weight) + + # Calculate the values where |y - X'w -c / sigma| > epsilon + # The values above this threshold are outliers. + linear_loss = y - safe_sparse_dot(X, w) + if fit_intercept: + linear_loss -= intercept + abs_linear_loss = np.abs(linear_loss) + outliers_mask = abs_linear_loss > epsilon * sigma + + # Calculate the linear loss due to the outliers. + # This is equal to (2 * M * |y - X'w -c / sigma| - M**2) * sigma + outliers = abs_linear_loss[outliers_mask] + num_outliers = np.count_nonzero(outliers_mask) + n_non_outliers = X.shape[0] - num_outliers + + # n_sq_outliers includes the weight give to the outliers while + # num_outliers is just the number of outliers. + outliers_sw = sample_weight[outliers_mask] + n_sw_outliers = np.sum(outliers_sw) + outlier_loss = ( + 2.0 * epsilon * np.sum(outliers_sw * outliers) + - sigma * n_sw_outliers * epsilon**2 + ) + + # Calculate the quadratic loss due to the non-outliers.- + # This is equal to |(y - X'w - c)**2 / sigma**2| * sigma + non_outliers = linear_loss[~outliers_mask] + weighted_non_outliers = sample_weight[~outliers_mask] * non_outliers + weighted_loss = np.dot(weighted_non_outliers.T, non_outliers) + squared_loss = weighted_loss / sigma + + if fit_intercept: + grad = np.zeros(n_features + 2) + else: + grad = np.zeros(n_features + 1) + + # Gradient due to the squared loss. + X_non_outliers = -axis0_safe_slice(X, ~outliers_mask, n_non_outliers) + grad[:n_features] = ( + 2.0 / sigma * safe_sparse_dot(weighted_non_outliers, X_non_outliers) + ) + + # Gradient due to the linear loss. + signed_outliers = np.ones_like(outliers) + signed_outliers_mask = linear_loss[outliers_mask] < 0 + signed_outliers[signed_outliers_mask] = -1.0 + X_outliers = axis0_safe_slice(X, outliers_mask, num_outliers) + sw_outliers = sample_weight[outliers_mask] * signed_outliers + grad[:n_features] -= 2.0 * epsilon * (safe_sparse_dot(sw_outliers, X_outliers)) + + # Gradient due to the penalty. + grad[:n_features] += alpha * 2.0 * w + + # Gradient due to sigma. + grad[-1] = n_samples + grad[-1] -= n_sw_outliers * epsilon**2 + grad[-1] -= squared_loss / sigma + + # Gradient due to the intercept. + if fit_intercept: + grad[-2] = -2.0 * np.sum(weighted_non_outliers) / sigma + grad[-2] -= 2.0 * epsilon * np.sum(sw_outliers) + + loss = n_samples * sigma + squared_loss + outlier_loss + loss += alpha * np.dot(w, w) + return loss, grad + + +class HuberRegressor(LinearModel, RegressorMixin, BaseEstimator): + """L2-regularized linear regression model that is robust to outliers. + + The Huber Regressor optimizes the squared loss for the samples where + ``|(y - Xw - c) / sigma| < epsilon`` and the absolute loss for the samples + where ``|(y - Xw - c) / sigma| > epsilon``, where the model coefficients + ``w``, the intercept ``c`` and the scale ``sigma`` are parameters + to be optimized. The parameter `sigma` makes sure that if `y` is scaled up + or down by a certain factor, one does not need to rescale `epsilon` to + achieve the same robustness. Note that this does not take into account + the fact that the different features of `X` may be of different scales. + + The Huber loss function has the advantage of not being heavily influenced + by the outliers while not completely ignoring their effect. + + Read more in the :ref:`User Guide ` + + .. versionadded:: 0.18 + + Parameters + ---------- + epsilon : float, default=1.35 + The parameter epsilon controls the number of samples that should be + classified as outliers. The smaller the epsilon, the more robust it is + to outliers. Epsilon must be in the range `[1, inf)`. + + max_iter : int, default=100 + Maximum number of iterations that + ``scipy.optimize.minimize(method="L-BFGS-B")`` should run for. + + alpha : float, default=0.0001 + Strength of the squared L2 regularization. Note that the penalty is + equal to ``alpha * ||w||^2``. + Must be in the range `[0, inf)`. + + warm_start : bool, default=False + This is useful if the stored attributes of a previously used model + has to be reused. If set to False, then the coefficients will + be rewritten for every call to fit. + See :term:`the Glossary `. + + fit_intercept : bool, default=True + Whether or not to fit the intercept. This can be set to False + if the data is already centered around the origin. + + tol : float, default=1e-05 + The iteration will stop when + ``max{|proj g_i | i = 1, ..., n}`` <= ``tol`` + where pg_i is the i-th component of the projected gradient. + + Attributes + ---------- + coef_ : array, shape (n_features,) + Features got by optimizing the L2-regularized Huber loss. + + intercept_ : float + Bias. + + scale_ : float + The value by which ``|y - Xw - c|`` is scaled down. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + Number of iterations that + ``scipy.optimize.minimize(method="L-BFGS-B")`` has run for. + + .. versionchanged:: 0.20 + + In SciPy <= 1.0.0 the number of lbfgs iterations may exceed + ``max_iter``. ``n_iter_`` will now report at most ``max_iter``. + + outliers_ : array, shape (n_samples,) + A boolean mask which is set to True where the samples are identified + as outliers. + + See Also + -------- + RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm. + TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model. + SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD. + + References + ---------- + .. [1] Peter J. Huber, Elvezio M. Ronchetti, Robust Statistics + Concomitant scale estimates, p. 172 + .. [2] Art B. Owen (2006), `A robust hybrid of lasso and ridge regression. + `_ + + Examples + -------- + >>> import numpy as np + >>> from sklearn.linear_model import HuberRegressor, LinearRegression + >>> from sklearn.datasets import make_regression + >>> rng = np.random.RandomState(0) + >>> X, y, coef = make_regression( + ... n_samples=200, n_features=2, noise=4.0, coef=True, random_state=0) + >>> X[:4] = rng.uniform(10, 20, (4, 2)) + >>> y[:4] = rng.uniform(10, 20, 4) + >>> huber = HuberRegressor().fit(X, y) + >>> huber.score(X, y) + -7.284 + >>> huber.predict(X[:1,]) + array([806.7200]) + >>> linear = LinearRegression().fit(X, y) + >>> print("True coefficients:", coef) + True coefficients: [20.4923... 34.1698...] + >>> print("Huber coefficients:", huber.coef_) + Huber coefficients: [17.7906... 31.0106...] + >>> print("Linear Regression coefficients:", linear.coef_) + Linear Regression coefficients: [-1.9221... 7.0226...] + """ + + _parameter_constraints: dict = { + "epsilon": [Interval(Real, 1.0, None, closed="left")], + "max_iter": [Interval(Integral, 0, None, closed="left")], + "alpha": [Interval(Real, 0, None, closed="left")], + "warm_start": ["boolean"], + "fit_intercept": ["boolean"], + "tol": [Interval(Real, 0.0, None, closed="left")], + } + + def __init__( + self, + *, + epsilon=1.35, + max_iter=100, + alpha=0.0001, + warm_start=False, + fit_intercept=True, + tol=1e-05, + ): + self.epsilon = epsilon + self.max_iter = max_iter + self.alpha = alpha + self.warm_start = warm_start + self.fit_intercept = fit_intercept + self.tol = tol + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, sample_weight=None): + """Fit the model according to the given training data. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like, shape (n_samples,) + Target vector relative to X. + + sample_weight : array-like, shape (n_samples,) + Weight given to each sample. + + Returns + ------- + self : object + Fitted `HuberRegressor` estimator. + """ + X, y = validate_data( + self, + X, + y, + copy=False, + accept_sparse=["csr"], + y_numeric=True, + dtype=[np.float64, np.float32], + ) + + sample_weight = _check_sample_weight(sample_weight, X) + + if self.warm_start and hasattr(self, "coef_"): + parameters = np.concatenate((self.coef_, [self.intercept_, self.scale_])) + else: + if self.fit_intercept: + parameters = np.zeros(X.shape[1] + 2) + else: + parameters = np.zeros(X.shape[1] + 1) + # Make sure to initialize the scale parameter to a strictly + # positive value: + parameters[-1] = 1 + + # Sigma or the scale factor should be non-negative. + # Setting it to be zero might cause undefined bounds hence we set it + # to a value close to zero. + bounds = np.tile([-np.inf, np.inf], (parameters.shape[0], 1)) + bounds[-1][0] = np.finfo(np.float64).eps * 10 + + opt_res = optimize.minimize( + _huber_loss_and_gradient, + parameters, + method="L-BFGS-B", + jac=True, + args=(X, y, self.epsilon, self.alpha, sample_weight), + options={ + "maxiter": self.max_iter, + "gtol": self.tol, + **_get_additional_lbfgs_options_dict("iprint", -1), + }, + bounds=bounds, + ) + + parameters = opt_res.x + + if opt_res.status == 2: + raise ValueError( + "HuberRegressor convergence failed: l-BFGS-b solver terminated with %s" + % opt_res.message + ) + self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter) + self.scale_ = parameters[-1] + if self.fit_intercept: + self.intercept_ = parameters[-2] + else: + self.intercept_ = 0.0 + self.coef_ = parameters[: X.shape[1]] + + residual = np.abs(y - safe_sparse_dot(X, self.coef_) - self.intercept_) + self.outliers_ = residual > self.scale_ * self.epsilon + return self + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_least_angle.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_least_angle.py new file mode 100644 index 0000000000000000000000000000000000000000..4bffe5f6e8c0d2d6fbc05821d6553e436758c86b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_least_angle.py @@ -0,0 +1,2346 @@ +""" +Least Angle Regression algorithm. See the documentation on the +Generalized Linear Model for a complete discussion. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import sys +import warnings +from math import log +from numbers import Integral, Real + +import numpy as np +from scipy import interpolate, linalg +from scipy.linalg.lapack import get_lapack_funcs + +from ..base import MultiOutputMixin, RegressorMixin, _fit_context +from ..exceptions import ConvergenceWarning +from ..model_selection import check_cv + +# mypy error: Module 'sklearn.utils' has no attribute 'arrayfuncs' +from ..utils import ( + Bunch, + arrayfuncs, + as_float_array, + check_random_state, +) +from ..utils._metadata_requests import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + _routing_enabled, + process_routing, +) +from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params +from ..utils.parallel import Parallel, delayed +from ..utils.validation import validate_data +from ._base import LinearModel, LinearRegression, _preprocess_data + +SOLVE_TRIANGULAR_ARGS = {"check_finite": False} + + +@validate_params( + { + "X": [np.ndarray, None], + "y": [np.ndarray, None], + "Xy": [np.ndarray, None], + "Gram": [StrOptions({"auto"}), "boolean", np.ndarray, None], + "max_iter": [Interval(Integral, 0, None, closed="left")], + "alpha_min": [Interval(Real, 0, None, closed="left")], + "method": [StrOptions({"lar", "lasso"})], + "copy_X": ["boolean"], + "eps": [Interval(Real, 0, None, closed="neither"), None], + "copy_Gram": ["boolean"], + "verbose": ["verbose"], + "return_path": ["boolean"], + "return_n_iter": ["boolean"], + "positive": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def lars_path( + X, + y, + Xy=None, + *, + Gram=None, + max_iter=500, + alpha_min=0, + method="lar", + copy_X=True, + eps=np.finfo(float).eps, + copy_Gram=True, + verbose=0, + return_path=True, + return_n_iter=False, + positive=False, +): + """Compute Least Angle Regression or Lasso path using the LARS algorithm. + + The optimization objective for the case method='lasso' is:: + + (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 + + in the case of method='lar', the objective function is only known in + the form of an implicit equation (see discussion in [1]_). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : None or ndarray of shape (n_samples, n_features) + Input data. If X is `None`, Gram must also be `None`. + If only the Gram matrix is available, use `lars_path_gram` instead. + + y : None or ndarray of shape (n_samples,) + Input targets. + + Xy : array-like of shape (n_features,), default=None + `Xy = X.T @ y` that can be precomputed. It is useful + only when the Gram matrix is precomputed. + + Gram : None, 'auto', bool, ndarray of shape (n_features, n_features), \ + default=None + Precomputed Gram matrix `X.T @ X`, if `'auto'`, the Gram + matrix is precomputed from the given X, if there are more samples + than features. + + max_iter : int, default=500 + Maximum number of iterations to perform, set to infinity for no limit. + + alpha_min : float, default=0 + Minimum correlation along the path. It corresponds to the + regularization parameter `alpha` in the Lasso. + + method : {'lar', 'lasso'}, default='lar' + Specifies the returned model. Select `'lar'` for Least Angle + Regression, `'lasso'` for the Lasso. + + copy_X : bool, default=True + If `False`, `X` is overwritten. + + eps : float, default=np.finfo(float).eps + The machine-precision regularization in the computation of the + Cholesky diagonal factors. Increase this for very ill-conditioned + systems. Unlike the `tol` parameter in some iterative + optimization-based algorithms, this parameter does not control + the tolerance of the optimization. + + copy_Gram : bool, default=True + If `False`, `Gram` is overwritten. + + verbose : int, default=0 + Controls output verbosity. + + return_path : bool, default=True + If `True`, returns the entire path, else returns only the + last point of the path. + + return_n_iter : bool, default=False + Whether to return the number of iterations. + + positive : bool, default=False + Restrict coefficients to be >= 0. + This option is only allowed with method 'lasso'. Note that the model + coefficients will not converge to the ordinary-least-squares solution + for small values of alpha. Only coefficients up to the smallest alpha + value (`alphas_[alphas_ > 0.].min()` when fit_path=True) reached by + the stepwise Lars-Lasso algorithm are typically in congruence with the + solution of the coordinate descent `lasso_path` function. + + Returns + ------- + alphas : ndarray of shape (n_alphas + 1,) + Maximum of covariances (in absolute value) at each iteration. + `n_alphas` is either `max_iter`, `n_features`, or the + number of nodes in the path with `alpha >= alpha_min`, whichever + is smaller. + + active : ndarray of shape (n_alphas,) + Indices of active variables at the end of the path. + + coefs : ndarray of shape (n_features, n_alphas + 1) + Coefficients along the path. + + n_iter : int + Number of iterations run. Returned only if `return_n_iter` is set + to True. + + See Also + -------- + lars_path_gram : Compute LARS path in the sufficient stats mode. + lasso_path : Compute Lasso path with coordinate descent. + LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars. + Lars : Least Angle Regression model a.k.a. LAR. + LassoLarsCV : Cross-validated Lasso, using the LARS algorithm. + LarsCV : Cross-validated Least Angle Regression model. + sklearn.decomposition.sparse_encode : Sparse coding. + + References + ---------- + .. [1] "Least Angle Regression", Efron et al. + http://statweb.stanford.edu/~tibs/ftp/lars.pdf + + .. [2] `Wikipedia entry on the Least-angle regression + `_ + + .. [3] `Wikipedia entry on the Lasso + `_ + + Examples + -------- + >>> from sklearn.linear_model import lars_path + >>> from sklearn.datasets import make_regression + >>> X, y, true_coef = make_regression( + ... n_samples=100, n_features=5, n_informative=2, coef=True, random_state=0 + ... ) + >>> true_coef + array([ 0. , 0. , 0. , 97.9, 45.7]) + >>> alphas, _, estimated_coef = lars_path(X, y) + >>> alphas.shape + (3,) + >>> estimated_coef + array([[ 0. , 0. , 0. ], + [ 0. , 0. , 0. ], + [ 0. , 0. , 0. ], + [ 0. , 46.96, 97.99], + [ 0. , 0. , 45.70]]) + """ + if X is None and Gram is not None: + raise ValueError( + "X cannot be None if Gram is not None" + "Use lars_path_gram to avoid passing X and y." + ) + return _lars_path_solver( + X=X, + y=y, + Xy=Xy, + Gram=Gram, + n_samples=None, + max_iter=max_iter, + alpha_min=alpha_min, + method=method, + copy_X=copy_X, + eps=eps, + copy_Gram=copy_Gram, + verbose=verbose, + return_path=return_path, + return_n_iter=return_n_iter, + positive=positive, + ) + + +@validate_params( + { + "Xy": [np.ndarray], + "Gram": [np.ndarray], + "n_samples": [Interval(Integral, 0, None, closed="left")], + "max_iter": [Interval(Integral, 0, None, closed="left")], + "alpha_min": [Interval(Real, 0, None, closed="left")], + "method": [StrOptions({"lar", "lasso"})], + "copy_X": ["boolean"], + "eps": [Interval(Real, 0, None, closed="neither"), None], + "copy_Gram": ["boolean"], + "verbose": ["verbose"], + "return_path": ["boolean"], + "return_n_iter": ["boolean"], + "positive": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def lars_path_gram( + Xy, + Gram, + *, + n_samples, + max_iter=500, + alpha_min=0, + method="lar", + copy_X=True, + eps=np.finfo(float).eps, + copy_Gram=True, + verbose=0, + return_path=True, + return_n_iter=False, + positive=False, +): + """The lars_path in the sufficient stats mode. + + The optimization objective for the case method='lasso' is:: + + (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 + + in the case of method='lar', the objective function is only known in + the form of an implicit equation (see discussion in [1]_). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + Xy : ndarray of shape (n_features,) + `Xy = X.T @ y`. + + Gram : ndarray of shape (n_features, n_features) + `Gram = X.T @ X`. + + n_samples : int + Equivalent size of sample. + + max_iter : int, default=500 + Maximum number of iterations to perform, set to infinity for no limit. + + alpha_min : float, default=0 + Minimum correlation along the path. It corresponds to the + regularization parameter alpha parameter in the Lasso. + + method : {'lar', 'lasso'}, default='lar' + Specifies the returned model. Select `'lar'` for Least Angle + Regression, ``'lasso'`` for the Lasso. + + copy_X : bool, default=True + If `False`, `X` is overwritten. + + eps : float, default=np.finfo(float).eps + The machine-precision regularization in the computation of the + Cholesky diagonal factors. Increase this for very ill-conditioned + systems. Unlike the `tol` parameter in some iterative + optimization-based algorithms, this parameter does not control + the tolerance of the optimization. + + copy_Gram : bool, default=True + If `False`, `Gram` is overwritten. + + verbose : int, default=0 + Controls output verbosity. + + return_path : bool, default=True + If `return_path==True` returns the entire path, else returns only the + last point of the path. + + return_n_iter : bool, default=False + Whether to return the number of iterations. + + positive : bool, default=False + Restrict coefficients to be >= 0. + This option is only allowed with method 'lasso'. Note that the model + coefficients will not converge to the ordinary-least-squares solution + for small values of alpha. Only coefficients up to the smallest alpha + value (`alphas_[alphas_ > 0.].min()` when `fit_path=True`) reached by + the stepwise Lars-Lasso algorithm are typically in congruence with the + solution of the coordinate descent lasso_path function. + + Returns + ------- + alphas : ndarray of shape (n_alphas + 1,) + Maximum of covariances (in absolute value) at each iteration. + `n_alphas` is either `max_iter`, `n_features` or the + number of nodes in the path with `alpha >= alpha_min`, whichever + is smaller. + + active : ndarray of shape (n_alphas,) + Indices of active variables at the end of the path. + + coefs : ndarray of shape (n_features, n_alphas + 1) + Coefficients along the path. + + n_iter : int + Number of iterations run. Returned only if `return_n_iter` is set + to True. + + See Also + -------- + lars_path_gram : Compute LARS path. + lasso_path : Compute Lasso path with coordinate descent. + LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars. + Lars : Least Angle Regression model a.k.a. LAR. + LassoLarsCV : Cross-validated Lasso, using the LARS algorithm. + LarsCV : Cross-validated Least Angle Regression model. + sklearn.decomposition.sparse_encode : Sparse coding. + + References + ---------- + .. [1] "Least Angle Regression", Efron et al. + http://statweb.stanford.edu/~tibs/ftp/lars.pdf + + .. [2] `Wikipedia entry on the Least-angle regression + `_ + + .. [3] `Wikipedia entry on the Lasso + `_ + + Examples + -------- + >>> from sklearn.linear_model import lars_path_gram + >>> from sklearn.datasets import make_regression + >>> X, y, true_coef = make_regression( + ... n_samples=100, n_features=5, n_informative=2, coef=True, random_state=0 + ... ) + >>> true_coef + array([ 0. , 0. , 0. , 97.9, 45.7]) + >>> alphas, _, estimated_coef = lars_path_gram(X.T @ y, X.T @ X, n_samples=100) + >>> alphas.shape + (3,) + >>> estimated_coef + array([[ 0. , 0. , 0. ], + [ 0. , 0. , 0. ], + [ 0. , 0. , 0. ], + [ 0. , 46.96, 97.99], + [ 0. , 0. , 45.70]]) + """ + return _lars_path_solver( + X=None, + y=None, + Xy=Xy, + Gram=Gram, + n_samples=n_samples, + max_iter=max_iter, + alpha_min=alpha_min, + method=method, + copy_X=copy_X, + eps=eps, + copy_Gram=copy_Gram, + verbose=verbose, + return_path=return_path, + return_n_iter=return_n_iter, + positive=positive, + ) + + +def _lars_path_solver( + X, + y, + Xy=None, + Gram=None, + n_samples=None, + max_iter=500, + alpha_min=0, + method="lar", + copy_X=True, + eps=np.finfo(float).eps, + copy_Gram=True, + verbose=0, + return_path=True, + return_n_iter=False, + positive=False, +): + """Compute Least Angle Regression or Lasso path using LARS algorithm [1] + + The optimization objective for the case method='lasso' is:: + + (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 + + in the case of method='lar', the objective function is only known in + the form of an implicit equation (see discussion in [1]) + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : None or ndarray of shape (n_samples, n_features) + Input data. Note that if X is None then Gram must be specified, + i.e., cannot be None or False. + + y : None or ndarray of shape (n_samples,) + Input targets. + + Xy : array-like of shape (n_features,), default=None + `Xy = np.dot(X.T, y)` that can be precomputed. It is useful + only when the Gram matrix is precomputed. + + Gram : None, 'auto' or array-like of shape (n_features, n_features), \ + default=None + Precomputed Gram matrix `(X' * X)`, if ``'auto'``, the Gram + matrix is precomputed from the given X, if there are more samples + than features. + + n_samples : int or float, default=None + Equivalent size of sample. If `None`, it will be `n_samples`. + + max_iter : int, default=500 + Maximum number of iterations to perform, set to infinity for no limit. + + alpha_min : float, default=0 + Minimum correlation along the path. It corresponds to the + regularization parameter alpha parameter in the Lasso. + + method : {'lar', 'lasso'}, default='lar' + Specifies the returned model. Select ``'lar'`` for Least Angle + Regression, ``'lasso'`` for the Lasso. + + copy_X : bool, default=True + If ``False``, ``X`` is overwritten. + + eps : float, default=np.finfo(float).eps + The machine-precision regularization in the computation of the + Cholesky diagonal factors. Increase this for very ill-conditioned + systems. Unlike the ``tol`` parameter in some iterative + optimization-based algorithms, this parameter does not control + the tolerance of the optimization. + + copy_Gram : bool, default=True + If ``False``, ``Gram`` is overwritten. + + verbose : int, default=0 + Controls output verbosity. + + return_path : bool, default=True + If ``return_path==True`` returns the entire path, else returns only the + last point of the path. + + return_n_iter : bool, default=False + Whether to return the number of iterations. + + positive : bool, default=False + Restrict coefficients to be >= 0. + This option is only allowed with method 'lasso'. Note that the model + coefficients will not converge to the ordinary-least-squares solution + for small values of alpha. Only coefficients up to the smallest alpha + value (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by + the stepwise Lars-Lasso algorithm are typically in congruence with the + solution of the coordinate descent lasso_path function. + + Returns + ------- + alphas : array-like of shape (n_alphas + 1,) + Maximum of covariances (in absolute value) at each iteration. + ``n_alphas`` is either ``max_iter``, ``n_features`` or the + number of nodes in the path with ``alpha >= alpha_min``, whichever + is smaller. + + active : array-like of shape (n_alphas,) + Indices of active variables at the end of the path. + + coefs : array-like of shape (n_features, n_alphas + 1) + Coefficients along the path + + n_iter : int + Number of iterations run. Returned only if return_n_iter is set + to True. + + See Also + -------- + lasso_path + LassoLars + Lars + LassoLarsCV + LarsCV + sklearn.decomposition.sparse_encode + + References + ---------- + .. [1] "Least Angle Regression", Efron et al. + http://statweb.stanford.edu/~tibs/ftp/lars.pdf + + .. [2] `Wikipedia entry on the Least-angle regression + `_ + + .. [3] `Wikipedia entry on the Lasso + `_ + + """ + if method == "lar" and positive: + raise ValueError("Positive constraint not supported for 'lar' coding method.") + + n_samples = n_samples if n_samples is not None else y.size + + if Xy is None: + Cov = np.dot(X.T, y) + else: + Cov = Xy.copy() + + if Gram is None or Gram is False: + Gram = None + if X is None: + raise ValueError("X and Gram cannot both be unspecified.") + elif (isinstance(Gram, str) and Gram == "auto") or Gram is True: + if Gram is True or X.shape[0] > X.shape[1]: + Gram = np.dot(X.T, X) + else: + Gram = None + elif copy_Gram: + Gram = Gram.copy() + + if Gram is None: + n_features = X.shape[1] + else: + n_features = Cov.shape[0] + if Gram.shape != (n_features, n_features): + raise ValueError("The shapes of the inputs Gram and Xy do not match.") + + if copy_X and X is not None and Gram is None: + # force copy. setting the array to be fortran-ordered + # speeds up the calculation of the (partial) Gram matrix + # and allows to easily swap columns + X = X.copy("F") + + max_features = min(max_iter, n_features) + + dtypes = set(a.dtype for a in (X, y, Xy, Gram) if a is not None) + if len(dtypes) == 1: + # use the precision level of input data if it is consistent + return_dtype = next(iter(dtypes)) + else: + # fallback to double precision otherwise + return_dtype = np.float64 + + if return_path: + coefs = np.zeros((max_features + 1, n_features), dtype=return_dtype) + alphas = np.zeros(max_features + 1, dtype=return_dtype) + else: + coef, prev_coef = ( + np.zeros(n_features, dtype=return_dtype), + np.zeros(n_features, dtype=return_dtype), + ) + alpha, prev_alpha = ( + np.array([0.0], dtype=return_dtype), + np.array([0.0], dtype=return_dtype), + ) + # above better ideas? + + n_iter, n_active = 0, 0 + active, indices = list(), np.arange(n_features) + # holds the sign of covariance + sign_active = np.empty(max_features, dtype=np.int8) + drop = False + + # will hold the cholesky factorization. Only lower part is + # referenced. + if Gram is None: + L = np.empty((max_features, max_features), dtype=X.dtype) + swap, nrm2 = linalg.get_blas_funcs(("swap", "nrm2"), (X,)) + else: + L = np.empty((max_features, max_features), dtype=Gram.dtype) + swap, nrm2 = linalg.get_blas_funcs(("swap", "nrm2"), (Cov,)) + (solve_cholesky,) = get_lapack_funcs(("potrs",), (L,)) + + if verbose: + if verbose > 1: + print("Step\t\tAdded\t\tDropped\t\tActive set size\t\tC") + else: + sys.stdout.write(".") + sys.stdout.flush() + + tiny32 = np.finfo(np.float32).tiny # to avoid division by 0 warning + cov_precision = np.finfo(Cov.dtype).precision + equality_tolerance = np.finfo(np.float32).eps + + if Gram is not None: + Gram_copy = Gram.copy() + Cov_copy = Cov.copy() + + while True: + if Cov.size: + if positive: + C_idx = np.argmax(Cov) + else: + C_idx = np.argmax(np.abs(Cov)) + + C_ = Cov[C_idx] + + if positive: + C = C_ + else: + C = np.fabs(C_) + else: + C = 0.0 + + if return_path: + alpha = alphas[n_iter, np.newaxis] + coef = coefs[n_iter] + prev_alpha = alphas[n_iter - 1, np.newaxis] + prev_coef = coefs[n_iter - 1] + + alpha[0] = C / n_samples + if alpha[0] <= alpha_min + equality_tolerance: # early stopping + if abs(alpha[0] - alpha_min) > equality_tolerance: + # interpolation factor 0 <= ss < 1 + if n_iter > 0: + # In the first iteration, all alphas are zero, the formula + # below would make ss a NaN + ss = (prev_alpha[0] - alpha_min) / (prev_alpha[0] - alpha[0]) + coef[:] = prev_coef + ss * (coef - prev_coef) + alpha[0] = alpha_min + if return_path: + coefs[n_iter] = coef + break + + if n_iter >= max_iter or n_active >= n_features: + break + if not drop: + ########################################################## + # Append x_j to the Cholesky factorization of (Xa * Xa') # + # # + # ( L 0 ) # + # L -> ( ) , where L * w = Xa' x_j # + # ( w z ) and z = ||x_j|| # + # # + ########################################################## + + if positive: + sign_active[n_active] = np.ones_like(C_) + else: + sign_active[n_active] = np.sign(C_) + m, n = n_active, C_idx + n_active + + Cov[C_idx], Cov[0] = swap(Cov[C_idx], Cov[0]) + indices[n], indices[m] = indices[m], indices[n] + Cov_not_shortened = Cov + Cov = Cov[1:] # remove Cov[0] + + if Gram is None: + X.T[n], X.T[m] = swap(X.T[n], X.T[m]) + c = nrm2(X.T[n_active]) ** 2 + L[n_active, :n_active] = np.dot(X.T[n_active], X.T[:n_active].T) + else: + # swap does only work inplace if matrix is fortran + # contiguous ... + Gram[m], Gram[n] = swap(Gram[m], Gram[n]) + Gram[:, m], Gram[:, n] = swap(Gram[:, m], Gram[:, n]) + c = Gram[n_active, n_active] + L[n_active, :n_active] = Gram[n_active, :n_active] + + # Update the cholesky decomposition for the Gram matrix + if n_active: + linalg.solve_triangular( + L[:n_active, :n_active], + L[n_active, :n_active], + trans=0, + lower=1, + overwrite_b=True, + **SOLVE_TRIANGULAR_ARGS, + ) + + v = np.dot(L[n_active, :n_active], L[n_active, :n_active]) + diag = max(np.sqrt(np.abs(c - v)), eps) + L[n_active, n_active] = diag + + if diag < 1e-7: + # The system is becoming too ill-conditioned. + # We have degenerate vectors in our active set. + # We'll 'drop for good' the last regressor added. + warnings.warn( + "Regressors in active set degenerate. " + "Dropping a regressor, after %i iterations, " + "i.e. alpha=%.3e, " + "with an active set of %i regressors, and " + "the smallest cholesky pivot element being %.3e." + " Reduce max_iter or increase eps parameters." + % (n_iter, alpha.item(), n_active, diag), + ConvergenceWarning, + ) + + # XXX: need to figure a 'drop for good' way + Cov = Cov_not_shortened + Cov[0] = 0 + Cov[C_idx], Cov[0] = swap(Cov[C_idx], Cov[0]) + continue + + active.append(indices[n_active]) + n_active += 1 + + if verbose > 1: + print( + "%s\t\t%s\t\t%s\t\t%s\t\t%s" % (n_iter, active[-1], "", n_active, C) + ) + + if method == "lasso" and n_iter > 0 and prev_alpha[0] < alpha[0]: + # alpha is increasing. This is because the updates of Cov are + # bringing in too much numerical error that is greater than + # than the remaining correlation with the + # regressors. Time to bail out + warnings.warn( + "Early stopping the lars path, as the residues " + "are small and the current value of alpha is no " + "longer well controlled. %i iterations, alpha=%.3e, " + "previous alpha=%.3e, with an active set of %i " + "regressors." % (n_iter, alpha.item(), prev_alpha.item(), n_active), + ConvergenceWarning, + ) + break + + # least squares solution + least_squares, _ = solve_cholesky( + L[:n_active, :n_active], sign_active[:n_active], lower=True + ) + + if least_squares.size == 1 and least_squares == 0: + # This happens because sign_active[:n_active] = 0 + least_squares[...] = 1 + AA = 1.0 + else: + # is this really needed ? + AA = 1.0 / np.sqrt(np.sum(least_squares * sign_active[:n_active])) + + if not np.isfinite(AA): + # L is too ill-conditioned + i = 0 + L_ = L[:n_active, :n_active].copy() + while not np.isfinite(AA): + L_.flat[:: n_active + 1] += (2**i) * eps + least_squares, _ = solve_cholesky( + L_, sign_active[:n_active], lower=True + ) + tmp = max(np.sum(least_squares * sign_active[:n_active]), eps) + AA = 1.0 / np.sqrt(tmp) + i += 1 + least_squares *= AA + + if Gram is None: + # equiangular direction of variables in the active set + eq_dir = np.dot(X.T[:n_active].T, least_squares) + # correlation between each unactive variables and + # eqiangular vector + corr_eq_dir = np.dot(X.T[n_active:], eq_dir) + else: + # if huge number of features, this takes 50% of time, I + # think could be avoided if we just update it using an + # orthogonal (QR) decomposition of X + corr_eq_dir = np.dot(Gram[:n_active, n_active:].T, least_squares) + + # Explicit rounding can be necessary to avoid `np.argmax(Cov)` yielding + # unstable results because of rounding errors. + np.around(corr_eq_dir, decimals=cov_precision, out=corr_eq_dir) + + g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny32)) + if positive: + gamma_ = min(g1, C / AA) + else: + g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny32)) + gamma_ = min(g1, g2, C / AA) + + # TODO: better names for these variables: z + drop = False + z = -coef[active] / (least_squares + tiny32) + z_pos = arrayfuncs.min_pos(z) + if z_pos < gamma_: + # some coefficients have changed sign + idx = np.where(z == z_pos)[0][::-1] + + # update the sign, important for LAR + sign_active[idx] = -sign_active[idx] + + if method == "lasso": + gamma_ = z_pos + drop = True + + n_iter += 1 + + if return_path: + if n_iter >= coefs.shape[0]: + del coef, alpha, prev_alpha, prev_coef + # resize the coefs and alphas array + add_features = 2 * max(1, (max_features - n_active)) + coefs = np.resize(coefs, (n_iter + add_features, n_features)) + coefs[-add_features:] = 0 + alphas = np.resize(alphas, n_iter + add_features) + alphas[-add_features:] = 0 + coef = coefs[n_iter] + prev_coef = coefs[n_iter - 1] + else: + # mimic the effect of incrementing n_iter on the array references + prev_coef = coef + prev_alpha[0] = alpha[0] + coef = np.zeros_like(coef) + + coef[active] = prev_coef[active] + gamma_ * least_squares + + # update correlations + Cov -= gamma_ * corr_eq_dir + + # See if any coefficient has changed sign + if drop and method == "lasso": + # handle the case when idx is not length of 1 + for ii in idx: + arrayfuncs.cholesky_delete(L[:n_active, :n_active], ii) + + n_active -= 1 + # handle the case when idx is not length of 1 + drop_idx = [active.pop(ii) for ii in idx] + + if Gram is None: + # propagate dropped variable + for ii in idx: + for i in range(ii, n_active): + X.T[i], X.T[i + 1] = swap(X.T[i], X.T[i + 1]) + # yeah this is stupid + indices[i], indices[i + 1] = indices[i + 1], indices[i] + + # TODO: this could be updated + residual = y - np.dot(X[:, :n_active], coef[active]) + temp = np.dot(X.T[n_active], residual) + + Cov = np.r_[temp, Cov] + else: + for ii in idx: + for i in range(ii, n_active): + indices[i], indices[i + 1] = indices[i + 1], indices[i] + Gram[i], Gram[i + 1] = swap(Gram[i], Gram[i + 1]) + Gram[:, i], Gram[:, i + 1] = swap(Gram[:, i], Gram[:, i + 1]) + + # Cov_n = Cov_j + x_j * X + increment(betas) TODO: + # will this still work with multiple drops ? + + # recompute covariance. Probably could be done better + # wrong as Xy is not swapped with the rest of variables + + # TODO: this could be updated + temp = Cov_copy[drop_idx] - np.dot(Gram_copy[drop_idx], coef) + Cov = np.r_[temp, Cov] + + sign_active = np.delete(sign_active, idx) + sign_active = np.append(sign_active, 0.0) # just to maintain size + if verbose > 1: + print( + "%s\t\t%s\t\t%s\t\t%s\t\t%s" + % (n_iter, "", drop_idx, n_active, abs(temp)) + ) + + if return_path: + # resize coefs in case of early stop + alphas = alphas[: n_iter + 1] + coefs = coefs[: n_iter + 1] + + if return_n_iter: + return alphas, active, coefs.T, n_iter + else: + return alphas, active, coefs.T + else: + if return_n_iter: + return alpha, active, coef, n_iter + else: + return alpha, active, coef + + +############################################################################### +# Estimator classes + + +class Lars(MultiOutputMixin, RegressorMixin, LinearModel): + """Least Angle Regression model a.k.a. LAR. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to false, no intercept will be used in calculations + (i.e. data is expected to be centered). + + verbose : bool or int, default=False + Sets the verbosity amount. + + precompute : bool, 'auto' or array-like , default='auto' + Whether to use a precomputed Gram matrix to speed up + calculations. If set to ``'auto'`` let us decide. The Gram + matrix can also be passed as argument. + + n_nonzero_coefs : int, default=500 + Target number of non-zero coefficients. Use ``np.inf`` for no limit. + + eps : float, default=np.finfo(float).eps + The machine-precision regularization in the computation of the + Cholesky diagonal factors. Increase this for very ill-conditioned + systems. Unlike the ``tol`` parameter in some iterative + optimization-based algorithms, this parameter does not control + the tolerance of the optimization. + + copy_X : bool, default=True + If ``True``, X will be copied; else, it may be overwritten. + + fit_path : bool, default=True + If True the full path is stored in the ``coef_path_`` attribute. + If you compute the solution for a large problem or many targets, + setting ``fit_path`` to ``False`` will lead to a speedup, especially + with a small alpha. + + jitter : float, default=None + Upper bound on a uniform noise parameter to be added to the + `y` values, to satisfy the model's assumption of + one-at-a-time computations. Might help with stability. + + .. versionadded:: 0.23 + + random_state : int, RandomState instance or None, default=None + Determines random number generation for jittering. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. Ignored if `jitter` is None. + + .. versionadded:: 0.23 + + Attributes + ---------- + alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays + Maximum of covariances (in absolute value) at each iteration. + ``n_alphas`` is either ``max_iter``, ``n_features`` or the + number of nodes in the path with ``alpha >= alpha_min``, whichever + is smaller. If this is a list of array-like, the length of the outer + list is `n_targets`. + + active_ : list of shape (n_alphas,) or list of such lists + Indices of active variables at the end of the path. + If this is a list of list, the length of the outer list is `n_targets`. + + coef_path_ : array-like of shape (n_features, n_alphas + 1) or list \ + of such arrays + The varying values of the coefficients along the path. It is not + present if the ``fit_path`` parameter is ``False``. If this is a list + of array-like, the length of the outer list is `n_targets`. + + coef_ : array-like of shape (n_features,) or (n_targets, n_features) + Parameter vector (w in the formulation formula). + + intercept_ : float or array-like of shape (n_targets,) + Independent term in decision function. + + n_iter_ : array-like or int + The number of iterations taken by lars_path to find the + grid of alphas for each target. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + lars_path: Compute Least Angle Regression or Lasso + path using LARS algorithm. + LarsCV : Cross-validated Least Angle Regression model. + sklearn.decomposition.sparse_encode : Sparse coding. + + Examples + -------- + >>> from sklearn import linear_model + >>> reg = linear_model.Lars(n_nonzero_coefs=1) + >>> reg.fit([[-1, 1], [0, 0], [1, 1]], [-1.1111, 0, -1.1111]) + Lars(n_nonzero_coefs=1) + >>> print(reg.coef_) + [ 0. -1.11] + """ + + _parameter_constraints: dict = { + "fit_intercept": ["boolean"], + "verbose": ["verbose"], + "precompute": ["boolean", StrOptions({"auto"}), np.ndarray, Hidden(None)], + "n_nonzero_coefs": [Interval(Integral, 1, None, closed="left")], + "eps": [Interval(Real, 0, None, closed="left")], + "copy_X": ["boolean"], + "fit_path": ["boolean"], + "jitter": [Interval(Real, 0, None, closed="left"), None], + "random_state": ["random_state"], + } + + method = "lar" + positive = False + + def __init__( + self, + *, + fit_intercept=True, + verbose=False, + precompute="auto", + n_nonzero_coefs=500, + eps=np.finfo(float).eps, + copy_X=True, + fit_path=True, + jitter=None, + random_state=None, + ): + self.fit_intercept = fit_intercept + self.verbose = verbose + self.precompute = precompute + self.n_nonzero_coefs = n_nonzero_coefs + self.eps = eps + self.copy_X = copy_X + self.fit_path = fit_path + self.jitter = jitter + self.random_state = random_state + + @staticmethod + def _get_gram(precompute, X, y): + if (not hasattr(precompute, "__array__")) and ( + (precompute is True) + or (precompute == "auto" and X.shape[0] > X.shape[1]) + or (precompute == "auto" and y.shape[1] > 1) + ): + precompute = np.dot(X.T, X) + + return precompute + + def _fit(self, X, y, max_iter, alpha, fit_path, Xy=None): + """Auxiliary method to fit the model using X, y as training data""" + n_features = X.shape[1] + + X, y, X_offset, y_offset, X_scale = _preprocess_data( + X, y, fit_intercept=self.fit_intercept, copy=self.copy_X + ) + + if y.ndim == 1: + y = y[:, np.newaxis] + + n_targets = y.shape[1] + + Gram = self._get_gram(self.precompute, X, y) + + self.alphas_ = [] + self.n_iter_ = [] + self.coef_ = np.empty((n_targets, n_features), dtype=X.dtype) + + if fit_path: + self.active_ = [] + self.coef_path_ = [] + for k in range(n_targets): + this_Xy = None if Xy is None else Xy[:, k] + alphas, active, coef_path, n_iter_ = lars_path( + X, + y[:, k], + Gram=Gram, + Xy=this_Xy, + copy_X=self.copy_X, + copy_Gram=True, + alpha_min=alpha, + method=self.method, + verbose=max(0, self.verbose - 1), + max_iter=max_iter, + eps=self.eps, + return_path=True, + return_n_iter=True, + positive=self.positive, + ) + self.alphas_.append(alphas) + self.active_.append(active) + self.n_iter_.append(n_iter_) + self.coef_path_.append(coef_path) + self.coef_[k] = coef_path[:, -1] + + if n_targets == 1: + self.alphas_, self.active_, self.coef_path_, self.coef_ = [ + a[0] + for a in (self.alphas_, self.active_, self.coef_path_, self.coef_) + ] + self.n_iter_ = self.n_iter_[0] + else: + for k in range(n_targets): + this_Xy = None if Xy is None else Xy[:, k] + alphas, _, self.coef_[k], n_iter_ = lars_path( + X, + y[:, k], + Gram=Gram, + Xy=this_Xy, + copy_X=self.copy_X, + copy_Gram=True, + alpha_min=alpha, + method=self.method, + verbose=max(0, self.verbose - 1), + max_iter=max_iter, + eps=self.eps, + return_path=False, + return_n_iter=True, + positive=self.positive, + ) + self.alphas_.append(alphas) + self.n_iter_.append(n_iter_) + if n_targets == 1: + self.alphas_ = self.alphas_[0] + self.n_iter_ = self.n_iter_[0] + + self._set_intercept(X_offset, y_offset, X_scale) + return self + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, Xy=None): + """Fit the model using X, y as training data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) or (n_samples, n_targets) + Target values. + + Xy : array-like of shape (n_features,) or (n_features, n_targets), \ + default=None + Xy = np.dot(X.T, y) that can be precomputed. It is useful + only when the Gram matrix is precomputed. + + Returns + ------- + self : object + Returns an instance of self. + """ + X, y = validate_data( + self, X, y, force_writeable=True, y_numeric=True, multi_output=True + ) + + alpha = getattr(self, "alpha", 0.0) + if hasattr(self, "n_nonzero_coefs"): + alpha = 0.0 # n_nonzero_coefs parametrization takes priority + max_iter = self.n_nonzero_coefs + else: + max_iter = self.max_iter + + if self.jitter is not None: + rng = check_random_state(self.random_state) + + noise = rng.uniform(high=self.jitter, size=len(y)) + y = y + noise + + self._fit( + X, + y, + max_iter=max_iter, + alpha=alpha, + fit_path=self.fit_path, + Xy=Xy, + ) + + return self + + +class LassoLars(Lars): + """Lasso model fit with Least Angle Regression a.k.a. Lars. + + It is a Linear Model trained with an L1 prior as regularizer. + + The optimization objective for Lasso is:: + + (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + alpha : float, default=1.0 + Constant that multiplies the penalty term. Defaults to 1.0. + ``alpha = 0`` is equivalent to an ordinary least square, solved + by :class:`LinearRegression`. For numerical reasons, using + ``alpha = 0`` with the LassoLars object is not advised and you + should prefer the LinearRegression object. + + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to false, no intercept will be used in calculations + (i.e. data is expected to be centered). + + verbose : bool or int, default=False + Sets the verbosity amount. + + precompute : bool, 'auto' or array-like, default='auto' + Whether to use a precomputed Gram matrix to speed up + calculations. If set to ``'auto'`` let us decide. The Gram + matrix can also be passed as argument. + + max_iter : int, default=500 + Maximum number of iterations to perform. + + eps : float, default=np.finfo(float).eps + The machine-precision regularization in the computation of the + Cholesky diagonal factors. Increase this for very ill-conditioned + systems. Unlike the ``tol`` parameter in some iterative + optimization-based algorithms, this parameter does not control + the tolerance of the optimization. + + copy_X : bool, default=True + If True, X will be copied; else, it may be overwritten. + + fit_path : bool, default=True + If ``True`` the full path is stored in the ``coef_path_`` attribute. + If you compute the solution for a large problem or many targets, + setting ``fit_path`` to ``False`` will lead to a speedup, especially + with a small alpha. + + positive : bool, default=False + Restrict coefficients to be >= 0. Be aware that you might want to + remove fit_intercept which is set True by default. + Under the positive restriction the model coefficients will not converge + to the ordinary-least-squares solution for small values of alpha. + Only coefficients up to the smallest alpha value (``alphas_[alphas_ > + 0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso + algorithm are typically in congruence with the solution of the + coordinate descent Lasso estimator. + + jitter : float, default=None + Upper bound on a uniform noise parameter to be added to the + `y` values, to satisfy the model's assumption of + one-at-a-time computations. Might help with stability. + + .. versionadded:: 0.23 + + random_state : int, RandomState instance or None, default=None + Determines random number generation for jittering. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. Ignored if `jitter` is None. + + .. versionadded:: 0.23 + + Attributes + ---------- + alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays + Maximum of covariances (in absolute value) at each iteration. + ``n_alphas`` is either ``max_iter``, ``n_features`` or the + number of nodes in the path with ``alpha >= alpha_min``, whichever + is smaller. If this is a list of array-like, the length of the outer + list is `n_targets`. + + active_ : list of length n_alphas or list of such lists + Indices of active variables at the end of the path. + If this is a list of list, the length of the outer list is `n_targets`. + + coef_path_ : array-like of shape (n_features, n_alphas + 1) or list \ + of such arrays + If a list is passed it's expected to be one of n_targets such arrays. + The varying values of the coefficients along the path. It is not + present if the ``fit_path`` parameter is ``False``. If this is a list + of array-like, the length of the outer list is `n_targets`. + + coef_ : array-like of shape (n_features,) or (n_targets, n_features) + Parameter vector (w in the formulation formula). + + intercept_ : float or array-like of shape (n_targets,) + Independent term in decision function. + + n_iter_ : array-like or int + The number of iterations taken by lars_path to find the + grid of alphas for each target. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + lars_path : Compute Least Angle Regression or Lasso + path using LARS algorithm. + lasso_path : Compute Lasso path with coordinate descent. + Lasso : Linear Model trained with L1 prior as + regularizer (aka the Lasso). + LassoCV : Lasso linear model with iterative fitting + along a regularization path. + LassoLarsCV: Cross-validated Lasso, using the LARS algorithm. + LassoLarsIC : Lasso model fit with Lars using BIC + or AIC for model selection. + sklearn.decomposition.sparse_encode : Sparse coding. + + Examples + -------- + >>> from sklearn import linear_model + >>> reg = linear_model.LassoLars(alpha=0.01) + >>> reg.fit([[-1, 1], [0, 0], [1, 1]], [-1, 0, -1]) + LassoLars(alpha=0.01) + >>> print(reg.coef_) + [ 0. -0.955] + """ + + _parameter_constraints: dict = { + **Lars._parameter_constraints, + "alpha": [Interval(Real, 0, None, closed="left")], + "max_iter": [Interval(Integral, 0, None, closed="left")], + "positive": ["boolean"], + } + _parameter_constraints.pop("n_nonzero_coefs") + + method = "lasso" + + def __init__( + self, + alpha=1.0, + *, + fit_intercept=True, + verbose=False, + precompute="auto", + max_iter=500, + eps=np.finfo(float).eps, + copy_X=True, + fit_path=True, + positive=False, + jitter=None, + random_state=None, + ): + self.alpha = alpha + self.fit_intercept = fit_intercept + self.max_iter = max_iter + self.verbose = verbose + self.positive = positive + self.precompute = precompute + self.copy_X = copy_X + self.eps = eps + self.fit_path = fit_path + self.jitter = jitter + self.random_state = random_state + + +############################################################################### +# Cross-validated estimator classes + + +def _check_copy_and_writeable(array, copy=False): + if copy or not array.flags.writeable: + return array.copy() + return array + + +def _lars_path_residues( + X_train, + y_train, + X_test, + y_test, + Gram=None, + copy=True, + method="lar", + verbose=False, + fit_intercept=True, + max_iter=500, + eps=np.finfo(float).eps, + positive=False, +): + """Compute the residues on left-out data for a full LARS path + + Parameters + ----------- + X_train : array-like of shape (n_samples, n_features) + The data to fit the LARS on + + y_train : array-like of shape (n_samples,) + The target variable to fit LARS on + + X_test : array-like of shape (n_samples, n_features) + The data to compute the residues on + + y_test : array-like of shape (n_samples,) + The target variable to compute the residues on + + Gram : None, 'auto' or array-like of shape (n_features, n_features), \ + default=None + Precomputed Gram matrix (X' * X), if ``'auto'``, the Gram + matrix is precomputed from the given X, if there are more samples + than features + + copy : bool, default=True + Whether X_train, X_test, y_train and y_test should be copied; + if False, they may be overwritten. + + method : {'lar' , 'lasso'}, default='lar' + Specifies the returned model. Select ``'lar'`` for Least Angle + Regression, ``'lasso'`` for the Lasso. + + verbose : bool or int, default=False + Sets the amount of verbosity + + fit_intercept : bool, default=True + whether to calculate the intercept for this model. If set + to false, no intercept will be used in calculations + (i.e. data is expected to be centered). + + positive : bool, default=False + Restrict coefficients to be >= 0. Be aware that you might want to + remove fit_intercept which is set True by default. + See reservations for using this option in combination with method + 'lasso' for expected small values of alpha in the doc of LassoLarsCV + and LassoLarsIC. + + max_iter : int, default=500 + Maximum number of iterations to perform. + + eps : float, default=np.finfo(float).eps + The machine-precision regularization in the computation of the + Cholesky diagonal factors. Increase this for very ill-conditioned + systems. Unlike the ``tol`` parameter in some iterative + optimization-based algorithms, this parameter does not control + the tolerance of the optimization. + + Returns + -------- + alphas : array-like of shape (n_alphas,) + Maximum of covariances (in absolute value) at each iteration. + ``n_alphas`` is either ``max_iter`` or ``n_features``, whichever + is smaller. + + active : list + Indices of active variables at the end of the path. + + coefs : array-like of shape (n_features, n_alphas) + Coefficients along the path + + residues : array-like of shape (n_alphas, n_samples) + Residues of the prediction on the test data + """ + X_train = _check_copy_and_writeable(X_train, copy) + y_train = _check_copy_and_writeable(y_train, copy) + X_test = _check_copy_and_writeable(X_test, copy) + y_test = _check_copy_and_writeable(y_test, copy) + + if fit_intercept: + X_mean = X_train.mean(axis=0) + X_train -= X_mean + X_test -= X_mean + y_mean = y_train.mean(axis=0) + y_train = as_float_array(y_train, copy=False) + y_train -= y_mean + y_test = as_float_array(y_test, copy=False) + y_test -= y_mean + + alphas, active, coefs = lars_path( + X_train, + y_train, + Gram=Gram, + copy_X=False, + copy_Gram=False, + method=method, + verbose=max(0, verbose - 1), + max_iter=max_iter, + eps=eps, + positive=positive, + ) + residues = np.dot(X_test, coefs) - y_test[:, np.newaxis] + return alphas, active, coefs, residues.T + + +class LarsCV(Lars): + """Cross-validated Least Angle Regression model. + + See glossary entry for :term:`cross-validation estimator`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to false, no intercept will be used in calculations + (i.e. data is expected to be centered). + + verbose : bool or int, default=False + Sets the verbosity amount. + + max_iter : int, default=500 + Maximum number of iterations to perform. + + precompute : bool, 'auto' or array-like , default='auto' + Whether to use a precomputed Gram matrix to speed up + calculations. If set to ``'auto'`` let us decide. The Gram matrix + cannot be passed as argument since we will use only subsets of X. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross-validation, + - integer, to specify the number of folds. + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, :class:`~sklearn.model_selection.KFold` is used. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value if None changed from 3-fold to 5-fold. + + max_n_alphas : int, default=1000 + The maximum number of points on the path used to compute the + residuals in the cross-validation. + + n_jobs : int or None, default=None + Number of CPUs to use during the cross validation. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + eps : float, default=np.finfo(float).eps + The machine-precision regularization in the computation of the + Cholesky diagonal factors. Increase this for very ill-conditioned + systems. Unlike the ``tol`` parameter in some iterative + optimization-based algorithms, this parameter does not control + the tolerance of the optimization. + + copy_X : bool, default=True + If ``True``, X will be copied; else, it may be overwritten. + + Attributes + ---------- + active_ : list of length n_alphas or list of such lists + Indices of active variables at the end of the path. + If this is a list of lists, the outer list length is `n_targets`. + + coef_ : array-like of shape (n_features,) + parameter vector (w in the formulation formula) + + intercept_ : float + independent term in decision function + + coef_path_ : array-like of shape (n_features, n_alphas) + the varying values of the coefficients along the path + + alpha_ : float + the estimated regularization parameter alpha + + alphas_ : array-like of shape (n_alphas,) + the different values of alpha along the path + + cv_alphas_ : array-like of shape (n_cv_alphas,) + all the values of alpha along the path for the different folds + + mse_path_ : array-like of shape (n_folds, n_cv_alphas) + the mean square error on left-out for each fold along the path + (alpha values given by ``cv_alphas``) + + n_iter_ : array-like or int + the number of iterations run by Lars with the optimal alpha. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + lars_path : Compute Least Angle Regression or Lasso + path using LARS algorithm. + lasso_path : Compute Lasso path with coordinate descent. + Lasso : Linear Model trained with L1 prior as + regularizer (aka the Lasso). + LassoCV : Lasso linear model with iterative fitting + along a regularization path. + LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars. + LassoLarsIC : Lasso model fit with Lars using BIC + or AIC for model selection. + sklearn.decomposition.sparse_encode : Sparse coding. + + Notes + ----- + In `fit`, once the best parameter `alpha` is found through + cross-validation, the model is fit again using the entire training set. + + Examples + -------- + >>> from sklearn.linear_model import LarsCV + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression(n_samples=200, noise=4.0, random_state=0) + >>> reg = LarsCV(cv=5).fit(X, y) + >>> reg.score(X, y) + 0.9996 + >>> reg.alpha_ + np.float64(0.2961) + >>> reg.predict(X[:1,]) + array([154.3996]) + """ + + _parameter_constraints: dict = { + **Lars._parameter_constraints, + "max_iter": [Interval(Integral, 0, None, closed="left")], + "cv": ["cv_object"], + "max_n_alphas": [Interval(Integral, 1, None, closed="left")], + "n_jobs": [Integral, None], + } + + for parameter in ["n_nonzero_coefs", "jitter", "fit_path", "random_state"]: + _parameter_constraints.pop(parameter) + + method = "lar" + + def __init__( + self, + *, + fit_intercept=True, + verbose=False, + max_iter=500, + precompute="auto", + cv=None, + max_n_alphas=1000, + n_jobs=None, + eps=np.finfo(float).eps, + copy_X=True, + ): + self.max_iter = max_iter + self.cv = cv + self.max_n_alphas = max_n_alphas + self.n_jobs = n_jobs + super().__init__( + fit_intercept=fit_intercept, + verbose=verbose, + precompute=precompute, + n_nonzero_coefs=500, + eps=eps, + copy_X=copy_X, + fit_path=True, + ) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.target_tags.multi_output = False + return tags + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, **params): + """Fit the model using X, y as training data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) + Target values. + + **params : dict, default=None + Parameters to be passed to the CV splitter. + + .. versionadded:: 1.4 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Returns an instance of self. + """ + _raise_for_params(params, self, "fit") + + X, y = validate_data(self, X, y, force_writeable=True, y_numeric=True) + X = as_float_array(X, copy=self.copy_X) + y = as_float_array(y, copy=self.copy_X) + + # init cross-validation generator + cv = check_cv(self.cv, classifier=False) + + if _routing_enabled(): + routed_params = process_routing(self, "fit", **params) + else: + routed_params = Bunch(splitter=Bunch(split={})) + + # As we use cross-validation, the Gram matrix is not precomputed here + Gram = self.precompute + if hasattr(Gram, "__array__"): + warnings.warn( + 'Parameter "precompute" cannot be an array in ' + '%s. Automatically switch to "auto" instead.' % self.__class__.__name__ + ) + Gram = "auto" + + cv_paths = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( + delayed(_lars_path_residues)( + X[train], + y[train], + X[test], + y[test], + Gram=Gram, + copy=False, + method=self.method, + verbose=max(0, self.verbose - 1), + fit_intercept=self.fit_intercept, + max_iter=self.max_iter, + eps=self.eps, + positive=self.positive, + ) + for train, test in cv.split(X, y, **routed_params.splitter.split) + ) + all_alphas = np.concatenate(next(zip(*cv_paths))) + # Unique also sorts + all_alphas = np.unique(all_alphas) + # Take at most max_n_alphas values + stride = int(max(1, int(len(all_alphas) / float(self.max_n_alphas)))) + all_alphas = all_alphas[::stride] + + mse_path = np.empty((len(all_alphas), len(cv_paths))) + for index, (alphas, _, _, residues) in enumerate(cv_paths): + alphas = alphas[::-1] + residues = residues[::-1] + if alphas[0] != 0: + alphas = np.r_[0, alphas] + residues = np.r_[residues[0, np.newaxis], residues] + if alphas[-1] != all_alphas[-1]: + alphas = np.r_[alphas, all_alphas[-1]] + residues = np.r_[residues, residues[-1, np.newaxis]] + this_residues = interpolate.interp1d(alphas, residues, axis=0)(all_alphas) + this_residues **= 2 + mse_path[:, index] = np.mean(this_residues, axis=-1) + + mask = np.all(np.isfinite(mse_path), axis=-1) + all_alphas = all_alphas[mask] + mse_path = mse_path[mask] + # Select the alpha that minimizes left-out error + i_best_alpha = np.argmin(mse_path.mean(axis=-1)) + best_alpha = all_alphas[i_best_alpha] + + # Store our parameters + self.alpha_ = best_alpha + self.cv_alphas_ = all_alphas + self.mse_path_ = mse_path + + # Now compute the full model using best_alpha + # it will call a lasso internally when self if LassoLarsCV + # as self.method == 'lasso' + self._fit( + X, + y, + max_iter=self.max_iter, + alpha=best_alpha, + Xy=None, + fit_path=True, + ) + return self + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.4 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__).add( + splitter=check_cv(self.cv), + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + return router + + +class LassoLarsCV(LarsCV): + """Cross-validated Lasso, using the LARS algorithm. + + See glossary entry for :term:`cross-validation estimator`. + + The optimization objective for Lasso is:: + + (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to false, no intercept will be used in calculations + (i.e. data is expected to be centered). + + verbose : bool or int, default=False + Sets the verbosity amount. + + max_iter : int, default=500 + Maximum number of iterations to perform. + + precompute : bool or 'auto' , default='auto' + Whether to use a precomputed Gram matrix to speed up + calculations. If set to ``'auto'`` let us decide. The Gram matrix + cannot be passed as argument since we will use only subsets of X. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross-validation, + - integer, to specify the number of folds. + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, :class:`~sklearn.model_selection.KFold` is used. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value if None changed from 3-fold to 5-fold. + + max_n_alphas : int, default=1000 + The maximum number of points on the path used to compute the + residuals in the cross-validation. + + n_jobs : int or None, default=None + Number of CPUs to use during the cross validation. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + eps : float, default=np.finfo(float).eps + The machine-precision regularization in the computation of the + Cholesky diagonal factors. Increase this for very ill-conditioned + systems. Unlike the ``tol`` parameter in some iterative + optimization-based algorithms, this parameter does not control + the tolerance of the optimization. + + copy_X : bool, default=True + If True, X will be copied; else, it may be overwritten. + + positive : bool, default=False + Restrict coefficients to be >= 0. Be aware that you might want to + remove fit_intercept which is set True by default. + Under the positive restriction the model coefficients do not converge + to the ordinary-least-squares solution for small values of alpha. + Only coefficients up to the smallest alpha value (``alphas_[alphas_ > + 0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso + algorithm are typically in congruence with the solution of the + coordinate descent Lasso estimator. + As a consequence using LassoLarsCV only makes sense for problems where + a sparse solution is expected and/or reached. + + Attributes + ---------- + coef_ : array-like of shape (n_features,) + parameter vector (w in the formulation formula) + + intercept_ : float + independent term in decision function. + + coef_path_ : array-like of shape (n_features, n_alphas) + the varying values of the coefficients along the path + + alpha_ : float + the estimated regularization parameter alpha + + alphas_ : array-like of shape (n_alphas,) + the different values of alpha along the path + + cv_alphas_ : array-like of shape (n_cv_alphas,) + all the values of alpha along the path for the different folds + + mse_path_ : array-like of shape (n_folds, n_cv_alphas) + the mean square error on left-out for each fold along the path + (alpha values given by ``cv_alphas``) + + n_iter_ : array-like or int + the number of iterations run by Lars with the optimal alpha. + + active_ : list of int + Indices of active variables at the end of the path. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + lars_path : Compute Least Angle Regression or Lasso + path using LARS algorithm. + lasso_path : Compute Lasso path with coordinate descent. + Lasso : Linear Model trained with L1 prior as + regularizer (aka the Lasso). + LassoCV : Lasso linear model with iterative fitting + along a regularization path. + LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars. + LassoLarsIC : Lasso model fit with Lars using BIC + or AIC for model selection. + sklearn.decomposition.sparse_encode : Sparse coding. + + Notes + ----- + The object solves the same problem as the + :class:`~sklearn.linear_model.LassoCV` object. However, unlike the + :class:`~sklearn.linear_model.LassoCV`, it find the relevant alphas values + by itself. In general, because of this property, it will be more stable. + However, it is more fragile to heavily multicollinear datasets. + + It is more efficient than the :class:`~sklearn.linear_model.LassoCV` if + only a small number of features are selected compared to the total number, + for instance if there are very few samples compared to the number of + features. + + In `fit`, once the best parameter `alpha` is found through + cross-validation, the model is fit again using the entire training set. + + Examples + -------- + >>> from sklearn.linear_model import LassoLarsCV + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression(noise=4.0, random_state=0) + >>> reg = LassoLarsCV(cv=5).fit(X, y) + >>> reg.score(X, y) + 0.9993 + >>> reg.alpha_ + np.float64(0.3972) + >>> reg.predict(X[:1,]) + array([-78.4831]) + """ + + _parameter_constraints = { + **LarsCV._parameter_constraints, + "positive": ["boolean"], + } + + method = "lasso" + + def __init__( + self, + *, + fit_intercept=True, + verbose=False, + max_iter=500, + precompute="auto", + cv=None, + max_n_alphas=1000, + n_jobs=None, + eps=np.finfo(float).eps, + copy_X=True, + positive=False, + ): + self.fit_intercept = fit_intercept + self.verbose = verbose + self.max_iter = max_iter + self.precompute = precompute + self.cv = cv + self.max_n_alphas = max_n_alphas + self.n_jobs = n_jobs + self.eps = eps + self.copy_X = copy_X + self.positive = positive + # XXX : we don't use super().__init__ + # to avoid setting n_nonzero_coefs + + +class LassoLarsIC(LassoLars): + """Lasso model fit with Lars using BIC or AIC for model selection. + + The optimization objective for Lasso is:: + + (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 + + AIC is the Akaike information criterion [2]_ and BIC is the Bayes + Information criterion [3]_. Such criteria are useful to select the value + of the regularization parameter by making a trade-off between the + goodness of fit and the complexity of the model. A good model should + explain well the data while being simple. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + criterion : {'aic', 'bic'}, default='aic' + The type of criterion to use. + + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to false, no intercept will be used in calculations + (i.e. data is expected to be centered). + + verbose : bool or int, default=False + Sets the verbosity amount. + + precompute : bool, 'auto' or array-like, default='auto' + Whether to use a precomputed Gram matrix to speed up + calculations. If set to ``'auto'`` let us decide. The Gram + matrix can also be passed as argument. + + max_iter : int, default=500 + Maximum number of iterations to perform. Can be used for + early stopping. + + eps : float, default=np.finfo(float).eps + The machine-precision regularization in the computation of the + Cholesky diagonal factors. Increase this for very ill-conditioned + systems. Unlike the ``tol`` parameter in some iterative + optimization-based algorithms, this parameter does not control + the tolerance of the optimization. + + copy_X : bool, default=True + If True, X will be copied; else, it may be overwritten. + + positive : bool, default=False + Restrict coefficients to be >= 0. Be aware that you might want to + remove fit_intercept which is set True by default. + Under the positive restriction the model coefficients do not converge + to the ordinary-least-squares solution for small values of alpha. + Only coefficients up to the smallest alpha value (``alphas_[alphas_ > + 0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso + algorithm are typically in congruence with the solution of the + coordinate descent Lasso estimator. + As a consequence using LassoLarsIC only makes sense for problems where + a sparse solution is expected and/or reached. + + noise_variance : float, default=None + The estimated noise variance of the data. If `None`, an unbiased + estimate is computed by an OLS model. However, it is only possible + in the case where `n_samples > n_features + fit_intercept`. + + .. versionadded:: 1.1 + + Attributes + ---------- + coef_ : array-like of shape (n_features,) + parameter vector (w in the formulation formula) + + intercept_ : float + independent term in decision function. + + alpha_ : float + the alpha parameter chosen by the information criterion + + alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays + Maximum of covariances (in absolute value) at each iteration. + ``n_alphas`` is either ``max_iter``, ``n_features`` or the + number of nodes in the path with ``alpha >= alpha_min``, whichever + is smaller. If a list, it will be of length `n_targets`. + + n_iter_ : int + number of iterations run by lars_path to find the grid of + alphas. + + criterion_ : array-like of shape (n_alphas,) + The value of the information criteria ('aic', 'bic') across all + alphas. The alpha which has the smallest information criterion is + chosen, as specified in [1]_. + + noise_variance_ : float + The estimated noise variance from the data used to compute the + criterion. + + .. versionadded:: 1.1 + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + lars_path : Compute Least Angle Regression or Lasso + path using LARS algorithm. + lasso_path : Compute Lasso path with coordinate descent. + Lasso : Linear Model trained with L1 prior as + regularizer (aka the Lasso). + LassoCV : Lasso linear model with iterative fitting + along a regularization path. + LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars. + LassoLarsCV: Cross-validated Lasso, using the LARS algorithm. + sklearn.decomposition.sparse_encode : Sparse coding. + + Notes + ----- + The number of degrees of freedom is computed as in [1]_. + + To have more details regarding the mathematical formulation of the + AIC and BIC criteria, please refer to :ref:`User Guide `. + + References + ---------- + .. [1] :arxiv:`Zou, Hui, Trevor Hastie, and Robert Tibshirani. + "On the degrees of freedom of the lasso." + The Annals of Statistics 35.5 (2007): 2173-2192. + <0712.0881>` + + .. [2] `Wikipedia entry on the Akaike information criterion + `_ + + .. [3] `Wikipedia entry on the Bayesian information criterion + `_ + + Examples + -------- + >>> from sklearn import linear_model + >>> reg = linear_model.LassoLarsIC(criterion='bic') + >>> X = [[-2, 2], [-1, 1], [0, 0], [1, 1], [2, 2]] + >>> y = [-2.2222, -1.1111, 0, -1.1111, -2.2222] + >>> reg.fit(X, y) + LassoLarsIC(criterion='bic') + >>> print(reg.coef_) + [ 0. -1.11] + """ + + _parameter_constraints: dict = { + **LassoLars._parameter_constraints, + "criterion": [StrOptions({"aic", "bic"})], + "noise_variance": [Interval(Real, 0, None, closed="left"), None], + } + + for parameter in ["jitter", "fit_path", "alpha", "random_state"]: + _parameter_constraints.pop(parameter) + + def __init__( + self, + criterion="aic", + *, + fit_intercept=True, + verbose=False, + precompute="auto", + max_iter=500, + eps=np.finfo(float).eps, + copy_X=True, + positive=False, + noise_variance=None, + ): + self.criterion = criterion + self.fit_intercept = fit_intercept + self.positive = positive + self.max_iter = max_iter + self.verbose = verbose + self.copy_X = copy_X + self.precompute = precompute + self.eps = eps + self.fit_path = True + self.noise_variance = noise_variance + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.target_tags.multi_output = False + return tags + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, copy_X=None): + """Fit the model using X, y as training data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) + Target values. Will be cast to X's dtype if necessary. + + copy_X : bool, default=None + If provided, this parameter will override the choice + of copy_X made at instance creation. + If ``True``, X will be copied; else, it may be overwritten. + + Returns + ------- + self : object + Returns an instance of self. + """ + if copy_X is None: + copy_X = self.copy_X + X, y = validate_data(self, X, y, force_writeable=True, y_numeric=True) + + X, y, Xmean, ymean, Xstd = _preprocess_data( + X, y, fit_intercept=self.fit_intercept, copy=copy_X + ) + + Gram = self.precompute + + alphas_, _, coef_path_, self.n_iter_ = lars_path( + X, + y, + Gram=Gram, + copy_X=copy_X, + copy_Gram=True, + alpha_min=0.0, + method="lasso", + verbose=self.verbose, + max_iter=self.max_iter, + eps=self.eps, + return_n_iter=True, + positive=self.positive, + ) + + n_samples = X.shape[0] + + if self.criterion == "aic": + criterion_factor = 2 + elif self.criterion == "bic": + criterion_factor = log(n_samples) + else: + raise ValueError( + f"criterion should be either bic or aic, got {self.criterion!r}" + ) + + residuals = y[:, np.newaxis] - np.dot(X, coef_path_) + residuals_sum_squares = np.sum(residuals**2, axis=0) + degrees_of_freedom = np.zeros(coef_path_.shape[1], dtype=int) + for k, coef in enumerate(coef_path_.T): + mask = np.abs(coef) > np.finfo(coef.dtype).eps + if not np.any(mask): + continue + # get the number of degrees of freedom equal to: + # Xc = X[:, mask] + # Trace(Xc * inv(Xc.T, Xc) * Xc.T) ie the number of non-zero coefs + degrees_of_freedom[k] = np.sum(mask) + + self.alphas_ = alphas_ + + if self.noise_variance is None: + self.noise_variance_ = self._estimate_noise_variance( + X, y, positive=self.positive + ) + else: + self.noise_variance_ = self.noise_variance + + self.criterion_ = ( + n_samples * np.log(2 * np.pi * self.noise_variance_) + + residuals_sum_squares / self.noise_variance_ + + criterion_factor * degrees_of_freedom + ) + n_best = np.argmin(self.criterion_) + + self.alpha_ = alphas_[n_best] + self.coef_ = coef_path_[:, n_best] + self._set_intercept(Xmean, ymean, Xstd) + return self + + def _estimate_noise_variance(self, X, y, positive): + """Compute an estimate of the variance with an OLS model. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Data to be fitted by the OLS model. We expect the data to be + centered. + + y : ndarray of shape (n_samples,) + Associated target. + + positive : bool, default=False + Restrict coefficients to be >= 0. This should be inline with + the `positive` parameter from `LassoLarsIC`. + + Returns + ------- + noise_variance : float + An estimator of the noise variance of an OLS model. + """ + if X.shape[0] <= X.shape[1] + self.fit_intercept: + raise ValueError( + f"You are using {self.__class__.__name__} in the case where the number " + "of samples is smaller than the number of features. In this setting, " + "getting a good estimate for the variance of the noise is not " + "possible. Provide an estimate of the noise variance in the " + "constructor." + ) + # X and y are already centered and we don't need to fit with an intercept + ols_model = LinearRegression(positive=positive, fit_intercept=False) + y_pred = ols_model.fit(X, y).predict(X) + return np.sum((y - y_pred) ** 2) / ( + X.shape[0] - X.shape[1] - self.fit_intercept + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_linear_loss.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_linear_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..9213008a19841f1707b57e3e47b7887ea29da4da --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_linear_loss.py @@ -0,0 +1,825 @@ +""" +Loss functions for linear models with raw_prediction = X @ coef +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np +from scipy import sparse + +from ..utils.extmath import squared_norm + + +def sandwich_dot(X, W): + """Compute the sandwich product X.T @ diag(W) @ X.""" + # TODO: This "sandwich product" is the main computational bottleneck for solvers + # that use the full hessian matrix. Here, thread parallelism would pay-off the + # most. + # While a dedicated Cython routine could exploit the symmetry, it is very hard to + # beat BLAS GEMM, even thought the latter cannot exploit the symmetry, unless one + # pays the price of taking square roots and implements + # sqrtWX = sqrt(W)[: None] * X + # return sqrtWX.T @ sqrtWX + # which (might) detect the symmetry and use BLAS SYRK under the hood. + n_samples = X.shape[0] + if sparse.issparse(X): + return ( + X.T @ sparse.dia_matrix((W, 0), shape=(n_samples, n_samples)) @ X + ).toarray() + else: + # np.einsum may use less memory but the following, using BLAS matrix + # multiplication (gemm), is by far faster. + WX = W[:, None] * X + return X.T @ WX + + +class LinearModelLoss: + """General class for loss functions with raw_prediction = X @ coef + intercept. + + Note that raw_prediction is also known as linear predictor. + + The loss is the average of per sample losses and includes a term for L2 + regularization:: + + loss = 1 / s_sum * sum_i s_i loss(y_i, X_i @ coef + intercept) + + 1/2 * l2_reg_strength * ||coef||_2^2 + + with sample weights s_i=1 if sample_weight=None and s_sum=sum_i s_i. + + Gradient and hessian, for simplicity without intercept, are:: + + gradient = 1 / s_sum * X.T @ loss.gradient + l2_reg_strength * coef + hessian = 1 / s_sum * X.T @ diag(loss.hessian) @ X + + l2_reg_strength * identity + + Conventions: + if fit_intercept: + n_dof = n_features + 1 + else: + n_dof = n_features + + if base_loss.is_multiclass: + coef.shape = (n_classes, n_dof) or ravelled (n_classes * n_dof,) + else: + coef.shape = (n_dof,) + + The intercept term is at the end of the coef array: + if base_loss.is_multiclass: + if coef.shape (n_classes, n_dof): + intercept = coef[:, -1] + if coef.shape (n_classes * n_dof,) + intercept = coef[n_features::n_dof] = coef[(n_dof-1)::n_dof] + intercept.shape = (n_classes,) + else: + intercept = coef[-1] + + Shape of gradient follows shape of coef. + gradient.shape = coef.shape + + But hessian (to make our lives simpler) are always 2-d: + if base_loss.is_multiclass: + hessian.shape = (n_classes * n_dof, n_classes * n_dof) + else: + hessian.shape = (n_dof, n_dof) + + Note: If coef has shape (n_classes * n_dof,), the 2d-array can be reconstructed as + + coef.reshape((n_classes, -1), order="F") + + The option order="F" makes coef[:, i] contiguous. This, in turn, makes the + coefficients without intercept, coef[:, :-1], contiguous and speeds up + matrix-vector computations. + + Note: If the average loss per sample is wanted instead of the sum of the loss per + sample, one can simply use a rescaled sample_weight such that + sum(sample_weight) = 1. + + Parameters + ---------- + base_loss : instance of class BaseLoss from sklearn._loss. + fit_intercept : bool + """ + + def __init__(self, base_loss, fit_intercept): + self.base_loss = base_loss + self.fit_intercept = fit_intercept + + def init_zero_coef(self, X, dtype=None): + """Allocate coef of correct shape with zeros. + + Parameters: + ----------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + dtype : data-type, default=None + Overrides the data type of coef. With dtype=None, coef will have the same + dtype as X. + + Returns + ------- + coef : ndarray of shape (n_dof,) or (n_classes, n_dof) + Coefficients of a linear model. + """ + n_features = X.shape[1] + n_classes = self.base_loss.n_classes + if self.fit_intercept: + n_dof = n_features + 1 + else: + n_dof = n_features + if self.base_loss.is_multiclass: + coef = np.zeros_like(X, shape=(n_classes, n_dof), dtype=dtype, order="F") + else: + coef = np.zeros_like(X, shape=n_dof, dtype=dtype) + return coef + + def weight_intercept(self, coef): + """Helper function to get coefficients and intercept. + + Parameters + ---------- + coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,) + Coefficients of a linear model. + If shape (n_classes * n_dof,), the classes of one feature are contiguous, + i.e. one reconstructs the 2d-array via + coef.reshape((n_classes, -1), order="F"). + + Returns + ------- + weights : ndarray of shape (n_features,) or (n_classes, n_features) + Coefficients without intercept term. + intercept : float or ndarray of shape (n_classes,) + Intercept terms. + """ + if not self.base_loss.is_multiclass: + if self.fit_intercept: + intercept = coef[-1] + weights = coef[:-1] + else: + intercept = 0.0 + weights = coef + else: + # reshape to (n_classes, n_dof) + if coef.ndim == 1: + weights = coef.reshape((self.base_loss.n_classes, -1), order="F") + else: + weights = coef + if self.fit_intercept: + intercept = weights[:, -1] + weights = weights[:, :-1] + else: + intercept = 0.0 + + return weights, intercept + + def weight_intercept_raw(self, coef, X): + """Helper function to get coefficients, intercept and raw_prediction. + + Parameters + ---------- + coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,) + Coefficients of a linear model. + If shape (n_classes * n_dof,), the classes of one feature are contiguous, + i.e. one reconstructs the 2d-array via + coef.reshape((n_classes, -1), order="F"). + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + Returns + ------- + weights : ndarray of shape (n_features,) or (n_classes, n_features) + Coefficients without intercept term. + intercept : float or ndarray of shape (n_classes,) + Intercept terms. + raw_prediction : ndarray of shape (n_samples,) or \ + (n_samples, n_classes) + """ + weights, intercept = self.weight_intercept(coef) + + if not self.base_loss.is_multiclass: + raw_prediction = X @ weights + intercept + else: + # weights has shape (n_classes, n_dof) + raw_prediction = X @ weights.T + intercept # ndarray, likely C-contiguous + + return weights, intercept, raw_prediction + + def l2_penalty(self, weights, l2_reg_strength): + """Compute L2 penalty term l2_reg_strength/2 *||w||_2^2.""" + norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights) + return 0.5 * l2_reg_strength * norm2_w + + def loss( + self, + coef, + X, + y, + sample_weight=None, + l2_reg_strength=0.0, + n_threads=1, + raw_prediction=None, + ): + """Compute the loss as weighted average over point-wise losses. + + Parameters + ---------- + coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,) + Coefficients of a linear model. + If shape (n_classes * n_dof,), the classes of one feature are contiguous, + i.e. one reconstructs the 2d-array via + coef.reshape((n_classes, -1), order="F"). + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + y : contiguous array of shape (n_samples,) + Observed, true target values. + sample_weight : None or contiguous array of shape (n_samples,), default=None + Sample weights. + l2_reg_strength : float, default=0.0 + L2 regularization strength + n_threads : int, default=1 + Number of OpenMP threads to use. + raw_prediction : C-contiguous array of shape (n_samples,) or array of \ + shape (n_samples, n_classes) + Raw prediction values (in link space). If provided, these are used. If + None, then raw_prediction = X @ coef + intercept is calculated. + + Returns + ------- + loss : float + Weighted average of losses per sample, plus penalty. + """ + if raw_prediction is None: + weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X) + else: + weights, intercept = self.weight_intercept(coef) + + loss = self.base_loss.loss( + y_true=y, + raw_prediction=raw_prediction, + sample_weight=None, + n_threads=n_threads, + ) + loss = np.average(loss, weights=sample_weight) + + return loss + self.l2_penalty(weights, l2_reg_strength) + + def loss_gradient( + self, + coef, + X, + y, + sample_weight=None, + l2_reg_strength=0.0, + n_threads=1, + raw_prediction=None, + ): + """Computes the sum of loss and gradient w.r.t. coef. + + Parameters + ---------- + coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,) + Coefficients of a linear model. + If shape (n_classes * n_dof,), the classes of one feature are contiguous, + i.e. one reconstructs the 2d-array via + coef.reshape((n_classes, -1), order="F"). + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + y : contiguous array of shape (n_samples,) + Observed, true target values. + sample_weight : None or contiguous array of shape (n_samples,), default=None + Sample weights. + l2_reg_strength : float, default=0.0 + L2 regularization strength + n_threads : int, default=1 + Number of OpenMP threads to use. + raw_prediction : C-contiguous array of shape (n_samples,) or array of \ + shape (n_samples, n_classes) + Raw prediction values (in link space). If provided, these are used. If + None, then raw_prediction = X @ coef + intercept is calculated. + + Returns + ------- + loss : float + Weighted average of losses per sample, plus penalty. + + gradient : ndarray of shape coef.shape + The gradient of the loss. + """ + (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes + n_dof = n_features + int(self.fit_intercept) + + if raw_prediction is None: + weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X) + else: + weights, intercept = self.weight_intercept(coef) + + loss, grad_pointwise = self.base_loss.loss_gradient( + y_true=y, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + n_threads=n_threads, + ) + sw_sum = n_samples if sample_weight is None else np.sum(sample_weight) + loss = loss.sum() / sw_sum + loss += self.l2_penalty(weights, l2_reg_strength) + + grad_pointwise /= sw_sum + + if not self.base_loss.is_multiclass: + grad = np.empty_like(coef, dtype=weights.dtype) + grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights + if self.fit_intercept: + grad[-1] = grad_pointwise.sum() + else: + grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F") + # grad_pointwise.shape = (n_samples, n_classes) + grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights + if self.fit_intercept: + grad[:, -1] = grad_pointwise.sum(axis=0) + if coef.ndim == 1: + grad = grad.ravel(order="F") + + return loss, grad + + def gradient( + self, + coef, + X, + y, + sample_weight=None, + l2_reg_strength=0.0, + n_threads=1, + raw_prediction=None, + ): + """Computes the gradient w.r.t. coef. + + Parameters + ---------- + coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,) + Coefficients of a linear model. + If shape (n_classes * n_dof,), the classes of one feature are contiguous, + i.e. one reconstructs the 2d-array via + coef.reshape((n_classes, -1), order="F"). + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + y : contiguous array of shape (n_samples,) + Observed, true target values. + sample_weight : None or contiguous array of shape (n_samples,), default=None + Sample weights. + l2_reg_strength : float, default=0.0 + L2 regularization strength + n_threads : int, default=1 + Number of OpenMP threads to use. + raw_prediction : C-contiguous array of shape (n_samples,) or array of \ + shape (n_samples, n_classes) + Raw prediction values (in link space). If provided, these are used. If + None, then raw_prediction = X @ coef + intercept is calculated. + + Returns + ------- + gradient : ndarray of shape coef.shape + The gradient of the loss. + """ + (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes + n_dof = n_features + int(self.fit_intercept) + + if raw_prediction is None: + weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X) + else: + weights, intercept = self.weight_intercept(coef) + + grad_pointwise = self.base_loss.gradient( + y_true=y, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + n_threads=n_threads, + ) + sw_sum = n_samples if sample_weight is None else np.sum(sample_weight) + grad_pointwise /= sw_sum + + if not self.base_loss.is_multiclass: + grad = np.empty_like(coef, dtype=weights.dtype) + grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights + if self.fit_intercept: + grad[-1] = grad_pointwise.sum() + return grad + else: + grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F") + # gradient.shape = (n_samples, n_classes) + grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights + if self.fit_intercept: + grad[:, -1] = grad_pointwise.sum(axis=0) + if coef.ndim == 1: + return grad.ravel(order="F") + else: + return grad + + def gradient_hessian( + self, + coef, + X, + y, + sample_weight=None, + l2_reg_strength=0.0, + n_threads=1, + gradient_out=None, + hessian_out=None, + raw_prediction=None, + ): + """Computes gradient and hessian w.r.t. coef. + + Parameters + ---------- + coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,) + Coefficients of a linear model. + If shape (n_classes * n_dof,), the classes of one feature are contiguous, + i.e. one reconstructs the 2d-array via + coef.reshape((n_classes, -1), order="F"). + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + y : contiguous array of shape (n_samples,) + Observed, true target values. + sample_weight : None or contiguous array of shape (n_samples,), default=None + Sample weights. + l2_reg_strength : float, default=0.0 + L2 regularization strength + n_threads : int, default=1 + Number of OpenMP threads to use. + gradient_out : None or ndarray of shape coef.shape + A location into which the gradient is stored. If None, a new array + might be created. + hessian_out : None or ndarray of shape (n_dof, n_dof) or \ + (n_classes * n_dof, n_classes * n_dof) + A location into which the hessian is stored. If None, a new array + might be created. + raw_prediction : C-contiguous array of shape (n_samples,) or array of \ + shape (n_samples, n_classes) + Raw prediction values (in link space). If provided, these are used. If + None, then raw_prediction = X @ coef + intercept is calculated. + + Returns + ------- + gradient : ndarray of shape coef.shape + The gradient of the loss. + + hessian : ndarray of shape (n_dof, n_dof) or \ + (n_classes, n_dof, n_dof, n_classes) + Hessian matrix. + + hessian_warning : bool + True if pointwise hessian has more than 25% of its elements non-positive. + """ + (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes + n_dof = n_features + int(self.fit_intercept) + if raw_prediction is None: + weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X) + else: + weights, intercept = self.weight_intercept(coef) + sw_sum = n_samples if sample_weight is None else np.sum(sample_weight) + + # Allocate gradient. + if gradient_out is None: + grad = np.empty_like(coef, dtype=weights.dtype, order="F") + elif gradient_out.shape != coef.shape: + raise ValueError( + f"gradient_out is required to have shape coef.shape = {coef.shape}; " + f"got {gradient_out.shape}." + ) + elif self.base_loss.is_multiclass and not gradient_out.flags.f_contiguous: + raise ValueError("gradient_out must be F-contiguous.") + else: + grad = gradient_out + # Allocate hessian. + n = coef.size # for multinomial this equals n_dof * n_classes + if hessian_out is None: + hess = np.empty((n, n), dtype=weights.dtype) + elif hessian_out.shape != (n, n): + raise ValueError( + f"hessian_out is required to have shape ({n, n}); got " + f"{hessian_out.shape=}." + ) + elif self.base_loss.is_multiclass and ( + not hessian_out.flags.c_contiguous and not hessian_out.flags.f_contiguous + ): + raise ValueError("hessian_out must be contiguous.") + else: + hess = hessian_out + + if not self.base_loss.is_multiclass: + grad_pointwise, hess_pointwise = self.base_loss.gradient_hessian( + y_true=y, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + n_threads=n_threads, + ) + grad_pointwise /= sw_sum + hess_pointwise /= sw_sum + + # For non-canonical link functions and far away from the optimum, the + # pointwise hessian can be negative. We take care that 75% of the hessian + # entries are positive. + hessian_warning = ( + np.average(hess_pointwise <= 0, weights=sample_weight) > 0.25 + ) + hess_pointwise = np.abs(hess_pointwise) + + grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights + if self.fit_intercept: + grad[-1] = grad_pointwise.sum() + + if hessian_warning: + # Exit early without computing the hessian. + return grad, hess, hessian_warning + + hess[:n_features, :n_features] = sandwich_dot(X, hess_pointwise) + + if l2_reg_strength > 0: + # The L2 penalty enters the Hessian on the diagonal only. To add those + # terms, we use a flattened view of the array. + order = "C" if hess.flags.c_contiguous else "F" + hess.reshape(-1, order=order)[: (n_features * n_dof) : (n_dof + 1)] += ( + l2_reg_strength + ) + + if self.fit_intercept: + # With intercept included as added column to X, the hessian becomes + # hess = (X, 1)' @ diag(h) @ (X, 1) + # = (X' @ diag(h) @ X, X' @ h) + # ( h @ X, sum(h)) + # The left upper part has already been filled, it remains to compute + # the last row and the last column. + Xh = X.T @ hess_pointwise + hess[:-1, -1] = Xh + hess[-1, :-1] = Xh + hess[-1, -1] = hess_pointwise.sum() + else: + # Here we may safely assume HalfMultinomialLoss aka categorical + # cross-entropy. + # HalfMultinomialLoss computes only the diagonal part of the hessian, i.e. + # diagonal in the classes. Here, we want the full hessian. Therefore, we + # call gradient_proba. + grad_pointwise, proba = self.base_loss.gradient_proba( + y_true=y, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + n_threads=n_threads, + ) + grad_pointwise /= sw_sum + grad = grad.reshape((n_classes, n_dof), order="F") + grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights + if self.fit_intercept: + grad[:, -1] = grad_pointwise.sum(axis=0) + if coef.ndim == 1: + grad = grad.ravel(order="F") + + # The full hessian matrix, i.e. not only the diagonal part, dropping most + # indices, is given by: + # + # hess = X' @ h @ X + # + # Here, h is a priori a 4-dimensional matrix of shape + # (n_samples, n_samples, n_classes, n_classes). It is diagonal its first + # two dimensions (the ones with n_samples), i.e. it is + # effectively a 3-dimensional matrix (n_samples, n_classes, n_classes). + # + # h = diag(p) - p' p + # + # or with indices k and l for classes + # + # h_kl = p_k * delta_kl - p_k * p_l + # + # with p_k the (predicted) probability for class k. Only the dimension in + # n_samples multiplies with X. + # For 3 classes and n_samples = 1, this looks like ("@" is a bit misused + # here): + # + # hess = X' @ (h00 h10 h20) @ X + # (h10 h11 h12) + # (h20 h12 h22) + # = (X' @ diag(h00) @ X, X' @ diag(h10), X' @ diag(h20)) + # (X' @ diag(h10) @ X, X' @ diag(h11), X' @ diag(h12)) + # (X' @ diag(h20) @ X, X' @ diag(h12), X' @ diag(h22)) + # + # Now coef of shape (n_classes * n_dof) is contiguous in n_classes. + # Therefore, we want the hessian to follow this convention, too, i.e. + # hess[:n_classes, :n_classes] = (x0' @ h00 @ x0, x0' @ h10 @ x0, ..) + # (x0' @ h10 @ x0, x0' @ h11 @ x0, ..) + # (x0' @ h20 @ x0, x0' @ h12 @ x0, ..) + # is the first feature, x0, for all classes. In our implementation, we + # still want to take advantage of BLAS "X.T @ X". Therefore, we have some + # index/slicing battle to fight. + if sample_weight is not None: + sw = sample_weight / sw_sum + else: + sw = 1.0 / sw_sum + + for k in range(n_classes): + # Diagonal terms (in classes) hess_kk. + # Note that this also writes to some of the lower triangular part. + h = proba[:, k] * (1 - proba[:, k]) * sw + hess[ + k : n_classes * n_features : n_classes, + k : n_classes * n_features : n_classes, + ] = sandwich_dot(X, h) + if self.fit_intercept: + # See above in the non multiclass case. + Xh = X.T @ h + hess[ + k : n_classes * n_features : n_classes, + n_classes * n_features + k, + ] = Xh + hess[ + n_classes * n_features + k, + k : n_classes * n_features : n_classes, + ] = Xh + hess[n_classes * n_features + k, n_classes * n_features + k] = ( + h.sum() + ) + # Off diagonal terms (in classes) hess_kl. + for l in range(k + 1, n_classes): + # Upper triangle (in classes). + h = -proba[:, k] * proba[:, l] * sw + hess[ + k : n_classes * n_features : n_classes, + l : n_classes * n_features : n_classes, + ] = sandwich_dot(X, h) + if self.fit_intercept: + Xh = X.T @ h + hess[ + k : n_classes * n_features : n_classes, + n_classes * n_features + l, + ] = Xh + hess[ + n_classes * n_features + k, + l : n_classes * n_features : n_classes, + ] = Xh + hess[n_classes * n_features + k, n_classes * n_features + l] = ( + h.sum() + ) + # Fill lower triangle (in classes). + hess[l::n_classes, k::n_classes] = hess[k::n_classes, l::n_classes] + + if l2_reg_strength > 0: + # See above in the non multiclass case. + order = "C" if hess.flags.c_contiguous else "F" + hess.reshape(-1, order=order)[ + : (n_classes**2 * n_features * n_dof) : (n_classes * n_dof + 1) + ] += l2_reg_strength + + # The pointwise hessian is always non-negative for the multinomial loss. + hessian_warning = False + + return grad, hess, hessian_warning + + def gradient_hessian_product( + self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1 + ): + """Computes gradient and hessp (hessian product function) w.r.t. coef. + + Parameters + ---------- + coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,) + Coefficients of a linear model. + If shape (n_classes * n_dof,), the classes of one feature are contiguous, + i.e. one reconstructs the 2d-array via + coef.reshape((n_classes, -1), order="F"). + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + y : contiguous array of shape (n_samples,) + Observed, true target values. + sample_weight : None or contiguous array of shape (n_samples,), default=None + Sample weights. + l2_reg_strength : float, default=0.0 + L2 regularization strength + n_threads : int, default=1 + Number of OpenMP threads to use. + + Returns + ------- + gradient : ndarray of shape coef.shape + The gradient of the loss. + + hessp : callable + Function that takes in a vector input of shape of gradient and + and returns matrix-vector product with hessian. + """ + (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes + n_dof = n_features + int(self.fit_intercept) + weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X) + sw_sum = n_samples if sample_weight is None else np.sum(sample_weight) + + if not self.base_loss.is_multiclass: + grad_pointwise, hess_pointwise = self.base_loss.gradient_hessian( + y_true=y, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + n_threads=n_threads, + ) + grad_pointwise /= sw_sum + hess_pointwise /= sw_sum + grad = np.empty_like(coef, dtype=weights.dtype) + grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights + if self.fit_intercept: + grad[-1] = grad_pointwise.sum() + + # Precompute as much as possible: hX, hX_sum and hessian_sum + hessian_sum = hess_pointwise.sum() + if sparse.issparse(X): + hX = ( + sparse.dia_matrix((hess_pointwise, 0), shape=(n_samples, n_samples)) + @ X + ) + else: + hX = hess_pointwise[:, np.newaxis] * X + + if self.fit_intercept: + # Calculate the double derivative with respect to intercept. + # Note: In case hX is sparse, hX.sum is a matrix object. + hX_sum = np.squeeze(np.asarray(hX.sum(axis=0))) + # prevent squeezing to zero-dim array if n_features == 1 + hX_sum = np.atleast_1d(hX_sum) + + # With intercept included and l2_reg_strength = 0, hessp returns + # res = (X, 1)' @ diag(h) @ (X, 1) @ s + # = (X, 1)' @ (hX @ s[:n_features], sum(h) * s[-1]) + # res[:n_features] = X' @ hX @ s[:n_features] + sum(h) * s[-1] + # res[-1] = 1' @ hX @ s[:n_features] + sum(h) * s[-1] + def hessp(s): + ret = np.empty_like(s) + if sparse.issparse(X): + ret[:n_features] = X.T @ (hX @ s[:n_features]) + else: + ret[:n_features] = np.linalg.multi_dot([X.T, hX, s[:n_features]]) + ret[:n_features] += l2_reg_strength * s[:n_features] + + if self.fit_intercept: + ret[:n_features] += s[-1] * hX_sum + ret[-1] = hX_sum @ s[:n_features] + hessian_sum * s[-1] + return ret + + else: + # Here we may safely assume HalfMultinomialLoss aka categorical + # cross-entropy. + # HalfMultinomialLoss computes only the diagonal part of the hessian, i.e. + # diagonal in the classes. Here, we want the matrix-vector product of the + # full hessian. Therefore, we call gradient_proba. + grad_pointwise, proba = self.base_loss.gradient_proba( + y_true=y, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + n_threads=n_threads, + ) + grad_pointwise /= sw_sum + grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F") + grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights + if self.fit_intercept: + grad[:, -1] = grad_pointwise.sum(axis=0) + + # Full hessian-vector product, i.e. not only the diagonal part of the + # hessian. Derivation with some index battle for input vector s: + # - sample index i + # - feature indices j, m + # - class indices k, l + # - 1_{k=l} is one if k=l else 0 + # - p_i_k is the (predicted) probability that sample i belongs to class k + # for all i: sum_k p_i_k = 1 + # - s_l_m is input vector for class l and feature m + # - X' = X transposed + # + # Note: Hessian with dropping most indices is just: + # X' @ p_k (1(k=l) - p_l) @ X + # + # result_{k j} = sum_{i, l, m} Hessian_{i, k j, m l} * s_l_m + # = sum_{i, l, m} (X')_{ji} * p_i_k * (1_{k=l} - p_i_l) + # * X_{im} s_l_m + # = sum_{i, m} (X')_{ji} * p_i_k + # * (X_{im} * s_k_m - sum_l p_i_l * X_{im} * s_l_m) + # + # See also https://github.com/scikit-learn/scikit-learn/pull/3646#discussion_r17461411 + def hessp(s): + s = s.reshape((n_classes, -1), order="F") # shape = (n_classes, n_dof) + if self.fit_intercept: + s_intercept = s[:, -1] + s = s[:, :-1] # shape = (n_classes, n_features) + else: + s_intercept = 0 + tmp = X @ s.T + s_intercept # X_{im} * s_k_m + tmp += (-proba * tmp).sum(axis=1)[:, np.newaxis] # - sum_l .. + tmp *= proba # * p_i_k + if sample_weight is not None: + tmp *= sample_weight[:, np.newaxis] + # hess_prod = empty_like(grad), but we ravel grad below and this + # function is run after that. + hess_prod = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F") + hess_prod[:, :n_features] = (tmp.T @ X) / sw_sum + l2_reg_strength * s + if self.fit_intercept: + hess_prod[:, -1] = tmp.sum(axis=0) / sw_sum + if coef.ndim == 1: + return hess_prod.ravel(order="F") + else: + return hess_prod + + if coef.ndim == 1: + return grad.ravel(order="F"), hessp + + return grad, hessp diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py new file mode 100644 index 0000000000000000000000000000000000000000..35cfcee7ce7d16e4dcd57ada0d51db87bfa8c69f --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py @@ -0,0 +1,2327 @@ +""" +Logistic Regression +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numbers +import warnings +from numbers import Integral, Real + +import numpy as np +from joblib import effective_n_jobs +from scipy import optimize + +from sklearn.metrics import get_scorer_names + +from .._loss.loss import HalfBinomialLoss, HalfMultinomialLoss +from ..base import _fit_context +from ..metrics import get_scorer +from ..model_selection import check_cv +from ..preprocessing import LabelBinarizer, LabelEncoder +from ..svm._base import _fit_liblinear +from ..utils import ( + Bunch, + check_array, + check_consistent_length, + check_random_state, + compute_class_weight, +) +from ..utils._param_validation import Hidden, Interval, StrOptions +from ..utils.extmath import row_norms, softmax +from ..utils.fixes import _get_additional_lbfgs_options_dict +from ..utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + _routing_enabled, + process_routing, +) +from ..utils.multiclass import check_classification_targets +from ..utils.optimize import _check_optimize_result, _newton_cg +from ..utils.parallel import Parallel, delayed +from ..utils.validation import ( + _check_method_params, + _check_sample_weight, + check_is_fitted, + validate_data, +) +from ._base import BaseEstimator, LinearClassifierMixin, SparseCoefMixin +from ._glm.glm import NewtonCholeskySolver +from ._linear_loss import LinearModelLoss +from ._sag import sag_solver + +_LOGISTIC_SOLVER_CONVERGENCE_MSG = ( + "Please also refer to the documentation for alternative solver options:\n" + " https://scikit-learn.org/stable/modules/linear_model.html" + "#logistic-regression" +) + + +def _check_solver(solver, penalty, dual): + if solver not in ["liblinear", "saga"] and penalty not in ("l2", None): + raise ValueError( + f"Solver {solver} supports only 'l2' or None penalties, got {penalty} " + "penalty." + ) + if solver != "liblinear" and dual: + raise ValueError(f"Solver {solver} supports only dual=False, got dual={dual}") + + if penalty == "elasticnet" and solver != "saga": + raise ValueError( + f"Only 'saga' solver supports elasticnet penalty, got solver={solver}." + ) + + if solver == "liblinear" and penalty is None: + raise ValueError("penalty=None is not supported for the liblinear solver") + + return solver + + +def _check_multi_class(multi_class, solver, n_classes): + """Computes the multi class type, either "multinomial" or "ovr". + + For `n_classes` > 2 and a solver that supports it, returns "multinomial". + For all other cases, in particular binary classification, return "ovr". + """ + if multi_class == "auto": + if solver in ("liblinear",): + multi_class = "ovr" + elif n_classes > 2: + multi_class = "multinomial" + else: + multi_class = "ovr" + if multi_class == "multinomial" and solver in ("liblinear",): + raise ValueError("Solver %s does not support a multinomial backend." % solver) + return multi_class + + +def _logistic_regression_path( + X, + y, + pos_class=None, + Cs=10, + fit_intercept=True, + max_iter=100, + tol=1e-4, + verbose=0, + solver="lbfgs", + coef=None, + class_weight=None, + dual=False, + penalty="l2", + intercept_scaling=1.0, + multi_class="auto", + random_state=None, + check_input=True, + max_squared_sum=None, + sample_weight=None, + l1_ratio=None, + n_threads=1, +): + """Compute a Logistic Regression model for a list of regularization + parameters. + + This is an implementation that uses the result of the previous model + to speed up computations along the set of solutions, making it faster + than sequentially calling LogisticRegression for the different parameters. + Note that there will be no speedup with liblinear solver, since it does + not handle warm-starting. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data. + + y : array-like of shape (n_samples,) or (n_samples, n_targets) + Input data, target values. + + pos_class : int, default=None + The class with respect to which we perform a one-vs-all fit. + If None, then it is assumed that the given problem is binary. + + Cs : int or array-like of shape (n_cs,), default=10 + List of values for the regularization parameter or integer specifying + the number of regularization parameters that should be used. In this + case, the parameters will be chosen in a logarithmic scale between + 1e-4 and 1e4. + + fit_intercept : bool, default=True + Whether to fit an intercept for the model. In this case the shape of + the returned array is (n_cs, n_features + 1). + + max_iter : int, default=100 + Maximum number of iterations for the solver. + + tol : float, default=1e-4 + Stopping criterion. For the newton-cg and lbfgs solvers, the iteration + will stop when ``max{|g_i | i = 1, ..., n} <= tol`` + where ``g_i`` is the i-th component of the gradient. + + verbose : int, default=0 + For the liblinear and lbfgs solvers set verbose to any positive + number for verbosity. + + solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, \ + default='lbfgs' + Numerical solver to use. + + coef : array-like of shape (n_features,), default=None + Initialization value for coefficients of logistic regression. + Useless for liblinear solver. + + class_weight : dict or 'balanced', default=None + Weights associated with classes in the form ``{class_label: weight}``. + If not given, all classes are supposed to have weight one. + + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))``. + + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if sample_weight is specified. + + dual : bool, default=False + Dual or primal formulation. Dual formulation is only implemented for + l2 penalty with liblinear solver. Prefer dual=False when + n_samples > n_features. + + penalty : {'l1', 'l2', 'elasticnet'}, default='l2' + Used to specify the norm used in the penalization. The 'newton-cg', + 'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is + only supported by the 'saga' solver. + + intercept_scaling : float, default=1. + Useful only when the solver `liblinear` is used + and `self.fit_intercept` is set to `True`. In this case, `x` becomes + `[x, self.intercept_scaling]`, + i.e. a "synthetic" feature with constant value equal to + `intercept_scaling` is appended to the instance vector. + The intercept becomes + ``intercept_scaling * synthetic_feature_weight``. + + .. note:: + The synthetic feature weight is subject to L1 or L2 + regularization as all other features. + To lessen the effect of regularization on synthetic feature weight + (and therefore on the intercept) `intercept_scaling` has to be increased. + + multi_class : {'ovr', 'multinomial', 'auto'}, default='auto' + If the option chosen is 'ovr', then a binary problem is fit for each + label. For 'multinomial' the loss minimised is the multinomial loss fit + across the entire probability distribution, *even when the data is + binary*. 'multinomial' is unavailable when solver='liblinear'. + 'auto' selects 'ovr' if the data is binary, or if solver='liblinear', + and otherwise selects 'multinomial'. + + .. versionadded:: 0.18 + Stochastic Average Gradient descent solver for 'multinomial' case. + .. versionchanged:: 0.22 + Default changed from 'ovr' to 'auto' in 0.22. + + random_state : int, RandomState instance, default=None + Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the + data. See :term:`Glossary ` for details. + + check_input : bool, default=True + If False, the input arrays X and y will not be checked. + + max_squared_sum : float, default=None + Maximum squared sum of X over samples. Used only in SAG solver. + If None, it will be computed, going through all the samples. + The value should be precomputed to speed up cross validation. + + sample_weight : array-like of shape(n_samples,), default=None + Array of weights that are assigned to individual samples. + If not provided, then each sample is given unit weight. + + l1_ratio : float, default=None + The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only + used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent + to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent + to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a + combination of L1 and L2. + + n_threads : int, default=1 + Number of OpenMP threads to use. + + Returns + ------- + coefs : ndarray of shape (n_cs, n_features) or (n_cs, n_features + 1) + List of coefficients for the Logistic Regression model. If + fit_intercept is set to True then the second dimension will be + n_features + 1, where the last item represents the intercept. For + ``multiclass='multinomial'``, the shape is (n_classes, n_cs, + n_features) or (n_classes, n_cs, n_features + 1). + + Cs : ndarray + Grid of Cs used for cross-validation. + + n_iter : array of shape (n_cs,) + Actual number of iteration for each Cs. + + Notes + ----- + You might get slightly different results with the solver liblinear than + with the others since this uses LIBLINEAR which penalizes the intercept. + + .. versionchanged:: 0.19 + The "copy" parameter was removed. + """ + if isinstance(Cs, numbers.Integral): + Cs = np.logspace(-4, 4, Cs) + + solver = _check_solver(solver, penalty, dual) + + # Preprocessing. + if check_input: + X = check_array( + X, + accept_sparse="csr", + dtype=np.float64, + accept_large_sparse=solver not in ["liblinear", "sag", "saga"], + ) + y = check_array(y, ensure_2d=False, dtype=None) + check_consistent_length(X, y) + n_samples, n_features = X.shape + + classes = np.unique(y) + random_state = check_random_state(random_state) + + multi_class = _check_multi_class(multi_class, solver, len(classes)) + if pos_class is None and multi_class != "multinomial": + if classes.size > 2: + raise ValueError("To fit OvR, use the pos_class argument") + # np.unique(y) gives labels in sorted order. + pos_class = classes[1] + + if sample_weight is not None or class_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype, copy=True) + + # If class_weights is a dict (provided by the user), the weights + # are assigned to the original labels. If it is "balanced", then + # the class_weights are assigned after masking the labels with a OvR. + le = LabelEncoder() + if isinstance(class_weight, dict) or ( + multi_class == "multinomial" and class_weight is not None + ): + class_weight_ = compute_class_weight( + class_weight, classes=classes, y=y, sample_weight=sample_weight + ) + sample_weight *= class_weight_[le.fit_transform(y)] + + # For doing a ovr, we need to mask the labels first. For the + # multinomial case this is not necessary. + if multi_class == "ovr": + w0 = np.zeros(n_features + int(fit_intercept), dtype=X.dtype) + mask = y == pos_class + y_bin = np.ones(y.shape, dtype=X.dtype) + if solver == "liblinear": + mask_classes = np.array([-1, 1]) + y_bin[~mask] = -1.0 + else: + # HalfBinomialLoss, used for those solvers, represents y in [0, 1] instead + # of in [-1, 1]. + mask_classes = np.array([0, 1]) + y_bin[~mask] = 0.0 + + # for compute_class_weight + if class_weight == "balanced": + class_weight_ = compute_class_weight( + class_weight, + classes=mask_classes, + y=y_bin, + sample_weight=sample_weight, + ) + sample_weight *= class_weight_[le.fit_transform(y_bin)] + + else: + if solver in ["sag", "saga", "lbfgs", "newton-cg", "newton-cholesky"]: + # SAG, lbfgs, newton-cg and newton-cholesky multinomial solvers need + # LabelEncoder, not LabelBinarizer, i.e. y as a 1d-array of integers. + # LabelEncoder also saves memory compared to LabelBinarizer, especially + # when n_classes is large. + le = LabelEncoder() + Y_multi = le.fit_transform(y).astype(X.dtype, copy=False) + else: + # For liblinear solver, apply LabelBinarizer, i.e. y is one-hot encoded. + lbin = LabelBinarizer() + Y_multi = lbin.fit_transform(y) + if Y_multi.shape[1] == 1: + Y_multi = np.hstack([1 - Y_multi, Y_multi]) + + w0 = np.zeros( + (classes.size, n_features + int(fit_intercept)), order="F", dtype=X.dtype + ) + + # IMPORTANT NOTE: + # All solvers relying on LinearModelLoss need to scale the penalty with n_samples + # or the sum of sample weights because the implemented logistic regression + # objective here is (unfortunately) + # C * sum(pointwise_loss) + penalty + # instead of (as LinearModelLoss does) + # mean(pointwise_loss) + 1/C * penalty + if solver in ["lbfgs", "newton-cg", "newton-cholesky"]: + # This needs to be calculated after sample_weight is multiplied by + # class_weight. It is even tested that passing class_weight is equivalent to + # passing sample_weights according to class_weight. + sw_sum = n_samples if sample_weight is None else np.sum(sample_weight) + + if coef is not None: + # it must work both giving the bias term and not + if multi_class == "ovr": + if coef.size not in (n_features, w0.size): + raise ValueError( + "Initialization coef is of shape %d, expected shape %d or %d" + % (coef.size, n_features, w0.size) + ) + w0[: coef.size] = coef + else: + # For binary problems coef.shape[0] should be 1, otherwise it + # should be classes.size. + n_classes = classes.size + if n_classes == 2: + n_classes = 1 + + if coef.shape[0] != n_classes or coef.shape[1] not in ( + n_features, + n_features + 1, + ): + raise ValueError( + "Initialization coef is of shape (%d, %d), expected " + "shape (%d, %d) or (%d, %d)" + % ( + coef.shape[0], + coef.shape[1], + classes.size, + n_features, + classes.size, + n_features + 1, + ) + ) + + if n_classes == 1: + w0[0, : coef.shape[1]] = -coef + w0[1, : coef.shape[1]] = coef + else: + w0[:, : coef.shape[1]] = coef + + if multi_class == "multinomial": + if solver in ["lbfgs", "newton-cg", "newton-cholesky"]: + # scipy.optimize.minimize and newton-cg accept only ravelled parameters, + # i.e. 1d-arrays. LinearModelLoss expects classes to be contiguous and + # reconstructs the 2d-array via w0.reshape((n_classes, -1), order="F"). + # As w0 is F-contiguous, ravel(order="F") also avoids a copy. + w0 = w0.ravel(order="F") + loss = LinearModelLoss( + base_loss=HalfMultinomialLoss(n_classes=classes.size), + fit_intercept=fit_intercept, + ) + target = Y_multi + if solver == "lbfgs": + func = loss.loss_gradient + elif solver == "newton-cg": + func = loss.loss + grad = loss.gradient + hess = loss.gradient_hessian_product # hess = [gradient, hessp] + warm_start_sag = {"coef": w0.T} + else: + target = y_bin + if solver == "lbfgs": + loss = LinearModelLoss( + base_loss=HalfBinomialLoss(), fit_intercept=fit_intercept + ) + func = loss.loss_gradient + elif solver == "newton-cg": + loss = LinearModelLoss( + base_loss=HalfBinomialLoss(), fit_intercept=fit_intercept + ) + func = loss.loss + grad = loss.gradient + hess = loss.gradient_hessian_product # hess = [gradient, hessp] + elif solver == "newton-cholesky": + loss = LinearModelLoss( + base_loss=HalfBinomialLoss(), fit_intercept=fit_intercept + ) + warm_start_sag = {"coef": np.expand_dims(w0, axis=1)} + + coefs = list() + n_iter = np.zeros(len(Cs), dtype=np.int32) + for i, C in enumerate(Cs): + if solver == "lbfgs": + l2_reg_strength = 1.0 / (C * sw_sum) + iprint = [-1, 50, 1, 100, 101][ + np.searchsorted(np.array([0, 1, 2, 3]), verbose) + ] + opt_res = optimize.minimize( + func, + w0, + method="L-BFGS-B", + jac=True, + args=(X, target, sample_weight, l2_reg_strength, n_threads), + options={ + "maxiter": max_iter, + "maxls": 50, # default is 20 + "gtol": tol, + "ftol": 64 * np.finfo(float).eps, + **_get_additional_lbfgs_options_dict("iprint", iprint), + }, + ) + n_iter_i = _check_optimize_result( + solver, + opt_res, + max_iter, + extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG, + ) + w0, loss = opt_res.x, opt_res.fun + elif solver == "newton-cg": + l2_reg_strength = 1.0 / (C * sw_sum) + args = (X, target, sample_weight, l2_reg_strength, n_threads) + w0, n_iter_i = _newton_cg( + grad_hess=hess, + func=func, + grad=grad, + x0=w0, + args=args, + maxiter=max_iter, + tol=tol, + verbose=verbose, + ) + elif solver == "newton-cholesky": + l2_reg_strength = 1.0 / (C * sw_sum) + sol = NewtonCholeskySolver( + coef=w0, + linear_loss=loss, + l2_reg_strength=l2_reg_strength, + tol=tol, + max_iter=max_iter, + n_threads=n_threads, + verbose=verbose, + ) + w0 = sol.solve(X=X, y=target, sample_weight=sample_weight) + n_iter_i = sol.iteration + elif solver == "liblinear": + if len(classes) > 2: + warnings.warn( + "Using the 'liblinear' solver for multiclass classification is " + "deprecated. An error will be raised in 1.8. Either use another " + "solver which supports the multinomial loss or wrap the estimator " + "in a OneVsRestClassifier to keep applying a one-versus-rest " + "scheme.", + FutureWarning, + ) + ( + coef_, + intercept_, + n_iter_i, + ) = _fit_liblinear( + X, + target, + C, + fit_intercept, + intercept_scaling, + None, + penalty, + dual, + verbose, + max_iter, + tol, + random_state, + sample_weight=sample_weight, + ) + if fit_intercept: + w0 = np.concatenate([coef_.ravel(), intercept_]) + else: + w0 = coef_.ravel() + # n_iter_i is an array for each class. However, `target` is always encoded + # in {-1, 1}, so we only take the first element of n_iter_i. + n_iter_i = n_iter_i.item() + + elif solver in ["sag", "saga"]: + if multi_class == "multinomial": + target = target.astype(X.dtype, copy=False) + loss = "multinomial" + else: + loss = "log" + # alpha is for L2-norm, beta is for L1-norm + if penalty == "l1": + alpha = 0.0 + beta = 1.0 / C + elif penalty == "l2": + alpha = 1.0 / C + beta = 0.0 + else: # Elastic-Net penalty + alpha = (1.0 / C) * (1 - l1_ratio) + beta = (1.0 / C) * l1_ratio + + w0, n_iter_i, warm_start_sag = sag_solver( + X, + target, + sample_weight, + loss, + alpha, + beta, + max_iter, + tol, + verbose, + random_state, + False, + max_squared_sum, + warm_start_sag, + is_saga=(solver == "saga"), + ) + + else: + raise ValueError( + "solver must be one of {'liblinear', 'lbfgs', " + "'newton-cg', 'sag'}, got '%s' instead" % solver + ) + + if multi_class == "multinomial": + n_classes = max(2, classes.size) + if solver in ["lbfgs", "newton-cg", "newton-cholesky"]: + multi_w0 = np.reshape(w0, (n_classes, -1), order="F") + else: + multi_w0 = w0 + if n_classes == 2: + multi_w0 = multi_w0[1][np.newaxis, :] + coefs.append(multi_w0.copy()) + else: + coefs.append(w0.copy()) + + n_iter[i] = n_iter_i + + return np.array(coefs), np.array(Cs), n_iter + + +# helper function for LogisticCV +def _log_reg_scoring_path( + X, + y, + train, + test, + *, + pos_class, + Cs, + scoring, + fit_intercept, + max_iter, + tol, + class_weight, + verbose, + solver, + penalty, + dual, + intercept_scaling, + multi_class, + random_state, + max_squared_sum, + sample_weight, + l1_ratio, + score_params, +): + """Computes scores across logistic_regression_path + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) or (n_samples, n_targets) + Target labels. + + train : list of indices + The indices of the train set. + + test : list of indices + The indices of the test set. + + pos_class : int + The class with respect to which we perform a one-vs-all fit. + If None, then it is assumed that the given problem is binary. + + Cs : int or list of floats + Each of the values in Cs describes the inverse of + regularization strength. If Cs is as an int, then a grid of Cs + values are chosen in a logarithmic scale between 1e-4 and 1e4. + + scoring : str, callable or None + The scoring method to use for cross-validation. Options: + + - str: see :ref:`scoring_string_names` for options. + - callable: a scorer callable object (e.g., function) with signature + ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details. + - `None`: :ref:`accuracy ` is used. + + fit_intercept : bool + If False, then the bias term is set to zero. Else the last + term of each coef_ gives us the intercept. + + max_iter : int + Maximum number of iterations for the solver. + + tol : float + Tolerance for stopping criteria. + + class_weight : dict or 'balanced' + Weights associated with classes in the form ``{class_label: weight}``. + If not given, all classes are supposed to have weight one. + + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))`` + + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if sample_weight is specified. + + verbose : int + For the liblinear and lbfgs solvers set verbose to any positive + number for verbosity. + + solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'} + Decides which solver to use. + + penalty : {'l1', 'l2', 'elasticnet'} + Used to specify the norm used in the penalization. The 'newton-cg', + 'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is + only supported by the 'saga' solver. + + dual : bool + Dual or primal formulation. Dual formulation is only implemented for + l2 penalty with liblinear solver. Prefer dual=False when + n_samples > n_features. + + intercept_scaling : float + Useful only when the solver `liblinear` is used + and `self.fit_intercept` is set to `True`. In this case, `x` becomes + `[x, self.intercept_scaling]`, + i.e. a "synthetic" feature with constant value equal to + `intercept_scaling` is appended to the instance vector. + The intercept becomes + ``intercept_scaling * synthetic_feature_weight``. + + .. note:: + The synthetic feature weight is subject to L1 or L2 + regularization as all other features. + To lessen the effect of regularization on synthetic feature weight + (and therefore on the intercept) `intercept_scaling` has to be increased. + + multi_class : {'auto', 'ovr', 'multinomial'} + If the option chosen is 'ovr', then a binary problem is fit for each + label. For 'multinomial' the loss minimised is the multinomial loss fit + across the entire probability distribution, *even when the data is + binary*. 'multinomial' is unavailable when solver='liblinear'. + + random_state : int, RandomState instance + Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the + data. See :term:`Glossary ` for details. + + max_squared_sum : float + Maximum squared sum of X over samples. Used only in SAG solver. + If None, it will be computed, going through all the samples. + The value should be precomputed to speed up cross validation. + + sample_weight : array-like of shape(n_samples,) + Array of weights that are assigned to individual samples. + If not provided, then each sample is given unit weight. + + l1_ratio : float + The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only + used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent + to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent + to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a + combination of L1 and L2. + + score_params : dict + Parameters to pass to the `score` method of the underlying scorer. + + Returns + ------- + coefs : ndarray of shape (n_cs, n_features) or (n_cs, n_features + 1) + List of coefficients for the Logistic Regression model. If + fit_intercept is set to True then the second dimension will be + n_features + 1, where the last item represents the intercept. + + Cs : ndarray + Grid of Cs used for cross-validation. + + scores : ndarray of shape (n_cs,) + Scores obtained for each Cs. + + n_iter : ndarray of shape(n_cs,) + Actual number of iteration for each Cs. + """ + X_train = X[train] + X_test = X[test] + y_train = y[train] + y_test = y[test] + + sw_train, sw_test = None, None + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) + sw_train = sample_weight[train] + sw_test = sample_weight[test] + + coefs, Cs, n_iter = _logistic_regression_path( + X_train, + y_train, + Cs=Cs, + l1_ratio=l1_ratio, + fit_intercept=fit_intercept, + solver=solver, + max_iter=max_iter, + class_weight=class_weight, + pos_class=pos_class, + multi_class=multi_class, + tol=tol, + verbose=verbose, + dual=dual, + penalty=penalty, + intercept_scaling=intercept_scaling, + random_state=random_state, + check_input=False, + max_squared_sum=max_squared_sum, + sample_weight=sw_train, + ) + + log_reg = LogisticRegression(solver=solver, multi_class=multi_class) + + # The score method of Logistic Regression has a classes_ attribute. + if multi_class == "ovr": + log_reg.classes_ = np.array([-1, 1]) + elif multi_class == "multinomial": + log_reg.classes_ = np.unique(y_train) + else: + raise ValueError( + "multi_class should be either multinomial or ovr, got %d" % multi_class + ) + + if pos_class is not None: + mask = y_test == pos_class + y_test = np.ones(y_test.shape, dtype=np.float64) + y_test[~mask] = -1.0 + + scores = list() + + scoring = get_scorer(scoring) + for w in coefs: + if multi_class == "ovr": + w = w[np.newaxis, :] + if fit_intercept: + log_reg.coef_ = w[:, :-1] + log_reg.intercept_ = w[:, -1] + else: + log_reg.coef_ = w + log_reg.intercept_ = 0.0 + + if scoring is None: + scores.append(log_reg.score(X_test, y_test, sample_weight=sw_test)) + else: + score_params = score_params or {} + score_params = _check_method_params(X=X, params=score_params, indices=test) + scores.append(scoring(log_reg, X_test, y_test, **score_params)) + return coefs, Cs, np.array(scores), n_iter + + +class LogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstimator): + """ + Logistic Regression (aka logit, MaxEnt) classifier. + + This class implements regularized logistic regression using the + 'liblinear' library, 'newton-cg', 'sag', 'saga' and 'lbfgs' solvers. **Note + that regularization is applied by default**. It can handle both dense + and sparse input. Use C-ordered arrays or CSR matrices containing 64-bit + floats for optimal performance; any other input format will be converted + (and copied). + + The 'newton-cg', 'sag', and 'lbfgs' solvers support only L2 regularization + with primal formulation, or no regularization. The 'liblinear' solver + supports both L1 and L2 regularization, with a dual formulation only for + the L2 penalty. The Elastic-Net regularization is only supported by the + 'saga' solver. + + For :term:`multiclass` problems, all solvers but 'liblinear' optimize the + (penalized) multinomial loss. 'liblinear' only handle binary classification but can + be extended to handle multiclass by using + :class:`~sklearn.multiclass.OneVsRestClassifier`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + penalty : {'l1', 'l2', 'elasticnet', None}, default='l2' + Specify the norm of the penalty: + + - `None`: no penalty is added; + - `'l2'`: add a L2 penalty term and it is the default choice; + - `'l1'`: add a L1 penalty term; + - `'elasticnet'`: both L1 and L2 penalty terms are added. + + .. warning:: + Some penalties may not work with some solvers. See the parameter + `solver` below, to know the compatibility between the penalty and + solver. + + .. versionadded:: 0.19 + l1 penalty with SAGA solver (allowing 'multinomial' + L1) + + dual : bool, default=False + Dual (constrained) or primal (regularized, see also + :ref:`this equation `) formulation. Dual formulation + is only implemented for l2 penalty with liblinear solver. Prefer dual=False when + n_samples > n_features. + + tol : float, default=1e-4 + Tolerance for stopping criteria. + + C : float, default=1.0 + Inverse of regularization strength; must be a positive float. + Like in support vector machines, smaller values specify stronger + regularization. + + fit_intercept : bool, default=True + Specifies if a constant (a.k.a. bias or intercept) should be + added to the decision function. + + intercept_scaling : float, default=1 + Useful only when the solver `liblinear` is used + and `self.fit_intercept` is set to `True`. In this case, `x` becomes + `[x, self.intercept_scaling]`, + i.e. a "synthetic" feature with constant value equal to + `intercept_scaling` is appended to the instance vector. + The intercept becomes + ``intercept_scaling * synthetic_feature_weight``. + + .. note:: + The synthetic feature weight is subject to L1 or L2 + regularization as all other features. + To lessen the effect of regularization on synthetic feature weight + (and therefore on the intercept) `intercept_scaling` has to be increased. + + class_weight : dict or 'balanced', default=None + Weights associated with classes in the form ``{class_label: weight}``. + If not given, all classes are supposed to have weight one. + + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))``. + + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if sample_weight is specified. + + .. versionadded:: 0.17 + *class_weight='balanced'* + + random_state : int, RandomState instance, default=None + Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the + data. See :term:`Glossary ` for details. + + solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, \ + default='lbfgs' + + Algorithm to use in the optimization problem. Default is 'lbfgs'. + To choose a solver, you might want to consider the following aspects: + + - For small datasets, 'liblinear' is a good choice, whereas 'sag' + and 'saga' are faster for large ones; + - For :term:`multiclass` problems, all solvers except 'liblinear' minimize the + full multinomial loss; + - 'liblinear' can only handle binary classification by default. To apply a + one-versus-rest scheme for the multiclass setting one can wrap it with the + :class:`~sklearn.multiclass.OneVsRestClassifier`. + - 'newton-cholesky' is a good choice for + `n_samples` >> `n_features * n_classes`, especially with one-hot encoded + categorical features with rare categories. Be aware that the memory usage + of this solver has a quadratic dependency on `n_features * n_classes` + because it explicitly computes the full Hessian matrix. + + .. warning:: + The choice of the algorithm depends on the penalty chosen and on + (multinomial) multiclass support: + + ================= ============================== ====================== + solver penalty multinomial multiclass + ================= ============================== ====================== + 'lbfgs' 'l2', None yes + 'liblinear' 'l1', 'l2' no + 'newton-cg' 'l2', None yes + 'newton-cholesky' 'l2', None yes + 'sag' 'l2', None yes + 'saga' 'elasticnet', 'l1', 'l2', None yes + ================= ============================== ====================== + + .. note:: + 'sag' and 'saga' fast convergence is only guaranteed on features + with approximately the same scale. You can preprocess the data with + a scaler from :mod:`sklearn.preprocessing`. + + .. seealso:: + Refer to the :ref:`User Guide ` for more + information regarding :class:`LogisticRegression` and more specifically the + :ref:`Table ` + summarizing solver/penalty supports. + + .. versionadded:: 0.17 + Stochastic Average Gradient (SAG) descent solver. Multinomial support in + version 0.18. + .. versionadded:: 0.19 + SAGA solver. + .. versionchanged:: 0.22 + The default solver changed from 'liblinear' to 'lbfgs' in 0.22. + .. versionadded:: 1.2 + newton-cholesky solver. Multinomial support in version 1.6. + + max_iter : int, default=100 + Maximum number of iterations taken for the solvers to converge. + + multi_class : {'auto', 'ovr', 'multinomial'}, default='auto' + If the option chosen is 'ovr', then a binary problem is fit for each + label. For 'multinomial' the loss minimised is the multinomial loss fit + across the entire probability distribution, *even when the data is + binary*. 'multinomial' is unavailable when solver='liblinear'. + 'auto' selects 'ovr' if the data is binary, or if solver='liblinear', + and otherwise selects 'multinomial'. + + .. versionadded:: 0.18 + Stochastic Average Gradient descent solver for 'multinomial' case. + .. versionchanged:: 0.22 + Default changed from 'ovr' to 'auto' in 0.22. + .. deprecated:: 1.5 + ``multi_class`` was deprecated in version 1.5 and will be removed in 1.8. + From then on, the recommended 'multinomial' will always be used for + `n_classes >= 3`. + Solvers that do not support 'multinomial' will raise an error. + Use `sklearn.multiclass.OneVsRestClassifier(LogisticRegression())` if you + still want to use OvR. + + verbose : int, default=0 + For the liblinear and lbfgs solvers set verbose to any positive + number for verbosity. + + warm_start : bool, default=False + When set to True, reuse the solution of the previous call to fit as + initialization, otherwise, just erase the previous solution. + Useless for liblinear solver. See :term:`the Glossary `. + + .. versionadded:: 0.17 + *warm_start* to support *lbfgs*, *newton-cg*, *sag*, *saga* solvers. + + n_jobs : int, default=None + Number of CPU cores used when parallelizing over classes if + multi_class='ovr'". This parameter is ignored when the ``solver`` is + set to 'liblinear' regardless of whether 'multi_class' is specified or + not. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` + context. ``-1`` means using all processors. + See :term:`Glossary ` for more details. + + l1_ratio : float, default=None + The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only + used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent + to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent + to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a + combination of L1 and L2. + + Attributes + ---------- + + classes_ : ndarray of shape (n_classes, ) + A list of class labels known to the classifier. + + coef_ : ndarray of shape (1, n_features) or (n_classes, n_features) + Coefficient of the features in the decision function. + + `coef_` is of shape (1, n_features) when the given problem is binary. + In particular, when `multi_class='multinomial'`, `coef_` corresponds + to outcome 1 (True) and `-coef_` corresponds to outcome 0 (False). + + intercept_ : ndarray of shape (1,) or (n_classes,) + Intercept (a.k.a. bias) added to the decision function. + + If `fit_intercept` is set to False, the intercept is set to zero. + `intercept_` is of shape (1,) when the given problem is binary. + In particular, when `multi_class='multinomial'`, `intercept_` + corresponds to outcome 1 (True) and `-intercept_` corresponds to + outcome 0 (False). + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : ndarray of shape (n_classes,) or (1, ) + Actual number of iterations for all classes. If binary or multinomial, + it returns only 1 element. For liblinear solver, only the maximum + number of iteration across all classes is given. + + .. versionchanged:: 0.20 + + In SciPy <= 1.0.0 the number of lbfgs iterations may exceed + ``max_iter``. ``n_iter_`` will now report at most ``max_iter``. + + See Also + -------- + SGDClassifier : Incrementally trained logistic regression (when given + the parameter ``loss="log_loss"``). + LogisticRegressionCV : Logistic regression with built-in cross validation. + + Notes + ----- + The underlying C implementation uses a random number generator to + select features when fitting the model. It is thus not uncommon, + to have slightly different results for the same input data. If + that happens, try with a smaller tol parameter. + + Predict output may not match that of standalone liblinear in certain + cases. See :ref:`differences from liblinear ` + in the narrative documentation. + + References + ---------- + + L-BFGS-B -- Software for Large-scale Bound-constrained Optimization + Ciyou Zhu, Richard Byrd, Jorge Nocedal and Jose Luis Morales. + http://users.iems.northwestern.edu/~nocedal/lbfgsb.html + + LIBLINEAR -- A Library for Large Linear Classification + https://www.csie.ntu.edu.tw/~cjlin/liblinear/ + + SAG -- Mark Schmidt, Nicolas Le Roux, and Francis Bach + Minimizing Finite Sums with the Stochastic Average Gradient + https://hal.inria.fr/hal-00860051/document + + SAGA -- Defazio, A., Bach F. & Lacoste-Julien S. (2014). + :arxiv:`"SAGA: A Fast Incremental Gradient Method With Support + for Non-Strongly Convex Composite Objectives" <1407.0202>` + + Hsiang-Fu Yu, Fang-Lan Huang, Chih-Jen Lin (2011). Dual coordinate descent + methods for logistic regression and maximum entropy models. + Machine Learning 85(1-2):41-75. + https://www.csie.ntu.edu.tw/~cjlin/papers/maxent_dual.pdf + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> from sklearn.linear_model import LogisticRegression + >>> X, y = load_iris(return_X_y=True) + >>> clf = LogisticRegression(random_state=0).fit(X, y) + >>> clf.predict(X[:2, :]) + array([0, 0]) + >>> clf.predict_proba(X[:2, :]) + array([[9.82e-01, 1.82e-02, 1.44e-08], + [9.72e-01, 2.82e-02, 3.02e-08]]) + >>> clf.score(X, y) + 0.97 + + For a comparison of the LogisticRegression with other classifiers see: + :ref:`sphx_glr_auto_examples_classification_plot_classification_probability.py`. + """ + + _parameter_constraints: dict = { + "penalty": [StrOptions({"l1", "l2", "elasticnet"}), None], + "dual": ["boolean"], + "tol": [Interval(Real, 0, None, closed="left")], + "C": [Interval(Real, 0, None, closed="right")], + "fit_intercept": ["boolean"], + "intercept_scaling": [Interval(Real, 0, None, closed="neither")], + "class_weight": [dict, StrOptions({"balanced"}), None], + "random_state": ["random_state"], + "solver": [ + StrOptions( + {"lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"} + ) + ], + "max_iter": [Interval(Integral, 0, None, closed="left")], + "verbose": ["verbose"], + "warm_start": ["boolean"], + "n_jobs": [None, Integral], + "l1_ratio": [Interval(Real, 0, 1, closed="both"), None], + "multi_class": [ + StrOptions({"auto", "ovr", "multinomial"}), + Hidden(StrOptions({"deprecated"})), + ], + } + + def __init__( + self, + penalty="l2", + *, + dual=False, + tol=1e-4, + C=1.0, + fit_intercept=True, + intercept_scaling=1, + class_weight=None, + random_state=None, + solver="lbfgs", + max_iter=100, + multi_class="deprecated", + verbose=0, + warm_start=False, + n_jobs=None, + l1_ratio=None, + ): + self.penalty = penalty + self.dual = dual + self.tol = tol + self.C = C + self.fit_intercept = fit_intercept + self.intercept_scaling = intercept_scaling + self.class_weight = class_weight + self.random_state = random_state + self.solver = solver + self.max_iter = max_iter + self.multi_class = multi_class + self.verbose = verbose + self.warm_start = warm_start + self.n_jobs = n_jobs + self.l1_ratio = l1_ratio + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, sample_weight=None): + """ + Fit the model according to the given training data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target vector relative to X. + + sample_weight : array-like of shape (n_samples,) default=None + Array of weights that are assigned to individual samples. + If not provided, then each sample is given unit weight. + + .. versionadded:: 0.17 + *sample_weight* support to LogisticRegression. + + Returns + ------- + self + Fitted estimator. + + Notes + ----- + The SAGA solver supports both float64 and float32 bit arrays. + """ + solver = _check_solver(self.solver, self.penalty, self.dual) + + if self.penalty != "elasticnet" and self.l1_ratio is not None: + warnings.warn( + "l1_ratio parameter is only used when penalty is " + "'elasticnet'. Got " + "(penalty={})".format(self.penalty) + ) + + if self.penalty == "elasticnet" and self.l1_ratio is None: + raise ValueError("l1_ratio must be specified when penalty is elasticnet.") + + if self.penalty is None: + if self.C != 1.0: # default values + warnings.warn( + "Setting penalty=None will ignore the C and l1_ratio parameters" + ) + # Note that check for l1_ratio is done right above + C_ = np.inf + penalty = "l2" + else: + C_ = self.C + penalty = self.penalty + + if solver == "lbfgs": + _dtype = np.float64 + else: + _dtype = [np.float64, np.float32] + + X, y = validate_data( + self, + X, + y, + accept_sparse="csr", + dtype=_dtype, + order="C", + accept_large_sparse=solver not in ["liblinear", "sag", "saga"], + ) + check_classification_targets(y) + self.classes_ = np.unique(y) + + # TODO(1.8) remove multi_class + multi_class = self.multi_class + if self.multi_class == "multinomial" and len(self.classes_) == 2: + warnings.warn( + ( + "'multi_class' was deprecated in version 1.5 and will be removed in" + " 1.8. From then on, binary problems will be fit as proper binary " + " logistic regression models (as if multi_class='ovr' were set)." + " Leave it to its default value to avoid this warning." + ), + FutureWarning, + ) + elif self.multi_class in ("multinomial", "auto"): + warnings.warn( + ( + "'multi_class' was deprecated in version 1.5 and will be removed in" + " 1.8. From then on, it will always use 'multinomial'." + " Leave it to its default value to avoid this warning." + ), + FutureWarning, + ) + elif self.multi_class == "ovr": + warnings.warn( + ( + "'multi_class' was deprecated in version 1.5 and will be removed in" + " 1.8. Use OneVsRestClassifier(LogisticRegression(..)) instead." + " Leave it to its default value to avoid this warning." + ), + FutureWarning, + ) + else: + # Set to old default value. + multi_class = "auto" + multi_class = _check_multi_class(multi_class, solver, len(self.classes_)) + + if solver == "liblinear": + if len(self.classes_) > 2: + warnings.warn( + "Using the 'liblinear' solver for multiclass classification is " + "deprecated. An error will be raised in 1.8. Either use another " + "solver which supports the multinomial loss or wrap the estimator " + "in a OneVsRestClassifier to keep applying a one-versus-rest " + "scheme.", + FutureWarning, + ) + if effective_n_jobs(self.n_jobs) != 1: + warnings.warn( + "'n_jobs' > 1 does not have any effect when" + " 'solver' is set to 'liblinear'. Got 'n_jobs'" + " = {}.".format(effective_n_jobs(self.n_jobs)) + ) + self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear( + X, + y, + self.C, + self.fit_intercept, + self.intercept_scaling, + self.class_weight, + self.penalty, + self.dual, + self.verbose, + self.max_iter, + self.tol, + self.random_state, + sample_weight=sample_weight, + ) + return self + + if solver in ["sag", "saga"]: + max_squared_sum = row_norms(X, squared=True).max() + else: + max_squared_sum = None + + n_classes = len(self.classes_) + classes_ = self.classes_ + if n_classes < 2: + raise ValueError( + "This solver needs samples of at least 2 classes" + " in the data, but the data contains only one" + " class: %r" % classes_[0] + ) + + if len(self.classes_) == 2: + n_classes = 1 + classes_ = classes_[1:] + + if self.warm_start: + warm_start_coef = getattr(self, "coef_", None) + else: + warm_start_coef = None + if warm_start_coef is not None and self.fit_intercept: + warm_start_coef = np.append( + warm_start_coef, self.intercept_[:, np.newaxis], axis=1 + ) + + # Hack so that we iterate only once for the multinomial case. + if multi_class == "multinomial": + classes_ = [None] + warm_start_coef = [warm_start_coef] + if warm_start_coef is None: + warm_start_coef = [None] * n_classes + + path_func = delayed(_logistic_regression_path) + + # The SAG solver releases the GIL so it's more efficient to use + # threads for this solver. + if solver in ["sag", "saga"]: + prefer = "threads" + else: + prefer = "processes" + + # TODO: Refactor this to avoid joblib parallelism entirely when doing binary + # and multinomial multiclass classification and use joblib only for the + # one-vs-rest multiclass case. + if ( + solver in ["lbfgs", "newton-cg", "newton-cholesky"] + and len(classes_) == 1 + and effective_n_jobs(self.n_jobs) == 1 + ): + # In the future, we would like n_threads = _openmp_effective_n_threads() + # For the time being, we just do + n_threads = 1 + else: + n_threads = 1 + + fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)( + path_func( + X, + y, + pos_class=class_, + Cs=[C_], + l1_ratio=self.l1_ratio, + fit_intercept=self.fit_intercept, + tol=self.tol, + verbose=self.verbose, + solver=solver, + multi_class=multi_class, + max_iter=self.max_iter, + class_weight=self.class_weight, + check_input=False, + random_state=self.random_state, + coef=warm_start_coef_, + penalty=penalty, + max_squared_sum=max_squared_sum, + sample_weight=sample_weight, + n_threads=n_threads, + ) + for class_, warm_start_coef_ in zip(classes_, warm_start_coef) + ) + + fold_coefs_, _, n_iter_ = zip(*fold_coefs_) + self.n_iter_ = np.asarray(n_iter_, dtype=np.int32)[:, 0] + + n_features = X.shape[1] + if multi_class == "multinomial": + self.coef_ = fold_coefs_[0][0] + else: + self.coef_ = np.asarray(fold_coefs_) + self.coef_ = self.coef_.reshape( + n_classes, n_features + int(self.fit_intercept) + ) + + if self.fit_intercept: + self.intercept_ = self.coef_[:, -1] + self.coef_ = self.coef_[:, :-1] + else: + self.intercept_ = np.zeros(n_classes) + + return self + + def predict_proba(self, X): + """ + Probability estimates. + + The returned estimates for all classes are ordered by the + label of classes. + + For a multi_class problem, if multi_class is set to be "multinomial" + the softmax function is used to find the predicted probability of + each class. + Else use a one-vs-rest approach, i.e. calculate the probability + of each class assuming it to be positive using the logistic function + and normalize these values across all the classes. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Vector to be scored, where `n_samples` is the number of samples and + `n_features` is the number of features. + + Returns + ------- + T : array-like of shape (n_samples, n_classes) + Returns the probability of the sample for each class in the model, + where classes are ordered as they are in ``self.classes_``. + """ + check_is_fitted(self) + + ovr = self.multi_class in ["ovr", "warn"] or ( + self.multi_class in ["auto", "deprecated"] + and (self.classes_.size <= 2 or self.solver == "liblinear") + ) + if ovr: + return super()._predict_proba_lr(X) + else: + decision = self.decision_function(X) + if decision.ndim == 1: + # Workaround for multi_class="multinomial" and binary outcomes + # which requires softmax prediction with only a 1D decision. + decision_2d = np.c_[-decision, decision] + else: + decision_2d = decision + return softmax(decision_2d, copy=False) + + def predict_log_proba(self, X): + """ + Predict logarithm of probability estimates. + + The returned estimates for all classes are ordered by the + label of classes. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Vector to be scored, where `n_samples` is the number of samples and + `n_features` is the number of features. + + Returns + ------- + T : array-like of shape (n_samples, n_classes) + Returns the log-probability of the sample for each class in the + model, where classes are ordered as they are in ``self.classes_``. + """ + return np.log(self.predict_proba(X)) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + + +class LogisticRegressionCV(LogisticRegression, LinearClassifierMixin, BaseEstimator): + """Logistic Regression CV (aka logit, MaxEnt) classifier. + + See glossary entry for :term:`cross-validation estimator`. + + This class implements logistic regression using liblinear, newton-cg, sag + or lbfgs optimizer. The newton-cg, sag and lbfgs solvers support only L2 + regularization with primal formulation. The liblinear solver supports both + L1 and L2 regularization, with a dual formulation only for the L2 penalty. + Elastic-Net penalty is only supported by the saga solver. + + For the grid of `Cs` values and `l1_ratios` values, the best hyperparameter + is selected by the cross-validator + :class:`~sklearn.model_selection.StratifiedKFold`, but it can be changed + using the :term:`cv` parameter. The 'newton-cg', 'sag', 'saga' and 'lbfgs' + solvers can warm-start the coefficients (see :term:`Glossary`). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + Cs : int or list of floats, default=10 + Each of the values in Cs describes the inverse of regularization + strength. If Cs is as an int, then a grid of Cs values are chosen + in a logarithmic scale between 1e-4 and 1e4. + Like in support vector machines, smaller values specify stronger + regularization. + + fit_intercept : bool, default=True + Specifies if a constant (a.k.a. bias or intercept) should be + added to the decision function. + + cv : int or cross-validation generator, default=None + The default cross-validation generator used is Stratified K-Folds. + If an integer is provided, then it is the number of folds used. + See the module :mod:`sklearn.model_selection` module for the + list of possible cross-validation objects. + + .. versionchanged:: 0.22 + ``cv`` default value if None changed from 3-fold to 5-fold. + + dual : bool, default=False + Dual (constrained) or primal (regularized, see also + :ref:`this equation `) formulation. Dual formulation + is only implemented for l2 penalty with liblinear solver. Prefer dual=False when + n_samples > n_features. + + penalty : {'l1', 'l2', 'elasticnet'}, default='l2' + Specify the norm of the penalty: + + - `'l2'`: add a L2 penalty term (used by default); + - `'l1'`: add a L1 penalty term; + - `'elasticnet'`: both L1 and L2 penalty terms are added. + + .. warning:: + Some penalties may not work with some solvers. See the parameter + `solver` below, to know the compatibility between the penalty and + solver. + + scoring : str or callable, default=None + The scoring method to use for cross-validation. Options: + + - str: see :ref:`scoring_string_names` for options. + - callable: a scorer callable object (e.g., function) with signature + ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details. + - `None`: :ref:`accuracy ` is used. + + solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, \ + default='lbfgs' + + Algorithm to use in the optimization problem. Default is 'lbfgs'. + To choose a solver, you might want to consider the following aspects: + + - For small datasets, 'liblinear' is a good choice, whereas 'sag' + and 'saga' are faster for large ones; + - For multiclass problems, all solvers except 'liblinear' minimize the full + multinomial loss; + - 'liblinear' might be slower in :class:`LogisticRegressionCV` + because it does not handle warm-starting. + - 'liblinear' can only handle binary classification by default. To apply a + one-versus-rest scheme for the multiclass setting one can wrap it with the + :class:`~sklearn.multiclass.OneVsRestClassifier`. + - 'newton-cholesky' is a good choice for + `n_samples` >> `n_features * n_classes`, especially with one-hot encoded + categorical features with rare categories. Be aware that the memory usage + of this solver has a quadratic dependency on `n_features * n_classes` + because it explicitly computes the full Hessian matrix. + + .. warning:: + The choice of the algorithm depends on the penalty chosen and on + (multinomial) multiclass support: + + ================= ============================== ====================== + solver penalty multinomial multiclass + ================= ============================== ====================== + 'lbfgs' 'l2' yes + 'liblinear' 'l1', 'l2' no + 'newton-cg' 'l2' yes + 'newton-cholesky' 'l2', yes + 'sag' 'l2', yes + 'saga' 'elasticnet', 'l1', 'l2' yes + ================= ============================== ====================== + + .. note:: + 'sag' and 'saga' fast convergence is only guaranteed on features + with approximately the same scale. You can preprocess the data with + a scaler from :mod:`sklearn.preprocessing`. + + .. versionadded:: 0.17 + Stochastic Average Gradient (SAG) descent solver. Multinomial support in + version 0.18. + .. versionadded:: 0.19 + SAGA solver. + .. versionadded:: 1.2 + newton-cholesky solver. Multinomial support in version 1.6. + + tol : float, default=1e-4 + Tolerance for stopping criteria. + + max_iter : int, default=100 + Maximum number of iterations of the optimization algorithm. + + class_weight : dict or 'balanced', default=None + Weights associated with classes in the form ``{class_label: weight}``. + If not given, all classes are supposed to have weight one. + + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))``. + + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if sample_weight is specified. + + .. versionadded:: 0.17 + class_weight == 'balanced' + + n_jobs : int, default=None + Number of CPU cores used during the cross-validation loop. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + verbose : int, default=0 + For the 'liblinear', 'sag' and 'lbfgs' solvers set verbose to any + positive number for verbosity. + + refit : bool, default=True + If set to True, the scores are averaged across all folds, and the + coefs and the C that corresponds to the best score is taken, and a + final refit is done using these parameters. + Otherwise the coefs, intercepts and C that correspond to the + best scores across folds are averaged. + + intercept_scaling : float, default=1 + Useful only when the solver `liblinear` is used + and `self.fit_intercept` is set to `True`. In this case, `x` becomes + `[x, self.intercept_scaling]`, + i.e. a "synthetic" feature with constant value equal to + `intercept_scaling` is appended to the instance vector. + The intercept becomes + ``intercept_scaling * synthetic_feature_weight``. + + .. note:: + The synthetic feature weight is subject to L1 or L2 + regularization as all other features. + To lessen the effect of regularization on synthetic feature weight + (and therefore on the intercept) `intercept_scaling` has to be increased. + + multi_class : {'auto, 'ovr', 'multinomial'}, default='auto' + If the option chosen is 'ovr', then a binary problem is fit for each + label. For 'multinomial' the loss minimised is the multinomial loss fit + across the entire probability distribution, *even when the data is + binary*. 'multinomial' is unavailable when solver='liblinear'. + 'auto' selects 'ovr' if the data is binary, or if solver='liblinear', + and otherwise selects 'multinomial'. + + .. versionadded:: 0.18 + Stochastic Average Gradient descent solver for 'multinomial' case. + .. versionchanged:: 0.22 + Default changed from 'ovr' to 'auto' in 0.22. + .. deprecated:: 1.5 + ``multi_class`` was deprecated in version 1.5 and will be removed in 1.8. + From then on, the recommended 'multinomial' will always be used for + `n_classes >= 3`. + Solvers that do not support 'multinomial' will raise an error. + Use `sklearn.multiclass.OneVsRestClassifier(LogisticRegressionCV())` if you + still want to use OvR. + + random_state : int, RandomState instance, default=None + Used when `solver='sag'`, 'saga' or 'liblinear' to shuffle the data. + Note that this only applies to the solver and not the cross-validation + generator. See :term:`Glossary ` for details. + + l1_ratios : list of float, default=None + The list of Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. + Only used if ``penalty='elasticnet'``. A value of 0 is equivalent to + using ``penalty='l2'``, while 1 is equivalent to using + ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a combination + of L1 and L2. + + Attributes + ---------- + classes_ : ndarray of shape (n_classes, ) + A list of class labels known to the classifier. + + coef_ : ndarray of shape (1, n_features) or (n_classes, n_features) + Coefficient of the features in the decision function. + + `coef_` is of shape (1, n_features) when the given problem + is binary. + + intercept_ : ndarray of shape (1,) or (n_classes,) + Intercept (a.k.a. bias) added to the decision function. + + If `fit_intercept` is set to False, the intercept is set to zero. + `intercept_` is of shape(1,) when the problem is binary. + + Cs_ : ndarray of shape (n_cs) + Array of C i.e. inverse of regularization parameter values used + for cross-validation. + + l1_ratios_ : ndarray of shape (n_l1_ratios) + Array of l1_ratios used for cross-validation. If no l1_ratio is used + (i.e. penalty is not 'elasticnet'), this is set to ``[None]`` + + coefs_paths_ : ndarray of shape (n_folds, n_cs, n_features) or \ + (n_folds, n_cs, n_features + 1) + dict with classes as the keys, and the path of coefficients obtained + during cross-validating across each fold and then across each Cs + after doing an OvR for the corresponding class as values. + If the 'multi_class' option is set to 'multinomial', then + the coefs_paths are the coefficients corresponding to each class. + Each dict value has shape ``(n_folds, n_cs, n_features)`` or + ``(n_folds, n_cs, n_features + 1)`` depending on whether the + intercept is fit or not. If ``penalty='elasticnet'``, the shape is + ``(n_folds, n_cs, n_l1_ratios_, n_features)`` or + ``(n_folds, n_cs, n_l1_ratios_, n_features + 1)``. + + scores_ : dict + dict with classes as the keys, and the values as the + grid of scores obtained during cross-validating each fold, after doing + an OvR for the corresponding class. If the 'multi_class' option + given is 'multinomial' then the same scores are repeated across + all classes, since this is the multinomial class. Each dict value + has shape ``(n_folds, n_cs)`` or ``(n_folds, n_cs, n_l1_ratios)`` if + ``penalty='elasticnet'``. + + C_ : ndarray of shape (n_classes,) or (n_classes - 1,) + Array of C that maps to the best scores across every class. If refit is + set to False, then for each class, the best C is the average of the + C's that correspond to the best scores for each fold. + `C_` is of shape(n_classes,) when the problem is binary. + + l1_ratio_ : ndarray of shape (n_classes,) or (n_classes - 1,) + Array of l1_ratio that maps to the best scores across every class. If + refit is set to False, then for each class, the best l1_ratio is the + average of the l1_ratio's that correspond to the best scores for each + fold. `l1_ratio_` is of shape(n_classes,) when the problem is binary. + + n_iter_ : ndarray of shape (n_classes, n_folds, n_cs) or (1, n_folds, n_cs) + Actual number of iterations for all classes, folds and Cs. + In the binary or multinomial cases, the first dimension is equal to 1. + If ``penalty='elasticnet'``, the shape is ``(n_classes, n_folds, + n_cs, n_l1_ratios)`` or ``(1, n_folds, n_cs, n_l1_ratios)``. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + LogisticRegression : Logistic regression without tuning the + hyperparameter `C`. + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> from sklearn.linear_model import LogisticRegressionCV + >>> X, y = load_iris(return_X_y=True) + >>> clf = LogisticRegressionCV(cv=5, random_state=0).fit(X, y) + >>> clf.predict(X[:2, :]) + array([0, 0]) + >>> clf.predict_proba(X[:2, :]).shape + (2, 3) + >>> clf.score(X, y) + 0.98... + """ + + _parameter_constraints: dict = {**LogisticRegression._parameter_constraints} + + for param in ["C", "warm_start", "l1_ratio"]: + _parameter_constraints.pop(param) + + _parameter_constraints.update( + { + "Cs": [Interval(Integral, 1, None, closed="left"), "array-like"], + "cv": ["cv_object"], + "scoring": [StrOptions(set(get_scorer_names())), callable, None], + "l1_ratios": ["array-like", None], + "refit": ["boolean"], + "penalty": [StrOptions({"l1", "l2", "elasticnet"})], + } + ) + + def __init__( + self, + *, + Cs=10, + fit_intercept=True, + cv=None, + dual=False, + penalty="l2", + scoring=None, + solver="lbfgs", + tol=1e-4, + max_iter=100, + class_weight=None, + n_jobs=None, + verbose=0, + refit=True, + intercept_scaling=1.0, + multi_class="deprecated", + random_state=None, + l1_ratios=None, + ): + self.Cs = Cs + self.fit_intercept = fit_intercept + self.cv = cv + self.dual = dual + self.penalty = penalty + self.scoring = scoring + self.tol = tol + self.max_iter = max_iter + self.class_weight = class_weight + self.n_jobs = n_jobs + self.verbose = verbose + self.solver = solver + self.refit = refit + self.intercept_scaling = intercept_scaling + self.multi_class = multi_class + self.random_state = random_state + self.l1_ratios = l1_ratios + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, sample_weight=None, **params): + """Fit the model according to the given training data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target vector relative to X. + + sample_weight : array-like of shape (n_samples,) default=None + Array of weights that are assigned to individual samples. + If not provided, then each sample is given unit weight. + + **params : dict + Parameters to pass to the underlying splitter and scorer. + + .. versionadded:: 1.4 + + Returns + ------- + self : object + Fitted LogisticRegressionCV estimator. + """ + _raise_for_params(params, self, "fit") + + solver = _check_solver(self.solver, self.penalty, self.dual) + + if self.penalty == "elasticnet": + if ( + self.l1_ratios is None + or len(self.l1_ratios) == 0 + or any( + ( + not isinstance(l1_ratio, numbers.Number) + or l1_ratio < 0 + or l1_ratio > 1 + ) + for l1_ratio in self.l1_ratios + ) + ): + raise ValueError( + "l1_ratios must be a list of numbers between " + "0 and 1; got (l1_ratios=%r)" % self.l1_ratios + ) + l1_ratios_ = self.l1_ratios + else: + if self.l1_ratios is not None: + warnings.warn( + "l1_ratios parameter is only used when penalty " + "is 'elasticnet'. Got (penalty={})".format(self.penalty) + ) + + l1_ratios_ = [None] + + X, y = validate_data( + self, + X, + y, + accept_sparse="csr", + dtype=np.float64, + order="C", + accept_large_sparse=solver not in ["liblinear", "sag", "saga"], + ) + check_classification_targets(y) + + class_weight = self.class_weight + + # Encode for string labels + label_encoder = LabelEncoder().fit(y) + y = label_encoder.transform(y) + if isinstance(class_weight, dict): + class_weight = { + label_encoder.transform([cls])[0]: v for cls, v in class_weight.items() + } + + # The original class labels + classes = self.classes_ = label_encoder.classes_ + encoded_labels = label_encoder.transform(label_encoder.classes_) + + # TODO(1.8) remove multi_class + multi_class = self.multi_class + if self.multi_class == "multinomial" and len(self.classes_) == 2: + warnings.warn( + ( + "'multi_class' was deprecated in version 1.5 and will be removed in" + " 1.8. From then on, binary problems will be fit as proper binary " + " logistic regression models (as if multi_class='ovr' were set)." + " Leave it to its default value to avoid this warning." + ), + FutureWarning, + ) + elif self.multi_class in ("multinomial", "auto"): + warnings.warn( + ( + "'multi_class' was deprecated in version 1.5 and will be removed in" + " 1.8. From then on, it will always use 'multinomial'." + " Leave it to its default value to avoid this warning." + ), + FutureWarning, + ) + elif self.multi_class == "ovr": + warnings.warn( + ( + "'multi_class' was deprecated in version 1.5 and will be removed in" + " 1.8. Use OneVsRestClassifier(LogisticRegressionCV(..)) instead." + " Leave it to its default value to avoid this warning." + ), + FutureWarning, + ) + else: + # Set to old default value. + multi_class = "auto" + multi_class = _check_multi_class(multi_class, solver, len(classes)) + + if solver in ["sag", "saga"]: + max_squared_sum = row_norms(X, squared=True).max() + else: + max_squared_sum = None + + if _routing_enabled(): + routed_params = process_routing( + self, + "fit", + sample_weight=sample_weight, + **params, + ) + else: + routed_params = Bunch() + routed_params.splitter = Bunch(split={}) + routed_params.scorer = Bunch(score=params) + if sample_weight is not None: + routed_params.scorer.score["sample_weight"] = sample_weight + + # init cross-validation generator + cv = check_cv(self.cv, y, classifier=True) + folds = list(cv.split(X, y, **routed_params.splitter.split)) + + # Use the label encoded classes + n_classes = len(encoded_labels) + + if n_classes < 2: + raise ValueError( + "This solver needs samples of at least 2 classes" + " in the data, but the data contains only one" + " class: %r" % classes[0] + ) + + if n_classes == 2: + # OvR in case of binary problems is as good as fitting + # the higher label + n_classes = 1 + encoded_labels = encoded_labels[1:] + classes = classes[1:] + + # We need this hack to iterate only once over labels, in the case of + # multi_class = multinomial, without changing the value of the labels. + if multi_class == "multinomial": + iter_encoded_labels = iter_classes = [None] + else: + iter_encoded_labels = encoded_labels + iter_classes = classes + + # compute the class weights for the entire dataset y + if class_weight == "balanced": + class_weight = compute_class_weight( + class_weight, + classes=np.arange(len(self.classes_)), + y=y, + sample_weight=sample_weight, + ) + class_weight = dict(enumerate(class_weight)) + + path_func = delayed(_log_reg_scoring_path) + + # The SAG solver releases the GIL so it's more efficient to use + # threads for this solver. + if self.solver in ["sag", "saga"]: + prefer = "threads" + else: + prefer = "processes" + + fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)( + path_func( + X, + y, + train, + test, + pos_class=label, + Cs=self.Cs, + fit_intercept=self.fit_intercept, + penalty=self.penalty, + dual=self.dual, + solver=solver, + tol=self.tol, + max_iter=self.max_iter, + verbose=self.verbose, + class_weight=class_weight, + scoring=self.scoring, + multi_class=multi_class, + intercept_scaling=self.intercept_scaling, + random_state=self.random_state, + max_squared_sum=max_squared_sum, + sample_weight=sample_weight, + l1_ratio=l1_ratio, + score_params=routed_params.scorer.score, + ) + for label in iter_encoded_labels + for train, test in folds + for l1_ratio in l1_ratios_ + ) + + # _log_reg_scoring_path will output different shapes depending on the + # multi_class param, so we need to reshape the outputs accordingly. + # Cs is of shape (n_classes . n_folds . n_l1_ratios, n_Cs) and all the + # rows are equal, so we just take the first one. + # After reshaping, + # - scores is of shape (n_classes, n_folds, n_Cs . n_l1_ratios) + # - coefs_paths is of shape + # (n_classes, n_folds, n_Cs . n_l1_ratios, n_features) + # - n_iter is of shape + # (n_classes, n_folds, n_Cs . n_l1_ratios) or + # (1, n_folds, n_Cs . n_l1_ratios) + coefs_paths, Cs, scores, n_iter_ = zip(*fold_coefs_) + self.Cs_ = Cs[0] + if multi_class == "multinomial": + coefs_paths = np.reshape( + coefs_paths, + (len(folds), len(l1_ratios_) * len(self.Cs_), n_classes, -1), + ) + # equiv to coefs_paths = np.moveaxis(coefs_paths, (0, 1, 2, 3), + # (1, 2, 0, 3)) + coefs_paths = np.swapaxes(coefs_paths, 0, 1) + coefs_paths = np.swapaxes(coefs_paths, 0, 2) + self.n_iter_ = np.reshape( + n_iter_, (1, len(folds), len(self.Cs_) * len(l1_ratios_)) + ) + # repeat same scores across all classes + scores = np.tile(scores, (n_classes, 1, 1)) + else: + coefs_paths = np.reshape( + coefs_paths, + (n_classes, len(folds), len(self.Cs_) * len(l1_ratios_), -1), + ) + self.n_iter_ = np.reshape( + n_iter_, (n_classes, len(folds), len(self.Cs_) * len(l1_ratios_)) + ) + scores = np.reshape(scores, (n_classes, len(folds), -1)) + self.scores_ = dict(zip(classes, scores)) + self.coefs_paths_ = dict(zip(classes, coefs_paths)) + + self.C_ = list() + self.l1_ratio_ = list() + self.coef_ = np.empty((n_classes, X.shape[1])) + self.intercept_ = np.zeros(n_classes) + for index, (cls, encoded_label) in enumerate( + zip(iter_classes, iter_encoded_labels) + ): + if multi_class == "ovr": + scores = self.scores_[cls] + coefs_paths = self.coefs_paths_[cls] + else: + # For multinomial, all scores are the same across classes + scores = scores[0] + # coefs_paths will keep its original shape because + # logistic_regression_path expects it this way + + if self.refit: + # best_index is between 0 and (n_Cs . n_l1_ratios - 1) + # for example, with n_cs=2 and n_l1_ratios=3 + # the layout of scores is + # [c1, c2, c1, c2, c1, c2] + # l1_1 , l1_2 , l1_3 + best_index = scores.sum(axis=0).argmax() + + best_index_C = best_index % len(self.Cs_) + C_ = self.Cs_[best_index_C] + self.C_.append(C_) + + best_index_l1 = best_index // len(self.Cs_) + l1_ratio_ = l1_ratios_[best_index_l1] + self.l1_ratio_.append(l1_ratio_) + + if multi_class == "multinomial": + coef_init = np.mean(coefs_paths[:, :, best_index, :], axis=1) + else: + coef_init = np.mean(coefs_paths[:, best_index, :], axis=0) + + # Note that y is label encoded and hence pos_class must be + # the encoded label / None (for 'multinomial') + w, _, _ = _logistic_regression_path( + X, + y, + pos_class=encoded_label, + Cs=[C_], + solver=solver, + fit_intercept=self.fit_intercept, + coef=coef_init, + max_iter=self.max_iter, + tol=self.tol, + penalty=self.penalty, + class_weight=class_weight, + multi_class=multi_class, + verbose=max(0, self.verbose - 1), + random_state=self.random_state, + check_input=False, + max_squared_sum=max_squared_sum, + sample_weight=sample_weight, + l1_ratio=l1_ratio_, + ) + w = w[0] + + else: + # Take the best scores across every fold and the average of + # all coefficients corresponding to the best scores. + best_indices = np.argmax(scores, axis=1) + if multi_class == "ovr": + w = np.mean( + [coefs_paths[i, best_indices[i], :] for i in range(len(folds))], + axis=0, + ) + else: + w = np.mean( + [ + coefs_paths[:, i, best_indices[i], :] + for i in range(len(folds)) + ], + axis=0, + ) + + best_indices_C = best_indices % len(self.Cs_) + self.C_.append(np.mean(self.Cs_[best_indices_C])) + + if self.penalty == "elasticnet": + best_indices_l1 = best_indices // len(self.Cs_) + self.l1_ratio_.append(np.mean(l1_ratios_[best_indices_l1])) + else: + self.l1_ratio_.append(None) + + if multi_class == "multinomial": + self.C_ = np.tile(self.C_, n_classes) + self.l1_ratio_ = np.tile(self.l1_ratio_, n_classes) + self.coef_ = w[:, : X.shape[1]] + if self.fit_intercept: + self.intercept_ = w[:, -1] + else: + self.coef_[index] = w[: X.shape[1]] + if self.fit_intercept: + self.intercept_[index] = w[-1] + + self.C_ = np.asarray(self.C_) + self.l1_ratio_ = np.asarray(self.l1_ratio_) + self.l1_ratios_ = np.asarray(l1_ratios_) + # if elasticnet was used, add the l1_ratios dimension to some + # attributes + if self.l1_ratios is not None: + # with n_cs=2 and n_l1_ratios=3 + # the layout of scores is + # [c1, c2, c1, c2, c1, c2] + # l1_1 , l1_2 , l1_3 + # To get a 2d array with the following layout + # l1_1, l1_2, l1_3 + # c1 [[ . , . , . ], + # c2 [ . , . , . ]] + # We need to first reshape and then transpose. + # The same goes for the other arrays + for cls, coefs_path in self.coefs_paths_.items(): + self.coefs_paths_[cls] = coefs_path.reshape( + (len(folds), self.l1_ratios_.size, self.Cs_.size, -1) + ) + self.coefs_paths_[cls] = np.transpose( + self.coefs_paths_[cls], (0, 2, 1, 3) + ) + for cls, score in self.scores_.items(): + self.scores_[cls] = score.reshape( + (len(folds), self.l1_ratios_.size, self.Cs_.size) + ) + self.scores_[cls] = np.transpose(self.scores_[cls], (0, 2, 1)) + + self.n_iter_ = self.n_iter_.reshape( + (-1, len(folds), self.l1_ratios_.size, self.Cs_.size) + ) + self.n_iter_ = np.transpose(self.n_iter_, (0, 1, 3, 2)) + + return self + + def score(self, X, y, sample_weight=None, **score_params): + """Score using the `scoring` option on the given test data and labels. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Test samples. + + y : array-like of shape (n_samples,) + True labels for X. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + **score_params : dict + Parameters to pass to the `score` method of the underlying scorer. + + .. versionadded:: 1.4 + + Returns + ------- + score : float + Score of self.predict(X) w.r.t. y. + """ + _raise_for_params(score_params, self, "score") + + scoring = self._get_scorer() + if _routing_enabled(): + routed_params = process_routing( + self, + "score", + sample_weight=sample_weight, + **score_params, + ) + else: + routed_params = Bunch() + routed_params.scorer = Bunch(score={}) + if sample_weight is not None: + routed_params.scorer.score["sample_weight"] = sample_weight + + return scoring( + self, + X, + y, + **routed_params.scorer.score, + ) + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.4 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + + router = ( + MetadataRouter(owner=self.__class__.__name__) + .add_self_request(self) + .add( + splitter=self.cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + .add( + scorer=self._get_scorer(), + method_mapping=MethodMapping() + .add(caller="score", callee="score") + .add(caller="fit", callee="score"), + ) + ) + return router + + def _get_scorer(self): + """Get the scorer based on the scoring method specified. + The default scoring method is `accuracy`. + """ + scoring = self.scoring or "accuracy" + return get_scorer(scoring) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_omp.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_omp.py new file mode 100644 index 0000000000000000000000000000000000000000..2f4dbac2d7634b0fe4e6a02771e64f80adcf490b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_omp.py @@ -0,0 +1,1121 @@ +"""Orthogonal matching pursuit algorithms""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from math import sqrt +from numbers import Integral, Real + +import numpy as np +from scipy import linalg +from scipy.linalg.lapack import get_lapack_funcs + +from ..base import MultiOutputMixin, RegressorMixin, _fit_context +from ..model_selection import check_cv +from ..utils import Bunch, as_float_array, check_array +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + _routing_enabled, + process_routing, +) +from ..utils.parallel import Parallel, delayed +from ..utils.validation import validate_data +from ._base import LinearModel, _pre_fit + +premature = ( + "Orthogonal matching pursuit ended prematurely due to linear" + " dependence in the dictionary. The requested precision might" + " not have been met." +) + + +def _cholesky_omp(X, y, n_nonzero_coefs, tol=None, copy_X=True, return_path=False): + """Orthogonal Matching Pursuit step using the Cholesky decomposition. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Input dictionary. Columns are assumed to have unit norm. + + y : ndarray of shape (n_samples,) + Input targets. + + n_nonzero_coefs : int + Targeted number of non-zero elements. + + tol : float, default=None + Targeted squared error, if not None overrides n_nonzero_coefs. + + copy_X : bool, default=True + Whether the design matrix X must be copied by the algorithm. A false + value is only helpful if X is already Fortran-ordered, otherwise a + copy is made anyway. + + return_path : bool, default=False + Whether to return every value of the nonzero coefficients along the + forward path. Useful for cross-validation. + + Returns + ------- + gamma : ndarray of shape (n_nonzero_coefs,) + Non-zero elements of the solution. + + idx : ndarray of shape (n_nonzero_coefs,) + Indices of the positions of the elements in gamma within the solution + vector. + + coef : ndarray of shape (n_features, n_nonzero_coefs) + The first k values of column k correspond to the coefficient value + for the active features at that step. The lower left triangle contains + garbage. Only returned if ``return_path=True``. + + n_active : int + Number of active features at convergence. + """ + if copy_X: + X = X.copy("F") + else: # even if we are allowed to overwrite, still copy it if bad order + X = np.asfortranarray(X) + + min_float = np.finfo(X.dtype).eps + nrm2, swap = linalg.get_blas_funcs(("nrm2", "swap"), (X,)) + (potrs,) = get_lapack_funcs(("potrs",), (X,)) + + alpha = np.dot(X.T, y) + residual = y + gamma = np.empty(0) + n_active = 0 + indices = np.arange(X.shape[1]) # keeping track of swapping + + max_features = X.shape[1] if tol is not None else n_nonzero_coefs + + L = np.empty((max_features, max_features), dtype=X.dtype) + + if return_path: + coefs = np.empty_like(L) + + while True: + lam = np.argmax(np.abs(np.dot(X.T, residual))) + if lam < n_active or alpha[lam] ** 2 < min_float: + # atom already selected or inner product too small + warnings.warn(premature, RuntimeWarning, stacklevel=2) + break + + if n_active > 0: + # Updates the Cholesky decomposition of X' X + L[n_active, :n_active] = np.dot(X[:, :n_active].T, X[:, lam]) + linalg.solve_triangular( + L[:n_active, :n_active], + L[n_active, :n_active], + trans=0, + lower=1, + overwrite_b=True, + check_finite=False, + ) + v = nrm2(L[n_active, :n_active]) ** 2 + Lkk = linalg.norm(X[:, lam]) ** 2 - v + if Lkk <= min_float: # selected atoms are dependent + warnings.warn(premature, RuntimeWarning, stacklevel=2) + break + L[n_active, n_active] = sqrt(Lkk) + else: + L[0, 0] = linalg.norm(X[:, lam]) + + X.T[n_active], X.T[lam] = swap(X.T[n_active], X.T[lam]) + alpha[n_active], alpha[lam] = alpha[lam], alpha[n_active] + indices[n_active], indices[lam] = indices[lam], indices[n_active] + n_active += 1 + + # solves LL'x = X'y as a composition of two triangular systems + gamma, _ = potrs( + L[:n_active, :n_active], alpha[:n_active], lower=True, overwrite_b=False + ) + + if return_path: + coefs[:n_active, n_active - 1] = gamma + residual = y - np.dot(X[:, :n_active], gamma) + if tol is not None and nrm2(residual) ** 2 <= tol: + break + elif n_active == max_features: + break + + if return_path: + return gamma, indices[:n_active], coefs[:, :n_active], n_active + else: + return gamma, indices[:n_active], n_active + + +def _gram_omp( + Gram, + Xy, + n_nonzero_coefs, + tol_0=None, + tol=None, + copy_Gram=True, + copy_Xy=True, + return_path=False, +): + """Orthogonal Matching Pursuit step on a precomputed Gram matrix. + + This function uses the Cholesky decomposition method. + + Parameters + ---------- + Gram : ndarray of shape (n_features, n_features) + Gram matrix of the input data matrix. + + Xy : ndarray of shape (n_features,) + Input targets. + + n_nonzero_coefs : int + Targeted number of non-zero elements. + + tol_0 : float, default=None + Squared norm of y, required if tol is not None. + + tol : float, default=None + Targeted squared error, if not None overrides n_nonzero_coefs. + + copy_Gram : bool, default=True + Whether the gram matrix must be copied by the algorithm. A false + value is only helpful if it is already Fortran-ordered, otherwise a + copy is made anyway. + + copy_Xy : bool, default=True + Whether the covariance vector Xy must be copied by the algorithm. + If False, it may be overwritten. + + return_path : bool, default=False + Whether to return every value of the nonzero coefficients along the + forward path. Useful for cross-validation. + + Returns + ------- + gamma : ndarray of shape (n_nonzero_coefs,) + Non-zero elements of the solution. + + idx : ndarray of shape (n_nonzero_coefs,) + Indices of the positions of the elements in gamma within the solution + vector. + + coefs : ndarray of shape (n_features, n_nonzero_coefs) + The first k values of column k correspond to the coefficient value + for the active features at that step. The lower left triangle contains + garbage. Only returned if ``return_path=True``. + + n_active : int + Number of active features at convergence. + """ + Gram = Gram.copy("F") if copy_Gram else np.asfortranarray(Gram) + + if copy_Xy or not Xy.flags.writeable: + Xy = Xy.copy() + + min_float = np.finfo(Gram.dtype).eps + nrm2, swap = linalg.get_blas_funcs(("nrm2", "swap"), (Gram,)) + (potrs,) = get_lapack_funcs(("potrs",), (Gram,)) + + indices = np.arange(len(Gram)) # keeping track of swapping + alpha = Xy + tol_curr = tol_0 + delta = 0 + gamma = np.empty(0) + n_active = 0 + + max_features = len(Gram) if tol is not None else n_nonzero_coefs + + L = np.empty((max_features, max_features), dtype=Gram.dtype) + + L[0, 0] = 1.0 + if return_path: + coefs = np.empty_like(L) + + while True: + lam = np.argmax(np.abs(alpha)) + if lam < n_active or alpha[lam] ** 2 < min_float: + # selected same atom twice, or inner product too small + warnings.warn(premature, RuntimeWarning, stacklevel=3) + break + if n_active > 0: + L[n_active, :n_active] = Gram[lam, :n_active] + linalg.solve_triangular( + L[:n_active, :n_active], + L[n_active, :n_active], + trans=0, + lower=1, + overwrite_b=True, + check_finite=False, + ) + v = nrm2(L[n_active, :n_active]) ** 2 + Lkk = Gram[lam, lam] - v + if Lkk <= min_float: # selected atoms are dependent + warnings.warn(premature, RuntimeWarning, stacklevel=3) + break + L[n_active, n_active] = sqrt(Lkk) + else: + L[0, 0] = sqrt(Gram[lam, lam]) + + Gram[n_active], Gram[lam] = swap(Gram[n_active], Gram[lam]) + Gram.T[n_active], Gram.T[lam] = swap(Gram.T[n_active], Gram.T[lam]) + indices[n_active], indices[lam] = indices[lam], indices[n_active] + Xy[n_active], Xy[lam] = Xy[lam], Xy[n_active] + n_active += 1 + # solves LL'x = X'y as a composition of two triangular systems + gamma, _ = potrs( + L[:n_active, :n_active], Xy[:n_active], lower=True, overwrite_b=False + ) + if return_path: + coefs[:n_active, n_active - 1] = gamma + beta = np.dot(Gram[:, :n_active], gamma) + alpha = Xy - beta + if tol is not None: + tol_curr += delta + delta = np.inner(gamma, beta[:n_active]) + tol_curr -= delta + if abs(tol_curr) <= tol: + break + elif n_active == max_features: + break + + if return_path: + return gamma, indices[:n_active], coefs[:, :n_active], n_active + else: + return gamma, indices[:n_active], n_active + + +@validate_params( + { + "X": ["array-like"], + "y": [np.ndarray], + "n_nonzero_coefs": [Interval(Integral, 1, None, closed="left"), None], + "tol": [Interval(Real, 0, None, closed="left"), None], + "precompute": ["boolean", StrOptions({"auto"})], + "copy_X": ["boolean"], + "return_path": ["boolean"], + "return_n_iter": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def orthogonal_mp( + X, + y, + *, + n_nonzero_coefs=None, + tol=None, + precompute=False, + copy_X=True, + return_path=False, + return_n_iter=False, +): + r"""Orthogonal Matching Pursuit (OMP). + + Solves n_targets Orthogonal Matching Pursuit problems. + An instance of the problem has the form: + + When parametrized by the number of non-zero coefficients using + `n_nonzero_coefs`: + argmin ||y - X\gamma||^2 subject to ||\gamma||_0 <= n_{nonzero coefs} + + When parametrized by error using the parameter `tol`: + argmin ||\gamma||_0 subject to ||y - X\gamma||^2 <= tol + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data. Columns are assumed to have unit norm. + + y : ndarray of shape (n_samples,) or (n_samples, n_targets) + Input targets. + + n_nonzero_coefs : int, default=None + Desired number of non-zero entries in the solution. If None (by + default) this value is set to 10% of n_features. + + tol : float, default=None + Maximum squared norm of the residual. If not None, overrides n_nonzero_coefs. + + precompute : 'auto' or bool, default=False + Whether to perform precomputations. Improves performance when n_targets + or n_samples is very large. + + copy_X : bool, default=True + Whether the design matrix X must be copied by the algorithm. A false + value is only helpful if X is already Fortran-ordered, otherwise a + copy is made anyway. + + return_path : bool, default=False + Whether to return every value of the nonzero coefficients along the + forward path. Useful for cross-validation. + + return_n_iter : bool, default=False + Whether or not to return the number of iterations. + + Returns + ------- + coef : ndarray of shape (n_features,) or (n_features, n_targets) + Coefficients of the OMP solution. If `return_path=True`, this contains + the whole coefficient path. In this case its shape is + (n_features, n_features) or (n_features, n_targets, n_features) and + iterating over the last axis generates coefficients in increasing order + of active features. + + n_iters : array-like or int + Number of active features across every target. Returned only if + `return_n_iter` is set to True. + + See Also + -------- + OrthogonalMatchingPursuit : Orthogonal Matching Pursuit model. + orthogonal_mp_gram : Solve OMP problems using Gram matrix and the product X.T * y. + lars_path : Compute Least Angle Regression or Lasso path using LARS algorithm. + sklearn.decomposition.sparse_encode : Sparse coding. + + Notes + ----- + Orthogonal matching pursuit was introduced in S. Mallat, Z. Zhang, + Matching pursuits with time-frequency dictionaries, IEEE Transactions on + Signal Processing, Vol. 41, No. 12. (December 1993), pp. 3397-3415. + (https://www.di.ens.fr/~mallat/papiers/MallatPursuit93.pdf) + + This implementation is based on Rubinstein, R., Zibulevsky, M. and Elad, + M., Efficient Implementation of the K-SVD Algorithm using Batch Orthogonal + Matching Pursuit Technical Report - CS Technion, April 2008. + https://www.cs.technion.ac.il/~ronrubin/Publications/KSVD-OMP-v2.pdf + + Examples + -------- + >>> from sklearn.datasets import make_regression + >>> from sklearn.linear_model import orthogonal_mp + >>> X, y = make_regression(noise=4, random_state=0) + >>> coef = orthogonal_mp(X, y) + >>> coef.shape + (100,) + >>> X[:1,] @ coef + array([-78.68]) + """ + X = check_array(X, order="F", copy=copy_X) + copy_X = False + if y.ndim == 1: + y = y.reshape(-1, 1) + y = check_array(y) + if y.shape[1] > 1: # subsequent targets will be affected + copy_X = True + if n_nonzero_coefs is None and tol is None: + # default for n_nonzero_coefs is 0.1 * n_features + # but at least one. + n_nonzero_coefs = max(int(0.1 * X.shape[1]), 1) + if tol is None and n_nonzero_coefs > X.shape[1]: + raise ValueError( + "The number of atoms cannot be more than the number of features" + ) + if precompute == "auto": + precompute = X.shape[0] > X.shape[1] + if precompute: + G = np.dot(X.T, X) + G = np.asfortranarray(G) + Xy = np.dot(X.T, y) + if tol is not None: + norms_squared = np.sum((y**2), axis=0) + else: + norms_squared = None + return orthogonal_mp_gram( + G, + Xy, + n_nonzero_coefs=n_nonzero_coefs, + tol=tol, + norms_squared=norms_squared, + copy_Gram=copy_X, + copy_Xy=False, + return_path=return_path, + ) + + if return_path: + coef = np.zeros((X.shape[1], y.shape[1], X.shape[1])) + else: + coef = np.zeros((X.shape[1], y.shape[1])) + n_iters = [] + + for k in range(y.shape[1]): + out = _cholesky_omp( + X, y[:, k], n_nonzero_coefs, tol, copy_X=copy_X, return_path=return_path + ) + if return_path: + _, idx, coefs, n_iter = out + coef = coef[:, :, : len(idx)] + for n_active, x in enumerate(coefs.T): + coef[idx[: n_active + 1], k, n_active] = x[: n_active + 1] + else: + x, idx, n_iter = out + coef[idx, k] = x + n_iters.append(n_iter) + + if y.shape[1] == 1: + n_iters = n_iters[0] + + if return_n_iter: + return np.squeeze(coef), n_iters + else: + return np.squeeze(coef) + + +@validate_params( + { + "Gram": ["array-like"], + "Xy": ["array-like"], + "n_nonzero_coefs": [Interval(Integral, 0, None, closed="neither"), None], + "tol": [Interval(Real, 0, None, closed="left"), None], + "norms_squared": ["array-like", None], + "copy_Gram": ["boolean"], + "copy_Xy": ["boolean"], + "return_path": ["boolean"], + "return_n_iter": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def orthogonal_mp_gram( + Gram, + Xy, + *, + n_nonzero_coefs=None, + tol=None, + norms_squared=None, + copy_Gram=True, + copy_Xy=True, + return_path=False, + return_n_iter=False, +): + """Gram Orthogonal Matching Pursuit (OMP). + + Solves n_targets Orthogonal Matching Pursuit problems using only + the Gram matrix X.T * X and the product X.T * y. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + Gram : array-like of shape (n_features, n_features) + Gram matrix of the input data: `X.T * X`. + + Xy : array-like of shape (n_features,) or (n_features, n_targets) + Input targets multiplied by `X`: `X.T * y`. + + n_nonzero_coefs : int, default=None + Desired number of non-zero entries in the solution. If `None` (by + default) this value is set to 10% of n_features. + + tol : float, default=None + Maximum squared norm of the residual. If not `None`, + overrides `n_nonzero_coefs`. + + norms_squared : array-like of shape (n_targets,), default=None + Squared L2 norms of the lines of `y`. Required if `tol` is not None. + + copy_Gram : bool, default=True + Whether the gram matrix must be copied by the algorithm. A `False` + value is only helpful if it is already Fortran-ordered, otherwise a + copy is made anyway. + + copy_Xy : bool, default=True + Whether the covariance vector `Xy` must be copied by the algorithm. + If `False`, it may be overwritten. + + return_path : bool, default=False + Whether to return every value of the nonzero coefficients along the + forward path. Useful for cross-validation. + + return_n_iter : bool, default=False + Whether or not to return the number of iterations. + + Returns + ------- + coef : ndarray of shape (n_features,) or (n_features, n_targets) + Coefficients of the OMP solution. If `return_path=True`, this contains + the whole coefficient path. In this case its shape is + `(n_features, n_features)` or `(n_features, n_targets, n_features)` and + iterating over the last axis yields coefficients in increasing order + of active features. + + n_iters : list or int + Number of active features across every target. Returned only if + `return_n_iter` is set to True. + + See Also + -------- + OrthogonalMatchingPursuit : Orthogonal Matching Pursuit model (OMP). + orthogonal_mp : Solves n_targets Orthogonal Matching Pursuit problems. + lars_path : Compute Least Angle Regression or Lasso path using + LARS algorithm. + sklearn.decomposition.sparse_encode : Generic sparse coding. + Each column of the result is the solution to a Lasso problem. + + Notes + ----- + Orthogonal matching pursuit was introduced in G. Mallat, Z. Zhang, + Matching pursuits with time-frequency dictionaries, IEEE Transactions on + Signal Processing, Vol. 41, No. 12. (December 1993), pp. 3397-3415. + (https://www.di.ens.fr/~mallat/papiers/MallatPursuit93.pdf) + + This implementation is based on Rubinstein, R., Zibulevsky, M. and Elad, + M., Efficient Implementation of the K-SVD Algorithm using Batch Orthogonal + Matching Pursuit Technical Report - CS Technion, April 2008. + https://www.cs.technion.ac.il/~ronrubin/Publications/KSVD-OMP-v2.pdf + + Examples + -------- + >>> from sklearn.datasets import make_regression + >>> from sklearn.linear_model import orthogonal_mp_gram + >>> X, y = make_regression(noise=4, random_state=0) + >>> coef = orthogonal_mp_gram(X.T @ X, X.T @ y) + >>> coef.shape + (100,) + >>> X[:1,] @ coef + array([-78.68]) + """ + Gram = check_array(Gram, order="F", copy=copy_Gram) + Xy = np.asarray(Xy) + if Xy.ndim > 1 and Xy.shape[1] > 1: + # or subsequent target will be affected + copy_Gram = True + if Xy.ndim == 1: + Xy = Xy[:, np.newaxis] + if tol is not None: + norms_squared = [norms_squared] + if copy_Xy or not Xy.flags.writeable: + # Make the copy once instead of many times in _gram_omp itself. + Xy = Xy.copy() + + if n_nonzero_coefs is None and tol is None: + n_nonzero_coefs = int(0.1 * len(Gram)) + if tol is not None and norms_squared is None: + raise ValueError( + "Gram OMP needs the precomputed norms in order " + "to evaluate the error sum of squares." + ) + if tol is not None and tol < 0: + raise ValueError("Epsilon cannot be negative") + if tol is None and n_nonzero_coefs <= 0: + raise ValueError("The number of atoms must be positive") + if tol is None and n_nonzero_coefs > len(Gram): + raise ValueError( + "The number of atoms cannot be more than the number of features" + ) + + if return_path: + coef = np.zeros((len(Gram), Xy.shape[1], len(Gram)), dtype=Gram.dtype) + else: + coef = np.zeros((len(Gram), Xy.shape[1]), dtype=Gram.dtype) + + n_iters = [] + for k in range(Xy.shape[1]): + out = _gram_omp( + Gram, + Xy[:, k], + n_nonzero_coefs, + norms_squared[k] if tol is not None else None, + tol, + copy_Gram=copy_Gram, + copy_Xy=False, + return_path=return_path, + ) + if return_path: + _, idx, coefs, n_iter = out + coef = coef[:, :, : len(idx)] + for n_active, x in enumerate(coefs.T): + coef[idx[: n_active + 1], k, n_active] = x[: n_active + 1] + else: + x, idx, n_iter = out + coef[idx, k] = x + n_iters.append(n_iter) + + if Xy.shape[1] == 1: + n_iters = n_iters[0] + + if return_n_iter: + return np.squeeze(coef), n_iters + else: + return np.squeeze(coef) + + +class OrthogonalMatchingPursuit(MultiOutputMixin, RegressorMixin, LinearModel): + """Orthogonal Matching Pursuit model (OMP). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_nonzero_coefs : int, default=None + Desired number of non-zero entries in the solution. Ignored if `tol` is set. + When `None` and `tol` is also `None`, this value is either set to 10% of + `n_features` or 1, whichever is greater. + + tol : float, default=None + Maximum squared norm of the residual. If not None, overrides n_nonzero_coefs. + + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to false, no intercept will be used in calculations + (i.e. data is expected to be centered). + + precompute : 'auto' or bool, default='auto' + Whether to use a precomputed Gram and Xy matrix to speed up + calculations. Improves performance when :term:`n_targets` or + :term:`n_samples` is very large. Note that if you already have such + matrices, you can pass them directly to the fit method. + + Attributes + ---------- + coef_ : ndarray of shape (n_features,) or (n_targets, n_features) + Parameter vector (w in the formula). + + intercept_ : float or ndarray of shape (n_targets,) + Independent term in decision function. + + n_iter_ : int or array-like + Number of active features across every target. + + n_nonzero_coefs_ : int or None + The number of non-zero coefficients in the solution or `None` when `tol` is + set. If `n_nonzero_coefs` is None and `tol` is None this value is either set + to 10% of `n_features` or 1, whichever is greater. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + orthogonal_mp : Solves n_targets Orthogonal Matching Pursuit problems. + orthogonal_mp_gram : Solves n_targets Orthogonal Matching Pursuit + problems using only the Gram matrix X.T * X and the product X.T * y. + lars_path : Compute Least Angle Regression or Lasso path using LARS algorithm. + Lars : Least Angle Regression model a.k.a. LAR. + LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars. + sklearn.decomposition.sparse_encode : Generic sparse coding. + Each column of the result is the solution to a Lasso problem. + OrthogonalMatchingPursuitCV : Cross-validated + Orthogonal Matching Pursuit model (OMP). + + Notes + ----- + Orthogonal matching pursuit was introduced in G. Mallat, Z. Zhang, + Matching pursuits with time-frequency dictionaries, IEEE Transactions on + Signal Processing, Vol. 41, No. 12. (December 1993), pp. 3397-3415. + (https://www.di.ens.fr/~mallat/papiers/MallatPursuit93.pdf) + + This implementation is based on Rubinstein, R., Zibulevsky, M. and Elad, + M., Efficient Implementation of the K-SVD Algorithm using Batch Orthogonal + Matching Pursuit Technical Report - CS Technion, April 2008. + https://www.cs.technion.ac.il/~ronrubin/Publications/KSVD-OMP-v2.pdf + + Examples + -------- + >>> from sklearn.linear_model import OrthogonalMatchingPursuit + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression(noise=4, random_state=0) + >>> reg = OrthogonalMatchingPursuit().fit(X, y) + >>> reg.score(X, y) + 0.9991 + >>> reg.predict(X[:1,]) + array([-78.3854]) + """ + + _parameter_constraints: dict = { + "n_nonzero_coefs": [Interval(Integral, 1, None, closed="left"), None], + "tol": [Interval(Real, 0, None, closed="left"), None], + "fit_intercept": ["boolean"], + "precompute": [StrOptions({"auto"}), "boolean"], + } + + def __init__( + self, + *, + n_nonzero_coefs=None, + tol=None, + fit_intercept=True, + precompute="auto", + ): + self.n_nonzero_coefs = n_nonzero_coefs + self.tol = tol + self.fit_intercept = fit_intercept + self.precompute = precompute + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y): + """Fit the model using X, y as training data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) or (n_samples, n_targets) + Target values. Will be cast to X's dtype if necessary. + + Returns + ------- + self : object + Returns an instance of self. + """ + X, y = validate_data(self, X, y, multi_output=True, y_numeric=True) + n_features = X.shape[1] + + X, y, X_offset, y_offset, X_scale, Gram, Xy = _pre_fit( + X, y, None, self.precompute, self.fit_intercept, copy=True + ) + + if y.ndim == 1: + y = y[:, np.newaxis] + + if self.n_nonzero_coefs is None and self.tol is None: + # default for n_nonzero_coefs is 0.1 * n_features + # but at least one. + self.n_nonzero_coefs_ = max(int(0.1 * n_features), 1) + elif self.tol is not None: + self.n_nonzero_coefs_ = None + else: + self.n_nonzero_coefs_ = self.n_nonzero_coefs + + if Gram is False: + coef_, self.n_iter_ = orthogonal_mp( + X, + y, + n_nonzero_coefs=self.n_nonzero_coefs_, + tol=self.tol, + precompute=False, + copy_X=True, + return_n_iter=True, + ) + else: + norms_sq = np.sum(y**2, axis=0) if self.tol is not None else None + + coef_, self.n_iter_ = orthogonal_mp_gram( + Gram, + Xy=Xy, + n_nonzero_coefs=self.n_nonzero_coefs_, + tol=self.tol, + norms_squared=norms_sq, + copy_Gram=True, + copy_Xy=True, + return_n_iter=True, + ) + self.coef_ = coef_.T + self._set_intercept(X_offset, y_offset, X_scale) + return self + + +def _omp_path_residues( + X_train, + y_train, + X_test, + y_test, + copy=True, + fit_intercept=True, + max_iter=100, +): + """Compute the residues on left-out data for a full LARS path. + + Parameters + ---------- + X_train : ndarray of shape (n_samples, n_features) + The data to fit the LARS on. + + y_train : ndarray of shape (n_samples) + The target variable to fit LARS on. + + X_test : ndarray of shape (n_samples, n_features) + The data to compute the residues on. + + y_test : ndarray of shape (n_samples) + The target variable to compute the residues on. + + copy : bool, default=True + Whether X_train, X_test, y_train and y_test should be copied. If + False, they may be overwritten. + + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to false, no intercept will be used in calculations + (i.e. data is expected to be centered). + + max_iter : int, default=100 + Maximum numbers of iterations to perform, therefore maximum features + to include. 100 by default. + + Returns + ------- + residues : ndarray of shape (n_samples, max_features) + Residues of the prediction on the test data. + """ + + if copy: + X_train = X_train.copy() + y_train = y_train.copy() + X_test = X_test.copy() + y_test = y_test.copy() + + if fit_intercept: + X_mean = X_train.mean(axis=0) + X_train -= X_mean + X_test -= X_mean + y_mean = y_train.mean(axis=0) + y_train = as_float_array(y_train, copy=False) + y_train -= y_mean + y_test = as_float_array(y_test, copy=False) + y_test -= y_mean + + coefs = orthogonal_mp( + X_train, + y_train, + n_nonzero_coefs=max_iter, + tol=None, + precompute=False, + copy_X=False, + return_path=True, + ) + if coefs.ndim == 1: + coefs = coefs[:, np.newaxis] + + return np.dot(coefs.T, X_test.T) - y_test + + +class OrthogonalMatchingPursuitCV(RegressorMixin, LinearModel): + """Cross-validated Orthogonal Matching Pursuit model (OMP). + + See glossary entry for :term:`cross-validation estimator`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + copy : bool, default=True + Whether the design matrix X must be copied by the algorithm. A false + value is only helpful if X is already Fortran-ordered, otherwise a + copy is made anyway. + + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to false, no intercept will be used in calculations + (i.e. data is expected to be centered). + + max_iter : int, default=None + Maximum numbers of iterations to perform, therefore maximum features + to include. 10% of ``n_features`` but at least 5 if available. + + cv : int, cross-validation generator or iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross-validation, + - integer, to specify the number of folds. + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, :class:`~sklearn.model_selection.KFold` is used. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value if None changed from 3-fold to 5-fold. + + n_jobs : int, default=None + Number of CPUs to use during the cross validation. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + verbose : bool or int, default=False + Sets the verbosity amount. + + Attributes + ---------- + intercept_ : float or ndarray of shape (n_targets,) + Independent term in decision function. + + coef_ : ndarray of shape (n_features,) or (n_targets, n_features) + Parameter vector (w in the problem formulation). + + n_nonzero_coefs_ : int + Estimated number of non-zero coefficients giving the best mean squared + error over the cross-validation folds. + + n_iter_ : int or array-like + Number of active features across every target for the model refit with + the best hyperparameters got by cross-validating across all folds. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + orthogonal_mp : Solves n_targets Orthogonal Matching Pursuit problems. + orthogonal_mp_gram : Solves n_targets Orthogonal Matching Pursuit + problems using only the Gram matrix X.T * X and the product X.T * y. + lars_path : Compute Least Angle Regression or Lasso path using LARS algorithm. + Lars : Least Angle Regression model a.k.a. LAR. + LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars. + OrthogonalMatchingPursuit : Orthogonal Matching Pursuit model (OMP). + LarsCV : Cross-validated Least Angle Regression model. + LassoLarsCV : Cross-validated Lasso model fit with Least Angle Regression. + sklearn.decomposition.sparse_encode : Generic sparse coding. + Each column of the result is the solution to a Lasso problem. + + Notes + ----- + In `fit`, once the optimal number of non-zero coefficients is found through + cross-validation, the model is fit again using the entire training set. + + Examples + -------- + >>> from sklearn.linear_model import OrthogonalMatchingPursuitCV + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression(n_features=100, n_informative=10, + ... noise=4, random_state=0) + >>> reg = OrthogonalMatchingPursuitCV(cv=5).fit(X, y) + >>> reg.score(X, y) + 0.9991 + >>> reg.n_nonzero_coefs_ + np.int64(10) + >>> reg.predict(X[:1,]) + array([-78.3854]) + """ + + _parameter_constraints: dict = { + "copy": ["boolean"], + "fit_intercept": ["boolean"], + "max_iter": [Interval(Integral, 0, None, closed="left"), None], + "cv": ["cv_object"], + "n_jobs": [Integral, None], + "verbose": ["verbose"], + } + + def __init__( + self, + *, + copy=True, + fit_intercept=True, + max_iter=None, + cv=None, + n_jobs=None, + verbose=False, + ): + self.copy = copy + self.fit_intercept = fit_intercept + self.max_iter = max_iter + self.cv = cv + self.n_jobs = n_jobs + self.verbose = verbose + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, **fit_params): + """Fit the model using X, y as training data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) + Target values. Will be cast to X's dtype if necessary. + + **fit_params : dict + Parameters to pass to the underlying splitter. + + .. versionadded:: 1.4 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Returns an instance of self. + """ + _raise_for_params(fit_params, self, "fit") + + X, y = validate_data(self, X, y, y_numeric=True, ensure_min_features=2) + X = as_float_array(X, copy=False, ensure_all_finite=False) + cv = check_cv(self.cv, classifier=False) + if _routing_enabled(): + routed_params = process_routing(self, "fit", **fit_params) + else: + # TODO(SLEP6): remove when metadata routing cannot be disabled. + routed_params = Bunch() + routed_params.splitter = Bunch(split={}) + max_iter = ( + min(max(int(0.1 * X.shape[1]), 5), X.shape[1]) + if not self.max_iter + else self.max_iter + ) + cv_paths = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( + delayed(_omp_path_residues)( + X[train], + y[train], + X[test], + y[test], + self.copy, + self.fit_intercept, + max_iter, + ) + for train, test in cv.split(X, **routed_params.splitter.split) + ) + + min_early_stop = min(fold.shape[0] for fold in cv_paths) + mse_folds = np.array( + [(fold[:min_early_stop] ** 2).mean(axis=1) for fold in cv_paths] + ) + best_n_nonzero_coefs = np.argmin(mse_folds.mean(axis=0)) + 1 + self.n_nonzero_coefs_ = best_n_nonzero_coefs + omp = OrthogonalMatchingPursuit( + n_nonzero_coefs=best_n_nonzero_coefs, + fit_intercept=self.fit_intercept, + ).fit(X, y) + + self.coef_ = omp.coef_ + self.intercept_ = omp.intercept_ + self.n_iter_ = omp.n_iter_ + return self + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.4 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + + router = MetadataRouter(owner=self.__class__.__name__).add( + splitter=self.cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + return router diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_passive_aggressive.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_passive_aggressive.py new file mode 100644 index 0000000000000000000000000000000000000000..61eb06edae85f9c6d04a94c070cd71c1bbbcaa3b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_passive_aggressive.py @@ -0,0 +1,573 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Real + +from ..base import _fit_context +from ..utils._param_validation import Interval, StrOptions +from ._stochastic_gradient import DEFAULT_EPSILON, BaseSGDClassifier, BaseSGDRegressor + + +class PassiveAggressiveClassifier(BaseSGDClassifier): + """Passive Aggressive Classifier. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + C : float, default=1.0 + Maximum step size (regularization). Defaults to 1.0. + + fit_intercept : bool, default=True + Whether the intercept should be estimated or not. If False, the + data is assumed to be already centered. + + max_iter : int, default=1000 + The maximum number of passes over the training data (aka epochs). + It only impacts the behavior in the ``fit`` method, and not the + :meth:`~sklearn.linear_model.PassiveAggressiveClassifier.partial_fit` method. + + .. versionadded:: 0.19 + + tol : float or None, default=1e-3 + The stopping criterion. If it is not None, the iterations will stop + when (loss > previous_loss - tol). + + .. versionadded:: 0.19 + + early_stopping : bool, default=False + Whether to use early stopping to terminate training when validation + score is not improving. If set to True, it will automatically set aside + a stratified fraction of training data as validation and terminate + training when validation score is not improving by at least `tol` for + `n_iter_no_change` consecutive epochs. + + .. versionadded:: 0.20 + + validation_fraction : float, default=0.1 + The proportion of training data to set aside as validation set for + early stopping. Must be between 0 and 1. + Only used if early_stopping is True. + + .. versionadded:: 0.20 + + n_iter_no_change : int, default=5 + Number of iterations with no improvement to wait before early stopping. + + .. versionadded:: 0.20 + + shuffle : bool, default=True + Whether or not the training data should be shuffled after each epoch. + + verbose : int, default=0 + The verbosity level. + + loss : str, default="hinge" + The loss function to be used: + hinge: equivalent to PA-I in the reference paper. + squared_hinge: equivalent to PA-II in the reference paper. + + n_jobs : int or None, default=None + The number of CPUs to use to do the OVA (One Versus All, for + multi-class problems) computation. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + random_state : int, RandomState instance, default=None + Used to shuffle the training data, when ``shuffle`` is set to + ``True``. Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. + + warm_start : bool, default=False + When set to True, reuse the solution of the previous call to fit as + initialization, otherwise, just erase the previous solution. + See :term:`the Glossary `. + + Repeatedly calling fit or partial_fit when warm_start is True can + result in a different solution than when calling fit a single time + because of the way the data is shuffled. + + class_weight : dict, {class_label: weight} or "balanced" or None, \ + default=None + Preset for the class_weight fit parameter. + + Weights associated with classes. If not given, all classes + are supposed to have weight one. + + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))``. + + .. versionadded:: 0.17 + parameter *class_weight* to automatically weight samples. + + average : bool or int, default=False + When set to True, computes the averaged SGD weights and stores the + result in the ``coef_`` attribute. If set to an int greater than 1, + averaging will begin once the total number of samples seen reaches + average. So average=10 will begin averaging after seeing 10 samples. + + .. versionadded:: 0.19 + parameter *average* to use weights averaging in SGD. + + Attributes + ---------- + coef_ : ndarray of shape (1, n_features) if n_classes == 2 else \ + (n_classes, n_features) + Weights assigned to the features. + + intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,) + Constants in decision function. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + The actual number of iterations to reach the stopping criterion. + For multiclass fits, it is the maximum over every binary fit. + + classes_ : ndarray of shape (n_classes,) + The unique classes labels. + + t_ : int + Number of weight updates performed during training. + Same as ``(n_iter_ * n_samples + 1)``. + + See Also + -------- + SGDClassifier : Incrementally trained logistic regression. + Perceptron : Linear perceptron classifier. + + References + ---------- + Online Passive-Aggressive Algorithms + + K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006) + + Examples + -------- + >>> from sklearn.linear_model import PassiveAggressiveClassifier + >>> from sklearn.datasets import make_classification + >>> X, y = make_classification(n_features=4, random_state=0) + >>> clf = PassiveAggressiveClassifier(max_iter=1000, random_state=0, + ... tol=1e-3) + >>> clf.fit(X, y) + PassiveAggressiveClassifier(random_state=0) + >>> print(clf.coef_) + [[0.26642044 0.45070924 0.67251877 0.64185414]] + >>> print(clf.intercept_) + [1.84127814] + >>> print(clf.predict([[0, 0, 0, 0]])) + [1] + """ + + _parameter_constraints: dict = { + **BaseSGDClassifier._parameter_constraints, + "loss": [StrOptions({"hinge", "squared_hinge"})], + "C": [Interval(Real, 0, None, closed="right")], + } + + def __init__( + self, + *, + C=1.0, + fit_intercept=True, + max_iter=1000, + tol=1e-3, + early_stopping=False, + validation_fraction=0.1, + n_iter_no_change=5, + shuffle=True, + verbose=0, + loss="hinge", + n_jobs=None, + random_state=None, + warm_start=False, + class_weight=None, + average=False, + ): + super().__init__( + penalty=None, + fit_intercept=fit_intercept, + max_iter=max_iter, + tol=tol, + early_stopping=early_stopping, + validation_fraction=validation_fraction, + n_iter_no_change=n_iter_no_change, + shuffle=shuffle, + verbose=verbose, + random_state=random_state, + eta0=1.0, + warm_start=warm_start, + class_weight=class_weight, + average=average, + n_jobs=n_jobs, + ) + + self.C = C + self.loss = loss + + @_fit_context(prefer_skip_nested_validation=True) + def partial_fit(self, X, y, classes=None): + """Fit linear model with Passive Aggressive algorithm. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Subset of the training data. + + y : array-like of shape (n_samples,) + Subset of the target values. + + classes : ndarray of shape (n_classes,) + Classes across all calls to partial_fit. + Can be obtained by via `np.unique(y_all)`, where y_all is the + target vector of the entire dataset. + This argument is required for the first call to partial_fit + and can be omitted in the subsequent calls. + Note that y doesn't need to contain all labels in `classes`. + + Returns + ------- + self : object + Fitted estimator. + """ + if not hasattr(self, "classes_"): + self._more_validate_params(for_partial_fit=True) + + if self.class_weight == "balanced": + raise ValueError( + "class_weight 'balanced' is not supported for " + "partial_fit. For 'balanced' weights, use " + "`sklearn.utils.compute_class_weight` with " + "`class_weight='balanced'`. In place of y you " + "can use a large enough subset of the full " + "training set target to properly estimate the " + "class frequency distributions. Pass the " + "resulting weights as the class_weight " + "parameter." + ) + + lr = "pa1" if self.loss == "hinge" else "pa2" + return self._partial_fit( + X, + y, + alpha=1.0, + C=self.C, + loss="hinge", + learning_rate=lr, + max_iter=1, + classes=classes, + sample_weight=None, + coef_init=None, + intercept_init=None, + ) + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, coef_init=None, intercept_init=None): + """Fit linear model with Passive Aggressive algorithm. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) + Target values. + + coef_init : ndarray of shape (n_classes, n_features) + The initial coefficients to warm-start the optimization. + + intercept_init : ndarray of shape (n_classes,) + The initial intercept to warm-start the optimization. + + Returns + ------- + self : object + Fitted estimator. + """ + self._more_validate_params() + + lr = "pa1" if self.loss == "hinge" else "pa2" + return self._fit( + X, + y, + alpha=1.0, + C=self.C, + loss="hinge", + learning_rate=lr, + coef_init=coef_init, + intercept_init=intercept_init, + ) + + +class PassiveAggressiveRegressor(BaseSGDRegressor): + """Passive Aggressive Regressor. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + + C : float, default=1.0 + Maximum step size (regularization). Defaults to 1.0. + + fit_intercept : bool, default=True + Whether the intercept should be estimated or not. If False, the + data is assumed to be already centered. Defaults to True. + + max_iter : int, default=1000 + The maximum number of passes over the training data (aka epochs). + It only impacts the behavior in the ``fit`` method, and not the + :meth:`~sklearn.linear_model.PassiveAggressiveRegressor.partial_fit` method. + + .. versionadded:: 0.19 + + tol : float or None, default=1e-3 + The stopping criterion. If it is not None, the iterations will stop + when (loss > previous_loss - tol). + + .. versionadded:: 0.19 + + early_stopping : bool, default=False + Whether to use early stopping to terminate training when validation. + score is not improving. If set to True, it will automatically set aside + a fraction of training data as validation and terminate + training when validation score is not improving by at least tol for + n_iter_no_change consecutive epochs. + + .. versionadded:: 0.20 + + validation_fraction : float, default=0.1 + The proportion of training data to set aside as validation set for + early stopping. Must be between 0 and 1. + Only used if early_stopping is True. + + .. versionadded:: 0.20 + + n_iter_no_change : int, default=5 + Number of iterations with no improvement to wait before early stopping. + + .. versionadded:: 0.20 + + shuffle : bool, default=True + Whether or not the training data should be shuffled after each epoch. + + verbose : int, default=0 + The verbosity level. + + loss : str, default="epsilon_insensitive" + The loss function to be used: + epsilon_insensitive: equivalent to PA-I in the reference paper. + squared_epsilon_insensitive: equivalent to PA-II in the reference + paper. + + epsilon : float, default=0.1 + If the difference between the current prediction and the correct label + is below this threshold, the model is not updated. + + random_state : int, RandomState instance, default=None + Used to shuffle the training data, when ``shuffle`` is set to + ``True``. Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. + + warm_start : bool, default=False + When set to True, reuse the solution of the previous call to fit as + initialization, otherwise, just erase the previous solution. + See :term:`the Glossary `. + + Repeatedly calling fit or partial_fit when warm_start is True can + result in a different solution than when calling fit a single time + because of the way the data is shuffled. + + average : bool or int, default=False + When set to True, computes the averaged SGD weights and stores the + result in the ``coef_`` attribute. If set to an int greater than 1, + averaging will begin once the total number of samples seen reaches + average. So average=10 will begin averaging after seeing 10 samples. + + .. versionadded:: 0.19 + parameter *average* to use weights averaging in SGD. + + Attributes + ---------- + coef_ : array, shape = [1, n_features] if n_classes == 2 else [n_classes,\ + n_features] + Weights assigned to the features. + + intercept_ : array, shape = [1] if n_classes == 2 else [n_classes] + Constants in decision function. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + The actual number of iterations to reach the stopping criterion. + + t_ : int + Number of weight updates performed during training. + Same as ``(n_iter_ * n_samples + 1)``. + + See Also + -------- + SGDRegressor : Linear model fitted by minimizing a regularized + empirical loss with SGD. + + References + ---------- + Online Passive-Aggressive Algorithms + + K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006). + + Examples + -------- + >>> from sklearn.linear_model import PassiveAggressiveRegressor + >>> from sklearn.datasets import make_regression + + >>> X, y = make_regression(n_features=4, random_state=0) + >>> regr = PassiveAggressiveRegressor(max_iter=100, random_state=0, + ... tol=1e-3) + >>> regr.fit(X, y) + PassiveAggressiveRegressor(max_iter=100, random_state=0) + >>> print(regr.coef_) + [20.48736655 34.18818427 67.59122734 87.94731329] + >>> print(regr.intercept_) + [-0.02306214] + >>> print(regr.predict([[0, 0, 0, 0]])) + [-0.02306214] + """ + + _parameter_constraints: dict = { + **BaseSGDRegressor._parameter_constraints, + "loss": [StrOptions({"epsilon_insensitive", "squared_epsilon_insensitive"})], + "C": [Interval(Real, 0, None, closed="right")], + "epsilon": [Interval(Real, 0, None, closed="left")], + } + + def __init__( + self, + *, + C=1.0, + fit_intercept=True, + max_iter=1000, + tol=1e-3, + early_stopping=False, + validation_fraction=0.1, + n_iter_no_change=5, + shuffle=True, + verbose=0, + loss="epsilon_insensitive", + epsilon=DEFAULT_EPSILON, + random_state=None, + warm_start=False, + average=False, + ): + super().__init__( + penalty=None, + l1_ratio=0, + epsilon=epsilon, + eta0=1.0, + fit_intercept=fit_intercept, + max_iter=max_iter, + tol=tol, + early_stopping=early_stopping, + validation_fraction=validation_fraction, + n_iter_no_change=n_iter_no_change, + shuffle=shuffle, + verbose=verbose, + random_state=random_state, + warm_start=warm_start, + average=average, + ) + self.C = C + self.loss = loss + + @_fit_context(prefer_skip_nested_validation=True) + def partial_fit(self, X, y): + """Fit linear model with Passive Aggressive algorithm. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Subset of training data. + + y : numpy array of shape [n_samples] + Subset of target values. + + Returns + ------- + self : object + Fitted estimator. + """ + if not hasattr(self, "coef_"): + self._more_validate_params(for_partial_fit=True) + + lr = "pa1" if self.loss == "epsilon_insensitive" else "pa2" + return self._partial_fit( + X, + y, + alpha=1.0, + C=self.C, + loss="epsilon_insensitive", + learning_rate=lr, + max_iter=1, + sample_weight=None, + coef_init=None, + intercept_init=None, + ) + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, coef_init=None, intercept_init=None): + """Fit linear model with Passive Aggressive algorithm. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : numpy array of shape [n_samples] + Target values. + + coef_init : array, shape = [n_features] + The initial coefficients to warm-start the optimization. + + intercept_init : array, shape = [1] + The initial intercept to warm-start the optimization. + + Returns + ------- + self : object + Fitted estimator. + """ + self._more_validate_params() + + lr = "pa1" if self.loss == "epsilon_insensitive" else "pa2" + return self._fit( + X, + y, + alpha=1.0, + C=self.C, + loss="epsilon_insensitive", + learning_rate=lr, + coef_init=coef_init, + intercept_init=intercept_init, + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_perceptron.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_perceptron.py new file mode 100644 index 0000000000000000000000000000000000000000..e93200ba385faf037be75654061932ee6e886b7b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_perceptron.py @@ -0,0 +1,226 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Real + +from ..utils._param_validation import Interval, StrOptions +from ._stochastic_gradient import BaseSGDClassifier + + +class Perceptron(BaseSGDClassifier): + """Linear perceptron classifier. + + The implementation is a wrapper around :class:`~sklearn.linear_model.SGDClassifier` + by fixing the `loss` and `learning_rate` parameters as:: + + SGDClassifier(loss="perceptron", learning_rate="constant") + + Other available parameters are described below and are forwarded to + :class:`~sklearn.linear_model.SGDClassifier`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + + penalty : {'l2','l1','elasticnet'}, default=None + The penalty (aka regularization term) to be used. + + alpha : float, default=0.0001 + Constant that multiplies the regularization term if regularization is + used. + + l1_ratio : float, default=0.15 + The Elastic Net mixing parameter, with `0 <= l1_ratio <= 1`. + `l1_ratio=0` corresponds to L2 penalty, `l1_ratio=1` to L1. + Only used if `penalty='elasticnet'`. + + .. versionadded:: 0.24 + + fit_intercept : bool, default=True + Whether the intercept should be estimated or not. If False, the + data is assumed to be already centered. + + max_iter : int, default=1000 + The maximum number of passes over the training data (aka epochs). + It only impacts the behavior in the ``fit`` method, and not the + :meth:`partial_fit` method. + + .. versionadded:: 0.19 + + tol : float or None, default=1e-3 + The stopping criterion. If it is not None, the iterations will stop + when (loss > previous_loss - tol). + + .. versionadded:: 0.19 + + shuffle : bool, default=True + Whether or not the training data should be shuffled after each epoch. + + verbose : int, default=0 + The verbosity level. + + eta0 : float, default=1 + Constant by which the updates are multiplied. + + n_jobs : int, default=None + The number of CPUs to use to do the OVA (One Versus All, for + multi-class problems) computation. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + random_state : int, RandomState instance or None, default=0 + Used to shuffle the training data, when ``shuffle`` is set to + ``True``. Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. + + early_stopping : bool, default=False + Whether to use early stopping to terminate training when validation + score is not improving. If set to True, it will automatically set aside + a stratified fraction of training data as validation and terminate + training when validation score is not improving by at least `tol` for + `n_iter_no_change` consecutive epochs. + + .. versionadded:: 0.20 + + validation_fraction : float, default=0.1 + The proportion of training data to set aside as validation set for + early stopping. Must be between 0 and 1. + Only used if early_stopping is True. + + .. versionadded:: 0.20 + + n_iter_no_change : int, default=5 + Number of iterations with no improvement to wait before early stopping. + + .. versionadded:: 0.20 + + class_weight : dict, {class_label: weight} or "balanced", default=None + Preset for the class_weight fit parameter. + + Weights associated with classes. If not given, all classes + are supposed to have weight one. + + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))``. + + warm_start : bool, default=False + When set to True, reuse the solution of the previous call to fit as + initialization, otherwise, just erase the previous solution. See + :term:`the Glossary `. + + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) + The unique classes labels. + + coef_ : ndarray of shape (1, n_features) if n_classes == 2 else \ + (n_classes, n_features) + Weights assigned to the features. + + intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,) + Constants in decision function. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + The actual number of iterations to reach the stopping criterion. + For multiclass fits, it is the maximum over every binary fit. + + t_ : int + Number of weight updates performed during training. + Same as ``(n_iter_ * n_samples + 1)``. + + See Also + -------- + sklearn.linear_model.SGDClassifier : Linear classifiers + (SVM, logistic regression, etc.) with SGD training. + + Notes + ----- + ``Perceptron`` is a classification algorithm which shares the same + underlying implementation with ``SGDClassifier``. In fact, + ``Perceptron()`` is equivalent to `SGDClassifier(loss="perceptron", + eta0=1, learning_rate="constant", penalty=None)`. + + References + ---------- + https://en.wikipedia.org/wiki/Perceptron and references therein. + + Examples + -------- + >>> from sklearn.datasets import load_digits + >>> from sklearn.linear_model import Perceptron + >>> X, y = load_digits(return_X_y=True) + >>> clf = Perceptron(tol=1e-3, random_state=0) + >>> clf.fit(X, y) + Perceptron() + >>> clf.score(X, y) + 0.939... + """ + + _parameter_constraints: dict = {**BaseSGDClassifier._parameter_constraints} + _parameter_constraints.pop("loss") + _parameter_constraints.pop("average") + _parameter_constraints.update( + { + "penalty": [StrOptions({"l2", "l1", "elasticnet"}), None], + "alpha": [Interval(Real, 0, None, closed="left")], + "l1_ratio": [Interval(Real, 0, 1, closed="both")], + "eta0": [Interval(Real, 0, None, closed="left")], + } + ) + + def __init__( + self, + *, + penalty=None, + alpha=0.0001, + l1_ratio=0.15, + fit_intercept=True, + max_iter=1000, + tol=1e-3, + shuffle=True, + verbose=0, + eta0=1.0, + n_jobs=None, + random_state=0, + early_stopping=False, + validation_fraction=0.1, + n_iter_no_change=5, + class_weight=None, + warm_start=False, + ): + super().__init__( + loss="perceptron", + penalty=penalty, + alpha=alpha, + l1_ratio=l1_ratio, + fit_intercept=fit_intercept, + max_iter=max_iter, + tol=tol, + shuffle=shuffle, + verbose=verbose, + random_state=random_state, + learning_rate="constant", + eta0=eta0, + early_stopping=early_stopping, + validation_fraction=validation_fraction, + n_iter_no_change=n_iter_no_change, + power_t=0.5, + warm_start=warm_start, + class_weight=class_weight, + n_jobs=n_jobs, + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_quantile.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_quantile.py new file mode 100644 index 0000000000000000000000000000000000000000..446d232958e8dbe3fec247ab37c05b39469160e8 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_quantile.py @@ -0,0 +1,301 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from numbers import Real + +import numpy as np +from scipy import sparse +from scipy.optimize import linprog + +from ..base import BaseEstimator, RegressorMixin, _fit_context +from ..exceptions import ConvergenceWarning +from ..utils import _safe_indexing +from ..utils._param_validation import Interval, StrOptions +from ..utils.fixes import parse_version, sp_version +from ..utils.validation import _check_sample_weight, validate_data +from ._base import LinearModel + + +class QuantileRegressor(LinearModel, RegressorMixin, BaseEstimator): + """Linear regression model that predicts conditional quantiles. + + The linear :class:`QuantileRegressor` optimizes the pinball loss for a + desired `quantile` and is robust to outliers. + + This model uses an L1 regularization like + :class:`~sklearn.linear_model.Lasso`. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.0 + + Parameters + ---------- + quantile : float, default=0.5 + The quantile that the model tries to predict. It must be strictly + between 0 and 1. If 0.5 (default), the model predicts the 50% + quantile, i.e. the median. + + alpha : float, default=1.0 + Regularization constant that multiplies the L1 penalty term. + + fit_intercept : bool, default=True + Whether or not to fit the intercept. + + solver : {'highs-ds', 'highs-ipm', 'highs', 'interior-point', \ + 'revised simplex'}, default='highs' + Method used by :func:`scipy.optimize.linprog` to solve the linear + programming formulation. + + It is recommended to use the highs methods because + they are the fastest ones. Solvers "highs-ds", "highs-ipm" and "highs" + support sparse input data and, in fact, always convert to sparse csc. + + From `scipy>=1.11.0`, "interior-point" is not available anymore. + + .. versionchanged:: 1.4 + The default of `solver` changed to `"highs"` in version 1.4. + + solver_options : dict, default=None + Additional parameters passed to :func:`scipy.optimize.linprog` as + options. If `None` and if `solver='interior-point'`, then + `{"lstsq": True}` is passed to :func:`scipy.optimize.linprog` for the + sake of stability. + + Attributes + ---------- + coef_ : array of shape (n_features,) + Estimated coefficients for the features. + + intercept_ : float + The intercept of the model, aka bias term. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + The actual number of iterations performed by the solver. + + See Also + -------- + Lasso : The Lasso is a linear model that estimates sparse coefficients + with l1 regularization. + HuberRegressor : Linear regression model that is robust to outliers. + + Examples + -------- + >>> from sklearn.linear_model import QuantileRegressor + >>> import numpy as np + >>> n_samples, n_features = 10, 2 + >>> rng = np.random.RandomState(0) + >>> y = rng.randn(n_samples) + >>> X = rng.randn(n_samples, n_features) + >>> # the two following lines are optional in practice + >>> from sklearn.utils.fixes import sp_version, parse_version + >>> reg = QuantileRegressor(quantile=0.8).fit(X, y) + >>> np.mean(y <= reg.predict(X)) + np.float64(0.8) + """ + + _parameter_constraints: dict = { + "quantile": [Interval(Real, 0, 1, closed="neither")], + "alpha": [Interval(Real, 0, None, closed="left")], + "fit_intercept": ["boolean"], + "solver": [ + StrOptions( + { + "highs-ds", + "highs-ipm", + "highs", + "interior-point", + "revised simplex", + } + ), + ], + "solver_options": [dict, None], + } + + def __init__( + self, + *, + quantile=0.5, + alpha=1.0, + fit_intercept=True, + solver="highs", + solver_options=None, + ): + self.quantile = quantile + self.alpha = alpha + self.fit_intercept = fit_intercept + self.solver = solver + self.solver_options = solver_options + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, sample_weight=None): + """Fit the model according to the given training data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) + Target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + self : object + Returns self. + """ + X, y = validate_data( + self, + X, + y, + accept_sparse=["csc", "csr", "coo"], + y_numeric=True, + multi_output=False, + ) + sample_weight = _check_sample_weight(sample_weight, X) + + n_features = X.shape[1] + n_params = n_features + + if self.fit_intercept: + n_params += 1 + # Note that centering y and X with _preprocess_data does not work + # for quantile regression. + + # The objective is defined as 1/n * sum(pinball loss) + alpha * L1. + # So we rescale the penalty term, which is equivalent. + alpha = np.sum(sample_weight) * self.alpha + + if self.solver == "interior-point" and sp_version >= parse_version("1.11.0"): + raise ValueError( + f"Solver {self.solver} is not anymore available in SciPy >= 1.11.0." + ) + + if sparse.issparse(X) and self.solver not in ["highs", "highs-ds", "highs-ipm"]: + raise ValueError( + f"Solver {self.solver} does not support sparse X. " + "Use solver 'highs' for example." + ) + # make default solver more stable + if self.solver_options is None and self.solver == "interior-point": + solver_options = {"lstsq": True} + else: + solver_options = self.solver_options + + # After rescaling alpha, the minimization problem is + # min sum(pinball loss) + alpha * L1 + # Use linear programming formulation of quantile regression + # min_x c x + # A_eq x = b_eq + # 0 <= x + # x = (s0, s, t0, t, u, v) = slack variables >= 0 + # intercept = s0 - t0 + # coef = s - t + # c = (0, alpha * 1_p, 0, alpha * 1_p, quantile * 1_n, (1-quantile) * 1_n) + # residual = y - X@coef - intercept = u - v + # A_eq = (1_n, X, -1_n, -X, diag(1_n), -diag(1_n)) + # b_eq = y + # p = n_features + # n = n_samples + # 1_n = vector of length n with entries equal one + # see https://stats.stackexchange.com/questions/384909/ + # + # Filtering out zero sample weights from the beginning makes life + # easier for the linprog solver. + indices = np.nonzero(sample_weight)[0] + n_indices = len(indices) # use n_mask instead of n_samples + if n_indices < len(sample_weight): + sample_weight = sample_weight[indices] + X = _safe_indexing(X, indices) + y = _safe_indexing(y, indices) + c = np.concatenate( + [ + np.full(2 * n_params, fill_value=alpha), + sample_weight * self.quantile, + sample_weight * (1 - self.quantile), + ] + ) + if self.fit_intercept: + # do not penalize the intercept + c[0] = 0 + c[n_params] = 0 + + if self.solver in ["highs", "highs-ds", "highs-ipm"]: + # Note that highs methods always use a sparse CSC memory layout internally, + # even for optimization problems parametrized using dense numpy arrays. + # Therefore, we work with CSC matrices as early as possible to limit + # unnecessary repeated memory copies. + eye = sparse.eye(n_indices, dtype=X.dtype, format="csc") + if self.fit_intercept: + ones = sparse.csc_matrix(np.ones(shape=(n_indices, 1), dtype=X.dtype)) + A_eq = sparse.hstack([ones, X, -ones, -X, eye, -eye], format="csc") + else: + A_eq = sparse.hstack([X, -X, eye, -eye], format="csc") + else: + eye = np.eye(n_indices) + if self.fit_intercept: + ones = np.ones((n_indices, 1)) + A_eq = np.concatenate([ones, X, -ones, -X, eye, -eye], axis=1) + else: + A_eq = np.concatenate([X, -X, eye, -eye], axis=1) + + b_eq = y + + result = linprog( + c=c, + A_eq=A_eq, + b_eq=b_eq, + method=self.solver, + options=solver_options, + ) + solution = result.x + if not result.success: + failure = { + 1: "Iteration limit reached.", + 2: "Problem appears to be infeasible.", + 3: "Problem appears to be unbounded.", + 4: "Numerical difficulties encountered.", + } + warnings.warn( + "Linear programming for QuantileRegressor did not succeed.\n" + f"Status is {result.status}: " + + failure.setdefault(result.status, "unknown reason") + + "\n" + + "Result message of linprog:\n" + + result.message, + ConvergenceWarning, + ) + + # positive slack - negative slack + # solution is an array with (params_pos, params_neg, u, v) + params = solution[:n_params] - solution[n_params : 2 * n_params] + + self.n_iter_ = result.nit + + if self.fit_intercept: + self.coef_ = params[1:] + self.intercept_ = params[0] + else: + self.coef_ = params + self.intercept_ = 0.0 + return self + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_ransac.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_ransac.py new file mode 100644 index 0000000000000000000000000000000000000000..c18065436dc3518ccb4a2359480cf7db7f36cd7e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_ransac.py @@ -0,0 +1,726 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from numbers import Integral, Real + +import numpy as np + +from ..base import ( + BaseEstimator, + MetaEstimatorMixin, + MultiOutputMixin, + RegressorMixin, + _fit_context, + clone, +) +from ..exceptions import ConvergenceWarning +from ..utils import check_consistent_length, check_random_state, get_tags +from ..utils._bunch import Bunch +from ..utils._param_validation import ( + HasMethods, + Interval, + Options, + RealNotInt, + StrOptions, +) +from ..utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + _routing_enabled, + process_routing, +) +from ..utils.random import sample_without_replacement +from ..utils.validation import ( + _check_method_params, + _check_sample_weight, + check_is_fitted, + has_fit_parameter, + validate_data, +) +from ._base import LinearRegression + +_EPSILON = np.spacing(1) + + +def _dynamic_max_trials(n_inliers, n_samples, min_samples, probability): + """Determine number trials such that at least one outlier-free subset is + sampled for the given inlier/outlier ratio. + + Parameters + ---------- + n_inliers : int + Number of inliers in the data. + + n_samples : int + Total number of samples in the data. + + min_samples : int + Minimum number of samples chosen randomly from original data. + + probability : float + Probability (confidence) that one outlier-free sample is generated. + + Returns + ------- + trials : int + Number of trials. + + """ + inlier_ratio = n_inliers / float(n_samples) + nom = max(_EPSILON, 1 - probability) + denom = max(_EPSILON, 1 - inlier_ratio**min_samples) + if nom == 1: + return 0 + if denom == 1: + return float("inf") + return abs(float(np.ceil(np.log(nom) / np.log(denom)))) + + +class RANSACRegressor( + MetaEstimatorMixin, + RegressorMixin, + MultiOutputMixin, + BaseEstimator, +): + """RANSAC (RANdom SAmple Consensus) algorithm. + + RANSAC is an iterative algorithm for the robust estimation of parameters + from a subset of inliers from the complete data set. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : object, default=None + Base estimator object which implements the following methods: + + * `fit(X, y)`: Fit model to given training data and target values. + * `score(X, y)`: Returns the mean accuracy on the given test data, + which is used for the stop criterion defined by `stop_score`. + Additionally, the score is used to decide which of two equally + large consensus sets is chosen as the better one. + * `predict(X)`: Returns predicted values using the linear model, + which is used to compute residual error using loss function. + + If `estimator` is None, then + :class:`~sklearn.linear_model.LinearRegression` is used for + target values of dtype float. + + Note that the current implementation only supports regression + estimators. + + min_samples : int (>= 1) or float ([0, 1]), default=None + Minimum number of samples chosen randomly from original data. Treated + as an absolute number of samples for `min_samples >= 1`, treated as a + relative number `ceil(min_samples * X.shape[0])` for + `min_samples < 1`. This is typically chosen as the minimal number of + samples necessary to estimate the given `estimator`. By default a + :class:`~sklearn.linear_model.LinearRegression` estimator is assumed and + `min_samples` is chosen as ``X.shape[1] + 1``. This parameter is highly + dependent upon the model, so if a `estimator` other than + :class:`~sklearn.linear_model.LinearRegression` is used, the user must + provide a value. + + residual_threshold : float, default=None + Maximum residual for a data sample to be classified as an inlier. + By default the threshold is chosen as the MAD (median absolute + deviation) of the target values `y`. Points whose residuals are + strictly equal to the threshold are considered as inliers. + + is_data_valid : callable, default=None + This function is called with the randomly selected data before the + model is fitted to it: `is_data_valid(X, y)`. If its return value is + False the current randomly chosen sub-sample is skipped. + + is_model_valid : callable, default=None + This function is called with the estimated model and the randomly + selected data: `is_model_valid(model, X, y)`. If its return value is + False the current randomly chosen sub-sample is skipped. + Rejecting samples with this function is computationally costlier than + with `is_data_valid`. `is_model_valid` should therefore only be used if + the estimated model is needed for making the rejection decision. + + max_trials : int, default=100 + Maximum number of iterations for random sample selection. + + max_skips : int, default=np.inf + Maximum number of iterations that can be skipped due to finding zero + inliers or invalid data defined by ``is_data_valid`` or invalid models + defined by ``is_model_valid``. + + .. versionadded:: 0.19 + + stop_n_inliers : int, default=np.inf + Stop iteration if at least this number of inliers are found. + + stop_score : float, default=np.inf + Stop iteration if score is greater equal than this threshold. + + stop_probability : float in range [0, 1], default=0.99 + RANSAC iteration stops if at least one outlier-free set of the training + data is sampled in RANSAC. This requires to generate at least N + samples (iterations):: + + N >= log(1 - probability) / log(1 - e**m) + + where the probability (confidence) is typically set to high value such + as 0.99 (the default) and e is the current fraction of inliers w.r.t. + the total number of samples. + + loss : str, callable, default='absolute_error' + String inputs, 'absolute_error' and 'squared_error' are supported which + find the absolute error and squared error per sample respectively. + + If ``loss`` is a callable, then it should be a function that takes + two arrays as inputs, the true and predicted value and returns a 1-D + array with the i-th value of the array corresponding to the loss + on ``X[i]``. + + If the loss on a sample is greater than the ``residual_threshold``, + then this sample is classified as an outlier. + + .. versionadded:: 0.18 + + random_state : int, RandomState instance, default=None + The generator used to initialize the centers. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + estimator_ : object + Final model fitted on the inliers predicted by the "best" model found + during RANSAC sampling (copy of the `estimator` object). + + n_trials_ : int + Number of random selection trials until one of the stop criteria is + met. It is always ``<= max_trials``. + + inlier_mask_ : bool array of shape [n_samples] + Boolean mask of inliers classified as ``True``. + + n_skips_no_inliers_ : int + Number of iterations skipped due to finding zero inliers. + + .. versionadded:: 0.19 + + n_skips_invalid_data_ : int + Number of iterations skipped due to invalid data defined by + ``is_data_valid``. + + .. versionadded:: 0.19 + + n_skips_invalid_model_ : int + Number of iterations skipped due to an invalid model defined by + ``is_model_valid``. + + .. versionadded:: 0.19 + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + HuberRegressor : Linear regression model that is robust to outliers. + TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model. + SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD. + + References + ---------- + .. [1] https://en.wikipedia.org/wiki/RANSAC + .. [2] https://www.sri.com/wp-content/uploads/2021/12/ransac-publication.pdf + .. [3] https://bmva-archive.org.uk/bmvc/2009/Papers/Paper355/Paper355.pdf + + Examples + -------- + >>> from sklearn.linear_model import RANSACRegressor + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression( + ... n_samples=200, n_features=2, noise=4.0, random_state=0) + >>> reg = RANSACRegressor(random_state=0).fit(X, y) + >>> reg.score(X, y) + 0.9885 + >>> reg.predict(X[:1,]) + array([-31.9417]) + + For a more detailed example, see + :ref:`sphx_glr_auto_examples_linear_model_plot_ransac.py` + """ + + _parameter_constraints: dict = { + "estimator": [HasMethods(["fit", "score", "predict"]), None], + "min_samples": [ + Interval(Integral, 1, None, closed="left"), + Interval(RealNotInt, 0, 1, closed="both"), + None, + ], + "residual_threshold": [Interval(Real, 0, None, closed="left"), None], + "is_data_valid": [callable, None], + "is_model_valid": [callable, None], + "max_trials": [ + Interval(Integral, 0, None, closed="left"), + Options(Real, {np.inf}), + ], + "max_skips": [ + Interval(Integral, 0, None, closed="left"), + Options(Real, {np.inf}), + ], + "stop_n_inliers": [ + Interval(Integral, 0, None, closed="left"), + Options(Real, {np.inf}), + ], + "stop_score": [Interval(Real, None, None, closed="both")], + "stop_probability": [Interval(Real, 0, 1, closed="both")], + "loss": [StrOptions({"absolute_error", "squared_error"}), callable], + "random_state": ["random_state"], + } + + def __init__( + self, + estimator=None, + *, + min_samples=None, + residual_threshold=None, + is_data_valid=None, + is_model_valid=None, + max_trials=100, + max_skips=np.inf, + stop_n_inliers=np.inf, + stop_score=np.inf, + stop_probability=0.99, + loss="absolute_error", + random_state=None, + ): + self.estimator = estimator + self.min_samples = min_samples + self.residual_threshold = residual_threshold + self.is_data_valid = is_data_valid + self.is_model_valid = is_model_valid + self.max_trials = max_trials + self.max_skips = max_skips + self.stop_n_inliers = stop_n_inliers + self.stop_score = stop_score + self.stop_probability = stop_probability + self.random_state = random_state + self.loss = loss + + @_fit_context( + # RansacRegressor.estimator is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y, sample_weight=None, **fit_params): + """Fit estimator using RANSAC algorithm. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) or (n_samples, n_targets) + Target values. + + sample_weight : array-like of shape (n_samples,), default=None + Individual weights for each sample + raises error if sample_weight is passed and estimator + fit method does not support it. + + .. versionadded:: 0.18 + + **fit_params : dict + Parameters routed to the `fit` method of the sub-estimator via the + metadata routing API. + + .. versionadded:: 1.5 + + Only available if + `sklearn.set_config(enable_metadata_routing=True)` is set. See + :ref:`Metadata Routing User Guide ` for more + details. + + Returns + ------- + self : object + Fitted `RANSACRegressor` estimator. + + Raises + ------ + ValueError + If no valid consensus set could be found. This occurs if + `is_data_valid` and `is_model_valid` return False for all + `max_trials` randomly chosen sub-samples. + """ + # Need to validate separately here. We can't pass multi_output=True + # because that would allow y to be csr. Delay expensive finiteness + # check to the estimator's own input validation. + _raise_for_params(fit_params, self, "fit") + check_X_params = dict(accept_sparse="csr", ensure_all_finite=False) + check_y_params = dict(ensure_2d=False) + X, y = validate_data( + self, X, y, validate_separately=(check_X_params, check_y_params) + ) + check_consistent_length(X, y) + + if self.estimator is not None: + estimator = clone(self.estimator) + else: + estimator = LinearRegression() + + if self.min_samples is None: + if not isinstance(estimator, LinearRegression): + raise ValueError( + "`min_samples` needs to be explicitly set when estimator " + "is not a LinearRegression." + ) + min_samples = X.shape[1] + 1 + elif 0 < self.min_samples < 1: + min_samples = np.ceil(self.min_samples * X.shape[0]) + elif self.min_samples >= 1: + min_samples = self.min_samples + if min_samples > X.shape[0]: + raise ValueError( + "`min_samples` may not be larger than number " + "of samples: n_samples = %d." % (X.shape[0]) + ) + + if self.residual_threshold is None: + # MAD (median absolute deviation) + residual_threshold = np.median(np.abs(y - np.median(y))) + else: + residual_threshold = self.residual_threshold + + if self.loss == "absolute_error": + if y.ndim == 1: + loss_function = lambda y_true, y_pred: np.abs(y_true - y_pred) + else: + loss_function = lambda y_true, y_pred: np.sum( + np.abs(y_true - y_pred), axis=1 + ) + elif self.loss == "squared_error": + if y.ndim == 1: + loss_function = lambda y_true, y_pred: (y_true - y_pred) ** 2 + else: + loss_function = lambda y_true, y_pred: np.sum( + (y_true - y_pred) ** 2, axis=1 + ) + + elif callable(self.loss): + loss_function = self.loss + + random_state = check_random_state(self.random_state) + + try: # Not all estimator accept a random_state + estimator.set_params(random_state=random_state) + except ValueError: + pass + + estimator_fit_has_sample_weight = has_fit_parameter(estimator, "sample_weight") + estimator_name = type(estimator).__name__ + if sample_weight is not None and not estimator_fit_has_sample_weight: + raise ValueError( + "%s does not support sample_weight. Sample" + " weights are only used for the calibration" + " itself." % estimator_name + ) + + if sample_weight is not None: + fit_params["sample_weight"] = sample_weight + + if _routing_enabled(): + routed_params = process_routing(self, "fit", **fit_params) + else: + routed_params = Bunch() + routed_params.estimator = Bunch(fit={}, predict={}, score={}) + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) + routed_params.estimator.fit = {"sample_weight": sample_weight} + + n_inliers_best = 1 + score_best = -np.inf + inlier_mask_best = None + X_inlier_best = None + y_inlier_best = None + inlier_best_idxs_subset = None + self.n_skips_no_inliers_ = 0 + self.n_skips_invalid_data_ = 0 + self.n_skips_invalid_model_ = 0 + + # number of data samples + n_samples = X.shape[0] + sample_idxs = np.arange(n_samples) + + self.n_trials_ = 0 + max_trials = self.max_trials + while self.n_trials_ < max_trials: + self.n_trials_ += 1 + + if ( + self.n_skips_no_inliers_ + + self.n_skips_invalid_data_ + + self.n_skips_invalid_model_ + ) > self.max_skips: + break + + # choose random sample set + subset_idxs = sample_without_replacement( + n_samples, min_samples, random_state=random_state + ) + X_subset = X[subset_idxs] + y_subset = y[subset_idxs] + + # check if random sample set is valid + if self.is_data_valid is not None and not self.is_data_valid( + X_subset, y_subset + ): + self.n_skips_invalid_data_ += 1 + continue + + # cut `fit_params` down to `subset_idxs` + fit_params_subset = _check_method_params( + X, params=routed_params.estimator.fit, indices=subset_idxs + ) + + # fit model for current random sample set + estimator.fit(X_subset, y_subset, **fit_params_subset) + + # check if estimated model is valid + if self.is_model_valid is not None and not self.is_model_valid( + estimator, X_subset, y_subset + ): + self.n_skips_invalid_model_ += 1 + continue + + # residuals of all data for current random sample model + y_pred = estimator.predict(X) + residuals_subset = loss_function(y, y_pred) + + # classify data into inliers and outliers + inlier_mask_subset = residuals_subset <= residual_threshold + n_inliers_subset = np.sum(inlier_mask_subset) + + # less inliers -> skip current random sample + if n_inliers_subset < n_inliers_best: + self.n_skips_no_inliers_ += 1 + continue + + # extract inlier data set + inlier_idxs_subset = sample_idxs[inlier_mask_subset] + X_inlier_subset = X[inlier_idxs_subset] + y_inlier_subset = y[inlier_idxs_subset] + + # cut `fit_params` down to `inlier_idxs_subset` + score_params_inlier_subset = _check_method_params( + X, params=routed_params.estimator.score, indices=inlier_idxs_subset + ) + + # score of inlier data set + score_subset = estimator.score( + X_inlier_subset, + y_inlier_subset, + **score_params_inlier_subset, + ) + + # same number of inliers but worse score -> skip current random + # sample + if n_inliers_subset == n_inliers_best and score_subset < score_best: + continue + + # save current random sample as best sample + n_inliers_best = n_inliers_subset + score_best = score_subset + inlier_mask_best = inlier_mask_subset + X_inlier_best = X_inlier_subset + y_inlier_best = y_inlier_subset + inlier_best_idxs_subset = inlier_idxs_subset + + max_trials = min( + max_trials, + _dynamic_max_trials( + n_inliers_best, n_samples, min_samples, self.stop_probability + ), + ) + + # break if sufficient number of inliers or score is reached + if n_inliers_best >= self.stop_n_inliers or score_best >= self.stop_score: + break + + # if none of the iterations met the required criteria + if inlier_mask_best is None: + if ( + self.n_skips_no_inliers_ + + self.n_skips_invalid_data_ + + self.n_skips_invalid_model_ + ) > self.max_skips: + raise ValueError( + "RANSAC skipped more iterations than `max_skips` without" + " finding a valid consensus set. Iterations were skipped" + " because each randomly chosen sub-sample failed the" + " passing criteria. See estimator attributes for" + " diagnostics (n_skips*)." + ) + else: + raise ValueError( + "RANSAC could not find a valid consensus set. All" + " `max_trials` iterations were skipped because each" + " randomly chosen sub-sample failed the passing criteria." + " See estimator attributes for diagnostics (n_skips*)." + ) + else: + if ( + self.n_skips_no_inliers_ + + self.n_skips_invalid_data_ + + self.n_skips_invalid_model_ + ) > self.max_skips: + warnings.warn( + ( + "RANSAC found a valid consensus set but exited" + " early due to skipping more iterations than" + " `max_skips`. See estimator attributes for" + " diagnostics (n_skips*)." + ), + ConvergenceWarning, + ) + + # estimate final model using all inliers + fit_params_best_idxs_subset = _check_method_params( + X, params=routed_params.estimator.fit, indices=inlier_best_idxs_subset + ) + + estimator.fit(X_inlier_best, y_inlier_best, **fit_params_best_idxs_subset) + + self.estimator_ = estimator + self.inlier_mask_ = inlier_mask_best + return self + + def predict(self, X, **params): + """Predict using the estimated model. + + This is a wrapper for `estimator_.predict(X)`. + + Parameters + ---------- + X : {array-like or sparse matrix} of shape (n_samples, n_features) + Input data. + + **params : dict + Parameters routed to the `predict` method of the sub-estimator via + the metadata routing API. + + .. versionadded:: 1.5 + + Only available if + `sklearn.set_config(enable_metadata_routing=True)` is set. See + :ref:`Metadata Routing User Guide ` for more + details. + + Returns + ------- + y : array, shape = [n_samples] or [n_samples, n_targets] + Returns predicted values. + """ + check_is_fitted(self) + X = validate_data( + self, + X, + ensure_all_finite=False, + accept_sparse=True, + reset=False, + ) + + _raise_for_params(params, self, "predict") + + if _routing_enabled(): + predict_params = process_routing(self, "predict", **params).estimator[ + "predict" + ] + else: + predict_params = {} + + return self.estimator_.predict(X, **predict_params) + + def score(self, X, y, **params): + """Return the score of the prediction. + + This is a wrapper for `estimator_.score(X, y)`. + + Parameters + ---------- + X : (array-like or sparse matrix} of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) or (n_samples, n_targets) + Target values. + + **params : dict + Parameters routed to the `score` method of the sub-estimator via + the metadata routing API. + + .. versionadded:: 1.5 + + Only available if + `sklearn.set_config(enable_metadata_routing=True)` is set. See + :ref:`Metadata Routing User Guide ` for more + details. + + Returns + ------- + z : float + Score of the prediction. + """ + check_is_fitted(self) + X = validate_data( + self, + X, + ensure_all_finite=False, + accept_sparse=True, + reset=False, + ) + + _raise_for_params(params, self, "score") + if _routing_enabled(): + score_params = process_routing(self, "score", **params).estimator["score"] + else: + score_params = {} + + return self.estimator_.score(X, y, **score_params) + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.5 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__).add( + estimator=self.estimator, + method_mapping=MethodMapping() + .add(caller="fit", callee="fit") + .add(caller="fit", callee="score") + .add(caller="score", callee="score") + .add(caller="predict", callee="predict"), + ) + return router + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + if self.estimator is None: + tags.input_tags.sparse = True # default estimator is LinearRegression + else: + tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_ridge.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_ridge.py new file mode 100644 index 0000000000000000000000000000000000000000..0a55291a70ace22716d07fecffc931c8dadb093e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_ridge.py @@ -0,0 +1,2899 @@ +""" +Ridge regression +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numbers +import warnings +from abc import ABCMeta, abstractmethod +from functools import partial +from numbers import Integral, Real + +import numpy as np +from scipy import linalg, optimize, sparse +from scipy.sparse import linalg as sp_linalg + +from sklearn.base import BaseEstimator + +from ..base import MultiOutputMixin, RegressorMixin, _fit_context, is_classifier +from ..exceptions import ConvergenceWarning +from ..metrics import check_scoring, get_scorer_names +from ..model_selection import GridSearchCV +from ..preprocessing import LabelBinarizer +from ..utils import ( + Bunch, + check_array, + check_consistent_length, + check_scalar, + column_or_1d, + compute_sample_weight, +) +from ..utils._array_api import ( + _is_numpy_namespace, + _ravel, + device, + get_namespace, + get_namespace_and_device, +) +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.extmath import row_norms, safe_sparse_dot +from ..utils.fixes import _sparse_linalg_cg +from ..utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + _routing_enabled, + process_routing, +) +from ..utils.sparsefuncs import mean_variance_axis +from ..utils.validation import _check_sample_weight, check_is_fitted, validate_data +from ._base import LinearClassifierMixin, LinearModel, _preprocess_data, _rescale_data +from ._sag import sag_solver + + +def _get_rescaled_operator(X, X_offset, sample_weight_sqrt): + """Create LinearOperator for matrix products with implicit centering. + + Matrix product `LinearOperator @ coef` returns `(X - X_offset) @ coef`. + """ + + def matvec(b): + return X.dot(b) - sample_weight_sqrt * b.dot(X_offset) + + def rmatvec(b): + return X.T.dot(b) - X_offset * b.dot(sample_weight_sqrt) + + X1 = sparse.linalg.LinearOperator(shape=X.shape, matvec=matvec, rmatvec=rmatvec) + return X1 + + +def _solve_sparse_cg( + X, + y, + alpha, + max_iter=None, + tol=1e-4, + verbose=0, + X_offset=None, + X_scale=None, + sample_weight_sqrt=None, +): + if sample_weight_sqrt is None: + sample_weight_sqrt = np.ones(X.shape[0], dtype=X.dtype) + + n_samples, n_features = X.shape + + if X_offset is None or X_scale is None: + X1 = sp_linalg.aslinearoperator(X) + else: + X_offset_scale = X_offset / X_scale + X1 = _get_rescaled_operator(X, X_offset_scale, sample_weight_sqrt) + + coefs = np.empty((y.shape[1], n_features), dtype=X.dtype) + + if n_features > n_samples: + + def create_mv(curr_alpha): + def _mv(x): + return X1.matvec(X1.rmatvec(x)) + curr_alpha * x + + return _mv + + else: + + def create_mv(curr_alpha): + def _mv(x): + return X1.rmatvec(X1.matvec(x)) + curr_alpha * x + + return _mv + + for i in range(y.shape[1]): + y_column = y[:, i] + + mv = create_mv(alpha[i]) + if n_features > n_samples: + # kernel ridge + # w = X.T * inv(X X^t + alpha*Id) y + C = sp_linalg.LinearOperator( + (n_samples, n_samples), matvec=mv, dtype=X.dtype + ) + coef, info = _sparse_linalg_cg(C, y_column, rtol=tol) + coefs[i] = X1.rmatvec(coef) + else: + # linear ridge + # w = inv(X^t X + alpha*Id) * X.T y + y_column = X1.rmatvec(y_column) + C = sp_linalg.LinearOperator( + (n_features, n_features), matvec=mv, dtype=X.dtype + ) + coefs[i], info = _sparse_linalg_cg(C, y_column, maxiter=max_iter, rtol=tol) + + if info < 0: + raise ValueError("Failed with error code %d" % info) + + if max_iter is None and info > 0 and verbose: + warnings.warn( + "sparse_cg did not converge after %d iterations." % info, + ConvergenceWarning, + ) + + return coefs + + +def _solve_lsqr( + X, + y, + *, + alpha, + fit_intercept=True, + max_iter=None, + tol=1e-4, + X_offset=None, + X_scale=None, + sample_weight_sqrt=None, +): + """Solve Ridge regression via LSQR. + + We expect that y is always mean centered. + If X is dense, we expect it to be mean centered such that we can solve + ||y - Xw||_2^2 + alpha * ||w||_2^2 + + If X is sparse, we expect X_offset to be given such that we can solve + ||y - (X - X_offset)w||_2^2 + alpha * ||w||_2^2 + + With sample weights S=diag(sample_weight), this becomes + ||sqrt(S) (y - (X - X_offset) w)||_2^2 + alpha * ||w||_2^2 + and we expect y and X to already be rescaled, i.e. sqrt(S) @ y, sqrt(S) @ X. In + this case, X_offset is the sample_weight weighted mean of X before scaling by + sqrt(S). The objective then reads + ||y - (X - sqrt(S) X_offset) w)||_2^2 + alpha * ||w||_2^2 + """ + if sample_weight_sqrt is None: + sample_weight_sqrt = np.ones(X.shape[0], dtype=X.dtype) + + if sparse.issparse(X) and fit_intercept: + X_offset_scale = X_offset / X_scale + X1 = _get_rescaled_operator(X, X_offset_scale, sample_weight_sqrt) + else: + # No need to touch anything + X1 = X + + n_samples, n_features = X.shape + coefs = np.empty((y.shape[1], n_features), dtype=X.dtype) + n_iter = np.empty(y.shape[1], dtype=np.int32) + + # According to the lsqr documentation, alpha = damp^2. + sqrt_alpha = np.sqrt(alpha) + + for i in range(y.shape[1]): + y_column = y[:, i] + info = sp_linalg.lsqr( + X1, y_column, damp=sqrt_alpha[i], atol=tol, btol=tol, iter_lim=max_iter + ) + coefs[i] = info[0] + n_iter[i] = info[2] + + return coefs, n_iter + + +def _solve_cholesky(X, y, alpha): + # w = inv(X^t X + alpha*Id) * X.T y + n_features = X.shape[1] + n_targets = y.shape[1] + + A = safe_sparse_dot(X.T, X, dense_output=True) + Xy = safe_sparse_dot(X.T, y, dense_output=True) + + one_alpha = np.array_equal(alpha, len(alpha) * [alpha[0]]) + + if one_alpha: + A.flat[:: n_features + 1] += alpha[0] + return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T + else: + coefs = np.empty([n_targets, n_features], dtype=X.dtype) + for coef, target, current_alpha in zip(coefs, Xy.T, alpha): + A.flat[:: n_features + 1] += current_alpha + coef[:] = linalg.solve(A, target, assume_a="pos", overwrite_a=False).ravel() + A.flat[:: n_features + 1] -= current_alpha + return coefs + + +def _solve_cholesky_kernel(K, y, alpha, sample_weight=None, copy=False): + # dual_coef = inv(X X^t + alpha*Id) y + n_samples = K.shape[0] + n_targets = y.shape[1] + + if copy: + K = K.copy() + + alpha = np.atleast_1d(alpha) + one_alpha = (alpha == alpha[0]).all() + has_sw = isinstance(sample_weight, np.ndarray) or sample_weight not in [1.0, None] + + if has_sw: + # Unlike other solvers, we need to support sample_weight directly + # because K might be a pre-computed kernel. + sw = np.sqrt(np.atleast_1d(sample_weight)) + y = y * sw[:, np.newaxis] + K *= np.outer(sw, sw) + + if one_alpha: + # Only one penalty, we can solve multi-target problems in one time. + K.flat[:: n_samples + 1] += alpha[0] + + try: + # Note: we must use overwrite_a=False in order to be able to + # use the fall-back solution below in case a LinAlgError + # is raised + dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False) + except np.linalg.LinAlgError: + warnings.warn( + "Singular matrix in solving dual problem. Using " + "least-squares solution instead." + ) + dual_coef = linalg.lstsq(K, y)[0] + + # K is expensive to compute and store in memory so change it back in + # case it was user-given. + K.flat[:: n_samples + 1] -= alpha[0] + + if has_sw: + dual_coef *= sw[:, np.newaxis] + + return dual_coef + else: + # One penalty per target. We need to solve each target separately. + dual_coefs = np.empty([n_targets, n_samples], K.dtype) + + for dual_coef, target, current_alpha in zip(dual_coefs, y.T, alpha): + K.flat[:: n_samples + 1] += current_alpha + + dual_coef[:] = linalg.solve( + K, target, assume_a="pos", overwrite_a=False + ).ravel() + + K.flat[:: n_samples + 1] -= current_alpha + + if has_sw: + dual_coefs *= sw[np.newaxis, :] + + return dual_coefs.T + + +def _solve_svd(X, y, alpha, xp=None): + xp, _ = get_namespace(X, xp=xp) + U, s, Vt = xp.linalg.svd(X, full_matrices=False) + idx = s > 1e-15 # same default value as scipy.linalg.pinv + s_nnz = s[idx][:, None] + UTy = U.T @ y + d = xp.zeros((s.shape[0], alpha.shape[0]), dtype=X.dtype, device=device(X)) + d[idx] = s_nnz / (s_nnz**2 + alpha) + d_UT_y = d * UTy + return (Vt.T @ d_UT_y).T + + +def _solve_lbfgs( + X, + y, + alpha, + positive=True, + max_iter=None, + tol=1e-4, + X_offset=None, + X_scale=None, + sample_weight_sqrt=None, +): + """Solve ridge regression with LBFGS. + + The main purpose is fitting with forcing coefficients to be positive. + For unconstrained ridge regression, there are faster dedicated solver methods. + Note that with positive bounds on the coefficients, LBFGS seems faster + than scipy.optimize.lsq_linear. + """ + n_samples, n_features = X.shape + + options = {} + if max_iter is not None: + options["maxiter"] = max_iter + config = { + "method": "L-BFGS-B", + "tol": tol, + "jac": True, + "options": options, + } + if positive: + config["bounds"] = [(0, np.inf)] * n_features + + if X_offset is not None and X_scale is not None: + X_offset_scale = X_offset / X_scale + else: + X_offset_scale = None + + if sample_weight_sqrt is None: + sample_weight_sqrt = np.ones(X.shape[0], dtype=X.dtype) + + coefs = np.empty((y.shape[1], n_features), dtype=X.dtype) + + for i in range(y.shape[1]): + x0 = np.zeros((n_features,)) + y_column = y[:, i] + + def func(w): + residual = X.dot(w) - y_column + if X_offset_scale is not None: + residual -= sample_weight_sqrt * w.dot(X_offset_scale) + f = 0.5 * residual.dot(residual) + 0.5 * alpha[i] * w.dot(w) + grad = X.T @ residual + alpha[i] * w + if X_offset_scale is not None: + grad -= X_offset_scale * residual.dot(sample_weight_sqrt) + + return f, grad + + result = optimize.minimize(func, x0, **config) + if not result["success"]: + warnings.warn( + ( + "The lbfgs solver did not converge. Try increasing max_iter " + f"or tol. Currently: max_iter={max_iter} and tol={tol}" + ), + ConvergenceWarning, + ) + coefs[i] = result["x"] + + return coefs + + +def _get_valid_accept_sparse(is_X_sparse, solver): + if is_X_sparse and solver in ["auto", "sag", "saga"]: + return "csr" + else: + return ["csr", "csc", "coo"] + + +@validate_params( + { + "X": ["array-like", "sparse matrix", sp_linalg.LinearOperator], + "y": ["array-like"], + "alpha": [Interval(Real, 0, None, closed="left"), "array-like"], + "sample_weight": [ + Interval(Real, None, None, closed="neither"), + "array-like", + None, + ], + "solver": [ + StrOptions( + {"auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga", "lbfgs"} + ) + ], + "max_iter": [Interval(Integral, 0, None, closed="left"), None], + "tol": [Interval(Real, 0, None, closed="left")], + "verbose": ["verbose"], + "positive": ["boolean"], + "random_state": ["random_state"], + "return_n_iter": ["boolean"], + "return_intercept": ["boolean"], + "check_input": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def ridge_regression( + X, + y, + alpha, + *, + sample_weight=None, + solver="auto", + max_iter=None, + tol=1e-4, + verbose=0, + positive=False, + random_state=None, + return_n_iter=False, + return_intercept=False, + check_input=True, +): + """Solve the ridge equation by the method of normal equations. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix, LinearOperator} of shape \ + (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) or (n_samples, n_targets) + Target values. + + alpha : float or array-like of shape (n_targets,) + Constant that multiplies the L2 term, controlling regularization + strength. `alpha` must be a non-negative float i.e. in `[0, inf)`. + + When `alpha = 0`, the objective is equivalent to ordinary least + squares, solved by the :class:`LinearRegression` object. For numerical + reasons, using `alpha = 0` with the `Ridge` object is not advised. + Instead, you should use the :class:`LinearRegression` object. + + If an array is passed, penalties are assumed to be specific to the + targets. Hence they must correspond in number. + + sample_weight : float or array-like of shape (n_samples,), default=None + Individual weights for each sample. If given a float, every sample + will have the same weight. If sample_weight is not None and + solver='auto', the solver will be set to 'cholesky'. + + .. versionadded:: 0.17 + + solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', \ + 'sag', 'saga', 'lbfgs'}, default='auto' + Solver to use in the computational routines: + + - 'auto' chooses the solver automatically based on the type of data. + + - 'svd' uses a Singular Value Decomposition of X to compute the Ridge + coefficients. It is the most stable solver, in particular more stable + for singular matrices than 'cholesky' at the cost of being slower. + + - 'cholesky' uses the standard scipy.linalg.solve function to + obtain a closed-form solution via a Cholesky decomposition of + dot(X.T, X) + + - 'sparse_cg' uses the conjugate gradient solver as found in + scipy.sparse.linalg.cg. As an iterative algorithm, this solver is + more appropriate than 'cholesky' for large-scale data + (possibility to set `tol` and `max_iter`). + + - 'lsqr' uses the dedicated regularized least-squares routine + scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative + procedure. + + - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses + its improved, unbiased version named SAGA. Both methods also use an + iterative procedure, and are often faster than other solvers when + both n_samples and n_features are large. Note that 'sag' and + 'saga' fast convergence is only guaranteed on features with + approximately the same scale. You can preprocess the data with a + scaler from sklearn.preprocessing. + + - 'lbfgs' uses L-BFGS-B algorithm implemented in + `scipy.optimize.minimize`. It can be used only when `positive` + is True. + + All solvers except 'svd' support both dense and sparse data. However, only + 'lsqr', 'sag', 'sparse_cg', and 'lbfgs' support sparse input when + `fit_intercept` is True. + + .. versionadded:: 0.17 + Stochastic Average Gradient descent solver. + .. versionadded:: 0.19 + SAGA solver. + + max_iter : int, default=None + Maximum number of iterations for conjugate gradient solver. + For the 'sparse_cg' and 'lsqr' solvers, the default value is determined + by scipy.sparse.linalg. For 'sag' and saga solver, the default value is + 1000. For 'lbfgs' solver, the default value is 15000. + + tol : float, default=1e-4 + Precision of the solution. Note that `tol` has no effect for solvers 'svd' and + 'cholesky'. + + .. versionchanged:: 1.2 + Default value changed from 1e-3 to 1e-4 for consistency with other linear + models. + + verbose : int, default=0 + Verbosity level. Setting verbose > 0 will display additional + information depending on the solver used. + + positive : bool, default=False + When set to ``True``, forces the coefficients to be positive. + Only 'lbfgs' solver is supported in this case. + + random_state : int, RandomState instance, default=None + Used when ``solver`` == 'sag' or 'saga' to shuffle the data. + See :term:`Glossary ` for details. + + return_n_iter : bool, default=False + If True, the method also returns `n_iter`, the actual number of + iteration performed by the solver. + + .. versionadded:: 0.17 + + return_intercept : bool, default=False + If True and if X is sparse, the method also returns the intercept, + and the solver is automatically changed to 'sag'. This is only a + temporary fix for fitting the intercept with sparse data. For dense + data, use sklearn.linear_model._preprocess_data before your regression. + + .. versionadded:: 0.17 + + check_input : bool, default=True + If False, the input arrays X and y will not be checked. + + .. versionadded:: 0.21 + + Returns + ------- + coef : ndarray of shape (n_features,) or (n_targets, n_features) + Weight vector(s). + + n_iter : int, optional + The actual number of iteration performed by the solver. + Only returned if `return_n_iter` is True. + + intercept : float or ndarray of shape (n_targets,) + The intercept of the model. Only returned if `return_intercept` + is True and if X is a scipy sparse array. + + Notes + ----- + This function won't compute the intercept. + + Regularization improves the conditioning of the problem and + reduces the variance of the estimates. Larger values specify stronger + regularization. Alpha corresponds to ``1 / (2C)`` in other linear + models such as :class:`~sklearn.linear_model.LogisticRegression` or + :class:`~sklearn.svm.LinearSVC`. If an array is passed, penalties are + assumed to be specific to the targets. Hence they must correspond in + number. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.datasets import make_regression + >>> from sklearn.linear_model import ridge_regression + >>> rng = np.random.RandomState(0) + >>> X = rng.randn(100, 4) + >>> y = 2.0 * X[:, 0] - 1.0 * X[:, 1] + 0.1 * rng.standard_normal(100) + >>> coef, intercept = ridge_regression(X, y, alpha=1.0, return_intercept=True, + ... random_state=0) + >>> coef + array([ 1.97, -1., -2.69e-3, -9.27e-4 ]) + >>> intercept + np.float64(-.0012) + """ + return _ridge_regression( + X, + y, + alpha, + sample_weight=sample_weight, + solver=solver, + max_iter=max_iter, + tol=tol, + verbose=verbose, + positive=positive, + random_state=random_state, + return_n_iter=return_n_iter, + return_intercept=return_intercept, + X_scale=None, + X_offset=None, + check_input=check_input, + ) + + +def _ridge_regression( + X, + y, + alpha, + sample_weight=None, + solver="auto", + max_iter=None, + tol=1e-4, + verbose=0, + positive=False, + random_state=None, + return_n_iter=False, + return_intercept=False, + return_solver=False, + X_scale=None, + X_offset=None, + check_input=True, + fit_intercept=False, +): + xp, is_array_api_compliant, device_ = get_namespace_and_device( + X, y, sample_weight, X_scale, X_offset + ) + is_numpy_namespace = _is_numpy_namespace(xp) + X_is_sparse = sparse.issparse(X) + + has_sw = sample_weight is not None + + solver = resolve_solver(solver, positive, return_intercept, X_is_sparse, xp) + + if is_numpy_namespace and not X_is_sparse: + X = np.asarray(X) + + if not is_numpy_namespace and solver != "svd": + raise ValueError( + f"Array API dispatch to namespace {xp.__name__} only supports " + f"solver 'svd'. Got '{solver}'." + ) + + if positive and solver != "lbfgs": + raise ValueError( + "When positive=True, only 'lbfgs' solver can be used. " + f"Please change solver {solver} to 'lbfgs' " + "or set positive=False." + ) + + if solver == "lbfgs" and not positive: + raise ValueError( + "'lbfgs' solver can be used only when positive=True. " + "Please use another solver." + ) + + if return_intercept and solver != "sag": + raise ValueError( + "In Ridge, only 'sag' solver can directly fit the " + "intercept. Please change solver to 'sag' or set " + "return_intercept=False." + ) + + if check_input: + _dtype = [xp.float64, xp.float32] + _accept_sparse = _get_valid_accept_sparse(X_is_sparse, solver) + X = check_array(X, accept_sparse=_accept_sparse, dtype=_dtype, order="C") + y = check_array(y, dtype=X.dtype, ensure_2d=False, order=None) + check_consistent_length(X, y) + + n_samples, n_features = X.shape + + if y.ndim > 2: + raise ValueError("Target y has the wrong shape %s" % str(y.shape)) + + if y.ndim == 1: + y = xp.reshape(y, (-1, 1)) + + n_samples_, n_targets = y.shape + + if n_samples != n_samples_: + raise ValueError( + "Number of samples in X and y does not correspond: %d != %d" + % (n_samples, n_samples_) + ) + + if has_sw: + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + + if solver not in ["sag", "saga"]: + # SAG supports sample_weight directly. For other solvers, + # we implement sample_weight via a simple rescaling. + X, y, sample_weight_sqrt = _rescale_data(X, y, sample_weight) + + # Some callers of this method might pass alpha as single + # element array which already has been validated. + if alpha is not None and not isinstance(alpha, type(xp.asarray([0.0]))): + alpha = check_scalar( + alpha, + "alpha", + target_type=numbers.Real, + min_val=0.0, + include_boundaries="left", + ) + + # There should be either 1 or n_targets penalties + alpha = _ravel(xp.asarray(alpha, device=device_, dtype=X.dtype), xp=xp) + if alpha.shape[0] not in [1, n_targets]: + raise ValueError( + "Number of targets and number of penalties do not correspond: %d != %d" + % (alpha.shape[0], n_targets) + ) + + if alpha.shape[0] == 1 and n_targets > 1: + alpha = xp.full( + shape=(n_targets,), fill_value=alpha[0], dtype=alpha.dtype, device=device_ + ) + + n_iter = None + if solver == "sparse_cg": + coef = _solve_sparse_cg( + X, + y, + alpha, + max_iter=max_iter, + tol=tol, + verbose=verbose, + X_offset=X_offset, + X_scale=X_scale, + sample_weight_sqrt=sample_weight_sqrt if has_sw else None, + ) + + elif solver == "lsqr": + coef, n_iter = _solve_lsqr( + X, + y, + alpha=alpha, + fit_intercept=fit_intercept, + max_iter=max_iter, + tol=tol, + X_offset=X_offset, + X_scale=X_scale, + sample_weight_sqrt=sample_weight_sqrt if has_sw else None, + ) + + elif solver == "cholesky": + if n_features > n_samples: + K = safe_sparse_dot(X, X.T, dense_output=True) + try: + dual_coef = _solve_cholesky_kernel(K, y, alpha) + + coef = safe_sparse_dot(X.T, dual_coef, dense_output=True).T + except linalg.LinAlgError: + # use SVD solver if matrix is singular + solver = "svd" + else: + try: + coef = _solve_cholesky(X, y, alpha) + except linalg.LinAlgError: + # use SVD solver if matrix is singular + solver = "svd" + + elif solver in ["sag", "saga"]: + # precompute max_squared_sum for all targets + max_squared_sum = row_norms(X, squared=True).max() + + coef = np.empty((y.shape[1], n_features), dtype=X.dtype) + n_iter = np.empty(y.shape[1], dtype=np.int32) + intercept = np.zeros((y.shape[1],), dtype=X.dtype) + for i, (alpha_i, target) in enumerate(zip(alpha, y.T)): + init = { + "coef": np.zeros((n_features + int(return_intercept), 1), dtype=X.dtype) + } + coef_, n_iter_, _ = sag_solver( + X, + target.ravel(), + sample_weight, + "squared", + alpha_i, + 0, + max_iter, + tol, + verbose, + random_state, + False, + max_squared_sum, + init, + is_saga=solver == "saga", + ) + if return_intercept: + coef[i] = coef_[:-1] + intercept[i] = coef_[-1] + else: + coef[i] = coef_ + n_iter[i] = n_iter_ + + if intercept.shape[0] == 1: + intercept = intercept[0] + + elif solver == "lbfgs": + coef = _solve_lbfgs( + X, + y, + alpha, + positive=positive, + tol=tol, + max_iter=max_iter, + X_offset=X_offset, + X_scale=X_scale, + sample_weight_sqrt=sample_weight_sqrt if has_sw else None, + ) + + if solver == "svd": + if X_is_sparse: + raise TypeError("SVD solver does not support sparse inputs currently") + coef = _solve_svd(X, y, alpha, xp) + + if n_targets == 1: + coef = _ravel(coef) + + coef = xp.asarray(coef) + + if return_n_iter and return_intercept: + res = coef, n_iter, intercept + elif return_intercept: + res = coef, intercept + elif return_n_iter: + res = coef, n_iter + else: + res = coef + + return (*res, solver) if return_solver else res + + +def resolve_solver(solver, positive, return_intercept, is_sparse, xp): + if solver != "auto": + return solver + + is_numpy_namespace = _is_numpy_namespace(xp) + + auto_solver_np = resolve_solver_for_numpy(positive, return_intercept, is_sparse) + if is_numpy_namespace: + return auto_solver_np + + if positive: + raise ValueError( + "The solvers that support positive fitting do not support " + f"Array API dispatch to namespace {xp.__name__}. Please " + "either disable Array API dispatch, or use a numpy-like " + "namespace, or set `positive=False`." + ) + + # At the moment, Array API dispatch only supports the "svd" solver. + solver = "svd" + if solver != auto_solver_np: + warnings.warn( + f"Using Array API dispatch to namespace {xp.__name__} with " + f"`solver='auto'` will result in using the solver '{solver}'. " + "The results may differ from those when using a Numpy array, " + f"because in that case the preferred solver would be {auto_solver_np}. " + f"Set `solver='{solver}'` to suppress this warning." + ) + + return solver + + +def resolve_solver_for_numpy(positive, return_intercept, is_sparse): + if positive: + return "lbfgs" + + if return_intercept: + # sag supports fitting intercept directly + return "sag" + + if not is_sparse: + return "cholesky" + + return "sparse_cg" + + +class _BaseRidge(LinearModel, metaclass=ABCMeta): + _parameter_constraints: dict = { + "alpha": [Interval(Real, 0, None, closed="left"), np.ndarray], + "fit_intercept": ["boolean"], + "copy_X": ["boolean"], + "max_iter": [Interval(Integral, 1, None, closed="left"), None], + "tol": [Interval(Real, 0, None, closed="left")], + "solver": [ + StrOptions( + {"auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga", "lbfgs"} + ) + ], + "positive": ["boolean"], + "random_state": ["random_state"], + } + + @abstractmethod + def __init__( + self, + alpha=1.0, + *, + fit_intercept=True, + copy_X=True, + max_iter=None, + tol=1e-4, + solver="auto", + positive=False, + random_state=None, + ): + self.alpha = alpha + self.fit_intercept = fit_intercept + self.copy_X = copy_X + self.max_iter = max_iter + self.tol = tol + self.solver = solver + self.positive = positive + self.random_state = random_state + + def fit(self, X, y, sample_weight=None): + xp, is_array_api_compliant = get_namespace(X, y, sample_weight) + + if self.solver == "lbfgs" and not self.positive: + raise ValueError( + "'lbfgs' solver can be used only when positive=True. " + "Please use another solver." + ) + + if self.positive: + if self.solver not in ["auto", "lbfgs"]: + raise ValueError( + f"solver='{self.solver}' does not support positive fitting. Please" + " set the solver to 'auto' or 'lbfgs', or set `positive=False`" + ) + else: + solver = self.solver + elif sparse.issparse(X) and self.fit_intercept: + if self.solver not in ["auto", "lbfgs", "lsqr", "sag", "sparse_cg"]: + raise ValueError( + "solver='{}' does not support fitting the intercept " + "on sparse data. Please set the solver to 'auto' or " + "'lsqr', 'sparse_cg', 'sag', 'lbfgs' " + "or set `fit_intercept=False`".format(self.solver) + ) + if self.solver in ["lsqr", "lbfgs"]: + solver = self.solver + elif self.solver == "sag" and self.max_iter is None and self.tol > 1e-4: + warnings.warn( + '"sag" solver requires many iterations to fit ' + "an intercept with sparse inputs. Either set the " + 'solver to "auto" or "sparse_cg", or set a low ' + '"tol" and a high "max_iter" (especially if inputs are ' + "not standardized)." + ) + solver = "sag" + else: + solver = "sparse_cg" + else: + solver = self.solver + + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + + # when X is sparse we only remove offset from y + X, y, X_offset, y_offset, X_scale = _preprocess_data( + X, + y, + fit_intercept=self.fit_intercept, + copy=self.copy_X, + sample_weight=sample_weight, + ) + + if solver == "sag" and sparse.issparse(X) and self.fit_intercept: + self.coef_, self.n_iter_, self.intercept_, self.solver_ = _ridge_regression( + X, + y, + alpha=self.alpha, + sample_weight=sample_weight, + max_iter=self.max_iter, + tol=self.tol, + solver="sag", + positive=self.positive, + random_state=self.random_state, + return_n_iter=True, + return_intercept=True, + return_solver=True, + check_input=False, + ) + # add the offset which was subtracted by _preprocess_data + self.intercept_ += y_offset + + else: + if sparse.issparse(X) and self.fit_intercept: + # required to fit intercept with sparse_cg and lbfgs solver + params = {"X_offset": X_offset, "X_scale": X_scale} + else: + # for dense matrices or when intercept is set to 0 + params = {} + + self.coef_, self.n_iter_, self.solver_ = _ridge_regression( + X, + y, + alpha=self.alpha, + sample_weight=sample_weight, + max_iter=self.max_iter, + tol=self.tol, + solver=solver, + positive=self.positive, + random_state=self.random_state, + return_n_iter=True, + return_intercept=False, + return_solver=True, + check_input=False, + fit_intercept=self.fit_intercept, + **params, + ) + self._set_intercept(X_offset, y_offset, X_scale) + + return self + + +class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge): + """Linear least squares with l2 regularization. + + Minimizes the objective function:: + + ||y - Xw||^2_2 + alpha * ||w||^2_2 + + This model solves a regression model where the loss function is + the linear least squares function and regularization is given by + the l2-norm. Also known as Ridge Regression or Tikhonov regularization. + This estimator has built-in support for multi-variate regression + (i.e., when y is a 2d-array of shape (n_samples, n_targets)). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + alpha : {float, ndarray of shape (n_targets,)}, default=1.0 + Constant that multiplies the L2 term, controlling regularization + strength. `alpha` must be a non-negative float i.e. in `[0, inf)`. + + When `alpha = 0`, the objective is equivalent to ordinary least + squares, solved by the :class:`LinearRegression` object. For numerical + reasons, using `alpha = 0` with the `Ridge` object is not advised. + Instead, you should use the :class:`LinearRegression` object. + + If an array is passed, penalties are assumed to be specific to the + targets. Hence they must correspond in number. + + fit_intercept : bool, default=True + Whether to fit the intercept for this model. If set + to false, no intercept will be used in calculations + (i.e. ``X`` and ``y`` are expected to be centered). + + copy_X : bool, default=True + If True, X will be copied; else, it may be overwritten. + + max_iter : int, default=None + Maximum number of iterations for conjugate gradient solver. + For 'sparse_cg' and 'lsqr' solvers, the default value is determined + by scipy.sparse.linalg. For 'sag' solver, the default value is 1000. + For 'lbfgs' solver, the default value is 15000. + + tol : float, default=1e-4 + The precision of the solution (`coef_`) is determined by `tol` which + specifies a different convergence criterion for each solver: + + - 'svd': `tol` has no impact. + + - 'cholesky': `tol` has no impact. + + - 'sparse_cg': norm of residuals smaller than `tol`. + + - 'lsqr': `tol` is set as atol and btol of scipy.sparse.linalg.lsqr, + which control the norm of the residual vector in terms of the norms of + matrix and coefficients. + + - 'sag' and 'saga': relative change of coef smaller than `tol`. + + - 'lbfgs': maximum of the absolute (projected) gradient=max|residuals| + smaller than `tol`. + + .. versionchanged:: 1.2 + Default value changed from 1e-3 to 1e-4 for consistency with other linear + models. + + solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', \ + 'sag', 'saga', 'lbfgs'}, default='auto' + Solver to use in the computational routines: + + - 'auto' chooses the solver automatically based on the type of data. + + - 'svd' uses a Singular Value Decomposition of X to compute the Ridge + coefficients. It is the most stable solver, in particular more stable + for singular matrices than 'cholesky' at the cost of being slower. + + - 'cholesky' uses the standard scipy.linalg.solve function to + obtain a closed-form solution. + + - 'sparse_cg' uses the conjugate gradient solver as found in + scipy.sparse.linalg.cg. As an iterative algorithm, this solver is + more appropriate than 'cholesky' for large-scale data + (possibility to set `tol` and `max_iter`). + + - 'lsqr' uses the dedicated regularized least-squares routine + scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative + procedure. + + - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses + its improved, unbiased version named SAGA. Both methods also use an + iterative procedure, and are often faster than other solvers when + both n_samples and n_features are large. Note that 'sag' and + 'saga' fast convergence is only guaranteed on features with + approximately the same scale. You can preprocess the data with a + scaler from sklearn.preprocessing. + + - 'lbfgs' uses L-BFGS-B algorithm implemented in + `scipy.optimize.minimize`. It can be used only when `positive` + is True. + + All solvers except 'svd' support both dense and sparse data. However, only + 'lsqr', 'sag', 'sparse_cg', and 'lbfgs' support sparse input when + `fit_intercept` is True. + + .. versionadded:: 0.17 + Stochastic Average Gradient descent solver. + .. versionadded:: 0.19 + SAGA solver. + + positive : bool, default=False + When set to ``True``, forces the coefficients to be positive. + Only 'lbfgs' solver is supported in this case. + + random_state : int, RandomState instance, default=None + Used when ``solver`` == 'sag' or 'saga' to shuffle the data. + See :term:`Glossary ` for details. + + .. versionadded:: 0.17 + `random_state` to support Stochastic Average Gradient. + + Attributes + ---------- + coef_ : ndarray of shape (n_features,) or (n_targets, n_features) + Weight vector(s). + + intercept_ : float or ndarray of shape (n_targets,) + Independent term in decision function. Set to 0.0 if + ``fit_intercept = False``. + + n_iter_ : None or ndarray of shape (n_targets,) + Actual number of iterations for each target. Available only for + sag and lsqr solvers. Other solvers will return None. + + .. versionadded:: 0.17 + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + solver_ : str + The solver that was used at fit time by the computational + routines. + + .. versionadded:: 1.5 + + See Also + -------- + RidgeClassifier : Ridge classifier. + RidgeCV : Ridge regression with built-in cross validation. + :class:`~sklearn.kernel_ridge.KernelRidge` : Kernel ridge regression + combines ridge regression with the kernel trick. + + Notes + ----- + Regularization improves the conditioning of the problem and + reduces the variance of the estimates. Larger values specify stronger + regularization. Alpha corresponds to ``1 / (2C)`` in other linear + models such as :class:`~sklearn.linear_model.LogisticRegression` or + :class:`~sklearn.svm.LinearSVC`. + + Examples + -------- + >>> from sklearn.linear_model import Ridge + >>> import numpy as np + >>> n_samples, n_features = 10, 5 + >>> rng = np.random.RandomState(0) + >>> y = rng.randn(n_samples) + >>> X = rng.randn(n_samples, n_features) + >>> clf = Ridge(alpha=1.0) + >>> clf.fit(X, y) + Ridge() + """ + + def __init__( + self, + alpha=1.0, + *, + fit_intercept=True, + copy_X=True, + max_iter=None, + tol=1e-4, + solver="auto", + positive=False, + random_state=None, + ): + super().__init__( + alpha=alpha, + fit_intercept=fit_intercept, + copy_X=copy_X, + max_iter=max_iter, + tol=tol, + solver=solver, + positive=positive, + random_state=random_state, + ) + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, sample_weight=None): + """Fit Ridge regression model. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : ndarray of shape (n_samples,) or (n_samples, n_targets) + Target values. + + sample_weight : float or ndarray of shape (n_samples,), default=None + Individual weights for each sample. If given a float, every sample + will have the same weight. + + Returns + ------- + self : object + Fitted estimator. + """ + _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver) + xp, _ = get_namespace(X, y, sample_weight) + X, y = validate_data( + self, + X, + y, + accept_sparse=_accept_sparse, + dtype=[xp.float64, xp.float32], + force_writeable=True, + multi_output=True, + y_numeric=True, + ) + return super().fit(X, y, sample_weight=sample_weight) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.array_api_support = True + tags.input_tags.sparse = (self.solver != "svd") and ( + self.solver != "cholesky" or not self.fit_intercept + ) + return tags + + +class _RidgeClassifierMixin(LinearClassifierMixin): + def _prepare_data(self, X, y, sample_weight, solver): + """Validate `X` and `y` and binarize `y`. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : ndarray of shape (n_samples,) + Target values. + + sample_weight : float or ndarray of shape (n_samples,), default=None + Individual weights for each sample. If given a float, every sample + will have the same weight. + + solver : str + The solver used in `Ridge` to know which sparse format to support. + + Returns + ------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + Validated training data. + + y : ndarray of shape (n_samples,) + Validated target values. + + sample_weight : ndarray of shape (n_samples,) + Validated sample weights. + + Y : ndarray of shape (n_samples, n_classes) + The binarized version of `y`. + """ + accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), solver) + X, y = validate_data( + self, + X, + y, + accept_sparse=accept_sparse, + multi_output=True, + y_numeric=False, + force_writeable=True, + ) + + self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) + Y = self._label_binarizer.fit_transform(y) + if not self._label_binarizer.y_type_.startswith("multilabel"): + y = column_or_1d(y, warn=True) + + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + if self.class_weight: + sample_weight = sample_weight * compute_sample_weight(self.class_weight, y) + return X, y, sample_weight, Y + + def predict(self, X): + """Predict class labels for samples in `X`. + + Parameters + ---------- + X : {array-like, spare matrix} of shape (n_samples, n_features) + The data matrix for which we want to predict the targets. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs) + Vector or matrix containing the predictions. In binary and + multiclass problems, this is a vector containing `n_samples`. In + a multilabel problem, it returns a matrix of shape + `(n_samples, n_outputs)`. + """ + check_is_fitted(self, attributes=["_label_binarizer"]) + if self._label_binarizer.y_type_.startswith("multilabel"): + # Threshold such that the negative label is -1 and positive label + # is 1 to use the inverse transform of the label binarizer fitted + # during fit. + scores = 2 * (self.decision_function(X) > 0) - 1 + return self._label_binarizer.inverse_transform(scores) + return super().predict(X) + + @property + def classes_(self): + """Classes labels.""" + return self._label_binarizer.classes_ + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.classifier_tags.multi_label = True + return tags + + +class RidgeClassifier(_RidgeClassifierMixin, _BaseRidge): + """Classifier using Ridge regression. + + This classifier first converts the target values into ``{-1, 1}`` and + then treats the problem as a regression task (multi-output regression in + the multiclass case). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + alpha : float, default=1.0 + Regularization strength; must be a positive float. Regularization + improves the conditioning of the problem and reduces the variance of + the estimates. Larger values specify stronger regularization. + Alpha corresponds to ``1 / (2C)`` in other linear models such as + :class:`~sklearn.linear_model.LogisticRegression` or + :class:`~sklearn.svm.LinearSVC`. + + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set to false, no + intercept will be used in calculations (e.g. data is expected to be + already centered). + + copy_X : bool, default=True + If True, X will be copied; else, it may be overwritten. + + max_iter : int, default=None + Maximum number of iterations for conjugate gradient solver. + The default value is determined by scipy.sparse.linalg. + + tol : float, default=1e-4 + The precision of the solution (`coef_`) is determined by `tol` which + specifies a different convergence criterion for each solver: + + - 'svd': `tol` has no impact. + + - 'cholesky': `tol` has no impact. + + - 'sparse_cg': norm of residuals smaller than `tol`. + + - 'lsqr': `tol` is set as atol and btol of scipy.sparse.linalg.lsqr, + which control the norm of the residual vector in terms of the norms of + matrix and coefficients. + + - 'sag' and 'saga': relative change of coef smaller than `tol`. + + - 'lbfgs': maximum of the absolute (projected) gradient=max|residuals| + smaller than `tol`. + + .. versionchanged:: 1.2 + Default value changed from 1e-3 to 1e-4 for consistency with other linear + models. + + class_weight : dict or 'balanced', default=None + Weights associated with classes in the form ``{class_label: weight}``. + If not given, all classes are supposed to have weight one. + + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))``. + + solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', \ + 'sag', 'saga', 'lbfgs'}, default='auto' + Solver to use in the computational routines: + + - 'auto' chooses the solver automatically based on the type of data. + + - 'svd' uses a Singular Value Decomposition of X to compute the Ridge + coefficients. It is the most stable solver, in particular more stable + for singular matrices than 'cholesky' at the cost of being slower. + + - 'cholesky' uses the standard scipy.linalg.solve function to + obtain a closed-form solution. + + - 'sparse_cg' uses the conjugate gradient solver as found in + scipy.sparse.linalg.cg. As an iterative algorithm, this solver is + more appropriate than 'cholesky' for large-scale data + (possibility to set `tol` and `max_iter`). + + - 'lsqr' uses the dedicated regularized least-squares routine + scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative + procedure. + + - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses + its unbiased and more flexible version named SAGA. Both methods + use an iterative procedure, and are often faster than other solvers + when both n_samples and n_features are large. Note that 'sag' and + 'saga' fast convergence is only guaranteed on features with + approximately the same scale. You can preprocess the data with a + scaler from sklearn.preprocessing. + + .. versionadded:: 0.17 + Stochastic Average Gradient descent solver. + .. versionadded:: 0.19 + SAGA solver. + + - 'lbfgs' uses L-BFGS-B algorithm implemented in + `scipy.optimize.minimize`. It can be used only when `positive` + is True. + + positive : bool, default=False + When set to ``True``, forces the coefficients to be positive. + Only 'lbfgs' solver is supported in this case. + + random_state : int, RandomState instance, default=None + Used when ``solver`` == 'sag' or 'saga' to shuffle the data. + See :term:`Glossary ` for details. + + Attributes + ---------- + coef_ : ndarray of shape (1, n_features) or (n_classes, n_features) + Coefficient of the features in the decision function. + + ``coef_`` is of shape (1, n_features) when the given problem is binary. + + intercept_ : float or ndarray of shape (n_targets,) + Independent term in decision function. Set to 0.0 if + ``fit_intercept = False``. + + n_iter_ : None or ndarray of shape (n_targets,) + Actual number of iterations for each target. Available only for + sag and lsqr solvers. Other solvers will return None. + + classes_ : ndarray of shape (n_classes,) + The classes labels. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + solver_ : str + The solver that was used at fit time by the computational + routines. + + .. versionadded:: 1.5 + + See Also + -------- + Ridge : Ridge regression. + RidgeClassifierCV : Ridge classifier with built-in cross validation. + + Notes + ----- + For multi-class classification, n_class classifiers are trained in + a one-versus-all approach. Concretely, this is implemented by taking + advantage of the multi-variate response support in Ridge. + + Examples + -------- + >>> from sklearn.datasets import load_breast_cancer + >>> from sklearn.linear_model import RidgeClassifier + >>> X, y = load_breast_cancer(return_X_y=True) + >>> clf = RidgeClassifier().fit(X, y) + >>> clf.score(X, y) + 0.9595... + """ + + _parameter_constraints: dict = { + **_BaseRidge._parameter_constraints, + "class_weight": [dict, StrOptions({"balanced"}), None], + } + + def __init__( + self, + alpha=1.0, + *, + fit_intercept=True, + copy_X=True, + max_iter=None, + tol=1e-4, + class_weight=None, + solver="auto", + positive=False, + random_state=None, + ): + super().__init__( + alpha=alpha, + fit_intercept=fit_intercept, + copy_X=copy_X, + max_iter=max_iter, + tol=tol, + solver=solver, + positive=positive, + random_state=random_state, + ) + self.class_weight = class_weight + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, sample_weight=None): + """Fit Ridge classifier model. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : ndarray of shape (n_samples,) + Target values. + + sample_weight : float or ndarray of shape (n_samples,), default=None + Individual weights for each sample. If given a float, every sample + will have the same weight. + + .. versionadded:: 0.17 + *sample_weight* support to RidgeClassifier. + + Returns + ------- + self : object + Instance of the estimator. + """ + X, y, sample_weight, Y = self._prepare_data(X, y, sample_weight, self.solver) + + super().fit(X, Y, sample_weight=sample_weight) + return self + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = (self.solver != "svd") and ( + self.solver != "cholesky" or not self.fit_intercept + ) + return tags + + +def _check_gcv_mode(X, gcv_mode): + if gcv_mode in ["eigen", "svd"]: + return gcv_mode + # if X has more rows than columns, use decomposition of X^T.X, + # otherwise X.X^T + if X.shape[0] > X.shape[1]: + return "svd" + return "eigen" + + +def _find_smallest_angle(query, vectors): + """Find the column of vectors that is most aligned with the query. + + Both query and the columns of vectors must have their l2 norm equal to 1. + + Parameters + ---------- + query : ndarray of shape (n_samples,) + Normalized query vector. + + vectors : ndarray of shape (n_samples, n_features) + Vectors to which we compare query, as columns. Must be normalized. + """ + abs_cosine = np.abs(query.dot(vectors)) + index = np.argmax(abs_cosine) + return index + + +class _X_CenterStackOp(sparse.linalg.LinearOperator): + """Behaves as centered and scaled X with an added intercept column. + + This operator behaves as + np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]]) + """ + + def __init__(self, X, X_mean, sqrt_sw): + n_samples, n_features = X.shape + super().__init__(X.dtype, (n_samples, n_features + 1)) + self.X = X + self.X_mean = X_mean + self.sqrt_sw = sqrt_sw + + def _matvec(self, v): + v = v.ravel() + return ( + safe_sparse_dot(self.X, v[:-1], dense_output=True) + - self.sqrt_sw * self.X_mean.dot(v[:-1]) + + v[-1] * self.sqrt_sw + ) + + def _matmat(self, v): + return ( + safe_sparse_dot(self.X, v[:-1], dense_output=True) + - self.sqrt_sw[:, None] * self.X_mean.dot(v[:-1]) + + v[-1] * self.sqrt_sw[:, None] + ) + + def _transpose(self): + return _XT_CenterStackOp(self.X, self.X_mean, self.sqrt_sw) + + +class _XT_CenterStackOp(sparse.linalg.LinearOperator): + """Behaves as transposed centered and scaled X with an intercept column. + + This operator behaves as + np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]]).T + """ + + def __init__(self, X, X_mean, sqrt_sw): + n_samples, n_features = X.shape + super().__init__(X.dtype, (n_features + 1, n_samples)) + self.X = X + self.X_mean = X_mean + self.sqrt_sw = sqrt_sw + + def _matvec(self, v): + v = v.ravel() + n_features = self.shape[0] + res = np.empty(n_features, dtype=self.X.dtype) + res[:-1] = safe_sparse_dot(self.X.T, v, dense_output=True) - ( + self.X_mean * self.sqrt_sw.dot(v) + ) + res[-1] = np.dot(v, self.sqrt_sw) + return res + + def _matmat(self, v): + n_features = self.shape[0] + res = np.empty((n_features, v.shape[1]), dtype=self.X.dtype) + res[:-1] = safe_sparse_dot(self.X.T, v, dense_output=True) - self.X_mean[ + :, None + ] * self.sqrt_sw.dot(v) + res[-1] = np.dot(self.sqrt_sw, v) + return res + + +class _IdentityRegressor(RegressorMixin, BaseEstimator): + """Fake regressor which will directly output the prediction.""" + + def decision_function(self, y_predict): + return y_predict + + def predict(self, y_predict): + return y_predict + + +class _IdentityClassifier(LinearClassifierMixin, BaseEstimator): + """Fake classifier which will directly output the prediction. + + We inherit from LinearClassifierMixin to get the proper shape for the + output `y`. + """ + + def __init__(self, classes): + self.classes_ = classes + + def decision_function(self, y_predict): + return y_predict + + +class _RidgeGCV(LinearModel): + """Ridge regression with built-in Leave-one-out Cross-Validation. + + This class is not intended to be used directly. Use RidgeCV instead. + + `_RidgeGCV` uses a Generalized Cross-Validation for model selection. It's an + efficient approximation of leave-one-out cross-validation (LOO-CV), where instead of + computing multiple models by excluding one data point at a time, it uses an + algebraic shortcut to approximate the LOO-CV error, making it faster and + computationally more efficient. + + Using a naive grid-search approach with a leave-one-out cross-validation in contrast + requires to fit `n_samples` models to compute the prediction error for each sample + and then to repeat this process for each alpha in the grid. + + Here, the prediction error for each sample is computed by solving a **single** + linear system (in other words a single model) via a matrix factorization (i.e. + eigendecomposition or SVD) solving the problem stated in the Notes section. Finally, + we need to repeat this process for each alpha in the grid. The detailed complexity + is further discussed in Sect. 4 in [1]. + + This algebraic approach is only applicable for regularized least squares + problems. It could potentially be extended to kernel ridge regression. + + See the Notes section and references for more details regarding the formulation + and the linear system that is solved. + + Notes + ----- + + We want to solve (K + alpha*Id)c = y, + where K = X X^T is the kernel matrix. + + Let G = (K + alpha*Id). + + Dual solution: c = G^-1y + Primal solution: w = X^T c + + Compute eigendecomposition K = Q V Q^T. + Then G^-1 = Q (V + alpha*Id)^-1 Q^T, + where (V + alpha*Id) is diagonal. + It is thus inexpensive to inverse for many alphas. + + Let loov be the vector of prediction values for each example + when the model was fitted with all examples but this example. + + loov = (KG^-1Y - diag(KG^-1)Y) / diag(I-KG^-1) + + Let looe be the vector of prediction errors for each example + when the model was fitted with all examples but this example. + + looe = y - loov = c / diag(G^-1) + + The best score (negative mean squared error or user-provided scoring) is + stored in the `best_score_` attribute, and the selected hyperparameter in + `alpha_`. + + References + ---------- + [1] http://cbcl.mit.edu/publications/ps/MIT-CSAIL-TR-2007-025.pdf + [2] https://www.mit.edu/~9.520/spring07/Classes/rlsslides.pdf + """ + + def __init__( + self, + alphas=(0.1, 1.0, 10.0), + *, + fit_intercept=True, + scoring=None, + copy_X=True, + gcv_mode=None, + store_cv_results=False, + is_clf=False, + alpha_per_target=False, + ): + self.alphas = alphas + self.fit_intercept = fit_intercept + self.scoring = scoring + self.copy_X = copy_X + self.gcv_mode = gcv_mode + self.store_cv_results = store_cv_results + self.is_clf = is_clf + self.alpha_per_target = alpha_per_target + + @staticmethod + def _decomp_diag(v_prime, Q): + # compute diagonal of the matrix: dot(Q, dot(diag(v_prime), Q^T)) + return (v_prime * Q**2).sum(axis=-1) + + @staticmethod + def _diag_dot(D, B): + # compute dot(diag(D), B) + if len(B.shape) > 1: + # handle case where B is > 1-d + D = D[(slice(None),) + (np.newaxis,) * (len(B.shape) - 1)] + return D * B + + def _compute_gram(self, X, sqrt_sw): + """Computes the Gram matrix XX^T with possible centering. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + The preprocessed design matrix. + + sqrt_sw : ndarray of shape (n_samples,) + square roots of sample weights + + Returns + ------- + gram : ndarray of shape (n_samples, n_samples) + The Gram matrix. + X_mean : ndarray of shape (n_feature,) + The weighted mean of ``X`` for each feature. + + Notes + ----- + When X is dense the centering has been done in preprocessing + so the mean is 0 and we just compute XX^T. + + When X is sparse it has not been centered in preprocessing, but it has + been scaled by sqrt(sample weights). + + When self.fit_intercept is False no centering is done. + + The centered X is never actually computed because centering would break + the sparsity of X. + """ + center = self.fit_intercept and sparse.issparse(X) + if not center: + # in this case centering has been done in preprocessing + # or we are not fitting an intercept. + X_mean = np.zeros(X.shape[1], dtype=X.dtype) + return safe_sparse_dot(X, X.T, dense_output=True), X_mean + # X is sparse + n_samples = X.shape[0] + sample_weight_matrix = sparse.dia_matrix( + (sqrt_sw, 0), shape=(n_samples, n_samples) + ) + X_weighted = sample_weight_matrix.dot(X) + X_mean, _ = mean_variance_axis(X_weighted, axis=0) + X_mean *= n_samples / sqrt_sw.dot(sqrt_sw) + X_mX = sqrt_sw[:, None] * safe_sparse_dot(X_mean, X.T, dense_output=True) + X_mX_m = np.outer(sqrt_sw, sqrt_sw) * np.dot(X_mean, X_mean) + return ( + safe_sparse_dot(X, X.T, dense_output=True) + X_mX_m - X_mX - X_mX.T, + X_mean, + ) + + def _compute_covariance(self, X, sqrt_sw): + """Computes covariance matrix X^TX with possible centering. + + Parameters + ---------- + X : sparse matrix of shape (n_samples, n_features) + The preprocessed design matrix. + + sqrt_sw : ndarray of shape (n_samples,) + square roots of sample weights + + Returns + ------- + covariance : ndarray of shape (n_features, n_features) + The covariance matrix. + X_mean : ndarray of shape (n_feature,) + The weighted mean of ``X`` for each feature. + + Notes + ----- + Since X is sparse it has not been centered in preprocessing, but it has + been scaled by sqrt(sample weights). + + When self.fit_intercept is False no centering is done. + + The centered X is never actually computed because centering would break + the sparsity of X. + """ + if not self.fit_intercept: + # in this case centering has been done in preprocessing + # or we are not fitting an intercept. + X_mean = np.zeros(X.shape[1], dtype=X.dtype) + return safe_sparse_dot(X.T, X, dense_output=True), X_mean + # this function only gets called for sparse X + n_samples = X.shape[0] + sample_weight_matrix = sparse.dia_matrix( + (sqrt_sw, 0), shape=(n_samples, n_samples) + ) + X_weighted = sample_weight_matrix.dot(X) + X_mean, _ = mean_variance_axis(X_weighted, axis=0) + X_mean = X_mean * n_samples / sqrt_sw.dot(sqrt_sw) + weight_sum = sqrt_sw.dot(sqrt_sw) + return ( + safe_sparse_dot(X.T, X, dense_output=True) + - weight_sum * np.outer(X_mean, X_mean), + X_mean, + ) + + def _sparse_multidot_diag(self, X, A, X_mean, sqrt_sw): + """Compute the diagonal of (X - X_mean).dot(A).dot((X - X_mean).T) + without explicitly centering X nor computing X.dot(A) + when X is sparse. + + Parameters + ---------- + X : sparse matrix of shape (n_samples, n_features) + + A : ndarray of shape (n_features, n_features) + + X_mean : ndarray of shape (n_features,) + + sqrt_sw : ndarray of shape (n_features,) + square roots of sample weights + + Returns + ------- + diag : np.ndarray, shape (n_samples,) + The computed diagonal. + """ + intercept_col = scale = sqrt_sw + batch_size = X.shape[1] + diag = np.empty(X.shape[0], dtype=X.dtype) + for start in range(0, X.shape[0], batch_size): + batch = slice(start, min(X.shape[0], start + batch_size), 1) + X_batch = np.empty( + (X[batch].shape[0], X.shape[1] + self.fit_intercept), dtype=X.dtype + ) + if self.fit_intercept: + X_batch[:, :-1] = X[batch].toarray() - X_mean * scale[batch][:, None] + X_batch[:, -1] = intercept_col[batch] + else: + X_batch = X[batch].toarray() + diag[batch] = (X_batch.dot(A) * X_batch).sum(axis=1) + return diag + + def _eigen_decompose_gram(self, X, y, sqrt_sw): + """Eigendecomposition of X.X^T, used when n_samples <= n_features.""" + # if X is dense it has already been centered in preprocessing + K, X_mean = self._compute_gram(X, sqrt_sw) + if self.fit_intercept: + # to emulate centering X with sample weights, + # ie removing the weighted average, we add a column + # containing the square roots of the sample weights. + # by centering, it is orthogonal to the other columns + K += np.outer(sqrt_sw, sqrt_sw) + eigvals, Q = linalg.eigh(K) + QT_y = np.dot(Q.T, y) + return X_mean, eigvals, Q, QT_y + + def _solve_eigen_gram(self, alpha, y, sqrt_sw, X_mean, eigvals, Q, QT_y): + """Compute dual coefficients and diagonal of G^-1. + + Used when we have a decomposition of X.X^T (n_samples <= n_features). + """ + w = 1.0 / (eigvals + alpha) + if self.fit_intercept: + # the vector containing the square roots of the sample weights (1 + # when no sample weights) is the eigenvector of XX^T which + # corresponds to the intercept; we cancel the regularization on + # this dimension. the corresponding eigenvalue is + # sum(sample_weight). + normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw) + intercept_dim = _find_smallest_angle(normalized_sw, Q) + w[intercept_dim] = 0 # cancel regularization for the intercept + + c = np.dot(Q, self._diag_dot(w, QT_y)) + G_inverse_diag = self._decomp_diag(w, Q) + # handle case where y is 2-d + if len(y.shape) != 1: + G_inverse_diag = G_inverse_diag[:, np.newaxis] + return G_inverse_diag, c + + def _eigen_decompose_covariance(self, X, y, sqrt_sw): + """Eigendecomposition of X^T.X, used when n_samples > n_features + and X is sparse. + """ + n_samples, n_features = X.shape + cov = np.empty((n_features + 1, n_features + 1), dtype=X.dtype) + cov[:-1, :-1], X_mean = self._compute_covariance(X, sqrt_sw) + if not self.fit_intercept: + cov = cov[:-1, :-1] + # to emulate centering X with sample weights, + # ie removing the weighted average, we add a column + # containing the square roots of the sample weights. + # by centering, it is orthogonal to the other columns + # when all samples have the same weight we add a column of 1 + else: + cov[-1] = 0 + cov[:, -1] = 0 + cov[-1, -1] = sqrt_sw.dot(sqrt_sw) + nullspace_dim = max(0, n_features - n_samples) + eigvals, V = linalg.eigh(cov) + # remove eigenvalues and vectors in the null space of X^T.X + eigvals = eigvals[nullspace_dim:] + V = V[:, nullspace_dim:] + return X_mean, eigvals, V, X + + def _solve_eigen_covariance_no_intercept( + self, alpha, y, sqrt_sw, X_mean, eigvals, V, X + ): + """Compute dual coefficients and diagonal of G^-1. + + Used when we have a decomposition of X^T.X + (n_samples > n_features and X is sparse), and not fitting an intercept. + """ + w = 1 / (eigvals + alpha) + A = (V * w).dot(V.T) + AXy = A.dot(safe_sparse_dot(X.T, y, dense_output=True)) + y_hat = safe_sparse_dot(X, AXy, dense_output=True) + hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw) + if len(y.shape) != 1: + # handle case where y is 2-d + hat_diag = hat_diag[:, np.newaxis] + return (1 - hat_diag) / alpha, (y - y_hat) / alpha + + def _solve_eigen_covariance_intercept( + self, alpha, y, sqrt_sw, X_mean, eigvals, V, X + ): + """Compute dual coefficients and diagonal of G^-1. + + Used when we have a decomposition of X^T.X + (n_samples > n_features and X is sparse), + and we are fitting an intercept. + """ + # the vector [0, 0, ..., 0, 1] + # is the eigenvector of X^TX which + # corresponds to the intercept; we cancel the regularization on + # this dimension. the corresponding eigenvalue is + # sum(sample_weight), e.g. n when uniform sample weights. + intercept_sv = np.zeros(V.shape[0]) + intercept_sv[-1] = 1 + intercept_dim = _find_smallest_angle(intercept_sv, V) + w = 1 / (eigvals + alpha) + w[intercept_dim] = 1 / eigvals[intercept_dim] + A = (V * w).dot(V.T) + # add a column to X containing the square roots of sample weights + X_op = _X_CenterStackOp(X, X_mean, sqrt_sw) + AXy = A.dot(X_op.T.dot(y)) + y_hat = X_op.dot(AXy) + hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw) + # return (1 - hat_diag), (y - y_hat) + if len(y.shape) != 1: + # handle case where y is 2-d + hat_diag = hat_diag[:, np.newaxis] + return (1 - hat_diag) / alpha, (y - y_hat) / alpha + + def _solve_eigen_covariance(self, alpha, y, sqrt_sw, X_mean, eigvals, V, X): + """Compute dual coefficients and diagonal of G^-1. + + Used when we have a decomposition of X^T.X + (n_samples > n_features and X is sparse). + """ + if self.fit_intercept: + return self._solve_eigen_covariance_intercept( + alpha, y, sqrt_sw, X_mean, eigvals, V, X + ) + return self._solve_eigen_covariance_no_intercept( + alpha, y, sqrt_sw, X_mean, eigvals, V, X + ) + + def _svd_decompose_design_matrix(self, X, y, sqrt_sw): + # X already centered + X_mean = np.zeros(X.shape[1], dtype=X.dtype) + if self.fit_intercept: + # to emulate fit_intercept=True situation, add a column + # containing the square roots of the sample weights + # by centering, the other columns are orthogonal to that one + intercept_column = sqrt_sw[:, None] + X = np.hstack((X, intercept_column)) + U, singvals, _ = linalg.svd(X, full_matrices=0) + singvals_sq = singvals**2 + UT_y = np.dot(U.T, y) + return X_mean, singvals_sq, U, UT_y + + def _solve_svd_design_matrix(self, alpha, y, sqrt_sw, X_mean, singvals_sq, U, UT_y): + """Compute dual coefficients and diagonal of G^-1. + + Used when we have an SVD decomposition of X + (n_samples > n_features and X is dense). + """ + w = ((singvals_sq + alpha) ** -1) - (alpha**-1) + if self.fit_intercept: + # detect intercept column + normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw) + intercept_dim = _find_smallest_angle(normalized_sw, U) + # cancel the regularization for the intercept + w[intercept_dim] = -(alpha**-1) + c = np.dot(U, self._diag_dot(w, UT_y)) + (alpha**-1) * y + G_inverse_diag = self._decomp_diag(w, U) + (alpha**-1) + if len(y.shape) != 1: + # handle case where y is 2-d + G_inverse_diag = G_inverse_diag[:, np.newaxis] + return G_inverse_diag, c + + def fit(self, X, y, sample_weight=None, score_params=None): + """Fit Ridge regression model with gcv. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + Training data. Will be cast to float64 if necessary. + + y : ndarray of shape (n_samples,) or (n_samples, n_targets) + Target values. Will be cast to float64 if necessary. + + sample_weight : float or ndarray of shape (n_samples,), default=None + Individual weights for each sample. If given a float, every sample + will have the same weight. Note that the scale of `sample_weight` + has an impact on the loss; i.e. multiplying all weights by `k` + is equivalent to setting `alpha / k`. + + score_params : dict, default=None + Parameters to be passed to the underlying scorer. + + .. versionadded:: 1.5 + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + """ + X, y = validate_data( + self, + X, + y, + accept_sparse=["csr", "csc", "coo"], + dtype=[np.float64], + multi_output=True, + y_numeric=True, + ) + + # alpha_per_target cannot be used in classifier mode. All subclasses + # of _RidgeGCV that are classifiers keep alpha_per_target at its + # default value: False, so the condition below should never happen. + assert not (self.is_clf and self.alpha_per_target) + + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + + self.alphas = np.asarray(self.alphas) + + unscaled_y = y + X, y, X_offset, y_offset, X_scale = _preprocess_data( + X, + y, + fit_intercept=self.fit_intercept, + copy=self.copy_X, + sample_weight=sample_weight, + ) + + gcv_mode = _check_gcv_mode(X, self.gcv_mode) + + if gcv_mode == "eigen": + decompose = self._eigen_decompose_gram + solve = self._solve_eigen_gram + elif gcv_mode == "svd": + if sparse.issparse(X): + decompose = self._eigen_decompose_covariance + solve = self._solve_eigen_covariance + else: + decompose = self._svd_decompose_design_matrix + solve = self._solve_svd_design_matrix + + n_samples = X.shape[0] + + if sample_weight is not None: + X, y, sqrt_sw = _rescale_data(X, y, sample_weight) + else: + sqrt_sw = np.ones(n_samples, dtype=X.dtype) + + X_mean, *decomposition = decompose(X, y, sqrt_sw) + + n_y = 1 if len(y.shape) == 1 else y.shape[1] + n_alphas = 1 if np.ndim(self.alphas) == 0 else len(self.alphas) + + if self.store_cv_results: + self.cv_results_ = np.empty((n_samples * n_y, n_alphas), dtype=X.dtype) + + best_coef, best_score, best_alpha = None, None, None + + for i, alpha in enumerate(np.atleast_1d(self.alphas)): + G_inverse_diag, c = solve(float(alpha), y, sqrt_sw, X_mean, *decomposition) + if self.scoring is None: + squared_errors = (c / G_inverse_diag) ** 2 + alpha_score = self._score_without_scorer(squared_errors=squared_errors) + if self.store_cv_results: + self.cv_results_[:, i] = squared_errors.ravel() + else: + predictions = y - (c / G_inverse_diag) + # Rescale predictions back to original scale + if sample_weight is not None: # avoid the unnecessary division by ones + if predictions.ndim > 1: + predictions /= sqrt_sw[:, None] + else: + predictions /= sqrt_sw + predictions += y_offset + + if self.store_cv_results: + self.cv_results_[:, i] = predictions.ravel() + + score_params = score_params or {} + alpha_score = self._score( + predictions=predictions, + y=unscaled_y, + n_y=n_y, + scorer=self.scoring, + score_params=score_params, + ) + + # Keep track of the best model + if best_score is None: + # initialize + if self.alpha_per_target and n_y > 1: + best_coef = c + best_score = np.atleast_1d(alpha_score) + best_alpha = np.full(n_y, alpha) + else: + best_coef = c + best_score = alpha_score + best_alpha = alpha + else: + # update + if self.alpha_per_target and n_y > 1: + to_update = alpha_score > best_score + best_coef[:, to_update] = c[:, to_update] + best_score[to_update] = alpha_score[to_update] + best_alpha[to_update] = alpha + elif alpha_score > best_score: + best_coef, best_score, best_alpha = c, alpha_score, alpha + + self.alpha_ = best_alpha + self.best_score_ = best_score + self.dual_coef_ = best_coef + self.coef_ = safe_sparse_dot(self.dual_coef_.T, X) + if y.ndim == 1 or y.shape[1] == 1: + self.coef_ = self.coef_.ravel() + + if sparse.issparse(X): + X_offset = X_mean * X_scale + else: + X_offset += X_mean * X_scale + self._set_intercept(X_offset, y_offset, X_scale) + + if self.store_cv_results: + if len(y.shape) == 1: + cv_results_shape = n_samples, n_alphas + else: + cv_results_shape = n_samples, n_y, n_alphas + self.cv_results_ = self.cv_results_.reshape(cv_results_shape) + + return self + + def _score_without_scorer(self, squared_errors): + """Performs scoring using squared errors when the scorer is None.""" + if self.alpha_per_target: + _score = -squared_errors.mean(axis=0) + else: + _score = -squared_errors.mean() + + return _score + + def _score(self, *, predictions, y, n_y, scorer, score_params): + """Performs scoring with the specified scorer using the + predictions and the true y values. + """ + if self.is_clf: + identity_estimator = _IdentityClassifier(classes=np.arange(n_y)) + _score = scorer( + identity_estimator, + predictions, + y.argmax(axis=1), + **score_params, + ) + else: + identity_estimator = _IdentityRegressor() + if self.alpha_per_target: + _score = np.array( + [ + scorer( + identity_estimator, + predictions[:, j], + y[:, j], + **score_params, + ) + for j in range(n_y) + ] + ) + else: + _score = scorer(identity_estimator, predictions, y, **score_params) + + return _score + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + # Required since this is neither a RegressorMixin nor a ClassifierMixin + tags.target_tags.required = True + return tags + + +class _BaseRidgeCV(LinearModel): + _parameter_constraints: dict = { + "alphas": ["array-like", Interval(Real, 0, None, closed="neither")], + "fit_intercept": ["boolean"], + "scoring": [StrOptions(set(get_scorer_names())), callable, None], + "cv": ["cv_object"], + "gcv_mode": [StrOptions({"auto", "svd", "eigen"}), None], + "store_cv_results": ["boolean"], + "alpha_per_target": ["boolean"], + } + + def __init__( + self, + alphas=(0.1, 1.0, 10.0), + *, + fit_intercept=True, + scoring=None, + cv=None, + gcv_mode=None, + store_cv_results=False, + alpha_per_target=False, + ): + self.alphas = alphas + self.fit_intercept = fit_intercept + self.scoring = scoring + self.cv = cv + self.gcv_mode = gcv_mode + self.store_cv_results = store_cv_results + self.alpha_per_target = alpha_per_target + + def fit(self, X, y, sample_weight=None, **params): + """Fit Ridge regression model with cv. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Training data. If using GCV, will be cast to float64 + if necessary. + + y : ndarray of shape (n_samples,) or (n_samples, n_targets) + Target values. Will be cast to X's dtype if necessary. + + sample_weight : float or ndarray of shape (n_samples,), default=None + Individual weights for each sample. If given a float, every sample + will have the same weight. + + **params : dict, default=None + Extra parameters for the underlying scorer. + + .. versionadded:: 1.5 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Fitted estimator. + + Notes + ----- + When sample_weight is provided, the selected hyperparameter may depend + on whether we use leave-one-out cross-validation (cv=None) + or another form of cross-validation, because only leave-one-out + cross-validation takes the sample weights into account when computing + the validation score. + """ + _raise_for_params(params, self, "fit") + cv = self.cv + scorer = self._get_scorer() + + # `_RidgeGCV` does not work for alpha = 0 + if cv is None: + check_scalar_alpha = partial( + check_scalar, + target_type=numbers.Real, + min_val=0.0, + include_boundaries="neither", + ) + else: + check_scalar_alpha = partial( + check_scalar, + target_type=numbers.Real, + min_val=0.0, + include_boundaries="left", + ) + + if isinstance(self.alphas, (np.ndarray, list, tuple)): + n_alphas = 1 if np.ndim(self.alphas) == 0 else len(self.alphas) + if n_alphas != 1: + for index, alpha in enumerate(self.alphas): + alpha = check_scalar_alpha(alpha, f"alphas[{index}]") + else: + self.alphas[0] = check_scalar_alpha(self.alphas[0], "alphas") + alphas = np.asarray(self.alphas) + + if sample_weight is not None: + params["sample_weight"] = sample_weight + + if cv is None: + if _routing_enabled(): + routed_params = process_routing( + self, + "fit", + **params, + ) + else: + routed_params = Bunch(scorer=Bunch(score={})) + if sample_weight is not None: + routed_params.scorer.score["sample_weight"] = sample_weight + + # reset `scorer` variable to original user-intend if no scoring is passed + if self.scoring is None: + scorer = None + + estimator = _RidgeGCV( + alphas, + fit_intercept=self.fit_intercept, + scoring=scorer, + gcv_mode=self.gcv_mode, + store_cv_results=self.store_cv_results, + is_clf=is_classifier(self), + alpha_per_target=self.alpha_per_target, + ) + estimator.fit( + X, + y, + sample_weight=sample_weight, + score_params=routed_params.scorer.score, + ) + self.alpha_ = estimator.alpha_ + self.best_score_ = estimator.best_score_ + if self.store_cv_results: + self.cv_results_ = estimator.cv_results_ + else: + if self.store_cv_results: + raise ValueError("cv!=None and store_cv_results=True are incompatible") + if self.alpha_per_target: + raise ValueError("cv!=None and alpha_per_target=True are incompatible") + + parameters = {"alpha": alphas} + solver = "sparse_cg" if sparse.issparse(X) else "auto" + model = RidgeClassifier if is_classifier(self) else Ridge + estimator = model( + fit_intercept=self.fit_intercept, + solver=solver, + ) + if _routing_enabled(): + estimator.set_fit_request(sample_weight=True) + + grid_search = GridSearchCV( + estimator, + parameters, + cv=cv, + scoring=scorer, + ) + + grid_search.fit(X, y, **params) + estimator = grid_search.best_estimator_ + self.alpha_ = grid_search.best_estimator_.alpha + self.best_score_ = grid_search.best_score_ + + self.coef_ = estimator.coef_ + self.intercept_ = estimator.intercept_ + self.n_features_in_ = estimator.n_features_in_ + if hasattr(estimator, "feature_names_in_"): + self.feature_names_in_ = estimator.feature_names_in_ + + return self + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.5 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = ( + MetadataRouter(owner=self.__class__.__name__) + .add_self_request(self) + .add( + scorer=self.scoring, + method_mapping=MethodMapping().add(caller="fit", callee="score"), + ) + .add( + splitter=self.cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + ) + return router + + def _get_scorer(self): + scorer = check_scoring(estimator=self, scoring=self.scoring, allow_none=True) + if _routing_enabled() and self.scoring is None: + # This estimator passes an array of 1s as sample_weight even if + # sample_weight is not provided by the user. Therefore we need to + # always request it. But we don't set it if it's passed explicitly + # by the user. + scorer.set_score_request(sample_weight=True) + return scorer + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + + +class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): + """Ridge regression with built-in cross-validation. + + See glossary entry for :term:`cross-validation estimator`. + + By default, it performs efficient Leave-One-Out Cross-Validation. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + alphas : array-like of shape (n_alphas,), default=(0.1, 1.0, 10.0) + Array of alpha values to try. + Regularization strength; must be a positive float. Regularization + improves the conditioning of the problem and reduces the variance of + the estimates. Larger values specify stronger regularization. + Alpha corresponds to ``1 / (2C)`` in other linear models such as + :class:`~sklearn.linear_model.LogisticRegression` or + :class:`~sklearn.svm.LinearSVC`. + If using Leave-One-Out cross-validation, alphas must be strictly positive. + + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to false, no intercept will be used in calculations + (i.e. data is expected to be centered). + + scoring : str, callable, default=None + The scoring method to use for cross-validation. Options: + + - str: see :ref:`scoring_string_names` for options. + - callable: a scorer callable object (e.g., function) with signature + ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details. + - `None`: negative :ref:`mean squared error ` if cv is + None (i.e. when using leave-one-out cross-validation), or + :ref:`coefficient of determination ` (:math:`R^2`) otherwise. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the efficient Leave-One-Out cross-validation + - integer, to specify the number of folds. + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, if ``y`` is binary or multiclass, + :class:`~sklearn.model_selection.StratifiedKFold` is used, else, + :class:`~sklearn.model_selection.KFold` is used. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + gcv_mode : {'auto', 'svd', 'eigen'}, default='auto' + Flag indicating which strategy to use when performing + Leave-One-Out Cross-Validation. Options are:: + + 'auto' : use 'svd' if n_samples > n_features, otherwise use 'eigen' + 'svd' : force use of singular value decomposition of X when X is + dense, eigenvalue decomposition of X^T.X when X is sparse. + 'eigen' : force computation via eigendecomposition of X.X^T + + The 'auto' mode is the default and is intended to pick the cheaper + option of the two depending on the shape of the training data. + + store_cv_results : bool, default=False + Flag indicating if the cross-validation values corresponding to + each alpha should be stored in the ``cv_results_`` attribute (see + below). This flag is only compatible with ``cv=None`` (i.e. using + Leave-One-Out Cross-Validation). + + .. versionchanged:: 1.5 + Parameter name changed from `store_cv_values` to `store_cv_results`. + + alpha_per_target : bool, default=False + Flag indicating whether to optimize the alpha value (picked from the + `alphas` parameter list) for each target separately (for multi-output + settings: multiple prediction targets). When set to `True`, after + fitting, the `alpha_` attribute will contain a value for each target. + When set to `False`, a single alpha is used for all targets. + + .. versionadded:: 0.24 + + Attributes + ---------- + cv_results_ : ndarray of shape (n_samples, n_alphas) or \ + shape (n_samples, n_targets, n_alphas), optional + Cross-validation values for each alpha (only available if + ``store_cv_results=True`` and ``cv=None``). After ``fit()`` has been + called, this attribute will contain the mean squared errors if + `scoring is None` otherwise it will contain standardized per point + prediction values. + + .. versionchanged:: 1.5 + `cv_values_` changed to `cv_results_`. + + coef_ : ndarray of shape (n_features) or (n_targets, n_features) + Weight vector(s). + + intercept_ : float or ndarray of shape (n_targets,) + Independent term in decision function. Set to 0.0 if + ``fit_intercept = False``. + + alpha_ : float or ndarray of shape (n_targets,) + Estimated regularization parameter, or, if ``alpha_per_target=True``, + the estimated regularization parameter for each target. + + best_score_ : float or ndarray of shape (n_targets,) + Score of base estimator with best alpha, or, if + ``alpha_per_target=True``, a score for each target. + + .. versionadded:: 0.23 + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + Ridge : Ridge regression. + RidgeClassifier : Classifier based on ridge regression on {-1, 1} labels. + RidgeClassifierCV : Ridge classifier with built-in cross validation. + + Examples + -------- + >>> from sklearn.datasets import load_diabetes + >>> from sklearn.linear_model import RidgeCV + >>> X, y = load_diabetes(return_X_y=True) + >>> clf = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y) + >>> clf.score(X, y) + 0.5166... + """ + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, sample_weight=None, **params): + """Fit Ridge regression model with cv. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Training data. If using GCV, will be cast to float64 + if necessary. + + y : ndarray of shape (n_samples,) or (n_samples, n_targets) + Target values. Will be cast to X's dtype if necessary. + + sample_weight : float or ndarray of shape (n_samples,), default=None + Individual weights for each sample. If given a float, every sample + will have the same weight. + + **params : dict, default=None + Parameters to be passed to the underlying scorer. + + .. versionadded:: 1.5 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Fitted estimator. + + Notes + ----- + When sample_weight is provided, the selected hyperparameter may depend + on whether we use leave-one-out cross-validation (cv=None) + or another form of cross-validation, because only leave-one-out + cross-validation takes the sample weights into account when computing + the validation score. + """ + super().fit(X, y, sample_weight=sample_weight, **params) + return self + + +class RidgeClassifierCV(_RidgeClassifierMixin, _BaseRidgeCV): + """Ridge classifier with built-in cross-validation. + + See glossary entry for :term:`cross-validation estimator`. + + By default, it performs Leave-One-Out Cross-Validation. Currently, + only the n_features > n_samples case is handled efficiently. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + alphas : array-like of shape (n_alphas,), default=(0.1, 1.0, 10.0) + Array of alpha values to try. + Regularization strength; must be a positive float. Regularization + improves the conditioning of the problem and reduces the variance of + the estimates. Larger values specify stronger regularization. + Alpha corresponds to ``1 / (2C)`` in other linear models such as + :class:`~sklearn.linear_model.LogisticRegression` or + :class:`~sklearn.svm.LinearSVC`. + If using Leave-One-Out cross-validation, alphas must be strictly positive. + + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to false, no intercept will be used in calculations + (i.e. data is expected to be centered). + + scoring : str, callable, default=None + The scoring method to use for cross-validation. Options: + + - str: see :ref:`scoring_string_names` for options. + - callable: a scorer callable object (e.g., function) with signature + ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details. + - `None`: negative :ref:`mean squared error ` if cv is + None (i.e. when using leave-one-out cross-validation), or + :ref:`accuracy ` otherwise. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the efficient Leave-One-Out cross-validation + - integer, to specify the number of folds. + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + class_weight : dict or 'balanced', default=None + Weights associated with classes in the form ``{class_label: weight}``. + If not given, all classes are supposed to have weight one. + + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))``. + + store_cv_results : bool, default=False + Flag indicating if the cross-validation results corresponding to + each alpha should be stored in the ``cv_results_`` attribute (see + below). This flag is only compatible with ``cv=None`` (i.e. using + Leave-One-Out Cross-Validation). + + .. versionchanged:: 1.5 + Parameter name changed from `store_cv_values` to `store_cv_results`. + + Attributes + ---------- + cv_results_ : ndarray of shape (n_samples, n_targets, n_alphas), optional + Cross-validation results for each alpha (only if ``store_cv_results=True`` and + ``cv=None``). After ``fit()`` has been called, this attribute will + contain the mean squared errors if `scoring is None` otherwise it + will contain standardized per point prediction values. + + .. versionchanged:: 1.5 + `cv_values_` changed to `cv_results_`. + + coef_ : ndarray of shape (1, n_features) or (n_targets, n_features) + Coefficient of the features in the decision function. + + ``coef_`` is of shape (1, n_features) when the given problem is binary. + + intercept_ : float or ndarray of shape (n_targets,) + Independent term in decision function. Set to 0.0 if + ``fit_intercept = False``. + + alpha_ : float + Estimated regularization parameter. + + best_score_ : float + Score of base estimator with best alpha. + + .. versionadded:: 0.23 + + classes_ : ndarray of shape (n_classes,) + The classes labels. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + Ridge : Ridge regression. + RidgeClassifier : Ridge classifier. + RidgeCV : Ridge regression with built-in cross validation. + + Notes + ----- + For multi-class classification, n_class classifiers are trained in + a one-versus-all approach. Concretely, this is implemented by taking + advantage of the multi-variate response support in Ridge. + + Examples + -------- + >>> from sklearn.datasets import load_breast_cancer + >>> from sklearn.linear_model import RidgeClassifierCV + >>> X, y = load_breast_cancer(return_X_y=True) + >>> clf = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y) + >>> clf.score(X, y) + 0.9630... + """ + + _parameter_constraints: dict = { + **_BaseRidgeCV._parameter_constraints, + "class_weight": [dict, StrOptions({"balanced"}), None], + } + for param in ("gcv_mode", "alpha_per_target"): + _parameter_constraints.pop(param) + + def __init__( + self, + alphas=(0.1, 1.0, 10.0), + *, + fit_intercept=True, + scoring=None, + cv=None, + class_weight=None, + store_cv_results=False, + ): + super().__init__( + alphas=alphas, + fit_intercept=fit_intercept, + scoring=scoring, + cv=cv, + store_cv_results=store_cv_results, + ) + self.class_weight = class_weight + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, sample_weight=None, **params): + """Fit Ridge classifier with cv. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples + and `n_features` is the number of features. When using GCV, + will be cast to float64 if necessary. + + y : ndarray of shape (n_samples,) + Target values. Will be cast to X's dtype if necessary. + + sample_weight : float or ndarray of shape (n_samples,), default=None + Individual weights for each sample. If given a float, every sample + will have the same weight. + + **params : dict, default=None + Parameters to be passed to the underlying scorer. + + .. versionadded:: 1.5 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Fitted estimator. + """ + # `RidgeClassifier` does not accept "sag" or "saga" solver and thus support + # csr, csc, and coo sparse matrices. By using solver="eigen" we force to accept + # all sparse format. + X, y, sample_weight, Y = self._prepare_data(X, y, sample_weight, solver="eigen") + + # If cv is None, gcv mode will be used and we used the binarized Y + # since y will not be binarized in _RidgeGCV estimator. + # If cv is not None, a GridSearchCV with some RidgeClassifier + # estimators are used where y will be binarized. Thus, we pass y + # instead of the binarized Y. + target = Y if self.cv is None else y + super().fit(X, target, sample_weight=sample_weight, **params) + return self diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_sag.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_sag.py new file mode 100644 index 0000000000000000000000000000000000000000..12e5d049b0b1f88b17405f5633d6d7371a3cca83 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_sag.py @@ -0,0 +1,370 @@ +"""Solvers for Ridge and LogisticRegression using SAG algorithm""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings + +import numpy as np + +from ..exceptions import ConvergenceWarning +from ..utils import check_array +from ..utils.extmath import row_norms +from ..utils.validation import _check_sample_weight +from ._base import make_dataset +from ._sag_fast import sag32, sag64 + + +def get_auto_step_size( + max_squared_sum, alpha_scaled, loss, fit_intercept, n_samples=None, is_saga=False +): + """Compute automatic step size for SAG solver. + + The step size is set to 1 / (alpha_scaled + L + fit_intercept) where L is + the max sum of squares for over all samples. + + Parameters + ---------- + max_squared_sum : float + Maximum squared sum of X over samples. + + alpha_scaled : float + Constant that multiplies the regularization term, scaled by + 1. / n_samples, the number of samples. + + loss : {'log', 'squared', 'multinomial'} + The loss function used in SAG solver. + + fit_intercept : bool + Specifies if a constant (a.k.a. bias or intercept) will be + added to the decision function. + + n_samples : int, default=None + Number of rows in X. Useful if is_saga=True. + + is_saga : bool, default=False + Whether to return step size for the SAGA algorithm or the SAG + algorithm. + + Returns + ------- + step_size : float + Step size used in SAG solver. + + References + ---------- + Schmidt, M., Roux, N. L., & Bach, F. (2013). + Minimizing finite sums with the stochastic average gradient + https://hal.inria.fr/hal-00860051/document + + :arxiv:`Defazio, A., Bach F. & Lacoste-Julien S. (2014). + "SAGA: A Fast Incremental Gradient Method With Support + for Non-Strongly Convex Composite Objectives" <1407.0202>` + """ + if loss in ("log", "multinomial"): + L = 0.25 * (max_squared_sum + int(fit_intercept)) + alpha_scaled + elif loss == "squared": + # inverse Lipschitz constant for squared loss + L = max_squared_sum + int(fit_intercept) + alpha_scaled + else: + raise ValueError( + "Unknown loss function for SAG solver, got %s instead of 'log' or 'squared'" + % loss + ) + if is_saga: + # SAGA theoretical step size is 1/3L or 1 / (2 * (L + mu n)) + # See Defazio et al. 2014 + mun = min(2 * n_samples * alpha_scaled, L) + step = 1.0 / (2 * L + mun) + else: + # SAG theoretical step size is 1/16L but it is recommended to use 1 / L + # see http://www.birs.ca//workshops//2014/14w5003/files/schmidt.pdf, + # slide 65 + step = 1.0 / L + return step + + +def sag_solver( + X, + y, + sample_weight=None, + loss="log", + alpha=1.0, + beta=0.0, + max_iter=1000, + tol=0.001, + verbose=0, + random_state=None, + check_input=True, + max_squared_sum=None, + warm_start_mem=None, + is_saga=False, +): + """SAG solver for Ridge and LogisticRegression. + + SAG stands for Stochastic Average Gradient: the gradient of the loss is + estimated each sample at a time and the model is updated along the way with + a constant learning rate. + + IMPORTANT NOTE: 'sag' solver converges faster on columns that are on the + same scale. You can normalize the data by using + sklearn.preprocessing.StandardScaler on your data before passing it to the + fit method. + + This implementation works with data represented as dense numpy arrays or + sparse scipy arrays of floating point values for the features. It will + fit the data according to squared loss or log loss. + + The regularizer is a penalty added to the loss function that shrinks model + parameters towards the zero vector using the squared euclidean norm L2. + + .. versionadded:: 0.17 + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : ndarray of shape (n_samples,) + Target values. With loss='multinomial', y must be label encoded + (see preprocessing.LabelEncoder). For loss='log' it must be in [0, 1]. + + sample_weight : array-like of shape (n_samples,), default=None + Weights applied to individual samples (1. for unweighted). + + loss : {'log', 'squared', 'multinomial'}, default='log' + Loss function that will be optimized: + -'log' is the binary logistic loss, as used in LogisticRegression. + -'squared' is the squared loss, as used in Ridge. + -'multinomial' is the multinomial logistic loss, as used in + LogisticRegression. + + .. versionadded:: 0.18 + *loss='multinomial'* + + alpha : float, default=1. + L2 regularization term in the objective function + ``(0.5 * alpha * || W ||_F^2)``. + + beta : float, default=0. + L1 regularization term in the objective function + ``(beta * || W ||_1)``. Only applied if ``is_saga`` is set to True. + + max_iter : int, default=1000 + The max number of passes over the training data if the stopping + criteria is not reached. + + tol : float, default=0.001 + The stopping criteria for the weights. The iterations will stop when + max(change in weights) / max(weights) < tol. + + verbose : int, default=0 + The verbosity level. + + random_state : int, RandomState instance or None, default=None + Used when shuffling the data. Pass an int for reproducible output + across multiple function calls. + See :term:`Glossary `. + + check_input : bool, default=True + If False, the input arrays X and y will not be checked. + + max_squared_sum : float, default=None + Maximum squared sum of X over samples. If None, it will be computed, + going through all the samples. The value should be precomputed + to speed up cross validation. + + warm_start_mem : dict, default=None + The initialization parameters used for warm starting. Warm starting is + currently used in LogisticRegression but not in Ridge. + It contains: + - 'coef': the weight vector, with the intercept in last line + if the intercept is fitted. + - 'gradient_memory': the scalar gradient for all seen samples. + - 'sum_gradient': the sum of gradient over all seen samples, + for each feature. + - 'intercept_sum_gradient': the sum of gradient over all seen + samples, for the intercept. + - 'seen': array of boolean describing the seen samples. + - 'num_seen': the number of seen samples. + + is_saga : bool, default=False + Whether to use the SAGA algorithm or the SAG algorithm. SAGA behaves + better in the first epochs, and allow for l1 regularisation. + + Returns + ------- + coef_ : ndarray of shape (n_features,) + Weight vector. + + n_iter_ : int + The number of full pass on all samples. + + warm_start_mem : dict + Contains a 'coef' key with the fitted result, and possibly the + fitted intercept at the end of the array. Contains also other keys + used for warm starting. + + Examples + -------- + >>> import numpy as np + >>> from sklearn import linear_model + >>> n_samples, n_features = 10, 5 + >>> rng = np.random.RandomState(0) + >>> X = rng.randn(n_samples, n_features) + >>> y = rng.randn(n_samples) + >>> clf = linear_model.Ridge(solver='sag') + >>> clf.fit(X, y) + Ridge(solver='sag') + + >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) + >>> y = np.array([1, 1, 2, 2]) + >>> clf = linear_model.LogisticRegression(solver='sag') + >>> clf.fit(X, y) + LogisticRegression(solver='sag') + + References + ---------- + Schmidt, M., Roux, N. L., & Bach, F. (2013). + Minimizing finite sums with the stochastic average gradient + https://hal.inria.fr/hal-00860051/document + + :arxiv:`Defazio, A., Bach F. & Lacoste-Julien S. (2014). + "SAGA: A Fast Incremental Gradient Method With Support + for Non-Strongly Convex Composite Objectives" <1407.0202>` + + See Also + -------- + Ridge, SGDRegressor, ElasticNet, Lasso, SVR, + LogisticRegression, SGDClassifier, LinearSVC, Perceptron + """ + if warm_start_mem is None: + warm_start_mem = {} + # Ridge default max_iter is None + if max_iter is None: + max_iter = 1000 + + if check_input: + _dtype = [np.float64, np.float32] + X = check_array(X, dtype=_dtype, accept_sparse="csr", order="C") + y = check_array(y, dtype=_dtype, ensure_2d=False, order="C") + + n_samples, n_features = X.shape[0], X.shape[1] + # As in SGD, the alpha is scaled by n_samples. + alpha_scaled = float(alpha) / n_samples + beta_scaled = float(beta) / n_samples + + # if loss == 'multinomial', y should be label encoded. + n_classes = int(y.max()) + 1 if loss == "multinomial" else 1 + + # initialization + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + + if "coef" in warm_start_mem.keys(): + coef_init = warm_start_mem["coef"] + else: + # assume fit_intercept is False + coef_init = np.zeros((n_features, n_classes), dtype=X.dtype, order="C") + + # coef_init contains possibly the intercept_init at the end. + # Note that Ridge centers the data before fitting, so fit_intercept=False. + fit_intercept = coef_init.shape[0] == (n_features + 1) + if fit_intercept: + intercept_init = coef_init[-1, :] + coef_init = coef_init[:-1, :] + else: + intercept_init = np.zeros(n_classes, dtype=X.dtype) + + if "intercept_sum_gradient" in warm_start_mem.keys(): + intercept_sum_gradient = warm_start_mem["intercept_sum_gradient"] + else: + intercept_sum_gradient = np.zeros(n_classes, dtype=X.dtype) + + if "gradient_memory" in warm_start_mem.keys(): + gradient_memory_init = warm_start_mem["gradient_memory"] + else: + gradient_memory_init = np.zeros( + (n_samples, n_classes), dtype=X.dtype, order="C" + ) + if "sum_gradient" in warm_start_mem.keys(): + sum_gradient_init = warm_start_mem["sum_gradient"] + else: + sum_gradient_init = np.zeros((n_features, n_classes), dtype=X.dtype, order="C") + + if "seen" in warm_start_mem.keys(): + seen_init = warm_start_mem["seen"] + else: + seen_init = np.zeros(n_samples, dtype=np.int32, order="C") + + if "num_seen" in warm_start_mem.keys(): + num_seen_init = warm_start_mem["num_seen"] + else: + num_seen_init = 0 + + dataset, intercept_decay = make_dataset(X, y, sample_weight, random_state) + + if max_squared_sum is None: + max_squared_sum = row_norms(X, squared=True).max() + step_size = get_auto_step_size( + max_squared_sum, + alpha_scaled, + loss, + fit_intercept, + n_samples=n_samples, + is_saga=is_saga, + ) + if step_size * alpha_scaled == 1: + raise ZeroDivisionError( + "Current sag implementation does not handle " + "the case step_size * alpha_scaled == 1" + ) + + sag = sag64 if X.dtype == np.float64 else sag32 + num_seen, n_iter_ = sag( + dataset, + coef_init, + intercept_init, + n_samples, + n_features, + n_classes, + tol, + max_iter, + loss, + step_size, + alpha_scaled, + beta_scaled, + sum_gradient_init, + gradient_memory_init, + seen_init, + num_seen_init, + fit_intercept, + intercept_sum_gradient, + intercept_decay, + is_saga, + verbose, + ) + + if n_iter_ == max_iter: + warnings.warn( + "The max_iter was reached which means the coef_ did not converge", + ConvergenceWarning, + ) + + if fit_intercept: + coef_init = np.vstack((coef_init, intercept_init)) + + warm_start_mem = { + "coef": coef_init, + "sum_gradient": sum_gradient_init, + "intercept_sum_gradient": intercept_sum_gradient, + "gradient_memory": gradient_memory_init, + "seen": seen_init, + "num_seen": num_seen, + } + + if loss == "multinomial": + coef_ = coef_init.T + else: + coef_ = coef_init[:, 0] + + return coef_, n_iter_, warm_start_mem diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_sag_fast.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_sag_fast.pyx.tp new file mode 100644 index 0000000000000000000000000000000000000000..906928673b0b7570cd7d5e819f9dd521539f5233 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_sag_fast.pyx.tp @@ -0,0 +1,642 @@ +{{py: + +""" + +Template file for easily generate fused types consistent code using Tempita +(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py). + +Generated file: sag_fast.pyx + +Each class is duplicated for all dtypes (float and double). The keywords +between double braces are substituted during the build. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# name_suffix, c_type, np_type +dtypes = [('64', 'double', 'np.float64'), + ('32', 'float', 'np.float32')] + +}} +"""SAG and SAGA implementation""" + +import numpy as np +from libc.math cimport exp, fabs, isfinite, log +from libc.time cimport time, time_t +from libc.stdio cimport printf + +from .._loss._loss cimport ( + CyLossFunction, + CyHalfBinomialLoss, + CyHalfMultinomialLoss, + CyHalfSquaredError, +) +from ..utils._seq_dataset cimport SequentialDataset32, SequentialDataset64 + + +{{for name_suffix, c_type, np_type in dtypes}} + +cdef inline {{c_type}} fmax{{name_suffix}}({{c_type}} x, {{c_type}} y) noexcept nogil: + if x > y: + return x + return y + +{{endfor}} + +{{for name_suffix, c_type, np_type in dtypes}} + +cdef inline {{c_type}} _soft_thresholding{{name_suffix}}({{c_type}} x, {{c_type}} shrinkage) noexcept nogil: + return fmax{{name_suffix}}(x - shrinkage, 0) - fmax{{name_suffix}}(- x - shrinkage, 0) + +{{endfor}} + + +{{for name_suffix, c_type, np_type in dtypes}} + +def sag{{name_suffix}}( + SequentialDataset{{name_suffix}} dataset, + {{c_type}}[:, ::1] weights_array, + {{c_type}}[::1] intercept_array, + int n_samples, + int n_features, + int n_classes, + double tol, + int max_iter, + str loss_function, + double step_size, + double alpha, + double beta, + {{c_type}}[:, ::1] sum_gradient_init, + {{c_type}}[:, ::1] gradient_memory_init, + bint[::1] seen_init, + int num_seen, + bint fit_intercept, + {{c_type}}[::1] intercept_sum_gradient_init, + double intercept_decay, + bint saga, + bint verbose +): + """Stochastic Average Gradient (SAG) and SAGA solvers. + + Used in Ridge and LogisticRegression. + + Some implementation details: + + - Just-in-time (JIT) update: In SAG(A), the average-gradient update is + collinear with the drawn sample X_i. Therefore, if the data is sparse, the + random sample X_i will change the average gradient only on features j where + X_ij != 0. In some cases, the average gradient on feature j might change + only after k random samples with no change. In these cases, instead of + applying k times the same gradient step on feature j, we apply the gradient + step only once, scaled by k. This is called the "just-in-time update", and + it is performed in `lagged_update{{name_suffix}}`. This function also + applies the proximal operator after the gradient step (if L1 regularization + is used in SAGA). + + - Weight scale: In SAG(A), the weights are scaled down at each iteration + due to the L2 regularization. To avoid updating all the weights at each + iteration, the weight scale is factored out in a separate variable `wscale` + which is only used in the JIT update. When this variable is too small, it + is reset for numerical stability using the function + `scale_weights{{name_suffix}}`. This reset requires applying all remaining + JIT updates. This reset is also performed every `n_samples` iterations + before each convergence check, so when the algorithm stops, we are sure + that there is no remaining JIT updates. + + Reference + --------- + Schmidt, M., Roux, N. L., & Bach, F. (2013). + Minimizing finite sums with the stochastic average gradient + https://hal.inria.fr/hal-00860051/document + (section 4.3) + + :arxiv:`Defazio, A., Bach F. & Lacoste-Julien S. (2014). + "SAGA: A Fast Incremental Gradient Method With Support + for Non-Strongly Convex Composite Objectives" <1407.0202>` + """ + # the data pointer for x, the current sample + cdef {{c_type}} *x_data_ptr = NULL + # the index pointer for the column of the data + cdef int *x_ind_ptr = NULL + # the number of non-zero features for current sample + cdef int xnnz = -1 + # the label value for current sample + # the label value for current sample + cdef {{c_type}} y + # the sample weight + cdef {{c_type}} sample_weight + + # helper variable for indexes + cdef int f_idx, s_idx, feature_ind, class_ind, j + # the number of pass through all samples + cdef int n_iter = 0 + # helper to track iterations through samples + cdef int sample_itr + # the index (row number) of the current sample + cdef int sample_ind + + # the maximum change in weights, used to compute stopping criteria + cdef {{c_type}} max_change + # a holder variable for the max weight, used to compute stopping criteria + cdef {{c_type}} max_weight + + # the start time of the fit + cdef time_t start_time + # the end time of the fit + cdef time_t end_time + + # precomputation since the step size does not change in this implementation + cdef {{c_type}} wscale_update = 1.0 - step_size * alpha + + # helper for cumulative sum + cdef {{c_type}} cum_sum + + # the pointer to the coef_ or weights + cdef {{c_type}}* weights = &weights_array[0, 0] + + # the sum of gradients for each feature + cdef {{c_type}}* sum_gradient = &sum_gradient_init[0, 0] + + # the previously seen gradient for each sample + cdef {{c_type}}* gradient_memory = &gradient_memory_init[0, 0] + + # the cumulative sums needed for JIT params + cdef {{c_type}}[::1] cumulative_sums = np.empty(n_samples, dtype={{np_type}}, order="c") + + # the index for the last time this feature was updated + cdef int[::1] feature_hist = np.zeros(n_features, dtype=np.int32, order="c") + + # the previous weights to use to compute stopping criteria + cdef {{c_type}}[:, ::1] previous_weights_array = np.zeros((n_features, n_classes), dtype={{np_type}}, order="c") + cdef {{c_type}}* previous_weights = &previous_weights_array[0, 0] + + cdef {{c_type}}[::1] prediction = np.zeros(n_classes, dtype={{np_type}}, order="c") + + cdef {{c_type}}[::1] gradient = np.zeros(n_classes, dtype={{np_type}}, order="c") + + # Intermediate variable that need declaration since cython cannot infer when templating + cdef {{c_type}} val + + # Bias correction term in saga + cdef {{c_type}} gradient_correction + + # the scalar used for multiplying z + cdef {{c_type}} wscale = 1.0 + + # return value (-1 if an error occurred, 0 otherwise) + cdef int status = 0 + + # the cumulative sums for each iteration for the sparse implementation + cumulative_sums[0] = 0.0 + + # the multipliative scale needed for JIT params + cdef {{c_type}}[::1] cumulative_sums_prox + cdef {{c_type}}* cumulative_sums_prox_ptr + + cdef bint prox = beta > 0 and saga + + # Loss function to optimize + cdef CyLossFunction loss + # Whether the loss function is multinomial + cdef bint multinomial = False + # Multinomial loss function + cdef CyHalfMultinomialLoss multiloss + + if loss_function == "multinomial": + multinomial = True + multiloss = CyHalfMultinomialLoss() + elif loss_function == "log": + loss = CyHalfBinomialLoss() + elif loss_function == "squared": + loss = CyHalfSquaredError() + else: + raise ValueError("Invalid loss parameter: got %s instead of " + "one of ('log', 'squared', 'multinomial')" + % loss_function) + + if prox: + cumulative_sums_prox = np.empty(n_samples, dtype={{np_type}}, order="c") + cumulative_sums_prox_ptr = &cumulative_sums_prox[0] + else: + cumulative_sums_prox = None + cumulative_sums_prox_ptr = NULL + + with nogil: + start_time = time(NULL) + for n_iter in range(max_iter): + for sample_itr in range(n_samples): + # extract a random sample + sample_ind = dataset.random(&x_data_ptr, &x_ind_ptr, &xnnz, &y, &sample_weight) + + # cached index for gradient_memory + s_idx = sample_ind * n_classes + + # update the number of samples seen and the seen array + if seen_init[sample_ind] == 0: + num_seen += 1 + seen_init[sample_ind] = 1 + + # make the weight updates (just-in-time gradient step, and prox operator) + if sample_itr > 0: + status = lagged_update{{name_suffix}}( + weights=weights, + wscale=wscale, + xnnz=xnnz, + n_samples=n_samples, + n_classes=n_classes, + sample_itr=sample_itr, + cumulative_sums=&cumulative_sums[0], + cumulative_sums_prox=cumulative_sums_prox_ptr, + feature_hist=&feature_hist[0], + prox=prox, + sum_gradient=sum_gradient, + x_ind_ptr=x_ind_ptr, + reset=False, + n_iter=n_iter + ) + if status == -1: + break + + # find the current prediction + predict_sample{{name_suffix}}( + x_data_ptr=x_data_ptr, + x_ind_ptr=x_ind_ptr, + xnnz=xnnz, + w_data_ptr=weights, + wscale=wscale, + intercept=&intercept_array[0], + prediction=&prediction[0], + n_classes=n_classes + ) + + # compute the gradient for this sample, given the prediction + if multinomial: + multiloss.cy_gradient( + y_true=y, + raw_prediction=prediction, + sample_weight=sample_weight, + gradient_out=gradient, + ) + else: + gradient[0] = loss.cy_gradient(y, prediction[0]) * sample_weight + + # L2 regularization by simply rescaling the weights + wscale *= wscale_update + + # make the updates to the sum of gradients + for j in range(xnnz): + feature_ind = x_ind_ptr[j] + val = x_data_ptr[j] + f_idx = feature_ind * n_classes + for class_ind in range(n_classes): + gradient_correction = \ + val * (gradient[class_ind] - + gradient_memory[s_idx + class_ind]) + if saga: + # Note that this is not the main gradient step, + # which is performed just-in-time in lagged_update. + # This part is done outside the JIT update + # as it does not depend on the average gradient. + # The prox operator is applied after the JIT update + weights[f_idx + class_ind] -= \ + (gradient_correction * step_size + * (1 - 1. / num_seen) / wscale) + sum_gradient[f_idx + class_ind] += gradient_correction + + # fit the intercept + if fit_intercept: + for class_ind in range(n_classes): + gradient_correction = (gradient[class_ind] - + gradient_memory[s_idx + class_ind]) + intercept_sum_gradient_init[class_ind] += gradient_correction + gradient_correction *= step_size * (1. - 1. / num_seen) + if saga: + intercept_array[class_ind] -= \ + (step_size * intercept_sum_gradient_init[class_ind] / + num_seen * intercept_decay) + gradient_correction + else: + intercept_array[class_ind] -= \ + (step_size * intercept_sum_gradient_init[class_ind] / + num_seen * intercept_decay) + + # check to see that the intercept is not inf or NaN + if not isfinite(intercept_array[class_ind]): + status = -1 + break + # Break from the n_samples outer loop if an error happened + # in the fit_intercept n_classes inner loop + if status == -1: + break + + # update the gradient memory for this sample + for class_ind in range(n_classes): + gradient_memory[s_idx + class_ind] = gradient[class_ind] + + if sample_itr == 0: + cumulative_sums[0] = step_size / (wscale * num_seen) + if prox: + cumulative_sums_prox[0] = step_size * beta / wscale + else: + cumulative_sums[sample_itr] = \ + (cumulative_sums[sample_itr - 1] + + step_size / (wscale * num_seen)) + if prox: + cumulative_sums_prox[sample_itr] = \ + (cumulative_sums_prox[sample_itr - 1] + + step_size * beta / wscale) + # If wscale gets too small, we need to reset the scale. + # This also resets the just-in-time update system. + if wscale < 1e-9: + if verbose: + with gil: + print("rescaling...") + status = scale_weights{{name_suffix}}( + weights=weights, + wscale=&wscale, + n_features=n_features, + n_samples=n_samples, + n_classes=n_classes, + sample_itr=sample_itr, + cumulative_sums=&cumulative_sums[0], + cumulative_sums_prox=cumulative_sums_prox_ptr, + feature_hist=&feature_hist[0], + prox=prox, + sum_gradient=sum_gradient, + n_iter=n_iter + ) + if status == -1: + break + + # Break from the n_iter outer loop if an error happened in the + # n_samples inner loop + if status == -1: + break + + # We scale the weights every n_samples iterations and reset the + # just-in-time update system for numerical stability. + # Because this reset is done before every convergence check, we are + # sure there is no remaining lagged update when the algorithm stops. + status = scale_weights{{name_suffix}}( + weights=weights, + wscale=&wscale, + n_features=n_features, + n_samples=n_samples, + n_classes=n_classes, + sample_itr=n_samples - 1, + cumulative_sums=&cumulative_sums[0], + cumulative_sums_prox=cumulative_sums_prox_ptr, + feature_hist=&feature_hist[0], + prox=prox, + sum_gradient=sum_gradient, + n_iter=n_iter + ) + if status == -1: + break + + # check if the stopping criteria is reached + max_change = 0.0 + max_weight = 0.0 + for idx in range(n_features * n_classes): + max_weight = fmax{{name_suffix}}(max_weight, fabs(weights[idx])) + max_change = fmax{{name_suffix}}(max_change, fabs(weights[idx] - previous_weights[idx])) + previous_weights[idx] = weights[idx] + if ((max_weight != 0 and max_change / max_weight <= tol) + or max_weight == 0 and max_change == 0): + if verbose: + end_time = time(NULL) + with gil: + print("convergence after %d epochs took %d seconds" % + (n_iter + 1, end_time - start_time)) + break + elif verbose: + printf('Epoch %d, change: %.8g\n', n_iter + 1, + max_change / max_weight) + n_iter += 1 + # We do the error treatment here based on error code in status to avoid + # re-acquiring the GIL within the cython code, which slows the computation + # when the sag/saga solver is used concurrently in multiple Python threads. + if status == -1: + raise ValueError(("Floating-point under-/overflow occurred at epoch" + " #%d. Scaling input data with StandardScaler or" + " MinMaxScaler might help.") % n_iter) + + if verbose and n_iter >= max_iter: + end_time = time(NULL) + print(("max_iter reached after %d seconds") % + (end_time - start_time)) + + return num_seen, n_iter + +{{endfor}} + + +{{for name_suffix, c_type, np_type in dtypes}} + +cdef int scale_weights{{name_suffix}}( + {{c_type}}* weights, + {{c_type}}* wscale, + int n_features, + int n_samples, + int n_classes, + int sample_itr, + {{c_type}}* cumulative_sums, + {{c_type}}* cumulative_sums_prox, + int* feature_hist, + bint prox, + {{c_type}}* sum_gradient, + int n_iter +) noexcept nogil: + """Scale the weights and reset wscale to 1.0 for numerical stability, and + reset the just-in-time (JIT) update system. + + See `sag{{name_suffix}}`'s docstring about the JIT update system. + + wscale = (1 - step_size * alpha) ** (n_iter * n_samples + sample_itr) + can become very small, so we reset it every n_samples iterations to 1.0 for + numerical stability. To be able to scale, we first need to update every + coefficients and reset the just-in-time update system. + This also limits the size of `cumulative_sums`. + """ + + cdef int status + status = lagged_update{{name_suffix}}( + weights, + wscale[0], + n_features, + n_samples, + n_classes, + sample_itr + 1, + cumulative_sums, + cumulative_sums_prox, + feature_hist, + prox, + sum_gradient, + NULL, + True, + n_iter + ) + # if lagged update succeeded, reset wscale to 1.0 + if status == 0: + wscale[0] = 1.0 + return status + +{{endfor}} + + +{{for name_suffix, c_type, np_type in dtypes}} + +cdef int lagged_update{{name_suffix}}( + {{c_type}}* weights, + {{c_type}} wscale, + int xnnz, + int n_samples, + int n_classes, + int sample_itr, + {{c_type}}* cumulative_sums, + {{c_type}}* cumulative_sums_prox, + int* feature_hist, + bint prox, + {{c_type}}* sum_gradient, + int* x_ind_ptr, + bint reset, + int n_iter +) noexcept nogil: + """Hard perform the JIT updates for non-zero features of present sample. + + See `sag{{name_suffix}}`'s docstring about the JIT update system. + + The updates that awaits are kept in memory using cumulative_sums, + cumulative_sums_prox, wscale and feature_hist. See original SAGA paper + (Defazio et al. 2014) for details. If reset=True, we also reset wscale to + 1 (this is done at the end of each epoch). + """ + cdef int feature_ind, class_ind, idx, f_idx, lagged_ind, last_update_ind + cdef {{c_type}} cum_sum, grad_step, prox_step, cum_sum_prox + for feature_ind in range(xnnz): + if not reset: + feature_ind = x_ind_ptr[feature_ind] + f_idx = feature_ind * n_classes + + cum_sum = cumulative_sums[sample_itr - 1] + if prox: + cum_sum_prox = cumulative_sums_prox[sample_itr - 1] + if feature_hist[feature_ind] != 0: + cum_sum -= cumulative_sums[feature_hist[feature_ind] - 1] + if prox: + cum_sum_prox -= cumulative_sums_prox[feature_hist[feature_ind] - 1] + if not prox: + for class_ind in range(n_classes): + idx = f_idx + class_ind + weights[idx] -= cum_sum * sum_gradient[idx] + if reset: + weights[idx] *= wscale + if not isfinite(weights[idx]): + # returning here does not require the gil as the return + # type is a C integer + return -1 + else: + for class_ind in range(n_classes): + idx = f_idx + class_ind + if fabs(sum_gradient[idx] * cum_sum) < cum_sum_prox: + # In this case, we can perform all the gradient steps and + # all the proximal steps in this order, which is more + # efficient than unrolling all the lagged updates. + # Idea taken from scikit-learn-contrib/lightning. + weights[idx] -= cum_sum * sum_gradient[idx] + weights[idx] = _soft_thresholding{{name_suffix}}(weights[idx], + cum_sum_prox) + else: + last_update_ind = feature_hist[feature_ind] + if last_update_ind == -1: + last_update_ind = sample_itr - 1 + for lagged_ind in range(sample_itr - 1, + last_update_ind - 1, -1): + if lagged_ind > 0: + grad_step = (cumulative_sums[lagged_ind] + - cumulative_sums[lagged_ind - 1]) + prox_step = (cumulative_sums_prox[lagged_ind] + - cumulative_sums_prox[lagged_ind - 1]) + else: + grad_step = cumulative_sums[lagged_ind] + prox_step = cumulative_sums_prox[lagged_ind] + weights[idx] -= sum_gradient[idx] * grad_step + weights[idx] = _soft_thresholding{{name_suffix}}(weights[idx], + prox_step) + + if reset: + weights[idx] *= wscale + # check to see that the weight is not inf or NaN + if not isfinite(weights[idx]): + return -1 + if reset: + feature_hist[feature_ind] = sample_itr % n_samples + else: + feature_hist[feature_ind] = sample_itr + + if reset: + cumulative_sums[sample_itr - 1] = 0.0 + if prox: + cumulative_sums_prox[sample_itr - 1] = 0.0 + + return 0 + +{{endfor}} + + +{{for name_suffix, c_type, np_type in dtypes}} + +cdef void predict_sample{{name_suffix}}( + {{c_type}}* x_data_ptr, + int* x_ind_ptr, + int xnnz, + {{c_type}}* w_data_ptr, + {{c_type}} wscale, + {{c_type}}* intercept, + {{c_type}}* prediction, + int n_classes +) noexcept nogil: + """Compute the prediction given sparse sample x and dense weight w. + + Parameters + ---------- + x_data_ptr : pointer + Pointer to the data of the sample x + + x_ind_ptr : pointer + Pointer to the indices of the sample x + + xnnz : int + Number of non-zero element in the sample x + + w_data_ptr : pointer + Pointer to the data of the weights w + + wscale : {{c_type}} + Scale of the weights w + + intercept : pointer + Pointer to the intercept + + prediction : pointer + Pointer to store the resulting prediction + + n_classes : int + Number of classes in multinomial case. Equals 1 in binary case. + + """ + cdef int feature_ind, class_ind, j + cdef {{c_type}} innerprod + + for class_ind in range(n_classes): + innerprod = 0.0 + # Compute the dot product only on non-zero elements of x + for j in range(xnnz): + feature_ind = x_ind_ptr[j] + innerprod += (w_data_ptr[feature_ind * n_classes + class_ind] * + x_data_ptr[j]) + + prediction[class_ind] = wscale * innerprod + intercept[class_ind] + + +{{endfor}} diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_sgd_fast.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_sgd_fast.pyx.tp new file mode 100644 index 0000000000000000000000000000000000000000..45cdf9172d8c455e3ba27f5755337683dd704aad --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_sgd_fast.pyx.tp @@ -0,0 +1,661 @@ +{{py: + +""" +Template file to easily generate fused types consistent code using Tempita +(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py). + +Generated file: _sgd_fast.pyx + +Each relevant function is duplicated for the dtypes float and double. +The keywords between double braces are substituted during the build. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# The dtypes are defined as follows (name_suffix, c_type, np_type) +dtypes = [ + ("64", "double", "np.float64"), + ("32", "float", "np.float32"), +] + +}} +"""SGD implementation""" + +import numpy as np +from time import time + +from cython cimport floating +from libc.math cimport exp, fabs, isfinite, log, pow, INFINITY + +from .._loss._loss cimport CyLossFunction +from ..utils._typedefs cimport uint32_t, uint8_t +from ..utils._weight_vector cimport WeightVector32, WeightVector64 +from ..utils._seq_dataset cimport SequentialDataset32, SequentialDataset64 + + +cdef extern from *: + """ + /* Penalty constants */ + #define NO_PENALTY 0 + #define L1 1 + #define L2 2 + #define ELASTICNET 3 + + /* Learning rate constants */ + #define CONSTANT 1 + #define OPTIMAL 2 + #define INVSCALING 3 + #define ADAPTIVE 4 + #define PA1 5 + #define PA2 6 + """ + int NO_PENALTY = 0 + int L1 = 1 + int L2 = 2 + int ELASTICNET = 3 + + int CONSTANT = 1 + int OPTIMAL = 2 + int INVSCALING = 3 + int ADAPTIVE = 4 + int PA1 = 5 + int PA2 = 6 + + +# ---------------------------------------- +# Extension Types for Loss Functions +# ---------------------------------------- + +cdef class Regression(CyLossFunction): + """Base class for loss functions for regression""" + + def py_loss(self, double p, double y): + """Python version of `loss` for testing only. + + Pytest needs a python function and can't use cdef functions. + + Parameters + ---------- + p : double + The prediction, `p = w^T x + intercept`. + y : double + The true value (aka target). + + Returns + ------- + double + The loss evaluated at `p` and `y`. + """ + return self.cy_loss(y, p) + + def py_dloss(self, double p, double y): + """Python version of `dloss` for testing only. + + Pytest needs a python function and can't use cdef functions. + + Parameters + ---------- + p : double + The prediction, `p = w^T x`. + y : double + The true value (aka target). + + Returns + ------- + double + The derivative of the loss function with regards to `p`. + """ + return self.cy_gradient(y, p) + + +cdef class Classification(CyLossFunction): + """Base class for loss functions for classification""" + + def py_loss(self, double p, double y): + """Python version of `loss` for testing only.""" + return self.cy_loss(y, p) + + def py_dloss(self, double p, double y): + """Python version of `dloss` for testing only.""" + return self.cy_gradient(y, p) + + +cdef class ModifiedHuber(Classification): + """Modified Huber loss for binary classification with y in {-1, 1} + + This is equivalent to quadratically smoothed SVM with gamma = 2. + + See T. Zhang 'Solving Large Scale Linear Prediction Problems Using + Stochastic Gradient Descent', ICML'04. + """ + cdef double cy_loss(self, double y, double p) noexcept nogil: + cdef double z = p * y + if z >= 1.0: + return 0.0 + elif z >= -1.0: + return (1.0 - z) * (1.0 - z) + else: + return -4.0 * z + + cdef double cy_gradient(self, double y, double p) noexcept nogil: + cdef double z = p * y + if z >= 1.0: + return 0.0 + elif z >= -1.0: + return 2.0 * (1.0 - z) * -y + else: + return -4.0 * y + + def __reduce__(self): + return ModifiedHuber, () + + +cdef class Hinge(Classification): + """Hinge loss for binary classification tasks with y in {-1,1} + + Parameters + ---------- + + threshold : float > 0.0 + Margin threshold. When threshold=1.0, one gets the loss used by SVM. + When threshold=0.0, one gets the loss used by the Perceptron. + """ + + cdef double threshold + + def __init__(self, double threshold=1.0): + self.threshold = threshold + + cdef double cy_loss(self, double y, double p) noexcept nogil: + cdef double z = p * y + if z <= self.threshold: + return self.threshold - z + return 0.0 + + cdef double cy_gradient(self, double y, double p) noexcept nogil: + cdef double z = p * y + if z <= self.threshold: + return -y + return 0.0 + + def __reduce__(self): + return Hinge, (self.threshold,) + + +cdef class SquaredHinge(Classification): + """Squared Hinge loss for binary classification tasks with y in {-1,1} + + Parameters + ---------- + + threshold : float > 0.0 + Margin threshold. When threshold=1.0, one gets the loss used by + (quadratically penalized) SVM. + """ + + cdef double threshold + + def __init__(self, double threshold=1.0): + self.threshold = threshold + + cdef double cy_loss(self, double y, double p) noexcept nogil: + cdef double z = self.threshold - p * y + if z > 0: + return z * z + return 0.0 + + cdef double cy_gradient(self, double y, double p) noexcept nogil: + cdef double z = self.threshold - p * y + if z > 0: + return -2 * y * z + return 0.0 + + def __reduce__(self): + return SquaredHinge, (self.threshold,) + + +cdef class EpsilonInsensitive(Regression): + """Epsilon-Insensitive loss (used by SVR). + + loss = max(0, |y - p| - epsilon) + """ + + cdef double epsilon + + def __init__(self, double epsilon): + self.epsilon = epsilon + + cdef double cy_loss(self, double y, double p) noexcept nogil: + cdef double ret = fabs(y - p) - self.epsilon + return ret if ret > 0 else 0 + + cdef double cy_gradient(self, double y, double p) noexcept nogil: + if y - p > self.epsilon: + return -1 + elif p - y > self.epsilon: + return 1 + else: + return 0 + + def __reduce__(self): + return EpsilonInsensitive, (self.epsilon,) + + +cdef class SquaredEpsilonInsensitive(Regression): + """Epsilon-Insensitive loss. + + loss = max(0, |y - p| - epsilon)^2 + """ + + cdef double epsilon + + def __init__(self, double epsilon): + self.epsilon = epsilon + + cdef double cy_loss(self, double y, double p) noexcept nogil: + cdef double ret = fabs(y - p) - self.epsilon + return ret * ret if ret > 0 else 0 + + cdef double cy_gradient(self, double y, double p) noexcept nogil: + cdef double z + z = y - p + if z > self.epsilon: + return -2 * (z - self.epsilon) + elif z < -self.epsilon: + return 2 * (-z - self.epsilon) + else: + return 0 + + def __reduce__(self): + return SquaredEpsilonInsensitive, (self.epsilon,) + +{{for name_suffix, c_type, np_type in dtypes}} + +def _plain_sgd{{name_suffix}}( + const {{c_type}}[::1] weights, + double intercept, + const {{c_type}}[::1] average_weights, + double average_intercept, + CyLossFunction loss, + int penalty_type, + double alpha, + double C, + double l1_ratio, + SequentialDataset{{name_suffix}} dataset, + const uint8_t[::1] validation_mask, + bint early_stopping, + validation_score_cb, + int n_iter_no_change, + unsigned int max_iter, + double tol, + int fit_intercept, + int verbose, + bint shuffle, + uint32_t seed, + double weight_pos, + double weight_neg, + int learning_rate, + double eta0, + double power_t, + bint one_class, + double t=1.0, + double intercept_decay=1.0, + int average=0, +): + """SGD for generic loss functions and penalties with optional averaging + + Parameters + ---------- + weights : ndarray[{{c_type}}, ndim=1] + The allocated vector of weights. + intercept : double + The initial intercept. + average_weights : ndarray[{{c_type}}, ndim=1] + The average weights as computed for ASGD. Should be None if average + is 0. + average_intercept : double + The average intercept for ASGD. Should be 0 if average is 0. + loss : CyLossFunction + A concrete ``CyLossFunction`` object. + penalty_type : int + The penalty 2 for L2, 1 for L1, and 3 for Elastic-Net. + alpha : float + The regularization parameter. + C : float + Maximum step size for passive aggressive. + l1_ratio : float + The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1. + l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1. + dataset : SequentialDataset + A concrete ``SequentialDataset`` object. + validation_mask : ndarray[uint8_t, ndim=1] + Equal to True on the validation set. + early_stopping : boolean + Whether to use a stopping criterion based on the validation set. + validation_score_cb : callable + A callable to compute a validation score given the current + coefficients and intercept values. + Used only if early_stopping is True. + n_iter_no_change : int + Number of iteration with no improvement to wait before stopping. + max_iter : int + The maximum number of iterations (epochs). + tol: double + The tolerance for the stopping criterion. + fit_intercept : int + Whether or not to fit the intercept (1 or 0). + verbose : int + Print verbose output; 0 for quite. + shuffle : boolean + Whether to shuffle the training data before each epoch. + weight_pos : float + The weight of the positive class. + weight_neg : float + The weight of the negative class. + seed : uint32_t + Seed of the pseudorandom number generator used to shuffle the data. + learning_rate : int + The learning rate: + (1) constant, eta = eta0 + (2) optimal, eta = 1.0/(alpha * t). + (3) inverse scaling, eta = eta0 / pow(t, power_t) + (4) adaptive decrease + (5) Passive Aggressive-I, eta = min(alpha, loss/norm(x)) + (6) Passive Aggressive-II, eta = 1.0 / (norm(x) + 0.5*alpha) + eta0 : double + The initial learning rate. + power_t : double + The exponent for inverse scaling learning rate. + one_class : boolean + Whether to solve the One-Class SVM optimization problem. + t : double + Initial state of the learning rate. This value is equal to the + iteration count except when the learning rate is set to `optimal`. + Default: 1.0. + average : int + The number of iterations before averaging starts. average=1 is + equivalent to averaging for all iterations. + + + Returns + ------- + weights : array, shape=[n_features] + The fitted weight vector. + intercept : float + The fitted intercept term. + average_weights : array shape=[n_features] + The averaged weights across iterations. Values are valid only if + average > 0. + average_intercept : float + The averaged intercept across iterations. + Values are valid only if average > 0. + n_iter_ : int + The actual number of iter (epochs). + """ + + # get the data information into easy vars + cdef Py_ssize_t n_samples = dataset.n_samples + cdef Py_ssize_t n_features = weights.shape[0] + + cdef WeightVector{{name_suffix}} w = WeightVector{{name_suffix}}(weights, average_weights) + cdef {{c_type}} *x_data_ptr = NULL + cdef int *x_ind_ptr = NULL + + # helper variables + cdef int no_improvement_count = 0 + cdef bint infinity = False + cdef int xnnz + cdef double eta = 0.0 + cdef double p = 0.0 + cdef double update = 0.0 + cdef double intercept_update = 0.0 + cdef double sumloss = 0.0 + cdef double score = 0.0 + cdef double best_loss = INFINITY + cdef double best_score = -INFINITY + cdef {{c_type}} y = 0.0 + cdef {{c_type}} sample_weight + cdef {{c_type}} class_weight = 1.0 + cdef unsigned int count = 0 + cdef unsigned int train_count = n_samples - np.sum(validation_mask) + cdef unsigned int epoch = 0 + cdef unsigned int i = 0 + cdef int is_hinge = isinstance(loss, Hinge) + cdef double optimal_init = 0.0 + cdef double dloss = 0.0 + cdef double MAX_DLOSS = 1e12 + + cdef long long sample_index + + # q vector is only used for L1 regularization + cdef {{c_type}}[::1] q = None + cdef {{c_type}} * q_data_ptr = NULL + if penalty_type == L1 or penalty_type == ELASTICNET: + q = np.zeros((n_features,), dtype={{np_type}}, order="c") + q_data_ptr = &q[0] + cdef double u = 0.0 + + if penalty_type == L2: + l1_ratio = 0.0 + elif penalty_type == L1: + l1_ratio = 1.0 + + eta = eta0 + + if learning_rate == OPTIMAL: + typw = np.sqrt(1.0 / np.sqrt(alpha)) + # computing eta0, the initial learning rate + initial_eta0 = typw / max(1.0, loss.cy_gradient(1.0, -typw)) + # initialize t such that eta at first sample equals eta0 + optimal_init = 1.0 / (initial_eta0 * alpha) + + t_start = time() + with nogil: + for epoch in range(max_iter): + sumloss = 0 + if verbose > 0: + with gil: + print("-- Epoch %d" % (epoch + 1)) + if shuffle: + dataset.shuffle(seed) + for i in range(n_samples): + dataset.next(&x_data_ptr, &x_ind_ptr, &xnnz, + &y, &sample_weight) + + sample_index = dataset.index_data_ptr[dataset.current_index] + if validation_mask[sample_index]: + # do not learn on the validation set + continue + + p = w.dot(x_data_ptr, x_ind_ptr, xnnz) + intercept + if learning_rate == OPTIMAL: + eta = 1.0 / (alpha * (optimal_init + t - 1)) + elif learning_rate == INVSCALING: + eta = eta0 / pow(t, power_t) + + if verbose or not early_stopping: + sumloss += loss.cy_loss(y, p) + + if y > 0.0: + class_weight = weight_pos + else: + class_weight = weight_neg + + if learning_rate == PA1: + update = sqnorm(x_data_ptr, x_ind_ptr, xnnz) + if update == 0: + continue + update = min(C, loss.cy_loss(y, p) / update) + elif learning_rate == PA2: + update = sqnorm(x_data_ptr, x_ind_ptr, xnnz) + update = loss.cy_loss(y, p) / (update + 0.5 / C) + else: + dloss = loss.cy_gradient(y, p) + # clip dloss with large values to avoid numerical + # instabilities + if dloss < -MAX_DLOSS: + dloss = -MAX_DLOSS + elif dloss > MAX_DLOSS: + dloss = MAX_DLOSS + update = -eta * dloss + + if learning_rate >= PA1: + if is_hinge: + # classification + update *= y + elif y - p < 0: + # regression + update *= -1 + + update *= class_weight * sample_weight + + if penalty_type >= L2: + # do not scale to negative values when eta or alpha are too + # big: instead set the weights to zero + w.scale(max(0, 1.0 - ((1.0 - l1_ratio) * eta * alpha))) + + if update != 0.0: + w.add(x_data_ptr, x_ind_ptr, xnnz, update) + if fit_intercept == 1: + intercept_update = update + if one_class: # specific for One-Class SVM + intercept_update -= 2. * eta * alpha + if intercept_update != 0: + intercept += intercept_update * intercept_decay + + if 0 < average <= t: + # compute the average for the intercept and update the + # average weights, this is done regardless as to whether + # the update is 0 + + w.add_average(x_data_ptr, x_ind_ptr, xnnz, + update, (t - average + 1)) + average_intercept += ((intercept - average_intercept) / + (t - average + 1)) + + if penalty_type == L1 or penalty_type == ELASTICNET: + u += (l1_ratio * eta * alpha) + l1penalty{{name_suffix}}(w, q_data_ptr, x_ind_ptr, xnnz, u) + + t += 1 + count += 1 + + # report epoch information + if verbose > 0: + with gil: + print("Norm: %.2f, NNZs: %d, Bias: %.6f, T: %d, " + "Avg. loss: %f" + % (w.norm(), np.nonzero(weights)[0].shape[0], + intercept, count, sumloss / train_count)) + print("Total training time: %.2f seconds." + % (time() - t_start)) + + # floating-point under-/overflow check. + if (not isfinite(intercept) or any_nonfinite(weights)): + infinity = True + break + + # evaluate the score on the validation set + if early_stopping: + with gil: + score = validation_score_cb(weights.base, intercept) + if tol > -INFINITY and score < best_score + tol: + no_improvement_count += 1 + else: + no_improvement_count = 0 + if score > best_score: + best_score = score + # or evaluate the loss on the training set + else: + if tol > -INFINITY and sumloss > best_loss - tol * train_count: + no_improvement_count += 1 + else: + no_improvement_count = 0 + if sumloss < best_loss: + best_loss = sumloss + + # if there is no improvement several times in a row + if no_improvement_count >= n_iter_no_change: + if learning_rate == ADAPTIVE and eta > 1e-6: + eta = eta / 5 + no_improvement_count = 0 + else: + if verbose: + with gil: + print("Convergence after %d epochs took %.2f " + "seconds" % (epoch + 1, time() - t_start)) + break + + if infinity: + raise ValueError(("Floating-point under-/overflow occurred at epoch" + " #%d. Scaling input data with StandardScaler or" + " MinMaxScaler might help.") % (epoch + 1)) + + w.reset_wscale() + + return ( + weights.base, + intercept, + None if average_weights is None else average_weights.base, + average_intercept, + epoch + 1 + ) + +{{endfor}} + + +cdef inline bint any_nonfinite(const floating[::1] w) noexcept nogil: + for i in range(w.shape[0]): + if not isfinite(w[i]): + return True + return 0 + + +cdef inline double sqnorm( + floating * x_data_ptr, + int * x_ind_ptr, + int xnnz, +) noexcept nogil: + cdef double x_norm = 0.0 + cdef int j + cdef double z + for j in range(xnnz): + z = x_data_ptr[j] + x_norm += z * z + return x_norm + + +{{for name_suffix, c_type, np_type in dtypes}} + +cdef void l1penalty{{name_suffix}}( + WeightVector{{name_suffix}} w, + {{c_type}} * q_data_ptr, + int *x_ind_ptr, + int xnnz, + double u, +) noexcept nogil: + """Apply the L1 penalty to each updated feature + + This implements the truncated gradient approach by + [Tsuruoka, Y., Tsujii, J., and Ananiadou, S., 2009]. + """ + cdef double z = 0.0 + cdef int j = 0 + cdef int idx = 0 + cdef double wscale = w.wscale + cdef {{c_type}} *w_data_ptr = w.w_data_ptr + for j in range(xnnz): + idx = x_ind_ptr[j] + z = w_data_ptr[idx] + if wscale * z > 0.0: + w_data_ptr[idx] = max( + 0.0, w_data_ptr[idx] - ((u + q_data_ptr[idx]) / wscale)) + + elif wscale * z < 0.0: + w_data_ptr[idx] = min( + 0.0, w_data_ptr[idx] + ((u - q_data_ptr[idx]) / wscale)) + + q_data_ptr[idx] += wscale * (w_data_ptr[idx] - z) + +{{endfor}} diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_stochastic_gradient.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_stochastic_gradient.py new file mode 100644 index 0000000000000000000000000000000000000000..8f7c814000614e91e2daf605835c0ebc69fc76c3 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_stochastic_gradient.py @@ -0,0 +1,2604 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +"""Classification, regression and One-Class SVM using Stochastic Gradient +Descent (SGD). +""" + +import warnings +from abc import ABCMeta, abstractmethod +from numbers import Integral, Real + +import numpy as np + +from .._loss._loss import CyHalfBinomialLoss, CyHalfSquaredError, CyHuberLoss +from ..base import ( + BaseEstimator, + OutlierMixin, + RegressorMixin, + _fit_context, + clone, + is_classifier, +) +from ..exceptions import ConvergenceWarning +from ..model_selection import ShuffleSplit, StratifiedShuffleSplit +from ..utils import check_random_state, compute_class_weight +from ..utils._param_validation import Hidden, Interval, StrOptions +from ..utils.extmath import safe_sparse_dot +from ..utils.metaestimators import available_if +from ..utils.multiclass import _check_partial_fit_first_call +from ..utils.parallel import Parallel, delayed +from ..utils.validation import _check_sample_weight, check_is_fitted, validate_data +from ._base import LinearClassifierMixin, SparseCoefMixin, make_dataset +from ._sgd_fast import ( + EpsilonInsensitive, + Hinge, + ModifiedHuber, + SquaredEpsilonInsensitive, + SquaredHinge, + _plain_sgd32, + _plain_sgd64, +) + +LEARNING_RATE_TYPES = { + "constant": 1, + "optimal": 2, + "invscaling": 3, + "adaptive": 4, + "pa1": 5, + "pa2": 6, +} + +PENALTY_TYPES = {"none": 0, "l2": 2, "l1": 1, "elasticnet": 3} + +DEFAULT_EPSILON = 0.1 +# Default value of ``epsilon`` parameter. + +MAX_INT = np.iinfo(np.int32).max + + +class _ValidationScoreCallback: + """Callback for early stopping based on validation score""" + + def __init__(self, estimator, X_val, y_val, sample_weight_val, classes=None): + self.estimator = clone(estimator) + self.estimator.t_ = 1 # to pass check_is_fitted + if classes is not None: + self.estimator.classes_ = classes + self.X_val = X_val + self.y_val = y_val + self.sample_weight_val = sample_weight_val + + def __call__(self, coef, intercept): + est = self.estimator + est.coef_ = coef.reshape(1, -1) + est.intercept_ = np.atleast_1d(intercept) + return est.score(self.X_val, self.y_val, self.sample_weight_val) + + +class BaseSGD(SparseCoefMixin, BaseEstimator, metaclass=ABCMeta): + """Base class for SGD classification and regression.""" + + _parameter_constraints: dict = { + "fit_intercept": ["boolean"], + "max_iter": [Interval(Integral, 1, None, closed="left")], + "tol": [Interval(Real, 0, None, closed="left"), None], + "shuffle": ["boolean"], + "verbose": ["verbose"], + "random_state": ["random_state"], + "warm_start": ["boolean"], + "average": [Interval(Integral, 0, None, closed="neither"), "boolean"], + } + + def __init__( + self, + loss, + *, + penalty="l2", + alpha=0.0001, + C=1.0, + l1_ratio=0.15, + fit_intercept=True, + max_iter=1000, + tol=1e-3, + shuffle=True, + verbose=0, + epsilon=0.1, + random_state=None, + learning_rate="optimal", + eta0=0.0, + power_t=0.5, + early_stopping=False, + validation_fraction=0.1, + n_iter_no_change=5, + warm_start=False, + average=False, + ): + self.loss = loss + self.penalty = penalty + self.learning_rate = learning_rate + self.epsilon = epsilon + self.alpha = alpha + self.C = C + self.l1_ratio = l1_ratio + self.fit_intercept = fit_intercept + self.shuffle = shuffle + self.random_state = random_state + self.verbose = verbose + self.eta0 = eta0 + self.power_t = power_t + self.early_stopping = early_stopping + self.validation_fraction = validation_fraction + self.n_iter_no_change = n_iter_no_change + self.warm_start = warm_start + self.average = average + self.max_iter = max_iter + self.tol = tol + + @abstractmethod + def fit(self, X, y): + """Fit model.""" + + def _more_validate_params(self, for_partial_fit=False): + """Validate input params.""" + if self.early_stopping and for_partial_fit: + raise ValueError("early_stopping should be False with partial_fit") + if ( + self.learning_rate in ("constant", "invscaling", "adaptive") + and self.eta0 <= 0.0 + ): + raise ValueError("eta0 must be > 0") + if self.learning_rate == "optimal" and self.alpha == 0: + raise ValueError( + "alpha must be > 0 since " + "learning_rate is 'optimal'. alpha is used " + "to compute the optimal learning rate." + ) + if self.penalty == "elasticnet" and self.l1_ratio is None: + raise ValueError("l1_ratio must be set when penalty is 'elasticnet'") + + # raises ValueError if not registered + self._get_penalty_type(self.penalty) + self._get_learning_rate_type(self.learning_rate) + + def _get_l1_ratio(self): + if self.l1_ratio is None: + # plain_sgd expects a float. Any value is fine since at this point + # penalty can't be "elsaticnet" so l1_ratio is not used. + return 0.0 + return self.l1_ratio + + def _get_loss_function(self, loss): + """Get concrete ``LossFunction`` object for str ``loss``.""" + loss_ = self.loss_functions[loss] + loss_class, args = loss_[0], loss_[1:] + if loss in ("huber", "epsilon_insensitive", "squared_epsilon_insensitive"): + args = (self.epsilon,) + return loss_class(*args) + + def _get_learning_rate_type(self, learning_rate): + return LEARNING_RATE_TYPES[learning_rate] + + def _get_penalty_type(self, penalty): + penalty = str(penalty).lower() + return PENALTY_TYPES[penalty] + + def _allocate_parameter_mem( + self, + n_classes, + n_features, + input_dtype, + coef_init=None, + intercept_init=None, + one_class=0, + ): + """Allocate mem for parameters; initialize if provided.""" + if n_classes > 2: + # allocate coef_ for multi-class + if coef_init is not None: + coef_init = np.asarray(coef_init, dtype=input_dtype, order="C") + if coef_init.shape != (n_classes, n_features): + raise ValueError("Provided ``coef_`` does not match dataset. ") + self.coef_ = coef_init + else: + self.coef_ = np.zeros( + (n_classes, n_features), dtype=input_dtype, order="C" + ) + + # allocate intercept_ for multi-class + if intercept_init is not None: + intercept_init = np.asarray( + intercept_init, order="C", dtype=input_dtype + ) + if intercept_init.shape != (n_classes,): + raise ValueError("Provided intercept_init does not match dataset.") + self.intercept_ = intercept_init + else: + self.intercept_ = np.zeros(n_classes, dtype=input_dtype, order="C") + else: + # allocate coef_ + if coef_init is not None: + coef_init = np.asarray(coef_init, dtype=input_dtype, order="C") + coef_init = coef_init.ravel() + if coef_init.shape != (n_features,): + raise ValueError("Provided coef_init does not match dataset.") + self.coef_ = coef_init + else: + self.coef_ = np.zeros(n_features, dtype=input_dtype, order="C") + + # allocate intercept_ + if intercept_init is not None: + intercept_init = np.asarray(intercept_init, dtype=input_dtype) + if intercept_init.shape != (1,) and intercept_init.shape != (): + raise ValueError("Provided intercept_init does not match dataset.") + if one_class: + self.offset_ = intercept_init.reshape( + 1, + ) + else: + self.intercept_ = intercept_init.reshape( + 1, + ) + else: + if one_class: + self.offset_ = np.zeros(1, dtype=input_dtype, order="C") + else: + self.intercept_ = np.zeros(1, dtype=input_dtype, order="C") + + # initialize average parameters + if self.average > 0: + self._standard_coef = self.coef_ + self._average_coef = np.zeros( + self.coef_.shape, dtype=input_dtype, order="C" + ) + if one_class: + self._standard_intercept = 1 - self.offset_ + else: + self._standard_intercept = self.intercept_ + + self._average_intercept = np.zeros( + self._standard_intercept.shape, dtype=input_dtype, order="C" + ) + + def _make_validation_split(self, y, sample_mask): + """Split the dataset between training set and validation set. + + Parameters + ---------- + y : ndarray of shape (n_samples, ) + Target values. + + sample_mask : ndarray of shape (n_samples, ) + A boolean array indicating whether each sample should be included + for validation set. + + Returns + ------- + validation_mask : ndarray of shape (n_samples, ) + Equal to True on the validation set, False on the training set. + """ + n_samples = y.shape[0] + validation_mask = np.zeros(n_samples, dtype=np.bool_) + if not self.early_stopping: + # use the full set for training, with an empty validation set + return validation_mask + + if is_classifier(self): + splitter_type = StratifiedShuffleSplit + else: + splitter_type = ShuffleSplit + cv = splitter_type( + test_size=self.validation_fraction, random_state=self.random_state + ) + idx_train, idx_val = next(cv.split(np.zeros(shape=(y.shape[0], 1)), y)) + + if not np.any(sample_mask[idx_val]): + raise ValueError( + "The sample weights for validation set are all zero, consider using a" + " different random state." + ) + + if idx_train.shape[0] == 0 or idx_val.shape[0] == 0: + raise ValueError( + "Splitting %d samples into a train set and a validation set " + "with validation_fraction=%r led to an empty set (%d and %d " + "samples). Please either change validation_fraction, increase " + "number of samples, or disable early_stopping." + % ( + n_samples, + self.validation_fraction, + idx_train.shape[0], + idx_val.shape[0], + ) + ) + + validation_mask[idx_val] = True + return validation_mask + + def _make_validation_score_cb( + self, validation_mask, X, y, sample_weight, classes=None + ): + if not self.early_stopping: + return None + + return _ValidationScoreCallback( + self, + X[validation_mask], + y[validation_mask], + sample_weight[validation_mask], + classes=classes, + ) + + +def _prepare_fit_binary(est, y, i, input_dtype, label_encode=True): + """Initialization for fit_binary. + + Returns y, coef, intercept, average_coef, average_intercept. + """ + y_i = np.ones(y.shape, dtype=input_dtype, order="C") + if label_encode: + # y in {0, 1} + y_i[y != est.classes_[i]] = 0.0 + else: + # y in {-1, +1} + y_i[y != est.classes_[i]] = -1.0 + average_intercept = 0 + average_coef = None + + if len(est.classes_) == 2: + if not est.average: + coef = est.coef_.ravel() + intercept = est.intercept_[0] + else: + coef = est._standard_coef.ravel() + intercept = est._standard_intercept[0] + average_coef = est._average_coef.ravel() + average_intercept = est._average_intercept[0] + else: + if not est.average: + coef = est.coef_[i] + intercept = est.intercept_[i] + else: + coef = est._standard_coef[i] + intercept = est._standard_intercept[i] + average_coef = est._average_coef[i] + average_intercept = est._average_intercept[i] + + return y_i, coef, intercept, average_coef, average_intercept + + +def fit_binary( + est, + i, + X, + y, + alpha, + C, + learning_rate, + max_iter, + pos_weight, + neg_weight, + sample_weight, + validation_mask=None, + random_state=None, +): + """Fit a single binary classifier. + + The i'th class is considered the "positive" class. + + Parameters + ---------- + est : Estimator object + The estimator to fit + + i : int + Index of the positive class + + X : numpy array or sparse matrix of shape [n_samples,n_features] + Training data + + y : numpy array of shape [n_samples, ] + Target values + + alpha : float + The regularization parameter + + C : float + Maximum step size for passive aggressive + + learning_rate : str + The learning rate. Accepted values are 'constant', 'optimal', + 'invscaling', 'pa1' and 'pa2'. + + max_iter : int + The maximum number of iterations (epochs) + + pos_weight : float + The weight of the positive class + + neg_weight : float + The weight of the negative class + + sample_weight : numpy array of shape [n_samples, ] + The weight of each sample + + validation_mask : numpy array of shape [n_samples, ], default=None + Precomputed validation mask in case _fit_binary is called in the + context of a one-vs-rest reduction. + + random_state : int, RandomState instance, default=None + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + """ + # if average is not true, average_coef, and average_intercept will be + # unused + label_encode = isinstance(est._loss_function_, CyHalfBinomialLoss) + y_i, coef, intercept, average_coef, average_intercept = _prepare_fit_binary( + est, y, i, input_dtype=X.dtype, label_encode=label_encode + ) + assert y_i.shape[0] == y.shape[0] == sample_weight.shape[0] + + random_state = check_random_state(random_state) + dataset, intercept_decay = make_dataset( + X, y_i, sample_weight, random_state=random_state + ) + + penalty_type = est._get_penalty_type(est.penalty) + learning_rate_type = est._get_learning_rate_type(learning_rate) + + if validation_mask is None: + validation_mask = est._make_validation_split(y_i, sample_mask=sample_weight > 0) + classes = np.array([-1, 1], dtype=y_i.dtype) + validation_score_cb = est._make_validation_score_cb( + validation_mask, X, y_i, sample_weight, classes=classes + ) + + # numpy mtrand expects a C long which is a signed 32 bit integer under + # Windows + seed = random_state.randint(MAX_INT) + + tol = est.tol if est.tol is not None else -np.inf + + _plain_sgd = _get_plain_sgd_function(input_dtype=coef.dtype) + coef, intercept, average_coef, average_intercept, n_iter_ = _plain_sgd( + coef, + intercept, + average_coef, + average_intercept, + est._loss_function_, + penalty_type, + alpha, + C, + est._get_l1_ratio(), + dataset, + validation_mask, + est.early_stopping, + validation_score_cb, + int(est.n_iter_no_change), + max_iter, + tol, + int(est.fit_intercept), + int(est.verbose), + int(est.shuffle), + seed, + pos_weight, + neg_weight, + learning_rate_type, + est.eta0, + est.power_t, + 0, + est.t_, + intercept_decay, + est.average, + ) + + if est.average: + if len(est.classes_) == 2: + est._average_intercept[0] = average_intercept + else: + est._average_intercept[i] = average_intercept + + return coef, intercept, n_iter_ + + +def _get_plain_sgd_function(input_dtype): + return _plain_sgd32 if input_dtype == np.float32 else _plain_sgd64 + + +class BaseSGDClassifier(LinearClassifierMixin, BaseSGD, metaclass=ABCMeta): + loss_functions = { + "hinge": (Hinge, 1.0), + "squared_hinge": (SquaredHinge, 1.0), + "perceptron": (Hinge, 0.0), + "log_loss": (CyHalfBinomialLoss,), + "modified_huber": (ModifiedHuber,), + "squared_error": (CyHalfSquaredError,), + "huber": (CyHuberLoss, DEFAULT_EPSILON), + "epsilon_insensitive": (EpsilonInsensitive, DEFAULT_EPSILON), + "squared_epsilon_insensitive": (SquaredEpsilonInsensitive, DEFAULT_EPSILON), + } + + _parameter_constraints: dict = { + **BaseSGD._parameter_constraints, + "loss": [StrOptions(set(loss_functions))], + "early_stopping": ["boolean"], + "validation_fraction": [Interval(Real, 0, 1, closed="neither")], + "n_iter_no_change": [Interval(Integral, 1, None, closed="left")], + "n_jobs": [Integral, None], + "class_weight": [StrOptions({"balanced"}), dict, None], + } + + @abstractmethod + def __init__( + self, + loss="hinge", + *, + penalty="l2", + alpha=0.0001, + l1_ratio=0.15, + fit_intercept=True, + max_iter=1000, + tol=1e-3, + shuffle=True, + verbose=0, + epsilon=DEFAULT_EPSILON, + n_jobs=None, + random_state=None, + learning_rate="optimal", + eta0=0.0, + power_t=0.5, + early_stopping=False, + validation_fraction=0.1, + n_iter_no_change=5, + class_weight=None, + warm_start=False, + average=False, + ): + super().__init__( + loss=loss, + penalty=penalty, + alpha=alpha, + l1_ratio=l1_ratio, + fit_intercept=fit_intercept, + max_iter=max_iter, + tol=tol, + shuffle=shuffle, + verbose=verbose, + epsilon=epsilon, + random_state=random_state, + learning_rate=learning_rate, + eta0=eta0, + power_t=power_t, + early_stopping=early_stopping, + validation_fraction=validation_fraction, + n_iter_no_change=n_iter_no_change, + warm_start=warm_start, + average=average, + ) + self.class_weight = class_weight + self.n_jobs = n_jobs + + def _partial_fit( + self, + X, + y, + alpha, + C, + loss, + learning_rate, + max_iter, + classes, + sample_weight, + coef_init, + intercept_init, + ): + first_call = not hasattr(self, "classes_") + X, y = validate_data( + self, + X, + y, + accept_sparse="csr", + dtype=[np.float64, np.float32], + order="C", + accept_large_sparse=False, + reset=first_call, + ) + + n_samples, n_features = X.shape + + _check_partial_fit_first_call(self, classes) + + n_classes = self.classes_.shape[0] + + # Allocate datastructures from input arguments + self._expanded_class_weight = compute_class_weight( + self.class_weight, classes=self.classes_, y=y + ) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + + if getattr(self, "coef_", None) is None or coef_init is not None: + self._allocate_parameter_mem( + n_classes=n_classes, + n_features=n_features, + input_dtype=X.dtype, + coef_init=coef_init, + intercept_init=intercept_init, + ) + elif n_features != self.coef_.shape[-1]: + raise ValueError( + "Number of features %d does not match previous data %d." + % (n_features, self.coef_.shape[-1]) + ) + + self._loss_function_ = self._get_loss_function(loss) + if not hasattr(self, "t_"): + self.t_ = 1.0 + + # delegate to concrete training procedure + if n_classes > 2: + self._fit_multiclass( + X, + y, + alpha=alpha, + C=C, + learning_rate=learning_rate, + sample_weight=sample_weight, + max_iter=max_iter, + ) + elif n_classes == 2: + self._fit_binary( + X, + y, + alpha=alpha, + C=C, + learning_rate=learning_rate, + sample_weight=sample_weight, + max_iter=max_iter, + ) + else: + raise ValueError( + "The number of classes has to be greater than one; got %d class" + % n_classes + ) + + return self + + def _fit( + self, + X, + y, + alpha, + C, + loss, + learning_rate, + coef_init=None, + intercept_init=None, + sample_weight=None, + ): + if hasattr(self, "classes_"): + # delete the attribute otherwise _partial_fit thinks it's not the first call + delattr(self, "classes_") + + # labels can be encoded as float, int, or string literals + # np.unique sorts in asc order; largest class id is positive class + y = validate_data(self, y=y) + classes = np.unique(y) + + if self.warm_start and hasattr(self, "coef_"): + if coef_init is None: + coef_init = self.coef_ + if intercept_init is None: + intercept_init = self.intercept_ + else: + self.coef_ = None + self.intercept_ = None + + if self.average > 0: + self._standard_coef = self.coef_ + self._standard_intercept = self.intercept_ + self._average_coef = None + self._average_intercept = None + + # Clear iteration count for multiple call to fit. + self.t_ = 1.0 + + self._partial_fit( + X, + y, + alpha, + C, + loss, + learning_rate, + self.max_iter, + classes, + sample_weight, + coef_init, + intercept_init, + ) + + if ( + self.tol is not None + and self.tol > -np.inf + and self.n_iter_ == self.max_iter + ): + warnings.warn( + ( + "Maximum number of iteration reached before " + "convergence. Consider increasing max_iter to " + "improve the fit." + ), + ConvergenceWarning, + ) + return self + + def _fit_binary(self, X, y, alpha, C, sample_weight, learning_rate, max_iter): + """Fit a binary classifier on X and y.""" + coef, intercept, n_iter_ = fit_binary( + self, + 1, + X, + y, + alpha, + C, + learning_rate, + max_iter, + self._expanded_class_weight[1], + self._expanded_class_weight[0], + sample_weight, + random_state=self.random_state, + ) + + self.t_ += n_iter_ * X.shape[0] + self.n_iter_ = n_iter_ + + # need to be 2d + if self.average > 0: + if self.average <= self.t_ - 1: + self.coef_ = self._average_coef.reshape(1, -1) + self.intercept_ = self._average_intercept + else: + self.coef_ = self._standard_coef.reshape(1, -1) + self._standard_intercept = np.atleast_1d(intercept) + self.intercept_ = self._standard_intercept + else: + self.coef_ = coef.reshape(1, -1) + # intercept is a float, need to convert it to an array of length 1 + self.intercept_ = np.atleast_1d(intercept) + + def _fit_multiclass(self, X, y, alpha, C, learning_rate, sample_weight, max_iter): + """Fit a multi-class classifier by combining binary classifiers + + Each binary classifier predicts one class versus all others. This + strategy is called OvA (One versus All) or OvR (One versus Rest). + """ + # Precompute the validation split using the multiclass labels + # to ensure proper balancing of the classes. + validation_mask = self._make_validation_split(y, sample_mask=sample_weight > 0) + + # Use joblib to fit OvA in parallel. + # Pick the random seed for each job outside of fit_binary to avoid + # sharing the estimator random state between threads which could lead + # to non-deterministic behavior + random_state = check_random_state(self.random_state) + seeds = random_state.randint(MAX_INT, size=len(self.classes_)) + result = Parallel( + n_jobs=self.n_jobs, verbose=self.verbose, require="sharedmem" + )( + delayed(fit_binary)( + self, + i, + X, + y, + alpha, + C, + learning_rate, + max_iter, + self._expanded_class_weight[i], + 1.0, + sample_weight, + validation_mask=validation_mask, + random_state=seed, + ) + for i, seed in enumerate(seeds) + ) + + # take the maximum of n_iter_ over every binary fit + n_iter_ = 0.0 + for i, (_, intercept, n_iter_i) in enumerate(result): + self.intercept_[i] = intercept + n_iter_ = max(n_iter_, n_iter_i) + + self.t_ += n_iter_ * X.shape[0] + self.n_iter_ = n_iter_ + + if self.average > 0: + if self.average <= self.t_ - 1.0: + self.coef_ = self._average_coef + self.intercept_ = self._average_intercept + else: + self.coef_ = self._standard_coef + self._standard_intercept = np.atleast_1d(self.intercept_) + self.intercept_ = self._standard_intercept + + @_fit_context(prefer_skip_nested_validation=True) + def partial_fit(self, X, y, classes=None, sample_weight=None): + """Perform one epoch of stochastic gradient descent on given samples. + + Internally, this method uses ``max_iter = 1``. Therefore, it is not + guaranteed that a minimum of the cost function is reached after calling + it once. Matters such as objective convergence, early stopping, and + learning rate adjustments should be handled by the user. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Subset of the training data. + + y : ndarray of shape (n_samples,) + Subset of the target values. + + classes : ndarray of shape (n_classes,), default=None + Classes across all calls to partial_fit. + Can be obtained by via `np.unique(y_all)`, where y_all is the + target vector of the entire dataset. + This argument is required for the first call to partial_fit + and can be omitted in the subsequent calls. + Note that y doesn't need to contain all labels in `classes`. + + sample_weight : array-like, shape (n_samples,), default=None + Weights applied to individual samples. + If not provided, uniform weights are assumed. + + Returns + ------- + self : object + Returns an instance of self. + """ + if not hasattr(self, "classes_"): + self._more_validate_params(for_partial_fit=True) + + if self.class_weight == "balanced": + raise ValueError( + "class_weight '{0}' is not supported for " + "partial_fit. In order to use 'balanced' weights," + " use compute_class_weight('{0}', " + "classes=classes, y=y). " + "In place of y you can use a large enough sample " + "of the full training set target to properly " + "estimate the class frequency distributions. " + "Pass the resulting weights as the class_weight " + "parameter.".format(self.class_weight) + ) + + return self._partial_fit( + X, + y, + alpha=self.alpha, + C=1.0, + loss=self.loss, + learning_rate=self.learning_rate, + max_iter=1, + classes=classes, + sample_weight=sample_weight, + coef_init=None, + intercept_init=None, + ) + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None): + """Fit linear model with Stochastic Gradient Descent. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Training data. + + y : ndarray of shape (n_samples,) + Target values. + + coef_init : ndarray of shape (n_classes, n_features), default=None + The initial coefficients to warm-start the optimization. + + intercept_init : ndarray of shape (n_classes,), default=None + The initial intercept to warm-start the optimization. + + sample_weight : array-like, shape (n_samples,), default=None + Weights applied to individual samples. + If not provided, uniform weights are assumed. These weights will + be multiplied with class_weight (passed through the + constructor) if class_weight is specified. + + Returns + ------- + self : object + Returns an instance of self. + """ + self._more_validate_params() + + return self._fit( + X, + y, + alpha=self.alpha, + C=1.0, + loss=self.loss, + learning_rate=self.learning_rate, + coef_init=coef_init, + intercept_init=intercept_init, + sample_weight=sample_weight, + ) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + + +class SGDClassifier(BaseSGDClassifier): + """Linear classifiers (SVM, logistic regression, etc.) with SGD training. + + This estimator implements regularized linear models with stochastic + gradient descent (SGD) learning: the gradient of the loss is estimated + each sample at a time and the model is updated along the way with a + decreasing strength schedule (aka learning rate). SGD allows minibatch + (online/out-of-core) learning via the `partial_fit` method. + For best results using the default learning rate schedule, the data should + have zero mean and unit variance. + + This implementation works with data represented as dense or sparse arrays + of floating point values for the features. The model it fits can be + controlled with the loss parameter; by default, it fits a linear support + vector machine (SVM). + + The regularizer is a penalty added to the loss function that shrinks model + parameters towards the zero vector using either the squared euclidean norm + L2 or the absolute norm L1 or a combination of both (Elastic Net). If the + parameter update crosses the 0.0 value because of the regularizer, the + update is truncated to 0.0 to allow for learning sparse models and achieve + online feature selection. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + loss : {'hinge', 'log_loss', 'modified_huber', 'squared_hinge',\ + 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive',\ + 'squared_epsilon_insensitive'}, default='hinge' + The loss function to be used. + + - 'hinge' gives a linear SVM. + - 'log_loss' gives logistic regression, a probabilistic classifier. + - 'modified_huber' is another smooth loss that brings tolerance to + outliers as well as probability estimates. + - 'squared_hinge' is like hinge but is quadratically penalized. + - 'perceptron' is the linear loss used by the perceptron algorithm. + - The other losses, 'squared_error', 'huber', 'epsilon_insensitive' and + 'squared_epsilon_insensitive' are designed for regression but can be useful + in classification as well; see + :class:`~sklearn.linear_model.SGDRegressor` for a description. + + More details about the losses formulas can be found in the :ref:`User Guide + ` and you can find a visualisation of the loss + functions in + :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_loss_functions.py`. + + penalty : {'l2', 'l1', 'elasticnet', None}, default='l2' + The penalty (aka regularization term) to be used. Defaults to 'l2' + which is the standard regularizer for linear SVM models. 'l1' and + 'elasticnet' might bring sparsity to the model (feature selection) + not achievable with 'l2'. No penalty is added when set to `None`. + + You can see a visualisation of the penalties in + :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_penalties.py`. + + alpha : float, default=0.0001 + Constant that multiplies the regularization term. The higher the + value, the stronger the regularization. Also used to compute the + learning rate when `learning_rate` is set to 'optimal'. + Values must be in the range `[0.0, inf)`. + + l1_ratio : float, default=0.15 + The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1. + l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1. + Only used if `penalty` is 'elasticnet'. + Values must be in the range `[0.0, 1.0]` or can be `None` if + `penalty` is not `elasticnet`. + + .. versionchanged:: 1.7 + `l1_ratio` can be `None` when `penalty` is not "elasticnet". + + fit_intercept : bool, default=True + Whether the intercept should be estimated or not. If False, the + data is assumed to be already centered. + + max_iter : int, default=1000 + The maximum number of passes over the training data (aka epochs). + It only impacts the behavior in the ``fit`` method, and not the + :meth:`partial_fit` method. + Values must be in the range `[1, inf)`. + + .. versionadded:: 0.19 + + tol : float or None, default=1e-3 + The stopping criterion. If it is not None, training will stop + when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive + epochs. + Convergence is checked against the training loss or the + validation loss depending on the `early_stopping` parameter. + Values must be in the range `[0.0, inf)`. + + .. versionadded:: 0.19 + + shuffle : bool, default=True + Whether or not the training data should be shuffled after each epoch. + + verbose : int, default=0 + The verbosity level. + Values must be in the range `[0, inf)`. + + epsilon : float, default=0.1 + Epsilon in the epsilon-insensitive loss functions; only if `loss` is + 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'. + For 'huber', determines the threshold at which it becomes less + important to get the prediction exactly right. + For epsilon-insensitive, any differences between the current prediction + and the correct label are ignored if they are less than this threshold. + Values must be in the range `[0.0, inf)`. + + n_jobs : int, default=None + The number of CPUs to use to do the OVA (One Versus All, for + multi-class problems) computation. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + random_state : int, RandomState instance, default=None + Used for shuffling the data, when ``shuffle`` is set to ``True``. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + Integer values must be in the range `[0, 2**32 - 1]`. + + learning_rate : str, default='optimal' + The learning rate schedule: + + - 'constant': `eta = eta0` + - 'optimal': `eta = 1.0 / (alpha * (t + t0))` + where `t0` is chosen by a heuristic proposed by Leon Bottou. + - 'invscaling': `eta = eta0 / pow(t, power_t)` + - 'adaptive': `eta = eta0`, as long as the training keeps decreasing. + Each time n_iter_no_change consecutive epochs fail to decrease the + training loss by tol or fail to increase validation score by tol if + `early_stopping` is `True`, the current learning rate is divided by 5. + + .. versionadded:: 0.20 + Added 'adaptive' option. + + eta0 : float, default=0.0 + The initial learning rate for the 'constant', 'invscaling' or + 'adaptive' schedules. The default value is 0.0 as eta0 is not used by + the default schedule 'optimal'. + Values must be in the range `[0.0, inf)`. + + power_t : float, default=0.5 + The exponent for inverse scaling learning rate. + Values must be in the range `(-inf, inf)`. + + early_stopping : bool, default=False + Whether to use early stopping to terminate training when validation + score is not improving. If set to `True`, it will automatically set aside + a stratified fraction of training data as validation and terminate + training when validation score returned by the `score` method is not + improving by at least tol for n_iter_no_change consecutive epochs. + + See :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_early_stopping.py` for an + example of the effects of early stopping. + + .. versionadded:: 0.20 + Added 'early_stopping' option + + validation_fraction : float, default=0.1 + The proportion of training data to set aside as validation set for + early stopping. Must be between 0 and 1. + Only used if `early_stopping` is True. + Values must be in the range `(0.0, 1.0)`. + + .. versionadded:: 0.20 + Added 'validation_fraction' option + + n_iter_no_change : int, default=5 + Number of iterations with no improvement to wait before stopping + fitting. + Convergence is checked against the training loss or the + validation loss depending on the `early_stopping` parameter. + Integer values must be in the range `[1, max_iter)`. + + .. versionadded:: 0.20 + Added 'n_iter_no_change' option + + class_weight : dict, {class_label: weight} or "balanced", default=None + Preset for the class_weight fit parameter. + + Weights associated with classes. If not given, all classes + are supposed to have weight one. + + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))``. + + warm_start : bool, default=False + When set to True, reuse the solution of the previous call to fit as + initialization, otherwise, just erase the previous solution. + See :term:`the Glossary `. + + Repeatedly calling fit or partial_fit when warm_start is True can + result in a different solution than when calling fit a single time + because of the way the data is shuffled. + If a dynamic learning rate is used, the learning rate is adapted + depending on the number of samples already seen. Calling ``fit`` resets + this counter, while ``partial_fit`` will result in increasing the + existing counter. + + average : bool or int, default=False + When set to `True`, computes the averaged SGD weights across all + updates and stores the result in the ``coef_`` attribute. If set to + an int greater than 1, averaging will begin once the total number of + samples seen reaches `average`. So ``average=10`` will begin + averaging after seeing 10 samples. + Integer values must be in the range `[1, n_samples]`. + + Attributes + ---------- + coef_ : ndarray of shape (1, n_features) if n_classes == 2 else \ + (n_classes, n_features) + Weights assigned to the features. + + intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,) + Constants in decision function. + + n_iter_ : int + The actual number of iterations before reaching the stopping criterion. + For multiclass fits, it is the maximum over every binary fit. + + classes_ : array of shape (n_classes,) + + t_ : int + Number of weight updates performed during training. + Same as ``(n_iter_ * n_samples + 1)``. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + sklearn.svm.LinearSVC : Linear support vector classification. + LogisticRegression : Logistic regression. + Perceptron : Inherits from SGDClassifier. ``Perceptron()`` is equivalent to + ``SGDClassifier(loss="perceptron", eta0=1, learning_rate="constant", + penalty=None)``. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.linear_model import SGDClassifier + >>> from sklearn.preprocessing import StandardScaler + >>> from sklearn.pipeline import make_pipeline + >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) + >>> Y = np.array([1, 1, 2, 2]) + >>> # Always scale the input. The most convenient way is to use a pipeline. + >>> clf = make_pipeline(StandardScaler(), + ... SGDClassifier(max_iter=1000, tol=1e-3)) + >>> clf.fit(X, Y) + Pipeline(steps=[('standardscaler', StandardScaler()), + ('sgdclassifier', SGDClassifier())]) + >>> print(clf.predict([[-0.8, -1]])) + [1] + """ + + _parameter_constraints: dict = { + **BaseSGDClassifier._parameter_constraints, + "penalty": [StrOptions({"l2", "l1", "elasticnet"}), None], + "alpha": [Interval(Real, 0, None, closed="left")], + "l1_ratio": [Interval(Real, 0, 1, closed="both"), None], + "power_t": [Interval(Real, None, None, closed="neither")], + "epsilon": [Interval(Real, 0, None, closed="left")], + "learning_rate": [ + StrOptions({"constant", "optimal", "invscaling", "adaptive"}), + Hidden(StrOptions({"pa1", "pa2"})), + ], + "eta0": [Interval(Real, 0, None, closed="left")], + } + + def __init__( + self, + loss="hinge", + *, + penalty="l2", + alpha=0.0001, + l1_ratio=0.15, + fit_intercept=True, + max_iter=1000, + tol=1e-3, + shuffle=True, + verbose=0, + epsilon=DEFAULT_EPSILON, + n_jobs=None, + random_state=None, + learning_rate="optimal", + eta0=0.0, + power_t=0.5, + early_stopping=False, + validation_fraction=0.1, + n_iter_no_change=5, + class_weight=None, + warm_start=False, + average=False, + ): + super().__init__( + loss=loss, + penalty=penalty, + alpha=alpha, + l1_ratio=l1_ratio, + fit_intercept=fit_intercept, + max_iter=max_iter, + tol=tol, + shuffle=shuffle, + verbose=verbose, + epsilon=epsilon, + n_jobs=n_jobs, + random_state=random_state, + learning_rate=learning_rate, + eta0=eta0, + power_t=power_t, + early_stopping=early_stopping, + validation_fraction=validation_fraction, + n_iter_no_change=n_iter_no_change, + class_weight=class_weight, + warm_start=warm_start, + average=average, + ) + + def _check_proba(self): + if self.loss not in ("log_loss", "modified_huber"): + raise AttributeError( + "probability estimates are not available for loss=%r" % self.loss + ) + return True + + @available_if(_check_proba) + def predict_proba(self, X): + """Probability estimates. + + This method is only available for log loss and modified Huber loss. + + Multiclass probability estimates are derived from binary (one-vs.-rest) + estimates by simple normalization, as recommended by Zadrozny and + Elkan. + + Binary probability estimates for loss="modified_huber" are given by + (clip(decision_function(X), -1, 1) + 1) / 2. For other loss functions + it is necessary to perform proper probability calibration by wrapping + the classifier with + :class:`~sklearn.calibration.CalibratedClassifierCV` instead. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Input data for prediction. + + Returns + ------- + ndarray of shape (n_samples, n_classes) + Returns the probability of the sample for each class in the model, + where classes are ordered as they are in `self.classes_`. + + References + ---------- + Zadrozny and Elkan, "Transforming classifier scores into multiclass + probability estimates", SIGKDD'02, + https://dl.acm.org/doi/pdf/10.1145/775047.775151 + + The justification for the formula in the loss="modified_huber" + case is in the appendix B in: + http://jmlr.csail.mit.edu/papers/volume2/zhang02c/zhang02c.pdf + """ + check_is_fitted(self) + + if self.loss == "log_loss": + return self._predict_proba_lr(X) + + elif self.loss == "modified_huber": + binary = len(self.classes_) == 2 + scores = self.decision_function(X) + + if binary: + prob2 = np.ones((scores.shape[0], 2)) + prob = prob2[:, 1] + else: + prob = scores + + np.clip(scores, -1, 1, prob) + prob += 1.0 + prob /= 2.0 + + if binary: + prob2[:, 0] -= prob + prob = prob2 + else: + # the above might assign zero to all classes, which doesn't + # normalize neatly; work around this to produce uniform + # probabilities + prob_sum = prob.sum(axis=1) + all_zero = prob_sum == 0 + if np.any(all_zero): + prob[all_zero, :] = 1 + prob_sum[all_zero] = len(self.classes_) + + # normalize + prob /= prob_sum.reshape((prob.shape[0], -1)) + + return prob + + else: + raise NotImplementedError( + "predict_(log_)proba only supported when" + " loss='log_loss' or loss='modified_huber' " + "(%r given)" % self.loss + ) + + @available_if(_check_proba) + def predict_log_proba(self, X): + """Log of probability estimates. + + This method is only available for log loss and modified Huber loss. + + When loss="modified_huber", probability estimates may be hard zeros + and ones, so taking the logarithm is not possible. + + See ``predict_proba`` for details. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data for prediction. + + Returns + ------- + T : array-like, shape (n_samples, n_classes) + Returns the log-probability of the sample for each class in the + model, where classes are ordered as they are in + `self.classes_`. + """ + return np.log(self.predict_proba(X)) + + +class BaseSGDRegressor(RegressorMixin, BaseSGD): + loss_functions = { + "squared_error": (CyHalfSquaredError,), + "huber": (CyHuberLoss, DEFAULT_EPSILON), + "epsilon_insensitive": (EpsilonInsensitive, DEFAULT_EPSILON), + "squared_epsilon_insensitive": (SquaredEpsilonInsensitive, DEFAULT_EPSILON), + } + + _parameter_constraints: dict = { + **BaseSGD._parameter_constraints, + "loss": [StrOptions(set(loss_functions))], + "early_stopping": ["boolean"], + "validation_fraction": [Interval(Real, 0, 1, closed="neither")], + "n_iter_no_change": [Interval(Integral, 1, None, closed="left")], + } + + @abstractmethod + def __init__( + self, + loss="squared_error", + *, + penalty="l2", + alpha=0.0001, + l1_ratio=0.15, + fit_intercept=True, + max_iter=1000, + tol=1e-3, + shuffle=True, + verbose=0, + epsilon=DEFAULT_EPSILON, + random_state=None, + learning_rate="invscaling", + eta0=0.01, + power_t=0.25, + early_stopping=False, + validation_fraction=0.1, + n_iter_no_change=5, + warm_start=False, + average=False, + ): + super().__init__( + loss=loss, + penalty=penalty, + alpha=alpha, + l1_ratio=l1_ratio, + fit_intercept=fit_intercept, + max_iter=max_iter, + tol=tol, + shuffle=shuffle, + verbose=verbose, + epsilon=epsilon, + random_state=random_state, + learning_rate=learning_rate, + eta0=eta0, + power_t=power_t, + early_stopping=early_stopping, + validation_fraction=validation_fraction, + n_iter_no_change=n_iter_no_change, + warm_start=warm_start, + average=average, + ) + + def _partial_fit( + self, + X, + y, + alpha, + C, + loss, + learning_rate, + max_iter, + sample_weight, + coef_init, + intercept_init, + ): + first_call = getattr(self, "coef_", None) is None + X, y = validate_data( + self, + X, + y, + accept_sparse="csr", + copy=False, + order="C", + dtype=[np.float64, np.float32], + accept_large_sparse=False, + reset=first_call, + ) + y = y.astype(X.dtype, copy=False) + + n_samples, n_features = X.shape + + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + + # Allocate datastructures from input arguments + if first_call: + self._allocate_parameter_mem( + n_classes=1, + n_features=n_features, + input_dtype=X.dtype, + coef_init=coef_init, + intercept_init=intercept_init, + ) + if self.average > 0 and getattr(self, "_average_coef", None) is None: + self._average_coef = np.zeros(n_features, dtype=X.dtype, order="C") + self._average_intercept = np.zeros(1, dtype=X.dtype, order="C") + + self._fit_regressor( + X, y, alpha, C, loss, learning_rate, sample_weight, max_iter + ) + + return self + + @_fit_context(prefer_skip_nested_validation=True) + def partial_fit(self, X, y, sample_weight=None): + """Perform one epoch of stochastic gradient descent on given samples. + + Internally, this method uses ``max_iter = 1``. Therefore, it is not + guaranteed that a minimum of the cost function is reached after calling + it once. Matters such as objective convergence and early stopping + should be handled by the user. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Subset of training data. + + y : numpy array of shape (n_samples,) + Subset of target values. + + sample_weight : array-like, shape (n_samples,), default=None + Weights applied to individual samples. + If not provided, uniform weights are assumed. + + Returns + ------- + self : object + Returns an instance of self. + """ + if not hasattr(self, "coef_"): + self._more_validate_params(for_partial_fit=True) + + return self._partial_fit( + X, + y, + self.alpha, + C=1.0, + loss=self.loss, + learning_rate=self.learning_rate, + max_iter=1, + sample_weight=sample_weight, + coef_init=None, + intercept_init=None, + ) + + def _fit( + self, + X, + y, + alpha, + C, + loss, + learning_rate, + coef_init=None, + intercept_init=None, + sample_weight=None, + ): + if self.warm_start and getattr(self, "coef_", None) is not None: + if coef_init is None: + coef_init = self.coef_ + if intercept_init is None: + intercept_init = self.intercept_ + else: + self.coef_ = None + self.intercept_ = None + + # Clear iteration count for multiple call to fit. + self.t_ = 1.0 + + self._partial_fit( + X, + y, + alpha, + C, + loss, + learning_rate, + self.max_iter, + sample_weight, + coef_init, + intercept_init, + ) + + if ( + self.tol is not None + and self.tol > -np.inf + and self.n_iter_ == self.max_iter + ): + warnings.warn( + ( + "Maximum number of iteration reached before " + "convergence. Consider increasing max_iter to " + "improve the fit." + ), + ConvergenceWarning, + ) + + return self + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None): + """Fit linear model with Stochastic Gradient Descent. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Training data. + + y : ndarray of shape (n_samples,) + Target values. + + coef_init : ndarray of shape (n_features,), default=None + The initial coefficients to warm-start the optimization. + + intercept_init : ndarray of shape (1,), default=None + The initial intercept to warm-start the optimization. + + sample_weight : array-like, shape (n_samples,), default=None + Weights applied to individual samples (1. for unweighted). + + Returns + ------- + self : object + Fitted `SGDRegressor` estimator. + """ + self._more_validate_params() + + return self._fit( + X, + y, + alpha=self.alpha, + C=1.0, + loss=self.loss, + learning_rate=self.learning_rate, + coef_init=coef_init, + intercept_init=intercept_init, + sample_weight=sample_weight, + ) + + def _decision_function(self, X): + """Predict using the linear model + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + + Returns + ------- + ndarray of shape (n_samples,) + Predicted target values per element in X. + """ + check_is_fitted(self) + + X = validate_data(self, X, accept_sparse="csr", reset=False) + + scores = safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_ + return scores.ravel() + + def predict(self, X): + """Predict using the linear model. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Input data. + + Returns + ------- + ndarray of shape (n_samples,) + Predicted target values per element in X. + """ + return self._decision_function(X) + + def _fit_regressor( + self, X, y, alpha, C, loss, learning_rate, sample_weight, max_iter + ): + loss_function = self._get_loss_function(loss) + penalty_type = self._get_penalty_type(self.penalty) + learning_rate_type = self._get_learning_rate_type(learning_rate) + + if not hasattr(self, "t_"): + self.t_ = 1.0 + + validation_mask = self._make_validation_split(y, sample_mask=sample_weight > 0) + validation_score_cb = self._make_validation_score_cb( + validation_mask, X, y, sample_weight + ) + + random_state = check_random_state(self.random_state) + # numpy mtrand expects a C long which is a signed 32 bit integer under + # Windows + seed = random_state.randint(0, MAX_INT) + + dataset, intercept_decay = make_dataset( + X, y, sample_weight, random_state=random_state + ) + + tol = self.tol if self.tol is not None else -np.inf + + if self.average: + coef = self._standard_coef + intercept = self._standard_intercept + average_coef = self._average_coef + average_intercept = self._average_intercept + else: + coef = self.coef_ + intercept = self.intercept_ + average_coef = None # Not used + average_intercept = [0] # Not used + + _plain_sgd = _get_plain_sgd_function(input_dtype=coef.dtype) + coef, intercept, average_coef, average_intercept, self.n_iter_ = _plain_sgd( + coef, + intercept[0], + average_coef, + average_intercept[0], + loss_function, + penalty_type, + alpha, + C, + self._get_l1_ratio(), + dataset, + validation_mask, + self.early_stopping, + validation_score_cb, + int(self.n_iter_no_change), + max_iter, + tol, + int(self.fit_intercept), + int(self.verbose), + int(self.shuffle), + seed, + 1.0, + 1.0, + learning_rate_type, + self.eta0, + self.power_t, + 0, + self.t_, + intercept_decay, + self.average, + ) + + self.t_ += self.n_iter_ * X.shape[0] + + if self.average > 0: + self._average_intercept = np.atleast_1d(average_intercept) + self._standard_intercept = np.atleast_1d(intercept) + + if self.average <= self.t_ - 1.0: + # made enough updates for averaging to be taken into account + self.coef_ = average_coef + self.intercept_ = np.atleast_1d(average_intercept) + else: + self.coef_ = coef + self.intercept_ = np.atleast_1d(intercept) + + else: + self.intercept_ = np.atleast_1d(intercept) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + + +class SGDRegressor(BaseSGDRegressor): + """Linear model fitted by minimizing a regularized empirical loss with SGD. + + SGD stands for Stochastic Gradient Descent: the gradient of the loss is + estimated each sample at a time and the model is updated along the way with + a decreasing strength schedule (aka learning rate). + + The regularizer is a penalty added to the loss function that shrinks model + parameters towards the zero vector using either the squared euclidean norm + L2 or the absolute norm L1 or a combination of both (Elastic Net). If the + parameter update crosses the 0.0 value because of the regularizer, the + update is truncated to 0.0 to allow for learning sparse models and achieve + online feature selection. + + This implementation works with data represented as dense numpy arrays of + floating point values for the features. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + loss : str, default='squared_error' + The loss function to be used. The possible values are 'squared_error', + 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive' + + The 'squared_error' refers to the ordinary least squares fit. + 'huber' modifies 'squared_error' to focus less on getting outliers + correct by switching from squared to linear loss past a distance of + epsilon. 'epsilon_insensitive' ignores errors less than epsilon and is + linear past that; this is the loss function used in SVR. + 'squared_epsilon_insensitive' is the same but becomes squared loss past + a tolerance of epsilon. + + More details about the losses formulas can be found in the + :ref:`User Guide `. + + penalty : {'l2', 'l1', 'elasticnet', None}, default='l2' + The penalty (aka regularization term) to be used. Defaults to 'l2' + which is the standard regularizer for linear SVM models. 'l1' and + 'elasticnet' might bring sparsity to the model (feature selection) + not achievable with 'l2'. No penalty is added when set to `None`. + + You can see a visualisation of the penalties in + :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_penalties.py`. + + alpha : float, default=0.0001 + Constant that multiplies the regularization term. The higher the + value, the stronger the regularization. Also used to compute the + learning rate when `learning_rate` is set to 'optimal'. + Values must be in the range `[0.0, inf)`. + + l1_ratio : float, default=0.15 + The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1. + l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1. + Only used if `penalty` is 'elasticnet'. + Values must be in the range `[0.0, 1.0]` or can be `None` if + `penalty` is not `elasticnet`. + + .. versionchanged:: 1.7 + `l1_ratio` can be `None` when `penalty` is not "elasticnet". + + fit_intercept : bool, default=True + Whether the intercept should be estimated or not. If False, the + data is assumed to be already centered. + + max_iter : int, default=1000 + The maximum number of passes over the training data (aka epochs). + It only impacts the behavior in the ``fit`` method, and not the + :meth:`partial_fit` method. + Values must be in the range `[1, inf)`. + + .. versionadded:: 0.19 + + tol : float or None, default=1e-3 + The stopping criterion. If it is not None, training will stop + when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive + epochs. + Convergence is checked against the training loss or the + validation loss depending on the `early_stopping` parameter. + Values must be in the range `[0.0, inf)`. + + .. versionadded:: 0.19 + + shuffle : bool, default=True + Whether or not the training data should be shuffled after each epoch. + + verbose : int, default=0 + The verbosity level. + Values must be in the range `[0, inf)`. + + epsilon : float, default=0.1 + Epsilon in the epsilon-insensitive loss functions; only if `loss` is + 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'. + For 'huber', determines the threshold at which it becomes less + important to get the prediction exactly right. + For epsilon-insensitive, any differences between the current prediction + and the correct label are ignored if they are less than this threshold. + Values must be in the range `[0.0, inf)`. + + random_state : int, RandomState instance, default=None + Used for shuffling the data, when ``shuffle`` is set to ``True``. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + learning_rate : str, default='invscaling' + The learning rate schedule: + + - 'constant': `eta = eta0` + - 'optimal': `eta = 1.0 / (alpha * (t + t0))` + where t0 is chosen by a heuristic proposed by Leon Bottou. + - 'invscaling': `eta = eta0 / pow(t, power_t)` + - 'adaptive': eta = eta0, as long as the training keeps decreasing. + Each time n_iter_no_change consecutive epochs fail to decrease the + training loss by tol or fail to increase validation score by tol if + early_stopping is True, the current learning rate is divided by 5. + + .. versionadded:: 0.20 + Added 'adaptive' option. + + eta0 : float, default=0.01 + The initial learning rate for the 'constant', 'invscaling' or + 'adaptive' schedules. The default value is 0.01. + Values must be in the range `[0.0, inf)`. + + power_t : float, default=0.25 + The exponent for inverse scaling learning rate. + Values must be in the range `(-inf, inf)`. + + early_stopping : bool, default=False + Whether to use early stopping to terminate training when validation + score is not improving. If set to True, it will automatically set aside + a fraction of training data as validation and terminate + training when validation score returned by the `score` method is not + improving by at least `tol` for `n_iter_no_change` consecutive + epochs. + + See :ref:`sphx_glr_auto_examples_linear_model_plot_sgd_early_stopping.py` for an + example of the effects of early stopping. + + .. versionadded:: 0.20 + Added 'early_stopping' option + + validation_fraction : float, default=0.1 + The proportion of training data to set aside as validation set for + early stopping. Must be between 0 and 1. + Only used if `early_stopping` is True. + Values must be in the range `(0.0, 1.0)`. + + .. versionadded:: 0.20 + Added 'validation_fraction' option + + n_iter_no_change : int, default=5 + Number of iterations with no improvement to wait before stopping + fitting. + Convergence is checked against the training loss or the + validation loss depending on the `early_stopping` parameter. + Integer values must be in the range `[1, max_iter)`. + + .. versionadded:: 0.20 + Added 'n_iter_no_change' option + + warm_start : bool, default=False + When set to True, reuse the solution of the previous call to fit as + initialization, otherwise, just erase the previous solution. + See :term:`the Glossary `. + + Repeatedly calling fit or partial_fit when warm_start is True can + result in a different solution than when calling fit a single time + because of the way the data is shuffled. + If a dynamic learning rate is used, the learning rate is adapted + depending on the number of samples already seen. Calling ``fit`` resets + this counter, while ``partial_fit`` will result in increasing the + existing counter. + + average : bool or int, default=False + When set to True, computes the averaged SGD weights across all + updates and stores the result in the ``coef_`` attribute. If set to + an int greater than 1, averaging will begin once the total number of + samples seen reaches `average`. So ``average=10`` will begin + averaging after seeing 10 samples. + + Attributes + ---------- + coef_ : ndarray of shape (n_features,) + Weights assigned to the features. + + intercept_ : ndarray of shape (1,) + The intercept term. + + n_iter_ : int + The actual number of iterations before reaching the stopping criterion. + + t_ : int + Number of weight updates performed during training. + Same as ``(n_iter_ * n_samples + 1)``. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + HuberRegressor : Linear regression model that is robust to outliers. + Lars : Least Angle Regression model. + Lasso : Linear Model trained with L1 prior as regularizer. + RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm. + Ridge : Linear least squares with l2 regularization. + sklearn.svm.SVR : Epsilon-Support Vector Regression. + TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.linear_model import SGDRegressor + >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.preprocessing import StandardScaler + >>> n_samples, n_features = 10, 5 + >>> rng = np.random.RandomState(0) + >>> y = rng.randn(n_samples) + >>> X = rng.randn(n_samples, n_features) + >>> # Always scale the input. The most convenient way is to use a pipeline. + >>> reg = make_pipeline(StandardScaler(), + ... SGDRegressor(max_iter=1000, tol=1e-3)) + >>> reg.fit(X, y) + Pipeline(steps=[('standardscaler', StandardScaler()), + ('sgdregressor', SGDRegressor())]) + """ + + _parameter_constraints: dict = { + **BaseSGDRegressor._parameter_constraints, + "penalty": [StrOptions({"l2", "l1", "elasticnet"}), None], + "alpha": [Interval(Real, 0, None, closed="left")], + "l1_ratio": [Interval(Real, 0, 1, closed="both"), None], + "power_t": [Interval(Real, None, None, closed="neither")], + "learning_rate": [ + StrOptions({"constant", "optimal", "invscaling", "adaptive"}), + Hidden(StrOptions({"pa1", "pa2"})), + ], + "epsilon": [Interval(Real, 0, None, closed="left")], + "eta0": [Interval(Real, 0, None, closed="left")], + } + + def __init__( + self, + loss="squared_error", + *, + penalty="l2", + alpha=0.0001, + l1_ratio=0.15, + fit_intercept=True, + max_iter=1000, + tol=1e-3, + shuffle=True, + verbose=0, + epsilon=DEFAULT_EPSILON, + random_state=None, + learning_rate="invscaling", + eta0=0.01, + power_t=0.25, + early_stopping=False, + validation_fraction=0.1, + n_iter_no_change=5, + warm_start=False, + average=False, + ): + super().__init__( + loss=loss, + penalty=penalty, + alpha=alpha, + l1_ratio=l1_ratio, + fit_intercept=fit_intercept, + max_iter=max_iter, + tol=tol, + shuffle=shuffle, + verbose=verbose, + epsilon=epsilon, + random_state=random_state, + learning_rate=learning_rate, + eta0=eta0, + power_t=power_t, + early_stopping=early_stopping, + validation_fraction=validation_fraction, + n_iter_no_change=n_iter_no_change, + warm_start=warm_start, + average=average, + ) + + +class SGDOneClassSVM(OutlierMixin, BaseSGD): + """Solves linear One-Class SVM using Stochastic Gradient Descent. + + This implementation is meant to be used with a kernel approximation + technique (e.g. `sklearn.kernel_approximation.Nystroem`) to obtain results + similar to `sklearn.svm.OneClassSVM` which uses a Gaussian kernel by + default. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.0 + + Parameters + ---------- + nu : float, default=0.5 + The nu parameter of the One Class SVM: an upper bound on the + fraction of training errors and a lower bound of the fraction of + support vectors. Should be in the interval (0, 1]. By default 0.5 + will be taken. + + fit_intercept : bool, default=True + Whether the intercept should be estimated or not. Defaults to True. + + max_iter : int, default=1000 + The maximum number of passes over the training data (aka epochs). + It only impacts the behavior in the ``fit`` method, and not the + `partial_fit`. Defaults to 1000. + Values must be in the range `[1, inf)`. + + tol : float or None, default=1e-3 + The stopping criterion. If it is not None, the iterations will stop + when (loss > previous_loss - tol). Defaults to 1e-3. + Values must be in the range `[0.0, inf)`. + + shuffle : bool, default=True + Whether or not the training data should be shuffled after each epoch. + Defaults to True. + + verbose : int, default=0 + The verbosity level. + + random_state : int, RandomState instance or None, default=None + The seed of the pseudo random number generator to use when shuffling + the data. If int, random_state is the seed used by the random number + generator; If RandomState instance, random_state is the random number + generator; If None, the random number generator is the RandomState + instance used by `np.random`. + + learning_rate : {'constant', 'optimal', 'invscaling', 'adaptive'}, default='optimal' + The learning rate schedule to use with `fit`. (If using `partial_fit`, + learning rate must be controlled directly). + + - 'constant': `eta = eta0` + - 'optimal': `eta = 1.0 / (alpha * (t + t0))` + where t0 is chosen by a heuristic proposed by Leon Bottou. + - 'invscaling': `eta = eta0 / pow(t, power_t)` + - 'adaptive': eta = eta0, as long as the training keeps decreasing. + Each time n_iter_no_change consecutive epochs fail to decrease the + training loss by tol or fail to increase validation score by tol if + early_stopping is True, the current learning rate is divided by 5. + + eta0 : float, default=0.0 + The initial learning rate for the 'constant', 'invscaling' or + 'adaptive' schedules. The default value is 0.0 as eta0 is not used by + the default schedule 'optimal'. + Values must be in the range `[0.0, inf)`. + + power_t : float, default=0.5 + The exponent for inverse scaling learning rate. + Values must be in the range `(-inf, inf)`. + + warm_start : bool, default=False + When set to True, reuse the solution of the previous call to fit as + initialization, otherwise, just erase the previous solution. + See :term:`the Glossary `. + + Repeatedly calling fit or partial_fit when warm_start is True can + result in a different solution than when calling fit a single time + because of the way the data is shuffled. + If a dynamic learning rate is used, the learning rate is adapted + depending on the number of samples already seen. Calling ``fit`` resets + this counter, while ``partial_fit`` will result in increasing the + existing counter. + + average : bool or int, default=False + When set to True, computes the averaged SGD weights and stores the + result in the ``coef_`` attribute. If set to an int greater than 1, + averaging will begin once the total number of samples seen reaches + average. So ``average=10`` will begin averaging after seeing 10 + samples. + + Attributes + ---------- + coef_ : ndarray of shape (1, n_features) + Weights assigned to the features. + + offset_ : ndarray of shape (1,) + Offset used to define the decision function from the raw scores. + We have the relation: decision_function = score_samples - offset. + + n_iter_ : int + The actual number of iterations to reach the stopping criterion. + + t_ : int + Number of weight updates performed during training. + Same as ``(n_iter_ * n_samples + 1)``. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + sklearn.svm.OneClassSVM : Unsupervised Outlier Detection. + + Notes + ----- + This estimator has a linear complexity in the number of training samples + and is thus better suited than the `sklearn.svm.OneClassSVM` + implementation for datasets with a large number of training samples (say + > 10,000). + + Examples + -------- + >>> import numpy as np + >>> from sklearn import linear_model + >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) + >>> clf = linear_model.SGDOneClassSVM(random_state=42) + >>> clf.fit(X) + SGDOneClassSVM(random_state=42) + + >>> print(clf.predict([[4, 4]])) + [1] + """ + + loss_functions = {"hinge": (Hinge, 1.0)} + + _parameter_constraints: dict = { + **BaseSGD._parameter_constraints, + "nu": [Interval(Real, 0.0, 1.0, closed="right")], + "learning_rate": [ + StrOptions({"constant", "optimal", "invscaling", "adaptive"}), + Hidden(StrOptions({"pa1", "pa2"})), + ], + "eta0": [Interval(Real, 0, None, closed="left")], + "power_t": [Interval(Real, None, None, closed="neither")], + } + + def __init__( + self, + nu=0.5, + fit_intercept=True, + max_iter=1000, + tol=1e-3, + shuffle=True, + verbose=0, + random_state=None, + learning_rate="optimal", + eta0=0.0, + power_t=0.5, + warm_start=False, + average=False, + ): + self.nu = nu + super().__init__( + loss="hinge", + penalty="l2", + C=1.0, + l1_ratio=0, + fit_intercept=fit_intercept, + max_iter=max_iter, + tol=tol, + shuffle=shuffle, + verbose=verbose, + epsilon=DEFAULT_EPSILON, + random_state=random_state, + learning_rate=learning_rate, + eta0=eta0, + power_t=power_t, + early_stopping=False, + validation_fraction=0.1, + n_iter_no_change=5, + warm_start=warm_start, + average=average, + ) + + def _fit_one_class(self, X, alpha, C, sample_weight, learning_rate, max_iter): + """Uses SGD implementation with X and y=np.ones(n_samples).""" + + # The One-Class SVM uses the SGD implementation with + # y=np.ones(n_samples). + n_samples = X.shape[0] + y = np.ones(n_samples, dtype=X.dtype, order="C") + + dataset, offset_decay = make_dataset(X, y, sample_weight) + + penalty_type = self._get_penalty_type(self.penalty) + learning_rate_type = self._get_learning_rate_type(learning_rate) + + # early stopping is set to False for the One-Class SVM. thus + # validation_mask and validation_score_cb will be set to values + # associated to early_stopping=False in _make_validation_split and + # _make_validation_score_cb respectively. + validation_mask = self._make_validation_split(y, sample_mask=sample_weight > 0) + validation_score_cb = self._make_validation_score_cb( + validation_mask, X, y, sample_weight + ) + + random_state = check_random_state(self.random_state) + # numpy mtrand expects a C long which is a signed 32 bit integer under + # Windows + seed = random_state.randint(0, np.iinfo(np.int32).max) + + tol = self.tol if self.tol is not None else -np.inf + + one_class = 1 + # There are no class weights for the One-Class SVM and they are + # therefore set to 1. + pos_weight = 1 + neg_weight = 1 + + if self.average: + coef = self._standard_coef + intercept = self._standard_intercept + average_coef = self._average_coef + average_intercept = self._average_intercept + else: + coef = self.coef_ + intercept = 1 - self.offset_ + average_coef = None # Not used + average_intercept = [0] # Not used + + _plain_sgd = _get_plain_sgd_function(input_dtype=coef.dtype) + coef, intercept, average_coef, average_intercept, self.n_iter_ = _plain_sgd( + coef, + intercept[0], + average_coef, + average_intercept[0], + self._loss_function_, + penalty_type, + alpha, + C, + self.l1_ratio, + dataset, + validation_mask, + self.early_stopping, + validation_score_cb, + int(self.n_iter_no_change), + max_iter, + tol, + int(self.fit_intercept), + int(self.verbose), + int(self.shuffle), + seed, + neg_weight, + pos_weight, + learning_rate_type, + self.eta0, + self.power_t, + one_class, + self.t_, + offset_decay, + self.average, + ) + + self.t_ += self.n_iter_ * n_samples + + if self.average > 0: + self._average_intercept = np.atleast_1d(average_intercept) + self._standard_intercept = np.atleast_1d(intercept) + + if self.average <= self.t_ - 1.0: + # made enough updates for averaging to be taken into account + self.coef_ = average_coef + self.offset_ = 1 - np.atleast_1d(average_intercept) + else: + self.coef_ = coef + self.offset_ = 1 - np.atleast_1d(intercept) + + else: + self.offset_ = 1 - np.atleast_1d(intercept) + + def _partial_fit( + self, + X, + alpha, + C, + loss, + learning_rate, + max_iter, + sample_weight, + coef_init, + offset_init, + ): + first_call = getattr(self, "coef_", None) is None + X = validate_data( + self, + X, + None, + accept_sparse="csr", + dtype=[np.float64, np.float32], + order="C", + accept_large_sparse=False, + reset=first_call, + ) + + n_features = X.shape[1] + + # Allocate datastructures from input arguments + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + + # We use intercept = 1 - offset where intercept is the intercept of + # the SGD implementation and offset is the offset of the One-Class SVM + # optimization problem. + if getattr(self, "coef_", None) is None or coef_init is not None: + self._allocate_parameter_mem( + n_classes=1, + n_features=n_features, + input_dtype=X.dtype, + coef_init=coef_init, + intercept_init=offset_init, + one_class=1, + ) + elif n_features != self.coef_.shape[-1]: + raise ValueError( + "Number of features %d does not match previous data %d." + % (n_features, self.coef_.shape[-1]) + ) + + if self.average and getattr(self, "_average_coef", None) is None: + self._average_coef = np.zeros(n_features, dtype=X.dtype, order="C") + self._average_intercept = np.zeros(1, dtype=X.dtype, order="C") + + self._loss_function_ = self._get_loss_function(loss) + if not hasattr(self, "t_"): + self.t_ = 1.0 + + # delegate to concrete training procedure + self._fit_one_class( + X, + alpha=alpha, + C=C, + learning_rate=learning_rate, + sample_weight=sample_weight, + max_iter=max_iter, + ) + + return self + + @_fit_context(prefer_skip_nested_validation=True) + def partial_fit(self, X, y=None, sample_weight=None): + """Fit linear One-Class SVM with Stochastic Gradient Descent. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Subset of the training data. + y : Ignored + Not used, present for API consistency by convention. + + sample_weight : array-like, shape (n_samples,), optional + Weights applied to individual samples. + If not provided, uniform weights are assumed. + + Returns + ------- + self : object + Returns a fitted instance of self. + """ + if not hasattr(self, "coef_"): + self._more_validate_params(for_partial_fit=True) + + alpha = self.nu / 2 + return self._partial_fit( + X, + alpha, + C=1.0, + loss=self.loss, + learning_rate=self.learning_rate, + max_iter=1, + sample_weight=sample_weight, + coef_init=None, + offset_init=None, + ) + + def _fit( + self, + X, + alpha, + C, + loss, + learning_rate, + coef_init=None, + offset_init=None, + sample_weight=None, + ): + if self.warm_start and hasattr(self, "coef_"): + if coef_init is None: + coef_init = self.coef_ + if offset_init is None: + offset_init = self.offset_ + else: + self.coef_ = None + self.offset_ = None + + # Clear iteration count for multiple call to fit. + self.t_ = 1.0 + + self._partial_fit( + X, + alpha, + C, + loss, + learning_rate, + self.max_iter, + sample_weight, + coef_init, + offset_init, + ) + + if ( + self.tol is not None + and self.tol > -np.inf + and self.n_iter_ == self.max_iter + ): + warnings.warn( + ( + "Maximum number of iteration reached before " + "convergence. Consider increasing max_iter to " + "improve the fit." + ), + ConvergenceWarning, + ) + + return self + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None, coef_init=None, offset_init=None, sample_weight=None): + """Fit linear One-Class SVM with Stochastic Gradient Descent. + + This solves an equivalent optimization problem of the + One-Class SVM primal optimization problem and returns a weight vector + w and an offset rho such that the decision function is given by + - rho. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Training data. + y : Ignored + Not used, present for API consistency by convention. + + coef_init : array, shape (n_classes, n_features) + The initial coefficients to warm-start the optimization. + + offset_init : array, shape (n_classes,) + The initial offset to warm-start the optimization. + + sample_weight : array-like, shape (n_samples,), optional + Weights applied to individual samples. + If not provided, uniform weights are assumed. These weights will + be multiplied with class_weight (passed through the + constructor) if class_weight is specified. + + Returns + ------- + self : object + Returns a fitted instance of self. + """ + self._more_validate_params() + + alpha = self.nu / 2 + self._fit( + X, + alpha=alpha, + C=1.0, + loss=self.loss, + learning_rate=self.learning_rate, + coef_init=coef_init, + offset_init=offset_init, + sample_weight=sample_weight, + ) + + return self + + def decision_function(self, X): + """Signed distance to the separating hyperplane. + + Signed distance is positive for an inlier and negative for an + outlier. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Testing data. + + Returns + ------- + dec : array-like, shape (n_samples,) + Decision function values of the samples. + """ + + check_is_fitted(self, "coef_") + + X = validate_data(self, X, accept_sparse="csr", reset=False) + decisions = safe_sparse_dot(X, self.coef_.T, dense_output=True) - self.offset_ + + return decisions.ravel() + + def score_samples(self, X): + """Raw scoring function of the samples. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Testing data. + + Returns + ------- + score_samples : array-like, shape (n_samples,) + Unshiffted scoring function values of the samples. + """ + score_samples = self.decision_function(X) + self.offset_ + return score_samples + + def predict(self, X): + """Return labels (1 inlier, -1 outlier) of the samples. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Testing data. + + Returns + ------- + y : array, shape (n_samples,) + Labels of the samples. + """ + y = (self.decision_function(X) >= 0).astype(np.int32) + y[y == 0] = -1 # for consistency with outlier detectors + return y + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/_theil_sen.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_theil_sen.py new file mode 100644 index 0000000000000000000000000000000000000000..4b25145a8ca55efe3f99e80f24a8da6e4b1a9f50 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/_theil_sen.py @@ -0,0 +1,467 @@ +""" +A Theil-Sen Estimator for Multiple Linear Regression Model +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from itertools import combinations +from numbers import Integral, Real + +import numpy as np +from joblib import effective_n_jobs +from scipy import linalg +from scipy.linalg.lapack import get_lapack_funcs +from scipy.special import binom + +from ..base import RegressorMixin, _fit_context +from ..exceptions import ConvergenceWarning +from ..utils import check_random_state +from ..utils._param_validation import Hidden, Interval, StrOptions +from ..utils.parallel import Parallel, delayed +from ..utils.validation import validate_data +from ._base import LinearModel + +_EPSILON = np.finfo(np.double).eps + + +def _modified_weiszfeld_step(X, x_old): + """Modified Weiszfeld step. + + This function defines one iteration step in order to approximate the + spatial median (L1 median). It is a form of an iteratively re-weighted + least squares method. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + + x_old : ndarray of shape = (n_features,) + Current start vector. + + Returns + ------- + x_new : ndarray of shape (n_features,) + New iteration step. + + References + ---------- + - On Computation of Spatial Median for Robust Data Mining, 2005 + T. Kärkkäinen and S. Äyrämö + http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf + """ + diff = X - x_old + diff_norm = np.sqrt(np.sum(diff**2, axis=1)) + mask = diff_norm >= _EPSILON + # x_old equals one of our samples + is_x_old_in_X = int(mask.sum() < X.shape[0]) + + diff = diff[mask] + diff_norm = diff_norm[mask][:, np.newaxis] + quotient_norm = linalg.norm(np.sum(diff / diff_norm, axis=0)) + + if quotient_norm > _EPSILON: # to avoid division by zero + new_direction = np.sum(X[mask, :] / diff_norm, axis=0) / np.sum( + 1 / diff_norm, axis=0 + ) + else: + new_direction = 1.0 + quotient_norm = 1.0 + + return ( + max(0.0, 1.0 - is_x_old_in_X / quotient_norm) * new_direction + + min(1.0, is_x_old_in_X / quotient_norm) * x_old + ) + + +def _spatial_median(X, max_iter=300, tol=1.0e-3): + """Spatial median (L1 median). + + The spatial median is member of a class of so-called M-estimators which + are defined by an optimization problem. Given a number of p points in an + n-dimensional space, the point x minimizing the sum of all distances to the + p other points is called spatial median. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + + max_iter : int, default=300 + Maximum number of iterations. + + tol : float, default=1.e-3 + Stop the algorithm if spatial_median has converged. + + Returns + ------- + spatial_median : ndarray of shape = (n_features,) + Spatial median. + + n_iter : int + Number of iterations needed. + + References + ---------- + - On Computation of Spatial Median for Robust Data Mining, 2005 + T. Kärkkäinen and S. Äyrämö + http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf + """ + if X.shape[1] == 1: + return 1, np.median(X.ravel(), keepdims=True) + + tol **= 2 # We are computing the tol on the squared norm + spatial_median_old = np.mean(X, axis=0) + + for n_iter in range(max_iter): + spatial_median = _modified_weiszfeld_step(X, spatial_median_old) + if np.sum((spatial_median_old - spatial_median) ** 2) < tol: + break + else: + spatial_median_old = spatial_median + else: + warnings.warn( + "Maximum number of iterations {max_iter} reached in " + "spatial median for TheilSen regressor." + "".format(max_iter=max_iter), + ConvergenceWarning, + ) + return n_iter, spatial_median + + +def _breakdown_point(n_samples, n_subsamples): + """Approximation of the breakdown point. + + Parameters + ---------- + n_samples : int + Number of samples. + + n_subsamples : int + Number of subsamples to consider. + + Returns + ------- + breakdown_point : float + Approximation of breakdown point. + """ + return ( + 1 + - ( + 0.5 ** (1 / n_subsamples) * (n_samples - n_subsamples + 1) + + n_subsamples + - 1 + ) + / n_samples + ) + + +def _lstsq(X, y, indices, fit_intercept): + """Least Squares Estimator for TheilSenRegressor class. + + This function calculates the least squares method on a subset of rows of X + and y defined by the indices array. Optionally, an intercept column is + added if intercept is set to true. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Design matrix, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : ndarray of shape (n_samples,) + Target vector, where `n_samples` is the number of samples. + + indices : ndarray of shape (n_subpopulation, n_subsamples) + Indices of all subsamples with respect to the chosen subpopulation. + + fit_intercept : bool + Fit intercept or not. + + Returns + ------- + weights : ndarray of shape (n_subpopulation, n_features + intercept) + Solution matrix of n_subpopulation solved least square problems. + """ + fit_intercept = int(fit_intercept) + n_features = X.shape[1] + fit_intercept + n_subsamples = indices.shape[1] + weights = np.empty((indices.shape[0], n_features)) + X_subpopulation = np.ones((n_subsamples, n_features)) + # gelss need to pad y_subpopulation to be of the max dim of X_subpopulation + y_subpopulation = np.zeros((max(n_subsamples, n_features))) + (lstsq,) = get_lapack_funcs(("gelss",), (X_subpopulation, y_subpopulation)) + + for index, subset in enumerate(indices): + X_subpopulation[:, fit_intercept:] = X[subset, :] + y_subpopulation[:n_subsamples] = y[subset] + weights[index] = lstsq(X_subpopulation, y_subpopulation)[1][:n_features] + + return weights + + +class TheilSenRegressor(RegressorMixin, LinearModel): + """Theil-Sen Estimator: robust multivariate regression model. + + The algorithm calculates least square solutions on subsets with size + n_subsamples of the samples in X. Any value of n_subsamples between the + number of features and samples leads to an estimator with a compromise + between robustness and efficiency. Since the number of least square + solutions is "n_samples choose n_subsamples", it can be extremely large + and can therefore be limited with max_subpopulation. If this limit is + reached, the subsets are chosen randomly. In a final step, the spatial + median (or L1 median) is calculated of all least square solutions. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + fit_intercept : bool, default=True + Whether to calculate the intercept for this model. If set + to false, no intercept will be used in calculations. + + copy_X : bool, default=True + If True, X will be copied; else, it may be overwritten. + + .. deprecated:: 1.6 + `copy_X` was deprecated in 1.6 and will be removed in 1.8. + It has no effect as a copy is always made. + + max_subpopulation : int, default=1e4 + Instead of computing with a set of cardinality 'n choose k', where n is + the number of samples and k is the number of subsamples (at least + number of features), consider only a stochastic subpopulation of a + given maximal size if 'n choose k' is larger than max_subpopulation. + For other than small problem sizes this parameter will determine + memory usage and runtime if n_subsamples is not changed. Note that the + data type should be int but floats such as 1e4 can be accepted too. + + n_subsamples : int, default=None + Number of samples to calculate the parameters. This is at least the + number of features (plus 1 if fit_intercept=True) and the number of + samples as a maximum. A lower number leads to a higher breakdown + point and a low efficiency while a high number leads to a low + breakdown point and a high efficiency. If None, take the + minimum number of subsamples leading to maximal robustness. + If n_subsamples is set to n_samples, Theil-Sen is identical to least + squares. + + max_iter : int, default=300 + Maximum number of iterations for the calculation of spatial median. + + tol : float, default=1e-3 + Tolerance when calculating spatial median. + + random_state : int, RandomState instance or None, default=None + A random number generator instance to define the state of the random + permutations generator. Pass an int for reproducible output across + multiple function calls. + See :term:`Glossary `. + + n_jobs : int, default=None + Number of CPUs to use during the cross validation. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + verbose : bool, default=False + Verbose mode when fitting the model. + + Attributes + ---------- + coef_ : ndarray of shape (n_features,) + Coefficients of the regression model (median of distribution). + + intercept_ : float + Estimated intercept of regression model. + + breakdown_ : float + Approximated breakdown point. + + n_iter_ : int + Number of iterations needed for the spatial median. + + n_subpopulation_ : int + Number of combinations taken into account from 'n choose k', where n is + the number of samples and k is the number of subsamples. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + HuberRegressor : Linear regression model that is robust to outliers. + RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm. + SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD. + + References + ---------- + - Theil-Sen Estimators in a Multiple Linear Regression Model, 2009 + Xin Dang, Hanxiang Peng, Xueqin Wang and Heping Zhang + http://home.olemiss.edu/~xdang/papers/MTSE.pdf + + Examples + -------- + >>> from sklearn.linear_model import TheilSenRegressor + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression( + ... n_samples=200, n_features=2, noise=4.0, random_state=0) + >>> reg = TheilSenRegressor(random_state=0).fit(X, y) + >>> reg.score(X, y) + 0.9884 + >>> reg.predict(X[:1,]) + array([-31.5871]) + """ + + _parameter_constraints: dict = { + "fit_intercept": ["boolean"], + "copy_X": ["boolean", Hidden(StrOptions({"deprecated"}))], + # target_type should be Integral but can accept Real for backward compatibility + "max_subpopulation": [Interval(Real, 1, None, closed="left")], + "n_subsamples": [None, Integral], + "max_iter": [Interval(Integral, 0, None, closed="left")], + "tol": [Interval(Real, 0.0, None, closed="left")], + "random_state": ["random_state"], + "n_jobs": [None, Integral], + "verbose": ["verbose"], + } + + def __init__( + self, + *, + fit_intercept=True, + copy_X="deprecated", + max_subpopulation=1e4, + n_subsamples=None, + max_iter=300, + tol=1.0e-3, + random_state=None, + n_jobs=None, + verbose=False, + ): + self.fit_intercept = fit_intercept + self.copy_X = copy_X + self.max_subpopulation = max_subpopulation + self.n_subsamples = n_subsamples + self.max_iter = max_iter + self.tol = tol + self.random_state = random_state + self.n_jobs = n_jobs + self.verbose = verbose + + def _check_subparams(self, n_samples, n_features): + n_subsamples = self.n_subsamples + + if self.fit_intercept: + n_dim = n_features + 1 + else: + n_dim = n_features + + if n_subsamples is not None: + if n_subsamples > n_samples: + raise ValueError( + "Invalid parameter since n_subsamples > " + "n_samples ({0} > {1}).".format(n_subsamples, n_samples) + ) + if n_samples >= n_features: + if n_dim > n_subsamples: + plus_1 = "+1" if self.fit_intercept else "" + raise ValueError( + "Invalid parameter since n_features{0} " + "> n_subsamples ({1} > {2})." + "".format(plus_1, n_dim, n_subsamples) + ) + else: # if n_samples < n_features + if n_subsamples != n_samples: + raise ValueError( + "Invalid parameter since n_subsamples != " + "n_samples ({0} != {1}) while n_samples " + "< n_features.".format(n_subsamples, n_samples) + ) + else: + n_subsamples = min(n_dim, n_samples) + + all_combinations = max(1, np.rint(binom(n_samples, n_subsamples))) + n_subpopulation = int(min(self.max_subpopulation, all_combinations)) + + return n_subsamples, n_subpopulation + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y): + """Fit linear model. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Training data. + y : ndarray of shape (n_samples,) + Target values. + + Returns + ------- + self : returns an instance of self. + Fitted `TheilSenRegressor` estimator. + """ + if self.copy_X != "deprecated": + warnings.warn( + "`copy_X` was deprecated in 1.6 and will be removed in 1.8 since it " + "has no effect internally. Simply leave this parameter to its default " + "value to avoid this warning.", + FutureWarning, + ) + + random_state = check_random_state(self.random_state) + X, y = validate_data(self, X, y, y_numeric=True) + n_samples, n_features = X.shape + n_subsamples, self.n_subpopulation_ = self._check_subparams( + n_samples, n_features + ) + self.breakdown_ = _breakdown_point(n_samples, n_subsamples) + + if self.verbose: + print("Breakdown point: {0}".format(self.breakdown_)) + print("Number of samples: {0}".format(n_samples)) + tol_outliers = int(self.breakdown_ * n_samples) + print("Tolerable outliers: {0}".format(tol_outliers)) + print("Number of subpopulations: {0}".format(self.n_subpopulation_)) + + # Determine indices of subpopulation + if np.rint(binom(n_samples, n_subsamples)) <= self.max_subpopulation: + indices = list(combinations(range(n_samples), n_subsamples)) + else: + indices = [ + random_state.choice(n_samples, size=n_subsamples, replace=False) + for _ in range(self.n_subpopulation_) + ] + + n_jobs = effective_n_jobs(self.n_jobs) + index_list = np.array_split(indices, n_jobs) + weights = Parallel(n_jobs=n_jobs, verbose=self.verbose)( + delayed(_lstsq)(X, y, index_list[job], self.fit_intercept) + for job in range(n_jobs) + ) + weights = np.vstack(weights) + self.n_iter_, coefs = _spatial_median( + weights, max_iter=self.max_iter, tol=self.tol + ) + + if self.fit_intercept: + self.intercept_ = coefs[0] + self.coef_ = coefs[1:] + else: + self.intercept_ = 0.0 + self.coef_ = coefs + + return self diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/meson.build b/.venv/lib/python3.12/site-packages/sklearn/linear_model/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..6d8405c7933891dcdbbc340d47108cde68089d1c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/meson.build @@ -0,0 +1,32 @@ +# .pyx is generated, so this is needed to make Cython compilation work +linear_model_cython_tree = [ + fs.copyfile('__init__.py'), +] + +py.extension_module( + '_cd_fast', + [cython_gen.process('_cd_fast.pyx'), utils_cython_tree], + subdir: 'sklearn/linear_model', + install: true +) + +name_list = ['_sgd_fast', '_sag_fast'] + +foreach name: name_list + pyx = custom_target( + name + '_pyx', + output: name + '.pyx', + input: name + '.pyx.tp', + command: [tempita, '@INPUT@', '-o', '@OUTDIR@'], + # TODO in principle this should go in py.exension_module below. This is + # temporary work-around for dependency issue with .pyx.tp files. For more + # details, see https://github.com/mesonbuild/meson/issues/13212 + depends: [linear_model_cython_tree, utils_cython_tree, _loss_cython_tree], + ) + py.extension_module( + name, + cython_gen.process(pyx), + subdir: 'sklearn/linear_model', + install: true +) +endforeach diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_common.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_common.py new file mode 100644 index 0000000000000000000000000000000000000000..2483a26644cbbe30388703efc4f687bb01ba5f62 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_common.py @@ -0,0 +1,234 @@ +# SPDX-License-Identifier: BSD-3-Clause + +import inspect + +import numpy as np +import pytest + +from sklearn.base import is_classifier +from sklearn.datasets import make_classification, make_low_rank_matrix, make_regression +from sklearn.linear_model import ( + ARDRegression, + BayesianRidge, + ElasticNet, + ElasticNetCV, + GammaRegressor, + HuberRegressor, + Lars, + LarsCV, + Lasso, + LassoCV, + LassoLars, + LassoLarsCV, + LassoLarsIC, + LinearRegression, + LogisticRegression, + LogisticRegressionCV, + MultiTaskElasticNet, + MultiTaskElasticNetCV, + MultiTaskLasso, + MultiTaskLassoCV, + OrthogonalMatchingPursuit, + OrthogonalMatchingPursuitCV, + PassiveAggressiveClassifier, + PassiveAggressiveRegressor, + Perceptron, + PoissonRegressor, + Ridge, + RidgeClassifier, + RidgeClassifierCV, + RidgeCV, + SGDClassifier, + SGDRegressor, + TheilSenRegressor, + TweedieRegressor, +) +from sklearn.preprocessing import MinMaxScaler +from sklearn.svm import LinearSVC, LinearSVR +from sklearn.utils._testing import set_random_state + + +# Note: GammaRegressor() and TweedieRegressor(power != 1) have a non-canonical link. +@pytest.mark.parametrize( + "model", + [ + ARDRegression(), + BayesianRidge(), + ElasticNet(), + ElasticNetCV(), + Lars(), + LarsCV(), + Lasso(), + LassoCV(), + LassoLarsCV(), + LassoLarsIC(), + LinearRegression(), + # TODO: FIx SAGA which fails badly with sample_weights. + # This is a known limitation, see: + # https://github.com/scikit-learn/scikit-learn/issues/21305 + pytest.param( + LogisticRegression( + penalty="elasticnet", solver="saga", l1_ratio=0.5, tol=1e-15 + ), + marks=pytest.mark.xfail(reason="Missing importance sampling scheme"), + ), + LogisticRegressionCV(tol=1e-6), + MultiTaskElasticNet(), + MultiTaskElasticNetCV(), + MultiTaskLasso(), + MultiTaskLassoCV(), + OrthogonalMatchingPursuit(), + OrthogonalMatchingPursuitCV(), + PoissonRegressor(), + Ridge(), + RidgeCV(), + pytest.param( + SGDRegressor(tol=1e-15), + marks=pytest.mark.xfail(reason="Insufficient precision."), + ), + SGDRegressor(penalty="elasticnet", max_iter=10_000), + TweedieRegressor(power=0), # same as Ridge + ], + ids=lambda x: x.__class__.__name__, +) +@pytest.mark.parametrize("with_sample_weight", [False, True]) +def test_balance_property(model, with_sample_weight, global_random_seed): + # Test that sum(y_predicted) == sum(y_observed) on the training set. + # This must hold for all linear models with deviance of an exponential disperson + # family as loss and the corresponding canonical link if fit_intercept=True. + # Examples: + # - squared error and identity link (most linear models) + # - Poisson deviance with log link + # - log loss with logit link + # This is known as balance property or unconditional calibration/unbiasedness. + # For reference, see Corollary 3.18, 3.20 and Chapter 5.1.5 of + # M.V. Wuthrich and M. Merz, "Statistical Foundations of Actuarial Learning and its + # Applications" (June 3, 2022). http://doi.org/10.2139/ssrn.3822407 + + if ( + with_sample_weight + and "sample_weight" not in inspect.signature(model.fit).parameters.keys() + ): + pytest.skip("Estimator does not support sample_weight.") + + rel = 2e-4 # test precision + if isinstance(model, SGDRegressor): + rel = 1e-1 + elif hasattr(model, "solver") and model.solver == "saga": + rel = 1e-2 + + rng = np.random.RandomState(global_random_seed) + n_train, n_features, n_targets = 100, 10, None + if isinstance( + model, + (MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLasso, MultiTaskLassoCV), + ): + n_targets = 3 + X = make_low_rank_matrix(n_samples=n_train, n_features=n_features, random_state=rng) + if n_targets: + coef = ( + rng.uniform(low=-2, high=2, size=(n_features, n_targets)) + / np.max(X, axis=0)[:, None] + ) + else: + coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0) + + expectation = np.exp(X @ coef + 0.5) + y = rng.poisson(lam=expectation) + 1 # strict positive, i.e. y > 0 + if is_classifier(model): + y = (y > expectation + 1).astype(np.float64) + + if with_sample_weight: + sw = rng.uniform(low=1, high=10, size=y.shape[0]) + else: + sw = None + + model.set_params(fit_intercept=True) # to be sure + if with_sample_weight: + model.fit(X, y, sample_weight=sw) + else: + model.fit(X, y) + # Assert balance property. + if is_classifier(model): + assert np.average(model.predict_proba(X)[:, 1], weights=sw) == pytest.approx( + np.average(y, weights=sw), rel=rel + ) + else: + assert np.average(model.predict(X), weights=sw, axis=0) == pytest.approx( + np.average(y, weights=sw, axis=0), rel=rel + ) + + +@pytest.mark.filterwarnings("ignore:The default of 'normalize'") +@pytest.mark.filterwarnings("ignore:lbfgs failed to converge") +@pytest.mark.parametrize( + "Regressor", + [ + ARDRegression, + BayesianRidge, + ElasticNet, + ElasticNetCV, + GammaRegressor, + HuberRegressor, + Lars, + LarsCV, + Lasso, + LassoCV, + LassoLars, + LassoLarsCV, + LassoLarsIC, + LinearSVR, + LinearRegression, + OrthogonalMatchingPursuit, + OrthogonalMatchingPursuitCV, + PassiveAggressiveRegressor, + PoissonRegressor, + Ridge, + RidgeCV, + SGDRegressor, + TheilSenRegressor, + TweedieRegressor, + ], +) +@pytest.mark.parametrize("ndim", [1, 2]) +def test_linear_model_regressor_coef_shape(Regressor, ndim): + """Check the consistency of linear models `coef` shape.""" + if Regressor is LinearRegression: + pytest.xfail("LinearRegression does not follow `coef_` shape contract!") + + X, y = make_regression(random_state=0, n_samples=200, n_features=20) + y = MinMaxScaler().fit_transform(y.reshape(-1, 1))[:, 0] + 1 + y = y[:, np.newaxis] if ndim == 2 else y + + regressor = Regressor() + set_random_state(regressor) + regressor.fit(X, y) + assert regressor.coef_.shape == (X.shape[1],) + + +@pytest.mark.parametrize( + "Classifier", + [ + LinearSVC, + LogisticRegression, + LogisticRegressionCV, + PassiveAggressiveClassifier, + Perceptron, + RidgeClassifier, + RidgeClassifierCV, + SGDClassifier, + ], +) +@pytest.mark.parametrize("n_classes", [2, 3]) +def test_linear_model_classifier_coef_shape(Classifier, n_classes): + if Classifier in (RidgeClassifier, RidgeClassifierCV): + pytest.xfail(f"{Classifier} does not follow `coef_` shape contract!") + + X, y = make_classification(n_informative=10, n_classes=n_classes, random_state=0) + n_features = X.shape[1] + + classifier = Classifier() + set_random_state(classifier) + classifier.fit(X, y) + expected_shape = (1, n_features) if n_classes == 2 else (n_classes, n_features) + assert classifier.coef_.shape == expected_shape diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_least_angle.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_least_angle.py new file mode 100644 index 0000000000000000000000000000000000000000..9b4a39750e03a495afd512a219b036433e31070c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_least_angle.py @@ -0,0 +1,869 @@ +import warnings + +import numpy as np +import pytest +from scipy import linalg + +from sklearn import datasets, linear_model +from sklearn.base import clone +from sklearn.exceptions import ConvergenceWarning +from sklearn.linear_model import ( + Lars, + LarsCV, + LassoLars, + LassoLarsCV, + LassoLarsIC, + lars_path, +) +from sklearn.linear_model._least_angle import _lars_path_residues +from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.utils._testing import ( + TempMemmap, + assert_allclose, + assert_array_almost_equal, + ignore_warnings, +) + +# TODO: use another dataset that has multiple drops +diabetes = datasets.load_diabetes() +X, y = diabetes.data, diabetes.target +G = np.dot(X.T, X) +Xy = np.dot(X.T, y) +n_samples = y.size + + +def test_simple(): + # Principle of Lars is to keep covariances tied and decreasing + + # also test verbose output + import sys + from io import StringIO + + old_stdout = sys.stdout + try: + sys.stdout = StringIO() + + _, _, coef_path_ = linear_model.lars_path(X, y, method="lar", verbose=10) + + sys.stdout = old_stdout + + for i, coef_ in enumerate(coef_path_.T): + res = y - np.dot(X, coef_) + cov = np.dot(X.T, res) + C = np.max(abs(cov)) + eps = 1e-3 + ocur = len(cov[C - eps < abs(cov)]) + if i < X.shape[1]: + assert ocur == i + 1 + else: + # no more than max_pred variables can go into the active set + assert ocur == X.shape[1] + finally: + sys.stdout = old_stdout + + +def test_simple_precomputed(): + # The same, with precomputed Gram matrix + + _, _, coef_path_ = linear_model.lars_path(X, y, Gram=G, method="lar") + + for i, coef_ in enumerate(coef_path_.T): + res = y - np.dot(X, coef_) + cov = np.dot(X.T, res) + C = np.max(abs(cov)) + eps = 1e-3 + ocur = len(cov[C - eps < abs(cov)]) + if i < X.shape[1]: + assert ocur == i + 1 + else: + # no more than max_pred variables can go into the active set + assert ocur == X.shape[1] + + +def _assert_same_lars_path_result(output1, output2): + assert len(output1) == len(output2) + for o1, o2 in zip(output1, output2): + assert_allclose(o1, o2) + + +@pytest.mark.parametrize("method", ["lar", "lasso"]) +@pytest.mark.parametrize("return_path", [True, False]) +def test_lars_path_gram_equivalent(method, return_path): + _assert_same_lars_path_result( + linear_model.lars_path_gram( + Xy=Xy, Gram=G, n_samples=n_samples, method=method, return_path=return_path + ), + linear_model.lars_path(X, y, Gram=G, method=method, return_path=return_path), + ) + + +def test_x_none_gram_none_raises_value_error(): + # Test that lars_path with no X and Gram raises exception + Xy = np.dot(X.T, y) + with pytest.raises(ValueError, match="X and Gram cannot both be unspecified"): + linear_model.lars_path(None, y, Gram=None, Xy=Xy) + + +def test_all_precomputed(): + # Test that lars_path with precomputed Gram and Xy gives the right answer + G = np.dot(X.T, X) + Xy = np.dot(X.T, y) + for method in "lar", "lasso": + output = linear_model.lars_path(X, y, method=method) + output_pre = linear_model.lars_path(X, y, Gram=G, Xy=Xy, method=method) + for expected, got in zip(output, output_pre): + assert_array_almost_equal(expected, got) + + +# TODO: remove warning filter when numpy min version >= 2.0.0 +@pytest.mark.filterwarnings("ignore: `rcond` parameter will change") +def test_lars_lstsq(): + # Test that Lars gives least square solution at the end + # of the path + X1 = 3 * X # use un-normalized dataset + clf = linear_model.LassoLars(alpha=0.0) + clf.fit(X1, y) + coef_lstsq = np.linalg.lstsq(X1, y)[0] + assert_array_almost_equal(clf.coef_, coef_lstsq) + + +# TODO: remove warning filter when numpy min version >= 2.0.0 +@pytest.mark.filterwarnings("ignore: `rcond` parameter will change") +def test_lasso_gives_lstsq_solution(): + # Test that Lars Lasso gives least square solution at the end + # of the path + _, _, coef_path_ = linear_model.lars_path(X, y, method="lasso") + coef_lstsq = np.linalg.lstsq(X, y)[0] + assert_array_almost_equal(coef_lstsq, coef_path_[:, -1]) + + +def test_collinearity(): + # Check that lars_path is robust to collinearity in input + X = np.array([[3.0, 3.0, 1.0], [2.0, 2.0, 0.0], [1.0, 1.0, 0]]) + y = np.array([1.0, 0.0, 0]) + rng = np.random.RandomState(0) + + f = ignore_warnings + _, _, coef_path_ = f(linear_model.lars_path)(X, y, alpha_min=0.01) + assert not np.isnan(coef_path_).any() + residual = np.dot(X, coef_path_[:, -1]) - y + assert (residual**2).sum() < 1.0 # just make sure it's bounded + + n_samples = 10 + X = rng.rand(n_samples, 5) + y = np.zeros(n_samples) + _, _, coef_path_ = linear_model.lars_path( + X, + y, + Gram="auto", + copy_X=False, + copy_Gram=False, + alpha_min=0.0, + method="lasso", + verbose=0, + max_iter=500, + ) + assert_array_almost_equal(coef_path_, np.zeros_like(coef_path_)) + + +def test_no_path(): + # Test that the ``return_path=False`` option returns the correct output + alphas_, _, coef_path_ = linear_model.lars_path(X, y, method="lar") + alpha_, _, coef = linear_model.lars_path(X, y, method="lar", return_path=False) + + assert_array_almost_equal(coef, coef_path_[:, -1]) + assert alpha_ == alphas_[-1] + + +def test_no_path_precomputed(): + # Test that the ``return_path=False`` option with Gram remains correct + alphas_, _, coef_path_ = linear_model.lars_path(X, y, method="lar", Gram=G) + alpha_, _, coef = linear_model.lars_path( + X, y, method="lar", Gram=G, return_path=False + ) + + assert_array_almost_equal(coef, coef_path_[:, -1]) + assert alpha_ == alphas_[-1] + + +def test_no_path_all_precomputed(): + # Test that the ``return_path=False`` option with Gram and Xy remains + # correct + X, y = 3 * diabetes.data, diabetes.target + G = np.dot(X.T, X) + Xy = np.dot(X.T, y) + alphas_, _, coef_path_ = linear_model.lars_path( + X, y, method="lasso", Xy=Xy, Gram=G, alpha_min=0.9 + ) + alpha_, _, coef = linear_model.lars_path( + X, y, method="lasso", Gram=G, Xy=Xy, alpha_min=0.9, return_path=False + ) + + assert_array_almost_equal(coef, coef_path_[:, -1]) + assert alpha_ == alphas_[-1] + + +@pytest.mark.parametrize( + "classifier", [linear_model.Lars, linear_model.LarsCV, linear_model.LassoLarsIC] +) +def test_lars_precompute(classifier): + # Check for different values of precompute + G = np.dot(X.T, X) + + clf = classifier(precompute=G) + output_1 = ignore_warnings(clf.fit)(X, y).coef_ + for precompute in [True, False, "auto", None]: + clf = classifier(precompute=precompute) + output_2 = clf.fit(X, y).coef_ + assert_array_almost_equal(output_1, output_2, decimal=8) + + +def test_singular_matrix(): + # Test when input is a singular matrix + X1 = np.array([[1, 1.0], [1.0, 1.0]]) + y1 = np.array([1, 1]) + _, _, coef_path = linear_model.lars_path(X1, y1) + assert_array_almost_equal(coef_path.T, [[0, 0], [1, 0]]) + + +def test_rank_deficient_design(): + # consistency test that checks that LARS Lasso is handling rank + # deficient input data (with n_features < rank) in the same way + # as coordinate descent Lasso + y = [5, 0, 5] + for X in ([[5, 0], [0, 5], [10, 10]], [[10, 10, 0], [1e-32, 0, 0], [0, 0, 1]]): + # To be able to use the coefs to compute the objective function, + # we need to turn off normalization + lars = linear_model.LassoLars(0.1) + coef_lars_ = lars.fit(X, y).coef_ + obj_lars = 1.0 / (2.0 * 3.0) * linalg.norm( + y - np.dot(X, coef_lars_) + ) ** 2 + 0.1 * linalg.norm(coef_lars_, 1) + coord_descent = linear_model.Lasso(0.1, tol=1e-6) + coef_cd_ = coord_descent.fit(X, y).coef_ + obj_cd = (1.0 / (2.0 * 3.0)) * linalg.norm( + y - np.dot(X, coef_cd_) + ) ** 2 + 0.1 * linalg.norm(coef_cd_, 1) + assert obj_lars < obj_cd * (1.0 + 1e-8) + + +def test_lasso_lars_vs_lasso_cd(): + # Test that LassoLars and Lasso using coordinate descent give the + # same results. + X = 3 * diabetes.data + + alphas, _, lasso_path = linear_model.lars_path(X, y, method="lasso") + lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8) + for c, a in zip(lasso_path.T, alphas): + if a == 0: + continue + lasso_cd.alpha = a + lasso_cd.fit(X, y) + error = linalg.norm(c - lasso_cd.coef_) + assert error < 0.01 + + # similar test, with the classifiers + for alpha in np.linspace(1e-2, 1 - 1e-2, 20): + clf1 = linear_model.LassoLars(alpha=alpha).fit(X, y) + clf2 = linear_model.Lasso(alpha=alpha, tol=1e-8).fit(X, y) + err = linalg.norm(clf1.coef_ - clf2.coef_) + assert err < 1e-3 + + # same test, with normalized data + X = diabetes.data + X = X - X.sum(axis=0) + X /= np.linalg.norm(X, axis=0) + alphas, _, lasso_path = linear_model.lars_path(X, y, method="lasso") + lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8) + for c, a in zip(lasso_path.T, alphas): + if a == 0: + continue + lasso_cd.alpha = a + lasso_cd.fit(X, y) + error = linalg.norm(c - lasso_cd.coef_) + assert error < 0.01 + + +def test_lasso_lars_vs_lasso_cd_early_stopping(): + # Test that LassoLars and Lasso using coordinate descent give the + # same results when early stopping is used. + # (test : before, in the middle, and in the last part of the path) + alphas_min = [10, 0.9, 1e-4] + + X = diabetes.data + + for alpha_min in alphas_min: + alphas, _, lasso_path = linear_model.lars_path( + X, y, method="lasso", alpha_min=alpha_min + ) + lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8) + lasso_cd.alpha = alphas[-1] + lasso_cd.fit(X, y) + error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_) + assert error < 0.01 + + # same test, with normalization + X = diabetes.data - diabetes.data.sum(axis=0) + X /= np.linalg.norm(X, axis=0) + + for alpha_min in alphas_min: + alphas, _, lasso_path = linear_model.lars_path( + X, y, method="lasso", alpha_min=alpha_min + ) + lasso_cd = linear_model.Lasso(tol=1e-8) + lasso_cd.alpha = alphas[-1] + lasso_cd.fit(X, y) + error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_) + assert error < 0.01 + + +def test_lasso_lars_path_length(): + # Test that the path length of the LassoLars is right + lasso = linear_model.LassoLars() + lasso.fit(X, y) + lasso2 = linear_model.LassoLars(alpha=lasso.alphas_[2]) + lasso2.fit(X, y) + assert_array_almost_equal(lasso.alphas_[:3], lasso2.alphas_) + # Also check that the sequence of alphas is always decreasing + assert np.all(np.diff(lasso.alphas_) < 0) + + +def test_lasso_lars_vs_lasso_cd_ill_conditioned(): + # Test lasso lars on a very ill-conditioned design, and check that + # it does not blow up, and stays somewhat close to a solution given + # by the coordinate descent solver + # Also test that lasso_path (using lars_path output style) gives + # the same result as lars_path and previous lasso output style + # under these conditions. + rng = np.random.RandomState(42) + + # Generate data + n, m = 70, 100 + k = 5 + X = rng.randn(n, m) + w = np.zeros((m, 1)) + i = np.arange(0, m) + rng.shuffle(i) + supp = i[:k] + w[supp] = np.sign(rng.randn(k, 1)) * (rng.rand(k, 1) + 1) + y = np.dot(X, w) + sigma = 0.2 + y += sigma * rng.rand(*y.shape) + y = y.squeeze() + lars_alphas, _, lars_coef = linear_model.lars_path(X, y, method="lasso") + + _, lasso_coef2, _ = linear_model.lasso_path(X, y, alphas=lars_alphas, tol=1e-6) + + assert_array_almost_equal(lars_coef, lasso_coef2, decimal=1) + + +def test_lasso_lars_vs_lasso_cd_ill_conditioned2(): + # Create an ill-conditioned situation in which the LARS has to go + # far in the path to converge, and check that LARS and coordinate + # descent give the same answers + # Note it used to be the case that Lars had to use the drop for good + # strategy for this but this is no longer the case with the + # equality_tolerance checks + X = [[1e20, 1e20, 0], [-1e-32, 0, 0], [1, 1, 1]] + y = [10, 10, 1] + alpha = 0.0001 + + def objective_function(coef): + return 1.0 / (2.0 * len(X)) * linalg.norm( + y - np.dot(X, coef) + ) ** 2 + alpha * linalg.norm(coef, 1) + + lars = linear_model.LassoLars(alpha=alpha) + warning_message = "Regressors in active set degenerate." + with pytest.warns(ConvergenceWarning, match=warning_message): + lars.fit(X, y) + lars_coef_ = lars.coef_ + lars_obj = objective_function(lars_coef_) + + coord_descent = linear_model.Lasso(alpha=alpha, tol=1e-4) + cd_coef_ = coord_descent.fit(X, y).coef_ + cd_obj = objective_function(cd_coef_) + + assert lars_obj < cd_obj * (1.0 + 1e-8) + + +def test_lars_add_features(): + # assure that at least some features get added if necessary + # test for 6d2b4c + # Hilbert matrix + n = 5 + H = 1.0 / (np.arange(1, n + 1) + np.arange(n)[:, np.newaxis]) + clf = linear_model.Lars(fit_intercept=False).fit(H, np.arange(n)) + assert np.all(np.isfinite(clf.coef_)) + + +def test_lars_n_nonzero_coefs(verbose=False): + lars = linear_model.Lars(n_nonzero_coefs=6, verbose=verbose) + lars.fit(X, y) + assert len(lars.coef_.nonzero()[0]) == 6 + # The path should be of length 6 + 1 in a Lars going down to 6 + # non-zero coefs + assert len(lars.alphas_) == 7 + + +def test_multitarget(): + # Assure that estimators receiving multidimensional y do the right thing + Y = np.vstack([y, y**2]).T + n_targets = Y.shape[1] + estimators = [ + linear_model.LassoLars(), + linear_model.Lars(), + # regression test for gh-1615 + linear_model.LassoLars(fit_intercept=False), + linear_model.Lars(fit_intercept=False), + ] + + for estimator in estimators: + estimator.fit(X, Y) + Y_pred = estimator.predict(X) + alphas, active, coef, path = ( + estimator.alphas_, + estimator.active_, + estimator.coef_, + estimator.coef_path_, + ) + for k in range(n_targets): + estimator.fit(X, Y[:, k]) + y_pred = estimator.predict(X) + assert_array_almost_equal(alphas[k], estimator.alphas_) + assert_array_almost_equal(active[k], estimator.active_) + assert_array_almost_equal(coef[k], estimator.coef_) + assert_array_almost_equal(path[k], estimator.coef_path_) + assert_array_almost_equal(Y_pred[:, k], y_pred) + + +def test_lars_cv(): + # Test the LassoLarsCV object by checking that the optimal alpha + # increases as the number of samples increases. + # This property is not actually guaranteed in general and is just a + # property of the given dataset, with the given steps chosen. + old_alpha = 0 + lars_cv = linear_model.LassoLarsCV() + for length in (400, 200, 100): + X = diabetes.data[:length] + y = diabetes.target[:length] + lars_cv.fit(X, y) + np.testing.assert_array_less(old_alpha, lars_cv.alpha_) + old_alpha = lars_cv.alpha_ + assert not hasattr(lars_cv, "n_nonzero_coefs") + + +def test_lars_cv_max_iter(recwarn): + warnings.simplefilter("always") + with np.errstate(divide="raise", invalid="raise"): + X = diabetes.data + y = diabetes.target + rng = np.random.RandomState(42) + x = rng.randn(len(y)) + X = diabetes.data + X = np.c_[X, x, x] # add correlated features + X = StandardScaler().fit_transform(X) + lars_cv = linear_model.LassoLarsCV(max_iter=5, cv=5) + lars_cv.fit(X, y) + + # Check that there is no warning in general and no ConvergenceWarning + # in particular. + # Materialize the string representation of the warning to get a more + # informative error message in case of AssertionError. + recorded_warnings = [str(w) for w in recwarn] + assert len(recorded_warnings) == 0 + + +def test_lasso_lars_ic(): + # Test the LassoLarsIC object by checking that + # - some good features are selected. + # - alpha_bic > alpha_aic + # - n_nonzero_bic < n_nonzero_aic + lars_bic = linear_model.LassoLarsIC("bic") + lars_aic = linear_model.LassoLarsIC("aic") + rng = np.random.RandomState(42) + X = diabetes.data + X = np.c_[X, rng.randn(X.shape[0], 5)] # add 5 bad features + X = StandardScaler().fit_transform(X) + lars_bic.fit(X, y) + lars_aic.fit(X, y) + nonzero_bic = np.where(lars_bic.coef_)[0] + nonzero_aic = np.where(lars_aic.coef_)[0] + assert lars_bic.alpha_ > lars_aic.alpha_ + assert len(nonzero_bic) < len(nonzero_aic) + assert np.max(nonzero_bic) < diabetes.data.shape[1] + + +def test_lars_path_readonly_data(): + # When using automated memory mapping on large input, the + # fold data is in read-only mode + # This is a non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/4597 + splitted_data = train_test_split(X, y, random_state=42) + with TempMemmap(splitted_data) as (X_train, X_test, y_train, y_test): + # The following should not fail despite copy=False + _lars_path_residues(X_train, y_train, X_test, y_test, copy=False) + + +def test_lars_path_positive_constraint(): + # this is the main test for the positive parameter on the lars_path method + # the estimator classes just make use of this function + + # we do the test on the diabetes dataset + + # ensure that we get negative coefficients when positive=False + # and all positive when positive=True + # for method 'lar' (default) and lasso + + err_msg = "Positive constraint not supported for 'lar' coding method." + with pytest.raises(ValueError, match=err_msg): + linear_model.lars_path( + diabetes["data"], diabetes["target"], method="lar", positive=True + ) + + method = "lasso" + _, _, coefs = linear_model.lars_path( + X, y, return_path=True, method=method, positive=False + ) + assert coefs.min() < 0 + + _, _, coefs = linear_model.lars_path( + X, y, return_path=True, method=method, positive=True + ) + assert coefs.min() >= 0 + + +# now we gonna test the positive option for all estimator classes + +default_parameter = {"fit_intercept": False} + +estimator_parameter_map = { + "LassoLars": {"alpha": 0.1}, + "LassoLarsCV": {}, + "LassoLarsIC": {}, +} + + +def test_estimatorclasses_positive_constraint(): + # testing the transmissibility for the positive option of all estimator + # classes in this same function here + default_parameter = {"fit_intercept": False} + + estimator_parameter_map = { + "LassoLars": {"alpha": 0.1}, + "LassoLarsCV": {}, + "LassoLarsIC": {}, + } + for estname in estimator_parameter_map: + params = default_parameter.copy() + params.update(estimator_parameter_map[estname]) + estimator = getattr(linear_model, estname)(positive=False, **params) + estimator.fit(X, y) + assert estimator.coef_.min() < 0 + estimator = getattr(linear_model, estname)(positive=True, **params) + estimator.fit(X, y) + assert min(estimator.coef_) >= 0 + + +def test_lasso_lars_vs_lasso_cd_positive(): + # Test that LassoLars and Lasso using coordinate descent give the + # same results when using the positive option + + # This test is basically a copy of the above with additional positive + # option. However for the middle part, the comparison of coefficient values + # for a range of alphas, we had to make an adaptations. See below. + + # not normalized data + X = 3 * diabetes.data + + alphas, _, lasso_path = linear_model.lars_path(X, y, method="lasso", positive=True) + lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8, positive=True) + for c, a in zip(lasso_path.T, alphas): + if a == 0: + continue + lasso_cd.alpha = a + lasso_cd.fit(X, y) + error = linalg.norm(c - lasso_cd.coef_) + assert error < 0.01 + + # The range of alphas chosen for coefficient comparison here is restricted + # as compared with the above test without the positive option. This is due + # to the circumstance that the Lars-Lasso algorithm does not converge to + # the least-squares-solution for small alphas, see 'Least Angle Regression' + # by Efron et al 2004. The coefficients are typically in congruence up to + # the smallest alpha reached by the Lars-Lasso algorithm and start to + # diverge thereafter. See + # https://gist.github.com/michigraber/7e7d7c75eca694c7a6ff + + for alpha in np.linspace(6e-1, 1 - 1e-2, 20): + clf1 = linear_model.LassoLars( + fit_intercept=False, alpha=alpha, positive=True + ).fit(X, y) + clf2 = linear_model.Lasso( + fit_intercept=False, alpha=alpha, tol=1e-8, positive=True + ).fit(X, y) + err = linalg.norm(clf1.coef_ - clf2.coef_) + assert err < 1e-3 + + # normalized data + X = diabetes.data - diabetes.data.sum(axis=0) + X /= np.linalg.norm(X, axis=0) + alphas, _, lasso_path = linear_model.lars_path(X, y, method="lasso", positive=True) + lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8, positive=True) + for c, a in zip(lasso_path.T[:-1], alphas[:-1]): # don't include alpha=0 + lasso_cd.alpha = a + lasso_cd.fit(X, y) + error = linalg.norm(c - lasso_cd.coef_) + assert error < 0.01 + + +def test_lasso_lars_vs_R_implementation(): + # Test that sklearn LassoLars implementation agrees with the LassoLars + # implementation available in R (lars library) when fit_intercept=False. + + # Let's generate the data used in the bug report 7778 + y = np.array([-6.45006793, -3.51251449, -8.52445396, 6.12277822, -19.42109366]) + x = np.array( + [ + [0.47299829, 0, 0, 0, 0], + [0.08239882, 0.85784863, 0, 0, 0], + [0.30114139, -0.07501577, 0.80895216, 0, 0], + [-0.01460346, -0.1015233, 0.0407278, 0.80338378, 0], + [-0.69363927, 0.06754067, 0.18064514, -0.0803561, 0.40427291], + ] + ) + + X = x.T + + # The R result was obtained using the following code: + # + # library(lars) + # model_lasso_lars = lars(X, t(y), type="lasso", intercept=FALSE, + # trace=TRUE, normalize=FALSE) + # r = t(model_lasso_lars$beta) + # + + r = np.array( + [ + [ + 0, + 0, + 0, + 0, + 0, + -79.810362809499026, + -83.528788732782829, + -83.777653739190711, + -83.784156932888934, + -84.033390591756657, + ], + [0, 0, 0, 0, -0.476624256777266, 0, 0, 0, 0, 0.025219751009936], + [ + 0, + -3.577397088285891, + -4.702795355871871, + -7.016748621359461, + -7.614898471899412, + -0.336938391359179, + 0, + 0, + 0.001213370600853, + 0.048162321585148, + ], + [ + 0, + 0, + 0, + 2.231558436628169, + 2.723267514525966, + 2.811549786389614, + 2.813766976061531, + 2.817462468949557, + 2.817368178703816, + 2.816221090636795, + ], + [ + 0, + 0, + -1.218422599914637, + -3.457726183014808, + -4.021304522060710, + -45.827461592423745, + -47.776608869312305, + -47.911561610746404, + -47.914845922736234, + -48.039562334265717, + ], + ] + ) + + model_lasso_lars = linear_model.LassoLars(alpha=0, fit_intercept=False) + model_lasso_lars.fit(X, y) + skl_betas = model_lasso_lars.coef_path_ + + assert_array_almost_equal(r, skl_betas, decimal=12) + + +@pytest.mark.parametrize("copy_X", [True, False]) +def test_lasso_lars_copyX_behaviour(copy_X): + """ + Test that user input regarding copy_X is not being overridden (it was until + at least version 0.21) + + """ + lasso_lars = LassoLarsIC(copy_X=copy_X, precompute=False) + rng = np.random.RandomState(0) + X = rng.normal(0, 1, (100, 5)) + X_copy = X.copy() + y = X[:, 2] + lasso_lars.fit(X, y) + assert copy_X == np.array_equal(X, X_copy) + + +@pytest.mark.parametrize("copy_X", [True, False]) +def test_lasso_lars_fit_copyX_behaviour(copy_X): + """ + Test that user input to .fit for copy_X overrides default __init__ value + + """ + lasso_lars = LassoLarsIC(precompute=False) + rng = np.random.RandomState(0) + X = rng.normal(0, 1, (100, 5)) + X_copy = X.copy() + y = X[:, 2] + lasso_lars.fit(X, y, copy_X=copy_X) + assert copy_X == np.array_equal(X, X_copy) + + +@pytest.mark.parametrize("est", (LassoLars(alpha=1e-3), Lars())) +def test_lars_with_jitter(est): + # Test that a small amount of jitter helps stability, + # using example provided in issue #2746 + + X = np.array([[0.0, 0.0, 0.0, -1.0, 0.0], [0.0, -1.0, 0.0, 0.0, 0.0]]) + y = [-2.5, -2.5] + expected_coef = [0, 2.5, 0, 2.5, 0] + + # set to fit_intercept to False since target is constant and we want check + # the value of coef. coef would be all zeros otherwise. + est.set_params(fit_intercept=False) + est_jitter = clone(est).set_params(jitter=10e-8, random_state=0) + + est.fit(X, y) + est_jitter.fit(X, y) + + assert np.mean((est.coef_ - est_jitter.coef_) ** 2) > 0.1 + np.testing.assert_allclose(est_jitter.coef_, expected_coef, rtol=1e-3) + + +def test_X_none_gram_not_none(): + with pytest.raises(ValueError, match="X cannot be None if Gram is not None"): + lars_path(X=None, y=np.array([1]), Gram=True) + + +def test_copy_X_with_auto_gram(): + # Non-regression test for #17789, `copy_X=True` and Gram='auto' does not + # overwrite X + rng = np.random.RandomState(42) + X = rng.rand(6, 6) + y = rng.rand(6) + + X_before = X.copy() + linear_model.lars_path(X, y, Gram="auto", copy_X=True, method="lasso") + # X did not change + assert_allclose(X, X_before) + + +@pytest.mark.parametrize( + "LARS, has_coef_path, args", + ( + (Lars, True, {}), + (LassoLars, True, {}), + (LassoLarsIC, False, {}), + (LarsCV, True, {}), + # max_iter=5 is for avoiding ConvergenceWarning + (LassoLarsCV, True, {"max_iter": 5}), + ), +) +@pytest.mark.parametrize("dtype", (np.float32, np.float64)) +def test_lars_dtype_match(LARS, has_coef_path, args, dtype): + # The test ensures that the fit method preserves input dtype + rng = np.random.RandomState(0) + X = rng.rand(20, 6).astype(dtype) + y = rng.rand(20).astype(dtype) + + model = LARS(**args) + model.fit(X, y) + assert model.coef_.dtype == dtype + if has_coef_path: + assert model.coef_path_.dtype == dtype + assert model.intercept_.dtype == dtype + + +@pytest.mark.parametrize( + "LARS, has_coef_path, args", + ( + (Lars, True, {}), + (LassoLars, True, {}), + (LassoLarsIC, False, {}), + (LarsCV, True, {}), + # max_iter=5 is for avoiding ConvergenceWarning + (LassoLarsCV, True, {"max_iter": 5}), + ), +) +def test_lars_numeric_consistency(LARS, has_coef_path, args): + # The test ensures numerical consistency between trained coefficients + # of float32 and float64. + rtol = 1e-5 + atol = 1e-5 + + rng = np.random.RandomState(0) + X_64 = rng.rand(10, 6) + y_64 = rng.rand(10) + + model_64 = LARS(**args).fit(X_64, y_64) + model_32 = LARS(**args).fit(X_64.astype(np.float32), y_64.astype(np.float32)) + + assert_allclose(model_64.coef_, model_32.coef_, rtol=rtol, atol=atol) + if has_coef_path: + assert_allclose(model_64.coef_path_, model_32.coef_path_, rtol=rtol, atol=atol) + assert_allclose(model_64.intercept_, model_32.intercept_, rtol=rtol, atol=atol) + + +@pytest.mark.parametrize("criterion", ["aic", "bic"]) +def test_lassolarsic_alpha_selection(criterion): + """Check that we properly compute the AIC and BIC score. + + In this test, we reproduce the example of the Fig. 2 of Zou et al. + (reference [1] in LassoLarsIC) In this example, only 7 features should be + selected. + """ + model = make_pipeline(StandardScaler(), LassoLarsIC(criterion=criterion)) + model.fit(X, y) + + best_alpha_selected = np.argmin(model[-1].criterion_) + assert best_alpha_selected == 7 + + +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_lassolarsic_noise_variance(fit_intercept): + """Check the behaviour when `n_samples` < `n_features` and that one needs + to provide the noise variance.""" + rng = np.random.RandomState(0) + X, y = datasets.make_regression( + n_samples=10, n_features=11 - fit_intercept, random_state=rng + ) + + model = make_pipeline(StandardScaler(), LassoLarsIC(fit_intercept=fit_intercept)) + + err_msg = ( + "You are using LassoLarsIC in the case where the number of samples is smaller" + " than the number of features" + ) + with pytest.raises(ValueError, match=err_msg): + model.fit(X, y) + + model.set_params(lassolarsic__noise_variance=1.0) + model.fit(X, y).predict(X) diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_linear_loss.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_linear_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..a273656b3dbb8508bb468d6f5ac906b16dbc03f5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_linear_loss.py @@ -0,0 +1,510 @@ +""" +Tests for LinearModelLoss + +Note that correctness of losses (which compose LinearModelLoss) is already well +covered in the _loss module. +""" + +import numpy as np +import pytest +from numpy.testing import assert_allclose +from scipy import linalg, optimize + +from sklearn._loss.loss import ( + HalfBinomialLoss, + HalfMultinomialLoss, + HalfPoissonLoss, +) +from sklearn.datasets import make_low_rank_matrix +from sklearn.linear_model._linear_loss import LinearModelLoss +from sklearn.utils.extmath import squared_norm +from sklearn.utils.fixes import CSR_CONTAINERS + +# We do not need to test all losses, just what LinearModelLoss does on top of the +# base losses. +LOSSES = [HalfBinomialLoss, HalfMultinomialLoss, HalfPoissonLoss] + + +def random_X_y_coef( + linear_model_loss, n_samples, n_features, coef_bound=(-2, 2), seed=42 +): + """Random generate y, X and coef in valid range.""" + rng = np.random.RandomState(seed) + n_dof = n_features + linear_model_loss.fit_intercept + X = make_low_rank_matrix( + n_samples=n_samples, + n_features=n_features, + random_state=rng, + ) + coef = linear_model_loss.init_zero_coef(X) + + if linear_model_loss.base_loss.is_multiclass: + n_classes = linear_model_loss.base_loss.n_classes + coef.flat[:] = rng.uniform( + low=coef_bound[0], + high=coef_bound[1], + size=n_classes * n_dof, + ) + if linear_model_loss.fit_intercept: + raw_prediction = X @ coef[:, :-1].T + coef[:, -1] + else: + raw_prediction = X @ coef.T + proba = linear_model_loss.base_loss.link.inverse(raw_prediction) + + # y = rng.choice(np.arange(n_classes), p=proba) does not work. + # See https://stackoverflow.com/a/34190035/16761084 + def choice_vectorized(items, p): + s = p.cumsum(axis=1) + r = rng.rand(p.shape[0])[:, None] + k = (s < r).sum(axis=1) + return items[k] + + y = choice_vectorized(np.arange(n_classes), p=proba).astype(np.float64) + else: + coef.flat[:] = rng.uniform( + low=coef_bound[0], + high=coef_bound[1], + size=n_dof, + ) + if linear_model_loss.fit_intercept: + raw_prediction = X @ coef[:-1] + coef[-1] + else: + raw_prediction = X @ coef + y = linear_model_loss.base_loss.link.inverse( + raw_prediction + rng.uniform(low=-1, high=1, size=n_samples) + ) + + return X, y, coef + + +@pytest.mark.parametrize("base_loss", LOSSES) +@pytest.mark.parametrize("fit_intercept", [False, True]) +@pytest.mark.parametrize("n_features", [0, 1, 10]) +@pytest.mark.parametrize("dtype", [None, np.float32, np.float64, np.int64]) +def test_init_zero_coef( + base_loss, fit_intercept, n_features, dtype, global_random_seed +): + """Test that init_zero_coef initializes coef correctly.""" + loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=fit_intercept) + rng = np.random.RandomState(global_random_seed) + X = rng.normal(size=(5, n_features)) + coef = loss.init_zero_coef(X, dtype=dtype) + if loss.base_loss.is_multiclass: + n_classes = loss.base_loss.n_classes + assert coef.shape == (n_classes, n_features + fit_intercept) + assert coef.flags["F_CONTIGUOUS"] + else: + assert coef.shape == (n_features + fit_intercept,) + + if dtype is None: + assert coef.dtype == X.dtype + else: + assert coef.dtype == dtype + + assert np.count_nonzero(coef) == 0 + + +@pytest.mark.parametrize("base_loss", LOSSES) +@pytest.mark.parametrize("fit_intercept", [False, True]) +@pytest.mark.parametrize("sample_weight", [None, "range"]) +@pytest.mark.parametrize("l2_reg_strength", [0, 1]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_loss_grad_hess_are_the_same( + base_loss, + fit_intercept, + sample_weight, + l2_reg_strength, + csr_container, + global_random_seed, +): + """Test that loss and gradient are the same across different functions.""" + loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=fit_intercept) + X, y, coef = random_X_y_coef( + linear_model_loss=loss, n_samples=10, n_features=5, seed=global_random_seed + ) + X_old, y_old, coef_old = X.copy(), y.copy(), coef.copy() + + if sample_weight == "range": + sample_weight = np.linspace(1, y.shape[0], num=y.shape[0]) + + l1 = loss.loss( + coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength + ) + g1 = loss.gradient( + coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength + ) + l2, g2 = loss.loss_gradient( + coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength + ) + g3, h3 = loss.gradient_hessian_product( + coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength + ) + g4, h4, _ = loss.gradient_hessian( + coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength + ) + assert_allclose(l1, l2) + assert_allclose(g1, g2) + assert_allclose(g1, g3) + assert_allclose(g1, g4) + # The ravelling only takes effect for multiclass. + assert_allclose(h4 @ g4.ravel(order="F"), h3(g3).ravel(order="F")) + # Test that gradient_out and hessian_out are considered properly. + g_out = np.empty_like(coef) + h_out = np.empty_like(coef, shape=(coef.size, coef.size)) + g5, h5, _ = loss.gradient_hessian( + coef, + X, + y, + sample_weight=sample_weight, + l2_reg_strength=l2_reg_strength, + gradient_out=g_out, + hessian_out=h_out, + ) + assert np.shares_memory(g5, g_out) + assert np.shares_memory(h5, h_out) + assert_allclose(g5, g_out) + assert_allclose(h5, h_out) + assert_allclose(g1, g5) + assert_allclose(h5, h4) + + # same for sparse X + Xs = csr_container(X) + l1_sp = loss.loss( + coef, Xs, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength + ) + g1_sp = loss.gradient( + coef, Xs, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength + ) + l2_sp, g2_sp = loss.loss_gradient( + coef, Xs, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength + ) + g3_sp, h3_sp = loss.gradient_hessian_product( + coef, Xs, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength + ) + g4_sp, h4_sp, _ = loss.gradient_hessian( + coef, Xs, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength + ) + assert_allclose(l1, l1_sp) + assert_allclose(l1, l2_sp) + assert_allclose(g1, g1_sp) + assert_allclose(g1, g2_sp) + assert_allclose(g1, g3_sp) + assert_allclose(h3(g1), h3_sp(g1_sp)) + assert_allclose(g1, g4_sp) + assert_allclose(h4, h4_sp) + + # X, y and coef should not have changed + assert_allclose(X, X_old) + assert_allclose(Xs.toarray(), X_old) + assert_allclose(y, y_old) + assert_allclose(coef, coef_old) + + +@pytest.mark.parametrize("base_loss", LOSSES) +@pytest.mark.parametrize("sample_weight", [None, "range"]) +@pytest.mark.parametrize("l2_reg_strength", [0, 1]) +@pytest.mark.parametrize("X_container", CSR_CONTAINERS + [None]) +def test_loss_gradients_hessp_intercept( + base_loss, sample_weight, l2_reg_strength, X_container, global_random_seed +): + """Test that loss and gradient handle intercept correctly.""" + loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=False) + loss_inter = LinearModelLoss(base_loss=base_loss(), fit_intercept=True) + n_samples, n_features = 10, 5 + X, y, coef = random_X_y_coef( + linear_model_loss=loss, + n_samples=n_samples, + n_features=n_features, + seed=global_random_seed, + ) + + X[:, -1] = 1 # make last column of 1 to mimic intercept term + X_inter = X[ + :, :-1 + ] # exclude intercept column as it is added automatically by loss_inter + + if X_container is not None: + X = X_container(X) + + if sample_weight == "range": + sample_weight = np.linspace(1, y.shape[0], num=y.shape[0]) + + l, g = loss.loss_gradient( + coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength + ) + _, hessp = loss.gradient_hessian_product( + coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength + ) + l_inter, g_inter = loss_inter.loss_gradient( + coef, X_inter, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength + ) + _, hessp_inter = loss_inter.gradient_hessian_product( + coef, X_inter, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength + ) + + # Note, that intercept gets no L2 penalty. + assert l == pytest.approx( + l_inter + 0.5 * l2_reg_strength * squared_norm(coef.T[-1]) + ) + + g_inter_corrected = g_inter + g_inter_corrected.T[-1] += l2_reg_strength * coef.T[-1] + assert_allclose(g, g_inter_corrected) + + s = np.random.RandomState(global_random_seed).randn(*coef.shape) + h = hessp(s) + h_inter = hessp_inter(s) + h_inter_corrected = h_inter + h_inter_corrected.T[-1] += l2_reg_strength * s.T[-1] + assert_allclose(h, h_inter_corrected) + + +@pytest.mark.parametrize("base_loss", LOSSES) +@pytest.mark.parametrize("fit_intercept", [False, True]) +@pytest.mark.parametrize("sample_weight", [None, "range"]) +@pytest.mark.parametrize("l2_reg_strength", [0, 1]) +def test_gradients_hessians_numerically( + base_loss, fit_intercept, sample_weight, l2_reg_strength, global_random_seed +): + """Test gradients and hessians with numerical derivatives. + + Gradient should equal the numerical derivatives of the loss function. + Hessians should equal the numerical derivatives of gradients. + """ + loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=fit_intercept) + n_samples, n_features = 10, 5 + X, y, coef = random_X_y_coef( + linear_model_loss=loss, + n_samples=n_samples, + n_features=n_features, + seed=global_random_seed, + ) + coef = coef.ravel(order="F") # this is important only for multinomial loss + + if sample_weight == "range": + sample_weight = np.linspace(1, y.shape[0], num=y.shape[0]) + + # 1. Check gradients numerically + eps = 1e-6 + g, hessp = loss.gradient_hessian_product( + coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength + ) + # Use a trick to get central finite difference of accuracy 4 (five-point stencil) + # https://en.wikipedia.org/wiki/Numerical_differentiation + # https://en.wikipedia.org/wiki/Finite_difference_coefficient + # approx_g1 = (f(x + eps) - f(x - eps)) / (2*eps) + approx_g1 = optimize.approx_fprime( + coef, + lambda coef: loss.loss( + coef - eps, + X, + y, + sample_weight=sample_weight, + l2_reg_strength=l2_reg_strength, + ), + 2 * eps, + ) + # approx_g2 = (f(x + 2*eps) - f(x - 2*eps)) / (4*eps) + approx_g2 = optimize.approx_fprime( + coef, + lambda coef: loss.loss( + coef - 2 * eps, + X, + y, + sample_weight=sample_weight, + l2_reg_strength=l2_reg_strength, + ), + 4 * eps, + ) + # Five-point stencil approximation + # See: https://en.wikipedia.org/wiki/Five-point_stencil#1D_first_derivative + approx_g = (4 * approx_g1 - approx_g2) / 3 + assert_allclose(g, approx_g, rtol=1e-2, atol=1e-8) + + # 2. Check hessp numerically along the second direction of the gradient + vector = np.zeros_like(g) + vector[1] = 1 + hess_col = hessp(vector) + # Computation of the Hessian is particularly fragile to numerical errors when doing + # simple finite differences. Here we compute the grad along a path in the direction + # of the vector and then use a least-square regression to estimate the slope + eps = 1e-3 + d_x = np.linspace(-eps, eps, 30) + d_grad = np.array( + [ + loss.gradient( + coef + t * vector, + X, + y, + sample_weight=sample_weight, + l2_reg_strength=l2_reg_strength, + ) + for t in d_x + ] + ) + d_grad -= d_grad.mean(axis=0) + approx_hess_col = linalg.lstsq(d_x[:, np.newaxis], d_grad)[0].ravel() + assert_allclose(approx_hess_col, hess_col, rtol=1e-3) + + +@pytest.mark.parametrize("fit_intercept", [False, True]) +def test_multinomial_coef_shape(fit_intercept, global_random_seed): + """Test that multinomial LinearModelLoss respects shape of coef.""" + loss = LinearModelLoss(base_loss=HalfMultinomialLoss(), fit_intercept=fit_intercept) + n_samples, n_features = 10, 5 + X, y, coef = random_X_y_coef( + linear_model_loss=loss, + n_samples=n_samples, + n_features=n_features, + seed=global_random_seed, + ) + s = np.random.RandomState(global_random_seed).randn(*coef.shape) + + l, g = loss.loss_gradient(coef, X, y) + g1 = loss.gradient(coef, X, y) + g2, hessp = loss.gradient_hessian_product(coef, X, y) + h = hessp(s) + assert g.shape == coef.shape + assert h.shape == coef.shape + assert_allclose(g, g1) + assert_allclose(g, g2) + g3, hess, _ = loss.gradient_hessian(coef, X, y) + assert g3.shape == coef.shape + # But full hessian is always 2d. + assert hess.shape == (coef.size, coef.size) + + coef_r = coef.ravel(order="F") + s_r = s.ravel(order="F") + l_r, g_r = loss.loss_gradient(coef_r, X, y) + g1_r = loss.gradient(coef_r, X, y) + g2_r, hessp_r = loss.gradient_hessian_product(coef_r, X, y) + h_r = hessp_r(s_r) + assert g_r.shape == coef_r.shape + assert h_r.shape == coef_r.shape + assert_allclose(g_r, g1_r) + assert_allclose(g_r, g2_r) + + assert_allclose(g, g_r.reshape(loss.base_loss.n_classes, -1, order="F")) + assert_allclose(h, h_r.reshape(loss.base_loss.n_classes, -1, order="F")) + + +@pytest.mark.parametrize("sample_weight", [None, "range"]) +def test_multinomial_hessian_3_classes(sample_weight, global_random_seed): + """Test multinomial hessian for 3 classes and 2 points. + + For n_classes = 3 and n_samples = 2, we have + p0 = [p0_0, p0_1] + p1 = [p1_0, p1_1] + p2 = [p2_0, p2_1] + and with 2 x 2 diagonal subblocks + H = [p0 * (1-p0), -p0 * p1, -p0 * p2] + [ -p0 * p1, p1 * (1-p1), -p1 * p2] + [ -p0 * p2, -p1 * p2, p2 * (1-p2)] + hess = X' H X + """ + n_samples, n_features, n_classes = 2, 5, 3 + loss = LinearModelLoss( + base_loss=HalfMultinomialLoss(n_classes=n_classes), fit_intercept=False + ) + X, y, coef = random_X_y_coef( + linear_model_loss=loss, + n_samples=n_samples, + n_features=n_features, + seed=global_random_seed, + ) + coef = coef.ravel(order="F") # this is important only for multinomial loss + + if sample_weight == "range": + sample_weight = np.linspace(1, y.shape[0], num=y.shape[0]) + + grad, hess, _ = loss.gradient_hessian( + coef, + X, + y, + sample_weight=sample_weight, + l2_reg_strength=0, + ) + # Hessian must be a symmetrix matrix. + assert_allclose(hess, hess.T) + + weights, intercept, raw_prediction = loss.weight_intercept_raw(coef, X) + grad_pointwise, proba = loss.base_loss.gradient_proba( + y_true=y, + raw_prediction=raw_prediction, + sample_weight=sample_weight, + ) + p0d, p1d, p2d, oned = ( + np.diag(proba[:, 0]), + np.diag(proba[:, 1]), + np.diag(proba[:, 2]), + np.diag(np.ones(2)), + ) + h = np.block( + [ + [p0d * (oned - p0d), -p0d * p1d, -p0d * p2d], + [-p0d * p1d, p1d * (oned - p1d), -p1d * p2d], + [-p0d * p2d, -p1d * p2d, p2d * (oned - p2d)], + ] + ) + h = h.reshape((n_classes, n_samples, n_classes, n_samples)) + if sample_weight is None: + h /= n_samples + else: + h *= sample_weight / np.sum(sample_weight) + # hess_expected.shape = (n_features, n_classes, n_classes, n_features) + hess_expected = np.einsum("ij, mini, ik->jmnk", X, h, X) + hess_expected = np.moveaxis(hess_expected, 2, 3) + hess_expected = hess_expected.reshape( + n_classes * n_features, n_classes * n_features, order="C" + ) + assert_allclose(hess_expected, hess_expected.T) + assert_allclose(hess, hess_expected) + + +def test_linear_loss_gradient_hessian_raises_wrong_out_parameters(): + """Test that wrong gradient_out and hessian_out raises errors.""" + n_samples, n_features, n_classes = 5, 2, 3 + loss = LinearModelLoss(base_loss=HalfBinomialLoss(), fit_intercept=False) + X = np.ones((n_samples, n_features)) + y = np.ones(n_samples) + coef = loss.init_zero_coef(X) + gradient_out = np.zeros(1) + with pytest.raises( + ValueError, match="gradient_out is required to have shape coef.shape" + ): + loss.gradient_hessian( + coef=coef, + X=X, + y=y, + gradient_out=gradient_out, + hessian_out=None, + ) + hessian_out = np.zeros(1) + with pytest.raises(ValueError, match="hessian_out is required to have shape"): + loss.gradient_hessian( + coef=coef, + X=X, + y=y, + gradient_out=None, + hessian_out=hessian_out, + ) + + loss = LinearModelLoss(base_loss=HalfMultinomialLoss(), fit_intercept=False) + coef = loss.init_zero_coef(X) + gradient_out = np.zeros((2 * n_classes, n_features))[::2] + with pytest.raises(ValueError, match="gradient_out must be F-contiguous"): + loss.gradient_hessian( + coef=coef, + X=X, + y=y, + gradient_out=gradient_out, + ) + hessian_out = np.zeros((2 * n_classes * n_features, n_classes * n_features))[::2] + with pytest.raises(ValueError, match="hessian_out must be contiguous"): + loss.gradient_hessian( + coef=coef, + X=X, + y=y, + gradient_out=None, + hessian_out=hessian_out, + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_logistic.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_logistic.py new file mode 100644 index 0000000000000000000000000000000000000000..007c900dd36776ba4bd3d5731f5f40cf882028b3 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_logistic.py @@ -0,0 +1,2471 @@ +import itertools +import os +import warnings +from functools import partial + +import numpy as np +import pytest +from numpy.testing import ( + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) +from scipy import sparse +from scipy.linalg import LinAlgWarning, svd + +from sklearn import config_context +from sklearn._loss import HalfMultinomialLoss +from sklearn.base import clone +from sklearn.datasets import load_iris, make_classification, make_low_rank_matrix +from sklearn.exceptions import ConvergenceWarning +from sklearn.linear_model import SGDClassifier +from sklearn.linear_model._logistic import ( + LogisticRegression as LogisticRegressionDefault, +) +from sklearn.linear_model._logistic import ( + LogisticRegressionCV as LogisticRegressionCVDefault, +) +from sklearn.linear_model._logistic import ( + _log_reg_scoring_path, + _logistic_regression_path, +) +from sklearn.metrics import get_scorer, log_loss +from sklearn.model_selection import ( + GridSearchCV, + LeaveOneGroupOut, + StratifiedKFold, + cross_val_score, + train_test_split, +) +from sklearn.multiclass import OneVsRestClassifier +from sklearn.preprocessing import LabelEncoder, StandardScaler, scale +from sklearn.svm import l1_min_c +from sklearn.utils import compute_class_weight, shuffle +from sklearn.utils._testing import ignore_warnings, skip_if_no_parallel +from sklearn.utils.fixes import _IS_32BIT, COO_CONTAINERS, CSR_CONTAINERS + +pytestmark = pytest.mark.filterwarnings( + "error::sklearn.exceptions.ConvergenceWarning:sklearn.*" +) +# Fixing random_state helps prevent ConvergenceWarnings +LogisticRegression = partial(LogisticRegressionDefault, random_state=0) +LogisticRegressionCV = partial(LogisticRegressionCVDefault, random_state=0) + + +SOLVERS = ("lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga") +X = [[-1, 0], [0, 1], [1, 1]] +Y1 = [0, 1, 1] +Y2 = [2, 1, 0] +iris = load_iris() + + +def check_predictions(clf, X, y): + """Check that the model is able to fit the classification data""" + n_samples = len(y) + classes = np.unique(y) + n_classes = classes.shape[0] + + predicted = clf.fit(X, y).predict(X) + assert_array_equal(clf.classes_, classes) + + assert predicted.shape == (n_samples,) + assert_array_equal(predicted, y) + + probabilities = clf.predict_proba(X) + assert probabilities.shape == (n_samples, n_classes) + assert_array_almost_equal(probabilities.sum(axis=1), np.ones(n_samples)) + assert_array_equal(probabilities.argmax(axis=1), y) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_predict_2_classes(csr_container): + # Simple sanity check on a 2 classes dataset + # Make sure it predicts the correct result on simple datasets. + check_predictions(LogisticRegression(random_state=0), X, Y1) + check_predictions(LogisticRegression(random_state=0), csr_container(X), Y1) + + check_predictions(LogisticRegression(C=100, random_state=0), X, Y1) + check_predictions(LogisticRegression(C=100, random_state=0), csr_container(X), Y1) + + check_predictions(LogisticRegression(fit_intercept=False, random_state=0), X, Y1) + check_predictions( + LogisticRegression(fit_intercept=False, random_state=0), csr_container(X), Y1 + ) + + +def test_logistic_cv_mock_scorer(): + class MockScorer: + def __init__(self): + self.calls = 0 + self.scores = [0.1, 0.4, 0.8, 0.5] + + def __call__(self, model, X, y, sample_weight=None): + score = self.scores[self.calls % len(self.scores)] + self.calls += 1 + return score + + mock_scorer = MockScorer() + Cs = [1, 2, 3, 4] + cv = 2 + + lr = LogisticRegressionCV(Cs=Cs, scoring=mock_scorer, cv=cv) + X, y = make_classification(random_state=0) + lr.fit(X, y) + + # Cs[2] has the highest score (0.8) from MockScorer + assert lr.C_[0] == Cs[2] + + # scorer called 8 times (cv*len(Cs)) + assert mock_scorer.calls == cv * len(Cs) + + # reset mock_scorer + mock_scorer.calls = 0 + custom_score = lr.score(X, lr.predict(X)) + + assert custom_score == mock_scorer.scores[0] + assert mock_scorer.calls == 1 + + +@skip_if_no_parallel +def test_lr_liblinear_warning(): + X, y = make_classification(random_state=0) + + lr = LogisticRegression(solver="liblinear", n_jobs=2) + warning_message = ( + "'n_jobs' > 1 does not have any effect when" + " 'solver' is set to 'liblinear'. Got 'n_jobs'" + " = 2." + ) + with pytest.warns(UserWarning, match=warning_message): + lr.fit(X, y) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_predict_3_classes(csr_container): + check_predictions(LogisticRegression(C=10), X, Y2) + check_predictions(LogisticRegression(C=10), csr_container(X), Y2) + + +# TODO(1.8): remove filterwarnings after the deprecation of multi_class +@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning") +@pytest.mark.filterwarnings( + "ignore:.*'liblinear' solver for multiclass classification is deprecated.*" +) +@pytest.mark.parametrize( + "clf", + [ + LogisticRegression(C=len(iris.data), solver="liblinear", multi_class="ovr"), + LogisticRegression(C=len(iris.data), solver="lbfgs"), + LogisticRegression(C=len(iris.data), solver="newton-cg"), + LogisticRegression( + C=len(iris.data), solver="sag", tol=1e-2, multi_class="ovr", random_state=42 + ), + LogisticRegression( + C=len(iris.data), + solver="saga", + tol=1e-2, + multi_class="ovr", + random_state=42, + ), + LogisticRegression(C=len(iris.data), solver="newton-cholesky"), + ], +) +def test_predict_iris(clf): + """Test logistic regression with the iris dataset. + + Test that both multinomial and OvR solvers handle multiclass data correctly and + give good accuracy score (>0.95) for the training data. + """ + n_samples, n_features = iris.data.shape + target = iris.target_names[iris.target] + + if clf.solver == "lbfgs": + # lbfgs has convergence issues on the iris data with its default max_iter=100 + with warnings.catch_warnings(): + warnings.simplefilter("ignore", ConvergenceWarning) + clf.fit(iris.data, target) + else: + clf.fit(iris.data, target) + assert_array_equal(np.unique(target), clf.classes_) + + pred = clf.predict(iris.data) + assert np.mean(pred == target) > 0.95 + + probabilities = clf.predict_proba(iris.data) + assert_allclose(probabilities.sum(axis=1), np.ones(n_samples)) + + pred = iris.target_names[probabilities.argmax(axis=1)] + assert np.mean(pred == target) > 0.95 + + +# TODO(1.8): remove filterwarnings after the deprecation of multi_class +@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning") +@pytest.mark.parametrize("LR", [LogisticRegression, LogisticRegressionCV]) +def test_check_solver_option(LR): + X, y = iris.data, iris.target + + # only 'liblinear' solver + for solver in ["liblinear"]: + msg = f"Solver {solver} does not support a multinomial backend." + lr = LR(solver=solver, multi_class="multinomial") + with pytest.raises(ValueError, match=msg): + lr.fit(X, y) + + # all solvers except 'liblinear' and 'saga' + for solver in ["lbfgs", "newton-cg", "newton-cholesky", "sag"]: + msg = "Solver %s supports only 'l2' or None penalties," % solver + lr = LR(solver=solver, penalty="l1", multi_class="ovr") + with pytest.raises(ValueError, match=msg): + lr.fit(X, y) + for solver in ["lbfgs", "newton-cg", "newton-cholesky", "sag", "saga"]: + msg = "Solver %s supports only dual=False, got dual=True" % solver + lr = LR(solver=solver, dual=True, multi_class="ovr") + with pytest.raises(ValueError, match=msg): + lr.fit(X, y) + + # only saga supports elasticnet. We only test for liblinear because the + # error is raised before for the other solvers (solver %s supports only l2 + # penalties) + for solver in ["liblinear"]: + msg = f"Only 'saga' solver supports elasticnet penalty, got solver={solver}." + lr = LR(solver=solver, penalty="elasticnet") + with pytest.raises(ValueError, match=msg): + lr.fit(X, y) + + # liblinear does not support penalty='none' + # (LogisticRegressionCV does not supports penalty='none' at all) + if LR is LogisticRegression: + msg = "penalty=None is not supported for the liblinear solver" + lr = LR(penalty=None, solver="liblinear") + with pytest.raises(ValueError, match=msg): + lr.fit(X, y) + + +@pytest.mark.parametrize("LR", [LogisticRegression, LogisticRegressionCV]) +def test_elasticnet_l1_ratio_err_helpful(LR): + # Check that an informative error message is raised when penalty="elasticnet" + # but l1_ratio is not specified. + model = LR(penalty="elasticnet", solver="saga") + with pytest.raises(ValueError, match=r".*l1_ratio.*"): + model.fit(np.array([[1, 2], [3, 4]]), np.array([0, 1])) + + +# TODO(1.8): remove whole test with deprecation of multi_class +@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning") +@pytest.mark.parametrize("solver", ["lbfgs", "newton-cg", "sag", "saga"]) +def test_multinomial_binary(solver): + # Test multinomial LR on a binary problem. + target = (iris.target > 0).astype(np.intp) + target = np.array(["setosa", "not-setosa"])[target] + + clf = LogisticRegression( + solver=solver, multi_class="multinomial", random_state=42, max_iter=2000 + ) + clf.fit(iris.data, target) + + assert clf.coef_.shape == (1, iris.data.shape[1]) + assert clf.intercept_.shape == (1,) + assert_array_equal(clf.predict(iris.data), target) + + mlr = LogisticRegression( + solver=solver, multi_class="multinomial", random_state=42, fit_intercept=False + ) + mlr.fit(iris.data, target) + pred = clf.classes_[np.argmax(clf.predict_log_proba(iris.data), axis=1)] + assert np.mean(pred == target) > 0.9 + + +# TODO(1.8): remove filterwarnings after the deprecation of multi_class +# Maybe even remove this whole test as correctness of multinomial loss is tested +# elsewhere. +@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning") +def test_multinomial_binary_probabilities(global_random_seed): + # Test multinomial LR gives expected probabilities based on the + # decision function, for a binary problem. + X, y = make_classification(random_state=global_random_seed) + clf = LogisticRegression( + multi_class="multinomial", + solver="saga", + tol=1e-3, + random_state=global_random_seed, + ) + clf.fit(X, y) + + decision = clf.decision_function(X) + proba = clf.predict_proba(X) + + expected_proba_class_1 = np.exp(decision) / (np.exp(decision) + np.exp(-decision)) + expected_proba = np.c_[1 - expected_proba_class_1, expected_proba_class_1] + + assert_almost_equal(proba, expected_proba) + + +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +def test_sparsify(coo_container): + # Test sparsify and densify members. + n_samples, n_features = iris.data.shape + target = iris.target_names[iris.target] + X = scale(iris.data) + clf = LogisticRegression(random_state=0).fit(X, target) + + pred_d_d = clf.decision_function(X) + + clf.sparsify() + assert sparse.issparse(clf.coef_) + pred_s_d = clf.decision_function(X) + + sp_data = coo_container(X) + pred_s_s = clf.decision_function(sp_data) + + clf.densify() + pred_d_s = clf.decision_function(sp_data) + + assert_array_almost_equal(pred_d_d, pred_s_d) + assert_array_almost_equal(pred_d_d, pred_s_s) + assert_array_almost_equal(pred_d_d, pred_d_s) + + +def test_inconsistent_input(): + # Test that an exception is raised on inconsistent input + rng = np.random.RandomState(0) + X_ = rng.random_sample((5, 10)) + y_ = np.ones(X_.shape[0]) + y_[0] = 0 + + clf = LogisticRegression(random_state=0) + + # Wrong dimensions for training data + y_wrong = y_[:-1] + + with pytest.raises(ValueError): + clf.fit(X, y_wrong) + + # Wrong dimensions for test data + with pytest.raises(ValueError): + clf.fit(X_, y_).predict(rng.random_sample((3, 12))) + + +def test_write_parameters(): + # Test that we can write to coef_ and intercept_ + clf = LogisticRegression(random_state=0) + clf.fit(X, Y1) + clf.coef_[:] = 0 + clf.intercept_[:] = 0 + assert_array_almost_equal(clf.decision_function(X), 0) + + +def test_nan(): + # Test proper NaN handling. + # Regression test for Issue #252: fit used to go into an infinite loop. + Xnan = np.array(X, dtype=np.float64) + Xnan[0, 1] = np.nan + logistic = LogisticRegression(random_state=0) + + with pytest.raises(ValueError): + logistic.fit(Xnan, Y1) + + +def test_consistency_path(): + # Test that the path algorithm is consistent + rng = np.random.RandomState(0) + X = np.concatenate((rng.randn(100, 2) + [1, 1], rng.randn(100, 2))) + y = [1] * 100 + [-1] * 100 + Cs = np.logspace(0, 4, 10) + + f = ignore_warnings + # can't test with fit_intercept=True since LIBLINEAR + # penalizes the intercept + for solver in ["sag", "saga"]: + coefs, Cs, _ = f(_logistic_regression_path)( + X, + y, + Cs=Cs, + fit_intercept=False, + tol=1e-5, + solver=solver, + max_iter=1000, + random_state=0, + ) + for i, C in enumerate(Cs): + lr = LogisticRegression( + C=C, + fit_intercept=False, + tol=1e-5, + solver=solver, + random_state=0, + max_iter=1000, + ) + lr.fit(X, y) + lr_coef = lr.coef_.ravel() + assert_array_almost_equal( + lr_coef, coefs[i], decimal=4, err_msg="with solver = %s" % solver + ) + + # test for fit_intercept=True + for solver in ("lbfgs", "newton-cg", "newton-cholesky", "liblinear", "sag", "saga"): + Cs = [1e3] + coefs, Cs, _ = f(_logistic_regression_path)( + X, + y, + Cs=Cs, + tol=1e-6, + solver=solver, + intercept_scaling=10000.0, + random_state=0, + ) + lr = LogisticRegression( + C=Cs[0], + tol=1e-6, + intercept_scaling=10000.0, + random_state=0, + solver=solver, + ) + lr.fit(X, y) + lr_coef = np.concatenate([lr.coef_.ravel(), lr.intercept_]) + assert_array_almost_equal( + lr_coef, coefs[0], decimal=4, err_msg="with solver = %s" % solver + ) + + +def test_logistic_regression_path_convergence_fail(): + rng = np.random.RandomState(0) + X = np.concatenate((rng.randn(100, 2) + [1, 1], rng.randn(100, 2))) + y = [1] * 100 + [-1] * 100 + Cs = [1e3] + + # Check that the convergence message points to both a model agnostic + # advice (scaling the data) and to the logistic regression specific + # documentation that includes hints on the solver configuration. + with pytest.warns(ConvergenceWarning) as record: + _logistic_regression_path( + X, y, Cs=Cs, tol=0.0, max_iter=1, random_state=0, verbose=0 + ) + + assert len(record) == 1 + warn_msg = record[0].message.args[0] + assert "lbfgs failed to converge after 1 iteration(s)" in warn_msg + assert "Increase the number of iterations" in warn_msg + assert "scale the data" in warn_msg + assert "linear_model.html#logistic-regression" in warn_msg + + +def test_liblinear_dual_random_state(): + # random_state is relevant for liblinear solver only if dual=True + X, y = make_classification(n_samples=20, random_state=0) + lr1 = LogisticRegression( + random_state=0, + dual=True, + tol=1e-3, + solver="liblinear", + ) + lr1.fit(X, y) + lr2 = LogisticRegression( + random_state=0, + dual=True, + tol=1e-3, + solver="liblinear", + ) + lr2.fit(X, y) + lr3 = LogisticRegression( + random_state=8, + dual=True, + tol=1e-3, + solver="liblinear", + ) + lr3.fit(X, y) + + # same result for same random state + assert_array_almost_equal(lr1.coef_, lr2.coef_) + # different results for different random states + msg = "Arrays are not almost equal to 6 decimals" + with pytest.raises(AssertionError, match=msg): + assert_array_almost_equal(lr1.coef_, lr3.coef_) + + +def test_logistic_cv(): + # test for LogisticRegressionCV object + n_samples, n_features = 50, 5 + rng = np.random.RandomState(0) + X_ref = rng.randn(n_samples, n_features) + y = np.sign(X_ref.dot(5 * rng.randn(n_features))) + X_ref -= X_ref.mean() + X_ref /= X_ref.std() + lr_cv = LogisticRegressionCV( + Cs=[1.0], fit_intercept=False, solver="liblinear", cv=3 + ) + lr_cv.fit(X_ref, y) + lr = LogisticRegression(C=1.0, fit_intercept=False, solver="liblinear") + lr.fit(X_ref, y) + assert_array_almost_equal(lr.coef_, lr_cv.coef_) + + assert_array_equal(lr_cv.coef_.shape, (1, n_features)) + assert_array_equal(lr_cv.classes_, [-1, 1]) + assert len(lr_cv.classes_) == 2 + + coefs_paths = np.asarray(list(lr_cv.coefs_paths_.values())) + assert_array_equal(coefs_paths.shape, (1, 3, 1, n_features)) + assert_array_equal(lr_cv.Cs_.shape, (1,)) + scores = np.asarray(list(lr_cv.scores_.values())) + assert_array_equal(scores.shape, (1, 3, 1)) + + +@pytest.mark.parametrize( + "scoring, multiclass_agg_list", + [ + ("accuracy", [""]), + ("precision", ["_macro", "_weighted"]), + # no need to test for micro averaging because it + # is the same as accuracy for f1, precision, + # and recall (see https://github.com/ + # scikit-learn/scikit-learn/pull/ + # 11578#discussion_r203250062) + ("f1", ["_macro", "_weighted"]), + ("neg_log_loss", [""]), + ("recall", ["_macro", "_weighted"]), + ], +) +def test_logistic_cv_multinomial_score(scoring, multiclass_agg_list): + # test that LogisticRegressionCV uses the right score to compute its + # cross-validation scores when using a multinomial scoring + # see https://github.com/scikit-learn/scikit-learn/issues/8720 + X, y = make_classification( + n_samples=100, random_state=0, n_classes=3, n_informative=6 + ) + train, test = np.arange(80), np.arange(80, 100) + lr = LogisticRegression(C=1.0) + # we use lbfgs to support multinomial + params = lr.get_params() + # we store the params to set them further in _log_reg_scoring_path + for key in ["C", "n_jobs", "warm_start"]: + del params[key] + lr.fit(X[train], y[train]) + for averaging in multiclass_agg_list: + scorer = get_scorer(scoring + averaging) + assert_array_almost_equal( + _log_reg_scoring_path( + X, + y, + train, + test, + Cs=[1.0], + scoring=scorer, + pos_class=None, + max_squared_sum=None, + sample_weight=None, + score_params=None, + **(params | {"multi_class": "multinomial"}), + )[2][0], + scorer(lr, X[test], y[test]), + ) + + +def test_multinomial_logistic_regression_string_inputs(): + # Test with string labels for LogisticRegression(CV) + n_samples, n_features, n_classes = 50, 5, 3 + X_ref, y = make_classification( + n_samples=n_samples, + n_features=n_features, + n_classes=n_classes, + n_informative=3, + random_state=0, + ) + y_str = LabelEncoder().fit(["bar", "baz", "foo"]).inverse_transform(y) + # For numerical labels, let y values be taken from set (-1, 0, 1) + y = np.array(y) - 1 + # Test for string labels + lr = LogisticRegression() + lr_cv = LogisticRegressionCV(Cs=3) + lr_str = LogisticRegression() + lr_cv_str = LogisticRegressionCV(Cs=3) + + lr.fit(X_ref, y) + lr_cv.fit(X_ref, y) + lr_str.fit(X_ref, y_str) + lr_cv_str.fit(X_ref, y_str) + + assert_array_almost_equal(lr.coef_, lr_str.coef_) + assert sorted(lr_str.classes_) == ["bar", "baz", "foo"] + assert_array_almost_equal(lr_cv.coef_, lr_cv_str.coef_) + assert sorted(lr_str.classes_) == ["bar", "baz", "foo"] + assert sorted(lr_cv_str.classes_) == ["bar", "baz", "foo"] + + # The predictions should be in original labels + assert sorted(np.unique(lr_str.predict(X_ref))) == ["bar", "baz", "foo"] + assert sorted(np.unique(lr_cv_str.predict(X_ref))) == ["bar", "baz", "foo"] + + # Make sure class weights can be given with string labels + lr_cv_str = LogisticRegression(class_weight={"bar": 1, "baz": 2, "foo": 0}).fit( + X_ref, y_str + ) + assert sorted(np.unique(lr_cv_str.predict(X_ref))) == ["bar", "baz"] + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_logistic_cv_sparse(csr_container): + X, y = make_classification(n_samples=50, n_features=5, random_state=0) + X[X < 1.0] = 0.0 + csr = csr_container(X) + + clf = LogisticRegressionCV() + clf.fit(X, y) + clfs = LogisticRegressionCV() + clfs.fit(csr, y) + assert_array_almost_equal(clfs.coef_, clf.coef_) + assert_array_almost_equal(clfs.intercept_, clf.intercept_) + assert clfs.C_ == clf.C_ + + +# TODO(1.8): remove filterwarnings after the deprecation of multi_class +# Best remove this whole test. +@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning") +def test_ovr_multinomial_iris(): + # Test that OvR and multinomial are correct using the iris dataset. + train, target = iris.data, iris.target + n_samples, n_features = train.shape + + # The cv indices from stratified kfold (where stratification is done based + # on the fine-grained iris classes, i.e, before the classes 0 and 1 are + # conflated) is used for both clf and clf1 + n_cv = 2 + cv = StratifiedKFold(n_cv) + precomputed_folds = list(cv.split(train, target)) + + # Train clf on the original dataset where classes 0 and 1 are separated + clf = LogisticRegressionCV(cv=precomputed_folds, multi_class="ovr") + clf.fit(train, target) + + # Conflate classes 0 and 1 and train clf1 on this modified dataset + clf1 = LogisticRegressionCV(cv=precomputed_folds, multi_class="ovr") + target_copy = target.copy() + target_copy[target_copy == 0] = 1 + clf1.fit(train, target_copy) + + # Ensure that what OvR learns for class2 is same regardless of whether + # classes 0 and 1 are separated or not + assert_allclose(clf.scores_[2], clf1.scores_[2]) + assert_allclose(clf.intercept_[2:], clf1.intercept_) + assert_allclose(clf.coef_[2][np.newaxis, :], clf1.coef_) + + # Test the shape of various attributes. + assert clf.coef_.shape == (3, n_features) + assert_array_equal(clf.classes_, [0, 1, 2]) + coefs_paths = np.asarray(list(clf.coefs_paths_.values())) + assert coefs_paths.shape == (3, n_cv, 10, n_features + 1) + assert clf.Cs_.shape == (10,) + scores = np.asarray(list(clf.scores_.values())) + assert scores.shape == (3, n_cv, 10) + + # Test that for the iris data multinomial gives a better accuracy than OvR + for solver in ["lbfgs", "newton-cg", "sag", "saga"]: + max_iter = 500 if solver in ["sag", "saga"] else 30 + clf_multi = LogisticRegressionCV( + solver=solver, + max_iter=max_iter, + random_state=42, + tol=1e-3 if solver in ["sag", "saga"] else 1e-2, + cv=2, + ) + if solver == "lbfgs": + # lbfgs requires scaling to avoid convergence warnings + train = scale(train) + + clf_multi.fit(train, target) + multi_score = clf_multi.score(train, target) + ovr_score = clf.score(train, target) + assert multi_score > ovr_score + + # Test attributes of LogisticRegressionCV + assert clf.coef_.shape == clf_multi.coef_.shape + assert_array_equal(clf_multi.classes_, [0, 1, 2]) + coefs_paths = np.asarray(list(clf_multi.coefs_paths_.values())) + assert coefs_paths.shape == (3, n_cv, 10, n_features + 1) + assert clf_multi.Cs_.shape == (10,) + scores = np.asarray(list(clf_multi.scores_.values())) + assert scores.shape == (3, n_cv, 10) + + +def test_logistic_regression_solvers(): + """Test solvers converge to the same result.""" + X, y = make_classification(n_features=10, n_informative=5, random_state=0) + + params = dict(fit_intercept=False, random_state=42) + + regressors = { + solver: LogisticRegression(solver=solver, **params).fit(X, y) + for solver in SOLVERS + } + + for solver_1, solver_2 in itertools.combinations(regressors, r=2): + assert_array_almost_equal( + regressors[solver_1].coef_, regressors[solver_2].coef_, decimal=3 + ) + + +# TODO(1.8): remove filterwarnings after the deprecation of multi_class +@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning") +@pytest.mark.parametrize("fit_intercept", [False, True]) +def test_logistic_regression_solvers_multiclass(fit_intercept): + """Test solvers converge to the same result for multiclass problems.""" + X, y = make_classification( + n_samples=20, n_features=20, n_informative=10, n_classes=3, random_state=0 + ) + tol = 1e-8 + params = dict(fit_intercept=fit_intercept, tol=tol, random_state=42) + + # Override max iteration count for specific solvers to allow for + # proper convergence. + solver_max_iter = {"lbfgs": 200, "sag": 10_000, "saga": 10_000} + + regressors = { + solver: LogisticRegression( + solver=solver, max_iter=solver_max_iter.get(solver, 100), **params + ).fit(X, y) + for solver in set(SOLVERS) - set(["liblinear"]) + } + + for solver_1, solver_2 in itertools.combinations(regressors, r=2): + assert_allclose( + regressors[solver_1].coef_, + regressors[solver_2].coef_, + rtol=5e-3 if (solver_1 == "saga" or solver_2 == "saga") else 1e-3, + err_msg=f"{solver_1} vs {solver_2}", + ) + if fit_intercept: + assert_allclose( + regressors[solver_1].intercept_, + regressors[solver_2].intercept_, + rtol=5e-3 if (solver_1 == "saga" or solver_2 == "saga") else 1e-3, + err_msg=f"{solver_1} vs {solver_2}", + ) + + +@pytest.mark.parametrize("fit_intercept", [False, True]) +def test_logistic_regression_solvers_multiclass_unpenalized( + fit_intercept, global_random_seed +): + """Test and compare solver results for unpenalized multinomial multiclass.""" + # We want to avoid perfect separation. + n_samples, n_features, n_classes = 100, 4, 3 + rng = np.random.RandomState(global_random_seed) + X = make_low_rank_matrix( + n_samples=n_samples, + n_features=n_features + fit_intercept, + effective_rank=n_features + fit_intercept, + tail_strength=0.1, + random_state=rng, + ) + if fit_intercept: + X[:, -1] = 1 + U, s, Vt = svd(X) + assert np.all(s > 1e-3) # to be sure that X is not singular + assert np.max(s) / np.min(s) < 100 # condition number of X + if fit_intercept: + X = X[:, :-1] + coef = rng.uniform(low=1, high=3, size=n_features * n_classes) + coef = coef.reshape(n_classes, n_features) + intercept = rng.uniform(low=-1, high=1, size=n_classes) * fit_intercept + raw_prediction = X @ coef.T + intercept + + loss = HalfMultinomialLoss(n_classes=n_classes) + proba = loss.link.inverse(raw_prediction) + # Only newer numpy version (1.22) support more dimensions on pvals. + y = np.zeros(n_samples) + for i in range(n_samples): + y[i] = np.argwhere(rng.multinomial(n=1, pvals=proba[i, :]))[0, 0] + + tol = 1e-9 + params = dict(fit_intercept=fit_intercept, random_state=42) + solver_max_iter = {"lbfgs": 200, "sag": 10_000, "saga": 10_000} + solver_tol = {"sag": 1e-8, "saga": 1e-8} + regressors = { + solver: LogisticRegression( + C=np.inf, + solver=solver, + tol=solver_tol.get(solver, tol), + max_iter=solver_max_iter.get(solver, 100), + **params, + ).fit(X, y) + for solver in set(SOLVERS) - set(["liblinear"]) + } + for solver in regressors.keys(): + # See the docstring of test_multinomial_identifiability_on_iris for reference. + assert_allclose( + regressors[solver].coef_.sum(axis=0), 0, atol=1e-10, err_msg=solver + ) + + for solver_1, solver_2 in itertools.combinations(regressors, r=2): + assert_allclose( + regressors[solver_1].coef_, + regressors[solver_2].coef_, + rtol=5e-3 if (solver_1 == "saga" or solver_2 == "saga") else 2e-3, + err_msg=f"{solver_1} vs {solver_2}", + ) + if fit_intercept: + assert_allclose( + regressors[solver_1].intercept_, + regressors[solver_2].intercept_, + rtol=5e-3 if (solver_1 == "saga" or solver_2 == "saga") else 1e-3, + err_msg=f"{solver_1} vs {solver_2}", + ) + + +@pytest.mark.parametrize("weight", [{0: 0.1, 1: 0.2}, {0: 0.1, 1: 0.2, 2: 0.5}]) +@pytest.mark.parametrize("class_weight", ["weight", "balanced"]) +def test_logistic_regressioncv_class_weights(weight, class_weight, global_random_seed): + """Test class_weight for LogisticRegressionCV.""" + n_classes = len(weight) + if class_weight == "weight": + class_weight = weight + + X, y = make_classification( + n_samples=30, + n_features=3, + n_repeated=0, + n_informative=3, + n_redundant=0, + n_classes=n_classes, + random_state=global_random_seed, + ) + params = dict( + Cs=1, + fit_intercept=False, + class_weight=class_weight, + tol=1e-8, + ) + clf_lbfgs = LogisticRegressionCV(solver="lbfgs", **params) + + # XXX: lbfgs' line search can fail and cause a ConvergenceWarning for some + # 10% of the random seeds, but only on specific platforms (in particular + # when using Atlas BLAS/LAPACK implementation). Doubling the maxls internal + # parameter of the solver does not help. However this lack of proper + # convergence does not seem to prevent the assertion to pass, so we ignore + # the warning for now. + # See: https://github.com/scikit-learn/scikit-learn/pull/27649 + with ignore_warnings(category=ConvergenceWarning): + clf_lbfgs.fit(X, y) + + for solver in set(SOLVERS) - set(["lbfgs", "liblinear", "newton-cholesky"]): + clf = LogisticRegressionCV(solver=solver, **params) + if solver in ("sag", "saga"): + clf.set_params( + tol=1e-18, max_iter=10000, random_state=global_random_seed + 1 + ) + clf.fit(X, y) + + assert_allclose( + clf.coef_, clf_lbfgs.coef_, rtol=1e-3, err_msg=f"{solver} vs lbfgs" + ) + + +@pytest.mark.parametrize("problem", ("single", "cv")) +@pytest.mark.parametrize( + "solver", ("lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga") +) +def test_logistic_regression_sample_weights(problem, solver, global_random_seed): + n_samples_per_cv_group = 200 + n_cv_groups = 3 + + X, y = make_classification( + n_samples=n_samples_per_cv_group * n_cv_groups, + n_features=5, + n_informative=3, + n_classes=2, + n_redundant=0, + random_state=global_random_seed, + ) + rng = np.random.RandomState(global_random_seed) + sw = np.ones(y.shape[0]) + + kw_weighted = { + "random_state": global_random_seed, + "fit_intercept": False, + "max_iter": 100_000 if solver.startswith("sag") else 1_000, + "tol": 1e-8, + } + kw_repeated = kw_weighted.copy() + sw[:n_samples_per_cv_group] = rng.randint(0, 5, size=n_samples_per_cv_group) + X_repeated = np.repeat(X, sw.astype(int), axis=0) + y_repeated = np.repeat(y, sw.astype(int), axis=0) + + if problem == "single": + LR = LogisticRegression + elif problem == "cv": + LR = LogisticRegressionCV + # We weight the first fold 2 times more. + groups_weighted = np.concatenate( + [ + np.full(n_samples_per_cv_group, 0), + np.full(n_samples_per_cv_group, 1), + np.full(n_samples_per_cv_group, 2), + ] + ) + splits_weighted = list(LeaveOneGroupOut().split(X, groups=groups_weighted)) + kw_weighted.update({"Cs": 100, "cv": splits_weighted}) + + groups_repeated = np.repeat(groups_weighted, sw.astype(int), axis=0) + splits_repeated = list( + LeaveOneGroupOut().split(X_repeated, groups=groups_repeated) + ) + kw_repeated.update({"Cs": 100, "cv": splits_repeated}) + + clf_sw_weighted = LR(solver=solver, **kw_weighted) + clf_sw_repeated = LR(solver=solver, **kw_repeated) + + if solver == "lbfgs": + # lbfgs has convergence issues on the data but this should not impact + # the quality of the results. + with warnings.catch_warnings(): + warnings.simplefilter("ignore", ConvergenceWarning) + clf_sw_weighted.fit(X, y, sample_weight=sw) + clf_sw_repeated.fit(X_repeated, y_repeated) + + else: + clf_sw_weighted.fit(X, y, sample_weight=sw) + clf_sw_repeated.fit(X_repeated, y_repeated) + + if problem == "cv": + assert_allclose(clf_sw_weighted.scores_[1], clf_sw_repeated.scores_[1]) + assert_allclose(clf_sw_weighted.coef_, clf_sw_repeated.coef_, atol=1e-5) + + +@pytest.mark.parametrize( + "solver", ("lbfgs", "newton-cg", "newton-cholesky", "sag", "saga") +) +def test_logistic_regression_solver_class_weights(solver, global_random_seed): + # Test that passing class_weight as [1, 2] is the same as + # passing class weight = [1,1] but adjusting sample weights + # to be 2 for all instances of class 1. + + X, y = make_classification( + n_samples=300, + n_features=5, + n_informative=3, + n_classes=2, + random_state=global_random_seed, + ) + + sample_weight = y + 1 + + kw_weighted = { + "random_state": global_random_seed, + "fit_intercept": False, + "max_iter": 100_000, + "tol": 1e-8, + } + clf_cw_12 = LogisticRegression( + solver=solver, class_weight={0: 1, 1: 2}, **kw_weighted + ) + clf_cw_12.fit(X, y) + clf_sw_12 = LogisticRegression(solver=solver, **kw_weighted) + clf_sw_12.fit(X, y, sample_weight=sample_weight) + assert_allclose(clf_cw_12.coef_, clf_sw_12.coef_, atol=1e-6) + + +def test_sample_and_class_weight_equivalence_liblinear(global_random_seed): + # Test the above for l1 penalty and l2 penalty with dual=True. + # since the patched liblinear code is different. + + X, y = make_classification( + n_samples=300, + n_features=5, + n_informative=3, + n_classes=2, + random_state=global_random_seed, + ) + + sample_weight = y + 1 + + clf_cw = LogisticRegression( + solver="liblinear", + fit_intercept=False, + class_weight={0: 1, 1: 2}, + penalty="l1", + max_iter=10_000, + tol=1e-12, + random_state=global_random_seed, + ) + clf_cw.fit(X, y) + clf_sw = LogisticRegression( + solver="liblinear", + fit_intercept=False, + penalty="l1", + max_iter=10_000, + tol=1e-12, + random_state=global_random_seed, + ) + clf_sw.fit(X, y, sample_weight) + assert_allclose(clf_cw.coef_, clf_sw.coef_, atol=1e-10) + + clf_cw = LogisticRegression( + solver="liblinear", + fit_intercept=False, + class_weight={0: 1, 1: 2}, + penalty="l2", + max_iter=10_000, + tol=1e-12, + dual=True, + random_state=global_random_seed, + ) + clf_cw.fit(X, y) + clf_sw = LogisticRegression( + solver="liblinear", + fit_intercept=False, + penalty="l2", + max_iter=10_000, + tol=1e-12, + dual=True, + random_state=global_random_seed, + ) + clf_sw.fit(X, y, sample_weight) + assert_allclose(clf_cw.coef_, clf_sw.coef_, atol=1e-10) + + +def _compute_class_weight_dictionary(y): + # helper for returning a dictionary instead of an array + classes = np.unique(y) + class_weight = compute_class_weight("balanced", classes=classes, y=y) + class_weight_dict = dict(zip(classes, class_weight)) + return class_weight_dict + + +@pytest.mark.parametrize("csr_container", [lambda x: x] + CSR_CONTAINERS) +def test_logistic_regression_class_weights(csr_container): + # Scale data to avoid convergence warnings with the lbfgs solver + X_iris = scale(iris.data) + # Multinomial case: remove 90% of class 0 + X = X_iris[45:, :] + X = csr_container(X) + y = iris.target[45:] + class_weight_dict = _compute_class_weight_dictionary(y) + + for solver in set(SOLVERS) - set(["liblinear", "newton-cholesky"]): + params = dict(solver=solver, max_iter=1000) + clf1 = LogisticRegression(class_weight="balanced", **params) + clf2 = LogisticRegression(class_weight=class_weight_dict, **params) + clf1.fit(X, y) + clf2.fit(X, y) + assert len(clf1.classes_) == 3 + assert_allclose(clf1.coef_, clf2.coef_, rtol=1e-4) + # Same as appropriate sample_weight. + sw = np.ones(X.shape[0]) + for c in clf1.classes_: + sw[y == c] *= class_weight_dict[c] + clf3 = LogisticRegression(**params).fit(X, y, sample_weight=sw) + assert_allclose(clf3.coef_, clf2.coef_, rtol=1e-4) + + # Binary case: remove 90% of class 0 and 100% of class 2 + X = X_iris[45:100, :] + y = iris.target[45:100] + class_weight_dict = _compute_class_weight_dictionary(y) + + for solver in SOLVERS: + params = dict(solver=solver, max_iter=1000) + clf1 = LogisticRegression(class_weight="balanced", **params) + clf2 = LogisticRegression(class_weight=class_weight_dict, **params) + clf1.fit(X, y) + clf2.fit(X, y) + assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=6) + + +def test_logistic_regression_multinomial(): + # Tests for the multinomial option in logistic regression + + # Some basic attributes of Logistic Regression + n_samples, n_features, n_classes = 50, 20, 3 + X, y = make_classification( + n_samples=n_samples, + n_features=n_features, + n_informative=10, + n_classes=n_classes, + random_state=0, + ) + + X = StandardScaler(with_mean=False).fit_transform(X) + + # 'lbfgs' is used as a referenced + solver = "lbfgs" + ref_i = LogisticRegression(solver=solver, tol=1e-6) + ref_w = LogisticRegression(solver=solver, fit_intercept=False, tol=1e-6) + ref_i.fit(X, y) + ref_w.fit(X, y) + assert ref_i.coef_.shape == (n_classes, n_features) + assert ref_w.coef_.shape == (n_classes, n_features) + for solver in ["sag", "saga", "newton-cg"]: + clf_i = LogisticRegression( + solver=solver, + random_state=42, + max_iter=2000, + tol=1e-7, + ) + clf_w = LogisticRegression( + solver=solver, + random_state=42, + max_iter=2000, + tol=1e-7, + fit_intercept=False, + ) + clf_i.fit(X, y) + clf_w.fit(X, y) + assert clf_i.coef_.shape == (n_classes, n_features) + assert clf_w.coef_.shape == (n_classes, n_features) + + # Compare solutions between lbfgs and the other solvers + assert_allclose(ref_i.coef_, clf_i.coef_, rtol=1e-3) + assert_allclose(ref_w.coef_, clf_w.coef_, rtol=1e-2) + assert_allclose(ref_i.intercept_, clf_i.intercept_, rtol=1e-3) + + # Test that the path give almost the same results. However since in this + # case we take the average of the coefs after fitting across all the + # folds, it need not be exactly the same. + for solver in ["lbfgs", "newton-cg", "sag", "saga"]: + clf_path = LogisticRegressionCV( + solver=solver, max_iter=2000, tol=1e-6, Cs=[1.0] + ) + clf_path.fit(X, y) + assert_allclose(clf_path.coef_, ref_i.coef_, rtol=1e-2) + assert_allclose(clf_path.intercept_, ref_i.intercept_, rtol=1e-2) + + +def test_liblinear_decision_function_zero(): + # Test negative prediction when decision_function values are zero. + # Liblinear predicts the positive class when decision_function values + # are zero. This is a test to verify that we do not do the same. + # See Issue: https://github.com/scikit-learn/scikit-learn/issues/3600 + # and the PR https://github.com/scikit-learn/scikit-learn/pull/3623 + X, y = make_classification(n_samples=5, n_features=5, random_state=0) + clf = LogisticRegression(fit_intercept=False, solver="liblinear") + clf.fit(X, y) + + # Dummy data such that the decision function becomes zero. + X = np.zeros((5, 5)) + assert_array_equal(clf.predict(X), np.zeros(5)) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_liblinear_logregcv_sparse(csr_container): + # Test LogRegCV with solver='liblinear' works for sparse matrices + + X, y = make_classification(n_samples=10, n_features=5, random_state=0) + clf = LogisticRegressionCV(solver="liblinear") + clf.fit(csr_container(X), y) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_saga_sparse(csr_container): + # Test LogRegCV with solver='liblinear' works for sparse matrices + + X, y = make_classification(n_samples=10, n_features=5, random_state=0) + clf = LogisticRegressionCV(solver="saga", tol=1e-2) + clf.fit(csr_container(X), y) + + +def test_logreg_intercept_scaling_zero(): + # Test that intercept_scaling is ignored when fit_intercept is False + + clf = LogisticRegression(fit_intercept=False) + clf.fit(X, Y1) + assert clf.intercept_ == 0.0 + + +def test_logreg_l1(): + # Because liblinear penalizes the intercept and saga does not, we do not + # fit the intercept to make it possible to compare the coefficients of + # the two models at convergence. + rng = np.random.RandomState(42) + n_samples = 50 + X, y = make_classification(n_samples=n_samples, n_features=20, random_state=0) + X_noise = rng.normal(size=(n_samples, 3)) + X_constant = np.ones(shape=(n_samples, 2)) + X = np.concatenate((X, X_noise, X_constant), axis=1) + lr_liblinear = LogisticRegression( + penalty="l1", + C=1.0, + solver="liblinear", + fit_intercept=False, + tol=1e-10, + ) + lr_liblinear.fit(X, y) + + lr_saga = LogisticRegression( + penalty="l1", + C=1.0, + solver="saga", + fit_intercept=False, + max_iter=1000, + tol=1e-10, + ) + lr_saga.fit(X, y) + assert_array_almost_equal(lr_saga.coef_, lr_liblinear.coef_) + + # Noise and constant features should be regularized to zero by the l1 + # penalty + assert_array_almost_equal(lr_liblinear.coef_[0, -5:], np.zeros(5)) + assert_array_almost_equal(lr_saga.coef_[0, -5:], np.zeros(5)) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_logreg_l1_sparse_data(csr_container): + # Because liblinear penalizes the intercept and saga does not, we do not + # fit the intercept to make it possible to compare the coefficients of + # the two models at convergence. + rng = np.random.RandomState(42) + n_samples = 50 + X, y = make_classification(n_samples=n_samples, n_features=20, random_state=0) + X_noise = rng.normal(scale=0.1, size=(n_samples, 3)) + X_constant = np.zeros(shape=(n_samples, 2)) + X = np.concatenate((X, X_noise, X_constant), axis=1) + X[X < 1] = 0 + X = csr_container(X) + + lr_liblinear = LogisticRegression( + penalty="l1", + C=1.0, + solver="liblinear", + fit_intercept=False, + tol=1e-10, + ) + lr_liblinear.fit(X, y) + + lr_saga = LogisticRegression( + penalty="l1", + C=1.0, + solver="saga", + fit_intercept=False, + max_iter=1000, + tol=1e-10, + ) + lr_saga.fit(X, y) + assert_array_almost_equal(lr_saga.coef_, lr_liblinear.coef_) + # Noise and constant features should be regularized to zero by the l1 + # penalty + assert_array_almost_equal(lr_liblinear.coef_[0, -5:], np.zeros(5)) + assert_array_almost_equal(lr_saga.coef_[0, -5:], np.zeros(5)) + + # Check that solving on the sparse and dense data yield the same results + lr_saga_dense = LogisticRegression( + penalty="l1", + C=1.0, + solver="saga", + fit_intercept=False, + max_iter=1000, + tol=1e-10, + ) + lr_saga_dense.fit(X.toarray(), y) + assert_array_almost_equal(lr_saga.coef_, lr_saga_dense.coef_) + + +@pytest.mark.parametrize("random_seed", [42]) +@pytest.mark.parametrize("penalty", ["l1", "l2"]) +def test_logistic_regression_cv_refit(random_seed, penalty): + # Test that when refit=True, logistic regression cv with the saga solver + # converges to the same solution as logistic regression with a fixed + # regularization parameter. + # Internally the LogisticRegressionCV model uses a warm start to refit on + # the full data model with the optimal C found by CV. As the penalized + # logistic regression loss is convex, we should still recover exactly + # the same solution as long as the stopping criterion is strict enough (and + # that there are no exactly duplicated features when penalty='l1'). + X, y = make_classification(n_samples=100, n_features=20, random_state=random_seed) + common_params = dict( + solver="saga", + penalty=penalty, + random_state=random_seed, + max_iter=1000, + tol=1e-12, + ) + lr_cv = LogisticRegressionCV(Cs=[1.0], refit=True, **common_params) + lr_cv.fit(X, y) + lr = LogisticRegression(C=1.0, **common_params) + lr.fit(X, y) + assert_array_almost_equal(lr_cv.coef_, lr.coef_) + + +def test_logreg_predict_proba_multinomial(): + X, y = make_classification( + n_samples=10, n_features=20, random_state=0, n_classes=3, n_informative=10 + ) + + # Predicted probabilities using the true-entropy loss should give a + # smaller loss than those using the ovr method. + clf_multi = LogisticRegression(solver="lbfgs") + clf_multi.fit(X, y) + clf_multi_loss = log_loss(y, clf_multi.predict_proba(X)) + clf_ovr = OneVsRestClassifier(LogisticRegression(solver="lbfgs")) + clf_ovr.fit(X, y) + clf_ovr_loss = log_loss(y, clf_ovr.predict_proba(X)) + assert clf_ovr_loss > clf_multi_loss + + # Predicted probabilities using the soft-max function should give a + # smaller loss than those using the logistic function. + clf_multi_loss = log_loss(y, clf_multi.predict_proba(X)) + clf_wrong_loss = log_loss(y, clf_multi._predict_proba_lr(X)) + assert clf_wrong_loss > clf_multi_loss + + +# TODO(1.8): remove filterwarnings after the deprecation of multi_class +@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning") +@pytest.mark.parametrize("max_iter", np.arange(1, 5)) +@pytest.mark.parametrize("multi_class", ["ovr", "multinomial"]) +@pytest.mark.parametrize( + "solver, message", + [ + ( + "newton-cg", + "newton-cg failed to converge.* Increase the number of iterations.", + ), + ( + "liblinear", + "Liblinear failed to converge, increase the number of iterations.", + ), + ("sag", "The max_iter was reached which means the coef_ did not converge"), + ("saga", "The max_iter was reached which means the coef_ did not converge"), + ("lbfgs", "lbfgs failed to converge"), + ("newton-cholesky", "Newton solver did not converge after [0-9]* iterations"), + ], +) +def test_max_iter(max_iter, multi_class, solver, message): + # Test that the maximum number of iteration is reached + X, y_bin = iris.data, iris.target.copy() + y_bin[y_bin == 2] = 0 + + if solver in ("liblinear",) and multi_class == "multinomial": + pytest.skip("'multinomial' is not supported by liblinear") + if solver == "newton-cholesky" and max_iter > 1: + pytest.skip("solver newton-cholesky might converge very fast") + + lr = LogisticRegression( + max_iter=max_iter, + tol=1e-15, + multi_class=multi_class, + random_state=0, + solver=solver, + ) + with pytest.warns(ConvergenceWarning, match=message): + lr.fit(X, y_bin) + + assert lr.n_iter_[0] == max_iter + + +# TODO(1.8): remove filterwarnings after the deprecation of multi_class +@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning") +@pytest.mark.filterwarnings( + "ignore:.*'liblinear' solver for multiclass classification is deprecated.*" +) +@pytest.mark.parametrize("solver", SOLVERS) +def test_n_iter(solver): + # Test that self.n_iter_ has the correct format. + X, y = iris.data, iris.target + if solver == "lbfgs": + # lbfgs requires scaling to avoid convergence warnings + X = scale(X) + + n_classes = np.unique(y).shape[0] + assert n_classes == 3 + + # Also generate a binary classification sub-problem. + y_bin = y.copy() + y_bin[y_bin == 2] = 0 + + n_Cs = 4 + n_cv_fold = 2 + + # Binary classification case + clf = LogisticRegression(tol=1e-2, C=1.0, solver=solver, random_state=42) + clf.fit(X, y_bin) + assert clf.n_iter_.shape == (1,) + + clf_cv = LogisticRegressionCV( + tol=1e-2, solver=solver, Cs=n_Cs, cv=n_cv_fold, random_state=42 + ) + clf_cv.fit(X, y_bin) + assert clf_cv.n_iter_.shape == (1, n_cv_fold, n_Cs) + + # OvR case + clf.set_params(multi_class="ovr").fit(X, y) + assert clf.n_iter_.shape == (n_classes,) + + clf_cv.set_params(multi_class="ovr").fit(X, y) + assert clf_cv.n_iter_.shape == (n_classes, n_cv_fold, n_Cs) + + # multinomial case + if solver in ("liblinear",): + # This solver only supports one-vs-rest multiclass classification. + return + + # When using the multinomial objective function, there is a single + # optimization problem to solve for all classes at once: + clf.set_params(multi_class="multinomial").fit(X, y) + assert clf.n_iter_.shape == (1,) + + clf_cv.set_params(multi_class="multinomial").fit(X, y) + assert clf_cv.n_iter_.shape == (1, n_cv_fold, n_Cs) + + +@pytest.mark.parametrize("solver", sorted(set(SOLVERS) - set(["liblinear"]))) +@pytest.mark.parametrize("warm_start", (True, False)) +@pytest.mark.parametrize("fit_intercept", (True, False)) +def test_warm_start(solver, warm_start, fit_intercept): + # A 1-iteration second fit on same data should give almost same result + # with warm starting, and quite different result without warm starting. + # Warm starting does not work with liblinear solver. + X, y = iris.data, iris.target + + clf = LogisticRegression( + tol=1e-4, + warm_start=warm_start, + solver=solver, + random_state=42, + fit_intercept=fit_intercept, + ) + with ignore_warnings(category=ConvergenceWarning): + clf.fit(X, y) + coef_1 = clf.coef_ + + clf.max_iter = 1 + clf.fit(X, y) + cum_diff = np.sum(np.abs(coef_1 - clf.coef_)) + msg = ( + f"Warm starting issue with solver {solver}" + f"with {fit_intercept=} and {warm_start=}" + ) + if warm_start: + assert 2.0 > cum_diff, msg + else: + assert cum_diff > 2.0, msg + + +@pytest.mark.parametrize("solver", ["newton-cholesky", "newton-cg"]) +@pytest.mark.parametrize("fit_intercept", (True, False)) +@pytest.mark.parametrize("penalty", ("l2", None)) +def test_warm_start_newton_solver(global_random_seed, solver, fit_intercept, penalty): + """Test that 2 steps at once are the same as 2 single steps with warm start.""" + X, y = iris.data, iris.target + + clf1 = LogisticRegression( + solver=solver, + max_iter=2, + fit_intercept=fit_intercept, + penalty=penalty, + random_state=global_random_seed, + ) + with ignore_warnings(category=ConvergenceWarning): + clf1.fit(X, y) + + clf2 = LogisticRegression( + solver=solver, + max_iter=1, + warm_start=True, + fit_intercept=fit_intercept, + penalty=penalty, + random_state=global_random_seed, + ) + with ignore_warnings(category=ConvergenceWarning): + clf2.fit(X, y) + clf2.fit(X, y) + + assert_allclose(clf2.coef_, clf1.coef_) + if fit_intercept: + assert_allclose(clf2.intercept_, clf1.intercept_) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_saga_vs_liblinear(csr_container): + iris = load_iris() + X, y = iris.data, iris.target + X = np.concatenate([X] * 3) + y = np.concatenate([y] * 3) + + X_bin = X[y <= 1] + y_bin = y[y <= 1] * 2 - 1 + + X_sparse, y_sparse = make_classification( + n_samples=50, n_features=20, random_state=0 + ) + X_sparse = csr_container(X_sparse) + + for X, y in ((X_bin, y_bin), (X_sparse, y_sparse)): + for penalty in ["l1", "l2"]: + n_samples = X.shape[0] + # alpha=1e-3 is time consuming + for alpha in np.logspace(-1, 1, 3): + saga = LogisticRegression( + C=1.0 / (n_samples * alpha), + solver="saga", + max_iter=200, + fit_intercept=False, + penalty=penalty, + random_state=0, + tol=1e-6, + ) + + liblinear = LogisticRegression( + C=1.0 / (n_samples * alpha), + solver="liblinear", + max_iter=200, + fit_intercept=False, + penalty=penalty, + random_state=0, + tol=1e-6, + ) + + saga.fit(X, y) + liblinear.fit(X, y) + # Convergence for alpha=1e-3 is very slow + assert_array_almost_equal(saga.coef_, liblinear.coef_, 3) + + +# TODO(1.8): remove filterwarnings after the deprecation of multi_class +@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning") +@pytest.mark.parametrize("multi_class", ["ovr", "multinomial"]) +@pytest.mark.parametrize( + "solver", ["liblinear", "newton-cg", "newton-cholesky", "saga"] +) +@pytest.mark.parametrize("fit_intercept", [False, True]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_dtype_match(solver, multi_class, fit_intercept, csr_container): + # Test that np.float32 input data is not cast to np.float64 when possible + # and that the output is approximately the same no matter the input format. + + if solver == "liblinear" and multi_class == "multinomial": + pytest.skip(f"Solver={solver} does not support multinomial logistic.") + + out32_type = np.float64 if solver == "liblinear" else np.float32 + + X_32 = np.array(X).astype(np.float32) + y_32 = np.array(Y1).astype(np.float32) + X_64 = np.array(X).astype(np.float64) + y_64 = np.array(Y1).astype(np.float64) + X_sparse_32 = csr_container(X, dtype=np.float32) + X_sparse_64 = csr_container(X, dtype=np.float64) + solver_tol = 5e-4 + + lr_templ = LogisticRegression( + solver=solver, + multi_class=multi_class, + random_state=42, + tol=solver_tol, + fit_intercept=fit_intercept, + ) + + # Check 32-bit type consistency + lr_32 = clone(lr_templ) + lr_32.fit(X_32, y_32) + assert lr_32.coef_.dtype == out32_type + + # Check 32-bit type consistency with sparsity + lr_32_sparse = clone(lr_templ) + lr_32_sparse.fit(X_sparse_32, y_32) + assert lr_32_sparse.coef_.dtype == out32_type + + # Check 64-bit type consistency + lr_64 = clone(lr_templ) + lr_64.fit(X_64, y_64) + assert lr_64.coef_.dtype == np.float64 + + # Check 64-bit type consistency with sparsity + lr_64_sparse = clone(lr_templ) + lr_64_sparse.fit(X_sparse_64, y_64) + assert lr_64_sparse.coef_.dtype == np.float64 + + # solver_tol bounds the norm of the loss gradient + # dw ~= inv(H)*grad ==> |dw| ~= |inv(H)| * solver_tol, where H - hessian + # + # See https://github.com/scikit-learn/scikit-learn/pull/13645 + # + # with Z = np.hstack((np.ones((3,1)), np.array(X))) + # In [8]: np.linalg.norm(np.diag([0,2,2]) + np.linalg.inv((Z.T @ Z)/4)) + # Out[8]: 1.7193336918135917 + + # factor of 2 to get the ball diameter + atol = 2 * 1.72 * solver_tol + if os.name == "nt" and _IS_32BIT: + # FIXME + atol = 1e-2 + + # Check accuracy consistency + assert_allclose(lr_32.coef_, lr_64.coef_.astype(np.float32), atol=atol) + + if solver == "saga" and fit_intercept: + # FIXME: SAGA on sparse data fits the intercept inaccurately with the + # default tol and max_iter parameters. + atol = 1e-1 + + assert_allclose(lr_32.coef_, lr_32_sparse.coef_, atol=atol) + assert_allclose(lr_64.coef_, lr_64_sparse.coef_, atol=atol) + + +def test_warm_start_converge_LR(): + # Test to see that the logistic regression converges on warm start, + # with multi_class='multinomial'. Non-regressive test for #10836 + + rng = np.random.RandomState(0) + X = np.concatenate((rng.randn(100, 2) + [1, 1], rng.randn(100, 2))) + y = np.array([1] * 100 + [-1] * 100) + lr_no_ws = LogisticRegression(solver="sag", warm_start=False, random_state=0) + lr_ws = LogisticRegression(solver="sag", warm_start=True, random_state=0) + + lr_no_ws_loss = log_loss(y, lr_no_ws.fit(X, y).predict_proba(X)) + for i in range(5): + lr_ws.fit(X, y) + lr_ws_loss = log_loss(y, lr_ws.predict_proba(X)) + assert_allclose(lr_no_ws_loss, lr_ws_loss, rtol=1e-5) + + +def test_elastic_net_coeffs(): + # make sure elasticnet penalty gives different coefficients from l1 and l2 + # with saga solver (l1_ratio different from 0 or 1) + X, y = make_classification(random_state=0) + + C = 2.0 + l1_ratio = 0.5 + coeffs = list() + for penalty, ratio in (("elasticnet", l1_ratio), ("l1", None), ("l2", None)): + lr = LogisticRegression( + penalty=penalty, + C=C, + solver="saga", + random_state=0, + l1_ratio=ratio, + tol=1e-3, + max_iter=200, + ) + lr.fit(X, y) + coeffs.append(lr.coef_) + + elastic_net_coeffs, l1_coeffs, l2_coeffs = coeffs + # make sure coeffs differ by at least .1 + assert not np.allclose(elastic_net_coeffs, l1_coeffs, rtol=0, atol=0.1) + assert not np.allclose(elastic_net_coeffs, l2_coeffs, rtol=0, atol=0.1) + assert not np.allclose(l2_coeffs, l1_coeffs, rtol=0, atol=0.1) + + +@pytest.mark.parametrize("C", [0.001, 0.1, 1, 10, 100, 1000, 1e6]) +@pytest.mark.parametrize("penalty, l1_ratio", [("l1", 1), ("l2", 0)]) +def test_elastic_net_l1_l2_equivalence(C, penalty, l1_ratio): + # Make sure elasticnet is equivalent to l1 when l1_ratio=1 and to l2 when + # l1_ratio=0. + X, y = make_classification(random_state=0) + + lr_enet = LogisticRegression( + penalty="elasticnet", + C=C, + l1_ratio=l1_ratio, + solver="saga", + random_state=0, + tol=1e-2, + ) + lr_expected = LogisticRegression( + penalty=penalty, C=C, solver="saga", random_state=0, tol=1e-2 + ) + lr_enet.fit(X, y) + lr_expected.fit(X, y) + + assert_array_almost_equal(lr_enet.coef_, lr_expected.coef_) + + +@pytest.mark.parametrize("C", [0.001, 1, 100, 1e6]) +def test_elastic_net_vs_l1_l2(C): + # Make sure that elasticnet with grid search on l1_ratio gives same or + # better results than just l1 or just l2. + + X, y = make_classification(500, random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + param_grid = {"l1_ratio": np.linspace(0, 1, 5)} + + enet_clf = LogisticRegression( + penalty="elasticnet", C=C, solver="saga", random_state=0, tol=1e-2 + ) + gs = GridSearchCV(enet_clf, param_grid, refit=True) + + l1_clf = LogisticRegression( + penalty="l1", C=C, solver="saga", random_state=0, tol=1e-2 + ) + l2_clf = LogisticRegression( + penalty="l2", C=C, solver="saga", random_state=0, tol=1e-2 + ) + + for clf in (gs, l1_clf, l2_clf): + clf.fit(X_train, y_train) + + assert gs.score(X_test, y_test) >= l1_clf.score(X_test, y_test) + assert gs.score(X_test, y_test) >= l2_clf.score(X_test, y_test) + + +@pytest.mark.parametrize("C", np.logspace(-3, 2, 4)) +@pytest.mark.parametrize("l1_ratio", [0.1, 0.5, 0.9]) +def test_LogisticRegression_elastic_net_objective(C, l1_ratio): + # Check that training with a penalty matching the objective leads + # to a lower objective. + # Here we train a logistic regression with l2 (a) and elasticnet (b) + # penalties, and compute the elasticnet objective. That of a should be + # greater than that of b (both objectives are convex). + X, y = make_classification( + n_samples=1000, + n_classes=2, + n_features=20, + n_informative=10, + n_redundant=0, + n_repeated=0, + random_state=0, + ) + X = scale(X) + + lr_enet = LogisticRegression( + penalty="elasticnet", + solver="saga", + random_state=0, + C=C, + l1_ratio=l1_ratio, + fit_intercept=False, + ) + lr_l2 = LogisticRegression( + penalty="l2", solver="saga", random_state=0, C=C, fit_intercept=False + ) + lr_enet.fit(X, y) + lr_l2.fit(X, y) + + def enet_objective(lr): + coef = lr.coef_.ravel() + obj = C * log_loss(y, lr.predict_proba(X)) + obj += l1_ratio * np.sum(np.abs(coef)) + obj += (1.0 - l1_ratio) * 0.5 * np.dot(coef, coef) + return obj + + assert enet_objective(lr_enet) < enet_objective(lr_l2) + + +@pytest.mark.parametrize("n_classes", (2, 3)) +def test_LogisticRegressionCV_GridSearchCV_elastic_net(n_classes): + # make sure LogisticRegressionCV gives same best params (l1 and C) as + # GridSearchCV when penalty is elasticnet + + X, y = make_classification( + n_samples=100, n_classes=n_classes, n_informative=3, random_state=0 + ) + + cv = StratifiedKFold(5) + + l1_ratios = np.linspace(0, 1, 3) + Cs = np.logspace(-4, 4, 3) + + lrcv = LogisticRegressionCV( + penalty="elasticnet", + Cs=Cs, + solver="saga", + cv=cv, + l1_ratios=l1_ratios, + random_state=0, + tol=1e-2, + ) + lrcv.fit(X, y) + + param_grid = {"C": Cs, "l1_ratio": l1_ratios} + lr = LogisticRegression( + penalty="elasticnet", + solver="saga", + random_state=0, + tol=1e-2, + ) + gs = GridSearchCV(lr, param_grid, cv=cv) + gs.fit(X, y) + + assert gs.best_params_["l1_ratio"] == lrcv.l1_ratio_[0] + assert gs.best_params_["C"] == lrcv.C_[0] + + +# TODO(1.8): remove filterwarnings after the deprecation of multi_class +# Maybe remove whole test after removal of the deprecated multi_class. +@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning") +def test_LogisticRegressionCV_GridSearchCV_elastic_net_ovr(): + # make sure LogisticRegressionCV gives same best params (l1 and C) as + # GridSearchCV when penalty is elasticnet and multiclass is ovr. We can't + # compare best_params like in the previous test because + # LogisticRegressionCV with multi_class='ovr' will have one C and one + # l1_param for each class, while LogisticRegression will share the + # parameters over the *n_classes* classifiers. + + X, y = make_classification( + n_samples=100, n_classes=3, n_informative=3, random_state=0 + ) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + cv = StratifiedKFold(5) + + l1_ratios = np.linspace(0, 1, 3) + Cs = np.logspace(-4, 4, 3) + + lrcv = LogisticRegressionCV( + penalty="elasticnet", + Cs=Cs, + solver="saga", + cv=cv, + l1_ratios=l1_ratios, + random_state=0, + multi_class="ovr", + tol=1e-2, + ) + lrcv.fit(X_train, y_train) + + param_grid = {"C": Cs, "l1_ratio": l1_ratios} + lr = LogisticRegression( + penalty="elasticnet", + solver="saga", + random_state=0, + multi_class="ovr", + tol=1e-2, + ) + gs = GridSearchCV(lr, param_grid, cv=cv) + gs.fit(X_train, y_train) + + # Check that predictions are 80% the same + assert (lrcv.predict(X_train) == gs.predict(X_train)).mean() >= 0.8 + assert (lrcv.predict(X_test) == gs.predict(X_test)).mean() >= 0.8 + + +# TODO(1.8): remove filterwarnings after the deprecation of multi_class +@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning") +@pytest.mark.parametrize("penalty", ("l2", "elasticnet")) +@pytest.mark.parametrize("multi_class", ("ovr", "multinomial", "auto")) +def test_LogisticRegressionCV_no_refit(penalty, multi_class): + # Test LogisticRegressionCV attribute shapes when refit is False + + n_classes = 3 + n_features = 20 + X, y = make_classification( + n_samples=200, + n_classes=n_classes, + n_informative=n_classes, + n_features=n_features, + random_state=0, + ) + + Cs = np.logspace(-4, 4, 3) + if penalty == "elasticnet": + l1_ratios = np.linspace(0, 1, 2) + else: + l1_ratios = None + + lrcv = LogisticRegressionCV( + penalty=penalty, + Cs=Cs, + solver="saga", + l1_ratios=l1_ratios, + random_state=0, + multi_class=multi_class, + tol=1e-2, + refit=False, + ) + lrcv.fit(X, y) + assert lrcv.C_.shape == (n_classes,) + assert lrcv.l1_ratio_.shape == (n_classes,) + assert lrcv.coef_.shape == (n_classes, n_features) + + +# TODO(1.8): remove filterwarnings after the deprecation of multi_class +# Remove multi_class an change first element of the expected n_iter_.shape from +# n_classes to 1 (according to the docstring). +@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning") +def test_LogisticRegressionCV_elasticnet_attribute_shapes(): + # Make sure the shapes of scores_ and coefs_paths_ attributes are correct + # when using elasticnet (added one dimension for l1_ratios) + + n_classes = 3 + n_features = 20 + X, y = make_classification( + n_samples=200, + n_classes=n_classes, + n_informative=n_classes, + n_features=n_features, + random_state=0, + ) + + Cs = np.logspace(-4, 4, 3) + l1_ratios = np.linspace(0, 1, 2) + + n_folds = 2 + lrcv = LogisticRegressionCV( + penalty="elasticnet", + Cs=Cs, + solver="saga", + cv=n_folds, + l1_ratios=l1_ratios, + multi_class="ovr", + random_state=0, + tol=1e-2, + ) + lrcv.fit(X, y) + coefs_paths = np.asarray(list(lrcv.coefs_paths_.values())) + assert coefs_paths.shape == ( + n_classes, + n_folds, + Cs.size, + l1_ratios.size, + n_features + 1, + ) + scores = np.asarray(list(lrcv.scores_.values())) + assert scores.shape == (n_classes, n_folds, Cs.size, l1_ratios.size) + + assert lrcv.n_iter_.shape == (n_classes, n_folds, Cs.size, l1_ratios.size) + + +def test_l1_ratio_non_elasticnet(): + msg = ( + r"l1_ratio parameter is only used when penalty is" + r" 'elasticnet'\. Got \(penalty=l1\)" + ) + with pytest.warns(UserWarning, match=msg): + LogisticRegression(penalty="l1", solver="saga", l1_ratio=0.5).fit(X, Y1) + + +@pytest.mark.parametrize("C", np.logspace(-3, 2, 4)) +@pytest.mark.parametrize("l1_ratio", [0.1, 0.5, 0.9]) +def test_elastic_net_versus_sgd(C, l1_ratio): + # Compare elasticnet penalty in LogisticRegression() and SGD(loss='log') + n_samples = 500 + X, y = make_classification( + n_samples=n_samples, + n_classes=2, + n_features=5, + n_informative=5, + n_redundant=0, + n_repeated=0, + random_state=1, + ) + X = scale(X) + + sgd = SGDClassifier( + penalty="elasticnet", + random_state=1, + fit_intercept=False, + tol=None, + max_iter=2000, + l1_ratio=l1_ratio, + alpha=1.0 / C / n_samples, + loss="log_loss", + ) + log = LogisticRegression( + penalty="elasticnet", + random_state=1, + fit_intercept=False, + tol=1e-5, + max_iter=1000, + l1_ratio=l1_ratio, + C=C, + solver="saga", + ) + + sgd.fit(X, y) + log.fit(X, y) + assert_array_almost_equal(sgd.coef_, log.coef_, decimal=1) + + +def test_logistic_regression_path_coefs_multinomial(): + # Make sure that the returned coefs by logistic_regression_path when + # multi_class='multinomial' don't override each other (used to be a + # bug). + X, y = make_classification( + n_samples=200, + n_classes=3, + n_informative=2, + n_redundant=0, + n_clusters_per_class=1, + random_state=0, + n_features=2, + ) + Cs = [0.00001, 1, 10000] + coefs, _, _ = _logistic_regression_path( + X, + y, + penalty="l1", + Cs=Cs, + solver="saga", + random_state=0, + multi_class="multinomial", + ) + + with pytest.raises(AssertionError): + assert_array_almost_equal(coefs[0], coefs[1], decimal=1) + with pytest.raises(AssertionError): + assert_array_almost_equal(coefs[0], coefs[2], decimal=1) + with pytest.raises(AssertionError): + assert_array_almost_equal(coefs[1], coefs[2], decimal=1) + + +# TODO(1.8): remove filterwarnings after the deprecation of multi_class +@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning") +@pytest.mark.filterwarnings( + "ignore:.*'liblinear' solver for multiclass classification is deprecated.*" +) +@pytest.mark.parametrize( + "est", + [ + LogisticRegression(random_state=0, max_iter=500), + LogisticRegressionCV(random_state=0, cv=3, Cs=3, tol=1e-3, max_iter=500), + ], + ids=lambda x: x.__class__.__name__, +) +@pytest.mark.parametrize("solver", SOLVERS) +def test_logistic_regression_multi_class_auto(est, solver): + # check multi_class='auto' => multi_class='ovr' + # iff binary y or liblinear + + def fit(X, y, **kw): + return clone(est).set_params(**kw).fit(X, y) + + scaled_data = scale(iris.data) + X = scaled_data[::10] + X2 = scaled_data[1::10] + y_multi = iris.target[::10] + y_bin = y_multi == 0 + est_auto_bin = fit(X, y_bin, multi_class="auto", solver=solver) + est_ovr_bin = fit(X, y_bin, multi_class="ovr", solver=solver) + assert_allclose(est_auto_bin.coef_, est_ovr_bin.coef_) + assert_allclose(est_auto_bin.predict_proba(X2), est_ovr_bin.predict_proba(X2)) + + est_auto_multi = fit(X, y_multi, multi_class="auto", solver=solver) + if solver == "liblinear": + est_ovr_multi = fit(X, y_multi, multi_class="ovr", solver=solver) + assert_allclose(est_auto_multi.coef_, est_ovr_multi.coef_) + assert_allclose( + est_auto_multi.predict_proba(X2), est_ovr_multi.predict_proba(X2) + ) + else: + est_multi_multi = fit(X, y_multi, multi_class="multinomial", solver=solver) + assert_allclose(est_auto_multi.coef_, est_multi_multi.coef_) + assert_allclose( + est_auto_multi.predict_proba(X2), est_multi_multi.predict_proba(X2) + ) + + # Make sure multi_class='ovr' is distinct from ='multinomial' + assert not np.allclose( + est_auto_bin.coef_, + fit(X, y_bin, multi_class="multinomial", solver=solver).coef_, + ) + assert not np.allclose( + est_auto_bin.coef_, + fit(X, y_multi, multi_class="multinomial", solver=solver).coef_, + ) + + +@pytest.mark.parametrize("solver", sorted(set(SOLVERS) - set(["liblinear"]))) +def test_penalty_none(solver): + # - Make sure warning is raised if penalty=None and C is set to a + # non-default value. + # - Make sure setting penalty=None is equivalent to setting C=np.inf with + # l2 penalty. + X, y = make_classification(n_samples=1000, n_redundant=0, random_state=0) + + msg = "Setting penalty=None will ignore the C" + lr = LogisticRegression(penalty=None, solver=solver, C=4) + with pytest.warns(UserWarning, match=msg): + lr.fit(X, y) + + lr_none = LogisticRegression(penalty=None, solver=solver, random_state=0) + lr_l2_C_inf = LogisticRegression( + penalty="l2", C=np.inf, solver=solver, random_state=0 + ) + pred_none = lr_none.fit(X, y).predict(X) + pred_l2_C_inf = lr_l2_C_inf.fit(X, y).predict(X) + assert_array_equal(pred_none, pred_l2_C_inf) + + +@pytest.mark.parametrize( + "params", + [ + {"penalty": "l1", "dual": False, "tol": 1e-6, "max_iter": 1000}, + {"penalty": "l2", "dual": True, "tol": 1e-12, "max_iter": 1000}, + {"penalty": "l2", "dual": False, "tol": 1e-12, "max_iter": 1000}, + ], +) +def test_logisticregression_liblinear_sample_weight(params): + # check that we support sample_weight with liblinear in all possible cases: + # l1-primal, l2-primal, l2-dual + X = np.array( + [ + [1, 3], + [1, 3], + [1, 3], + [1, 3], + [2, 1], + [2, 1], + [2, 1], + [2, 1], + [3, 3], + [3, 3], + [3, 3], + [3, 3], + [4, 1], + [4, 1], + [4, 1], + [4, 1], + ], + dtype=np.dtype("float"), + ) + y = np.array( + [1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2], dtype=np.dtype("int") + ) + + X2 = np.vstack([X, X]) + y2 = np.hstack([y, 3 - y]) + sample_weight = np.ones(shape=len(y) * 2) + sample_weight[len(y) :] = 0 + X2, y2, sample_weight = shuffle(X2, y2, sample_weight, random_state=0) + + base_clf = LogisticRegression(solver="liblinear", random_state=42) + base_clf.set_params(**params) + clf_no_weight = clone(base_clf).fit(X, y) + clf_with_weight = clone(base_clf).fit(X2, y2, sample_weight=sample_weight) + + for method in ("predict", "predict_proba", "decision_function"): + X_clf_no_weight = getattr(clf_no_weight, method)(X) + X_clf_with_weight = getattr(clf_with_weight, method)(X) + assert_allclose(X_clf_no_weight, X_clf_with_weight) + + +def test_scores_attribute_layout_elasticnet(): + # Non regression test for issue #14955. + # when penalty is elastic net the scores_ attribute has shape + # (n_classes, n_Cs, n_l1_ratios) + # We here make sure that the second dimension indeed corresponds to Cs and + # the third dimension corresponds to l1_ratios. + + X, y = make_classification(n_samples=1000, random_state=0) + cv = StratifiedKFold(n_splits=5) + + l1_ratios = [0.1, 0.9] + Cs = [0.1, 1, 10] + + lrcv = LogisticRegressionCV( + penalty="elasticnet", + solver="saga", + l1_ratios=l1_ratios, + Cs=Cs, + cv=cv, + random_state=0, + max_iter=250, + tol=1e-3, + ) + lrcv.fit(X, y) + + avg_scores_lrcv = lrcv.scores_[1].mean(axis=0) # average over folds + + for i, C in enumerate(Cs): + for j, l1_ratio in enumerate(l1_ratios): + lr = LogisticRegression( + penalty="elasticnet", + solver="saga", + C=C, + l1_ratio=l1_ratio, + random_state=0, + max_iter=250, + tol=1e-3, + ) + + avg_score_lr = cross_val_score(lr, X, y, cv=cv).mean() + assert avg_scores_lrcv[i, j] == pytest.approx(avg_score_lr) + + +# TODO(1.8): remove filterwarnings after the deprecation of multi_class +@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning") +@pytest.mark.parametrize("solver", ["lbfgs", "newton-cg", "newton-cholesky"]) +@pytest.mark.parametrize("fit_intercept", [False, True]) +def test_multinomial_identifiability_on_iris(solver, fit_intercept): + """Test that the multinomial classification is identifiable. + + A multinomial with c classes can be modeled with + probability_k = exp(X@coef_k) / sum(exp(X@coef_l), l=1..c) for k=1..c. + This is not identifiable, unless one chooses a further constraint. + According to [1], the maximum of the L2 penalized likelihood automatically + satisfies the symmetric constraint: + sum(coef_k, k=1..c) = 0 + + Further details can be found in [2]. + + Reference + --------- + .. [1] :doi:`Zhu, Ji and Trevor J. Hastie. "Classification of gene microarrays by + penalized logistic regression". Biostatistics 5 3 (2004): 427-43. + <10.1093/biostatistics/kxg046>` + + .. [2] :arxiv:`Noah Simon and Jerome Friedman and Trevor Hastie. (2013) + "A Blockwise Descent Algorithm for Group-penalized Multiresponse and + Multinomial Regression". <1311.6529>` + """ + # Test logistic regression with the iris dataset + n_samples, n_features = iris.data.shape + target = iris.target_names[iris.target] + + clf = LogisticRegression( + C=len(iris.data), + solver="lbfgs", + fit_intercept=fit_intercept, + ) + # Scaling X to ease convergence. + X_scaled = scale(iris.data) + clf.fit(X_scaled, target) + + # axis=0 is sum over classes + assert_allclose(clf.coef_.sum(axis=0), 0, atol=1e-10) + if fit_intercept: + assert clf.intercept_.sum(axis=0) == pytest.approx(0, abs=1e-11) + + +# TODO(1.8): remove filterwarnings after the deprecation of multi_class +@pytest.mark.filterwarnings("ignore:.*'multi_class' was deprecated.*:FutureWarning") +@pytest.mark.parametrize("multi_class", ["ovr", "multinomial", "auto"]) +@pytest.mark.parametrize("class_weight", [{0: 1.0, 1: 10.0, 2: 1.0}, "balanced"]) +def test_sample_weight_not_modified(multi_class, class_weight): + X, y = load_iris(return_X_y=True) + n_features = len(X) + W = np.ones(n_features) + W[: n_features // 2] = 2 + + expected = W.copy() + + clf = LogisticRegression( + random_state=0, class_weight=class_weight, max_iter=200, multi_class=multi_class + ) + clf.fit(X, y, sample_weight=W) + assert_allclose(expected, W) + + +@pytest.mark.parametrize("solver", SOLVERS) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_large_sparse_matrix(solver, global_random_seed, csr_container): + # Solvers either accept large sparse matrices, or raise helpful error. + # Non-regression test for pull-request #21093. + + # generate sparse matrix with int64 indices + X = csr_container(sparse.rand(20, 10, random_state=global_random_seed)) + for attr in ["indices", "indptr"]: + setattr(X, attr, getattr(X, attr).astype("int64")) + rng = np.random.RandomState(global_random_seed) + y = rng.randint(2, size=X.shape[0]) + + if solver in ["liblinear", "sag", "saga"]: + msg = "Only sparse matrices with 32-bit integer indices" + with pytest.raises(ValueError, match=msg): + LogisticRegression(solver=solver).fit(X, y) + else: + LogisticRegression(solver=solver).fit(X, y) + + +def test_single_feature_newton_cg(): + # Test that Newton-CG works with a single feature and intercept. + # Non-regression test for issue #23605. + + X = np.array([[0.5, 0.65, 1.1, 1.25, 0.8, 0.54, 0.95, 0.7]]).T + y = np.array([1, 1, 0, 0, 1, 1, 0, 1]) + assert X.shape[1] == 1 + LogisticRegression(solver="newton-cg", fit_intercept=True).fit(X, y) + + +def test_liblinear_not_stuck(): + # Non-regression https://github.com/scikit-learn/scikit-learn/issues/18264 + X = iris.data.copy() + y = iris.target.copy() + X = X[y != 2] + y = y[y != 2] + X_prep = StandardScaler().fit_transform(X) + + C = l1_min_c(X, y, loss="log") * 10 ** (10 / 29) + clf = LogisticRegression( + penalty="l1", + solver="liblinear", + tol=1e-6, + max_iter=100, + intercept_scaling=10000.0, + random_state=0, + C=C, + ) + + # test that the fit does not raise a ConvergenceWarning + with warnings.catch_warnings(): + warnings.simplefilter("error", ConvergenceWarning) + clf.fit(X_prep, y) + + +@config_context(enable_metadata_routing=True) +def test_lr_cv_scores_differ_when_sample_weight_is_requested(): + """Test that `sample_weight` is correctly passed to the scorer in + `LogisticRegressionCV.fit` and `LogisticRegressionCV.score` by + checking the difference in scores with the case when `sample_weight` + is not requested. + """ + rng = np.random.RandomState(10) + X, y = make_classification(n_samples=10, random_state=rng) + X_t, y_t = make_classification(n_samples=10, random_state=rng) + sample_weight = np.ones(len(y)) + sample_weight[: len(y) // 2] = 2 + kwargs = {"sample_weight": sample_weight} + + scorer1 = get_scorer("accuracy") + lr_cv1 = LogisticRegressionCV(scoring=scorer1) + lr_cv1.fit(X, y, **kwargs) + + scorer2 = get_scorer("accuracy") + scorer2.set_score_request(sample_weight=True) + lr_cv2 = LogisticRegressionCV(scoring=scorer2) + lr_cv2.fit(X, y, **kwargs) + + assert not np.allclose(lr_cv1.scores_[1], lr_cv2.scores_[1]) + + score_1 = lr_cv1.score(X_t, y_t, **kwargs) + score_2 = lr_cv2.score(X_t, y_t, **kwargs) + + assert not np.allclose(score_1, score_2) + + +def test_lr_cv_scores_without_enabling_metadata_routing(): + """Test that `sample_weight` is passed correctly to the scorer in + `LogisticRegressionCV.fit` and `LogisticRegressionCV.score` even + when `enable_metadata_routing=False` + """ + rng = np.random.RandomState(10) + X, y = make_classification(n_samples=10, random_state=rng) + X_t, y_t = make_classification(n_samples=10, random_state=rng) + sample_weight = np.ones(len(y)) + sample_weight[: len(y) // 2] = 2 + kwargs = {"sample_weight": sample_weight} + + with config_context(enable_metadata_routing=False): + scorer1 = get_scorer("accuracy") + lr_cv1 = LogisticRegressionCV(scoring=scorer1) + lr_cv1.fit(X, y, **kwargs) + score_1 = lr_cv1.score(X_t, y_t, **kwargs) + + with config_context(enable_metadata_routing=True): + scorer2 = get_scorer("accuracy") + scorer2.set_score_request(sample_weight=True) + lr_cv2 = LogisticRegressionCV(scoring=scorer2) + lr_cv2.fit(X, y, **kwargs) + score_2 = lr_cv2.score(X_t, y_t, **kwargs) + + assert_allclose(lr_cv1.scores_[1], lr_cv2.scores_[1]) + assert_allclose(score_1, score_2) + + +@pytest.mark.parametrize("solver", SOLVERS) +def test_zero_max_iter(solver): + # Make sure we can inspect the state of LogisticRegression right after + # initialization (before the first weight update). + X, y = load_iris(return_X_y=True) + y = y == 2 + with ignore_warnings(category=ConvergenceWarning): + clf = LogisticRegression(solver=solver, max_iter=0).fit(X, y) + if solver not in ["saga", "sag"]: + # XXX: sag and saga have n_iter_ = [1]... + assert clf.n_iter_ == 0 + + if solver != "lbfgs": + # XXX: lbfgs has already started to update the coefficients... + assert_allclose(clf.coef_, np.zeros_like(clf.coef_)) + assert_allclose( + clf.decision_function(X), + np.full(shape=X.shape[0], fill_value=clf.intercept_), + ) + assert_allclose( + clf.predict_proba(X), + np.full(shape=(X.shape[0], 2), fill_value=0.5), + ) + assert clf.score(X, y) < 0.7 + + +def test_passing_params_without_enabling_metadata_routing(): + """Test that the right error message is raised when metadata params + are passed while not supported when `enable_metadata_routing=False`.""" + X, y = make_classification(n_samples=10, random_state=0) + lr_cv = LogisticRegressionCV() + msg = "is only supported if enable_metadata_routing=True" + + with config_context(enable_metadata_routing=False): + params = {"extra_param": 1.0} + + with pytest.raises(ValueError, match=msg): + lr_cv.fit(X, y, **params) + + with pytest.raises(ValueError, match=msg): + lr_cv.score(X, y, **params) + + +# TODO(1.8): remove +def test_multi_class_deprecated(): + """Check `multi_class` parameter deprecated.""" + X, y = make_classification(n_classes=3, n_samples=50, n_informative=6) + lr = LogisticRegression(multi_class="ovr") + msg = "'multi_class' was deprecated" + with pytest.warns(FutureWarning, match=msg): + lr.fit(X, y) + + lrCV = LogisticRegressionCV(multi_class="ovr") + with pytest.warns(FutureWarning, match=msg): + lrCV.fit(X, y) + + # Special warning for "binary multinomial" + X, y = make_classification(n_classes=2, n_samples=50, n_informative=6) + lr = LogisticRegression(multi_class="multinomial") + msg = "'multi_class' was deprecated.*binary problems" + with pytest.warns(FutureWarning, match=msg): + lr.fit(X, y) + + lrCV = LogisticRegressionCV(multi_class="multinomial") + with pytest.warns(FutureWarning, match=msg): + lrCV.fit(X, y) + + +def test_newton_cholesky_fallback_to_lbfgs(global_random_seed): + # Wide data matrix should lead to a rank-deficient Hessian matrix + # hence make the Newton-Cholesky solver raise a warning and fallback to + # lbfgs. + X, y = make_classification( + n_samples=10, n_features=20, random_state=global_random_seed + ) + C = 1e30 # very high C to nearly disable regularization + + # Check that LBFGS can converge without any warning on this problem. + lr_lbfgs = LogisticRegression(solver="lbfgs", C=C) + with warnings.catch_warnings(): + warnings.simplefilter("error") + lr_lbfgs.fit(X, y) + n_iter_lbfgs = lr_lbfgs.n_iter_[0] + + assert n_iter_lbfgs >= 1 + + # Check that the Newton-Cholesky solver raises a warning and falls back to + # LBFGS. This should converge with the same number of iterations as the + # above call of lbfgs since the Newton-Cholesky triggers the fallback + # before completing the first iteration, for the problem setting at hand. + lr_nc = LogisticRegression(solver="newton-cholesky", C=C) + with ignore_warnings(category=LinAlgWarning): + lr_nc.fit(X, y) + n_iter_nc = lr_nc.n_iter_[0] + + assert n_iter_nc == n_iter_lbfgs + + # Trying to fit the same model again with a small iteration budget should + # therefore raise a ConvergenceWarning: + lr_nc_limited = LogisticRegression( + solver="newton-cholesky", C=C, max_iter=n_iter_lbfgs - 1 + ) + with ignore_warnings(category=LinAlgWarning): + with pytest.warns(ConvergenceWarning, match="lbfgs failed to converge"): + lr_nc_limited.fit(X, y) + n_iter_nc_limited = lr_nc_limited.n_iter_[0] + + assert n_iter_nc_limited == lr_nc_limited.max_iter - 1 + + +# TODO(1.8): check for an error instead +@pytest.mark.parametrize("Estimator", [LogisticRegression, LogisticRegressionCV]) +def test_liblinear_multiclass_warning(Estimator): + """Check that liblinear warns on multiclass problems.""" + msg = ( + "Using the 'liblinear' solver for multiclass classification is " + "deprecated. An error will be raised in 1.8. Either use another " + "solver which supports the multinomial loss or wrap the estimator " + "in a OneVsRestClassifier to keep applying a one-versus-rest " + "scheme." + ) + with pytest.warns(FutureWarning, match=msg): + Estimator(solver="liblinear").fit(iris.data, iris.target) diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_passive_aggressive.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_passive_aggressive.py new file mode 100644 index 0000000000000000000000000000000000000000..bcfd58b1eab2b51ecd8cc1097bd48577e2babe0d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_passive_aggressive.py @@ -0,0 +1,268 @@ +import numpy as np +import pytest + +from sklearn.base import ClassifierMixin +from sklearn.datasets import load_iris +from sklearn.linear_model import PassiveAggressiveClassifier, PassiveAggressiveRegressor +from sklearn.utils import check_random_state +from sklearn.utils._testing import ( + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) +from sklearn.utils.fixes import CSR_CONTAINERS + +iris = load_iris() +random_state = check_random_state(12) +indices = np.arange(iris.data.shape[0]) +random_state.shuffle(indices) +X = iris.data[indices] +y = iris.target[indices] + + +class MyPassiveAggressive(ClassifierMixin): + def __init__( + self, + C=1.0, + epsilon=0.01, + loss="hinge", + fit_intercept=True, + n_iter=1, + random_state=None, + ): + self.C = C + self.epsilon = epsilon + self.loss = loss + self.fit_intercept = fit_intercept + self.n_iter = n_iter + + def fit(self, X, y): + n_samples, n_features = X.shape + self.w = np.zeros(n_features, dtype=np.float64) + self.b = 0.0 + + for t in range(self.n_iter): + for i in range(n_samples): + p = self.project(X[i]) + if self.loss in ("hinge", "squared_hinge"): + loss = max(1 - y[i] * p, 0) + else: + loss = max(np.abs(p - y[i]) - self.epsilon, 0) + + sqnorm = np.dot(X[i], X[i]) + + if self.loss in ("hinge", "epsilon_insensitive"): + step = min(self.C, loss / sqnorm) + elif self.loss in ("squared_hinge", "squared_epsilon_insensitive"): + step = loss / (sqnorm + 1.0 / (2 * self.C)) + + if self.loss in ("hinge", "squared_hinge"): + step *= y[i] + else: + step *= np.sign(y[i] - p) + + self.w += step * X[i] + if self.fit_intercept: + self.b += step + + def project(self, X): + return np.dot(X, self.w) + self.b + + +@pytest.mark.parametrize("average", [False, True]) +@pytest.mark.parametrize("fit_intercept", [True, False]) +@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS]) +def test_classifier_accuracy(csr_container, fit_intercept, average): + data = csr_container(X) if csr_container is not None else X + clf = PassiveAggressiveClassifier( + C=1.0, + max_iter=30, + fit_intercept=fit_intercept, + random_state=1, + average=average, + tol=None, + ) + clf.fit(data, y) + score = clf.score(data, y) + assert score > 0.79 + if average: + assert hasattr(clf, "_average_coef") + assert hasattr(clf, "_average_intercept") + assert hasattr(clf, "_standard_intercept") + assert hasattr(clf, "_standard_coef") + + +@pytest.mark.parametrize("average", [False, True]) +@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS]) +def test_classifier_partial_fit(csr_container, average): + classes = np.unique(y) + data = csr_container(X) if csr_container is not None else X + clf = PassiveAggressiveClassifier(random_state=0, average=average, max_iter=5) + for t in range(30): + clf.partial_fit(data, y, classes) + score = clf.score(data, y) + assert score > 0.79 + if average: + assert hasattr(clf, "_average_coef") + assert hasattr(clf, "_average_intercept") + assert hasattr(clf, "_standard_intercept") + assert hasattr(clf, "_standard_coef") + + +def test_classifier_refit(): + # Classifier can be retrained on different labels and features. + clf = PassiveAggressiveClassifier(max_iter=5).fit(X, y) + assert_array_equal(clf.classes_, np.unique(y)) + + clf.fit(X[:, :-1], iris.target_names[y]) + assert_array_equal(clf.classes_, iris.target_names) + + +@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS]) +@pytest.mark.parametrize("loss", ("hinge", "squared_hinge")) +def test_classifier_correctness(loss, csr_container): + y_bin = y.copy() + y_bin[y != 1] = -1 + + clf1 = MyPassiveAggressive(loss=loss, n_iter=2) + clf1.fit(X, y_bin) + + data = csr_container(X) if csr_container is not None else X + clf2 = PassiveAggressiveClassifier(loss=loss, max_iter=2, shuffle=False, tol=None) + clf2.fit(data, y_bin) + + assert_array_almost_equal(clf1.w, clf2.coef_.ravel(), decimal=2) + + +@pytest.mark.parametrize( + "response_method", ["predict_proba", "predict_log_proba", "transform"] +) +def test_classifier_undefined_methods(response_method): + clf = PassiveAggressiveClassifier(max_iter=100) + with pytest.raises(AttributeError): + getattr(clf, response_method) + + +def test_class_weights(): + # Test class weights. + X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) + y2 = [1, 1, 1, -1, -1] + + clf = PassiveAggressiveClassifier( + C=0.1, max_iter=100, class_weight=None, random_state=100 + ) + clf.fit(X2, y2) + assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1])) + + # we give a small weights to class 1 + clf = PassiveAggressiveClassifier( + C=0.1, max_iter=100, class_weight={1: 0.001}, random_state=100 + ) + clf.fit(X2, y2) + + # now the hyperplane should rotate clock-wise and + # the prediction on this point should shift + assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1])) + + +def test_partial_fit_weight_class_balanced(): + # partial_fit with class_weight='balanced' not supported + clf = PassiveAggressiveClassifier(class_weight="balanced", max_iter=100) + with pytest.raises(ValueError): + clf.partial_fit(X, y, classes=np.unique(y)) + + +def test_equal_class_weight(): + X2 = [[1, 0], [1, 0], [0, 1], [0, 1]] + y2 = [0, 0, 1, 1] + clf = PassiveAggressiveClassifier(C=0.1, tol=None, class_weight=None) + clf.fit(X2, y2) + + # Already balanced, so "balanced" weights should have no effect + clf_balanced = PassiveAggressiveClassifier(C=0.1, tol=None, class_weight="balanced") + clf_balanced.fit(X2, y2) + + clf_weighted = PassiveAggressiveClassifier( + C=0.1, tol=None, class_weight={0: 0.5, 1: 0.5} + ) + clf_weighted.fit(X2, y2) + + # should be similar up to some epsilon due to learning rate schedule + assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2) + assert_almost_equal(clf.coef_, clf_balanced.coef_, decimal=2) + + +def test_wrong_class_weight_label(): + # ValueError due to wrong class_weight label. + X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) + y2 = [1, 1, 1, -1, -1] + + clf = PassiveAggressiveClassifier(class_weight={0: 0.5}, max_iter=100) + with pytest.raises(ValueError): + clf.fit(X2, y2) + + +@pytest.mark.parametrize("average", [False, True]) +@pytest.mark.parametrize("fit_intercept", [True, False]) +@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS]) +def test_regressor_mse(csr_container, fit_intercept, average): + y_bin = y.copy() + y_bin[y != 1] = -1 + + data = csr_container(X) if csr_container is not None else X + reg = PassiveAggressiveRegressor( + C=1.0, + fit_intercept=fit_intercept, + random_state=0, + average=average, + max_iter=5, + ) + reg.fit(data, y_bin) + pred = reg.predict(data) + assert np.mean((pred - y_bin) ** 2) < 1.7 + if average: + assert hasattr(reg, "_average_coef") + assert hasattr(reg, "_average_intercept") + assert hasattr(reg, "_standard_intercept") + assert hasattr(reg, "_standard_coef") + + +@pytest.mark.parametrize("average", [False, True]) +@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS]) +def test_regressor_partial_fit(csr_container, average): + y_bin = y.copy() + y_bin[y != 1] = -1 + + data = csr_container(X) if csr_container is not None else X + reg = PassiveAggressiveRegressor(random_state=0, average=average, max_iter=100) + for t in range(50): + reg.partial_fit(data, y_bin) + pred = reg.predict(data) + assert np.mean((pred - y_bin) ** 2) < 1.7 + if average: + assert hasattr(reg, "_average_coef") + assert hasattr(reg, "_average_intercept") + assert hasattr(reg, "_standard_intercept") + assert hasattr(reg, "_standard_coef") + + +@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS]) +@pytest.mark.parametrize("loss", ("epsilon_insensitive", "squared_epsilon_insensitive")) +def test_regressor_correctness(loss, csr_container): + y_bin = y.copy() + y_bin[y != 1] = -1 + + reg1 = MyPassiveAggressive(loss=loss, n_iter=2) + reg1.fit(X, y_bin) + + data = csr_container(X) if csr_container is not None else X + reg2 = PassiveAggressiveRegressor(tol=None, loss=loss, max_iter=2, shuffle=False) + reg2.fit(data, y_bin) + + assert_array_almost_equal(reg1.w, reg2.coef_.ravel(), decimal=2) + + +def test_regressor_undefined_methods(): + reg = PassiveAggressiveRegressor(max_iter=100) + with pytest.raises(AttributeError): + reg.transform(X) diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_perceptron.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_perceptron.py new file mode 100644 index 0000000000000000000000000000000000000000..71456ae72132ccebc76da96aea9213fd55f47c9d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_perceptron.py @@ -0,0 +1,88 @@ +import numpy as np +import pytest + +from sklearn.datasets import load_iris +from sklearn.linear_model import Perceptron +from sklearn.utils import check_random_state +from sklearn.utils._testing import assert_allclose, assert_array_almost_equal +from sklearn.utils.fixes import CSR_CONTAINERS + +iris = load_iris() +random_state = check_random_state(12) +indices = np.arange(iris.data.shape[0]) +random_state.shuffle(indices) +X = iris.data[indices] +y = iris.target[indices] + + +class MyPerceptron: + def __init__(self, n_iter=1): + self.n_iter = n_iter + + def fit(self, X, y): + n_samples, n_features = X.shape + self.w = np.zeros(n_features, dtype=np.float64) + self.b = 0.0 + + for t in range(self.n_iter): + for i in range(n_samples): + if self.predict(X[i])[0] != y[i]: + self.w += y[i] * X[i] + self.b += y[i] + + def project(self, X): + return np.dot(X, self.w) + self.b + + def predict(self, X): + X = np.atleast_2d(X) + return np.sign(self.project(X)) + + +@pytest.mark.parametrize("container", CSR_CONTAINERS + [np.array]) +def test_perceptron_accuracy(container): + data = container(X) + clf = Perceptron(max_iter=100, tol=None, shuffle=False) + clf.fit(data, y) + score = clf.score(data, y) + assert score > 0.7 + + +def test_perceptron_correctness(): + y_bin = y.copy() + y_bin[y != 1] = -1 + + clf1 = MyPerceptron(n_iter=2) + clf1.fit(X, y_bin) + + clf2 = Perceptron(max_iter=2, shuffle=False, tol=None) + clf2.fit(X, y_bin) + + assert_array_almost_equal(clf1.w, clf2.coef_.ravel()) + + +def test_undefined_methods(): + clf = Perceptron(max_iter=100) + for meth in ("predict_proba", "predict_log_proba"): + with pytest.raises(AttributeError): + getattr(clf, meth) + + +def test_perceptron_l1_ratio(): + """Check that `l1_ratio` has an impact when `penalty='elasticnet'`""" + clf1 = Perceptron(l1_ratio=0, penalty="elasticnet") + clf1.fit(X, y) + + clf2 = Perceptron(l1_ratio=0.15, penalty="elasticnet") + clf2.fit(X, y) + + assert clf1.score(X, y) != clf2.score(X, y) + + # check that the bounds of elastic net which should correspond to an l1 or + # l2 penalty depending of `l1_ratio` value. + clf_l1 = Perceptron(penalty="l1").fit(X, y) + clf_elasticnet = Perceptron(l1_ratio=1, penalty="elasticnet").fit(X, y) + assert_allclose(clf_l1.coef_, clf_elasticnet.coef_) + + clf_l2 = Perceptron(penalty="l2").fit(X, y) + clf_elasticnet = Perceptron(l1_ratio=0, penalty="elasticnet").fit(X, y) + assert_allclose(clf_l2.coef_, clf_elasticnet.coef_) diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_quantile.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_quantile.py new file mode 100644 index 0000000000000000000000000000000000000000..1d166b14091ccc11e148184056a6d4a58a48a664 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_quantile.py @@ -0,0 +1,283 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np +import pytest +from pytest import approx +from scipy.optimize import minimize + +from sklearn.datasets import make_regression +from sklearn.exceptions import ConvergenceWarning +from sklearn.linear_model import HuberRegressor, QuantileRegressor +from sklearn.metrics import mean_pinball_loss +from sklearn.utils._testing import assert_allclose +from sklearn.utils.fixes import ( + COO_CONTAINERS, + CSC_CONTAINERS, + CSR_CONTAINERS, + parse_version, + sp_version, +) + + +@pytest.fixture +def X_y_data(): + X, y = make_regression(n_samples=10, n_features=1, random_state=0, noise=1) + return X, y + + +@pytest.mark.skipif( + parse_version(sp_version.base_version) >= parse_version("1.11"), + reason="interior-point solver is not available in SciPy 1.11", +) +@pytest.mark.parametrize("solver", ["interior-point", "revised simplex"]) +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_incompatible_solver_for_sparse_input(X_y_data, solver, csc_container): + X, y = X_y_data + X_sparse = csc_container(X) + err_msg = ( + f"Solver {solver} does not support sparse X. Use solver 'highs' for example." + ) + with pytest.raises(ValueError, match=err_msg): + QuantileRegressor(solver=solver).fit(X_sparse, y) + + +@pytest.mark.parametrize( + "quantile, alpha, intercept, coef", + [ + # for 50% quantile w/o regularization, any slope in [1, 10] is okay + [0.5, 0, 1, None], + # if positive error costs more, the slope is maximal + [0.51, 0, 1, 10], + # if negative error costs more, the slope is minimal + [0.49, 0, 1, 1], + # for a small lasso penalty, the slope is also minimal + [0.5, 0.01, 1, 1], + # for a large lasso penalty, the model predicts the constant median + [0.5, 100, 2, 0], + ], +) +def test_quantile_toy_example(quantile, alpha, intercept, coef): + # test how different parameters affect a small intuitive example + X = [[0], [1], [1]] + y = [1, 2, 11] + model = QuantileRegressor(quantile=quantile, alpha=alpha).fit(X, y) + assert_allclose(model.intercept_, intercept, atol=1e-2) + if coef is not None: + assert_allclose(model.coef_[0], coef, atol=1e-2) + if alpha < 100: + assert model.coef_[0] >= 1 + assert model.coef_[0] <= 10 + + +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_quantile_equals_huber_for_low_epsilon(fit_intercept): + X, y = make_regression(n_samples=100, n_features=20, random_state=0, noise=1.0) + alpha = 1e-4 + huber = HuberRegressor( + epsilon=1 + 1e-4, alpha=alpha, fit_intercept=fit_intercept + ).fit(X, y) + quant = QuantileRegressor(alpha=alpha, fit_intercept=fit_intercept).fit(X, y) + assert_allclose(huber.coef_, quant.coef_, atol=1e-1) + if fit_intercept: + assert huber.intercept_ == approx(quant.intercept_, abs=1e-1) + # check that we still predict fraction + assert np.mean(y < quant.predict(X)) == approx(0.5, abs=1e-1) + + +@pytest.mark.parametrize("q", [0.5, 0.9, 0.05]) +def test_quantile_estimates_calibration(q): + # Test that model estimates percentage of points below the prediction + X, y = make_regression(n_samples=1000, n_features=20, random_state=0, noise=1.0) + quant = QuantileRegressor(quantile=q, alpha=0).fit(X, y) + assert np.mean(y < quant.predict(X)) == approx(q, abs=1e-2) + + +def test_quantile_sample_weight(): + # test that with unequal sample weights we still estimate weighted fraction + n = 1000 + X, y = make_regression(n_samples=n, n_features=5, random_state=0, noise=10.0) + weight = np.ones(n) + # when we increase weight of upper observations, + # estimate of quantile should go up + weight[y > y.mean()] = 100 + quant = QuantileRegressor(quantile=0.5, alpha=1e-8) + quant.fit(X, y, sample_weight=weight) + fraction_below = np.mean(y < quant.predict(X)) + assert fraction_below > 0.5 + weighted_fraction_below = np.average(y < quant.predict(X), weights=weight) + assert weighted_fraction_below == approx(0.5, abs=3e-2) + + +@pytest.mark.parametrize("quantile", [0.2, 0.5, 0.8]) +def test_asymmetric_error(quantile): + """Test quantile regression for asymmetric distributed targets.""" + n_samples = 1000 + rng = np.random.RandomState(42) + X = np.concatenate( + ( + np.abs(rng.randn(n_samples)[:, None]), + -rng.randint(2, size=(n_samples, 1)), + ), + axis=1, + ) + intercept = 1.23 + coef = np.array([0.5, -2]) + # Take care that X @ coef + intercept > 0 + assert np.min(X @ coef + intercept) > 0 + # For an exponential distribution with rate lambda, e.g. exp(-lambda * x), + # the quantile at level q is: + # quantile(q) = - log(1 - q) / lambda + # scale = 1/lambda = -quantile(q) / log(1 - q) + y = rng.exponential( + scale=-(X @ coef + intercept) / np.log(1 - quantile), size=n_samples + ) + model = QuantileRegressor( + quantile=quantile, + alpha=0, + ).fit(X, y) + # This test can be made to pass with any solver but in the interest + # of sparing continuous integration resources, the test is performed + # with the fastest solver only. + + assert model.intercept_ == approx(intercept, rel=0.2) + assert_allclose(model.coef_, coef, rtol=0.6) + assert_allclose(np.mean(model.predict(X) > y), quantile, atol=1e-2) + + # Now compare to Nelder-Mead optimization with L1 penalty + alpha = 0.01 + model.set_params(alpha=alpha).fit(X, y) + model_coef = np.r_[model.intercept_, model.coef_] + + def func(coef): + loss = mean_pinball_loss(y, X @ coef[1:] + coef[0], alpha=quantile) + L1 = np.sum(np.abs(coef[1:])) + return loss + alpha * L1 + + res = minimize( + fun=func, + x0=[1, 0, -1], + method="Nelder-Mead", + tol=1e-12, + options={"maxiter": 2000}, + ) + + assert func(model_coef) == approx(func(res.x)) + assert_allclose(model.intercept_, res.x[0]) + assert_allclose(model.coef_, res.x[1:]) + assert_allclose(np.mean(model.predict(X) > y), quantile, atol=1e-2) + + +@pytest.mark.parametrize("quantile", [0.2, 0.5, 0.8]) +def test_equivariance(quantile): + """Test equivariace of quantile regression. + + See Koenker (2005) Quantile Regression, Chapter 2.2.3. + """ + rng = np.random.RandomState(42) + n_samples, n_features = 100, 5 + X, y = make_regression( + n_samples=n_samples, + n_features=n_features, + n_informative=n_features, + noise=0, + random_state=rng, + shuffle=False, + ) + # make y asymmetric + y += rng.exponential(scale=100, size=y.shape) + params = dict(alpha=0) + model1 = QuantileRegressor(quantile=quantile, **params).fit(X, y) + + # coef(q; a*y, X) = a * coef(q; y, X) + a = 2.5 + model2 = QuantileRegressor(quantile=quantile, **params).fit(X, a * y) + assert model2.intercept_ == approx(a * model1.intercept_, rel=1e-5) + assert_allclose(model2.coef_, a * model1.coef_, rtol=1e-5) + + # coef(1-q; -a*y, X) = -a * coef(q; y, X) + model2 = QuantileRegressor(quantile=1 - quantile, **params).fit(X, -a * y) + assert model2.intercept_ == approx(-a * model1.intercept_, rel=1e-5) + assert_allclose(model2.coef_, -a * model1.coef_, rtol=1e-5) + + # coef(q; y + X @ g, X) = coef(q; y, X) + g + g_intercept, g_coef = rng.randn(), rng.randn(n_features) + model2 = QuantileRegressor(quantile=quantile, **params) + model2.fit(X, y + X @ g_coef + g_intercept) + assert model2.intercept_ == approx(model1.intercept_ + g_intercept) + assert_allclose(model2.coef_, model1.coef_ + g_coef, rtol=1e-6) + + # coef(q; y, X @ A) = A^-1 @ coef(q; y, X) + A = rng.randn(n_features, n_features) + model2 = QuantileRegressor(quantile=quantile, **params) + model2.fit(X @ A, y) + assert model2.intercept_ == approx(model1.intercept_, rel=1e-5) + assert_allclose(model2.coef_, np.linalg.solve(A, model1.coef_), rtol=1e-5) + + +@pytest.mark.skipif( + parse_version(sp_version.base_version) >= parse_version("1.11"), + reason="interior-point solver is not available in SciPy 1.11", +) +@pytest.mark.filterwarnings("ignore:`method='interior-point'` is deprecated") +def test_linprog_failure(): + """Test that linprog fails.""" + X = np.linspace(0, 10, num=10).reshape(-1, 1) + y = np.linspace(0, 10, num=10) + reg = QuantileRegressor( + alpha=0, solver="interior-point", solver_options={"maxiter": 1} + ) + + msg = "Linear programming for QuantileRegressor did not succeed." + with pytest.warns(ConvergenceWarning, match=msg): + reg.fit(X, y) + + +@pytest.mark.parametrize( + "sparse_container", CSC_CONTAINERS + CSR_CONTAINERS + COO_CONTAINERS +) +@pytest.mark.parametrize("solver", ["highs", "highs-ds", "highs-ipm"]) +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_sparse_input(sparse_container, solver, fit_intercept, global_random_seed): + """Test that sparse and dense X give same results.""" + n_informative = 10 + quantile_level = 0.6 + X, y = make_regression( + n_samples=300, + n_features=20, + n_informative=10, + random_state=global_random_seed, + noise=1.0, + ) + X_sparse = sparse_container(X) + alpha = 0.1 + quant_dense = QuantileRegressor( + quantile=quantile_level, alpha=alpha, fit_intercept=fit_intercept + ).fit(X, y) + quant_sparse = QuantileRegressor( + quantile=quantile_level, alpha=alpha, fit_intercept=fit_intercept, solver=solver + ).fit(X_sparse, y) + assert_allclose(quant_sparse.coef_, quant_dense.coef_, rtol=1e-2) + sparse_support = quant_sparse.coef_ != 0 + dense_support = quant_dense.coef_ != 0 + assert dense_support.sum() == pytest.approx(n_informative, abs=1) + assert sparse_support.sum() == pytest.approx(n_informative, abs=1) + if fit_intercept: + assert quant_sparse.intercept_ == approx(quant_dense.intercept_) + # check that we still predict fraction + empirical_coverage = np.mean(y < quant_sparse.predict(X_sparse)) + assert empirical_coverage == approx(quantile_level, abs=3e-2) + + +def test_error_interior_point_future(X_y_data, monkeypatch): + """Check that we will raise a proper error when requesting + `solver='interior-point'` in SciPy >= 1.11. + """ + X, y = X_y_data + import sklearn.linear_model._quantile + + with monkeypatch.context() as m: + m.setattr(sklearn.linear_model._quantile, "sp_version", parse_version("1.11.0")) + err_msg = "Solver interior-point is not anymore available in SciPy >= 1.11.0." + with pytest.raises(ValueError, match=err_msg): + QuantileRegressor(solver="interior-point").fit(X, y) diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_ransac.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_ransac.py new file mode 100644 index 0000000000000000000000000000000000000000..7b2bc66160ef3f5e686da7c546cf01314035ae57 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_ransac.py @@ -0,0 +1,545 @@ +import numpy as np +import pytest +from numpy.testing import assert_array_almost_equal, assert_array_equal + +from sklearn.datasets import make_regression +from sklearn.exceptions import ConvergenceWarning +from sklearn.linear_model import ( + LinearRegression, + OrthogonalMatchingPursuit, + RANSACRegressor, + Ridge, +) +from sklearn.linear_model._ransac import _dynamic_max_trials +from sklearn.utils import check_random_state +from sklearn.utils._testing import assert_allclose +from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS + +# Generate coordinates of line +X = np.arange(-200, 200) +y = 0.2 * X + 20 +data = np.column_stack([X, y]) + +# Add some faulty data +rng = np.random.RandomState(1000) +outliers = np.unique(rng.randint(len(X), size=200)) +data[outliers, :] += 50 + rng.rand(len(outliers), 2) * 10 + +X = data[:, 0][:, np.newaxis] +y = data[:, 1] + + +def test_ransac_inliers_outliers(): + estimator = LinearRegression() + ransac_estimator = RANSACRegressor( + estimator, min_samples=2, residual_threshold=5, random_state=0 + ) + + # Estimate parameters of corrupted data + ransac_estimator.fit(X, y) + + # Ground truth / reference inlier mask + ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_) + ref_inlier_mask[outliers] = False + + assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask) + + +def test_ransac_is_data_valid(): + def is_data_valid(X, y): + assert X.shape[0] == 2 + assert y.shape[0] == 2 + return False + + rng = np.random.RandomState(0) + X = rng.rand(10, 2) + y = rng.rand(10, 1) + + estimator = LinearRegression() + ransac_estimator = RANSACRegressor( + estimator, + min_samples=2, + residual_threshold=5, + is_data_valid=is_data_valid, + random_state=0, + ) + with pytest.raises(ValueError): + ransac_estimator.fit(X, y) + + +def test_ransac_is_model_valid(): + def is_model_valid(estimator, X, y): + assert X.shape[0] == 2 + assert y.shape[0] == 2 + return False + + estimator = LinearRegression() + ransac_estimator = RANSACRegressor( + estimator, + min_samples=2, + residual_threshold=5, + is_model_valid=is_model_valid, + random_state=0, + ) + with pytest.raises(ValueError): + ransac_estimator.fit(X, y) + + +def test_ransac_max_trials(): + estimator = LinearRegression() + + ransac_estimator = RANSACRegressor( + estimator, + min_samples=2, + residual_threshold=5, + max_trials=0, + random_state=0, + ) + with pytest.raises(ValueError): + ransac_estimator.fit(X, y) + + # there is a 1e-9 chance it will take these many trials. No good reason + # 1e-2 isn't enough, can still happen + # 2 is the what ransac defines as min_samples = X.shape[1] + 1 + max_trials = _dynamic_max_trials(len(X) - len(outliers), X.shape[0], 2, 1 - 1e-9) + ransac_estimator = RANSACRegressor(estimator, min_samples=2) + for i in range(50): + ransac_estimator.set_params(min_samples=2, random_state=i) + ransac_estimator.fit(X, y) + assert ransac_estimator.n_trials_ < max_trials + 1 + + +def test_ransac_stop_n_inliers(): + estimator = LinearRegression() + ransac_estimator = RANSACRegressor( + estimator, + min_samples=2, + residual_threshold=5, + stop_n_inliers=2, + random_state=0, + ) + ransac_estimator.fit(X, y) + + assert ransac_estimator.n_trials_ == 1 + + +def test_ransac_stop_score(): + estimator = LinearRegression() + ransac_estimator = RANSACRegressor( + estimator, + min_samples=2, + residual_threshold=5, + stop_score=0, + random_state=0, + ) + ransac_estimator.fit(X, y) + + assert ransac_estimator.n_trials_ == 1 + + +def test_ransac_score(): + X = np.arange(100)[:, None] + y = np.zeros((100,)) + y[0] = 1 + y[1] = 100 + + estimator = LinearRegression() + ransac_estimator = RANSACRegressor( + estimator, min_samples=2, residual_threshold=0.5, random_state=0 + ) + ransac_estimator.fit(X, y) + + assert ransac_estimator.score(X[2:], y[2:]) == 1 + assert ransac_estimator.score(X[:2], y[:2]) < 1 + + +def test_ransac_predict(): + X = np.arange(100)[:, None] + y = np.zeros((100,)) + y[0] = 1 + y[1] = 100 + + estimator = LinearRegression() + ransac_estimator = RANSACRegressor( + estimator, min_samples=2, residual_threshold=0.5, random_state=0 + ) + ransac_estimator.fit(X, y) + + assert_array_equal(ransac_estimator.predict(X), np.zeros(100)) + + +def test_ransac_no_valid_data(): + def is_data_valid(X, y): + return False + + estimator = LinearRegression() + ransac_estimator = RANSACRegressor( + estimator, is_data_valid=is_data_valid, max_trials=5 + ) + + msg = "RANSAC could not find a valid consensus set" + with pytest.raises(ValueError, match=msg): + ransac_estimator.fit(X, y) + assert ransac_estimator.n_skips_no_inliers_ == 0 + assert ransac_estimator.n_skips_invalid_data_ == 5 + assert ransac_estimator.n_skips_invalid_model_ == 0 + + +def test_ransac_no_valid_model(): + def is_model_valid(estimator, X, y): + return False + + estimator = LinearRegression() + ransac_estimator = RANSACRegressor( + estimator, is_model_valid=is_model_valid, max_trials=5 + ) + + msg = "RANSAC could not find a valid consensus set" + with pytest.raises(ValueError, match=msg): + ransac_estimator.fit(X, y) + assert ransac_estimator.n_skips_no_inliers_ == 0 + assert ransac_estimator.n_skips_invalid_data_ == 0 + assert ransac_estimator.n_skips_invalid_model_ == 5 + + +def test_ransac_exceed_max_skips(): + def is_data_valid(X, y): + return False + + estimator = LinearRegression() + ransac_estimator = RANSACRegressor( + estimator, is_data_valid=is_data_valid, max_trials=5, max_skips=3 + ) + + msg = "RANSAC skipped more iterations than `max_skips`" + with pytest.raises(ValueError, match=msg): + ransac_estimator.fit(X, y) + assert ransac_estimator.n_skips_no_inliers_ == 0 + assert ransac_estimator.n_skips_invalid_data_ == 4 + assert ransac_estimator.n_skips_invalid_model_ == 0 + + +def test_ransac_warn_exceed_max_skips(): + global cause_skip + cause_skip = False + + def is_data_valid(X, y): + global cause_skip + if not cause_skip: + cause_skip = True + return True + else: + return False + + estimator = LinearRegression() + ransac_estimator = RANSACRegressor( + estimator, is_data_valid=is_data_valid, max_skips=3, max_trials=5 + ) + warning_message = ( + "RANSAC found a valid consensus set but exited " + "early due to skipping more iterations than " + "`max_skips`. See estimator attributes for " + "diagnostics." + ) + with pytest.warns(ConvergenceWarning, match=warning_message): + ransac_estimator.fit(X, y) + assert ransac_estimator.n_skips_no_inliers_ == 0 + assert ransac_estimator.n_skips_invalid_data_ == 4 + assert ransac_estimator.n_skips_invalid_model_ == 0 + + +@pytest.mark.parametrize( + "sparse_container", COO_CONTAINERS + CSR_CONTAINERS + CSC_CONTAINERS +) +def test_ransac_sparse(sparse_container): + X_sparse = sparse_container(X) + + estimator = LinearRegression() + ransac_estimator = RANSACRegressor( + estimator, min_samples=2, residual_threshold=5, random_state=0 + ) + ransac_estimator.fit(X_sparse, y) + + ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_) + ref_inlier_mask[outliers] = False + + assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask) + + +def test_ransac_none_estimator(): + estimator = LinearRegression() + + ransac_estimator = RANSACRegressor( + estimator, min_samples=2, residual_threshold=5, random_state=0 + ) + ransac_none_estimator = RANSACRegressor( + None, min_samples=2, residual_threshold=5, random_state=0 + ) + + ransac_estimator.fit(X, y) + ransac_none_estimator.fit(X, y) + + assert_array_almost_equal( + ransac_estimator.predict(X), ransac_none_estimator.predict(X) + ) + + +def test_ransac_min_n_samples(): + estimator = LinearRegression() + ransac_estimator1 = RANSACRegressor( + estimator, min_samples=2, residual_threshold=5, random_state=0 + ) + ransac_estimator2 = RANSACRegressor( + estimator, + min_samples=2.0 / X.shape[0], + residual_threshold=5, + random_state=0, + ) + ransac_estimator5 = RANSACRegressor( + estimator, min_samples=2, residual_threshold=5, random_state=0 + ) + ransac_estimator6 = RANSACRegressor(estimator, residual_threshold=5, random_state=0) + ransac_estimator7 = RANSACRegressor( + estimator, min_samples=X.shape[0] + 1, residual_threshold=5, random_state=0 + ) + # GH #19390 + ransac_estimator8 = RANSACRegressor( + Ridge(), min_samples=None, residual_threshold=5, random_state=0 + ) + + ransac_estimator1.fit(X, y) + ransac_estimator2.fit(X, y) + ransac_estimator5.fit(X, y) + ransac_estimator6.fit(X, y) + + assert_array_almost_equal( + ransac_estimator1.predict(X), ransac_estimator2.predict(X) + ) + assert_array_almost_equal( + ransac_estimator1.predict(X), ransac_estimator5.predict(X) + ) + assert_array_almost_equal( + ransac_estimator1.predict(X), ransac_estimator6.predict(X) + ) + + with pytest.raises(ValueError): + ransac_estimator7.fit(X, y) + + err_msg = "`min_samples` needs to be explicitly set" + with pytest.raises(ValueError, match=err_msg): + ransac_estimator8.fit(X, y) + + +def test_ransac_multi_dimensional_targets(): + estimator = LinearRegression() + ransac_estimator = RANSACRegressor( + estimator, min_samples=2, residual_threshold=5, random_state=0 + ) + + # 3-D target values + yyy = np.column_stack([y, y, y]) + + # Estimate parameters of corrupted data + ransac_estimator.fit(X, yyy) + + # Ground truth / reference inlier mask + ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_) + ref_inlier_mask[outliers] = False + + assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask) + + +def test_ransac_residual_loss(): + def loss_multi1(y_true, y_pred): + return np.sum(np.abs(y_true - y_pred), axis=1) + + def loss_multi2(y_true, y_pred): + return np.sum((y_true - y_pred) ** 2, axis=1) + + def loss_mono(y_true, y_pred): + return np.abs(y_true - y_pred) + + yyy = np.column_stack([y, y, y]) + + estimator = LinearRegression() + ransac_estimator0 = RANSACRegressor( + estimator, min_samples=2, residual_threshold=5, random_state=0 + ) + ransac_estimator1 = RANSACRegressor( + estimator, + min_samples=2, + residual_threshold=5, + random_state=0, + loss=loss_multi1, + ) + ransac_estimator2 = RANSACRegressor( + estimator, + min_samples=2, + residual_threshold=5, + random_state=0, + loss=loss_multi2, + ) + + # multi-dimensional + ransac_estimator0.fit(X, yyy) + ransac_estimator1.fit(X, yyy) + ransac_estimator2.fit(X, yyy) + assert_array_almost_equal( + ransac_estimator0.predict(X), ransac_estimator1.predict(X) + ) + assert_array_almost_equal( + ransac_estimator0.predict(X), ransac_estimator2.predict(X) + ) + + # one-dimensional + ransac_estimator0.fit(X, y) + ransac_estimator2.loss = loss_mono + ransac_estimator2.fit(X, y) + assert_array_almost_equal( + ransac_estimator0.predict(X), ransac_estimator2.predict(X) + ) + ransac_estimator3 = RANSACRegressor( + estimator, + min_samples=2, + residual_threshold=5, + random_state=0, + loss="squared_error", + ) + ransac_estimator3.fit(X, y) + assert_array_almost_equal( + ransac_estimator0.predict(X), ransac_estimator2.predict(X) + ) + + +def test_ransac_default_residual_threshold(): + estimator = LinearRegression() + ransac_estimator = RANSACRegressor(estimator, min_samples=2, random_state=0) + + # Estimate parameters of corrupted data + ransac_estimator.fit(X, y) + + # Ground truth / reference inlier mask + ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_) + ref_inlier_mask[outliers] = False + + assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask) + + +def test_ransac_dynamic_max_trials(): + # Numbers hand-calculated and confirmed on page 119 (Table 4.3) in + # Hartley, R.~I. and Zisserman, A., 2004, + # Multiple View Geometry in Computer Vision, Second Edition, + # Cambridge University Press, ISBN: 0521540518 + + # e = 0%, min_samples = X + assert _dynamic_max_trials(100, 100, 2, 0.99) == 1 + + # e = 5%, min_samples = 2 + assert _dynamic_max_trials(95, 100, 2, 0.99) == 2 + # e = 10%, min_samples = 2 + assert _dynamic_max_trials(90, 100, 2, 0.99) == 3 + # e = 30%, min_samples = 2 + assert _dynamic_max_trials(70, 100, 2, 0.99) == 7 + # e = 50%, min_samples = 2 + assert _dynamic_max_trials(50, 100, 2, 0.99) == 17 + + # e = 5%, min_samples = 8 + assert _dynamic_max_trials(95, 100, 8, 0.99) == 5 + # e = 10%, min_samples = 8 + assert _dynamic_max_trials(90, 100, 8, 0.99) == 9 + # e = 30%, min_samples = 8 + assert _dynamic_max_trials(70, 100, 8, 0.99) == 78 + # e = 50%, min_samples = 8 + assert _dynamic_max_trials(50, 100, 8, 0.99) == 1177 + + # e = 0%, min_samples = 10 + assert _dynamic_max_trials(1, 100, 10, 0) == 0 + assert _dynamic_max_trials(1, 100, 10, 1) == float("inf") + + +def test_ransac_fit_sample_weight(): + ransac_estimator = RANSACRegressor(random_state=0) + n_samples = y.shape[0] + weights = np.ones(n_samples) + ransac_estimator.fit(X, y, sample_weight=weights) + # sanity check + assert ransac_estimator.inlier_mask_.shape[0] == n_samples + + ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_) + ref_inlier_mask[outliers] = False + # check that mask is correct + assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask) + + # check that fit(X) = fit([X1, X2, X3],sample_weight = [n1, n2, n3]) where + # X = X1 repeated n1 times, X2 repeated n2 times and so forth + random_state = check_random_state(0) + X_ = random_state.randint(0, 200, [10, 1]) + y_ = np.ndarray.flatten(0.2 * X_ + 2) + sample_weight = random_state.randint(0, 10, 10) + outlier_X = random_state.randint(0, 1000, [1, 1]) + outlier_weight = random_state.randint(0, 10, 1) + outlier_y = random_state.randint(-1000, 0, 1) + + X_flat = np.append( + np.repeat(X_, sample_weight, axis=0), + np.repeat(outlier_X, outlier_weight, axis=0), + axis=0, + ) + y_flat = np.ndarray.flatten( + np.append( + np.repeat(y_, sample_weight, axis=0), + np.repeat(outlier_y, outlier_weight, axis=0), + axis=0, + ) + ) + ransac_estimator.fit(X_flat, y_flat) + ref_coef_ = ransac_estimator.estimator_.coef_ + + sample_weight = np.append(sample_weight, outlier_weight) + X_ = np.append(X_, outlier_X, axis=0) + y_ = np.append(y_, outlier_y) + ransac_estimator.fit(X_, y_, sample_weight=sample_weight) + + assert_allclose(ransac_estimator.estimator_.coef_, ref_coef_) + + # check that if estimator.fit doesn't support + # sample_weight, raises error + estimator = OrthogonalMatchingPursuit() + ransac_estimator = RANSACRegressor(estimator, min_samples=10) + + err_msg = f"{estimator.__class__.__name__} does not support sample_weight." + with pytest.raises(ValueError, match=err_msg): + ransac_estimator.fit(X, y, sample_weight=weights) + + +def test_ransac_final_model_fit_sample_weight(): + X, y = make_regression(n_samples=1000, random_state=10) + rng = check_random_state(42) + sample_weight = rng.randint(1, 4, size=y.shape[0]) + sample_weight = sample_weight / sample_weight.sum() + ransac = RANSACRegressor(random_state=0) + ransac.fit(X, y, sample_weight=sample_weight) + + final_model = LinearRegression() + mask_samples = ransac.inlier_mask_ + final_model.fit( + X[mask_samples], y[mask_samples], sample_weight=sample_weight[mask_samples] + ) + + assert_allclose(ransac.estimator_.coef_, final_model.coef_, atol=1e-12) + + +def test_perfect_horizontal_line(): + """Check that we can fit a line where all samples are inliers. + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/19497 + """ + X = np.arange(100)[:, None] + y = np.zeros((100,)) + + estimator = LinearRegression() + ransac_estimator = RANSACRegressor(estimator, random_state=0) + ransac_estimator.fit(X, y) + + assert_allclose(ransac_estimator.estimator_.coef_, 0.0) + assert_allclose(ransac_estimator.estimator_.intercept_, 0.0) diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_ridge.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_ridge.py new file mode 100644 index 0000000000000000000000000000000000000000..24515195fb7ccd674091ab6b90a91b43a59a14aa --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_ridge.py @@ -0,0 +1,2380 @@ +import warnings +from itertools import product + +import numpy as np +import pytest +from scipy import linalg + +from sklearn import config_context, datasets +from sklearn.base import clone +from sklearn.datasets import ( + make_classification, + make_low_rank_matrix, + make_multilabel_classification, + make_regression, +) +from sklearn.exceptions import ConvergenceWarning +from sklearn.linear_model import ( + LinearRegression, + Ridge, + RidgeClassifier, + RidgeClassifierCV, + RidgeCV, + ridge_regression, +) +from sklearn.linear_model._ridge import ( + _check_gcv_mode, + _RidgeGCV, + _solve_cholesky, + _solve_cholesky_kernel, + _solve_lbfgs, + _solve_svd, + _X_CenterStackOp, +) +from sklearn.metrics import get_scorer, make_scorer, mean_squared_error +from sklearn.model_selection import ( + GridSearchCV, + GroupKFold, + KFold, + LeaveOneOut, + cross_val_predict, +) +from sklearn.preprocessing import minmax_scale +from sklearn.utils import check_random_state +from sklearn.utils._array_api import ( + _NUMPY_NAMESPACE_NAMES, + _atol_for_type, + _convert_to_numpy, + _get_namespace_device_dtype_ids, + yield_namespace_device_dtype_combinations, + yield_namespaces, +) +from sklearn.utils._test_common.instance_generator import _get_check_estimator_ids +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, + ignore_warnings, +) +from sklearn.utils.estimator_checks import ( + _array_api_for_tests, + check_array_api_input_and_values, +) +from sklearn.utils.fixes import ( + _IS_32BIT, + COO_CONTAINERS, + CSC_CONTAINERS, + CSR_CONTAINERS, + DOK_CONTAINERS, + LIL_CONTAINERS, +) + +SOLVERS = ["svd", "sparse_cg", "cholesky", "lsqr", "sag", "saga"] +SPARSE_SOLVERS_WITH_INTERCEPT = ("sparse_cg", "sag") +SPARSE_SOLVERS_WITHOUT_INTERCEPT = ("sparse_cg", "cholesky", "lsqr", "sag", "saga") + +diabetes = datasets.load_diabetes() +X_diabetes, y_diabetes = diabetes.data, diabetes.target +ind = np.arange(X_diabetes.shape[0]) +rng = np.random.RandomState(0) +rng.shuffle(ind) +ind = ind[:200] +X_diabetes, y_diabetes = X_diabetes[ind], y_diabetes[ind] + +iris = datasets.load_iris() +X_iris, y_iris = iris.data, iris.target + + +def _accuracy_callable(y_test, y_pred, **kwargs): + return np.mean(y_test == y_pred) + + +def _mean_squared_error_callable(y_test, y_pred): + return ((y_test - y_pred) ** 2).mean() + + +@pytest.fixture(params=["long", "wide"]) +def ols_ridge_dataset(global_random_seed, request): + """Dataset with OLS and Ridge solutions, well conditioned X. + + The construction is based on the SVD decomposition of X = U S V'. + + Parameters + ---------- + type : {"long", "wide"} + If "long", then n_samples > n_features. + If "wide", then n_features > n_samples. + + For "wide", we return the minimum norm solution w = X' (XX')^-1 y: + + min ||w||_2 subject to X w = y + + Returns + ------- + X : ndarray + Last column of 1, i.e. intercept. + y : ndarray + coef_ols : ndarray of shape + Minimum norm OLS solutions, i.e. min ||X w - y||_2_2 (with minimum ||w||_2 in + case of ambiguity) + Last coefficient is intercept. + coef_ridge : ndarray of shape (5,) + Ridge solution with alpha=1, i.e. min ||X w - y||_2_2 + ||w||_2^2. + Last coefficient is intercept. + """ + # Make larger dim more than double as big as the smaller one. + # This helps when constructing singular matrices like (X, X). + if request.param == "long": + n_samples, n_features = 12, 4 + else: + n_samples, n_features = 4, 12 + k = min(n_samples, n_features) + rng = np.random.RandomState(global_random_seed) + X = make_low_rank_matrix( + n_samples=n_samples, n_features=n_features, effective_rank=k, random_state=rng + ) + X[:, -1] = 1 # last columns acts as intercept + U, s, Vt = linalg.svd(X) + assert np.all(s > 1e-3) # to be sure + U1, U2 = U[:, :k], U[:, k:] + Vt1, _ = Vt[:k, :], Vt[k:, :] + + if request.param == "long": + # Add a term that vanishes in the product X'y + coef_ols = rng.uniform(low=-10, high=10, size=n_features) + y = X @ coef_ols + y += U2 @ rng.normal(size=n_samples - n_features) ** 2 + else: + y = rng.uniform(low=-10, high=10, size=n_samples) + # w = X'(XX')^-1 y = V s^-1 U' y + coef_ols = Vt1.T @ np.diag(1 / s) @ U1.T @ y + + # Add penalty alpha * ||coef||_2^2 for alpha=1 and solve via normal equations. + # Note that the problem is well conditioned such that we get accurate results. + alpha = 1 + d = alpha * np.identity(n_features) + d[-1, -1] = 0 # intercept gets no penalty + coef_ridge = linalg.solve(X.T @ X + d, X.T @ y) + + # To be sure + R_OLS = y - X @ coef_ols + R_Ridge = y - X @ coef_ridge + assert np.linalg.norm(R_OLS) < np.linalg.norm(R_Ridge) + + return X, y, coef_ols, coef_ridge + + +@pytest.mark.parametrize("solver", SOLVERS) +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_ridge_regression(solver, fit_intercept, ols_ridge_dataset, global_random_seed): + """Test that Ridge converges for all solvers to correct solution. + + We work with a simple constructed data set with known solution. + """ + X, y, _, coef = ols_ridge_dataset + alpha = 1.0 # because ols_ridge_dataset uses this. + params = dict( + alpha=alpha, + fit_intercept=True, + solver=solver, + tol=1e-15 if solver in ("sag", "saga") else 1e-10, + random_state=global_random_seed, + ) + + # Calculate residuals and R2. + res_null = y - np.mean(y) + res_Ridge = y - X @ coef + R2_Ridge = 1 - np.sum(res_Ridge**2) / np.sum(res_null**2) + + model = Ridge(**params) + X = X[:, :-1] # remove intercept + if fit_intercept: + intercept = coef[-1] + else: + X = X - X.mean(axis=0) + y = y - y.mean() + intercept = 0 + model.fit(X, y) + coef = coef[:-1] + + assert model.intercept_ == pytest.approx(intercept) + assert_allclose(model.coef_, coef) + assert model.score(X, y) == pytest.approx(R2_Ridge) + + # Same with sample_weight. + model = Ridge(**params).fit(X, y, sample_weight=np.ones(X.shape[0])) + assert model.intercept_ == pytest.approx(intercept) + assert_allclose(model.coef_, coef) + assert model.score(X, y) == pytest.approx(R2_Ridge) + + assert model.solver_ == solver + + +@pytest.mark.parametrize("solver", SOLVERS) +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_ridge_regression_hstacked_X( + solver, fit_intercept, ols_ridge_dataset, global_random_seed +): + """Test that Ridge converges for all solvers to correct solution on hstacked data. + + We work with a simple constructed data set with known solution. + Fit on [X] with alpha is the same as fit on [X, X]/2 with alpha/2. + For long X, [X, X] is a singular matrix. + """ + X, y, _, coef = ols_ridge_dataset + n_samples, n_features = X.shape + alpha = 1.0 # because ols_ridge_dataset uses this. + + model = Ridge( + alpha=alpha / 2, + fit_intercept=fit_intercept, + solver=solver, + tol=1e-15 if solver in ("sag", "saga") else 1e-10, + random_state=global_random_seed, + ) + X = X[:, :-1] # remove intercept + X = 0.5 * np.concatenate((X, X), axis=1) + assert np.linalg.matrix_rank(X) <= min(n_samples, n_features - 1) + if fit_intercept: + intercept = coef[-1] + else: + X = X - X.mean(axis=0) + y = y - y.mean() + intercept = 0 + model.fit(X, y) + coef = coef[:-1] + + assert model.intercept_ == pytest.approx(intercept) + # coefficients are not all on the same magnitude, adding a small atol to + # make this test less brittle + assert_allclose(model.coef_, np.r_[coef, coef], atol=1e-8) + + +@pytest.mark.parametrize("solver", SOLVERS) +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_ridge_regression_vstacked_X( + solver, fit_intercept, ols_ridge_dataset, global_random_seed +): + """Test that Ridge converges for all solvers to correct solution on vstacked data. + + We work with a simple constructed data set with known solution. + Fit on [X] with alpha is the same as fit on [X], [y] + [X], [y] with 2 * alpha. + For wide X, [X', X'] is a singular matrix. + """ + X, y, _, coef = ols_ridge_dataset + n_samples, n_features = X.shape + alpha = 1.0 # because ols_ridge_dataset uses this. + + model = Ridge( + alpha=2 * alpha, + fit_intercept=fit_intercept, + solver=solver, + tol=1e-15 if solver in ("sag", "saga") else 1e-10, + random_state=global_random_seed, + ) + X = X[:, :-1] # remove intercept + X = np.concatenate((X, X), axis=0) + assert np.linalg.matrix_rank(X) <= min(n_samples, n_features) + y = np.r_[y, y] + if fit_intercept: + intercept = coef[-1] + else: + X = X - X.mean(axis=0) + y = y - y.mean() + intercept = 0 + model.fit(X, y) + coef = coef[:-1] + + assert model.intercept_ == pytest.approx(intercept) + # coefficients are not all on the same magnitude, adding a small atol to + # make this test less brittle + assert_allclose(model.coef_, coef, atol=1e-8) + + +@pytest.mark.parametrize("solver", SOLVERS) +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_ridge_regression_unpenalized( + solver, fit_intercept, ols_ridge_dataset, global_random_seed +): + """Test that unpenalized Ridge = OLS converges for all solvers to correct solution. + + We work with a simple constructed data set with known solution. + Note: This checks the minimum norm solution for wide X, i.e. + n_samples < n_features: + min ||w||_2 subject to X w = y + """ + X, y, coef, _ = ols_ridge_dataset + n_samples, n_features = X.shape + alpha = 0 # OLS + params = dict( + alpha=alpha, + fit_intercept=fit_intercept, + solver=solver, + tol=1e-15 if solver in ("sag", "saga") else 1e-10, + random_state=global_random_seed, + ) + + model = Ridge(**params) + # Note that cholesky might give a warning: "Singular matrix in solving dual + # problem. Using least-squares solution instead." + if fit_intercept: + X = X[:, :-1] # remove intercept + intercept = coef[-1] + coef = coef[:-1] + else: + intercept = 0 + model.fit(X, y) + + # FIXME: `assert_allclose(model.coef_, coef)` should work for all cases but fails + # for the wide/fat case with n_features > n_samples. The current Ridge solvers do + # NOT return the minimum norm solution with fit_intercept=True. + if n_samples > n_features or not fit_intercept: + assert model.intercept_ == pytest.approx(intercept) + assert_allclose(model.coef_, coef) + else: + # As it is an underdetermined problem, residuals = 0. This shows that we get + # a solution to X w = y .... + assert_allclose(model.predict(X), y) + assert_allclose(X @ coef + intercept, y) + # But it is not the minimum norm solution. (This should be equal.) + assert np.linalg.norm(np.r_[model.intercept_, model.coef_]) > np.linalg.norm( + np.r_[intercept, coef] + ) + + pytest.xfail(reason="Ridge does not provide the minimum norm solution.") + assert model.intercept_ == pytest.approx(intercept) + assert_allclose(model.coef_, coef) + + +@pytest.mark.parametrize("solver", SOLVERS) +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_ridge_regression_unpenalized_hstacked_X( + solver, fit_intercept, ols_ridge_dataset, global_random_seed +): + """Test that unpenalized Ridge = OLS converges for all solvers to correct solution. + + We work with a simple constructed data set with known solution. + OLS fit on [X] is the same as fit on [X, X]/2. + For long X, [X, X] is a singular matrix and we check against the minimum norm + solution: + min ||w||_2 subject to min ||X w - y||_2 + """ + X, y, coef, _ = ols_ridge_dataset + n_samples, n_features = X.shape + alpha = 0 # OLS + + model = Ridge( + alpha=alpha, + fit_intercept=fit_intercept, + solver=solver, + tol=1e-15 if solver in ("sag", "saga") else 1e-10, + random_state=global_random_seed, + ) + if fit_intercept: + X = X[:, :-1] # remove intercept + intercept = coef[-1] + coef = coef[:-1] + else: + intercept = 0 + X = 0.5 * np.concatenate((X, X), axis=1) + assert np.linalg.matrix_rank(X) <= min(n_samples, n_features) + model.fit(X, y) + + if n_samples > n_features or not fit_intercept: + assert model.intercept_ == pytest.approx(intercept) + if solver == "cholesky": + # Cholesky is a bad choice for singular X. + pytest.skip() + assert_allclose(model.coef_, np.r_[coef, coef]) + else: + # FIXME: Same as in test_ridge_regression_unpenalized. + # As it is an underdetermined problem, residuals = 0. This shows that we get + # a solution to X w = y .... + assert_allclose(model.predict(X), y) + # But it is not the minimum norm solution. (This should be equal.) + assert np.linalg.norm(np.r_[model.intercept_, model.coef_]) > np.linalg.norm( + np.r_[intercept, coef, coef] + ) + + pytest.xfail(reason="Ridge does not provide the minimum norm solution.") + assert model.intercept_ == pytest.approx(intercept) + assert_allclose(model.coef_, np.r_[coef, coef]) + + +@pytest.mark.parametrize("solver", SOLVERS) +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_ridge_regression_unpenalized_vstacked_X( + solver, fit_intercept, ols_ridge_dataset, global_random_seed +): + """Test that unpenalized Ridge = OLS converges for all solvers to correct solution. + + We work with a simple constructed data set with known solution. + OLS fit on [X] is the same as fit on [X], [y] + [X], [y]. + For wide X, [X', X'] is a singular matrix and we check against the minimum norm + solution: + min ||w||_2 subject to X w = y + """ + X, y, coef, _ = ols_ridge_dataset + n_samples, n_features = X.shape + alpha = 0 # OLS + + model = Ridge( + alpha=alpha, + fit_intercept=fit_intercept, + solver=solver, + tol=1e-15 if solver in ("sag", "saga") else 1e-10, + random_state=global_random_seed, + ) + + if fit_intercept: + X = X[:, :-1] # remove intercept + intercept = coef[-1] + coef = coef[:-1] + else: + intercept = 0 + X = np.concatenate((X, X), axis=0) + assert np.linalg.matrix_rank(X) <= min(n_samples, n_features) + y = np.r_[y, y] + model.fit(X, y) + + if n_samples > n_features or not fit_intercept: + assert model.intercept_ == pytest.approx(intercept) + assert_allclose(model.coef_, coef) + else: + # FIXME: Same as in test_ridge_regression_unpenalized. + # As it is an underdetermined problem, residuals = 0. This shows that we get + # a solution to X w = y .... + assert_allclose(model.predict(X), y) + # But it is not the minimum norm solution. (This should be equal.) + assert np.linalg.norm(np.r_[model.intercept_, model.coef_]) > np.linalg.norm( + np.r_[intercept, coef] + ) + + pytest.xfail(reason="Ridge does not provide the minimum norm solution.") + assert model.intercept_ == pytest.approx(intercept) + assert_allclose(model.coef_, coef) + + +@pytest.mark.parametrize("solver", SOLVERS) +@pytest.mark.parametrize("fit_intercept", [True, False]) +@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS) +@pytest.mark.parametrize("alpha", [1.0, 1e-2]) +def test_ridge_regression_sample_weights( + solver, + fit_intercept, + sparse_container, + alpha, + ols_ridge_dataset, + global_random_seed, +): + """Test that Ridge with sample weights gives correct results. + + We use the following trick: + ||y - Xw||_2 = (z - Aw)' W (z - Aw) + for z=[y, y], A' = [X', X'] (vstacked), and W[:n/2] + W[n/2:] = 1, W=diag(W) + """ + if sparse_container is not None: + if fit_intercept and solver not in SPARSE_SOLVERS_WITH_INTERCEPT: + pytest.skip() + elif not fit_intercept and solver not in SPARSE_SOLVERS_WITHOUT_INTERCEPT: + pytest.skip() + X, y, _, coef = ols_ridge_dataset + n_samples, n_features = X.shape + sw = rng.uniform(low=0, high=1, size=n_samples) + + model = Ridge( + alpha=alpha, + fit_intercept=fit_intercept, + solver=solver, + tol=1e-15 if solver in ["sag", "saga"] else 1e-10, + max_iter=100_000, + random_state=global_random_seed, + ) + X = X[:, :-1] # remove intercept + X = np.concatenate((X, X), axis=0) + y = np.r_[y, y] + sw = np.r_[sw, 1 - sw] * alpha + if fit_intercept: + intercept = coef[-1] + else: + X = X - X.mean(axis=0) + y = y - y.mean() + intercept = 0 + if sparse_container is not None: + X = sparse_container(X) + model.fit(X, y, sample_weight=sw) + coef = coef[:-1] + + assert model.intercept_ == pytest.approx(intercept) + assert_allclose(model.coef_, coef) + + +def test_primal_dual_relationship(): + y = y_diabetes.reshape(-1, 1) + coef = _solve_cholesky(X_diabetes, y, alpha=[1e-2]) + K = np.dot(X_diabetes, X_diabetes.T) + dual_coef = _solve_cholesky_kernel(K, y, alpha=[1e-2]) + coef2 = np.dot(X_diabetes.T, dual_coef).T + assert_array_almost_equal(coef, coef2) + + +def test_ridge_regression_convergence_fail(): + rng = np.random.RandomState(0) + y = rng.randn(5) + X = rng.randn(5, 10) + warning_message = r"sparse_cg did not converge after [0-9]+ iterations." + with pytest.warns(ConvergenceWarning, match=warning_message): + ridge_regression( + X, y, alpha=1.0, solver="sparse_cg", tol=0.0, max_iter=None, verbose=1 + ) + + +def test_ridge_shapes_type(): + # Test shape of coef_ and intercept_ + rng = np.random.RandomState(0) + n_samples, n_features = 5, 10 + X = rng.randn(n_samples, n_features) + y = rng.randn(n_samples) + Y1 = y[:, np.newaxis] + Y = np.c_[y, 1 + y] + + ridge = Ridge() + + ridge.fit(X, y) + assert ridge.coef_.shape == (n_features,) + assert ridge.intercept_.shape == () + assert isinstance(ridge.coef_, np.ndarray) + assert isinstance(ridge.intercept_, float) + + ridge.fit(X, Y1) + assert ridge.coef_.shape == (n_features,) + assert ridge.intercept_.shape == (1,) + assert isinstance(ridge.coef_, np.ndarray) + assert isinstance(ridge.intercept_, np.ndarray) + + ridge.fit(X, Y) + assert ridge.coef_.shape == (2, n_features) + assert ridge.intercept_.shape == (2,) + assert isinstance(ridge.coef_, np.ndarray) + assert isinstance(ridge.intercept_, np.ndarray) + + +def test_ridge_intercept(): + # Test intercept with multiple targets GH issue #708 + rng = np.random.RandomState(0) + n_samples, n_features = 5, 10 + X = rng.randn(n_samples, n_features) + y = rng.randn(n_samples) + Y = np.c_[y, 1.0 + y] + + ridge = Ridge() + + ridge.fit(X, y) + intercept = ridge.intercept_ + + ridge.fit(X, Y) + assert_almost_equal(ridge.intercept_[0], intercept) + assert_almost_equal(ridge.intercept_[1], intercept + 1.0) + + +def test_ridge_vs_lstsq(): + # On alpha=0., Ridge and OLS yield the same solution. + + rng = np.random.RandomState(0) + # we need more samples than features + n_samples, n_features = 5, 4 + y = rng.randn(n_samples) + X = rng.randn(n_samples, n_features) + + ridge = Ridge(alpha=0.0, fit_intercept=False) + ols = LinearRegression(fit_intercept=False) + + ridge.fit(X, y) + ols.fit(X, y) + assert_almost_equal(ridge.coef_, ols.coef_) + + ridge.fit(X, y) + ols.fit(X, y) + assert_almost_equal(ridge.coef_, ols.coef_) + + +def test_ridge_individual_penalties(): + # Tests the ridge object using individual penalties + + rng = np.random.RandomState(42) + + n_samples, n_features, n_targets = 20, 10, 5 + X = rng.randn(n_samples, n_features) + y = rng.randn(n_samples, n_targets) + + penalties = np.arange(n_targets) + + coef_cholesky = np.array( + [ + Ridge(alpha=alpha, solver="cholesky").fit(X, target).coef_ + for alpha, target in zip(penalties, y.T) + ] + ) + + coefs_indiv_pen = [ + Ridge(alpha=penalties, solver=solver, tol=1e-12).fit(X, y).coef_ + for solver in ["svd", "sparse_cg", "lsqr", "cholesky", "sag", "saga"] + ] + for coef_indiv_pen in coefs_indiv_pen: + assert_array_almost_equal(coef_cholesky, coef_indiv_pen) + + # Test error is raised when number of targets and penalties do not match. + ridge = Ridge(alpha=penalties[:-1]) + err_msg = "Number of targets and number of penalties do not correspond: 4 != 5" + with pytest.raises(ValueError, match=err_msg): + ridge.fit(X, y) + + +@pytest.mark.parametrize("n_col", [(), (1,), (3,)]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_X_CenterStackOp(n_col, csr_container): + rng = np.random.RandomState(0) + X = rng.randn(11, 8) + X_m = rng.randn(8) + sqrt_sw = rng.randn(len(X)) + Y = rng.randn(11, *n_col) + A = rng.randn(9, *n_col) + operator = _X_CenterStackOp(csr_container(X), X_m, sqrt_sw) + reference_operator = np.hstack([X - sqrt_sw[:, None] * X_m, sqrt_sw[:, None]]) + assert_allclose(reference_operator.dot(A), operator.dot(A)) + assert_allclose(reference_operator.T.dot(Y), operator.T.dot(Y)) + + +@pytest.mark.parametrize("shape", [(10, 1), (13, 9), (3, 7), (2, 2), (20, 20)]) +@pytest.mark.parametrize("uniform_weights", [True, False]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_compute_gram(shape, uniform_weights, csr_container): + rng = np.random.RandomState(0) + X = rng.randn(*shape) + if uniform_weights: + sw = np.ones(X.shape[0]) + else: + sw = rng.chisquare(1, shape[0]) + sqrt_sw = np.sqrt(sw) + X_mean = np.average(X, axis=0, weights=sw) + X_centered = (X - X_mean) * sqrt_sw[:, None] + true_gram = X_centered.dot(X_centered.T) + X_sparse = csr_container(X * sqrt_sw[:, None]) + gcv = _RidgeGCV(fit_intercept=True) + computed_gram, computed_mean = gcv._compute_gram(X_sparse, sqrt_sw) + assert_allclose(X_mean, computed_mean) + assert_allclose(true_gram, computed_gram) + + +@pytest.mark.parametrize("shape", [(10, 1), (13, 9), (3, 7), (2, 2), (20, 20)]) +@pytest.mark.parametrize("uniform_weights", [True, False]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_compute_covariance(shape, uniform_weights, csr_container): + rng = np.random.RandomState(0) + X = rng.randn(*shape) + if uniform_weights: + sw = np.ones(X.shape[0]) + else: + sw = rng.chisquare(1, shape[0]) + sqrt_sw = np.sqrt(sw) + X_mean = np.average(X, axis=0, weights=sw) + X_centered = (X - X_mean) * sqrt_sw[:, None] + true_covariance = X_centered.T.dot(X_centered) + X_sparse = csr_container(X * sqrt_sw[:, None]) + gcv = _RidgeGCV(fit_intercept=True) + computed_cov, computed_mean = gcv._compute_covariance(X_sparse, sqrt_sw) + assert_allclose(X_mean, computed_mean) + assert_allclose(true_covariance, computed_cov) + + +def _make_sparse_offset_regression( + n_samples=100, + n_features=100, + proportion_nonzero=0.5, + n_informative=10, + n_targets=1, + bias=13.0, + X_offset=30.0, + noise=30.0, + shuffle=True, + coef=False, + positive=False, + random_state=None, +): + X, y, c = make_regression( + n_samples=n_samples, + n_features=n_features, + n_informative=n_informative, + n_targets=n_targets, + bias=bias, + noise=noise, + shuffle=shuffle, + coef=True, + random_state=random_state, + ) + if n_features == 1: + c = np.asarray([c]) + X += X_offset + mask = ( + np.random.RandomState(random_state).binomial(1, proportion_nonzero, X.shape) > 0 + ) + removed_X = X.copy() + X[~mask] = 0.0 + removed_X[mask] = 0.0 + y -= removed_X.dot(c) + if positive: + y += X.dot(np.abs(c) + 1 - c) + c = np.abs(c) + 1 + if n_features == 1: + c = c[0] + if coef: + return X, y, c + return X, y + + +@pytest.mark.parametrize( + "solver, sparse_container", + ( + (solver, sparse_container) + for (solver, sparse_container) in product( + ["cholesky", "sag", "sparse_cg", "lsqr", "saga", "ridgecv"], + [None] + CSR_CONTAINERS, + ) + if sparse_container is None or solver in ["sparse_cg", "ridgecv"] + ), +) +@pytest.mark.parametrize( + "n_samples,dtype,proportion_nonzero", + [(20, "float32", 0.1), (40, "float32", 1.0), (20, "float64", 0.2)], +) +def test_solver_consistency( + solver, proportion_nonzero, n_samples, dtype, sparse_container, global_random_seed +): + alpha = 1.0 + noise = 50.0 if proportion_nonzero > 0.9 else 500.0 + X, y = _make_sparse_offset_regression( + bias=10, + n_features=30, + proportion_nonzero=proportion_nonzero, + noise=noise, + random_state=global_random_seed, + n_samples=n_samples, + ) + # Manually scale the data to avoid pathological cases. We use + # minmax_scale to deal with the sparse case without breaking + # the sparsity pattern. + X = minmax_scale(X) + + svd_ridge = Ridge(solver="svd", alpha=alpha).fit(X, y) + X = X.astype(dtype, copy=False) + y = y.astype(dtype, copy=False) + if sparse_container is not None: + X = sparse_container(X) + if solver == "ridgecv": + ridge = RidgeCV(alphas=[alpha]) + else: + if solver.startswith("sag"): + # Avoid ConvergenceWarning for sag and saga solvers. + tol = 1e-7 + max_iter = 100_000 + else: + tol = 1e-10 + max_iter = None + + ridge = Ridge( + alpha=alpha, + solver=solver, + max_iter=max_iter, + tol=tol, + random_state=global_random_seed, + ) + ridge.fit(X, y) + assert_allclose(ridge.coef_, svd_ridge.coef_, atol=1e-3, rtol=1e-3) + assert_allclose(ridge.intercept_, svd_ridge.intercept_, atol=1e-3, rtol=1e-3) + + +@pytest.mark.parametrize("gcv_mode", ["svd", "eigen"]) +@pytest.mark.parametrize("X_container", [np.asarray] + CSR_CONTAINERS) +@pytest.mark.parametrize("X_shape", [(11, 8), (11, 20)]) +@pytest.mark.parametrize("fit_intercept", [True, False]) +@pytest.mark.parametrize( + "y_shape, noise", + [ + ((11,), 1.0), + ((11, 1), 30.0), + ((11, 3), 150.0), + ], +) +def test_ridge_gcv_vs_ridge_loo_cv( + gcv_mode, X_container, X_shape, y_shape, fit_intercept, noise +): + n_samples, n_features = X_shape + n_targets = y_shape[-1] if len(y_shape) == 2 else 1 + X, y = _make_sparse_offset_regression( + n_samples=n_samples, + n_features=n_features, + n_targets=n_targets, + random_state=0, + shuffle=False, + noise=noise, + n_informative=5, + ) + y = y.reshape(y_shape) + + alphas = [1e-3, 0.1, 1.0, 10.0, 1e3] + loo_ridge = RidgeCV( + cv=n_samples, + fit_intercept=fit_intercept, + alphas=alphas, + scoring="neg_mean_squared_error", + ) + gcv_ridge = RidgeCV( + gcv_mode=gcv_mode, + fit_intercept=fit_intercept, + alphas=alphas, + ) + + loo_ridge.fit(X, y) + + X_gcv = X_container(X) + gcv_ridge.fit(X_gcv, y) + + assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_) + assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3) + assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3) + + +def test_ridge_loo_cv_asym_scoring(): + # checking on asymmetric scoring + scoring = "explained_variance" + n_samples, n_features = 10, 5 + n_targets = 1 + X, y = _make_sparse_offset_regression( + n_samples=n_samples, + n_features=n_features, + n_targets=n_targets, + random_state=0, + shuffle=False, + noise=1, + n_informative=5, + ) + + alphas = [1e-3, 0.1, 1.0, 10.0, 1e3] + loo_ridge = RidgeCV( + cv=n_samples, fit_intercept=True, alphas=alphas, scoring=scoring + ) + + gcv_ridge = RidgeCV(fit_intercept=True, alphas=alphas, scoring=scoring) + + loo_ridge.fit(X, y) + gcv_ridge.fit(X, y) + + assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_), ( + f"{gcv_ridge.alpha_=}, {loo_ridge.alpha_=}" + ) + assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3) + assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3) + + +@pytest.mark.parametrize("gcv_mode", ["svd", "eigen"]) +@pytest.mark.parametrize("X_container", [np.asarray] + CSR_CONTAINERS) +@pytest.mark.parametrize("n_features", [8, 20]) +@pytest.mark.parametrize( + "y_shape, fit_intercept, noise", + [ + ((11,), True, 1.0), + ((11, 1), True, 20.0), + ((11, 3), True, 150.0), + ((11, 3), False, 30.0), + ], +) +def test_ridge_gcv_sample_weights( + gcv_mode, X_container, fit_intercept, n_features, y_shape, noise +): + alphas = [1e-3, 0.1, 1.0, 10.0, 1e3] + rng = np.random.RandomState(0) + n_targets = y_shape[-1] if len(y_shape) == 2 else 1 + X, y = _make_sparse_offset_regression( + n_samples=11, + n_features=n_features, + n_targets=n_targets, + random_state=0, + shuffle=False, + noise=noise, + ) + y = y.reshape(y_shape) + + sample_weight = 3 * rng.randn(len(X)) + sample_weight = (sample_weight - sample_weight.min() + 1).astype(int) + indices = np.repeat(np.arange(X.shape[0]), sample_weight) + sample_weight = sample_weight.astype(float) + X_tiled, y_tiled = X[indices], y[indices] + + cv = GroupKFold(n_splits=X.shape[0]) + splits = cv.split(X_tiled, y_tiled, groups=indices) + kfold = RidgeCV( + alphas=alphas, + cv=splits, + scoring="neg_mean_squared_error", + fit_intercept=fit_intercept, + ) + kfold.fit(X_tiled, y_tiled) + + ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept) + splits = cv.split(X_tiled, y_tiled, groups=indices) + predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits) + if predictions.shape != y_tiled.shape: + predictions = predictions.reshape(y_tiled.shape) + kfold_errors = (y_tiled - predictions) ** 2 + kfold_errors = [ + np.sum(kfold_errors[indices == i], axis=0) for i in np.arange(X.shape[0]) + ] + kfold_errors = np.asarray(kfold_errors) + + X_gcv = X_container(X) + gcv_ridge = RidgeCV( + alphas=alphas, + store_cv_results=True, + gcv_mode=gcv_mode, + fit_intercept=fit_intercept, + ) + gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight) + if len(y_shape) == 2: + gcv_errors = gcv_ridge.cv_results_[:, :, alphas.index(kfold.alpha_)] + else: + gcv_errors = gcv_ridge.cv_results_[:, alphas.index(kfold.alpha_)] + + assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_) + assert_allclose(gcv_errors, kfold_errors, rtol=1e-3) + assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3) + assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3) + + +@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS) +@pytest.mark.parametrize( + "mode, mode_n_greater_than_p, mode_p_greater_than_n", + [ + (None, "svd", "eigen"), + ("auto", "svd", "eigen"), + ("eigen", "eigen", "eigen"), + ("svd", "svd", "svd"), + ], +) +def test_check_gcv_mode_choice( + sparse_container, mode, mode_n_greater_than_p, mode_p_greater_than_n +): + X, _ = make_regression(n_samples=5, n_features=2) + if sparse_container is not None: + X = sparse_container(X) + assert _check_gcv_mode(X, mode) == mode_n_greater_than_p + assert _check_gcv_mode(X.T, mode) == mode_p_greater_than_n + + +def _test_ridge_loo(sparse_container): + # test that can work with both dense or sparse matrices + n_samples = X_diabetes.shape[0] + + ret = [] + + if sparse_container is None: + X, fit_intercept = X_diabetes, True + else: + X, fit_intercept = sparse_container(X_diabetes), False + ridge_gcv = _RidgeGCV(fit_intercept=fit_intercept) + + # check best alpha + ridge_gcv.fit(X, y_diabetes) + alpha_ = ridge_gcv.alpha_ + ret.append(alpha_) + + # check that we get same best alpha with custom loss_func + f = ignore_warnings + scoring = make_scorer(mean_squared_error, greater_is_better=False) + ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring) + f(ridge_gcv2.fit)(X, y_diabetes) + assert ridge_gcv2.alpha_ == pytest.approx(alpha_) + + # check that we get same best alpha with custom score_func + def func(x, y): + return -mean_squared_error(x, y) + + scoring = make_scorer(func) + ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring) + f(ridge_gcv3.fit)(X, y_diabetes) + assert ridge_gcv3.alpha_ == pytest.approx(alpha_) + + # check that we get same best alpha with a scorer + scorer = get_scorer("neg_mean_squared_error") + ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer) + ridge_gcv4.fit(X, y_diabetes) + assert ridge_gcv4.alpha_ == pytest.approx(alpha_) + + # check that we get same best alpha with sample weights + if sparse_container is None: + ridge_gcv.fit(X, y_diabetes, sample_weight=np.ones(n_samples)) + assert ridge_gcv.alpha_ == pytest.approx(alpha_) + + # simulate several responses + Y = np.vstack((y_diabetes, y_diabetes)).T + + ridge_gcv.fit(X, Y) + Y_pred = ridge_gcv.predict(X) + ridge_gcv.fit(X, y_diabetes) + y_pred = ridge_gcv.predict(X) + + assert_allclose(np.vstack((y_pred, y_pred)).T, Y_pred, rtol=1e-5) + + return ret + + +def _test_ridge_cv(sparse_container): + X = X_diabetes if sparse_container is None else sparse_container(X_diabetes) + ridge_cv = RidgeCV() + ridge_cv.fit(X, y_diabetes) + ridge_cv.predict(X) + + assert len(ridge_cv.coef_.shape) == 1 + assert type(ridge_cv.intercept_) is np.float64 + + cv = KFold(5) + ridge_cv.set_params(cv=cv) + ridge_cv.fit(X, y_diabetes) + ridge_cv.predict(X) + + assert len(ridge_cv.coef_.shape) == 1 + assert type(ridge_cv.intercept_) is np.float64 + + +@pytest.mark.parametrize( + "ridge, make_dataset", + [ + (RidgeCV(store_cv_results=False), make_regression), + (RidgeClassifierCV(store_cv_results=False), make_classification), + ], +) +def test_ridge_gcv_cv_results_not_stored(ridge, make_dataset): + # Check that `cv_results_` is not stored when store_cv_results is False + X, y = make_dataset(n_samples=6, random_state=42) + ridge.fit(X, y) + assert not hasattr(ridge, "cv_results_") + + +@pytest.mark.parametrize( + "ridge, make_dataset", + [(RidgeCV(), make_regression), (RidgeClassifierCV(), make_classification)], +) +@pytest.mark.parametrize("cv", [None, 3]) +def test_ridge_best_score(ridge, make_dataset, cv): + # check that the best_score_ is store + X, y = make_dataset(n_samples=6, random_state=42) + ridge.set_params(store_cv_results=False, cv=cv) + ridge.fit(X, y) + assert hasattr(ridge, "best_score_") + assert isinstance(ridge.best_score_, float) + + +def test_ridge_cv_individual_penalties(): + # Tests the ridge_cv object optimizing individual penalties for each target + + rng = np.random.RandomState(42) + + # Create random dataset with multiple targets. Each target should have + # a different optimal alpha. + n_samples, n_features, n_targets = 20, 5, 3 + y = rng.randn(n_samples, n_targets) + X = ( + np.dot(y[:, [0]], np.ones((1, n_features))) + + np.dot(y[:, [1]], 0.05 * np.ones((1, n_features))) + + np.dot(y[:, [2]], 0.001 * np.ones((1, n_features))) + + rng.randn(n_samples, n_features) + ) + + alphas = (1, 100, 1000) + + # Find optimal alpha for each target + optimal_alphas = [RidgeCV(alphas=alphas).fit(X, target).alpha_ for target in y.T] + + # Find optimal alphas for all targets simultaneously + ridge_cv = RidgeCV(alphas=alphas, alpha_per_target=True).fit(X, y) + assert_array_equal(optimal_alphas, ridge_cv.alpha_) + + # The resulting regression weights should incorporate the different + # alpha values. + assert_array_almost_equal( + Ridge(alpha=ridge_cv.alpha_).fit(X, y).coef_, ridge_cv.coef_ + ) + + # Test shape of alpha_ and cv_results_ + ridge_cv = RidgeCV(alphas=alphas, alpha_per_target=True, store_cv_results=True).fit( + X, y + ) + assert ridge_cv.alpha_.shape == (n_targets,) + assert ridge_cv.best_score_.shape == (n_targets,) + assert ridge_cv.cv_results_.shape == (n_samples, len(alphas), n_targets) + + # Test edge case of there being only one alpha value + ridge_cv = RidgeCV(alphas=1, alpha_per_target=True, store_cv_results=True).fit(X, y) + assert ridge_cv.alpha_.shape == (n_targets,) + assert ridge_cv.best_score_.shape == (n_targets,) + assert ridge_cv.cv_results_.shape == (n_samples, n_targets, 1) + + # Test edge case of there being only one target + ridge_cv = RidgeCV(alphas=alphas, alpha_per_target=True, store_cv_results=True).fit( + X, y[:, 0] + ) + assert np.isscalar(ridge_cv.alpha_) + assert np.isscalar(ridge_cv.best_score_) + assert ridge_cv.cv_results_.shape == (n_samples, len(alphas)) + + # Try with a custom scoring function + ridge_cv = RidgeCV(alphas=alphas, alpha_per_target=True, scoring="r2").fit(X, y) + assert_array_equal(optimal_alphas, ridge_cv.alpha_) + assert_array_almost_equal( + Ridge(alpha=ridge_cv.alpha_).fit(X, y).coef_, ridge_cv.coef_ + ) + + # Using a custom CV object should throw an error in combination with + # alpha_per_target=True + ridge_cv = RidgeCV(alphas=alphas, cv=LeaveOneOut(), alpha_per_target=True) + msg = "cv!=None and alpha_per_target=True are incompatible" + with pytest.raises(ValueError, match=msg): + ridge_cv.fit(X, y) + ridge_cv = RidgeCV(alphas=alphas, cv=6, alpha_per_target=True) + with pytest.raises(ValueError, match=msg): + ridge_cv.fit(X, y) + + +def _test_ridge_diabetes(sparse_container): + X = X_diabetes if sparse_container is None else sparse_container(X_diabetes) + ridge = Ridge(fit_intercept=False) + ridge.fit(X, y_diabetes) + return np.round(ridge.score(X, y_diabetes), 5) + + +def _test_multi_ridge_diabetes(sparse_container): + # simulate several responses + X = X_diabetes if sparse_container is None else sparse_container(X_diabetes) + Y = np.vstack((y_diabetes, y_diabetes)).T + n_features = X_diabetes.shape[1] + + ridge = Ridge(fit_intercept=False) + ridge.fit(X, Y) + assert ridge.coef_.shape == (2, n_features) + Y_pred = ridge.predict(X) + ridge.fit(X, y_diabetes) + y_pred = ridge.predict(X) + assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3) + + +def _test_ridge_classifiers(sparse_container): + n_classes = np.unique(y_iris).shape[0] + n_features = X_iris.shape[1] + X = X_iris if sparse_container is None else sparse_container(X_iris) + + for reg in (RidgeClassifier(), RidgeClassifierCV()): + reg.fit(X, y_iris) + assert reg.coef_.shape == (n_classes, n_features) + y_pred = reg.predict(X) + assert np.mean(y_iris == y_pred) > 0.79 + + cv = KFold(5) + reg = RidgeClassifierCV(cv=cv) + reg.fit(X, y_iris) + y_pred = reg.predict(X) + assert np.mean(y_iris == y_pred) >= 0.8 + + +@pytest.mark.parametrize("scoring", [None, "accuracy", _accuracy_callable]) +@pytest.mark.parametrize("cv", [None, KFold(5)]) +@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS) +def test_ridge_classifier_with_scoring(sparse_container, scoring, cv): + # non-regression test for #14672 + # check that RidgeClassifierCV works with all sort of scoring and + # cross-validation + X = X_iris if sparse_container is None else sparse_container(X_iris) + scoring_ = make_scorer(scoring) if callable(scoring) else scoring + clf = RidgeClassifierCV(scoring=scoring_, cv=cv) + # Smoke test to check that fit/predict does not raise error + clf.fit(X, y_iris).predict(X) + + +@pytest.mark.parametrize("cv", [None, KFold(5)]) +@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS) +def test_ridge_regression_custom_scoring(sparse_container, cv): + # check that custom scoring is working as expected + # check the tie breaking strategy (keep the first alpha tried) + + def _dummy_score(y_test, y_pred, **kwargs): + return 0.42 + + X = X_iris if sparse_container is None else sparse_container(X_iris) + alphas = np.logspace(-2, 2, num=5) + clf = RidgeClassifierCV(alphas=alphas, scoring=make_scorer(_dummy_score), cv=cv) + clf.fit(X, y_iris) + assert clf.best_score_ == pytest.approx(0.42) + # In case of tie score, the first alphas will be kept + assert clf.alpha_ == pytest.approx(alphas[0]) + + +def _test_tolerance(sparse_container): + X = X_diabetes if sparse_container is None else sparse_container(X_diabetes) + + ridge = Ridge(tol=1e-5, fit_intercept=False) + ridge.fit(X, y_diabetes) + score = ridge.score(X, y_diabetes) + + ridge2 = Ridge(tol=1e-3, fit_intercept=False) + ridge2.fit(X, y_diabetes) + score2 = ridge2.score(X, y_diabetes) + + assert score >= score2 + + +def check_array_api_attributes(name, estimator, array_namespace, device, dtype_name): + xp = _array_api_for_tests(array_namespace, device) + + X_iris_np = X_iris.astype(dtype_name) + y_iris_np = y_iris.astype(dtype_name) + + X_iris_xp = xp.asarray(X_iris_np, device=device) + y_iris_xp = xp.asarray(y_iris_np, device=device) + + estimator.fit(X_iris_np, y_iris_np) + coef_np = estimator.coef_ + intercept_np = estimator.intercept_ + + with config_context(array_api_dispatch=True): + estimator_xp = clone(estimator).fit(X_iris_xp, y_iris_xp) + coef_xp = estimator_xp.coef_ + assert coef_xp.shape == (4,) + assert coef_xp.dtype == X_iris_xp.dtype + + assert_allclose( + _convert_to_numpy(coef_xp, xp=xp), + coef_np, + atol=_atol_for_type(dtype_name), + ) + intercept_xp = estimator_xp.intercept_ + assert intercept_xp.shape == () + assert intercept_xp.dtype == X_iris_xp.dtype + + assert_allclose( + _convert_to_numpy(intercept_xp, xp=xp), + intercept_np, + atol=_atol_for_type(dtype_name), + ) + + +@pytest.mark.parametrize( + "array_namespace, device, dtype_name", + yield_namespace_device_dtype_combinations(), + ids=_get_namespace_device_dtype_ids, +) +@pytest.mark.parametrize( + "check", + [check_array_api_input_and_values, check_array_api_attributes], + ids=_get_check_estimator_ids, +) +@pytest.mark.parametrize( + "estimator", + [Ridge(solver="svd")], + ids=_get_check_estimator_ids, +) +def test_ridge_array_api_compliance( + estimator, check, array_namespace, device, dtype_name +): + name = estimator.__class__.__name__ + check(name, estimator, array_namespace, device=device, dtype_name=dtype_name) + + +@pytest.mark.parametrize( + "array_namespace", yield_namespaces(include_numpy_namespaces=False) +) +def test_array_api_error_and_warnings_for_solver_parameter(array_namespace): + xp = _array_api_for_tests(array_namespace, device=None) + + X_iris_xp = xp.asarray(X_iris[:5]) + y_iris_xp = xp.asarray(y_iris[:5]) + + available_solvers = Ridge._parameter_constraints["solver"][0].options + for solver in available_solvers - {"auto", "svd"}: + ridge = Ridge(solver=solver, positive=solver == "lbfgs") + expected_msg = ( + f"Array API dispatch to namespace {xp.__name__} only supports " + f"solver 'svd'. Got '{solver}'." + ) + + with pytest.raises(ValueError, match=expected_msg): + with config_context(array_api_dispatch=True): + ridge.fit(X_iris_xp, y_iris_xp) + + ridge = Ridge(solver="auto", positive=True) + expected_msg = ( + "The solvers that support positive fitting do not support " + f"Array API dispatch to namespace {xp.__name__}. Please " + "either disable Array API dispatch, or use a numpy-like " + "namespace, or set `positive=False`." + ) + + with pytest.raises(ValueError, match=expected_msg): + with config_context(array_api_dispatch=True): + ridge.fit(X_iris_xp, y_iris_xp) + + ridge = Ridge() + expected_msg = ( + f"Using Array API dispatch to namespace {xp.__name__} with `solver='auto'` " + "will result in using the solver 'svd'. The results may differ from those " + "when using a Numpy array, because in that case the preferred solver would " + "be cholesky. Set `solver='svd'` to suppress this warning." + ) + with pytest.warns(UserWarning, match=expected_msg): + with config_context(array_api_dispatch=True): + ridge.fit(X_iris_xp, y_iris_xp) + + +@pytest.mark.parametrize("array_namespace", sorted(_NUMPY_NAMESPACE_NAMES)) +def test_array_api_numpy_namespace_no_warning(array_namespace): + xp = _array_api_for_tests(array_namespace, device=None) + + X_iris_xp = xp.asarray(X_iris[:5]) + y_iris_xp = xp.asarray(y_iris[:5]) + + ridge = Ridge() + expected_msg = ( + "Results might be different than when Array API dispatch is " + "disabled, or when a numpy-like namespace is used" + ) + + with warnings.catch_warnings(): + warnings.filterwarnings("error", message=expected_msg, category=UserWarning) + with config_context(array_api_dispatch=True): + ridge.fit(X_iris_xp, y_iris_xp) + + # All numpy namespaces are compatible with all solver, in particular + # solvers that support `positive=True` (like 'lbfgs') should work. + with config_context(array_api_dispatch=True): + Ridge(solver="auto", positive=True).fit(X_iris_xp, y_iris_xp) + + +@pytest.mark.parametrize( + "test_func", + ( + _test_ridge_loo, + _test_ridge_cv, + _test_ridge_diabetes, + _test_multi_ridge_diabetes, + _test_ridge_classifiers, + _test_tolerance, + ), +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_dense_sparse(test_func, csr_container): + # test dense matrix + ret_dense = test_func(None) + # test sparse matrix + ret_sparse = test_func(csr_container) + # test that the outputs are the same + if ret_dense is not None and ret_sparse is not None: + assert_array_almost_equal(ret_dense, ret_sparse, decimal=3) + + +def test_class_weights(): + # Test class weights. + X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) + y = [1, 1, 1, -1, -1] + + reg = RidgeClassifier(class_weight=None) + reg.fit(X, y) + assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([1])) + + # we give a small weights to class 1 + reg = RidgeClassifier(class_weight={1: 0.001}) + reg.fit(X, y) + + # now the hyperplane should rotate clock-wise and + # the prediction on this point should shift + assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([-1])) + + # check if class_weight = 'balanced' can handle negative labels. + reg = RidgeClassifier(class_weight="balanced") + reg.fit(X, y) + assert_array_equal(reg.predict([[0.2, -1.0]]), np.array([1])) + + # class_weight = 'balanced', and class_weight = None should return + # same values when y has equal number of all labels + X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0]]) + y = [1, 1, -1, -1] + reg = RidgeClassifier(class_weight=None) + reg.fit(X, y) + rega = RidgeClassifier(class_weight="balanced") + rega.fit(X, y) + assert len(rega.classes_) == 2 + assert_array_almost_equal(reg.coef_, rega.coef_) + assert_array_almost_equal(reg.intercept_, rega.intercept_) + + +@pytest.mark.parametrize("reg", (RidgeClassifier, RidgeClassifierCV)) +def test_class_weight_vs_sample_weight(reg): + """Check class_weights resemble sample_weights behavior.""" + + # Iris is balanced, so no effect expected for using 'balanced' weights + reg1 = reg() + reg1.fit(iris.data, iris.target) + reg2 = reg(class_weight="balanced") + reg2.fit(iris.data, iris.target) + assert_almost_equal(reg1.coef_, reg2.coef_) + + # Inflate importance of class 1, check against user-defined weights + sample_weight = np.ones(iris.target.shape) + sample_weight[iris.target == 1] *= 100 + class_weight = {0: 1.0, 1: 100.0, 2: 1.0} + reg1 = reg() + reg1.fit(iris.data, iris.target, sample_weight) + reg2 = reg(class_weight=class_weight) + reg2.fit(iris.data, iris.target) + assert_almost_equal(reg1.coef_, reg2.coef_) + + # Check that sample_weight and class_weight are multiplicative + reg1 = reg() + reg1.fit(iris.data, iris.target, sample_weight**2) + reg2 = reg(class_weight=class_weight) + reg2.fit(iris.data, iris.target, sample_weight) + assert_almost_equal(reg1.coef_, reg2.coef_) + + +def test_class_weights_cv(): + # Test class weights for cross validated ridge classifier. + X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) + y = [1, 1, 1, -1, -1] + + reg = RidgeClassifierCV(class_weight=None, alphas=[0.01, 0.1, 1]) + reg.fit(X, y) + + # we give a small weights to class 1 + reg = RidgeClassifierCV(class_weight={1: 0.001}, alphas=[0.01, 0.1, 1, 10]) + reg.fit(X, y) + + assert_array_equal(reg.predict([[-0.2, 2]]), np.array([-1])) + + +@pytest.mark.parametrize( + "scoring", [None, "neg_mean_squared_error", _mean_squared_error_callable] +) +def test_ridgecv_store_cv_results(scoring): + rng = np.random.RandomState(42) + + n_samples = 8 + n_features = 5 + x = rng.randn(n_samples, n_features) + alphas = [1e-1, 1e0, 1e1] + n_alphas = len(alphas) + + scoring_ = make_scorer(scoring) if callable(scoring) else scoring + + r = RidgeCV(alphas=alphas, cv=None, store_cv_results=True, scoring=scoring_) + + # with len(y.shape) == 1 + y = rng.randn(n_samples) + r.fit(x, y) + assert r.cv_results_.shape == (n_samples, n_alphas) + + # with len(y.shape) == 2 + n_targets = 3 + y = rng.randn(n_samples, n_targets) + r.fit(x, y) + assert r.cv_results_.shape == (n_samples, n_targets, n_alphas) + + r = RidgeCV(cv=3, store_cv_results=True, scoring=scoring) + with pytest.raises(ValueError, match="cv!=None and store_cv_results"): + r.fit(x, y) + + +@pytest.mark.parametrize("scoring", [None, "accuracy", _accuracy_callable]) +def test_ridge_classifier_cv_store_cv_results(scoring): + x = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) + y = np.array([1, 1, 1, -1, -1]) + + n_samples = x.shape[0] + alphas = [1e-1, 1e0, 1e1] + n_alphas = len(alphas) + + scoring_ = make_scorer(scoring) if callable(scoring) else scoring + + r = RidgeClassifierCV( + alphas=alphas, cv=None, store_cv_results=True, scoring=scoring_ + ) + + # with len(y.shape) == 1 + n_targets = 1 + r.fit(x, y) + assert r.cv_results_.shape == (n_samples, n_targets, n_alphas) + + # with len(y.shape) == 2 + y = np.array( + [[1, 1, 1, -1, -1], [1, -1, 1, -1, 1], [-1, -1, 1, -1, -1]] + ).transpose() + n_targets = y.shape[1] + r.fit(x, y) + assert r.cv_results_.shape == (n_samples, n_targets, n_alphas) + + +@pytest.mark.parametrize("Estimator", [RidgeCV, RidgeClassifierCV]) +def test_ridgecv_alphas_conversion(Estimator): + rng = np.random.RandomState(0) + alphas = (0.1, 1.0, 10.0) + + n_samples, n_features = 5, 5 + if Estimator is RidgeCV: + y = rng.randn(n_samples) + else: + y = rng.randint(0, 2, n_samples) + X = rng.randn(n_samples, n_features) + + ridge_est = Estimator(alphas=alphas) + assert ridge_est.alphas is alphas, ( + f"`alphas` was mutated in `{Estimator.__name__}.__init__`" + ) + + ridge_est.fit(X, y) + assert_array_equal(ridge_est.alphas, np.asarray(alphas)) + + +@pytest.mark.parametrize("cv", [None, 3]) +@pytest.mark.parametrize("Estimator", [RidgeCV, RidgeClassifierCV]) +def test_ridgecv_alphas_zero(cv, Estimator): + """Check alpha=0.0 raises error only when `cv=None`.""" + rng = np.random.RandomState(0) + alphas = (0.0, 1.0, 10.0) + + n_samples, n_features = 5, 5 + if Estimator is RidgeCV: + y = rng.randn(n_samples) + else: + y = rng.randint(0, 2, n_samples) + X = rng.randn(n_samples, n_features) + + ridge_est = Estimator(alphas=alphas, cv=cv) + if cv is None: + with pytest.raises(ValueError, match=r"alphas\[0\] == 0.0, must be > 0.0."): + ridge_est.fit(X, y) + else: + ridge_est.fit(X, y) + + +def test_ridgecv_sample_weight(): + rng = np.random.RandomState(0) + alphas = (0.1, 1.0, 10.0) + + # There are different algorithms for n_samples > n_features + # and the opposite, so test them both. + for n_samples, n_features in ((6, 5), (5, 10)): + y = rng.randn(n_samples) + X = rng.randn(n_samples, n_features) + sample_weight = 1.0 + rng.rand(n_samples) + + cv = KFold(5) + ridgecv = RidgeCV(alphas=alphas, cv=cv) + ridgecv.fit(X, y, sample_weight=sample_weight) + + # Check using GridSearchCV directly + parameters = {"alpha": alphas} + gs = GridSearchCV(Ridge(), parameters, cv=cv) + gs.fit(X, y, sample_weight=sample_weight) + + assert ridgecv.alpha_ == gs.best_estimator_.alpha + assert_array_almost_equal(ridgecv.coef_, gs.best_estimator_.coef_) + + +def test_raises_value_error_if_sample_weights_greater_than_1d(): + # Sample weights must be either scalar or 1D + + n_sampless = [2, 3] + n_featuress = [3, 2] + + rng = np.random.RandomState(42) + + for n_samples, n_features in zip(n_sampless, n_featuress): + X = rng.randn(n_samples, n_features) + y = rng.randn(n_samples) + sample_weights_OK = rng.randn(n_samples) ** 2 + 1 + sample_weights_OK_1 = 1.0 + sample_weights_OK_2 = 2.0 + sample_weights_not_OK = sample_weights_OK[:, np.newaxis] + sample_weights_not_OK_2 = sample_weights_OK[np.newaxis, :] + + ridge = Ridge(alpha=1) + + # make sure the "OK" sample weights actually work + ridge.fit(X, y, sample_weights_OK) + ridge.fit(X, y, sample_weights_OK_1) + ridge.fit(X, y, sample_weights_OK_2) + + def fit_ridge_not_ok(): + ridge.fit(X, y, sample_weights_not_OK) + + def fit_ridge_not_ok_2(): + ridge.fit(X, y, sample_weights_not_OK_2) + + err_msg = "Sample weights must be 1D array or scalar" + with pytest.raises(ValueError, match=err_msg): + fit_ridge_not_ok() + + err_msg = "Sample weights must be 1D array or scalar" + with pytest.raises(ValueError, match=err_msg): + fit_ridge_not_ok_2() + + +@pytest.mark.parametrize("n_samples,n_features", [[2, 3], [3, 2]]) +@pytest.mark.parametrize( + "sparse_container", + COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS, +) +def test_sparse_design_with_sample_weights(n_samples, n_features, sparse_container): + # Sample weights must work with sparse matrices + rng = np.random.RandomState(42) + + sparse_ridge = Ridge(alpha=1.0, fit_intercept=False) + dense_ridge = Ridge(alpha=1.0, fit_intercept=False) + + X = rng.randn(n_samples, n_features) + y = rng.randn(n_samples) + sample_weights = rng.randn(n_samples) ** 2 + 1 + X_sparse = sparse_container(X) + sparse_ridge.fit(X_sparse, y, sample_weight=sample_weights) + dense_ridge.fit(X, y, sample_weight=sample_weights) + + assert_array_almost_equal(sparse_ridge.coef_, dense_ridge.coef_, decimal=6) + + +def test_ridgecv_int_alphas(): + X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) + y = [1, 1, 1, -1, -1] + + # Integers + ridge = RidgeCV(alphas=(1, 10, 100)) + ridge.fit(X, y) + + +@pytest.mark.parametrize("Estimator", [RidgeCV, RidgeClassifierCV]) +@pytest.mark.parametrize( + "params, err_type, err_msg", + [ + ({"alphas": (1, -1, -100)}, ValueError, r"alphas\[1\] == -1, must be > 0.0"), + ( + {"alphas": (-0.1, -1.0, -10.0)}, + ValueError, + r"alphas\[0\] == -0.1, must be > 0.0", + ), + ( + {"alphas": (1, 1.0, "1")}, + TypeError, + r"alphas\[2\] must be an instance of float, not str", + ), + ], +) +def test_ridgecv_alphas_validation(Estimator, params, err_type, err_msg): + """Check the `alphas` validation in RidgeCV and RidgeClassifierCV.""" + + n_samples, n_features = 5, 5 + X = rng.randn(n_samples, n_features) + y = rng.randint(0, 2, n_samples) + + with pytest.raises(err_type, match=err_msg): + Estimator(**params).fit(X, y) + + +@pytest.mark.parametrize("Estimator", [RidgeCV, RidgeClassifierCV]) +def test_ridgecv_alphas_scalar(Estimator): + """Check the case when `alphas` is a scalar. + This case was supported in the past when `alphas` where converted + into array in `__init__`. + We add this test to ensure backward compatibility. + """ + + n_samples, n_features = 5, 5 + X = rng.randn(n_samples, n_features) + if Estimator is RidgeCV: + y = rng.randn(n_samples) + else: + y = rng.randint(0, 2, n_samples) + + Estimator(alphas=1).fit(X, y) + + +def test_sparse_cg_max_iter(): + reg = Ridge(solver="sparse_cg", max_iter=1) + reg.fit(X_diabetes, y_diabetes) + assert reg.coef_.shape[0] == X_diabetes.shape[1] + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +def test_n_iter(): + # Test that self.n_iter_ is correct. + n_targets = 2 + X, y = X_diabetes, y_diabetes + y_n = np.tile(y, (n_targets, 1)).T + + for max_iter in range(1, 4): + for solver in ("sag", "saga", "lsqr"): + reg = Ridge(solver=solver, max_iter=max_iter, tol=1e-12) + reg.fit(X, y_n) + assert_array_equal(reg.n_iter_, np.tile(max_iter, n_targets)) + + for solver in ("sparse_cg", "svd", "cholesky"): + reg = Ridge(solver=solver, max_iter=1, tol=1e-1) + reg.fit(X, y_n) + assert reg.n_iter_ is None + + +@pytest.mark.parametrize("solver", ["lsqr", "sparse_cg", "lbfgs", "auto"]) +@pytest.mark.parametrize("with_sample_weight", [True, False]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_ridge_fit_intercept_sparse( + solver, with_sample_weight, global_random_seed, csr_container +): + """Check that ridge finds the same coefs and intercept on dense and sparse input + in the presence of sample weights. + + For now only sparse_cg and lbfgs can correctly fit an intercept + with sparse X with default tol and max_iter. + 'sag' is tested separately in test_ridge_fit_intercept_sparse_sag because it + requires more iterations and should raise a warning if default max_iter is used. + Other solvers raise an exception, as checked in + test_ridge_fit_intercept_sparse_error + """ + positive = solver == "lbfgs" + X, y = _make_sparse_offset_regression( + n_features=20, random_state=global_random_seed, positive=positive + ) + + sample_weight = None + if with_sample_weight: + rng = np.random.RandomState(global_random_seed) + sample_weight = 1.0 + rng.uniform(size=X.shape[0]) + + # "auto" should switch to "sparse_cg" when X is sparse + # so the reference we use for both ("auto" and "sparse_cg") is + # Ridge(solver="sparse_cg"), fitted using the dense representation (note + # that "sparse_cg" can fit sparse or dense data) + dense_solver = "sparse_cg" if solver == "auto" else solver + dense_ridge = Ridge(solver=dense_solver, tol=1e-12, positive=positive) + sparse_ridge = Ridge(solver=solver, tol=1e-12, positive=positive) + + dense_ridge.fit(X, y, sample_weight=sample_weight) + sparse_ridge.fit(csr_container(X), y, sample_weight=sample_weight) + + assert_allclose(dense_ridge.intercept_, sparse_ridge.intercept_) + assert_allclose(dense_ridge.coef_, sparse_ridge.coef_, rtol=5e-7) + + +@pytest.mark.parametrize("solver", ["saga", "svd", "cholesky"]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_ridge_fit_intercept_sparse_error(solver, csr_container): + X, y = _make_sparse_offset_regression(n_features=20, random_state=0) + X_csr = csr_container(X) + sparse_ridge = Ridge(solver=solver) + err_msg = "solver='{}' does not support".format(solver) + with pytest.raises(ValueError, match=err_msg): + sparse_ridge.fit(X_csr, y) + + +@pytest.mark.parametrize("with_sample_weight", [True, False]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_ridge_fit_intercept_sparse_sag( + with_sample_weight, global_random_seed, csr_container +): + X, y = _make_sparse_offset_regression( + n_features=5, n_samples=20, random_state=global_random_seed, X_offset=5.0 + ) + if with_sample_weight: + rng = np.random.RandomState(global_random_seed) + sample_weight = 1.0 + rng.uniform(size=X.shape[0]) + else: + sample_weight = None + X_csr = csr_container(X) + + params = dict( + alpha=1.0, solver="sag", fit_intercept=True, tol=1e-10, max_iter=100000 + ) + dense_ridge = Ridge(**params) + sparse_ridge = Ridge(**params) + dense_ridge.fit(X, y, sample_weight=sample_weight) + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + sparse_ridge.fit(X_csr, y, sample_weight=sample_weight) + assert_allclose(dense_ridge.intercept_, sparse_ridge.intercept_, rtol=1e-4) + assert_allclose(dense_ridge.coef_, sparse_ridge.coef_, rtol=1e-4) + with pytest.warns(UserWarning, match='"sag" solver requires.*'): + Ridge(solver="sag", fit_intercept=True, tol=1e-3, max_iter=None).fit(X_csr, y) + + +@pytest.mark.parametrize("return_intercept", [False, True]) +@pytest.mark.parametrize("sample_weight", [None, np.ones(1000)]) +@pytest.mark.parametrize("container", [np.array] + CSR_CONTAINERS) +@pytest.mark.parametrize( + "solver", ["auto", "sparse_cg", "cholesky", "lsqr", "sag", "saga", "lbfgs"] +) +def test_ridge_regression_check_arguments_validity( + return_intercept, sample_weight, container, solver +): + """check if all combinations of arguments give valid estimations""" + + # test excludes 'svd' solver because it raises exception for sparse inputs + + rng = check_random_state(42) + X = rng.rand(1000, 3) + true_coefs = [1, 2, 0.1] + y = np.dot(X, true_coefs) + true_intercept = 0.0 + if return_intercept: + true_intercept = 10000.0 + y += true_intercept + X_testing = container(X) + + alpha, tol = 1e-3, 1e-6 + atol = 1e-3 if _IS_32BIT else 1e-4 + + positive = solver == "lbfgs" + + if solver not in ["sag", "auto"] and return_intercept: + with pytest.raises(ValueError, match="In Ridge, only 'sag' solver"): + ridge_regression( + X_testing, + y, + alpha=alpha, + solver=solver, + sample_weight=sample_weight, + return_intercept=return_intercept, + positive=positive, + tol=tol, + ) + return + + out = ridge_regression( + X_testing, + y, + alpha=alpha, + solver=solver, + sample_weight=sample_weight, + positive=positive, + return_intercept=return_intercept, + tol=tol, + ) + + if return_intercept: + coef, intercept = out + assert_allclose(coef, true_coefs, rtol=0, atol=atol) + assert_allclose(intercept, true_intercept, rtol=0, atol=atol) + else: + assert_allclose(out, true_coefs, rtol=0, atol=atol) + + +@pytest.mark.parametrize( + "solver", ["svd", "sparse_cg", "cholesky", "lsqr", "sag", "saga", "lbfgs"] +) +def test_dtype_match(solver): + rng = np.random.RandomState(0) + alpha = 1.0 + positive = solver == "lbfgs" + + n_samples, n_features = 6, 5 + X_64 = rng.randn(n_samples, n_features) + y_64 = rng.randn(n_samples) + X_32 = X_64.astype(np.float32) + y_32 = y_64.astype(np.float32) + + tol = 2 * np.finfo(np.float32).resolution + # Check type consistency 32bits + ridge_32 = Ridge( + alpha=alpha, solver=solver, max_iter=500, tol=tol, positive=positive + ) + ridge_32.fit(X_32, y_32) + coef_32 = ridge_32.coef_ + + # Check type consistency 64 bits + ridge_64 = Ridge( + alpha=alpha, solver=solver, max_iter=500, tol=tol, positive=positive + ) + ridge_64.fit(X_64, y_64) + coef_64 = ridge_64.coef_ + + # Do the actual checks at once for easier debug + assert coef_32.dtype == X_32.dtype + assert coef_64.dtype == X_64.dtype + assert ridge_32.predict(X_32).dtype == X_32.dtype + assert ridge_64.predict(X_64).dtype == X_64.dtype + assert_allclose(ridge_32.coef_, ridge_64.coef_, rtol=1e-4, atol=5e-4) + + +def test_dtype_match_cholesky(): + # Test different alphas in cholesky solver to ensure full coverage. + # This test is separated from test_dtype_match for clarity. + rng = np.random.RandomState(0) + alpha = np.array([1.0, 0.5]) + + n_samples, n_features, n_target = 6, 7, 2 + X_64 = rng.randn(n_samples, n_features) + y_64 = rng.randn(n_samples, n_target) + X_32 = X_64.astype(np.float32) + y_32 = y_64.astype(np.float32) + + # Check type consistency 32bits + ridge_32 = Ridge(alpha=alpha, solver="cholesky") + ridge_32.fit(X_32, y_32) + coef_32 = ridge_32.coef_ + + # Check type consistency 64 bits + ridge_64 = Ridge(alpha=alpha, solver="cholesky") + ridge_64.fit(X_64, y_64) + coef_64 = ridge_64.coef_ + + # Do all the checks at once, like this is easier to debug + assert coef_32.dtype == X_32.dtype + assert coef_64.dtype == X_64.dtype + assert ridge_32.predict(X_32).dtype == X_32.dtype + assert ridge_64.predict(X_64).dtype == X_64.dtype + assert_almost_equal(ridge_32.coef_, ridge_64.coef_, decimal=5) + + +@pytest.mark.parametrize( + "solver", ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga", "lbfgs"] +) +@pytest.mark.parametrize("seed", range(1)) +def test_ridge_regression_dtype_stability(solver, seed): + random_state = np.random.RandomState(seed) + n_samples, n_features = 6, 5 + X = random_state.randn(n_samples, n_features) + coef = random_state.randn(n_features) + y = np.dot(X, coef) + 0.01 * random_state.randn(n_samples) + alpha = 1.0 + positive = solver == "lbfgs" + results = dict() + # XXX: Sparse CG seems to be far less numerically stable than the + # others, maybe we should not enable float32 for this one. + atol = 1e-3 if solver == "sparse_cg" else 1e-5 + for current_dtype in (np.float32, np.float64): + results[current_dtype] = ridge_regression( + X.astype(current_dtype), + y.astype(current_dtype), + alpha=alpha, + solver=solver, + random_state=random_state, + sample_weight=None, + positive=positive, + max_iter=500, + tol=1e-10, + return_n_iter=False, + return_intercept=False, + ) + + assert results[np.float32].dtype == np.float32 + assert results[np.float64].dtype == np.float64 + assert_allclose(results[np.float32], results[np.float64], atol=atol) + + +def test_ridge_sag_with_X_fortran(): + # check that Fortran array are converted when using SAG solver + X, y = make_regression(random_state=42) + # for the order of X and y to not be C-ordered arrays + X = np.asfortranarray(X) + X = X[::2, :] + y = y[::2] + Ridge(solver="sag").fit(X, y) + + +@pytest.mark.parametrize( + "Classifier, params", + [ + (RidgeClassifier, {}), + (RidgeClassifierCV, {"cv": None}), + (RidgeClassifierCV, {"cv": 3}), + ], +) +def test_ridgeclassifier_multilabel(Classifier, params): + """Check that multilabel classification is supported and give meaningful + results.""" + X, y = make_multilabel_classification(n_classes=1, random_state=0) + y = y.reshape(-1, 1) + Y = np.concatenate([y, y], axis=1) + clf = Classifier(**params).fit(X, Y) + Y_pred = clf.predict(X) + + assert Y_pred.shape == Y.shape + assert_array_equal(Y_pred[:, 0], Y_pred[:, 1]) + Ridge(solver="sag").fit(X, y) + + +@pytest.mark.parametrize("solver", ["auto", "lbfgs"]) +@pytest.mark.parametrize("fit_intercept", [True, False]) +@pytest.mark.parametrize("alpha", [1e-3, 1e-2, 0.1, 1.0]) +def test_ridge_positive_regression_test(solver, fit_intercept, alpha): + """Test that positive Ridge finds true positive coefficients.""" + X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + coef = np.array([1, -10]) + if fit_intercept: + intercept = 20 + y = X.dot(coef) + intercept + else: + y = X.dot(coef) + + model = Ridge( + alpha=alpha, positive=True, solver=solver, fit_intercept=fit_intercept + ) + model.fit(X, y) + assert np.all(model.coef_ >= 0) + + +@pytest.mark.parametrize("fit_intercept", [True, False]) +@pytest.mark.parametrize("alpha", [1e-3, 1e-2, 0.1, 1.0]) +def test_ridge_ground_truth_positive_test(fit_intercept, alpha): + """Test that Ridge w/wo positive converges to the same solution. + + Ridge with positive=True and positive=False must give the same + when the ground truth coefs are all positive. + """ + rng = np.random.RandomState(42) + X = rng.randn(300, 100) + coef = rng.uniform(0.1, 1.0, size=X.shape[1]) + if fit_intercept: + intercept = 1 + y = X @ coef + intercept + else: + y = X @ coef + y += rng.normal(size=X.shape[0]) * 0.01 + + results = [] + for positive in [True, False]: + model = Ridge( + alpha=alpha, positive=positive, fit_intercept=fit_intercept, tol=1e-10 + ) + results.append(model.fit(X, y).coef_) + assert_allclose(*results, atol=1e-6, rtol=0) + + +@pytest.mark.parametrize( + "solver", ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"] +) +def test_ridge_positive_error_test(solver): + """Test input validation for positive argument in Ridge.""" + alpha = 0.1 + X = np.array([[1, 2], [3, 4]]) + coef = np.array([1, -1]) + y = X @ coef + + model = Ridge(alpha=alpha, positive=True, solver=solver, fit_intercept=False) + with pytest.raises(ValueError, match="does not support positive"): + model.fit(X, y) + + with pytest.raises(ValueError, match="only 'lbfgs' solver can be used"): + _, _ = ridge_regression( + X, y, alpha, positive=True, solver=solver, return_intercept=False + ) + + +@pytest.mark.parametrize("alpha", [1e-3, 1e-2, 0.1, 1.0]) +def test_positive_ridge_loss(alpha): + """Check ridge loss consistency when positive argument is enabled.""" + X, y = make_regression(n_samples=300, n_features=300, random_state=42) + alpha = 0.10 + n_checks = 100 + + def ridge_loss(model, random_state=None, noise_scale=1e-8): + intercept = model.intercept_ + if random_state is not None: + rng = np.random.RandomState(random_state) + coef = model.coef_ + rng.uniform(0, noise_scale, size=model.coef_.shape) + else: + coef = model.coef_ + + return 0.5 * np.sum((y - X @ coef - intercept) ** 2) + 0.5 * alpha * np.sum( + coef**2 + ) + + model = Ridge(alpha=alpha).fit(X, y) + model_positive = Ridge(alpha=alpha, positive=True).fit(X, y) + + # Check 1: + # Loss for solution found by Ridge(positive=False) + # is lower than that for solution found by Ridge(positive=True) + loss = ridge_loss(model) + loss_positive = ridge_loss(model_positive) + assert loss <= loss_positive + + # Check 2: + # Loss for solution found by Ridge(positive=True) + # is lower than that for small random positive perturbation + # of the positive solution. + for random_state in range(n_checks): + loss_perturbed = ridge_loss(model_positive, random_state=random_state) + assert loss_positive <= loss_perturbed + + +@pytest.mark.parametrize("alpha", [1e-3, 1e-2, 0.1, 1.0]) +def test_lbfgs_solver_consistency(alpha): + """Test that LBGFS gets almost the same coef of svd when positive=False.""" + X, y = make_regression(n_samples=300, n_features=300, random_state=42) + y = np.expand_dims(y, 1) + alpha = np.asarray([alpha]) + config = { + "positive": False, + "tol": 1e-16, + "max_iter": 500000, + } + + coef_lbfgs = _solve_lbfgs(X, y, alpha, **config) + coef_cholesky = _solve_svd(X, y, alpha) + assert_allclose(coef_lbfgs, coef_cholesky, atol=1e-4, rtol=0) + + +def test_lbfgs_solver_error(): + """Test that LBFGS solver raises ConvergenceWarning.""" + X = np.array([[1, -1], [1, 1]]) + y = np.array([-1e10, 1e10]) + + model = Ridge( + alpha=0.01, + solver="lbfgs", + fit_intercept=False, + tol=1e-12, + positive=True, + max_iter=1, + ) + with pytest.warns(ConvergenceWarning, match="lbfgs solver did not converge"): + model.fit(X, y) + + +@pytest.mark.parametrize("fit_intercept", [False, True]) +@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS) +@pytest.mark.parametrize("data", ["tall", "wide"]) +@pytest.mark.parametrize("solver", SOLVERS + ["lbfgs"]) +def test_ridge_sample_weight_consistency( + fit_intercept, sparse_container, data, solver, global_random_seed +): + """Test that the impact of sample_weight is consistent. + + Note that this test is stricter than the common test + check_sample_weight_equivalence alone. + """ + # filter out solver that do not support sparse input + if sparse_container is not None: + if solver == "svd" or (solver in ("cholesky", "saga") and fit_intercept): + pytest.skip("unsupported configuration") + + # XXX: this test is quite sensitive to the seed used to generate the data: + # ideally we would like the test to pass for any global_random_seed but this is not + # the case at the moment. + rng = np.random.RandomState(42) + n_samples = 12 + if data == "tall": + n_features = n_samples // 2 + else: + n_features = n_samples * 2 + + X = rng.rand(n_samples, n_features) + y = rng.rand(n_samples) + if sparse_container is not None: + X = sparse_container(X) + params = dict( + fit_intercept=fit_intercept, + alpha=1.0, + solver=solver, + positive=(solver == "lbfgs"), + random_state=global_random_seed, # for sag/saga + tol=1e-12, + ) + + # 1) sample_weight=np.ones(..) should be equivalent to sample_weight=None, + # a special case of check_sample_weight_equivalence(name, reg), but we also + # test with sparse input. + reg = Ridge(**params).fit(X, y, sample_weight=None) + coef = reg.coef_.copy() + if fit_intercept: + intercept = reg.intercept_ + sample_weight = np.ones_like(y) + reg.fit(X, y, sample_weight=sample_weight) + assert_allclose(reg.coef_, coef, rtol=1e-6) + if fit_intercept: + assert_allclose(reg.intercept_, intercept) + + # 2) setting elements of sample_weight to 0 is equivalent to removing these samples, + # another special case of check_sample_weight_equivalence(name, reg), but we + # also test with sparse input + sample_weight = rng.uniform(low=0.01, high=2, size=X.shape[0]) + sample_weight[-5:] = 0 + y[-5:] *= 1000 # to make excluding those samples important + reg.fit(X, y, sample_weight=sample_weight) + coef = reg.coef_.copy() + if fit_intercept: + intercept = reg.intercept_ + reg.fit(X[:-5, :], y[:-5], sample_weight=sample_weight[:-5]) + assert_allclose(reg.coef_, coef, rtol=1e-6) + if fit_intercept: + assert_allclose(reg.intercept_, intercept) + + # 3) scaling of sample_weight should have no effect + # Note: For models with penalty, scaling the penalty term might work. + reg2 = Ridge(**params).set_params(alpha=np.pi * params["alpha"]) + reg2.fit(X, y, sample_weight=np.pi * sample_weight) + if solver in ("sag", "saga") and not fit_intercept: + pytest.xfail(f"Solver {solver} does fail test for scaling of sample_weight.") + assert_allclose(reg2.coef_, coef, rtol=1e-6) + if fit_intercept: + assert_allclose(reg2.intercept_, intercept) + + # 4) check that multiplying sample_weight by 2 is equivalent + # to repeating corresponding samples twice + if sparse_container is not None: + X = X.toarray() + X2 = np.concatenate([X, X[: n_samples // 2]], axis=0) + y2 = np.concatenate([y, y[: n_samples // 2]]) + sample_weight_1 = sample_weight.copy() + sample_weight_1[: n_samples // 2] *= 2 + sample_weight_2 = np.concatenate( + [sample_weight, sample_weight[: n_samples // 2]], axis=0 + ) + if sparse_container is not None: + X = sparse_container(X) + X2 = sparse_container(X2) + reg1 = Ridge(**params).fit(X, y, sample_weight=sample_weight_1) + reg2 = Ridge(**params).fit(X2, y2, sample_weight=sample_weight_2) + assert_allclose(reg1.coef_, reg2.coef_) + if fit_intercept: + assert_allclose(reg1.intercept_, reg2.intercept_) + + +@pytest.mark.parametrize("with_sample_weight", [False, True]) +@pytest.mark.parametrize("fit_intercept", [False, True]) +@pytest.mark.parametrize("n_targets", [1, 2]) +def test_ridge_cv_results_predictions(with_sample_weight, fit_intercept, n_targets): + """Check that the predictions stored in `cv_results_` are on the original scale. + + The GCV approach works on scaled data: centered by an offset and scaled by the + square root of the sample weights. Thus, prior to computing scores, the + predictions need to be scaled back to the original scale. These predictions are + the ones stored in `cv_results_` in `RidgeCV`. + + In this test, we check that the internal predictions stored in `cv_results_` are + equivalent to a naive LOO-CV grid search with a `Ridge` estimator. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/13998 + """ + X, y = make_regression( + n_samples=100, n_features=10, n_targets=n_targets, random_state=0 + ) + sample_weight = np.ones(shape=(X.shape[0],)) + if with_sample_weight: + sample_weight[::2] = 0.5 + + alphas = (0.1, 1.0, 10.0) + + # scoring should be set to store predictions and not the squared error + ridge_cv = RidgeCV( + alphas=alphas, + scoring="neg_mean_squared_error", + fit_intercept=fit_intercept, + store_cv_results=True, + ) + ridge_cv.fit(X, y, sample_weight=sample_weight) + + # manual grid-search with a `Ridge` estimator + predictions = np.empty(shape=(*y.shape, len(alphas))) + cv = LeaveOneOut() + for alpha_idx, alpha in enumerate(alphas): + for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)): + ridge = Ridge(alpha=alpha, fit_intercept=fit_intercept) + ridge.fit(X[train_idx], y[train_idx], sample_weight[train_idx]) + predictions[idx, ..., alpha_idx] = ridge.predict(X[test_idx]) + assert_allclose(ridge_cv.cv_results_, predictions) + + +def test_ridge_cv_multioutput_sample_weight(global_random_seed): + """Check that `RidgeCV` works properly with multioutput and sample_weight + when `scoring != None`. + + We check the error reported by the RidgeCV is close to a naive LOO-CV using a + Ridge estimator. + """ + X, y = make_regression(n_targets=2, random_state=global_random_seed) + sample_weight = np.ones(shape=(X.shape[0],)) + + ridge_cv = RidgeCV(scoring="neg_mean_squared_error", store_cv_results=True) + ridge_cv.fit(X, y, sample_weight=sample_weight) + + cv = LeaveOneOut() + ridge = Ridge(alpha=ridge_cv.alpha_) + y_pred_loo = np.squeeze( + [ + ridge.fit(X[train], y[train], sample_weight=sample_weight[train]).predict( + X[test] + ) + for train, test in cv.split(X) + ] + ) + assert_allclose(ridge_cv.best_score_, -mean_squared_error(y, y_pred_loo)) + + +def test_ridge_cv_custom_multioutput_scorer(): + """Check that `RidgeCV` works properly with a custom multioutput scorer.""" + X, y = make_regression(n_targets=2, random_state=0) + + def custom_error(y_true, y_pred): + errors = (y_true - y_pred) ** 2 + mean_errors = np.mean(errors, axis=0) + if mean_errors.ndim == 1: + # case of multioutput + return -np.average(mean_errors, weights=[2, 1]) + # single output - this part of the code should not be reached in the case of + # multioutput scoring + return -mean_errors # pragma: no cover + + def custom_multioutput_scorer(estimator, X, y): + """Multioutput score that give twice more importance to the second target.""" + return -custom_error(y, estimator.predict(X)) + + ridge_cv = RidgeCV(scoring=custom_multioutput_scorer) + ridge_cv.fit(X, y) + + cv = LeaveOneOut() + ridge = Ridge(alpha=ridge_cv.alpha_) + y_pred_loo = np.squeeze( + [ridge.fit(X[train], y[train]).predict(X[test]) for train, test in cv.split(X)] + ) + + assert_allclose(ridge_cv.best_score_, -custom_error(y, y_pred_loo)) + + +# Metadata Routing Tests +# ====================== + + +@pytest.mark.parametrize("metaestimator", [RidgeCV, RidgeClassifierCV]) +@config_context(enable_metadata_routing=True) +def test_metadata_routing_with_default_scoring(metaestimator): + """Test that `RidgeCV` or `RidgeClassifierCV` with default `scoring` + argument (`None`), don't enter into `RecursionError` when metadata is routed. + """ + metaestimator().get_metadata_routing() + + +@pytest.mark.parametrize( + "metaestimator, make_dataset", + [ + (RidgeCV(), make_regression), + (RidgeClassifierCV(), make_classification), + ], +) +@config_context(enable_metadata_routing=True) +def test_set_score_request_with_default_scoring(metaestimator, make_dataset): + """Test that `set_score_request` is set within `RidgeCV.fit()` and + `RidgeClassifierCV.fit()` when using the default scoring and no + UnsetMetadataPassedError is raised. Regression test for the fix in PR #29634.""" + X, y = make_dataset(n_samples=100, n_features=5, random_state=42) + metaestimator.fit(X, y, sample_weight=np.ones(X.shape[0])) + + +# End of Metadata Routing Tests +# ============================= diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_sag.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_sag.py new file mode 100644 index 0000000000000000000000000000000000000000..575838f8e8497a01c60adbb74ddad95dadc6e662 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_sag.py @@ -0,0 +1,861 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import math +import re + +import numpy as np +import pytest + +from sklearn.base import clone +from sklearn.datasets import load_iris, make_blobs, make_classification +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.linear_model._sag import get_auto_step_size +from sklearn.multiclass import OneVsRestClassifier +from sklearn.preprocessing import LabelEncoder +from sklearn.utils import check_random_state, compute_class_weight +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, +) +from sklearn.utils.extmath import row_norms +from sklearn.utils.fixes import CSR_CONTAINERS + +iris = load_iris() + + +# this is used for sag classification +def log_dloss(p, y): + z = p * y + # approximately equal and saves the computation of the log + if z > 18.0: + return math.exp(-z) * -y + if z < -18.0: + return -y + return -y / (math.exp(z) + 1.0) + + +def log_loss(p, y): + return np.mean(np.log(1.0 + np.exp(-y * p))) + + +# this is used for sag regression +def squared_dloss(p, y): + return p - y + + +def squared_loss(p, y): + return np.mean(0.5 * (p - y) * (p - y)) + + +# function for measuring the log loss +def get_pobj(w, alpha, myX, myy, loss): + w = w.ravel() + pred = np.dot(myX, w) + p = loss(pred, myy) + p += alpha * w.dot(w) / 2.0 + return p + + +def sag( + X, + y, + step_size, + alpha, + n_iter=1, + dloss=None, + sparse=False, + sample_weight=None, + fit_intercept=True, + saga=False, +): + n_samples, n_features = X.shape[0], X.shape[1] + + weights = np.zeros(X.shape[1]) + sum_gradient = np.zeros(X.shape[1]) + gradient_memory = np.zeros((n_samples, n_features)) + + intercept = 0.0 + intercept_sum_gradient = 0.0 + intercept_gradient_memory = np.zeros(n_samples) + + rng = np.random.RandomState(77) + decay = 1.0 + seen = set() + + # sparse data has a fixed decay of .01 + if sparse: + decay = 0.01 + + for epoch in range(n_iter): + for k in range(n_samples): + idx = int(rng.rand() * n_samples) + # idx = k + entry = X[idx] + seen.add(idx) + p = np.dot(entry, weights) + intercept + gradient = dloss(p, y[idx]) + if sample_weight is not None: + gradient *= sample_weight[idx] + update = entry * gradient + alpha * weights + gradient_correction = update - gradient_memory[idx] + sum_gradient += gradient_correction + gradient_memory[idx] = update + if saga: + weights -= gradient_correction * step_size * (1 - 1.0 / len(seen)) + + if fit_intercept: + gradient_correction = gradient - intercept_gradient_memory[idx] + intercept_gradient_memory[idx] = gradient + intercept_sum_gradient += gradient_correction + gradient_correction *= step_size * (1.0 - 1.0 / len(seen)) + if saga: + intercept -= ( + step_size * intercept_sum_gradient / len(seen) * decay + ) + gradient_correction + else: + intercept -= step_size * intercept_sum_gradient / len(seen) * decay + + weights -= step_size * sum_gradient / len(seen) + + return weights, intercept + + +def sag_sparse( + X, + y, + step_size, + alpha, + n_iter=1, + dloss=None, + sample_weight=None, + sparse=False, + fit_intercept=True, + saga=False, + random_state=0, +): + if step_size * alpha == 1.0: + raise ZeroDivisionError( + "Sparse sag does not handle the case step_size * alpha == 1" + ) + n_samples, n_features = X.shape[0], X.shape[1] + + weights = np.zeros(n_features) + sum_gradient = np.zeros(n_features) + last_updated = np.zeros(n_features, dtype=int) + gradient_memory = np.zeros(n_samples) + rng = check_random_state(random_state) + intercept = 0.0 + intercept_sum_gradient = 0.0 + wscale = 1.0 + decay = 1.0 + seen = set() + + c_sum = np.zeros(n_iter * n_samples) + + # sparse data has a fixed decay of .01 + if sparse: + decay = 0.01 + + counter = 0 + for epoch in range(n_iter): + for k in range(n_samples): + # idx = k + idx = int(rng.rand() * n_samples) + entry = X[idx] + seen.add(idx) + + if counter >= 1: + for j in range(n_features): + if last_updated[j] == 0: + weights[j] -= c_sum[counter - 1] * sum_gradient[j] + else: + weights[j] -= ( + c_sum[counter - 1] - c_sum[last_updated[j] - 1] + ) * sum_gradient[j] + last_updated[j] = counter + + p = (wscale * np.dot(entry, weights)) + intercept + gradient = dloss(p, y[idx]) + + if sample_weight is not None: + gradient *= sample_weight[idx] + + update = entry * gradient + gradient_correction = update - (gradient_memory[idx] * entry) + sum_gradient += gradient_correction + if saga: + for j in range(n_features): + weights[j] -= ( + gradient_correction[j] + * step_size + * (1 - 1.0 / len(seen)) + / wscale + ) + + if fit_intercept: + gradient_correction = gradient - gradient_memory[idx] + intercept_sum_gradient += gradient_correction + gradient_correction *= step_size * (1.0 - 1.0 / len(seen)) + if saga: + intercept -= ( + step_size * intercept_sum_gradient / len(seen) * decay + ) + gradient_correction + else: + intercept -= step_size * intercept_sum_gradient / len(seen) * decay + + gradient_memory[idx] = gradient + + wscale *= 1.0 - alpha * step_size + if counter == 0: + c_sum[0] = step_size / (wscale * len(seen)) + else: + c_sum[counter] = c_sum[counter - 1] + step_size / (wscale * len(seen)) + + if counter >= 1 and wscale < 1e-9: + for j in range(n_features): + if last_updated[j] == 0: + weights[j] -= c_sum[counter] * sum_gradient[j] + else: + weights[j] -= ( + c_sum[counter] - c_sum[last_updated[j] - 1] + ) * sum_gradient[j] + last_updated[j] = counter + 1 + c_sum[counter] = 0 + weights *= wscale + wscale = 1.0 + + counter += 1 + + for j in range(n_features): + if last_updated[j] == 0: + weights[j] -= c_sum[counter - 1] * sum_gradient[j] + else: + weights[j] -= ( + c_sum[counter - 1] - c_sum[last_updated[j] - 1] + ) * sum_gradient[j] + weights *= wscale + return weights, intercept + + +def get_step_size(X, alpha, fit_intercept, classification=True): + if classification: + return 4.0 / (np.max(np.sum(X * X, axis=1)) + fit_intercept + 4.0 * alpha) + else: + return 1.0 / (np.max(np.sum(X * X, axis=1)) + fit_intercept + alpha) + + +def test_classifier_matching(): + n_samples = 20 + X, y = make_blobs(n_samples=n_samples, centers=2, random_state=0, cluster_std=0.1) + # y must be 0 or 1 + alpha = 1.1 + fit_intercept = True + step_size = get_step_size(X, alpha, fit_intercept) + for solver in ["sag", "saga"]: + if solver == "sag": + n_iter = 80 + else: + # SAGA variance w.r.t. stream order is higher + n_iter = 300 + clf = LogisticRegression( + solver=solver, + fit_intercept=fit_intercept, + tol=1e-11, + C=1.0 / alpha / n_samples, + max_iter=n_iter, + random_state=10, + ) + clf.fit(X, y) + + weights, intercept = sag_sparse( + X, + 2 * y - 1, # y must be -1 or +1 + step_size, + alpha, + n_iter=n_iter, + dloss=log_dloss, + fit_intercept=fit_intercept, + saga=solver == "saga", + ) + weights2, intercept2 = sag( + X, + 2 * y - 1, # y must be -1 or +1 + step_size, + alpha, + n_iter=n_iter, + dloss=log_dloss, + fit_intercept=fit_intercept, + saga=solver == "saga", + ) + weights = np.atleast_2d(weights) + intercept = np.atleast_1d(intercept) + weights2 = np.atleast_2d(weights2) + intercept2 = np.atleast_1d(intercept2) + + assert_array_almost_equal(weights, clf.coef_, decimal=9) + assert_array_almost_equal(intercept, clf.intercept_, decimal=9) + assert_array_almost_equal(weights2, clf.coef_, decimal=9) + assert_array_almost_equal(intercept2, clf.intercept_, decimal=9) + + +def test_regressor_matching(): + n_samples = 10 + n_features = 5 + + rng = np.random.RandomState(10) + X = rng.normal(size=(n_samples, n_features)) + true_w = rng.normal(size=n_features) + y = X.dot(true_w) + + alpha = 1.0 + n_iter = 100 + fit_intercept = True + + step_size = get_step_size(X, alpha, fit_intercept, classification=False) + clf = Ridge( + fit_intercept=fit_intercept, + tol=0.00000000001, + solver="sag", + alpha=alpha * n_samples, + max_iter=n_iter, + ) + clf.fit(X, y) + + weights1, intercept1 = sag_sparse( + X, + y, + step_size, + alpha, + n_iter=n_iter, + dloss=squared_dloss, + fit_intercept=fit_intercept, + ) + weights2, intercept2 = sag( + X, + y, + step_size, + alpha, + n_iter=n_iter, + dloss=squared_dloss, + fit_intercept=fit_intercept, + ) + + assert_allclose(weights1, clf.coef_) + assert_allclose(intercept1, clf.intercept_) + assert_allclose(weights2, clf.coef_) + assert_allclose(intercept2, clf.intercept_) + + +@pytest.mark.filterwarnings("ignore:The max_iter was reached") +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sag_pobj_matches_logistic_regression(csr_container): + """tests if the sag pobj matches log reg""" + n_samples = 100 + alpha = 1.0 + max_iter = 20 + X, y = make_blobs(n_samples=n_samples, centers=2, random_state=0, cluster_std=0.1) + + clf1 = LogisticRegression( + solver="sag", + fit_intercept=False, + tol=0.0000001, + C=1.0 / alpha / n_samples, + max_iter=max_iter, + random_state=10, + ) + clf2 = clone(clf1) + clf3 = LogisticRegression( + fit_intercept=False, + tol=0.0000001, + C=1.0 / alpha / n_samples, + max_iter=max_iter, + random_state=10, + ) + + clf1.fit(X, y) + clf2.fit(csr_container(X), y) + clf3.fit(X, y) + + pobj1 = get_pobj(clf1.coef_, alpha, X, y, log_loss) + pobj2 = get_pobj(clf2.coef_, alpha, X, y, log_loss) + pobj3 = get_pobj(clf3.coef_, alpha, X, y, log_loss) + + assert_array_almost_equal(pobj1, pobj2, decimal=4) + assert_array_almost_equal(pobj2, pobj3, decimal=4) + assert_array_almost_equal(pobj3, pobj1, decimal=4) + + +@pytest.mark.filterwarnings("ignore:The max_iter was reached") +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sag_pobj_matches_ridge_regression(csr_container): + """tests if the sag pobj matches ridge reg""" + n_samples = 100 + n_features = 10 + alpha = 1.0 + n_iter = 100 + fit_intercept = False + rng = np.random.RandomState(10) + X = rng.normal(size=(n_samples, n_features)) + true_w = rng.normal(size=n_features) + y = X.dot(true_w) + + clf1 = Ridge( + fit_intercept=fit_intercept, + tol=0.00000000001, + solver="sag", + alpha=alpha, + max_iter=n_iter, + random_state=42, + ) + clf2 = clone(clf1) + clf3 = Ridge( + fit_intercept=fit_intercept, + tol=0.00001, + solver="lsqr", + alpha=alpha, + max_iter=n_iter, + random_state=42, + ) + + clf1.fit(X, y) + clf2.fit(csr_container(X), y) + clf3.fit(X, y) + + pobj1 = get_pobj(clf1.coef_, alpha, X, y, squared_loss) + pobj2 = get_pobj(clf2.coef_, alpha, X, y, squared_loss) + pobj3 = get_pobj(clf3.coef_, alpha, X, y, squared_loss) + + assert_array_almost_equal(pobj1, pobj2, decimal=4) + assert_array_almost_equal(pobj1, pobj3, decimal=4) + assert_array_almost_equal(pobj3, pobj2, decimal=4) + + +@pytest.mark.filterwarnings("ignore:The max_iter was reached") +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sag_regressor_computed_correctly(csr_container): + """tests if the sag regressor is computed correctly""" + alpha = 0.1 + n_features = 10 + n_samples = 40 + max_iter = 100 + tol = 0.000001 + fit_intercept = True + rng = np.random.RandomState(0) + X = rng.normal(size=(n_samples, n_features)) + w = rng.normal(size=n_features) + y = np.dot(X, w) + 2.0 + step_size = get_step_size(X, alpha, fit_intercept, classification=False) + + clf1 = Ridge( + fit_intercept=fit_intercept, + tol=tol, + solver="sag", + alpha=alpha * n_samples, + max_iter=max_iter, + random_state=rng, + ) + clf2 = clone(clf1) + + clf1.fit(X, y) + clf2.fit(csr_container(X), y) + + spweights1, spintercept1 = sag_sparse( + X, + y, + step_size, + alpha, + n_iter=max_iter, + dloss=squared_dloss, + fit_intercept=fit_intercept, + random_state=rng, + ) + + spweights2, spintercept2 = sag_sparse( + X, + y, + step_size, + alpha, + n_iter=max_iter, + dloss=squared_dloss, + sparse=True, + fit_intercept=fit_intercept, + random_state=rng, + ) + + assert_array_almost_equal(clf1.coef_.ravel(), spweights1.ravel(), decimal=3) + assert_almost_equal(clf1.intercept_, spintercept1, decimal=1) + + # TODO: uncomment when sparse Ridge with intercept will be fixed (#4710) + # assert_array_almost_equal(clf2.coef_.ravel(), + # spweights2.ravel(), + # decimal=3) + # assert_almost_equal(clf2.intercept_, spintercept2, decimal=1)''' + + +def test_get_auto_step_size(): + X = np.array([[1, 2, 3], [2, 3, 4], [2, 3, 2]], dtype=np.float64) + alpha = 1.2 + fit_intercept = False + # sum the squares of the second sample because that's the largest + max_squared_sum = 4 + 9 + 16 + max_squared_sum_ = row_norms(X, squared=True).max() + n_samples = X.shape[0] + assert_almost_equal(max_squared_sum, max_squared_sum_, decimal=4) + + for saga in [True, False]: + for fit_intercept in (True, False): + if saga: + L_sqr = max_squared_sum + alpha + int(fit_intercept) + L_log = (max_squared_sum + 4.0 * alpha + int(fit_intercept)) / 4.0 + mun_sqr = min(2 * n_samples * alpha, L_sqr) + mun_log = min(2 * n_samples * alpha, L_log) + step_size_sqr = 1 / (2 * L_sqr + mun_sqr) + step_size_log = 1 / (2 * L_log + mun_log) + else: + step_size_sqr = 1.0 / (max_squared_sum + alpha + int(fit_intercept)) + step_size_log = 4.0 / ( + max_squared_sum + 4.0 * alpha + int(fit_intercept) + ) + + step_size_sqr_ = get_auto_step_size( + max_squared_sum_, + alpha, + "squared", + fit_intercept, + n_samples=n_samples, + is_saga=saga, + ) + step_size_log_ = get_auto_step_size( + max_squared_sum_, + alpha, + "log", + fit_intercept, + n_samples=n_samples, + is_saga=saga, + ) + + assert_almost_equal(step_size_sqr, step_size_sqr_, decimal=4) + assert_almost_equal(step_size_log, step_size_log_, decimal=4) + + msg = "Unknown loss function for SAG solver, got wrong instead of" + with pytest.raises(ValueError, match=msg): + get_auto_step_size(max_squared_sum_, alpha, "wrong", fit_intercept) + + +@pytest.mark.parametrize("seed", range(3)) # locally tested with 1000 seeds +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sag_regressor(seed, csr_container): + """tests if the sag regressor performs well""" + xmin, xmax = -5, 5 + n_samples = 300 + tol = 0.001 + max_iter = 100 + alpha = 0.1 + rng = np.random.RandomState(seed) + X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1) + + # simple linear function without noise + y = 0.5 * X.ravel() + + clf1 = Ridge( + tol=tol, + solver="sag", + max_iter=max_iter, + alpha=alpha * n_samples, + random_state=rng, + ) + clf2 = clone(clf1) + clf1.fit(X, y) + clf2.fit(csr_container(X), y) + score1 = clf1.score(X, y) + score2 = clf2.score(X, y) + assert score1 > 0.98 + assert score2 > 0.98 + + # simple linear function with noise + y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel() + + clf1 = Ridge(tol=tol, solver="sag", max_iter=max_iter, alpha=alpha * n_samples) + clf2 = clone(clf1) + clf1.fit(X, y) + clf2.fit(csr_container(X), y) + score1 = clf1.score(X, y) + score2 = clf2.score(X, y) + assert score1 > 0.45 + assert score2 > 0.45 + + +@pytest.mark.filterwarnings("ignore:The max_iter was reached") +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sag_classifier_computed_correctly(csr_container): + """tests if the binary classifier is computed correctly""" + alpha = 0.1 + n_samples = 50 + n_iter = 50 + tol = 0.00001 + fit_intercept = True + X, y = make_blobs(n_samples=n_samples, centers=2, random_state=0, cluster_std=0.1) + step_size = get_step_size(X, alpha, fit_intercept, classification=True) + classes = np.unique(y) + y_tmp = np.ones(n_samples) + y_tmp[y != classes[1]] = -1 + y = y_tmp + + clf1 = LogisticRegression( + solver="sag", + C=1.0 / alpha / n_samples, + max_iter=n_iter, + tol=tol, + random_state=77, + fit_intercept=fit_intercept, + ) + clf2 = clone(clf1) + + clf1.fit(X, y) + clf2.fit(csr_container(X), y) + + spweights, spintercept = sag_sparse( + X, + y, + step_size, + alpha, + n_iter=n_iter, + dloss=log_dloss, + fit_intercept=fit_intercept, + ) + spweights2, spintercept2 = sag_sparse( + X, + y, + step_size, + alpha, + n_iter=n_iter, + dloss=log_dloss, + sparse=True, + fit_intercept=fit_intercept, + ) + + assert_array_almost_equal(clf1.coef_.ravel(), spweights.ravel(), decimal=2) + assert_almost_equal(clf1.intercept_, spintercept, decimal=1) + + assert_array_almost_equal(clf2.coef_.ravel(), spweights2.ravel(), decimal=2) + assert_almost_equal(clf2.intercept_, spintercept2, decimal=1) + + +@pytest.mark.filterwarnings("ignore:The max_iter was reached") +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sag_multiclass_computed_correctly(csr_container): + """tests if the multiclass classifier is computed correctly""" + alpha = 0.1 + n_samples = 20 + tol = 1e-5 + max_iter = 70 + fit_intercept = True + X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0, cluster_std=0.1) + step_size = get_step_size(X, alpha, fit_intercept, classification=True) + classes = np.unique(y) + + clf1 = OneVsRestClassifier( + LogisticRegression( + solver="sag", + C=1.0 / alpha / n_samples, + max_iter=max_iter, + tol=tol, + random_state=77, + fit_intercept=fit_intercept, + ) + ) + clf2 = clone(clf1) + + clf1.fit(X, y) + clf2.fit(csr_container(X), y) + + coef1 = [] + intercept1 = [] + coef2 = [] + intercept2 = [] + for cl in classes: + y_encoded = np.ones(n_samples) + y_encoded[y != cl] = -1 + + spweights1, spintercept1 = sag_sparse( + X, + y_encoded, + step_size, + alpha, + dloss=log_dloss, + n_iter=max_iter, + fit_intercept=fit_intercept, + ) + spweights2, spintercept2 = sag_sparse( + X, + y_encoded, + step_size, + alpha, + dloss=log_dloss, + n_iter=max_iter, + sparse=True, + fit_intercept=fit_intercept, + ) + coef1.append(spweights1) + intercept1.append(spintercept1) + + coef2.append(spweights2) + intercept2.append(spintercept2) + + coef1 = np.vstack(coef1) + intercept1 = np.array(intercept1) + coef2 = np.vstack(coef2) + intercept2 = np.array(intercept2) + + for i, cl in enumerate(classes): + assert_allclose(clf1.estimators_[i].coef_.ravel(), coef1[i], rtol=1e-2) + assert_allclose(clf1.estimators_[i].intercept_, intercept1[i], rtol=1e-1) + + assert_allclose(clf2.estimators_[i].coef_.ravel(), coef2[i], rtol=1e-2) + # Note the very crude accuracy, i.e. high rtol. + assert_allclose(clf2.estimators_[i].intercept_, intercept2[i], rtol=5e-1) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_classifier_results(csr_container): + """tests if classifier results match target""" + alpha = 0.1 + n_features = 20 + n_samples = 10 + tol = 0.01 + max_iter = 200 + rng = np.random.RandomState(0) + X = rng.normal(size=(n_samples, n_features)) + w = rng.normal(size=n_features) + y = np.dot(X, w) + y = np.sign(y) + clf1 = LogisticRegression( + solver="sag", + C=1.0 / alpha / n_samples, + max_iter=max_iter, + tol=tol, + random_state=77, + ) + clf2 = clone(clf1) + + clf1.fit(X, y) + clf2.fit(csr_container(X), y) + pred1 = clf1.predict(X) + pred2 = clf2.predict(X) + assert_almost_equal(pred1, y, decimal=12) + assert_almost_equal(pred2, y, decimal=12) + + +@pytest.mark.filterwarnings("ignore:The max_iter was reached") +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_binary_classifier_class_weight(csr_container): + """tests binary classifier with classweights for each class""" + alpha = 0.1 + n_samples = 50 + n_iter = 20 + tol = 0.00001 + fit_intercept = True + X, y = make_blobs(n_samples=n_samples, centers=2, random_state=10, cluster_std=0.1) + step_size = get_step_size(X, alpha, fit_intercept, classification=True) + classes = np.unique(y) + y_tmp = np.ones(n_samples) + y_tmp[y != classes[1]] = -1 + y = y_tmp + + class_weight = {1: 0.45, -1: 0.55} + clf1 = LogisticRegression( + solver="sag", + C=1.0 / alpha / n_samples, + max_iter=n_iter, + tol=tol, + random_state=77, + fit_intercept=fit_intercept, + class_weight=class_weight, + ) + clf2 = clone(clf1) + + clf1.fit(X, y) + clf2.fit(csr_container(X), y) + + le = LabelEncoder() + class_weight_ = compute_class_weight(class_weight, classes=np.unique(y), y=y) + sample_weight = class_weight_[le.fit_transform(y)] + spweights, spintercept = sag_sparse( + X, + y, + step_size, + alpha, + n_iter=n_iter, + dloss=log_dloss, + sample_weight=sample_weight, + fit_intercept=fit_intercept, + ) + spweights2, spintercept2 = sag_sparse( + X, + y, + step_size, + alpha, + n_iter=n_iter, + dloss=log_dloss, + sparse=True, + sample_weight=sample_weight, + fit_intercept=fit_intercept, + ) + + assert_array_almost_equal(clf1.coef_.ravel(), spweights.ravel(), decimal=2) + assert_almost_equal(clf1.intercept_, spintercept, decimal=1) + + assert_array_almost_equal(clf2.coef_.ravel(), spweights2.ravel(), decimal=2) + assert_almost_equal(clf2.intercept_, spintercept2, decimal=1) + + +def test_classifier_single_class(): + """tests if ValueError is thrown with only one class""" + X = [[1, 2], [3, 4]] + y = [1, 1] + + msg = "This solver needs samples of at least 2 classes in the data" + with pytest.raises(ValueError, match=msg): + LogisticRegression(solver="sag").fit(X, y) + + +def test_step_size_alpha_error(): + X = [[0, 0], [0, 0]] + y = [1, -1] + fit_intercept = False + alpha = 1.0 + msg = re.escape( + "Current sag implementation does not handle the case" + " step_size * alpha_scaled == 1" + ) + + clf1 = LogisticRegression(solver="sag", C=1.0 / alpha, fit_intercept=fit_intercept) + with pytest.raises(ZeroDivisionError, match=msg): + clf1.fit(X, y) + + clf2 = Ridge(fit_intercept=fit_intercept, solver="sag", alpha=alpha) + with pytest.raises(ZeroDivisionError, match=msg): + clf2.fit(X, y) + + +@pytest.mark.parametrize("solver", ["sag", "saga"]) +def test_sag_classifier_raises_error(solver): + # Following #13316, the error handling behavior changed in cython sag. This + # is simply a non-regression test to make sure numerical errors are + # properly raised. + + # Train a classifier on a simple problem + rng = np.random.RandomState(42) + X, y = make_classification(random_state=rng) + clf = LogisticRegression(solver=solver, random_state=rng, warm_start=True) + clf.fit(X, y) + + # Trigger a numerical error by: + # - corrupting the fitted coefficients of the classifier + # - fit it again starting from its current state thanks to warm_start + clf.coef_[:] = np.nan + + with pytest.raises(ValueError, match="Floating-point under-/overflow"): + clf.fit(X, y) diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_sgd.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_sgd.py new file mode 100644 index 0000000000000000000000000000000000000000..26d138ae3649b22c4848dcacae1391a399e72fcd --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_sgd.py @@ -0,0 +1,2195 @@ +import pickle +from unittest.mock import Mock + +import joblib +import numpy as np +import pytest +import scipy.sparse as sp + +from sklearn import datasets, linear_model, metrics +from sklearn.base import clone, is_classifier +from sklearn.exceptions import ConvergenceWarning +from sklearn.kernel_approximation import Nystroem +from sklearn.linear_model import _sgd_fast as sgd_fast +from sklearn.linear_model import _stochastic_gradient +from sklearn.model_selection import ( + RandomizedSearchCV, + ShuffleSplit, + StratifiedShuffleSplit, +) +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, scale +from sklearn.svm import OneClassSVM +from sklearn.utils import get_tags +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) + + +def _update_kwargs(kwargs): + if "random_state" not in kwargs: + kwargs["random_state"] = 42 + + if "tol" not in kwargs: + kwargs["tol"] = None + if "max_iter" not in kwargs: + kwargs["max_iter"] = 5 + + +class _SparseSGDClassifier(linear_model.SGDClassifier): + def fit(self, X, y, *args, **kw): + X = sp.csr_matrix(X) + return super().fit(X, y, *args, **kw) + + def partial_fit(self, X, y, *args, **kw): + X = sp.csr_matrix(X) + return super().partial_fit(X, y, *args, **kw) + + def decision_function(self, X): + X = sp.csr_matrix(X) + return super().decision_function(X) + + def predict_proba(self, X): + X = sp.csr_matrix(X) + return super().predict_proba(X) + + +class _SparseSGDRegressor(linear_model.SGDRegressor): + def fit(self, X, y, *args, **kw): + X = sp.csr_matrix(X) + return linear_model.SGDRegressor.fit(self, X, y, *args, **kw) + + def partial_fit(self, X, y, *args, **kw): + X = sp.csr_matrix(X) + return linear_model.SGDRegressor.partial_fit(self, X, y, *args, **kw) + + def decision_function(self, X, *args, **kw): + # XXX untested as of v0.22 + X = sp.csr_matrix(X) + return linear_model.SGDRegressor.decision_function(self, X, *args, **kw) + + +class _SparseSGDOneClassSVM(linear_model.SGDOneClassSVM): + def fit(self, X, *args, **kw): + X = sp.csr_matrix(X) + return linear_model.SGDOneClassSVM.fit(self, X, *args, **kw) + + def partial_fit(self, X, *args, **kw): + X = sp.csr_matrix(X) + return linear_model.SGDOneClassSVM.partial_fit(self, X, *args, **kw) + + def decision_function(self, X, *args, **kw): + X = sp.csr_matrix(X) + return linear_model.SGDOneClassSVM.decision_function(self, X, *args, **kw) + + +def SGDClassifier(**kwargs): + _update_kwargs(kwargs) + return linear_model.SGDClassifier(**kwargs) + + +def SGDRegressor(**kwargs): + _update_kwargs(kwargs) + return linear_model.SGDRegressor(**kwargs) + + +def SGDOneClassSVM(**kwargs): + _update_kwargs(kwargs) + return linear_model.SGDOneClassSVM(**kwargs) + + +def SparseSGDClassifier(**kwargs): + _update_kwargs(kwargs) + return _SparseSGDClassifier(**kwargs) + + +def SparseSGDRegressor(**kwargs): + _update_kwargs(kwargs) + return _SparseSGDRegressor(**kwargs) + + +def SparseSGDOneClassSVM(**kwargs): + _update_kwargs(kwargs) + return _SparseSGDOneClassSVM(**kwargs) + + +# Test Data + +# test sample 1 +X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]) +Y = [1, 1, 1, 2, 2, 2] +T = np.array([[-1, -1], [2, 2], [3, 2]]) +true_result = [1, 2, 2] + +# test sample 2; string class labels +X2 = np.array( + [ + [-1, 1], + [-0.75, 0.5], + [-1.5, 1.5], + [1, 1], + [0.75, 0.5], + [1.5, 1.5], + [-1, -1], + [0, -0.5], + [1, -1], + ] +) +Y2 = ["one"] * 3 + ["two"] * 3 + ["three"] * 3 +T2 = np.array([[-1.5, 0.5], [1, 2], [0, -2]]) +true_result2 = ["one", "two", "three"] + +# test sample 3 +X3 = np.array( + [ + [1, 1, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [0, 0, 1, 0, 0, 0], + [0, 0, 1, 0, 0, 0], + [0, 0, 0, 0, 1, 1], + [0, 0, 0, 0, 1, 1], + [0, 0, 0, 1, 0, 0], + [0, 0, 0, 1, 0, 0], + ] +) +Y3 = np.array([1, 1, 1, 1, 2, 2, 2, 2]) + +# test sample 4 - two more or less redundant feature groups +X4 = np.array( + [ + [1, 0.9, 0.8, 0, 0, 0], + [1, 0.84, 0.98, 0, 0, 0], + [1, 0.96, 0.88, 0, 0, 0], + [1, 0.91, 0.99, 0, 0, 0], + [0, 0, 0, 0.89, 0.91, 1], + [0, 0, 0, 0.79, 0.84, 1], + [0, 0, 0, 0.91, 0.95, 1], + [0, 0, 0, 0.93, 1, 1], + ] +) +Y4 = np.array([1, 1, 1, 1, 2, 2, 2, 2]) + +iris = datasets.load_iris() + +# test sample 5 - test sample 1 as binary classification problem +X5 = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]) +Y5 = [1, 1, 1, 2, 2, 2] +true_result5 = [0, 1, 1] + + +############################################################################### +# Common Test Case to classification and regression + + +# a simple implementation of ASGD to use for testing +# uses squared loss to find the gradient +def asgd(klass, X, y, eta, alpha, weight_init=None, intercept_init=0.0): + if weight_init is None: + weights = np.zeros(X.shape[1]) + else: + weights = weight_init + + average_weights = np.zeros(X.shape[1]) + intercept = intercept_init + average_intercept = 0.0 + decay = 1.0 + + # sparse data has a fixed decay of .01 + if klass in (SparseSGDClassifier, SparseSGDRegressor): + decay = 0.01 + + for i, entry in enumerate(X): + p = np.dot(entry, weights) + p += intercept + gradient = p - y[i] + weights *= 1.0 - (eta * alpha) + weights += -(eta * gradient * entry) + intercept += -(eta * gradient) * decay + + average_weights *= i + average_weights += weights + average_weights /= i + 1.0 + + average_intercept *= i + average_intercept += intercept + average_intercept /= i + 1.0 + + return average_weights, average_intercept + + +def _test_warm_start(klass, X, Y, lr): + # Test that explicit warm restart... + clf = klass(alpha=0.01, eta0=0.01, shuffle=False, learning_rate=lr) + clf.fit(X, Y) + + clf2 = klass(alpha=0.001, eta0=0.01, shuffle=False, learning_rate=lr) + clf2.fit(X, Y, coef_init=clf.coef_.copy(), intercept_init=clf.intercept_.copy()) + + # ... and implicit warm restart are equivalent. + clf3 = klass( + alpha=0.01, eta0=0.01, shuffle=False, warm_start=True, learning_rate=lr + ) + clf3.fit(X, Y) + + assert clf3.t_ == clf.t_ + assert_array_almost_equal(clf3.coef_, clf.coef_) + + clf3.set_params(alpha=0.001) + clf3.fit(X, Y) + + assert clf3.t_ == clf2.t_ + assert_array_almost_equal(clf3.coef_, clf2.coef_) + + +@pytest.mark.parametrize( + "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor] +) +@pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"]) +def test_warm_start(klass, lr): + _test_warm_start(klass, X, Y, lr) + + +@pytest.mark.parametrize( + "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor] +) +def test_input_format(klass): + # Input format tests. + clf = klass(alpha=0.01, shuffle=False) + clf.fit(X, Y) + Y_ = np.array(Y)[:, np.newaxis] + + Y_ = np.c_[Y_, Y_] + with pytest.raises(ValueError): + clf.fit(X, Y_) + + +@pytest.mark.parametrize( + "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor] +) +def test_clone(klass): + # Test whether clone works ok. + clf = klass(alpha=0.01, penalty="l1") + clf = clone(clf) + clf.set_params(penalty="l2") + clf.fit(X, Y) + + clf2 = klass(alpha=0.01, penalty="l2") + clf2.fit(X, Y) + + assert_array_equal(clf.coef_, clf2.coef_) + + +@pytest.mark.parametrize( + "klass", + [ + SGDClassifier, + SparseSGDClassifier, + SGDRegressor, + SparseSGDRegressor, + SGDOneClassSVM, + SparseSGDOneClassSVM, + ], +) +def test_plain_has_no_average_attr(klass): + clf = klass(average=True, eta0=0.01) + clf.fit(X, Y) + + assert hasattr(clf, "_average_coef") + assert hasattr(clf, "_average_intercept") + assert hasattr(clf, "_standard_intercept") + assert hasattr(clf, "_standard_coef") + + clf = klass() + clf.fit(X, Y) + + assert not hasattr(clf, "_average_coef") + assert not hasattr(clf, "_average_intercept") + assert not hasattr(clf, "_standard_intercept") + assert not hasattr(clf, "_standard_coef") + + +@pytest.mark.parametrize( + "klass", + [ + SGDClassifier, + SparseSGDClassifier, + SGDRegressor, + SparseSGDRegressor, + SGDOneClassSVM, + SparseSGDOneClassSVM, + ], +) +def test_late_onset_averaging_not_reached(klass): + clf1 = klass(average=600) + clf2 = klass() + for _ in range(100): + if is_classifier(clf1): + clf1.partial_fit(X, Y, classes=np.unique(Y)) + clf2.partial_fit(X, Y, classes=np.unique(Y)) + else: + clf1.partial_fit(X, Y) + clf2.partial_fit(X, Y) + + assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=16) + if klass in [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]: + assert_almost_equal(clf1.intercept_, clf2.intercept_, decimal=16) + elif klass in [SGDOneClassSVM, SparseSGDOneClassSVM]: + assert_allclose(clf1.offset_, clf2.offset_) + + +@pytest.mark.parametrize( + "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor] +) +def test_late_onset_averaging_reached(klass): + eta0 = 0.001 + alpha = 0.0001 + Y_encode = np.array(Y) + Y_encode[Y_encode == 1] = -1.0 + Y_encode[Y_encode == 2] = 1.0 + + clf1 = klass( + average=7, + learning_rate="constant", + loss="squared_error", + eta0=eta0, + alpha=alpha, + max_iter=2, + shuffle=False, + ) + clf2 = klass( + average=False, + learning_rate="constant", + loss="squared_error", + eta0=eta0, + alpha=alpha, + max_iter=1, + shuffle=False, + ) + + clf1.fit(X, Y_encode) + clf2.fit(X, Y_encode) + + average_weights, average_intercept = asgd( + klass, + X, + Y_encode, + eta0, + alpha, + weight_init=clf2.coef_.ravel(), + intercept_init=clf2.intercept_, + ) + + assert_array_almost_equal(clf1.coef_.ravel(), average_weights.ravel(), decimal=16) + assert_almost_equal(clf1.intercept_, average_intercept, decimal=16) + + +@pytest.mark.parametrize( + "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor] +) +def test_early_stopping(klass): + X = iris.data[iris.target > 0] + Y = iris.target[iris.target > 0] + for early_stopping in [True, False]: + max_iter = 1000 + clf = klass(early_stopping=early_stopping, tol=1e-3, max_iter=max_iter).fit( + X, Y + ) + assert clf.n_iter_ < max_iter + + +@pytest.mark.parametrize( + "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor] +) +def test_adaptive_longer_than_constant(klass): + clf1 = klass(learning_rate="adaptive", eta0=0.01, tol=1e-3, max_iter=100) + clf1.fit(iris.data, iris.target) + clf2 = klass(learning_rate="constant", eta0=0.01, tol=1e-3, max_iter=100) + clf2.fit(iris.data, iris.target) + assert clf1.n_iter_ > clf2.n_iter_ + + +@pytest.mark.parametrize( + "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor] +) +def test_validation_set_not_used_for_training(klass): + X, Y = iris.data, iris.target + validation_fraction = 0.4 + seed = 42 + shuffle = False + max_iter = 10 + clf1 = klass( + early_stopping=True, + random_state=np.random.RandomState(seed), + validation_fraction=validation_fraction, + learning_rate="constant", + eta0=0.01, + tol=None, + max_iter=max_iter, + shuffle=shuffle, + ) + clf1.fit(X, Y) + assert clf1.n_iter_ == max_iter + + clf2 = klass( + early_stopping=False, + random_state=np.random.RandomState(seed), + learning_rate="constant", + eta0=0.01, + tol=None, + max_iter=max_iter, + shuffle=shuffle, + ) + + if is_classifier(clf2): + cv = StratifiedShuffleSplit(test_size=validation_fraction, random_state=seed) + else: + cv = ShuffleSplit(test_size=validation_fraction, random_state=seed) + idx_train, idx_val = next(cv.split(X, Y)) + idx_train = np.sort(idx_train) # remove shuffling + clf2.fit(X[idx_train], Y[idx_train]) + assert clf2.n_iter_ == max_iter + + assert_array_equal(clf1.coef_, clf2.coef_) + + +@pytest.mark.parametrize( + "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor] +) +def test_n_iter_no_change(klass): + X, Y = iris.data, iris.target + # test that n_iter_ increases monotonically with n_iter_no_change + for early_stopping in [True, False]: + n_iter_list = [ + klass( + early_stopping=early_stopping, + n_iter_no_change=n_iter_no_change, + tol=1e-4, + max_iter=1000, + ) + .fit(X, Y) + .n_iter_ + for n_iter_no_change in [2, 3, 10] + ] + assert_array_equal(n_iter_list, sorted(n_iter_list)) + + +@pytest.mark.parametrize( + "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor] +) +def test_not_enough_sample_for_early_stopping(klass): + # test an error is raised if the training or validation set is empty + clf = klass(early_stopping=True, validation_fraction=0.99) + with pytest.raises(ValueError): + clf.fit(X3, Y3) + + +@pytest.mark.parametrize("Estimator", [SGDClassifier, SGDRegressor]) +@pytest.mark.parametrize("l1_ratio", [0, 0.7, 1]) +def test_sgd_l1_ratio_not_used(Estimator, l1_ratio): + """Check that l1_ratio is not used when penalty is not 'elasticnet'""" + clf1 = Estimator(penalty="l1", l1_ratio=None, random_state=0).fit(X, Y) + clf2 = Estimator(penalty="l1", l1_ratio=l1_ratio, random_state=0).fit(X, Y) + + assert_allclose(clf1.coef_, clf2.coef_) + + +@pytest.mark.parametrize( + "Estimator", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor] +) +def test_sgd_failing_penalty_validation(Estimator): + clf = Estimator(penalty="elasticnet", l1_ratio=None) + with pytest.raises( + ValueError, match="l1_ratio must be set when penalty is 'elasticnet'" + ): + clf.fit(X, Y) + + +############################################################################### +# Classification Test Case + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_sgd_clf(klass): + # Check that SGD gives any results :-) + + for loss in ("hinge", "squared_hinge", "log_loss", "modified_huber"): + clf = klass( + penalty="l2", + alpha=0.01, + fit_intercept=True, + loss=loss, + max_iter=10, + shuffle=True, + ) + clf.fit(X, Y) + # assert_almost_equal(clf.coef_[0], clf.coef_[1], decimal=7) + assert_array_equal(clf.predict(T), true_result) + + +@pytest.mark.parametrize( + "klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM] +) +def test_provide_coef(klass): + """Check that the shape of `coef_init` is validated.""" + with pytest.raises(ValueError, match="Provided coef_init does not match dataset"): + klass().fit(X, Y, coef_init=np.zeros((3,))) + + +@pytest.mark.parametrize( + "klass, fit_params", + [ + (SGDClassifier, {"intercept_init": np.zeros((3,))}), + (SparseSGDClassifier, {"intercept_init": np.zeros((3,))}), + (SGDOneClassSVM, {"offset_init": np.zeros((3,))}), + (SparseSGDOneClassSVM, {"offset_init": np.zeros((3,))}), + ], +) +def test_set_intercept_offset(klass, fit_params): + """Check that `intercept_init` or `offset_init` is validated.""" + sgd_estimator = klass() + with pytest.raises(ValueError, match="does not match dataset"): + sgd_estimator.fit(X, Y, **fit_params) + + +@pytest.mark.parametrize( + "klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor] +) +def test_sgd_early_stopping_with_partial_fit(klass): + """Check that we raise an error for `early_stopping` used with + `partial_fit`. + """ + err_msg = "early_stopping should be False with partial_fit" + with pytest.raises(ValueError, match=err_msg): + klass(early_stopping=True).partial_fit(X, Y) + + +@pytest.mark.parametrize( + "klass, fit_params", + [ + (SGDClassifier, {"intercept_init": 0}), + (SparseSGDClassifier, {"intercept_init": 0}), + (SGDOneClassSVM, {"offset_init": 0}), + (SparseSGDOneClassSVM, {"offset_init": 0}), + ], +) +def test_set_intercept_offset_binary(klass, fit_params): + """Check that we can pass a scaler with binary classification to + `intercept_init` or `offset_init`.""" + klass().fit(X5, Y5, **fit_params) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_average_binary_computed_correctly(klass): + # Checks the SGDClassifier correctly computes the average weights + eta = 0.1 + alpha = 2.0 + n_samples = 20 + n_features = 10 + rng = np.random.RandomState(0) + X = rng.normal(size=(n_samples, n_features)) + w = rng.normal(size=n_features) + + clf = klass( + loss="squared_error", + learning_rate="constant", + eta0=eta, + alpha=alpha, + fit_intercept=True, + max_iter=1, + average=True, + shuffle=False, + ) + + # simple linear function without noise + y = np.dot(X, w) + y = np.sign(y) + + clf.fit(X, y) + + average_weights, average_intercept = asgd(klass, X, y, eta, alpha) + average_weights = average_weights.reshape(1, -1) + assert_array_almost_equal(clf.coef_, average_weights, decimal=14) + assert_almost_equal(clf.intercept_, average_intercept, decimal=14) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_set_intercept_to_intercept(klass): + # Checks intercept_ shape consistency for the warm starts + # Inconsistent intercept_ shape. + clf = klass().fit(X5, Y5) + klass().fit(X5, Y5, intercept_init=clf.intercept_) + clf = klass().fit(X, Y) + klass().fit(X, Y, intercept_init=clf.intercept_) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_sgd_at_least_two_labels(klass): + # Target must have at least two labels + clf = klass(alpha=0.01, max_iter=20) + with pytest.raises(ValueError): + clf.fit(X2, np.ones(9)) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_partial_fit_weight_class_balanced(klass): + # partial_fit with class_weight='balanced' not supported""" + regex = ( + r"class_weight 'balanced' is not supported for " + r"partial_fit\. In order to use 'balanced' weights, " + r"use compute_class_weight\('balanced', classes=classes, y=y\). " + r"In place of y you can use a large enough sample " + r"of the full training set target to properly " + r"estimate the class frequency distributions\. " + r"Pass the resulting weights as the class_weight " + r"parameter\." + ) + with pytest.raises(ValueError, match=regex): + klass(class_weight="balanced").partial_fit(X, Y, classes=np.unique(Y)) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_sgd_multiclass(klass): + # Multi-class test case + clf = klass(alpha=0.01, max_iter=20).fit(X2, Y2) + assert clf.coef_.shape == (3, 2) + assert clf.intercept_.shape == (3,) + assert clf.decision_function([[0, 0]]).shape == (1, 3) + pred = clf.predict(T2) + assert_array_equal(pred, true_result2) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_sgd_multiclass_average(klass): + eta = 0.001 + alpha = 0.01 + # Multi-class average test case + clf = klass( + loss="squared_error", + learning_rate="constant", + eta0=eta, + alpha=alpha, + fit_intercept=True, + max_iter=1, + average=True, + shuffle=False, + ) + + np_Y2 = np.array(Y2) + clf.fit(X2, np_Y2) + classes = np.unique(np_Y2) + + for i, cl in enumerate(classes): + y_i = np.ones(np_Y2.shape[0]) + y_i[np_Y2 != cl] = -1 + average_coef, average_intercept = asgd(klass, X2, y_i, eta, alpha) + assert_array_almost_equal(average_coef, clf.coef_[i], decimal=16) + assert_almost_equal(average_intercept, clf.intercept_[i], decimal=16) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_sgd_multiclass_with_init_coef(klass): + # Multi-class test case + clf = klass(alpha=0.01, max_iter=20) + clf.fit(X2, Y2, coef_init=np.zeros((3, 2)), intercept_init=np.zeros(3)) + assert clf.coef_.shape == (3, 2) + assert clf.intercept_.shape, (3,) + pred = clf.predict(T2) + assert_array_equal(pred, true_result2) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_sgd_multiclass_njobs(klass): + # Multi-class test case with multi-core support + clf = klass(alpha=0.01, max_iter=20, n_jobs=2).fit(X2, Y2) + assert clf.coef_.shape == (3, 2) + assert clf.intercept_.shape == (3,) + assert clf.decision_function([[0, 0]]).shape == (1, 3) + pred = clf.predict(T2) + assert_array_equal(pred, true_result2) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_set_coef_multiclass(klass): + # Checks coef_init and intercept_init shape for multi-class + # problems + # Provided coef_ does not match dataset + clf = klass() + with pytest.raises(ValueError): + clf.fit(X2, Y2, coef_init=np.zeros((2, 2))) + + # Provided coef_ does match dataset + clf = klass().fit(X2, Y2, coef_init=np.zeros((3, 2))) + + # Provided intercept_ does not match dataset + clf = klass() + with pytest.raises(ValueError): + clf.fit(X2, Y2, intercept_init=np.zeros((1,))) + + # Provided intercept_ does match dataset. + clf = klass().fit(X2, Y2, intercept_init=np.zeros((3,))) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_sgd_predict_proba_method_access(klass): + # Checks that SGDClassifier predict_proba and predict_log_proba methods + # can either be accessed or raise an appropriate error message + # otherwise. See + # https://github.com/scikit-learn/scikit-learn/issues/10938 for more + # details. + for loss in linear_model.SGDClassifier.loss_functions: + clf = SGDClassifier(loss=loss) + if loss in ("log_loss", "modified_huber"): + assert hasattr(clf, "predict_proba") + assert hasattr(clf, "predict_log_proba") + else: + inner_msg = "probability estimates are not available for loss={!r}".format( + loss + ) + assert not hasattr(clf, "predict_proba") + assert not hasattr(clf, "predict_log_proba") + with pytest.raises( + AttributeError, match="has no attribute 'predict_proba'" + ) as exec_info: + clf.predict_proba + + assert isinstance(exec_info.value.__cause__, AttributeError) + assert inner_msg in str(exec_info.value.__cause__) + + with pytest.raises( + AttributeError, match="has no attribute 'predict_log_proba'" + ) as exec_info: + clf.predict_log_proba + assert isinstance(exec_info.value.__cause__, AttributeError) + assert inner_msg in str(exec_info.value.__cause__) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_sgd_proba(klass): + # Check SGD.predict_proba + + # Hinge loss does not allow for conditional prob estimate. + # We cannot use the factory here, because it defines predict_proba + # anyway. + clf = SGDClassifier(loss="hinge", alpha=0.01, max_iter=10, tol=None).fit(X, Y) + assert not hasattr(clf, "predict_proba") + assert not hasattr(clf, "predict_log_proba") + + # log and modified_huber losses can output probability estimates + # binary case + for loss in ["log_loss", "modified_huber"]: + clf = klass(loss=loss, alpha=0.01, max_iter=10) + clf.fit(X, Y) + p = clf.predict_proba([[3, 2]]) + assert p[0, 1] > 0.5 + p = clf.predict_proba([[-1, -1]]) + assert p[0, 1] < 0.5 + + # If predict_proba is 0, we get "RuntimeWarning: divide by zero encountered + # in log". We avoid it here. + with np.errstate(divide="ignore"): + p = clf.predict_log_proba([[3, 2]]) + assert p[0, 1] > p[0, 0] + p = clf.predict_log_proba([[-1, -1]]) + assert p[0, 1] < p[0, 0] + + # log loss multiclass probability estimates + clf = klass(loss="log_loss", alpha=0.01, max_iter=10).fit(X2, Y2) + + d = clf.decision_function([[0.1, -0.1], [0.3, 0.2]]) + p = clf.predict_proba([[0.1, -0.1], [0.3, 0.2]]) + assert_array_equal(np.argmax(p, axis=1), np.argmax(d, axis=1)) + assert_almost_equal(p[0].sum(), 1) + assert np.all(p[0] >= 0) + + p = clf.predict_proba([[-1, -1]]) + d = clf.decision_function([[-1, -1]]) + assert_array_equal(np.argsort(p[0]), np.argsort(d[0])) + + lp = clf.predict_log_proba([[3, 2]]) + p = clf.predict_proba([[3, 2]]) + assert_array_almost_equal(np.log(p), lp) + + lp = clf.predict_log_proba([[-1, -1]]) + p = clf.predict_proba([[-1, -1]]) + assert_array_almost_equal(np.log(p), lp) + + # Modified Huber multiclass probability estimates; requires a separate + # test because the hard zero/one probabilities may destroy the + # ordering present in decision_function output. + clf = klass(loss="modified_huber", alpha=0.01, max_iter=10) + clf.fit(X2, Y2) + d = clf.decision_function([[3, 2]]) + p = clf.predict_proba([[3, 2]]) + if klass != SparseSGDClassifier: + assert np.argmax(d, axis=1) == np.argmax(p, axis=1) + else: # XXX the sparse test gets a different X2 (?) + assert np.argmin(d, axis=1) == np.argmin(p, axis=1) + + # the following sample produces decision_function values < -1, + # which would cause naive normalization to fail (see comment + # in SGDClassifier.predict_proba) + x = X.mean(axis=0) + d = clf.decision_function([x]) + if np.all(d < -1): # XXX not true in sparse test case (why?) + p = clf.predict_proba([x]) + assert_array_almost_equal(p[0], [1 / 3.0] * 3) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_sgd_l1(klass): + # Test L1 regularization + n = len(X4) + rng = np.random.RandomState(13) + idx = np.arange(n) + rng.shuffle(idx) + + X = X4[idx, :] + Y = Y4[idx] + + clf = klass( + penalty="l1", + alpha=0.2, + fit_intercept=False, + max_iter=2000, + tol=None, + shuffle=False, + ) + clf.fit(X, Y) + assert_array_equal(clf.coef_[0, 1:-1], np.zeros((4,))) + pred = clf.predict(X) + assert_array_equal(pred, Y) + + # test sparsify with dense inputs + clf.sparsify() + assert sp.issparse(clf.coef_) + pred = clf.predict(X) + assert_array_equal(pred, Y) + + # pickle and unpickle with sparse coef_ + clf = pickle.loads(pickle.dumps(clf)) + assert sp.issparse(clf.coef_) + pred = clf.predict(X) + assert_array_equal(pred, Y) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_class_weights(klass): + # Test class weights. + X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) + y = [1, 1, 1, -1, -1] + + clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False, class_weight=None) + clf.fit(X, y) + assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1])) + + # we give a small weights to class 1 + clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False, class_weight={1: 0.001}) + clf.fit(X, y) + + # now the hyperplane should rotate clock-wise and + # the prediction on this point should shift + assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1])) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_equal_class_weight(klass): + # Test if equal class weights approx. equals no class weights. + X = [[1, 0], [1, 0], [0, 1], [0, 1]] + y = [0, 0, 1, 1] + clf = klass(alpha=0.1, max_iter=1000, class_weight=None) + clf.fit(X, y) + + X = [[1, 0], [0, 1]] + y = [0, 1] + clf_weighted = klass(alpha=0.1, max_iter=1000, class_weight={0: 0.5, 1: 0.5}) + clf_weighted.fit(X, y) + + # should be similar up to some epsilon due to learning rate schedule + assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_wrong_class_weight_label(klass): + # ValueError due to not existing class label. + clf = klass(alpha=0.1, max_iter=1000, class_weight={0: 0.5}) + with pytest.raises(ValueError): + clf.fit(X, Y) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_weights_multiplied(klass): + # Tests that class_weight and sample_weight are multiplicative + class_weights = {1: 0.6, 2: 0.3} + rng = np.random.RandomState(0) + sample_weights = rng.random_sample(Y4.shape[0]) + multiplied_together = np.copy(sample_weights) + multiplied_together[Y4 == 1] *= class_weights[1] + multiplied_together[Y4 == 2] *= class_weights[2] + + clf1 = klass(alpha=0.1, max_iter=20, class_weight=class_weights) + clf2 = klass(alpha=0.1, max_iter=20) + + clf1.fit(X4, Y4, sample_weight=sample_weights) + clf2.fit(X4, Y4, sample_weight=multiplied_together) + + assert_almost_equal(clf1.coef_, clf2.coef_) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_balanced_weight(klass): + # Test class weights for imbalanced data""" + # compute reference metrics on iris dataset that is quite balanced by + # default + X, y = iris.data, iris.target + X = scale(X) + idx = np.arange(X.shape[0]) + rng = np.random.RandomState(6) + rng.shuffle(idx) + X = X[idx] + y = y[idx] + clf = klass(alpha=0.0001, max_iter=1000, class_weight=None, shuffle=False).fit(X, y) + f1 = metrics.f1_score(y, clf.predict(X), average="weighted") + assert_almost_equal(f1, 0.96, decimal=1) + + # make the same prediction using balanced class_weight + clf_balanced = klass( + alpha=0.0001, max_iter=1000, class_weight="balanced", shuffle=False + ).fit(X, y) + f1 = metrics.f1_score(y, clf_balanced.predict(X), average="weighted") + assert_almost_equal(f1, 0.96, decimal=1) + + # Make sure that in the balanced case it does not change anything + # to use "balanced" + assert_array_almost_equal(clf.coef_, clf_balanced.coef_, 6) + + # build an very very imbalanced dataset out of iris data + X_0 = X[y == 0, :] + y_0 = y[y == 0] + + X_imbalanced = np.vstack([X] + [X_0] * 10) + y_imbalanced = np.concatenate([y] + [y_0] * 10) + + # fit a model on the imbalanced data without class weight info + clf = klass(max_iter=1000, class_weight=None, shuffle=False) + clf.fit(X_imbalanced, y_imbalanced) + y_pred = clf.predict(X) + assert metrics.f1_score(y, y_pred, average="weighted") < 0.96 + + # fit a model with balanced class_weight enabled + clf = klass(max_iter=1000, class_weight="balanced", shuffle=False) + clf.fit(X_imbalanced, y_imbalanced) + y_pred = clf.predict(X) + assert metrics.f1_score(y, y_pred, average="weighted") > 0.96 + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_sample_weights(klass): + # Test weights on individual samples + X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) + y = [1, 1, 1, -1, -1] + + clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False) + clf.fit(X, y) + assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1])) + + # we give a small weights to class 1 + clf.fit(X, y, sample_weight=[0.001] * 3 + [1] * 2) + + # now the hyperplane should rotate clock-wise and + # the prediction on this point should shift + assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1])) + + +@pytest.mark.parametrize( + "klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM] +) +def test_wrong_sample_weights(klass): + # Test if ValueError is raised if sample_weight has wrong shape + if klass in [SGDClassifier, SparseSGDClassifier]: + clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False) + elif klass in [SGDOneClassSVM, SparseSGDOneClassSVM]: + clf = klass(nu=0.1, max_iter=1000, fit_intercept=False) + # provided sample_weight too long + with pytest.raises(ValueError): + clf.fit(X, Y, sample_weight=np.arange(7)) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_partial_fit_exception(klass): + clf = klass(alpha=0.01) + # classes was not specified + with pytest.raises(ValueError): + clf.partial_fit(X3, Y3) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_partial_fit_binary(klass): + third = X.shape[0] // 3 + clf = klass(alpha=0.01) + classes = np.unique(Y) + + clf.partial_fit(X[:third], Y[:third], classes=classes) + assert clf.coef_.shape == (1, X.shape[1]) + assert clf.intercept_.shape == (1,) + assert clf.decision_function([[0, 0]]).shape == (1,) + id1 = id(clf.coef_.data) + + clf.partial_fit(X[third:], Y[third:]) + id2 = id(clf.coef_.data) + # check that coef_ haven't been re-allocated + assert id1, id2 + + y_pred = clf.predict(T) + assert_array_equal(y_pred, true_result) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_partial_fit_multiclass(klass): + third = X2.shape[0] // 3 + clf = klass(alpha=0.01) + classes = np.unique(Y2) + + clf.partial_fit(X2[:third], Y2[:third], classes=classes) + assert clf.coef_.shape == (3, X2.shape[1]) + assert clf.intercept_.shape == (3,) + assert clf.decision_function([[0, 0]]).shape == (1, 3) + id1 = id(clf.coef_.data) + + clf.partial_fit(X2[third:], Y2[third:]) + id2 = id(clf.coef_.data) + # check that coef_ haven't been re-allocated + assert id1, id2 + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_partial_fit_multiclass_average(klass): + third = X2.shape[0] // 3 + clf = klass(alpha=0.01, average=X2.shape[0]) + classes = np.unique(Y2) + + clf.partial_fit(X2[:third], Y2[:third], classes=classes) + assert clf.coef_.shape == (3, X2.shape[1]) + assert clf.intercept_.shape == (3,) + + clf.partial_fit(X2[third:], Y2[third:]) + assert clf.coef_.shape == (3, X2.shape[1]) + assert clf.intercept_.shape == (3,) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_fit_then_partial_fit(klass): + # Partial_fit should work after initial fit in the multiclass case. + # Non-regression test for #2496; fit would previously produce a + # Fortran-ordered coef_ that subsequent partial_fit couldn't handle. + clf = klass() + clf.fit(X2, Y2) + clf.partial_fit(X2, Y2) # no exception here + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +@pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"]) +def test_partial_fit_equal_fit_classif(klass, lr): + for X_, Y_, T_ in ((X, Y, T), (X2, Y2, T2)): + clf = klass(alpha=0.01, eta0=0.01, max_iter=2, learning_rate=lr, shuffle=False) + clf.fit(X_, Y_) + y_pred = clf.decision_function(T_) + t = clf.t_ + + classes = np.unique(Y_) + clf = klass(alpha=0.01, eta0=0.01, learning_rate=lr, shuffle=False) + for i in range(2): + clf.partial_fit(X_, Y_, classes=classes) + y_pred2 = clf.decision_function(T_) + + assert clf.t_ == t + assert_array_almost_equal(y_pred, y_pred2, decimal=2) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_regression_losses(klass): + random_state = np.random.RandomState(1) + clf = klass( + alpha=0.01, + learning_rate="constant", + eta0=0.1, + loss="epsilon_insensitive", + random_state=random_state, + ) + clf.fit(X, Y) + assert 1.0 == np.mean(clf.predict(X) == Y) + + clf = klass( + alpha=0.01, + learning_rate="constant", + eta0=0.1, + loss="squared_epsilon_insensitive", + random_state=random_state, + ) + clf.fit(X, Y) + assert 1.0 == np.mean(clf.predict(X) == Y) + + clf = klass(alpha=0.01, loss="huber", random_state=random_state) + clf.fit(X, Y) + assert 1.0 == np.mean(clf.predict(X) == Y) + + clf = klass( + alpha=0.01, + learning_rate="constant", + eta0=0.01, + loss="squared_error", + random_state=random_state, + ) + clf.fit(X, Y) + assert 1.0 == np.mean(clf.predict(X) == Y) + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_warm_start_multiclass(klass): + _test_warm_start(klass, X2, Y2, "optimal") + + +@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier]) +def test_multiple_fit(klass): + # Test multiple calls of fit w/ different shaped inputs. + clf = klass(alpha=0.01, shuffle=False) + clf.fit(X, Y) + assert hasattr(clf, "coef_") + + # Non-regression test: try fitting with a different label set. + y = [["ham", "spam"][i] for i in LabelEncoder().fit_transform(Y)] + clf.fit(X[:, :-1], y) + + +############################################################################### +# Regression Test Case + + +@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor]) +def test_sgd_reg(klass): + # Check that SGD gives any results. + clf = klass(alpha=0.1, max_iter=2, fit_intercept=False) + clf.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2]) + assert clf.coef_[0] == clf.coef_[1] + + +@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor]) +def test_sgd_averaged_computed_correctly(klass): + # Tests the average regressor matches the naive implementation + + eta = 0.001 + alpha = 0.01 + n_samples = 20 + n_features = 10 + rng = np.random.RandomState(0) + X = rng.normal(size=(n_samples, n_features)) + w = rng.normal(size=n_features) + + # simple linear function without noise + y = np.dot(X, w) + + clf = klass( + loss="squared_error", + learning_rate="constant", + eta0=eta, + alpha=alpha, + fit_intercept=True, + max_iter=1, + average=True, + shuffle=False, + ) + + clf.fit(X, y) + average_weights, average_intercept = asgd(klass, X, y, eta, alpha) + + assert_array_almost_equal(clf.coef_, average_weights, decimal=16) + assert_almost_equal(clf.intercept_, average_intercept, decimal=16) + + +@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor]) +def test_sgd_averaged_partial_fit(klass): + # Tests whether the partial fit yields the same average as the fit + eta = 0.001 + alpha = 0.01 + n_samples = 20 + n_features = 10 + rng = np.random.RandomState(0) + X = rng.normal(size=(n_samples, n_features)) + w = rng.normal(size=n_features) + + # simple linear function without noise + y = np.dot(X, w) + + clf = klass( + loss="squared_error", + learning_rate="constant", + eta0=eta, + alpha=alpha, + fit_intercept=True, + max_iter=1, + average=True, + shuffle=False, + ) + + clf.partial_fit(X[: int(n_samples / 2)][:], y[: int(n_samples / 2)]) + clf.partial_fit(X[int(n_samples / 2) :][:], y[int(n_samples / 2) :]) + average_weights, average_intercept = asgd(klass, X, y, eta, alpha) + + assert_array_almost_equal(clf.coef_, average_weights, decimal=16) + assert_almost_equal(clf.intercept_[0], average_intercept, decimal=16) + + +@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor]) +def test_average_sparse(klass): + # Checks the average weights on data with 0s + + eta = 0.001 + alpha = 0.01 + clf = klass( + loss="squared_error", + learning_rate="constant", + eta0=eta, + alpha=alpha, + fit_intercept=True, + max_iter=1, + average=True, + shuffle=False, + ) + + n_samples = Y3.shape[0] + + clf.partial_fit(X3[: int(n_samples / 2)][:], Y3[: int(n_samples / 2)]) + clf.partial_fit(X3[int(n_samples / 2) :][:], Y3[int(n_samples / 2) :]) + average_weights, average_intercept = asgd(klass, X3, Y3, eta, alpha) + + assert_array_almost_equal(clf.coef_, average_weights, decimal=16) + assert_almost_equal(clf.intercept_, average_intercept, decimal=16) + + +@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor]) +def test_sgd_least_squares_fit(klass): + xmin, xmax = -5, 5 + n_samples = 100 + rng = np.random.RandomState(0) + X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1) + + # simple linear function without noise + y = 0.5 * X.ravel() + + clf = klass(loss="squared_error", alpha=0.1, max_iter=20, fit_intercept=False) + clf.fit(X, y) + score = clf.score(X, y) + assert score > 0.99 + + # simple linear function with noise + y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel() + + clf = klass(loss="squared_error", alpha=0.1, max_iter=20, fit_intercept=False) + clf.fit(X, y) + score = clf.score(X, y) + assert score > 0.5 + + +@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor]) +def test_sgd_epsilon_insensitive(klass): + xmin, xmax = -5, 5 + n_samples = 100 + rng = np.random.RandomState(0) + X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1) + + # simple linear function without noise + y = 0.5 * X.ravel() + + clf = klass( + loss="epsilon_insensitive", + epsilon=0.01, + alpha=0.1, + max_iter=20, + fit_intercept=False, + ) + clf.fit(X, y) + score = clf.score(X, y) + assert score > 0.99 + + # simple linear function with noise + y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel() + + clf = klass( + loss="epsilon_insensitive", + epsilon=0.01, + alpha=0.1, + max_iter=20, + fit_intercept=False, + ) + clf.fit(X, y) + score = clf.score(X, y) + assert score > 0.5 + + +@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor]) +def test_sgd_huber_fit(klass): + xmin, xmax = -5, 5 + n_samples = 100 + rng = np.random.RandomState(0) + X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1) + + # simple linear function without noise + y = 0.5 * X.ravel() + + clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, fit_intercept=False) + clf.fit(X, y) + score = clf.score(X, y) + assert score > 0.99 + + # simple linear function with noise + y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel() + + clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, fit_intercept=False) + clf.fit(X, y) + score = clf.score(X, y) + assert score > 0.5 + + +@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor]) +def test_elasticnet_convergence(klass): + # Check that the SGD output is consistent with coordinate descent + + n_samples, n_features = 1000, 5 + rng = np.random.RandomState(0) + X = rng.randn(n_samples, n_features) + # ground_truth linear model that generate y from X and to which the + # models should converge if the regularizer would be set to 0.0 + ground_truth_coef = rng.randn(n_features) + y = np.dot(X, ground_truth_coef) + + # XXX: alpha = 0.1 seems to cause convergence problems + for alpha in [0.01, 0.001]: + for l1_ratio in [0.5, 0.8, 1.0]: + cd = linear_model.ElasticNet( + alpha=alpha, l1_ratio=l1_ratio, fit_intercept=False + ) + cd.fit(X, y) + sgd = klass( + penalty="elasticnet", + max_iter=50, + alpha=alpha, + l1_ratio=l1_ratio, + fit_intercept=False, + ) + sgd.fit(X, y) + err_msg = ( + "cd and sgd did not converge to comparable " + "results for alpha=%f and l1_ratio=%f" % (alpha, l1_ratio) + ) + assert_almost_equal(cd.coef_, sgd.coef_, decimal=2, err_msg=err_msg) + + +@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor]) +def test_partial_fit(klass): + third = X.shape[0] // 3 + clf = klass(alpha=0.01) + + clf.partial_fit(X[:third], Y[:third]) + assert clf.coef_.shape == (X.shape[1],) + assert clf.intercept_.shape == (1,) + assert clf.predict([[0, 0]]).shape == (1,) + id1 = id(clf.coef_.data) + + clf.partial_fit(X[third:], Y[third:]) + id2 = id(clf.coef_.data) + # check that coef_ haven't been re-allocated + assert id1, id2 + + +@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor]) +@pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"]) +def test_partial_fit_equal_fit(klass, lr): + clf = klass(alpha=0.01, max_iter=2, eta0=0.01, learning_rate=lr, shuffle=False) + clf.fit(X, Y) + y_pred = clf.predict(T) + t = clf.t_ + + clf = klass(alpha=0.01, eta0=0.01, learning_rate=lr, shuffle=False) + for i in range(2): + clf.partial_fit(X, Y) + y_pred2 = clf.predict(T) + + assert clf.t_ == t + assert_array_almost_equal(y_pred, y_pred2, decimal=2) + + +@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor]) +def test_loss_function_epsilon(klass): + clf = klass(epsilon=0.9) + clf.set_params(epsilon=0.1) + assert clf.loss_functions["huber"][1] == 0.1 + + +############################################################################### +# SGD One Class SVM Test Case + + +# a simple implementation of ASGD to use for testing SGDOneClassSVM +def asgd_oneclass(klass, X, eta, nu, coef_init=None, offset_init=0.0): + if coef_init is None: + coef = np.zeros(X.shape[1]) + else: + coef = coef_init + + average_coef = np.zeros(X.shape[1]) + offset = offset_init + intercept = 1 - offset + average_intercept = 0.0 + decay = 1.0 + + # sparse data has a fixed decay of .01 + if klass == SparseSGDOneClassSVM: + decay = 0.01 + + for i, entry in enumerate(X): + p = np.dot(entry, coef) + p += intercept + if p <= 1.0: + gradient = -1 + else: + gradient = 0 + coef *= max(0, 1.0 - (eta * nu / 2)) + coef += -(eta * gradient * entry) + intercept += -(eta * (nu + gradient)) * decay + + average_coef *= i + average_coef += coef + average_coef /= i + 1.0 + + average_intercept *= i + average_intercept += intercept + average_intercept /= i + 1.0 + + return average_coef, 1 - average_intercept + + +@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM]) +def _test_warm_start_oneclass(klass, X, lr): + # Test that explicit warm restart... + clf = klass(nu=0.5, eta0=0.01, shuffle=False, learning_rate=lr) + clf.fit(X) + + clf2 = klass(nu=0.1, eta0=0.01, shuffle=False, learning_rate=lr) + clf2.fit(X, coef_init=clf.coef_.copy(), offset_init=clf.offset_.copy()) + + # ... and implicit warm restart are equivalent. + clf3 = klass(nu=0.5, eta0=0.01, shuffle=False, warm_start=True, learning_rate=lr) + clf3.fit(X) + + assert clf3.t_ == clf.t_ + assert_allclose(clf3.coef_, clf.coef_) + + clf3.set_params(nu=0.1) + clf3.fit(X) + + assert clf3.t_ == clf2.t_ + assert_allclose(clf3.coef_, clf2.coef_) + + +@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM]) +@pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"]) +def test_warm_start_oneclass(klass, lr): + _test_warm_start_oneclass(klass, X, lr) + + +@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM]) +def test_clone_oneclass(klass): + # Test whether clone works ok. + clf = klass(nu=0.5) + clf = clone(clf) + clf.set_params(nu=0.1) + clf.fit(X) + + clf2 = klass(nu=0.1) + clf2.fit(X) + + assert_array_equal(clf.coef_, clf2.coef_) + + +@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM]) +def test_partial_fit_oneclass(klass): + third = X.shape[0] // 3 + clf = klass(nu=0.1) + + clf.partial_fit(X[:third]) + assert clf.coef_.shape == (X.shape[1],) + assert clf.offset_.shape == (1,) + assert clf.predict([[0, 0]]).shape == (1,) + previous_coefs = clf.coef_ + + clf.partial_fit(X[third:]) + # check that coef_ haven't been re-allocated + assert clf.coef_ is previous_coefs + + # raises ValueError if number of features does not match previous data + with pytest.raises(ValueError): + clf.partial_fit(X[:, 1]) + + +@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM]) +@pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"]) +def test_partial_fit_equal_fit_oneclass(klass, lr): + clf = klass(nu=0.05, max_iter=2, eta0=0.01, learning_rate=lr, shuffle=False) + clf.fit(X) + y_scores = clf.decision_function(T) + t = clf.t_ + coef = clf.coef_ + offset = clf.offset_ + + clf = klass(nu=0.05, eta0=0.01, max_iter=1, learning_rate=lr, shuffle=False) + for _ in range(2): + clf.partial_fit(X) + y_scores2 = clf.decision_function(T) + + assert clf.t_ == t + assert_allclose(y_scores, y_scores2) + assert_allclose(clf.coef_, coef) + assert_allclose(clf.offset_, offset) + + +@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM]) +def test_late_onset_averaging_reached_oneclass(klass): + # Test average + eta0 = 0.001 + nu = 0.05 + + # 2 passes over the training set but average only at second pass + clf1 = klass( + average=7, learning_rate="constant", eta0=eta0, nu=nu, max_iter=2, shuffle=False + ) + # 1 pass over the training set with no averaging + clf2 = klass( + average=False, + learning_rate="constant", + eta0=eta0, + nu=nu, + max_iter=1, + shuffle=False, + ) + + clf1.fit(X) + clf2.fit(X) + + # Start from clf2 solution, compute averaging using asgd function and + # compare with clf1 solution + average_coef, average_offset = asgd_oneclass( + klass, X, eta0, nu, coef_init=clf2.coef_.ravel(), offset_init=clf2.offset_ + ) + + assert_allclose(clf1.coef_.ravel(), average_coef.ravel()) + assert_allclose(clf1.offset_, average_offset) + + +@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM]) +def test_sgd_averaged_computed_correctly_oneclass(klass): + # Tests the average SGD One-Class SVM matches the naive implementation + eta = 0.001 + nu = 0.05 + n_samples = 20 + n_features = 10 + rng = np.random.RandomState(0) + X = rng.normal(size=(n_samples, n_features)) + + clf = klass( + learning_rate="constant", + eta0=eta, + nu=nu, + fit_intercept=True, + max_iter=1, + average=True, + shuffle=False, + ) + + clf.fit(X) + average_coef, average_offset = asgd_oneclass(klass, X, eta, nu) + + assert_allclose(clf.coef_, average_coef) + assert_allclose(clf.offset_, average_offset) + + +@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM]) +def test_sgd_averaged_partial_fit_oneclass(klass): + # Tests whether the partial fit yields the same average as the fit + eta = 0.001 + nu = 0.05 + n_samples = 20 + n_features = 10 + rng = np.random.RandomState(0) + X = rng.normal(size=(n_samples, n_features)) + + clf = klass( + learning_rate="constant", + eta0=eta, + nu=nu, + fit_intercept=True, + max_iter=1, + average=True, + shuffle=False, + ) + + clf.partial_fit(X[: int(n_samples / 2)][:]) + clf.partial_fit(X[int(n_samples / 2) :][:]) + average_coef, average_offset = asgd_oneclass(klass, X, eta, nu) + + assert_allclose(clf.coef_, average_coef) + assert_allclose(clf.offset_, average_offset) + + +@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM]) +def test_average_sparse_oneclass(klass): + # Checks the average coef on data with 0s + eta = 0.001 + nu = 0.01 + clf = klass( + learning_rate="constant", + eta0=eta, + nu=nu, + fit_intercept=True, + max_iter=1, + average=True, + shuffle=False, + ) + + n_samples = X3.shape[0] + + clf.partial_fit(X3[: int(n_samples / 2)]) + clf.partial_fit(X3[int(n_samples / 2) :]) + average_coef, average_offset = asgd_oneclass(klass, X3, eta, nu) + + assert_allclose(clf.coef_, average_coef) + assert_allclose(clf.offset_, average_offset) + + +def test_sgd_oneclass(): + # Test fit, decision_function, predict and score_samples on a toy + # dataset + X_train = np.array([[-2, -1], [-1, -1], [1, 1]]) + X_test = np.array([[0.5, -2], [2, 2]]) + clf = SGDOneClassSVM( + nu=0.5, eta0=1, learning_rate="constant", shuffle=False, max_iter=1 + ) + clf.fit(X_train) + assert_allclose(clf.coef_, np.array([-0.125, 0.4375])) + assert clf.offset_[0] == -0.5 + + scores = clf.score_samples(X_test) + assert_allclose(scores, np.array([-0.9375, 0.625])) + + dec = clf.score_samples(X_test) - clf.offset_ + assert_allclose(clf.decision_function(X_test), dec) + + pred = clf.predict(X_test) + assert_array_equal(pred, np.array([-1, 1])) + + +def test_ocsvm_vs_sgdocsvm(): + # Checks SGDOneClass SVM gives a good approximation of kernelized + # One-Class SVM + nu = 0.05 + gamma = 2.0 + random_state = 42 + + # Generate train and test data + rng = np.random.RandomState(random_state) + X = 0.3 * rng.randn(500, 2) + X_train = np.r_[X + 2, X - 2] + X = 0.3 * rng.randn(100, 2) + X_test = np.r_[X + 2, X - 2] + + # One-Class SVM + clf = OneClassSVM(gamma=gamma, kernel="rbf", nu=nu) + clf.fit(X_train) + y_pred_ocsvm = clf.predict(X_test) + dec_ocsvm = clf.decision_function(X_test).reshape(1, -1) + + # SGDOneClassSVM using kernel approximation + max_iter = 15 + transform = Nystroem(gamma=gamma, random_state=random_state) + clf_sgd = SGDOneClassSVM( + nu=nu, + shuffle=True, + fit_intercept=True, + max_iter=max_iter, + random_state=random_state, + tol=None, + ) + pipe_sgd = make_pipeline(transform, clf_sgd) + pipe_sgd.fit(X_train) + y_pred_sgdocsvm = pipe_sgd.predict(X_test) + dec_sgdocsvm = pipe_sgd.decision_function(X_test).reshape(1, -1) + + assert np.mean(y_pred_sgdocsvm == y_pred_ocsvm) >= 0.99 + corrcoef = np.corrcoef(np.concatenate((dec_ocsvm, dec_sgdocsvm)))[0, 1] + assert corrcoef >= 0.9 + + +def test_l1_ratio(): + # Test if l1 ratio extremes match L1 and L2 penalty settings. + X, y = datasets.make_classification( + n_samples=1000, n_features=100, n_informative=20, random_state=1234 + ) + + # test if elasticnet with l1_ratio near 1 gives same result as pure l1 + est_en = SGDClassifier( + alpha=0.001, + penalty="elasticnet", + tol=None, + max_iter=6, + l1_ratio=0.9999999999, + random_state=42, + ).fit(X, y) + est_l1 = SGDClassifier( + alpha=0.001, penalty="l1", max_iter=6, random_state=42, tol=None + ).fit(X, y) + assert_array_almost_equal(est_en.coef_, est_l1.coef_) + + # test if elasticnet with l1_ratio near 0 gives same result as pure l2 + est_en = SGDClassifier( + alpha=0.001, + penalty="elasticnet", + tol=None, + max_iter=6, + l1_ratio=0.0000000001, + random_state=42, + ).fit(X, y) + est_l2 = SGDClassifier( + alpha=0.001, penalty="l2", max_iter=6, random_state=42, tol=None + ).fit(X, y) + assert_array_almost_equal(est_en.coef_, est_l2.coef_) + + +def test_underflow_or_overlow(): + with np.errstate(all="raise"): + # Generate some weird data with hugely unscaled features + rng = np.random.RandomState(0) + n_samples = 100 + n_features = 10 + + X = rng.normal(size=(n_samples, n_features)) + X[:, :2] *= 1e300 + assert np.isfinite(X).all() + + # Use MinMaxScaler to scale the data without introducing a numerical + # instability (computing the standard deviation naively is not possible + # on this data) + X_scaled = MinMaxScaler().fit_transform(X) + assert np.isfinite(X_scaled).all() + + # Define a ground truth on the scaled data + ground_truth = rng.normal(size=n_features) + y = (np.dot(X_scaled, ground_truth) > 0.0).astype(np.int32) + assert_array_equal(np.unique(y), [0, 1]) + + model = SGDClassifier(alpha=0.1, loss="squared_hinge", max_iter=500) + + # smoke test: model is stable on scaled data + model.fit(X_scaled, y) + assert np.isfinite(model.coef_).all() + + # model is numerically unstable on unscaled data + msg_regxp = ( + r"Floating-point under-/overflow occurred at epoch #.*" + " Scaling input data with StandardScaler or MinMaxScaler" + " might help." + ) + with pytest.raises(ValueError, match=msg_regxp): + model.fit(X, y) + + +def test_numerical_stability_large_gradient(): + # Non regression test case for numerical stability on scaled problems + # where the gradient can still explode with some losses + model = SGDClassifier( + loss="squared_hinge", + max_iter=10, + shuffle=True, + penalty="elasticnet", + l1_ratio=0.3, + alpha=0.01, + eta0=0.001, + random_state=0, + tol=None, + ) + with np.errstate(all="raise"): + model.fit(iris.data, iris.target) + assert np.isfinite(model.coef_).all() + + +@pytest.mark.parametrize("penalty", ["l2", "l1", "elasticnet"]) +def test_large_regularization(penalty): + # Non regression tests for numerical stability issues caused by large + # regularization parameters + model = SGDClassifier( + alpha=1e5, + learning_rate="constant", + eta0=0.1, + penalty=penalty, + shuffle=False, + tol=None, + max_iter=6, + ) + with np.errstate(all="raise"): + model.fit(iris.data, iris.target) + assert_array_almost_equal(model.coef_, np.zeros_like(model.coef_)) + + +def test_tol_parameter(): + # Test that the tol parameter behaves as expected + X = StandardScaler().fit_transform(iris.data) + y = iris.target == 1 + + # With tol is None, the number of iteration should be equal to max_iter + max_iter = 42 + model_0 = SGDClassifier(tol=None, random_state=0, max_iter=max_iter) + model_0.fit(X, y) + assert max_iter == model_0.n_iter_ + + # If tol is not None, the number of iteration should be less than max_iter + max_iter = 2000 + model_1 = SGDClassifier(tol=0, random_state=0, max_iter=max_iter) + model_1.fit(X, y) + assert max_iter > model_1.n_iter_ + assert model_1.n_iter_ > 5 + + # A larger tol should yield a smaller number of iteration + model_2 = SGDClassifier(tol=0.1, random_state=0, max_iter=max_iter) + model_2.fit(X, y) + assert model_1.n_iter_ > model_2.n_iter_ + assert model_2.n_iter_ > 3 + + # Strict tolerance and small max_iter should trigger a warning + model_3 = SGDClassifier(max_iter=3, tol=1e-3, random_state=0) + warning_message = ( + "Maximum number of iteration reached before " + "convergence. Consider increasing max_iter to " + "improve the fit." + ) + with pytest.warns(ConvergenceWarning, match=warning_message): + model_3.fit(X, y) + assert model_3.n_iter_ == 3 + + +def _test_loss_common(loss_function, cases): + # Test the different loss functions + # cases is a list of (p, y, expected) + for p, y, expected_loss, expected_dloss in cases: + assert_almost_equal(loss_function.py_loss(p, y), expected_loss) + assert_almost_equal(loss_function.py_dloss(p, y), expected_dloss) + + +def test_loss_hinge(): + # Test Hinge (hinge / perceptron) + # hinge + loss = sgd_fast.Hinge(1.0) + cases = [ + # (p, y, expected_loss, expected_dloss) + (1.1, 1.0, 0.0, 0.0), + (-2.0, -1.0, 0.0, 0.0), + (1.0, 1.0, 0.0, -1.0), + (-1.0, -1.0, 0.0, 1.0), + (0.5, 1.0, 0.5, -1.0), + (2.0, -1.0, 3.0, 1.0), + (-0.5, -1.0, 0.5, 1.0), + (0.0, 1.0, 1, -1.0), + ] + _test_loss_common(loss, cases) + + # perceptron + loss = sgd_fast.Hinge(0.0) + cases = [ + # (p, y, expected_loss, expected_dloss) + (1.0, 1.0, 0.0, 0.0), + (-0.1, -1.0, 0.0, 0.0), + (0.0, 1.0, 0.0, -1.0), + (0.0, -1.0, 0.0, 1.0), + (0.5, -1.0, 0.5, 1.0), + (2.0, -1.0, 2.0, 1.0), + (-0.5, 1.0, 0.5, -1.0), + (-1.0, 1.0, 1.0, -1.0), + ] + _test_loss_common(loss, cases) + + +def test_gradient_squared_hinge(): + # Test SquaredHinge + loss = sgd_fast.SquaredHinge(1.0) + cases = [ + # (p, y, expected_loss, expected_dloss) + (1.0, 1.0, 0.0, 0.0), + (-2.0, -1.0, 0.0, 0.0), + (1.0, -1.0, 4.0, 4.0), + (-1.0, 1.0, 4.0, -4.0), + (0.5, 1.0, 0.25, -1.0), + (0.5, -1.0, 2.25, 3.0), + ] + _test_loss_common(loss, cases) + + +def test_loss_modified_huber(): + # (p, y, expected_loss, expected_dloss) + loss = sgd_fast.ModifiedHuber() + cases = [ + # (p, y, expected_loss, expected_dloss) + (1.0, 1.0, 0.0, 0.0), + (-1.0, -1.0, 0.0, 0.0), + (2.0, 1.0, 0.0, 0.0), + (0.0, 1.0, 1.0, -2.0), + (-1.0, 1.0, 4.0, -4.0), + (0.5, -1.0, 2.25, 3.0), + (-2.0, 1.0, 8, -4.0), + (-3.0, 1.0, 12, -4.0), + ] + _test_loss_common(loss, cases) + + +def test_loss_epsilon_insensitive(): + # Test EpsilonInsensitive + loss = sgd_fast.EpsilonInsensitive(0.1) + cases = [ + # (p, y, expected_loss, expected_dloss) + (0.0, 0.0, 0.0, 0.0), + (0.1, 0.0, 0.0, 0.0), + (-2.05, -2.0, 0.0, 0.0), + (3.05, 3.0, 0.0, 0.0), + (2.2, 2.0, 0.1, 1.0), + (2.0, -1.0, 2.9, 1.0), + (2.0, 2.2, 0.1, -1.0), + (-2.0, 1.0, 2.9, -1.0), + ] + _test_loss_common(loss, cases) + + +def test_loss_squared_epsilon_insensitive(): + # Test SquaredEpsilonInsensitive + loss = sgd_fast.SquaredEpsilonInsensitive(0.1) + cases = [ + # (p, y, expected_loss, expected_dloss) + (0.0, 0.0, 0.0, 0.0), + (0.1, 0.0, 0.0, 0.0), + (-2.05, -2.0, 0.0, 0.0), + (3.05, 3.0, 0.0, 0.0), + (2.2, 2.0, 0.01, 0.2), + (2.0, -1.0, 8.41, 5.8), + (2.0, 2.2, 0.01, -0.2), + (-2.0, 1.0, 8.41, -5.8), + ] + _test_loss_common(loss, cases) + + +def test_multi_thread_multi_class_and_early_stopping(): + # This is a non-regression test for a bad interaction between + # early stopping internal attribute and thread-based parallelism. + clf = SGDClassifier( + alpha=1e-3, + tol=1e-3, + max_iter=1000, + early_stopping=True, + n_iter_no_change=100, + random_state=0, + n_jobs=2, + ) + clf.fit(iris.data, iris.target) + assert clf.n_iter_ > clf.n_iter_no_change + assert clf.n_iter_ < clf.n_iter_no_change + 20 + assert clf.score(iris.data, iris.target) > 0.8 + + +def test_multi_core_gridsearch_and_early_stopping(): + # This is a non-regression test for a bad interaction between + # early stopping internal attribute and process-based multi-core + # parallelism. + param_grid = { + "alpha": np.logspace(-4, 4, 9), + "n_iter_no_change": [5, 10, 50], + } + + clf = SGDClassifier(tol=1e-2, max_iter=1000, early_stopping=True, random_state=0) + search = RandomizedSearchCV(clf, param_grid, n_iter=5, n_jobs=2, random_state=0) + search.fit(iris.data, iris.target) + assert search.best_score_ > 0.8 + + +@pytest.mark.parametrize("backend", ["loky", "multiprocessing", "threading"]) +def test_SGDClassifier_fit_for_all_backends(backend): + # This is a non-regression smoke test. In the multi-class case, + # SGDClassifier.fit fits each class in a one-versus-all fashion using + # joblib.Parallel. However, each OvA step updates the coef_ attribute of + # the estimator in-place. Internally, SGDClassifier calls Parallel using + # require='sharedmem'. This test makes sure SGDClassifier.fit works + # consistently even when the user asks for a backend that does not provide + # sharedmem semantics. + + # We further test a case where memmapping would have been used if + # SGDClassifier.fit was called from a loky or multiprocessing backend. In + # this specific case, in-place modification of clf.coef_ would have caused + # a segmentation fault when trying to write in a readonly memory mapped + # buffer. + + random_state = np.random.RandomState(42) + + # Create a classification problem with 50000 features and 20 classes. Using + # loky or multiprocessing this make the clf.coef_ exceed the threshold + # above which memmaping is used in joblib and loky (1MB as of 2018/11/1). + X = sp.random(500, 2000, density=0.02, format="csr", random_state=random_state) + y = random_state.choice(20, 500) + + # Begin by fitting a SGD classifier sequentially + clf_sequential = SGDClassifier(max_iter=1000, n_jobs=1, random_state=42) + clf_sequential.fit(X, y) + + # Fit a SGDClassifier using the specified backend, and make sure the + # coefficients are equal to those obtained using a sequential fit + clf_parallel = SGDClassifier(max_iter=1000, n_jobs=4, random_state=42) + with joblib.parallel_backend(backend=backend): + clf_parallel.fit(X, y) + assert_array_almost_equal(clf_sequential.coef_, clf_parallel.coef_) + + +@pytest.mark.parametrize( + "Estimator", [linear_model.SGDClassifier, linear_model.SGDRegressor] +) +def test_sgd_random_state(Estimator, global_random_seed): + # Train the same model on the same data without converging and check that we + # get reproducible results by fixing the random seed. + if Estimator == linear_model.SGDRegressor: + X, y = datasets.make_regression(random_state=global_random_seed) + else: + X, y = datasets.make_classification(random_state=global_random_seed) + + # Fitting twice a model with the same hyper-parameters on the same training + # set with the same seed leads to the same results deterministically. + + est = Estimator(random_state=global_random_seed, max_iter=1) + with pytest.warns(ConvergenceWarning): + coef_same_seed_a = est.fit(X, y).coef_ + assert est.n_iter_ == 1 + + est = Estimator(random_state=global_random_seed, max_iter=1) + with pytest.warns(ConvergenceWarning): + coef_same_seed_b = est.fit(X, y).coef_ + assert est.n_iter_ == 1 + + assert_allclose(coef_same_seed_a, coef_same_seed_b) + + # Fitting twice a model with the same hyper-parameters on the same training + # set but with different random seed leads to different results after one + # epoch because of the random shuffling of the dataset. + + est = Estimator(random_state=global_random_seed + 1, max_iter=1) + with pytest.warns(ConvergenceWarning): + coef_other_seed = est.fit(X, y).coef_ + assert est.n_iter_ == 1 + + assert np.abs(coef_same_seed_a - coef_other_seed).max() > 1.0 + + +def test_validation_mask_correctly_subsets(monkeypatch): + """Test that data passed to validation callback correctly subsets. + + Non-regression test for #23255. + """ + X, Y = iris.data, iris.target + n_samples = X.shape[0] + validation_fraction = 0.2 + clf = linear_model.SGDClassifier( + early_stopping=True, + tol=1e-3, + max_iter=1000, + validation_fraction=validation_fraction, + ) + + mock = Mock(side_effect=_stochastic_gradient._ValidationScoreCallback) + monkeypatch.setattr(_stochastic_gradient, "_ValidationScoreCallback", mock) + clf.fit(X, Y) + + X_val, y_val = mock.call_args[0][1:3] + assert X_val.shape[0] == int(n_samples * validation_fraction) + assert y_val.shape[0] == int(n_samples * validation_fraction) + + +def test_sgd_error_on_zero_validation_weight(): + # Test that SGDClassifier raises error when all the validation samples + # have zero sample_weight. Non-regression test for #17229. + X, Y = iris.data, iris.target + sample_weight = np.zeros_like(Y) + validation_fraction = 0.4 + + clf = linear_model.SGDClassifier( + early_stopping=True, validation_fraction=validation_fraction, random_state=0 + ) + + error_message = ( + "The sample weights for validation set are all zero, consider using a" + " different random state." + ) + with pytest.raises(ValueError, match=error_message): + clf.fit(X, Y, sample_weight=sample_weight) + + +@pytest.mark.parametrize("Estimator", [SGDClassifier, SGDRegressor]) +def test_sgd_verbose(Estimator): + """non-regression test for gh #25249""" + Estimator(verbose=1).fit(X, Y) + + +@pytest.mark.parametrize( + "SGDEstimator", + [ + SGDClassifier, + SparseSGDClassifier, + SGDRegressor, + SparseSGDRegressor, + SGDOneClassSVM, + SparseSGDOneClassSVM, + ], +) +@pytest.mark.parametrize("data_type", (np.float32, np.float64)) +def test_sgd_dtype_match(SGDEstimator, data_type): + _X = X.astype(data_type) + _Y = np.array(Y, dtype=data_type) + sgd_model = SGDEstimator() + sgd_model.fit(_X, _Y) + assert sgd_model.coef_.dtype == data_type + + +@pytest.mark.parametrize( + "SGDEstimator", + [ + SGDClassifier, + SparseSGDClassifier, + SGDRegressor, + SparseSGDRegressor, + SGDOneClassSVM, + SparseSGDOneClassSVM, + ], +) +def test_sgd_numerical_consistency(SGDEstimator): + X_64 = X.astype(dtype=np.float64) + Y_64 = np.array(Y, dtype=np.float64) + + X_32 = X.astype(dtype=np.float32) + Y_32 = np.array(Y, dtype=np.float32) + + sgd_64 = SGDEstimator(max_iter=20) + sgd_64.fit(X_64, Y_64) + + sgd_32 = SGDEstimator(max_iter=20) + sgd_32.fit(X_32, Y_32) + + assert_allclose(sgd_64.coef_, sgd_32.coef_) + + +def test_sgd_one_class_svm_estimator_type(): + """Check that SGDOneClassSVM has the correct estimator type. + + Non-regression test for if the mixin was not on the left. + """ + sgd_ocsvm = SGDOneClassSVM() + assert get_tags(sgd_ocsvm).estimator_type == "outlier_detector" diff --git a/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_theil_sen.py b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_theil_sen.py new file mode 100644 index 0000000000000000000000000000000000000000..216415f2ee9277e618c457afc0a7280c8a2a4b8a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/linear_model/tests/test_theil_sen.py @@ -0,0 +1,303 @@ +""" +Testing for Theil-Sen module (sklearn.linear_model.theil_sen) +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import os +import re +import sys +from contextlib import contextmanager + +import numpy as np +import pytest +from numpy.testing import ( + assert_array_almost_equal, + assert_array_equal, + assert_array_less, +) +from scipy.linalg import norm +from scipy.optimize import fmin_bfgs + +from sklearn.exceptions import ConvergenceWarning +from sklearn.linear_model import LinearRegression, TheilSenRegressor +from sklearn.linear_model._theil_sen import ( + _breakdown_point, + _modified_weiszfeld_step, + _spatial_median, +) +from sklearn.utils._testing import assert_almost_equal + + +@contextmanager +def no_stdout_stderr(): + old_stdout = sys.stdout + old_stderr = sys.stderr + with open(os.devnull, "w") as devnull: + sys.stdout = devnull + sys.stderr = devnull + yield + devnull.flush() + sys.stdout = old_stdout + sys.stderr = old_stderr + + +def gen_toy_problem_1d(intercept=True): + random_state = np.random.RandomState(0) + # Linear model y = 3*x + N(2, 0.1**2) + w = 3.0 + if intercept: + c = 2.0 + n_samples = 50 + else: + c = 0.1 + n_samples = 100 + x = random_state.normal(size=n_samples) + noise = 0.1 * random_state.normal(size=n_samples) + y = w * x + c + noise + # Add some outliers + if intercept: + x[42], y[42] = (-2, 4) + x[43], y[43] = (-2.5, 8) + x[33], y[33] = (2.5, 1) + x[49], y[49] = (2.1, 2) + else: + x[42], y[42] = (-2, 4) + x[43], y[43] = (-2.5, 8) + x[53], y[53] = (2.5, 1) + x[60], y[60] = (2.1, 2) + x[72], y[72] = (1.8, -7) + return x[:, np.newaxis], y, w, c + + +def gen_toy_problem_2d(): + random_state = np.random.RandomState(0) + n_samples = 100 + # Linear model y = 5*x_1 + 10*x_2 + N(1, 0.1**2) + X = random_state.normal(size=(n_samples, 2)) + w = np.array([5.0, 10.0]) + c = 1.0 + noise = 0.1 * random_state.normal(size=n_samples) + y = np.dot(X, w) + c + noise + # Add some outliers + n_outliers = n_samples // 10 + ix = random_state.randint(0, n_samples, size=n_outliers) + y[ix] = 50 * random_state.normal(size=n_outliers) + return X, y, w, c + + +def gen_toy_problem_4d(): + random_state = np.random.RandomState(0) + n_samples = 10000 + # Linear model y = 5*x_1 + 10*x_2 + 42*x_3 + 7*x_4 + N(1, 0.1**2) + X = random_state.normal(size=(n_samples, 4)) + w = np.array([5.0, 10.0, 42.0, 7.0]) + c = 1.0 + noise = 0.1 * random_state.normal(size=n_samples) + y = np.dot(X, w) + c + noise + # Add some outliers + n_outliers = n_samples // 10 + ix = random_state.randint(0, n_samples, size=n_outliers) + y[ix] = 50 * random_state.normal(size=n_outliers) + return X, y, w, c + + +def test_modweiszfeld_step_1d(): + X = np.array([1.0, 2.0, 3.0]).reshape(3, 1) + # Check startvalue is element of X and solution + median = 2.0 + new_y = _modified_weiszfeld_step(X, median) + assert_array_almost_equal(new_y, median) + # Check startvalue is not the solution + y = 2.5 + new_y = _modified_weiszfeld_step(X, y) + assert_array_less(median, new_y) + assert_array_less(new_y, y) + # Check startvalue is not the solution but element of X + y = 3.0 + new_y = _modified_weiszfeld_step(X, y) + assert_array_less(median, new_y) + assert_array_less(new_y, y) + # Check that a single vector is identity + X = np.array([1.0, 2.0, 3.0]).reshape(1, 3) + y = X[0] + new_y = _modified_weiszfeld_step(X, y) + assert_array_equal(y, new_y) + + +def test_modweiszfeld_step_2d(): + X = np.array([0.0, 0.0, 1.0, 1.0, 0.0, 1.0]).reshape(3, 2) + y = np.array([0.5, 0.5]) + # Check first two iterations + new_y = _modified_weiszfeld_step(X, y) + assert_array_almost_equal(new_y, np.array([1 / 3, 2 / 3])) + new_y = _modified_weiszfeld_step(X, new_y) + assert_array_almost_equal(new_y, np.array([0.2792408, 0.7207592])) + # Check fix point + y = np.array([0.21132505, 0.78867497]) + new_y = _modified_weiszfeld_step(X, y) + assert_array_almost_equal(new_y, y) + + +def test_spatial_median_1d(): + X = np.array([1.0, 2.0, 3.0]).reshape(3, 1) + true_median = 2.0 + _, median = _spatial_median(X) + assert_array_almost_equal(median, true_median) + # Test larger problem and for exact solution in 1d case + random_state = np.random.RandomState(0) + X = random_state.randint(100, size=(1000, 1)) + true_median = np.median(X.ravel()) + _, median = _spatial_median(X) + assert_array_equal(median, true_median) + + +def test_spatial_median_2d(): + X = np.array([0.0, 0.0, 1.0, 1.0, 0.0, 1.0]).reshape(3, 2) + _, median = _spatial_median(X, max_iter=100, tol=1.0e-6) + + def cost_func(y): + dists = np.array([norm(x - y) for x in X]) + return np.sum(dists) + + # Check if median is solution of the Fermat-Weber location problem + fermat_weber = fmin_bfgs(cost_func, median, disp=False) + assert_array_almost_equal(median, fermat_weber) + # Check when maximum iteration is exceeded a warning is emitted + warning_message = "Maximum number of iterations 30 reached in spatial median." + with pytest.warns(ConvergenceWarning, match=warning_message): + _spatial_median(X, max_iter=30, tol=0.0) + + +def test_theil_sen_1d(): + X, y, w, c = gen_toy_problem_1d() + # Check that Least Squares fails + lstq = LinearRegression().fit(X, y) + assert np.abs(lstq.coef_ - w) > 0.9 + # Check that Theil-Sen works + theil_sen = TheilSenRegressor(random_state=0).fit(X, y) + assert_array_almost_equal(theil_sen.coef_, w, 1) + assert_array_almost_equal(theil_sen.intercept_, c, 1) + + +def test_theil_sen_1d_no_intercept(): + X, y, w, c = gen_toy_problem_1d(intercept=False) + # Check that Least Squares fails + lstq = LinearRegression(fit_intercept=False).fit(X, y) + assert np.abs(lstq.coef_ - w - c) > 0.5 + # Check that Theil-Sen works + theil_sen = TheilSenRegressor(fit_intercept=False, random_state=0).fit(X, y) + assert_array_almost_equal(theil_sen.coef_, w + c, 1) + assert_almost_equal(theil_sen.intercept_, 0.0) + + # non-regression test for #18104 + theil_sen.score(X, y) + + +def test_theil_sen_2d(): + X, y, w, c = gen_toy_problem_2d() + # Check that Least Squares fails + lstq = LinearRegression().fit(X, y) + assert norm(lstq.coef_ - w) > 1.0 + # Check that Theil-Sen works + theil_sen = TheilSenRegressor(max_subpopulation=1e3, random_state=0).fit(X, y) + assert_array_almost_equal(theil_sen.coef_, w, 1) + assert_array_almost_equal(theil_sen.intercept_, c, 1) + + +def test_calc_breakdown_point(): + bp = _breakdown_point(1e10, 2) + assert np.abs(bp - 1 + 1 / (np.sqrt(2))) < 1.0e-6 + + +@pytest.mark.parametrize( + "param, ExceptionCls, match", + [ + ( + {"n_subsamples": 1}, + ValueError, + re.escape("Invalid parameter since n_features+1 > n_subsamples (2 > 1)"), + ), + ( + {"n_subsamples": 101}, + ValueError, + re.escape("Invalid parameter since n_subsamples > n_samples (101 > 50)"), + ), + ], +) +def test_checksubparams_invalid_input(param, ExceptionCls, match): + X, y, w, c = gen_toy_problem_1d() + theil_sen = TheilSenRegressor(**param, random_state=0) + with pytest.raises(ExceptionCls, match=match): + theil_sen.fit(X, y) + + +def test_checksubparams_n_subsamples_if_less_samples_than_features(): + random_state = np.random.RandomState(0) + n_samples, n_features = 10, 20 + X = random_state.normal(size=(n_samples, n_features)) + y = random_state.normal(size=n_samples) + theil_sen = TheilSenRegressor(n_subsamples=9, random_state=0) + with pytest.raises(ValueError): + theil_sen.fit(X, y) + + +def test_subpopulation(): + X, y, w, c = gen_toy_problem_4d() + theil_sen = TheilSenRegressor(max_subpopulation=250, random_state=0).fit(X, y) + assert_array_almost_equal(theil_sen.coef_, w, 1) + assert_array_almost_equal(theil_sen.intercept_, c, 1) + + +def test_subsamples(): + X, y, w, c = gen_toy_problem_4d() + theil_sen = TheilSenRegressor(n_subsamples=X.shape[0], random_state=0).fit(X, y) + lstq = LinearRegression().fit(X, y) + # Check for exact the same results as Least Squares + assert_array_almost_equal(theil_sen.coef_, lstq.coef_, 9) + + +def test_verbosity(): + X, y, w, c = gen_toy_problem_1d() + # Check that Theil-Sen can be verbose + with no_stdout_stderr(): + TheilSenRegressor(verbose=True, random_state=0).fit(X, y) + TheilSenRegressor(verbose=True, max_subpopulation=10, random_state=0).fit(X, y) + + +def test_theil_sen_parallel(): + X, y, w, c = gen_toy_problem_2d() + # Check that Least Squares fails + lstq = LinearRegression().fit(X, y) + assert norm(lstq.coef_ - w) > 1.0 + # Check that Theil-Sen works + theil_sen = TheilSenRegressor(n_jobs=2, random_state=0, max_subpopulation=2e3).fit( + X, y + ) + assert_array_almost_equal(theil_sen.coef_, w, 1) + assert_array_almost_equal(theil_sen.intercept_, c, 1) + + +def test_less_samples_than_features(): + random_state = np.random.RandomState(0) + n_samples, n_features = 10, 20 + X = random_state.normal(size=(n_samples, n_features)) + y = random_state.normal(size=n_samples) + # Check that Theil-Sen falls back to Least Squares if fit_intercept=False + theil_sen = TheilSenRegressor(fit_intercept=False, random_state=0).fit(X, y) + lstq = LinearRegression(fit_intercept=False).fit(X, y) + assert_array_almost_equal(theil_sen.coef_, lstq.coef_, 12) + # Check fit_intercept=True case. This will not be equal to the Least + # Squares solution since the intercept is calculated differently. + theil_sen = TheilSenRegressor(fit_intercept=True, random_state=0).fit(X, y) + y_pred = theil_sen.predict(X) + assert_array_almost_equal(y_pred, y, 12) + + +# TODO(1.8): Remove +def test_copy_X_deprecated(): + X, y, _, _ = gen_toy_problem_1d() + theil_sen = TheilSenRegressor(copy_X=True, random_state=0) + with pytest.warns(FutureWarning, match="`copy_X` was deprecated"): + theil_sen.fit(X, y) diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..349f7c1a4a7c41a053e3ae35228dc654dc6b63fc --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/__init__.py @@ -0,0 +1,22 @@ +"""Data embedding techniques.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._isomap import Isomap +from ._locally_linear import LocallyLinearEmbedding, locally_linear_embedding +from ._mds import MDS, smacof +from ._spectral_embedding import SpectralEmbedding, spectral_embedding +from ._t_sne import TSNE, trustworthiness + +__all__ = [ + "MDS", + "TSNE", + "Isomap", + "LocallyLinearEmbedding", + "SpectralEmbedding", + "locally_linear_embedding", + "smacof", + "spectral_embedding", + "trustworthiness", +] diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/_barnes_hut_tsne.pyx b/.venv/lib/python3.12/site-packages/sklearn/manifold/_barnes_hut_tsne.pyx new file mode 100644 index 0000000000000000000000000000000000000000..e84df4a9074b220d2a5dc01b203559d4a0945e6c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/_barnes_hut_tsne.pyx @@ -0,0 +1,295 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# See http://homepage.tudelft.nl/19j49/t-SNE.html for reference +# implementations and papers describing the technique + + +import numpy as np +cimport numpy as cnp +from libc.stdio cimport printf +from libc.math cimport log +from libc.stdlib cimport malloc, free +from libc.time cimport clock, clock_t +from cython.parallel cimport prange, parallel + +from ..neighbors._quad_tree cimport _QuadTree + +cnp.import_array() + + +cdef char* EMPTY_STRING = "" + +# Smallest strictly positive value that can be represented by floating +# point numbers for different precision levels. This is useful to avoid +# taking the log of zero when computing the KL divergence. +cdef float FLOAT32_TINY = np.finfo(np.float32).tiny + +# Useful to void division by zero or divergence to +inf. +cdef float FLOAT64_EPS = np.finfo(np.float64).eps + +# This is effectively an ifdef statement in Cython +# It allows us to write printf debugging lines +# and remove them at compile time +cdef enum: + DEBUGFLAG = 0 + +cdef float compute_gradient(float[:] val_P, + float[:, :] pos_reference, + cnp.int64_t[:] neighbors, + cnp.int64_t[:] indptr, + float[:, :] tot_force, + _QuadTree qt, + float theta, + int dof, + long start, + bint compute_error, + int num_threads) noexcept nogil: + # Having created the tree, calculate the gradient + # in two components, the positive and negative forces + cdef: + long i, coord + int ax + long n_samples = pos_reference.shape[0] + int n_dimensions = qt.n_dimensions + clock_t t1 = 0, t2 = 0 + double sQ + float error + int take_timing = 1 if qt.verbose > 15 else 0 + + if qt.verbose > 11: + printf("[t-SNE] Allocating %li elements in force arrays\n", + n_samples * n_dimensions * 2) + cdef float* neg_f = malloc(sizeof(float) * n_samples * n_dimensions) + cdef float* pos_f = malloc(sizeof(float) * n_samples * n_dimensions) + + if take_timing: + t1 = clock() + sQ = compute_gradient_negative(pos_reference, neg_f, qt, dof, theta, start, + num_threads) + if take_timing: + t2 = clock() + printf("[t-SNE] Computing negative gradient: %e ticks\n", ((float) (t2 - t1))) + + if take_timing: + t1 = clock() + error = compute_gradient_positive(val_P, pos_reference, neighbors, indptr, + pos_f, n_dimensions, dof, sQ, start, + qt.verbose, compute_error, num_threads) + if take_timing: + t2 = clock() + printf("[t-SNE] Computing positive gradient: %e ticks\n", + ((float) (t2 - t1))) + for i in prange(start, n_samples, nogil=True, num_threads=num_threads, + schedule='static'): + for ax in range(n_dimensions): + coord = i * n_dimensions + ax + tot_force[i, ax] = pos_f[coord] - (neg_f[coord] / sQ) + + free(neg_f) + free(pos_f) + return error + + +cdef float compute_gradient_positive(float[:] val_P, + float[:, :] pos_reference, + cnp.int64_t[:] neighbors, + cnp.int64_t[:] indptr, + float* pos_f, + int n_dimensions, + int dof, + double sum_Q, + cnp.int64_t start, + int verbose, + bint compute_error, + int num_threads) noexcept nogil: + # Sum over the following expression for i not equal to j + # grad_i = p_ij (1 + ||y_i - y_j||^2)^-1 (y_i - y_j) + # This is equivalent to compute_edge_forces in the authors' code + # It just goes over the nearest neighbors instead of all the data points + # (unlike the non-nearest neighbors version of `compute_gradient_positive') + cdef: + int ax + long i, j, k + long n_samples = indptr.shape[0] - 1 + float C = 0.0 + float dij, qij, pij + float exponent = (dof + 1.0) / 2.0 + float float_dof = (float) (dof) + float* buff + clock_t t1 = 0, t2 = 0 + float dt + + if verbose > 10: + t1 = clock() + + with nogil, parallel(num_threads=num_threads): + # Define private buffer variables + buff = malloc(sizeof(float) * n_dimensions) + + for i in prange(start, n_samples, schedule='static'): + # Init the gradient vector + for ax in range(n_dimensions): + pos_f[i * n_dimensions + ax] = 0.0 + # Compute the positive interaction for the nearest neighbors + for k in range(indptr[i], indptr[i+1]): + j = neighbors[k] + dij = 0.0 + pij = val_P[k] + for ax in range(n_dimensions): + buff[ax] = pos_reference[i, ax] - pos_reference[j, ax] + dij += buff[ax] * buff[ax] + qij = float_dof / (float_dof + dij) + if dof != 1: # i.e. exponent != 1 + qij = qij ** exponent + dij = pij * qij + + # only compute the error when needed + if compute_error: + qij = qij / sum_Q + C += pij * log(max(pij, FLOAT32_TINY) / max(qij, FLOAT32_TINY)) + for ax in range(n_dimensions): + pos_f[i * n_dimensions + ax] += dij * buff[ax] + + free(buff) + if verbose > 10: + t2 = clock() + dt = ((float) (t2 - t1)) + printf("[t-SNE] Computed error=%1.4f in %1.1e ticks\n", C, dt) + return C + + +cdef double compute_gradient_negative(float[:, :] pos_reference, + float* neg_f, + _QuadTree qt, + int dof, + float theta, + long start, + int num_threads) noexcept nogil: + cdef: + int ax + int n_dimensions = qt.n_dimensions + int offset = n_dimensions + 2 + long i, j, idx + long n_samples = pos_reference.shape[0] + long n = n_samples - start + long dta = 0 + long dtb = 0 + float size, dist2s, mult + float exponent = (dof + 1.0) / 2.0 + float float_dof = (float) (dof) + double qijZ, sum_Q = 0.0 + float* force + float* neg_force + float* pos + clock_t t1 = 0, t2 = 0, t3 = 0 + int take_timing = 1 if qt.verbose > 20 else 0 + + with nogil, parallel(num_threads=num_threads): + # Define thread-local buffers + summary = malloc(sizeof(float) * n * offset) + pos = malloc(sizeof(float) * n_dimensions) + force = malloc(sizeof(float) * n_dimensions) + neg_force = malloc(sizeof(float) * n_dimensions) + + for i in prange(start, n_samples, schedule='static'): + # Clear the arrays + for ax in range(n_dimensions): + force[ax] = 0.0 + neg_force[ax] = 0.0 + pos[ax] = pos_reference[i, ax] + + # Find which nodes are summarizing and collect their centers of mass + # deltas, and sizes, into vectorized arrays + if take_timing: + t1 = clock() + idx = qt.summarize(pos, summary, theta*theta) + if take_timing: + t2 = clock() + # Compute the t-SNE negative force + # for the digits dataset, walking the tree + # is about 10-15x more expensive than the + # following for loop + for j in range(idx // offset): + + dist2s = summary[j * offset + n_dimensions] + size = summary[j * offset + n_dimensions + 1] + qijZ = float_dof / (float_dof + dist2s) # 1/(1+dist) + if dof != 1: # i.e. exponent != 1 + qijZ = qijZ ** exponent + + sum_Q += size * qijZ # size of the node * q + mult = size * qijZ * qijZ + for ax in range(n_dimensions): + neg_force[ax] += mult * summary[j * offset + ax] + if take_timing: + t3 = clock() + for ax in range(n_dimensions): + neg_f[i * n_dimensions + ax] = neg_force[ax] + if take_timing: + dta += t2 - t1 + dtb += t3 - t2 + free(pos) + free(force) + free(neg_force) + free(summary) + if take_timing: + printf("[t-SNE] Tree: %li clock ticks | ", dta) + printf("Force computation: %li clock ticks\n", dtb) + + # Put sum_Q to machine EPSILON to avoid divisions by 0 + sum_Q = max(sum_Q, FLOAT64_EPS) + return sum_Q + + +def gradient(float[:] val_P, + float[:, :] pos_output, + cnp.int64_t[:] neighbors, + cnp.int64_t[:] indptr, + float[:, :] forces, + float theta, + int n_dimensions, + int verbose, + int dof=1, + long skip_num_points=0, + bint compute_error=1, + int num_threads=1): + # This function is designed to be called from external Python + # it passes the 'forces' array by reference and fills that's array + # up in-place + cdef float C + cdef int n + n = pos_output.shape[0] + assert val_P.itemsize == 4 + assert pos_output.itemsize == 4 + assert forces.itemsize == 4 + m = "Forces array and pos_output shapes are incompatible" + assert n == forces.shape[0], m + m = "Pij and pos_output shapes are incompatible" + assert n == indptr.shape[0] - 1, m + if verbose > 10: + printf("[t-SNE] Initializing tree of n_dimensions %i\n", n_dimensions) + cdef _QuadTree qt = _QuadTree(pos_output.shape[1], verbose) + if verbose > 10: + printf("[t-SNE] Inserting %li points\n", pos_output.shape[0]) + qt.build_tree(pos_output) + if verbose > 10: + # XXX: format hack to workaround lack of `const char *` type + # in the generated C code that triggers error with gcc 4.9 + # and -Werror=format-security + printf("[t-SNE] Computing gradient\n%s", EMPTY_STRING) + + C = compute_gradient(val_P, pos_output, neighbors, indptr, forces, + qt, theta, dof, skip_num_points, compute_error, + num_threads) + + if verbose > 10: + # XXX: format hack to workaround lack of `const char *` type + # in the generated C code + # and -Werror=format-security + printf("[t-SNE] Checking tree consistency\n%s", EMPTY_STRING) + m = "Tree consistency failed: unexpected number of points on the tree" + assert qt.cells[0].cumulative_size == qt.n_points, m + if not compute_error: + C = np.nan + return C diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/_isomap.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/_isomap.py new file mode 100644 index 0000000000000000000000000000000000000000..90154470c18a486a250ea112cb31e57167d2eb43 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/_isomap.py @@ -0,0 +1,442 @@ +"""Isomap for manifold learning""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from numbers import Integral, Real + +import numpy as np +from scipy.sparse import issparse +from scipy.sparse.csgraph import connected_components, shortest_path + +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) +from ..decomposition import KernelPCA +from ..metrics.pairwise import _VALID_METRICS +from ..neighbors import NearestNeighbors, kneighbors_graph, radius_neighbors_graph +from ..preprocessing import KernelCenterer +from ..utils._param_validation import Interval, StrOptions +from ..utils.graph import _fix_connected_components +from ..utils.validation import check_is_fitted + + +class Isomap(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): + """Isomap Embedding. + + Non-linear dimensionality reduction through Isometric Mapping + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_neighbors : int or None, default=5 + Number of neighbors to consider for each point. If `n_neighbors` is an int, + then `radius` must be `None`. + + radius : float or None, default=None + Limiting distance of neighbors to return. If `radius` is a float, + then `n_neighbors` must be set to `None`. + + .. versionadded:: 1.1 + + n_components : int, default=2 + Number of coordinates for the manifold. + + eigen_solver : {'auto', 'arpack', 'dense'}, default='auto' + 'auto' : Attempt to choose the most efficient solver + for the given problem. + + 'arpack' : Use Arnoldi decomposition to find the eigenvalues + and eigenvectors. + + 'dense' : Use a direct solver (i.e. LAPACK) + for the eigenvalue decomposition. + + tol : float, default=0 + Convergence tolerance passed to arpack or lobpcg. + not used if eigen_solver == 'dense'. + + max_iter : int, default=None + Maximum number of iterations for the arpack solver. + not used if eigen_solver == 'dense'. + + path_method : {'auto', 'FW', 'D'}, default='auto' + Method to use in finding shortest path. + + 'auto' : attempt to choose the best algorithm automatically. + + 'FW' : Floyd-Warshall algorithm. + + 'D' : Dijkstra's algorithm. + + neighbors_algorithm : {'auto', 'brute', 'kd_tree', 'ball_tree'}, \ + default='auto' + Algorithm to use for nearest neighbors search, + passed to neighbors.NearestNeighbors instance. + + n_jobs : int or None, default=None + The number of parallel jobs to run. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + metric : str, or callable, default="minkowski" + The metric to use when calculating distance between instances in a + feature array. If metric is a string or callable, it must be one of + the options allowed by :func:`sklearn.metrics.pairwise_distances` for + its metric parameter. + If metric is "precomputed", X is assumed to be a distance matrix and + must be square. X may be a :term:`Glossary `. + + .. versionadded:: 0.22 + + p : float, default=2 + Parameter for the Minkowski metric from + sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + + .. versionadded:: 0.22 + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + .. versionadded:: 0.22 + + Attributes + ---------- + embedding_ : array-like, shape (n_samples, n_components) + Stores the embedding vectors. + + kernel_pca_ : object + :class:`~sklearn.decomposition.KernelPCA` object used to implement the + embedding. + + nbrs_ : sklearn.neighbors.NearestNeighbors instance + Stores nearest neighbors instance, including BallTree or KDtree + if applicable. + + dist_matrix_ : array-like, shape (n_samples, n_samples) + Stores the geodesic distance matrix of training data. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + sklearn.decomposition.PCA : Principal component analysis that is a linear + dimensionality reduction method. + sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using + kernels and PCA. + MDS : Manifold learning using multidimensional scaling. + TSNE : T-distributed Stochastic Neighbor Embedding. + LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding. + SpectralEmbedding : Spectral embedding for non-linear dimensionality. + + References + ---------- + + .. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric + framework for nonlinear dimensionality reduction. Science 290 (5500) + + Examples + -------- + >>> from sklearn.datasets import load_digits + >>> from sklearn.manifold import Isomap + >>> X, _ = load_digits(return_X_y=True) + >>> X.shape + (1797, 64) + >>> embedding = Isomap(n_components=2) + >>> X_transformed = embedding.fit_transform(X[:100]) + >>> X_transformed.shape + (100, 2) + """ + + _parameter_constraints: dict = { + "n_neighbors": [Interval(Integral, 1, None, closed="left"), None], + "radius": [Interval(Real, 0, None, closed="both"), None], + "n_components": [Interval(Integral, 1, None, closed="left")], + "eigen_solver": [StrOptions({"auto", "arpack", "dense"})], + "tol": [Interval(Real, 0, None, closed="left")], + "max_iter": [Interval(Integral, 1, None, closed="left"), None], + "path_method": [StrOptions({"auto", "FW", "D"})], + "neighbors_algorithm": [StrOptions({"auto", "brute", "kd_tree", "ball_tree"})], + "n_jobs": [Integral, None], + "p": [Interval(Real, 1, None, closed="left")], + "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable], + "metric_params": [dict, None], + } + + def __init__( + self, + *, + n_neighbors=5, + radius=None, + n_components=2, + eigen_solver="auto", + tol=0, + max_iter=None, + path_method="auto", + neighbors_algorithm="auto", + n_jobs=None, + metric="minkowski", + p=2, + metric_params=None, + ): + self.n_neighbors = n_neighbors + self.radius = radius + self.n_components = n_components + self.eigen_solver = eigen_solver + self.tol = tol + self.max_iter = max_iter + self.path_method = path_method + self.neighbors_algorithm = neighbors_algorithm + self.n_jobs = n_jobs + self.metric = metric + self.p = p + self.metric_params = metric_params + + def _fit_transform(self, X): + if self.n_neighbors is not None and self.radius is not None: + raise ValueError( + "Both n_neighbors and radius are provided. Use" + f" Isomap(radius={self.radius}, n_neighbors=None) if intended to use" + " radius-based neighbors" + ) + + self.nbrs_ = NearestNeighbors( + n_neighbors=self.n_neighbors, + radius=self.radius, + algorithm=self.neighbors_algorithm, + metric=self.metric, + p=self.p, + metric_params=self.metric_params, + n_jobs=self.n_jobs, + ) + self.nbrs_.fit(X) + self.n_features_in_ = self.nbrs_.n_features_in_ + if hasattr(self.nbrs_, "feature_names_in_"): + self.feature_names_in_ = self.nbrs_.feature_names_in_ + + self.kernel_pca_ = KernelPCA( + n_components=self.n_components, + kernel="precomputed", + eigen_solver=self.eigen_solver, + tol=self.tol, + max_iter=self.max_iter, + n_jobs=self.n_jobs, + ).set_output(transform="default") + + if self.n_neighbors is not None: + nbg = kneighbors_graph( + self.nbrs_, + self.n_neighbors, + metric=self.metric, + p=self.p, + metric_params=self.metric_params, + mode="distance", + n_jobs=self.n_jobs, + ) + else: + nbg = radius_neighbors_graph( + self.nbrs_, + radius=self.radius, + metric=self.metric, + p=self.p, + metric_params=self.metric_params, + mode="distance", + n_jobs=self.n_jobs, + ) + + # Compute the number of connected components, and connect the different + # components to be able to compute a shortest path between all pairs + # of samples in the graph. + # Similar fix to cluster._agglomerative._fix_connectivity. + n_connected_components, labels = connected_components(nbg) + if n_connected_components > 1: + if self.metric == "precomputed" and issparse(X): + raise RuntimeError( + "The number of connected components of the neighbors graph" + f" is {n_connected_components} > 1. The graph cannot be " + "completed with metric='precomputed', and Isomap cannot be" + "fitted. Increase the number of neighbors to avoid this " + "issue, or precompute the full distance matrix instead " + "of passing a sparse neighbors graph." + ) + warnings.warn( + ( + "The number of connected components of the neighbors graph " + f"is {n_connected_components} > 1. Completing the graph to fit" + " Isomap might be slow. Increase the number of neighbors to " + "avoid this issue." + ), + stacklevel=2, + ) + + # use array validated by NearestNeighbors + nbg = _fix_connected_components( + X=self.nbrs_._fit_X, + graph=nbg, + n_connected_components=n_connected_components, + component_labels=labels, + mode="distance", + metric=self.nbrs_.effective_metric_, + **self.nbrs_.effective_metric_params_, + ) + + self.dist_matrix_ = shortest_path(nbg, method=self.path_method, directed=False) + + if self.nbrs_._fit_X.dtype == np.float32: + self.dist_matrix_ = self.dist_matrix_.astype( + self.nbrs_._fit_X.dtype, copy=False + ) + + G = self.dist_matrix_**2 + G *= -0.5 + + self.embedding_ = self.kernel_pca_.fit_transform(G) + self._n_features_out = self.embedding_.shape[1] + + def reconstruction_error(self): + """Compute the reconstruction error for the embedding. + + Returns + ------- + reconstruction_error : float + Reconstruction error. + + Notes + ----- + The cost function of an isomap embedding is + + ``E = frobenius_norm[K(D) - K(D_fit)] / n_samples`` + + Where D is the matrix of distances for the input data X, + D_fit is the matrix of distances for the output embedding X_fit, + and K is the isomap kernel: + + ``K(D) = -0.5 * (I - 1/n_samples) * D^2 * (I - 1/n_samples)`` + """ + G = -0.5 * self.dist_matrix_**2 + G_center = KernelCenterer().fit_transform(G) + evals = self.kernel_pca_.eigenvalues_ + return np.sqrt(np.sum(G_center**2) - np.sum(evals**2)) / G.shape[0] + + @_fit_context( + # Isomap.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y=None): + """Compute the embedding vectors for data X. + + Parameters + ---------- + X : {array-like, sparse matrix, BallTree, KDTree, NearestNeighbors} + Sample data, shape = (n_samples, n_features), in the form of a + numpy array, sparse matrix, precomputed tree, or NearestNeighbors + object. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Returns a fitted instance of self. + """ + self._fit_transform(X) + return self + + @_fit_context( + # Isomap.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit_transform(self, X, y=None): + """Fit the model from data in X and transform X. + + Parameters + ---------- + X : {array-like, sparse matrix, BallTree, KDTree} + Training vector, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + X_new : array-like, shape (n_samples, n_components) + X transformed in the new space. + """ + self._fit_transform(X) + return self.embedding_ + + def transform(self, X): + """Transform X. + + This is implemented by linking the points X into the graph of geodesic + distances of the training data. First the `n_neighbors` nearest + neighbors of X are found in the training data, and from these the + shortest geodesic distances from each point in X to each point in + the training data are computed in order to construct the kernel. + The embedding of X is the projection of this kernel onto the + embedding vectors of the training set. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_queries, n_features) + If neighbors_algorithm='precomputed', X is assumed to be a + distance matrix or a sparse graph of shape + (n_queries, n_samples_fit). + + Returns + ------- + X_new : array-like, shape (n_queries, n_components) + X transformed in the new space. + """ + check_is_fitted(self) + if self.n_neighbors is not None: + distances, indices = self.nbrs_.kneighbors(X, return_distance=True) + else: + distances, indices = self.nbrs_.radius_neighbors(X, return_distance=True) + + # Create the graph of shortest distances from X to + # training data via the nearest neighbors of X. + # This can be done as a single array operation, but it potentially + # takes a lot of memory. To avoid that, use a loop: + + n_samples_fit = self.nbrs_.n_samples_fit_ + n_queries = distances.shape[0] + + if hasattr(X, "dtype") and X.dtype == np.float32: + dtype = np.float32 + else: + dtype = np.float64 + + G_X = np.zeros((n_queries, n_samples_fit), dtype) + for i in range(n_queries): + G_X[i] = np.min(self.dist_matrix_[indices[i]] + distances[i][:, None], 0) + + G_X **= 2 + G_X *= -0.5 + + return self.kernel_pca_.transform(G_X) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.transformer_tags.preserves_dtype = ["float64", "float32"] + tags.input_tags.sparse = True + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/_locally_linear.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/_locally_linear.py new file mode 100644 index 0000000000000000000000000000000000000000..7e3f456f7ca57e0a5ef4ba3aaf847475aacadfab --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/_locally_linear.py @@ -0,0 +1,879 @@ +"""Locally Linear Embedding""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Integral, Real + +import numpy as np +from scipy.linalg import eigh, qr, solve, svd +from scipy.sparse import csr_matrix, eye, lil_matrix +from scipy.sparse.linalg import eigsh + +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, + _UnstableArchMixin, +) +from ..neighbors import NearestNeighbors +from ..utils import check_array, check_random_state +from ..utils._arpack import _init_arpack_v0 +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.extmath import stable_cumsum +from ..utils.validation import FLOAT_DTYPES, check_is_fitted, validate_data + + +def barycenter_weights(X, Y, indices, reg=1e-3): + """Compute barycenter weights of X from Y along the first axis + + We estimate the weights to assign to each point in Y[indices] to recover + the point X[i]. The barycenter weights sum to 1. + + Parameters + ---------- + X : array-like, shape (n_samples, n_dim) + + Y : array-like, shape (n_samples, n_dim) + + indices : array-like, shape (n_samples, n_dim) + Indices of the points in Y used to compute the barycenter + + reg : float, default=1e-3 + Amount of regularization to add for the problem to be + well-posed in the case of n_neighbors > n_dim + + Returns + ------- + B : array-like, shape (n_samples, n_neighbors) + + Notes + ----- + See developers note for more information. + """ + X = check_array(X, dtype=FLOAT_DTYPES) + Y = check_array(Y, dtype=FLOAT_DTYPES) + indices = check_array(indices, dtype=int) + + n_samples, n_neighbors = indices.shape + assert X.shape[0] == n_samples + + B = np.empty((n_samples, n_neighbors), dtype=X.dtype) + v = np.ones(n_neighbors, dtype=X.dtype) + + # this might raise a LinalgError if G is singular and has trace + # zero + for i, ind in enumerate(indices): + A = Y[ind] + C = A - X[i] # broadcasting + G = np.dot(C, C.T) + trace = np.trace(G) + if trace > 0: + R = reg * trace + else: + R = reg + G.flat[:: n_neighbors + 1] += R + w = solve(G, v, assume_a="pos") + B[i, :] = w / np.sum(w) + return B + + +def barycenter_kneighbors_graph(X, n_neighbors, reg=1e-3, n_jobs=None): + """Computes the barycenter weighted graph of k-Neighbors for points in X + + Parameters + ---------- + X : {array-like, NearestNeighbors} + Sample data, shape = (n_samples, n_features), in the form of a + numpy array or a NearestNeighbors object. + + n_neighbors : int + Number of neighbors for each sample. + + reg : float, default=1e-3 + Amount of regularization when solving the least-squares + problem. Only relevant if mode='barycenter'. If None, use the + default. + + n_jobs : int or None, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Returns + ------- + A : sparse matrix in CSR format, shape = [n_samples, n_samples] + A[i, j] is assigned the weight of edge that connects i to j. + + See Also + -------- + sklearn.neighbors.kneighbors_graph + sklearn.neighbors.radius_neighbors_graph + """ + knn = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs).fit(X) + X = knn._fit_X + n_samples = knn.n_samples_fit_ + ind = knn.kneighbors(X, return_distance=False)[:, 1:] + data = barycenter_weights(X, X, ind, reg=reg) + indptr = np.arange(0, n_samples * n_neighbors + 1, n_neighbors) + return csr_matrix((data.ravel(), ind.ravel(), indptr), shape=(n_samples, n_samples)) + + +def null_space( + M, k, k_skip=1, eigen_solver="arpack", tol=1e-6, max_iter=100, random_state=None +): + """ + Find the null space of a matrix M. + + Parameters + ---------- + M : {array, matrix, sparse matrix, LinearOperator} + Input covariance matrix: should be symmetric positive semi-definite + + k : int + Number of eigenvalues/vectors to return + + k_skip : int, default=1 + Number of low eigenvalues to skip. + + eigen_solver : {'auto', 'arpack', 'dense'}, default='arpack' + auto : algorithm will attempt to choose the best method for input data + arpack : use arnoldi iteration in shift-invert mode. + For this method, M may be a dense matrix, sparse matrix, + or general linear operator. + Warning: ARPACK can be unstable for some problems. It is + best to try several random seeds in order to check results. + dense : use standard dense matrix operations for the eigenvalue + decomposition. For this method, M must be an array + or matrix type. This method should be avoided for + large problems. + + tol : float, default=1e-6 + Tolerance for 'arpack' method. + Not used if eigen_solver=='dense'. + + max_iter : int, default=100 + Maximum number of iterations for 'arpack' method. + Not used if eigen_solver=='dense' + + random_state : int, RandomState instance, default=None + Determines the random number generator when ``solver`` == 'arpack'. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + """ + if eigen_solver == "auto": + if M.shape[0] > 200 and k + k_skip < 10: + eigen_solver = "arpack" + else: + eigen_solver = "dense" + + if eigen_solver == "arpack": + v0 = _init_arpack_v0(M.shape[0], random_state) + try: + eigen_values, eigen_vectors = eigsh( + M, k + k_skip, sigma=0.0, tol=tol, maxiter=max_iter, v0=v0 + ) + except RuntimeError as e: + raise ValueError( + "Error in determining null-space with ARPACK. Error message: " + "'%s'. Note that eigen_solver='arpack' can fail when the " + "weight matrix is singular or otherwise ill-behaved. In that " + "case, eigen_solver='dense' is recommended. See online " + "documentation for more information." % e + ) from e + + return eigen_vectors[:, k_skip:], np.sum(eigen_values[k_skip:]) + elif eigen_solver == "dense": + if hasattr(M, "toarray"): + M = M.toarray() + eigen_values, eigen_vectors = eigh( + M, subset_by_index=(k_skip, k + k_skip - 1), overwrite_a=True + ) + index = np.argsort(np.abs(eigen_values)) + return eigen_vectors[:, index], np.sum(eigen_values) + else: + raise ValueError("Unrecognized eigen_solver '%s'" % eigen_solver) + + +def _locally_linear_embedding( + X, + *, + n_neighbors, + n_components, + reg=1e-3, + eigen_solver="auto", + tol=1e-6, + max_iter=100, + method="standard", + hessian_tol=1e-4, + modified_tol=1e-12, + random_state=None, + n_jobs=None, +): + nbrs = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs) + nbrs.fit(X) + X = nbrs._fit_X + + N, d_in = X.shape + + if n_components > d_in: + raise ValueError( + "output dimension must be less than or equal to input dimension" + ) + if n_neighbors >= N: + raise ValueError( + "Expected n_neighbors < n_samples, but n_samples = %d, n_neighbors = %d" + % (N, n_neighbors) + ) + + M_sparse = eigen_solver != "dense" + M_container_constructor = lil_matrix if M_sparse else np.zeros + + if method == "standard": + W = barycenter_kneighbors_graph( + nbrs, n_neighbors=n_neighbors, reg=reg, n_jobs=n_jobs + ) + + # we'll compute M = (I-W)'(I-W) + # depending on the solver, we'll do this differently + if M_sparse: + M = eye(*W.shape, format=W.format) - W + M = M.T @ M + else: + M = (W.T @ W - W.T - W).toarray() + M.flat[:: M.shape[0] + 1] += 1 # M = W' W - W' - W + I + + elif method == "hessian": + dp = n_components * (n_components + 1) // 2 + + if n_neighbors <= n_components + dp: + raise ValueError( + "for method='hessian', n_neighbors must be " + "greater than " + "[n_components * (n_components + 3) / 2]" + ) + + neighbors = nbrs.kneighbors( + X, n_neighbors=n_neighbors + 1, return_distance=False + ) + neighbors = neighbors[:, 1:] + + Yi = np.empty((n_neighbors, 1 + n_components + dp), dtype=np.float64) + Yi[:, 0] = 1 + + M = M_container_constructor((N, N), dtype=np.float64) + + use_svd = n_neighbors > d_in + + for i in range(N): + Gi = X[neighbors[i]] + Gi -= Gi.mean(0) + + # build Hessian estimator + if use_svd: + U = svd(Gi, full_matrices=0)[0] + else: + Ci = np.dot(Gi, Gi.T) + U = eigh(Ci)[1][:, ::-1] + + Yi[:, 1 : 1 + n_components] = U[:, :n_components] + + j = 1 + n_components + for k in range(n_components): + Yi[:, j : j + n_components - k] = U[:, k : k + 1] * U[:, k:n_components] + j += n_components - k + + Q, R = qr(Yi) + + w = Q[:, n_components + 1 :] + S = w.sum(0) + + S[np.where(abs(S) < hessian_tol)] = 1 + w /= S + + nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i]) + M[nbrs_x, nbrs_y] += np.dot(w, w.T) + + elif method == "modified": + if n_neighbors < n_components: + raise ValueError("modified LLE requires n_neighbors >= n_components") + + neighbors = nbrs.kneighbors( + X, n_neighbors=n_neighbors + 1, return_distance=False + ) + neighbors = neighbors[:, 1:] + + # find the eigenvectors and eigenvalues of each local covariance + # matrix. We want V[i] to be a [n_neighbors x n_neighbors] matrix, + # where the columns are eigenvectors + V = np.zeros((N, n_neighbors, n_neighbors)) + nev = min(d_in, n_neighbors) + evals = np.zeros([N, nev]) + + # choose the most efficient way to find the eigenvectors + use_svd = n_neighbors > d_in + + if use_svd: + for i in range(N): + X_nbrs = X[neighbors[i]] - X[i] + V[i], evals[i], _ = svd(X_nbrs, full_matrices=True) + evals **= 2 + else: + for i in range(N): + X_nbrs = X[neighbors[i]] - X[i] + C_nbrs = np.dot(X_nbrs, X_nbrs.T) + evi, vi = eigh(C_nbrs) + evals[i] = evi[::-1] + V[i] = vi[:, ::-1] + + # find regularized weights: this is like normal LLE. + # because we've already computed the SVD of each covariance matrix, + # it's faster to use this rather than np.linalg.solve + reg = 1e-3 * evals.sum(1) + + tmp = np.dot(V.transpose(0, 2, 1), np.ones(n_neighbors)) + tmp[:, :nev] /= evals + reg[:, None] + tmp[:, nev:] /= reg[:, None] + + w_reg = np.zeros((N, n_neighbors)) + for i in range(N): + w_reg[i] = np.dot(V[i], tmp[i]) + w_reg /= w_reg.sum(1)[:, None] + + # calculate eta: the median of the ratio of small to large eigenvalues + # across the points. This is used to determine s_i, below + rho = evals[:, n_components:].sum(1) / evals[:, :n_components].sum(1) + eta = np.median(rho) + + # find s_i, the size of the "almost null space" for each point: + # this is the size of the largest set of eigenvalues + # such that Sum[v; v in set]/Sum[v; v not in set] < eta + s_range = np.zeros(N, dtype=int) + evals_cumsum = stable_cumsum(evals, 1) + eta_range = evals_cumsum[:, -1:] / evals_cumsum[:, :-1] - 1 + for i in range(N): + s_range[i] = np.searchsorted(eta_range[i, ::-1], eta) + s_range += n_neighbors - nev # number of zero eigenvalues + + # Now calculate M. + # This is the [N x N] matrix whose null space is the desired embedding + M = M_container_constructor((N, N), dtype=np.float64) + + for i in range(N): + s_i = s_range[i] + + # select bottom s_i eigenvectors and calculate alpha + Vi = V[i, :, n_neighbors - s_i :] + alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i) + + # compute Householder matrix which satisfies + # Hi*Vi.T*ones(n_neighbors) = alpha_i*ones(s) + # using prescription from paper + h = np.full(s_i, alpha_i) - np.dot(Vi.T, np.ones(n_neighbors)) + + norm_h = np.linalg.norm(h) + if norm_h < modified_tol: + h *= 0 + else: + h /= norm_h + + # Householder matrix is + # >> Hi = np.identity(s_i) - 2*np.outer(h,h) + # Then the weight matrix is + # >> Wi = np.dot(Vi,Hi) + (1-alpha_i) * w_reg[i,:,None] + # We do this much more efficiently: + Wi = Vi - 2 * np.outer(np.dot(Vi, h), h) + (1 - alpha_i) * w_reg[i, :, None] + + # Update M as follows: + # >> W_hat = np.zeros( (N,s_i) ) + # >> W_hat[neighbors[i],:] = Wi + # >> W_hat[i] -= 1 + # >> M += np.dot(W_hat,W_hat.T) + # We can do this much more efficiently: + nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i]) + M[nbrs_x, nbrs_y] += np.dot(Wi, Wi.T) + Wi_sum1 = Wi.sum(1) + M[i, neighbors[i]] -= Wi_sum1 + M[neighbors[i], [i]] -= Wi_sum1 + M[i, i] += s_i + + elif method == "ltsa": + neighbors = nbrs.kneighbors( + X, n_neighbors=n_neighbors + 1, return_distance=False + ) + neighbors = neighbors[:, 1:] + + M = M_container_constructor((N, N), dtype=np.float64) + + use_svd = n_neighbors > d_in + + for i in range(N): + Xi = X[neighbors[i]] + Xi -= Xi.mean(0) + + # compute n_components largest eigenvalues of Xi @ Xi^T + if use_svd: + v = svd(Xi, full_matrices=True)[0] + else: + Ci = np.dot(Xi, Xi.T) + v = eigh(Ci)[1][:, ::-1] + + Gi = np.zeros((n_neighbors, n_components + 1)) + Gi[:, 1:] = v[:, :n_components] + Gi[:, 0] = 1.0 / np.sqrt(n_neighbors) + + GiGiT = np.dot(Gi, Gi.T) + + nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i]) + M[nbrs_x, nbrs_y] -= GiGiT + + M[neighbors[i], neighbors[i]] += np.ones(shape=n_neighbors) + + if M_sparse: + M = M.tocsr() + + return null_space( + M, + n_components, + k_skip=1, + eigen_solver=eigen_solver, + tol=tol, + max_iter=max_iter, + random_state=random_state, + ) + + +@validate_params( + { + "X": ["array-like", NearestNeighbors], + "n_neighbors": [Interval(Integral, 1, None, closed="left")], + "n_components": [Interval(Integral, 1, None, closed="left")], + "reg": [Interval(Real, 0, None, closed="left")], + "eigen_solver": [StrOptions({"auto", "arpack", "dense"})], + "tol": [Interval(Real, 0, None, closed="left")], + "max_iter": [Interval(Integral, 1, None, closed="left")], + "method": [StrOptions({"standard", "hessian", "modified", "ltsa"})], + "hessian_tol": [Interval(Real, 0, None, closed="left")], + "modified_tol": [Interval(Real, 0, None, closed="left")], + "random_state": ["random_state"], + "n_jobs": [None, Integral], + }, + prefer_skip_nested_validation=True, +) +def locally_linear_embedding( + X, + *, + n_neighbors, + n_components, + reg=1e-3, + eigen_solver="auto", + tol=1e-6, + max_iter=100, + method="standard", + hessian_tol=1e-4, + modified_tol=1e-12, + random_state=None, + n_jobs=None, +): + """Perform a Locally Linear Embedding analysis on the data. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, NearestNeighbors} + Sample data, shape = (n_samples, n_features), in the form of a + numpy array or a NearestNeighbors object. + + n_neighbors : int + Number of neighbors to consider for each point. + + n_components : int + Number of coordinates for the manifold. + + reg : float, default=1e-3 + Regularization constant, multiplies the trace of the local covariance + matrix of the distances. + + eigen_solver : {'auto', 'arpack', 'dense'}, default='auto' + auto : algorithm will attempt to choose the best method for input data + + arpack : use arnoldi iteration in shift-invert mode. + For this method, M may be a dense matrix, sparse matrix, + or general linear operator. + Warning: ARPACK can be unstable for some problems. It is + best to try several random seeds in order to check results. + + dense : use standard dense matrix operations for the eigenvalue + decomposition. For this method, M must be an array + or matrix type. This method should be avoided for + large problems. + + tol : float, default=1e-6 + Tolerance for 'arpack' method + Not used if eigen_solver=='dense'. + + max_iter : int, default=100 + Maximum number of iterations for the arpack solver. + + method : {'standard', 'hessian', 'modified', 'ltsa'}, default='standard' + standard : use the standard locally linear embedding algorithm. + see reference [1]_ + hessian : use the Hessian eigenmap method. This method requires + n_neighbors > n_components * (1 + (n_components + 1) / 2. + see reference [2]_ + modified : use the modified locally linear embedding algorithm. + see reference [3]_ + ltsa : use local tangent space alignment algorithm + see reference [4]_ + + hessian_tol : float, default=1e-4 + Tolerance for Hessian eigenmapping method. + Only used if method == 'hessian'. + + modified_tol : float, default=1e-12 + Tolerance for modified LLE method. + Only used if method == 'modified'. + + random_state : int, RandomState instance, default=None + Determines the random number generator when ``solver`` == 'arpack'. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + n_jobs : int or None, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Returns + ------- + Y : ndarray of shape (n_samples, n_components) + Embedding vectors. + + squared_error : float + Reconstruction error for the embedding vectors. Equivalent to + ``norm(Y - W Y, 'fro')**2``, where W are the reconstruction weights. + + References + ---------- + + .. [1] Roweis, S. & Saul, L. Nonlinear dimensionality reduction + by locally linear embedding. Science 290:2323 (2000). + .. [2] Donoho, D. & Grimes, C. Hessian eigenmaps: Locally + linear embedding techniques for high-dimensional data. + Proc Natl Acad Sci U S A. 100:5591 (2003). + .. [3] `Zhang, Z. & Wang, J. MLLE: Modified Locally Linear + Embedding Using Multiple Weights. + `_ + .. [4] Zhang, Z. & Zha, H. Principal manifolds and nonlinear + dimensionality reduction via tangent space alignment. + Journal of Shanghai Univ. 8:406 (2004) + + Examples + -------- + >>> from sklearn.datasets import load_digits + >>> from sklearn.manifold import locally_linear_embedding + >>> X, _ = load_digits(return_X_y=True) + >>> X.shape + (1797, 64) + >>> embedding, _ = locally_linear_embedding(X[:100],n_neighbors=5, n_components=2) + >>> embedding.shape + (100, 2) + """ + return _locally_linear_embedding( + X=X, + n_neighbors=n_neighbors, + n_components=n_components, + reg=reg, + eigen_solver=eigen_solver, + tol=tol, + max_iter=max_iter, + method=method, + hessian_tol=hessian_tol, + modified_tol=modified_tol, + random_state=random_state, + n_jobs=n_jobs, + ) + + +class LocallyLinearEmbedding( + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _UnstableArchMixin, + BaseEstimator, +): + """Locally Linear Embedding. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_neighbors : int, default=5 + Number of neighbors to consider for each point. + + n_components : int, default=2 + Number of coordinates for the manifold. + + reg : float, default=1e-3 + Regularization constant, multiplies the trace of the local covariance + matrix of the distances. + + eigen_solver : {'auto', 'arpack', 'dense'}, default='auto' + The solver used to compute the eigenvectors. The available options are: + + - `'auto'` : algorithm will attempt to choose the best method for input + data. + - `'arpack'` : use arnoldi iteration in shift-invert mode. For this + method, M may be a dense matrix, sparse matrix, or general linear + operator. + - `'dense'` : use standard dense matrix operations for the eigenvalue + decomposition. For this method, M must be an array or matrix type. + This method should be avoided for large problems. + + .. warning:: + ARPACK can be unstable for some problems. It is best to try several + random seeds in order to check results. + + tol : float, default=1e-6 + Tolerance for 'arpack' method + Not used if eigen_solver=='dense'. + + max_iter : int, default=100 + Maximum number of iterations for the arpack solver. + Not used if eigen_solver=='dense'. + + method : {'standard', 'hessian', 'modified', 'ltsa'}, default='standard' + - `standard`: use the standard locally linear embedding algorithm. see + reference [1]_ + - `hessian`: use the Hessian eigenmap method. This method requires + ``n_neighbors > n_components * (1 + (n_components + 1) / 2``. see + reference [2]_ + - `modified`: use the modified locally linear embedding algorithm. + see reference [3]_ + - `ltsa`: use local tangent space alignment algorithm. see + reference [4]_ + + hessian_tol : float, default=1e-4 + Tolerance for Hessian eigenmapping method. + Only used if ``method == 'hessian'``. + + modified_tol : float, default=1e-12 + Tolerance for modified LLE method. + Only used if ``method == 'modified'``. + + neighbors_algorithm : {'auto', 'brute', 'kd_tree', 'ball_tree'}, \ + default='auto' + Algorithm to use for nearest neighbors search, passed to + :class:`~sklearn.neighbors.NearestNeighbors` instance. + + random_state : int, RandomState instance, default=None + Determines the random number generator when + ``eigen_solver`` == 'arpack'. Pass an int for reproducible results + across multiple function calls. See :term:`Glossary `. + + n_jobs : int or None, default=None + The number of parallel jobs to run. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Attributes + ---------- + embedding_ : array-like, shape [n_samples, n_components] + Stores the embedding vectors + + reconstruction_error_ : float + Reconstruction error associated with `embedding_` + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + nbrs_ : NearestNeighbors object + Stores nearest neighbors instance, including BallTree or KDtree + if applicable. + + See Also + -------- + SpectralEmbedding : Spectral embedding for non-linear dimensionality + reduction. + TSNE : Distributed Stochastic Neighbor Embedding. + + References + ---------- + + .. [1] Roweis, S. & Saul, L. Nonlinear dimensionality reduction + by locally linear embedding. Science 290:2323 (2000). + .. [2] Donoho, D. & Grimes, C. Hessian eigenmaps: Locally + linear embedding techniques for high-dimensional data. + Proc Natl Acad Sci U S A. 100:5591 (2003). + .. [3] `Zhang, Z. & Wang, J. MLLE: Modified Locally Linear + Embedding Using Multiple Weights. + `_ + .. [4] Zhang, Z. & Zha, H. Principal manifolds and nonlinear + dimensionality reduction via tangent space alignment. + Journal of Shanghai Univ. 8:406 (2004) + + Examples + -------- + >>> from sklearn.datasets import load_digits + >>> from sklearn.manifold import LocallyLinearEmbedding + >>> X, _ = load_digits(return_X_y=True) + >>> X.shape + (1797, 64) + >>> embedding = LocallyLinearEmbedding(n_components=2) + >>> X_transformed = embedding.fit_transform(X[:100]) + >>> X_transformed.shape + (100, 2) + """ + + _parameter_constraints: dict = { + "n_neighbors": [Interval(Integral, 1, None, closed="left")], + "n_components": [Interval(Integral, 1, None, closed="left")], + "reg": [Interval(Real, 0, None, closed="left")], + "eigen_solver": [StrOptions({"auto", "arpack", "dense"})], + "tol": [Interval(Real, 0, None, closed="left")], + "max_iter": [Interval(Integral, 1, None, closed="left")], + "method": [StrOptions({"standard", "hessian", "modified", "ltsa"})], + "hessian_tol": [Interval(Real, 0, None, closed="left")], + "modified_tol": [Interval(Real, 0, None, closed="left")], + "neighbors_algorithm": [StrOptions({"auto", "brute", "kd_tree", "ball_tree"})], + "random_state": ["random_state"], + "n_jobs": [None, Integral], + } + + def __init__( + self, + *, + n_neighbors=5, + n_components=2, + reg=1e-3, + eigen_solver="auto", + tol=1e-6, + max_iter=100, + method="standard", + hessian_tol=1e-4, + modified_tol=1e-12, + neighbors_algorithm="auto", + random_state=None, + n_jobs=None, + ): + self.n_neighbors = n_neighbors + self.n_components = n_components + self.reg = reg + self.eigen_solver = eigen_solver + self.tol = tol + self.max_iter = max_iter + self.method = method + self.hessian_tol = hessian_tol + self.modified_tol = modified_tol + self.random_state = random_state + self.neighbors_algorithm = neighbors_algorithm + self.n_jobs = n_jobs + + def _fit_transform(self, X): + self.nbrs_ = NearestNeighbors( + n_neighbors=self.n_neighbors, + algorithm=self.neighbors_algorithm, + n_jobs=self.n_jobs, + ) + + random_state = check_random_state(self.random_state) + X = validate_data(self, X, dtype=float) + self.nbrs_.fit(X) + self.embedding_, self.reconstruction_error_ = _locally_linear_embedding( + X=self.nbrs_, + n_neighbors=self.n_neighbors, + n_components=self.n_components, + eigen_solver=self.eigen_solver, + tol=self.tol, + max_iter=self.max_iter, + method=self.method, + hessian_tol=self.hessian_tol, + modified_tol=self.modified_tol, + random_state=random_state, + reg=self.reg, + n_jobs=self.n_jobs, + ) + self._n_features_out = self.embedding_.shape[1] + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Compute the embedding vectors for data X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training set. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self : object + Fitted `LocallyLinearEmbedding` class instance. + """ + self._fit_transform(X) + return self + + @_fit_context(prefer_skip_nested_validation=True) + def fit_transform(self, X, y=None): + """Compute the embedding vectors for data X and transform X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training set. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + X_new : array-like, shape (n_samples, n_components) + Returns the instance itself. + """ + self._fit_transform(X) + return self.embedding_ + + def transform(self, X): + """ + Transform new points into embedding space. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training set. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_components) + Returns the instance itself. + + Notes + ----- + Because of scaling performed by this method, it is discouraged to use + it together with methods that are not scale-invariant (like SVMs). + """ + check_is_fitted(self) + + X = validate_data(self, X, reset=False) + ind = self.nbrs_.kneighbors( + X, n_neighbors=self.n_neighbors, return_distance=False + ) + weights = barycenter_weights(X, self.nbrs_._fit_X, ind, reg=self.reg) + X_new = np.empty((X.shape[0], self.n_components)) + for i in range(X.shape[0]): + X_new[i] = np.dot(self.embedding_[ind[i]].T, weights[i]) + return X_new diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/_mds.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/_mds.py new file mode 100644 index 0000000000000000000000000000000000000000..6c31c72f7ef59e782be2476971e28b7f487dd644 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/_mds.py @@ -0,0 +1,714 @@ +""" +Multi-dimensional Scaling (MDS). +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from numbers import Integral, Real + +import numpy as np +from joblib import effective_n_jobs + +from ..base import BaseEstimator, _fit_context +from ..isotonic import IsotonicRegression +from ..metrics import euclidean_distances +from ..utils import check_array, check_random_state, check_symmetric +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.parallel import Parallel, delayed +from ..utils.validation import validate_data + + +def _smacof_single( + dissimilarities, + metric=True, + n_components=2, + init=None, + max_iter=300, + verbose=0, + eps=1e-6, + random_state=None, + normalized_stress=False, +): + """Computes multidimensional scaling using SMACOF algorithm. + + Parameters + ---------- + dissimilarities : ndarray of shape (n_samples, n_samples) + Pairwise dissimilarities between the points. Must be symmetric. + + metric : bool, default=True + Compute metric or nonmetric SMACOF algorithm. + When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as + missing values. + + n_components : int, default=2 + Number of dimensions in which to immerse the dissimilarities. If an + ``init`` array is provided, this option is overridden and the shape of + ``init`` is used to determine the dimensionality of the embedding + space. + + init : ndarray of shape (n_samples, n_components), default=None + Starting configuration of the embedding to initialize the algorithm. By + default, the algorithm is initialized with a randomly chosen array. + + max_iter : int, default=300 + Maximum number of iterations of the SMACOF algorithm for a single run. + + verbose : int, default=0 + Level of verbosity. + + eps : float, default=1e-6 + The tolerance with respect to stress (normalized by the sum of squared + embedding distances) at which to declare convergence. + + .. versionchanged:: 1.7 + The default value for `eps` has changed from 1e-3 to 1e-6, as a result + of a bugfix in the computation of the convergence criterion. + + random_state : int, RandomState instance or None, default=None + Determines the random number generator used to initialize the centers. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + normalized_stress : bool, default=False + Whether to return normalized stress value (Stress-1) instead of raw + stress. + + .. versionadded:: 1.2 + + .. versionchanged:: 1.7 + Normalized stress is now supported for metric MDS as well. + + Returns + ------- + X : ndarray of shape (n_samples, n_components) + Coordinates of the points in a ``n_components``-space. + + stress : float + The final value of the stress (sum of squared distance of the + disparities and the distances for all constrained points). + If `normalized_stress=True`, returns Stress-1. + A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good, + 0.1 fair, and 0.2 poor [1]_. + + n_iter : int + The number of iterations corresponding to the best stress. + + References + ---------- + .. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J. + Psychometrika, 29 (1964) + + .. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric + hypothesis" Kruskal, J. Psychometrika, 29, (1964) + + .. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.; + Groenen P. Springer Series in Statistics (1997) + """ + dissimilarities = check_symmetric(dissimilarities, raise_exception=True) + + n_samples = dissimilarities.shape[0] + random_state = check_random_state(random_state) + + dissimilarities_flat = ((1 - np.tri(n_samples)) * dissimilarities).ravel() + dissimilarities_flat_w = dissimilarities_flat[dissimilarities_flat != 0] + if init is None: + # Randomly choose initial configuration + X = random_state.uniform(size=n_samples * n_components) + X = X.reshape((n_samples, n_components)) + else: + # overrides the parameter p + n_components = init.shape[1] + if n_samples != init.shape[0]: + raise ValueError( + "init matrix should be of shape (%d, %d)" % (n_samples, n_components) + ) + X = init + distances = euclidean_distances(X) + + # Out of bounds condition cannot happen because we are transforming + # the training set here, but does sometimes get triggered in + # practice due to machine precision issues. Hence "clip". + ir = IsotonicRegression(out_of_bounds="clip") + + old_stress = None + for it in range(max_iter): + # Compute distance and monotonic regression + if metric: + disparities = dissimilarities + else: + distances_flat = distances.ravel() + # dissimilarities with 0 are considered as missing values + distances_flat_w = distances_flat[dissimilarities_flat != 0] + + # Compute the disparities using isotonic regression. + # For the first SMACOF iteration, use scaled original dissimilarities. + # (This choice follows the R implementation described in this paper: + # https://www.jstatsoft.org/article/view/v102i10) + if it < 1: + disparities_flat = dissimilarities_flat_w + else: + disparities_flat = ir.fit_transform( + dissimilarities_flat_w, distances_flat_w + ) + disparities = np.zeros_like(distances_flat) + disparities[dissimilarities_flat != 0] = disparities_flat + disparities = disparities.reshape((n_samples, n_samples)) + disparities *= np.sqrt( + (n_samples * (n_samples - 1) / 2) / (disparities**2).sum() + ) + disparities = disparities + disparities.T + + # Update X using the Guttman transform + distances[distances == 0] = 1e-5 + ratio = disparities / distances + B = -ratio + B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1) + X = 1.0 / n_samples * np.dot(B, X) + + # Compute stress + distances = euclidean_distances(X) + stress = ((distances.ravel() - disparities.ravel()) ** 2).sum() / 2 + + if verbose >= 2: # pragma: no cover + print(f"Iteration {it}, stress {stress:.4f}") + if old_stress is not None: + sum_squared_distances = (distances.ravel() ** 2).sum() + if ((old_stress - stress) / (sum_squared_distances / 2)) < eps: + if verbose: # pragma: no cover + print("Convergence criterion reached.") + break + old_stress = stress + + if normalized_stress: + sum_squared_distances = (distances.ravel() ** 2).sum() + stress = np.sqrt(stress / (sum_squared_distances / 2)) + + return X, stress, it + 1 + + +# TODO(1.9): change default `n_init` to 1, see PR #31117 +@validate_params( + { + "dissimilarities": ["array-like"], + "metric": ["boolean"], + "n_components": [Interval(Integral, 1, None, closed="left")], + "init": ["array-like", None], + "n_init": [Interval(Integral, 1, None, closed="left"), StrOptions({"warn"})], + "n_jobs": [Integral, None], + "max_iter": [Interval(Integral, 1, None, closed="left")], + "verbose": ["verbose"], + "eps": [Interval(Real, 0, None, closed="left")], + "random_state": ["random_state"], + "return_n_iter": ["boolean"], + "normalized_stress": ["boolean", StrOptions({"auto"})], + }, + prefer_skip_nested_validation=True, +) +def smacof( + dissimilarities, + *, + metric=True, + n_components=2, + init=None, + n_init="warn", + n_jobs=None, + max_iter=300, + verbose=0, + eps=1e-6, + random_state=None, + return_n_iter=False, + normalized_stress="auto", +): + """Compute multidimensional scaling using the SMACOF algorithm. + + The SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a + multidimensional scaling algorithm which minimizes an objective function + (the *stress*) using a majorization technique. Stress majorization, also + known as the Guttman Transform, guarantees a monotone convergence of + stress, and is more powerful than traditional techniques such as gradient + descent. + + The SMACOF algorithm for metric MDS can be summarized by the following + steps: + + 1. Set an initial start configuration, randomly or not. + 2. Compute the stress + 3. Compute the Guttman Transform + 4. Iterate 2 and 3 until convergence. + + The nonmetric algorithm adds a monotonic regression step before computing + the stress. + + Parameters + ---------- + dissimilarities : array-like of shape (n_samples, n_samples) + Pairwise dissimilarities between the points. Must be symmetric. + + metric : bool, default=True + Compute metric or nonmetric SMACOF algorithm. + When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as + missing values. + + n_components : int, default=2 + Number of dimensions in which to immerse the dissimilarities. If an + ``init`` array is provided, this option is overridden and the shape of + ``init`` is used to determine the dimensionality of the embedding + space. + + init : array-like of shape (n_samples, n_components), default=None + Starting configuration of the embedding to initialize the algorithm. By + default, the algorithm is initialized with a randomly chosen array. + + n_init : int, default=8 + Number of times the SMACOF algorithm will be run with different + initializations. The final results will be the best output of the runs, + determined by the run with the smallest final stress. If ``init`` is + provided, this option is overridden and a single run is performed. + + .. versionchanged:: 1.9 + The default value for `n_iter` will change from 8 to 1 in version 1.9. + + n_jobs : int, default=None + The number of jobs to use for the computation. If multiple + initializations are used (``n_init``), each run of the algorithm is + computed in parallel. + + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + max_iter : int, default=300 + Maximum number of iterations of the SMACOF algorithm for a single run. + + verbose : int, default=0 + Level of verbosity. + + eps : float, default=1e-6 + The tolerance with respect to stress (normalized by the sum of squared + embedding distances) at which to declare convergence. + + .. versionchanged:: 1.7 + The default value for `eps` has changed from 1e-3 to 1e-6, as a result + of a bugfix in the computation of the convergence criterion. + + random_state : int, RandomState instance or None, default=None + Determines the random number generator used to initialize the centers. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + return_n_iter : bool, default=False + Whether or not to return the number of iterations. + + normalized_stress : bool or "auto", default="auto" + Whether to return normalized stress value (Stress-1) instead of raw + stress. By default, metric MDS returns raw stress while non-metric MDS + returns normalized stress. + + .. versionadded:: 1.2 + + .. versionchanged:: 1.4 + The default value changed from `False` to `"auto"` in version 1.4. + + .. versionchanged:: 1.7 + Normalized stress is now supported for metric MDS as well. + + Returns + ------- + X : ndarray of shape (n_samples, n_components) + Coordinates of the points in a ``n_components``-space. + + stress : float + The final value of the stress (sum of squared distance of the + disparities and the distances for all constrained points). + If `normalized_stress=True`, returns Stress-1. + A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good, + 0.1 fair, and 0.2 poor [1]_. + + n_iter : int + The number of iterations corresponding to the best stress. Returned + only if ``return_n_iter`` is set to ``True``. + + References + ---------- + .. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J. + Psychometrika, 29 (1964) + + .. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric + hypothesis" Kruskal, J. Psychometrika, 29, (1964) + + .. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.; + Groenen P. Springer Series in Statistics (1997) + + Examples + -------- + >>> import numpy as np + >>> from sklearn.manifold import smacof + >>> from sklearn.metrics import euclidean_distances + >>> X = np.array([[0, 1, 2], [1, 0, 3], [2, 3, 0]]) + >>> dissimilarities = euclidean_distances(X) + >>> Z, stress = smacof( + ... dissimilarities, n_components=2, n_init=1, eps=1e-6, random_state=42 + ... ) + >>> Z.shape + (3, 2) + >>> np.round(stress, 6).item() + 3.2e-05 + """ + + if n_init == "warn": + warnings.warn( + "The default value of `n_init` will change from 8 to 1 in 1.9.", + FutureWarning, + ) + n_init = 8 + + dissimilarities = check_array(dissimilarities) + random_state = check_random_state(random_state) + + if normalized_stress == "auto": + normalized_stress = not metric + + if hasattr(init, "__array__"): + init = np.asarray(init).copy() + if not n_init == 1: + warnings.warn( + "Explicit initial positions passed: " + "performing only one init of the MDS instead of %d" % n_init + ) + n_init = 1 + + best_pos, best_stress = None, None + + if effective_n_jobs(n_jobs) == 1: + for it in range(n_init): + pos, stress, n_iter_ = _smacof_single( + dissimilarities, + metric=metric, + n_components=n_components, + init=init, + max_iter=max_iter, + verbose=verbose, + eps=eps, + random_state=random_state, + normalized_stress=normalized_stress, + ) + if best_stress is None or stress < best_stress: + best_stress = stress + best_pos = pos.copy() + best_iter = n_iter_ + else: + seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) + results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))( + delayed(_smacof_single)( + dissimilarities, + metric=metric, + n_components=n_components, + init=init, + max_iter=max_iter, + verbose=verbose, + eps=eps, + random_state=seed, + normalized_stress=normalized_stress, + ) + for seed in seeds + ) + positions, stress, n_iters = zip(*results) + best = np.argmin(stress) + best_stress = stress[best] + best_pos = positions[best] + best_iter = n_iters[best] + + if return_n_iter: + return best_pos, best_stress, best_iter + else: + return best_pos, best_stress + + +# TODO(1.9): change default `n_init` to 1, see PR #31117 +class MDS(BaseEstimator): + """Multidimensional scaling. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int, default=2 + Number of dimensions in which to immerse the dissimilarities. + + metric : bool, default=True + If ``True``, perform metric MDS; otherwise, perform nonmetric MDS. + When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as + missing values. + + n_init : int, default=4 + Number of times the SMACOF algorithm will be run with different + initializations. The final results will be the best output of the runs, + determined by the run with the smallest final stress. + + .. versionchanged:: 1.9 + The default value for `n_init` will change from 4 to 1 in version 1.9. + + max_iter : int, default=300 + Maximum number of iterations of the SMACOF algorithm for a single run. + + verbose : int, default=0 + Level of verbosity. + + eps : float, default=1e-6 + The tolerance with respect to stress (normalized by the sum of squared + embedding distances) at which to declare convergence. + + .. versionchanged:: 1.7 + The default value for `eps` has changed from 1e-3 to 1e-6, as a result + of a bugfix in the computation of the convergence criterion. + + n_jobs : int, default=None + The number of jobs to use for the computation. If multiple + initializations are used (``n_init``), each run of the algorithm is + computed in parallel. + + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + random_state : int, RandomState instance or None, default=None + Determines the random number generator used to initialize the centers. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + dissimilarity : {'euclidean', 'precomputed'}, default='euclidean' + Dissimilarity measure to use: + + - 'euclidean': + Pairwise Euclidean distances between points in the dataset. + + - 'precomputed': + Pre-computed dissimilarities are passed directly to ``fit`` and + ``fit_transform``. + + normalized_stress : bool or "auto" default="auto" + Whether to return normalized stress value (Stress-1) instead of raw + stress. By default, metric MDS returns raw stress while non-metric MDS + returns normalized stress. + + .. versionadded:: 1.2 + + .. versionchanged:: 1.4 + The default value changed from `False` to `"auto"` in version 1.4. + + .. versionchanged:: 1.7 + Normalized stress is now supported for metric MDS as well. + + Attributes + ---------- + embedding_ : ndarray of shape (n_samples, n_components) + Stores the position of the dataset in the embedding space. + + stress_ : float + The final value of the stress (sum of squared distance of the + disparities and the distances for all constrained points). + If `normalized_stress=True`, returns Stress-1. + A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good, + 0.1 fair, and 0.2 poor [1]_. + + dissimilarity_matrix_ : ndarray of shape (n_samples, n_samples) + Pairwise dissimilarities between the points. Symmetric matrix that: + + - either uses a custom dissimilarity matrix by setting `dissimilarity` + to 'precomputed'; + - or constructs a dissimilarity matrix from data using + Euclidean distances. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + The number of iterations corresponding to the best stress. + + See Also + -------- + sklearn.decomposition.PCA : Principal component analysis that is a linear + dimensionality reduction method. + sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using + kernels and PCA. + TSNE : T-distributed Stochastic Neighbor Embedding. + Isomap : Manifold learning based on Isometric Mapping. + LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding. + SpectralEmbedding : Spectral embedding for non-linear dimensionality. + + References + ---------- + .. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J. + Psychometrika, 29 (1964) + + .. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric + hypothesis" Kruskal, J. Psychometrika, 29, (1964) + + .. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.; + Groenen P. Springer Series in Statistics (1997) + + Examples + -------- + >>> from sklearn.datasets import load_digits + >>> from sklearn.manifold import MDS + >>> X, _ = load_digits(return_X_y=True) + >>> X.shape + (1797, 64) + >>> embedding = MDS(n_components=2, n_init=1) + >>> X_transformed = embedding.fit_transform(X[:100]) + >>> X_transformed.shape + (100, 2) + + For a more detailed example of usage, see + :ref:`sphx_glr_auto_examples_manifold_plot_mds.py`. + + For a comparison of manifold learning techniques, see + :ref:`sphx_glr_auto_examples_manifold_plot_compare_methods.py`. + """ + + _parameter_constraints: dict = { + "n_components": [Interval(Integral, 1, None, closed="left")], + "metric": ["boolean"], + "n_init": [Interval(Integral, 1, None, closed="left"), StrOptions({"warn"})], + "max_iter": [Interval(Integral, 1, None, closed="left")], + "verbose": ["verbose"], + "eps": [Interval(Real, 0.0, None, closed="left")], + "n_jobs": [None, Integral], + "random_state": ["random_state"], + "dissimilarity": [StrOptions({"euclidean", "precomputed"})], + "normalized_stress": ["boolean", StrOptions({"auto"})], + } + + def __init__( + self, + n_components=2, + *, + metric=True, + n_init="warn", + max_iter=300, + verbose=0, + eps=1e-6, + n_jobs=None, + random_state=None, + dissimilarity="euclidean", + normalized_stress="auto", + ): + self.n_components = n_components + self.dissimilarity = dissimilarity + self.metric = metric + self.n_init = n_init + self.max_iter = max_iter + self.eps = eps + self.verbose = verbose + self.n_jobs = n_jobs + self.random_state = random_state + self.normalized_stress = normalized_stress + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = self.dissimilarity == "precomputed" + return tags + + def fit(self, X, y=None, init=None): + """ + Compute the position of the points in the embedding space. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) or \ + (n_samples, n_samples) + Input data. If ``dissimilarity=='precomputed'``, the input should + be the dissimilarity matrix. + + y : Ignored + Not used, present for API consistency by convention. + + init : ndarray of shape (n_samples, n_components), default=None + Starting configuration of the embedding to initialize the SMACOF + algorithm. By default, the algorithm is initialized with a randomly + chosen array. + + Returns + ------- + self : object + Fitted estimator. + """ + self.fit_transform(X, init=init) + return self + + @_fit_context(prefer_skip_nested_validation=True) + def fit_transform(self, X, y=None, init=None): + """ + Fit the data from `X`, and returns the embedded coordinates. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) or \ + (n_samples, n_samples) + Input data. If ``dissimilarity=='precomputed'``, the input should + be the dissimilarity matrix. + + y : Ignored + Not used, present for API consistency by convention. + + init : ndarray of shape (n_samples, n_components), default=None + Starting configuration of the embedding to initialize the SMACOF + algorithm. By default, the algorithm is initialized with a randomly + chosen array. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_components) + X transformed in the new space. + """ + + if self.n_init == "warn": + warnings.warn( + "The default value of `n_init` will change from 4 to 1 in 1.9.", + FutureWarning, + ) + self._n_init = 4 + else: + self._n_init = self.n_init + + X = validate_data(self, X) + if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed": + warnings.warn( + "The MDS API has changed. ``fit`` now constructs a" + " dissimilarity matrix from data. To use a custom " + "dissimilarity matrix, set " + "``dissimilarity='precomputed'``." + ) + + if self.dissimilarity == "precomputed": + self.dissimilarity_matrix_ = X + elif self.dissimilarity == "euclidean": + self.dissimilarity_matrix_ = euclidean_distances(X) + + self.embedding_, self.stress_, self.n_iter_ = smacof( + self.dissimilarity_matrix_, + metric=self.metric, + n_components=self.n_components, + init=init, + n_init=self._n_init, + n_jobs=self.n_jobs, + max_iter=self.max_iter, + verbose=self.verbose, + eps=self.eps, + random_state=self.random_state, + return_n_iter=True, + normalized_stress=self.normalized_stress, + ) + + return self.embedding_ diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/_spectral_embedding.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/_spectral_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..1a3b95e023897567bd49cc5c0e969a240a1e2afd --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/_spectral_embedding.py @@ -0,0 +1,776 @@ +"""Spectral Embedding.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from numbers import Integral, Real + +import numpy as np +from scipy import sparse +from scipy.linalg import eigh +from scipy.sparse.csgraph import connected_components +from scipy.sparse.linalg import eigsh, lobpcg + +from ..base import BaseEstimator, _fit_context +from ..metrics.pairwise import rbf_kernel +from ..neighbors import NearestNeighbors, kneighbors_graph +from ..utils import ( + check_array, + check_random_state, + check_symmetric, +) +from ..utils._arpack import _init_arpack_v0 +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.extmath import _deterministic_vector_sign_flip +from ..utils.fixes import laplacian as csgraph_laplacian +from ..utils.fixes import parse_version, sp_version +from ..utils.validation import validate_data + + +def _graph_connected_component(graph, node_id): + """Find the largest graph connected components that contains one + given node. + + Parameters + ---------- + graph : array-like of shape (n_samples, n_samples) + Adjacency matrix of the graph, non-zero weight means an edge + between the nodes. + + node_id : int + The index of the query node of the graph. + + Returns + ------- + connected_components_matrix : array-like of shape (n_samples,) + An array of bool value indicating the indexes of the nodes + belonging to the largest connected components of the given query + node. + """ + n_node = graph.shape[0] + if sparse.issparse(graph): + # speed up row-wise access to boolean connection mask + graph = graph.tocsr() + connected_nodes = np.zeros(n_node, dtype=bool) + nodes_to_explore = np.zeros(n_node, dtype=bool) + nodes_to_explore[node_id] = True + for _ in range(n_node): + last_num_component = connected_nodes.sum() + np.logical_or(connected_nodes, nodes_to_explore, out=connected_nodes) + if last_num_component >= connected_nodes.sum(): + break + indices = np.where(nodes_to_explore)[0] + nodes_to_explore.fill(False) + for i in indices: + if sparse.issparse(graph): + # scipy not yet implemented 1D sparse slices; can be changed back to + # `neighbors = graph[i].toarray().ravel()` once implemented + neighbors = graph[[i], :].toarray().ravel() + else: + neighbors = graph[i] + np.logical_or(nodes_to_explore, neighbors, out=nodes_to_explore) + return connected_nodes + + +def _graph_is_connected(graph): + """Return whether the graph is connected (True) or Not (False). + + Parameters + ---------- + graph : {array-like, sparse matrix} of shape (n_samples, n_samples) + Adjacency matrix of the graph, non-zero weight means an edge + between the nodes. + + Returns + ------- + is_connected : bool + True means the graph is fully connected and False means not. + """ + if sparse.issparse(graph): + # Before Scipy 1.11.3, `connected_components` only supports 32-bit indices. + # PR: https://github.com/scipy/scipy/pull/18913 + # First integration in 1.11.3: https://github.com/scipy/scipy/pull/19279 + # TODO(jjerphan): Once SciPy 1.11.3 is the minimum supported version, use + # `accept_large_sparse=True`. + accept_large_sparse = sp_version >= parse_version("1.11.3") + graph = check_array( + graph, accept_sparse=True, accept_large_sparse=accept_large_sparse + ) + # sparse graph, find all the connected components + n_connected_components, _ = connected_components(graph) + return n_connected_components == 1 + else: + # dense graph, find all connected components start from node 0 + return _graph_connected_component(graph, 0).sum() == graph.shape[0] + + +def _set_diag(laplacian, value, norm_laplacian): + """Set the diagonal of the laplacian matrix and convert it to a + sparse format well suited for eigenvalue decomposition. + + Parameters + ---------- + laplacian : {ndarray, sparse matrix} + The graph laplacian. + + value : float + The value of the diagonal. + + norm_laplacian : bool + Whether the value of the diagonal should be changed or not. + + Returns + ------- + laplacian : {array, sparse matrix} + An array of matrix in a form that is well suited to fast + eigenvalue decomposition, depending on the band width of the + matrix. + """ + n_nodes = laplacian.shape[0] + # We need all entries in the diagonal to values + if not sparse.issparse(laplacian): + if norm_laplacian: + laplacian.flat[:: n_nodes + 1] = value + else: + laplacian = laplacian.tocoo() + if norm_laplacian: + diag_idx = laplacian.row == laplacian.col + laplacian.data[diag_idx] = value + # If the matrix has a small number of diagonals (as in the + # case of structured matrices coming from images), the + # dia format might be best suited for matvec products: + n_diags = np.unique(laplacian.row - laplacian.col).size + if n_diags <= 7: + # 3 or less outer diagonals on each side + laplacian = laplacian.todia() + else: + # csr has the fastest matvec and is thus best suited to + # arpack + laplacian = laplacian.tocsr() + return laplacian + + +@validate_params( + { + "adjacency": ["array-like", "sparse matrix"], + "n_components": [Interval(Integral, 1, None, closed="left")], + "eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None], + "random_state": ["random_state"], + "eigen_tol": [Interval(Real, 0, None, closed="left"), StrOptions({"auto"})], + "norm_laplacian": ["boolean"], + "drop_first": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def spectral_embedding( + adjacency, + *, + n_components=8, + eigen_solver=None, + random_state=None, + eigen_tol="auto", + norm_laplacian=True, + drop_first=True, +): + """Project the sample on the first eigenvectors of the graph Laplacian. + + The adjacency matrix is used to compute a normalized graph Laplacian + whose spectrum (especially the eigenvectors associated to the + smallest eigenvalues) has an interpretation in terms of minimal + number of cuts necessary to split the graph into comparably sized + components. + + This embedding can also 'work' even if the ``adjacency`` variable is + not strictly the adjacency matrix of a graph but more generally + an affinity or similarity matrix between samples (for instance the + heat kernel of a euclidean distance matrix or a k-NN matrix). + + However care must taken to always make the affinity matrix symmetric + so that the eigenvector decomposition works as expected. + + Note : Laplacian Eigenmaps is the actual algorithm implemented here. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + adjacency : {array-like, sparse graph} of shape (n_samples, n_samples) + The adjacency matrix of the graph to embed. + + n_components : int, default=8 + The dimension of the projection subspace. + + eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None + The eigenvalue decomposition strategy to use. AMG requires pyamg + to be installed. It can be faster on very large, sparse problems, + but may also lead to instabilities. If None, then ``'arpack'`` is + used. + + random_state : int, RandomState instance or None, default=None + A pseudo random number generator used for the initialization + of the lobpcg eigen vectors decomposition when `eigen_solver == + 'amg'`, and for the K-Means initialization. Use an int to make + the results deterministic across calls (See + :term:`Glossary `). + + .. note:: + When using `eigen_solver == 'amg'`, + it is necessary to also fix the global numpy seed with + `np.random.seed(int)` to get deterministic results. See + https://github.com/pyamg/pyamg/issues/139 for further + information. + + eigen_tol : float, default="auto" + Stopping criterion for eigendecomposition of the Laplacian matrix. + If `eigen_tol="auto"` then the passed tolerance will depend on the + `eigen_solver`: + + - If `eigen_solver="arpack"`, then `eigen_tol=0.0`; + - If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then + `eigen_tol=None` which configures the underlying `lobpcg` solver to + automatically resolve the value according to their heuristics. See, + :func:`scipy.sparse.linalg.lobpcg` for details. + + Note that when using `eigen_solver="amg"` values of `tol<1e-5` may lead + to convergence issues and should be avoided. + + .. versionadded:: 1.2 + Added 'auto' option. + + norm_laplacian : bool, default=True + If True, then compute symmetric normalized Laplacian. + + drop_first : bool, default=True + Whether to drop the first eigenvector. For spectral embedding, this + should be True as the first eigenvector should be constant vector for + connected graph, but for spectral clustering, this should be kept as + False to retain the first eigenvector. + + Returns + ------- + embedding : ndarray of shape (n_samples, n_components) + The reduced samples. + + Notes + ----- + Spectral Embedding (Laplacian Eigenmaps) is most useful when the graph + has one connected component. If there graph has many components, the first + few eigenvectors will simply uncover the connected components of the graph. + + References + ---------- + * https://en.wikipedia.org/wiki/LOBPCG + + * :doi:`"Toward the Optimal Preconditioned Eigensolver: Locally Optimal + Block Preconditioned Conjugate Gradient Method", + Andrew V. Knyazev + <10.1137/S1064827500366124>` + + Examples + -------- + >>> from sklearn.datasets import load_digits + >>> from sklearn.neighbors import kneighbors_graph + >>> from sklearn.manifold import spectral_embedding + >>> X, _ = load_digits(return_X_y=True) + >>> X = X[:100] + >>> affinity_matrix = kneighbors_graph( + ... X, n_neighbors=int(X.shape[0] / 10), include_self=True + ... ) + >>> # make the matrix symmetric + >>> affinity_matrix = 0.5 * (affinity_matrix + affinity_matrix.T) + >>> embedding = spectral_embedding(affinity_matrix, n_components=2, random_state=42) + >>> embedding.shape + (100, 2) + """ + random_state = check_random_state(random_state) + + return _spectral_embedding( + adjacency, + n_components=n_components, + eigen_solver=eigen_solver, + random_state=random_state, + eigen_tol=eigen_tol, + norm_laplacian=norm_laplacian, + drop_first=drop_first, + ) + + +def _spectral_embedding( + adjacency, + *, + n_components=8, + eigen_solver=None, + random_state=None, + eigen_tol="auto", + norm_laplacian=True, + drop_first=True, +): + adjacency = check_symmetric(adjacency) + + if eigen_solver == "amg": + try: + from pyamg import smoothed_aggregation_solver + except ImportError as e: + raise ValueError( + "The eigen_solver was set to 'amg', but pyamg is not available." + ) from e + + if eigen_solver is None: + eigen_solver = "arpack" + + n_nodes = adjacency.shape[0] + # Whether to drop the first eigenvector + if drop_first: + n_components = n_components + 1 + + if not _graph_is_connected(adjacency): + warnings.warn( + "Graph is not fully connected, spectral embedding may not work as expected." + ) + + laplacian, dd = csgraph_laplacian( + adjacency, normed=norm_laplacian, return_diag=True + ) + if eigen_solver == "arpack" or ( + eigen_solver != "lobpcg" + and (not sparse.issparse(laplacian) or n_nodes < 5 * n_components) + ): + # lobpcg used with eigen_solver='amg' has bugs for low number of nodes + # for details see the source code in scipy: + # https://github.com/scipy/scipy/blob/v0.11.0/scipy/sparse/linalg/eigen + # /lobpcg/lobpcg.py#L237 + # or matlab: + # https://www.mathworks.com/matlabcentral/fileexchange/48-lobpcg-m + laplacian = _set_diag(laplacian, 1, norm_laplacian) + + # Here we'll use shift-invert mode for fast eigenvalues + # (see https://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html + # for a short explanation of what this means) + # Because the normalized Laplacian has eigenvalues between 0 and 2, + # I - L has eigenvalues between -1 and 1. ARPACK is most efficient + # when finding eigenvalues of largest magnitude (keyword which='LM') + # and when these eigenvalues are very large compared to the rest. + # For very large, very sparse graphs, I - L can have many, many + # eigenvalues very near 1.0. This leads to slow convergence. So + # instead, we'll use ARPACK's shift-invert mode, asking for the + # eigenvalues near 1.0. This effectively spreads-out the spectrum + # near 1.0 and leads to much faster convergence: potentially an + # orders-of-magnitude speedup over simply using keyword which='LA' + # in standard mode. + try: + # We are computing the opposite of the laplacian inplace so as + # to spare a memory allocation of a possibly very large array + tol = 0 if eigen_tol == "auto" else eigen_tol + laplacian *= -1 + v0 = _init_arpack_v0(laplacian.shape[0], random_state) + laplacian = check_array( + laplacian, accept_sparse="csr", accept_large_sparse=False + ) + _, diffusion_map = eigsh( + laplacian, k=n_components, sigma=1.0, which="LM", tol=tol, v0=v0 + ) + embedding = diffusion_map.T[n_components::-1] + if norm_laplacian: + # recover u = D^-1/2 x from the eigenvector output x + embedding = embedding / dd + except RuntimeError: + # When submatrices are exactly singular, an LU decomposition + # in arpack fails. We fallback to lobpcg + eigen_solver = "lobpcg" + # Revert the laplacian to its opposite to have lobpcg work + laplacian *= -1 + + elif eigen_solver == "amg": + # Use AMG to get a preconditioner and speed up the eigenvalue + # problem. + if not sparse.issparse(laplacian): + warnings.warn("AMG works better for sparse matrices") + laplacian = check_array( + laplacian, dtype=[np.float64, np.float32], accept_sparse=True + ) + laplacian = _set_diag(laplacian, 1, norm_laplacian) + + # The Laplacian matrix is always singular, having at least one zero + # eigenvalue, corresponding to the trivial eigenvector, which is a + # constant. Using a singular matrix for preconditioning may result in + # random failures in LOBPCG and is not supported by the existing + # theory: + # see https://doi.org/10.1007/s10208-015-9297-1 + # Shift the Laplacian so its diagononal is not all ones. The shift + # does change the eigenpairs however, so we'll feed the shifted + # matrix to the solver and afterward set it back to the original. + diag_shift = 1e-5 * sparse.eye(laplacian.shape[0]) + laplacian += diag_shift + if hasattr(sparse, "csr_array") and isinstance(laplacian, sparse.csr_array): + # `pyamg` does not work with `csr_array` and we need to convert it to a + # `csr_matrix` object. + laplacian = sparse.csr_matrix(laplacian) + ml = smoothed_aggregation_solver(check_array(laplacian, accept_sparse="csr")) + laplacian -= diag_shift + + M = ml.aspreconditioner() + # Create initial approximation X to eigenvectors + X = random_state.standard_normal(size=(laplacian.shape[0], n_components + 1)) + X[:, 0] = dd.ravel() + X = X.astype(laplacian.dtype) + + tol = None if eigen_tol == "auto" else eigen_tol + _, diffusion_map = lobpcg(laplacian, X, M=M, tol=tol, largest=False) + embedding = diffusion_map.T + if norm_laplacian: + # recover u = D^-1/2 x from the eigenvector output x + embedding = embedding / dd + if embedding.shape[0] == 1: + raise ValueError + + if eigen_solver == "lobpcg": + laplacian = check_array( + laplacian, dtype=[np.float64, np.float32], accept_sparse=True + ) + if n_nodes < 5 * n_components + 1: + # see note above under arpack why lobpcg has problems with small + # number of nodes + # lobpcg will fallback to eigh, so we short circuit it + if sparse.issparse(laplacian): + laplacian = laplacian.toarray() + _, diffusion_map = eigh(laplacian, check_finite=False) + embedding = diffusion_map.T[:n_components] + if norm_laplacian: + # recover u = D^-1/2 x from the eigenvector output x + embedding = embedding / dd + else: + laplacian = _set_diag(laplacian, 1, norm_laplacian) + # We increase the number of eigenvectors requested, as lobpcg + # doesn't behave well in low dimension and create initial + # approximation X to eigenvectors + X = random_state.standard_normal( + size=(laplacian.shape[0], n_components + 1) + ) + X[:, 0] = dd.ravel() + X = X.astype(laplacian.dtype) + tol = None if eigen_tol == "auto" else eigen_tol + _, diffusion_map = lobpcg( + laplacian, X, tol=tol, largest=False, maxiter=2000 + ) + embedding = diffusion_map.T[:n_components] + if norm_laplacian: + # recover u = D^-1/2 x from the eigenvector output x + embedding = embedding / dd + if embedding.shape[0] == 1: + raise ValueError + + embedding = _deterministic_vector_sign_flip(embedding) + if drop_first: + return embedding[1:n_components].T + else: + return embedding[:n_components].T + + +class SpectralEmbedding(BaseEstimator): + """Spectral embedding for non-linear dimensionality reduction. + + Forms an affinity matrix given by the specified function and + applies spectral decomposition to the corresponding graph laplacian. + The resulting transformation is given by the value of the + eigenvectors for each data point. + + Note : Laplacian Eigenmaps is the actual algorithm implemented here. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int, default=2 + The dimension of the projected subspace. + + affinity : {'nearest_neighbors', 'rbf', 'precomputed', \ + 'precomputed_nearest_neighbors'} or callable, \ + default='nearest_neighbors' + How to construct the affinity matrix. + - 'nearest_neighbors' : construct the affinity matrix by computing a + graph of nearest neighbors. + - 'rbf' : construct the affinity matrix by computing a radial basis + function (RBF) kernel. + - 'precomputed' : interpret ``X`` as a precomputed affinity matrix. + - 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph + of precomputed nearest neighbors, and constructs the affinity matrix + by selecting the ``n_neighbors`` nearest neighbors. + - callable : use passed in function as affinity + the function takes in data matrix (n_samples, n_features) + and return affinity matrix (n_samples, n_samples). + + gamma : float, default=None + Kernel coefficient for rbf kernel. If None, gamma will be set to + 1/n_features. + + random_state : int, RandomState instance or None, default=None + A pseudo random number generator used for the initialization + of the lobpcg eigen vectors decomposition when `eigen_solver == + 'amg'`, and for the K-Means initialization. Use an int to make + the results deterministic across calls (See + :term:`Glossary `). + + .. note:: + When using `eigen_solver == 'amg'`, + it is necessary to also fix the global numpy seed with + `np.random.seed(int)` to get deterministic results. See + https://github.com/pyamg/pyamg/issues/139 for further + information. + + eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None + The eigenvalue decomposition strategy to use. AMG requires pyamg + to be installed. It can be faster on very large, sparse problems. + If None, then ``'arpack'`` is used. + + eigen_tol : float, default="auto" + Stopping criterion for eigendecomposition of the Laplacian matrix. + If `eigen_tol="auto"` then the passed tolerance will depend on the + `eigen_solver`: + + - If `eigen_solver="arpack"`, then `eigen_tol=0.0`; + - If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then + `eigen_tol=None` which configures the underlying `lobpcg` solver to + automatically resolve the value according to their heuristics. See, + :func:`scipy.sparse.linalg.lobpcg` for details. + + Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"` + values of `tol<1e-5` may lead to convergence issues and should be + avoided. + + .. versionadded:: 1.2 + + n_neighbors : int, default=None + Number of nearest neighbors for nearest_neighbors graph building. + If None, n_neighbors will be set to max(n_samples/10, 1). + + n_jobs : int, default=None + The number of parallel jobs to run. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Attributes + ---------- + embedding_ : ndarray of shape (n_samples, n_components) + Spectral embedding of the training matrix. + + affinity_matrix_ : ndarray of shape (n_samples, n_samples) + Affinity_matrix constructed from samples or precomputed. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_neighbors_ : int + Number of nearest neighbors effectively used. + + See Also + -------- + Isomap : Non-linear dimensionality reduction through Isometric Mapping. + + References + ---------- + + - :doi:`A Tutorial on Spectral Clustering, 2007 + Ulrike von Luxburg + <10.1007/s11222-007-9033-z>` + + - `On Spectral Clustering: Analysis and an algorithm, 2001 + Andrew Y. Ng, Michael I. Jordan, Yair Weiss + `_ + + - :doi:`Normalized cuts and image segmentation, 2000 + Jianbo Shi, Jitendra Malik + <10.1109/34.868688>` + + Examples + -------- + >>> from sklearn.datasets import load_digits + >>> from sklearn.manifold import SpectralEmbedding + >>> X, _ = load_digits(return_X_y=True) + >>> X.shape + (1797, 64) + >>> embedding = SpectralEmbedding(n_components=2) + >>> X_transformed = embedding.fit_transform(X[:100]) + >>> X_transformed.shape + (100, 2) + """ + + _parameter_constraints: dict = { + "n_components": [Interval(Integral, 1, None, closed="left")], + "affinity": [ + StrOptions( + { + "nearest_neighbors", + "rbf", + "precomputed", + "precomputed_nearest_neighbors", + }, + ), + callable, + ], + "gamma": [Interval(Real, 0, None, closed="left"), None], + "random_state": ["random_state"], + "eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None], + "eigen_tol": [Interval(Real, 0, None, closed="left"), StrOptions({"auto"})], + "n_neighbors": [Interval(Integral, 1, None, closed="left"), None], + "n_jobs": [None, Integral], + } + + def __init__( + self, + n_components=2, + *, + affinity="nearest_neighbors", + gamma=None, + random_state=None, + eigen_solver=None, + eigen_tol="auto", + n_neighbors=None, + n_jobs=None, + ): + self.n_components = n_components + self.affinity = affinity + self.gamma = gamma + self.random_state = random_state + self.eigen_solver = eigen_solver + self.eigen_tol = eigen_tol + self.n_neighbors = n_neighbors + self.n_jobs = n_jobs + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + tags.input_tags.pairwise = self.affinity in [ + "precomputed", + "precomputed_nearest_neighbors", + ] + return tags + + def _get_affinity_matrix(self, X, Y=None): + """Calculate the affinity matrix from data + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples + and `n_features` is the number of features. + + If affinity is "precomputed" + X : array-like of shape (n_samples, n_samples), + Interpret X as precomputed adjacency graph computed from + samples. + + Y: Ignored + + Returns + ------- + affinity_matrix of shape (n_samples, n_samples) + """ + if self.affinity == "precomputed": + self.affinity_matrix_ = X + return self.affinity_matrix_ + if self.affinity == "precomputed_nearest_neighbors": + estimator = NearestNeighbors( + n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric="precomputed" + ).fit(X) + connectivity = estimator.kneighbors_graph(X=X, mode="connectivity") + self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T) + return self.affinity_matrix_ + if self.affinity == "nearest_neighbors": + if sparse.issparse(X): + warnings.warn( + "Nearest neighbors affinity currently does " + "not support sparse input, falling back to " + "rbf affinity" + ) + self.affinity = "rbf" + else: + self.n_neighbors_ = ( + self.n_neighbors + if self.n_neighbors is not None + else max(int(X.shape[0] / 10), 1) + ) + self.affinity_matrix_ = kneighbors_graph( + X, self.n_neighbors_, include_self=True, n_jobs=self.n_jobs + ) + # currently only symmetric affinity_matrix supported + self.affinity_matrix_ = 0.5 * ( + self.affinity_matrix_ + self.affinity_matrix_.T + ) + return self.affinity_matrix_ + if self.affinity == "rbf": + self.gamma_ = self.gamma if self.gamma is not None else 1.0 / X.shape[1] + self.affinity_matrix_ = rbf_kernel(X, gamma=self.gamma_) + return self.affinity_matrix_ + self.affinity_matrix_ = self.affinity(X) + return self.affinity_matrix_ + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit the model from data in X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples + and `n_features` is the number of features. + + If affinity is "precomputed" + X : {array-like, sparse matrix}, shape (n_samples, n_samples), + Interpret X as precomputed adjacency graph computed from + samples. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Returns the instance itself. + """ + X = validate_data(self, X, accept_sparse="csr", ensure_min_samples=2) + + random_state = check_random_state(self.random_state) + + affinity_matrix = self._get_affinity_matrix(X) + self.embedding_ = _spectral_embedding( + affinity_matrix, + n_components=self.n_components, + eigen_solver=self.eigen_solver, + eigen_tol=self.eigen_tol, + random_state=random_state, + ) + return self + + def fit_transform(self, X, y=None): + """Fit the model from data in X and transform X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples + and `n_features` is the number of features. + + If affinity is "precomputed" + X : {array-like, sparse matrix} of shape (n_samples, n_samples), + Interpret X as precomputed adjacency graph computed from + samples. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + X_new : array-like of shape (n_samples, n_components) + Spectral embedding of the training matrix. + """ + self.fit(X) + return self.embedding_ diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/_t_sne.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/_t_sne.py new file mode 100644 index 0000000000000000000000000000000000000000..51882a5b38abdec7b60c26c1794dafedeef4f666 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/_t_sne.py @@ -0,0 +1,1184 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# This is the exact and Barnes-Hut t-SNE implementation. There are other +# modifications of the algorithm: +# * Fast Optimization for t-SNE: +# https://cseweb.ucsd.edu/~lvdmaaten/workshops/nips2010/papers/vandermaaten.pdf + +from numbers import Integral, Real +from time import time + +import numpy as np +from scipy import linalg +from scipy.sparse import csr_matrix, issparse +from scipy.spatial.distance import pdist, squareform + +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) +from ..decomposition import PCA +from ..metrics.pairwise import _VALID_METRICS, pairwise_distances +from ..neighbors import NearestNeighbors +from ..utils import check_random_state +from ..utils._openmp_helpers import _openmp_effective_n_threads +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.validation import _num_samples, check_non_negative, validate_data + +# mypy error: Module 'sklearn.manifold' has no attribute '_utils' +# mypy error: Module 'sklearn.manifold' has no attribute '_barnes_hut_tsne' +from . import _barnes_hut_tsne, _utils # type: ignore[attr-defined] + +MACHINE_EPSILON = np.finfo(np.double).eps + + +def _joint_probabilities(distances, desired_perplexity, verbose): + """Compute joint probabilities p_ij from distances. + + Parameters + ---------- + distances : ndarray of shape (n_samples * (n_samples-1) / 2,) + Distances of samples are stored as condensed matrices, i.e. + we omit the diagonal and duplicate entries and store everything + in a one-dimensional array. + + desired_perplexity : float + Desired perplexity of the joint probability distributions. + + verbose : int + Verbosity level. + + Returns + ------- + P : ndarray of shape (n_samples * (n_samples-1) / 2,) + Condensed joint probability matrix. + """ + # Compute conditional probabilities such that they approximately match + # the desired perplexity + distances = distances.astype(np.float32, copy=False) + conditional_P = _utils._binary_search_perplexity( + distances, desired_perplexity, verbose + ) + P = conditional_P + conditional_P.T + sum_P = np.maximum(np.sum(P), MACHINE_EPSILON) + P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON) + return P + + +def _joint_probabilities_nn(distances, desired_perplexity, verbose): + """Compute joint probabilities p_ij from distances using just nearest + neighbors. + + This method is approximately equal to _joint_probabilities. The latter + is O(N), but limiting the joint probability to nearest neighbors improves + this substantially to O(uN). + + Parameters + ---------- + distances : sparse matrix of shape (n_samples, n_samples) + Distances of samples to its n_neighbors nearest neighbors. All other + distances are left to zero (and are not materialized in memory). + Matrix should be of CSR format. + + desired_perplexity : float + Desired perplexity of the joint probability distributions. + + verbose : int + Verbosity level. + + Returns + ------- + P : sparse matrix of shape (n_samples, n_samples) + Condensed joint probability matrix with only nearest neighbors. Matrix + will be of CSR format. + """ + t0 = time() + # Compute conditional probabilities such that they approximately match + # the desired perplexity + distances.sort_indices() + n_samples = distances.shape[0] + distances_data = distances.data.reshape(n_samples, -1) + distances_data = distances_data.astype(np.float32, copy=False) + conditional_P = _utils._binary_search_perplexity( + distances_data, desired_perplexity, verbose + ) + assert np.all(np.isfinite(conditional_P)), "All probabilities should be finite" + + # Symmetrize the joint probability distribution using sparse operations + P = csr_matrix( + (conditional_P.ravel(), distances.indices, distances.indptr), + shape=(n_samples, n_samples), + ) + P = P + P.T + + # Normalize the joint probability distribution + sum_P = np.maximum(P.sum(), MACHINE_EPSILON) + P /= sum_P + + assert np.all(np.abs(P.data) <= 1.0) + if verbose >= 2: + duration = time() - t0 + print("[t-SNE] Computed conditional probabilities in {:.3f}s".format(duration)) + return P + + +def _kl_divergence( + params, + P, + degrees_of_freedom, + n_samples, + n_components, + skip_num_points=0, + compute_error=True, +): + """t-SNE objective function: gradient of the KL divergence + of p_ijs and q_ijs and the absolute error. + + Parameters + ---------- + params : ndarray of shape (n_params,) + Unraveled embedding. + + P : ndarray of shape (n_samples * (n_samples-1) / 2,) + Condensed joint probability matrix. + + degrees_of_freedom : int + Degrees of freedom of the Student's-t distribution. + + n_samples : int + Number of samples. + + n_components : int + Dimension of the embedded space. + + skip_num_points : int, default=0 + This does not compute the gradient for points with indices below + `skip_num_points`. This is useful when computing transforms of new + data where you'd like to keep the old data fixed. + + compute_error: bool, default=True + If False, the kl_divergence is not computed and returns NaN. + + Returns + ------- + kl_divergence : float + Kullback-Leibler divergence of p_ij and q_ij. + + grad : ndarray of shape (n_params,) + Unraveled gradient of the Kullback-Leibler divergence with respect to + the embedding. + """ + X_embedded = params.reshape(n_samples, n_components) + + # Q is a heavy-tailed distribution: Student's t-distribution + dist = pdist(X_embedded, "sqeuclidean") + dist /= degrees_of_freedom + dist += 1.0 + dist **= (degrees_of_freedom + 1.0) / -2.0 + Q = np.maximum(dist / (2.0 * np.sum(dist)), MACHINE_EPSILON) + + # Optimization trick below: np.dot(x, y) is faster than + # np.sum(x * y) because it calls BLAS + + # Objective: C (Kullback-Leibler divergence of P and Q) + if compute_error: + kl_divergence = 2.0 * np.dot(P, np.log(np.maximum(P, MACHINE_EPSILON) / Q)) + else: + kl_divergence = np.nan + + # Gradient: dC/dY + # pdist always returns double precision distances. Thus we need to take + grad = np.ndarray((n_samples, n_components), dtype=params.dtype) + PQd = squareform((P - Q) * dist) + for i in range(skip_num_points, n_samples): + grad[i] = np.dot(np.ravel(PQd[i], order="K"), X_embedded[i] - X_embedded) + grad = grad.ravel() + c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom + grad *= c + + return kl_divergence, grad + + +def _kl_divergence_bh( + params, + P, + degrees_of_freedom, + n_samples, + n_components, + angle=0.5, + skip_num_points=0, + verbose=False, + compute_error=True, + num_threads=1, +): + """t-SNE objective function: KL divergence of p_ijs and q_ijs. + + Uses Barnes-Hut tree methods to calculate the gradient that + runs in O(NlogN) instead of O(N^2). + + Parameters + ---------- + params : ndarray of shape (n_params,) + Unraveled embedding. + + P : sparse matrix of shape (n_samples, n_sample) + Sparse approximate joint probability matrix, computed only for the + k nearest-neighbors and symmetrized. Matrix should be of CSR format. + + degrees_of_freedom : int + Degrees of freedom of the Student's-t distribution. + + n_samples : int + Number of samples. + + n_components : int + Dimension of the embedded space. + + angle : float, default=0.5 + This is the trade-off between speed and accuracy for Barnes-Hut T-SNE. + 'angle' is the angular size (referred to as theta in [3]) of a distant + node as measured from a point. If this size is below 'angle' then it is + used as a summary node of all points contained within it. + This method is not very sensitive to changes in this parameter + in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing + computation time and angle greater 0.8 has quickly increasing error. + + skip_num_points : int, default=0 + This does not compute the gradient for points with indices below + `skip_num_points`. This is useful when computing transforms of new + data where you'd like to keep the old data fixed. + + verbose : int, default=False + Verbosity level. + + compute_error: bool, default=True + If False, the kl_divergence is not computed and returns NaN. + + num_threads : int, default=1 + Number of threads used to compute the gradient. This is set here to + avoid calling _openmp_effective_n_threads for each gradient step. + + Returns + ------- + kl_divergence : float + Kullback-Leibler divergence of p_ij and q_ij. + + grad : ndarray of shape (n_params,) + Unraveled gradient of the Kullback-Leibler divergence with respect to + the embedding. + """ + params = params.astype(np.float32, copy=False) + X_embedded = params.reshape(n_samples, n_components) + + val_P = P.data.astype(np.float32, copy=False) + neighbors = P.indices.astype(np.int64, copy=False) + indptr = P.indptr.astype(np.int64, copy=False) + + grad = np.zeros(X_embedded.shape, dtype=np.float32) + error = _barnes_hut_tsne.gradient( + val_P, + X_embedded, + neighbors, + indptr, + grad, + angle, + n_components, + verbose, + dof=degrees_of_freedom, + compute_error=compute_error, + num_threads=num_threads, + ) + c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom + grad = grad.ravel() + grad *= c + + return error, grad + + +def _gradient_descent( + objective, + p0, + it, + max_iter, + n_iter_check=1, + n_iter_without_progress=300, + momentum=0.8, + learning_rate=200.0, + min_gain=0.01, + min_grad_norm=1e-7, + verbose=0, + args=None, + kwargs=None, +): + """Batch gradient descent with momentum and individual gains. + + Parameters + ---------- + objective : callable + Should return a tuple of cost and gradient for a given parameter + vector. When expensive to compute, the cost can optionally + be None and can be computed every n_iter_check steps using + the objective_error function. + + p0 : array-like of shape (n_params,) + Initial parameter vector. + + it : int + Current number of iterations (this function will be called more than + once during the optimization). + + max_iter : int + Maximum number of gradient descent iterations. + + n_iter_check : int, default=1 + Number of iterations before evaluating the global error. If the error + is sufficiently low, we abort the optimization. + + n_iter_without_progress : int, default=300 + Maximum number of iterations without progress before we abort the + optimization. + + momentum : float within (0.0, 1.0), default=0.8 + The momentum generates a weight for previous gradients that decays + exponentially. + + learning_rate : float, default=200.0 + The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If + the learning rate is too high, the data may look like a 'ball' with any + point approximately equidistant from its nearest neighbours. If the + learning rate is too low, most points may look compressed in a dense + cloud with few outliers. + + min_gain : float, default=0.01 + Minimum individual gain for each parameter. + + min_grad_norm : float, default=1e-7 + If the gradient norm is below this threshold, the optimization will + be aborted. + + verbose : int, default=0 + Verbosity level. + + args : sequence, default=None + Arguments to pass to objective function. + + kwargs : dict, default=None + Keyword arguments to pass to objective function. + + Returns + ------- + p : ndarray of shape (n_params,) + Optimum parameters. + + error : float + Optimum. + + i : int + Last iteration. + """ + if args is None: + args = [] + if kwargs is None: + kwargs = {} + + p = p0.copy().ravel() + update = np.zeros_like(p) + gains = np.ones_like(p) + error = np.finfo(float).max + best_error = np.finfo(float).max + best_iter = i = it + + tic = time() + for i in range(it, max_iter): + check_convergence = (i + 1) % n_iter_check == 0 + # only compute the error when needed + kwargs["compute_error"] = check_convergence or i == max_iter - 1 + + error, grad = objective(p, *args, **kwargs) + + inc = update * grad < 0.0 + dec = np.invert(inc) + gains[inc] += 0.2 + gains[dec] *= 0.8 + np.clip(gains, min_gain, np.inf, out=gains) + grad *= gains + update = momentum * update - learning_rate * grad + p += update + + if check_convergence: + toc = time() + duration = toc - tic + tic = toc + grad_norm = linalg.norm(grad) + + if verbose >= 2: + print( + "[t-SNE] Iteration %d: error = %.7f," + " gradient norm = %.7f" + " (%s iterations in %0.3fs)" + % (i + 1, error, grad_norm, n_iter_check, duration) + ) + + if error < best_error: + best_error = error + best_iter = i + elif i - best_iter > n_iter_without_progress: + if verbose >= 2: + print( + "[t-SNE] Iteration %d: did not make any progress " + "during the last %d episodes. Finished." + % (i + 1, n_iter_without_progress) + ) + break + if grad_norm <= min_grad_norm: + if verbose >= 2: + print( + "[t-SNE] Iteration %d: gradient norm %f. Finished." + % (i + 1, grad_norm) + ) + break + + return p, error, i + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "X_embedded": ["array-like", "sparse matrix"], + "n_neighbors": [Interval(Integral, 1, None, closed="left")], + "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable], + }, + prefer_skip_nested_validation=True, +) +def trustworthiness(X, X_embedded, *, n_neighbors=5, metric="euclidean"): + r"""Indicate to what extent the local structure is retained. + + The trustworthiness is within [0, 1]. It is defined as + + .. math:: + + T(k) = 1 - \frac{2}{nk (2n - 3k - 1)} \sum^n_{i=1} + \sum_{j \in \mathcal{N}_{i}^{k}} \max(0, (r(i, j) - k)) + + where for each sample i, :math:`\mathcal{N}_{i}^{k}` are its k nearest + neighbors in the output space, and every sample j is its :math:`r(i, j)`-th + nearest neighbor in the input space. In other words, any unexpected nearest + neighbors in the output space are penalised in proportion to their rank in + the input space. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) + If the metric is 'precomputed' X must be a square distance + matrix. Otherwise it contains a sample per row. + + X_embedded : {array-like, sparse matrix} of shape (n_samples, n_components) + Embedding of the training data in low-dimensional space. + + n_neighbors : int, default=5 + The number of neighbors that will be considered. Should be fewer than + `n_samples / 2` to ensure the trustworthiness to lies within [0, 1], as + mentioned in [1]_. An error will be raised otherwise. + + metric : str or callable, default='euclidean' + Which metric to use for computing pairwise distances between samples + from the original input space. If metric is 'precomputed', X must be a + matrix of pairwise distances or squared distances. Otherwise, for a list + of available metrics, see the documentation of argument metric in + `sklearn.pairwise.pairwise_distances` and metrics listed in + `sklearn.metrics.pairwise.PAIRWISE_DISTANCE_FUNCTIONS`. Note that the + "cosine" metric uses :func:`~sklearn.metrics.pairwise.cosine_distances`. + + .. versionadded:: 0.20 + + Returns + ------- + trustworthiness : float + Trustworthiness of the low-dimensional embedding. + + References + ---------- + .. [1] Jarkko Venna and Samuel Kaski. 2001. Neighborhood + Preservation in Nonlinear Projection Methods: An Experimental Study. + In Proceedings of the International Conference on Artificial Neural Networks + (ICANN '01). Springer-Verlag, Berlin, Heidelberg, 485-491. + + .. [2] Laurens van der Maaten. Learning a Parametric Embedding by Preserving + Local Structure. Proceedings of the Twelfth International Conference on + Artificial Intelligence and Statistics, PMLR 5:384-391, 2009. + + Examples + -------- + >>> from sklearn.datasets import make_blobs + >>> from sklearn.decomposition import PCA + >>> from sklearn.manifold import trustworthiness + >>> X, _ = make_blobs(n_samples=100, n_features=10, centers=3, random_state=42) + >>> X_embedded = PCA(n_components=2).fit_transform(X) + >>> print(f"{trustworthiness(X, X_embedded, n_neighbors=5):.2f}") + 0.92 + """ + n_samples = _num_samples(X) + if n_neighbors >= n_samples / 2: + raise ValueError( + f"n_neighbors ({n_neighbors}) should be less than n_samples / 2" + f" ({n_samples / 2})" + ) + dist_X = pairwise_distances(X, metric=metric) + if metric == "precomputed": + dist_X = dist_X.copy() + # we set the diagonal to np.inf to exclude the points themselves from + # their own neighborhood + np.fill_diagonal(dist_X, np.inf) + ind_X = np.argsort(dist_X, axis=1) + # `ind_X[i]` is the index of sorted distances between i and other samples + ind_X_embedded = ( + NearestNeighbors(n_neighbors=n_neighbors) + .fit(X_embedded) + .kneighbors(return_distance=False) + ) + + # We build an inverted index of neighbors in the input space: For sample i, + # we define `inverted_index[i]` as the inverted index of sorted distances: + # inverted_index[i][ind_X[i]] = np.arange(1, n_sample + 1) + inverted_index = np.zeros((n_samples, n_samples), dtype=int) + ordered_indices = np.arange(n_samples + 1) + inverted_index[ordered_indices[:-1, np.newaxis], ind_X] = ordered_indices[1:] + ranks = ( + inverted_index[ordered_indices[:-1, np.newaxis], ind_X_embedded] - n_neighbors + ) + t = np.sum(ranks[ranks > 0]) + t = 1.0 - t * ( + 2.0 / (n_samples * n_neighbors * (2.0 * n_samples - 3.0 * n_neighbors - 1.0)) + ) + return t + + +class TSNE(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): + """T-distributed Stochastic Neighbor Embedding. + + t-SNE [1] is a tool to visualize high-dimensional data. It converts + similarities between data points to joint probabilities and tries + to minimize the Kullback-Leibler divergence between the joint + probabilities of the low-dimensional embedding and the + high-dimensional data. t-SNE has a cost function that is not convex, + i.e. with different initializations we can get different results. + + It is highly recommended to use another dimensionality reduction + method (e.g. PCA for dense data or TruncatedSVD for sparse data) + to reduce the number of dimensions to a reasonable amount (e.g. 50) + if the number of features is very high. This will suppress some + noise and speed up the computation of pairwise distances between + samples. For more tips see Laurens van der Maaten's FAQ [2]. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int, default=2 + Dimension of the embedded space. + + perplexity : float, default=30.0 + The perplexity is related to the number of nearest neighbors that + is used in other manifold learning algorithms. Larger datasets + usually require a larger perplexity. Consider selecting a value + between 5 and 50. Different values can result in significantly + different results. The perplexity must be less than the number + of samples. + + early_exaggeration : float, default=12.0 + Controls how tight natural clusters in the original space are in + the embedded space and how much space will be between them. For + larger values, the space between natural clusters will be larger + in the embedded space. Again, the choice of this parameter is not + very critical. If the cost function increases during initial + optimization, the early exaggeration factor or the learning rate + might be too high. + + learning_rate : float or "auto", default="auto" + The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If + the learning rate is too high, the data may look like a 'ball' with any + point approximately equidistant from its nearest neighbours. If the + learning rate is too low, most points may look compressed in a dense + cloud with few outliers. If the cost function gets stuck in a bad local + minimum increasing the learning rate may help. + Note that many other t-SNE implementations (bhtsne, FIt-SNE, openTSNE, + etc.) use a definition of learning_rate that is 4 times smaller than + ours. So our learning_rate=200 corresponds to learning_rate=800 in + those other implementations. The 'auto' option sets the learning_rate + to `max(N / early_exaggeration / 4, 50)` where N is the sample size, + following [4] and [5]. + + .. versionchanged:: 1.2 + The default value changed to `"auto"`. + + max_iter : int, default=1000 + Maximum number of iterations for the optimization. Should be at + least 250. + + .. versionchanged:: 1.5 + Parameter name changed from `n_iter` to `max_iter`. + + n_iter_without_progress : int, default=300 + Maximum number of iterations without progress before we abort the + optimization, used after 250 initial iterations with early + exaggeration. Note that progress is only checked every 50 iterations so + this value is rounded to the next multiple of 50. + + .. versionadded:: 0.17 + parameter *n_iter_without_progress* to control stopping criteria. + + min_grad_norm : float, default=1e-7 + If the gradient norm is below this threshold, the optimization will + be stopped. + + metric : str or callable, default='euclidean' + The metric to use when calculating distance between instances in a + feature array. If metric is a string, it must be one of the options + allowed by scipy.spatial.distance.pdist for its metric parameter, or + a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. + If metric is "precomputed", X is assumed to be a distance matrix. + Alternatively, if metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays from X as input and return a value indicating + the distance between them. The default is "euclidean" which is + interpreted as squared euclidean distance. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + .. versionadded:: 1.1 + + init : {"random", "pca"} or ndarray of shape (n_samples, n_components), \ + default="pca" + Initialization of embedding. + PCA initialization cannot be used with precomputed distances and is + usually more globally stable than random initialization. + + .. versionchanged:: 1.2 + The default value changed to `"pca"`. + + verbose : int, default=0 + Verbosity level. + + random_state : int, RandomState instance or None, default=None + Determines the random number generator. Pass an int for reproducible + results across multiple function calls. Note that different + initializations might result in different local minima of the cost + function. See :term:`Glossary `. + + method : {'barnes_hut', 'exact'}, default='barnes_hut' + By default the gradient calculation algorithm uses Barnes-Hut + approximation running in O(NlogN) time. method='exact' + will run on the slower, but exact, algorithm in O(N^2) time. The + exact algorithm should be used when nearest-neighbor errors need + to be better than 3%. However, the exact method cannot scale to + millions of examples. + + .. versionadded:: 0.17 + Approximate optimization *method* via the Barnes-Hut. + + angle : float, default=0.5 + Only used if method='barnes_hut' + This is the trade-off between speed and accuracy for Barnes-Hut T-SNE. + 'angle' is the angular size (referred to as theta in [3]) of a distant + node as measured from a point. If this size is below 'angle' then it is + used as a summary node of all points contained within it. + This method is not very sensitive to changes in this parameter + in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing + computation time and angle greater 0.8 has quickly increasing error. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. This parameter + has no impact when ``metric="precomputed"`` or + (``metric="euclidean"`` and ``method="exact"``). + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + .. versionadded:: 0.22 + + Attributes + ---------- + embedding_ : array-like of shape (n_samples, n_components) + Stores the embedding vectors. + + kl_divergence_ : float + Kullback-Leibler divergence after optimization. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + learning_rate_ : float + Effective learning rate. + + .. versionadded:: 1.2 + + n_iter_ : int + Number of iterations run. + + See Also + -------- + sklearn.decomposition.PCA : Principal component analysis that is a linear + dimensionality reduction method. + sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using + kernels and PCA. + MDS : Manifold learning using multidimensional scaling. + Isomap : Manifold learning based on Isometric Mapping. + LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding. + SpectralEmbedding : Spectral embedding for non-linear dimensionality. + + Notes + ----- + For an example of using :class:`~sklearn.manifold.TSNE` in combination with + :class:`~sklearn.neighbors.KNeighborsTransformer` see + :ref:`sphx_glr_auto_examples_neighbors_approximate_nearest_neighbors.py`. + + References + ---------- + + [1] van der Maaten, L.J.P.; Hinton, G.E. Visualizing High-Dimensional Data + Using t-SNE. Journal of Machine Learning Research 9:2579-2605, 2008. + + [2] van der Maaten, L.J.P. t-Distributed Stochastic Neighbor Embedding + https://lvdmaaten.github.io/tsne/ + + [3] L.J.P. van der Maaten. Accelerating t-SNE using Tree-Based Algorithms. + Journal of Machine Learning Research 15(Oct):3221-3245, 2014. + https://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf + + [4] Belkina, A. C., Ciccolella, C. O., Anno, R., Halpert, R., Spidlen, J., + & Snyder-Cappione, J. E. (2019). Automated optimized parameters for + T-distributed stochastic neighbor embedding improve visualization + and analysis of large datasets. Nature Communications, 10(1), 1-12. + + [5] Kobak, D., & Berens, P. (2019). The art of using t-SNE for single-cell + transcriptomics. Nature Communications, 10(1), 1-14. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.manifold import TSNE + >>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) + >>> X_embedded = TSNE(n_components=2, learning_rate='auto', + ... init='random', perplexity=3).fit_transform(X) + >>> X_embedded.shape + (4, 2) + """ + + _parameter_constraints: dict = { + "n_components": [Interval(Integral, 1, None, closed="left")], + "perplexity": [Interval(Real, 0, None, closed="neither")], + "early_exaggeration": [Interval(Real, 1, None, closed="left")], + "learning_rate": [ + StrOptions({"auto"}), + Interval(Real, 0, None, closed="neither"), + ], + "max_iter": [Interval(Integral, 250, None, closed="left")], + "n_iter_without_progress": [Interval(Integral, -1, None, closed="left")], + "min_grad_norm": [Interval(Real, 0, None, closed="left")], + "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable], + "metric_params": [dict, None], + "init": [ + StrOptions({"pca", "random"}), + np.ndarray, + ], + "verbose": ["verbose"], + "random_state": ["random_state"], + "method": [StrOptions({"barnes_hut", "exact"})], + "angle": [Interval(Real, 0, 1, closed="both")], + "n_jobs": [None, Integral], + } + + # Control the number of exploration iterations with early_exaggeration on + _EXPLORATION_MAX_ITER = 250 + + # Control the number of iterations between progress checks + _N_ITER_CHECK = 50 + + def __init__( + self, + n_components=2, + *, + perplexity=30.0, + early_exaggeration=12.0, + learning_rate="auto", + max_iter=1000, + n_iter_without_progress=300, + min_grad_norm=1e-7, + metric="euclidean", + metric_params=None, + init="pca", + verbose=0, + random_state=None, + method="barnes_hut", + angle=0.5, + n_jobs=None, + ): + self.n_components = n_components + self.perplexity = perplexity + self.early_exaggeration = early_exaggeration + self.learning_rate = learning_rate + self.max_iter = max_iter + self.n_iter_without_progress = n_iter_without_progress + self.min_grad_norm = min_grad_norm + self.metric = metric + self.metric_params = metric_params + self.init = init + self.verbose = verbose + self.random_state = random_state + self.method = method + self.angle = angle + self.n_jobs = n_jobs + + def _check_params_vs_input(self, X): + if self.perplexity >= X.shape[0]: + raise ValueError( + f"perplexity ({self.perplexity}) must be less " + f"than n_samples ({X.shape[0]})" + ) + + def _fit(self, X, skip_num_points=0): + """Private function to fit the model using X as training data.""" + + if isinstance(self.init, str) and self.init == "pca" and issparse(X): + raise TypeError( + "PCA initialization is currently not supported " + "with the sparse input matrix. Use " + 'init="random" instead.' + ) + + if self.learning_rate == "auto": + # See issue #18018 + self.learning_rate_ = X.shape[0] / self.early_exaggeration / 4 + self.learning_rate_ = np.maximum(self.learning_rate_, 50) + else: + self.learning_rate_ = self.learning_rate + + if self.method == "barnes_hut": + X = validate_data( + self, + X, + accept_sparse=["csr"], + ensure_min_samples=2, + dtype=[np.float32, np.float64], + ) + else: + X = validate_data( + self, + X, + accept_sparse=["csr", "csc", "coo"], + dtype=[np.float32, np.float64], + ) + if self.metric == "precomputed": + if isinstance(self.init, str) and self.init == "pca": + raise ValueError( + 'The parameter init="pca" cannot be used with metric="precomputed".' + ) + if X.shape[0] != X.shape[1]: + raise ValueError("X should be a square distance matrix") + + check_non_negative( + X, + ( + "TSNE.fit(). With metric='precomputed', X " + "should contain positive distances." + ), + ) + + if self.method == "exact" and issparse(X): + raise TypeError( + 'TSNE with method="exact" does not accept sparse ' + 'precomputed distance matrix. Use method="barnes_hut" ' + "or provide the dense distance matrix." + ) + + if self.method == "barnes_hut" and self.n_components > 3: + raise ValueError( + "'n_components' should be inferior to 4 for the " + "barnes_hut algorithm as it relies on " + "quad-tree or oct-tree." + ) + random_state = check_random_state(self.random_state) + + n_samples = X.shape[0] + + neighbors_nn = None + if self.method == "exact": + # Retrieve the distance matrix, either using the precomputed one or + # computing it. + if self.metric == "precomputed": + distances = X + else: + if self.verbose: + print("[t-SNE] Computing pairwise distances...") + + if self.metric == "euclidean": + # Euclidean is squared here, rather than using **= 2, + # because euclidean_distances already calculates + # squared distances, and returns np.sqrt(dist) for + # squared=False. + # Also, Euclidean is slower for n_jobs>1, so don't set here + distances = pairwise_distances(X, metric=self.metric, squared=True) + else: + metric_params_ = self.metric_params or {} + distances = pairwise_distances( + X, metric=self.metric, n_jobs=self.n_jobs, **metric_params_ + ) + + if np.any(distances < 0): + raise ValueError( + "All distances should be positive, the metric given is not correct" + ) + + if self.metric != "euclidean": + distances **= 2 + + # compute the joint probability distribution for the input space + P = _joint_probabilities(distances, self.perplexity, self.verbose) + assert np.all(np.isfinite(P)), "All probabilities should be finite" + assert np.all(P >= 0), "All probabilities should be non-negative" + assert np.all(P <= 1), ( + "All probabilities should be less or then equal to one" + ) + + else: + # Compute the number of nearest neighbors to find. + # LvdM uses 3 * perplexity as the number of neighbors. + # In the event that we have very small # of points + # set the neighbors to n - 1. + n_neighbors = min(n_samples - 1, int(3.0 * self.perplexity + 1)) + + if self.verbose: + print("[t-SNE] Computing {} nearest neighbors...".format(n_neighbors)) + + # Find the nearest neighbors for every point + knn = NearestNeighbors( + algorithm="auto", + n_jobs=self.n_jobs, + n_neighbors=n_neighbors, + metric=self.metric, + metric_params=self.metric_params, + ) + t0 = time() + knn.fit(X) + duration = time() - t0 + if self.verbose: + print( + "[t-SNE] Indexed {} samples in {:.3f}s...".format( + n_samples, duration + ) + ) + + t0 = time() + distances_nn = knn.kneighbors_graph(mode="distance") + duration = time() - t0 + if self.verbose: + print( + "[t-SNE] Computed neighbors for {} samples in {:.3f}s...".format( + n_samples, duration + ) + ) + + # Free the memory used by the ball_tree + del knn + + # knn return the euclidean distance but we need it squared + # to be consistent with the 'exact' method. Note that the + # the method was derived using the euclidean method as in the + # input space. Not sure of the implication of using a different + # metric. + distances_nn.data **= 2 + + # compute the joint probability distribution for the input space + P = _joint_probabilities_nn(distances_nn, self.perplexity, self.verbose) + + if isinstance(self.init, np.ndarray): + X_embedded = self.init + elif self.init == "pca": + pca = PCA( + n_components=self.n_components, + svd_solver="randomized", + random_state=random_state, + ) + # Always output a numpy array, no matter what is configured globally + pca.set_output(transform="default") + X_embedded = pca.fit_transform(X).astype(np.float32, copy=False) + # PCA is rescaled so that PC1 has standard deviation 1e-4 which is + # the default value for random initialization. See issue #18018. + X_embedded = X_embedded / np.std(X_embedded[:, 0]) * 1e-4 + elif self.init == "random": + # The embedding is initialized with iid samples from Gaussians with + # standard deviation 1e-4. + X_embedded = 1e-4 * random_state.standard_normal( + size=(n_samples, self.n_components) + ).astype(np.float32) + + # Degrees of freedom of the Student's t-distribution. The suggestion + # degrees_of_freedom = n_components - 1 comes from + # "Learning a Parametric Embedding by Preserving Local Structure" + # Laurens van der Maaten, 2009. + degrees_of_freedom = max(self.n_components - 1, 1) + + return self._tsne( + P, + degrees_of_freedom, + n_samples, + X_embedded=X_embedded, + neighbors=neighbors_nn, + skip_num_points=skip_num_points, + ) + + def _tsne( + self, + P, + degrees_of_freedom, + n_samples, + X_embedded, + neighbors=None, + skip_num_points=0, + ): + """Runs t-SNE.""" + # t-SNE minimizes the Kullback-Leiber divergence of the Gaussians P + # and the Student's t-distributions Q. The optimization algorithm that + # we use is batch gradient descent with two stages: + # * initial optimization with early exaggeration and momentum at 0.5 + # * final optimization with momentum at 0.8 + params = X_embedded.ravel() + + opt_args = { + "it": 0, + "n_iter_check": self._N_ITER_CHECK, + "min_grad_norm": self.min_grad_norm, + "learning_rate": self.learning_rate_, + "verbose": self.verbose, + "kwargs": dict(skip_num_points=skip_num_points), + "args": [P, degrees_of_freedom, n_samples, self.n_components], + "n_iter_without_progress": self._EXPLORATION_MAX_ITER, + "max_iter": self._EXPLORATION_MAX_ITER, + "momentum": 0.5, + } + if self.method == "barnes_hut": + obj_func = _kl_divergence_bh + opt_args["kwargs"]["angle"] = self.angle + # Repeat verbose argument for _kl_divergence_bh + opt_args["kwargs"]["verbose"] = self.verbose + # Get the number of threads for gradient computation here to + # avoid recomputing it at each iteration. + opt_args["kwargs"]["num_threads"] = _openmp_effective_n_threads() + else: + obj_func = _kl_divergence + + # Learning schedule (part 1): do 250 iteration with lower momentum but + # higher learning rate controlled via the early exaggeration parameter + P *= self.early_exaggeration + params, kl_divergence, it = _gradient_descent(obj_func, params, **opt_args) + if self.verbose: + print( + "[t-SNE] KL divergence after %d iterations with early exaggeration: %f" + % (it + 1, kl_divergence) + ) + + # Learning schedule (part 2): disable early exaggeration and finish + # optimization with a higher momentum at 0.8 + P /= self.early_exaggeration + remaining = self.max_iter - self._EXPLORATION_MAX_ITER + if it < self._EXPLORATION_MAX_ITER or remaining > 0: + opt_args["max_iter"] = self.max_iter + opt_args["it"] = it + 1 + opt_args["momentum"] = 0.8 + opt_args["n_iter_without_progress"] = self.n_iter_without_progress + params, kl_divergence, it = _gradient_descent(obj_func, params, **opt_args) + + # Save the final number of iterations + self.n_iter_ = it + + if self.verbose: + print( + "[t-SNE] KL divergence after %d iterations: %f" + % (it + 1, kl_divergence) + ) + + X_embedded = params.reshape(n_samples, self.n_components) + self.kl_divergence_ = kl_divergence + + return X_embedded + + @_fit_context( + # TSNE.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit_transform(self, X, y=None): + """Fit X into an embedded space and return that transformed output. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) + If the metric is 'precomputed' X must be a square distance + matrix. Otherwise it contains a sample per row. If the method + is 'exact', X may be a sparse matrix of type 'csr', 'csc' + or 'coo'. If the method is 'barnes_hut' and the metric is + 'precomputed', X may be a precomputed sparse graph. + + y : None + Ignored. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_components) + Embedding of the training data in low-dimensional space. + """ + self._check_params_vs_input(X) + embedding = self._fit(X) + self.embedding_ = embedding + return self.embedding_ + + @_fit_context( + # TSNE.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y=None): + """Fit X into an embedded space. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) + If the metric is 'precomputed' X must be a square distance + matrix. Otherwise it contains a sample per row. If the method + is 'exact', X may be a sparse matrix of type 'csr', 'csc' + or 'coo'. If the method is 'barnes_hut' and the metric is + 'precomputed', X may be a precomputed sparse graph. + + y : None + Ignored. + + Returns + ------- + self : object + Fitted estimator. + """ + self.fit_transform(X) + return self + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.embedding_.shape[1] + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = self.metric == "precomputed" + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/_utils.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/manifold/_utils.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..079767b633d4d757170fe1227300694510c704bd Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/manifold/_utils.cpython-312-x86_64-linux-gnu.so differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/_utils.pyx b/.venv/lib/python3.12/site-packages/sklearn/manifold/_utils.pyx new file mode 100644 index 0000000000000000000000000000000000000000..be3a1d2f91f6670cea8eee130990becc3fc4b8bb --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/_utils.pyx @@ -0,0 +1,120 @@ +import numpy as np + +from libc cimport math +from libc.math cimport INFINITY + +from ..utils._typedefs cimport float32_t, float64_t + + +cdef float EPSILON_DBL = 1e-8 +cdef float PERPLEXITY_TOLERANCE = 1e-5 + + +# TODO: have this function support float32 and float64 and preserve inputs' dtypes. +def _binary_search_perplexity( + const float32_t[:, :] sqdistances, + float desired_perplexity, + int verbose): + """Binary search for sigmas of conditional Gaussians. + + This approximation reduces the computational complexity from O(N^2) to + O(uN). + + Parameters + ---------- + sqdistances : ndarray of shape (n_samples, n_neighbors), dtype=np.float32 + Distances between training samples and their k nearest neighbors. + When using the exact method, this is a square (n_samples, n_samples) + distance matrix. The TSNE default metric is "euclidean" which is + interpreted as squared euclidean distance. + + desired_perplexity : float + Desired perplexity (2^entropy) of the conditional Gaussians. + + verbose : int + Verbosity level. + + Returns + ------- + P : ndarray of shape (n_samples, n_samples), dtype=np.float64 + Probabilities of conditional Gaussian distributions p_i|j. + """ + # Maximum number of binary search steps + cdef long n_steps = 100 + + cdef long n_samples = sqdistances.shape[0] + cdef long n_neighbors = sqdistances.shape[1] + cdef int using_neighbors = n_neighbors < n_samples + # Precisions of conditional Gaussian distributions + cdef double beta + cdef double beta_min + cdef double beta_max + cdef double beta_sum = 0.0 + + # Use log scale + cdef double desired_entropy = math.log(desired_perplexity) + cdef double entropy_diff + + cdef double entropy + cdef double sum_Pi + cdef double sum_disti_Pi + cdef long i, j, l + + # This array is later used as a 32bit array. It has multiple intermediate + # floating point additions that benefit from the extra precision + cdef float64_t[:, :] P = np.zeros( + (n_samples, n_neighbors), dtype=np.float64) + + for i in range(n_samples): + beta_min = -INFINITY + beta_max = INFINITY + beta = 1.0 + + # Binary search of precision for i-th conditional distribution + for l in range(n_steps): + # Compute current entropy and corresponding probabilities + # computed just over the nearest neighbors or over all data + # if we're not using neighbors + sum_Pi = 0.0 + for j in range(n_neighbors): + if j != i or using_neighbors: + P[i, j] = math.exp(-sqdistances[i, j] * beta) + sum_Pi += P[i, j] + + if sum_Pi == 0.0: + sum_Pi = EPSILON_DBL + sum_disti_Pi = 0.0 + + for j in range(n_neighbors): + P[i, j] /= sum_Pi + sum_disti_Pi += sqdistances[i, j] * P[i, j] + + entropy = math.log(sum_Pi) + beta * sum_disti_Pi + entropy_diff = entropy - desired_entropy + + if math.fabs(entropy_diff) <= PERPLEXITY_TOLERANCE: + break + + if entropy_diff > 0.0: + beta_min = beta + if beta_max == INFINITY: + beta *= 2.0 + else: + beta = (beta + beta_max) / 2.0 + else: + beta_max = beta + if beta_min == -INFINITY: + beta /= 2.0 + else: + beta = (beta + beta_min) / 2.0 + + beta_sum += beta + + if verbose and ((i + 1) % 1000 == 0 or i + 1 == n_samples): + print("[t-SNE] Computed conditional probabilities for sample " + "%d / %d" % (i + 1, n_samples)) + + if verbose: + print("[t-SNE] Mean sigma: %f" + % np.mean(math.sqrt(n_samples / beta_sum))) + return np.asarray(P) diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/meson.build b/.venv/lib/python3.12/site-packages/sklearn/manifold/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..c060590410d63ff06ca8b0f062b08cd1581a07de --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/meson.build @@ -0,0 +1,14 @@ +py.extension_module( + '_utils', + [cython_gen.process('_utils.pyx'), utils_cython_tree], + subdir: 'sklearn/manifold', + install: true +) + +py.extension_module( + '_barnes_hut_tsne', + cython_gen.process('_barnes_hut_tsne.pyx'), + dependencies: [np_dep, openmp_dep], + subdir: 'sklearn/manifold', + install: true +) diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_isomap.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_isomap.py new file mode 100644 index 0000000000000000000000000000000000000000..e38b92442e58d9881726bdee85073ad38a7c95e1 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_isomap.py @@ -0,0 +1,348 @@ +import math +from itertools import product + +import numpy as np +import pytest +from scipy.sparse import rand as sparse_rand + +from sklearn import clone, datasets, manifold, neighbors, pipeline, preprocessing +from sklearn.datasets import make_blobs +from sklearn.metrics.pairwise import pairwise_distances +from sklearn.utils._testing import ( + assert_allclose, + assert_allclose_dense_sparse, + assert_array_equal, +) +from sklearn.utils.fixes import CSR_CONTAINERS + +eigen_solvers = ["auto", "dense", "arpack"] +path_methods = ["auto", "FW", "D"] + + +def create_sample_data(dtype, n_pts=25, add_noise=False): + # grid of equidistant points in 2D, n_components = n_dim + n_per_side = int(math.sqrt(n_pts)) + X = np.array(list(product(range(n_per_side), repeat=2))).astype(dtype, copy=False) + if add_noise: + # add noise in a third dimension + rng = np.random.RandomState(0) + noise = 0.1 * rng.randn(n_pts, 1).astype(dtype, copy=False) + X = np.concatenate((X, noise), 1) + return X + + +@pytest.mark.parametrize("n_neighbors, radius", [(24, None), (None, np.inf)]) +@pytest.mark.parametrize("eigen_solver", eigen_solvers) +@pytest.mark.parametrize("path_method", path_methods) +def test_isomap_simple_grid( + global_dtype, n_neighbors, radius, eigen_solver, path_method +): + # Isomap should preserve distances when all neighbors are used + n_pts = 25 + X = create_sample_data(global_dtype, n_pts=n_pts, add_noise=False) + + # distances from each point to all others + if n_neighbors is not None: + G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance") + else: + G = neighbors.radius_neighbors_graph(X, radius, mode="distance") + + clf = manifold.Isomap( + n_neighbors=n_neighbors, + radius=radius, + n_components=2, + eigen_solver=eigen_solver, + path_method=path_method, + ) + clf.fit(X) + + if n_neighbors is not None: + G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode="distance") + else: + G_iso = neighbors.radius_neighbors_graph( + clf.embedding_, radius, mode="distance" + ) + atol = 1e-5 if global_dtype == np.float32 else 0 + assert_allclose_dense_sparse(G, G_iso, atol=atol) + + +@pytest.mark.parametrize("n_neighbors, radius", [(24, None), (None, np.inf)]) +@pytest.mark.parametrize("eigen_solver", eigen_solvers) +@pytest.mark.parametrize("path_method", path_methods) +def test_isomap_reconstruction_error( + global_dtype, n_neighbors, radius, eigen_solver, path_method +): + if global_dtype is np.float32: + pytest.skip( + "Skipping test due to numerical instabilities on float32 data" + "from KernelCenterer used in the reconstruction_error method" + ) + + # Same setup as in test_isomap_simple_grid, with an added dimension + n_pts = 25 + X = create_sample_data(global_dtype, n_pts=n_pts, add_noise=True) + + # compute input kernel + if n_neighbors is not None: + G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance").toarray() + else: + G = neighbors.radius_neighbors_graph(X, radius, mode="distance").toarray() + centerer = preprocessing.KernelCenterer() + K = centerer.fit_transform(-0.5 * G**2) + + clf = manifold.Isomap( + n_neighbors=n_neighbors, + radius=radius, + n_components=2, + eigen_solver=eigen_solver, + path_method=path_method, + ) + clf.fit(X) + + # compute output kernel + if n_neighbors is not None: + G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode="distance") + else: + G_iso = neighbors.radius_neighbors_graph( + clf.embedding_, radius, mode="distance" + ) + G_iso = G_iso.toarray() + K_iso = centerer.fit_transform(-0.5 * G_iso**2) + + # make sure error agrees + reconstruction_error = np.linalg.norm(K - K_iso) / n_pts + atol = 1e-5 if global_dtype == np.float32 else 0 + assert_allclose(reconstruction_error, clf.reconstruction_error(), atol=atol) + + +@pytest.mark.parametrize("n_neighbors, radius", [(2, None), (None, 0.5)]) +def test_transform(global_dtype, n_neighbors, radius): + n_samples = 200 + n_components = 10 + noise_scale = 0.01 + + # Create S-curve dataset + X, y = datasets.make_s_curve(n_samples, random_state=0) + + X = X.astype(global_dtype, copy=False) + + # Compute isomap embedding + iso = manifold.Isomap( + n_components=n_components, n_neighbors=n_neighbors, radius=radius + ) + X_iso = iso.fit_transform(X) + + # Re-embed a noisy version of the points + rng = np.random.RandomState(0) + noise = noise_scale * rng.randn(*X.shape) + X_iso2 = iso.transform(X + noise) + + # Make sure the rms error on re-embedding is comparable to noise_scale + assert np.sqrt(np.mean((X_iso - X_iso2) ** 2)) < 2 * noise_scale + + +@pytest.mark.parametrize("n_neighbors, radius", [(2, None), (None, 10.0)]) +def test_pipeline(n_neighbors, radius, global_dtype): + # check that Isomap works fine as a transformer in a Pipeline + # only checks that no error is raised. + # TODO check that it actually does something useful + X, y = datasets.make_blobs(random_state=0) + X = X.astype(global_dtype, copy=False) + clf = pipeline.Pipeline( + [ + ("isomap", manifold.Isomap(n_neighbors=n_neighbors, radius=radius)), + ("clf", neighbors.KNeighborsClassifier()), + ] + ) + clf.fit(X, y) + assert 0.9 < clf.score(X, y) + + +def test_pipeline_with_nearest_neighbors_transformer(global_dtype): + # Test chaining NearestNeighborsTransformer and Isomap with + # neighbors_algorithm='precomputed' + algorithm = "auto" + n_neighbors = 10 + + X, _ = datasets.make_blobs(random_state=0) + X2, _ = datasets.make_blobs(random_state=1) + + X = X.astype(global_dtype, copy=False) + X2 = X2.astype(global_dtype, copy=False) + + # compare the chained version and the compact version + est_chain = pipeline.make_pipeline( + neighbors.KNeighborsTransformer( + n_neighbors=n_neighbors, algorithm=algorithm, mode="distance" + ), + manifold.Isomap(n_neighbors=n_neighbors, metric="precomputed"), + ) + est_compact = manifold.Isomap( + n_neighbors=n_neighbors, neighbors_algorithm=algorithm + ) + + Xt_chain = est_chain.fit_transform(X) + Xt_compact = est_compact.fit_transform(X) + assert_allclose(Xt_chain, Xt_compact) + + Xt_chain = est_chain.transform(X2) + Xt_compact = est_compact.transform(X2) + assert_allclose(Xt_chain, Xt_compact) + + +@pytest.mark.parametrize( + "metric, p, is_euclidean", + [ + ("euclidean", 2, True), + ("manhattan", 1, False), + ("minkowski", 1, False), + ("minkowski", 2, True), + (lambda x1, x2: np.sqrt(np.sum(x1**2 + x2**2)), 2, False), + ], +) +def test_different_metric(global_dtype, metric, p, is_euclidean): + # Isomap must work on various metric parameters work correctly + # and must default to euclidean. + X, _ = datasets.make_blobs(random_state=0) + X = X.astype(global_dtype, copy=False) + + reference = manifold.Isomap().fit_transform(X) + embedding = manifold.Isomap(metric=metric, p=p).fit_transform(X) + + if is_euclidean: + assert_allclose(embedding, reference) + else: + with pytest.raises(AssertionError, match="Not equal to tolerance"): + assert_allclose(embedding, reference) + + +def test_isomap_clone_bug(): + # regression test for bug reported in #6062 + model = manifold.Isomap() + for n_neighbors in [10, 15, 20]: + model.set_params(n_neighbors=n_neighbors) + model.fit(np.random.rand(50, 2)) + assert model.nbrs_.n_neighbors == n_neighbors + + +@pytest.mark.parametrize("eigen_solver", eigen_solvers) +@pytest.mark.parametrize("path_method", path_methods) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse_input( + global_dtype, eigen_solver, path_method, global_random_seed, csr_container +): + # TODO: compare results on dense and sparse data as proposed in: + # https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186 + X = csr_container( + sparse_rand( + 100, + 3, + density=0.1, + format="csr", + dtype=global_dtype, + random_state=global_random_seed, + ) + ) + + iso_dense = manifold.Isomap( + n_components=2, + eigen_solver=eigen_solver, + path_method=path_method, + n_neighbors=8, + ) + iso_sparse = clone(iso_dense) + + X_trans_dense = iso_dense.fit_transform(X.toarray()) + X_trans_sparse = iso_sparse.fit_transform(X) + + assert_allclose(X_trans_sparse, X_trans_dense, rtol=1e-4, atol=1e-4) + + +def test_isomap_fit_precomputed_radius_graph(global_dtype): + # Isomap.fit_transform must yield similar result when using + # a precomputed distance matrix. + + X, y = datasets.make_s_curve(200, random_state=0) + X = X.astype(global_dtype, copy=False) + radius = 10 + + g = neighbors.radius_neighbors_graph(X, radius=radius, mode="distance") + isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="precomputed") + isomap.fit(g) + precomputed_result = isomap.embedding_ + + isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="minkowski") + result = isomap.fit_transform(X) + atol = 1e-5 if global_dtype == np.float32 else 0 + assert_allclose(precomputed_result, result, atol=atol) + + +def test_isomap_fitted_attributes_dtype(global_dtype): + """Check that the fitted attributes are stored accordingly to the + data type of X.""" + iso = manifold.Isomap(n_neighbors=2) + + X = np.array([[1, 2], [3, 4], [5, 6]], dtype=global_dtype) + + iso.fit(X) + + assert iso.dist_matrix_.dtype == global_dtype + assert iso.embedding_.dtype == global_dtype + + +def test_isomap_dtype_equivalence(): + """Check the equivalence of the results with 32 and 64 bits input.""" + iso_32 = manifold.Isomap(n_neighbors=2) + X_32 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32) + iso_32.fit(X_32) + + iso_64 = manifold.Isomap(n_neighbors=2) + X_64 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float64) + iso_64.fit(X_64) + + assert_allclose(iso_32.dist_matrix_, iso_64.dist_matrix_) + + +def test_isomap_raise_error_when_neighbor_and_radius_both_set(): + # Isomap.fit_transform must raise a ValueError if + # radius and n_neighbors are provided. + + X, _ = datasets.load_digits(return_X_y=True) + isomap = manifold.Isomap(n_neighbors=3, radius=5.5) + msg = "Both n_neighbors and radius are provided" + with pytest.raises(ValueError, match=msg): + isomap.fit_transform(X) + + +def test_multiple_connected_components(): + # Test that a warning is raised when the graph has multiple components + X = np.array([0, 1, 2, 5, 6, 7])[:, None] + with pytest.warns(UserWarning, match="number of connected components"): + manifold.Isomap(n_neighbors=2).fit(X) + + +def test_multiple_connected_components_metric_precomputed(global_dtype): + # Test that an error is raised when the graph has multiple components + # and when X is a precomputed neighbors graph. + X = np.array([0, 1, 2, 5, 6, 7])[:, None].astype(global_dtype, copy=False) + + # works with a precomputed distance matrix (dense) + X_distances = pairwise_distances(X) + with pytest.warns(UserWarning, match="number of connected components"): + manifold.Isomap(n_neighbors=1, metric="precomputed").fit(X_distances) + + # does not work with a precomputed neighbors graph (sparse) + X_graph = neighbors.kneighbors_graph(X, n_neighbors=2, mode="distance") + with pytest.raises(RuntimeError, match="number of connected components"): + manifold.Isomap(n_neighbors=1, metric="precomputed").fit(X_graph) + + +def test_get_feature_names_out(): + """Check get_feature_names_out for Isomap.""" + X, y = make_blobs(random_state=0, n_features=4) + n_components = 2 + + iso = manifold.Isomap(n_components=n_components) + iso.fit_transform(X) + names = iso.get_feature_names_out() + assert_array_equal([f"isomap{i}" for i in range(n_components)], names) diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_locally_linear.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_locally_linear.py new file mode 100644 index 0000000000000000000000000000000000000000..835aa20fd1d32ace684eea9afd451bcdcf695f79 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_locally_linear.py @@ -0,0 +1,171 @@ +from itertools import product + +import numpy as np +import pytest +from scipy import linalg + +from sklearn import manifold, neighbors +from sklearn.datasets import make_blobs +from sklearn.manifold._locally_linear import barycenter_kneighbors_graph +from sklearn.utils._testing import ( + assert_allclose, + assert_array_equal, + ignore_warnings, +) + +eigen_solvers = ["dense", "arpack"] + + +# ---------------------------------------------------------------------- +# Test utility routines +def test_barycenter_kneighbors_graph(global_dtype): + X = np.array([[0, 1], [1.01, 1.0], [2, 0]], dtype=global_dtype) + + graph = barycenter_kneighbors_graph(X, 1) + expected_graph = np.array( + [[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype=global_dtype + ) + + assert graph.dtype == global_dtype + + assert_allclose(graph.toarray(), expected_graph) + + graph = barycenter_kneighbors_graph(X, 2) + # check that columns sum to one + assert_allclose(np.sum(graph.toarray(), axis=1), np.ones(3)) + pred = np.dot(graph.toarray(), X) + assert linalg.norm(pred - X) / X.shape[0] < 1 + + +# ---------------------------------------------------------------------- +# Test LLE by computing the reconstruction error on some manifolds. + + +def test_lle_simple_grid(global_dtype): + # note: ARPACK is numerically unstable, so this test will fail for + # some random seeds. We choose 42 because the tests pass. + # for arm64 platforms 2 makes the test fail. + # TODO: rewrite this test to make less sensitive to the random seed, + # irrespective of the platform. + rng = np.random.RandomState(42) + + # grid of equidistant points in 2D, n_components = n_dim + X = np.array(list(product(range(5), repeat=2))) + X = X + 1e-10 * rng.uniform(size=X.shape) + X = X.astype(global_dtype, copy=False) + + n_components = 2 + clf = manifold.LocallyLinearEmbedding( + n_neighbors=5, n_components=n_components, random_state=rng + ) + tol = 0.1 + + N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray() + reconstruction_error = linalg.norm(np.dot(N, X) - X, "fro") + assert reconstruction_error < tol + + for solver in eigen_solvers: + clf.set_params(eigen_solver=solver) + clf.fit(X) + assert clf.embedding_.shape[1] == n_components + reconstruction_error = ( + linalg.norm(np.dot(N, clf.embedding_) - clf.embedding_, "fro") ** 2 + ) + + assert reconstruction_error < tol + assert_allclose(clf.reconstruction_error_, reconstruction_error, atol=1e-1) + + # re-embed a noisy version of X using the transform method + noise = rng.randn(*X.shape).astype(global_dtype, copy=False) / 100 + X_reembedded = clf.transform(X + noise) + assert linalg.norm(X_reembedded - clf.embedding_) < tol + + +@pytest.mark.parametrize("method", ["standard", "hessian", "modified", "ltsa"]) +@pytest.mark.parametrize("solver", eigen_solvers) +def test_lle_manifold(global_dtype, method, solver): + rng = np.random.RandomState(0) + # similar test on a slightly more complex manifold + X = np.array(list(product(np.arange(18), repeat=2))) + X = np.c_[X, X[:, 0] ** 2 / 18] + X = X + 1e-10 * rng.uniform(size=X.shape) + X = X.astype(global_dtype, copy=False) + n_components = 2 + + clf = manifold.LocallyLinearEmbedding( + n_neighbors=6, n_components=n_components, method=method, random_state=0 + ) + tol = 1.5 if method == "standard" else 3 + + N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray() + reconstruction_error = linalg.norm(np.dot(N, X) - X) + assert reconstruction_error < tol + + clf.set_params(eigen_solver=solver) + clf.fit(X) + assert clf.embedding_.shape[1] == n_components + reconstruction_error = ( + linalg.norm(np.dot(N, clf.embedding_) - clf.embedding_, "fro") ** 2 + ) + details = "solver: %s, method: %s" % (solver, method) + assert reconstruction_error < tol, details + assert ( + np.abs(clf.reconstruction_error_ - reconstruction_error) + < tol * reconstruction_error + ), details + + +def test_pipeline(): + # check that LocallyLinearEmbedding works fine as a Pipeline + # only checks that no error is raised. + # TODO check that it actually does something useful + from sklearn import datasets, pipeline + + X, y = datasets.make_blobs(random_state=0) + clf = pipeline.Pipeline( + [ + ("filter", manifold.LocallyLinearEmbedding(random_state=0)), + ("clf", neighbors.KNeighborsClassifier()), + ] + ) + clf.fit(X, y) + assert 0.9 < clf.score(X, y) + + +# Test the error raised when the weight matrix is singular +def test_singular_matrix(): + M = np.ones((200, 3)) + f = ignore_warnings + with pytest.raises(ValueError, match="Error in determining null-space with ARPACK"): + f( + manifold.locally_linear_embedding( + M, + n_neighbors=2, + n_components=1, + method="standard", + eigen_solver="arpack", + ) + ) + + +# regression test for #6033 +def test_integer_input(): + rand = np.random.RandomState(0) + X = rand.randint(0, 100, size=(20, 3)) + + for method in ["standard", "hessian", "modified", "ltsa"]: + clf = manifold.LocallyLinearEmbedding(method=method, n_neighbors=10) + clf.fit(X) # this previously raised a TypeError + + +def test_get_feature_names_out(): + """Check get_feature_names_out for LocallyLinearEmbedding.""" + X, y = make_blobs(random_state=0, n_features=4) + n_components = 2 + + iso = manifold.LocallyLinearEmbedding(n_components=n_components) + iso.fit(X) + names = iso.get_feature_names_out() + assert_array_equal( + [f"locallylinearembedding{i}" for i in range(n_components)], names + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_mds.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_mds.py new file mode 100644 index 0000000000000000000000000000000000000000..88dc842a1d5fc4168a3cc9003c929f1770e839bb --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_mds.py @@ -0,0 +1,234 @@ +from unittest.mock import Mock + +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_array_almost_equal, assert_equal + +from sklearn.datasets import load_digits +from sklearn.manifold import _mds as mds +from sklearn.metrics import euclidean_distances + + +def test_smacof(): + # test metric smacof using the data of "Modern Multidimensional Scaling", + # Borg & Groenen, p 154 + sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) + Z = np.array([[-0.266, -0.539], [0.451, 0.252], [0.016, -0.238], [-0.200, 0.524]]) + X, _ = mds.smacof(sim, init=Z, n_components=2, max_iter=1, n_init=1) + X_true = np.array( + [[-1.415, -2.471], [1.633, 1.107], [0.249, -0.067], [-0.468, 1.431]] + ) + assert_array_almost_equal(X, X_true, decimal=3) + + +def test_nonmetric_lower_normalized_stress(): + # Testing that nonmetric MDS results in lower normalized stress compared + # compared to metric MDS (non-regression test for issue 27028) + sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) + Z = np.array([[-0.266, -0.539], [0.451, 0.252], [0.016, -0.238], [-0.200, 0.524]]) + + _, stress1 = mds.smacof( + sim, init=Z, n_components=2, max_iter=1000, n_init=1, normalized_stress=True + ) + + _, stress2 = mds.smacof( + sim, + init=Z, + n_components=2, + max_iter=1000, + n_init=1, + normalized_stress=True, + metric=False, + ) + assert stress1 > stress2 + + +def test_nonmetric_mds_optimization(): + # Test that stress is decreasing during nonmetric MDS optimization + # (non-regression test for issue 27028) + X, _ = load_digits(return_X_y=True) + rng = np.random.default_rng(seed=42) + ind_subset = rng.choice(len(X), size=200, replace=False) + X = X[ind_subset] + + mds_est = mds.MDS( + n_components=2, + n_init=1, + max_iter=2, + metric=False, + random_state=42, + ).fit(X) + stress_after_2_iter = mds_est.stress_ + + mds_est = mds.MDS( + n_components=2, + n_init=1, + max_iter=3, + metric=False, + random_state=42, + ).fit(X) + stress_after_3_iter = mds_est.stress_ + + assert stress_after_2_iter > stress_after_3_iter + + +@pytest.mark.parametrize("metric", [True, False]) +def test_mds_recovers_true_data(metric): + X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]]) + mds_est = mds.MDS( + n_components=2, + n_init=1, + eps=1e-15, + max_iter=1000, + metric=metric, + random_state=42, + ).fit(X) + stress = mds_est.stress_ + assert_allclose(stress, 0, atol=1e-6) + + +def test_smacof_error(): + # Not symmetric similarity matrix: + sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) + + with pytest.raises(ValueError): + mds.smacof(sim, n_init=1) + + # Not squared similarity matrix: + sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [4, 2, 1, 0]]) + + with pytest.raises(ValueError): + mds.smacof(sim, n_init=1) + + # init not None and not correct format: + sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) + + Z = np.array([[-0.266, -0.539], [0.016, -0.238], [-0.200, 0.524]]) + with pytest.raises(ValueError): + mds.smacof(sim, init=Z, n_init=1) + + +def test_MDS(): + sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) + mds_clf = mds.MDS( + metric=False, + n_jobs=3, + n_init=3, + dissimilarity="precomputed", + ) + mds_clf.fit(sim) + + +# TODO(1.9): remove warning filter +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("k", [0.5, 1.5, 2]) +def test_normed_stress(k): + """Test that non-metric MDS normalized stress is scale-invariant.""" + sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) + + X1, stress1 = mds.smacof(sim, metric=False, max_iter=5, random_state=0) + X2, stress2 = mds.smacof(k * sim, metric=False, max_iter=5, random_state=0) + + assert_allclose(stress1, stress2, rtol=1e-5) + assert_allclose(X1, X2, rtol=1e-5) + + +# TODO(1.9): remove warning filter +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("metric", [True, False]) +def test_normalized_stress_auto(metric, monkeypatch): + rng = np.random.RandomState(0) + X = rng.randn(4, 3) + dist = euclidean_distances(X) + + mock = Mock(side_effect=mds._smacof_single) + monkeypatch.setattr("sklearn.manifold._mds._smacof_single", mock) + + est = mds.MDS(metric=metric, normalized_stress="auto", random_state=rng) + est.fit_transform(X) + assert mock.call_args[1]["normalized_stress"] != metric + + mds.smacof(dist, metric=metric, normalized_stress="auto", random_state=rng) + assert mock.call_args[1]["normalized_stress"] != metric + + +def test_isotonic_outofbounds(): + # This particular configuration can trigger out of bounds error + # in the isotonic regression (non-regression test for issue 26999) + dis = np.array( + [ + [0.0, 1.732050807568877, 1.7320508075688772], + [1.732050807568877, 0.0, 6.661338147750939e-16], + [1.7320508075688772, 6.661338147750939e-16, 0.0], + ] + ) + init = np.array( + [ + [0.08665881585055124, 0.7939114643387546], + [0.9959834154297658, 0.7555546025640025], + [0.8766008278401566, 0.4227358815811242], + ] + ) + mds.smacof(dis, init=init, metric=False, n_init=1) + + +# TODO(1.9): remove warning filter +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("normalized_stress", [True, False]) +def test_returned_stress(normalized_stress): + # Test that the final stress corresponds to the final embedding + # (non-regression test for issue 16846) + X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]]) + D = euclidean_distances(X) + + mds_est = mds.MDS( + n_components=2, + random_state=42, + normalized_stress=normalized_stress, + ).fit(X) + + Z = mds_est.embedding_ + stress = mds_est.stress_ + + D_mds = euclidean_distances(Z) + stress_Z = ((D_mds.ravel() - D.ravel()) ** 2).sum() / 2 + + if normalized_stress: + stress_Z = np.sqrt(stress_Z / ((D_mds.ravel() ** 2).sum() / 2)) + + assert_allclose(stress, stress_Z) + + +# TODO(1.9): remove warning filter +@pytest.mark.filterwarnings("ignore::FutureWarning") +@pytest.mark.parametrize("metric", [True, False]) +def test_convergence_does_not_depend_on_scale(metric): + # Test that the number of iterations until convergence does not depend on + # the scale of the input data + X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]]) + + mds_est = mds.MDS( + n_components=2, + random_state=42, + metric=metric, + ) + + mds_est.fit(X * 100) + n_iter1 = mds_est.n_iter_ + + mds_est.fit(X / 100) + n_iter2 = mds_est.n_iter_ + + assert_equal(n_iter1, n_iter2) + + +# TODO(1.9): delete this test +def test_future_warning_n_init(): + X = np.array([[1, 1], [1, 4], [1, 5], [3, 3]]) + sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]]) + + with pytest.warns(FutureWarning): + mds.smacof(sim) + + with pytest.warns(FutureWarning): + mds.MDS().fit(X) diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_spectral_embedding.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_spectral_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..4c4115734a404360d0d4ce507d18df9e4b2b5396 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_spectral_embedding.py @@ -0,0 +1,503 @@ +import itertools +from unittest.mock import Mock + +import numpy as np +import pytest +from scipy import sparse +from scipy.linalg import eigh +from scipy.sparse.linalg import eigsh, lobpcg + +from sklearn.cluster import KMeans +from sklearn.datasets import make_blobs +from sklearn.manifold import SpectralEmbedding, _spectral_embedding, spectral_embedding +from sklearn.manifold._spectral_embedding import ( + _graph_connected_component, + _graph_is_connected, +) +from sklearn.metrics import normalized_mutual_info_score, pairwise_distances +from sklearn.metrics.pairwise import rbf_kernel +from sklearn.neighbors import NearestNeighbors +from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal +from sklearn.utils.extmath import _deterministic_vector_sign_flip +from sklearn.utils.fixes import ( + COO_CONTAINERS, + CSC_CONTAINERS, + CSR_CONTAINERS, + parse_version, + sp_version, +) +from sklearn.utils.fixes import laplacian as csgraph_laplacian + +try: + from pyamg import smoothed_aggregation_solver # noqa: F401 + + pyamg_available = True +except ImportError: + pyamg_available = False +skip_if_no_pyamg = pytest.mark.skipif( + not pyamg_available, reason="PyAMG is required for the tests in this function." +) + +# non centered, sparse centers to check the +centers = np.array( + [ + [0.0, 5.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 4.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 5.0, 1.0], + ] +) +n_samples = 1000 +n_clusters, n_features = centers.shape +S, true_labels = make_blobs( + n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42 +) + + +def _assert_equal_with_sign_flipping(A, B, tol=0.0): + """Check array A and B are equal with possible sign flipping on + each column""" + tol_squared = tol**2 + for A_col, B_col in zip(A.T, B.T): + assert ( + np.max((A_col - B_col) ** 2) <= tol_squared + or np.max((A_col + B_col) ** 2) <= tol_squared + ) + + +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +def test_sparse_graph_connected_component(coo_container): + rng = np.random.RandomState(42) + n_samples = 300 + boundaries = [0, 42, 121, 200, n_samples] + p = rng.permutation(n_samples) + connections = [] + + for start, stop in itertools.pairwise(boundaries): + group = p[start:stop] + # Connect all elements within the group at least once via an + # arbitrary path that spans the group. + for i in range(len(group) - 1): + connections.append((group[i], group[i + 1])) + + # Add some more random connections within the group + min_idx, max_idx = 0, len(group) - 1 + n_random_connections = 1000 + source = rng.randint(min_idx, max_idx, size=n_random_connections) + target = rng.randint(min_idx, max_idx, size=n_random_connections) + connections.extend(zip(group[source], group[target])) + + # Build a symmetric affinity matrix + row_idx, column_idx = tuple(np.array(connections).T) + data = rng.uniform(0.1, 42, size=len(connections)) + affinity = coo_container((data, (row_idx, column_idx))) + affinity = 0.5 * (affinity + affinity.T) + + for start, stop in itertools.pairwise(boundaries): + component_1 = _graph_connected_component(affinity, p[start]) + component_size = stop - start + assert component_1.sum() == component_size + + # We should retrieve the same component mask by starting by both ends + # of the group + component_2 = _graph_connected_component(affinity, p[stop - 1]) + assert component_2.sum() == component_size + assert_array_equal(component_1, component_2) + + +# TODO: investigate why this test is seed-sensitive on 32-bit Python +# runtimes. Is this revealing a numerical stability problem ? Or is it +# expected from the test numerical design ? In the latter case the test +# should be made less seed-sensitive instead. +@pytest.mark.parametrize( + "eigen_solver", + [ + "arpack", + "lobpcg", + pytest.param("amg", marks=skip_if_no_pyamg), + ], +) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_spectral_embedding_two_components(eigen_solver, dtype, seed=0): + # Test spectral embedding with two components + random_state = np.random.RandomState(seed) + n_sample = 100 + affinity = np.zeros(shape=[n_sample * 2, n_sample * 2]) + # first component + affinity[0:n_sample, 0:n_sample] = ( + np.abs(random_state.randn(n_sample, n_sample)) + 2 + ) + # second component + affinity[n_sample::, n_sample::] = ( + np.abs(random_state.randn(n_sample, n_sample)) + 2 + ) + + # Test of internal _graph_connected_component before connection + component = _graph_connected_component(affinity, 0) + assert component[:n_sample].all() + assert not component[n_sample:].any() + component = _graph_connected_component(affinity, -1) + assert not component[:n_sample].any() + assert component[n_sample:].all() + + # connection + affinity[0, n_sample + 1] = 1 + affinity[n_sample + 1, 0] = 1 + affinity.flat[:: 2 * n_sample + 1] = 0 + affinity = 0.5 * (affinity + affinity.T) + + true_label = np.zeros(shape=2 * n_sample) + true_label[0:n_sample] = 1 + + se_precomp = SpectralEmbedding( + n_components=1, + affinity="precomputed", + random_state=np.random.RandomState(seed), + eigen_solver=eigen_solver, + ) + + embedded_coordinate = se_precomp.fit_transform(affinity.astype(dtype)) + # thresholding on the first components using 0. + label_ = np.array(embedded_coordinate.ravel() < 0, dtype=np.int64) + assert normalized_mutual_info_score(true_label, label_) == pytest.approx(1.0) + + +@pytest.mark.parametrize("sparse_container", [None, *CSR_CONTAINERS]) +@pytest.mark.parametrize( + "eigen_solver", + [ + "arpack", + "lobpcg", + pytest.param("amg", marks=skip_if_no_pyamg), + ], +) +@pytest.mark.parametrize("dtype", (np.float32, np.float64)) +def test_spectral_embedding_precomputed_affinity( + sparse_container, eigen_solver, dtype, seed=36 +): + # Test spectral embedding with precomputed kernel + gamma = 1.0 + X = S if sparse_container is None else sparse_container(S) + + se_precomp = SpectralEmbedding( + n_components=2, + affinity="precomputed", + random_state=np.random.RandomState(seed), + eigen_solver=eigen_solver, + ) + se_rbf = SpectralEmbedding( + n_components=2, + affinity="rbf", + gamma=gamma, + random_state=np.random.RandomState(seed), + eigen_solver=eigen_solver, + ) + embed_precomp = se_precomp.fit_transform(rbf_kernel(X.astype(dtype), gamma=gamma)) + embed_rbf = se_rbf.fit_transform(X.astype(dtype)) + assert_array_almost_equal(se_precomp.affinity_matrix_, se_rbf.affinity_matrix_) + _assert_equal_with_sign_flipping(embed_precomp, embed_rbf, 0.05) + + +def test_precomputed_nearest_neighbors_filtering(): + # Test precomputed graph filtering when containing too many neighbors + n_neighbors = 2 + results = [] + for additional_neighbors in [0, 10]: + nn = NearestNeighbors(n_neighbors=n_neighbors + additional_neighbors).fit(S) + graph = nn.kneighbors_graph(S, mode="connectivity") + embedding = ( + SpectralEmbedding( + random_state=0, + n_components=2, + affinity="precomputed_nearest_neighbors", + n_neighbors=n_neighbors, + ) + .fit(graph) + .embedding_ + ) + results.append(embedding) + + assert_array_equal(results[0], results[1]) + + +@pytest.mark.parametrize("sparse_container", [None, *CSR_CONTAINERS]) +def test_spectral_embedding_callable_affinity(sparse_container, seed=36): + # Test spectral embedding with callable affinity + gamma = 0.9 + kern = rbf_kernel(S, gamma=gamma) + X = S if sparse_container is None else sparse_container(S) + + se_callable = SpectralEmbedding( + n_components=2, + affinity=(lambda x: rbf_kernel(x, gamma=gamma)), + gamma=gamma, + random_state=np.random.RandomState(seed), + ) + se_rbf = SpectralEmbedding( + n_components=2, + affinity="rbf", + gamma=gamma, + random_state=np.random.RandomState(seed), + ) + embed_rbf = se_rbf.fit_transform(X) + embed_callable = se_callable.fit_transform(X) + assert_array_almost_equal(se_callable.affinity_matrix_, se_rbf.affinity_matrix_) + assert_array_almost_equal(kern, se_rbf.affinity_matrix_) + _assert_equal_with_sign_flipping(embed_rbf, embed_callable, 0.05) + + +@pytest.mark.skipif( + not pyamg_available, reason="PyAMG is required for the tests in this function." +) +@pytest.mark.parametrize("dtype", (np.float32, np.float64)) +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +def test_spectral_embedding_amg_solver(dtype, coo_container, seed=36): + se_amg = SpectralEmbedding( + n_components=2, + affinity="nearest_neighbors", + eigen_solver="amg", + n_neighbors=5, + random_state=np.random.RandomState(seed), + ) + se_arpack = SpectralEmbedding( + n_components=2, + affinity="nearest_neighbors", + eigen_solver="arpack", + n_neighbors=5, + random_state=np.random.RandomState(seed), + ) + embed_amg = se_amg.fit_transform(S.astype(dtype)) + embed_arpack = se_arpack.fit_transform(S.astype(dtype)) + _assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5) + + # same with special case in which amg is not actually used + # regression test for #10715 + # affinity between nodes + row = np.array([0, 0, 1, 2, 3, 3, 4], dtype=np.int32) + col = np.array([1, 2, 2, 3, 4, 5, 5], dtype=np.int32) + val = np.array([100, 100, 100, 1, 100, 100, 100], dtype=np.int64) + + affinity = coo_container( + (np.hstack([val, val]), (np.hstack([row, col]), np.hstack([col, row]))), + shape=(6, 6), + ) + se_amg.affinity = "precomputed" + se_arpack.affinity = "precomputed" + embed_amg = se_amg.fit_transform(affinity.astype(dtype)) + embed_arpack = se_arpack.fit_transform(affinity.astype(dtype)) + _assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5) + + # Check that passing a sparse matrix with `np.int64` indices dtype raises an error + # or is successful based on the version of SciPy which is installed. + # Use a CSR matrix to avoid any conversion during the validation + affinity = affinity.tocsr() + affinity.indptr = affinity.indptr.astype(np.int64) + affinity.indices = affinity.indices.astype(np.int64) + + # PR: https://github.com/scipy/scipy/pull/18913 + # First integration in 1.11.3: https://github.com/scipy/scipy/pull/19279 + scipy_graph_traversal_supports_int64_index = sp_version >= parse_version("1.11.3") + if scipy_graph_traversal_supports_int64_index: + se_amg.fit_transform(affinity) + else: + err_msg = "Only sparse matrices with 32-bit integer indices are accepted" + with pytest.raises(ValueError, match=err_msg): + se_amg.fit_transform(affinity) + + +@pytest.mark.skipif( + not pyamg_available, reason="PyAMG is required for the tests in this function." +) +@pytest.mark.parametrize("dtype", (np.float32, np.float64)) +def test_spectral_embedding_amg_solver_failure(dtype, seed=36): + # Non-regression test for amg solver failure (issue #13393 on github) + num_nodes = 100 + X = sparse.rand(num_nodes, num_nodes, density=0.1, random_state=seed) + X = X.astype(dtype) + upper = sparse.triu(X) - sparse.diags(X.diagonal()) + sym_matrix = upper + upper.T + embedding = spectral_embedding( + sym_matrix, n_components=10, eigen_solver="amg", random_state=0 + ) + + # Check that the learned embedding is stable w.r.t. random solver init: + for i in range(3): + new_embedding = spectral_embedding( + sym_matrix, n_components=10, eigen_solver="amg", random_state=i + 1 + ) + _assert_equal_with_sign_flipping(embedding, new_embedding, tol=0.05) + + +def test_pipeline_spectral_clustering(seed=36): + # Test using pipeline to do spectral clustering + random_state = np.random.RandomState(seed) + se_rbf = SpectralEmbedding( + n_components=n_clusters, affinity="rbf", random_state=random_state + ) + se_knn = SpectralEmbedding( + n_components=n_clusters, + affinity="nearest_neighbors", + n_neighbors=5, + random_state=random_state, + ) + for se in [se_rbf, se_knn]: + km = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10) + km.fit(se.fit_transform(S)) + assert_array_almost_equal( + normalized_mutual_info_score(km.labels_, true_labels), 1.0, 2 + ) + + +def test_connectivity(seed=36): + # Test that graph connectivity test works as expected + graph = np.array( + [ + [1, 0, 0, 0, 0], + [0, 1, 1, 0, 0], + [0, 1, 1, 1, 0], + [0, 0, 1, 1, 1], + [0, 0, 0, 1, 1], + ] + ) + assert not _graph_is_connected(graph) + for csr_container in CSR_CONTAINERS: + assert not _graph_is_connected(csr_container(graph)) + for csc_container in CSC_CONTAINERS: + assert not _graph_is_connected(csc_container(graph)) + + graph = np.array( + [ + [1, 1, 0, 0, 0], + [1, 1, 1, 0, 0], + [0, 1, 1, 1, 0], + [0, 0, 1, 1, 1], + [0, 0, 0, 1, 1], + ] + ) + assert _graph_is_connected(graph) + for csr_container in CSR_CONTAINERS: + assert _graph_is_connected(csr_container(graph)) + for csc_container in CSC_CONTAINERS: + assert _graph_is_connected(csc_container(graph)) + + +def test_spectral_embedding_deterministic(): + # Test that Spectral Embedding is deterministic + random_state = np.random.RandomState(36) + data = random_state.randn(10, 30) + sims = rbf_kernel(data) + embedding_1 = spectral_embedding(sims) + embedding_2 = spectral_embedding(sims) + assert_array_almost_equal(embedding_1, embedding_2) + + +def test_spectral_embedding_unnormalized(): + # Test that spectral_embedding is also processing unnormalized laplacian + # correctly + random_state = np.random.RandomState(36) + data = random_state.randn(10, 30) + sims = rbf_kernel(data) + n_components = 8 + embedding_1 = spectral_embedding( + sims, norm_laplacian=False, n_components=n_components, drop_first=False + ) + + # Verify using manual computation with dense eigh + laplacian, dd = csgraph_laplacian(sims, normed=False, return_diag=True) + _, diffusion_map = eigh(laplacian) + embedding_2 = diffusion_map.T[:n_components] + embedding_2 = _deterministic_vector_sign_flip(embedding_2).T + + assert_array_almost_equal(embedding_1, embedding_2) + + +def test_spectral_embedding_first_eigen_vector(): + # Test that the first eigenvector of spectral_embedding + # is constant and that the second is not (for a connected graph) + random_state = np.random.RandomState(36) + data = random_state.randn(10, 30) + sims = rbf_kernel(data) + n_components = 2 + + for seed in range(10): + embedding = spectral_embedding( + sims, + norm_laplacian=False, + n_components=n_components, + drop_first=False, + random_state=seed, + ) + + assert np.std(embedding[:, 0]) == pytest.approx(0) + assert np.std(embedding[:, 1]) > 1e-3 + + +@pytest.mark.parametrize( + "eigen_solver", + [ + "arpack", + "lobpcg", + pytest.param("amg", marks=skip_if_no_pyamg), + ], +) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_spectral_embedding_preserves_dtype(eigen_solver, dtype): + """Check that `SpectralEmbedding is preserving the dtype of the fitted + attribute and transformed data. + + Ideally, this test should be covered by the common test + `check_transformer_preserve_dtypes`. However, this test only run + with transformers implementing `transform` while `SpectralEmbedding` + implements only `fit_transform`. + """ + X = S.astype(dtype) + se = SpectralEmbedding( + n_components=2, affinity="rbf", eigen_solver=eigen_solver, random_state=0 + ) + X_trans = se.fit_transform(X) + + assert X_trans.dtype == dtype + assert se.embedding_.dtype == dtype + assert se.affinity_matrix_.dtype == dtype + + +@pytest.mark.skipif( + pyamg_available, + reason="PyAMG is installed and we should not test for an error.", +) +def test_error_pyamg_not_available(): + se_precomp = SpectralEmbedding( + n_components=2, + affinity="rbf", + eigen_solver="amg", + ) + err_msg = "The eigen_solver was set to 'amg', but pyamg is not available." + with pytest.raises(ValueError, match=err_msg): + se_precomp.fit_transform(S) + + +@pytest.mark.parametrize("solver", ["arpack", "amg", "lobpcg"]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_spectral_eigen_tol_auto(monkeypatch, solver, csr_container): + """Test that `eigen_tol="auto"` is resolved correctly""" + if solver == "amg" and not pyamg_available: + pytest.skip("PyAMG is not available.") + X, _ = make_blobs( + n_samples=200, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01 + ) + D = pairwise_distances(X) # Distance matrix + S = np.max(D) - D # Similarity matrix + + solver_func = eigsh if solver == "arpack" else lobpcg + default_value = 0 if solver == "arpack" else None + if solver == "amg": + S = csr_container(S) + + mocked_solver = Mock(side_effect=solver_func) + + monkeypatch.setattr(_spectral_embedding, solver_func.__qualname__, mocked_solver) + + spectral_embedding(S, random_state=42, eigen_solver=solver, eigen_tol="auto") + mocked_solver.assert_called() + + _, kwargs = mocked_solver.call_args + assert kwargs["tol"] == default_value diff --git a/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_t_sne.py b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_t_sne.py new file mode 100644 index 0000000000000000000000000000000000000000..4f32b889d5b1f20c758228f075e4f5541bfb3300 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/manifold/tests/test_t_sne.py @@ -0,0 +1,1187 @@ +import re +import sys +from io import StringIO + +import numpy as np +import pytest +import scipy.sparse as sp +from numpy.testing import assert_allclose +from scipy.optimize import check_grad +from scipy.spatial.distance import pdist, squareform + +from sklearn import config_context +from sklearn.datasets import make_blobs + +# mypy error: Module 'sklearn.manifold' has no attribute '_barnes_hut_tsne' +from sklearn.manifold import ( # type: ignore[attr-defined] + TSNE, + _barnes_hut_tsne, +) +from sklearn.manifold._t_sne import ( + _gradient_descent, + _joint_probabilities, + _joint_probabilities_nn, + _kl_divergence, + _kl_divergence_bh, + trustworthiness, +) +from sklearn.manifold._utils import _binary_search_perplexity +from sklearn.metrics.pairwise import ( + cosine_distances, + manhattan_distances, + pairwise_distances, +) +from sklearn.neighbors import NearestNeighbors, kneighbors_graph +from sklearn.utils import check_random_state +from sklearn.utils._testing import ( + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, + skip_if_32bit, +) +from sklearn.utils.fixes import CSR_CONTAINERS, LIL_CONTAINERS + +x = np.linspace(0, 1, 10) +xx, yy = np.meshgrid(x, x) +X_2d_grid = np.hstack( + [ + xx.ravel().reshape(-1, 1), + yy.ravel().reshape(-1, 1), + ] +) + + +def test_gradient_descent_stops(): + # Test stopping conditions of gradient descent. + class ObjectiveSmallGradient: + def __init__(self): + self.it = -1 + + def __call__(self, _, compute_error=True): + self.it += 1 + return (10 - self.it) / 10.0, np.array([1e-5]) + + def flat_function(_, compute_error=True): + return 0.0, np.ones(1) + + # Gradient norm + old_stdout = sys.stdout + sys.stdout = StringIO() + try: + _, error, it = _gradient_descent( + ObjectiveSmallGradient(), + np.zeros(1), + 0, + max_iter=100, + n_iter_without_progress=100, + momentum=0.0, + learning_rate=0.0, + min_gain=0.0, + min_grad_norm=1e-5, + verbose=2, + ) + finally: + out = sys.stdout.getvalue() + sys.stdout.close() + sys.stdout = old_stdout + assert error == 1.0 + assert it == 0 + assert "gradient norm" in out + + # Maximum number of iterations without improvement + old_stdout = sys.stdout + sys.stdout = StringIO() + try: + _, error, it = _gradient_descent( + flat_function, + np.zeros(1), + 0, + max_iter=100, + n_iter_without_progress=10, + momentum=0.0, + learning_rate=0.0, + min_gain=0.0, + min_grad_norm=0.0, + verbose=2, + ) + finally: + out = sys.stdout.getvalue() + sys.stdout.close() + sys.stdout = old_stdout + assert error == 0.0 + assert it == 11 + assert "did not make any progress" in out + + # Maximum number of iterations + old_stdout = sys.stdout + sys.stdout = StringIO() + try: + _, error, it = _gradient_descent( + ObjectiveSmallGradient(), + np.zeros(1), + 0, + max_iter=11, + n_iter_without_progress=100, + momentum=0.0, + learning_rate=0.0, + min_gain=0.0, + min_grad_norm=0.0, + verbose=2, + ) + finally: + out = sys.stdout.getvalue() + sys.stdout.close() + sys.stdout = old_stdout + assert error == 0.0 + assert it == 10 + assert "Iteration 10" in out + + +def test_binary_search(): + # Test if the binary search finds Gaussians with desired perplexity. + random_state = check_random_state(0) + data = random_state.randn(50, 5) + distances = pairwise_distances(data).astype(np.float32) + desired_perplexity = 25.0 + P = _binary_search_perplexity(distances, desired_perplexity, verbose=0) + P = np.maximum(P, np.finfo(np.double).eps) + mean_perplexity = np.mean( + [np.exp(-np.sum(P[i] * np.log(P[i]))) for i in range(P.shape[0])] + ) + assert_almost_equal(mean_perplexity, desired_perplexity, decimal=3) + + +def test_binary_search_underflow(): + # Test if the binary search finds Gaussians with desired perplexity. + # A more challenging case than the one above, producing numeric + # underflow in float precision (see issue #19471 and PR #19472). + random_state = check_random_state(42) + data = random_state.randn(1, 90).astype(np.float32) + 100 + desired_perplexity = 30.0 + P = _binary_search_perplexity(data, desired_perplexity, verbose=0) + perplexity = 2 ** -np.nansum(P[0, 1:] * np.log2(P[0, 1:])) + assert_almost_equal(perplexity, desired_perplexity, decimal=3) + + +def test_binary_search_neighbors(): + # Binary perplexity search approximation. + # Should be approximately equal to the slow method when we use + # all points as neighbors. + n_samples = 200 + desired_perplexity = 25.0 + random_state = check_random_state(0) + data = random_state.randn(n_samples, 2).astype(np.float32, copy=False) + distances = pairwise_distances(data) + P1 = _binary_search_perplexity(distances, desired_perplexity, verbose=0) + + # Test that when we use all the neighbors the results are identical + n_neighbors = n_samples - 1 + nn = NearestNeighbors().fit(data) + distance_graph = nn.kneighbors_graph(n_neighbors=n_neighbors, mode="distance") + distances_nn = distance_graph.data.astype(np.float32, copy=False) + distances_nn = distances_nn.reshape(n_samples, n_neighbors) + P2 = _binary_search_perplexity(distances_nn, desired_perplexity, verbose=0) + + indptr = distance_graph.indptr + P1_nn = np.array( + [ + P1[k, distance_graph.indices[indptr[k] : indptr[k + 1]]] + for k in range(n_samples) + ] + ) + assert_array_almost_equal(P1_nn, P2, decimal=4) + + # Test that the highest P_ij are the same when fewer neighbors are used + for k in np.linspace(150, n_samples - 1, 5): + k = int(k) + topn = k * 10 # check the top 10 * k entries out of k * k entries + distance_graph = nn.kneighbors_graph(n_neighbors=k, mode="distance") + distances_nn = distance_graph.data.astype(np.float32, copy=False) + distances_nn = distances_nn.reshape(n_samples, k) + P2k = _binary_search_perplexity(distances_nn, desired_perplexity, verbose=0) + assert_array_almost_equal(P1_nn, P2, decimal=2) + idx = np.argsort(P1.ravel())[::-1] + P1top = P1.ravel()[idx][:topn] + idx = np.argsort(P2k.ravel())[::-1] + P2top = P2k.ravel()[idx][:topn] + assert_array_almost_equal(P1top, P2top, decimal=2) + + +def test_binary_perplexity_stability(): + # Binary perplexity search should be stable. + # The binary_search_perplexity had a bug wherein the P array + # was uninitialized, leading to sporadically failing tests. + n_neighbors = 10 + n_samples = 100 + random_state = check_random_state(0) + data = random_state.randn(n_samples, 5) + nn = NearestNeighbors().fit(data) + distance_graph = nn.kneighbors_graph(n_neighbors=n_neighbors, mode="distance") + distances = distance_graph.data.astype(np.float32, copy=False) + distances = distances.reshape(n_samples, n_neighbors) + last_P = None + desired_perplexity = 3 + for _ in range(100): + P = _binary_search_perplexity(distances.copy(), desired_perplexity, verbose=0) + P1 = _joint_probabilities_nn(distance_graph, desired_perplexity, verbose=0) + # Convert the sparse matrix to a dense one for testing + P1 = P1.toarray() + if last_P is None: + last_P = P + last_P1 = P1 + else: + assert_array_almost_equal(P, last_P, decimal=4) + assert_array_almost_equal(P1, last_P1, decimal=4) + + +def test_gradient(): + # Test gradient of Kullback-Leibler divergence. + random_state = check_random_state(0) + + n_samples = 50 + n_features = 2 + n_components = 2 + alpha = 1.0 + + distances = random_state.randn(n_samples, n_features).astype(np.float32) + distances = np.abs(distances.dot(distances.T)) + np.fill_diagonal(distances, 0.0) + X_embedded = random_state.randn(n_samples, n_components).astype(np.float32) + + P = _joint_probabilities(distances, desired_perplexity=25.0, verbose=0) + + def fun(params): + return _kl_divergence(params, P, alpha, n_samples, n_components)[0] + + def grad(params): + return _kl_divergence(params, P, alpha, n_samples, n_components)[1] + + assert_almost_equal(check_grad(fun, grad, X_embedded.ravel()), 0.0, decimal=5) + + +def test_trustworthiness(): + # Test trustworthiness score. + random_state = check_random_state(0) + + # Affine transformation + X = random_state.randn(100, 2) + assert trustworthiness(X, 5.0 + X / 10.0) == 1.0 + + # Randomly shuffled + X = np.arange(100).reshape(-1, 1) + X_embedded = X.copy() + random_state.shuffle(X_embedded) + assert trustworthiness(X, X_embedded) < 0.6 + + # Completely different + X = np.arange(5).reshape(-1, 1) + X_embedded = np.array([[0], [2], [4], [1], [3]]) + assert_almost_equal(trustworthiness(X, X_embedded, n_neighbors=1), 0.2) + + +def test_trustworthiness_n_neighbors_error(): + """Raise an error when n_neighbors >= n_samples / 2. + + Non-regression test for #18567. + """ + regex = "n_neighbors .+ should be less than .+" + rng = np.random.RandomState(42) + X = rng.rand(7, 4) + X_embedded = rng.rand(7, 2) + with pytest.raises(ValueError, match=regex): + trustworthiness(X, X_embedded, n_neighbors=5) + + trust = trustworthiness(X, X_embedded, n_neighbors=3) + assert 0 <= trust <= 1 + + +@pytest.mark.parametrize("method", ["exact", "barnes_hut"]) +@pytest.mark.parametrize("init", ("random", "pca")) +def test_preserve_trustworthiness_approximately(method, init): + # Nearest neighbors should be preserved approximately. + random_state = check_random_state(0) + n_components = 2 + X = random_state.randn(50, n_components).astype(np.float32) + tsne = TSNE( + n_components=n_components, + init=init, + random_state=0, + method=method, + max_iter=700, + learning_rate="auto", + ) + X_embedded = tsne.fit_transform(X) + t = trustworthiness(X, X_embedded, n_neighbors=1) + assert t > 0.85 + + +def test_optimization_minimizes_kl_divergence(): + """t-SNE should give a lower KL divergence with more iterations.""" + random_state = check_random_state(0) + X, _ = make_blobs(n_features=3, random_state=random_state) + kl_divergences = [] + for max_iter in [250, 300, 350]: + tsne = TSNE( + n_components=2, + init="random", + perplexity=10, + learning_rate=100.0, + max_iter=max_iter, + random_state=0, + ) + tsne.fit_transform(X) + kl_divergences.append(tsne.kl_divergence_) + assert kl_divergences[1] <= kl_divergences[0] + assert kl_divergences[2] <= kl_divergences[1] + + +@pytest.mark.parametrize("method", ["exact", "barnes_hut"]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_fit_transform_csr_matrix(method, csr_container): + # TODO: compare results on dense and sparse data as proposed in: + # https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186 + # X can be a sparse matrix. + rng = check_random_state(0) + X = rng.randn(50, 2) + X[(rng.randint(0, 50, 25), rng.randint(0, 2, 25))] = 0.0 + X_csr = csr_container(X) + tsne = TSNE( + n_components=2, + init="random", + perplexity=10, + learning_rate=100.0, + random_state=0, + method=method, + max_iter=750, + ) + X_embedded = tsne.fit_transform(X_csr) + assert_allclose(trustworthiness(X_csr, X_embedded, n_neighbors=1), 1.0, rtol=1.1e-1) + + +def test_preserve_trustworthiness_approximately_with_precomputed_distances(): + # Nearest neighbors should be preserved approximately. + random_state = check_random_state(0) + for i in range(3): + X = random_state.randn(80, 2) + D = squareform(pdist(X), "sqeuclidean") + tsne = TSNE( + n_components=2, + perplexity=2, + learning_rate=100.0, + early_exaggeration=2.0, + metric="precomputed", + random_state=i, + verbose=0, + max_iter=500, + init="random", + ) + X_embedded = tsne.fit_transform(D) + t = trustworthiness(D, X_embedded, n_neighbors=1, metric="precomputed") + assert t > 0.95 + + +def test_trustworthiness_not_euclidean_metric(): + # Test trustworthiness with a metric different from 'euclidean' and + # 'precomputed' + random_state = check_random_state(0) + X = random_state.randn(100, 2) + assert trustworthiness(X, X, metric="cosine") == trustworthiness( + pairwise_distances(X, metric="cosine"), X, metric="precomputed" + ) + + +@pytest.mark.parametrize( + "method, retype", + [ + ("exact", np.asarray), + ("barnes_hut", np.asarray), + *[("barnes_hut", csr_container) for csr_container in CSR_CONTAINERS], + ], +) +@pytest.mark.parametrize( + "D, message_regex", + [ + ([[0.0], [1.0]], ".* square distance matrix"), + ([[0.0, -1.0], [1.0, 0.0]], ".* positive.*"), + ], +) +def test_bad_precomputed_distances(method, D, retype, message_regex): + tsne = TSNE( + metric="precomputed", + method=method, + init="random", + random_state=42, + perplexity=1, + ) + with pytest.raises(ValueError, match=message_regex): + tsne.fit_transform(retype(D)) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_exact_no_precomputed_sparse(csr_container): + tsne = TSNE( + metric="precomputed", + method="exact", + init="random", + random_state=42, + perplexity=1, + ) + with pytest.raises(TypeError, match="sparse"): + tsne.fit_transform(csr_container([[0, 5], [5, 0]])) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_high_perplexity_precomputed_sparse_distances(csr_container): + # Perplexity should be less than 50 + dist = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [1.0, 0.0, 0.0]]) + bad_dist = csr_container(dist) + tsne = TSNE(metric="precomputed", init="random", random_state=42, perplexity=1) + msg = "3 neighbors per samples are required, but some samples have only 1" + with pytest.raises(ValueError, match=msg): + tsne.fit_transform(bad_dist) + + +@pytest.mark.filterwarnings( + "ignore:Precomputed sparse input was not sorted by " + "row values:sklearn.exceptions.EfficiencyWarning" +) +@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + LIL_CONTAINERS) +def test_sparse_precomputed_distance(sparse_container): + """Make sure that TSNE works identically for sparse and dense matrix""" + random_state = check_random_state(0) + X = random_state.randn(100, 2) + + D_sparse = kneighbors_graph(X, n_neighbors=100, mode="distance", include_self=True) + D = pairwise_distances(X) + assert sp.issparse(D_sparse) + assert_almost_equal(D_sparse.toarray(), D) + + tsne = TSNE( + metric="precomputed", random_state=0, init="random", learning_rate="auto" + ) + Xt_dense = tsne.fit_transform(D) + + Xt_sparse = tsne.fit_transform(sparse_container(D_sparse)) + assert_almost_equal(Xt_dense, Xt_sparse) + + +def test_non_positive_computed_distances(): + # Computed distance matrices must be positive. + def metric(x, y): + return -1 + + # Negative computed distances should be caught even if result is squared + tsne = TSNE(metric=metric, method="exact", perplexity=1) + X = np.array([[0.0, 0.0], [1.0, 1.0]]) + with pytest.raises(ValueError, match="All distances .*metric given.*"): + tsne.fit_transform(X) + + +def test_init_ndarray(): + # Initialize TSNE with ndarray and test fit + tsne = TSNE(init=np.zeros((100, 2)), learning_rate="auto") + X_embedded = tsne.fit_transform(np.ones((100, 5))) + assert_array_equal(np.zeros((100, 2)), X_embedded) + + +def test_init_ndarray_precomputed(): + # Initialize TSNE with ndarray and metric 'precomputed' + # Make sure no FutureWarning is thrown from _fit + tsne = TSNE( + init=np.zeros((100, 2)), + metric="precomputed", + learning_rate=50.0, + ) + tsne.fit(np.zeros((100, 100))) + + +def test_pca_initialization_not_compatible_with_precomputed_kernel(): + # Precomputed distance matrices cannot use PCA initialization. + tsne = TSNE(metric="precomputed", init="pca", perplexity=1) + with pytest.raises( + ValueError, + match='The parameter init="pca" cannot be used with metric="precomputed".', + ): + tsne.fit_transform(np.array([[0.0], [1.0]])) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_pca_initialization_not_compatible_with_sparse_input(csr_container): + # Sparse input matrices cannot use PCA initialization. + tsne = TSNE(init="pca", learning_rate=100.0, perplexity=1) + with pytest.raises(TypeError, match="PCA initialization.*"): + tsne.fit_transform(csr_container([[0, 5], [5, 0]])) + + +def test_n_components_range(): + # barnes_hut method should only be used with n_components <= 3 + tsne = TSNE(n_components=4, method="barnes_hut", perplexity=1) + with pytest.raises(ValueError, match="'n_components' should be .*"): + tsne.fit_transform(np.array([[0.0], [1.0]])) + + +def test_early_exaggeration_used(): + # check that the ``early_exaggeration`` parameter has an effect + random_state = check_random_state(0) + n_components = 2 + methods = ["exact", "barnes_hut"] + X = random_state.randn(25, n_components).astype(np.float32) + for method in methods: + tsne = TSNE( + n_components=n_components, + perplexity=1, + learning_rate=100.0, + init="pca", + random_state=0, + method=method, + early_exaggeration=1.0, + max_iter=250, + ) + X_embedded1 = tsne.fit_transform(X) + tsne = TSNE( + n_components=n_components, + perplexity=1, + learning_rate=100.0, + init="pca", + random_state=0, + method=method, + early_exaggeration=10.0, + max_iter=250, + ) + X_embedded2 = tsne.fit_transform(X) + + assert not np.allclose(X_embedded1, X_embedded2) + + +def test_max_iter_used(): + # check that the ``max_iter`` parameter has an effect + random_state = check_random_state(0) + n_components = 2 + methods = ["exact", "barnes_hut"] + X = random_state.randn(25, n_components).astype(np.float32) + for method in methods: + for max_iter in [251, 500]: + tsne = TSNE( + n_components=n_components, + perplexity=1, + learning_rate=0.5, + init="random", + random_state=0, + method=method, + early_exaggeration=1.0, + max_iter=max_iter, + ) + tsne.fit_transform(X) + + assert tsne.n_iter_ == max_iter - 1 + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_answer_gradient_two_points(csr_container): + # Test the tree with only a single set of children. + # + # These tests & answers have been checked against the reference + # implementation by LvdM. + pos_input = np.array([[1.0, 0.0], [0.0, 1.0]]) + pos_output = np.array( + [[-4.961291e-05, -1.072243e-04], [9.259460e-05, 2.702024e-04]] + ) + neighbors = np.array([[1], [0]]) + grad_output = np.array( + [[-2.37012478e-05, -6.29044398e-05], [2.37012478e-05, 6.29044398e-05]] + ) + _run_answer_test(pos_input, pos_output, neighbors, grad_output, csr_container) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_answer_gradient_four_points(csr_container): + # Four points tests the tree with multiple levels of children. + # + # These tests & answers have been checked against the reference + # implementation by LvdM. + pos_input = np.array([[1.0, 0.0], [0.0, 1.0], [5.0, 2.0], [7.3, 2.2]]) + pos_output = np.array( + [ + [6.080564e-05, -7.120823e-05], + [-1.718945e-04, -4.000536e-05], + [-2.271720e-04, 8.663310e-05], + [-1.032577e-04, -3.582033e-05], + ] + ) + neighbors = np.array([[1, 2, 3], [0, 2, 3], [1, 0, 3], [1, 2, 0]]) + grad_output = np.array( + [ + [5.81128448e-05, -7.78033454e-06], + [-5.81526851e-05, 7.80976444e-06], + [4.24275173e-08, -3.69569698e-08], + [-2.58720939e-09, 7.52706374e-09], + ] + ) + _run_answer_test(pos_input, pos_output, neighbors, grad_output, csr_container) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_skip_num_points_gradient(csr_container): + # Test the kwargs option skip_num_points. + # + # Skip num points should make it such that the Barnes_hut gradient + # is not calculated for indices below skip_num_point. + # Aside from skip_num_points=2 and the first two gradient rows + # being set to zero, these data points are the same as in + # test_answer_gradient_four_points() + pos_input = np.array([[1.0, 0.0], [0.0, 1.0], [5.0, 2.0], [7.3, 2.2]]) + pos_output = np.array( + [ + [6.080564e-05, -7.120823e-05], + [-1.718945e-04, -4.000536e-05], + [-2.271720e-04, 8.663310e-05], + [-1.032577e-04, -3.582033e-05], + ] + ) + neighbors = np.array([[1, 2, 3], [0, 2, 3], [1, 0, 3], [1, 2, 0]]) + grad_output = np.array( + [ + [0.0, 0.0], + [0.0, 0.0], + [4.24275173e-08, -3.69569698e-08], + [-2.58720939e-09, 7.52706374e-09], + ] + ) + _run_answer_test( + pos_input, pos_output, neighbors, grad_output, csr_container, False, 0.1, 2 + ) + + +def _run_answer_test( + pos_input, + pos_output, + neighbors, + grad_output, + csr_container, + verbose=False, + perplexity=0.1, + skip_num_points=0, +): + distances = pairwise_distances(pos_input).astype(np.float32) + args = distances, perplexity, verbose + pos_output = pos_output.astype(np.float32) + neighbors = neighbors.astype(np.int64, copy=False) + pij_input = _joint_probabilities(*args) + pij_input = squareform(pij_input).astype(np.float32) + grad_bh = np.zeros(pos_output.shape, dtype=np.float32) + + P = csr_container(pij_input) + + neighbors = P.indices.astype(np.int64) + indptr = P.indptr.astype(np.int64) + + _barnes_hut_tsne.gradient( + P.data, pos_output, neighbors, indptr, grad_bh, 0.5, 2, 1, skip_num_points=0 + ) + assert_array_almost_equal(grad_bh, grad_output, decimal=4) + + +def test_verbose(): + # Verbose options write to stdout. + random_state = check_random_state(0) + tsne = TSNE(verbose=2, perplexity=4) + X = random_state.randn(5, 2) + + old_stdout = sys.stdout + sys.stdout = StringIO() + try: + tsne.fit_transform(X) + finally: + out = sys.stdout.getvalue() + sys.stdout.close() + sys.stdout = old_stdout + + assert "[t-SNE]" in out + assert "nearest neighbors..." in out + assert "Computed conditional probabilities" in out + assert "Mean sigma" in out + assert "early exaggeration" in out + + +def test_chebyshev_metric(): + # t-SNE should allow metrics that cannot be squared (issue #3526). + random_state = check_random_state(0) + tsne = TSNE(metric="chebyshev", perplexity=4) + X = random_state.randn(5, 2) + tsne.fit_transform(X) + + +def test_reduction_to_one_component(): + # t-SNE should allow reduction to one component (issue #4154). + random_state = check_random_state(0) + tsne = TSNE(n_components=1, perplexity=4) + X = random_state.randn(5, 2) + X_embedded = tsne.fit(X).embedding_ + assert np.all(np.isfinite(X_embedded)) + + +@pytest.mark.parametrize("method", ["barnes_hut", "exact"]) +@pytest.mark.parametrize("dt", [np.float32, np.float64]) +def test_64bit(method, dt): + # Ensure 64bit arrays are handled correctly. + random_state = check_random_state(0) + + X = random_state.randn(10, 2).astype(dt, copy=False) + tsne = TSNE( + n_components=2, + perplexity=2, + learning_rate=100.0, + random_state=0, + method=method, + verbose=0, + max_iter=300, + init="random", + ) + X_embedded = tsne.fit_transform(X) + effective_type = X_embedded.dtype + + # tsne cython code is only single precision, so the output will + # always be single precision, irrespectively of the input dtype + assert effective_type == np.float32 + + +@pytest.mark.parametrize("method", ["barnes_hut", "exact"]) +def test_kl_divergence_not_nan(method): + # Ensure kl_divergence_ is computed at last iteration + # even though max_iter % n_iter_check != 0, i.e. 1003 % 50 != 0 + random_state = check_random_state(0) + + X = random_state.randn(50, 2) + tsne = TSNE( + n_components=2, + perplexity=2, + learning_rate=100.0, + random_state=0, + method=method, + verbose=0, + max_iter=503, + init="random", + ) + tsne.fit_transform(X) + + assert not np.isnan(tsne.kl_divergence_) + + +def test_barnes_hut_angle(): + # When Barnes-Hut's angle=0 this corresponds to the exact method. + angle = 0.0 + perplexity = 10 + n_samples = 100 + for n_components in [2, 3]: + n_features = 5 + degrees_of_freedom = float(n_components - 1.0) + + random_state = check_random_state(0) + data = random_state.randn(n_samples, n_features) + distances = pairwise_distances(data) + params = random_state.randn(n_samples, n_components) + P = _joint_probabilities(distances, perplexity, verbose=0) + kl_exact, grad_exact = _kl_divergence( + params, P, degrees_of_freedom, n_samples, n_components + ) + + n_neighbors = n_samples - 1 + distances_csr = ( + NearestNeighbors() + .fit(data) + .kneighbors_graph(n_neighbors=n_neighbors, mode="distance") + ) + P_bh = _joint_probabilities_nn(distances_csr, perplexity, verbose=0) + kl_bh, grad_bh = _kl_divergence_bh( + params, + P_bh, + degrees_of_freedom, + n_samples, + n_components, + angle=angle, + skip_num_points=0, + verbose=0, + ) + + P = squareform(P) + P_bh = P_bh.toarray() + assert_array_almost_equal(P_bh, P, decimal=5) + assert_almost_equal(kl_exact, kl_bh, decimal=3) + + +@skip_if_32bit +def test_n_iter_without_progress(): + # Use a dummy negative n_iter_without_progress and check output on stdout + random_state = check_random_state(0) + X = random_state.randn(100, 10) + for method in ["barnes_hut", "exact"]: + tsne = TSNE( + n_iter_without_progress=-1, + verbose=2, + learning_rate=1e8, + random_state=0, + method=method, + max_iter=351, + init="random", + ) + tsne._N_ITER_CHECK = 1 + tsne._EXPLORATION_MAX_ITER = 0 + + old_stdout = sys.stdout + sys.stdout = StringIO() + try: + tsne.fit_transform(X) + finally: + out = sys.stdout.getvalue() + sys.stdout.close() + sys.stdout = old_stdout + + # The output needs to contain the value of n_iter_without_progress + assert "did not make any progress during the last -1 episodes. Finished." in out + + +def test_min_grad_norm(): + # Make sure that the parameter min_grad_norm is used correctly + random_state = check_random_state(0) + X = random_state.randn(100, 2) + min_grad_norm = 0.002 + tsne = TSNE(min_grad_norm=min_grad_norm, verbose=2, random_state=0, method="exact") + + old_stdout = sys.stdout + sys.stdout = StringIO() + try: + tsne.fit_transform(X) + finally: + out = sys.stdout.getvalue() + sys.stdout.close() + sys.stdout = old_stdout + + lines_out = out.split("\n") + + # extract the gradient norm from the verbose output + gradient_norm_values = [] + for line in lines_out: + # When the computation is Finished just an old gradient norm value + # is repeated that we do not need to store + if "Finished" in line: + break + + start_grad_norm = line.find("gradient norm") + if start_grad_norm >= 0: + line = line[start_grad_norm:] + line = line.replace("gradient norm = ", "").split(" ")[0] + gradient_norm_values.append(float(line)) + + # Compute how often the gradient norm is smaller than min_grad_norm + gradient_norm_values = np.array(gradient_norm_values) + n_smaller_gradient_norms = len( + gradient_norm_values[gradient_norm_values <= min_grad_norm] + ) + + # The gradient norm can be smaller than min_grad_norm at most once, + # because in the moment it becomes smaller the optimization stops + assert n_smaller_gradient_norms <= 1 + + +def test_accessible_kl_divergence(): + # Ensures that the accessible kl_divergence matches the computed value + random_state = check_random_state(0) + X = random_state.randn(50, 2) + tsne = TSNE( + n_iter_without_progress=2, + verbose=2, + random_state=0, + method="exact", + max_iter=500, + ) + + old_stdout = sys.stdout + sys.stdout = StringIO() + try: + tsne.fit_transform(X) + finally: + out = sys.stdout.getvalue() + sys.stdout.close() + sys.stdout = old_stdout + + # The output needs to contain the accessible kl_divergence as the error at + # the last iteration + for line in out.split("\n")[::-1]: + if "Iteration" in line: + _, _, error = line.partition("error = ") + if error: + error, _, _ = error.partition(",") + break + assert_almost_equal(tsne.kl_divergence_, float(error), decimal=5) + + +@pytest.mark.parametrize("method", ["barnes_hut", "exact"]) +def test_uniform_grid(method): + """Make sure that TSNE can approximately recover a uniform 2D grid + + Due to ties in distances between point in X_2d_grid, this test is platform + dependent for ``method='barnes_hut'`` due to numerical imprecision. + + Also, t-SNE is not assured to converge to the right solution because bad + initialization can lead to convergence to bad local minimum (the + optimization problem is non-convex). To avoid breaking the test too often, + we re-run t-SNE from the final point when the convergence is not good + enough. + """ + seeds = range(3) + max_iter = 500 + for seed in seeds: + tsne = TSNE( + n_components=2, + init="random", + random_state=seed, + perplexity=50, + max_iter=max_iter, + method=method, + learning_rate="auto", + ) + Y = tsne.fit_transform(X_2d_grid) + + try_name = "{}_{}".format(method, seed) + try: + assert_uniform_grid(Y, try_name) + except AssertionError: + # If the test fails a first time, re-run with init=Y to see if + # this was caused by a bad initialization. Note that this will + # also run an early_exaggeration step. + try_name += ":rerun" + tsne.init = Y + Y = tsne.fit_transform(X_2d_grid) + assert_uniform_grid(Y, try_name) + + +def assert_uniform_grid(Y, try_name=None): + # Ensure that the resulting embedding leads to approximately + # uniformly spaced points: the distance to the closest neighbors + # should be non-zero and approximately constant. + nn = NearestNeighbors(n_neighbors=1).fit(Y) + dist_to_nn = nn.kneighbors(return_distance=True)[0].ravel() + assert dist_to_nn.min() > 0.1 + + smallest_to_mean = dist_to_nn.min() / np.mean(dist_to_nn) + largest_to_mean = dist_to_nn.max() / np.mean(dist_to_nn) + + assert smallest_to_mean > 0.5, try_name + assert largest_to_mean < 2, try_name + + +def test_bh_match_exact(): + # check that the ``barnes_hut`` method match the exact one when + # ``angle = 0`` and ``perplexity > n_samples / 3`` + random_state = check_random_state(0) + n_features = 10 + X = random_state.randn(30, n_features).astype(np.float32) + X_embeddeds = {} + max_iter = {} + for method in ["exact", "barnes_hut"]: + tsne = TSNE( + n_components=2, + method=method, + learning_rate=1.0, + init="random", + random_state=0, + max_iter=251, + perplexity=29.5, + angle=0, + ) + # Kill the early_exaggeration + tsne._EXPLORATION_MAX_ITER = 0 + X_embeddeds[method] = tsne.fit_transform(X) + max_iter[method] = tsne.n_iter_ + + assert max_iter["exact"] == max_iter["barnes_hut"] + assert_allclose(X_embeddeds["exact"], X_embeddeds["barnes_hut"], rtol=1e-4) + + +def test_gradient_bh_multithread_match_sequential(): + # check that the bh gradient with different num_threads gives the same + # results + + n_features = 10 + n_samples = 30 + n_components = 2 + degrees_of_freedom = 1 + + angle = 3 + perplexity = 5 + + random_state = check_random_state(0) + data = random_state.randn(n_samples, n_features).astype(np.float32) + params = random_state.randn(n_samples, n_components) + + n_neighbors = n_samples - 1 + distances_csr = ( + NearestNeighbors() + .fit(data) + .kneighbors_graph(n_neighbors=n_neighbors, mode="distance") + ) + P_bh = _joint_probabilities_nn(distances_csr, perplexity, verbose=0) + kl_sequential, grad_sequential = _kl_divergence_bh( + params, + P_bh, + degrees_of_freedom, + n_samples, + n_components, + angle=angle, + skip_num_points=0, + verbose=0, + num_threads=1, + ) + for num_threads in [2, 4]: + kl_multithread, grad_multithread = _kl_divergence_bh( + params, + P_bh, + degrees_of_freedom, + n_samples, + n_components, + angle=angle, + skip_num_points=0, + verbose=0, + num_threads=num_threads, + ) + + assert_allclose(kl_multithread, kl_sequential, rtol=1e-6) + assert_allclose(grad_multithread, grad_multithread) + + +@pytest.mark.parametrize( + "metric, dist_func", + [("manhattan", manhattan_distances), ("cosine", cosine_distances)], +) +@pytest.mark.parametrize("method", ["barnes_hut", "exact"]) +def test_tsne_with_different_distance_metrics(metric, dist_func, method): + """Make sure that TSNE works for different distance metrics""" + + if method == "barnes_hut" and metric == "manhattan": + # The distances computed by `manhattan_distances` differ slightly from those + # computed internally by NearestNeighbors via the PairwiseDistancesReduction + # Cython code-based. This in turns causes T-SNE to converge to a different + # solution but this should not impact the qualitative results as both + # methods. + # NOTE: it's probably not valid from a mathematical point of view to use the + # Manhattan distance for T-SNE... + # TODO: re-enable this test if/when `manhattan_distances` is refactored to + # reuse the same underlying Cython code NearestNeighbors. + # For reference, see: + # https://github.com/scikit-learn/scikit-learn/pull/23865/files#r925721573 + pytest.xfail( + "Distance computations are different for method == 'barnes_hut' and metric" + " == 'manhattan', but this is expected." + ) + + random_state = check_random_state(0) + n_components_original = 3 + n_components_embedding = 2 + X = random_state.randn(50, n_components_original).astype(np.float32) + X_transformed_tsne = TSNE( + metric=metric, + method=method, + n_components=n_components_embedding, + random_state=0, + max_iter=300, + init="random", + learning_rate="auto", + ).fit_transform(X) + X_transformed_tsne_precomputed = TSNE( + metric="precomputed", + method=method, + n_components=n_components_embedding, + random_state=0, + max_iter=300, + init="random", + learning_rate="auto", + ).fit_transform(dist_func(X)) + assert_array_equal(X_transformed_tsne, X_transformed_tsne_precomputed) + + +@pytest.mark.parametrize("method", ["exact", "barnes_hut"]) +def test_tsne_n_jobs(method): + """Make sure that the n_jobs parameter doesn't impact the output""" + random_state = check_random_state(0) + n_features = 10 + X = random_state.randn(30, n_features) + X_tr_ref = TSNE( + n_components=2, + method=method, + perplexity=25.0, + angle=0, + n_jobs=1, + random_state=0, + init="random", + learning_rate="auto", + ).fit_transform(X) + X_tr = TSNE( + n_components=2, + method=method, + perplexity=25.0, + angle=0, + n_jobs=2, + random_state=0, + init="random", + learning_rate="auto", + ).fit_transform(X) + + assert_allclose(X_tr_ref, X_tr) + + +def test_tsne_with_mahalanobis_distance(): + """Make sure that method_parameters works with mahalanobis distance.""" + random_state = check_random_state(0) + n_samples, n_features = 300, 10 + X = random_state.randn(n_samples, n_features) + default_params = { + "perplexity": 40, + "max_iter": 250, + "learning_rate": "auto", + "init": "random", + "n_components": 3, + "random_state": 0, + } + + tsne = TSNE(metric="mahalanobis", **default_params) + msg = "Must provide either V or VI for Mahalanobis distance" + with pytest.raises(ValueError, match=msg): + tsne.fit_transform(X) + + precomputed_X = squareform(pdist(X, metric="mahalanobis"), checks=True) + X_trans_expected = TSNE(metric="precomputed", **default_params).fit_transform( + precomputed_X + ) + + X_trans = TSNE( + metric="mahalanobis", metric_params={"V": np.cov(X.T)}, **default_params + ).fit_transform(X) + assert_allclose(X_trans, X_trans_expected) + + +@pytest.mark.parametrize("perplexity", (20, 30)) +def test_tsne_perplexity_validation(perplexity): + """Make sure that perplexity > n_samples results in a ValueError""" + + random_state = check_random_state(0) + X = random_state.randn(20, 2) + est = TSNE( + learning_rate="auto", + init="pca", + perplexity=perplexity, + random_state=random_state, + ) + msg = re.escape(f"perplexity ({perplexity}) must be less than n_samples (20)") + with pytest.raises(ValueError, match=msg): + est.fit_transform(X) + + +def test_tsne_works_with_pandas_output(): + """Make sure that TSNE works when the output is set to "pandas". + + Non-regression test for gh-25365. + """ + pytest.importorskip("pandas") + with config_context(transform_output="pandas"): + arr = np.arange(35 * 4).reshape(35, 4) + TSNE(n_components=2).fit_transform(arr) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ce86525acc368f681af3c1fd635fbe37ed2815c3 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/__init__.py @@ -0,0 +1,181 @@ +"""Score functions, performance metrics, pairwise metrics and distance computations.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from . import cluster +from ._classification import ( + accuracy_score, + balanced_accuracy_score, + brier_score_loss, + class_likelihood_ratios, + classification_report, + cohen_kappa_score, + confusion_matrix, + d2_log_loss_score, + f1_score, + fbeta_score, + hamming_loss, + hinge_loss, + jaccard_score, + log_loss, + matthews_corrcoef, + multilabel_confusion_matrix, + precision_recall_fscore_support, + precision_score, + recall_score, + zero_one_loss, +) +from ._dist_metrics import DistanceMetric +from ._plot.confusion_matrix import ConfusionMatrixDisplay +from ._plot.det_curve import DetCurveDisplay +from ._plot.precision_recall_curve import PrecisionRecallDisplay +from ._plot.regression import PredictionErrorDisplay +from ._plot.roc_curve import RocCurveDisplay +from ._ranking import ( + auc, + average_precision_score, + coverage_error, + dcg_score, + det_curve, + label_ranking_average_precision_score, + label_ranking_loss, + ndcg_score, + precision_recall_curve, + roc_auc_score, + roc_curve, + top_k_accuracy_score, +) +from ._regression import ( + d2_absolute_error_score, + d2_pinball_score, + d2_tweedie_score, + explained_variance_score, + max_error, + mean_absolute_error, + mean_absolute_percentage_error, + mean_gamma_deviance, + mean_pinball_loss, + mean_poisson_deviance, + mean_squared_error, + mean_squared_log_error, + mean_tweedie_deviance, + median_absolute_error, + r2_score, + root_mean_squared_error, + root_mean_squared_log_error, +) +from ._scorer import check_scoring, get_scorer, get_scorer_names, make_scorer +from .cluster import ( + adjusted_mutual_info_score, + adjusted_rand_score, + calinski_harabasz_score, + completeness_score, + consensus_score, + davies_bouldin_score, + fowlkes_mallows_score, + homogeneity_completeness_v_measure, + homogeneity_score, + mutual_info_score, + normalized_mutual_info_score, + pair_confusion_matrix, + rand_score, + silhouette_samples, + silhouette_score, + v_measure_score, +) +from .pairwise import ( + euclidean_distances, + nan_euclidean_distances, + pairwise_distances, + pairwise_distances_argmin, + pairwise_distances_argmin_min, + pairwise_distances_chunked, + pairwise_kernels, +) + +__all__ = [ + "ConfusionMatrixDisplay", + "DetCurveDisplay", + "DistanceMetric", + "PrecisionRecallDisplay", + "PredictionErrorDisplay", + "RocCurveDisplay", + "accuracy_score", + "adjusted_mutual_info_score", + "adjusted_rand_score", + "auc", + "average_precision_score", + "balanced_accuracy_score", + "brier_score_loss", + "calinski_harabasz_score", + "check_scoring", + "class_likelihood_ratios", + "classification_report", + "cluster", + "cohen_kappa_score", + "completeness_score", + "confusion_matrix", + "consensus_score", + "coverage_error", + "d2_absolute_error_score", + "d2_log_loss_score", + "d2_pinball_score", + "d2_tweedie_score", + "davies_bouldin_score", + "dcg_score", + "det_curve", + "euclidean_distances", + "explained_variance_score", + "f1_score", + "fbeta_score", + "fowlkes_mallows_score", + "get_scorer", + "get_scorer_names", + "hamming_loss", + "hinge_loss", + "homogeneity_completeness_v_measure", + "homogeneity_score", + "jaccard_score", + "label_ranking_average_precision_score", + "label_ranking_loss", + "log_loss", + "make_scorer", + "matthews_corrcoef", + "max_error", + "mean_absolute_error", + "mean_absolute_percentage_error", + "mean_gamma_deviance", + "mean_pinball_loss", + "mean_poisson_deviance", + "mean_squared_error", + "mean_squared_log_error", + "mean_tweedie_deviance", + "median_absolute_error", + "multilabel_confusion_matrix", + "mutual_info_score", + "nan_euclidean_distances", + "ndcg_score", + "normalized_mutual_info_score", + "pair_confusion_matrix", + "pairwise_distances", + "pairwise_distances_argmin", + "pairwise_distances_argmin_min", + "pairwise_distances_chunked", + "pairwise_kernels", + "precision_recall_curve", + "precision_recall_fscore_support", + "precision_score", + "r2_score", + "rand_score", + "recall_score", + "roc_auc_score", + "roc_curve", + "root_mean_squared_error", + "root_mean_squared_log_error", + "silhouette_samples", + "silhouette_score", + "top_k_accuracy_score", + "v_measure_score", + "zero_one_loss", +] diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/__init__.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..43850ec0986c9616c11dbb02410d667fe3f2d05e Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/__init__.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_base.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b4dd7f01788628a18b39b4c23ab23221d3d26ef Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_base.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_ranking.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_ranking.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84384539144e1b2578f3a8d38139a2f6b4a0e267 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_ranking.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_regression.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_regression.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d34d10666b22a07a2a5652052bea31d6445baf54 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_regression.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_scorer.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_scorer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..99425fc0fc069e3c3b3897999463aaade7759cb6 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/_scorer.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/pairwise.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/pairwise.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2e9c141c115ab136474b17ff00f9b29e2e8db9da Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/__pycache__/pairwise.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_base.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..aa4150c88a9783aee51d6bf9e89172806728c97f --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_base.py @@ -0,0 +1,193 @@ +""" +Common code for all metrics. + +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from itertools import combinations + +import numpy as np + +from ..utils import check_array, check_consistent_length +from ..utils.multiclass import type_of_target + + +def _average_binary_score(binary_metric, y_true, y_score, average, sample_weight=None): + """Average a binary metric for multilabel classification. + + Parameters + ---------- + y_true : array, shape = [n_samples] or [n_samples, n_classes] + True binary labels in binary label indicators. + + y_score : array, shape = [n_samples] or [n_samples, n_classes] + Target scores, can either be probability estimates of the positive + class, confidence values, or binary decisions. + + average : {None, 'micro', 'macro', 'samples', 'weighted'}, default='macro' + If ``None``, the scores for each class are returned. Otherwise, + this determines the type of averaging performed on the data: + + ``'micro'``: + Calculate metrics globally by considering each element of the label + indicator matrix as a label. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average, weighted + by support (the number of true instances for each label). + ``'samples'``: + Calculate metrics for each instance, and find their average. + + Will be ignored when ``y_true`` is binary. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + binary_metric : callable, returns shape [n_classes] + The binary metric function to use. + + Returns + ------- + score : float or array of shape [n_classes] + If not ``None``, average the score, else return the score for each + classes. + + """ + average_options = (None, "micro", "macro", "weighted", "samples") + if average not in average_options: + raise ValueError("average has to be one of {0}".format(average_options)) + + y_type = type_of_target(y_true) + if y_type not in ("binary", "multilabel-indicator"): + raise ValueError("{0} format is not supported".format(y_type)) + + if y_type == "binary": + return binary_metric(y_true, y_score, sample_weight=sample_weight) + + check_consistent_length(y_true, y_score, sample_weight) + y_true = check_array(y_true) + y_score = check_array(y_score) + + not_average_axis = 1 + score_weight = sample_weight + average_weight = None + + if average == "micro": + if score_weight is not None: + score_weight = np.repeat(score_weight, y_true.shape[1]) + y_true = y_true.ravel() + y_score = y_score.ravel() + + elif average == "weighted": + if score_weight is not None: + average_weight = np.sum( + np.multiply(y_true, np.reshape(score_weight, (-1, 1))), axis=0 + ) + else: + average_weight = np.sum(y_true, axis=0) + if np.isclose(average_weight.sum(), 0.0): + return 0 + + elif average == "samples": + # swap average_weight <-> score_weight + average_weight = score_weight + score_weight = None + not_average_axis = 0 + + if y_true.ndim == 1: + y_true = y_true.reshape((-1, 1)) + + if y_score.ndim == 1: + y_score = y_score.reshape((-1, 1)) + + n_classes = y_score.shape[not_average_axis] + score = np.zeros((n_classes,)) + for c in range(n_classes): + y_true_c = y_true.take([c], axis=not_average_axis).ravel() + y_score_c = y_score.take([c], axis=not_average_axis).ravel() + score[c] = binary_metric(y_true_c, y_score_c, sample_weight=score_weight) + + # Average the results + if average is not None: + if average_weight is not None: + # Scores with 0 weights are forced to be 0, preventing the average + # score from being affected by 0-weighted NaN elements. + average_weight = np.asarray(average_weight) + score[average_weight == 0] = 0 + return float(np.average(score, weights=average_weight)) + else: + return score + + +def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average="macro"): + """Average one-versus-one scores for multiclass classification. + + Uses the binary metric for one-vs-one multiclass classification, + where the score is computed according to the Hand & Till (2001) algorithm. + + Parameters + ---------- + binary_metric : callable + The binary metric function to use that accepts the following as input: + y_true_target : array, shape = [n_samples_target] + Some sub-array of y_true for a pair of classes designated + positive and negative in the one-vs-one scheme. + y_score_target : array, shape = [n_samples_target] + Scores corresponding to the probability estimates + of a sample belonging to the designated positive class label + + y_true : array-like of shape (n_samples,) + True multiclass labels. + + y_score : array-like of shape (n_samples, n_classes) + Target scores corresponding to probability estimates of a sample + belonging to a particular class. + + average : {'macro', 'weighted'}, default='macro' + Determines the type of averaging performed on the pairwise binary + metric scores: + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. Classes + are assumed to be uniformly distributed. + ``'weighted'``: + Calculate metrics for each label, taking into account the + prevalence of the classes. + + Returns + ------- + score : float + Average of the pairwise binary metric scores. + """ + check_consistent_length(y_true, y_score) + + y_true_unique = np.unique(y_true) + n_classes = y_true_unique.shape[0] + n_pairs = n_classes * (n_classes - 1) // 2 + pair_scores = np.empty(n_pairs) + + is_weighted = average == "weighted" + prevalence = np.empty(n_pairs) if is_weighted else None + + # Compute scores treating a as positive class and b as negative class, + # then b as positive class and a as negative class + for ix, (a, b) in enumerate(combinations(y_true_unique, 2)): + a_mask = y_true == a + b_mask = y_true == b + ab_mask = np.logical_or(a_mask, b_mask) + + if is_weighted: + prevalence[ix] = np.average(ab_mask) + + a_true = a_mask[ab_mask] + b_true = b_mask[ab_mask] + + a_true_score = binary_metric(a_true, y_score[ab_mask, a]) + b_true_score = binary_metric(b_true, y_score[ab_mask, b]) + pair_scores[ix] = (a_true_score + b_true_score) / 2 + + return np.average(pair_scores, weights=prevalence) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_classification.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..06503046790beacc11e0a40df39ec9aeb89d0cac --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_classification.py @@ -0,0 +1,3730 @@ +"""Metrics to assess performance on classification task given class prediction. + +Functions named as ``*_score`` return a scalar value to maximize: the higher +the better. + +Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize: +the lower the better. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from numbers import Integral, Real + +import numpy as np +from scipy.sparse import coo_matrix, csr_matrix, issparse +from scipy.special import xlogy + +from ..exceptions import UndefinedMetricWarning +from ..preprocessing import LabelBinarizer, LabelEncoder +from ..utils import ( + assert_all_finite, + check_array, + check_consistent_length, + check_scalar, + column_or_1d, +) +from ..utils._array_api import ( + _average, + _bincount, + _count_nonzero, + _find_matching_floating_dtype, + _is_numpy_namespace, + _max_precision_float_dtype, + _searchsorted, + _tolist, + _union1d, + get_namespace, + get_namespace_and_device, + xpx, +) +from ..utils._param_validation import ( + Hidden, + Interval, + Options, + StrOptions, + validate_params, +) +from ..utils._unique import attach_unique +from ..utils.extmath import _nanaverage +from ..utils.multiclass import type_of_target, unique_labels +from ..utils.validation import ( + _check_pos_label_consistency, + _check_sample_weight, + _num_samples, +) + + +def _check_zero_division(zero_division): + if isinstance(zero_division, str) and zero_division == "warn": + return np.float64(0.0) + elif isinstance(zero_division, (int, float)) and zero_division in [0, 1]: + return np.float64(zero_division) + else: # np.isnan(zero_division) + return np.nan + + +def _check_targets(y_true, y_pred): + """Check that y_true and y_pred belong to the same classification task. + + This converts multiclass or binary types to a common shape, and raises a + ValueError for a mix of multilabel and multiclass targets, a mix of + multilabel formats, for the presence of continuous-valued or multioutput + targets, or for targets of different lengths. + + Column vectors are squeezed to 1d, while multilabel formats are returned + as CSR sparse label indicators. + + Parameters + ---------- + y_true : array-like + + y_pred : array-like + + Returns + ------- + type_true : one of {'multilabel-indicator', 'multiclass', 'binary'} + The type of the true target data, as output by + ``utils.multiclass.type_of_target``. + + y_true : array or indicator matrix + + y_pred : array or indicator matrix + """ + xp, _ = get_namespace(y_true, y_pred) + check_consistent_length(y_true, y_pred) + type_true = type_of_target(y_true, input_name="y_true") + type_pred = type_of_target(y_pred, input_name="y_pred") + + y_type = {type_true, type_pred} + if y_type == {"binary", "multiclass"}: + y_type = {"multiclass"} + + if len(y_type) > 1: + raise ValueError( + "Classification metrics can't handle a mix of {0} and {1} targets".format( + type_true, type_pred + ) + ) + + # We can't have more than one value on y_type => The set is no more needed + y_type = y_type.pop() + + # No metrics support "multiclass-multioutput" format + if y_type not in ["binary", "multiclass", "multilabel-indicator"]: + raise ValueError("{0} is not supported".format(y_type)) + + if y_type in ["binary", "multiclass"]: + xp, _ = get_namespace(y_true, y_pred) + y_true = column_or_1d(y_true) + y_pred = column_or_1d(y_pred) + if y_type == "binary": + try: + unique_values = _union1d(y_true, y_pred, xp) + except TypeError as e: + # We expect y_true and y_pred to be of the same data type. + # If `y_true` was provided to the classifier as strings, + # `y_pred` given by the classifier will also be encoded with + # strings. So we raise a meaningful error + raise TypeError( + "Labels in y_true and y_pred should be of the same type. " + f"Got y_true={xp.unique(y_true)} and " + f"y_pred={xp.unique(y_pred)}. Make sure that the " + "predictions provided by the classifier coincides with " + "the true labels." + ) from e + if unique_values.shape[0] > 2: + y_type = "multiclass" + + if y_type.startswith("multilabel"): + if _is_numpy_namespace(xp): + # XXX: do we really want to sparse-encode multilabel indicators when + # they are passed as a dense arrays? This is not possible for array + # API inputs in general hence we only do it for NumPy inputs. But even + # for NumPy the usefulness is questionable. + y_true = csr_matrix(y_true) + y_pred = csr_matrix(y_pred) + y_type = "multilabel-indicator" + + return y_type, y_true, y_pred + + +def _validate_multiclass_probabilistic_prediction( + y_true, y_prob, sample_weight, labels +): + r"""Convert y_true and y_prob to shape (n_samples, n_classes) + + 1. Verify that y_true, y_prob, and sample_weights have the same first dim + 2. Ensure 2 or more classes in y_true i.e. valid classification task. The + classes are provided by the labels argument, or inferred using y_true. + When inferring y_true is assumed binary if it has shape (n_samples, ). + 3. Validate y_true, and y_prob have the same number of classes. Convert to + shape (n_samples, n_classes) + + Parameters + ---------- + y_true : array-like or label indicator matrix + Ground truth (correct) labels for n_samples samples. + + y_prob : array-like of float, shape=(n_samples, n_classes) or (n_samples,) + Predicted probabilities, as returned by a classifier's + predict_proba method. If `y_prob.shape = (n_samples,)` + the probabilities provided are assumed to be that of the + positive class. The labels in `y_prob` are assumed to be + ordered lexicographically, as done by + :class:`preprocessing.LabelBinarizer`. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + labels : array-like, default=None + If not provided, labels will be inferred from y_true. If `labels` + is `None` and `y_prob` has shape `(n_samples,)` the labels are + assumed to be binary and are inferred from `y_true`. + + Returns + ------- + transformed_labels : array of shape (n_samples, n_classes) + + y_prob : array of shape (n_samples, n_classes) + """ + y_prob = check_array( + y_prob, ensure_2d=False, dtype=[np.float64, np.float32, np.float16] + ) + + if y_prob.max() > 1: + raise ValueError(f"y_prob contains values greater than 1: {y_prob.max()}") + if y_prob.min() < 0: + raise ValueError(f"y_prob contains values lower than 0: {y_prob.min()}") + + check_consistent_length(y_prob, y_true, sample_weight) + lb = LabelBinarizer() + + if labels is not None: + lb = lb.fit(labels) + # LabelBinarizer does not respect the order implied by labels, which + # can be misleading. + if not np.all(lb.classes_ == labels): + warnings.warn( + f"Labels passed were {labels}. But this function " + "assumes labels are ordered lexicographically. " + f"Pass the ordered labels={lb.classes_.tolist()} and ensure that " + "the columns of y_prob correspond to this ordering.", + UserWarning, + ) + if not np.isin(y_true, labels).all(): + undeclared_labels = set(y_true) - set(labels) + raise ValueError( + f"y_true contains values {undeclared_labels} not belonging " + f"to the passed labels {labels}." + ) + + else: + lb = lb.fit(y_true) + + if len(lb.classes_) == 1: + if labels is None: + raise ValueError( + "y_true contains only one label ({0}). Please " + "provide the list of all expected class labels explicitly through the " + "labels argument.".format(lb.classes_[0]) + ) + else: + raise ValueError( + "The labels array needs to contain at least two " + "labels, got {0}.".format(lb.classes_) + ) + + transformed_labels = lb.transform(y_true) + + if transformed_labels.shape[1] == 1: + transformed_labels = np.append( + 1 - transformed_labels, transformed_labels, axis=1 + ) + + # If y_prob is of single dimension, assume y_true to be binary + # and then check. + if y_prob.ndim == 1: + y_prob = y_prob[:, np.newaxis] + if y_prob.shape[1] == 1: + y_prob = np.append(1 - y_prob, y_prob, axis=1) + + eps = np.finfo(y_prob.dtype).eps + + # Make sure y_prob is normalized + y_prob_sum = y_prob.sum(axis=1) + if not np.allclose(y_prob_sum, 1, rtol=np.sqrt(eps)): + warnings.warn( + "The y_prob values do not sum to one. Make sure to pass probabilities.", + UserWarning, + ) + + # Check if dimensions are consistent. + transformed_labels = check_array(transformed_labels) + if len(lb.classes_) != y_prob.shape[1]: + if labels is None: + raise ValueError( + "y_true and y_prob contain different number of " + "classes: {0} vs {1}. Please provide the true " + "labels explicitly through the labels argument. " + "Classes found in " + "y_true: {2}".format( + transformed_labels.shape[1], y_prob.shape[1], lb.classes_ + ) + ) + else: + raise ValueError( + "The number of classes in labels is different " + "from that in y_prob. Classes found in " + "labels: {0}".format(lb.classes_) + ) + + return transformed_labels, y_prob + + +@validate_params( + { + "y_true": ["array-like", "sparse matrix"], + "y_pred": ["array-like", "sparse matrix"], + "normalize": ["boolean"], + "sample_weight": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None): + """Accuracy classification score. + + In multilabel classification, this function computes subset accuracy: + the set of labels predicted for a sample must *exactly* match the + corresponding set of labels in y_true. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : 1d array-like, or label indicator array / sparse matrix + Ground truth (correct) labels. + + y_pred : 1d array-like, or label indicator array / sparse matrix + Predicted labels, as returned by a classifier. + + normalize : bool, default=True + If ``False``, return the number of correctly classified samples. + Otherwise, return the fraction of correctly classified samples. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + score : float or int + If ``normalize == True``, return the fraction of correctly + classified samples (float), else returns the number of correctly + classified samples (int). + + The best performance is 1 with ``normalize == True`` and the number + of samples with ``normalize == False``. + + See Also + -------- + balanced_accuracy_score : Compute the balanced accuracy to deal with + imbalanced datasets. + jaccard_score : Compute the Jaccard similarity coefficient score. + hamming_loss : Compute the average Hamming loss or Hamming distance between + two sets of samples. + zero_one_loss : Compute the Zero-one classification loss. By default, the + function will return the percentage of imperfectly predicted subsets. + + Examples + -------- + >>> from sklearn.metrics import accuracy_score + >>> y_pred = [0, 2, 1, 3] + >>> y_true = [0, 1, 2, 3] + >>> accuracy_score(y_true, y_pred) + 0.5 + >>> accuracy_score(y_true, y_pred, normalize=False) + 2.0 + + In the multilabel case with binary label indicators: + + >>> import numpy as np + >>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2))) + 0.5 + """ + xp, _, device = get_namespace_and_device(y_true, y_pred, sample_weight) + # Compute accuracy for each possible representation + y_true, y_pred = attach_unique(y_true, y_pred) + y_type, y_true, y_pred = _check_targets(y_true, y_pred) + check_consistent_length(y_true, y_pred, sample_weight) + + if y_type.startswith("multilabel"): + differing_labels = _count_nonzero(y_true - y_pred, xp=xp, device=device, axis=1) + score = xp.asarray(differing_labels == 0, device=device) + else: + score = y_true == y_pred + + return float(_average(score, weights=sample_weight, normalize=normalize)) + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "labels": ["array-like", None], + "sample_weight": ["array-like", None], + "normalize": [StrOptions({"true", "pred", "all"}), None], + }, + prefer_skip_nested_validation=True, +) +def confusion_matrix( + y_true, y_pred, *, labels=None, sample_weight=None, normalize=None +): + """Compute confusion matrix to evaluate the accuracy of a classification. + + By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}` + is equal to the number of observations known to be in group :math:`i` and + predicted to be in group :math:`j`. + + Thus in binary classification, the count of true negatives is + :math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is + :math:`C_{1,1}` and false positives is :math:`C_{0,1}`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) + Estimated targets as returned by a classifier. + + labels : array-like of shape (n_classes), default=None + List of labels to index the matrix. This may be used to reorder + or select a subset of labels. + If ``None`` is given, those that appear at least once + in ``y_true`` or ``y_pred`` are used in sorted order. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + .. versionadded:: 0.18 + + normalize : {'true', 'pred', 'all'}, default=None + Normalizes confusion matrix over the true (rows), predicted (columns) + conditions or all the population. If None, confusion matrix will not be + normalized. + + Returns + ------- + C : ndarray of shape (n_classes, n_classes) + Confusion matrix whose i-th row and j-th + column entry indicates the number of + samples with true label being i-th class + and predicted label being j-th class. + + See Also + -------- + ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix + given an estimator, the data, and the label. + ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix + given the true and predicted labels. + ConfusionMatrixDisplay : Confusion Matrix visualization. + + References + ---------- + .. [1] `Wikipedia entry for the Confusion matrix + `_ + (Wikipedia and other references may use a different + convention for axes). + + Examples + -------- + >>> from sklearn.metrics import confusion_matrix + >>> y_true = [2, 0, 2, 2, 0, 1] + >>> y_pred = [0, 0, 2, 2, 0, 2] + >>> confusion_matrix(y_true, y_pred) + array([[2, 0, 0], + [0, 0, 1], + [1, 0, 2]]) + + >>> y_true = ["cat", "ant", "cat", "cat", "ant", "bird"] + >>> y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"] + >>> confusion_matrix(y_true, y_pred, labels=["ant", "bird", "cat"]) + array([[2, 0, 0], + [0, 0, 1], + [1, 0, 2]]) + + In the binary case, we can extract true positives, etc. as follows: + + >>> tn, fp, fn, tp = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0]).ravel().tolist() + >>> (tn, fp, fn, tp) + (0, 2, 1, 1) + """ + y_true, y_pred = attach_unique(y_true, y_pred) + y_type, y_true, y_pred = _check_targets(y_true, y_pred) + if y_type not in ("binary", "multiclass"): + raise ValueError("%s is not supported" % y_type) + + if labels is None: + labels = unique_labels(y_true, y_pred) + else: + labels = np.asarray(labels) + n_labels = labels.size + if n_labels == 0: + raise ValueError("'labels' should contains at least one label.") + elif y_true.size == 0: + return np.zeros((n_labels, n_labels), dtype=int) + elif len(np.intersect1d(y_true, labels)) == 0: + raise ValueError("At least one label specified must be in y_true") + + if sample_weight is None: + sample_weight = np.ones(y_true.shape[0], dtype=np.int64) + else: + sample_weight = np.asarray(sample_weight) + + check_consistent_length(y_true, y_pred, sample_weight) + + n_labels = labels.size + # If labels are not consecutive integers starting from zero, then + # y_true and y_pred must be converted into index form + need_index_conversion = not ( + labels.dtype.kind in {"i", "u", "b"} + and np.all(labels == np.arange(n_labels)) + and y_true.min() >= 0 + and y_pred.min() >= 0 + ) + if need_index_conversion: + label_to_ind = {y: x for x, y in enumerate(labels)} + y_pred = np.array([label_to_ind.get(x, n_labels + 1) for x in y_pred]) + y_true = np.array([label_to_ind.get(x, n_labels + 1) for x in y_true]) + + # intersect y_pred, y_true with labels, eliminate items not in labels + ind = np.logical_and(y_pred < n_labels, y_true < n_labels) + if not np.all(ind): + y_pred = y_pred[ind] + y_true = y_true[ind] + # also eliminate weights of eliminated items + sample_weight = sample_weight[ind] + + # Choose the accumulator dtype to always have high precision + if sample_weight.dtype.kind in {"i", "u", "b"}: + dtype = np.int64 + else: + dtype = np.float64 + + cm = coo_matrix( + (sample_weight, (y_true, y_pred)), + shape=(n_labels, n_labels), + dtype=dtype, + ).toarray() + + with np.errstate(all="ignore"): + if normalize == "true": + cm = cm / cm.sum(axis=1, keepdims=True) + elif normalize == "pred": + cm = cm / cm.sum(axis=0, keepdims=True) + elif normalize == "all": + cm = cm / cm.sum() + cm = np.nan_to_num(cm) + + if cm.shape == (1, 1): + warnings.warn( + ( + "A single label was found in 'y_true' and 'y_pred'. For the confusion " + "matrix to have the correct shape, use the 'labels' parameter to pass " + "all known labels." + ), + UserWarning, + ) + + return cm + + +@validate_params( + { + "y_true": ["array-like", "sparse matrix"], + "y_pred": ["array-like", "sparse matrix"], + "sample_weight": ["array-like", None], + "labels": ["array-like", None], + "samplewise": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def multilabel_confusion_matrix( + y_true, y_pred, *, sample_weight=None, labels=None, samplewise=False +): + """Compute a confusion matrix for each class or sample. + + .. versionadded:: 0.21 + + Compute class-wise (default) or sample-wise (samplewise=True) multilabel + confusion matrix to evaluate the accuracy of a classification, and output + confusion matrices for each class or sample. + + In multilabel confusion matrix :math:`MCM`, the count of true negatives + is :math:`MCM_{:,0,0}`, false negatives is :math:`MCM_{:,1,0}`, + true positives is :math:`MCM_{:,1,1}` and false positives is + :math:`MCM_{:,0,1}`. + + Multiclass data will be treated as if binarized under a one-vs-rest + transformation. Returned confusion matrices will be in the order of + sorted unique labels in the union of (y_true, y_pred). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : {array-like, sparse matrix} of shape (n_samples, n_outputs) or \ + (n_samples,) + Ground truth (correct) target values. + + y_pred : {array-like, sparse matrix} of shape (n_samples, n_outputs) or \ + (n_samples,) + Estimated targets as returned by a classifier. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + labels : array-like of shape (n_classes,), default=None + A list of classes or column indices to select some (or to force + inclusion of classes absent from the data). + + samplewise : bool, default=False + In the multilabel case, this calculates a confusion matrix per sample. + + Returns + ------- + multi_confusion : ndarray of shape (n_outputs, 2, 2) + A 2x2 confusion matrix corresponding to each output in the input. + When calculating class-wise multi_confusion (default), then + n_outputs = n_labels; when calculating sample-wise multi_confusion + (samplewise=True), n_outputs = n_samples. If ``labels`` is defined, + the results will be returned in the order specified in ``labels``, + otherwise the results will be returned in sorted order by default. + + See Also + -------- + confusion_matrix : Compute confusion matrix to evaluate the accuracy of a + classifier. + + Notes + ----- + The `multilabel_confusion_matrix` calculates class-wise or sample-wise + multilabel confusion matrices, and in multiclass tasks, labels are + binarized under a one-vs-rest way; while + :func:`~sklearn.metrics.confusion_matrix` calculates one confusion matrix + for confusion between every two classes. + + Examples + -------- + Multilabel-indicator case: + + >>> import numpy as np + >>> from sklearn.metrics import multilabel_confusion_matrix + >>> y_true = np.array([[1, 0, 1], + ... [0, 1, 0]]) + >>> y_pred = np.array([[1, 0, 0], + ... [0, 1, 1]]) + >>> multilabel_confusion_matrix(y_true, y_pred) + array([[[1, 0], + [0, 1]], + + [[1, 0], + [0, 1]], + + [[0, 1], + [1, 0]]]) + + Multiclass case: + + >>> y_true = ["cat", "ant", "cat", "cat", "ant", "bird"] + >>> y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"] + >>> multilabel_confusion_matrix(y_true, y_pred, + ... labels=["ant", "bird", "cat"]) + array([[[3, 1], + [0, 2]], + + [[5, 0], + [1, 0]], + + [[2, 1], + [1, 2]]]) + """ + y_true, y_pred = attach_unique(y_true, y_pred) + xp, _, device_ = get_namespace_and_device(y_true, y_pred) + y_type, y_true, y_pred = _check_targets(y_true, y_pred) + if sample_weight is not None: + sample_weight = column_or_1d(sample_weight, device=device_) + check_consistent_length(y_true, y_pred, sample_weight) + + if y_type not in ("binary", "multiclass", "multilabel-indicator"): + raise ValueError("%s is not supported" % y_type) + + present_labels = unique_labels(y_true, y_pred) + if labels is None: + labels = present_labels + n_labels = None + else: + labels = xp.asarray(labels, device=device_) + n_labels = labels.shape[0] + labels = xp.concat( + [labels, xpx.setdiff1d(present_labels, labels, assume_unique=True, xp=xp)], + axis=-1, + ) + + if y_true.ndim == 1: + if samplewise: + raise ValueError( + "Samplewise metrics are not available outside of " + "multilabel classification." + ) + + le = LabelEncoder() + le.fit(labels) + y_true = le.transform(y_true) + y_pred = le.transform(y_pred) + sorted_labels = le.classes_ + + # labels are now from 0 to len(labels) - 1 -> use bincount + tp = y_true == y_pred + tp_bins = y_true[tp] + if sample_weight is not None: + tp_bins_weights = sample_weight[tp] + else: + tp_bins_weights = None + + if tp_bins.shape[0]: + tp_sum = _bincount( + tp_bins, weights=tp_bins_weights, minlength=labels.shape[0], xp=xp + ) + else: + # Pathological case + true_sum = pred_sum = tp_sum = xp.zeros(labels.shape[0]) + if y_pred.shape[0]: + pred_sum = _bincount( + y_pred, weights=sample_weight, minlength=labels.shape[0], xp=xp + ) + if y_true.shape[0]: + true_sum = _bincount( + y_true, weights=sample_weight, minlength=labels.shape[0], xp=xp + ) + + # Retain only selected labels + indices = _searchsorted(sorted_labels, labels[:n_labels], xp=xp) + tp_sum = xp.take(tp_sum, indices, axis=0) + true_sum = xp.take(true_sum, indices, axis=0) + pred_sum = xp.take(pred_sum, indices, axis=0) + + else: + sum_axis = 1 if samplewise else 0 + + # All labels are index integers for multilabel. + # Select labels: + if labels.shape != present_labels.shape or xp.any( + xp.not_equal(labels, present_labels) + ): + if xp.max(labels) > xp.max(present_labels): + raise ValueError( + "All labels must be in [0, n labels) for " + "multilabel targets. " + "Got %d > %d" % (xp.max(labels), xp.max(present_labels)) + ) + if xp.min(labels) < 0: + raise ValueError( + "All labels must be in [0, n labels) for " + "multilabel targets. " + "Got %d < 0" % xp.min(labels) + ) + + if n_labels is not None: + y_true = y_true[:, labels[:n_labels]] + y_pred = y_pred[:, labels[:n_labels]] + + if issparse(y_true) or issparse(y_pred): + true_and_pred = y_true.multiply(y_pred) + else: + true_and_pred = xp.multiply(y_true, y_pred) + + # calculate weighted counts + tp_sum = _count_nonzero( + true_and_pred, + axis=sum_axis, + sample_weight=sample_weight, + xp=xp, + device=device_, + ) + pred_sum = _count_nonzero( + y_pred, + axis=sum_axis, + sample_weight=sample_weight, + xp=xp, + device=device_, + ) + true_sum = _count_nonzero( + y_true, + axis=sum_axis, + sample_weight=sample_weight, + xp=xp, + device=device_, + ) + + fp = pred_sum - tp_sum + fn = true_sum - tp_sum + tp = tp_sum + + if sample_weight is not None and samplewise: + tp = xp.asarray(tp) + fp = xp.asarray(fp) + fn = xp.asarray(fn) + tn = sample_weight * y_true.shape[1] - tp - fp - fn + elif sample_weight is not None: + tn = xp.sum(sample_weight) - tp - fp - fn + elif samplewise: + tn = y_true.shape[1] - tp - fp - fn + else: + tn = y_true.shape[0] - tp - fp - fn + + return xp.reshape(xp.stack([tn, fp, fn, tp]).T, (-1, 2, 2)) + + +@validate_params( + { + "y1": ["array-like"], + "y2": ["array-like"], + "labels": ["array-like", None], + "weights": [StrOptions({"linear", "quadratic"}), None], + "sample_weight": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def cohen_kappa_score(y1, y2, *, labels=None, weights=None, sample_weight=None): + r"""Compute Cohen's kappa: a statistic that measures inter-annotator agreement. + + This function computes Cohen's kappa [1]_, a score that expresses the level + of agreement between two annotators on a classification problem. It is + defined as + + .. math:: + \kappa = (p_o - p_e) / (1 - p_e) + + where :math:`p_o` is the empirical probability of agreement on the label + assigned to any sample (the observed agreement ratio), and :math:`p_e` is + the expected agreement when both annotators assign labels randomly. + :math:`p_e` is estimated using a per-annotator empirical prior over the + class labels [2]_. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y1 : array-like of shape (n_samples,) + Labels assigned by the first annotator. + + y2 : array-like of shape (n_samples,) + Labels assigned by the second annotator. The kappa statistic is + symmetric, so swapping ``y1`` and ``y2`` doesn't change the value. + + labels : array-like of shape (n_classes,), default=None + List of labels to index the matrix. This may be used to select a + subset of labels. If `None`, all labels that appear at least once in + ``y1`` or ``y2`` are used. Note that at least one label in `labels` must be + present in `y1`, even though this function is otherwise agnostic to the order + of `y1` and `y2`. + + weights : {'linear', 'quadratic'}, default=None + Weighting type to calculate the score. `None` means not weighted; + "linear" means linear weighting; "quadratic" means quadratic weighting. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + kappa : float + The kappa statistic, which is a number between -1 and 1. The maximum + value means complete agreement; zero or lower means chance agreement. + + References + ---------- + .. [1] :doi:`J. Cohen (1960). "A coefficient of agreement for nominal scales". + Educational and Psychological Measurement 20(1):37-46. + <10.1177/001316446002000104>` + .. [2] `R. Artstein and M. Poesio (2008). "Inter-coder agreement for + computational linguistics". Computational Linguistics 34(4):555-596 + `_. + .. [3] `Wikipedia entry for the Cohen's kappa + `_. + + Examples + -------- + >>> from sklearn.metrics import cohen_kappa_score + >>> y1 = ["negative", "positive", "negative", "neutral", "positive"] + >>> y2 = ["negative", "positive", "negative", "neutral", "negative"] + >>> cohen_kappa_score(y1, y2) + 0.6875 + """ + try: + confusion = confusion_matrix(y1, y2, labels=labels, sample_weight=sample_weight) + except ValueError as e: + if "At least one label specified must be in y_true" in str(e): + msg = ( + "At least one label in `labels` must be present in `y1` (even though " + "`cohen_kappa_score` is otherwise agnostic to the order of `y1` and " + "`y2`)." + ) + raise ValueError(msg) from e + raise + + n_classes = confusion.shape[0] + sum0 = np.sum(confusion, axis=0) + sum1 = np.sum(confusion, axis=1) + expected = np.outer(sum0, sum1) / np.sum(sum0) + + if weights is None: + w_mat = np.ones([n_classes, n_classes], dtype=int) + w_mat.flat[:: n_classes + 1] = 0 + else: # "linear" or "quadratic" + w_mat = np.zeros([n_classes, n_classes], dtype=int) + w_mat += np.arange(n_classes) + if weights == "linear": + w_mat = np.abs(w_mat - w_mat.T) + else: + w_mat = (w_mat - w_mat.T) ** 2 + + k = np.sum(w_mat * confusion) / np.sum(w_mat * expected) + return float(1 - k) + + +@validate_params( + { + "y_true": ["array-like", "sparse matrix"], + "y_pred": ["array-like", "sparse matrix"], + "labels": ["array-like", None], + "pos_label": [Real, str, "boolean", None], + "average": [ + StrOptions({"micro", "macro", "samples", "weighted", "binary"}), + None, + ], + "sample_weight": ["array-like", None], + "zero_division": [ + Options(Real, {0, 1}), + StrOptions({"warn"}), + ], + }, + prefer_skip_nested_validation=True, +) +def jaccard_score( + y_true, + y_pred, + *, + labels=None, + pos_label=1, + average="binary", + sample_weight=None, + zero_division="warn", +): + """Jaccard similarity coefficient score. + + The Jaccard index [1], or Jaccard similarity coefficient, defined as + the size of the intersection divided by the size of the union of two label + sets, is used to compare set of predicted labels for a sample to the + corresponding set of labels in ``y_true``. + + Support beyond term:`binary` targets is achieved by treating :term:`multiclass` + and :term:`multilabel` data as a collection of binary problems, one for each + label. For the :term:`binary` case, setting `average='binary'` will return the + Jaccard similarity coefficient for `pos_label`. If `average` is not `'binary'`, + `pos_label` is ignored and scores for both classes are computed, then averaged or + both returned (when `average=None`). Similarly, for :term:`multiclass` and + :term:`multilabel` targets, scores for all `labels` are either returned or + averaged depending on the `average` parameter. Use `labels` specify the set of + labels to calculate the score for. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : 1d array-like, or label indicator array / sparse matrix + Ground truth (correct) labels. + + y_pred : 1d array-like, or label indicator array / sparse matrix + Predicted labels, as returned by a classifier. + + labels : array-like of shape (n_classes,), default=None + The set of labels to include when `average != 'binary'`, and their + order if `average is None`. Labels present in the data can be + excluded, for example in multiclass classification to exclude a "negative + class". Labels not present in the data can be included and will be + "assigned" 0 samples. For multilabel targets, labels are column indices. + By default, all labels in `y_true` and `y_pred` are used in sorted order. + + pos_label : int, float, bool or str, default=1 + The class to report if `average='binary'` and the data is binary, + otherwise this parameter is ignored. + For multiclass or multilabel targets, set `labels=[pos_label]` and + `average != 'binary'` to report metrics for one label only. + + average : {'micro', 'macro', 'samples', 'weighted', \ + 'binary'} or None, default='binary' + If ``None``, the scores for each class are returned. Otherwise, this + determines the type of averaging performed on the data: + + ``'binary'``: + Only report results for the class specified by ``pos_label``. + This is applicable only if targets (``y_{true,pred}``) are binary. + ``'micro'``: + Calculate metrics globally by counting the total true positives, + false negatives and false positives. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average, weighted + by support (the number of true instances for each label). This + alters 'macro' to account for label imbalance. + ``'samples'``: + Calculate metrics for each instance, and find their average (only + meaningful for multilabel classification). + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + zero_division : "warn", {0.0, 1.0}, default="warn" + Sets the value to return when there is a zero division, i.e. when there + there are no negative values in predictions and labels. If set to + "warn", this acts like 0, but a warning is also raised. + + .. versionadded:: 0.24 + + Returns + ------- + score : float or ndarray of shape (n_unique_labels,), dtype=np.float64 + The Jaccard score. When `average` is not `None`, a single scalar is + returned. + + See Also + -------- + accuracy_score : Function for calculating the accuracy score. + f1_score : Function for calculating the F1 score. + multilabel_confusion_matrix : Function for computing a confusion matrix\ + for each class or sample. + + Notes + ----- + :func:`jaccard_score` may be a poor metric if there are no + positives for some samples or classes. Jaccard is undefined if there are + no true or predicted labels, and our implementation will return a score + of 0 with a warning. + + References + ---------- + .. [1] `Wikipedia entry for the Jaccard index + `_. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import jaccard_score + >>> y_true = np.array([[0, 1, 1], + ... [1, 1, 0]]) + >>> y_pred = np.array([[1, 1, 1], + ... [1, 0, 0]]) + + In the binary case: + + >>> jaccard_score(y_true[0], y_pred[0]) + 0.6666 + + In the 2D comparison case (e.g. image similarity): + + >>> jaccard_score(y_true, y_pred, average="micro") + 0.6 + + In the multilabel case: + + >>> jaccard_score(y_true, y_pred, average='samples') + 0.5833 + >>> jaccard_score(y_true, y_pred, average='macro') + 0.6666 + >>> jaccard_score(y_true, y_pred, average=None) + array([0.5, 0.5, 1. ]) + + In the multiclass case: + + >>> y_pred = [0, 2, 1, 2] + >>> y_true = [0, 1, 2, 2] + >>> jaccard_score(y_true, y_pred, average=None) + array([1. , 0. , 0.33]) + """ + labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label) + samplewise = average == "samples" + MCM = multilabel_confusion_matrix( + y_true, + y_pred, + sample_weight=sample_weight, + labels=labels, + samplewise=samplewise, + ) + numerator = MCM[:, 1, 1] + denominator = MCM[:, 1, 1] + MCM[:, 0, 1] + MCM[:, 1, 0] + + xp, _, device_ = get_namespace_and_device(y_true, y_pred) + if average == "micro": + numerator = xp.asarray(xp.sum(numerator, keepdims=True), device=device_) + denominator = xp.asarray(xp.sum(denominator, keepdims=True), device=device_) + + jaccard = _prf_divide( + numerator, + denominator, + "jaccard", + "true or predicted", + average, + ("jaccard",), + zero_division=zero_division, + ) + if average is None: + return jaccard + if average == "weighted": + weights = MCM[:, 1, 0] + MCM[:, 1, 1] + if not xp.any(weights): + # numerator is 0, and warning should have already been issued + weights = None + elif average == "samples" and sample_weight is not None: + weights = sample_weight + else: + weights = None + return float(_average(jaccard, weights=weights, xp=xp)) + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "sample_weight": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def matthews_corrcoef(y_true, y_pred, *, sample_weight=None): + """Compute the Matthews correlation coefficient (MCC). + + The Matthews correlation coefficient is used in machine learning as a + measure of the quality of binary and multiclass classifications. It takes + into account true and false positives and negatives and is generally + regarded as a balanced measure which can be used even if the classes are of + very different sizes. The MCC is in essence a correlation coefficient value + between -1 and +1. A coefficient of +1 represents a perfect prediction, 0 + an average random prediction and -1 an inverse prediction. The statistic + is also known as the phi coefficient. [source: Wikipedia] + + Binary and multiclass labels are supported. Only in the binary case does + this relate to information about true and false positives and negatives. + See references below. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) + Estimated targets as returned by a classifier. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + .. versionadded:: 0.18 + + Returns + ------- + mcc : float + The Matthews correlation coefficient (+1 represents a perfect + prediction, 0 an average random prediction and -1 and inverse + prediction). + + References + ---------- + .. [1] :doi:`Baldi, Brunak, Chauvin, Andersen and Nielsen, (2000). Assessing the + accuracy of prediction algorithms for classification: an overview. + <10.1093/bioinformatics/16.5.412>` + + .. [2] `Wikipedia entry for the Matthews Correlation Coefficient (phi coefficient) + `_. + + .. [3] `Gorodkin, (2004). Comparing two K-category assignments by a + K-category correlation coefficient + `_. + + .. [4] `Jurman, Riccadonna, Furlanello, (2012). A Comparison of MCC and CEN + Error Measures in MultiClass Prediction + `_. + + Examples + -------- + >>> from sklearn.metrics import matthews_corrcoef + >>> y_true = [+1, +1, +1, -1] + >>> y_pred = [+1, -1, +1, +1] + >>> matthews_corrcoef(y_true, y_pred) + -0.33 + """ + y_true, y_pred = attach_unique(y_true, y_pred) + y_type, y_true, y_pred = _check_targets(y_true, y_pred) + check_consistent_length(y_true, y_pred, sample_weight) + if y_type not in {"binary", "multiclass"}: + raise ValueError("%s is not supported" % y_type) + + lb = LabelEncoder() + lb.fit(np.hstack([y_true, y_pred])) + y_true = lb.transform(y_true) + y_pred = lb.transform(y_pred) + + C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight) + t_sum = C.sum(axis=1, dtype=np.float64) + p_sum = C.sum(axis=0, dtype=np.float64) + n_correct = np.trace(C, dtype=np.float64) + n_samples = p_sum.sum() + cov_ytyp = n_correct * n_samples - np.dot(t_sum, p_sum) + cov_ypyp = n_samples**2 - np.dot(p_sum, p_sum) + cov_ytyt = n_samples**2 - np.dot(t_sum, t_sum) + + cov_ypyp_ytyt = cov_ypyp * cov_ytyt + if cov_ypyp_ytyt == 0: + return 0.0 + else: + return float(cov_ytyp / np.sqrt(cov_ypyp_ytyt)) + + +@validate_params( + { + "y_true": ["array-like", "sparse matrix"], + "y_pred": ["array-like", "sparse matrix"], + "normalize": ["boolean"], + "sample_weight": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def zero_one_loss(y_true, y_pred, *, normalize=True, sample_weight=None): + """Zero-one classification loss. + + If normalize is ``True``, return the fraction of misclassifications + (float), else it returns the number of misclassifications (int). The best + performance is 0. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : 1d array-like, or label indicator array / sparse matrix + Ground truth (correct) labels. + + y_pred : 1d array-like, or label indicator array / sparse matrix + Predicted labels, as returned by a classifier. + + normalize : bool, default=True + If ``False``, return the number of misclassifications. + Otherwise, return the fraction of misclassifications. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + loss : float or int, + If ``normalize == True``, return the fraction of misclassifications + (float), else it returns the number of misclassifications (int). + + See Also + -------- + accuracy_score : Compute the accuracy score. By default, the function will + return the fraction of correct predictions divided by the total number + of predictions. + hamming_loss : Compute the average Hamming loss or Hamming distance between + two sets of samples. + jaccard_score : Compute the Jaccard similarity coefficient score. + + Notes + ----- + In multilabel classification, the zero_one_loss function corresponds to + the subset zero-one loss: for each sample, the entire set of labels must be + correctly predicted, otherwise the loss for that sample is equal to one. + + Examples + -------- + >>> from sklearn.metrics import zero_one_loss + >>> y_pred = [1, 2, 3, 4] + >>> y_true = [2, 2, 3, 4] + >>> zero_one_loss(y_true, y_pred) + 0.25 + >>> zero_one_loss(y_true, y_pred, normalize=False) + 1.0 + + In the multilabel case with binary label indicators: + + >>> import numpy as np + >>> zero_one_loss(np.array([[0, 1], [1, 1]]), np.ones((2, 2))) + 0.5 + """ + xp, _ = get_namespace(y_true, y_pred) + score = accuracy_score( + y_true, y_pred, normalize=normalize, sample_weight=sample_weight + ) + + if normalize: + return 1 - score + else: + if sample_weight is not None: + n_samples = xp.sum(sample_weight) + else: + n_samples = _num_samples(y_true) + return n_samples - score + + +@validate_params( + { + "y_true": ["array-like", "sparse matrix"], + "y_pred": ["array-like", "sparse matrix"], + "labels": ["array-like", None], + "pos_label": [Real, str, "boolean", None], + "average": [ + StrOptions({"micro", "macro", "samples", "weighted", "binary"}), + None, + ], + "sample_weight": ["array-like", None], + "zero_division": [ + Options(Real, {0.0, 1.0}), + "nan", + StrOptions({"warn"}), + ], + }, + prefer_skip_nested_validation=True, +) +def f1_score( + y_true, + y_pred, + *, + labels=None, + pos_label=1, + average="binary", + sample_weight=None, + zero_division="warn", +): + """Compute the F1 score, also known as balanced F-score or F-measure. + + The F1 score can be interpreted as a harmonic mean of the precision and + recall, where an F1 score reaches its best value at 1 and worst score at 0. + The relative contribution of precision and recall to the F1 score are + equal. The formula for the F1 score is: + + .. math:: + \\text{F1} = \\frac{2 * \\text{TP}}{2 * \\text{TP} + \\text{FP} + \\text{FN}} + + Where :math:`\\text{TP}` is the number of true positives, :math:`\\text{FN}` is the + number of false negatives, and :math:`\\text{FP}` is the number of false positives. + F1 is by default + calculated as 0.0 when there are no true positives, false negatives, or + false positives. + + Support beyond :term:`binary` targets is achieved by treating :term:`multiclass` + and :term:`multilabel` data as a collection of binary problems, one for each + label. For the :term:`binary` case, setting `average='binary'` will return + F1 score for `pos_label`. If `average` is not `'binary'`, `pos_label` is ignored + and F1 score for both classes are computed, then averaged or both returned (when + `average=None`). Similarly, for :term:`multiclass` and :term:`multilabel` targets, + F1 score for all `labels` are either returned or averaged depending on the + `average` parameter. Use `labels` specify the set of labels to calculate F1 score + for. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : 1d array-like, or label indicator array / sparse matrix + Ground truth (correct) target values. + + y_pred : 1d array-like, or label indicator array / sparse matrix + Estimated targets as returned by a classifier. + + labels : array-like, default=None + The set of labels to include when `average != 'binary'`, and their + order if `average is None`. Labels present in the data can be + excluded, for example in multiclass classification to exclude a "negative + class". Labels not present in the data can be included and will be + "assigned" 0 samples. For multilabel targets, labels are column indices. + By default, all labels in `y_true` and `y_pred` are used in sorted order. + + .. versionchanged:: 0.17 + Parameter `labels` improved for multiclass problem. + + pos_label : int, float, bool or str, default=1 + The class to report if `average='binary'` and the data is binary, + otherwise this parameter is ignored. + For multiclass or multilabel targets, set `labels=[pos_label]` and + `average != 'binary'` to report metrics for one label only. + + average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \ + default='binary' + This parameter is required for multiclass/multilabel targets. + If ``None``, the metrics for each class are returned. Otherwise, this + determines the type of averaging performed on the data: + + ``'binary'``: + Only report results for the class specified by ``pos_label``. + This is applicable only if targets (``y_{true,pred}``) are binary. + ``'micro'``: + Calculate metrics globally by counting the total true positives, + false negatives and false positives. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average weighted + by support (the number of true instances for each label). This + alters 'macro' to account for label imbalance; it can result in an + F-score that is not between precision and recall. + ``'samples'``: + Calculate metrics for each instance, and find their average (only + meaningful for multilabel classification where this differs from + :func:`accuracy_score`). + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + zero_division : {"warn", 0.0, 1.0, np.nan}, default="warn" + Sets the value to return when there is a zero division, i.e. when all + predictions and labels are negative. + + Notes: + - If set to "warn", this acts like 0, but a warning is also raised. + - If set to `np.nan`, such values will be excluded from the average. + + .. versionadded:: 1.3 + `np.nan` option was added. + + Returns + ------- + f1_score : float or array of float, shape = [n_unique_labels] + F1 score of the positive class in binary classification or weighted + average of the F1 scores of each class for the multiclass task. + + See Also + -------- + fbeta_score : Compute the F-beta score. + precision_recall_fscore_support : Compute the precision, recall, F-score, + and support. + jaccard_score : Compute the Jaccard similarity coefficient score. + multilabel_confusion_matrix : Compute a confusion matrix for each class or + sample. + + Notes + ----- + When ``true positive + false positive + false negative == 0`` (i.e. a class + is completely absent from both ``y_true`` or ``y_pred``), f-score is + undefined. In such cases, by default f-score will be set to 0.0, and + ``UndefinedMetricWarning`` will be raised. This behavior can be modified by + setting the ``zero_division`` parameter. + + References + ---------- + .. [1] `Wikipedia entry for the F1-score + `_. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import f1_score + >>> y_true = [0, 1, 2, 0, 1, 2] + >>> y_pred = [0, 2, 1, 0, 0, 1] + >>> f1_score(y_true, y_pred, average='macro') + 0.267 + >>> f1_score(y_true, y_pred, average='micro') + 0.33 + >>> f1_score(y_true, y_pred, average='weighted') + 0.267 + >>> f1_score(y_true, y_pred, average=None) + array([0.8, 0. , 0. ]) + + >>> # binary classification + >>> y_true_empty = [0, 0, 0, 0, 0, 0] + >>> y_pred_empty = [0, 0, 0, 0, 0, 0] + >>> f1_score(y_true_empty, y_pred_empty) + 0.0... + >>> f1_score(y_true_empty, y_pred_empty, zero_division=1.0) + 1.0... + >>> f1_score(y_true_empty, y_pred_empty, zero_division=np.nan) + nan... + + >>> # multilabel classification + >>> y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]] + >>> y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]] + >>> f1_score(y_true, y_pred, average=None) + array([0.66666667, 1. , 0.66666667]) + """ + return fbeta_score( + y_true, + y_pred, + beta=1, + labels=labels, + pos_label=pos_label, + average=average, + sample_weight=sample_weight, + zero_division=zero_division, + ) + + +@validate_params( + { + "y_true": ["array-like", "sparse matrix"], + "y_pred": ["array-like", "sparse matrix"], + "beta": [Interval(Real, 0.0, None, closed="both")], + "labels": ["array-like", None], + "pos_label": [Real, str, "boolean", None], + "average": [ + StrOptions({"micro", "macro", "samples", "weighted", "binary"}), + None, + ], + "sample_weight": ["array-like", None], + "zero_division": [ + Options(Real, {0.0, 1.0}), + "nan", + StrOptions({"warn"}), + ], + }, + prefer_skip_nested_validation=True, +) +def fbeta_score( + y_true, + y_pred, + *, + beta, + labels=None, + pos_label=1, + average="binary", + sample_weight=None, + zero_division="warn", +): + """Compute the F-beta score. + + The F-beta score is the weighted harmonic mean of precision and recall, + reaching its optimal value at 1 and its worst value at 0. + + The `beta` parameter represents the ratio of recall importance to + precision importance. `beta > 1` gives more weight to recall, while + `beta < 1` favors precision. For example, `beta = 2` makes recall twice + as important as precision, while `beta = 0.5` does the opposite. + Asymptotically, `beta -> +inf` considers only recall, and `beta -> 0` + only precision. + + The formula for F-beta score is: + + .. math:: + + F_\\beta = \\frac{(1 + \\beta^2) \\text{tp}} + {(1 + \\beta^2) \\text{tp} + \\text{fp} + \\beta^2 \\text{fn}} + + Where :math:`\\text{tp}` is the number of true positives, :math:`\\text{fp}` is the + number of false positives, and :math:`\\text{fn}` is the number of false negatives. + + Support beyond term:`binary` targets is achieved by treating :term:`multiclass` + and :term:`multilabel` data as a collection of binary problems, one for each + label. For the :term:`binary` case, setting `average='binary'` will return + F-beta score for `pos_label`. If `average` is not `'binary'`, `pos_label` is + ignored and F-beta score for both classes are computed, then averaged or both + returned (when `average=None`). Similarly, for :term:`multiclass` and + :term:`multilabel` targets, F-beta score for all `labels` are either returned or + averaged depending on the `average` parameter. Use `labels` specify the set of + labels to calculate F-beta score for. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : 1d array-like, or label indicator array / sparse matrix + Ground truth (correct) target values. + + y_pred : 1d array-like, or label indicator array / sparse matrix + Estimated targets as returned by a classifier. + + beta : float + Determines the weight of recall in the combined score. + + labels : array-like, default=None + The set of labels to include when `average != 'binary'`, and their + order if `average is None`. Labels present in the data can be + excluded, for example in multiclass classification to exclude a "negative + class". Labels not present in the data can be included and will be + "assigned" 0 samples. For multilabel targets, labels are column indices. + By default, all labels in `y_true` and `y_pred` are used in sorted order. + + .. versionchanged:: 0.17 + Parameter `labels` improved for multiclass problem. + + pos_label : int, float, bool or str, default=1 + The class to report if `average='binary'` and the data is binary, + otherwise this parameter is ignored. + For multiclass or multilabel targets, set `labels=[pos_label]` and + `average != 'binary'` to report metrics for one label only. + + average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \ + default='binary' + This parameter is required for multiclass/multilabel targets. + If ``None``, the metrics for each class are returned. Otherwise, this + determines the type of averaging performed on the data: + + ``'binary'``: + Only report results for the class specified by ``pos_label``. + This is applicable only if targets (``y_{true,pred}``) are binary. + ``'micro'``: + Calculate metrics globally by counting the total true positives, + false negatives and false positives. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average weighted + by support (the number of true instances for each label). This + alters 'macro' to account for label imbalance; it can result in an + F-score that is not between precision and recall. + ``'samples'``: + Calculate metrics for each instance, and find their average (only + meaningful for multilabel classification where this differs from + :func:`accuracy_score`). + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + zero_division : {"warn", 0.0, 1.0, np.nan}, default="warn" + Sets the value to return when there is a zero division, i.e. when all + predictions and labels are negative. + + Notes: + + - If set to "warn", this acts like 0, but a warning is also raised. + - If set to `np.nan`, such values will be excluded from the average. + + .. versionadded:: 1.3 + `np.nan` option was added. + + Returns + ------- + fbeta_score : float (if average is not None) or array of float, shape =\ + [n_unique_labels] + F-beta score of the positive class in binary classification or weighted + average of the F-beta score of each class for the multiclass task. + + See Also + -------- + precision_recall_fscore_support : Compute the precision, recall, F-score, + and support. + multilabel_confusion_matrix : Compute a confusion matrix for each class or + sample. + + Notes + ----- + When ``true positive + false positive + false negative == 0``, f-score + returns 0.0 and raises ``UndefinedMetricWarning``. This behavior can be + modified by setting ``zero_division``. + + F-beta score is not implemented as a named scorer that can be passed to + the `scoring` parameter of cross-validation tools directly: it requires to be + wrapped with :func:`make_scorer` so as to specify the value of `beta`. See + examples for details. + + References + ---------- + .. [1] R. Baeza-Yates and B. Ribeiro-Neto (2011). + Modern Information Retrieval. Addison Wesley, pp. 327-328. + + .. [2] `Wikipedia entry for the F1-score + `_. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import fbeta_score + >>> y_true = [0, 1, 2, 0, 1, 2] + >>> y_pred = [0, 2, 1, 0, 0, 1] + >>> fbeta_score(y_true, y_pred, average='macro', beta=0.5) + 0.238 + >>> fbeta_score(y_true, y_pred, average='micro', beta=0.5) + 0.33 + >>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5) + 0.238 + >>> fbeta_score(y_true, y_pred, average=None, beta=0.5) + array([0.71, 0. , 0. ]) + >>> y_pred_empty = [0, 0, 0, 0, 0, 0] + >>> fbeta_score( + ... y_true, + ... y_pred_empty, + ... average="macro", + ... zero_division=np.nan, + ... beta=0.5, + ... ) + 0.128 + + In order to use :func:`fbeta_scorer` as a scorer, a callable + scorer objects needs to be created first with :func:`make_scorer`, + passing the value for the `beta` parameter. + + >>> from sklearn.metrics import fbeta_score, make_scorer + >>> ftwo_scorer = make_scorer(fbeta_score, beta=2) + >>> from sklearn.model_selection import GridSearchCV + >>> from sklearn.svm import LinearSVC + >>> grid = GridSearchCV( + ... LinearSVC(dual="auto"), + ... param_grid={'C': [1, 10]}, + ... scoring=ftwo_scorer, + ... cv=5 + ... ) + """ + + _, _, f, _ = precision_recall_fscore_support( + y_true, + y_pred, + beta=beta, + labels=labels, + pos_label=pos_label, + average=average, + warn_for=("f-score",), + sample_weight=sample_weight, + zero_division=zero_division, + ) + return f + + +def _prf_divide( + numerator, denominator, metric, modifier, average, warn_for, zero_division="warn" +): + """Performs division and handles divide-by-zero. + + On zero-division, sets the corresponding result elements equal to + 0, 1 or np.nan (according to ``zero_division``). Plus, if + ``zero_division != "warn"`` raises a warning. + + The metric, modifier and average arguments are used only for determining + an appropriate warning. + """ + xp, _ = get_namespace(numerator, denominator) + dtype_float = _find_matching_floating_dtype(numerator, denominator, xp=xp) + mask = denominator == 0 + denominator = xp.asarray(denominator, copy=True, dtype=dtype_float) + denominator[mask] = 1 # avoid infs/nans + result = xp.asarray(numerator, dtype=dtype_float) / denominator + + if not xp.any(mask): + return result + + # set those with 0 denominator to `zero_division`, and 0 when "warn" + zero_division_value = _check_zero_division(zero_division) + result[mask] = zero_division_value + + # we assume the user will be removing warnings if zero_division is set + # to something different than "warn". If we are computing only f-score + # the warning will be raised only if precision and recall are ill-defined + if zero_division != "warn" or metric not in warn_for: + return result + + # build appropriate warning + if metric in warn_for: + _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0]) + + return result + + +def _warn_prf(average, modifier, msg_start, result_size): + axis0, axis1 = "sample", "label" + if average == "samples": + axis0, axis1 = axis1, axis0 + msg = ( + "{0} ill-defined and being set to 0.0 {{0}} " + "no {1} {2}s. Use `zero_division` parameter to control" + " this behavior.".format(msg_start, modifier, axis0) + ) + if result_size == 1: + msg = msg.format("due to") + else: + msg = msg.format("in {0}s with".format(axis1)) + warnings.warn(msg, UndefinedMetricWarning, stacklevel=2) + + +def _check_set_wise_labels(y_true, y_pred, average, labels, pos_label): + """Validation associated with set-wise metrics. + + Returns identified labels. + """ + average_options = (None, "micro", "macro", "weighted", "samples") + if average not in average_options and average != "binary": + raise ValueError("average has to be one of " + str(average_options)) + + y_true, y_pred = attach_unique(y_true, y_pred) + y_type, y_true, y_pred = _check_targets(y_true, y_pred) + # Convert to Python primitive type to avoid NumPy type / Python str + # comparison. See https://github.com/numpy/numpy/issues/6784 + present_labels = _tolist(unique_labels(y_true, y_pred)) + if average == "binary": + if y_type == "binary": + if pos_label not in present_labels: + if len(present_labels) >= 2: + raise ValueError( + f"pos_label={pos_label} is not a valid label. It " + f"should be one of {present_labels}" + ) + labels = [pos_label] + else: + average_options = list(average_options) + if y_type == "multiclass": + average_options.remove("samples") + raise ValueError( + "Target is %s but average='binary'. Please " + "choose another average setting, one of %r." % (y_type, average_options) + ) + elif pos_label not in (None, 1): + warnings.warn( + "Note that pos_label (set to %r) is ignored when " + "average != 'binary' (got %r). You may use " + "labels=[pos_label] to specify a single positive class." + % (pos_label, average), + UserWarning, + ) + return labels + + +@validate_params( + { + "y_true": ["array-like", "sparse matrix"], + "y_pred": ["array-like", "sparse matrix"], + "beta": [Interval(Real, 0.0, None, closed="both")], + "labels": ["array-like", None], + "pos_label": [Real, str, "boolean", None], + "average": [ + StrOptions({"micro", "macro", "samples", "weighted", "binary"}), + None, + ], + "warn_for": [list, tuple, set], + "sample_weight": ["array-like", None], + "zero_division": [ + Options(Real, {0.0, 1.0}), + "nan", + StrOptions({"warn"}), + ], + }, + prefer_skip_nested_validation=True, +) +def precision_recall_fscore_support( + y_true, + y_pred, + *, + beta=1.0, + labels=None, + pos_label=1, + average=None, + warn_for=("precision", "recall", "f-score"), + sample_weight=None, + zero_division="warn", +): + """Compute precision, recall, F-measure and support for each class. + + The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of + true positives and ``fp`` the number of false positives. The precision is + intuitively the ability of the classifier not to label a negative sample as + positive. + + The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of + true positives and ``fn`` the number of false negatives. The recall is + intuitively the ability of the classifier to find all the positive samples. + + The F-beta score can be interpreted as a weighted harmonic mean of + the precision and recall, where an F-beta score reaches its best + value at 1 and worst score at 0. + + The F-beta score weights recall more than precision by a factor of + ``beta``. ``beta == 1.0`` means recall and precision are equally important. + + The support is the number of occurrences of each class in ``y_true``. + + Support beyond term:`binary` targets is achieved by treating :term:`multiclass` + and :term:`multilabel` data as a collection of binary problems, one for each + label. For the :term:`binary` case, setting `average='binary'` will return + metrics for `pos_label`. If `average` is not `'binary'`, `pos_label` is ignored + and metrics for both classes are computed, then averaged or both returned (when + `average=None`). Similarly, for :term:`multiclass` and :term:`multilabel` targets, + metrics for all `labels` are either returned or averaged depending on the `average` + parameter. Use `labels` specify the set of labels to calculate metrics for. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : 1d array-like, or label indicator array / sparse matrix + Ground truth (correct) target values. + + y_pred : 1d array-like, or label indicator array / sparse matrix + Estimated targets as returned by a classifier. + + beta : float, default=1.0 + The strength of recall versus precision in the F-score. + + labels : array-like, default=None + The set of labels to include when `average != 'binary'`, and their + order if `average is None`. Labels present in the data can be + excluded, for example in multiclass classification to exclude a "negative + class". Labels not present in the data can be included and will be + "assigned" 0 samples. For multilabel targets, labels are column indices. + By default, all labels in `y_true` and `y_pred` are used in sorted order. + + .. versionchanged:: 0.17 + Parameter `labels` improved for multiclass problem. + + pos_label : int, float, bool or str, default=1 + The class to report if `average='binary'` and the data is binary, + otherwise this parameter is ignored. + For multiclass or multilabel targets, set `labels=[pos_label]` and + `average != 'binary'` to report metrics for one label only. + + average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \ + default='binary' + This parameter is required for multiclass/multilabel targets. + If ``None``, the metrics for each class are returned. Otherwise, this + determines the type of averaging performed on the data: + + ``'binary'``: + Only report results for the class specified by ``pos_label``. + This is applicable only if targets (``y_{true,pred}``) are binary. + ``'micro'``: + Calculate metrics globally by counting the total true positives, + false negatives and false positives. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average weighted + by support (the number of true instances for each label). This + alters 'macro' to account for label imbalance; it can result in an + F-score that is not between precision and recall. + ``'samples'``: + Calculate metrics for each instance, and find their average (only + meaningful for multilabel classification where this differs from + :func:`accuracy_score`). + + warn_for : list, tuple or set, for internal use + This determines which warnings will be made in the case that this + function is being used to return only one of its metrics. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + zero_division : {"warn", 0.0, 1.0, np.nan}, default="warn" + Sets the value to return when there is a zero division: + + - recall: when there are no positive labels + - precision: when there are no positive predictions + - f-score: both + + Notes: + + - If set to "warn", this acts like 0, but a warning is also raised. + - If set to `np.nan`, such values will be excluded from the average. + + .. versionadded:: 1.3 + `np.nan` option was added. + + Returns + ------- + precision : float (if average is not None) or array of float, shape =\ + [n_unique_labels] + Precision score. + + recall : float (if average is not None) or array of float, shape =\ + [n_unique_labels] + Recall score. + + fbeta_score : float (if average is not None) or array of float, shape =\ + [n_unique_labels] + F-beta score. + + support : None (if average is not None) or array of int, shape =\ + [n_unique_labels] + The number of occurrences of each label in ``y_true``. + + Notes + ----- + When ``true positive + false positive == 0``, precision is undefined. + When ``true positive + false negative == 0``, recall is undefined. When + ``true positive + false negative + false positive == 0``, f-score is + undefined. In such cases, by default the metric will be set to 0, and + ``UndefinedMetricWarning`` will be raised. This behavior can be modified + with ``zero_division``. + + References + ---------- + .. [1] `Wikipedia entry for the Precision and recall + `_. + + .. [2] `Wikipedia entry for the F1-score + `_. + + .. [3] `Discriminative Methods for Multi-labeled Classification Advances + in Knowledge Discovery and Data Mining (2004), pp. 22-30 by Shantanu + Godbole, Sunita Sarawagi + `_. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import precision_recall_fscore_support + >>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig']) + >>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog']) + >>> precision_recall_fscore_support(y_true, y_pred, average='macro') + (0.222, 0.333, 0.267, None) + >>> precision_recall_fscore_support(y_true, y_pred, average='micro') + (0.33, 0.33, 0.33, None) + >>> precision_recall_fscore_support(y_true, y_pred, average='weighted') + (0.222, 0.333, 0.267, None) + + It is possible to compute per-label precisions, recalls, F1-scores and + supports instead of averaging: + + >>> precision_recall_fscore_support(y_true, y_pred, average=None, + ... labels=['pig', 'dog', 'cat']) + (array([0. , 0. , 0.66]), + array([0., 0., 1.]), array([0. , 0. , 0.8]), + array([2, 2, 2])) + """ + _check_zero_division(zero_division) + labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label) + + # Calculate tp_sum, pred_sum, true_sum ### + samplewise = average == "samples" + MCM = multilabel_confusion_matrix( + y_true, + y_pred, + sample_weight=sample_weight, + labels=labels, + samplewise=samplewise, + ) + tp_sum = MCM[:, 1, 1] + pred_sum = tp_sum + MCM[:, 0, 1] + true_sum = tp_sum + MCM[:, 1, 0] + + xp, _, device_ = get_namespace_and_device(y_true, y_pred) + if average == "micro": + tp_sum = xp.reshape(xp.sum(tp_sum), (1,)) + pred_sum = xp.reshape(xp.sum(pred_sum), (1,)) + true_sum = xp.reshape(xp.sum(true_sum), (1,)) + + # Finally, we have all our sufficient statistics. Divide! # + beta2 = beta**2 + + # Divide, and on zero-division, set scores and/or warn according to + # zero_division: + precision = _prf_divide( + tp_sum, pred_sum, "precision", "predicted", average, warn_for, zero_division + ) + recall = _prf_divide( + tp_sum, true_sum, "recall", "true", average, warn_for, zero_division + ) + + if np.isposinf(beta): + f_score = recall + elif beta == 0: + f_score = precision + else: + # The score is defined as: + # score = (1 + beta**2) * precision * recall / (beta**2 * precision + recall) + # Therefore, we can express the score in terms of confusion matrix entries as: + # score = (1 + beta**2) * tp / ((1 + beta**2) * tp + beta**2 * fn + fp) + + # Array api strict requires all arrays to be of the same type so we + # need to convert true_sum, pred_sum and tp_sum to the max supported + # float dtype because beta2 is a float + max_float_type = _max_precision_float_dtype(xp=xp, device=device_) + denom = beta2 * xp.astype(true_sum, max_float_type) + xp.astype( + pred_sum, max_float_type + ) + f_score = _prf_divide( + (1 + beta2) * xp.astype(tp_sum, max_float_type), + denom, + "f-score", + "true nor predicted", + average, + warn_for, + zero_division, + ) + + # Average the results + if average == "weighted": + weights = true_sum + elif average == "samples": + weights = sample_weight + else: + weights = None + + if average is not None: + precision = float(_nanaverage(precision, weights=weights)) + recall = float(_nanaverage(recall, weights=weights)) + f_score = float(_nanaverage(f_score, weights=weights)) + true_sum = None # return no support + + return precision, recall, f_score, true_sum + + +@validate_params( + { + "y_true": ["array-like", "sparse matrix"], + "y_pred": ["array-like", "sparse matrix"], + "labels": ["array-like", None], + "sample_weight": ["array-like", None], + "raise_warning": ["boolean", Hidden(StrOptions({"deprecated"}))], + "replace_undefined_by": [ + Options(Real, {1.0, np.nan}), + dict, + ], + }, + prefer_skip_nested_validation=True, +) +def class_likelihood_ratios( + y_true, + y_pred, + *, + labels=None, + sample_weight=None, + raise_warning="deprecated", + replace_undefined_by=np.nan, +): + """Compute binary classification positive and negative likelihood ratios. + + The positive likelihood ratio is `LR+ = sensitivity / (1 - specificity)` + where the sensitivity or recall is the ratio `tp / (tp + fn)` and the + specificity is `tn / (tn + fp)`. The negative likelihood ratio is `LR- = (1 + - sensitivity) / specificity`. Here `tp` is the number of true positives, + `fp` the number of false positives, `tn` is the number of true negatives and + `fn` the number of false negatives. Both class likelihood ratios can be used + to obtain post-test probabilities given a pre-test probability. + + `LR+` ranges from 1.0 to infinity. A `LR+` of 1.0 indicates that the probability + of predicting the positive class is the same for samples belonging to either + class; therefore, the test is useless. The greater `LR+` is, the more a + positive prediction is likely to be a true positive when compared with the + pre-test probability. A value of `LR+` lower than 1.0 is invalid as it would + indicate that the odds of a sample being a true positive decrease with + respect to the pre-test odds. + + `LR-` ranges from 0.0 to 1.0. The closer it is to 0.0, the lower the probability + of a given sample to be a false negative. A `LR-` of 1.0 means the test is + useless because the odds of having the condition did not change after the + test. A value of `LR-` greater than 1.0 invalidates the classifier as it + indicates an increase in the odds of a sample belonging to the positive + class after being classified as negative. This is the case when the + classifier systematically predicts the opposite of the true label. + + A typical application in medicine is to identify the positive/negative class + to the presence/absence of a disease, respectively; the classifier being a + diagnostic test; the pre-test probability of an individual having the + disease can be the prevalence of such disease (proportion of a particular + population found to be affected by a medical condition); and the post-test + probabilities would be the probability that the condition is truly present + given a positive test result. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : 1d array-like, or label indicator array / sparse matrix + Ground truth (correct) target values. + + y_pred : 1d array-like, or label indicator array / sparse matrix + Estimated targets as returned by a classifier. + + labels : array-like, default=None + List of labels to index the matrix. This may be used to select the + positive and negative classes with the ordering `labels=[negative_class, + positive_class]`. If `None` is given, those that appear at least once in + `y_true` or `y_pred` are used in sorted order. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + raise_warning : bool, default=True + Whether or not a case-specific warning message is raised when there is division + by zero. + + .. deprecated:: 1.7 + `raise_warning` was deprecated in version 1.7 and will be removed in 1.9, + when an :class:`~sklearn.exceptions.UndefinedMetricWarning` will always + raise in case of a division by zero. + + replace_undefined_by : np.nan, 1.0, or dict, default=np.nan + Sets the return values for LR+ and LR- when there is a division by zero. Can + take the following values: + + - `np.nan` to return `np.nan` for both `LR+` and `LR-` + - `1.0` to return the worst possible scores: `{"LR+": 1.0, "LR-": 1.0}` + - a dict in the format `{"LR+": value_1, "LR-": value_2}` where the values can + be non-negative floats, `np.inf` or `np.nan` in the range of the + likelihood ratios. For example, `{"LR+": 1.0, "LR-": 1.0}` can be used for + returning the worst scores, indicating a useless model, and `{"LR+": np.inf, + "LR-": 0.0}` can be used for returning the best scores, indicating a useful + model. + + If a division by zero occurs, only the affected metric is replaced with the set + value; the other metric is calculated as usual. + + .. versionadded:: 1.7 + + Returns + ------- + (positive_likelihood_ratio, negative_likelihood_ratio) : tuple + A tuple of two floats, the first containing the positive likelihood ratio (LR+) + and the second the negative likelihood ratio (LR-). + + Warns + ----- + Raises :class:`~sklearn.exceptions.UndefinedMetricWarning` when `y_true` and + `y_pred` lead to the following conditions: + + - The number of false positives is 0 and `raise_warning` is set to `True` + (default): positive likelihood ratio is undefined. + - The number of true negatives is 0 and `raise_warning` is set to `True` + (default): negative likelihood ratio is undefined. + - The sum of true positives and false negatives is 0 (no samples of the positive + class are present in `y_true`): both likelihood ratios are undefined. + + For the first two cases, an undefined metric can be defined by setting the + `replace_undefined_by` param. + + References + ---------- + .. [1] `Wikipedia entry for the Likelihood ratios in diagnostic testing + `_. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import class_likelihood_ratios + >>> class_likelihood_ratios([0, 1, 0, 1, 0], [1, 1, 0, 0, 0]) + (1.5, 0.75) + >>> y_true = np.array(["non-cat", "cat", "non-cat", "cat", "non-cat"]) + >>> y_pred = np.array(["cat", "cat", "non-cat", "non-cat", "non-cat"]) + >>> class_likelihood_ratios(y_true, y_pred) + (1.33, 0.66) + >>> y_true = np.array(["non-zebra", "zebra", "non-zebra", "zebra", "non-zebra"]) + >>> y_pred = np.array(["zebra", "zebra", "non-zebra", "non-zebra", "non-zebra"]) + >>> class_likelihood_ratios(y_true, y_pred) + (1.5, 0.75) + + To avoid ambiguities, use the notation `labels=[negative_class, + positive_class]` + + >>> y_true = np.array(["non-cat", "cat", "non-cat", "cat", "non-cat"]) + >>> y_pred = np.array(["cat", "cat", "non-cat", "non-cat", "non-cat"]) + >>> class_likelihood_ratios(y_true, y_pred, labels=["non-cat", "cat"]) + (1.5, 0.75) + """ + # TODO(1.9): When `raise_warning` is removed, the following changes need to be made: + # The checks for `raise_warning==True` need to be removed and we will always warn, + # remove `FutureWarning`, and the Warns section in the docstring should not mention + # `raise_warning` anymore. + y_true, y_pred = attach_unique(y_true, y_pred) + y_type, y_true, y_pred = _check_targets(y_true, y_pred) + if y_type != "binary": + raise ValueError( + "class_likelihood_ratios only supports binary classification " + f"problems, got targets of type: {y_type}" + ) + + msg_deprecated_param = ( + "`raise_warning` was deprecated in version 1.7 and will be removed in 1.9. An " + "`UndefinedMetricWarning` will always be raised in case of a division by zero " + "and the value set with the `replace_undefined_by` param will be returned." + ) + if raise_warning != "deprecated": + warnings.warn(msg_deprecated_param, FutureWarning) + else: + raise_warning = True + + if replace_undefined_by == 1.0: + replace_undefined_by = {"LR+": 1.0, "LR-": 1.0} + + if isinstance(replace_undefined_by, dict): + msg = ( + "The dictionary passed as `replace_undefined_by` needs to be in the form " + "`{'LR+': `value_1`, 'LR-': `value_2`}` where the value for `LR+` ranges " + "from `1.0` to `np.inf` or is `np.nan` and the value for `LR-` ranges from " + f"`0.0` to `1.0` or is `np.nan`; got `{replace_undefined_by}`." + ) + if ("LR+" in replace_undefined_by) and ("LR-" in replace_undefined_by): + try: + desired_lr_pos = replace_undefined_by.get("LR+", None) + check_scalar( + desired_lr_pos, + "positive_likelihood_ratio", + target_type=(Real), + min_val=1.0, + include_boundaries="left", + ) + desired_lr_neg = replace_undefined_by.get("LR-", None) + check_scalar( + desired_lr_neg, + "negative_likelihood_ratio", + target_type=(Real), + min_val=0.0, + max_val=1.0, + include_boundaries="both", + ) + except Exception as e: + raise ValueError(msg) from e + else: + raise ValueError(msg) + + cm = confusion_matrix( + y_true, + y_pred, + sample_weight=sample_weight, + labels=labels, + ) + + tn, fp, fn, tp = cm.ravel() + support_pos = tp + fn + support_neg = tn + fp + pos_num = tp * support_neg + pos_denom = fp * support_pos + neg_num = fn * support_neg + neg_denom = tn * support_pos + + # if `support_pos == 0`a division by zero will occur + if support_pos == 0: + msg = ( + "No samples of the positive class are present in `y_true`. " + "`positive_likelihood_ratio` and `negative_likelihood_ratio` are both set " + "to `np.nan`. Use the `replace_undefined_by` param to control this " + "behavior. To suppress this warning or turn it into an error, see Python's " + "`warnings` module and `warnings.catch_warnings()`." + ) + warnings.warn(msg, UndefinedMetricWarning, stacklevel=2) + positive_likelihood_ratio = np.nan + negative_likelihood_ratio = np.nan + + # if `fp == 0`a division by zero will occur + if fp == 0: + if raise_warning: + if tp == 0: + msg_beginning = ( + "No samples were predicted for the positive class and " + "`positive_likelihood_ratio` is " + ) + else: + msg_beginning = "`positive_likelihood_ratio` is ill-defined and " + msg_end = "set to `np.nan`. Use the `replace_undefined_by` param to " + "control this behavior. To suppress this warning or turn it into an error, " + "see Python's `warnings` module and `warnings.catch_warnings()`." + warnings.warn(msg_beginning + msg_end, UndefinedMetricWarning, stacklevel=2) + if isinstance(replace_undefined_by, float) and np.isnan(replace_undefined_by): + positive_likelihood_ratio = replace_undefined_by + else: + # replace_undefined_by is a dict and + # isinstance(replace_undefined_by.get("LR+", None), Real); this includes + # `np.inf` and `np.nan` + positive_likelihood_ratio = desired_lr_pos + else: + positive_likelihood_ratio = pos_num / pos_denom + + # if `tn == 0`a division by zero will occur + if tn == 0: + if raise_warning: + msg = ( + "`negative_likelihood_ratio` is ill-defined and set to `np.nan`. " + "Use the `replace_undefined_by` param to control this behavior. To " + "suppress this warning or turn it into an error, see Python's " + "`warnings` module and `warnings.catch_warnings()`." + ) + warnings.warn(msg, UndefinedMetricWarning, stacklevel=2) + if isinstance(replace_undefined_by, float) and np.isnan(replace_undefined_by): + negative_likelihood_ratio = replace_undefined_by + else: + # replace_undefined_by is a dict and + # isinstance(replace_undefined_by.get("LR-", None), Real); this includes + # `np.nan` + negative_likelihood_ratio = desired_lr_neg + else: + negative_likelihood_ratio = neg_num / neg_denom + + return float(positive_likelihood_ratio), float(negative_likelihood_ratio) + + +@validate_params( + { + "y_true": ["array-like", "sparse matrix"], + "y_pred": ["array-like", "sparse matrix"], + "labels": ["array-like", None], + "pos_label": [Real, str, "boolean", None], + "average": [ + StrOptions({"micro", "macro", "samples", "weighted", "binary"}), + None, + ], + "sample_weight": ["array-like", None], + "zero_division": [ + Options(Real, {0.0, 1.0}), + "nan", + StrOptions({"warn"}), + ], + }, + prefer_skip_nested_validation=True, +) +def precision_score( + y_true, + y_pred, + *, + labels=None, + pos_label=1, + average="binary", + sample_weight=None, + zero_division="warn", +): + """Compute the precision. + + The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of + true positives and ``fp`` the number of false positives. The precision is + intuitively the ability of the classifier not to label as positive a sample + that is negative. + + The best value is 1 and the worst value is 0. + + Support beyond term:`binary` targets is achieved by treating :term:`multiclass` + and :term:`multilabel` data as a collection of binary problems, one for each + label. For the :term:`binary` case, setting `average='binary'` will return + precision for `pos_label`. If `average` is not `'binary'`, `pos_label` is ignored + and precision for both classes are computed, then averaged or both returned (when + `average=None`). Similarly, for :term:`multiclass` and :term:`multilabel` targets, + precision for all `labels` are either returned or averaged depending on the + `average` parameter. Use `labels` specify the set of labels to calculate precision + for. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : 1d array-like, or label indicator array / sparse matrix + Ground truth (correct) target values. + + y_pred : 1d array-like, or label indicator array / sparse matrix + Estimated targets as returned by a classifier. + + labels : array-like, default=None + The set of labels to include when `average != 'binary'`, and their + order if `average is None`. Labels present in the data can be + excluded, for example in multiclass classification to exclude a "negative + class". Labels not present in the data can be included and will be + "assigned" 0 samples. For multilabel targets, labels are column indices. + By default, all labels in `y_true` and `y_pred` are used in sorted order. + + .. versionchanged:: 0.17 + Parameter `labels` improved for multiclass problem. + + pos_label : int, float, bool or str, default=1 + The class to report if `average='binary'` and the data is binary, + otherwise this parameter is ignored. + For multiclass or multilabel targets, set `labels=[pos_label]` and + `average != 'binary'` to report metrics for one label only. + + average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \ + default='binary' + This parameter is required for multiclass/multilabel targets. + If ``None``, the metrics for each class are returned. Otherwise, this + determines the type of averaging performed on the data: + + ``'binary'``: + Only report results for the class specified by ``pos_label``. + This is applicable only if targets (``y_{true,pred}``) are binary. + ``'micro'``: + Calculate metrics globally by counting the total true positives, + false negatives and false positives. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average weighted + by support (the number of true instances for each label). This + alters 'macro' to account for label imbalance; it can result in an + F-score that is not between precision and recall. + ``'samples'``: + Calculate metrics for each instance, and find their average (only + meaningful for multilabel classification where this differs from + :func:`accuracy_score`). + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + zero_division : {"warn", 0.0, 1.0, np.nan}, default="warn" + Sets the value to return when there is a zero division. + + Notes: + + - If set to "warn", this acts like 0, but a warning is also raised. + - If set to `np.nan`, such values will be excluded from the average. + + .. versionadded:: 1.3 + `np.nan` option was added. + + Returns + ------- + precision : float (if average is not None) or array of float of shape \ + (n_unique_labels,) + Precision of the positive class in binary classification or weighted + average of the precision of each class for the multiclass task. + + See Also + -------- + precision_recall_fscore_support : Compute precision, recall, F-measure and + support for each class. + recall_score : Compute the ratio ``tp / (tp + fn)`` where ``tp`` is the + number of true positives and ``fn`` the number of false negatives. + PrecisionRecallDisplay.from_estimator : Plot precision-recall curve given + an estimator and some data. + PrecisionRecallDisplay.from_predictions : Plot precision-recall curve given + binary class predictions. + multilabel_confusion_matrix : Compute a confusion matrix for each class or + sample. + + Notes + ----- + When ``true positive + false positive == 0``, precision returns 0 and + raises ``UndefinedMetricWarning``. This behavior can be + modified with ``zero_division``. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import precision_score + >>> y_true = [0, 1, 2, 0, 1, 2] + >>> y_pred = [0, 2, 1, 0, 0, 1] + >>> precision_score(y_true, y_pred, average='macro') + 0.22 + >>> precision_score(y_true, y_pred, average='micro') + 0.33 + >>> precision_score(y_true, y_pred, average='weighted') + 0.22 + >>> precision_score(y_true, y_pred, average=None) + array([0.66, 0. , 0. ]) + >>> y_pred = [0, 0, 0, 0, 0, 0] + >>> precision_score(y_true, y_pred, average=None) + array([0.33, 0. , 0. ]) + >>> precision_score(y_true, y_pred, average=None, zero_division=1) + array([0.33, 1. , 1. ]) + >>> precision_score(y_true, y_pred, average=None, zero_division=np.nan) + array([0.33, nan, nan]) + + >>> # multilabel classification + >>> y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]] + >>> y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]] + >>> precision_score(y_true, y_pred, average=None) + array([0.5, 1. , 1. ]) + """ + p, _, _, _ = precision_recall_fscore_support( + y_true, + y_pred, + labels=labels, + pos_label=pos_label, + average=average, + warn_for=("precision",), + sample_weight=sample_weight, + zero_division=zero_division, + ) + return p + + +@validate_params( + { + "y_true": ["array-like", "sparse matrix"], + "y_pred": ["array-like", "sparse matrix"], + "labels": ["array-like", None], + "pos_label": [Real, str, "boolean", None], + "average": [ + StrOptions({"micro", "macro", "samples", "weighted", "binary"}), + None, + ], + "sample_weight": ["array-like", None], + "zero_division": [ + Options(Real, {0.0, 1.0}), + "nan", + StrOptions({"warn"}), + ], + }, + prefer_skip_nested_validation=True, +) +def recall_score( + y_true, + y_pred, + *, + labels=None, + pos_label=1, + average="binary", + sample_weight=None, + zero_division="warn", +): + """Compute the recall. + + The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of + true positives and ``fn`` the number of false negatives. The recall is + intuitively the ability of the classifier to find all the positive samples. + + The best value is 1 and the worst value is 0. + + Support beyond term:`binary` targets is achieved by treating :term:`multiclass` + and :term:`multilabel` data as a collection of binary problems, one for each + label. For the :term:`binary` case, setting `average='binary'` will return + recall for `pos_label`. If `average` is not `'binary'`, `pos_label` is ignored + and recall for both classes are computed then averaged or both returned (when + `average=None`). Similarly, for :term:`multiclass` and :term:`multilabel` targets, + recall for all `labels` are either returned or averaged depending on the `average` + parameter. Use `labels` specify the set of labels to calculate recall for. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : 1d array-like, or label indicator array / sparse matrix + Ground truth (correct) target values. + + y_pred : 1d array-like, or label indicator array / sparse matrix + Estimated targets as returned by a classifier. + + labels : array-like, default=None + The set of labels to include when `average != 'binary'`, and their + order if `average is None`. Labels present in the data can be + excluded, for example in multiclass classification to exclude a "negative + class". Labels not present in the data can be included and will be + "assigned" 0 samples. For multilabel targets, labels are column indices. + By default, all labels in `y_true` and `y_pred` are used in sorted order. + + .. versionchanged:: 0.17 + Parameter `labels` improved for multiclass problem. + + pos_label : int, float, bool or str, default=1 + The class to report if `average='binary'` and the data is binary, + otherwise this parameter is ignored. + For multiclass or multilabel targets, set `labels=[pos_label]` and + `average != 'binary'` to report metrics for one label only. + + average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \ + default='binary' + This parameter is required for multiclass/multilabel targets. + If ``None``, the metrics for each class are returned. Otherwise, this + determines the type of averaging performed on the data: + + ``'binary'``: + Only report results for the class specified by ``pos_label``. + This is applicable only if targets (``y_{true,pred}``) are binary. + ``'micro'``: + Calculate metrics globally by counting the total true positives, + false negatives and false positives. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average weighted + by support (the number of true instances for each label). This + alters 'macro' to account for label imbalance; it can result in an + F-score that is not between precision and recall. Weighted recall + is equal to accuracy. + ``'samples'``: + Calculate metrics for each instance, and find their average (only + meaningful for multilabel classification where this differs from + :func:`accuracy_score`). + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + zero_division : {"warn", 0.0, 1.0, np.nan}, default="warn" + Sets the value to return when there is a zero division. + + Notes: + + - If set to "warn", this acts like 0, but a warning is also raised. + - If set to `np.nan`, such values will be excluded from the average. + + .. versionadded:: 1.3 + `np.nan` option was added. + + Returns + ------- + recall : float (if average is not None) or array of float of shape \ + (n_unique_labels,) + Recall of the positive class in binary classification or weighted + average of the recall of each class for the multiclass task. + + See Also + -------- + precision_recall_fscore_support : Compute precision, recall, F-measure and + support for each class. + precision_score : Compute the ratio ``tp / (tp + fp)`` where ``tp`` is the + number of true positives and ``fp`` the number of false positives. + balanced_accuracy_score : Compute balanced accuracy to deal with imbalanced + datasets. + multilabel_confusion_matrix : Compute a confusion matrix for each class or + sample. + PrecisionRecallDisplay.from_estimator : Plot precision-recall curve given + an estimator and some data. + PrecisionRecallDisplay.from_predictions : Plot precision-recall curve given + binary class predictions. + + Notes + ----- + When ``true positive + false negative == 0``, recall returns 0 and raises + ``UndefinedMetricWarning``. This behavior can be modified with + ``zero_division``. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import recall_score + >>> y_true = [0, 1, 2, 0, 1, 2] + >>> y_pred = [0, 2, 1, 0, 0, 1] + >>> recall_score(y_true, y_pred, average='macro') + 0.33 + >>> recall_score(y_true, y_pred, average='micro') + 0.33 + >>> recall_score(y_true, y_pred, average='weighted') + 0.33 + >>> recall_score(y_true, y_pred, average=None) + array([1., 0., 0.]) + >>> y_true = [0, 0, 0, 0, 0, 0] + >>> recall_score(y_true, y_pred, average=None) + array([0.5, 0. , 0. ]) + >>> recall_score(y_true, y_pred, average=None, zero_division=1) + array([0.5, 1. , 1. ]) + >>> recall_score(y_true, y_pred, average=None, zero_division=np.nan) + array([0.5, nan, nan]) + + >>> # multilabel classification + >>> y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]] + >>> y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]] + >>> recall_score(y_true, y_pred, average=None) + array([1. , 1. , 0.5]) + """ + _, r, _, _ = precision_recall_fscore_support( + y_true, + y_pred, + labels=labels, + pos_label=pos_label, + average=average, + warn_for=("recall",), + sample_weight=sample_weight, + zero_division=zero_division, + ) + return r + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "sample_weight": ["array-like", None], + "adjusted": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def balanced_accuracy_score(y_true, y_pred, *, sample_weight=None, adjusted=False): + """Compute the balanced accuracy. + + The balanced accuracy in binary and multiclass classification problems to + deal with imbalanced datasets. It is defined as the average of recall + obtained on each class. + + The best value is 1 and the worst value is 0 when ``adjusted=False``. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.20 + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) + Estimated targets as returned by a classifier. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + adjusted : bool, default=False + When true, the result is adjusted for chance, so that random + performance would score 0, while keeping perfect performance at a score + of 1. + + Returns + ------- + balanced_accuracy : float + Balanced accuracy score. + + See Also + -------- + average_precision_score : Compute average precision (AP) from prediction + scores. + precision_score : Compute the precision score. + recall_score : Compute the recall score. + roc_auc_score : Compute Area Under the Receiver Operating Characteristic + Curve (ROC AUC) from prediction scores. + + Notes + ----- + Some literature promotes alternative definitions of balanced accuracy. Our + definition is equivalent to :func:`accuracy_score` with class-balanced + sample weights, and shares desirable properties with the binary case. + See the :ref:`User Guide `. + + References + ---------- + .. [1] Brodersen, K.H.; Ong, C.S.; Stephan, K.E.; Buhmann, J.M. (2010). + The balanced accuracy and its posterior distribution. + Proceedings of the 20th International Conference on Pattern + Recognition, 3121-24. + .. [2] John. D. Kelleher, Brian Mac Namee, Aoife D'Arcy, (2015). + `Fundamentals of Machine Learning for Predictive Data Analytics: + Algorithms, Worked Examples, and Case Studies + `_. + + Examples + -------- + >>> from sklearn.metrics import balanced_accuracy_score + >>> y_true = [0, 1, 0, 0, 1, 0] + >>> y_pred = [0, 1, 0, 0, 0, 1] + >>> balanced_accuracy_score(y_true, y_pred) + 0.625 + """ + C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight) + with np.errstate(divide="ignore", invalid="ignore"): + per_class = np.diag(C) / C.sum(axis=1) + if np.any(np.isnan(per_class)): + warnings.warn("y_pred contains classes not in y_true") + per_class = per_class[~np.isnan(per_class)] + score = np.mean(per_class) + if adjusted: + n_classes = len(per_class) + chance = 1 / n_classes + score -= chance + score /= 1 - chance + return float(score) + + +@validate_params( + { + "y_true": ["array-like", "sparse matrix"], + "y_pred": ["array-like", "sparse matrix"], + "labels": ["array-like", None], + "target_names": ["array-like", None], + "sample_weight": ["array-like", None], + "digits": [Interval(Integral, 0, None, closed="left")], + "output_dict": ["boolean"], + "zero_division": [ + Options(Real, {0.0, 1.0}), + "nan", + StrOptions({"warn"}), + ], + }, + prefer_skip_nested_validation=True, +) +def classification_report( + y_true, + y_pred, + *, + labels=None, + target_names=None, + sample_weight=None, + digits=2, + output_dict=False, + zero_division="warn", +): + """Build a text report showing the main classification metrics. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : 1d array-like, or label indicator array / sparse matrix + Ground truth (correct) target values. + + y_pred : 1d array-like, or label indicator array / sparse matrix + Estimated targets as returned by a classifier. + + labels : array-like of shape (n_labels,), default=None + Optional list of label indices to include in the report. + + target_names : array-like of shape (n_labels,), default=None + Optional display names matching the labels (same order). + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + digits : int, default=2 + Number of digits for formatting output floating point values. + When ``output_dict`` is ``True``, this will be ignored and the + returned values will not be rounded. + + output_dict : bool, default=False + If True, return output as dict. + + .. versionadded:: 0.20 + + zero_division : {"warn", 0.0, 1.0, np.nan}, default="warn" + Sets the value to return when there is a zero division. If set to + "warn", this acts as 0, but warnings are also raised. + + .. versionadded:: 1.3 + `np.nan` option was added. + + Returns + ------- + report : str or dict + Text summary of the precision, recall, F1 score for each class. + Dictionary returned if output_dict is True. Dictionary has the + following structure:: + + {'label 1': {'precision':0.5, + 'recall':1.0, + 'f1-score':0.67, + 'support':1}, + 'label 2': { ... }, + ... + } + + The reported averages include macro average (averaging the unweighted + mean per label), weighted average (averaging the support-weighted mean + per label), and sample average (only for multilabel classification). + Micro average (averaging the total true positives, false negatives and + false positives) is only shown for multi-label or multi-class + with a subset of classes, because it corresponds to accuracy + otherwise and would be the same for all metrics. + See also :func:`precision_recall_fscore_support` for more details + on averages. + + Note that in binary classification, recall of the positive class + is also known as "sensitivity"; recall of the negative class is + "specificity". + + See Also + -------- + precision_recall_fscore_support: Compute precision, recall, F-measure and + support for each class. + confusion_matrix: Compute confusion matrix to evaluate the accuracy of a + classification. + multilabel_confusion_matrix: Compute a confusion matrix for each class or sample. + + Examples + -------- + >>> from sklearn.metrics import classification_report + >>> y_true = [0, 1, 2, 2, 2] + >>> y_pred = [0, 0, 2, 2, 1] + >>> target_names = ['class 0', 'class 1', 'class 2'] + >>> print(classification_report(y_true, y_pred, target_names=target_names)) + precision recall f1-score support + + class 0 0.50 1.00 0.67 1 + class 1 0.00 0.00 0.00 1 + class 2 1.00 0.67 0.80 3 + + accuracy 0.60 5 + macro avg 0.50 0.56 0.49 5 + weighted avg 0.70 0.60 0.61 5 + + >>> y_pred = [1, 1, 0] + >>> y_true = [1, 1, 1] + >>> print(classification_report(y_true, y_pred, labels=[1, 2, 3])) + precision recall f1-score support + + 1 1.00 0.67 0.80 3 + 2 0.00 0.00 0.00 0 + 3 0.00 0.00 0.00 0 + + micro avg 1.00 0.67 0.80 3 + macro avg 0.33 0.22 0.27 3 + weighted avg 1.00 0.67 0.80 3 + + """ + + y_true, y_pred = attach_unique(y_true, y_pred) + y_type, y_true, y_pred = _check_targets(y_true, y_pred) + + if labels is None: + labels = unique_labels(y_true, y_pred) + labels_given = False + else: + labels = np.asarray(labels) + labels_given = True + + # labelled micro average + micro_is_accuracy = (y_type == "multiclass" or y_type == "binary") and ( + not labels_given or (set(labels) >= set(unique_labels(y_true, y_pred))) + ) + + if target_names is not None and len(labels) != len(target_names): + if labels_given: + warnings.warn( + "labels size, {0}, does not match size of target_names, {1}".format( + len(labels), len(target_names) + ) + ) + else: + raise ValueError( + "Number of classes, {0}, does not match size of " + "target_names, {1}. Try specifying the labels " + "parameter".format(len(labels), len(target_names)) + ) + if target_names is None: + target_names = ["%s" % l for l in labels] + + headers = ["precision", "recall", "f1-score", "support"] + # compute per-class results without averaging + p, r, f1, s = precision_recall_fscore_support( + y_true, + y_pred, + labels=labels, + average=None, + sample_weight=sample_weight, + zero_division=zero_division, + ) + rows = zip(target_names, p, r, f1, s) + + if y_type.startswith("multilabel"): + average_options = ("micro", "macro", "weighted", "samples") + else: + average_options = ("micro", "macro", "weighted") + + if output_dict: + report_dict = {label[0]: label[1:] for label in rows} + for label, scores in report_dict.items(): + report_dict[label] = dict(zip(headers, [float(i) for i in scores])) + else: + longest_last_line_heading = "weighted avg" + name_width = max(len(cn) for cn in target_names) + width = max(name_width, len(longest_last_line_heading), digits) + head_fmt = "{:>{width}s} " + " {:>9}" * len(headers) + report = head_fmt.format("", *headers, width=width) + report += "\n\n" + row_fmt = "{:>{width}s} " + " {:>9.{digits}f}" * 3 + " {:>9}\n" + for row in rows: + report += row_fmt.format(*row, width=width, digits=digits) + report += "\n" + + # compute all applicable averages + for average in average_options: + if average.startswith("micro") and micro_is_accuracy: + line_heading = "accuracy" + else: + line_heading = average + " avg" + + # compute averages with specified averaging method + avg_p, avg_r, avg_f1, _ = precision_recall_fscore_support( + y_true, + y_pred, + labels=labels, + average=average, + sample_weight=sample_weight, + zero_division=zero_division, + ) + avg = [avg_p, avg_r, avg_f1, np.sum(s)] + + if output_dict: + report_dict[line_heading] = dict(zip(headers, [float(i) for i in avg])) + else: + if line_heading == "accuracy": + row_fmt_accuracy = ( + "{:>{width}s} " + + " {:>9.{digits}}" * 2 + + " {:>9.{digits}f}" + + " {:>9}\n" + ) + report += row_fmt_accuracy.format( + line_heading, "", "", *avg[2:], width=width, digits=digits + ) + else: + report += row_fmt.format(line_heading, *avg, width=width, digits=digits) + + if output_dict: + if "accuracy" in report_dict.keys(): + report_dict["accuracy"] = report_dict["accuracy"]["precision"] + return report_dict + else: + return report + + +@validate_params( + { + "y_true": ["array-like", "sparse matrix"], + "y_pred": ["array-like", "sparse matrix"], + "sample_weight": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def hamming_loss(y_true, y_pred, *, sample_weight=None): + """Compute the average Hamming loss. + + The Hamming loss is the fraction of labels that are incorrectly predicted. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : 1d array-like, or label indicator array / sparse matrix + Ground truth (correct) labels. + + y_pred : 1d array-like, or label indicator array / sparse matrix + Predicted labels, as returned by a classifier. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + .. versionadded:: 0.18 + + Returns + ------- + loss : float or int + Return the average Hamming loss between element of ``y_true`` and + ``y_pred``. + + See Also + -------- + accuracy_score : Compute the accuracy score. By default, the function will + return the fraction of correct predictions divided by the total number + of predictions. + jaccard_score : Compute the Jaccard similarity coefficient score. + zero_one_loss : Compute the Zero-one classification loss. By default, the + function will return the percentage of imperfectly predicted subsets. + + Notes + ----- + In multiclass classification, the Hamming loss corresponds to the Hamming + distance between ``y_true`` and ``y_pred`` which is equivalent to the + subset ``zero_one_loss`` function, when `normalize` parameter is set to + True. + + In multilabel classification, the Hamming loss is different from the + subset zero-one loss. The zero-one loss considers the entire set of labels + for a given sample incorrect if it does not entirely match the true set of + labels. Hamming loss is more forgiving in that it penalizes only the + individual labels. + + The Hamming loss is upperbounded by the subset zero-one loss, when + `normalize` parameter is set to True. It is always between 0 and 1, + lower being better. + + References + ---------- + .. [1] Grigorios Tsoumakas, Ioannis Katakis. Multi-Label Classification: + An Overview. International Journal of Data Warehousing & Mining, + 3(3), 1-13, July-September 2007. + + .. [2] `Wikipedia entry on the Hamming distance + `_. + + Examples + -------- + >>> from sklearn.metrics import hamming_loss + >>> y_pred = [1, 2, 3, 4] + >>> y_true = [2, 2, 3, 4] + >>> hamming_loss(y_true, y_pred) + 0.25 + + In the multilabel case with binary label indicators: + + >>> import numpy as np + >>> hamming_loss(np.array([[0, 1], [1, 1]]), np.zeros((2, 2))) + 0.75 + """ + y_true, y_pred = attach_unique(y_true, y_pred) + y_type, y_true, y_pred = _check_targets(y_true, y_pred) + check_consistent_length(y_true, y_pred, sample_weight) + + xp, _, device = get_namespace_and_device(y_true, y_pred, sample_weight) + + if sample_weight is None: + weight_average = 1.0 + else: + sample_weight = xp.asarray(sample_weight, device=device) + weight_average = _average(sample_weight, xp=xp) + + if y_type.startswith("multilabel"): + n_differences = _count_nonzero( + y_true - y_pred, xp=xp, device=device, sample_weight=sample_weight + ) + return float(n_differences) / ( + y_true.shape[0] * y_true.shape[1] * weight_average + ) + + elif y_type in ["binary", "multiclass"]: + return float(_average(y_true != y_pred, weights=sample_weight, normalize=True)) + else: + raise ValueError("{0} is not supported".format(y_type)) + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "normalize": ["boolean"], + "sample_weight": ["array-like", None], + "labels": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def log_loss(y_true, y_pred, *, normalize=True, sample_weight=None, labels=None): + r"""Log loss, aka logistic loss or cross-entropy loss. + + This is the loss function used in (multinomial) logistic regression + and extensions of it such as neural networks, defined as the negative + log-likelihood of a logistic model that returns ``y_pred`` probabilities + for its training data ``y_true``. + The log loss is only defined for two or more labels. + For a single sample with true label :math:`y \in \{0,1\}` and + a probability estimate :math:`p = \operatorname{Pr}(y = 1)`, the log + loss is: + + .. math:: + L_{\log}(y, p) = -(y \log (p) + (1 - y) \log (1 - p)) + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like or label indicator matrix + Ground truth (correct) labels for n_samples samples. + + y_pred : array-like of float, shape = (n_samples, n_classes) or (n_samples,) + Predicted probabilities, as returned by a classifier's + predict_proba method. If ``y_pred.shape = (n_samples,)`` + the probabilities provided are assumed to be that of the + positive class. The labels in ``y_pred`` are assumed to be + ordered alphabetically, as done by + :class:`~sklearn.preprocessing.LabelBinarizer`. + + `y_pred` values are clipped to `[eps, 1-eps]` where `eps` is the machine + precision for `y_pred`'s dtype. + + normalize : bool, default=True + If true, return the mean loss per sample. + Otherwise, return the sum of the per-sample losses. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + labels : array-like, default=None + If not provided, labels will be inferred from y_true. If ``labels`` + is ``None`` and ``y_pred`` has shape (n_samples,) the labels are + assumed to be binary and are inferred from ``y_true``. + + .. versionadded:: 0.18 + + Returns + ------- + loss : float + Log loss, aka logistic loss or cross-entropy loss. + + Notes + ----- + The logarithm used is the natural logarithm (base-e). + + References + ---------- + C.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer, + p. 209. + + Examples + -------- + >>> from sklearn.metrics import log_loss + >>> log_loss(["spam", "ham", "ham", "spam"], + ... [[.1, .9], [.9, .1], [.8, .2], [.35, .65]]) + 0.21616 + """ + transformed_labels, y_pred = _validate_multiclass_probabilistic_prediction( + y_true, y_pred, sample_weight, labels + ) + + # Clipping + eps = np.finfo(y_pred.dtype).eps + y_pred = np.clip(y_pred, eps, 1 - eps) + + loss = -xlogy(transformed_labels, y_pred).sum(axis=1) + + return float(_average(loss, weights=sample_weight, normalize=normalize)) + + +@validate_params( + { + "y_true": ["array-like"], + "pred_decision": ["array-like"], + "labels": ["array-like", None], + "sample_weight": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def hinge_loss(y_true, pred_decision, *, labels=None, sample_weight=None): + """Average hinge loss (non-regularized). + + In binary class case, assuming labels in y_true are encoded with +1 and -1, + when a prediction mistake is made, ``margin = y_true * pred_decision`` is + always negative (since the signs disagree), implying ``1 - margin`` is + always greater than 1. The cumulated hinge loss is therefore an upper + bound of the number of mistakes made by the classifier. + + In multiclass case, the function expects that either all the labels are + included in y_true or an optional labels argument is provided which + contains all the labels. The multilabel margin is calculated according + to Crammer-Singer's method. As in the binary case, the cumulated hinge loss + is an upper bound of the number of mistakes made by the classifier. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + True target, consisting of integers of two values. The positive label + must be greater than the negative label. + + pred_decision : array-like of shape (n_samples,) or (n_samples, n_classes) + Predicted decisions, as output by decision_function (floats). + + labels : array-like, default=None + Contains all the labels for the problem. Used in multiclass hinge loss. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + loss : float + Average hinge loss. + + References + ---------- + .. [1] `Wikipedia entry on the Hinge loss + `_. + + .. [2] Koby Crammer, Yoram Singer. On the Algorithmic + Implementation of Multiclass Kernel-based Vector + Machines. Journal of Machine Learning Research 2, + (2001), 265-292. + + .. [3] `L1 AND L2 Regularization for Multiclass Hinge Loss Models + by Robert C. Moore, John DeNero + `_. + + Examples + -------- + >>> from sklearn import svm + >>> from sklearn.metrics import hinge_loss + >>> X = [[0], [1]] + >>> y = [-1, 1] + >>> est = svm.LinearSVC(random_state=0) + >>> est.fit(X, y) + LinearSVC(random_state=0) + >>> pred_decision = est.decision_function([[-2], [3], [0.5]]) + >>> pred_decision + array([-2.18, 2.36, 0.09]) + >>> hinge_loss([-1, 1, 1], pred_decision) + 0.30 + + In the multiclass case: + + >>> import numpy as np + >>> X = np.array([[0], [1], [2], [3]]) + >>> Y = np.array([0, 1, 2, 3]) + >>> labels = np.array([0, 1, 2, 3]) + >>> est = svm.LinearSVC() + >>> est.fit(X, Y) + LinearSVC() + >>> pred_decision = est.decision_function([[-1], [2], [3]]) + >>> y_true = [0, 2, 3] + >>> hinge_loss(y_true, pred_decision, labels=labels) + 0.56 + """ + check_consistent_length(y_true, pred_decision, sample_weight) + pred_decision = check_array(pred_decision, ensure_2d=False) + y_true = column_or_1d(y_true) + y_true_unique = np.unique(labels if labels is not None else y_true) + + if y_true_unique.size > 2: + if pred_decision.ndim <= 1: + raise ValueError( + "The shape of pred_decision cannot be 1d array" + "with a multiclass target. pred_decision shape " + "must be (n_samples, n_classes), that is " + f"({y_true.shape[0]}, {y_true_unique.size})." + f" Got: {pred_decision.shape}" + ) + + # pred_decision.ndim > 1 is true + if y_true_unique.size != pred_decision.shape[1]: + if labels is None: + raise ValueError( + "Please include all labels in y_true " + "or pass labels as third argument" + ) + else: + raise ValueError( + "The shape of pred_decision is not " + "consistent with the number of classes. " + "With a multiclass target, pred_decision " + "shape must be " + "(n_samples, n_classes), that is " + f"({y_true.shape[0]}, {y_true_unique.size}). " + f"Got: {pred_decision.shape}" + ) + if labels is None: + labels = y_true_unique + le = LabelEncoder() + le.fit(labels) + y_true = le.transform(y_true) + mask = np.ones_like(pred_decision, dtype=bool) + mask[np.arange(y_true.shape[0]), y_true] = False + margin = pred_decision[~mask] + margin -= np.max(pred_decision[mask].reshape(y_true.shape[0], -1), axis=1) + + else: + # Handles binary class case + # this code assumes that positive and negative labels + # are encoded as +1 and -1 respectively + pred_decision = column_or_1d(pred_decision) + pred_decision = np.ravel(pred_decision) + + lbin = LabelBinarizer(neg_label=-1) + y_true = lbin.fit_transform(y_true)[:, 0] + + try: + margin = y_true * pred_decision + except TypeError: + raise TypeError("pred_decision should be an array of floats.") + + losses = 1 - margin + # The hinge_loss doesn't penalize good enough predictions. + np.clip(losses, 0, None, out=losses) + return float(np.average(losses, weights=sample_weight)) + + +def _validate_binary_probabilistic_prediction(y_true, y_prob, sample_weight, pos_label): + r"""Convert y_true and y_prob in binary classification to shape (n_samples, 2) + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + True labels. + + y_prob : array-like of shape (n_samples,) + Probabilities of the positive class. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + pos_label : int, float, bool or str, default=None + Label of the positive class. If None, `pos_label` will be inferred + in the following manner: + + * if `y_true` in {-1, 1} or {0, 1}, `pos_label` defaults to 1; + * else if `y_true` contains string, an error will be raised and + `pos_label` should be explicitly specified; + * otherwise, `pos_label` defaults to the greater label, + i.e. `np.unique(y_true)[-1]`. + + Returns + ------- + transformed_labels : array of shape (n_samples, 2) + + y_prob : array of shape (n_samples, 2) + """ + # sanity checks on y_true and y_prob + y_true = column_or_1d(y_true) + y_prob = column_or_1d(y_prob) + + assert_all_finite(y_true) + assert_all_finite(y_prob) + + check_consistent_length(y_prob, y_true, sample_weight) + + y_type = type_of_target(y_true, input_name="y_true") + if y_type != "binary": + raise ValueError( + f"The type of the target inferred from y_true is {y_type} but should be " + "binary according to the shape of y_prob." + ) + + if y_prob.max() > 1: + raise ValueError(f"y_prob contains values greater than 1: {y_prob.max()}") + if y_prob.min() < 0: + raise ValueError(f"y_prob contains values less than 0: {y_prob.min()}") + + # check that pos_label is consistent with y_true + try: + pos_label = _check_pos_label_consistency(pos_label, y_true) + except ValueError: + classes = np.unique(y_true) + if classes.dtype.kind not in ("O", "U", "S"): + # for backward compatibility, if classes are not string then + # `pos_label` will correspond to the greater label + pos_label = classes[-1] + else: + raise + + # convert (n_samples,) to (n_samples, 2) shape + y_true = np.array(y_true == pos_label, int) + transformed_labels = np.column_stack((1 - y_true, y_true)) + y_prob = np.column_stack((1 - y_prob, y_prob)) + + return transformed_labels, y_prob + + +@validate_params( + { + "y_true": ["array-like"], + "y_proba": ["array-like"], + "sample_weight": ["array-like", None], + "pos_label": [Real, str, "boolean", None], + "labels": ["array-like", None], + "scale_by_half": ["boolean", StrOptions({"auto"})], + }, + prefer_skip_nested_validation=True, +) +def brier_score_loss( + y_true, + y_proba, + *, + sample_weight=None, + pos_label=None, + labels=None, + scale_by_half="auto", +): + r"""Compute the Brier score loss. + + The smaller the Brier score loss, the better, hence the naming with "loss". + The Brier score measures the mean squared difference between the predicted + probability and the actual outcome. The Brier score is a strictly proper scoring + rule. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + True targets. + + y_proba : array-like of shape (n_samples,) or (n_samples, n_classes) + Predicted probabilities. If `y_proba.shape = (n_samples,)` + the probabilities provided are assumed to be that of the + positive class. If `y_proba.shape = (n_samples, n_classes)` + the columns in `y_proba` are assumed to correspond to the + labels in alphabetical order, as done by + :class:`~sklearn.preprocessing.LabelBinarizer`. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + pos_label : int, float, bool or str, default=None + Label of the positive class when `y_proba.shape = (n_samples,)`. + If not provided, `pos_label` will be inferred in the + following manner: + + * if `y_true` in {-1, 1} or {0, 1}, `pos_label` defaults to 1; + * else if `y_true` contains string, an error will be raised and + `pos_label` should be explicitly specified; + * otherwise, `pos_label` defaults to the greater label, + i.e. `np.unique(y_true)[-1]`. + + labels : array-like of shape (n_classes,), default=None + Class labels when `y_proba.shape = (n_samples, n_classes)`. + If not provided, labels will be inferred from `y_true`. + + .. versionadded:: 1.7 + + scale_by_half : bool or "auto", default="auto" + When True, scale the Brier score by 1/2 to lie in the [0, 1] range instead + of the [0, 2] range. The default "auto" option implements the rescaling to + [0, 1] only for binary classification (as customary) but keeps the + original [0, 2] range for multiclass classification. + + .. versionadded:: 1.7 + + Returns + ------- + score : float + Brier score loss. + + Notes + ----- + + For :math:`N` observations labeled from :math:`C` possible classes, the Brier + score is defined as: + + .. math:: + \frac{1}{N}\sum_{i=1}^{N}\sum_{c=1}^{C}(y_{ic} - \hat{p}_{ic})^{2} + + where :math:`y_{ic}` is 1 if observation `i` belongs to class `c`, + otherwise 0 and :math:`\hat{p}_{ic}` is the predicted probability for + observation `i` to belong to class `c`. + The Brier score then ranges between :math:`[0, 2]`. + + In binary classification tasks the Brier score is usually divided by + two and then ranges between :math:`[0, 1]`. It can be alternatively + written as: + + .. math:: + \frac{1}{N}\sum_{i=1}^{N}(y_{i} - \hat{p}_{i})^{2} + + where :math:`y_{i}` is the binary target and :math:`\hat{p}_{i}` + is the predicted probability of the positive class. + + References + ---------- + .. [1] `Wikipedia entry for the Brier score + `_. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import brier_score_loss + >>> y_true = np.array([0, 1, 1, 0]) + >>> y_true_categorical = np.array(["spam", "ham", "ham", "spam"]) + >>> y_prob = np.array([0.1, 0.9, 0.8, 0.3]) + >>> brier_score_loss(y_true, y_prob) + 0.0375 + >>> brier_score_loss(y_true, 1-y_prob, pos_label=0) + 0.0375 + >>> brier_score_loss(y_true_categorical, y_prob, pos_label="ham") + 0.0375 + >>> brier_score_loss(y_true, np.array(y_prob) > 0.5) + 0.0 + >>> brier_score_loss(y_true, y_prob, scale_by_half=False) + 0.075 + >>> brier_score_loss( + ... ["eggs", "ham", "spam"], + ... [[0.8, 0.1, 0.1], [0.2, 0.7, 0.1], [0.2, 0.2, 0.6]], + ... labels=["eggs", "ham", "spam"] + ... ) + 0.146 + """ + y_proba = check_array( + y_proba, ensure_2d=False, dtype=[np.float64, np.float32, np.float16] + ) + + if y_proba.ndim == 1 or y_proba.shape[1] == 1: + transformed_labels, y_proba = _validate_binary_probabilistic_prediction( + y_true, y_proba, sample_weight, pos_label + ) + else: + transformed_labels, y_proba = _validate_multiclass_probabilistic_prediction( + y_true, y_proba, sample_weight, labels + ) + + brier_score = np.average( + np.sum((transformed_labels - y_proba) ** 2, axis=1), weights=sample_weight + ) + + if scale_by_half == "auto": + scale_by_half = y_proba.ndim == 1 or y_proba.shape[1] < 3 + if scale_by_half: + brier_score *= 0.5 + + return float(brier_score) + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "sample_weight": ["array-like", None], + "labels": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, labels=None): + """ + :math:`D^2` score function, fraction of log loss explained. + + Best possible score is 1.0 and it can be negative (because the model can be + arbitrarily worse). A model that always predicts the per-class proportions + of `y_true`, disregarding the input features, gets a D^2 score of 0.0. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.5 + + Parameters + ---------- + y_true : array-like or label indicator matrix + The actuals labels for the n_samples samples. + + y_pred : array-like of shape (n_samples, n_classes) or (n_samples,) + Predicted probabilities, as returned by a classifier's + predict_proba method. If ``y_pred.shape = (n_samples,)`` + the probabilities provided are assumed to be that of the + positive class. The labels in ``y_pred`` are assumed to be + ordered alphabetically, as done by + :class:`~sklearn.preprocessing.LabelBinarizer`. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + labels : array-like, default=None + If not provided, labels will be inferred from y_true. If ``labels`` + is ``None`` and ``y_pred`` has shape (n_samples,) the labels are + assumed to be binary and are inferred from ``y_true``. + + Returns + ------- + d2 : float or ndarray of floats + The D^2 score. + + Notes + ----- + This is not a symmetric function. + + Like R^2, D^2 score may be negative (it need not actually be the square of + a quantity D). + + This metric is not well-defined for a single sample and will return a NaN + value if n_samples is less than two. + """ + y_pred = check_array(y_pred, ensure_2d=False, dtype="numeric") + check_consistent_length(y_pred, y_true, sample_weight) + if _num_samples(y_pred) < 2: + msg = "D^2 score is not well-defined with less than two samples." + warnings.warn(msg, UndefinedMetricWarning) + return float("nan") + + # log loss of the fitted model + numerator = log_loss( + y_true=y_true, + y_pred=y_pred, + normalize=False, + sample_weight=sample_weight, + labels=labels, + ) + + # Proportion of labels in the dataset + weights = _check_sample_weight(sample_weight, y_true) + + # If labels is passed, augment y_true to ensure that all labels are represented + # Use 0 weight for the new samples to not affect the counts + y_true_, weights_ = ( + ( + np.concatenate([y_true, labels]), + np.concatenate([weights, np.zeros_like(weights, shape=len(labels))]), + ) + if labels is not None + else (y_true, weights) + ) + + _, y_value_indices = np.unique(y_true_, return_inverse=True) + counts = np.bincount(y_value_indices, weights=weights_) + y_prob = counts / weights.sum() + y_pred_null = np.tile(y_prob, (len(y_true), 1)) + + # log loss of the null model + denominator = log_loss( + y_true=y_true, + y_pred=y_pred_null, + normalize=False, + sample_weight=sample_weight, + labels=labels, + ) + + return float(1 - (numerator / denominator)) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_dist_metrics.pxd b/.venv/lib/python3.12/site-packages/sklearn/metrics/_dist_metrics.pxd new file mode 100644 index 0000000000000000000000000000000000000000..0a249a8a9fb0a158c34c9f725891467de6041d40 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_dist_metrics.pxd @@ -0,0 +1,268 @@ +from libc.math cimport sqrt, exp + +from ..utils._typedefs cimport float64_t, float32_t, int32_t, intp_t + +cdef class DistanceMetric: + pass + +###################################################################### +# Inline distance functions +# +# We use these for the default (euclidean) case so that they can be +# inlined. This leads to faster computation for the most common case +cdef inline float64_t euclidean_dist64( + const float64_t* x1, + const float64_t* x2, + intp_t size, +) except -1 nogil: + cdef float64_t tmp, d=0 + cdef intp_t j + for j in range(size): + tmp = (x1[j] - x2[j]) + d += tmp * tmp + return sqrt(d) + + +cdef inline float64_t euclidean_rdist64( + const float64_t* x1, + const float64_t* x2, + intp_t size, +) except -1 nogil: + cdef float64_t tmp, d=0 + cdef intp_t j + for j in range(size): + tmp = (x1[j] - x2[j]) + d += tmp * tmp + return d + + +cdef inline float64_t euclidean_dist_to_rdist64(const float64_t dist) except -1 nogil: + return dist * dist + + +cdef inline float64_t euclidean_rdist_to_dist64(const float64_t dist) except -1 nogil: + return sqrt(dist) + + +###################################################################### +# DistanceMetric64 base class +cdef class DistanceMetric64(DistanceMetric): + # The following attributes are required for a few of the subclasses. + # we must define them here so that cython's limited polymorphism will work. + # Because we don't expect to instantiate a lot of these objects, the + # extra memory overhead of this setup should not be an issue. + cdef float64_t p + cdef const float64_t[::1] vec + cdef const float64_t[:, ::1] mat + cdef intp_t size + cdef object func + cdef object kwargs + + cdef float64_t dist( + self, + const float64_t* x1, + const float64_t* x2, + intp_t size, + ) except -1 nogil + + cdef float64_t rdist( + self, + const float64_t* x1, + const float64_t* x2, + intp_t size, + ) except -1 nogil + + cdef float64_t dist_csr( + self, + const float64_t* x1_data, + const int32_t* x1_indices, + const float64_t* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil + + cdef float64_t rdist_csr( + self, + const float64_t* x1_data, + const int32_t* x1_indices, + const float64_t* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil + + cdef int pdist( + self, + const float64_t[:, ::1] X, + float64_t[:, ::1] D, + ) except -1 + + cdef int cdist( + self, + const float64_t[:, ::1] X, + const float64_t[:, ::1] Y, + float64_t[:, ::1] D, + ) except -1 + + cdef int pdist_csr( + self, + const float64_t* x1_data, + const int32_t[::1] x1_indices, + const int32_t[::1] x1_indptr, + const intp_t size, + float64_t[:, ::1] D, + ) except -1 nogil + + cdef int cdist_csr( + self, + const float64_t* x1_data, + const int32_t[::1] x1_indices, + const int32_t[::1] x1_indptr, + const float64_t* x2_data, + const int32_t[::1] x2_indices, + const int32_t[::1] x2_indptr, + const intp_t size, + float64_t[:, ::1] D, + ) except -1 nogil + + cdef float64_t _rdist_to_dist(self, float64_t rdist) except -1 nogil + + cdef float64_t _dist_to_rdist(self, float64_t dist) except -1 nogil + +###################################################################### +# Inline distance functions +# +# We use these for the default (euclidean) case so that they can be +# inlined. This leads to faster computation for the most common case +cdef inline float64_t euclidean_dist32( + const float32_t* x1, + const float32_t* x2, + intp_t size, +) except -1 nogil: + cdef float64_t tmp, d=0 + cdef intp_t j + for j in range(size): + tmp = (x1[j] - x2[j]) + d += tmp * tmp + return sqrt(d) + + +cdef inline float64_t euclidean_rdist32( + const float32_t* x1, + const float32_t* x2, + intp_t size, +) except -1 nogil: + cdef float64_t tmp, d=0 + cdef intp_t j + for j in range(size): + tmp = (x1[j] - x2[j]) + d += tmp * tmp + return d + + +cdef inline float64_t euclidean_dist_to_rdist32(const float32_t dist) except -1 nogil: + return dist * dist + + +cdef inline float64_t euclidean_rdist_to_dist32(const float32_t dist) except -1 nogil: + return sqrt(dist) + + +###################################################################### +# DistanceMetric32 base class +cdef class DistanceMetric32(DistanceMetric): + # The following attributes are required for a few of the subclasses. + # we must define them here so that cython's limited polymorphism will work. + # Because we don't expect to instantiate a lot of these objects, the + # extra memory overhead of this setup should not be an issue. + cdef float64_t p + cdef const float64_t[::1] vec + cdef const float64_t[:, ::1] mat + cdef intp_t size + cdef object func + cdef object kwargs + + cdef float32_t dist( + self, + const float32_t* x1, + const float32_t* x2, + intp_t size, + ) except -1 nogil + + cdef float32_t rdist( + self, + const float32_t* x1, + const float32_t* x2, + intp_t size, + ) except -1 nogil + + cdef float32_t dist_csr( + self, + const float32_t* x1_data, + const int32_t* x1_indices, + const float32_t* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil + + cdef float32_t rdist_csr( + self, + const float32_t* x1_data, + const int32_t* x1_indices, + const float32_t* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil + + cdef int pdist( + self, + const float32_t[:, ::1] X, + float32_t[:, ::1] D, + ) except -1 + + cdef int cdist( + self, + const float32_t[:, ::1] X, + const float32_t[:, ::1] Y, + float32_t[:, ::1] D, + ) except -1 + + cdef int pdist_csr( + self, + const float32_t* x1_data, + const int32_t[::1] x1_indices, + const int32_t[::1] x1_indptr, + const intp_t size, + float32_t[:, ::1] D, + ) except -1 nogil + + cdef int cdist_csr( + self, + const float32_t* x1_data, + const int32_t[::1] x1_indices, + const int32_t[::1] x1_indptr, + const float32_t* x2_data, + const int32_t[::1] x2_indices, + const int32_t[::1] x2_indptr, + const intp_t size, + float32_t[:, ::1] D, + ) except -1 nogil + + cdef float32_t _rdist_to_dist(self, float32_t rdist) except -1 nogil + + cdef float32_t _dist_to_rdist(self, float32_t dist) except -1 nogil diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_dist_metrics.pxd.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_dist_metrics.pxd.tp new file mode 100644 index 0000000000000000000000000000000000000000..313225088c776e8575bfb4cec47c1f17183fab03 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_dist_metrics.pxd.tp @@ -0,0 +1,152 @@ +{{py: + +implementation_specific_values = [ + # Values are the following ones: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE + ('64', 'float64_t', 'np.float64'), + ('32', 'float32_t', 'np.float32') +] + +}} +from libc.math cimport sqrt, exp + +from ..utils._typedefs cimport float64_t, float32_t, int32_t, intp_t + +cdef class DistanceMetric: + pass + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +###################################################################### +# Inline distance functions +# +# We use these for the default (euclidean) case so that they can be +# inlined. This leads to faster computation for the most common case +cdef inline float64_t euclidean_dist{{name_suffix}}( + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, +) except -1 nogil: + cdef float64_t tmp, d=0 + cdef intp_t j + for j in range(size): + tmp = (x1[j] - x2[j]) + d += tmp * tmp + return sqrt(d) + + +cdef inline float64_t euclidean_rdist{{name_suffix}}( + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, +) except -1 nogil: + cdef float64_t tmp, d=0 + cdef intp_t j + for j in range(size): + tmp = (x1[j] - x2[j]) + d += tmp * tmp + return d + + +cdef inline float64_t euclidean_dist_to_rdist{{name_suffix}}(const {{INPUT_DTYPE_t}} dist) except -1 nogil: + return dist * dist + + +cdef inline float64_t euclidean_rdist_to_dist{{name_suffix}}(const {{INPUT_DTYPE_t}} dist) except -1 nogil: + return sqrt(dist) + + +###################################################################### +# DistanceMetric{{name_suffix}} base class +cdef class DistanceMetric{{name_suffix}}(DistanceMetric): + # The following attributes are required for a few of the subclasses. + # we must define them here so that cython's limited polymorphism will work. + # Because we don't expect to instantiate a lot of these objects, the + # extra memory overhead of this setup should not be an issue. + cdef float64_t p + cdef const float64_t[::1] vec + cdef const float64_t[:, ::1] mat + cdef intp_t size + cdef object func + cdef object kwargs + + cdef {{INPUT_DTYPE_t}} dist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil + + cdef {{INPUT_DTYPE_t}} rdist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil + + cdef {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil + + cdef {{INPUT_DTYPE_t}} rdist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil + + cdef int pdist( + self, + const {{INPUT_DTYPE_t}}[:, ::1] X, + {{INPUT_DTYPE_t}}[:, ::1] D, + ) except -1 + + cdef int cdist( + self, + const {{INPUT_DTYPE_t}}[:, ::1] X, + const {{INPUT_DTYPE_t}}[:, ::1] Y, + {{INPUT_DTYPE_t}}[:, ::1] D, + ) except -1 + + cdef int pdist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t[::1] x1_indices, + const int32_t[::1] x1_indptr, + const intp_t size, + {{INPUT_DTYPE_t}}[:, ::1] D, + ) except -1 nogil + + cdef int cdist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t[::1] x1_indices, + const int32_t[::1] x1_indptr, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t[::1] x2_indices, + const int32_t[::1] x2_indptr, + const intp_t size, + {{INPUT_DTYPE_t}}[:, ::1] D, + ) except -1 nogil + + cdef {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil + + cdef {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil + +{{endfor}} diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_dist_metrics.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_dist_metrics.pyx.tp new file mode 100644 index 0000000000000000000000000000000000000000..b7d3d1f4d86a6b4817af36489d1846b74afe7e6d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_dist_metrics.pyx.tp @@ -0,0 +1,2811 @@ +{{py: + +implementation_specific_values = [ + # Values are the following ones: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE + ('64', 'float64_t', 'np.float64'), + ('32', 'float32_t', 'np.float32') +] + +}} +# By Jake Vanderplas (2013) +# written for the scikit-learn project +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np +cimport numpy as cnp + +cnp.import_array() # required in order to use C-API + +from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin + +from scipy.sparse import csr_matrix, issparse +from ..utils._typedefs cimport float64_t, float32_t, int32_t, intp_t +from ..utils import check_array +from ..utils.fixes import parse_version, sp_base_version + +cdef inline double fmax(double a, double b) noexcept nogil: + return max(a, b) + + +###################################################################### +# newObj function +# this is a helper function for pickling +def newObj(obj): + return obj.__new__(obj) + + +BOOL_METRICS = [ + "hamming", + "jaccard", + "dice", + "rogerstanimoto", + "russellrao", + "sokalsneath", +] +DEPRECATED_METRICS = [] +if sp_base_version < parse_version("1.17"): + # Deprecated in SciPy 1.15 and removed in SciPy 1.17 + BOOL_METRICS += ["sokalmichener"] +if sp_base_version >= parse_version("1.15"): + DEPRECATED_METRICS.append("sokalmichener") +if sp_base_version < parse_version("1.11"): + # Deprecated in SciPy 1.9 and removed in SciPy 1.11 + BOOL_METRICS += ["kulsinski"] +if sp_base_version >= parse_version("1.9"): + DEPRECATED_METRICS.append("kulsinski") +if sp_base_version < parse_version("1.9"): + # Deprecated in SciPy 1.0 and removed in SciPy 1.9 + BOOL_METRICS += ["matching"] +if sp_base_version >= parse_version("1.0"): + DEPRECATED_METRICS.append("matching") + +def get_valid_metric_ids(L): + """Given an iterable of metric class names or class identifiers, + return a list of metric IDs which map to those classes. + + Example: + >>> L = get_valid_metric_ids([EuclideanDistance, 'ManhattanDistance']) + >>> sorted(L) + ['cityblock', 'euclidean', 'l1', 'l2', 'manhattan'] + """ + return [key for (key, val) in METRIC_MAPPING64.items() + if (val.__name__ in L) or (val in L)] + +cdef class DistanceMetric: + """Uniform interface for fast distance metric functions. + + The `DistanceMetric` class provides a convenient way to compute pairwise distances + between samples. It supports various distance metrics, such as Euclidean distance, + Manhattan distance, and more. + + The `pairwise` method can be used to compute pairwise distances between samples in + the input arrays. It returns a distance matrix representing the distances between + all pairs of samples. + + The :meth:`get_metric` method allows you to retrieve a specific metric using its + string identifier. + + Examples + -------- + >>> from sklearn.metrics import DistanceMetric + >>> dist = DistanceMetric.get_metric('euclidean') + >>> X = [[1, 2], [3, 4], [5, 6]] + >>> Y = [[7, 8], [9, 10]] + >>> dist.pairwise(X,Y) + array([[7.81..., 10.63...] + [5.65..., 8.48...] + [1.41..., 4.24...]]) + + .. rubric:: Available Metrics + + The following lists the string metric identifiers and the associated + distance metric classes: + + **Metrics intended for real-valued vector spaces:** + + ============== ==================== ======== =============================== + identifier class name args distance function + -------------- -------------------- -------- ------------------------------- + "euclidean" EuclideanDistance - ``sqrt(sum((x - y)^2))`` + "manhattan" ManhattanDistance - ``sum(|x - y|)`` + "chebyshev" ChebyshevDistance - ``max(|x - y|)`` + "minkowski" MinkowskiDistance p, w ``sum(w * |x - y|^p)^(1/p)`` + "seuclidean" SEuclideanDistance V ``sqrt(sum((x - y)^2 / V))`` + "mahalanobis" MahalanobisDistance V or VI ``sqrt((x - y)' V^-1 (x - y))`` + ============== ==================== ======== =============================== + + **Metrics intended for two-dimensional vector spaces:** Note that the haversine + distance metric requires data in the form of [latitude, longitude] and both + inputs and outputs are in units of radians. + + ============ ================== =============================================================== + identifier class name distance function + ------------ ------------------ --------------------------------------------------------------- + "haversine" HaversineDistance ``2 arcsin(sqrt(sin^2(0.5*dx) + cos(x1)cos(x2)sin^2(0.5*dy)))`` + ============ ================== =============================================================== + + + **Metrics intended for integer-valued vector spaces:** Though intended + for integer-valued vectors, these are also valid metrics in the case of + real-valued vectors. + + ============= ==================== ======================================== + identifier class name distance function + ------------- -------------------- ---------------------------------------- + "hamming" HammingDistance ``N_unequal(x, y) / N_tot`` + "canberra" CanberraDistance ``sum(|x - y| / (|x| + |y|))`` + "braycurtis" BrayCurtisDistance ``sum(|x - y|) / (sum(|x|) + sum(|y|))`` + ============= ==================== ======================================== + + **Metrics intended for boolean-valued vector spaces:** Any nonzero entry + is evaluated to "True". In the listings below, the following + abbreviations are used: + + - N: number of dimensions + - NTT: number of dims in which both values are True + - NTF: number of dims in which the first value is True, second is False + - NFT: number of dims in which the first value is False, second is True + - NFF: number of dims in which both values are False + - NNEQ: number of non-equal dimensions, NNEQ = NTF + NFT + - NNZ: number of nonzero dimensions, NNZ = NTF + NFT + NTT + + ================= ======================= =============================== + identifier class name distance function + ----------------- ----------------------- ------------------------------- + "jaccard" JaccardDistance NNEQ / NNZ + "matching" MatchingDistance NNEQ / N + "dice" DiceDistance NNEQ / (NTT + NNZ) + "kulsinski" KulsinskiDistance (NNEQ + N - NTT) / (NNEQ + N) + "rogerstanimoto" RogersTanimotoDistance 2 * NNEQ / (N + NNEQ) + "russellrao" RussellRaoDistance (N - NTT) / N + "sokalmichener" SokalMichenerDistance 2 * NNEQ / (N + NNEQ) + "sokalsneath" SokalSneathDistance NNEQ / (NNEQ + 0.5 * NTT) + ================= ======================= =============================== + + **User-defined distance:** + + =========== =============== ======= + identifier class name args + ----------- --------------- ------- + "pyfunc" PyFuncDistance func + =========== =============== ======= + + Here ``func`` is a function which takes two one-dimensional numpy + arrays, and returns a distance. Note that in order to be used within + the BallTree, the distance must be a true metric: + i.e. it must satisfy the following properties + + 1) Non-negativity: d(x, y) >= 0 + 2) Identity: d(x, y) = 0 if and only if x == y + 3) Symmetry: d(x, y) = d(y, x) + 4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z) + + Because of the Python object overhead involved in calling the python + function, this will be fairly slow, but it will have the same + scaling as other distances. + """ + @classmethod + def get_metric(cls, metric, dtype=np.float64, **kwargs): + """Get the given distance metric from the string identifier. + + See the docstring of DistanceMetric for a list of available metrics. + + Parameters + ---------- + metric : str or class name + The string identifier or class name of the desired distance metric. + See the documentation of the `DistanceMetric` class for a list of + available metrics. + + dtype : {np.float32, np.float64}, default=np.float64 + The data type of the input on which the metric will be applied. + This affects the precision of the computed distances. + By default, it is set to `np.float64`. + + **kwargs + Additional keyword arguments that will be passed to the requested metric. + These arguments can be used to customize the behavior of the specific + metric. + + Returns + ------- + metric_obj : instance of the requested metric + An instance of the requested distance metric class. + """ + if dtype == np.float32: + specialized_class = DistanceMetric32 + elif dtype == np.float64: + specialized_class = DistanceMetric64 + else: + raise ValueError( + f"Unexpected dtype {dtype} provided. Please select a dtype from" + " {np.float32, np.float64}" + ) + + return specialized_class.get_metric(metric, **kwargs) + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +###################################################################### +# metric mappings +# These map from metric id strings to class names +METRIC_MAPPING{{name_suffix}} = { + 'euclidean': EuclideanDistance{{name_suffix}}, + 'l2': EuclideanDistance{{name_suffix}}, + 'minkowski': MinkowskiDistance{{name_suffix}}, + 'p': MinkowskiDistance{{name_suffix}}, + 'manhattan': ManhattanDistance{{name_suffix}}, + 'cityblock': ManhattanDistance{{name_suffix}}, + 'l1': ManhattanDistance{{name_suffix}}, + 'chebyshev': ChebyshevDistance{{name_suffix}}, + 'infinity': ChebyshevDistance{{name_suffix}}, + 'seuclidean': SEuclideanDistance{{name_suffix}}, + 'mahalanobis': MahalanobisDistance{{name_suffix}}, + 'hamming': HammingDistance{{name_suffix}}, + 'canberra': CanberraDistance{{name_suffix}}, + 'braycurtis': BrayCurtisDistance{{name_suffix}}, + 'matching': MatchingDistance{{name_suffix}}, + 'jaccard': JaccardDistance{{name_suffix}}, + 'dice': DiceDistance{{name_suffix}}, + 'kulsinski': KulsinskiDistance{{name_suffix}}, + 'rogerstanimoto': RogersTanimotoDistance{{name_suffix}}, + 'russellrao': RussellRaoDistance{{name_suffix}}, + 'sokalmichener': SokalMichenerDistance{{name_suffix}}, + 'sokalsneath': SokalSneathDistance{{name_suffix}}, + 'haversine': HaversineDistance{{name_suffix}}, + 'pyfunc': PyFuncDistance{{name_suffix}}, +} + +cdef inline object _buffer_to_ndarray{{name_suffix}}(const {{INPUT_DTYPE_t}}* x, intp_t n): + # Wrap a memory buffer with an ndarray. Warning: this is not robust. + # In particular, if x is deallocated before the returned array goes + # out of scope, this could cause memory errors. Since there is not + # a possibility of this for our use-case, this should be safe. + + # Note: this Segfaults unless np.import_array() is called above + # TODO: remove the explicit cast to cnp.intp_t* when cython min version >= 3.0 + return cnp.PyArray_SimpleNewFromData(1, &n, cnp.NPY_FLOAT64, x) + + +cdef {{INPUT_DTYPE_t}} INF{{name_suffix}} = np.inf + + +###################################################################### +# Distance Metric Classes +cdef class DistanceMetric{{name_suffix}}(DistanceMetric): + """DistanceMetric class + + This class provides a uniform interface to fast distance metric + functions. The various metrics can be accessed via the :meth:`get_metric` + class method and the metric string identifier (see below). + + Examples + -------- + >>> from sklearn.metrics import DistanceMetric + >>> dist = DistanceMetric.get_metric('euclidean') + >>> X = [[0, 1, 2], + [3, 4, 5]] + >>> dist.pairwise(X) + array([[ 0. , 5.19615242], + [ 5.19615242, 0. ]]) + + Available Metrics + + The following lists the string metric identifiers and the associated + distance metric classes: + + **Metrics intended for real-valued vector spaces:** + + ============== ==================== ======== =============================== + identifier class name args distance function + -------------- -------------------- -------- ------------------------------- + "euclidean" EuclideanDistance - ``sqrt(sum((x - y)^2))`` + "manhattan" ManhattanDistance - ``sum(|x - y|)`` + "chebyshev" ChebyshevDistance - ``max(|x - y|)`` + "minkowski" MinkowskiDistance p, w ``sum(w * |x - y|^p)^(1/p)`` + "seuclidean" SEuclideanDistance V ``sqrt(sum((x - y)^2 / V))`` + "mahalanobis" MahalanobisDistance V or VI ``sqrt((x - y)' V^-1 (x - y))`` + ============== ==================== ======== =============================== + + **Metrics intended for two-dimensional vector spaces:** Note that the haversine + distance metric requires data in the form of [latitude, longitude] and both + inputs and outputs are in units of radians. + + ============ ================== =============================================================== + identifier class name distance function + ------------ ------------------ --------------------------------------------------------------- + "haversine" HaversineDistance ``2 arcsin(sqrt(sin^2(0.5*dx) + cos(x1)cos(x2)sin^2(0.5*dy)))`` + ============ ================== =============================================================== + + + **Metrics intended for integer-valued vector spaces:** Though intended + for integer-valued vectors, these are also valid metrics in the case of + real-valued vectors. + + ============= ==================== ======================================== + identifier class name distance function + ------------- -------------------- ---------------------------------------- + "hamming" HammingDistance ``N_unequal(x, y) / N_tot`` + "canberra" CanberraDistance ``sum(|x - y| / (|x| + |y|))`` + "braycurtis" BrayCurtisDistance ``sum(|x - y|) / (sum(|x|) + sum(|y|))`` + ============= ==================== ======================================== + + **Metrics intended for boolean-valued vector spaces:** Any nonzero entry + is evaluated to "True". In the listings below, the following + abbreviations are used: + + - N: number of dimensions + - NTT: number of dims in which both values are True + - NTF: number of dims in which the first value is True, second is False + - NFT: number of dims in which the first value is False, second is True + - NFF: number of dims in which both values are False + - NNEQ: number of non-equal dimensions, NNEQ = NTF + NFT + - NNZ: number of nonzero dimensions, NNZ = NTF + NFT + NTT + + ================= ======================= =============================== + identifier class name distance function + ----------------- ----------------------- ------------------------------- + "jaccard" JaccardDistance NNEQ / NNZ + "matching" MatchingDistance NNEQ / N + "dice" DiceDistance NNEQ / (NTT + NNZ) + "kulsinski" KulsinskiDistance (NNEQ + N - NTT) / (NNEQ + N) + "rogerstanimoto" RogersTanimotoDistance 2 * NNEQ / (N + NNEQ) + "russellrao" RussellRaoDistance (N - NTT) / N + "sokalmichener" SokalMichenerDistance 2 * NNEQ / (N + NNEQ) + "sokalsneath" SokalSneathDistance NNEQ / (NNEQ + 0.5 * NTT) + ================= ======================= =============================== + + **User-defined distance:** + + =========== =============== ======= + identifier class name args + ----------- --------------- ------- + "pyfunc" PyFuncDistance func + =========== =============== ======= + + Here ``func`` is a function which takes two one-dimensional numpy + arrays, and returns a distance. Note that in order to be used within + the BallTree, the distance must be a true metric: + i.e. it must satisfy the following properties + + 1) Non-negativity: d(x, y) >= 0 + 2) Identity: d(x, y) = 0 if and only if x == y + 3) Symmetry: d(x, y) = d(y, x) + 4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z) + + Because of the Python object overhead involved in calling the python + function, this will be fairly slow, but it will have the same + scaling as other distances. + """ + def __cinit__(self): + self.p = 2 + self.vec = np.zeros(1, dtype=np.float64, order='C') + self.mat = np.zeros((1, 1), dtype=np.float64, order='C') + self.size = 1 + + def __reduce__(self): + """ + reduce method used for pickling + """ + return (newObj, (self.__class__,), self.__getstate__()) + + def __getstate__(self): + """ + get state for pickling + """ + if self.__class__.__name__ == "PyFuncDistance{{name_suffix}}": + return (float(self.p), np.asarray(self.vec), np.asarray(self.mat), self.func, self.kwargs) + return (float(self.p), np.asarray(self.vec), np.asarray(self.mat)) + + def __setstate__(self, state): + """ + set state for pickling + """ + self.p = state[0] + self.vec = state[1] + self.mat = state[2] + if self.__class__.__name__ == "PyFuncDistance{{name_suffix}}": + self.func = state[3] + self.kwargs = state[4] + self.size = self.vec.shape[0] + + @classmethod + def get_metric(cls, metric, **kwargs): + """Get the given distance metric from the string identifier. + + See the docstring of DistanceMetric for a list of available metrics. + + Parameters + ---------- + metric : str or class name + The distance metric to use + **kwargs + additional arguments will be passed to the requested metric + """ + if isinstance(metric, DistanceMetric{{name_suffix}}): + return metric + + if callable(metric): + return PyFuncDistance{{name_suffix}}(metric, **kwargs) + + # Map the metric string ID to the metric class + if isinstance(metric, type) and issubclass(metric, DistanceMetric{{name_suffix}}): + pass + else: + try: + metric = METRIC_MAPPING{{name_suffix}}[metric] + except: + raise ValueError("Unrecognized metric '%s'" % metric) + + # In Minkowski special cases, return more efficient methods + if metric is MinkowskiDistance{{name_suffix}}: + p = kwargs.pop('p', 2) + w = kwargs.pop('w', None) + if p == 1 and w is None: + return ManhattanDistance{{name_suffix}}(**kwargs) + elif p == 2 and w is None: + return EuclideanDistance{{name_suffix}}(**kwargs) + elif np.isinf(p) and w is None: + return ChebyshevDistance{{name_suffix}}(**kwargs) + else: + return MinkowskiDistance{{name_suffix}}(p, w, **kwargs) + else: + return metric(**kwargs) + + def __init__(self): + if self.__class__ is DistanceMetric{{name_suffix}}: + raise NotImplementedError("DistanceMetric{{name_suffix}} is an abstract class") + + def _validate_data(self, X): + """Validate the input data. + + This should be overridden in a base class if a specific input format + is required. + """ + return + + cdef {{INPUT_DTYPE_t}} dist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + """Compute the distance between vectors x1 and x2 + + This should be overridden in a base class. + """ + return -999 + + cdef {{INPUT_DTYPE_t}} rdist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + """Compute the rank-preserving surrogate distance between vectors x1 and x2. + + This can optionally be overridden in a base class. + + The rank-preserving surrogate distance is any measure that yields the same + rank as the distance, but is more efficient to compute. For example, the + rank-preserving surrogate distance of the Euclidean metric is the + squared-euclidean distance. + """ + return self.dist(x1, x2, size) + + cdef int pdist( + self, + const {{INPUT_DTYPE_t}}[:, ::1] X, + {{INPUT_DTYPE_t}}[:, ::1] D, + ) except -1: + """Compute the pairwise distances between points in X""" + cdef intp_t i1, i2 + for i1 in range(X.shape[0]): + for i2 in range(i1, X.shape[0]): + D[i1, i2] = self.dist(&X[i1, 0], &X[i2, 0], X.shape[1]) + D[i2, i1] = D[i1, i2] + return 0 + + + cdef int cdist( + self, + const {{INPUT_DTYPE_t}}[:, ::1] X, + const {{INPUT_DTYPE_t}}[:, ::1] Y, + {{INPUT_DTYPE_t}}[:, ::1] D, + ) except -1: + """Compute the cross-pairwise distances between arrays X and Y""" + cdef intp_t i1, i2 + if X.shape[1] != Y.shape[1]: + raise ValueError('X and Y must have the same second dimension') + for i1 in range(X.shape[0]): + for i2 in range(Y.shape[0]): + D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1]) + return 0 + + cdef {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + """Compute the distance between vectors x1 and x2 represented + under the CSR format. + + This must be overridden in a subclass. + + Notes + ----- + 0. The implementation of this method in subclasses must be robust to the + presence of explicit zeros in the CSR representation. + + 1. The `data` arrays are passed using pointers to be able to support an + alternative representation of the CSR data structure for supporting + fused sparse-dense datasets pairs with minimum overhead. + + See the explanations in `SparseDenseDatasetsPair.__init__`. + + 2. An alternative signature would be: + + cdef {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + ) except -1 nogil: + + Where callers would use slicing on the original CSR data and indices + memoryviews: + + x1_start = X1_csr.indices_ptr[i] + x1_end = X1_csr.indices_ptr[i+1] + x2_start = X2_csr.indices_ptr[j] + x2_end = X2_csr.indices_ptr[j+1] + + self.dist_csr( + &x1_data[x1_start], + x1_indices[x1_start:x1_end], + &x2_data[x2_start], + x2_indices[x2_start:x2_end], + ) + + Yet, slicing on memoryview slows down execution as it takes the GIL. + See: https://github.com/scikit-learn/scikit-learn/issues/17299 + + Hence, to avoid slicing the data and indices arrays of the sparse + matrices containing respectively x1 and x2 (namely x{1,2}_{data,indices}) + are passed as well as their indices pointers (namely x{1,2}_{start,end}). + + 3. For reference about the CSR format, see section 3.4 of + Saad, Y. (2003), Iterative Methods for Sparse Linear Systems, SIAM. + https://www-users.cse.umn.edu/~saad/IterMethBook_2ndEd.pdf + """ + return -999 + + cdef {{INPUT_DTYPE_t}} rdist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + """Distance between rows of CSR matrices x1 and x2. + + This can optionally be overridden in a subclass. + + The rank-preserving surrogate distance is any measure that yields the same + rank as the distance, but is more efficient to compute. For example, the + rank-preserving surrogate distance of the Euclidean metric is the + squared-euclidean distance. + + Notes + ----- + The implementation of this method in subclasses must be robust to the + presence of explicit zeros in the CSR representation. + + More information about the motives for this method signature is given + in the docstring of dist_csr. + """ + return self.dist_csr( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + ) + + cdef int pdist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t[::1] x1_indices, + const int32_t[::1] x1_indptr, + const intp_t size, + {{INPUT_DTYPE_t}}[:, ::1] D, + ) except -1 nogil: + """Pairwise distances between rows in CSR matrix X. + + Note that this implementation is twice faster than cdist_csr(X, X) + because it leverages the symmetry of the problem. + """ + cdef: + intp_t i1, i2 + intp_t n_x1 = x1_indptr.shape[0] - 1 + intp_t x1_start, x1_end, x2_start, x2_end + + for i1 in range(n_x1): + x1_start = x1_indptr[i1] + x1_end = x1_indptr[i1 + 1] + for i2 in range(i1, n_x1): + x2_start = x1_indptr[i2] + x2_end = x1_indptr[i2 + 1] + D[i1, i2] = D[i2, i1] = self.dist_csr( + x1_data, + &x1_indices[0], + x1_data, + &x1_indices[0], + x1_start, + x1_end, + x2_start, + x2_end, + size, + ) + return 0 + + cdef int cdist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t[::1] x1_indices, + const int32_t[::1] x1_indptr, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t[::1] x2_indices, + const int32_t[::1] x2_indptr, + const intp_t size, + {{INPUT_DTYPE_t}}[:, ::1] D, + ) except -1 nogil: + """Compute the cross-pairwise distances between arrays X and Y + represented in the CSR format.""" + cdef: + intp_t i1, i2 + intp_t n_x1 = x1_indptr.shape[0] - 1 + intp_t n_x2 = x2_indptr.shape[0] - 1 + intp_t x1_start, x1_end, x2_start, x2_end + + for i1 in range(n_x1): + x1_start = x1_indptr[i1] + x1_end = x1_indptr[i1 + 1] + for i2 in range(n_x2): + x2_start = x2_indptr[i2] + x2_end = x2_indptr[i2 + 1] + + D[i1, i2] = self.dist_csr( + x1_data, + &x1_indices[0], + x2_data, + &x2_indices[0], + x1_start, + x1_end, + x2_start, + x2_end, + size, + ) + return 0 + + cdef {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: + """Convert the rank-preserving surrogate distance to the distance""" + return rdist + + cdef {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: + """Convert the distance to the rank-preserving surrogate distance""" + return dist + + def rdist_to_dist(self, rdist): + """Convert the rank-preserving surrogate distance to the distance. + + The surrogate distance is any measure that yields the same rank as the + distance, but is more efficient to compute. For example, the + rank-preserving surrogate distance of the Euclidean metric is the + squared-euclidean distance. + + Parameters + ---------- + rdist : double + Surrogate distance. + + Returns + ------- + double + True distance. + """ + return rdist + + def dist_to_rdist(self, dist): + """Convert the true distance to the rank-preserving surrogate distance. + + The surrogate distance is any measure that yields the same rank as the + distance, but is more efficient to compute. For example, the + rank-preserving surrogate distance of the Euclidean metric is the + squared-euclidean distance. + + Parameters + ---------- + dist : double + True distance. + + Returns + ------- + double + Surrogate distance. + """ + return dist + + def _pairwise_dense_dense(self, X, Y): + cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr + cdef const {{INPUT_DTYPE_t}}[:, ::1] Yarr + cdef {{INPUT_DTYPE_t}}[:, ::1] Darr + + Xarr = np.asarray(X, dtype={{INPUT_DTYPE}}, order='C') + self._validate_data(Xarr) + if X is Y: + Darr = np.empty((Xarr.shape[0], Xarr.shape[0]), dtype={{INPUT_DTYPE}}, order='C') + self.pdist(Xarr, Darr) + else: + Yarr = np.asarray(Y, dtype={{INPUT_DTYPE}}, order='C') + self._validate_data(Yarr) + Darr = np.empty((Xarr.shape[0], Yarr.shape[0]), dtype={{INPUT_DTYPE}}, order='C') + self.cdist(Xarr, Yarr, Darr) + return np.asarray(Darr) + + def _pairwise_sparse_sparse(self, X: csr_matrix , Y: csr_matrix): + cdef: + intp_t n_X, n_features + const {{INPUT_DTYPE_t}}[::1] X_data + const int32_t[::1] X_indices + const int32_t[::1] X_indptr + + intp_t n_Y + const {{INPUT_DTYPE_t}}[::1] Y_data + const int32_t[::1] Y_indices + const int32_t[::1] Y_indptr + + {{INPUT_DTYPE_t}}[:, ::1] Darr + + X_csr = X.tocsr() + n_X, n_features = X_csr.shape + X_data = np.asarray(X_csr.data, dtype={{INPUT_DTYPE}}) + X_indices = np.asarray(X_csr.indices, dtype=np.int32) + X_indptr = np.asarray(X_csr.indptr, dtype=np.int32) + if X is Y: + Darr = np.empty((n_X, n_X), dtype={{INPUT_DTYPE}}, order='C') + self.pdist_csr( + x1_data=&X_data[0], + x1_indices=X_indices, + x1_indptr=X_indptr, + size=n_features, + D=Darr, + ) + else: + Y_csr = Y.tocsr() + n_Y, _ = Y_csr.shape + Y_data = np.asarray(Y_csr.data, dtype={{INPUT_DTYPE}}) + Y_indices = np.asarray(Y_csr.indices, dtype=np.int32) + Y_indptr = np.asarray(Y_csr.indptr, dtype=np.int32) + + Darr = np.empty((n_X, n_Y), dtype={{INPUT_DTYPE}}, order='C') + self.cdist_csr( + x1_data=&X_data[0], + x1_indices=X_indices, + x1_indptr=X_indptr, + x2_data=&Y_data[0], + x2_indices=Y_indices, + x2_indptr=Y_indptr, + size=n_features, + D=Darr, + ) + return np.asarray(Darr) + + def _pairwise_sparse_dense(self, X: csr_matrix, Y): + cdef: + intp_t n_X = X.shape[0] + intp_t n_features = X.shape[1] + const {{INPUT_DTYPE_t}}[::1] X_data = np.asarray( + X.data, dtype={{INPUT_DTYPE}}, + ) + const int32_t[::1] X_indices = np.asarray( + X.indices, dtype=np.int32, + ) + const int32_t[::1] X_indptr = np.asarray( + X.indptr, dtype=np.int32, + ) + + const {{INPUT_DTYPE_t}}[:, ::1] Y_data = np.asarray( + Y, dtype={{INPUT_DTYPE}}, order="C", + ) + intp_t n_Y = Y_data.shape[0] + const int32_t[::1] Y_indices = ( + np.arange(n_features, dtype=np.int32) + ) + + {{INPUT_DTYPE_t}}[:, ::1] Darr = np.empty((n_X, n_Y), dtype={{INPUT_DTYPE}}, order='C') + + intp_t i1, i2 + intp_t x1_start, x1_end + {{INPUT_DTYPE_t}} * x2_data + + with nogil: + # Use the exact same adaptation for CSR than in SparseDenseDatasetsPair + # for supporting the sparse-dense case with minimal overhead. + # Note: at this point this method is only a convenience method + # used in the tests via the DistanceMetric.pairwise method. + # Therefore, there is no need to attempt parallelization of those + # nested for-loops. + # Efficient parallel computation of pairwise distances can be + # achieved via the PairwiseDistances class instead. The latter + # internally calls into vector-wise distance computation from + # the DistanceMetric subclass while benefiting from the generic + # Cython/OpenMP parallelization template for the generic pairwise + # distance + reduction computational pattern. + for i1 in range(n_X): + x1_start = X_indptr[i1] + x1_end = X_indptr[i1 + 1] + for i2 in range(n_Y): + x2_data = &Y_data[0, 0] + i2 * n_features + + Darr[i1, i2] = self.dist_csr( + x1_data=&X_data[0], + x1_indices=&X_indices[0], + x2_data=x2_data, + x2_indices=&Y_indices[0], + x1_start=x1_start, + x1_end=x1_end, + x2_start=0, + x2_end=n_features, + size=n_features, + ) + + return np.asarray(Darr) + + def _pairwise_dense_sparse(self, X, Y: csr_matrix): + # We could have implemented this method using _pairwise_dense_sparse by + # swapping argument and by transposing the results, but this would + # have come with an extra copy to ensure C-contiguity of the result. + cdef: + intp_t n_X = X.shape[0] + intp_t n_features = X.shape[1] + + const {{INPUT_DTYPE_t}}[:, ::1] X_data = np.asarray( + X, dtype={{INPUT_DTYPE}}, order="C", + ) + const int32_t[::1] X_indices = np.arange( + n_features, dtype=np.int32, + ) + + intp_t n_Y = Y.shape[0] + const {{INPUT_DTYPE_t}}[::1] Y_data = np.asarray( + Y.data, dtype={{INPUT_DTYPE}}, + ) + const int32_t[::1] Y_indices = np.asarray( + Y.indices, dtype=np.int32, + ) + const int32_t[::1] Y_indptr = np.asarray( + Y.indptr, dtype=np.int32, + ) + + {{INPUT_DTYPE_t}}[:, ::1] Darr = np.empty((n_X, n_Y), dtype={{INPUT_DTYPE}}, order='C') + + intp_t i1, i2 + {{INPUT_DTYPE_t}} * x1_data + + intp_t x2_start, x2_end + + with nogil: + # Use the exact same adaptation for CSR than in SparseDenseDatasetsPair + # for supporting the dense-sparse case with minimal overhead. + # Note: at this point this method is only a convenience method + # used in the tests via the DistanceMetric.pairwise method. + # Therefore, there is no need to attempt parallelization of those + # nested for-loops. + # Efficient parallel computation of pairwise distances can be + # achieved via the PairwiseDistances class instead. The latter + # internally calls into vector-wise distance computation from + # the DistanceMetric subclass while benefiting from the generic + # Cython/OpenMP parallelization template for the generic pairwise + # distance + reduction computational pattern. + for i1 in range(n_X): + x1_data = &X_data[0, 0] + i1 * n_features + for i2 in range(n_Y): + x2_start = Y_indptr[i2] + x2_end = Y_indptr[i2 + 1] + + Darr[i1, i2] = self.dist_csr( + x1_data=x1_data, + x1_indices=&X_indices[0], + x2_data=&Y_data[0], + x2_indices=&Y_indices[0], + x1_start=0, + x1_end=n_features, + x2_start=x2_start, + x2_end=x2_end, + size=n_features, + ) + + return np.asarray(Darr) + + + def pairwise(self, X, Y=None): + """Compute the pairwise distances between X and Y + + This is a convenience routine for the sake of testing. For many + metrics, the utilities in scipy.spatial.distance.cdist and + scipy.spatial.distance.pdist will be faster. + + Parameters + ---------- + X : ndarray or CSR matrix of shape (n_samples_X, n_features) + Input data. + Y : ndarray or CSR matrix of shape (n_samples_Y, n_features) + Input data. + If not specified, then Y=X. + + Returns + ------- + dist : ndarray of shape (n_samples_X, n_samples_Y) + The distance matrix of pairwise distances between points in X and Y. + """ + X = check_array(X, accept_sparse=['csr']) + + if Y is None: + Y = X + else: + Y = check_array(Y, accept_sparse=['csr']) + + X_is_sparse = issparse(X) + Y_is_sparse = issparse(Y) + + if not X_is_sparse and not Y_is_sparse: + return self._pairwise_dense_dense(X, Y) + + if X_is_sparse and Y_is_sparse: + return self._pairwise_sparse_sparse(X, Y) + + if X_is_sparse and not Y_is_sparse: + return self._pairwise_sparse_dense(X, Y) + + return self._pairwise_dense_sparse(X, Y) + +#------------------------------------------------------------ +# Euclidean Distance +# d = sqrt(sum(x_i^2 - y_i^2)) +cdef class EuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): + r"""Euclidean Distance metric + + .. math:: + D(x, y) = \sqrt{ \sum_i (x_i - y_i) ^ 2 } + """ + def __init__(self): + self.p = 2 + + cdef inline {{INPUT_DTYPE_t}} dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + return euclidean_dist{{name_suffix}}(x1, x2, size) + + cdef inline {{INPUT_DTYPE_t}} rdist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + return euclidean_rdist{{name_suffix}}(x1, x2, size) + + cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: + return sqrt(rdist) + + cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: + return dist * dist + + def rdist_to_dist(self, rdist): + return np.sqrt(rdist) + + def dist_to_rdist(self, dist): + return dist ** 2 + + cdef inline {{INPUT_DTYPE_t}} rdist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + + cdef: + intp_t ix1, ix2 + intp_t i1 = x1_start + intp_t i2 = x2_start + + float64_t d = 0.0 + float64_t unsquared = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + unsquared = x1_data[i1] - x2_data[i2] + d = d + (unsquared * unsquared) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + unsquared = x1_data[i1] + d = d + (unsquared * unsquared) + i1 = i1 + 1 + else: + unsquared = x2_data[i2] + d = d + (unsquared * unsquared) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + unsquared = x2_data[i2] + d = d + (unsquared * unsquared) + i2 = i2 + 1 + else: + while i1 < x1_end: + unsquared = x1_data[i1] + d = d + (unsquared * unsquared) + i1 = i1 + 1 + + return d + + cdef inline {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + return sqrt( + self.rdist_csr( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + )) + +#------------------------------------------------------------ +# SEuclidean Distance +# d = sqrt(sum((x_i - y_i2)^2 / v_i)) +cdef class SEuclideanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): + r"""Standardized Euclidean Distance metric + + .. math:: + D(x, y) = \sqrt{ \sum_i \frac{ (x_i - y_i) ^ 2}{V_i} } + """ + def __init__(self, V): + self.vec = np.asarray(V, dtype=np.float64) + self.size = self.vec.shape[0] + self.p = 2 + + def _validate_data(self, X): + if X.shape[1] != self.size: + raise ValueError('SEuclidean dist: size of V does not match') + + cdef inline {{INPUT_DTYPE_t}} rdist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + cdef float64_t tmp, d=0 + cdef intp_t j + for j in range(size): + tmp = x1[j] - x2[j] + d += (tmp * tmp / self.vec[j]) + return d + + cdef inline {{INPUT_DTYPE_t}} dist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + return sqrt(self.rdist(x1, x2, size)) + + cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: + return sqrt(rdist) + + cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: + return dist * dist + + def rdist_to_dist(self, rdist): + return np.sqrt(rdist) + + def dist_to_rdist(self, dist): + return dist ** 2 + + cdef inline {{INPUT_DTYPE_t}} rdist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + + cdef: + intp_t ix1, ix2 + intp_t i1 = x1_start + intp_t i2 = x2_start + + float64_t d = 0.0 + float64_t unsquared = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + unsquared = x1_data[i1] - x2_data[i2] + d = d + (unsquared * unsquared) / self.vec[ix1] + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + unsquared = x1_data[i1] + d = d + (unsquared * unsquared) / self.vec[ix1] + i1 = i1 + 1 + else: + unsquared = x2_data[i2] + d = d + (unsquared * unsquared) / self.vec[ix2] + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + ix2 = x2_indices[i2] + unsquared = x2_data[i2] + d = d + (unsquared * unsquared) / self.vec[ix2] + i2 = i2 + 1 + else: + while i1 < x1_end: + ix1 = x1_indices[i1] + unsquared = x1_data[i1] + d = d + (unsquared * unsquared) / self.vec[ix1] + i1 = i1 + 1 + return d + + cdef inline {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + return sqrt( + self.rdist_csr( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + )) + +#------------------------------------------------------------ +# Manhattan Distance +# d = sum(abs(x_i - y_i)) +cdef class ManhattanDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): + r"""Manhattan/City-block Distance metric + + .. math:: + D(x, y) = \sum_i |x_i - y_i| + """ + def __init__(self): + self.p = 1 + + cdef inline {{INPUT_DTYPE_t}} dist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + cdef float64_t d = 0 + cdef intp_t j + for j in range(size): + d += fabs(x1[j] - x2[j]) + return d + + cdef inline {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + + cdef: + intp_t ix1, ix2 + intp_t i1 = x1_start + intp_t i2 = x2_start + + {{INPUT_DTYPE_t}} d = 0.0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + d = d + fabs(x1_data[i1] - x2_data[i2]) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + d = d + fabs(x1_data[i1]) + i1 = i1 + 1 + else: + d = d + fabs(x2_data[i2]) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + d = d + fabs(x2_data[i2]) + i2 = i2 + 1 + else: + while i1 < x1_end: + d = d + fabs(x1_data[i1]) + i1 = i1 + 1 + + return d + + +#------------------------------------------------------------ +# Chebyshev Distance +# d = max_i(abs(x_i - y_i)) +cdef class ChebyshevDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): + """Chebyshev/Infinity Distance + + .. math:: + D(x, y) = max_i (|x_i - y_i|) + + Examples + -------- + >>> from sklearn.metrics.dist_metrics import DistanceMetric + >>> dist = DistanceMetric.get_metric('chebyshev') + >>> X = [[0, 1, 2], + ... [3, 4, 5]] + >>> Y = [[-1, 0, 1], + ... [3, 4, 5]] + >>> dist.pairwise(X, Y) + array([[1.732..., 5.196...], + [6.928..., 0.... ]]) + """ + def __init__(self): + self.p = INF{{name_suffix}} + + cdef inline {{INPUT_DTYPE_t}} dist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + cdef float64_t d = 0 + cdef intp_t j + for j in range(size): + d = fmax(d, fabs(x1[j] - x2[j])) + return d + + + cdef inline {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + + cdef: + intp_t ix1, ix2 + intp_t i1 = x1_start + intp_t i2 = x2_start + + float64_t d = 0.0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + d = fmax(d, fabs(x1_data[i1] - x2_data[i2])) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + d = fmax(d, fabs(x1_data[i1])) + i1 = i1 + 1 + else: + d = fmax(d, fabs(x2_data[i2])) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + d = fmax(d, fabs(x2_data[i2])) + i2 = i2 + 1 + else: + while i1 < x1_end: + d = fmax(d, fabs(x1_data[i1])) + i1 = i1 + 1 + + return d + + +#------------------------------------------------------------ +# Minkowski Distance +cdef class MinkowskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): + r"""Minkowski Distance + + .. math:: + D(x, y) = {||u-v||}_p + + when w is None. + + Here is the more general expanded expression for the weighted case: + + .. math:: + D(x, y) = [\sum_i w_i *|x_i - y_i|^p] ^ (1/p) + + Parameters + ---------- + p : float + The order of the p-norm of the difference (see above). + + .. versionchanged:: 1.4.0 + Minkowski distance allows `p` to be `0 0 and finite. + When :math:`p \in (0,1)`, it isn't a true metric but is permissible when + the triangular inequality isn't necessary. + For p = infinity, use ChebyshevDistance. + Note that for p=1, ManhattanDistance is more efficient, and for + p=2, EuclideanDistance is more efficient. + + """ + def __init__(self, p, w=None): + if p <= 0: + raise ValueError("p must be greater than 0") + elif np.isinf(p): + raise ValueError("MinkowskiDistance requires finite p. " + "For p=inf, use ChebyshevDistance.") + + self.p = p + if w is not None: + w_array = check_array( + w, ensure_2d=False, dtype=np.float64, input_name="w" + ) + if (w_array < 0).any(): + raise ValueError("w cannot contain negative weights") + self.vec = w_array + self.size = self.vec.shape[0] + else: + self.vec = np.asarray([], dtype=np.float64) + self.size = 0 + + def _validate_data(self, X): + if self.size > 0 and X.shape[1] != self.size: + raise ValueError("MinkowskiDistance: the size of w must match " + f"the number of features ({X.shape[1]}). " + f"Currently len(w)={self.size}.") + + cdef inline {{INPUT_DTYPE_t}} rdist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + cdef float64_t d=0 + cdef intp_t j + cdef bint has_w = self.size > 0 + if has_w: + for j in range(size): + d += (self.vec[j] * pow(fabs(x1[j] - x2[j]), self.p)) + else: + for j in range(size): + d += (pow(fabs(x1[j] - x2[j]), self.p)) + return d + + cdef inline {{INPUT_DTYPE_t}} dist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + return pow(self.rdist(x1, x2, size), 1. / self.p) + + cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: + return pow(rdist, 1. / self.p) + + cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: + return pow(dist, self.p) + + def rdist_to_dist(self, rdist): + return rdist ** (1. / self.p) + + def dist_to_rdist(self, dist): + return dist ** self.p + + cdef inline {{INPUT_DTYPE_t}} rdist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + + cdef: + intp_t ix1, ix2 + intp_t i1 = x1_start + intp_t i2 = x2_start + + float64_t d = 0.0 + bint has_w = self.size > 0 + + if has_w: + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + d = d + (self.vec[ix1] * pow(fabs( + x1_data[i1] - x2_data[i2] + ), self.p)) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) + i1 = i1 + 1 + else: + d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + ix2 = x2_indices[i2] + d = d + (self.vec[ix2] * pow(fabs(x2_data[i2]), self.p)) + i2 = i2 + 1 + else: + while i1 < x1_end: + ix1 = x1_indices[i1] + d = d + (self.vec[ix1] * pow(fabs(x1_data[i1]), self.p)) + i1 = i1 + 1 + + return d + else: + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + d = d + (pow(fabs( + x1_data[i1] - x2_data[i2] + ), self.p)) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + d = d + (pow(fabs(x1_data[i1]), self.p)) + i1 = i1 + 1 + else: + d = d + (pow(fabs(x2_data[i2]), self.p)) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + d = d + (pow(fabs(x2_data[i2]), self.p)) + i2 = i2 + 1 + else: + while i1 < x1_end: + d = d + (pow(fabs(x1_data[i1]), self.p)) + i1 = i1 + 1 + + return d + + cdef inline {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + return pow( + self.rdist_csr( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + ), + 1 / self.p + ) + +#------------------------------------------------------------ +# Mahalanobis Distance +# d = sqrt( (x - y)^T V^-1 (x - y) ) +cdef class MahalanobisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): + """Mahalanobis Distance + + .. math:: + D(x, y) = \sqrt{ (x - y)^T V^{-1} (x - y) } + + Parameters + ---------- + V : array-like + Symmetric positive-definite covariance matrix. + The inverse of this matrix will be explicitly computed. + VI : array-like + optionally specify the inverse directly. If VI is passed, + then V is not referenced. + """ + cdef float64_t[::1] buffer + + def __init__(self, V=None, VI=None): + if VI is None: + if V is None: + raise ValueError("Must provide either V or VI " + "for Mahalanobis distance") + VI = np.linalg.inv(V) + if VI.ndim != 2 or VI.shape[0] != VI.shape[1]: + raise ValueError("V/VI must be square") + + self.mat = np.asarray(VI, dtype=np.float64, order='C') + + self.size = self.mat.shape[0] + + # We need to create a buffer to store the vectors' coordinates' differences + self.buffer = np.zeros(self.size, dtype=np.float64) + + def __setstate__(self, state): + super().__setstate__(state) + self.size = self.mat.shape[0] + self.buffer = np.zeros(self.size, dtype=np.float64) + + def _validate_data(self, X): + if X.shape[1] != self.size: + raise ValueError('Mahalanobis dist: size of V does not match') + + cdef inline {{INPUT_DTYPE_t}} rdist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + cdef float64_t tmp, d = 0 + cdef intp_t i, j + + # compute (x1 - x2).T * VI * (x1 - x2) + for i in range(size): + self.buffer[i] = x1[i] - x2[i] + + for i in range(size): + tmp = 0 + for j in range(size): + tmp += self.mat[i, j] * self.buffer[j] + d += tmp * self.buffer[i] + return d + + cdef inline {{INPUT_DTYPE_t}} dist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + return sqrt(self.rdist(x1, x2, size)) + + cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: + return sqrt(rdist) + + cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: + return dist * dist + + def rdist_to_dist(self, rdist): + return np.sqrt(rdist) + + def dist_to_rdist(self, dist): + return dist ** 2 + + cdef inline {{INPUT_DTYPE_t}} rdist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + + cdef: + intp_t ix1, ix2 + intp_t i1 = x1_start + intp_t i2 = x2_start + + float64_t tmp, d = 0.0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + self.buffer[ix1] = x1_data[i1] - x2_data[i2] + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + self.buffer[ix1] = x1_data[i1] + i1 = i1 + 1 + else: + self.buffer[ix2] = - x2_data[i2] + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + ix2 = x2_indices[i2] + self.buffer[ix2] = - x2_data[i2] + i2 = i2 + 1 + else: + while i1 < x1_end: + ix1 = x1_indices[i1] + self.buffer[ix1] = x1_data[i1] + i1 = i1 + 1 + + for i in range(size): + tmp = 0 + for j in range(size): + tmp += self.mat[i, j] * self.buffer[j] + d += tmp * self.buffer[i] + + return d + + cdef inline {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + return sqrt( + self.rdist_csr( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + )) + +#------------------------------------------------------------ +# Hamming Distance +# d = N_unequal(x, y) / N_tot +cdef class HammingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): + r"""Hamming Distance + + Hamming distance is meant for discrete-valued vectors, though it is + a valid metric for real-valued vectors. + + .. math:: + D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i} + """ + cdef inline {{INPUT_DTYPE_t}} dist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + cdef int n_unequal = 0 + cdef intp_t j + for j in range(size): + if x1[j] != x2[j]: + n_unequal += 1 + return float(n_unequal) / size + + + cdef inline {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + + cdef: + intp_t ix1, ix2 + intp_t i1 = x1_start + intp_t i2 = x2_start + + float64_t d = 0.0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + d += (x1_data[i1] != x2_data[i2]) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + d += (x1_data[i1] != 0) + i1 = i1 + 1 + else: + d += (x2_data[i2] != 0) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + d += (x2_data[i2] != 0) + i2 = i2 + 1 + else: + while i1 < x1_end: + d += (x1_data[i1] != 0) + i1 = i1 + 1 + + d /= size + + return d + + +#------------------------------------------------------------ +# Canberra Distance +# D(x, y) = sum[ abs(x_i - y_i) / (abs(x_i) + abs(y_i)) ] +cdef class CanberraDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): + r"""Canberra Distance + + Canberra distance is meant for discrete-valued vectors, though it is + a valid metric for real-valued vectors. + + .. math:: + D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|} + """ + cdef inline {{INPUT_DTYPE_t}} dist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + cdef float64_t denom, d = 0 + cdef intp_t j + for j in range(size): + denom = fabs(x1[j]) + fabs(x2[j]) + if denom > 0: + d += fabs(x1[j] - x2[j]) / denom + return d + + cdef inline {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + + cdef: + intp_t ix1, ix2 + intp_t i1 = x1_start + intp_t i2 = x2_start + + float64_t d = 0.0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + d += ( + fabs(x1_data[i1] - x2_data[i2]) / + (fabs(x1_data[i1]) + fabs(x2_data[i2])) + ) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + d += 1. + i1 = i1 + 1 + else: + d += 1. + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + d += 1. + i2 = i2 + 1 + else: + while i1 < x1_end: + d += 1. + i1 = i1 + 1 + + return d + +#------------------------------------------------------------ +# Bray-Curtis Distance +# D(x, y) = sum[abs(x_i - y_i)] / sum[abs(x_i) + abs(y_i)] +cdef class BrayCurtisDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): + r"""Bray-Curtis Distance + + Bray-Curtis distance is meant for discrete-valued vectors, though it is + a valid metric for real-valued vectors. + + .. math:: + D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)} + """ + cdef inline {{INPUT_DTYPE_t}} dist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + cdef float64_t num = 0, denom = 0 + cdef intp_t j + for j in range(size): + num += fabs(x1[j] - x2[j]) + denom += fabs(x1[j]) + fabs(x2[j]) + if denom > 0: + return num / denom + else: + return 0.0 + + cdef inline {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + + cdef: + intp_t ix1, ix2 + intp_t i1 = x1_start + intp_t i2 = x2_start + + float64_t num = 0.0 + float64_t denom = 0.0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + num += fabs(x1_data[i1] - x2_data[i2]) + denom += fabs(x1_data[i1]) + fabs(x2_data[i2]) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + num += fabs(x1_data[i1]) + denom += fabs(x1_data[i1]) + i1 = i1 + 1 + else: + num += fabs(x2_data[i2]) + denom += fabs(x2_data[i2]) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + num += fabs(x1_data[i1]) + denom += fabs(x1_data[i1]) + i2 = i2 + 1 + else: + while i1 < x1_end: + num += fabs(x2_data[i2]) + denom += fabs(x2_data[i2]) + i1 = i1 + 1 + + return num / denom + +#------------------------------------------------------------ +# Jaccard Distance (boolean) +# D(x, y) = N_unequal(x, y) / N_nonzero(x, y) +cdef class JaccardDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): + r"""Jaccard Distance + + Jaccard Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + D(x, y) = (N_TF + N_FT) / (N_TT + N_TF + N_FT) + """ + cdef inline {{INPUT_DTYPE_t}} dist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + cdef int tf1, tf2, n_eq = 0, nnz = 0 + cdef intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + nnz += (tf1 or tf2) + n_eq += (tf1 and tf2) + # Based on https://github.com/scipy/scipy/pull/7373 + # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric + # was changed to return 0, instead of nan. + if nnz == 0: + return 0 + return (nnz - n_eq) * 1.0 / nnz + + cdef inline {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + + cdef: + intp_t ix1, ix2 + intp_t i1 = x1_start + intp_t i2 = x2_start + + intp_t tf1, tf2, n_tt = 0, nnz = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + nnz += (tf1 or tf2) + n_tt += (tf1 and tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + nnz += tf1 + i1 = i1 + 1 + else: + nnz += tf2 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + tf2 = x2_data[i2] != 0 + nnz += tf2 + i2 = i2 + 1 + else: + while i1 < x1_end: + tf1 = x1_data[i1] != 0 + nnz += tf1 + i1 = i1 + 1 + + # Based on https://github.com/scipy/scipy/pull/7373 + # When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric + # was changed to return 0, instead of nan. + if nnz == 0: + return 0 + return (nnz - n_tt) * 1.0 / nnz + +#------------------------------------------------------------ +# Matching Distance (boolean) +# D(x, y) = n_neq / n +cdef class MatchingDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): + r"""Matching Distance + + Matching Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + D(x, y) = (N_TF + N_FT) / N + """ + cdef inline {{INPUT_DTYPE_t}} dist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + cdef int tf1, tf2, n_neq = 0 + cdef intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + n_neq += (tf1 != tf2) + return n_neq * 1. / size + + cdef inline {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + + cdef: + intp_t ix1, ix2 + intp_t i1 = x1_start + intp_t i2 = x2_start + + intp_t tf1, tf2, n_neq = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + if ix1 == ix2: + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + n_neq += (tf1 != tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + n_neq += (x1_data[i1] != 0) + i1 = i1 + 1 + else: + n_neq += (x2_data[i2] != 0) + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + n_neq += (x2_data[i2] != 0) + i2 = i2 + 1 + else: + while i1 < x1_end: + n_neq += (x1_data[i1] != 0) + i1 = i1 + 1 + + return n_neq * 1.0 / size + +#------------------------------------------------------------ +# Dice Distance (boolean) +# D(x, y) = n_neq / (2 * ntt + n_neq) +cdef class DiceDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): + r"""Dice Distance + + Dice Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + D(x, y) = (N_TF + N_FT) / (2 * N_TT + N_TF + N_FT) + + """ + cdef inline {{INPUT_DTYPE_t}} dist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + cdef int tf1, tf2, n_neq = 0, n_tt = 0 + cdef intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + n_tt += (tf1 and tf2) + n_neq += (tf1 != tf2) + return n_neq / (2.0 * n_tt + n_neq) + + cdef inline {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + + cdef: + intp_t ix1, ix2 + intp_t i1 = x1_start + intp_t i2 = x2_start + + intp_t tf1, tf2, n_tt = 0, n_neq = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + n_tt += (tf1 and tf2) + n_neq += (tf1 != tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + n_neq += tf1 + i1 = i1 + 1 + else: + n_neq += tf2 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + tf2 = x2_data[i2] != 0 + n_neq += tf2 + i2 = i2 + 1 + else: + while i1 < x1_end: + tf1 = x1_data[i1] != 0 + n_neq += tf1 + i1 = i1 + 1 + + return n_neq / (2.0 * n_tt + n_neq) + + +#------------------------------------------------------------ +# Kulsinski Distance (boolean) +# D(x, y) = (ntf + nft - ntt + n) / (n_neq + n) +cdef class KulsinskiDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): + r"""Kulsinski Distance + + Kulsinski Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + D(x, y) = 1 - N_TT / (N + N_TF + N_FT) + + """ + cdef inline {{INPUT_DTYPE_t}} dist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + cdef int tf1, tf2, n_tt = 0, n_neq = 0 + cdef intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + n_neq += (tf1 != tf2) + n_tt += (tf1 and tf2) + return (n_neq - n_tt + size) * 1.0 / (n_neq + size) + + cdef inline {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + + cdef: + intp_t ix1, ix2 + intp_t i1 = x1_start + intp_t i2 = x2_start + + intp_t tf1, tf2, n_tt = 0, n_neq = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + n_tt += (tf1 and tf2) + n_neq += (tf1 != tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + n_neq += tf1 + i1 = i1 + 1 + else: + n_neq += tf2 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + tf2 = x2_data[i2] != 0 + n_neq += tf2 + i2 = i2 + 1 + else: + while i1 < x1_end: + tf1 = x1_data[i1] != 0 + n_neq += tf1 + i1 = i1 + 1 + + return (n_neq - n_tt + size) * 1.0 / (n_neq + size) + +#------------------------------------------------------------ +# Rogers-Tanimoto Distance (boolean) +# D(x, y) = 2 * n_neq / (n + n_neq) +cdef class RogersTanimotoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): + r"""Rogers-Tanimoto Distance + + Rogers-Tanimoto Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + D(x, y) = 2 (N_TF + N_FT) / (N + N_TF + N_FT) + """ + cdef inline {{INPUT_DTYPE_t}} dist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + cdef int tf1, tf2, n_neq = 0 + cdef intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + n_neq += (tf1 != tf2) + return (2.0 * n_neq) / (size + n_neq) + + cdef inline {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + + cdef: + intp_t ix1, ix2 + intp_t i1 = x1_start + intp_t i2 = x2_start + + intp_t tf1, tf2, n_neq = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + n_neq += (tf1 != tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + n_neq += tf1 + i1 = i1 + 1 + else: + n_neq += tf2 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + tf2 = x2_data[i2] != 0 + n_neq += tf2 + i2 = i2 + 1 + else: + while i1 < x1_end: + tf1 = x1_data[i1] != 0 + n_neq += tf1 + i1 = i1 + 1 + + return (2.0 * n_neq) / (size + n_neq) + +#------------------------------------------------------------ +# Russell-Rao Distance (boolean) +# D(x, y) = (n - ntt) / n +cdef class RussellRaoDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): + r"""Russell-Rao Distance + + Russell-Rao Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + D(x, y) = (N - N_TT) / N + """ + cdef inline {{INPUT_DTYPE_t}} dist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + cdef int tf1, tf2, n_tt = 0 + cdef intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + n_tt += (tf1 and tf2) + return (size - n_tt) * 1. / size + + cdef inline {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + + cdef: + intp_t ix1, ix2 + intp_t i1 = x1_start + intp_t i2 = x2_start + + intp_t tf1, tf2, n_tt = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + n_tt += (tf1 and tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + i1 = i1 + 1 + else: + i2 = i2 + 1 + + # We don't need to go through all the longest + # vector because tf1 or tf2 will be false + # and thus n_tt won't be increased. + + return (size - n_tt) * 1. / size + + + +#------------------------------------------------------------ +# Sokal-Michener Distance (boolean) +# D(x, y) = 2 * n_neq / (n + n_neq) +cdef class SokalMichenerDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): + r"""Sokal-Michener Distance + + Sokal-Michener Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + D(x, y) = 2 (N_TF + N_FT) / (N + N_TF + N_FT) + """ + cdef inline {{INPUT_DTYPE_t}} dist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + cdef int tf1, tf2, n_neq = 0 + cdef intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + n_neq += (tf1 != tf2) + return (2.0 * n_neq) / (size + n_neq) + + cdef inline {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + + cdef: + intp_t ix1, ix2 + intp_t i1 = x1_start + intp_t i2 = x2_start + + intp_t tf1, tf2, n_neq = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + n_neq += (tf1 != tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + n_neq += tf1 + i1 = i1 + 1 + else: + n_neq += tf2 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + tf2 = x2_data[i2] != 0 + n_neq += tf2 + i2 = i2 + 1 + else: + while i1 < x1_end: + tf1 = x1_data[i1] != 0 + n_neq += tf1 + i1 = i1 + 1 + + return (2.0 * n_neq) / (size + n_neq) + +#------------------------------------------------------------ +# Sokal-Sneath Distance (boolean) +# D(x, y) = n_neq / (0.5 * n_tt + n_neq) +cdef class SokalSneathDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): + r"""Sokal-Sneath Distance + + Sokal-Sneath Distance is a dissimilarity measure for boolean-valued + vectors. All nonzero entries will be treated as True, zero entries will + be treated as False. + + D(x, y) = (N_TF + N_FT) / (N_TT / 2 + N_FT + N_TF) + """ + cdef inline {{INPUT_DTYPE_t}} dist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + cdef int tf1, tf2, n_tt = 0, n_neq = 0 + cdef intp_t j + for j in range(size): + tf1 = x1[j] != 0 + tf2 = x2[j] != 0 + n_neq += (tf1 != tf2) + n_tt += (tf1 and tf2) + return n_neq / (0.5 * n_tt + n_neq) + + cdef inline {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + + cdef: + intp_t ix1, ix2 + intp_t i1 = x1_start + intp_t i2 = x2_start + + intp_t tf1, tf2, n_tt = 0, n_neq = 0 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + tf1 = x1_data[i1] != 0 + tf2 = x2_data[i2] != 0 + + if ix1 == ix2: + n_tt += (tf1 and tf2) + n_neq += (tf1 != tf2) + i1 = i1 + 1 + i2 = i2 + 1 + elif ix1 < ix2: + n_neq += tf1 + i1 = i1 + 1 + else: + n_neq += tf2 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + tf2 = x2_data[i2] != 0 + n_neq += tf2 + i2 = i2 + 1 + else: + while i1 < x1_end: + tf1 = x1_data[i1] != 0 + n_neq += tf1 + i1 = i1 + 1 + + return n_neq / (0.5 * n_tt + n_neq) + + +#------------------------------------------------------------ +# Haversine Distance (2 dimensional) +# D(x, y) = 2 arcsin{sqrt[sin^2 ((x1 - y1) / 2) +# + cos(x1) cos(y1) sin^2 ((x2 - y2) / 2)]} +cdef class HaversineDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): + """Haversine (Spherical) Distance + + The Haversine distance is the angular distance between two points on + the surface of a sphere. The first distance of each point is assumed + to be the latitude, the second is the longitude, given in radians. + The dimension of the points must be 2: + + D(x, y) = 2 arcsin[sqrt{sin^2((x1 - y1) / 2) + cos(x1)cos(y1)sin^2((x2 - y2) / 2)}] + + """ + + def _validate_data(self, X): + if X.shape[1] != 2: + raise ValueError("Haversine distance only valid " + "in 2 dimensions") + + cdef inline {{INPUT_DTYPE_t}} rdist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + cdef float64_t sin_0 = sin(0.5 * ((x1[0]) - (x2[0]))) + cdef float64_t sin_1 = sin(0.5 * ((x1[1]) - (x2[1]))) + return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1) + + cdef inline {{INPUT_DTYPE_t}} dist(self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + return 2 * asin(sqrt(self.rdist(x1, x2, size))) + + cdef inline {{INPUT_DTYPE_t}} _rdist_to_dist(self, {{INPUT_DTYPE_t}} rdist) except -1 nogil: + return 2 * asin(sqrt(rdist)) + + cdef inline {{INPUT_DTYPE_t}} _dist_to_rdist(self, {{INPUT_DTYPE_t}} dist) except -1 nogil: + cdef float64_t tmp = sin(0.5 * dist) + return tmp * tmp + + def rdist_to_dist(self, rdist): + return 2 * np.arcsin(np.sqrt(rdist)) + + def dist_to_rdist(self, dist): + tmp = np.sin(0.5 * dist) + return tmp * tmp + + cdef inline {{INPUT_DTYPE_t}} dist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + return 2 * asin(sqrt(self.rdist_csr( + x1_data, + x1_indices, + x2_data, + x2_indices, + x1_start, + x1_end, + x2_start, + x2_end, + size, + ))) + + cdef inline {{INPUT_DTYPE_t}} rdist_csr( + self, + const {{INPUT_DTYPE_t}}* x1_data, + const int32_t* x1_indices, + const {{INPUT_DTYPE_t}}* x2_data, + const int32_t* x2_indices, + const int32_t x1_start, + const int32_t x1_end, + const int32_t x2_start, + const int32_t x2_end, + const intp_t size, + ) except -1 nogil: + + cdef: + intp_t ix1, ix2 + intp_t i1 = x1_start + intp_t i2 = x2_start + + float64_t x1_0 = 0 + float64_t x1_1 = 0 + float64_t x2_0 = 0 + float64_t x2_1 = 0 + float64_t sin_0 + float64_t sin_1 + + while i1 < x1_end and i2 < x2_end: + ix1 = x1_indices[i1] + ix2 = x2_indices[i2] + + # Find the components in the 2D vectors to work with + x1_component = ix1 if (x1_start == 0) else ix1 % x1_start + x2_component = ix2 if (x2_start == 0) else ix2 % x2_start + + if x1_component == 0: + x1_0 = x1_data[i1] + else: + x1_1 = x1_data[i1] + + if x2_component == 0: + x2_0 = x2_data[i2] + else: + x2_1 = x2_data[i2] + + i1 = i1 + 1 + i2 = i2 + 1 + + if i1 == x1_end: + while i2 < x2_end: + ix2 = x2_indices[i2] + x2_component = ix2 if (x2_start == 0) else ix2 % x2_start + if x2_component == 0: + x2_0 = x2_data[i2] + else: + x2_1 = x2_data[i2] + i2 = i2 + 1 + else: + while i1 < x1_end: + ix1 = x1_indices[i1] + x1_component = ix1 if (x1_start == 0) else ix1 % x1_start + if x1_component == 0: + x1_0 = x1_data[i1] + else: + x1_1 = x1_data[i1] + i1 = i1 + 1 + + sin_0 = sin(0.5 * (x1_0 - x2_0)) + sin_1 = sin(0.5 * (x1_1 - x2_1)) + + return (sin_0 * sin_0 + cos(x1_0) * cos(x2_0) * sin_1 * sin_1) + +#------------------------------------------------------------ +# User-defined distance +# +cdef class PyFuncDistance{{name_suffix}}(DistanceMetric{{name_suffix}}): + """PyFunc Distance + + A user-defined distance + + Parameters + ---------- + func : function + func should take two numpy arrays as input, and return a distance. + """ + def __init__(self, func, **kwargs): + self.func = func + self.kwargs = kwargs + + # in cython < 0.26, GIL was required to be acquired during definition of + # the function and inside the body of the function. This behaviour is not + # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The + # only way to be back compatible is to inherit `dist` from the base class + # without GIL and called an inline `_dist` which acquire GIL. + cdef inline {{INPUT_DTYPE_t}} dist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 nogil: + return self._dist(x1, x2, size) + + cdef inline {{INPUT_DTYPE_t}} _dist( + self, + const {{INPUT_DTYPE_t}}* x1, + const {{INPUT_DTYPE_t}}* x2, + intp_t size, + ) except -1 with gil: + cdef: + object x1arr = _buffer_to_ndarray{{name_suffix}}(x1, size) + object x2arr = _buffer_to_ndarray{{name_suffix}}(x2, size) + d = self.func(x1arr, x2arr, **self.kwargs) + try: + # Cython generates code here that results in a TypeError + # if d is the wrong type. + return d + except TypeError: + raise TypeError("Custom distance function must accept two " + "vectors and return a float.") + +{{endfor}} diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6b532e0fa8ff07a27111f86d2ccc36b8d48879b5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/__init__.py @@ -0,0 +1,112 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# +# Pairwise Distances Reductions +# ============================= +# +# Overview +# -------- +# +# This module provides routines to compute pairwise distances between a set +# of row vectors of X and another set of row vectors of Y and apply a +# reduction on top. The canonical example is the brute-force computation +# of the top k nearest neighbors by leveraging the arg-k-min reduction. +# +# The reduction takes a matrix of pairwise distances between rows of X and Y +# as input and outputs an aggregate data-structure for each row of X. The +# aggregate values are typically smaller than the number of rows in Y, hence +# the term reduction. +# +# For computational reasons, the reduction are performed on the fly on chunks +# of rows of X and Y so as to keep intermediate data-structures in CPU cache +# and avoid unnecessary round trips of large distance arrays with the RAM +# that would otherwise severely degrade the speed by making the overall +# processing memory-bound. +# +# Finally, the routines follow a generic parallelization template to process +# chunks of data with OpenMP loops (via Cython prange), either on rows of X +# or rows of Y depending on their respective sizes. +# +# +# Dispatching to specialized implementations +# ------------------------------------------ +# +# Dispatchers are meant to be used in the Python code. Under the hood, a +# dispatcher must only define the logic to choose at runtime to the correct +# dtype-specialized :class:`BaseDistancesReductionDispatcher` implementation based +# on the dtype of X and of Y. +# +# +# High-level diagram +# ------------------ +# +# Legend: +# +# A ---⊳ B: A inherits from B +# A ---x B: A dispatches to B +# +# +# (base dispatcher) +# BaseDistancesReductionDispatcher +# ∆ +# | +# | +# +------------------+---------------+---------------+------------------+ +# | | | | +# | (dispatcher) (dispatcher) | +# | ArgKmin RadiusNeighbors | +# | | | | +# | | | | +# | | (float{32,64} implem.) | | +# | | BaseDistancesReduction{32,64} | | +# | | ∆ | | +# (dispatcher) | | | (dispatcher) +# ArgKminClassMode | | | RadiusNeighborsClassMode +# | | +----------+----------+ | | +# | | | | | | +# | | | | | | +# | x | | x | +# | +-------⊳ ArgKmin{32,64} RadiusNeighbors{32,64} ⊲---+ | +# x | | ∆ ∆ | | x +# ArgKminClassMode{32,64} | | | | RadiusNeighborsClassMode{32,64} +# ===================================== Specializations ============================================ +# | | | | +# | | | | +# x | | x +# EuclideanArgKmin{32,64} EuclideanRadiusNeighbors{32,64} +# +# +# For instance :class:`ArgKmin` dispatches to: +# - :class:`ArgKmin64` if X and Y are two `float64` array-likes +# - :class:`ArgKmin32` if X and Y are two `float32` array-likes +# +# In addition, if the metric parameter is set to "euclidean" or "sqeuclidean", +# then some direct subclass of `BaseDistancesReduction{32,64}` further dispatches +# to one of their subclass for euclidean-specialized implementation. For instance, +# :class:`ArgKmin64` dispatches to :class:`EuclideanArgKmin64`. +# +# Those Euclidean-specialized implementations relies on optimal implementations of +# a decomposition of the squared euclidean distance matrix into a sum of three terms +# (see :class:`MiddleTermComputer{32,64}`). +# + +from ._dispatcher import ( + ArgKmin, + ArgKminClassMode, + BaseDistancesReductionDispatcher, + RadiusNeighbors, + RadiusNeighborsClassMode, + sqeuclidean_row_norms, +) + +__all__ = [ + "ArgKmin", + "ArgKminClassMode", + "BaseDistancesReductionDispatcher", + "RadiusNeighbors", + "RadiusNeighborsClassMode", + "sqeuclidean_row_norms", +] + +# ruff: noqa: E501 diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/__pycache__/__init__.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..515308c6957d286fa9d4c33b1b96631d7c6efe38 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/__pycache__/__init__.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/__pycache__/_dispatcher.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/__pycache__/_dispatcher.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d25737cf601efaf838210d2f5ae020490dcc4a5 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/__pycache__/_dispatcher.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp new file mode 100644 index 0000000000000000000000000000000000000000..f3a9ce96e64c00f2818b43d147baaa363d6895ee --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pxd.tp @@ -0,0 +1,31 @@ +from ...utils._typedefs cimport intp_t, float64_t + +{{for name_suffix in ['64', '32']}} + +from ._base cimport BaseDistancesReduction{{name_suffix}} +from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}} + +cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): + """float{{name_suffix}} implementation of the ArgKmin.""" + + cdef: + intp_t k + + intp_t[:, ::1] argkmin_indices + float64_t[:, ::1] argkmin_distances + + # Used as array of pointers to private datastructures used in threads. + float64_t ** heaps_r_distances_chunks + intp_t ** heaps_indices_chunks + + +cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): + """EuclideanDistance-specialisation of ArgKmin{{name_suffix}}.""" + cdef: + MiddleTermComputer{{name_suffix}} middle_term_computer + const float64_t[::1] X_norm_squared + const float64_t[::1] Y_norm_squared + + bint use_squared_distances + +{{endfor}} diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp new file mode 100644 index 0000000000000000000000000000000000000000..c21717554e94b22d48558811534ecbc08fe6dc52 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_argkmin.pyx.tp @@ -0,0 +1,512 @@ +from libc.stdlib cimport free, malloc +from libc.float cimport DBL_MAX +from cython cimport final +from cython.parallel cimport parallel, prange + +from ...utils._heap cimport heap_push +from ...utils._sorting cimport simultaneous_sort +from ...utils._typedefs cimport intp_t, float64_t + +import numpy as np +import warnings + +from numbers import Integral +from scipy.sparse import issparse +from ...utils import check_array, check_scalar +from ...utils.fixes import _in_unstable_openblas_configuration +from ...utils.parallel import _get_threadpool_controller + +{{for name_suffix in ['64', '32']}} + +from ._base cimport ( + BaseDistancesReduction{{name_suffix}}, + _sqeuclidean_row_norms{{name_suffix}}, +) + +from ._datasets_pair cimport DatasetsPair{{name_suffix}} + +from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}} + + +cdef class ArgKmin{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): + """float{{name_suffix}} implementation of the ArgKmin.""" + + @classmethod + def compute( + cls, + X, + Y, + intp_t k, + metric="euclidean", + chunk_size=None, + dict metric_kwargs=None, + str strategy=None, + bint return_distance=False, + ): + """Compute the argkmin reduction. + + This classmethod is responsible for introspecting the arguments + values to dispatch to the most appropriate implementation of + :class:`ArgKmin{{name_suffix}}`. + + This allows decoupling the API entirely from the implementation details + whilst maintaining RAII: all temporarily allocated datastructures necessary + for the concrete implementation are therefore freed when this classmethod + returns. + + No instance should directly be created outside of this class method. + """ + # Limit the number of threads in second level of nested parallelism for BLAS + # to avoid threads over-subscription (in DOT or GEMM for instance). + with _get_threadpool_controller().limit(limits=1, user_api='blas'): + if metric in ("euclidean", "sqeuclidean"): + # Specialized implementation of ArgKmin for the Euclidean distance + # for the dense-dense and sparse-sparse cases. + # This implementation computes the distances by chunk using + # a decomposition of the Squared Euclidean distance. + # This specialisation has an improved arithmetic intensity for both + # the dense and sparse settings, allowing in most case speed-ups of + # several orders of magnitude compared to the generic ArgKmin + # implementation. + # Note that squared norms of X and Y are precomputed in the + # constructor of this class by issuing BLAS calls that may use + # multithreading (depending on the BLAS implementation), hence calling + # the constructor needs to be protected under the threadpool_limits + # context, along with the main calls to _parallel_on_Y and + # _parallel_on_X. + # For more information see MiddleTermComputer. + use_squared_distances = metric == "sqeuclidean" + pda = EuclideanArgKmin{{name_suffix}}( + X=X, Y=Y, k=k, + use_squared_distances=use_squared_distances, + chunk_size=chunk_size, + strategy=strategy, + metric_kwargs=metric_kwargs, + ) + else: + # Fall back on a generic implementation that handles most scipy + # metrics by computing the distances between 2 vectors at a time. + pda = ArgKmin{{name_suffix}}( + datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs), + k=k, + chunk_size=chunk_size, + strategy=strategy, + ) + + if pda.execute_in_parallel_on_Y: + pda._parallel_on_Y() + else: + pda._parallel_on_X() + + return pda._finalize_results(return_distance) + + def __init__( + self, + DatasetsPair{{name_suffix}} datasets_pair, + chunk_size=None, + strategy=None, + intp_t k=1, + ): + super().__init__( + datasets_pair=datasets_pair, + chunk_size=chunk_size, + strategy=strategy, + ) + self.k = check_scalar(k, "k", Integral, min_val=1) + + # Allocating pointers to datastructures but not the datastructures themselves. + # There are as many pointers as effective threads. + # + # For the sake of explicitness: + # - when parallelizing on X, the pointers of those heaps are referencing + # (with proper offsets) addresses of the two main heaps (see below) + # - when parallelizing on Y, the pointers of those heaps are referencing + # small heaps which are thread-wise-allocated and whose content will be + # merged with the main heaps'. + self.heaps_r_distances_chunks = malloc( + sizeof(float64_t *) * self.chunks_n_threads + ) + self.heaps_indices_chunks = malloc( + sizeof(intp_t *) * self.chunks_n_threads + ) + + # Main heaps which will be returned as results by `ArgKmin{{name_suffix}}.compute`. + self.argkmin_indices = np.full((self.n_samples_X, self.k), 0, dtype=np.intp) + self.argkmin_distances = np.full((self.n_samples_X, self.k), DBL_MAX, dtype=np.float64) + + def __dealloc__(self): + if self.heaps_indices_chunks is not NULL: + free(self.heaps_indices_chunks) + + if self.heaps_r_distances_chunks is not NULL: + free(self.heaps_r_distances_chunks) + + cdef void _compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil: + cdef: + intp_t i, j + intp_t n_samples_X = X_end - X_start + intp_t n_samples_Y = Y_end - Y_start + float64_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num] + intp_t *heaps_indices = self.heaps_indices_chunks[thread_num] + + # Pushing the distances and their associated indices on a heap + # which by construction will keep track of the argkmin. + for i in range(n_samples_X): + for j in range(n_samples_Y): + heap_push( + values=heaps_r_distances + i * self.k, + indices=heaps_indices + i * self.k, + size=self.k, + val=self.datasets_pair.surrogate_dist(X_start + i, Y_start + j), + val_idx=Y_start + j, + ) + + cdef void _parallel_on_X_init_chunk( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil: + # As this strategy is embarrassingly parallel, we can set each + # thread's heaps pointer to the proper position on the main heaps. + self.heaps_r_distances_chunks[thread_num] = &self.argkmin_distances[X_start, 0] + self.heaps_indices_chunks[thread_num] = &self.argkmin_indices[X_start, 0] + + cdef void _parallel_on_X_prange_iter_finalize( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil: + cdef: + intp_t idx + + # Sorting the main heaps portion associated to `X[X_start:X_end]` + # in ascending order w.r.t the distances. + for idx in range(X_end - X_start): + simultaneous_sort( + self.heaps_r_distances_chunks[thread_num] + idx * self.k, + self.heaps_indices_chunks[thread_num] + idx * self.k, + self.k + ) + + cdef void _parallel_on_Y_init( + self, + ) noexcept nogil: + cdef: + # Maximum number of scalar elements (the last chunks can be smaller) + intp_t heaps_size = self.X_n_samples_chunk * self.k + intp_t thread_num + + # The allocation is done in parallel for data locality purposes: this way + # the heaps used in each threads are allocated in pages which are closer + # to the CPU core used by the thread. + # See comments about First Touch Placement Policy: + # https://www.openmp.org/wp-content/uploads/openmp-webinar-vanderPas-20210318.pdf #noqa + for thread_num in prange(self.chunks_n_threads, schedule='static', nogil=True, + num_threads=self.chunks_n_threads): + # As chunks of X are shared across threads, so must their + # heaps. To solve this, each thread has its own heaps + # which are then synchronised back in the main ones. + self.heaps_r_distances_chunks[thread_num] = malloc( + heaps_size * sizeof(float64_t) + ) + self.heaps_indices_chunks[thread_num] = malloc( + heaps_size * sizeof(intp_t) + ) + + cdef void _parallel_on_Y_parallel_init( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil: + # Initialising heaps (memset can't be used here) + for idx in range(self.X_n_samples_chunk * self.k): + self.heaps_r_distances_chunks[thread_num][idx] = DBL_MAX + self.heaps_indices_chunks[thread_num][idx] = -1 + + @final + cdef void _parallel_on_Y_synchronize( + self, + intp_t X_start, + intp_t X_end, + ) noexcept nogil: + cdef: + intp_t idx, jdx, thread_num + with nogil, parallel(num_threads=self.effective_n_threads): + # Synchronising the thread heaps with the main heaps. + # This is done in parallel sample-wise (no need for locks). + # + # This might break each thread's data locality as each heap which + # was allocated in a thread is being now being used in several threads. + # + # Still, this parallel pattern has shown to be efficient in practice. + for idx in prange(X_end - X_start, schedule="static"): + for thread_num in range(self.chunks_n_threads): + for jdx in range(self.k): + heap_push( + values=&self.argkmin_distances[X_start + idx, 0], + indices=&self.argkmin_indices[X_start + idx, 0], + size=self.k, + val=self.heaps_r_distances_chunks[thread_num][idx * self.k + jdx], + val_idx=self.heaps_indices_chunks[thread_num][idx * self.k + jdx], + ) + + cdef void _parallel_on_Y_finalize( + self, + ) noexcept nogil: + cdef: + intp_t idx, thread_num + + with nogil, parallel(num_threads=self.chunks_n_threads): + # Deallocating temporary datastructures + for thread_num in prange(self.chunks_n_threads, schedule='static'): + free(self.heaps_r_distances_chunks[thread_num]) + free(self.heaps_indices_chunks[thread_num]) + + # Sorting the main in ascending order w.r.t the distances. + # This is done in parallel sample-wise (no need for locks). + for idx in prange(self.n_samples_X, schedule='static'): + simultaneous_sort( + &self.argkmin_distances[idx, 0], + &self.argkmin_indices[idx, 0], + self.k, + ) + return + + cdef void compute_exact_distances(self) noexcept nogil: + cdef: + intp_t i, j + float64_t[:, ::1] distances = self.argkmin_distances + for i in prange(self.n_samples_X, schedule='static', nogil=True, + num_threads=self.effective_n_threads): + for j in range(self.k): + distances[i, j] = self.datasets_pair.distance_metric._rdist_to_dist( + # Guard against potential -0., causing nan production. + max(distances[i, j], 0.) + ) + + def _finalize_results(self, bint return_distance=False): + if return_distance: + # We need to recompute distances because we relied on + # surrogate distances for the reduction. + self.compute_exact_distances() + + # Values are returned identically to the way `KNeighborsMixin.kneighbors` + # returns values. This is counter-intuitive but this allows not using + # complex adaptations where `ArgKmin.compute` is called. + return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices) + + return np.asarray(self.argkmin_indices) + + +cdef class EuclideanArgKmin{{name_suffix}}(ArgKmin{{name_suffix}}): + """EuclideanDistance-specialisation of ArgKmin{{name_suffix}}.""" + + @classmethod + def is_usable_for(cls, X, Y, metric) -> bool: + return (ArgKmin{{name_suffix}}.is_usable_for(X, Y, metric) and + not _in_unstable_openblas_configuration()) + + def __init__( + self, + X, + Y, + intp_t k, + bint use_squared_distances=False, + chunk_size=None, + strategy=None, + metric_kwargs=None, + ): + if ( + isinstance(metric_kwargs, dict) and + (metric_kwargs.keys() - {"X_norm_squared", "Y_norm_squared"}) + ): + warnings.warn( + f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't " + f"usable for this case (EuclideanArgKmin64) and will be ignored.", + UserWarning, + stacklevel=3, + ) + + super().__init__( + # The datasets pair here is used for exact distances computations + datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric="euclidean"), + chunk_size=chunk_size, + strategy=strategy, + k=k, + ) + cdef: + intp_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk + + self.middle_term_computer = MiddleTermComputer{{name_suffix}}.get_for( + X, + Y, + self.effective_n_threads, + self.chunks_n_threads, + dist_middle_terms_chunks_size, + n_features=X.shape[1], + chunk_size=self.chunk_size, + ) + + if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs: + self.Y_norm_squared = check_array( + metric_kwargs.pop("Y_norm_squared"), + ensure_2d=False, + input_name="Y_norm_squared", + dtype=np.float64, + ) + else: + self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}( + Y, + self.effective_n_threads, + ) + + if metric_kwargs is not None and "X_norm_squared" in metric_kwargs: + self.X_norm_squared = check_array( + metric_kwargs.pop("X_norm_squared"), + ensure_2d=False, + input_name="X_norm_squared", + dtype=np.float64, + ) + else: + # Do not recompute norms if datasets are identical. + self.X_norm_squared = ( + self.Y_norm_squared if X is Y else + _sqeuclidean_row_norms{{name_suffix}}( + X, + self.effective_n_threads, + ) + ) + + self.use_squared_distances = use_squared_distances + + @final + cdef void compute_exact_distances(self) noexcept nogil: + if not self.use_squared_distances: + ArgKmin{{name_suffix}}.compute_exact_distances(self) + + @final + cdef void _parallel_on_X_parallel_init( + self, + intp_t thread_num, + ) noexcept nogil: + ArgKmin{{name_suffix}}._parallel_on_X_parallel_init(self, thread_num) + self.middle_term_computer._parallel_on_X_parallel_init(thread_num) + + @final + cdef void _parallel_on_X_init_chunk( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil: + ArgKmin{{name_suffix}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end) + self.middle_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end) + + @final + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil: + ArgKmin{{name_suffix}}._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, Y_start, Y_end, thread_num, + ) + + @final + cdef void _parallel_on_Y_init( + self, + ) noexcept nogil: + ArgKmin{{name_suffix}}._parallel_on_Y_init(self) + self.middle_term_computer._parallel_on_Y_init() + + @final + cdef void _parallel_on_Y_parallel_init( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil: + ArgKmin{{name_suffix}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end) + self.middle_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end) + + @final + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil: + ArgKmin{{name_suffix}}._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, Y_start, Y_end, thread_num + ) + + @final + cdef void _compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil: + cdef: + intp_t i, j + float64_t sqeuclidean_dist_i_j + intp_t n_X = X_end - X_start + intp_t n_Y = Y_end - Y_start + float64_t * dist_middle_terms = self.middle_term_computer._compute_dist_middle_terms( + X_start, X_end, Y_start, Y_end, thread_num + ) + float64_t * heaps_r_distances = self.heaps_r_distances_chunks[thread_num] + intp_t * heaps_indices = self.heaps_indices_chunks[thread_num] + + # Pushing the distance and their associated indices on heaps + # which keep tracks of the argkmin. + for i in range(n_X): + for j in range(n_Y): + sqeuclidean_dist_i_j = ( + self.X_norm_squared[i + X_start] + + dist_middle_terms[i * n_Y + j] + + self.Y_norm_squared[j + Y_start] + ) + + # Catastrophic cancellation might cause -0. to be present, + # e.g. when computing d(x_i, y_i) when X is Y. + sqeuclidean_dist_i_j = max(0., sqeuclidean_dist_i_j) + + heap_push( + values=heaps_r_distances + i * self.k, + indices=heaps_indices + i * self.k, + size=self.k, + val=sqeuclidean_dist_i_j, + val_idx=j + Y_start, + ) + +{{endfor}} diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp new file mode 100644 index 0000000000000000000000000000000000000000..51fb745dca78408b7829a8aeb324bb7f99631c6b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_argkmin_classmode.pyx.tp @@ -0,0 +1,182 @@ +from cython cimport floating, integral +from cython.parallel cimport parallel, prange +from libcpp.map cimport map as cpp_map, pair as cpp_pair +from libc.stdlib cimport free + +from ...utils._typedefs cimport intp_t, float64_t +from ...utils.parallel import _get_threadpool_controller + +import numpy as np +from scipy.sparse import issparse +from ._classmode cimport WeightingStrategy + +{{for name_suffix in ["32", "64"]}} +from ._argkmin cimport ArgKmin{{name_suffix}} +from ._datasets_pair cimport DatasetsPair{{name_suffix}} + +cdef class ArgKminClassMode{{name_suffix}}(ArgKmin{{name_suffix}}): + """ + {{name_suffix}}bit implementation of ArgKminClassMode. + """ + cdef: + const intp_t[:] Y_labels, + const intp_t[:] unique_Y_labels + float64_t[:, :] class_scores + cpp_map[intp_t, intp_t] labels_to_index + WeightingStrategy weight_type + + @classmethod + def compute( + cls, + X, + Y, + intp_t k, + weights, + Y_labels, + unique_Y_labels, + str metric="euclidean", + chunk_size=None, + dict metric_kwargs=None, + str strategy=None, + ): + """Compute the argkmin reduction with Y_labels. + + This classmethod is responsible for introspecting the arguments + values to dispatch to the most appropriate implementation of + :class:`ArgKminClassMode{{name_suffix}}`. + + This allows decoupling the API entirely from the implementation details + whilst maintaining RAII: all temporarily allocated datastructures necessary + for the concrete implementation are therefore freed when this classmethod + returns. + + No instance _must_ directly be created outside of this class method. + """ + # Use a generic implementation that handles most scipy + # metrics by computing the distances between 2 vectors at a time. + pda = ArgKminClassMode{{name_suffix}}( + datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs), + k=k, + chunk_size=chunk_size, + strategy=strategy, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + ) + + # Limit the number of threads in second level of nested parallelism for BLAS + # to avoid threads over-subscription (in GEMM for instance). + with _get_threadpool_controller().limit(limits=1, user_api="blas"): + if pda.execute_in_parallel_on_Y: + pda._parallel_on_Y() + else: + pda._parallel_on_X() + + return pda._finalize_results() + + def __init__( + self, + DatasetsPair{{name_suffix}} datasets_pair, + const intp_t[:] Y_labels, + const intp_t[:] unique_Y_labels, + chunk_size=None, + strategy=None, + intp_t k=1, + weights=None, + ): + super().__init__( + datasets_pair=datasets_pair, + chunk_size=chunk_size, + strategy=strategy, + k=k, + ) + + if weights == "uniform": + self.weight_type = WeightingStrategy.uniform + elif weights == "distance": + self.weight_type = WeightingStrategy.distance + else: + self.weight_type = WeightingStrategy.callable + self.Y_labels = Y_labels + + self.unique_Y_labels = unique_Y_labels + + cdef intp_t idx, neighbor_class_idx + # Map from set of unique labels to their indices in `class_scores` + # Buffer used in building a histogram for one-pass weighted mode + self.class_scores = np.zeros( + (self.n_samples_X, unique_Y_labels.shape[0]), dtype=np.float64, + ) + + def _finalize_results(self): + probabilities = np.asarray(self.class_scores) + probabilities /= probabilities.sum(axis=1, keepdims=True) + return probabilities + + cdef inline void weighted_histogram_mode( + self, + intp_t sample_index, + intp_t* indices, + float64_t* distances, + ) noexcept nogil: + cdef: + intp_t neighbor_idx, neighbor_class_idx, label_index, multi_output_index + float64_t score_incr = 1 + # TODO: Implement other WeightingStrategy values + bint use_distance_weighting = ( + self.weight_type == WeightingStrategy.distance + ) + + # Iterate through the sample k-nearest neighbours + for neighbor_rank in range(self.k): + # Absolute indice of the neighbor_rank-th Nearest Neighbors + # in range [0, n_samples_Y) + # TODO: inspect if it worth permuting this condition + # and the for-loop above for improved branching. + if use_distance_weighting: + score_incr = 1 / distances[neighbor_rank] + neighbor_idx = indices[neighbor_rank] + neighbor_class_idx = self.Y_labels[neighbor_idx] + self.class_scores[sample_index][neighbor_class_idx] += score_incr + return + + cdef void _parallel_on_X_prange_iter_finalize( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil: + cdef: + intp_t idx, sample_index + for idx in range(X_end - X_start): + # One-pass top-one weighted mode + # Compute the absolute index in [0, n_samples_X) + sample_index = X_start + idx + self.weighted_histogram_mode( + sample_index, + &self.heaps_indices_chunks[thread_num][idx * self.k], + &self.heaps_r_distances_chunks[thread_num][idx * self.k], + ) + return + + cdef void _parallel_on_Y_finalize( + self, + ) noexcept nogil: + cdef: + intp_t sample_index, thread_num + + with nogil, parallel(num_threads=self.chunks_n_threads): + # Deallocating temporary datastructures + for thread_num in prange(self.chunks_n_threads, schedule='static'): + free(self.heaps_r_distances_chunks[thread_num]) + free(self.heaps_indices_chunks[thread_num]) + + for sample_index in prange(self.n_samples_X, schedule='static'): + self.weighted_histogram_mode( + sample_index, + &self.argkmin_indices[sample_index][0], + &self.argkmin_distances[sample_index][0], + ) + return + +{{endfor}} diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp new file mode 100644 index 0000000000000000000000000000000000000000..9578129993c37d392853f97ede19b5ce201a422f --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_base.pxd.tp @@ -0,0 +1,135 @@ +from cython cimport final + +from ...utils._typedefs cimport intp_t, float64_t + +{{for name_suffix in ['64', '32']}} + +from ._datasets_pair cimport DatasetsPair{{name_suffix}} + + +cpdef float64_t[::1] _sqeuclidean_row_norms{{name_suffix}}( + X, + intp_t num_threads, +) + +cdef class BaseDistancesReduction{{name_suffix}}: + """ + Base float{{name_suffix}} implementation template of the pairwise-distances + reduction backends. + + Implementations inherit from this template and may override the several + defined hooks as needed in order to easily extend functionality with + minimal redundant code. + """ + + cdef: + readonly DatasetsPair{{name_suffix}} datasets_pair + + # The number of threads that can be used is stored in effective_n_threads. + # + # The number of threads to use in the parallelization strategy + # (i.e. parallel_on_X or parallel_on_Y) can be smaller than effective_n_threads: + # for small datasets, fewer threads might be needed to loop over pair of chunks. + # + # Hence, the number of threads that _will_ be used for looping over chunks + # is stored in chunks_n_threads, allowing solely using what we need. + # + # Thus, an invariant is: + # + # chunks_n_threads <= effective_n_threads + # + intp_t effective_n_threads + intp_t chunks_n_threads + + intp_t n_samples_chunk, chunk_size + + intp_t n_samples_X, X_n_samples_chunk, X_n_chunks, X_n_samples_last_chunk + intp_t n_samples_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_last_chunk + + bint execute_in_parallel_on_Y + + @final + cdef void _parallel_on_X(self) noexcept nogil + + @final + cdef void _parallel_on_Y(self) noexcept nogil + + # Placeholder methods which have to be implemented + + cdef void _compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil + + + # Placeholder methods which can be implemented + + cdef void compute_exact_distances(self) noexcept nogil + + cdef void _parallel_on_X_parallel_init( + self, + intp_t thread_num, + ) noexcept nogil + + cdef void _parallel_on_X_init_chunk( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil + + cdef void _parallel_on_X_prange_iter_finalize( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil + + cdef void _parallel_on_X_parallel_finalize( + self, + intp_t thread_num + ) noexcept nogil + + cdef void _parallel_on_Y_init( + self, + ) noexcept nogil + + cdef void _parallel_on_Y_parallel_init( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil + + cdef void _parallel_on_Y_synchronize( + self, + intp_t X_start, + intp_t X_end, + ) noexcept nogil + + cdef void _parallel_on_Y_finalize( + self, + ) noexcept nogil +{{endfor}} diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp new file mode 100644 index 0000000000000000000000000000000000000000..2bbfd74e2c2c399297f7836ffb6b973f8318a9e4 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -0,0 +1,504 @@ +from cython cimport final +from cython.operator cimport dereference as deref +from cython.parallel cimport parallel, prange +from libcpp.vector cimport vector + +from ...utils._cython_blas cimport _dot +from ...utils._openmp_helpers cimport omp_get_thread_num +from ...utils._typedefs cimport intp_t, float32_t, float64_t, int32_t + +import numpy as np + +from scipy.sparse import issparse +from numbers import Integral +from sklearn import get_config +from sklearn.utils import check_scalar +from ...utils._openmp_helpers import _openmp_effective_n_threads + +##################### + +cdef float64_t[::1] _sqeuclidean_row_norms64_dense( + const float64_t[:, ::1] X, + intp_t num_threads, +): + """Compute the squared euclidean norm of the rows of X in parallel. + + This is faster than using np.einsum("ij, ij->i") even when using a single thread. + """ + cdef: + # Casting for X to remove the const qualifier is needed because APIs + # exposed via scipy.linalg.cython_blas aren't reflecting the arguments' + # const qualifier. + # See: https://github.com/scipy/scipy/issues/14262 + float64_t * X_ptr = &X[0, 0] + intp_t idx = 0 + intp_t n = X.shape[0] + intp_t d = X.shape[1] + float64_t[::1] squared_row_norms = np.empty(n, dtype=np.float64) + + for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads): + squared_row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1) + + return squared_row_norms + + +cdef float64_t[::1] _sqeuclidean_row_norms32_dense( + const float32_t[:, ::1] X, + intp_t num_threads, +): + """Compute the squared euclidean norm of the rows of X in parallel. + + This is faster than using np.einsum("ij, ij->i") even when using a single thread. + """ + cdef: + # Casting for X to remove the const qualifier is needed because APIs + # exposed via scipy.linalg.cython_blas aren't reflecting the arguments' + # const qualifier. + # See: https://github.com/scipy/scipy/issues/14262 + float32_t * X_ptr = &X[0, 0] + intp_t i = 0, j = 0 + intp_t thread_num + intp_t n = X.shape[0] + intp_t d = X.shape[1] + float64_t[::1] squared_row_norms = np.empty(n, dtype=np.float64) + + # To upcast the i-th row of X from float32 to float64 + vector[vector[float64_t]] X_i_upcast = vector[vector[float64_t]]( + num_threads, vector[float64_t](d) + ) + + with nogil, parallel(num_threads=num_threads): + thread_num = omp_get_thread_num() + + for i in prange(n, schedule='static'): + # Upcasting the i-th row of X from float32 to float64 + for j in range(d): + X_i_upcast[thread_num][j] = deref(X_ptr + i * d + j) + + squared_row_norms[i] = _dot( + d, X_i_upcast[thread_num].data(), 1, + X_i_upcast[thread_num].data(), 1, + ) + + return squared_row_norms + + +cdef float64_t[::1] _sqeuclidean_row_norms64_sparse( + const float64_t[:] X_data, + const int32_t[:] X_indptr, + intp_t num_threads, +): + cdef: + intp_t n = X_indptr.shape[0] - 1 + int32_t X_i_ptr, idx = 0 + float64_t[::1] squared_row_norms = np.zeros(n, dtype=np.float64) + + for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads): + for X_i_ptr in range(X_indptr[idx], X_indptr[idx+1]): + squared_row_norms[idx] += X_data[X_i_ptr] * X_data[X_i_ptr] + + return squared_row_norms + + +{{for name_suffix in ["64", "32"]}} + +from ._datasets_pair cimport DatasetsPair{{name_suffix}} + + +cpdef float64_t[::1] _sqeuclidean_row_norms{{name_suffix}}( + X, + intp_t num_threads, +): + if issparse(X): + # TODO: remove this instruction which is a cast in the float32 case + # by moving squared row norms computations in MiddleTermComputer. + X_data = np.asarray(X.data, dtype=np.float64) + X_indptr = np.asarray(X.indptr, dtype=np.int32) + return _sqeuclidean_row_norms64_sparse(X_data, X_indptr, num_threads) + else: + return _sqeuclidean_row_norms{{name_suffix}}_dense(X, num_threads) + + +cdef class BaseDistancesReduction{{name_suffix}}: + """ + Base float{{name_suffix}} implementation template of the pairwise-distances + reduction backends. + + Implementations inherit from this template and may override the several + defined hooks as needed in order to easily extend functionality with + minimal redundant code. + """ + + def __init__( + self, + DatasetsPair{{name_suffix}} datasets_pair, + chunk_size=None, + strategy=None, + ): + cdef: + intp_t X_n_full_chunks, Y_n_full_chunks + + if chunk_size is None: + chunk_size = get_config().get("pairwise_dist_chunk_size", 256) + + self.chunk_size = check_scalar(chunk_size, "chunk_size", Integral, min_val=20) + + self.effective_n_threads = _openmp_effective_n_threads() + + self.datasets_pair = datasets_pair + + self.n_samples_X = datasets_pair.n_samples_X() + self.X_n_samples_chunk = min(self.n_samples_X, self.chunk_size) + X_n_full_chunks = self.n_samples_X // self.X_n_samples_chunk + X_n_samples_remainder = self.n_samples_X % self.X_n_samples_chunk + self.X_n_chunks = X_n_full_chunks + (X_n_samples_remainder != 0) + + if X_n_samples_remainder != 0: + self.X_n_samples_last_chunk = X_n_samples_remainder + else: + self.X_n_samples_last_chunk = self.X_n_samples_chunk + + self.n_samples_Y = datasets_pair.n_samples_Y() + self.Y_n_samples_chunk = min(self.n_samples_Y, self.chunk_size) + Y_n_full_chunks = self.n_samples_Y // self.Y_n_samples_chunk + Y_n_samples_remainder = self.n_samples_Y % self.Y_n_samples_chunk + self.Y_n_chunks = Y_n_full_chunks + (Y_n_samples_remainder != 0) + + if Y_n_samples_remainder != 0: + self.Y_n_samples_last_chunk = Y_n_samples_remainder + else: + self.Y_n_samples_last_chunk = self.Y_n_samples_chunk + + if strategy is None: + strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto') + + if strategy not in ('parallel_on_X', 'parallel_on_Y', 'auto'): + raise RuntimeError(f"strategy must be 'parallel_on_X, 'parallel_on_Y', " + f"or 'auto', but currently strategy='{self.strategy}'.") + + if strategy == 'auto': + # This is a simple heuristic whose constant for the + # comparison has been chosen based on experiments. + # parallel_on_X has less synchronization overhead than + # parallel_on_Y and should therefore be used whenever + # n_samples_X is large enough to not starve any of the + # available hardware threads. + if self.n_samples_Y < self.n_samples_X: + # No point to even consider parallelizing on Y in this case. This + # is in particular important to do this on machines with a large + # number of hardware threads. + strategy = 'parallel_on_X' + elif 4 * self.chunk_size * self.effective_n_threads < self.n_samples_X: + # If Y is larger than X, but X is still large enough to allow for + # parallelism, we might still want to favor parallelizing on X. + strategy = 'parallel_on_X' + else: + strategy = 'parallel_on_Y' + + self.execute_in_parallel_on_Y = strategy == "parallel_on_Y" + + # Not using less, not using more. + self.chunks_n_threads = min( + self.Y_n_chunks if self.execute_in_parallel_on_Y else self.X_n_chunks, + self.effective_n_threads, + ) + + @final + cdef void _parallel_on_X(self) noexcept nogil: + """Perform computation and reduction in parallel on chunks of X. + + This strategy dispatches tasks statically on threads. Each task + processes exactly only one chunk of X, computing and reducing + distances matrices between vectors of this chunk and vectors of all + chunks of Y, one chunk of Y at a time. + + This strategy is embarrassingly parallel with no intermediate data + structures synchronization at all. + + Private datastructures are modified internally by threads. + + Private template methods can be implemented on subclasses to + interact with those datastructures at various stages. + """ + cdef: + intp_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx + intp_t thread_num + + with nogil, parallel(num_threads=self.chunks_n_threads): + thread_num = omp_get_thread_num() + + # Allocating thread datastructures + self._parallel_on_X_parallel_init(thread_num) + + for X_chunk_idx in prange(self.X_n_chunks, schedule='static'): + X_start = X_chunk_idx * self.X_n_samples_chunk + if X_chunk_idx == self.X_n_chunks - 1: + X_end = X_start + self.X_n_samples_last_chunk + else: + X_end = X_start + self.X_n_samples_chunk + + # Reinitializing thread datastructures for the new X chunk + self._parallel_on_X_init_chunk(thread_num, X_start, X_end) + + for Y_chunk_idx in range(self.Y_n_chunks): + Y_start = Y_chunk_idx * self.Y_n_samples_chunk + if Y_chunk_idx == self.Y_n_chunks - 1: + Y_end = Y_start + self.Y_n_samples_last_chunk + else: + Y_end = Y_start + self.Y_n_samples_chunk + + self._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + + self._compute_and_reduce_distances_on_chunks( + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + + # Adjusting thread datastructures on the full pass on Y + self._parallel_on_X_prange_iter_finalize(thread_num, X_start, X_end) + + # end: for X_chunk_idx + + # Deallocating thread datastructures + self._parallel_on_X_parallel_finalize(thread_num) + + # end: with nogil, parallel + return + + @final + cdef void _parallel_on_Y(self) noexcept nogil: + """Perform computation and reduction in parallel on chunks of Y. + + This strategy is a sequence of embarrassingly parallel subtasks: + chunks of X are iterated over sequentially, and for each chunk of X, + tasks are dispatched statically on threads. Each task processes one + and only one chunk of Y, computing and reducing distances matrices + between vectors of the chunk of X and vectors of the Y. + + It comes with lock-free and parallelized intermediate data structures + that synchronize at each iteration of the sequential outer loop on X + chunks. + + Private datastructures are modified internally by threads. + + Private template methods can be implemented on subclasses to + interact with those datastructures at various stages. + """ + cdef: + intp_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx + intp_t thread_num + + # Allocating datastructures shared by all threads + self._parallel_on_Y_init() + + for X_chunk_idx in range(self.X_n_chunks): + X_start = X_chunk_idx * self.X_n_samples_chunk + if X_chunk_idx == self.X_n_chunks - 1: + X_end = X_start + self.X_n_samples_last_chunk + else: + X_end = X_start + self.X_n_samples_chunk + + with nogil, parallel(num_threads=self.chunks_n_threads): + thread_num = omp_get_thread_num() + + # Initializing datastructures used in this thread + self._parallel_on_Y_parallel_init(thread_num, X_start, X_end) + + for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'): + Y_start = Y_chunk_idx * self.Y_n_samples_chunk + if Y_chunk_idx == self.Y_n_chunks - 1: + Y_end = Y_start + self.Y_n_samples_last_chunk + else: + Y_end = Y_start + self.Y_n_samples_chunk + + self._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + + self._compute_and_reduce_distances_on_chunks( + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + # end: prange + + # end: with nogil, parallel + + # Synchronizing the thread datastructures with the main ones + self._parallel_on_Y_synchronize(X_start, X_end) + + # end: for X_chunk_idx + # Deallocating temporary datastructures and adjusting main datastructures + self._parallel_on_Y_finalize() + return + + # Placeholder methods which have to be implemented + + cdef void _compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil: + """Compute the pairwise distances on two chunks of X and Y and reduce them. + + This is THE core computational method of BaseDistancesReduction{{name_suffix}}. + This must be implemented in subclasses agnostically from the parallelization + strategies. + """ + return + + def _finalize_results(self, bint return_distance): + """Callback adapting datastructures before returning results. + + This must be implemented in subclasses. + """ + return None + + # Placeholder methods which can be implemented + + cdef void compute_exact_distances(self) noexcept nogil: + """Convert rank-preserving distances to exact distances or recompute them.""" + return + + cdef void _parallel_on_X_parallel_init( + self, + intp_t thread_num, + ) noexcept nogil: + """Allocate datastructures used in a thread given its number.""" + return + + cdef void _parallel_on_X_init_chunk( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil: + """Initialize datastructures used in a thread given its number. + + In this method, EuclideanDistance specialisations of subclass of + BaseDistancesReduction _must_ call: + + self.middle_term_computer._parallel_on_X_init_chunk( + thread_num, X_start, X_end, + ) + + to ensure the proper upcast of X[X_start:X_end] to float64 prior + to the reduction with float64 accumulator buffers when X.dtype is + float32. + """ + return + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil: + """Initialize datastructures just before the _compute_and_reduce_distances_on_chunks. + + In this method, EuclideanDistance specialisations of subclass of + BaseDistancesReduction _must_ call: + + self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, Y_start, Y_end, thread_num, + ) + + to ensure the proper upcast of Y[Y_start:Y_end] to float64 prior + to the reduction with float64 accumulator buffers when Y.dtype is + float32. + """ + return + + cdef void _parallel_on_X_prange_iter_finalize( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil: + """Interact with datastructures after a reduction on chunks.""" + return + + cdef void _parallel_on_X_parallel_finalize( + self, + intp_t thread_num + ) noexcept nogil: + """Interact with datastructures after executing all the reductions.""" + return + + cdef void _parallel_on_Y_init( + self, + ) noexcept nogil: + """Allocate datastructures used in all threads.""" + return + + cdef void _parallel_on_Y_parallel_init( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil: + """Initialize datastructures used in a thread given its number. + + In this method, EuclideanDistance specialisations of subclass of + BaseDistancesReduction _must_ call: + + self.middle_term_computer._parallel_on_Y_parallel_init( + thread_num, X_start, X_end, + ) + + to ensure the proper upcast of X[X_start:X_end] to float64 prior + to the reduction with float64 accumulator buffers when X.dtype is + float32. + """ + return + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil: + """Initialize datastructures just before the _compute_and_reduce_distances_on_chunks. + + In this method, EuclideanDistance specialisations of subclass of + BaseDistancesReduction _must_ call: + + self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, Y_start, Y_end, thread_num, + ) + + to ensure the proper upcast of Y[Y_start:Y_end] to float64 prior + to the reduction with float64 accumulator buffers when Y.dtype is + float32. + """ + return + + cdef void _parallel_on_Y_synchronize( + self, + intp_t X_start, + intp_t X_end, + ) noexcept nogil: + """Update thread datastructures before leaving a parallel region.""" + return + + cdef void _parallel_on_Y_finalize( + self, + ) noexcept nogil: + """Update datastructures after executing all the reductions.""" + return + +{{endfor}} diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_classmode.pxd b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_classmode.pxd new file mode 100644 index 0000000000000000000000000000000000000000..65db044d668e89cc0a681a871663220d065dca41 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_classmode.pxd @@ -0,0 +1,5 @@ +cpdef enum WeightingStrategy: + uniform = 0 + # TODO: Implement the following options in weighted_histogram_mode + distance = 1 + callable = 2 diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp new file mode 100644 index 0000000000000000000000000000000000000000..1e57b3291a8f4be47902a1c4c26c1a41d1f43297 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp @@ -0,0 +1,67 @@ +{{py: + +implementation_specific_values = [ + # Values are the following ones: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE + ('64', 'DistanceMetric64', 'float64_t'), + ('32', 'DistanceMetric32', 'float32_t') +] + +}} +from ...utils._typedefs cimport float64_t, float32_t, int32_t, intp_t +from ...metrics._dist_metrics cimport DistanceMetric64, DistanceMetric32, DistanceMetric + +{{for name_suffix, DistanceMetric, INPUT_DTYPE_t in implementation_specific_values}} + + +cdef class DatasetsPair{{name_suffix}}: + cdef: + {{DistanceMetric}} distance_metric + intp_t n_features + + cdef intp_t n_samples_X(self) noexcept nogil + + cdef intp_t n_samples_Y(self) noexcept nogil + + cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil + + cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil + + +cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): + cdef: + const {{INPUT_DTYPE_t}}[:, ::1] X + const {{INPUT_DTYPE_t}}[:, ::1] Y + + +cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): + cdef: + const {{INPUT_DTYPE_t}}[:] X_data + const int32_t[::1] X_indices + const int32_t[::1] X_indptr + + const {{INPUT_DTYPE_t}}[:] Y_data + const int32_t[::1] Y_indices + const int32_t[::1] Y_indptr + + +cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): + cdef: + const {{INPUT_DTYPE_t}}[:] X_data + const int32_t[::1] X_indices + const int32_t[::1] X_indptr + + const {{INPUT_DTYPE_t}}[:] Y_data + const int32_t[::1] Y_indices + intp_t n_Y + + +cdef class DenseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): + cdef: + # As distance metrics are commutative, we can simply rely + # on the implementation of SparseDenseDatasetsPair and + # swap arguments. + DatasetsPair{{name_suffix}} datasets_pair + +{{endfor}} diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp new file mode 100644 index 0000000000000000000000000000000000000000..2c3ca44047145e98ca4b446a85db87b1c2ecd2c2 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pyx.tp @@ -0,0 +1,406 @@ +import copy + +{{py: + +implementation_specific_values = [ + # Values are the following ones: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE + ('64', 'DistanceMetric64', 'float64_t', 'np.float64'), + ('32', 'DistanceMetric32', 'float32_t', 'np.float32') +] + +}} +import numpy as np + +from cython cimport final + +from ...utils._typedefs cimport float64_t, float32_t, intp_t + +from scipy.sparse import issparse, csr_matrix + +{{for name_suffix, DistanceMetric, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +cdef class DatasetsPair{{name_suffix}}: + """Abstract class which wraps a pair of datasets (X, Y). + + This class allows computing distances between a single pair of rows of + of X and Y at a time given the pair of their indices (i, j). This class is + specialized for each metric thanks to the :func:`get_for` factory classmethod. + + The handling of parallelization over chunks to compute the distances + and aggregation for several rows at a time is done in dedicated + subclasses of :class:`BaseDistancesReductionDispatcher` that in-turn rely on + subclasses of :class:`DatasetsPair` for each pair of rows in the data. The + goal is to make it possible to decouple the generic parallelization and + aggregation logic from metric-specific computation as much as possible. + + X and Y can be stored as C-contiguous np.ndarrays or CSR matrices + in subclasses. + + This class avoids the overhead of dispatching distance computations + to :class:`sklearn.metrics.DistanceMetric` based on the physical + representation of the vectors (sparse vs. dense). It makes use of + cython.final to remove the overhead of dispatching method calls. + + Parameters + ---------- + distance_metric: {{DistanceMetric}} + The distance metric responsible for computing distances + between two vectors of (X, Y). + """ + + @classmethod + def get_for( + cls, + X, + Y, + metric="euclidean", + dict metric_kwargs=None, + ) -> DatasetsPair{{name_suffix}}: + """Return the DatasetsPair implementation for the given arguments. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples_X, n_features) + Input data. + If provided as a ndarray, it must be C-contiguous. + If provided as a sparse matrix, it must be in CSR format. + + Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features) + Input data. + If provided as a ndarray, it must be C-contiguous. + If provided as a sparse matrix, it must be in CSR format. + + metric : str or DistanceMetric object, default='euclidean' + The distance metric to compute between rows of X and Y. + The default metric is a fast implementation of the Euclidean + metric. For a list of available metrics, see the documentation + of :class:`~sklearn.metrics.DistanceMetric`. + + metric_kwargs : dict, default=None + Keyword arguments to pass to specified metric function. + + Returns + ------- + datasets_pair: DatasetsPair{{name_suffix}} + The suited DatasetsPair{{name_suffix}} implementation. + """ + # X_norm_squared and Y_norm_squared might be propagated + # down to DatasetsPairs via metrics_kwargs when the Euclidean + # specialisations can't be used. + # To prevent X_norm_squared and Y_norm_squared to be passed + # down to DistanceMetrics (whose constructors would raise + # a RuntimeError), we pop them here. + if metric_kwargs is not None: + # Copying metric_kwargs not to pop "X_norm_squared" + # and "Y_norm_squared" where they are used + metric_kwargs = copy.copy(metric_kwargs) + metric_kwargs.pop("X_norm_squared", None) + metric_kwargs.pop("Y_norm_squared", None) + cdef: + {{DistanceMetric}} distance_metric = DistanceMetric.get_metric( + metric, + {{INPUT_DTYPE}}, + **(metric_kwargs or {}) + ) + + # Metric-specific checks that do not replace nor duplicate `check_array`. + distance_metric._validate_data(X) + distance_metric._validate_data(Y) + + X_is_sparse = issparse(X) + Y_is_sparse = issparse(Y) + + if not X_is_sparse and not Y_is_sparse: + return DenseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric) + + if X_is_sparse and Y_is_sparse: + return SparseSparseDatasetsPair{{name_suffix}}(X, Y, distance_metric) + + if X_is_sparse and not Y_is_sparse: + return SparseDenseDatasetsPair{{name_suffix}}(X, Y, distance_metric) + + return DenseSparseDatasetsPair{{name_suffix}}(X, Y, distance_metric) + + @classmethod + def unpack_csr_matrix(cls, X: csr_matrix): + """Ensure that the CSR matrix is indexed with np.int32.""" + X_data = np.asarray(X.data, dtype={{INPUT_DTYPE}}) + X_indices = np.asarray(X.indices, dtype=np.int32) + X_indptr = np.asarray(X.indptr, dtype=np.int32) + return X_data, X_indices, X_indptr + + def __init__(self, {{DistanceMetric}} distance_metric, intp_t n_features): + self.distance_metric = distance_metric + self.n_features = n_features + + cdef intp_t n_samples_X(self) noexcept nogil: + """Number of samples in X.""" + # This is a abstract method. + # This _must_ always be overwritten in subclasses. + # TODO: add "with gil: raise" here when supporting Cython 3.0 + return -999 + + cdef intp_t n_samples_Y(self) noexcept nogil: + """Number of samples in Y.""" + # This is a abstract method. + # This _must_ always be overwritten in subclasses. + # TODO: add "with gil: raise" here when supporting Cython 3.0 + return -999 + + cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil: + return self.dist(i, j) + + cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil: + # This is a abstract method. + # This _must_ always be overwritten in subclasses. + # TODO: add "with gil: raise" here when supporting Cython 3.0 + return -1 + +@final +cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): + """Compute distances between row vectors of two arrays. + + Parameters + ---------- + X: ndarray of shape (n_samples_X, n_features) + Rows represent vectors. Must be C-contiguous. + + Y: ndarray of shape (n_samples_Y, n_features) + Rows represent vectors. Must be C-contiguous. + + distance_metric: DistanceMetric + The distance metric responsible for computing distances + between two row vectors of (X, Y). + """ + + def __init__( + self, + const {{INPUT_DTYPE_t}}[:, ::1] X, + const {{INPUT_DTYPE_t}}[:, ::1] Y, + {{DistanceMetric}} distance_metric, + ): + super().__init__(distance_metric, n_features=X.shape[1]) + # Arrays have already been checked + self.X = X + self.Y = Y + + @final + cdef intp_t n_samples_X(self) noexcept nogil: + return self.X.shape[0] + + @final + cdef intp_t n_samples_Y(self) noexcept nogil: + return self.Y.shape[0] + + @final + cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil: + return self.distance_metric.rdist(&self.X[i, 0], &self.Y[j, 0], self.n_features) + + @final + cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil: + return self.distance_metric.dist(&self.X[i, 0], &self.Y[j, 0], self.n_features) + + +@final +cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): + """Compute distances between vectors of two CSR matrices. + + Parameters + ---------- + X: sparse matrix of shape (n_samples_X, n_features) + Rows represent vectors. Must be in CSR format. + + Y: sparse matrix of shape (n_samples_Y, n_features) + Rows represent vectors. Must be in CSR format. + + distance_metric: DistanceMetric + The distance metric responsible for computing distances + between two vectors of (X, Y). + """ + + def __init__(self, X, Y, {{DistanceMetric}} distance_metric): + super().__init__(distance_metric, n_features=X.shape[1]) + + self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) + self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y) + + @final + cdef intp_t n_samples_X(self) noexcept nogil: + return self.X_indptr.shape[0] - 1 + + @final + cdef intp_t n_samples_Y(self) noexcept nogil: + return self.Y_indptr.shape[0] - 1 + + @final + cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil: + return self.distance_metric.rdist_csr( + x1_data=&self.X_data[0], + x1_indices=&self.X_indices[0], + x2_data=&self.Y_data[0], + x2_indices=&self.Y_indices[0], + x1_start=self.X_indptr[i], + x1_end=self.X_indptr[i + 1], + x2_start=self.Y_indptr[j], + x2_end=self.Y_indptr[j + 1], + size=self.n_features, + ) + + @final + cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil: + return self.distance_metric.dist_csr( + x1_data=&self.X_data[0], + x1_indices=&self.X_indices[0], + x2_data=&self.Y_data[0], + x2_indices=&self.Y_indices[0], + x1_start=self.X_indptr[i], + x1_end=self.X_indptr[i + 1], + x2_start=self.Y_indptr[j], + x2_end=self.Y_indptr[j + 1], + size=self.n_features, + ) + + +@final +cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): + """Compute distances between vectors of a CSR matrix and a dense array. + + Parameters + ---------- + X: sparse matrix of shape (n_samples_X, n_features) + Rows represent vectors. Must be in CSR format. + + Y: ndarray of shape (n_samples_Y, n_features) + Rows represent vectors. Must be C-contiguous. + + distance_metric: DistanceMetric + The distance metric responsible for computing distances + between two vectors of (X, Y). + """ + + def __init__(self, X, Y, {{DistanceMetric}} distance_metric): + super().__init__(distance_metric, n_features=X.shape[1]) + + self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) + + # We support the sparse-dense case by using the sparse-sparse interfaces + # of `DistanceMetric` (namely `DistanceMetric.{dist_csr,rdist_csr}`) to + # avoid introducing a new complex set of interfaces. In this case, we + # need to convert `Y` (the dense array) into a CSR matrix. + # + # Here we motive using another simpler CSR representation to use for `Y`. + # + # If we were to use the usual CSR representation for `Y`, storing all + # the columns indices in `indices` would have required allocating an + # array of n_samples × n_features elements with repeated contiguous + # integers from 0 to n_features - 1. This would have been very wasteful + # from a memory point of view. This alternative representation just uses + # the necessary amount of information needed and only necessitates + # shifting the address of `data` before calling the CSR × CSR routines. + # + # In this representation: + # + # - the `data` array is the original dense array, `Y`, whose first + # element's address is shifted before calling the CSR × CSR routine + # + # - the `indices` array is a single row of `n_features` elements: + # + # [0, 1, ..., n_features-1] + # + # - the `indptr` array is not materialised as the indices pointers' + # offset is constant (the offset equals `n_features`). Moreover, as + # `data` is shifted, constant `start` and `end` indices pointers + # respectively equalling 0 and n_features are used. + + # Y array already has been checked here + self.n_Y = Y.shape[0] + self.Y_data = np.ravel(Y) + self.Y_indices = np.arange(self.n_features, dtype=np.int32) + + @final + cdef intp_t n_samples_X(self) noexcept nogil: + return self.X_indptr.shape[0] - 1 + + @final + cdef intp_t n_samples_Y(self) noexcept nogil: + return self.n_Y + + @final + cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil: + return self.distance_metric.rdist_csr( + x1_data=&self.X_data[0], + x1_indices=&self.X_indices[0], + # Increment the data pointer such that x2_start=0 is aligned with the + # j-th row + x2_data=&self.Y_data[0] + j * self.n_features, + x2_indices=&self.Y_indices[0], + x1_start=self.X_indptr[i], + x1_end=self.X_indptr[i + 1], + x2_start=0, + x2_end=self.n_features, + size=self.n_features, + ) + + @final + cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil: + + return self.distance_metric.dist_csr( + x1_data=&self.X_data[0], + x1_indices=&self.X_indices[0], + # Increment the data pointer such that x2_start=0 is aligned with the + # j-th row + x2_data=&self.Y_data[0] + j * self.n_features, + x2_indices=&self.Y_indices[0], + x1_start=self.X_indptr[i], + x1_end=self.X_indptr[i + 1], + x2_start=0, + x2_end=self.n_features, + size=self.n_features, + ) + + +@final +cdef class DenseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}): + """Compute distances between vectors of a dense array and a CSR matrix. + + Parameters + ---------- + X: ndarray of shape (n_samples_X, n_features) + Rows represent vectors. Must be C-contiguous. + + Y: sparse matrix of shape (n_samples_Y, n_features) + Rows represent vectors. Must be in CSR format. + + distance_metric: DistanceMetric + The distance metric responsible for computing distances + between two vectors of (X, Y). + """ + + def __init__(self, X, Y, {{DistanceMetric}} distance_metric): + super().__init__(distance_metric, n_features=X.shape[1]) + # Swapping arguments on the constructor + self.datasets_pair = SparseDenseDatasetsPair{{name_suffix}}(Y, X, distance_metric) + + @final + cdef intp_t n_samples_X(self) noexcept nogil: + # Swapping interface + return self.datasets_pair.n_samples_Y() + + @final + cdef intp_t n_samples_Y(self) noexcept nogil: + # Swapping interface + return self.datasets_pair.n_samples_X() + + @final + cdef float64_t surrogate_dist(self, intp_t i, intp_t j) noexcept nogil: + # Swapping arguments on the same interface + return self.datasets_pair.surrogate_dist(j, i) + + @final + cdef float64_t dist(self, intp_t i, intp_t j) noexcept nogil: + # Swapping arguments on the same interface + return self.datasets_pair.dist(j, i) + +{{endfor}} diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py new file mode 100644 index 0000000000000000000000000000000000000000..d8307cbe84eaa904b50bdf11b59546aef397dbc3 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -0,0 +1,767 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from abc import abstractmethod +from typing import List + +import numpy as np +from scipy.sparse import issparse + +from ... import get_config +from .._dist_metrics import ( + BOOL_METRICS, + METRIC_MAPPING64, + DistanceMetric, +) +from ._argkmin import ( + ArgKmin32, + ArgKmin64, +) +from ._argkmin_classmode import ( + ArgKminClassMode32, + ArgKminClassMode64, +) +from ._base import _sqeuclidean_row_norms32, _sqeuclidean_row_norms64 +from ._radius_neighbors import ( + RadiusNeighbors32, + RadiusNeighbors64, +) +from ._radius_neighbors_classmode import ( + RadiusNeighborsClassMode32, + RadiusNeighborsClassMode64, +) + + +def sqeuclidean_row_norms(X, num_threads): + """Compute the squared euclidean norm of the rows of X in parallel. + + Parameters + ---------- + X : ndarray or CSR matrix of shape (n_samples, n_features) + Input data. Must be c-contiguous. + + num_threads : int + The number of OpenMP threads to use. + + Returns + ------- + sqeuclidean_row_norms : ndarray of shape (n_samples,) + Arrays containing the squared euclidean norm of each row of X. + """ + if X.dtype == np.float64: + return np.asarray(_sqeuclidean_row_norms64(X, num_threads)) + if X.dtype == np.float32: + return np.asarray(_sqeuclidean_row_norms32(X, num_threads)) + + raise ValueError( + "Only float64 or float32 datasets are supported at this time, " + f"got: X.dtype={X.dtype}." + ) + + +class BaseDistancesReductionDispatcher: + """Abstract base dispatcher for pairwise distance computation & reduction. + + Each dispatcher extending the base :class:`BaseDistancesReductionDispatcher` + dispatcher must implement the :meth:`compute` classmethod. + """ + + @classmethod + def valid_metrics(cls) -> List[str]: + excluded = { + # PyFunc cannot be supported because it necessitates interacting with + # the CPython interpreter to call user defined functions. + "pyfunc", + "mahalanobis", # is numerically unstable + # In order to support discrete distance metrics, we need to have a + # stable simultaneous sort which preserves the order of the indices + # because there generally is a lot of occurrences for a given values + # of distances in this case. + # TODO: implement a stable simultaneous_sort. + "hamming", + *BOOL_METRICS, + } + return sorted(({"sqeuclidean"} | set(METRIC_MAPPING64.keys())) - excluded) + + @classmethod + def is_usable_for(cls, X, Y, metric) -> bool: + """Return True if the dispatcher can be used for the + given parameters. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples_X, n_features) + Input data. + + Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features) + Input data. + + metric : str, default='euclidean' + The distance metric to use. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. + + Returns + ------- + True if the dispatcher can be used, else False. + """ + + # FIXME: the current Cython implementation is too slow for a large number of + # features. We temporarily disable it to fallback on SciPy's implementation. + # See: https://github.com/scikit-learn/scikit-learn/issues/28191 + if ( + issparse(X) + and issparse(Y) + and isinstance(metric, str) + and "euclidean" in metric + ): + return False + + def is_numpy_c_ordered(X): + return hasattr(X, "flags") and getattr(X.flags, "c_contiguous", False) + + def is_valid_sparse_matrix(X): + return ( + issparse(X) + and X.format == "csr" + and + # TODO: support CSR matrices without non-zeros elements + X.nnz > 0 + and + # TODO: support CSR matrices with int64 indices and indptr + # See: https://github.com/scikit-learn/scikit-learn/issues/23653 + X.indices.dtype == X.indptr.dtype == np.int32 + ) + + is_usable = ( + get_config().get("enable_cython_pairwise_dist", True) + and (is_numpy_c_ordered(X) or is_valid_sparse_matrix(X)) + and (is_numpy_c_ordered(Y) or is_valid_sparse_matrix(Y)) + and X.dtype == Y.dtype + and X.dtype in (np.float32, np.float64) + and (metric in cls.valid_metrics() or isinstance(metric, DistanceMetric)) + ) + + return is_usable + + @classmethod + @abstractmethod + def compute( + cls, + X, + Y, + **kwargs, + ): + """Compute the reduction. + + Parameters + ---------- + X : ndarray or CSR matrix of shape (n_samples_X, n_features) + Input data. + + Y : ndarray or CSR matrix of shape (n_samples_Y, n_features) + Input data. + + **kwargs : additional parameters for the reduction + + Notes + ----- + This method is an abstract class method: it has to be implemented + for all subclasses. + """ + + +class ArgKmin(BaseDistancesReductionDispatcher): + """Compute the argkmin of row vectors of X on the ones of Y. + + For each row vector of X, computes the indices of k first the rows + vectors of Y with the smallest distances. + + ArgKmin is typically used to perform + bruteforce k-nearest neighbors queries. + + This class is not meant to be instantiated, one should only use + its :meth:`compute` classmethod which handles allocation and + deallocation consistently. + """ + + @classmethod + def compute( + cls, + X, + Y, + k, + metric="euclidean", + chunk_size=None, + metric_kwargs=None, + strategy=None, + return_distance=False, + ): + """Compute the argkmin reduction. + + Parameters + ---------- + X : ndarray or CSR matrix of shape (n_samples_X, n_features) + Input data. + + Y : ndarray or CSR matrix of shape (n_samples_Y, n_features) + Input data. + + k : int + The k for the argkmin reduction. + + metric : str, default='euclidean' + The distance metric to use for argkmin. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. + + chunk_size : int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + + metric_kwargs : dict, default=None + Keyword arguments to pass to specified metric function. + + strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None + The chunking strategy defining which dataset parallelization are made on. + + For both strategies the computations happens with two nested loops, + respectively on chunks of X and chunks of Y. + Strategies differs on which loop (outer or inner) is made to run + in parallel with the Cython `prange` construct: + + - 'parallel_on_X' dispatches chunks of X uniformly on threads. + Each thread then iterates on all the chunks of Y. This strategy is + embarrassingly parallel and comes with no datastructures + synchronisation. + + - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. + Each thread processes all the chunks of X in turn. This strategy is + a sequence of embarrassingly parallel subtasks (the inner loop on Y + chunks) with intermediate datastructures synchronisation at each + iteration of the sequential outer loop on X chunks. + + - 'auto' relies on a simple heuristic to choose between + 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, + 'parallel_on_X' is usually the most efficient strategy. + When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y' + brings more opportunity for parallelism and is therefore more efficient + + - None (default) looks-up in scikit-learn configuration for + `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. + + return_distance : boolean, default=False + Return distances between each X vector and its + argkmin if set to True. + + Returns + ------- + If return_distance=False: + - argkmin_indices : ndarray of shape (n_samples_X, k) + Indices of the argkmin for each vector in X. + + If return_distance=True: + - argkmin_distances : ndarray of shape (n_samples_X, k) + Distances to the argkmin for each vector in X. + - argkmin_indices : ndarray of shape (n_samples_X, k) + Indices of the argkmin for each vector in X. + + Notes + ----- + This classmethod inspects the arguments values to dispatch to the + dtype-specialized implementation of :class:`ArgKmin`. + + This allows decoupling the API entirely from the implementation details + whilst maintaining RAII: all temporarily allocated datastructures necessary + for the concrete implementation are therefore freed when this classmethod + returns. + """ + if X.dtype == Y.dtype == np.float64: + return ArgKmin64.compute( + X=X, + Y=Y, + k=k, + metric=metric, + chunk_size=chunk_size, + metric_kwargs=metric_kwargs, + strategy=strategy, + return_distance=return_distance, + ) + + if X.dtype == Y.dtype == np.float32: + return ArgKmin32.compute( + X=X, + Y=Y, + k=k, + metric=metric, + chunk_size=chunk_size, + metric_kwargs=metric_kwargs, + strategy=strategy, + return_distance=return_distance, + ) + + raise ValueError( + "Only float64 or float32 datasets pairs are supported at this time, " + f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}." + ) + + +class RadiusNeighbors(BaseDistancesReductionDispatcher): + """Compute radius-based neighbors for two sets of vectors. + + For each row-vector X[i] of the queries X, find all the indices j of + row-vectors in Y such that: + + dist(X[i], Y[j]) <= radius + + The distance function `dist` depends on the values of the `metric` + and `metric_kwargs` parameters. + + This class is not meant to be instantiated, one should only use + its :meth:`compute` classmethod which handles allocation and + deallocation consistently. + """ + + @classmethod + def compute( + cls, + X, + Y, + radius, + metric="euclidean", + chunk_size=None, + metric_kwargs=None, + strategy=None, + return_distance=False, + sort_results=False, + ): + """Return the results of the reduction for the given arguments. + + Parameters + ---------- + X : ndarray or CSR matrix of shape (n_samples_X, n_features) + Input data. + + Y : ndarray or CSR matrix of shape (n_samples_Y, n_features) + Input data. + + radius : float + The radius defining the neighborhood. + + metric : str, default='euclidean' + The distance metric to use. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. + + chunk_size : int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + + metric_kwargs : dict, default=None + Keyword arguments to pass to specified metric function. + + strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None + The chunking strategy defining which dataset parallelization are made on. + + For both strategies the computations happens with two nested loops, + respectively on chunks of X and chunks of Y. + Strategies differs on which loop (outer or inner) is made to run + in parallel with the Cython `prange` construct: + + - 'parallel_on_X' dispatches chunks of X uniformly on threads. + Each thread then iterates on all the chunks of Y. This strategy is + embarrassingly parallel and comes with no datastructures + synchronisation. + + - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. + Each thread processes all the chunks of X in turn. This strategy is + a sequence of embarrassingly parallel subtasks (the inner loop on Y + chunks) with intermediate datastructures synchronisation at each + iteration of the sequential outer loop on X chunks. + + - 'auto' relies on a simple heuristic to choose between + 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, + 'parallel_on_X' is usually the most efficient strategy. + When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y' + brings more opportunity for parallelism and is therefore more efficient + despite the synchronization step at each iteration of the outer loop + on chunks of `X`. + + - None (default) looks-up in scikit-learn configuration for + `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. + + return_distance : boolean, default=False + Return distances between each X vector and its neighbors if set to True. + + sort_results : boolean, default=False + Sort results with respect to distances between each X vector and its + neighbors if set to True. + + Returns + ------- + If return_distance=False: + - neighbors_indices : ndarray of n_samples_X ndarray + Indices of the neighbors for each vector in X. + + If return_distance=True: + - neighbors_indices : ndarray of n_samples_X ndarray + Indices of the neighbors for each vector in X. + - neighbors_distances : ndarray of n_samples_X ndarray + Distances to the neighbors for each vector in X. + + Notes + ----- + This classmethod inspects the arguments values to dispatch to the + dtype-specialized implementation of :class:`RadiusNeighbors`. + + This allows decoupling the API entirely from the implementation details + whilst maintaining RAII: all temporarily allocated datastructures necessary + for the concrete implementation are therefore freed when this classmethod + returns. + """ + if X.dtype == Y.dtype == np.float64: + return RadiusNeighbors64.compute( + X=X, + Y=Y, + radius=radius, + metric=metric, + chunk_size=chunk_size, + metric_kwargs=metric_kwargs, + strategy=strategy, + sort_results=sort_results, + return_distance=return_distance, + ) + + if X.dtype == Y.dtype == np.float32: + return RadiusNeighbors32.compute( + X=X, + Y=Y, + radius=radius, + metric=metric, + chunk_size=chunk_size, + metric_kwargs=metric_kwargs, + strategy=strategy, + sort_results=sort_results, + return_distance=return_distance, + ) + + raise ValueError( + "Only float64 or float32 datasets pairs are supported at this time, " + f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}." + ) + + +class ArgKminClassMode(BaseDistancesReductionDispatcher): + """Compute the argkmin of row vectors of X on the ones of Y with labels. + + For each row vector of X, computes the indices of k first the rows + vectors of Y with the smallest distances. Computes weighted mode of labels. + + ArgKminClassMode is typically used to perform bruteforce k-nearest neighbors + queries when the weighted mode of the labels for the k-nearest neighbors + are required, such as in `predict` methods. + + This class is not meant to be instantiated, one should only use + its :meth:`compute` classmethod which handles allocation and + deallocation consistently. + """ + + @classmethod + def valid_metrics(cls) -> List[str]: + excluded = { + # Euclidean is technically usable for ArgKminClassMode + # but its current implementation would not be competitive. + # TODO: implement Euclidean specialization using GEMM. + "euclidean", + "sqeuclidean", + } + return list(set(BaseDistancesReductionDispatcher.valid_metrics()) - excluded) + + @classmethod + def compute( + cls, + X, + Y, + k, + weights, + Y_labels, + unique_Y_labels, + metric="euclidean", + chunk_size=None, + metric_kwargs=None, + strategy=None, + ): + """Compute the argkmin reduction. + + Parameters + ---------- + X : ndarray of shape (n_samples_X, n_features) + The input array to be labelled. + + Y : ndarray of shape (n_samples_Y, n_features) + The input array whose class membership are provided through the + `Y_labels` parameter. + + k : int + The number of nearest neighbors to consider. + + weights : ndarray + The weights applied over the `Y_labels` of `Y` when computing the + weighted mode of the labels. + + Y_labels : ndarray + An array containing the index of the class membership of the + associated samples in `Y`. This is used in labeling `X`. + + unique_Y_labels : ndarray + An array containing all unique indices contained in the + corresponding `Y_labels` array. + + metric : str, default='euclidean' + The distance metric to use. For a list of available metrics, see + the documentation of :class:`~sklearn.metrics.DistanceMetric`. + Currently does not support `'precomputed'`. + + chunk_size : int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + + metric_kwargs : dict, default=None + Keyword arguments to pass to specified metric function. + + strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None + The chunking strategy defining which dataset parallelization are made on. + + For both strategies the computations happens with two nested loops, + respectively on chunks of X and chunks of Y. + Strategies differs on which loop (outer or inner) is made to run + in parallel with the Cython `prange` construct: + + - 'parallel_on_X' dispatches chunks of X uniformly on threads. + Each thread then iterates on all the chunks of Y. This strategy is + embarrassingly parallel and comes with no datastructures + synchronisation. + + - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. + Each thread processes all the chunks of X in turn. This strategy is + a sequence of embarrassingly parallel subtasks (the inner loop on Y + chunks) with intermediate datastructures synchronisation at each + iteration of the sequential outer loop on X chunks. + + - 'auto' relies on a simple heuristic to choose between + 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, + 'parallel_on_X' is usually the most efficient strategy. + When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y' + brings more opportunity for parallelism and is therefore more efficient + despite the synchronization step at each iteration of the outer loop + on chunks of `X`. + + - None (default) looks-up in scikit-learn configuration for + `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. + + Returns + ------- + probabilities : ndarray of shape (n_samples_X, n_classes) + An array containing the class probabilities for each sample. + + Notes + ----- + This classmethod is responsible for introspecting the arguments + values to dispatch to the most appropriate implementation of + :class:`PairwiseDistancesArgKmin`. + + This allows decoupling the API entirely from the implementation details + whilst maintaining RAII: all temporarily allocated datastructures necessary + for the concrete implementation are therefore freed when this classmethod + returns. + """ + if weights not in {"uniform", "distance"}: + raise ValueError( + "Only the 'uniform' or 'distance' weights options are supported" + f" at this time. Got: {weights=}." + ) + if X.dtype == Y.dtype == np.float64: + return ArgKminClassMode64.compute( + X=X, + Y=Y, + k=k, + weights=weights, + Y_labels=np.array(Y_labels, dtype=np.intp), + unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp), + metric=metric, + chunk_size=chunk_size, + metric_kwargs=metric_kwargs, + strategy=strategy, + ) + + if X.dtype == Y.dtype == np.float32: + return ArgKminClassMode32.compute( + X=X, + Y=Y, + k=k, + weights=weights, + Y_labels=np.array(Y_labels, dtype=np.intp), + unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp), + metric=metric, + chunk_size=chunk_size, + metric_kwargs=metric_kwargs, + strategy=strategy, + ) + + raise ValueError( + "Only float64 or float32 datasets pairs are supported at this time, " + f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}." + ) + + +class RadiusNeighborsClassMode(BaseDistancesReductionDispatcher): + """Compute radius-based class modes of row vectors of X using the + those of Y. + + For each row-vector X[i] of the queries X, find all the indices j of + row-vectors in Y such that: + + dist(X[i], Y[j]) <= radius + + RadiusNeighborsClassMode is typically used to perform bruteforce + radius neighbors queries when the weighted mode of the labels for + the nearest neighbors within the specified radius are required, + such as in `predict` methods. + + This class is not meant to be instantiated, one should only use + its :meth:`compute` classmethod which handles allocation and + deallocation consistently. + """ + + @classmethod + def valid_metrics(cls) -> List[str]: + excluded = { + # Euclidean is technically usable for RadiusNeighborsClassMode + # but it would not be competitive. + # TODO: implement Euclidean specialization using GEMM. + "euclidean", + "sqeuclidean", + } + return sorted(set(BaseDistancesReductionDispatcher.valid_metrics()) - excluded) + + @classmethod + def compute( + cls, + X, + Y, + radius, + weights, + Y_labels, + unique_Y_labels, + outlier_label, + metric="euclidean", + chunk_size=None, + metric_kwargs=None, + strategy=None, + ): + """Return the results of the reduction for the given arguments. + Parameters + ---------- + X : ndarray of shape (n_samples_X, n_features) + The input array to be labelled. + Y : ndarray of shape (n_samples_Y, n_features) + The input array whose class membership is provided through + the `Y_labels` parameter. + radius : float + The radius defining the neighborhood. + weights : ndarray + The weights applied to the `Y_labels` when computing the + weighted mode of the labels. + Y_labels : ndarray + An array containing the index of the class membership of the + associated samples in `Y`. This is used in labeling `X`. + unique_Y_labels : ndarray + An array containing all unique class labels. + outlier_label : int, default=None + Label for outlier samples (samples with no neighbors in given + radius). In the default case when the value is None if any + outlier is detected, a ValueError will be raised. The outlier + label should be selected from among the unique 'Y' labels. If + it is specified with a different value a warning will be raised + and all class probabilities of outliers will be assigned to be 0. + metric : str, default='euclidean' + The distance metric to use. For a list of available metrics, see + the documentation of :class:`~sklearn.metrics.DistanceMetric`. + Currently does not support `'precomputed'`. + chunk_size : int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + metric_kwargs : dict, default=None + Keyword arguments to pass to specified metric function. + strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None + The chunking strategy defining which dataset parallelization are made on. + For both strategies the computations happens with two nested loops, + respectively on chunks of X and chunks of Y. + Strategies differs on which loop (outer or inner) is made to run + in parallel with the Cython `prange` construct: + - 'parallel_on_X' dispatches chunks of X uniformly on threads. + Each thread then iterates on all the chunks of Y. This strategy is + embarrassingly parallel and comes with no datastructures + synchronisation. + - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. + Each thread processes all the chunks of X in turn. This strategy is + a sequence of embarrassingly parallel subtasks (the inner loop on Y + chunks) with intermediate datastructures synchronisation at each + iteration of the sequential outer loop on X chunks. + - 'auto' relies on a simple heuristic to choose between + 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, + 'parallel_on_X' is usually the most efficient strategy. + When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y' + brings more opportunity for parallelism and is therefore more efficient + despite the synchronization step at each iteration of the outer loop + on chunks of `X`. + - None (default) looks-up in scikit-learn configuration for + `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. + Returns + ------- + probabilities : ndarray of shape (n_samples_X, n_classes) + An array containing the class probabilities for each sample. + """ + if weights not in {"uniform", "distance"}: + raise ValueError( + "Only the 'uniform' or 'distance' weights options are supported" + f" at this time. Got: {weights=}." + ) + if X.dtype == Y.dtype == np.float64: + return RadiusNeighborsClassMode64.compute( + X=X, + Y=Y, + radius=radius, + weights=weights, + Y_labels=np.array(Y_labels, dtype=np.intp), + unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp), + outlier_label=outlier_label, + metric=metric, + chunk_size=chunk_size, + metric_kwargs=metric_kwargs, + strategy=strategy, + ) + + if X.dtype == Y.dtype == np.float32: + return RadiusNeighborsClassMode32.compute( + X=X, + Y=Y, + radius=radius, + weights=weights, + Y_labels=np.array(Y_labels, dtype=np.intp), + unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp), + outlier_label=outlier_label, + metric=metric, + chunk_size=chunk_size, + metric_kwargs=metric_kwargs, + strategy=strategy, + ) + + raise ValueError( + "Only float64 or float32 datasets pairs are supported at this time, " + f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}." + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp new file mode 100644 index 0000000000000000000000000000000000000000..bdf007bd0514ab4b49ccdd55a3bd5dbe1b2c75ec --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd.tp @@ -0,0 +1,228 @@ +{{py: + +implementation_specific_values = [ + # Values are the following ones: + # + # name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE + # + # We also use the float64 dtype and C-type names as defined in + # `sklearn.utils._typedefs` to maintain consistency. + # + ('64', False, 'float64_t', 'np.float64'), + ('32', True, 'float32_t', 'np.float32') +] + +}} +from libcpp.vector cimport vector + +from ...utils._typedefs cimport float64_t, float32_t, int32_t, intp_t + + +cdef void _middle_term_sparse_sparse_64( + const float64_t[:] X_data, + const int32_t[:] X_indices, + const int32_t[:] X_indptr, + intp_t X_start, + intp_t X_end, + const float64_t[:] Y_data, + const int32_t[:] Y_indices, + const int32_t[:] Y_indptr, + intp_t Y_start, + intp_t Y_end, + float64_t * D, +) noexcept nogil + + +{{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + + +cdef class MiddleTermComputer{{name_suffix}}: + cdef: + intp_t effective_n_threads + intp_t chunks_n_threads + intp_t dist_middle_terms_chunks_size + intp_t n_features + intp_t chunk_size + + # Buffers for the `-2 * X_c @ Y_c.T` term computed via GEMM + vector[vector[float64_t]] dist_middle_terms_chunks + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil + + cdef void _parallel_on_X_parallel_init(self, intp_t thread_num) noexcept nogil + + cdef void _parallel_on_X_init_chunk( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil + + cdef void _parallel_on_Y_init(self) noexcept nogil + + cdef void _parallel_on_Y_parallel_init( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num + ) noexcept nogil + + cdef float64_t * _compute_dist_middle_terms( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil + + +cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): + cdef: + const {{INPUT_DTYPE_t}}[:, ::1] X + const {{INPUT_DTYPE_t}}[:, ::1] Y + + {{if upcast_to_float64}} + # Buffers for upcasting chunks of X and Y from 32bit to 64bit + vector[vector[float64_t]] X_c_upcast + vector[vector[float64_t]] Y_c_upcast + {{endif}} + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil + + cdef void _parallel_on_X_init_chunk( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil + + cdef void _parallel_on_Y_parallel_init( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num + ) noexcept nogil + + cdef float64_t * _compute_dist_middle_terms( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil + + +cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): + cdef: + const float64_t[:] X_data + const int32_t[:] X_indices + const int32_t[:] X_indptr + + const float64_t[:] Y_data + const int32_t[:] Y_indices + const int32_t[:] Y_indptr + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num + ) noexcept nogil + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num + ) noexcept nogil + + cdef float64_t * _compute_dist_middle_terms( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil + + +cdef class SparseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): + cdef: + const float64_t[:] X_data + const int32_t[:] X_indices + const int32_t[:] X_indptr + + const {{INPUT_DTYPE_t}}[:, ::1] Y + + # We treat the dense-sparse case with the sparse-dense case by simply + # treating the dist_middle_terms as F-ordered and by swapping arguments. + # This attribute is meant to encode the case and adapt the logic + # accordingly. + bint c_ordered_middle_term + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num + ) noexcept nogil + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num + ) noexcept nogil + + cdef float64_t * _compute_dist_middle_terms( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil + +{{endfor}} diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp new file mode 100644 index 0000000000000000000000000000000000000000..1fca2d674720c40fa2df8f56fea4f3a7a6980ba8 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx.tp @@ -0,0 +1,633 @@ +{{py: + +implementation_specific_values = [ + # Values are the following ones: + # + # name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE + # + # We also use the float64 dtype and C-type names as defined in + # `sklearn.utils._typedefs` to maintain consistency. + # + ('64', False, 'float64_t', 'np.float64'), + ('32', True, 'float32_t', 'np.float32') +] + +}} +from libcpp.vector cimport vector +from libcpp.algorithm cimport fill + +from ...utils._cython_blas cimport ( + BLAS_Order, + BLAS_Trans, + NoTrans, + RowMajor, + Trans, + _gemm, +) +from ...utils._typedefs cimport float64_t, float32_t, int32_t, intp_t + +import numpy as np +from scipy.sparse import issparse, csr_matrix + + +cdef void _middle_term_sparse_sparse_64( + const float64_t[:] X_data, + const int32_t[:] X_indices, + const int32_t[:] X_indptr, + intp_t X_start, + intp_t X_end, + const float64_t[:] Y_data, + const int32_t[:] Y_indices, + const int32_t[:] Y_indptr, + intp_t Y_start, + intp_t Y_end, + float64_t * D, +) noexcept nogil: + # This routine assumes that D points to the first element of a + # zeroed buffer of length at least equal to n_X × n_Y, conceptually + # representing a 2-d C-ordered array. + cdef: + intp_t i, j, k + intp_t n_X = X_end - X_start + intp_t n_Y = Y_end - Y_start + intp_t x_col, x_ptr, y_col, y_ptr + + for i in range(n_X): + for x_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]): + x_col = X_indices[x_ptr] + for j in range(n_Y): + k = i * n_Y + j + for y_ptr in range(Y_indptr[Y_start+j], Y_indptr[Y_start+j+1]): + y_col = Y_indices[y_ptr] + if x_col == y_col: + D[k] += -2 * X_data[x_ptr] * Y_data[y_ptr] + + +{{for name_suffix, upcast_to_float64, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +cdef void _middle_term_sparse_dense_{{name_suffix}}( + const float64_t[:] X_data, + const int32_t[:] X_indices, + const int32_t[:] X_indptr, + intp_t X_start, + intp_t X_end, + const {{INPUT_DTYPE_t}}[:, ::1] Y, + intp_t Y_start, + intp_t Y_end, + bint c_ordered_middle_term, + float64_t * dist_middle_terms, +) noexcept nogil: + # This routine assumes that dist_middle_terms is a pointer to the first element + # of a buffer filled with zeros of length at least equal to n_X × n_Y, conceptually + # representing a 2-d C-ordered of F-ordered array. + cdef: + intp_t i, j, k + intp_t n_X = X_end - X_start + intp_t n_Y = Y_end - Y_start + intp_t X_i_col_idx, X_i_ptr, Y_j_col_idx, Y_j_ptr + + for i in range(n_X): + for j in range(n_Y): + k = i * n_Y + j if c_ordered_middle_term else j * n_X + i + for X_i_ptr in range(X_indptr[X_start+i], X_indptr[X_start+i+1]): + X_i_col_idx = X_indices[X_i_ptr] + dist_middle_terms[k] += -2 * X_data[X_i_ptr] * Y[Y_start + j, X_i_col_idx] + + +cdef class MiddleTermComputer{{name_suffix}}: + """Helper class to compute a Euclidean distance matrix in chunks. + + This is an abstract base class that is further specialized depending + on the type of data (dense or sparse). + + `EuclideanDistance` subclasses relies on the squared Euclidean + distances between chunks of vectors X_c and Y_c using the + following decomposition for the (i,j) pair : + + + ||X_c_i - Y_c_j||² = ||X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² + + + This helper class is in charge of wrapping the common logic to compute + the middle term, i.e. `- 2 X_c_i.Y_c_j^T`. + """ + + @classmethod + def get_for( + cls, + X, + Y, + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) -> MiddleTermComputer{{name_suffix}}: + """Return the MiddleTermComputer implementation for the given arguments. + + Parameters + ---------- + X : ndarray or CSR sparse matrix of shape (n_samples_X, n_features) + Input data. + If provided as a ndarray, it must be C-contiguous. + + Y : ndarray or CSR sparse matrix of shape (n_samples_Y, n_features) + Input data. + If provided as a ndarray, it must be C-contiguous. + + Returns + ------- + middle_term_computer: MiddleTermComputer{{name_suffix}} + The suited MiddleTermComputer{{name_suffix}} implementation. + """ + X_is_sparse = issparse(X) + Y_is_sparse = issparse(Y) + + if not X_is_sparse and not Y_is_sparse: + return DenseDenseMiddleTermComputer{{name_suffix}}( + X, + Y, + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) + if X_is_sparse and Y_is_sparse: + return SparseSparseMiddleTermComputer{{name_suffix}}( + X, + Y, + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) + if X_is_sparse and not Y_is_sparse: + return SparseDenseMiddleTermComputer{{name_suffix}}( + X, + Y, + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + c_ordered_middle_term=True + ) + if not X_is_sparse and Y_is_sparse: + # NOTE: The Dense-Sparse case is implement via the Sparse-Dense case. + # + # To do so: + # - X (dense) and Y (sparse) are swapped + # - the distance middle term is seen as F-ordered for consistency + # (c_ordered_middle_term = False) + return SparseDenseMiddleTermComputer{{name_suffix}}( + # Mind that X and Y are swapped here. + Y, + X, + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + c_ordered_middle_term=False, + ) + raise NotImplementedError( + "X and Y must be CSR sparse matrices or numpy arrays." + ) + + @classmethod + def unpack_csr_matrix(cls, X: csr_matrix): + """Ensure that the CSR matrix is indexed with np.int32.""" + X_data = np.asarray(X.data, dtype=np.float64) + X_indices = np.asarray(X.indices, dtype=np.int32) + X_indptr = np.asarray(X.indptr, dtype=np.int32) + return X_data, X_indices, X_indptr + + def __init__( + self, + intp_t effective_n_threads, + intp_t chunks_n_threads, + intp_t dist_middle_terms_chunks_size, + intp_t n_features, + intp_t chunk_size, + ): + self.effective_n_threads = effective_n_threads + self.chunks_n_threads = chunks_n_threads + self.dist_middle_terms_chunks_size = dist_middle_terms_chunks_size + self.n_features = n_features + self.chunk_size = chunk_size + + self.dist_middle_terms_chunks = vector[vector[float64_t]](self.effective_n_threads) + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil: + return + + cdef void _parallel_on_X_parallel_init(self, intp_t thread_num) noexcept nogil: + self.dist_middle_terms_chunks[thread_num].resize(self.dist_middle_terms_chunks_size) + + cdef void _parallel_on_X_init_chunk( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil: + return + + cdef void _parallel_on_Y_init(self) noexcept nogil: + for thread_num in range(self.chunks_n_threads): + self.dist_middle_terms_chunks[thread_num].resize( + self.dist_middle_terms_chunks_size + ) + + cdef void _parallel_on_Y_parallel_init( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil: + return + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num + ) noexcept nogil: + return + + cdef float64_t * _compute_dist_middle_terms( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil: + return NULL + + +cdef class DenseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): + """Computes the middle term of the Euclidean distance between two chunked dense matrices + X_c and Y_c. + + dist_middle_terms = - 2 X_c_i.Y_c_j^T + + This class use the BLAS gemm routine to perform the dot product of each chunks + of the distance matrix with improved arithmetic intensity and vector instruction (SIMD). + """ + + def __init__( + self, + const {{INPUT_DTYPE_t}}[:, ::1] X, + const {{INPUT_DTYPE_t}}[:, ::1] Y, + intp_t effective_n_threads, + intp_t chunks_n_threads, + intp_t dist_middle_terms_chunks_size, + intp_t n_features, + intp_t chunk_size, + ): + super().__init__( + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) + self.X = X + self.Y = Y + +{{if upcast_to_float64}} + # We populate the buffer for upcasting chunks of X and Y from float32 to float64. + self.X_c_upcast = vector[vector[float64_t]](self.effective_n_threads) + self.Y_c_upcast = vector[vector[float64_t]](self.effective_n_threads) + + upcast_buffer_n_elements = self.chunk_size * n_features + + for thread_num in range(self.effective_n_threads): + self.X_c_upcast[thread_num].resize(upcast_buffer_n_elements) + self.Y_c_upcast[thread_num].resize(upcast_buffer_n_elements) +{{endif}} + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil: +{{if upcast_to_float64}} + cdef: + intp_t i, j + intp_t n_chunk_samples = Y_end - Y_start + + # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64 + for i in range(n_chunk_samples): + for j in range(self.n_features): + self.Y_c_upcast[thread_num][i * self.n_features + j] = self.Y[Y_start + i, j] +{{else}} + return +{{endif}} + + cdef void _parallel_on_X_init_chunk( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil: +{{if upcast_to_float64}} + cdef: + intp_t i, j + intp_t n_chunk_samples = X_end - X_start + + # Upcasting X_c=X[X_start:X_end, :] from float32 to float64 + for i in range(n_chunk_samples): + for j in range(self.n_features): + self.X_c_upcast[thread_num][i * self.n_features + j] = self.X[X_start + i, j] +{{else}} + return +{{endif}} + + cdef void _parallel_on_Y_parallel_init( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil: +{{if upcast_to_float64}} + cdef: + intp_t i, j + intp_t n_chunk_samples = X_end - X_start + + # Upcasting X_c=X[X_start:X_end, :] from float32 to float64 + for i in range(n_chunk_samples): + for j in range(self.n_features): + self.X_c_upcast[thread_num][i * self.n_features + j] = self.X[X_start + i, j] +{{else}} + return +{{endif}} + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num + ) noexcept nogil: +{{if upcast_to_float64}} + cdef: + intp_t i, j + intp_t n_chunk_samples = Y_end - Y_start + + # Upcasting Y_c=Y[Y_start:Y_end, :] from float32 to float64 + for i in range(n_chunk_samples): + for j in range(self.n_features): + self.Y_c_upcast[thread_num][i * self.n_features + j] = self.Y[Y_start + i, j] +{{else}} + return +{{endif}} + + cdef float64_t * _compute_dist_middle_terms( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil: + cdef: + float64_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num].data() + + # Careful: LDA, LDB and LDC are given for F-ordered arrays + # in BLAS documentations, for instance: + # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa + # + # Here, we use their counterpart values to work with C-ordered arrays. + BLAS_Order order = RowMajor + BLAS_Trans ta = NoTrans + BLAS_Trans tb = Trans + intp_t m = X_end - X_start + intp_t n = Y_end - Y_start + intp_t K = self.n_features + float64_t alpha = - 2. +{{if upcast_to_float64}} + float64_t * A = self.X_c_upcast[thread_num].data() + float64_t * B = self.Y_c_upcast[thread_num].data() +{{else}} + # Casting for A and B to remove the const is needed because APIs exposed via + # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier. + # See: https://github.com/scipy/scipy/issues/14262 + float64_t * A = &self.X[X_start, 0] + float64_t * B = &self.Y[Y_start, 0] +{{endif}} + intp_t lda = self.n_features + intp_t ldb = self.n_features + float64_t beta = 0. + intp_t ldc = Y_end - Y_start + + # dist_middle_terms = `-2 * X[X_start:X_end] @ Y[Y_start:Y_end].T` + _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, dist_middle_terms, ldc) + + return dist_middle_terms + + +cdef class SparseSparseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): + """Middle term of the Euclidean distance between two chunked CSR matrices. + + The result is return as a contiguous array. + + dist_middle_terms = - 2 X_c_i.Y_c_j^T + + The logic of the computation is wrapped in the routine _middle_term_sparse_sparse_64. + This routine iterates over the data, indices and indptr arrays of the sparse matrices without + densifying them. + """ + + def __init__( + self, + X, + Y, + intp_t effective_n_threads, + intp_t chunks_n_threads, + intp_t dist_middle_terms_chunks_size, + intp_t n_features, + intp_t chunk_size, + ): + super().__init__( + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) + self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) + self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y) + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil: + # Flush the thread dist_middle_terms_chunks to 0.0 + fill( + self.dist_middle_terms_chunks[thread_num].begin(), + self.dist_middle_terms_chunks[thread_num].end(), + 0.0, + ) + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil: + # Flush the thread dist_middle_terms_chunks to 0.0 + fill( + self.dist_middle_terms_chunks[thread_num].begin(), + self.dist_middle_terms_chunks[thread_num].end(), + 0.0, + ) + + cdef float64_t * _compute_dist_middle_terms( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil: + cdef: + float64_t *dist_middle_terms = ( + self.dist_middle_terms_chunks[thread_num].data() + ) + + _middle_term_sparse_sparse_64( + self.X_data, + self.X_indices, + self.X_indptr, + X_start, + X_end, + self.Y_data, + self.Y_indices, + self.Y_indptr, + Y_start, + Y_end, + dist_middle_terms, + ) + + return dist_middle_terms + +cdef class SparseDenseMiddleTermComputer{{name_suffix}}(MiddleTermComputer{{name_suffix}}): + """Middle term of the Euclidean distance between chunks of a CSR matrix and a np.ndarray. + + The logic of the computation is wrapped in the routine _middle_term_sparse_dense_{{name_suffix}}. + This routine iterates over the data, indices and indptr arrays of the sparse matrices + without densifying them. + """ + + def __init__( + self, + X, + Y, + intp_t effective_n_threads, + intp_t chunks_n_threads, + intp_t dist_middle_terms_chunks_size, + intp_t n_features, + intp_t chunk_size, + bint c_ordered_middle_term, + ): + super().__init__( + effective_n_threads, + chunks_n_threads, + dist_middle_terms_chunks_size, + n_features, + chunk_size, + ) + self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) + self.Y = Y + self.c_ordered_middle_term = c_ordered_middle_term + + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil: + # Fill the thread's dist_middle_terms_chunks with 0.0 before + # computing its elements in _compute_dist_middle_terms. + fill( + self.dist_middle_terms_chunks[thread_num].begin(), + self.dist_middle_terms_chunks[thread_num].end(), + 0.0, + ) + + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil: + # Fill the thread's dist_middle_terms_chunks with 0.0 before + # computing its elements in _compute_dist_middle_terms. + fill( + self.dist_middle_terms_chunks[thread_num].begin(), + self.dist_middle_terms_chunks[thread_num].end(), + 0.0, + ) + + cdef float64_t * _compute_dist_middle_terms( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil: + cdef: + float64_t *dist_middle_terms = ( + self.dist_middle_terms_chunks[thread_num].data() + ) + + # For the dense-sparse case, we use the sparse-dense case + # with dist_middle_terms seen as F-ordered. + # Hence we swap indices pointers here. + if not self.c_ordered_middle_term: + X_start, Y_start = Y_start, X_start + X_end, Y_end = Y_end, X_end + + _middle_term_sparse_dense_{{name_suffix}}( + self.X_data, + self.X_indices, + self.X_indptr, + X_start, + X_end, + self.Y, + Y_start, + Y_end, + self.c_ordered_middle_term, + dist_middle_terms, + ) + + return dist_middle_terms + +{{endfor}} diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp new file mode 100644 index 0000000000000000000000000000000000000000..809a80a68c5b0c6a513b8b9267fe211f109cdaee --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd.tp @@ -0,0 +1,90 @@ +cimport numpy as cnp + +from libcpp.memory cimport shared_ptr +from libcpp.vector cimport vector +from cython cimport final + +from ...utils._typedefs cimport intp_t, float64_t + +cnp.import_array() + +###################### +## std::vector to np.ndarray coercion +# As type covariance is not supported for C++ containers via Cython, +# we need to redefine fused types. +ctypedef fused vector_double_intp_t: + vector[intp_t] + vector[float64_t] + + +ctypedef fused vector_vector_double_intp_t: + vector[vector[intp_t]] + vector[vector[float64_t]] + +cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays( + shared_ptr[vector_vector_double_intp_t] vecs +) + +##################### +{{for name_suffix in ['64', '32']}} + +from ._base cimport BaseDistancesReduction{{name_suffix}} +from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}} + +cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): + """float{{name_suffix}} implementation of the RadiusNeighbors.""" + + cdef: + float64_t radius + + # DistanceMetric{{name_suffix}} compute rank-preserving surrogate distance via rdist + # which are proxies necessitating less computations. + # We get the equivalent for the radius to be able to compare it against + # vectors' rank-preserving surrogate distances. + float64_t r_radius + + # Neighbors indices and distances are returned as np.ndarrays of np.ndarrays. + # + # For this implementation, we want resizable buffers which we will wrap + # into numpy arrays at the end. std::vector comes as a handy container + # for interacting efficiently with resizable buffers. + # + # Though it is possible to access their buffer address with + # std::vector::data, they can't be stolen: buffers lifetime + # is tied to their std::vector and are deallocated when + # std::vectors are. + # + # To solve this, we dynamically allocate std::vectors and then + # encapsulate them in a StdVectorSentinel responsible for + # freeing them when the associated np.ndarray is freed. + # + # Shared pointers (defined via shared_ptr) are use for safer memory management. + # Unique pointers (defined via unique_ptr) can't be used as datastructures + # are shared across threads for parallel_on_X; see _parallel_on_X_init_chunk. + shared_ptr[vector[vector[intp_t]]] neigh_indices + shared_ptr[vector[vector[float64_t]]] neigh_distances + + # Used as array of pointers to private datastructures used in threads. + vector[shared_ptr[vector[vector[intp_t]]]] neigh_indices_chunks + vector[shared_ptr[vector[vector[float64_t]]]] neigh_distances_chunks + + bint sort_results + + @final + cdef void _merge_vectors( + self, + intp_t idx, + intp_t num_threads, + ) noexcept nogil + + +cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}}): + """EuclideanDistance-specialisation of RadiusNeighbors{{name_suffix}}.""" + cdef: + MiddleTermComputer{{name_suffix}} middle_term_computer + const float64_t[::1] X_norm_squared + const float64_t[::1] Y_norm_squared + + bint use_squared_distances + +{{endfor}} diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp new file mode 100644 index 0000000000000000000000000000000000000000..d0567f2ead804d122fc24424a5d502084b6565e0 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx.tp @@ -0,0 +1,514 @@ +cimport numpy as cnp +import numpy as np +import warnings + +from libcpp.memory cimport shared_ptr, make_shared +from libcpp.vector cimport vector +from libcpp.algorithm cimport move +from cython cimport final +from cython.operator cimport dereference as deref +from cython.parallel cimport parallel, prange + +from ...utils._sorting cimport simultaneous_sort +from ...utils._typedefs cimport intp_t, float64_t +from ...utils._vector_sentinel cimport vector_to_nd_array + +from numbers import Real +from scipy.sparse import issparse +from ...utils import check_array, check_scalar +from ...utils.fixes import _in_unstable_openblas_configuration +from ...utils.parallel import _get_threadpool_controller + +cnp.import_array() + +###################### + +cdef cnp.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays( + shared_ptr[vector_vector_double_intp_t] vecs +): + """Coerce a std::vector of std::vector to a ndarray of ndarray.""" + cdef: + intp_t n = deref(vecs).size() + cnp.ndarray[object, ndim=1] nd_arrays_of_nd_arrays = np.empty(n, dtype=np.ndarray) + + for i in range(n): + nd_arrays_of_nd_arrays[i] = vector_to_nd_array(&(deref(vecs)[i])) + + return nd_arrays_of_nd_arrays + +##################### +{{for name_suffix in ['64', '32']}} + +from ._base cimport ( + BaseDistancesReduction{{name_suffix}}, + _sqeuclidean_row_norms{{name_suffix}} +) + +from ._datasets_pair cimport DatasetsPair{{name_suffix}} + +from ._middle_term_computer cimport MiddleTermComputer{{name_suffix}} + + +cdef class RadiusNeighbors{{name_suffix}}(BaseDistancesReduction{{name_suffix}}): + """float{{name_suffix}} implementation of the RadiusNeighbors.""" + + @classmethod + def compute( + cls, + X, + Y, + float64_t radius, + str metric="euclidean", + chunk_size=None, + dict metric_kwargs=None, + str strategy=None, + bint return_distance=False, + bint sort_results=False, + ): + """Compute the radius-neighbors reduction. + + This classmethod is responsible for introspecting the arguments + values to dispatch to the most appropriate implementation of + :class:`RadiusNeighbors{{name_suffix}}`. + + This allows decoupling the API entirely from the implementation details + whilst maintaining RAII: all temporarily allocated datastructures necessary + for the concrete implementation are therefore freed when this classmethod + returns. + + No instance should directly be created outside of this class method. + """ + if metric in ("euclidean", "sqeuclidean"): + # Specialized implementation of RadiusNeighbors for the Euclidean + # distance for the dense-dense and sparse-sparse cases. + # This implementation computes the distances by chunk using + # a decomposition of the Squared Euclidean distance. + # This specialisation has an improved arithmetic intensity for both + # the dense and sparse settings, allowing in most case speed-ups of + # several orders of magnitude compared to the generic RadiusNeighbors + # implementation. + # For more information see MiddleTermComputer. + use_squared_distances = metric == "sqeuclidean" + pda = EuclideanRadiusNeighbors{{name_suffix}}( + X=X, Y=Y, radius=radius, + use_squared_distances=use_squared_distances, + chunk_size=chunk_size, + strategy=strategy, + sort_results=sort_results, + metric_kwargs=metric_kwargs, + ) + else: + # Fall back on a generic implementation that handles most scipy + # metrics by computing the distances between 2 vectors at a time. + pda = RadiusNeighbors{{name_suffix}}( + datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs), + radius=radius, + chunk_size=chunk_size, + strategy=strategy, + sort_results=sort_results, + ) + + # Limit the number of threads in second level of nested parallelism for BLAS + # to avoid threads over-subscription (in GEMM for instance). + with _get_threadpool_controller().limit(limits=1, user_api="blas"): + if pda.execute_in_parallel_on_Y: + pda._parallel_on_Y() + else: + pda._parallel_on_X() + + return pda._finalize_results(return_distance) + + + def __init__( + self, + DatasetsPair{{name_suffix}} datasets_pair, + float64_t radius, + chunk_size=None, + strategy=None, + sort_results=False, + ): + super().__init__( + datasets_pair=datasets_pair, + chunk_size=chunk_size, + strategy=strategy, + ) + + self.radius = check_scalar(radius, "radius", Real, min_val=0) + self.r_radius = self.datasets_pair.distance_metric._dist_to_rdist(radius) + self.sort_results = sort_results + + # Allocating pointers to datastructures but not the datastructures themselves. + # There are as many pointers as effective threads. + # + # For the sake of explicitness: + # - when parallelizing on X, the pointers of those heaps are referencing + # self.neigh_distances and self.neigh_indices + # - when parallelizing on Y, the pointers of those heaps are referencing + # std::vectors of std::vectors which are thread-wise-allocated and whose + # content will be merged into self.neigh_distances and self.neigh_indices. + self.neigh_distances_chunks = vector[shared_ptr[vector[vector[float64_t]]]]( + self.chunks_n_threads + ) + self.neigh_indices_chunks = vector[shared_ptr[vector[vector[intp_t]]]]( + self.chunks_n_threads + ) + + # Temporary datastructures which will be coerced to numpy arrays on before + # RadiusNeighbors.compute "return" and will be then freed. + self.neigh_distances = make_shared[vector[vector[float64_t]]](self.n_samples_X) + self.neigh_indices = make_shared[vector[vector[intp_t]]](self.n_samples_X) + + cdef void _compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil: + cdef: + intp_t i, j + float64_t r_dist_i_j + + for i in range(X_start, X_end): + for j in range(Y_start, Y_end): + r_dist_i_j = self.datasets_pair.surrogate_dist(i, j) + if r_dist_i_j <= self.r_radius: + deref(self.neigh_distances_chunks[thread_num])[i].push_back(r_dist_i_j) + deref(self.neigh_indices_chunks[thread_num])[i].push_back(j) + + def _finalize_results(self, bint return_distance=False): + if return_distance: + # We need to recompute distances because we relied on + # surrogate distances for the reduction. + self.compute_exact_distances() + return ( + coerce_vectors_to_nd_arrays(self.neigh_distances), + coerce_vectors_to_nd_arrays(self.neigh_indices), + ) + + return coerce_vectors_to_nd_arrays(self.neigh_indices) + + cdef void _parallel_on_X_init_chunk( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil: + + # As this strategy is embarrassingly parallel, we can set the + # thread vectors' pointers to the main vectors'. + self.neigh_distances_chunks[thread_num] = self.neigh_distances + self.neigh_indices_chunks[thread_num] = self.neigh_indices + + @final + cdef void _parallel_on_X_prange_iter_finalize( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil: + cdef: + intp_t idx + + # Sorting neighbors for each query vector of X + if self.sort_results: + for idx in range(X_start, X_end): + simultaneous_sort( + deref(self.neigh_distances)[idx].data(), + deref(self.neigh_indices)[idx].data(), + deref(self.neigh_indices)[idx].size() + ) + + cdef void _parallel_on_Y_init( + self, + ) noexcept nogil: + cdef: + intp_t thread_num + # As chunks of X are shared across threads, so must datastructures to avoid race + # conditions: each thread has its own vectors of n_samples_X vectors which are + # then merged back in the main n_samples_X vectors. + for thread_num in range(self.chunks_n_threads): + self.neigh_distances_chunks[thread_num] = make_shared[vector[vector[float64_t]]](self.n_samples_X) + self.neigh_indices_chunks[thread_num] = make_shared[vector[vector[intp_t]]](self.n_samples_X) + + @final + cdef void _merge_vectors( + self, + intp_t idx, + intp_t num_threads, + ) noexcept nogil: + cdef: + intp_t thread_num + intp_t idx_n_elements = 0 + intp_t last_element_idx = deref(self.neigh_indices)[idx].size() + + # Resizing buffers only once for the given number of elements. + for thread_num in range(num_threads): + idx_n_elements += deref(self.neigh_distances_chunks[thread_num])[idx].size() + + deref(self.neigh_distances)[idx].resize(last_element_idx + idx_n_elements) + deref(self.neigh_indices)[idx].resize(last_element_idx + idx_n_elements) + + # Moving the elements by range using the range first element + # as the reference for the insertion. + for thread_num in range(num_threads): + move( + deref(self.neigh_distances_chunks[thread_num])[idx].begin(), + deref(self.neigh_distances_chunks[thread_num])[idx].end(), + deref(self.neigh_distances)[idx].begin() + last_element_idx + ) + move( + deref(self.neigh_indices_chunks[thread_num])[idx].begin(), + deref(self.neigh_indices_chunks[thread_num])[idx].end(), + deref(self.neigh_indices)[idx].begin() + last_element_idx + ) + last_element_idx += deref(self.neigh_distances_chunks[thread_num])[idx].size() + + cdef void _parallel_on_Y_finalize( + self, + ) noexcept nogil: + cdef: + intp_t idx + + with nogil, parallel(num_threads=self.effective_n_threads): + # Merge vectors used in threads into the main ones. + # This is done in parallel sample-wise (no need for locks). + for idx in prange(self.n_samples_X, schedule='static'): + self._merge_vectors(idx, self.chunks_n_threads) + + # The content of the vector have been std::moved. + # Hence they can't be used anymore and can be deleted. + # Their deletion is carried out automatically as the + # implementation relies on shared pointers. + + # Sort in parallel in ascending order w.r.t the distances if requested. + if self.sort_results: + for idx in prange(self.n_samples_X, schedule='static'): + simultaneous_sort( + deref(self.neigh_distances)[idx].data(), + deref(self.neigh_indices)[idx].data(), + deref(self.neigh_indices)[idx].size() + ) + + return + + cdef void compute_exact_distances(self) noexcept nogil: + """Convert rank-preserving distances to pairwise distances in parallel.""" + cdef: + intp_t i + vector[intp_t].size_type j + + for i in prange(self.n_samples_X, nogil=True, schedule='static', + num_threads=self.effective_n_threads): + for j in range(deref(self.neigh_indices)[i].size()): + deref(self.neigh_distances)[i][j] = ( + self.datasets_pair.distance_metric._rdist_to_dist( + # Guard against potential -0., causing nan production. + max(deref(self.neigh_distances)[i][j], 0.) + ) + ) + + +cdef class EuclideanRadiusNeighbors{{name_suffix}}(RadiusNeighbors{{name_suffix}}): + """EuclideanDistance-specialisation of RadiusNeighbors{{name_suffix}}.""" + + @classmethod + def is_usable_for(cls, X, Y, metric) -> bool: + return (RadiusNeighbors{{name_suffix}}.is_usable_for(X, Y, metric) + and not _in_unstable_openblas_configuration()) + + def __init__( + self, + X, + Y, + float64_t radius, + bint use_squared_distances=False, + chunk_size=None, + strategy=None, + sort_results=False, + metric_kwargs=None, + ): + if ( + isinstance(metric_kwargs, dict) and + (metric_kwargs.keys() - {"X_norm_squared", "Y_norm_squared"}) + ): + warnings.warn( + f"Some metric_kwargs have been passed ({metric_kwargs}) but aren't " + f"usable for this case (EuclideanRadiusNeighbors64) and will be ignored.", + UserWarning, + stacklevel=3, + ) + + super().__init__( + # The datasets pair here is used for exact distances computations + datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric="euclidean"), + radius=radius, + chunk_size=chunk_size, + strategy=strategy, + sort_results=sort_results, + ) + cdef: + intp_t dist_middle_terms_chunks_size = self.Y_n_samples_chunk * self.X_n_samples_chunk + + self.middle_term_computer = MiddleTermComputer{{name_suffix}}.get_for( + X, + Y, + self.effective_n_threads, + self.chunks_n_threads, + dist_middle_terms_chunks_size, + n_features=X.shape[1], + chunk_size=self.chunk_size, + ) + + if metric_kwargs is not None and "Y_norm_squared" in metric_kwargs: + self.Y_norm_squared = check_array( + metric_kwargs.pop("Y_norm_squared"), + ensure_2d=False, + input_name="Y_norm_squared", + dtype=np.float64, + ) + else: + self.Y_norm_squared = _sqeuclidean_row_norms{{name_suffix}}( + Y, + self.effective_n_threads, + ) + + if metric_kwargs is not None and "X_norm_squared" in metric_kwargs: + self.X_norm_squared = check_array( + metric_kwargs.pop("X_norm_squared"), + ensure_2d=False, + input_name="X_norm_squared", + dtype=np.float64, + ) + else: + # Do not recompute norms if datasets are identical. + self.X_norm_squared = ( + self.Y_norm_squared if X is Y else + _sqeuclidean_row_norms{{name_suffix}}( + X, + self.effective_n_threads, + ) + ) + + self.use_squared_distances = use_squared_distances + + if use_squared_distances: + # In this specialisation and this setup, the value passed to the radius is + # already considered to be the adapted radius, so we overwrite it. + self.r_radius = radius + + @final + cdef void _parallel_on_X_parallel_init( + self, + intp_t thread_num, + ) noexcept nogil: + RadiusNeighbors{{name_suffix}}._parallel_on_X_parallel_init(self, thread_num) + self.middle_term_computer._parallel_on_X_parallel_init(thread_num) + + @final + cdef void _parallel_on_X_init_chunk( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil: + RadiusNeighbors{{name_suffix}}._parallel_on_X_init_chunk(self, thread_num, X_start, X_end) + self.middle_term_computer._parallel_on_X_init_chunk(thread_num, X_start, X_end) + + @final + cdef void _parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil: + RadiusNeighbors{{name_suffix}}._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + self, + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + self.middle_term_computer._parallel_on_X_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, Y_start, Y_end, thread_num, + ) + + @final + cdef void _parallel_on_Y_init( + self, + ) noexcept nogil: + RadiusNeighbors{{name_suffix}}._parallel_on_Y_init(self) + self.middle_term_computer._parallel_on_Y_init() + + @final + cdef void _parallel_on_Y_parallel_init( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil: + RadiusNeighbors{{name_suffix}}._parallel_on_Y_parallel_init(self, thread_num, X_start, X_end) + self.middle_term_computer._parallel_on_Y_parallel_init(thread_num, X_start, X_end) + + @final + cdef void _parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil: + RadiusNeighbors{{name_suffix}}._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + self, + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + self.middle_term_computer._parallel_on_Y_pre_compute_and_reduce_distances_on_chunks( + X_start, X_end, Y_start, Y_end, thread_num + ) + + @final + cdef void compute_exact_distances(self) noexcept nogil: + if not self.use_squared_distances: + RadiusNeighbors{{name_suffix}}.compute_exact_distances(self) + + @final + cdef void _compute_and_reduce_distances_on_chunks( + self, + intp_t X_start, + intp_t X_end, + intp_t Y_start, + intp_t Y_end, + intp_t thread_num, + ) noexcept nogil: + cdef: + intp_t i, j + float64_t sqeuclidean_dist_i_j + intp_t n_X = X_end - X_start + intp_t n_Y = Y_end - Y_start + float64_t *dist_middle_terms = self.middle_term_computer._compute_dist_middle_terms( + X_start, X_end, Y_start, Y_end, thread_num + ) + + # Pushing the distance and their associated indices in vectors. + for i in range(n_X): + for j in range(n_Y): + sqeuclidean_dist_i_j = ( + self.X_norm_squared[i + X_start] + + dist_middle_terms[i * n_Y + j] + + self.Y_norm_squared[j + Y_start] + ) + + # Catastrophic cancellation might cause -0. to be present, + # e.g. when computing d(x_i, y_i) when X is Y. + sqeuclidean_dist_i_j = max(0., sqeuclidean_dist_i_j) + + if sqeuclidean_dist_i_j <= self.r_radius: + deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(sqeuclidean_dist_i_j) + deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start) + +{{endfor}} diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp new file mode 100644 index 0000000000000000000000000000000000000000..0a9b22251843e60e52fa6d29248f3a745b37e414 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp @@ -0,0 +1,217 @@ +import warnings + +from cython cimport floating, final, integral +from cython.operator cimport dereference as deref +from cython.parallel cimport parallel, prange +from ._classmode cimport WeightingStrategy +from ...utils._typedefs cimport intp_t, float64_t, uint8_t + +import numpy as np +from scipy.sparse import issparse +from ...utils.parallel import _get_threadpool_controller + + +{{for name_suffix in ["32", "64"]}} +from ._radius_neighbors cimport RadiusNeighbors{{name_suffix}} +from ._datasets_pair cimport DatasetsPair{{name_suffix}} + +cdef class RadiusNeighborsClassMode{{name_suffix}}(RadiusNeighbors{{name_suffix}}): + """ + {{name_suffix}}bit implementation of RadiusNeighborsClassMode. + """ + cdef: + const intp_t[::1] Y_labels + const intp_t[::1] unique_Y_labels + intp_t outlier_label_index + bint outlier_label_exists + bint outliers_exist + uint8_t[::1] outliers + object outlier_label + float64_t[:, ::1] class_scores + WeightingStrategy weight_type + + @classmethod + def compute( + cls, + X, + Y, + float64_t radius, + weights, + Y_labels, + unique_Y_labels, + outlier_label=None, + str metric="euclidean", + chunk_size=None, + dict metric_kwargs=None, + str strategy=None, + ): + # Use a generic implementation that handles most scipy + # metrics by computing the distances between 2 vectors at a time. + pda = RadiusNeighborsClassMode{{name_suffix}}( + datasets_pair=DatasetsPair{{name_suffix}}.get_for(X, Y, metric, metric_kwargs), + radius=radius, + chunk_size=chunk_size, + strategy=strategy, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + outlier_label=outlier_label, + ) + + # Limit the number of threads in second level of nested parallelism for BLAS + # to avoid threads over-subscription (in GEMM for instance). + with _get_threadpool_controller().limit(limits=1, user_api="blas"): + if pda.execute_in_parallel_on_Y: + pda._parallel_on_Y() + else: + pda._parallel_on_X() + + return pda._finalize_results() + + def __init__( + self, + DatasetsPair{{name_suffix}} datasets_pair, + const intp_t[::1] Y_labels, + const intp_t[::1] unique_Y_labels, + float64_t radius, + chunk_size=None, + strategy=None, + weights=None, + outlier_label=None, + ): + super().__init__( + datasets_pair=datasets_pair, + chunk_size=chunk_size, + strategy=strategy, + radius=radius, + ) + + if weights == "uniform": + self.weight_type = WeightingStrategy.uniform + elif weights == "distance": + self.weight_type = WeightingStrategy.distance + else: + self.weight_type = WeightingStrategy.callable + + self.Y_labels = Y_labels + self.unique_Y_labels = unique_Y_labels + self.outlier_label_index = -1 + self.outliers_exist = False + self.outlier_label = outlier_label + self.outliers = np.zeros(self.n_samples_X, dtype=np.bool_) + + cdef intp_t idx + if self.outlier_label is not None: + for idx in range(self.unique_Y_labels.shape[0]): + if self.unique_Y_labels[idx] == outlier_label: + self.outlier_label_index = idx + + # Map from set of unique labels to their indices in `class_scores` + # Buffer used in building a histogram for one-pass weighted mode + self.class_scores = np.zeros( + (self.n_samples_X, unique_Y_labels.shape[0]), dtype=np.float64, + ) + + + cdef inline void weighted_histogram_mode( + self, + intp_t sample_index, + intp_t sample_n_neighbors, + intp_t* indices, + float64_t* distances, + ) noexcept nogil: + cdef: + intp_t neighbor_idx, neighbor_class_idx, label_index + float64_t score_incr = 1 + bint use_distance_weighting = ( + self.weight_type == WeightingStrategy.distance + ) + + if sample_n_neighbors == 0: + self.outliers_exist = True + self.outliers[sample_index] = True + if self.outlier_label_index >= 0: + self.class_scores[sample_index][self.outlier_label_index] = score_incr + + return + + # Iterate over the neighbors. This can be different for + # each of the samples as they are based on the radius. + for neighbor_rank in range(sample_n_neighbors): + if use_distance_weighting: + score_incr = 1 / distances[neighbor_rank] + + neighbor_idx = indices[neighbor_rank] + neighbor_class_idx = self.Y_labels[neighbor_idx] + self.class_scores[sample_index][neighbor_class_idx] += score_incr + + return + + @final + cdef void _parallel_on_X_prange_iter_finalize( + self, + intp_t thread_num, + intp_t X_start, + intp_t X_end, + ) noexcept nogil: + cdef: + intp_t idx + + for idx in range(X_start, X_end): + self.weighted_histogram_mode( + sample_index=idx, + sample_n_neighbors=deref(self.neigh_indices)[idx].size(), + indices=deref(self.neigh_indices)[idx].data(), + distances=deref(self.neigh_distances)[idx].data(), + ) + + return + + @final + cdef void _parallel_on_Y_finalize( + self, + ) noexcept nogil: + cdef: + intp_t idx + + with nogil, parallel(num_threads=self.effective_n_threads): + # Merge vectors used in threads into the main ones. + # This is done in parallel sample-wise (no need for locks). + for idx in prange(self.n_samples_X, schedule='static'): + self._merge_vectors(idx, self.chunks_n_threads) + + for idx in prange(self.n_samples_X, schedule='static'): + self.weighted_histogram_mode( + sample_index=idx, + sample_n_neighbors=deref(self.neigh_indices)[idx].size(), + indices=deref(self.neigh_indices)[idx].data(), + distances=deref(self.neigh_distances)[idx].data(), + ) + + return + + def _finalize_results(self): + if self.outliers_exist and self.outlier_label is None: + raise ValueError( + "No neighbors found for test samples %r, " + "you can try using larger radius, " + "giving a label for outliers, " + "or considering removing them from your dataset." + % np.where(self.outliers)[0] + ) + + if self.outliers_exist and self.outlier_label_index < 0: + warnings.warn( + "Outlier label %s is not in training " + "classes. All class probabilities of " + "outliers will be assigned with 0." + % self.outlier_label + ) + + probabilities = np.asarray(self.class_scores) + normalizer = probabilities.sum(axis=1, keepdims=True) + normalizer[normalizer == 0.0] = 1.0 + probabilities /= normalizer + return probabilities + +{{endfor}} diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/meson.build b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..0f7eaa286399c5319d16d2c413d7be1957a10d74 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_distances_reduction/meson.build @@ -0,0 +1,193 @@ +# Note: the dependencies between different Cython files in +# _pairwise_distances_reduction is probably one of the most involved in +# scikit-learn. If you change this file make sure you build from scratch: +# rm -rf build; make dev-meson +# run a command like this: +# ninja -C build/cp312 -t missingdeps +# and make sure that the output is something like: +# No missing dependencies on generated files found. + +# _pairwise_distances_reduction is cimported from other subpackages so this is +# needed for the cimport to work +_pairwise_distances_reduction_cython_tree = [ + fs.copyfile('__init__.py'), + # We are in a sub-module of metrics, so we always need to have + # sklearn/metrics/__init__.py copied to the build directory to avoid the + # error: + # relative cimport beyond main package is not allowed + metrics_cython_tree +] + +_classmode_pxd = fs.copyfile('_classmode.pxd') + +_datasets_pair_pxd = custom_target( + '_datasets_pair_pxd', + output: '_datasets_pair.pxd', + input: '_datasets_pair.pxd.tp', + command: [tempita, '@INPUT@', '-o', '@OUTDIR@'] +) +_datasets_pair_pyx = custom_target( + '_datasets_pair_pyx', + output: '_datasets_pair.pyx', + input: '_datasets_pair.pyx.tp', + command: [tempita, '@INPUT@', '-o', '@OUTDIR@'], + # TODO in principle this should go in py.exension_module below. This is + # temporary work-around for dependency issue with .pyx.tp files. For more + # details, see https://github.com/mesonbuild/meson/issues/13212 + depends: [_datasets_pair_pxd, _pairwise_distances_reduction_cython_tree, utils_cython_tree], +) +_datasets_pair = py.extension_module( + '_datasets_pair', + cython_gen_cpp.process(_datasets_pair_pyx), + dependencies: [np_dep], + subdir: 'sklearn/metrics/_pairwise_distances_reduction', + install: true +) + +_base_pxd = custom_target( + '_base_pxd', + output: '_base.pxd', + input: '_base.pxd.tp', + command: [tempita, '@INPUT@', '-o', '@OUTDIR@'] +) +_base_pyx = custom_target( + '_base_pyx', + output: '_base.pyx', + input: '_base.pyx.tp', + command: [tempita, '@INPUT@', '-o', '@OUTDIR@'], + # TODO in principle this should go in py.exension_module below. This is + # temporary work-around for dependency issue with .pyx.tp files. For more + # details, see https://github.com/mesonbuild/meson/issues/13212 + depends: [_base_pxd, _pairwise_distances_reduction_cython_tree, + _datasets_pair_pxd, utils_cython_tree], +) +_base = py.extension_module( + '_base', + cython_gen_cpp.process(_base_pyx), + dependencies: [np_dep, openmp_dep], + subdir: 'sklearn/metrics/_pairwise_distances_reduction', + install: true +) + +_middle_term_computer_pxd = custom_target( + '_middle_term_computer_pxd', + output: '_middle_term_computer.pxd', + input: '_middle_term_computer.pxd.tp', + command: [tempita, '@INPUT@', '-o', '@OUTDIR@'] +) +_middle_term_computer_pyx = custom_target( + '_middle_term_computer_pyx', + output: '_middle_term_computer.pyx', + input: '_middle_term_computer.pyx.tp', + command: [tempita, '@INPUT@', '-o', '@OUTDIR@'], + # TODO in principle this should go in py.exension_module below. This is + # temporary work-around for dependency issue with .pyx.tp files. For more + # details, see https://github.com/mesonbuild/meson/issues/13212 + depends: [_middle_term_computer_pxd, + _pairwise_distances_reduction_cython_tree, + utils_cython_tree], +) +_middle_term_computer = py.extension_module( + '_middle_term_computer', + cython_gen_cpp.process(_middle_term_computer_pyx), + dependencies: [np_dep], + subdir: 'sklearn/metrics/_pairwise_distances_reduction', + install: true +) + +_argkmin_pxd = custom_target( + '_argkmin_pxd', + output: '_argkmin.pxd', + input: '_argkmin.pxd.tp', + command: [tempita, '@INPUT@', '-o', '@OUTDIR@'] + ) +_argkmin_pyx = custom_target( + '_argkmin_pyx', + output: '_argkmin.pyx', + input: '_argkmin.pyx.tp', + command: [tempita, '@INPUT@', '-o', '@OUTDIR@'], + # TODO in principle this should go in py.exension_module below. This is + # temporary work-around for dependency issue with .pyx.tp files. For more + # details, see https://github.com/mesonbuild/meson/issues/13212 + depends: [_argkmin_pxd, + _pairwise_distances_reduction_cython_tree, + _datasets_pair_pxd, _base_pxd, _middle_term_computer_pxd], + ) +_argkmin = py.extension_module( + '_argkmin', + cython_gen_cpp.process(_argkmin_pyx), + dependencies: [np_dep, openmp_dep], + subdir: 'sklearn/metrics/_pairwise_distances_reduction', + install: true +) + +_radius_neighbors_pxd = custom_target( + '_radius_neighbors_pxd', + output: '_radius_neighbors.pxd', + input: '_radius_neighbors.pxd.tp', + command: [tempita, '@INPUT@', '-o', '@OUTDIR@'] + ) +_radius_neighbors_pyx = custom_target( + '_radius_neighbors_pyx', + output: '_radius_neighbors.pyx', + input: '_radius_neighbors.pyx.tp', + command: [tempita, '@INPUT@', '-o', '@OUTDIR@'], + # TODO in principle this should go in py.exension_module below. This is + # temporary work-around for dependency issue with .pyx.tp files. For more + # details, see https://github.com/mesonbuild/meson/issues/13212 + depends: [_radius_neighbors_pxd, + _datasets_pair_pxd, _base_pxd, _middle_term_computer_pxd, + _pairwise_distances_reduction_cython_tree, utils_cython_tree], +) +_radius_neighbors = py.extension_module( + '_radius_neighbors', + cython_gen_cpp.process(_radius_neighbors_pyx), + dependencies: [np_dep, openmp_dep], + subdir: 'sklearn/metrics/_pairwise_distances_reduction', + install: true +) + +_argkmin_classmode_pyx = custom_target( + '_argkmin_classmode_pyx', + output: '_argkmin_classmode.pyx', + input: '_argkmin_classmode.pyx.tp', + command: [tempita, '@INPUT@', '-o', '@OUTDIR@'], + # TODO in principle this should go in py.exension_module below. This is + # temporary work-around for dependency issue with .pyx.tp files. For more + # details, see https://github.com/mesonbuild/meson/issues/13212 + depends: [_classmode_pxd, + _argkmin_pxd, _pairwise_distances_reduction_cython_tree, + _datasets_pair_pxd, _base_pxd, _middle_term_computer_pxd, utils_cython_tree], +) +_argkmin_classmode = py.extension_module( + '_argkmin_classmode', + cython_gen_cpp.process(_argkmin_classmode_pyx), + dependencies: [np_dep, openmp_dep], + # XXX: for some reason -fno-sized-deallocation is needed otherwise there is + # an error with undefined symbol _ZdlPv at import time in manylinux wheels. + # See https://github.com/scikit-learn/scikit-learn/issues/28596 for more details. + cpp_args: ['-fno-sized-deallocation'], + subdir: 'sklearn/metrics/_pairwise_distances_reduction', + install: true +) + +_radius_neighbors_classmode_pyx = custom_target( + '_radius_neighbors_classmode_pyx', + output: '_radius_neighbors_classmode.pyx', + input: '_radius_neighbors_classmode.pyx.tp', + command: [tempita, '@INPUT@', '-o', '@OUTDIR@'], + # TODO in principle this should go in py.exension_module below. This is + # temporary work-around for dependency issue with .pyx.tp files. For more + # details, see https://github.com/mesonbuild/meson/issues/13212 + depends: [_classmode_pxd, + _middle_term_computer_pxd, _radius_neighbors_pxd, + _pairwise_distances_reduction_cython_tree, + _datasets_pair_pxd, _base_pxd, utils_cython_tree], +) +_radius_neighbors_classmode = py.extension_module( + '_radius_neighbors_classmode', + cython_gen_cpp.process(_radius_neighbors_classmode_pyx), + dependencies: [np_dep, openmp_dep], + subdir: 'sklearn/metrics/_pairwise_distances_reduction', + install: true +) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_fast.pyx b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_fast.pyx new file mode 100644 index 0000000000000000000000000000000000000000..bf4ded09b2610eef7949cd56e5270b77cb2ce4db --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_pairwise_fast.pyx @@ -0,0 +1,107 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from cython cimport floating +from cython.parallel cimport prange +from libc.math cimport fabs + +from ..utils._typedefs cimport intp_t + +from ..utils._openmp_helpers import _openmp_effective_n_threads + + +def _chi2_kernel_fast(floating[:, :] X, + floating[:, :] Y, + floating[:, :] result): + cdef intp_t i, j, k + cdef intp_t n_samples_X = X.shape[0] + cdef intp_t n_samples_Y = Y.shape[0] + cdef intp_t n_features = X.shape[1] + cdef double res, nom, denom + + with nogil: + for i in range(n_samples_X): + for j in range(n_samples_Y): + res = 0 + for k in range(n_features): + denom = (X[i, k] - Y[j, k]) + nom = (X[i, k] + Y[j, k]) + if nom != 0: + res += denom * denom / nom + result[i, j] = -res + + +def _sparse_manhattan( + const floating[::1] X_data, + const int[:] X_indices, + const int[:] X_indptr, + const floating[::1] Y_data, + const int[:] Y_indices, + const int[:] Y_indptr, + double[:, ::1] D, +): + """Pairwise L1 distances for CSR matrices. + + Usage: + >>> D = np.zeros(X.shape[0], Y.shape[0]) + >>> _sparse_manhattan(X.data, X.indices, X.indptr, + ... Y.data, Y.indices, Y.indptr, + ... D) + """ + cdef intp_t px, py, i, j, ix, iy + cdef double d = 0.0 + + cdef int m = D.shape[0] + cdef int n = D.shape[1] + + cdef int X_indptr_end = 0 + cdef int Y_indptr_end = 0 + + cdef int num_threads = _openmp_effective_n_threads() + + # We scan the matrices row by row. + # Given row px in X and row py in Y, we find the positions (i and j + # respectively), in .indices where the indices for the two rows start. + # If the indices (ix and iy) are the same, the corresponding data values + # are processed and the cursors i and j are advanced. + # If not, the lowest index is considered. Its associated data value is + # processed and its cursor is advanced. + # We proceed like this until one of the cursors hits the end for its row. + # Then we process all remaining data values in the other row. + + # Below the avoidance of inplace operators is intentional. + # When prange is used, the inplace operator has a special meaning, i.e. it + # signals a "reduction" + + for px in prange(m, nogil=True, num_threads=num_threads): + X_indptr_end = X_indptr[px + 1] + for py in range(n): + Y_indptr_end = Y_indptr[py + 1] + i = X_indptr[px] + j = Y_indptr[py] + d = 0.0 + while i < X_indptr_end and j < Y_indptr_end: + ix = X_indices[i] + iy = Y_indices[j] + + if ix == iy: + d = d + fabs(X_data[i] - Y_data[j]) + i = i + 1 + j = j + 1 + elif ix < iy: + d = d + fabs(X_data[i]) + i = i + 1 + else: + d = d + fabs(Y_data[j]) + j = j + 1 + + if i == X_indptr_end: + while j < Y_indptr_end: + d = d + fabs(Y_data[j]) + j = j + 1 + else: + while i < X_indptr_end: + d = d + fabs(X_data[i]) + i = i + 1 + + D[px, py] = d diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..67dd18fb94b593f0a3125c1f5833f3b9597614ba --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__init__.py @@ -0,0 +1,2 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/__init__.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cff9ae99268b96a9d6d332ac2377c325ba7a0ddd Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/__init__.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/confusion_matrix.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/confusion_matrix.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..34f825229da0c7a280e0d47b5d99f64c0273e3f9 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/confusion_matrix.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/det_curve.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/det_curve.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ad7fc06bba7a8a7059de53ca08328114ec2e0459 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/det_curve.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/precision_recall_curve.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/precision_recall_curve.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b9648fc4ff03a95433f835c6cbf564d4dcbba24 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/precision_recall_curve.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/regression.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/regression.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc63164253445f0e6d8e3d9cfcbbb801f23d9a95 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/regression.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/roc_curve.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/roc_curve.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..119e8cc687ac38eee648c599315604ad0fa82b0c Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/__pycache__/roc_curve.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/confusion_matrix.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/confusion_matrix.py new file mode 100644 index 0000000000000000000000000000000000000000..cee515bebe08e859268c27c5441ce3450434d817 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/confusion_matrix.py @@ -0,0 +1,499 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from itertools import product + +import numpy as np + +from ...base import is_classifier +from ...utils._optional_dependencies import check_matplotlib_support +from ...utils._plotting import _validate_style_kwargs +from ...utils.multiclass import unique_labels +from .. import confusion_matrix + + +class ConfusionMatrixDisplay: + """Confusion Matrix visualization. + + It is recommended to use + :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_estimator` or + :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_predictions` to + create a :class:`ConfusionMatrixDisplay`. All parameters are stored as + attributes. + + For general information regarding `scikit-learn` visualization tools, see + the :ref:`Visualization Guide `. + For guidance on interpreting these plots, refer to the + :ref:`Model Evaluation Guide `. + + Parameters + ---------- + confusion_matrix : ndarray of shape (n_classes, n_classes) + Confusion matrix. + + display_labels : ndarray of shape (n_classes,), default=None + Display labels for plot. If None, display labels are set from 0 to + `n_classes - 1`. + + Attributes + ---------- + im_ : matplotlib AxesImage + Image representing the confusion matrix. + + text_ : ndarray of shape (n_classes, n_classes), dtype=matplotlib Text, \ + or None + Array of matplotlib axes. `None` if `include_values` is false. + + ax_ : matplotlib Axes + Axes with confusion matrix. + + figure_ : matplotlib Figure + Figure containing the confusion matrix. + + See Also + -------- + confusion_matrix : Compute Confusion Matrix to evaluate the accuracy of a + classification. + ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix + given an estimator, the data, and the label. + ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix + given the true and predicted labels. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_classification + >>> from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.svm import SVC + >>> X, y = make_classification(random_state=0) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, + ... random_state=0) + >>> clf = SVC(random_state=0) + >>> clf.fit(X_train, y_train) + SVC(random_state=0) + >>> predictions = clf.predict(X_test) + >>> cm = confusion_matrix(y_test, predictions, labels=clf.classes_) + >>> disp = ConfusionMatrixDisplay(confusion_matrix=cm, + ... display_labels=clf.classes_) + >>> disp.plot() + <...> + >>> plt.show() + """ + + def __init__(self, confusion_matrix, *, display_labels=None): + self.confusion_matrix = confusion_matrix + self.display_labels = display_labels + + def plot( + self, + *, + include_values=True, + cmap="viridis", + xticks_rotation="horizontal", + values_format=None, + ax=None, + colorbar=True, + im_kw=None, + text_kw=None, + ): + """Plot visualization. + + Parameters + ---------- + include_values : bool, default=True + Includes values in confusion matrix. + + cmap : str or matplotlib Colormap, default='viridis' + Colormap recognized by matplotlib. + + xticks_rotation : {'vertical', 'horizontal'} or float, \ + default='horizontal' + Rotation of xtick labels. + + values_format : str, default=None + Format specification for values in confusion matrix. If `None`, + the format specification is 'd' or '.2g' whichever is shorter. + + ax : matplotlib axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + colorbar : bool, default=True + Whether or not to add a colorbar to the plot. + + im_kw : dict, default=None + Dict with keywords passed to `matplotlib.pyplot.imshow` call. + + text_kw : dict, default=None + Dict with keywords passed to `matplotlib.pyplot.text` call. + + .. versionadded:: 1.2 + + Returns + ------- + display : :class:`~sklearn.metrics.ConfusionMatrixDisplay` + Returns a :class:`~sklearn.metrics.ConfusionMatrixDisplay` instance + that contains all the information to plot the confusion matrix. + """ + check_matplotlib_support("ConfusionMatrixDisplay.plot") + import matplotlib.pyplot as plt + + if ax is None: + fig, ax = plt.subplots() + else: + fig = ax.figure + + cm = self.confusion_matrix + n_classes = cm.shape[0] + + default_im_kw = dict(interpolation="nearest", cmap=cmap) + im_kw = im_kw or {} + im_kw = _validate_style_kwargs(default_im_kw, im_kw) + text_kw = text_kw or {} + + self.im_ = ax.imshow(cm, **im_kw) + self.text_ = None + cmap_min, cmap_max = self.im_.cmap(0), self.im_.cmap(1.0) + + if include_values: + self.text_ = np.empty_like(cm, dtype=object) + + # print text with appropriate color depending on background + thresh = (cm.max() + cm.min()) / 2.0 + + for i, j in product(range(n_classes), range(n_classes)): + color = cmap_max if cm[i, j] < thresh else cmap_min + + if values_format is None: + text_cm = format(cm[i, j], ".2g") + if cm.dtype.kind != "f": + text_d = format(cm[i, j], "d") + if len(text_d) < len(text_cm): + text_cm = text_d + else: + text_cm = format(cm[i, j], values_format) + + default_text_kwargs = dict(ha="center", va="center", color=color) + text_kwargs = _validate_style_kwargs(default_text_kwargs, text_kw) + + self.text_[i, j] = ax.text(j, i, text_cm, **text_kwargs) + + if self.display_labels is None: + display_labels = np.arange(n_classes) + else: + display_labels = self.display_labels + if colorbar: + fig.colorbar(self.im_, ax=ax) + ax.set( + xticks=np.arange(n_classes), + yticks=np.arange(n_classes), + xticklabels=display_labels, + yticklabels=display_labels, + ylabel="True label", + xlabel="Predicted label", + ) + + ax.set_ylim((n_classes - 0.5, -0.5)) + plt.setp(ax.get_xticklabels(), rotation=xticks_rotation) + + self.figure_ = fig + self.ax_ = ax + return self + + @classmethod + def from_estimator( + cls, + estimator, + X, + y, + *, + labels=None, + sample_weight=None, + normalize=None, + display_labels=None, + include_values=True, + xticks_rotation="horizontal", + values_format=None, + cmap="viridis", + ax=None, + colorbar=True, + im_kw=None, + text_kw=None, + ): + """Plot Confusion Matrix given an estimator and some data. + + For general information regarding `scikit-learn` visualization tools, see + the :ref:`Visualization Guide `. + For guidance on interpreting these plots, refer to the + :ref:`Model Evaluation Guide `. + + .. versionadded:: 1.0 + + Parameters + ---------- + estimator : estimator instance + Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline` + in which the last estimator is a classifier. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input values. + + y : array-like of shape (n_samples,) + Target values. + + labels : array-like of shape (n_classes,), default=None + List of labels to index the confusion matrix. This may be used to + reorder or select a subset of labels. If `None` is given, those + that appear at least once in `y_true` or `y_pred` are used in + sorted order. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + normalize : {'true', 'pred', 'all'}, default=None + Either to normalize the counts display in the matrix: + + - if `'true'`, the confusion matrix is normalized over the true + conditions (e.g. rows); + - if `'pred'`, the confusion matrix is normalized over the + predicted conditions (e.g. columns); + - if `'all'`, the confusion matrix is normalized by the total + number of samples; + - if `None` (default), the confusion matrix will not be normalized. + + display_labels : array-like of shape (n_classes,), default=None + Target names used for plotting. By default, `labels` will be used + if it is defined, otherwise the unique labels of `y_true` and + `y_pred` will be used. + + include_values : bool, default=True + Includes values in confusion matrix. + + xticks_rotation : {'vertical', 'horizontal'} or float, \ + default='horizontal' + Rotation of xtick labels. + + values_format : str, default=None + Format specification for values in confusion matrix. If `None`, the + format specification is 'd' or '.2g' whichever is shorter. + + cmap : str or matplotlib Colormap, default='viridis' + Colormap recognized by matplotlib. + + ax : matplotlib Axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + colorbar : bool, default=True + Whether or not to add a colorbar to the plot. + + im_kw : dict, default=None + Dict with keywords passed to `matplotlib.pyplot.imshow` call. + + text_kw : dict, default=None + Dict with keywords passed to `matplotlib.pyplot.text` call. + + .. versionadded:: 1.2 + + Returns + ------- + display : :class:`~sklearn.metrics.ConfusionMatrixDisplay` + + See Also + -------- + ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix + given the true and predicted labels. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_classification + >>> from sklearn.metrics import ConfusionMatrixDisplay + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.svm import SVC + >>> X, y = make_classification(random_state=0) + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, random_state=0) + >>> clf = SVC(random_state=0) + >>> clf.fit(X_train, y_train) + SVC(random_state=0) + >>> ConfusionMatrixDisplay.from_estimator( + ... clf, X_test, y_test) + <...> + >>> plt.show() + + For a detailed example of using a confusion matrix to evaluate a + Support Vector Classifier, please see + :ref:`sphx_glr_auto_examples_model_selection_plot_confusion_matrix.py` + """ + method_name = f"{cls.__name__}.from_estimator" + check_matplotlib_support(method_name) + if not is_classifier(estimator): + raise ValueError(f"{method_name} only supports classifiers") + y_pred = estimator.predict(X) + + return cls.from_predictions( + y, + y_pred, + sample_weight=sample_weight, + labels=labels, + normalize=normalize, + display_labels=display_labels, + include_values=include_values, + cmap=cmap, + ax=ax, + xticks_rotation=xticks_rotation, + values_format=values_format, + colorbar=colorbar, + im_kw=im_kw, + text_kw=text_kw, + ) + + @classmethod + def from_predictions( + cls, + y_true, + y_pred, + *, + labels=None, + sample_weight=None, + normalize=None, + display_labels=None, + include_values=True, + xticks_rotation="horizontal", + values_format=None, + cmap="viridis", + ax=None, + colorbar=True, + im_kw=None, + text_kw=None, + ): + """Plot Confusion Matrix given true and predicted labels. + + For general information regarding `scikit-learn` visualization tools, see + the :ref:`Visualization Guide `. + For guidance on interpreting these plots, refer to the + :ref:`Model Evaluation Guide `. + + .. versionadded:: 1.0 + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + True labels. + + y_pred : array-like of shape (n_samples,) + The predicted labels given by the method `predict` of an + classifier. + + labels : array-like of shape (n_classes,), default=None + List of labels to index the confusion matrix. This may be used to + reorder or select a subset of labels. If `None` is given, those + that appear at least once in `y_true` or `y_pred` are used in + sorted order. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + normalize : {'true', 'pred', 'all'}, default=None + Either to normalize the counts display in the matrix: + + - if `'true'`, the confusion matrix is normalized over the true + conditions (e.g. rows); + - if `'pred'`, the confusion matrix is normalized over the + predicted conditions (e.g. columns); + - if `'all'`, the confusion matrix is normalized by the total + number of samples; + - if `None` (default), the confusion matrix will not be normalized. + + display_labels : array-like of shape (n_classes,), default=None + Target names used for plotting. By default, `labels` will be used + if it is defined, otherwise the unique labels of `y_true` and + `y_pred` will be used. + + include_values : bool, default=True + Includes values in confusion matrix. + + xticks_rotation : {'vertical', 'horizontal'} or float, \ + default='horizontal' + Rotation of xtick labels. + + values_format : str, default=None + Format specification for values in confusion matrix. If `None`, the + format specification is 'd' or '.2g' whichever is shorter. + + cmap : str or matplotlib Colormap, default='viridis' + Colormap recognized by matplotlib. + + ax : matplotlib Axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + colorbar : bool, default=True + Whether or not to add a colorbar to the plot. + + im_kw : dict, default=None + Dict with keywords passed to `matplotlib.pyplot.imshow` call. + + text_kw : dict, default=None + Dict with keywords passed to `matplotlib.pyplot.text` call. + + .. versionadded:: 1.2 + + Returns + ------- + display : :class:`~sklearn.metrics.ConfusionMatrixDisplay` + + See Also + -------- + ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix + given an estimator, the data, and the label. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_classification + >>> from sklearn.metrics import ConfusionMatrixDisplay + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.svm import SVC + >>> X, y = make_classification(random_state=0) + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, random_state=0) + >>> clf = SVC(random_state=0) + >>> clf.fit(X_train, y_train) + SVC(random_state=0) + >>> y_pred = clf.predict(X_test) + >>> ConfusionMatrixDisplay.from_predictions( + ... y_test, y_pred) + <...> + >>> plt.show() + """ + check_matplotlib_support(f"{cls.__name__}.from_predictions") + + if display_labels is None: + if labels is None: + display_labels = unique_labels(y_true, y_pred) + else: + display_labels = labels + + cm = confusion_matrix( + y_true, + y_pred, + sample_weight=sample_weight, + labels=labels, + normalize=normalize, + ) + + disp = cls(confusion_matrix=cm, display_labels=display_labels) + + return disp.plot( + include_values=include_values, + cmap=cmap, + ax=ax, + xticks_rotation=xticks_rotation, + values_format=values_format, + colorbar=colorbar, + im_kw=im_kw, + text_kw=text_kw, + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/det_curve.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/det_curve.py new file mode 100644 index 0000000000000000000000000000000000000000..590b908d917232d9e43b0f8492710ee978ce989c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/det_curve.py @@ -0,0 +1,371 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np +import scipy as sp + +from ...utils._plotting import _BinaryClassifierCurveDisplayMixin +from .._ranking import det_curve + + +class DetCurveDisplay(_BinaryClassifierCurveDisplayMixin): + """Detection Error Tradeoff (DET) curve visualization. + + It is recommended to use :func:`~sklearn.metrics.DetCurveDisplay.from_estimator` + or :func:`~sklearn.metrics.DetCurveDisplay.from_predictions` to create a + visualizer. All parameters are stored as attributes. + + For general information regarding `scikit-learn` visualization tools, see + the :ref:`Visualization Guide `. + For guidance on interpreting these plots, refer to the + :ref:`Model Evaluation Guide `. + + .. versionadded:: 0.24 + + Parameters + ---------- + fpr : ndarray + False positive rate. + + fnr : ndarray + False negative rate. + + estimator_name : str, default=None + Name of estimator. If None, the estimator name is not shown. + + pos_label : int, float, bool or str, default=None + The label of the positive class. + + Attributes + ---------- + line_ : matplotlib Artist + DET Curve. + + ax_ : matplotlib Axes + Axes with DET Curve. + + figure_ : matplotlib Figure + Figure containing the curve. + + See Also + -------- + det_curve : Compute error rates for different probability thresholds. + DetCurveDisplay.from_estimator : Plot DET curve given an estimator and + some data. + DetCurveDisplay.from_predictions : Plot DET curve given the true and + predicted labels. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_classification + >>> from sklearn.metrics import det_curve, DetCurveDisplay + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.svm import SVC + >>> X, y = make_classification(n_samples=1000, random_state=0) + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, test_size=0.4, random_state=0) + >>> clf = SVC(random_state=0).fit(X_train, y_train) + >>> y_pred = clf.decision_function(X_test) + >>> fpr, fnr, _ = det_curve(y_test, y_pred) + >>> display = DetCurveDisplay( + ... fpr=fpr, fnr=fnr, estimator_name="SVC" + ... ) + >>> display.plot() + <...> + >>> plt.show() + """ + + def __init__(self, *, fpr, fnr, estimator_name=None, pos_label=None): + self.fpr = fpr + self.fnr = fnr + self.estimator_name = estimator_name + self.pos_label = pos_label + + @classmethod + def from_estimator( + cls, + estimator, + X, + y, + *, + sample_weight=None, + drop_intermediate=True, + response_method="auto", + pos_label=None, + name=None, + ax=None, + **kwargs, + ): + """Plot DET curve given an estimator and data. + + For general information regarding `scikit-learn` visualization tools, see + the :ref:`Visualization Guide `. + For guidance on interpreting these plots, refer to the + :ref:`Model Evaluation Guide `. + + .. versionadded:: 1.0 + + Parameters + ---------- + estimator : estimator instance + Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline` + in which the last estimator is a classifier. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input values. + + y : array-like of shape (n_samples,) + Target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + drop_intermediate : bool, default=True + Whether to drop thresholds where true positives (tp) do not change + from the previous or subsequent threshold. All points with the same + tp value have the same `fnr` and thus same y coordinate. + + .. versionadded:: 1.7 + + response_method : {'predict_proba', 'decision_function', 'auto'} \ + default='auto' + Specifies whether to use :term:`predict_proba` or + :term:`decision_function` as the predicted target response. If set + to 'auto', :term:`predict_proba` is tried first and if it does not + exist :term:`decision_function` is tried next. + + pos_label : int, float, bool or str, default=None + The label of the positive class. When `pos_label=None`, if `y_true` + is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an + error will be raised. + + name : str, default=None + Name of DET curve for labeling. If `None`, use the name of the + estimator. + + ax : matplotlib axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + **kwargs : dict + Additional keywords arguments passed to matplotlib `plot` function. + + Returns + ------- + display : :class:`~sklearn.metrics.DetCurveDisplay` + Object that stores computed values. + + See Also + -------- + det_curve : Compute error rates for different probability thresholds. + DetCurveDisplay.from_predictions : Plot DET curve given the true and + predicted labels. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_classification + >>> from sklearn.metrics import DetCurveDisplay + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.svm import SVC + >>> X, y = make_classification(n_samples=1000, random_state=0) + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, test_size=0.4, random_state=0) + >>> clf = SVC(random_state=0).fit(X_train, y_train) + >>> DetCurveDisplay.from_estimator( + ... clf, X_test, y_test) + <...> + >>> plt.show() + """ + y_pred, pos_label, name = cls._validate_and_get_response_values( + estimator, + X, + y, + response_method=response_method, + pos_label=pos_label, + name=name, + ) + + return cls.from_predictions( + y_true=y, + y_pred=y_pred, + sample_weight=sample_weight, + drop_intermediate=drop_intermediate, + name=name, + ax=ax, + pos_label=pos_label, + **kwargs, + ) + + @classmethod + def from_predictions( + cls, + y_true, + y_pred, + *, + sample_weight=None, + drop_intermediate=True, + pos_label=None, + name=None, + ax=None, + **kwargs, + ): + """Plot the DET curve given the true and predicted labels. + + For general information regarding `scikit-learn` visualization tools, see + the :ref:`Visualization Guide `. + For guidance on interpreting these plots, refer to the + :ref:`Model Evaluation Guide `. + + .. versionadded:: 1.0 + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + True labels. + + y_pred : array-like of shape (n_samples,) + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by `decision_function` on some classifiers). + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + drop_intermediate : bool, default=True + Whether to drop thresholds where true positives (tp) do not change + from the previous or subsequent threshold. All points with the same + tp value have the same `fnr` and thus same y coordinate. + + .. versionadded:: 1.7 + + pos_label : int, float, bool or str, default=None + The label of the positive class. When `pos_label=None`, if `y_true` + is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an + error will be raised. + + name : str, default=None + Name of DET curve for labeling. If `None`, name will be set to + `"Classifier"`. + + ax : matplotlib axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + **kwargs : dict + Additional keywords arguments passed to matplotlib `plot` function. + + Returns + ------- + display : :class:`~sklearn.metrics.DetCurveDisplay` + Object that stores computed values. + + See Also + -------- + det_curve : Compute error rates for different probability thresholds. + DetCurveDisplay.from_estimator : Plot DET curve given an estimator and + some data. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_classification + >>> from sklearn.metrics import DetCurveDisplay + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.svm import SVC + >>> X, y = make_classification(n_samples=1000, random_state=0) + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, test_size=0.4, random_state=0) + >>> clf = SVC(random_state=0).fit(X_train, y_train) + >>> y_pred = clf.decision_function(X_test) + >>> DetCurveDisplay.from_predictions( + ... y_test, y_pred) + <...> + >>> plt.show() + """ + pos_label_validated, name = cls._validate_from_predictions_params( + y_true, y_pred, sample_weight=sample_weight, pos_label=pos_label, name=name + ) + + fpr, fnr, _ = det_curve( + y_true, + y_pred, + pos_label=pos_label, + sample_weight=sample_weight, + drop_intermediate=drop_intermediate, + ) + + viz = cls( + fpr=fpr, + fnr=fnr, + estimator_name=name, + pos_label=pos_label_validated, + ) + + return viz.plot(ax=ax, name=name, **kwargs) + + def plot(self, ax=None, *, name=None, **kwargs): + """Plot visualization. + + Parameters + ---------- + ax : matplotlib axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + name : str, default=None + Name of DET curve for labeling. If `None`, use `estimator_name` if + it is not `None`, otherwise no labeling is shown. + + **kwargs : dict + Additional keywords arguments passed to matplotlib `plot` function. + + Returns + ------- + display : :class:`~sklearn.metrics.DetCurveDisplay` + Object that stores computed values. + """ + self.ax_, self.figure_, name = self._validate_plot_params(ax=ax, name=name) + + line_kwargs = {} if name is None else {"label": name} + line_kwargs.update(**kwargs) + + # We have the following bounds: + # sp.stats.norm.ppf(0.0) = -np.inf + # sp.stats.norm.ppf(1.0) = np.inf + # We therefore clip to eps and 1 - eps to not provide infinity to matplotlib. + eps = np.finfo(self.fpr.dtype).eps + self.fpr = self.fpr.clip(eps, 1 - eps) + self.fnr = self.fnr.clip(eps, 1 - eps) + + (self.line_,) = self.ax_.plot( + sp.stats.norm.ppf(self.fpr), + sp.stats.norm.ppf(self.fnr), + **line_kwargs, + ) + info_pos_label = ( + f" (Positive label: {self.pos_label})" if self.pos_label is not None else "" + ) + + xlabel = "False Positive Rate" + info_pos_label + ylabel = "False Negative Rate" + info_pos_label + self.ax_.set(xlabel=xlabel, ylabel=ylabel) + + if "label" in line_kwargs: + self.ax_.legend(loc="lower right") + + ticks = [0.001, 0.01, 0.05, 0.20, 0.5, 0.80, 0.95, 0.99, 0.999] + tick_locations = sp.stats.norm.ppf(ticks) + tick_labels = [ + "{:.0%}".format(s) if (100 * s).is_integer() else "{:.1%}".format(s) + for s in ticks + ] + self.ax_.set_xticks(tick_locations) + self.ax_.set_xticklabels(tick_labels) + self.ax_.set_xlim(-3, 3) + self.ax_.set_yticks(tick_locations) + self.ax_.set_yticklabels(tick_labels) + self.ax_.set_ylim(-3, 3) + + return self diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/precision_recall_curve.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/precision_recall_curve.py new file mode 100644 index 0000000000000000000000000000000000000000..30dd1fba08761f12d74d75743b4985aca3442d59 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/precision_recall_curve.py @@ -0,0 +1,555 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from collections import Counter + +from ...utils._plotting import ( + _BinaryClassifierCurveDisplayMixin, + _despine, + _validate_style_kwargs, +) +from .._ranking import average_precision_score, precision_recall_curve + + +class PrecisionRecallDisplay(_BinaryClassifierCurveDisplayMixin): + """Precision Recall visualization. + + It is recommended to use + :func:`~sklearn.metrics.PrecisionRecallDisplay.from_estimator` or + :func:`~sklearn.metrics.PrecisionRecallDisplay.from_predictions` to create + a :class:`~sklearn.metrics.PrecisionRecallDisplay`. All parameters are + stored as attributes. + + For general information regarding `scikit-learn` visualization tools, see + the :ref:`Visualization Guide `. + For guidance on interpreting these plots, refer to the :ref:`Model + Evaluation Guide `. + + Parameters + ---------- + precision : ndarray + Precision values. + + recall : ndarray + Recall values. + + average_precision : float, default=None + Average precision. If None, the average precision is not shown. + + estimator_name : str, default=None + Name of estimator. If None, then the estimator name is not shown. + + pos_label : int, float, bool or str, default=None + The class considered as the positive class. If None, the class will not + be shown in the legend. + + .. versionadded:: 0.24 + + prevalence_pos_label : float, default=None + The prevalence of the positive label. It is used for plotting the + chance level line. If None, the chance level line will not be plotted + even if `plot_chance_level` is set to True when plotting. + + .. versionadded:: 1.3 + + Attributes + ---------- + line_ : matplotlib Artist + Precision recall curve. + + chance_level_ : matplotlib Artist or None + The chance level line. It is `None` if the chance level is not plotted. + + .. versionadded:: 1.3 + + ax_ : matplotlib Axes + Axes with precision recall curve. + + figure_ : matplotlib Figure + Figure containing the curve. + + See Also + -------- + precision_recall_curve : Compute precision-recall pairs for different + probability thresholds. + PrecisionRecallDisplay.from_estimator : Plot Precision Recall Curve given + a binary classifier. + PrecisionRecallDisplay.from_predictions : Plot Precision Recall Curve + using predictions from a binary classifier. + + Notes + ----- + The average precision (cf. :func:`~sklearn.metrics.average_precision_score`) in + scikit-learn is computed without any interpolation. To be consistent with + this metric, the precision-recall curve is plotted without any + interpolation as well (step-wise style). + + You can change this style by passing the keyword argument + `drawstyle="default"` in :meth:`plot`, :meth:`from_estimator`, or + :meth:`from_predictions`. However, the curve will not be strictly + consistent with the reported average precision. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_classification + >>> from sklearn.metrics import (precision_recall_curve, + ... PrecisionRecallDisplay) + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.svm import SVC + >>> X, y = make_classification(random_state=0) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, + ... random_state=0) + >>> clf = SVC(random_state=0) + >>> clf.fit(X_train, y_train) + SVC(random_state=0) + >>> predictions = clf.predict(X_test) + >>> precision, recall, _ = precision_recall_curve(y_test, predictions) + >>> disp = PrecisionRecallDisplay(precision=precision, recall=recall) + >>> disp.plot() + <...> + >>> plt.show() + """ + + def __init__( + self, + precision, + recall, + *, + average_precision=None, + estimator_name=None, + pos_label=None, + prevalence_pos_label=None, + ): + self.estimator_name = estimator_name + self.precision = precision + self.recall = recall + self.average_precision = average_precision + self.pos_label = pos_label + self.prevalence_pos_label = prevalence_pos_label + + def plot( + self, + ax=None, + *, + name=None, + plot_chance_level=False, + chance_level_kw=None, + despine=False, + **kwargs, + ): + """Plot visualization. + + Extra keyword arguments will be passed to matplotlib's `plot`. + + Parameters + ---------- + ax : Matplotlib Axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + name : str, default=None + Name of precision recall curve for labeling. If `None`, use + `estimator_name` if not `None`, otherwise no labeling is shown. + + plot_chance_level : bool, default=False + Whether to plot the chance level. The chance level is the prevalence + of the positive label computed from the data passed during + :meth:`from_estimator` or :meth:`from_predictions` call. + + .. versionadded:: 1.3 + + chance_level_kw : dict, default=None + Keyword arguments to be passed to matplotlib's `plot` for rendering + the chance level line. + + .. versionadded:: 1.3 + + despine : bool, default=False + Whether to remove the top and right spines from the plot. + + .. versionadded:: 1.6 + + **kwargs : dict + Keyword arguments to be passed to matplotlib's `plot`. + + Returns + ------- + display : :class:`~sklearn.metrics.PrecisionRecallDisplay` + Object that stores computed values. + + Notes + ----- + The average precision (cf. :func:`~sklearn.metrics.average_precision_score`) + in scikit-learn is computed without any interpolation. To be consistent + with this metric, the precision-recall curve is plotted without any + interpolation as well (step-wise style). + + You can change this style by passing the keyword argument + `drawstyle="default"`. However, the curve will not be strictly + consistent with the reported average precision. + """ + self.ax_, self.figure_, name = self._validate_plot_params(ax=ax, name=name) + + default_line_kwargs = {"drawstyle": "steps-post"} + if self.average_precision is not None and name is not None: + default_line_kwargs["label"] = ( + f"{name} (AP = {self.average_precision:0.2f})" + ) + elif self.average_precision is not None: + default_line_kwargs["label"] = f"AP = {self.average_precision:0.2f}" + elif name is not None: + default_line_kwargs["label"] = name + + line_kwargs = _validate_style_kwargs(default_line_kwargs, kwargs) + + (self.line_,) = self.ax_.plot(self.recall, self.precision, **line_kwargs) + + info_pos_label = ( + f" (Positive label: {self.pos_label})" if self.pos_label is not None else "" + ) + + xlabel = "Recall" + info_pos_label + ylabel = "Precision" + info_pos_label + self.ax_.set( + xlabel=xlabel, + xlim=(-0.01, 1.01), + ylabel=ylabel, + ylim=(-0.01, 1.01), + aspect="equal", + ) + + if plot_chance_level: + if self.prevalence_pos_label is None: + raise ValueError( + "You must provide prevalence_pos_label when constructing the " + "PrecisionRecallDisplay object in order to plot the chance " + "level line. Alternatively, you may use " + "PrecisionRecallDisplay.from_estimator or " + "PrecisionRecallDisplay.from_predictions " + "to automatically set prevalence_pos_label" + ) + + default_chance_level_line_kw = { + "label": f"Chance level (AP = {self.prevalence_pos_label:0.2f})", + "color": "k", + "linestyle": "--", + } + + if chance_level_kw is None: + chance_level_kw = {} + + chance_level_line_kw = _validate_style_kwargs( + default_chance_level_line_kw, chance_level_kw + ) + + (self.chance_level_,) = self.ax_.plot( + (0, 1), + (self.prevalence_pos_label, self.prevalence_pos_label), + **chance_level_line_kw, + ) + else: + self.chance_level_ = None + + if despine: + _despine(self.ax_) + + if "label" in line_kwargs or plot_chance_level: + self.ax_.legend(loc="lower left") + + return self + + @classmethod + def from_estimator( + cls, + estimator, + X, + y, + *, + sample_weight=None, + drop_intermediate=False, + response_method="auto", + pos_label=None, + name=None, + ax=None, + plot_chance_level=False, + chance_level_kw=None, + despine=False, + **kwargs, + ): + """Plot precision-recall curve given an estimator and some data. + + For general information regarding `scikit-learn` visualization tools, see + the :ref:`Visualization Guide `. + For guidance on interpreting these plots, refer to the :ref:`Model + Evaluation Guide `. + + Parameters + ---------- + estimator : estimator instance + Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline` + in which the last estimator is a classifier. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input values. + + y : array-like of shape (n_samples,) + Target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + drop_intermediate : bool, default=False + Whether to drop some suboptimal thresholds which would not appear + on a plotted precision-recall curve. This is useful in order to + create lighter precision-recall curves. + + .. versionadded:: 1.3 + + response_method : {'predict_proba', 'decision_function', 'auto'}, \ + default='auto' + Specifies whether to use :term:`predict_proba` or + :term:`decision_function` as the target response. If set to 'auto', + :term:`predict_proba` is tried first and if it does not exist + :term:`decision_function` is tried next. + + pos_label : int, float, bool or str, default=None + The class considered as the positive class when computing the + precision and recall metrics. By default, `estimators.classes_[1]` + is considered as the positive class. + + name : str, default=None + Name for labeling curve. If `None`, no name is used. + + ax : matplotlib axes, default=None + Axes object to plot on. If `None`, a new figure and axes is created. + + plot_chance_level : bool, default=False + Whether to plot the chance level. The chance level is the prevalence + of the positive label computed from the data passed during + :meth:`from_estimator` or :meth:`from_predictions` call. + + .. versionadded:: 1.3 + + chance_level_kw : dict, default=None + Keyword arguments to be passed to matplotlib's `plot` for rendering + the chance level line. + + .. versionadded:: 1.3 + + despine : bool, default=False + Whether to remove the top and right spines from the plot. + + .. versionadded:: 1.6 + + **kwargs : dict + Keyword arguments to be passed to matplotlib's `plot`. + + Returns + ------- + display : :class:`~sklearn.metrics.PrecisionRecallDisplay` + + See Also + -------- + PrecisionRecallDisplay.from_predictions : Plot precision-recall curve + using estimated probabilities or output of decision function. + + Notes + ----- + The average precision (cf. :func:`~sklearn.metrics.average_precision_score`) + in scikit-learn is computed without any interpolation. To be consistent + with this metric, the precision-recall curve is plotted without any + interpolation as well (step-wise style). + + You can change this style by passing the keyword argument + `drawstyle="default"`. However, the curve will not be strictly + consistent with the reported average precision. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_classification + >>> from sklearn.metrics import PrecisionRecallDisplay + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.linear_model import LogisticRegression + >>> X, y = make_classification(random_state=0) + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, random_state=0) + >>> clf = LogisticRegression() + >>> clf.fit(X_train, y_train) + LogisticRegression() + >>> PrecisionRecallDisplay.from_estimator( + ... clf, X_test, y_test) + <...> + >>> plt.show() + """ + y_pred, pos_label, name = cls._validate_and_get_response_values( + estimator, + X, + y, + response_method=response_method, + pos_label=pos_label, + name=name, + ) + + return cls.from_predictions( + y, + y_pred, + sample_weight=sample_weight, + name=name, + pos_label=pos_label, + drop_intermediate=drop_intermediate, + ax=ax, + plot_chance_level=plot_chance_level, + chance_level_kw=chance_level_kw, + despine=despine, + **kwargs, + ) + + @classmethod + def from_predictions( + cls, + y_true, + y_pred, + *, + sample_weight=None, + drop_intermediate=False, + pos_label=None, + name=None, + ax=None, + plot_chance_level=False, + chance_level_kw=None, + despine=False, + **kwargs, + ): + """Plot precision-recall curve given binary class predictions. + + For general information regarding `scikit-learn` visualization tools, see + the :ref:`Visualization Guide `. + For guidance on interpreting these plots, refer to the :ref:`Model + Evaluation Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + True binary labels. + + y_pred : array-like of shape (n_samples,) + Estimated probabilities or output of decision function. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + drop_intermediate : bool, default=False + Whether to drop some suboptimal thresholds which would not appear + on a plotted precision-recall curve. This is useful in order to + create lighter precision-recall curves. + + .. versionadded:: 1.3 + + pos_label : int, float, bool or str, default=None + The class considered as the positive class when computing the + precision and recall metrics. + + name : str, default=None + Name for labeling curve. If `None`, name will be set to + `"Classifier"`. + + ax : matplotlib axes, default=None + Axes object to plot on. If `None`, a new figure and axes is created. + + plot_chance_level : bool, default=False + Whether to plot the chance level. The chance level is the prevalence + of the positive label computed from the data passed during + :meth:`from_estimator` or :meth:`from_predictions` call. + + .. versionadded:: 1.3 + + chance_level_kw : dict, default=None + Keyword arguments to be passed to matplotlib's `plot` for rendering + the chance level line. + + .. versionadded:: 1.3 + + despine : bool, default=False + Whether to remove the top and right spines from the plot. + + .. versionadded:: 1.6 + + **kwargs : dict + Keyword arguments to be passed to matplotlib's `plot`. + + Returns + ------- + display : :class:`~sklearn.metrics.PrecisionRecallDisplay` + + See Also + -------- + PrecisionRecallDisplay.from_estimator : Plot precision-recall curve + using an estimator. + + Notes + ----- + The average precision (cf. :func:`~sklearn.metrics.average_precision_score`) + in scikit-learn is computed without any interpolation. To be consistent + with this metric, the precision-recall curve is plotted without any + interpolation as well (step-wise style). + + You can change this style by passing the keyword argument + `drawstyle="default"`. However, the curve will not be strictly + consistent with the reported average precision. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_classification + >>> from sklearn.metrics import PrecisionRecallDisplay + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.linear_model import LogisticRegression + >>> X, y = make_classification(random_state=0) + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, random_state=0) + >>> clf = LogisticRegression() + >>> clf.fit(X_train, y_train) + LogisticRegression() + >>> y_pred = clf.predict_proba(X_test)[:, 1] + >>> PrecisionRecallDisplay.from_predictions( + ... y_test, y_pred) + <...> + >>> plt.show() + """ + pos_label, name = cls._validate_from_predictions_params( + y_true, y_pred, sample_weight=sample_weight, pos_label=pos_label, name=name + ) + + precision, recall, _ = precision_recall_curve( + y_true, + y_pred, + pos_label=pos_label, + sample_weight=sample_weight, + drop_intermediate=drop_intermediate, + ) + average_precision = average_precision_score( + y_true, y_pred, pos_label=pos_label, sample_weight=sample_weight + ) + + class_count = Counter(y_true) + prevalence_pos_label = class_count[pos_label] / sum(class_count.values()) + + viz = cls( + precision=precision, + recall=recall, + average_precision=average_precision, + estimator_name=name, + pos_label=pos_label, + prevalence_pos_label=prevalence_pos_label, + ) + + return viz.plot( + ax=ax, + name=name, + plot_chance_level=plot_chance_level, + chance_level_kw=chance_level_kw, + despine=despine, + **kwargs, + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/regression.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/regression.py new file mode 100644 index 0000000000000000000000000000000000000000..1b56859cabefd181eafec383a1aa48c2a28807a4 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/regression.py @@ -0,0 +1,413 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numbers + +import numpy as np + +from ...utils import _safe_indexing, check_random_state +from ...utils._optional_dependencies import check_matplotlib_support +from ...utils._plotting import _validate_style_kwargs + + +class PredictionErrorDisplay: + """Visualization of the prediction error of a regression model. + + This tool can display "residuals vs predicted" or "actual vs predicted" + using scatter plots to qualitatively assess the behavior of a regressor, + preferably on held-out data points. + + See the details in the docstrings of + :func:`~sklearn.metrics.PredictionErrorDisplay.from_estimator` or + :func:`~sklearn.metrics.PredictionErrorDisplay.from_predictions` to + create a visualizer. All parameters are stored as attributes. + + For general information regarding `scikit-learn` visualization tools, read + more in the :ref:`Visualization Guide `. + For details regarding interpreting these plots, refer to the + :ref:`Model Evaluation Guide `. + + .. versionadded:: 1.2 + + Parameters + ---------- + y_true : ndarray of shape (n_samples,) + True values. + + y_pred : ndarray of shape (n_samples,) + Prediction values. + + Attributes + ---------- + line_ : matplotlib Artist + Optimal line representing `y_true == y_pred`. Therefore, it is a + diagonal line for `kind="predictions"` and a horizontal line for + `kind="residuals"`. + + errors_lines_ : matplotlib Artist or None + Residual lines. If `with_errors=False`, then it is set to `None`. + + scatter_ : matplotlib Artist + Scatter data points. + + ax_ : matplotlib Axes + Axes with the different matplotlib axis. + + figure_ : matplotlib Figure + Figure containing the scatter and lines. + + See Also + -------- + PredictionErrorDisplay.from_estimator : Prediction error visualization + given an estimator and some data. + PredictionErrorDisplay.from_predictions : Prediction error visualization + given the true and predicted targets. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import load_diabetes + >>> from sklearn.linear_model import Ridge + >>> from sklearn.metrics import PredictionErrorDisplay + >>> X, y = load_diabetes(return_X_y=True) + >>> ridge = Ridge().fit(X, y) + >>> y_pred = ridge.predict(X) + >>> display = PredictionErrorDisplay(y_true=y, y_pred=y_pred) + >>> display.plot() + <...> + >>> plt.show() + """ + + def __init__(self, *, y_true, y_pred): + self.y_true = y_true + self.y_pred = y_pred + + def plot( + self, + ax=None, + *, + kind="residual_vs_predicted", + scatter_kwargs=None, + line_kwargs=None, + ): + """Plot visualization. + + Extra keyword arguments will be passed to matplotlib's ``plot``. + + Parameters + ---------- + ax : matplotlib axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + kind : {"actual_vs_predicted", "residual_vs_predicted"}, \ + default="residual_vs_predicted" + The type of plot to draw: + + - "actual_vs_predicted" draws the observed values (y-axis) vs. + the predicted values (x-axis). + - "residual_vs_predicted" draws the residuals, i.e. difference + between observed and predicted values, (y-axis) vs. the predicted + values (x-axis). + + scatter_kwargs : dict, default=None + Dictionary with keywords passed to the `matplotlib.pyplot.scatter` + call. + + line_kwargs : dict, default=None + Dictionary with keyword passed to the `matplotlib.pyplot.plot` + call to draw the optimal line. + + Returns + ------- + display : :class:`~sklearn.metrics.PredictionErrorDisplay` + + Object that stores computed values. + """ + check_matplotlib_support(f"{self.__class__.__name__}.plot") + + expected_kind = ("actual_vs_predicted", "residual_vs_predicted") + if kind not in expected_kind: + raise ValueError( + f"`kind` must be one of {', '.join(expected_kind)}. " + f"Got {kind!r} instead." + ) + + import matplotlib.pyplot as plt + + if scatter_kwargs is None: + scatter_kwargs = {} + if line_kwargs is None: + line_kwargs = {} + + default_scatter_kwargs = {"color": "tab:blue", "alpha": 0.8} + default_line_kwargs = {"color": "black", "alpha": 0.7, "linestyle": "--"} + + scatter_kwargs = _validate_style_kwargs(default_scatter_kwargs, scatter_kwargs) + line_kwargs = _validate_style_kwargs(default_line_kwargs, line_kwargs) + + scatter_kwargs = {**default_scatter_kwargs, **scatter_kwargs} + line_kwargs = {**default_line_kwargs, **line_kwargs} + + if ax is None: + _, ax = plt.subplots() + + if kind == "actual_vs_predicted": + max_value = max(np.max(self.y_true), np.max(self.y_pred)) + min_value = min(np.min(self.y_true), np.min(self.y_pred)) + self.line_ = ax.plot( + [min_value, max_value], [min_value, max_value], **line_kwargs + )[0] + + x_data, y_data = self.y_pred, self.y_true + xlabel, ylabel = "Predicted values", "Actual values" + + self.scatter_ = ax.scatter(x_data, y_data, **scatter_kwargs) + + # force to have a squared axis + ax.set_aspect("equal", adjustable="datalim") + ax.set_xticks(np.linspace(min_value, max_value, num=5)) + ax.set_yticks(np.linspace(min_value, max_value, num=5)) + else: # kind == "residual_vs_predicted" + self.line_ = ax.plot( + [np.min(self.y_pred), np.max(self.y_pred)], + [0, 0], + **line_kwargs, + )[0] + self.scatter_ = ax.scatter( + self.y_pred, self.y_true - self.y_pred, **scatter_kwargs + ) + xlabel, ylabel = "Predicted values", "Residuals (actual - predicted)" + + ax.set(xlabel=xlabel, ylabel=ylabel) + + self.ax_ = ax + self.figure_ = ax.figure + + return self + + @classmethod + def from_estimator( + cls, + estimator, + X, + y, + *, + kind="residual_vs_predicted", + subsample=1_000, + random_state=None, + ax=None, + scatter_kwargs=None, + line_kwargs=None, + ): + """Plot the prediction error given a regressor and some data. + + For general information regarding `scikit-learn` visualization tools, + read more in the :ref:`Visualization Guide `. + For details regarding interpreting these plots, refer to the + :ref:`Model Evaluation Guide `. + + .. versionadded:: 1.2 + + Parameters + ---------- + estimator : estimator instance + Fitted regressor or a fitted :class:`~sklearn.pipeline.Pipeline` + in which the last estimator is a regressor. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input values. + + y : array-like of shape (n_samples,) + Target values. + + kind : {"actual_vs_predicted", "residual_vs_predicted"}, \ + default="residual_vs_predicted" + The type of plot to draw: + + - "actual_vs_predicted" draws the observed values (y-axis) vs. + the predicted values (x-axis). + - "residual_vs_predicted" draws the residuals, i.e. difference + between observed and predicted values, (y-axis) vs. the predicted + values (x-axis). + + subsample : float, int or None, default=1_000 + Sampling the samples to be shown on the scatter plot. If `float`, + it should be between 0 and 1 and represents the proportion of the + original dataset. If `int`, it represents the number of samples + display on the scatter plot. If `None`, no subsampling will be + applied. by default, 1000 samples or less will be displayed. + + random_state : int or RandomState, default=None + Controls the randomness when `subsample` is not `None`. + See :term:`Glossary ` for details. + + ax : matplotlib axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + scatter_kwargs : dict, default=None + Dictionary with keywords passed to the `matplotlib.pyplot.scatter` + call. + + line_kwargs : dict, default=None + Dictionary with keyword passed to the `matplotlib.pyplot.plot` + call to draw the optimal line. + + Returns + ------- + display : :class:`~sklearn.metrics.PredictionErrorDisplay` + Object that stores the computed values. + + See Also + -------- + PredictionErrorDisplay : Prediction error visualization for regression. + PredictionErrorDisplay.from_predictions : Prediction error visualization + given the true and predicted targets. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import load_diabetes + >>> from sklearn.linear_model import Ridge + >>> from sklearn.metrics import PredictionErrorDisplay + >>> X, y = load_diabetes(return_X_y=True) + >>> ridge = Ridge().fit(X, y) + >>> disp = PredictionErrorDisplay.from_estimator(ridge, X, y) + >>> plt.show() + """ + check_matplotlib_support(f"{cls.__name__}.from_estimator") + + y_pred = estimator.predict(X) + + return cls.from_predictions( + y_true=y, + y_pred=y_pred, + kind=kind, + subsample=subsample, + random_state=random_state, + ax=ax, + scatter_kwargs=scatter_kwargs, + line_kwargs=line_kwargs, + ) + + @classmethod + def from_predictions( + cls, + y_true, + y_pred, + *, + kind="residual_vs_predicted", + subsample=1_000, + random_state=None, + ax=None, + scatter_kwargs=None, + line_kwargs=None, + ): + """Plot the prediction error given the true and predicted targets. + + For general information regarding `scikit-learn` visualization tools, + read more in the :ref:`Visualization Guide `. + For details regarding interpreting these plots, refer to the + :ref:`Model Evaluation Guide `. + + .. versionadded:: 1.2 + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + True target values. + + y_pred : array-like of shape (n_samples,) + Predicted target values. + + kind : {"actual_vs_predicted", "residual_vs_predicted"}, \ + default="residual_vs_predicted" + The type of plot to draw: + + - "actual_vs_predicted" draws the observed values (y-axis) vs. + the predicted values (x-axis). + - "residual_vs_predicted" draws the residuals, i.e. difference + between observed and predicted values, (y-axis) vs. the predicted + values (x-axis). + + subsample : float, int or None, default=1_000 + Sampling the samples to be shown on the scatter plot. If `float`, + it should be between 0 and 1 and represents the proportion of the + original dataset. If `int`, it represents the number of samples + display on the scatter plot. If `None`, no subsampling will be + applied. by default, 1000 samples or less will be displayed. + + random_state : int or RandomState, default=None + Controls the randomness when `subsample` is not `None`. + See :term:`Glossary ` for details. + + ax : matplotlib axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + scatter_kwargs : dict, default=None + Dictionary with keywords passed to the `matplotlib.pyplot.scatter` + call. + + line_kwargs : dict, default=None + Dictionary with keyword passed to the `matplotlib.pyplot.plot` + call to draw the optimal line. + + Returns + ------- + display : :class:`~sklearn.metrics.PredictionErrorDisplay` + Object that stores the computed values. + + See Also + -------- + PredictionErrorDisplay : Prediction error visualization for regression. + PredictionErrorDisplay.from_estimator : Prediction error visualization + given an estimator and some data. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import load_diabetes + >>> from sklearn.linear_model import Ridge + >>> from sklearn.metrics import PredictionErrorDisplay + >>> X, y = load_diabetes(return_X_y=True) + >>> ridge = Ridge().fit(X, y) + >>> y_pred = ridge.predict(X) + >>> disp = PredictionErrorDisplay.from_predictions(y_true=y, y_pred=y_pred) + >>> plt.show() + """ + check_matplotlib_support(f"{cls.__name__}.from_predictions") + + random_state = check_random_state(random_state) + + n_samples = len(y_true) + if isinstance(subsample, numbers.Integral): + if subsample <= 0: + raise ValueError( + f"When an integer, subsample={subsample} should be positive." + ) + elif isinstance(subsample, numbers.Real): + if subsample <= 0 or subsample >= 1: + raise ValueError( + f"When a floating-point, subsample={subsample} should" + " be in the (0, 1) range." + ) + subsample = int(n_samples * subsample) + + if subsample is not None and subsample < n_samples: + indices = random_state.choice(np.arange(n_samples), size=subsample) + y_true = _safe_indexing(y_true, indices, axis=0) + y_pred = _safe_indexing(y_pred, indices, axis=0) + + viz = cls( + y_true=y_true, + y_pred=y_pred, + ) + + return viz.plot( + ax=ax, + kind=kind, + scatter_kwargs=scatter_kwargs, + line_kwargs=line_kwargs, + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/roc_curve.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/roc_curve.py new file mode 100644 index 0000000000000000000000000000000000000000..383f14e688859afe537ccf89da68fe2751bcb5a4 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/roc_curve.py @@ -0,0 +1,795 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + + +import warnings + +import numpy as np + +from ...utils import _safe_indexing +from ...utils._plotting import ( + _BinaryClassifierCurveDisplayMixin, + _check_param_lengths, + _convert_to_list_leaving_none, + _deprecate_estimator_name, + _despine, + _validate_style_kwargs, +) +from ...utils._response import _get_response_values_binary +from .._ranking import auc, roc_curve + + +class RocCurveDisplay(_BinaryClassifierCurveDisplayMixin): + """ROC Curve visualization. + + It is recommended to use + :func:`~sklearn.metrics.RocCurveDisplay.from_estimator` or + :func:`~sklearn.metrics.RocCurveDisplay.from_predictions` or + :func:`~sklearn.metrics.RocCurveDisplay.from_cv_results` to create + a :class:`~sklearn.metrics.RocCurveDisplay`. All parameters are + stored as attributes. + + For general information regarding `scikit-learn` visualization tools, see + the :ref:`Visualization Guide `. + For guidance on interpreting these plots, refer to the :ref:`Model + Evaluation Guide `. + + Parameters + ---------- + fpr : ndarray or list of ndarrays + False positive rates. Each ndarray should contain values for a single curve. + If plotting multiple curves, list should be of same length as `tpr`. + + .. versionchanged:: 1.7 + Now accepts a list for plotting multiple curves. + + tpr : ndarray or list of ndarrays + True positive rates. Each ndarray should contain values for a single curve. + If plotting multiple curves, list should be of same length as `fpr`. + + .. versionchanged:: 1.7 + Now accepts a list for plotting multiple curves. + + roc_auc : float or list of floats, default=None + Area under ROC curve, used for labeling each curve in the legend. + If plotting multiple curves, should be a list of the same length as `fpr` + and `tpr`. If `None`, ROC AUC scores are not shown in the legend. + + .. versionchanged:: 1.7 + Now accepts a list for plotting multiple curves. + + name : str or list of str, default=None + Name for labeling legend entries. The number of legend entries is determined + by the `curve_kwargs` passed to `plot`, and is not affected by `name`. + To label each curve, provide a list of strings. To avoid labeling + individual curves that have the same appearance, this cannot be used in + conjunction with `curve_kwargs` being a dictionary or None. If a + string is provided, it will be used to either label the single legend entry + or if there are multiple legend entries, label each individual curve with + the same name. If still `None`, no name is shown in the legend. + + .. versionadded:: 1.7 + + pos_label : int, float, bool or str, default=None + The class considered as the positive class when computing the roc auc + metrics. By default, `estimators.classes_[1]` is considered + as the positive class. + + .. versionadded:: 0.24 + + estimator_name : str, default=None + Name of estimator. If None, the estimator name is not shown. + + .. deprecated:: 1.7 + `estimator_name` is deprecated and will be removed in 1.9. Use `name` + instead. + + Attributes + ---------- + line_ : matplotlib Artist or list of matplotlib Artists + ROC Curves. + + .. versionchanged:: 1.7 + This attribute can now be a list of Artists, for when multiple curves + are plotted. + + chance_level_ : matplotlib Artist or None + The chance level line. It is `None` if the chance level is not plotted. + + .. versionadded:: 1.3 + + ax_ : matplotlib Axes + Axes with ROC Curve. + + figure_ : matplotlib Figure + Figure containing the curve. + + See Also + -------- + roc_curve : Compute Receiver operating characteristic (ROC) curve. + RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic + (ROC) curve given an estimator and some data. + RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic + (ROC) curve given the true and predicted values. + roc_auc_score : Compute the area under the ROC curve. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> import numpy as np + >>> from sklearn import metrics + >>> y_true = np.array([0, 0, 1, 1]) + >>> y_score = np.array([0.1, 0.4, 0.35, 0.8]) + >>> fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score) + >>> roc_auc = metrics.auc(fpr, tpr) + >>> display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, + ... name='example estimator') + >>> display.plot() + <...> + >>> plt.show() + """ + + def __init__( + self, + *, + fpr, + tpr, + roc_auc=None, + name=None, + pos_label=None, + estimator_name="deprecated", + ): + self.fpr = fpr + self.tpr = tpr + self.roc_auc = roc_auc + self.name = _deprecate_estimator_name(estimator_name, name, "1.7") + self.pos_label = pos_label + + def _validate_plot_params(self, *, ax, name): + self.ax_, self.figure_, name = super()._validate_plot_params(ax=ax, name=name) + + fpr = _convert_to_list_leaving_none(self.fpr) + tpr = _convert_to_list_leaving_none(self.tpr) + roc_auc = _convert_to_list_leaving_none(self.roc_auc) + name = _convert_to_list_leaving_none(name) + + optional = {"self.roc_auc": roc_auc} + if isinstance(name, list) and len(name) != 1: + optional.update({"'name' (or self.name)": name}) + _check_param_lengths( + required={"self.fpr": fpr, "self.tpr": tpr}, + optional=optional, + class_name="RocCurveDisplay", + ) + return fpr, tpr, roc_auc, name + + def plot( + self, + ax=None, + *, + name=None, + curve_kwargs=None, + plot_chance_level=False, + chance_level_kw=None, + despine=False, + **kwargs, + ): + """Plot visualization. + + Parameters + ---------- + ax : matplotlib axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + name : str or list of str, default=None + Name for labeling legend entries. The number of legend entries + is determined by `curve_kwargs`, and is not affected by `name`. + To label each curve, provide a list of strings. To avoid labeling + individual curves that have the same appearance, this cannot be used in + conjunction with `curve_kwargs` being a dictionary or None. If a + string is provided, it will be used to either label the single legend entry + or if there are multiple legend entries, label each individual curve with + the same name. If `None`, set to `name` provided at `RocCurveDisplay` + initialization. If still `None`, no name is shown in the legend. + + .. versionadded:: 1.7 + + curve_kwargs : dict or list of dict, default=None + Keywords arguments to be passed to matplotlib's `plot` function + to draw individual ROC curves. For single curve plotting, should be + a dictionary. For multi-curve plotting, if a list is provided the + parameters are applied to the ROC curves of each CV fold + sequentially and a legend entry is added for each curve. + If a single dictionary is provided, the same parameters are applied + to all ROC curves and a single legend entry for all curves is added, + labeled with the mean ROC AUC score. + + .. versionadded:: 1.7 + + plot_chance_level : bool, default=False + Whether to plot the chance level. + + .. versionadded:: 1.3 + + chance_level_kw : dict, default=None + Keyword arguments to be passed to matplotlib's `plot` for rendering + the chance level line. + + .. versionadded:: 1.3 + + despine : bool, default=False + Whether to remove the top and right spines from the plot. + + .. versionadded:: 1.6 + + **kwargs : dict + Keyword arguments to be passed to matplotlib's `plot`. + + .. deprecated:: 1.7 + kwargs is deprecated and will be removed in 1.9. Pass matplotlib + arguments to `curve_kwargs` as a dictionary instead. + + Returns + ------- + display : :class:`~sklearn.metrics.RocCurveDisplay` + Object that stores computed values. + """ + fpr, tpr, roc_auc, name = self._validate_plot_params(ax=ax, name=name) + n_curves = len(fpr) + if not isinstance(curve_kwargs, list) and n_curves > 1: + if roc_auc: + legend_metric = {"mean": np.mean(roc_auc), "std": np.std(roc_auc)} + else: + legend_metric = {"mean": None, "std": None} + else: + roc_auc = roc_auc if roc_auc is not None else [None] * n_curves + legend_metric = {"metric": roc_auc} + + curve_kwargs = self._validate_curve_kwargs( + n_curves, + name, + legend_metric, + "AUC", + curve_kwargs=curve_kwargs, + **kwargs, + ) + + default_chance_level_line_kw = { + "label": "Chance level (AUC = 0.5)", + "color": "k", + "linestyle": "--", + } + + if chance_level_kw is None: + chance_level_kw = {} + + chance_level_kw = _validate_style_kwargs( + default_chance_level_line_kw, chance_level_kw + ) + + self.line_ = [] + for fpr, tpr, line_kw in zip(fpr, tpr, curve_kwargs): + self.line_.extend(self.ax_.plot(fpr, tpr, **line_kw)) + # Return single artist if only one curve is plotted + if len(self.line_) == 1: + self.line_ = self.line_[0] + + info_pos_label = ( + f" (Positive label: {self.pos_label})" if self.pos_label is not None else "" + ) + + xlabel = "False Positive Rate" + info_pos_label + ylabel = "True Positive Rate" + info_pos_label + self.ax_.set( + xlabel=xlabel, + xlim=(-0.01, 1.01), + ylabel=ylabel, + ylim=(-0.01, 1.01), + aspect="equal", + ) + + if plot_chance_level: + (self.chance_level_,) = self.ax_.plot((0, 1), (0, 1), **chance_level_kw) + else: + self.chance_level_ = None + + if despine: + _despine(self.ax_) + + if curve_kwargs[0].get("label") is not None or ( + plot_chance_level and chance_level_kw.get("label") is not None + ): + self.ax_.legend(loc="lower right") + + return self + + @classmethod + def from_estimator( + cls, + estimator, + X, + y, + *, + sample_weight=None, + drop_intermediate=True, + response_method="auto", + pos_label=None, + name=None, + ax=None, + curve_kwargs=None, + plot_chance_level=False, + chance_level_kw=None, + despine=False, + **kwargs, + ): + """Create a ROC Curve display from an estimator. + + For general information regarding `scikit-learn` visualization tools, + see the :ref:`Visualization Guide `. + For guidance on interpreting these plots, refer to the :ref:`Model + Evaluation Guide `. + + Parameters + ---------- + estimator : estimator instance + Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline` + in which the last estimator is a classifier. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input values. + + y : array-like of shape (n_samples,) + Target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + drop_intermediate : bool, default=True + Whether to drop thresholds where the resulting point is collinear + with its neighbors in ROC space. This has no effect on the ROC AUC + or visual shape of the curve, but reduces the number of plotted + points. + + response_method : {'predict_proba', 'decision_function', 'auto'} \ + default='auto' + Specifies whether to use :term:`predict_proba` or + :term:`decision_function` as the target response. If set to 'auto', + :term:`predict_proba` is tried first and if it does not exist + :term:`decision_function` is tried next. + + pos_label : int, float, bool or str, default=None + The class considered as the positive class when computing the ROC AUC. + By default, `estimators.classes_[1]` is considered + as the positive class. + + name : str, default=None + Name of ROC Curve for labeling. If `None`, use the name of the + estimator. + + ax : matplotlib axes, default=None + Axes object to plot on. If `None`, a new figure and axes is created. + + curve_kwargs : dict, default=None + Keywords arguments to be passed to matplotlib's `plot` function. + + .. versionadded:: 1.7 + + plot_chance_level : bool, default=False + Whether to plot the chance level. + + .. versionadded:: 1.3 + + chance_level_kw : dict, default=None + Keyword arguments to be passed to matplotlib's `plot` for rendering + the chance level line. + + .. versionadded:: 1.3 + + despine : bool, default=False + Whether to remove the top and right spines from the plot. + + .. versionadded:: 1.6 + + **kwargs : dict + Keyword arguments to be passed to matplotlib's `plot`. + + .. deprecated:: 1.7 + kwargs is deprecated and will be removed in 1.9. Pass matplotlib + arguments to `curve_kwargs` as a dictionary instead. + + Returns + ------- + display : :class:`~sklearn.metrics.RocCurveDisplay` + The ROC Curve display. + + See Also + -------- + roc_curve : Compute Receiver operating characteristic (ROC) curve. + RocCurveDisplay.from_predictions : ROC Curve visualization given the + probabilities of scores of a classifier. + roc_auc_score : Compute the area under the ROC curve. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_classification + >>> from sklearn.metrics import RocCurveDisplay + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.svm import SVC + >>> X, y = make_classification(random_state=0) + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, random_state=0) + >>> clf = SVC(random_state=0).fit(X_train, y_train) + >>> RocCurveDisplay.from_estimator( + ... clf, X_test, y_test) + <...> + >>> plt.show() + """ + y_score, pos_label, name = cls._validate_and_get_response_values( + estimator, + X, + y, + response_method=response_method, + pos_label=pos_label, + name=name, + ) + + return cls.from_predictions( + y_true=y, + y_score=y_score, + sample_weight=sample_weight, + drop_intermediate=drop_intermediate, + pos_label=pos_label, + name=name, + ax=ax, + curve_kwargs=curve_kwargs, + plot_chance_level=plot_chance_level, + chance_level_kw=chance_level_kw, + despine=despine, + **kwargs, + ) + + @classmethod + def from_predictions( + cls, + y_true, + y_score=None, + *, + sample_weight=None, + drop_intermediate=True, + pos_label=None, + name=None, + ax=None, + curve_kwargs=None, + plot_chance_level=False, + chance_level_kw=None, + despine=False, + y_pred="deprecated", + **kwargs, + ): + """Plot ROC curve given the true and predicted values. + + For general information regarding `scikit-learn` visualization tools, + see the :ref:`Visualization Guide `. + For guidance on interpreting these plots, refer to the :ref:`Model + Evaluation Guide `. + + .. versionadded:: 1.0 + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + True labels. + + y_score : array-like of shape (n_samples,) + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by “decision_function” on some classifiers). + + .. versionadded:: 1.7 + `y_pred` has been renamed to `y_score`. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + drop_intermediate : bool, default=True + Whether to drop thresholds where the resulting point is collinear + with its neighbors in ROC space. This has no effect on the ROC AUC + or visual shape of the curve, but reduces the number of plotted + points. + + pos_label : int, float, bool or str, default=None + The label of the positive class when computing the ROC AUC. + When `pos_label=None`, if `y_true` is in {-1, 1} or {0, 1}, `pos_label` + is set to 1, otherwise an error will be raised. + + name : str, default=None + Name of ROC curve for legend labeling. If `None`, name will be set to + `"Classifier"`. + + ax : matplotlib axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + curve_kwargs : dict, default=None + Keywords arguments to be passed to matplotlib's `plot` function. + + .. versionadded:: 1.7 + + plot_chance_level : bool, default=False + Whether to plot the chance level. + + .. versionadded:: 1.3 + + chance_level_kw : dict, default=None + Keyword arguments to be passed to matplotlib's `plot` for rendering + the chance level line. + + .. versionadded:: 1.3 + + despine : bool, default=False + Whether to remove the top and right spines from the plot. + + .. versionadded:: 1.6 + + y_pred : array-like of shape (n_samples,) + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by “decision_function” on some classifiers). + + .. deprecated:: 1.7 + `y_pred` is deprecated and will be removed in 1.9. Use + `y_score` instead. + + **kwargs : dict + Additional keywords arguments passed to matplotlib `plot` function. + + .. deprecated:: 1.7 + kwargs is deprecated and will be removed in 1.9. Pass matplotlib + arguments to `curve_kwargs` as a dictionary instead. + + Returns + ------- + display : :class:`~sklearn.metrics.RocCurveDisplay` + Object that stores computed values. + + See Also + -------- + roc_curve : Compute Receiver operating characteristic (ROC) curve. + RocCurveDisplay.from_estimator : ROC Curve visualization given an + estimator and some data. + roc_auc_score : Compute the area under the ROC curve. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_classification + >>> from sklearn.metrics import RocCurveDisplay + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.svm import SVC + >>> X, y = make_classification(random_state=0) + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, random_state=0) + >>> clf = SVC(random_state=0).fit(X_train, y_train) + >>> y_score = clf.decision_function(X_test) + >>> RocCurveDisplay.from_predictions(y_test, y_score) + <...> + >>> plt.show() + """ + # TODO(1.9): remove after the end of the deprecation period of `y_pred` + if y_score is not None and not ( + isinstance(y_pred, str) and y_pred == "deprecated" + ): + raise ValueError( + "`y_pred` and `y_score` cannot be both specified. Please use `y_score`" + " only as `y_pred` is deprecated in 1.7 and will be removed in 1.9." + ) + if not (isinstance(y_pred, str) and y_pred == "deprecated"): + warnings.warn( + ( + "y_pred is deprecated in 1.7 and will be removed in 1.9. " + "Please use `y_score` instead." + ), + FutureWarning, + ) + y_score = y_pred + + pos_label_validated, name = cls._validate_from_predictions_params( + y_true, y_score, sample_weight=sample_weight, pos_label=pos_label, name=name + ) + + fpr, tpr, _ = roc_curve( + y_true, + y_score, + pos_label=pos_label, + sample_weight=sample_weight, + drop_intermediate=drop_intermediate, + ) + roc_auc = auc(fpr, tpr) + + viz = cls( + fpr=fpr, + tpr=tpr, + roc_auc=roc_auc, + name=name, + pos_label=pos_label_validated, + ) + + return viz.plot( + ax=ax, + curve_kwargs=curve_kwargs, + plot_chance_level=plot_chance_level, + chance_level_kw=chance_level_kw, + despine=despine, + **kwargs, + ) + + @classmethod + def from_cv_results( + cls, + cv_results, + X, + y, + *, + sample_weight=None, + drop_intermediate=True, + response_method="auto", + pos_label=None, + ax=None, + name=None, + curve_kwargs=None, + plot_chance_level=False, + chance_level_kwargs=None, + despine=False, + ): + """Create a multi-fold ROC curve display given cross-validation results. + + .. versionadded:: 1.7 + + Parameters + ---------- + cv_results : dict + Dictionary as returned by :func:`~sklearn.model_selection.cross_validate` + using `return_estimator=True` and `return_indices=True` (i.e., dictionary + should contain the keys "estimator" and "indices"). + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input values. + + y : array-like of shape (n_samples,) + Target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + drop_intermediate : bool, default=True + Whether to drop some suboptimal thresholds which would not appear + on a plotted ROC curve. This is useful in order to create lighter + ROC curves. + + response_method : {'predict_proba', 'decision_function', 'auto'} \ + default='auto' + Specifies whether to use :term:`predict_proba` or + :term:`decision_function` as the target response. If set to 'auto', + :term:`predict_proba` is tried first and if it does not exist + :term:`decision_function` is tried next. + + pos_label : int, float, bool or str, default=None + The class considered as the positive class when computing the ROC AUC + metrics. By default, `estimators.classes_[1]` is considered + as the positive class. + + ax : matplotlib axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + name : str or list of str, default=None + Name for labeling legend entries. The number of legend entries + is determined by `curve_kwargs`, and is not affected by `name`. + To label each curve, provide a list of strings. To avoid labeling + individual curves that have the same appearance, this cannot be used in + conjunction with `curve_kwargs` being a dictionary or None. If a + string is provided, it will be used to either label the single legend entry + or if there are multiple legend entries, label each individual curve with + the same name. If `None`, no name is shown in the legend. + + curve_kwargs : dict or list of dict, default=None + Keywords arguments to be passed to matplotlib's `plot` function + to draw individual ROC curves. If a list is provided the + parameters are applied to the ROC curves of each CV fold + sequentially and a legend entry is added for each curve. + If a single dictionary is provided, the same parameters are applied + to all ROC curves and a single legend entry for all curves is added, + labeled with the mean ROC AUC score. + + plot_chance_level : bool, default=False + Whether to plot the chance level. + + chance_level_kwargs : dict, default=None + Keyword arguments to be passed to matplotlib's `plot` for rendering + the chance level line. + + despine : bool, default=False + Whether to remove the top and right spines from the plot. + + Returns + ------- + display : :class:`~sklearn.metrics.RocCurveDisplay` + The multi-fold ROC curve display. + + See Also + -------- + roc_curve : Compute Receiver operating characteristic (ROC) curve. + RocCurveDisplay.from_estimator : ROC Curve visualization given an + estimator and some data. + RocCurveDisplay.from_predictions : ROC Curve visualization given the + probabilities of scores of a classifier. + roc_auc_score : Compute the area under the ROC curve. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_classification + >>> from sklearn.metrics import RocCurveDisplay + >>> from sklearn.model_selection import cross_validate + >>> from sklearn.svm import SVC + >>> X, y = make_classification(random_state=0) + >>> clf = SVC(random_state=0) + >>> cv_results = cross_validate( + ... clf, X, y, cv=3, return_estimator=True, return_indices=True) + >>> RocCurveDisplay.from_cv_results(cv_results, X, y) + <...> + >>> plt.show() + """ + pos_label_ = cls._validate_from_cv_results_params( + cv_results, + X, + y, + sample_weight=sample_weight, + pos_label=pos_label, + ) + + fpr_folds, tpr_folds, auc_folds = [], [], [] + for estimator, test_indices in zip( + cv_results["estimator"], cv_results["indices"]["test"] + ): + y_true = _safe_indexing(y, test_indices) + y_pred, _ = _get_response_values_binary( + estimator, + _safe_indexing(X, test_indices), + response_method=response_method, + pos_label=pos_label_, + ) + sample_weight_fold = ( + None + if sample_weight is None + else _safe_indexing(sample_weight, test_indices) + ) + fpr, tpr, _ = roc_curve( + y_true, + y_pred, + pos_label=pos_label_, + sample_weight=sample_weight_fold, + drop_intermediate=drop_intermediate, + ) + roc_auc = auc(fpr, tpr) + + fpr_folds.append(fpr) + tpr_folds.append(tpr) + auc_folds.append(roc_auc) + + viz = cls( + fpr=fpr_folds, + tpr=tpr_folds, + roc_auc=auc_folds, + name=name, + pos_label=pos_label_, + ) + return viz.plot( + ax=ax, + curve_kwargs=curve_kwargs, + plot_chance_level=plot_chance_level, + chance_level_kw=chance_level_kwargs, + despine=despine, + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_common_curve_display.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_common_curve_display.py new file mode 100644 index 0000000000000000000000000000000000000000..753f2a1e7319d51b2ff7c299a25a7146801e5fd3 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_common_curve_display.py @@ -0,0 +1,292 @@ +import numpy as np +import pytest + +from sklearn.base import BaseEstimator, ClassifierMixin, clone +from sklearn.calibration import CalibrationDisplay +from sklearn.compose import make_column_transformer +from sklearn.datasets import load_iris +from sklearn.exceptions import NotFittedError +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import ( + ConfusionMatrixDisplay, + DetCurveDisplay, + PrecisionRecallDisplay, + PredictionErrorDisplay, + RocCurveDisplay, +) +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor + + +@pytest.fixture(scope="module") +def data(): + return load_iris(return_X_y=True) + + +@pytest.fixture(scope="module") +def data_binary(data): + X, y = data + return X[y < 2], y[y < 2] + + +@pytest.mark.parametrize( + "Display", + [CalibrationDisplay, DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay], +) +def test_display_curve_error_classifier(pyplot, data, data_binary, Display): + """Check that a proper error is raised when only binary classification is + supported.""" + X, y = data + X_binary, y_binary = data_binary + clf = DecisionTreeClassifier().fit(X, y) + + # Case 1: multiclass classifier with multiclass target + msg = "Expected 'estimator' to be a binary classifier. Got 3 classes instead." + with pytest.raises(ValueError, match=msg): + Display.from_estimator(clf, X, y) + + # Case 2: multiclass classifier with binary target + with pytest.raises(ValueError, match=msg): + Display.from_estimator(clf, X_binary, y_binary) + + # Case 3: binary classifier with multiclass target + clf = DecisionTreeClassifier().fit(X_binary, y_binary) + msg = "The target y is not binary. Got multiclass type of target." + with pytest.raises(ValueError, match=msg): + Display.from_estimator(clf, X, y) + + +@pytest.mark.parametrize( + "Display", + [CalibrationDisplay, DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay], +) +def test_display_curve_error_regression(pyplot, data_binary, Display): + """Check that we raise an error with regressor.""" + + # Case 1: regressor + X, y = data_binary + regressor = DecisionTreeRegressor().fit(X, y) + + msg = "Expected 'estimator' to be a binary classifier. Got DecisionTreeRegressor" + with pytest.raises(ValueError, match=msg): + Display.from_estimator(regressor, X, y) + + # Case 2: regression target + classifier = DecisionTreeClassifier().fit(X, y) + # Force `y_true` to be seen as a regression problem + y = y + 0.5 + msg = "The target y is not binary. Got continuous type of target." + with pytest.raises(ValueError, match=msg): + Display.from_estimator(classifier, X, y) + with pytest.raises(ValueError, match=msg): + Display.from_predictions(y, regressor.fit(X, y).predict(X)) + + +@pytest.mark.parametrize( + "response_method, msg", + [ + ( + "predict_proba", + "MyClassifier has none of the following attributes: predict_proba.", + ), + ( + "decision_function", + "MyClassifier has none of the following attributes: decision_function.", + ), + ( + "auto", + ( + "MyClassifier has none of the following attributes: predict_proba," + " decision_function." + ), + ), + ( + "bad_method", + "MyClassifier has none of the following attributes: bad_method.", + ), + ], +) +@pytest.mark.parametrize( + "Display", [DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay] +) +def test_display_curve_error_no_response( + pyplot, + data_binary, + response_method, + msg, + Display, +): + """Check that a proper error is raised when the response method requested + is not defined for the given trained classifier.""" + X, y = data_binary + + class MyClassifier(ClassifierMixin, BaseEstimator): + def fit(self, X, y): + self.classes_ = [0, 1] + return self + + clf = MyClassifier().fit(X, y) + + with pytest.raises(AttributeError, match=msg): + Display.from_estimator(clf, X, y, response_method=response_method) + + +@pytest.mark.parametrize("Display", [DetCurveDisplay, PrecisionRecallDisplay]) +@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"]) +def test_display_curve_estimator_name_multiple_calls( + pyplot, + data_binary, + Display, + constructor_name, +): + """Check that passing `name` when calling `plot` will overwrite the original name + in the legend.""" + X, y = data_binary + clf_name = "my hand-crafted name" + clf = LogisticRegression().fit(X, y) + y_pred = clf.predict_proba(X)[:, 1] + + # safe guard for the binary if/else construction + assert constructor_name in ("from_estimator", "from_predictions") + + if constructor_name == "from_estimator": + disp = Display.from_estimator(clf, X, y, name=clf_name) + else: + disp = Display.from_predictions(y, y_pred, name=clf_name) + assert disp.estimator_name == clf_name + pyplot.close("all") + disp.plot() + assert clf_name in disp.line_.get_label() + pyplot.close("all") + clf_name = "another_name" + disp.plot(name=clf_name) + assert clf_name in disp.line_.get_label() + + +# TODO: remove this test once classes moved to using `name` instead of +# `estimator_name` +@pytest.mark.parametrize( + "clf", + [ + LogisticRegression(), + make_pipeline(StandardScaler(), LogisticRegression()), + make_pipeline( + make_column_transformer((StandardScaler(), [0, 1])), LogisticRegression() + ), + ], +) +@pytest.mark.parametrize("Display", [DetCurveDisplay, PrecisionRecallDisplay]) +def test_display_curve_not_fitted_errors_old_name(pyplot, data_binary, clf, Display): + """Check that a proper error is raised when the classifier is not + fitted.""" + X, y = data_binary + # clone since we parametrize the test and the classifier will be fitted + # when testing the second and subsequent plotting function + model = clone(clf) + with pytest.raises(NotFittedError): + Display.from_estimator(model, X, y) + model.fit(X, y) + disp = Display.from_estimator(model, X, y) + assert model.__class__.__name__ in disp.line_.get_label() + assert disp.estimator_name == model.__class__.__name__ + + +@pytest.mark.parametrize( + "clf", + [ + LogisticRegression(), + make_pipeline(StandardScaler(), LogisticRegression()), + make_pipeline( + make_column_transformer((StandardScaler(), [0, 1])), LogisticRegression() + ), + ], +) +@pytest.mark.parametrize("Display", [RocCurveDisplay]) +def test_display_curve_not_fitted_errors(pyplot, data_binary, clf, Display): + """Check that a proper error is raised when the classifier is not fitted.""" + X, y = data_binary + # clone since we parametrize the test and the classifier will be fitted + # when testing the second and subsequent plotting function + model = clone(clf) + with pytest.raises(NotFittedError): + Display.from_estimator(model, X, y) + model.fit(X, y) + disp = Display.from_estimator(model, X, y) + assert model.__class__.__name__ in disp.line_.get_label() + assert disp.name == model.__class__.__name__ + + +@pytest.mark.parametrize( + "Display", [DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay] +) +def test_display_curve_n_samples_consistency(pyplot, data_binary, Display): + """Check the error raised when `y_pred` or `sample_weight` have inconsistent + length.""" + X, y = data_binary + classifier = DecisionTreeClassifier().fit(X, y) + + msg = "Found input variables with inconsistent numbers of samples" + with pytest.raises(ValueError, match=msg): + Display.from_estimator(classifier, X[:-2], y) + with pytest.raises(ValueError, match=msg): + Display.from_estimator(classifier, X, y[:-2]) + with pytest.raises(ValueError, match=msg): + Display.from_estimator(classifier, X, y, sample_weight=np.ones(X.shape[0] - 2)) + + +@pytest.mark.parametrize( + "Display", [DetCurveDisplay, PrecisionRecallDisplay, RocCurveDisplay] +) +def test_display_curve_error_pos_label(pyplot, data_binary, Display): + """Check consistence of error message when `pos_label` should be specified.""" + X, y = data_binary + y = y + 10 + + classifier = DecisionTreeClassifier().fit(X, y) + y_pred = classifier.predict_proba(X)[:, -1] + msg = r"y_true takes value in {10, 11} and pos_label is not specified" + with pytest.raises(ValueError, match=msg): + Display.from_predictions(y, y_pred) + + +@pytest.mark.parametrize( + "Display", + [ + CalibrationDisplay, + DetCurveDisplay, + PrecisionRecallDisplay, + RocCurveDisplay, + PredictionErrorDisplay, + ConfusionMatrixDisplay, + ], +) +@pytest.mark.parametrize( + "constructor", + ["from_predictions", "from_estimator"], +) +def test_classifier_display_curve_named_constructor_return_type( + pyplot, data_binary, Display, constructor +): + """Check that named constructors return the correct type when subclassed. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/pull/27675 + """ + X, y = data_binary + + # This can be anything - we just need to check the named constructor return + # type so the only requirement here is instantiating the class without error + y_pred = y + + classifier = LogisticRegression().fit(X, y) + + class SubclassOfDisplay(Display): + pass + + if constructor == "from_predictions": + curve = SubclassOfDisplay.from_predictions(y, y_pred) + else: # constructor == "from_estimator" + curve = SubclassOfDisplay.from_estimator(classifier, X, y) + + assert isinstance(curve, SubclassOfDisplay) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_confusion_matrix_display.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_confusion_matrix_display.py new file mode 100644 index 0000000000000000000000000000000000000000..6e93bf4993a93f0f5c12d295aa9c0c3b6136218d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_confusion_matrix_display.py @@ -0,0 +1,374 @@ +import numpy as np +import pytest +from numpy.testing import ( + assert_allclose, + assert_array_equal, +) + +from sklearn.compose import make_column_transformer +from sklearn.datasets import make_classification +from sklearn.exceptions import NotFittedError +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.svm import SVC, SVR + + +def test_confusion_matrix_display_validation(pyplot): + """Check that we raise the proper error when validating parameters.""" + X, y = make_classification( + n_samples=100, n_informative=5, n_classes=5, random_state=0 + ) + + with pytest.raises(NotFittedError): + ConfusionMatrixDisplay.from_estimator(SVC(), X, y) + + regressor = SVR().fit(X, y) + y_pred_regressor = regressor.predict(X) + y_pred_classifier = SVC().fit(X, y).predict(X) + + err_msg = "ConfusionMatrixDisplay.from_estimator only supports classifiers" + with pytest.raises(ValueError, match=err_msg): + ConfusionMatrixDisplay.from_estimator(regressor, X, y) + + err_msg = "Mix type of y not allowed, got types" + with pytest.raises(ValueError, match=err_msg): + # Force `y_true` to be seen as a regression problem + ConfusionMatrixDisplay.from_predictions(y + 0.5, y_pred_classifier) + with pytest.raises(ValueError, match=err_msg): + ConfusionMatrixDisplay.from_predictions(y, y_pred_regressor) + + err_msg = "Found input variables with inconsistent numbers of samples" + with pytest.raises(ValueError, match=err_msg): + ConfusionMatrixDisplay.from_predictions(y, y_pred_classifier[::2]) + + +@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"]) +@pytest.mark.parametrize("with_labels", [True, False]) +@pytest.mark.parametrize("with_display_labels", [True, False]) +def test_confusion_matrix_display_custom_labels( + pyplot, constructor_name, with_labels, with_display_labels +): + """Check the resulting plot when labels are given.""" + n_classes = 5 + X, y = make_classification( + n_samples=100, n_informative=5, n_classes=n_classes, random_state=0 + ) + classifier = SVC().fit(X, y) + y_pred = classifier.predict(X) + + # safe guard for the binary if/else construction + assert constructor_name in ("from_estimator", "from_predictions") + + ax = pyplot.gca() + labels = [2, 1, 0, 3, 4] if with_labels else None + display_labels = ["b", "d", "a", "e", "f"] if with_display_labels else None + + cm = confusion_matrix(y, y_pred, labels=labels) + common_kwargs = { + "ax": ax, + "display_labels": display_labels, + "labels": labels, + } + if constructor_name == "from_estimator": + disp = ConfusionMatrixDisplay.from_estimator(classifier, X, y, **common_kwargs) + else: + disp = ConfusionMatrixDisplay.from_predictions(y, y_pred, **common_kwargs) + assert_allclose(disp.confusion_matrix, cm) + + if with_display_labels: + expected_display_labels = display_labels + elif with_labels: + expected_display_labels = labels + else: + expected_display_labels = list(range(n_classes)) + + expected_display_labels_str = [str(name) for name in expected_display_labels] + + x_ticks = [tick.get_text() for tick in disp.ax_.get_xticklabels()] + y_ticks = [tick.get_text() for tick in disp.ax_.get_yticklabels()] + + assert_array_equal(disp.display_labels, expected_display_labels) + assert_array_equal(x_ticks, expected_display_labels_str) + assert_array_equal(y_ticks, expected_display_labels_str) + + +@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"]) +@pytest.mark.parametrize("normalize", ["true", "pred", "all", None]) +@pytest.mark.parametrize("include_values", [True, False]) +def test_confusion_matrix_display_plotting( + pyplot, + constructor_name, + normalize, + include_values, +): + """Check the overall plotting rendering.""" + n_classes = 5 + X, y = make_classification( + n_samples=100, n_informative=5, n_classes=n_classes, random_state=0 + ) + classifier = SVC().fit(X, y) + y_pred = classifier.predict(X) + + # safe guard for the binary if/else construction + assert constructor_name in ("from_estimator", "from_predictions") + + ax = pyplot.gca() + cmap = "plasma" + + cm = confusion_matrix(y, y_pred) + common_kwargs = { + "normalize": normalize, + "cmap": cmap, + "ax": ax, + "include_values": include_values, + } + if constructor_name == "from_estimator": + disp = ConfusionMatrixDisplay.from_estimator(classifier, X, y, **common_kwargs) + else: + disp = ConfusionMatrixDisplay.from_predictions(y, y_pred, **common_kwargs) + + assert disp.ax_ == ax + + if normalize == "true": + cm = cm / cm.sum(axis=1, keepdims=True) + elif normalize == "pred": + cm = cm / cm.sum(axis=0, keepdims=True) + elif normalize == "all": + cm = cm / cm.sum() + + assert_allclose(disp.confusion_matrix, cm) + import matplotlib as mpl + + assert isinstance(disp.im_, mpl.image.AxesImage) + assert disp.im_.get_cmap().name == cmap + assert isinstance(disp.ax_, pyplot.Axes) + assert isinstance(disp.figure_, pyplot.Figure) + + assert disp.ax_.get_ylabel() == "True label" + assert disp.ax_.get_xlabel() == "Predicted label" + + x_ticks = [tick.get_text() for tick in disp.ax_.get_xticklabels()] + y_ticks = [tick.get_text() for tick in disp.ax_.get_yticklabels()] + + expected_display_labels = list(range(n_classes)) + + expected_display_labels_str = [str(name) for name in expected_display_labels] + + assert_array_equal(disp.display_labels, expected_display_labels) + assert_array_equal(x_ticks, expected_display_labels_str) + assert_array_equal(y_ticks, expected_display_labels_str) + + image_data = disp.im_.get_array().data + assert_allclose(image_data, cm) + + if include_values: + assert disp.text_.shape == (n_classes, n_classes) + fmt = ".2g" + expected_text = np.array([format(v, fmt) for v in cm.ravel(order="C")]) + text_text = np.array([t.get_text() for t in disp.text_.ravel(order="C")]) + assert_array_equal(expected_text, text_text) + else: + assert disp.text_ is None + + +@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"]) +def test_confusion_matrix_display(pyplot, constructor_name): + """Check the behaviour of the default constructor without using the class + methods.""" + n_classes = 5 + X, y = make_classification( + n_samples=100, n_informative=5, n_classes=n_classes, random_state=0 + ) + classifier = SVC().fit(X, y) + y_pred = classifier.predict(X) + + # safe guard for the binary if/else construction + assert constructor_name in ("from_estimator", "from_predictions") + + cm = confusion_matrix(y, y_pred) + common_kwargs = { + "normalize": None, + "include_values": True, + "cmap": "viridis", + "xticks_rotation": 45.0, + } + if constructor_name == "from_estimator": + disp = ConfusionMatrixDisplay.from_estimator(classifier, X, y, **common_kwargs) + else: + disp = ConfusionMatrixDisplay.from_predictions(y, y_pred, **common_kwargs) + + assert_allclose(disp.confusion_matrix, cm) + assert disp.text_.shape == (n_classes, n_classes) + + rotations = [tick.get_rotation() for tick in disp.ax_.get_xticklabels()] + assert_allclose(rotations, 45.0) + + image_data = disp.im_.get_array().data + assert_allclose(image_data, cm) + + disp.plot(cmap="plasma") + assert disp.im_.get_cmap().name == "plasma" + + disp.plot(include_values=False) + assert disp.text_ is None + + disp.plot(xticks_rotation=90.0) + rotations = [tick.get_rotation() for tick in disp.ax_.get_xticklabels()] + assert_allclose(rotations, 90.0) + + disp.plot(values_format="e") + expected_text = np.array([format(v, "e") for v in cm.ravel(order="C")]) + text_text = np.array([t.get_text() for t in disp.text_.ravel(order="C")]) + assert_array_equal(expected_text, text_text) + + +def test_confusion_matrix_contrast(pyplot): + """Check that the text color is appropriate depending on background.""" + + cm = np.eye(2) / 2 + disp = ConfusionMatrixDisplay(cm, display_labels=[0, 1]) + + disp.plot(cmap=pyplot.cm.gray) + # diagonal text is black + assert_allclose(disp.text_[0, 0].get_color(), [0.0, 0.0, 0.0, 1.0]) + assert_allclose(disp.text_[1, 1].get_color(), [0.0, 0.0, 0.0, 1.0]) + + # off-diagonal text is white + assert_allclose(disp.text_[0, 1].get_color(), [1.0, 1.0, 1.0, 1.0]) + assert_allclose(disp.text_[1, 0].get_color(), [1.0, 1.0, 1.0, 1.0]) + + disp.plot(cmap=pyplot.cm.gray_r) + # diagonal text is white + assert_allclose(disp.text_[0, 1].get_color(), [0.0, 0.0, 0.0, 1.0]) + assert_allclose(disp.text_[1, 0].get_color(), [0.0, 0.0, 0.0, 1.0]) + + # off-diagonal text is black + assert_allclose(disp.text_[0, 0].get_color(), [1.0, 1.0, 1.0, 1.0]) + assert_allclose(disp.text_[1, 1].get_color(), [1.0, 1.0, 1.0, 1.0]) + + # Regression test for #15920 + cm = np.array([[19, 34], [32, 58]]) + disp = ConfusionMatrixDisplay(cm, display_labels=[0, 1]) + + disp.plot(cmap=pyplot.cm.Blues) + min_color = pyplot.cm.Blues(0) + max_color = pyplot.cm.Blues(255) + assert_allclose(disp.text_[0, 0].get_color(), max_color) + assert_allclose(disp.text_[0, 1].get_color(), max_color) + assert_allclose(disp.text_[1, 0].get_color(), max_color) + assert_allclose(disp.text_[1, 1].get_color(), min_color) + + +@pytest.mark.parametrize( + "clf", + [ + LogisticRegression(), + make_pipeline(StandardScaler(), LogisticRegression()), + make_pipeline( + make_column_transformer((StandardScaler(), [0, 1])), + LogisticRegression(), + ), + ], + ids=["clf", "pipeline-clf", "pipeline-column_transformer-clf"], +) +def test_confusion_matrix_pipeline(pyplot, clf): + """Check the behaviour of the plotting with more complex pipeline.""" + n_classes = 5 + X, y = make_classification( + n_samples=100, n_informative=5, n_classes=n_classes, random_state=0 + ) + with pytest.raises(NotFittedError): + ConfusionMatrixDisplay.from_estimator(clf, X, y) + clf.fit(X, y) + y_pred = clf.predict(X) + + disp = ConfusionMatrixDisplay.from_estimator(clf, X, y) + cm = confusion_matrix(y, y_pred) + + assert_allclose(disp.confusion_matrix, cm) + assert disp.text_.shape == (n_classes, n_classes) + + +@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"]) +def test_confusion_matrix_with_unknown_labels(pyplot, constructor_name): + """Check that when labels=None, the unique values in `y_pred` and `y_true` + will be used. + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/pull/18405 + """ + n_classes = 5 + X, y = make_classification( + n_samples=100, n_informative=5, n_classes=n_classes, random_state=0 + ) + classifier = SVC().fit(X, y) + y_pred = classifier.predict(X) + # create unseen labels in `y_true` not seen during fitting and not present + # in 'classifier.classes_' + y = y + 1 + + # safe guard for the binary if/else construction + assert constructor_name in ("from_estimator", "from_predictions") + + common_kwargs = {"labels": None} + if constructor_name == "from_estimator": + disp = ConfusionMatrixDisplay.from_estimator(classifier, X, y, **common_kwargs) + else: + disp = ConfusionMatrixDisplay.from_predictions(y, y_pred, **common_kwargs) + + display_labels = [tick.get_text() for tick in disp.ax_.get_xticklabels()] + expected_labels = [str(i) for i in range(n_classes + 1)] + assert_array_equal(expected_labels, display_labels) + + +def test_colormap_max(pyplot): + """Check that the max color is used for the color of the text.""" + gray = pyplot.get_cmap("gray", 1024) + confusion_matrix = np.array([[1.0, 0.0], [0.0, 1.0]]) + + disp = ConfusionMatrixDisplay(confusion_matrix) + disp.plot(cmap=gray) + + color = disp.text_[1, 0].get_color() + assert_allclose(color, [1.0, 1.0, 1.0, 1.0]) + + +def test_im_kw_adjust_vmin_vmax(pyplot): + """Check that im_kw passes kwargs to imshow""" + + confusion_matrix = np.array([[0.48, 0.04], [0.08, 0.4]]) + disp = ConfusionMatrixDisplay(confusion_matrix) + disp.plot(im_kw=dict(vmin=0.0, vmax=0.8)) + + clim = disp.im_.get_clim() + assert clim[0] == pytest.approx(0.0) + assert clim[1] == pytest.approx(0.8) + + +def test_confusion_matrix_text_kw(pyplot): + """Check that text_kw is passed to the text call.""" + font_size = 15.0 + X, y = make_classification(random_state=0) + classifier = SVC().fit(X, y) + + # from_estimator passes the font size + disp = ConfusionMatrixDisplay.from_estimator( + classifier, X, y, text_kw={"fontsize": font_size} + ) + for text in disp.text_.reshape(-1): + assert text.get_fontsize() == font_size + + # plot adjusts plot to new font size + new_font_size = 20.0 + disp.plot(text_kw={"fontsize": new_font_size}) + for text in disp.text_.reshape(-1): + assert text.get_fontsize() == new_font_size + + # from_predictions passes the font size + y_pred = classifier.predict(X) + disp = ConfusionMatrixDisplay.from_predictions( + y, y_pred, text_kw={"fontsize": font_size} + ) + for text in disp.text_.reshape(-1): + assert text.get_fontsize() == font_size diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_det_curve_display.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_det_curve_display.py new file mode 100644 index 0000000000000000000000000000000000000000..105778c63103040255278dfd4410dab5a2abd792 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_det_curve_display.py @@ -0,0 +1,114 @@ +import numpy as np +import pytest +from numpy.testing import assert_allclose + +from sklearn.datasets import load_iris +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import DetCurveDisplay, det_curve + + +@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"]) +@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"]) +@pytest.mark.parametrize("with_sample_weight", [True, False]) +@pytest.mark.parametrize("drop_intermediate", [True, False]) +@pytest.mark.parametrize("with_strings", [True, False]) +def test_det_curve_display( + pyplot, + constructor_name, + response_method, + with_sample_weight, + drop_intermediate, + with_strings, +): + X, y = load_iris(return_X_y=True) + # Binarize the data with only the two first classes + X, y = X[y < 2], y[y < 2] + + pos_label = None + if with_strings: + y = np.array(["c", "b"])[y] + pos_label = "c" + + if with_sample_weight: + rng = np.random.RandomState(42) + sample_weight = rng.randint(1, 4, size=(X.shape[0])) + else: + sample_weight = None + + lr = LogisticRegression() + lr.fit(X, y) + y_pred = getattr(lr, response_method)(X) + if y_pred.ndim == 2: + y_pred = y_pred[:, 1] + + # safe guard for the binary if/else construction + assert constructor_name in ("from_estimator", "from_predictions") + + common_kwargs = { + "name": lr.__class__.__name__, + "alpha": 0.8, + "sample_weight": sample_weight, + "drop_intermediate": drop_intermediate, + "pos_label": pos_label, + } + if constructor_name == "from_estimator": + disp = DetCurveDisplay.from_estimator(lr, X, y, **common_kwargs) + else: + disp = DetCurveDisplay.from_predictions(y, y_pred, **common_kwargs) + + fpr, fnr, _ = det_curve( + y, + y_pred, + sample_weight=sample_weight, + drop_intermediate=drop_intermediate, + pos_label=pos_label, + ) + + assert_allclose(disp.fpr, fpr, atol=1e-7) + assert_allclose(disp.fnr, fnr, atol=1e-7) + + assert disp.estimator_name == "LogisticRegression" + + # cannot fail thanks to pyplot fixture + import matplotlib as mpl + + assert isinstance(disp.line_, mpl.lines.Line2D) + assert disp.line_.get_alpha() == 0.8 + assert isinstance(disp.ax_, mpl.axes.Axes) + assert isinstance(disp.figure_, mpl.figure.Figure) + assert disp.line_.get_label() == "LogisticRegression" + + expected_pos_label = 1 if pos_label is None else pos_label + expected_ylabel = f"False Negative Rate (Positive label: {expected_pos_label})" + expected_xlabel = f"False Positive Rate (Positive label: {expected_pos_label})" + assert disp.ax_.get_ylabel() == expected_ylabel + assert disp.ax_.get_xlabel() == expected_xlabel + + +@pytest.mark.parametrize( + "constructor_name, expected_clf_name", + [ + ("from_estimator", "LogisticRegression"), + ("from_predictions", "Classifier"), + ], +) +def test_det_curve_display_default_name( + pyplot, + constructor_name, + expected_clf_name, +): + # Check the default name display in the figure when `name` is not provided + X, y = load_iris(return_X_y=True) + # Binarize the data with only the two first classes + X, y = X[y < 2], y[y < 2] + + lr = LogisticRegression().fit(X, y) + y_pred = lr.predict_proba(X)[:, 1] + + if constructor_name == "from_estimator": + disp = DetCurveDisplay.from_estimator(lr, X, y) + else: + disp = DetCurveDisplay.from_predictions(y, y_pred) + + assert disp.estimator_name == expected_clf_name + assert disp.line_.get_label() == expected_clf_name diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_precision_recall_display.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_precision_recall_display.py new file mode 100644 index 0000000000000000000000000000000000000000..022a5fbf28a914e4e27b6679b0d572d5a356ca82 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_precision_recall_display.py @@ -0,0 +1,382 @@ +from collections import Counter + +import numpy as np +import pytest +from scipy.integrate import trapezoid + +from sklearn.compose import make_column_transformer +from sklearn.datasets import load_breast_cancer, make_classification +from sklearn.exceptions import NotFittedError +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import ( + PrecisionRecallDisplay, + average_precision_score, + precision_recall_curve, +) +from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.utils import shuffle + + +@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"]) +@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"]) +@pytest.mark.parametrize("drop_intermediate", [True, False]) +def test_precision_recall_display_plotting( + pyplot, constructor_name, response_method, drop_intermediate +): + """Check the overall plotting rendering.""" + X, y = make_classification(n_classes=2, n_samples=50, random_state=0) + pos_label = 1 + + classifier = LogisticRegression().fit(X, y) + classifier.fit(X, y) + + y_pred = getattr(classifier, response_method)(X) + y_pred = y_pred if y_pred.ndim == 1 else y_pred[:, pos_label] + + # safe guard for the binary if/else construction + assert constructor_name in ("from_estimator", "from_predictions") + + if constructor_name == "from_estimator": + display = PrecisionRecallDisplay.from_estimator( + classifier, + X, + y, + response_method=response_method, + drop_intermediate=drop_intermediate, + ) + else: + display = PrecisionRecallDisplay.from_predictions( + y, y_pred, pos_label=pos_label, drop_intermediate=drop_intermediate + ) + + precision, recall, _ = precision_recall_curve( + y, y_pred, pos_label=pos_label, drop_intermediate=drop_intermediate + ) + average_precision = average_precision_score(y, y_pred, pos_label=pos_label) + + np.testing.assert_allclose(display.precision, precision) + np.testing.assert_allclose(display.recall, recall) + assert display.average_precision == pytest.approx(average_precision) + + import matplotlib as mpl + + assert isinstance(display.line_, mpl.lines.Line2D) + assert isinstance(display.ax_, mpl.axes.Axes) + assert isinstance(display.figure_, mpl.figure.Figure) + + assert display.ax_.get_xlabel() == "Recall (Positive label: 1)" + assert display.ax_.get_ylabel() == "Precision (Positive label: 1)" + assert display.ax_.get_adjustable() == "box" + assert display.ax_.get_aspect() in ("equal", 1.0) + assert display.ax_.get_xlim() == display.ax_.get_ylim() == (-0.01, 1.01) + + # plotting passing some new parameters + display.plot(alpha=0.8, name="MySpecialEstimator") + expected_label = f"MySpecialEstimator (AP = {average_precision:0.2f})" + assert display.line_.get_label() == expected_label + assert display.line_.get_alpha() == pytest.approx(0.8) + + # Check that the chance level line is not plotted by default + assert display.chance_level_ is None + + +@pytest.mark.parametrize("chance_level_kw", [None, {"color": "r"}, {"c": "r"}]) +@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"]) +def test_precision_recall_chance_level_line( + pyplot, + chance_level_kw, + constructor_name, +): + """Check the chance level line plotting behavior.""" + X, y = make_classification(n_classes=2, n_samples=50, random_state=0) + pos_prevalence = Counter(y)[1] / len(y) + + lr = LogisticRegression() + y_pred = lr.fit(X, y).predict_proba(X)[:, 1] + + if constructor_name == "from_estimator": + display = PrecisionRecallDisplay.from_estimator( + lr, + X, + y, + plot_chance_level=True, + chance_level_kw=chance_level_kw, + ) + else: + display = PrecisionRecallDisplay.from_predictions( + y, + y_pred, + plot_chance_level=True, + chance_level_kw=chance_level_kw, + ) + + import matplotlib as mpl + + assert isinstance(display.chance_level_, mpl.lines.Line2D) + assert tuple(display.chance_level_.get_xdata()) == (0, 1) + assert tuple(display.chance_level_.get_ydata()) == (pos_prevalence, pos_prevalence) + + # Checking for chance level line styles + if chance_level_kw is None: + assert display.chance_level_.get_color() == "k" + else: + assert display.chance_level_.get_color() == "r" + + +@pytest.mark.parametrize( + "constructor_name, default_label", + [ + ("from_estimator", "LogisticRegression (AP = {:.2f})"), + ("from_predictions", "Classifier (AP = {:.2f})"), + ], +) +def test_precision_recall_display_name(pyplot, constructor_name, default_label): + """Check the behaviour of the name parameters""" + X, y = make_classification(n_classes=2, n_samples=100, random_state=0) + pos_label = 1 + + classifier = LogisticRegression().fit(X, y) + classifier.fit(X, y) + + y_pred = classifier.predict_proba(X)[:, pos_label] + + # safe guard for the binary if/else construction + assert constructor_name in ("from_estimator", "from_predictions") + + if constructor_name == "from_estimator": + display = PrecisionRecallDisplay.from_estimator(classifier, X, y) + else: + display = PrecisionRecallDisplay.from_predictions( + y, y_pred, pos_label=pos_label + ) + + average_precision = average_precision_score(y, y_pred, pos_label=pos_label) + + # check that the default name is used + assert display.line_.get_label() == default_label.format(average_precision) + + # check that the name can be set + display.plot(name="MySpecialEstimator") + assert ( + display.line_.get_label() + == f"MySpecialEstimator (AP = {average_precision:.2f})" + ) + + +@pytest.mark.parametrize( + "clf", + [ + make_pipeline(StandardScaler(), LogisticRegression()), + make_pipeline( + make_column_transformer((StandardScaler(), [0, 1])), LogisticRegression() + ), + ], +) +def test_precision_recall_display_pipeline(pyplot, clf): + X, y = make_classification(n_classes=2, n_samples=50, random_state=0) + with pytest.raises(NotFittedError): + PrecisionRecallDisplay.from_estimator(clf, X, y) + clf.fit(X, y) + display = PrecisionRecallDisplay.from_estimator(clf, X, y) + assert display.estimator_name == clf.__class__.__name__ + + +def test_precision_recall_display_string_labels(pyplot): + # regression test #15738 + cancer = load_breast_cancer() + X, y = cancer.data, cancer.target_names[cancer.target] + + lr = make_pipeline(StandardScaler(), LogisticRegression()) + lr.fit(X, y) + for klass in cancer.target_names: + assert klass in lr.classes_ + display = PrecisionRecallDisplay.from_estimator(lr, X, y) + + y_pred = lr.predict_proba(X)[:, 1] + avg_prec = average_precision_score(y, y_pred, pos_label=lr.classes_[1]) + + assert display.average_precision == pytest.approx(avg_prec) + assert display.estimator_name == lr.__class__.__name__ + + err_msg = r"y_true takes value in {'benign', 'malignant'}" + with pytest.raises(ValueError, match=err_msg): + PrecisionRecallDisplay.from_predictions(y, y_pred) + + display = PrecisionRecallDisplay.from_predictions( + y, y_pred, pos_label=lr.classes_[1] + ) + assert display.average_precision == pytest.approx(avg_prec) + + +@pytest.mark.parametrize( + "average_precision, estimator_name, expected_label", + [ + (0.9, None, "AP = 0.90"), + (None, "my_est", "my_est"), + (0.8, "my_est2", "my_est2 (AP = 0.80)"), + ], +) +def test_default_labels(pyplot, average_precision, estimator_name, expected_label): + """Check the default labels used in the display.""" + precision = np.array([1, 0.5, 0]) + recall = np.array([0, 0.5, 1]) + display = PrecisionRecallDisplay( + precision, + recall, + average_precision=average_precision, + estimator_name=estimator_name, + ) + display.plot() + assert display.line_.get_label() == expected_label + + +@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"]) +@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"]) +def test_plot_precision_recall_pos_label(pyplot, constructor_name, response_method): + # check that we can provide the positive label and display the proper + # statistics + X, y = load_breast_cancer(return_X_y=True) + # create an highly imbalanced version of the breast cancer dataset + idx_positive = np.flatnonzero(y == 1) + idx_negative = np.flatnonzero(y == 0) + idx_selected = np.hstack([idx_negative, idx_positive[:25]]) + X, y = X[idx_selected], y[idx_selected] + X, y = shuffle(X, y, random_state=42) + # only use 2 features to make the problem even harder + X = X[:, :2] + y = np.array(["cancer" if c == 1 else "not cancer" for c in y], dtype=object) + X_train, X_test, y_train, y_test = train_test_split( + X, + y, + stratify=y, + random_state=0, + ) + + classifier = LogisticRegression() + classifier.fit(X_train, y_train) + + # sanity check to be sure the positive class is classes_[0] and that we + # are betrayed by the class imbalance + assert classifier.classes_.tolist() == ["cancer", "not cancer"] + + y_pred = getattr(classifier, response_method)(X_test) + # we select the corresponding probability columns or reverse the decision + # function otherwise + y_pred_cancer = -1 * y_pred if y_pred.ndim == 1 else y_pred[:, 0] + y_pred_not_cancer = y_pred if y_pred.ndim == 1 else y_pred[:, 1] + + if constructor_name == "from_estimator": + display = PrecisionRecallDisplay.from_estimator( + classifier, + X_test, + y_test, + pos_label="cancer", + response_method=response_method, + ) + else: + display = PrecisionRecallDisplay.from_predictions( + y_test, + y_pred_cancer, + pos_label="cancer", + ) + # we should obtain the statistics of the "cancer" class + avg_prec_limit = 0.65 + assert display.average_precision < avg_prec_limit + assert -trapezoid(display.precision, display.recall) < avg_prec_limit + + # otherwise we should obtain the statistics of the "not cancer" class + if constructor_name == "from_estimator": + display = PrecisionRecallDisplay.from_estimator( + classifier, + X_test, + y_test, + response_method=response_method, + pos_label="not cancer", + ) + else: + display = PrecisionRecallDisplay.from_predictions( + y_test, + y_pred_not_cancer, + pos_label="not cancer", + ) + avg_prec_limit = 0.95 + assert display.average_precision > avg_prec_limit + assert -trapezoid(display.precision, display.recall) > avg_prec_limit + + +@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"]) +def test_precision_recall_prevalence_pos_label_reusable(pyplot, constructor_name): + # Check that even if one passes plot_chance_level=False the first time + # one can still call disp.plot with plot_chance_level=True and get the + # chance level line + X, y = make_classification(n_classes=2, n_samples=50, random_state=0) + + lr = LogisticRegression() + y_pred = lr.fit(X, y).predict_proba(X)[:, 1] + + if constructor_name == "from_estimator": + display = PrecisionRecallDisplay.from_estimator( + lr, X, y, plot_chance_level=False + ) + else: + display = PrecisionRecallDisplay.from_predictions( + y, y_pred, plot_chance_level=False + ) + assert display.chance_level_ is None + + import matplotlib as mpl + + # When calling from_estimator or from_predictions, + # prevalence_pos_label should have been set, so that directly + # calling plot_chance_level=True should plot the chance level line + display.plot(plot_chance_level=True) + assert isinstance(display.chance_level_, mpl.lines.Line2D) + + +def test_precision_recall_raise_no_prevalence(pyplot): + # Check that raises correctly when plotting chance level with + # no prvelance_pos_label is provided + precision = np.array([1, 0.5, 0]) + recall = np.array([0, 0.5, 1]) + display = PrecisionRecallDisplay(precision, recall) + + msg = ( + "You must provide prevalence_pos_label when constructing the " + "PrecisionRecallDisplay object in order to plot the chance " + "level line. Alternatively, you may use " + "PrecisionRecallDisplay.from_estimator or " + "PrecisionRecallDisplay.from_predictions " + "to automatically set prevalence_pos_label" + ) + + with pytest.raises(ValueError, match=msg): + display.plot(plot_chance_level=True) + + +@pytest.mark.parametrize("despine", [True, False]) +@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"]) +def test_plot_precision_recall_despine(pyplot, despine, constructor_name): + # Check that the despine keyword is working correctly + X, y = make_classification(n_classes=2, n_samples=50, random_state=0) + + clf = LogisticRegression().fit(X, y) + clf.fit(X, y) + + y_pred = clf.decision_function(X) + + # safe guard for the binary if/else construction + assert constructor_name in ("from_estimator", "from_predictions") + + if constructor_name == "from_estimator": + display = PrecisionRecallDisplay.from_estimator(clf, X, y, despine=despine) + else: + display = PrecisionRecallDisplay.from_predictions(y, y_pred, despine=despine) + + for s in ["top", "right"]: + assert display.ax_.spines[s].get_visible() is not despine + + if despine: + for s in ["bottom", "left"]: + assert display.ax_.spines[s].get_bounds() == (0, 1) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_predict_error_display.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_predict_error_display.py new file mode 100644 index 0000000000000000000000000000000000000000..b2cb888e8884958f55d665879429f224fc9b787d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_predict_error_display.py @@ -0,0 +1,169 @@ +import pytest +from numpy.testing import assert_allclose + +from sklearn.datasets import load_diabetes +from sklearn.exceptions import NotFittedError +from sklearn.linear_model import Ridge +from sklearn.metrics import PredictionErrorDisplay + +X, y = load_diabetes(return_X_y=True) + + +@pytest.fixture +def regressor_fitted(): + return Ridge().fit(X, y) + + +@pytest.mark.parametrize( + "regressor, params, err_type, err_msg", + [ + ( + Ridge().fit(X, y), + {"subsample": -1}, + ValueError, + "When an integer, subsample=-1 should be", + ), + ( + Ridge().fit(X, y), + {"subsample": 20.0}, + ValueError, + "When a floating-point, subsample=20.0 should be", + ), + ( + Ridge().fit(X, y), + {"subsample": -20.0}, + ValueError, + "When a floating-point, subsample=-20.0 should be", + ), + ( + Ridge().fit(X, y), + {"kind": "xxx"}, + ValueError, + "`kind` must be one of", + ), + ], +) +@pytest.mark.parametrize("class_method", ["from_estimator", "from_predictions"]) +def test_prediction_error_display_raise_error( + pyplot, class_method, regressor, params, err_type, err_msg +): + """Check that we raise the proper error when making the parameters + # validation.""" + with pytest.raises(err_type, match=err_msg): + if class_method == "from_estimator": + PredictionErrorDisplay.from_estimator(regressor, X, y, **params) + else: + y_pred = regressor.predict(X) + PredictionErrorDisplay.from_predictions(y_true=y, y_pred=y_pred, **params) + + +def test_from_estimator_not_fitted(pyplot): + """Check that we raise a `NotFittedError` when the passed regressor is not + fit.""" + regressor = Ridge() + with pytest.raises(NotFittedError, match="is not fitted yet."): + PredictionErrorDisplay.from_estimator(regressor, X, y) + + +@pytest.mark.parametrize("class_method", ["from_estimator", "from_predictions"]) +@pytest.mark.parametrize("kind", ["actual_vs_predicted", "residual_vs_predicted"]) +def test_prediction_error_display(pyplot, regressor_fitted, class_method, kind): + """Check the default behaviour of the display.""" + if class_method == "from_estimator": + display = PredictionErrorDisplay.from_estimator( + regressor_fitted, X, y, kind=kind + ) + else: + y_pred = regressor_fitted.predict(X) + display = PredictionErrorDisplay.from_predictions( + y_true=y, y_pred=y_pred, kind=kind + ) + + if kind == "actual_vs_predicted": + assert_allclose(display.line_.get_xdata(), display.line_.get_ydata()) + assert display.ax_.get_xlabel() == "Predicted values" + assert display.ax_.get_ylabel() == "Actual values" + assert display.line_ is not None + else: + assert display.ax_.get_xlabel() == "Predicted values" + assert display.ax_.get_ylabel() == "Residuals (actual - predicted)" + assert display.line_ is not None + + assert display.ax_.get_legend() is None + + +@pytest.mark.parametrize("class_method", ["from_estimator", "from_predictions"]) +@pytest.mark.parametrize( + "subsample, expected_size", + [(5, 5), (0.1, int(X.shape[0] * 0.1)), (None, X.shape[0])], +) +def test_plot_prediction_error_subsample( + pyplot, regressor_fitted, class_method, subsample, expected_size +): + """Check the behaviour of `subsample`.""" + if class_method == "from_estimator": + display = PredictionErrorDisplay.from_estimator( + regressor_fitted, X, y, subsample=subsample + ) + else: + y_pred = regressor_fitted.predict(X) + display = PredictionErrorDisplay.from_predictions( + y_true=y, y_pred=y_pred, subsample=subsample + ) + assert len(display.scatter_.get_offsets()) == expected_size + + +@pytest.mark.parametrize("class_method", ["from_estimator", "from_predictions"]) +def test_plot_prediction_error_ax(pyplot, regressor_fitted, class_method): + """Check that we can pass an axis to the display.""" + _, ax = pyplot.subplots() + if class_method == "from_estimator": + display = PredictionErrorDisplay.from_estimator(regressor_fitted, X, y, ax=ax) + else: + y_pred = regressor_fitted.predict(X) + display = PredictionErrorDisplay.from_predictions( + y_true=y, y_pred=y_pred, ax=ax + ) + assert display.ax_ is ax + + +@pytest.mark.parametrize("class_method", ["from_estimator", "from_predictions"]) +@pytest.mark.parametrize( + "scatter_kwargs", + [None, {"color": "blue", "alpha": 0.9}, {"c": "blue", "alpha": 0.9}], +) +@pytest.mark.parametrize( + "line_kwargs", [None, {"color": "red", "linestyle": "-"}, {"c": "red", "ls": "-"}] +) +def test_prediction_error_custom_artist( + pyplot, regressor_fitted, class_method, scatter_kwargs, line_kwargs +): + """Check that we can tune the style of the line and the scatter.""" + extra_params = { + "kind": "actual_vs_predicted", + "scatter_kwargs": scatter_kwargs, + "line_kwargs": line_kwargs, + } + if class_method == "from_estimator": + display = PredictionErrorDisplay.from_estimator( + regressor_fitted, X, y, **extra_params + ) + else: + y_pred = regressor_fitted.predict(X) + display = PredictionErrorDisplay.from_predictions( + y_true=y, y_pred=y_pred, **extra_params + ) + + if line_kwargs is not None: + assert display.line_.get_linestyle() == "-" + assert display.line_.get_color() == "red" + else: + assert display.line_.get_linestyle() == "--" + assert display.line_.get_color() == "black" + assert display.line_.get_alpha() == 0.7 + + if scatter_kwargs is not None: + assert_allclose(display.scatter_.get_facecolor(), [[0.0, 0.0, 1.0, 0.9]]) + assert_allclose(display.scatter_.get_edgecolor(), [[0.0, 0.0, 1.0, 0.9]]) + else: + assert display.scatter_.get_alpha() == 0.8 diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_roc_curve_display.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_roc_curve_display.py new file mode 100644 index 0000000000000000000000000000000000000000..23fa2f2e3a5e6a7f0e8b918ec4b75e404887af8b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_plot/tests/test_roc_curve_display.py @@ -0,0 +1,987 @@ +from collections.abc import Mapping + +import numpy as np +import pytest +from numpy.testing import assert_allclose +from scipy.integrate import trapezoid + +from sklearn import clone +from sklearn.compose import make_column_transformer +from sklearn.datasets import load_breast_cancer, make_classification +from sklearn.exceptions import NotFittedError +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import RocCurveDisplay, auc, roc_curve +from sklearn.model_selection import cross_validate, train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.utils import _safe_indexing, shuffle +from sklearn.utils._response import _get_response_values_binary + + +@pytest.fixture(scope="module") +def data_binary(): + X, y = make_classification( + n_samples=200, + n_features=20, + n_informative=5, + n_redundant=2, + flip_y=0.1, + class_sep=0.8, + random_state=42, + ) + return X, y + + +def _check_figure_axes_and_labels(display, pos_label): + """Check mpl axes and figure defaults are correct.""" + import matplotlib as mpl + + assert isinstance(display.ax_, mpl.axes.Axes) + assert isinstance(display.figure_, mpl.figure.Figure) + assert display.ax_.get_adjustable() == "box" + assert display.ax_.get_aspect() in ("equal", 1.0) + assert display.ax_.get_xlim() == display.ax_.get_ylim() == (-0.01, 1.01) + + expected_pos_label = 1 if pos_label is None else pos_label + expected_ylabel = f"True Positive Rate (Positive label: {expected_pos_label})" + expected_xlabel = f"False Positive Rate (Positive label: {expected_pos_label})" + + assert display.ax_.get_ylabel() == expected_ylabel + assert display.ax_.get_xlabel() == expected_xlabel + + +@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"]) +@pytest.mark.parametrize("with_sample_weight", [True, False]) +@pytest.mark.parametrize("drop_intermediate", [True, False]) +@pytest.mark.parametrize("with_strings", [True, False]) +@pytest.mark.parametrize( + "constructor_name, default_name", + [ + ("from_estimator", "LogisticRegression"), + ("from_predictions", "Classifier"), + ], +) +def test_roc_curve_display_plotting( + pyplot, + response_method, + data_binary, + with_sample_weight, + drop_intermediate, + with_strings, + constructor_name, + default_name, +): + """Check the overall plotting behaviour for single curve.""" + X, y = data_binary + + pos_label = None + if with_strings: + y = np.array(["c", "b"])[y] + pos_label = "c" + + if with_sample_weight: + rng = np.random.RandomState(42) + sample_weight = rng.randint(1, 4, size=(X.shape[0])) + else: + sample_weight = None + + lr = LogisticRegression() + lr.fit(X, y) + + y_score = getattr(lr, response_method)(X) + y_score = y_score if y_score.ndim == 1 else y_score[:, 1] + + if constructor_name == "from_estimator": + display = RocCurveDisplay.from_estimator( + lr, + X, + y, + sample_weight=sample_weight, + drop_intermediate=drop_intermediate, + pos_label=pos_label, + curve_kwargs={"alpha": 0.8}, + ) + else: + display = RocCurveDisplay.from_predictions( + y, + y_score, + sample_weight=sample_weight, + drop_intermediate=drop_intermediate, + pos_label=pos_label, + curve_kwargs={"alpha": 0.8}, + ) + + fpr, tpr, _ = roc_curve( + y, + y_score, + sample_weight=sample_weight, + drop_intermediate=drop_intermediate, + pos_label=pos_label, + ) + + assert_allclose(display.roc_auc, auc(fpr, tpr)) + assert_allclose(display.fpr, fpr) + assert_allclose(display.tpr, tpr) + + assert display.name == default_name + + import matplotlib as mpl + + _check_figure_axes_and_labels(display, pos_label) + assert isinstance(display.line_, mpl.lines.Line2D) + assert display.line_.get_alpha() == 0.8 + + expected_label = f"{default_name} (AUC = {display.roc_auc:.2f})" + assert display.line_.get_label() == expected_label + + +@pytest.mark.parametrize( + "params, err_msg", + [ + ( + { + "fpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])], + "tpr": [np.array([0, 0.5, 1])], + "roc_auc": None, + "name": None, + }, + "self.fpr and self.tpr from `RocCurveDisplay` initialization,", + ), + ( + { + "fpr": [np.array([0, 0.5, 1])], + "tpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])], + "roc_auc": [0.8, 0.9], + "name": None, + }, + "self.fpr, self.tpr and self.roc_auc from `RocCurveDisplay`", + ), + ( + { + "fpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])], + "tpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])], + "roc_auc": [0.8], + "name": None, + }, + "Got: self.fpr: 2, self.tpr: 2, self.roc_auc: 1", + ), + ( + { + "fpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])], + "tpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])], + "roc_auc": [0.8, 0.9], + "name": ["curve1", "curve2", "curve3"], + }, + r"self.fpr, self.tpr, self.roc_auc and 'name' \(or self.name\)", + ), + ( + { + "fpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])], + "tpr": [np.array([0, 0.5, 1]), np.array([0, 0.5, 1])], + "roc_auc": [0.8, 0.9], + # List of length 1 is always allowed + "name": ["curve1"], + }, + None, + ), + ], +) +def test_roc_curve_plot_parameter_length_validation(pyplot, params, err_msg): + """Check `plot` parameter length validation performed correctly.""" + display = RocCurveDisplay(**params) + if err_msg: + with pytest.raises(ValueError, match=err_msg): + display.plot() + else: + # No error should be raised + display.plot() + + +def test_validate_plot_params(pyplot): + """Check `_validate_plot_params` returns the correct variables.""" + fpr = np.array([0, 0.5, 1]) + tpr = [np.array([0, 0.5, 1])] + roc_auc = None + name = "test_curve" + + # Initialize display with test inputs + display = RocCurveDisplay( + fpr=fpr, + tpr=tpr, + roc_auc=roc_auc, + name=name, + pos_label=None, + ) + fpr_out, tpr_out, roc_auc_out, name_out = display._validate_plot_params( + ax=None, name=None + ) + + assert isinstance(fpr_out, list) + assert isinstance(tpr_out, list) + assert len(fpr_out) == 1 + assert len(tpr_out) == 1 + assert roc_auc_out is None + assert name_out == ["test_curve"] + + +def test_roc_curve_from_cv_results_param_validation(pyplot, data_binary): + """Check parameter validation is correct.""" + X, y = data_binary + + # `cv_results` missing key + cv_results_no_est = cross_validate( + LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=False + ) + cv_results_no_indices = cross_validate( + LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=False + ) + for cv_results in (cv_results_no_est, cv_results_no_indices): + with pytest.raises( + ValueError, + match="`cv_results` does not contain one of the following required", + ): + RocCurveDisplay.from_cv_results(cv_results, X, y) + + cv_results = cross_validate( + LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True + ) + + # `X` wrong length + with pytest.raises(ValueError, match="`X` does not contain the correct"): + RocCurveDisplay.from_cv_results(cv_results, X[:10, :], y) + + # `y` not binary + y_multi = y.copy() + y_multi[0] = 2 + with pytest.raises(ValueError, match="The target `y` is not binary."): + RocCurveDisplay.from_cv_results(cv_results, X, y_multi) + + # input inconsistent length + with pytest.raises(ValueError, match="Found input variables with inconsistent"): + RocCurveDisplay.from_cv_results(cv_results, X, y[:10]) + with pytest.raises(ValueError, match="Found input variables with inconsistent"): + RocCurveDisplay.from_cv_results(cv_results, X, y, sample_weight=[1, 2]) + + # `pos_label` inconsistency + y_multi[y_multi == 1] = 2 + with pytest.raises(ValueError, match=r"y takes value in \{0, 2\}"): + RocCurveDisplay.from_cv_results(cv_results, X, y_multi) + + # `name` is list while `curve_kwargs` is None or dict + for curve_kwargs in (None, {"alpha": 0.2}): + with pytest.raises(ValueError, match="To avoid labeling individual curves"): + RocCurveDisplay.from_cv_results( + cv_results, + X, + y, + name=["one", "two", "three"], + curve_kwargs=curve_kwargs, + ) + + # `curve_kwargs` incorrect length + with pytest.raises(ValueError, match="`curve_kwargs` must be None, a dictionary"): + RocCurveDisplay.from_cv_results(cv_results, X, y, curve_kwargs=[{"alpha": 1}]) + + # `curve_kwargs` both alias provided + with pytest.raises(TypeError, match="Got both c and"): + RocCurveDisplay.from_cv_results( + cv_results, X, y, curve_kwargs={"c": "blue", "color": "red"} + ) + + +@pytest.mark.parametrize( + "curve_kwargs", + [None, {"alpha": 0.2}, [{"alpha": 0.2}, {"alpha": 0.3}, {"alpha": 0.4}]], +) +def test_roc_curve_display_from_cv_results_curve_kwargs( + pyplot, data_binary, curve_kwargs +): + """Check `curve_kwargs` correctly passed.""" + X, y = data_binary + n_cv = 3 + cv_results = cross_validate( + LogisticRegression(), X, y, cv=n_cv, return_estimator=True, return_indices=True + ) + display = RocCurveDisplay.from_cv_results( + cv_results, + X, + y, + curve_kwargs=curve_kwargs, + ) + if curve_kwargs is None: + # Default `alpha` used + assert all(line.get_alpha() == 0.5 for line in display.line_) + elif isinstance(curve_kwargs, Mapping): + # `alpha` from dict used for all curves + assert all(line.get_alpha() == 0.2 for line in display.line_) + else: + # Different `alpha` used for each curve + assert all( + line.get_alpha() == curve_kwargs[i]["alpha"] + for i, line in enumerate(display.line_) + ) + + +# TODO(1.9): Remove in 1.9 +def test_roc_curve_display_estimator_name_deprecation(pyplot): + """Check deprecation of `estimator_name`.""" + fpr = np.array([0, 0.5, 1]) + tpr = np.array([0, 0.5, 1]) + with pytest.warns(FutureWarning, match="`estimator_name` is deprecated in"): + RocCurveDisplay(fpr=fpr, tpr=tpr, estimator_name="test") + + +# TODO(1.9): Remove in 1.9 +@pytest.mark.parametrize( + "constructor_name", ["from_estimator", "from_predictions", "plot"] +) +def test_roc_curve_display_kwargs_deprecation(pyplot, data_binary, constructor_name): + """Check **kwargs deprecated correctly in favour of `curve_kwargs`.""" + X, y = data_binary + lr = LogisticRegression() + lr.fit(X, y) + fpr = np.array([0, 0.5, 1]) + tpr = np.array([0, 0.5, 1]) + + # Error when both `curve_kwargs` and `**kwargs` provided + with pytest.raises(ValueError, match="Cannot provide both `curve_kwargs`"): + if constructor_name == "from_estimator": + RocCurveDisplay.from_estimator( + lr, X, y, curve_kwargs={"alpha": 1}, label="test" + ) + elif constructor_name == "from_predictions": + RocCurveDisplay.from_predictions( + y, y, curve_kwargs={"alpha": 1}, label="test" + ) + else: + RocCurveDisplay(fpr=fpr, tpr=tpr).plot( + curve_kwargs={"alpha": 1}, label="test" + ) + + # Warning when `**kwargs`` provided + with pytest.warns(FutureWarning, match=r"`\*\*kwargs` is deprecated and will be"): + if constructor_name == "from_estimator": + RocCurveDisplay.from_estimator(lr, X, y, label="test") + elif constructor_name == "from_predictions": + RocCurveDisplay.from_predictions(y, y, label="test") + else: + RocCurveDisplay(fpr=fpr, tpr=tpr).plot(label="test") + + +@pytest.mark.parametrize( + "curve_kwargs", + [ + None, + {"color": "blue"}, + [{"color": "blue"}, {"color": "green"}, {"color": "red"}], + ], +) +@pytest.mark.parametrize("drop_intermediate", [True, False]) +@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"]) +@pytest.mark.parametrize("with_sample_weight", [True, False]) +@pytest.mark.parametrize("with_strings", [True, False]) +def test_roc_curve_display_plotting_from_cv_results( + pyplot, + data_binary, + with_strings, + with_sample_weight, + response_method, + drop_intermediate, + curve_kwargs, +): + """Check overall plotting of `from_cv_results`.""" + X, y = data_binary + + pos_label = None + if with_strings: + y = np.array(["c", "b"])[y] + pos_label = "c" + + if with_sample_weight: + rng = np.random.RandomState(42) + sample_weight = rng.randint(1, 4, size=(X.shape[0])) + else: + sample_weight = None + + cv_results = cross_validate( + LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True + ) + display = RocCurveDisplay.from_cv_results( + cv_results, + X, + y, + sample_weight=sample_weight, + drop_intermediate=drop_intermediate, + response_method=response_method, + pos_label=pos_label, + curve_kwargs=curve_kwargs, + ) + + for idx, (estimator, test_indices) in enumerate( + zip(cv_results["estimator"], cv_results["indices"]["test"]) + ): + y_true = _safe_indexing(y, test_indices) + y_pred = _get_response_values_binary( + estimator, + _safe_indexing(X, test_indices), + response_method=response_method, + pos_label=pos_label, + )[0] + sample_weight_fold = ( + None + if sample_weight is None + else _safe_indexing(sample_weight, test_indices) + ) + fpr, tpr, _ = roc_curve( + y_true, + y_pred, + sample_weight=sample_weight_fold, + drop_intermediate=drop_intermediate, + pos_label=pos_label, + ) + assert_allclose(display.roc_auc[idx], auc(fpr, tpr)) + assert_allclose(display.fpr[idx], fpr) + assert_allclose(display.tpr[idx], tpr) + + assert display.name is None + + import matplotlib as mpl + + _check_figure_axes_and_labels(display, pos_label) + if with_sample_weight: + aggregate_expected_labels = ["AUC = 0.64 +/- 0.04", "_child1", "_child2"] + else: + aggregate_expected_labels = ["AUC = 0.61 +/- 0.05", "_child1", "_child2"] + for idx, line in enumerate(display.line_): + assert isinstance(line, mpl.lines.Line2D) + # Default alpha for `from_cv_results` + line.get_alpha() == 0.5 + if isinstance(curve_kwargs, list): + # Each individual curve labelled + assert line.get_label() == f"AUC = {display.roc_auc[idx]:.2f}" + else: + # Single aggregate label + assert line.get_label() == aggregate_expected_labels[idx] + + +@pytest.mark.parametrize("roc_auc", [[1.0, 1.0, 1.0], None]) +@pytest.mark.parametrize( + "curve_kwargs", + [None, {"color": "red"}, [{"c": "red"}, {"c": "green"}, {"c": "yellow"}]], +) +@pytest.mark.parametrize("name", [None, "single", ["one", "two", "three"]]) +def test_roc_curve_plot_legend_label(pyplot, data_binary, name, curve_kwargs, roc_auc): + """Check legend label correct with all `curve_kwargs`, `name` combinations.""" + fpr = [np.array([0, 0.5, 1]), np.array([0, 0.5, 1]), np.array([0, 0.5, 1])] + tpr = [np.array([0, 0.5, 1]), np.array([0, 0.5, 1]), np.array([0, 0.5, 1])] + if not isinstance(curve_kwargs, list) and isinstance(name, list): + with pytest.raises(ValueError, match="To avoid labeling individual curves"): + RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc).plot( + name=name, curve_kwargs=curve_kwargs + ) + + else: + display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc).plot( + name=name, curve_kwargs=curve_kwargs + ) + legend = display.ax_.get_legend() + if legend is None: + # No legend is created, exit test early + assert name is None + assert roc_auc is None + return + else: + legend_labels = [text.get_text() for text in legend.get_texts()] + + if isinstance(curve_kwargs, list): + # Multiple labels in legend + assert len(legend_labels) == 3 + for idx, label in enumerate(legend_labels): + if name is None: + expected_label = "AUC = 1.00" if roc_auc else None + assert label == expected_label + elif isinstance(name, str): + expected_label = "single (AUC = 1.00)" if roc_auc else "single" + assert label == expected_label + else: + # `name` is a list of different strings + expected_label = ( + f"{name[idx]} (AUC = 1.00)" if roc_auc else f"{name[idx]}" + ) + assert label == expected_label + else: + # Single label in legend + assert len(legend_labels) == 1 + if name is None: + expected_label = "AUC = 1.00 +/- 0.00" if roc_auc else None + assert legend_labels[0] == expected_label + else: + # name is single string + expected_label = "single (AUC = 1.00 +/- 0.00)" if roc_auc else "single" + assert legend_labels[0] == expected_label + + +@pytest.mark.parametrize( + "curve_kwargs", + [None, {"color": "red"}, [{"c": "red"}, {"c": "green"}, {"c": "yellow"}]], +) +@pytest.mark.parametrize("name", [None, "single", ["one", "two", "three"]]) +def test_roc_curve_from_cv_results_legend_label( + pyplot, data_binary, name, curve_kwargs +): + """Check legend label correct with all `curve_kwargs`, `name` combinations.""" + X, y = data_binary + n_cv = 3 + cv_results = cross_validate( + LogisticRegression(), X, y, cv=n_cv, return_estimator=True, return_indices=True + ) + + if not isinstance(curve_kwargs, list) and isinstance(name, list): + with pytest.raises(ValueError, match="To avoid labeling individual curves"): + RocCurveDisplay.from_cv_results( + cv_results, X, y, name=name, curve_kwargs=curve_kwargs + ) + else: + display = RocCurveDisplay.from_cv_results( + cv_results, X, y, name=name, curve_kwargs=curve_kwargs + ) + + legend = display.ax_.get_legend() + legend_labels = [text.get_text() for text in legend.get_texts()] + if isinstance(curve_kwargs, list): + # Multiple labels in legend + assert len(legend_labels) == 3 + auc = ["0.62", "0.66", "0.55"] + for idx, label in enumerate(legend_labels): + if name is None: + assert label == f"AUC = {auc[idx]}" + elif isinstance(name, str): + assert label == f"single (AUC = {auc[idx]})" + else: + # `name` is a list of different strings + assert label == f"{name[idx]} (AUC = {auc[idx]})" + else: + # Single label in legend + assert len(legend_labels) == 1 + if name is None: + assert legend_labels[0] == "AUC = 0.61 +/- 0.05" + else: + # name is single string + assert legend_labels[0] == "single (AUC = 0.61 +/- 0.05)" + + +@pytest.mark.parametrize( + "curve_kwargs", + [None, {"color": "red"}, [{"c": "red"}, {"c": "green"}, {"c": "yellow"}]], +) +def test_roc_curve_from_cv_results_curve_kwargs(pyplot, data_binary, curve_kwargs): + """Check line kwargs passed correctly in `from_cv_results`.""" + + X, y = data_binary + cv_results = cross_validate( + LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True + ) + display = RocCurveDisplay.from_cv_results( + cv_results, X, y, curve_kwargs=curve_kwargs + ) + + for idx, line in enumerate(display.line_): + color = line.get_color() + if curve_kwargs is None: + # Default color + assert color == "blue" + elif isinstance(curve_kwargs, Mapping): + # All curves "red" + assert color == "red" + else: + assert color == curve_kwargs[idx]["c"] + + +def _check_chance_level(plot_chance_level, chance_level_kw, display): + """Check chance level line and line styles correct.""" + import matplotlib as mpl + + if plot_chance_level: + assert isinstance(display.chance_level_, mpl.lines.Line2D) + assert tuple(display.chance_level_.get_xdata()) == (0, 1) + assert tuple(display.chance_level_.get_ydata()) == (0, 1) + else: + assert display.chance_level_ is None + + # Checking for chance level line styles + if plot_chance_level and chance_level_kw is None: + assert display.chance_level_.get_color() == "k" + assert display.chance_level_.get_linestyle() == "--" + assert display.chance_level_.get_label() == "Chance level (AUC = 0.5)" + elif plot_chance_level: + if "c" in chance_level_kw: + assert display.chance_level_.get_color() == chance_level_kw["c"] + else: + assert display.chance_level_.get_color() == chance_level_kw["color"] + if "lw" in chance_level_kw: + assert display.chance_level_.get_linewidth() == chance_level_kw["lw"] + else: + assert display.chance_level_.get_linewidth() == chance_level_kw["linewidth"] + if "ls" in chance_level_kw: + assert display.chance_level_.get_linestyle() == chance_level_kw["ls"] + else: + assert display.chance_level_.get_linestyle() == chance_level_kw["linestyle"] + + +@pytest.mark.parametrize("plot_chance_level", [True, False]) +@pytest.mark.parametrize("label", [None, "Test Label"]) +@pytest.mark.parametrize( + "chance_level_kw", + [ + None, + {"linewidth": 1, "color": "red", "linestyle": "-", "label": "DummyEstimator"}, + {"lw": 1, "c": "red", "ls": "-", "label": "DummyEstimator"}, + {"lw": 1, "color": "blue", "ls": "-", "label": None}, + ], +) +@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"]) +def test_roc_curve_chance_level_line( + pyplot, + data_binary, + plot_chance_level, + chance_level_kw, + label, + constructor_name, +): + """Check chance level plotting behavior of `from_predictions`, `from_estimator`.""" + X, y = data_binary + + lr = LogisticRegression() + lr.fit(X, y) + + y_score = getattr(lr, "predict_proba")(X) + y_score = y_score if y_score.ndim == 1 else y_score[:, 1] + + if constructor_name == "from_estimator": + display = RocCurveDisplay.from_estimator( + lr, + X, + y, + curve_kwargs={"alpha": 0.8, "label": label}, + plot_chance_level=plot_chance_level, + chance_level_kw=chance_level_kw, + ) + else: + display = RocCurveDisplay.from_predictions( + y, + y_score, + curve_kwargs={"alpha": 0.8, "label": label}, + plot_chance_level=plot_chance_level, + chance_level_kw=chance_level_kw, + ) + + import matplotlib as mpl + + assert isinstance(display.line_, mpl.lines.Line2D) + assert display.line_.get_alpha() == 0.8 + assert isinstance(display.ax_, mpl.axes.Axes) + assert isinstance(display.figure_, mpl.figure.Figure) + + _check_chance_level(plot_chance_level, chance_level_kw, display) + + # Checking for legend behaviour + if plot_chance_level and chance_level_kw is not None: + if label is not None or chance_level_kw.get("label") is not None: + legend = display.ax_.get_legend() + assert legend is not None # Legend should be present if any label is set + legend_labels = [text.get_text() for text in legend.get_texts()] + if label is not None: + assert label in legend_labels + if chance_level_kw.get("label") is not None: + assert chance_level_kw["label"] in legend_labels + else: + assert display.ax_.get_legend() is None + + +@pytest.mark.parametrize("plot_chance_level", [True, False]) +@pytest.mark.parametrize( + "chance_level_kw", + [ + None, + {"linewidth": 1, "color": "red", "linestyle": "-", "label": "DummyEstimator"}, + {"lw": 1, "c": "red", "ls": "-", "label": "DummyEstimator"}, + {"lw": 1, "color": "blue", "ls": "-", "label": None}, + ], +) +@pytest.mark.parametrize("curve_kwargs", [None, {"alpha": 0.8}]) +def test_roc_curve_chance_level_line_from_cv_results( + pyplot, + data_binary, + plot_chance_level, + chance_level_kw, + curve_kwargs, +): + """Check chance level plotting behavior with `from_cv_results`.""" + X, y = data_binary + n_cv = 3 + cv_results = cross_validate( + LogisticRegression(), X, y, cv=n_cv, return_estimator=True, return_indices=True + ) + + display = RocCurveDisplay.from_cv_results( + cv_results, + X, + y, + plot_chance_level=plot_chance_level, + chance_level_kwargs=chance_level_kw, + curve_kwargs=curve_kwargs, + ) + + import matplotlib as mpl + + assert all(isinstance(line, mpl.lines.Line2D) for line in display.line_) + # Ensure both curve line kwargs passed correctly as well + if curve_kwargs: + assert all(line.get_alpha() == 0.8 for line in display.line_) + assert isinstance(display.ax_, mpl.axes.Axes) + assert isinstance(display.figure_, mpl.figure.Figure) + + _check_chance_level(plot_chance_level, chance_level_kw, display) + + legend = display.ax_.get_legend() + # There is always a legend, to indicate each 'Fold' curve + assert legend is not None + legend_labels = [text.get_text() for text in legend.get_texts()] + if plot_chance_level and chance_level_kw is not None: + if chance_level_kw.get("label") is not None: + assert chance_level_kw["label"] in legend_labels + else: + assert len(legend_labels) == 1 + + +@pytest.mark.parametrize( + "clf", + [ + LogisticRegression(), + make_pipeline(StandardScaler(), LogisticRegression()), + make_pipeline( + make_column_transformer((StandardScaler(), [0, 1])), LogisticRegression() + ), + ], +) +@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"]) +def test_roc_curve_display_complex_pipeline(pyplot, data_binary, clf, constructor_name): + """Check the behaviour with complex pipeline.""" + X, y = data_binary + + clf = clone(clf) + + if constructor_name == "from_estimator": + with pytest.raises(NotFittedError): + RocCurveDisplay.from_estimator(clf, X, y) + + clf.fit(X, y) + + if constructor_name == "from_estimator": + display = RocCurveDisplay.from_estimator(clf, X, y) + name = clf.__class__.__name__ + else: + display = RocCurveDisplay.from_predictions(y, y) + name = "Classifier" + + assert name in display.line_.get_label() + assert display.name == name + + +@pytest.mark.parametrize( + "roc_auc, name, curve_kwargs, expected_labels", + [ + ([0.9, 0.8], None, None, ["AUC = 0.85 +/- 0.05", "_child1"]), + ([0.9, 0.8], "Est name", None, ["Est name (AUC = 0.85 +/- 0.05)", "_child1"]), + ( + [0.8, 0.7], + ["fold1", "fold2"], + [{"c": "blue"}, {"c": "red"}], + ["fold1 (AUC = 0.80)", "fold2 (AUC = 0.70)"], + ), + (None, ["fold1", "fold2"], [{"c": "blue"}, {"c": "red"}], ["fold1", "fold2"]), + ], +) +def test_roc_curve_display_default_labels( + pyplot, roc_auc, name, curve_kwargs, expected_labels +): + """Check the default labels used in the display.""" + fpr = [np.array([0, 0.5, 1]), np.array([0, 0.3, 1])] + tpr = [np.array([0, 0.5, 1]), np.array([0, 0.3, 1])] + disp = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, name=name).plot( + curve_kwargs=curve_kwargs + ) + for idx, expected_label in enumerate(expected_labels): + assert disp.line_[idx].get_label() == expected_label + + +def _check_auc(display, constructor_name): + roc_auc_limit = 0.95679 + roc_auc_limit_multi = [0.97007, 0.985915, 0.980952] + + if constructor_name == "from_cv_results": + for idx, roc_auc in enumerate(display.roc_auc): + assert roc_auc == pytest.approx(roc_auc_limit_multi[idx]) + else: + assert display.roc_auc == pytest.approx(roc_auc_limit) + assert trapezoid(display.tpr, display.fpr) == pytest.approx(roc_auc_limit) + + +@pytest.mark.parametrize("response_method", ["predict_proba", "decision_function"]) +@pytest.mark.parametrize( + "constructor_name", ["from_estimator", "from_predictions", "from_cv_results"] +) +def test_plot_roc_curve_pos_label(pyplot, response_method, constructor_name): + # check that we can provide the positive label and display the proper + # statistics + X, y = load_breast_cancer(return_X_y=True) + # create an highly imbalanced + idx_positive = np.flatnonzero(y == 1) + idx_negative = np.flatnonzero(y == 0) + idx_selected = np.hstack([idx_negative, idx_positive[:25]]) + X, y = X[idx_selected], y[idx_selected] + X, y = shuffle(X, y, random_state=42) + # only use 2 features to make the problem even harder + X = X[:, :2] + y = np.array(["cancer" if c == 1 else "not cancer" for c in y], dtype=object) + X_train, X_test, y_train, y_test = train_test_split( + X, + y, + stratify=y, + random_state=0, + ) + + classifier = LogisticRegression() + classifier.fit(X_train, y_train) + cv_results = cross_validate( + LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True + ) + + # Sanity check to be sure the positive class is `classes_[0]` + # Class imbalance ensures a large difference in prediction values between classes, + # allowing us to catch errors when we switch `pos_label` + assert classifier.classes_.tolist() == ["cancer", "not cancer"] + + y_score = getattr(classifier, response_method)(X_test) + # we select the corresponding probability columns or reverse the decision + # function otherwise + y_score_cancer = -1 * y_score if y_score.ndim == 1 else y_score[:, 0] + y_score_not_cancer = y_score if y_score.ndim == 1 else y_score[:, 1] + + pos_label = "cancer" + y_score = y_score_cancer + if constructor_name == "from_estimator": + display = RocCurveDisplay.from_estimator( + classifier, + X_test, + y_test, + pos_label=pos_label, + response_method=response_method, + ) + elif constructor_name == "from_predictions": + display = RocCurveDisplay.from_predictions( + y_test, + y_score, + pos_label=pos_label, + ) + else: + display = RocCurveDisplay.from_cv_results( + cv_results, + X, + y, + response_method=response_method, + pos_label=pos_label, + ) + + _check_auc(display, constructor_name) + + pos_label = "not cancer" + y_score = y_score_not_cancer + if constructor_name == "from_estimator": + display = RocCurveDisplay.from_estimator( + classifier, + X_test, + y_test, + response_method=response_method, + pos_label=pos_label, + ) + elif constructor_name == "from_predictions": + display = RocCurveDisplay.from_predictions( + y_test, + y_score, + pos_label=pos_label, + ) + else: + display = RocCurveDisplay.from_cv_results( + cv_results, + X, + y, + response_method=response_method, + pos_label=pos_label, + ) + + _check_auc(display, constructor_name) + + +# TODO(1.9): remove +def test_y_score_and_y_pred_specified_error(): + """Check that an error is raised when both y_score and y_pred are specified.""" + y_true = np.array([0, 1, 1, 0]) + y_score = np.array([0.1, 0.4, 0.35, 0.8]) + y_pred = np.array([0.2, 0.3, 0.5, 0.1]) + + with pytest.raises( + ValueError, match="`y_pred` and `y_score` cannot be both specified" + ): + RocCurveDisplay.from_predictions(y_true, y_score=y_score, y_pred=y_pred) + + +# TODO(1.9): remove +def test_y_pred_deprecation_warning(pyplot): + """Check that a warning is raised when y_pred is specified.""" + y_true = np.array([0, 1, 1, 0]) + y_score = np.array([0.1, 0.4, 0.35, 0.8]) + + with pytest.warns(FutureWarning, match="y_pred is deprecated in 1.7"): + display_y_pred = RocCurveDisplay.from_predictions(y_true, y_pred=y_score) + + assert_allclose(display_y_pred.fpr, [0, 0.5, 0.5, 1]) + assert_allclose(display_y_pred.tpr, [0, 0, 1, 1]) + + display_y_score = RocCurveDisplay.from_predictions(y_true, y_score) + assert_allclose(display_y_score.fpr, [0, 0.5, 0.5, 1]) + assert_allclose(display_y_score.tpr, [0, 0, 1, 1]) + + +@pytest.mark.parametrize("despine", [True, False]) +@pytest.mark.parametrize( + "constructor_name", ["from_estimator", "from_predictions", "from_cv_results"] +) +def test_plot_roc_curve_despine(pyplot, data_binary, despine, constructor_name): + # Check that the despine keyword is working correctly + X, y = data_binary + + lr = LogisticRegression().fit(X, y) + lr.fit(X, y) + cv_results = cross_validate( + LogisticRegression(), X, y, cv=3, return_estimator=True, return_indices=True + ) + + y_pred = lr.decision_function(X) + + # safe guard for the if/else construction + assert constructor_name in ("from_estimator", "from_predictions", "from_cv_results") + + if constructor_name == "from_estimator": + display = RocCurveDisplay.from_estimator(lr, X, y, despine=despine) + elif constructor_name == "from_predictions": + display = RocCurveDisplay.from_predictions(y, y_pred, despine=despine) + else: + display = RocCurveDisplay.from_cv_results(cv_results, X, y, despine=despine) + + for s in ["top", "right"]: + assert display.ax_.spines[s].get_visible() is not despine + + if despine: + for s in ["bottom", "left"]: + assert display.ax_.spines[s].get_bounds() == (0, 1) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_ranking.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_ranking.py new file mode 100644 index 0000000000000000000000000000000000000000..2d0e5211c236c703676923a65bfe5df75affef96 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_ranking.py @@ -0,0 +1,2077 @@ +"""Metrics to assess performance on classification task given scores. + +Functions named as ``*_score`` return a scalar value to maximize: the higher +the better. + +Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize: +the lower the better. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from functools import partial +from numbers import Integral, Real + +import numpy as np +from scipy.integrate import trapezoid +from scipy.sparse import csr_matrix, issparse +from scipy.stats import rankdata + +from ..exceptions import UndefinedMetricWarning +from ..preprocessing import label_binarize +from ..utils import ( + assert_all_finite, + check_array, + check_consistent_length, + column_or_1d, +) +from ..utils._encode import _encode, _unique +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.extmath import stable_cumsum +from ..utils.multiclass import type_of_target +from ..utils.sparsefuncs import count_nonzero +from ..utils.validation import _check_pos_label_consistency, _check_sample_weight +from ._base import _average_binary_score, _average_multiclass_ovo_score + + +@validate_params( + {"x": ["array-like"], "y": ["array-like"]}, + prefer_skip_nested_validation=True, +) +def auc(x, y): + """Compute Area Under the Curve (AUC) using the trapezoidal rule. + + This is a general function, given points on a curve. For computing the + area under the ROC-curve, see :func:`roc_auc_score`. For an alternative + way to summarize a precision-recall curve, see + :func:`average_precision_score`. + + Parameters + ---------- + x : array-like of shape (n,) + X coordinates. These must be either monotonic increasing or monotonic + decreasing. + y : array-like of shape (n,) + Y coordinates. + + Returns + ------- + auc : float + Area Under the Curve. + + See Also + -------- + roc_auc_score : Compute the area under the ROC curve. + average_precision_score : Compute average precision from prediction scores. + precision_recall_curve : Compute precision-recall pairs for different + probability thresholds. + + Examples + -------- + >>> import numpy as np + >>> from sklearn import metrics + >>> y_true = np.array([1, 1, 2, 2]) + >>> y_score = np.array([0.1, 0.4, 0.35, 0.8]) + >>> fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score, pos_label=2) + >>> metrics.auc(fpr, tpr) + 0.75 + """ + check_consistent_length(x, y) + x = column_or_1d(x) + y = column_or_1d(y) + + if x.shape[0] < 2: + raise ValueError( + "At least 2 points are needed to compute area under curve, but x.shape = %s" + % x.shape + ) + + direction = 1 + dx = np.diff(x) + if np.any(dx < 0): + if np.all(dx <= 0): + direction = -1 + else: + raise ValueError("x is neither increasing nor decreasing : {}.".format(x)) + + area = direction * trapezoid(y, x) + if isinstance(area, np.memmap): + # Reductions such as .sum used internally in trapezoid do not return a + # scalar by default for numpy.memmap instances contrary to + # regular numpy.ndarray instances. + area = area.dtype.type(area) + return float(area) + + +@validate_params( + { + "y_true": ["array-like"], + "y_score": ["array-like"], + "average": [StrOptions({"micro", "samples", "weighted", "macro"}), None], + "pos_label": [Real, str, "boolean"], + "sample_weight": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def average_precision_score( + y_true, y_score, *, average="macro", pos_label=1, sample_weight=None +): + """Compute average precision (AP) from prediction scores. + + AP summarizes a precision-recall curve as the weighted mean of precisions + achieved at each threshold, with the increase in recall from the previous + threshold used as the weight: + + .. math:: + \\text{AP} = \\sum_n (R_n - R_{n-1}) P_n + + where :math:`P_n` and :math:`R_n` are the precision and recall at the nth + threshold [1]_. This implementation is not interpolated and is different + from computing the area under the precision-recall curve with the + trapezoidal rule, which uses linear interpolation and can be too + optimistic. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) or (n_samples, n_classes) + True binary labels or binary label indicators. + + y_score : array-like of shape (n_samples,) or (n_samples, n_classes) + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by :term:`decision_function` on some classifiers). + For :term:`decision_function` scores, values greater than or equal to + zero should indicate the positive class. + + average : {'micro', 'samples', 'weighted', 'macro'} or None, \ + default='macro' + If ``None``, the scores for each class are returned. Otherwise, + this determines the type of averaging performed on the data: + + ``'micro'``: + Calculate metrics globally by considering each element of the label + indicator matrix as a label. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average, weighted + by support (the number of true instances for each label). + ``'samples'``: + Calculate metrics for each instance, and find their average. + + Will be ignored when ``y_true`` is binary. + + pos_label : int, float, bool or str, default=1 + The label of the positive class. Only applied to binary ``y_true``. + For multilabel-indicator ``y_true``, ``pos_label`` is fixed to 1. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + average_precision : float + Average precision score. + + See Also + -------- + roc_auc_score : Compute the area under the ROC curve. + precision_recall_curve : Compute precision-recall pairs for different + probability thresholds. + PrecisionRecallDisplay.from_estimator : Plot the precision recall curve + using an estimator and data. + PrecisionRecallDisplay.from_predictions : Plot the precision recall curve + using true and predicted labels. + + Notes + ----- + .. versionchanged:: 0.19 + Instead of linearly interpolating between operating points, precisions + are weighted by the change in recall since the last operating point. + + References + ---------- + .. [1] `Wikipedia entry for the Average precision + `_ + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import average_precision_score + >>> y_true = np.array([0, 0, 1, 1]) + >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) + >>> average_precision_score(y_true, y_scores) + 0.83 + >>> y_true = np.array([0, 0, 1, 1, 2, 2]) + >>> y_scores = np.array([ + ... [0.7, 0.2, 0.1], + ... [0.4, 0.3, 0.3], + ... [0.1, 0.8, 0.1], + ... [0.2, 0.3, 0.5], + ... [0.4, 0.4, 0.2], + ... [0.1, 0.2, 0.7], + ... ]) + >>> average_precision_score(y_true, y_scores) + 0.77 + """ + + def _binary_uninterpolated_average_precision( + y_true, y_score, pos_label=1, sample_weight=None + ): + precision, recall, _ = precision_recall_curve( + y_true, y_score, pos_label=pos_label, sample_weight=sample_weight + ) + # Return the step function integral + # The following works because the last entry of precision is + # guaranteed to be 1, as returned by precision_recall_curve. + # Due to numerical error, we can get `-0.0` and we therefore clip it. + return float(max(0.0, -np.sum(np.diff(recall) * np.array(precision)[:-1]))) + + y_type = type_of_target(y_true, input_name="y_true") + + # Convert to Python primitive type to avoid NumPy type / Python str + # comparison. See https://github.com/numpy/numpy/issues/6784 + present_labels = np.unique(y_true).tolist() + + if y_type == "binary": + if len(present_labels) == 2 and pos_label not in present_labels: + raise ValueError( + f"pos_label={pos_label} is not a valid label. It should be " + f"one of {present_labels}" + ) + + elif y_type == "multilabel-indicator" and pos_label != 1: + raise ValueError( + "Parameter pos_label is fixed to 1 for multilabel-indicator y_true. " + "Do not set pos_label or set pos_label to 1." + ) + + elif y_type == "multiclass": + if pos_label != 1: + raise ValueError( + "Parameter pos_label is fixed to 1 for multiclass y_true. " + "Do not set pos_label or set pos_label to 1." + ) + y_true = label_binarize(y_true, classes=present_labels) + + average_precision = partial( + _binary_uninterpolated_average_precision, pos_label=pos_label + ) + return _average_binary_score( + average_precision, y_true, y_score, average, sample_weight=sample_weight + ) + + +@validate_params( + { + "y_true": ["array-like"], + "y_score": ["array-like"], + "pos_label": [Real, str, "boolean", None], + "sample_weight": ["array-like", None], + "drop_intermediate": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def det_curve( + y_true, y_score, pos_label=None, sample_weight=None, drop_intermediate=False +): + """Compute Detection Error Tradeoff (DET) for different probability thresholds. + + .. note:: + This metric is used for evaluation of ranking and error tradeoffs of + a binary classification task. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.24 + + .. versionchanged:: 1.7 + An arbitrary threshold at infinity is added to represent a classifier + that always predicts the negative class, i.e. `fpr=0` and `fnr=1`, unless + `fpr=0` is already reached at a finite threshold. + + Parameters + ---------- + y_true : ndarray of shape (n_samples,) + True binary labels. If labels are not either {-1, 1} or {0, 1}, then + pos_label should be explicitly given. + + y_score : ndarray of shape of (n_samples,) + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by "decision_function" on some classifiers). + For :term:`decision_function` scores, values greater than or equal to + zero should indicate the positive class. + + pos_label : int, float, bool or str, default=None + The label of the positive class. + When ``pos_label=None``, if `y_true` is in {-1, 1} or {0, 1}, + ``pos_label`` is set to 1, otherwise an error will be raised. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + drop_intermediate : bool, default=False + Whether to drop thresholds where true positives (tp) do not change from + the previous or subsequent threshold. All points with the same tp value + have the same `fnr` and thus same y coordinate. + + .. versionadded:: 1.7 + + Returns + ------- + fpr : ndarray of shape (n_thresholds,) + False positive rate (FPR) such that element i is the false positive + rate of predictions with score >= thresholds[i]. This is occasionally + referred to as false acceptance probability or fall-out. + + fnr : ndarray of shape (n_thresholds,) + False negative rate (FNR) such that element i is the false negative + rate of predictions with score >= thresholds[i]. This is occasionally + referred to as false rejection or miss rate. + + thresholds : ndarray of shape (n_thresholds,) + Decreasing thresholds on the decision function (either `predict_proba` + or `decision_function`) used to compute FPR and FNR. + + .. versionchanged:: 1.7 + An arbitrary threshold at infinity is added for the case `fpr=0` + and `fnr=1`. + + See Also + -------- + DetCurveDisplay.from_estimator : Plot DET curve given an estimator and + some data. + DetCurveDisplay.from_predictions : Plot DET curve given the true and + predicted labels. + DetCurveDisplay : DET curve visualization. + roc_curve : Compute Receiver operating characteristic (ROC) curve. + precision_recall_curve : Compute precision-recall curve. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import det_curve + >>> y_true = np.array([0, 0, 1, 1]) + >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) + >>> fpr, fnr, thresholds = det_curve(y_true, y_scores) + >>> fpr + array([0.5, 0.5, 0. ]) + >>> fnr + array([0. , 0.5, 0.5]) + >>> thresholds + array([0.35, 0.4 , 0.8 ]) + """ + fps, tps, thresholds = _binary_clf_curve( + y_true, y_score, pos_label=pos_label, sample_weight=sample_weight + ) + + # add a threshold at inf where the clf always predicts the negative class + # i.e. tps = fps = 0 + tps = np.concatenate(([0], tps)) + fps = np.concatenate(([0], fps)) + thresholds = np.concatenate(([np.inf], thresholds)) + + if drop_intermediate and len(fps) > 2: + # Drop thresholds where true positives (tp) do not change from the + # previous or subsequent threshold. As tp + fn, is fixed for a dataset, + # this means the false negative rate (fnr) remains constant while the + # false positive rate (fpr) changes, producing horizontal line segments + # in the transformed (normal deviate) scale. These intermediate points + # can be dropped to create lighter DET curve plots. + optimal_idxs = np.where( + np.concatenate( + [[True], np.logical_or(np.diff(tps[:-1]), np.diff(tps[1:])), [True]] + ) + )[0] + fps = fps[optimal_idxs] + tps = tps[optimal_idxs] + thresholds = thresholds[optimal_idxs] + + if len(np.unique(y_true)) != 2: + raise ValueError( + "Only one class is present in y_true. Detection error " + "tradeoff curve is not defined in that case." + ) + + fns = tps[-1] - tps + p_count = tps[-1] + n_count = fps[-1] + + # start with false positives zero, which may be at a finite threshold + first_ind = ( + fps.searchsorted(fps[0], side="right") - 1 + if fps.searchsorted(fps[0], side="right") > 0 + else None + ) + # stop with false negatives zero + last_ind = tps.searchsorted(tps[-1]) + 1 + sl = slice(first_ind, last_ind) + + # reverse the output such that list of false positives is decreasing + return (fps[sl][::-1] / n_count, fns[sl][::-1] / p_count, thresholds[sl][::-1]) + + +def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=None): + """Binary roc auc score.""" + if len(np.unique(y_true)) != 2: + warnings.warn( + ( + "Only one class is present in y_true. ROC AUC score " + "is not defined in that case." + ), + UndefinedMetricWarning, + ) + return np.nan + + fpr, tpr, _ = roc_curve(y_true, y_score, sample_weight=sample_weight) + if max_fpr is None or max_fpr == 1: + return auc(fpr, tpr) + if max_fpr <= 0 or max_fpr > 1: + raise ValueError("Expected max_fpr in range (0, 1], got: %r" % max_fpr) + + # Add a single point at max_fpr by linear interpolation + stop = np.searchsorted(fpr, max_fpr, "right") + x_interp = [fpr[stop - 1], fpr[stop]] + y_interp = [tpr[stop - 1], tpr[stop]] + tpr = np.append(tpr[:stop], np.interp(max_fpr, x_interp, y_interp)) + fpr = np.append(fpr[:stop], max_fpr) + partial_auc = auc(fpr, tpr) + + # McClish correction: standardize result to be 0.5 if non-discriminant + # and 1 if maximal + min_area = 0.5 * max_fpr**2 + max_area = max_fpr + return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area)) + + +@validate_params( + { + "y_true": ["array-like"], + "y_score": ["array-like"], + "average": [StrOptions({"micro", "macro", "samples", "weighted"}), None], + "sample_weight": ["array-like", None], + "max_fpr": [Interval(Real, 0.0, 1, closed="right"), None], + "multi_class": [StrOptions({"raise", "ovr", "ovo"})], + "labels": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def roc_auc_score( + y_true, + y_score, + *, + average="macro", + sample_weight=None, + max_fpr=None, + multi_class="raise", + labels=None, +): + """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) \ + from prediction scores. + + Note: this implementation can be used with binary, multiclass and + multilabel classification, but some restrictions apply (see Parameters). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) or (n_samples, n_classes) + True labels or binary label indicators. The binary and multiclass cases + expect labels with shape (n_samples,) while the multilabel case expects + binary label indicators with shape (n_samples, n_classes). + + y_score : array-like of shape (n_samples,) or (n_samples, n_classes) + Target scores. + + * In the binary case, it corresponds to an array of shape + `(n_samples,)`. Both probability estimates and non-thresholded + decision values can be provided. The probability estimates correspond + to the **probability of the class with the greater label**, + i.e. `estimator.classes_[1]` and thus + `estimator.predict_proba(X, y)[:, 1]`. The decision values + corresponds to the output of `estimator.decision_function(X, y)`. + See more information in the :ref:`User guide `; + * In the multiclass case, it corresponds to an array of shape + `(n_samples, n_classes)` of probability estimates provided by the + `predict_proba` method. The probability estimates **must** + sum to 1 across the possible classes. In addition, the order of the + class scores must correspond to the order of ``labels``, + if provided, or else to the numerical or lexicographical order of + the labels in ``y_true``. See more information in the + :ref:`User guide `; + * In the multilabel case, it corresponds to an array of shape + `(n_samples, n_classes)`. Probability estimates are provided by the + `predict_proba` method and the non-thresholded decision values by + the `decision_function` method. The probability estimates correspond + to the **probability of the class with the greater label for each + output** of the classifier. See more information in the + :ref:`User guide `. + + average : {'micro', 'macro', 'samples', 'weighted'} or None, \ + default='macro' + If ``None``, the scores for each class are returned. + Otherwise, this determines the type of averaging performed on the data. + Note: multiclass ROC AUC currently only handles the 'macro' and + 'weighted' averages. For multiclass targets, `average=None` is only + implemented for `multi_class='ovr'` and `average='micro'` is only + implemented for `multi_class='ovr'`. + + ``'micro'``: + Calculate metrics globally by considering each element of the label + indicator matrix as a label. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average, weighted + by support (the number of true instances for each label). + ``'samples'``: + Calculate metrics for each instance, and find their average. + + Will be ignored when ``y_true`` is binary. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + max_fpr : float > 0 and <= 1, default=None + If not ``None``, the standardized partial AUC [2]_ over the range + [0, max_fpr] is returned. For the multiclass case, ``max_fpr``, + should be either equal to ``None`` or ``1.0`` as AUC ROC partial + computation currently is not supported for multiclass. + + multi_class : {'raise', 'ovr', 'ovo'}, default='raise' + Only used for multiclass targets. Determines the type of configuration + to use. The default value raises an error, so either + ``'ovr'`` or ``'ovo'`` must be passed explicitly. + + ``'ovr'``: + Stands for One-vs-rest. Computes the AUC of each class + against the rest [3]_ [4]_. This + treats the multiclass case in the same way as the multilabel case. + Sensitive to class imbalance even when ``average == 'macro'``, + because class imbalance affects the composition of each of the + 'rest' groupings. + ``'ovo'``: + Stands for One-vs-one. Computes the average AUC of all + possible pairwise combinations of classes [5]_. + Insensitive to class imbalance when + ``average == 'macro'``. + + labels : array-like of shape (n_classes,), default=None + Only used for multiclass targets. List of labels that index the + classes in ``y_score``. If ``None``, the numerical or lexicographical + order of the labels in ``y_true`` is used. + + Returns + ------- + auc : float + Area Under the Curve score. + + See Also + -------- + average_precision_score : Area under the precision-recall curve. + roc_curve : Compute Receiver operating characteristic (ROC) curve. + RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic + (ROC) curve given an estimator and some data. + RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic + (ROC) curve given the true and predicted values. + + Notes + ----- + The Gini Coefficient is a summary measure of the ranking ability of binary + classifiers. It is expressed using the area under of the ROC as follows: + + G = 2 * AUC - 1 + + Where G is the Gini coefficient and AUC is the ROC-AUC score. This normalisation + will ensure that random guessing will yield a score of 0 in expectation, and it is + upper bounded by 1. + + References + ---------- + .. [1] `Wikipedia entry for the Receiver operating characteristic + `_ + + .. [2] `Analyzing a portion of the ROC curve. McClish, 1989 + `_ + + .. [3] Provost, F., Domingos, P. (2000). Well-trained PETs: Improving + probability estimation trees (Section 6.2), CeDER Working Paper + #IS-00-04, Stern School of Business, New York University. + + .. [4] `Fawcett, T. (2006). An introduction to ROC analysis. Pattern + Recognition Letters, 27(8), 861-874. + `_ + + .. [5] `Hand, D.J., Till, R.J. (2001). A Simple Generalisation of the Area + Under the ROC Curve for Multiple Class Classification Problems. + Machine Learning, 45(2), 171-186. + `_ + .. [6] `Wikipedia entry for the Gini coefficient + `_ + + Examples + -------- + Binary case: + + >>> from sklearn.datasets import load_breast_cancer + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.metrics import roc_auc_score + >>> X, y = load_breast_cancer(return_X_y=True) + >>> clf = LogisticRegression(solver="newton-cholesky", random_state=0).fit(X, y) + >>> roc_auc_score(y, clf.predict_proba(X)[:, 1]) + 0.99 + >>> roc_auc_score(y, clf.decision_function(X)) + 0.99 + + Multiclass case: + + >>> from sklearn.datasets import load_iris + >>> X, y = load_iris(return_X_y=True) + >>> clf = LogisticRegression(solver="newton-cholesky").fit(X, y) + >>> roc_auc_score(y, clf.predict_proba(X), multi_class='ovr') + 0.99 + + Multilabel case: + + >>> import numpy as np + >>> from sklearn.datasets import make_multilabel_classification + >>> from sklearn.multioutput import MultiOutputClassifier + >>> X, y = make_multilabel_classification(random_state=0) + >>> clf = MultiOutputClassifier(clf).fit(X, y) + >>> # get a list of n_output containing probability arrays of shape + >>> # (n_samples, n_classes) + >>> y_score = clf.predict_proba(X) + >>> # extract the positive columns for each output + >>> y_score = np.transpose([score[:, 1] for score in y_score]) + >>> roc_auc_score(y, y_score, average=None) + array([0.828, 0.852, 0.94, 0.869, 0.95]) + >>> from sklearn.linear_model import RidgeClassifierCV + >>> clf = RidgeClassifierCV().fit(X, y) + >>> roc_auc_score(y, clf.decision_function(X), average=None) + array([0.82, 0.847, 0.93, 0.872, 0.944]) + """ + + y_type = type_of_target(y_true, input_name="y_true") + y_true = check_array(y_true, ensure_2d=False, dtype=None) + y_score = check_array(y_score, ensure_2d=False) + + if y_type == "multiclass" or ( + y_type == "binary" and y_score.ndim == 2 and y_score.shape[1] > 2 + ): + # do not support partial ROC computation for multiclass + if max_fpr is not None and max_fpr != 1.0: + raise ValueError( + "Partial AUC computation not available in " + "multiclass setting, 'max_fpr' must be" + " set to `None`, received `max_fpr={0}` " + "instead".format(max_fpr) + ) + if multi_class == "raise": + raise ValueError("multi_class must be in ('ovo', 'ovr')") + return _multiclass_roc_auc_score( + y_true, y_score, labels, multi_class, average, sample_weight + ) + elif y_type == "binary": + labels = np.unique(y_true) + y_true = label_binarize(y_true, classes=labels)[:, 0] + return _average_binary_score( + partial(_binary_roc_auc_score, max_fpr=max_fpr), + y_true, + y_score, + average, + sample_weight=sample_weight, + ) + else: # multilabel-indicator + return _average_binary_score( + partial(_binary_roc_auc_score, max_fpr=max_fpr), + y_true, + y_score, + average, + sample_weight=sample_weight, + ) + + +def _multiclass_roc_auc_score( + y_true, y_score, labels, multi_class, average, sample_weight +): + """Multiclass roc auc score. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + True multiclass labels. + + y_score : array-like of shape (n_samples, n_classes) + Target scores corresponding to probability estimates of a sample + belonging to a particular class + + labels : array-like of shape (n_classes,) or None + List of labels to index ``y_score`` used for multiclass. If ``None``, + the lexical order of ``y_true`` is used to index ``y_score``. + + multi_class : {'ovr', 'ovo'} + Determines the type of multiclass configuration to use. + ``'ovr'``: + Calculate metrics for the multiclass case using the one-vs-rest + approach. + ``'ovo'``: + Calculate metrics for the multiclass case using the one-vs-one + approach. + + average : {'micro', 'macro', 'weighted'} + Determines the type of averaging performed on the pairwise binary + metric scores + ``'micro'``: + Calculate metrics for the binarized-raveled classes. Only supported + for `multi_class='ovr'`. + + .. versionadded:: 1.2 + + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. Classes + are assumed to be uniformly distributed. + ``'weighted'``: + Calculate metrics for each label, taking into account the + prevalence of the classes. + + sample_weight : array-like of shape (n_samples,) or None + Sample weights. + + """ + # validation of the input y_score + if not np.allclose(1, y_score.sum(axis=1)): + raise ValueError( + "Target scores need to be probabilities for multiclass " + "roc_auc, i.e. they should sum up to 1.0 over classes" + ) + + # validation for multiclass parameter specifications + average_options = ("macro", "weighted", None) + if multi_class == "ovr": + average_options = ("micro",) + average_options + if average not in average_options: + raise ValueError( + "average must be one of {0} for multiclass problems".format(average_options) + ) + + multiclass_options = ("ovo", "ovr") + if multi_class not in multiclass_options: + raise ValueError( + "multi_class='{0}' is not supported " + "for multiclass ROC AUC, multi_class must be " + "in {1}".format(multi_class, multiclass_options) + ) + + if average is None and multi_class == "ovo": + raise NotImplementedError( + "average=None is not implemented for multi_class='ovo'." + ) + + if labels is not None: + labels = column_or_1d(labels) + classes = _unique(labels) + if len(classes) != len(labels): + raise ValueError("Parameter 'labels' must be unique") + if not np.array_equal(classes, labels): + raise ValueError("Parameter 'labels' must be ordered") + if len(classes) != y_score.shape[1]: + raise ValueError( + "Number of given labels, {0}, not equal to the number " + "of columns in 'y_score', {1}".format(len(classes), y_score.shape[1]) + ) + if len(np.setdiff1d(y_true, classes)): + raise ValueError("'y_true' contains labels not in parameter 'labels'") + else: + classes = _unique(y_true) + if len(classes) != y_score.shape[1]: + raise ValueError( + "Number of classes in y_true not equal to the number of " + "columns in 'y_score'" + ) + + if multi_class == "ovo": + if sample_weight is not None: + raise ValueError( + "sample_weight is not supported " + "for multiclass one-vs-one ROC AUC, " + "'sample_weight' must be None in this case." + ) + y_true_encoded = _encode(y_true, uniques=classes) + # Hand & Till (2001) implementation (ovo) + return _average_multiclass_ovo_score( + _binary_roc_auc_score, y_true_encoded, y_score, average=average + ) + else: + # ovr is same as multi-label + y_true_multilabel = label_binarize(y_true, classes=classes) + return _average_binary_score( + _binary_roc_auc_score, + y_true_multilabel, + y_score, + average, + sample_weight=sample_weight, + ) + + +def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None): + """Calculate true and false positives per binary classification threshold. + + Parameters + ---------- + y_true : ndarray of shape (n_samples,) + True targets of binary classification. + + y_score : ndarray of shape (n_samples,) + Estimated probabilities or output of a decision function. + + pos_label : int, float, bool or str, default=None + The label of the positive class. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + fps : ndarray of shape (n_thresholds,) + A count of false positives, at index i being the number of negative + samples assigned a score >= thresholds[i]. The total number of + negative samples is equal to fps[-1] (thus true negatives are given by + fps[-1] - fps). + + tps : ndarray of shape (n_thresholds,) + An increasing count of true positives, at index i being the number + of positive samples assigned a score >= thresholds[i]. The total + number of positive samples is equal to tps[-1] (thus false negatives + are given by tps[-1] - tps). + + thresholds : ndarray of shape (n_thresholds,) + Decreasing score values. + """ + # Check to make sure y_true is valid + y_type = type_of_target(y_true, input_name="y_true") + if not (y_type == "binary" or (y_type == "multiclass" and pos_label is not None)): + raise ValueError("{0} format is not supported".format(y_type)) + + check_consistent_length(y_true, y_score, sample_weight) + y_true = column_or_1d(y_true) + y_score = column_or_1d(y_score) + assert_all_finite(y_true) + assert_all_finite(y_score) + + # Filter out zero-weighted samples, as they should not impact the result + if sample_weight is not None: + sample_weight = column_or_1d(sample_weight) + sample_weight = _check_sample_weight(sample_weight, y_true) + nonzero_weight_mask = sample_weight != 0 + y_true = y_true[nonzero_weight_mask] + y_score = y_score[nonzero_weight_mask] + sample_weight = sample_weight[nonzero_weight_mask] + + pos_label = _check_pos_label_consistency(pos_label, y_true) + + # make y_true a boolean vector + y_true = y_true == pos_label + + # sort scores and corresponding truth values + desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1] + y_score = y_score[desc_score_indices] + y_true = y_true[desc_score_indices] + if sample_weight is not None: + weight = sample_weight[desc_score_indices] + else: + weight = 1.0 + + # y_score typically has many tied values. Here we extract + # the indices associated with the distinct values. We also + # concatenate a value for the end of the curve. + distinct_value_indices = np.where(np.diff(y_score))[0] + threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1] + + # accumulate the true positives with decreasing threshold + tps = stable_cumsum(y_true * weight)[threshold_idxs] + if sample_weight is not None: + # express fps as a cumsum to ensure fps is increasing even in + # the presence of floating point errors + fps = stable_cumsum((1 - y_true) * weight)[threshold_idxs] + else: + fps = 1 + threshold_idxs - tps + return fps, tps, y_score[threshold_idxs] + + +@validate_params( + { + "y_true": ["array-like"], + "y_score": ["array-like"], + "pos_label": [Real, str, "boolean", None], + "sample_weight": ["array-like", None], + "drop_intermediate": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def precision_recall_curve( + y_true, + y_score, + *, + pos_label=None, + sample_weight=None, + drop_intermediate=False, +): + """Compute precision-recall pairs for different probability thresholds. + + Note: this implementation is restricted to the binary classification task. + + The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of + true positives and ``fp`` the number of false positives. The precision is + intuitively the ability of the classifier not to label as positive a sample + that is negative. + + The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of + true positives and ``fn`` the number of false negatives. The recall is + intuitively the ability of the classifier to find all the positive samples. + + The last precision and recall values are 1. and 0. respectively and do not + have a corresponding threshold. This ensures that the graph starts on the + y axis. + + The first precision and recall values are precision=class balance and recall=1.0 + which corresponds to a classifier that always predicts the positive class. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + True binary labels. If labels are not either {-1, 1} or {0, 1}, then + pos_label should be explicitly given. + + y_score : array-like of shape (n_samples,) + Target scores, can either be probability estimates of the positive + class, or non-thresholded measure of decisions (as returned by + `decision_function` on some classifiers). + For :term:`decision_function` scores, values greater than or equal to + zero should indicate the positive class. + + pos_label : int, float, bool or str, default=None + The label of the positive class. + When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1}, + ``pos_label`` is set to 1, otherwise an error will be raised. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + drop_intermediate : bool, default=False + Whether to drop some suboptimal thresholds which would not appear + on a plotted precision-recall curve. This is useful in order to create + lighter precision-recall curves. + + .. versionadded:: 1.3 + + Returns + ------- + precision : ndarray of shape (n_thresholds + 1,) + Precision values such that element i is the precision of + predictions with score >= thresholds[i] and the last element is 1. + + recall : ndarray of shape (n_thresholds + 1,) + Decreasing recall values such that element i is the recall of + predictions with score >= thresholds[i] and the last element is 0. + + thresholds : ndarray of shape (n_thresholds,) + Increasing thresholds on the decision function used to compute + precision and recall where `n_thresholds = len(np.unique(y_score))`. + + See Also + -------- + PrecisionRecallDisplay.from_estimator : Plot Precision Recall Curve given + a binary classifier. + PrecisionRecallDisplay.from_predictions : Plot Precision Recall Curve + using predictions from a binary classifier. + average_precision_score : Compute average precision from prediction scores. + det_curve: Compute error rates for different probability thresholds. + roc_curve : Compute Receiver operating characteristic (ROC) curve. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import precision_recall_curve + >>> y_true = np.array([0, 0, 1, 1]) + >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) + >>> precision, recall, thresholds = precision_recall_curve( + ... y_true, y_scores) + >>> precision + array([0.5 , 0.66666667, 0.5 , 1. , 1. ]) + >>> recall + array([1. , 1. , 0.5, 0.5, 0. ]) + >>> thresholds + array([0.1 , 0.35, 0.4 , 0.8 ]) + """ + fps, tps, thresholds = _binary_clf_curve( + y_true, y_score, pos_label=pos_label, sample_weight=sample_weight + ) + + if drop_intermediate and len(fps) > 2: + # Drop thresholds corresponding to points where true positives (tps) + # do not change from the previous or subsequent point. This will keep + # only the first and last point for each tps value. All points + # with the same tps value have the same recall and thus x coordinate. + # They appear as a vertical line on the plot. + optimal_idxs = np.where( + np.concatenate( + [[True], np.logical_or(np.diff(tps[:-1]), np.diff(tps[1:])), [True]] + ) + )[0] + fps = fps[optimal_idxs] + tps = tps[optimal_idxs] + thresholds = thresholds[optimal_idxs] + + ps = tps + fps + # Initialize the result array with zeros to make sure that precision[ps == 0] + # does not contain uninitialized values. + precision = np.zeros_like(tps) + np.divide(tps, ps, out=precision, where=(ps != 0)) + + # When no positive label in y_true, recall is set to 1 for all thresholds + # tps[-1] == 0 <=> y_true == all negative labels + if tps[-1] == 0: + warnings.warn( + "No positive class found in y_true, " + "recall is set to one for all thresholds." + ) + recall = np.ones_like(tps) + else: + recall = tps / tps[-1] + + # reverse the outputs so recall is decreasing + sl = slice(None, None, -1) + return np.hstack((precision[sl], 1)), np.hstack((recall[sl], 0)), thresholds[sl] + + +@validate_params( + { + "y_true": ["array-like"], + "y_score": ["array-like"], + "pos_label": [Real, str, "boolean", None], + "sample_weight": ["array-like", None], + "drop_intermediate": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def roc_curve( + y_true, y_score, *, pos_label=None, sample_weight=None, drop_intermediate=True +): + """Compute Receiver operating characteristic (ROC). + + Note: this implementation is restricted to the binary classification task. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + True binary labels. If labels are not either {-1, 1} or {0, 1}, then + pos_label should be explicitly given. + + y_score : array-like of shape (n_samples,) + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by "decision_function" on some classifiers). + For :term:`decision_function` scores, values greater than or equal to + zero should indicate the positive class. + + pos_label : int, float, bool or str, default=None + The label of the positive class. + When ``pos_label=None``, if `y_true` is in {-1, 1} or {0, 1}, + ``pos_label`` is set to 1, otherwise an error will be raised. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + drop_intermediate : bool, default=True + Whether to drop thresholds where the resulting point is collinear with + its neighbors in ROC space. This has no effect on the ROC AUC or visual + shape of the curve, but reduces the number of plotted points. + + .. versionadded:: 0.17 + parameter *drop_intermediate*. + + Returns + ------- + fpr : ndarray of shape (>2,) + Increasing false positive rates such that element i is the false + positive rate of predictions with score >= `thresholds[i]`. + + tpr : ndarray of shape (>2,) + Increasing true positive rates such that element `i` is the true + positive rate of predictions with score >= `thresholds[i]`. + + thresholds : ndarray of shape (n_thresholds,) + Decreasing thresholds on the decision function used to compute + fpr and tpr. The first threshold is set to `np.inf`. + + .. versionchanged:: 1.3 + An arbitrary threshold at infinity (stored in `thresholds[0]`) is + added to represent a classifier that always predicts the negative + class, i.e. `fpr=0` and `tpr=0`. + + See Also + -------- + RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic + (ROC) curve given an estimator and some data. + RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic + (ROC) curve given the true and predicted values. + det_curve: Compute error rates for different probability thresholds. + roc_auc_score : Compute the area under the ROC curve. + + Notes + ----- + Since the thresholds are sorted from low to high values, they + are reversed upon returning them to ensure they correspond to both ``fpr`` + and ``tpr``, which are sorted in reversed order during their calculation. + + References + ---------- + .. [1] `Wikipedia entry for the Receiver operating characteristic + `_ + + .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition + Letters, 2006, 27(8):861-874. + + Examples + -------- + >>> import numpy as np + >>> from sklearn import metrics + >>> y = np.array([1, 1, 2, 2]) + >>> scores = np.array([0.1, 0.4, 0.35, 0.8]) + >>> fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2) + >>> fpr + array([0. , 0. , 0.5, 0.5, 1. ]) + >>> tpr + array([0. , 0.5, 0.5, 1. , 1. ]) + >>> thresholds + array([ inf, 0.8 , 0.4 , 0.35, 0.1 ]) + """ + fps, tps, thresholds = _binary_clf_curve( + y_true, y_score, pos_label=pos_label, sample_weight=sample_weight + ) + + # Attempt to drop thresholds corresponding to points in between and + # collinear with other points. These are always suboptimal and do not + # appear on a plotted ROC curve (and thus do not affect the AUC). + # Here np.diff(_, 2) is used as a "second derivative" to tell if there + # is a corner at the point. Both fps and tps must be tested to handle + # thresholds with multiple data points (which are combined in + # _binary_clf_curve). This keeps all cases where the point should be kept, + # but does not drop more complicated cases like fps = [1, 3, 7], + # tps = [1, 2, 4]; there is no harm in keeping too many thresholds. + if drop_intermediate and len(fps) > 2: + optimal_idxs = np.where( + np.r_[True, np.logical_or(np.diff(fps, 2), np.diff(tps, 2)), True] + )[0] + fps = fps[optimal_idxs] + tps = tps[optimal_idxs] + thresholds = thresholds[optimal_idxs] + + # Add an extra threshold position + # to make sure that the curve starts at (0, 0) + tps = np.r_[0, tps] + fps = np.r_[0, fps] + # get dtype of `y_score` even if it is an array-like + thresholds = np.r_[np.inf, thresholds] + + if fps[-1] <= 0: + warnings.warn( + "No negative samples in y_true, false positive value should be meaningless", + UndefinedMetricWarning, + ) + fpr = np.repeat(np.nan, fps.shape) + else: + fpr = fps / fps[-1] + + if tps[-1] <= 0: + warnings.warn( + "No positive samples in y_true, true positive value should be meaningless", + UndefinedMetricWarning, + ) + tpr = np.repeat(np.nan, tps.shape) + else: + tpr = tps / tps[-1] + + return fpr, tpr, thresholds + + +@validate_params( + { + "y_true": ["array-like", "sparse matrix"], + "y_score": ["array-like"], + "sample_weight": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def label_ranking_average_precision_score(y_true, y_score, *, sample_weight=None): + """Compute ranking-based average precision. + + Label ranking average precision (LRAP) is the average over each ground + truth label assigned to each sample, of the ratio of true vs. total + labels with lower score. + + This metric is used in multilabel ranking problem, where the goal + is to give better rank to the labels associated to each sample. + + The obtained score is always strictly greater than 0 and + the best value is 1. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : {array-like, sparse matrix} of shape (n_samples, n_labels) + True binary labels in binary indicator format. + + y_score : array-like of shape (n_samples, n_labels) + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by "decision_function" on some classifiers). + For :term:`decision_function` scores, values greater than or equal to + zero should indicate the positive class. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + .. versionadded:: 0.20 + + Returns + ------- + score : float + Ranking-based average precision score. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import label_ranking_average_precision_score + >>> y_true = np.array([[1, 0, 0], [0, 0, 1]]) + >>> y_score = np.array([[0.75, 0.5, 1], [1, 0.2, 0.1]]) + >>> label_ranking_average_precision_score(y_true, y_score) + 0.416 + """ + check_consistent_length(y_true, y_score, sample_weight) + y_true = check_array(y_true, ensure_2d=False, accept_sparse="csr") + y_score = check_array(y_score, ensure_2d=False) + + if y_true.shape != y_score.shape: + raise ValueError("y_true and y_score have different shape") + + # Handle badly formatted array and the degenerate case with one label + y_type = type_of_target(y_true, input_name="y_true") + if y_type != "multilabel-indicator" and not ( + y_type == "binary" and y_true.ndim == 2 + ): + raise ValueError("{0} format is not supported".format(y_type)) + + if not issparse(y_true): + y_true = csr_matrix(y_true) + + y_score = -y_score + + n_samples, n_labels = y_true.shape + + out = 0.0 + for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])): + relevant = y_true.indices[start:stop] + + if relevant.size == 0 or relevant.size == n_labels: + # If all labels are relevant or unrelevant, the score is also + # equal to 1. The label ranking has no meaning. + aux = 1.0 + else: + scores_i = y_score[i] + rank = rankdata(scores_i, "max")[relevant] + L = rankdata(scores_i[relevant], "max") + aux = (L / rank).mean() + + if sample_weight is not None: + aux = aux * sample_weight[i] + out += aux + + if sample_weight is None: + out /= n_samples + else: + out /= np.sum(sample_weight) + + return float(out) + + +@validate_params( + { + "y_true": ["array-like"], + "y_score": ["array-like"], + "sample_weight": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def coverage_error(y_true, y_score, *, sample_weight=None): + """Coverage error measure. + + Compute how far we need to go through the ranked scores to cover all + true labels. The best value is equal to the average number + of labels in ``y_true`` per sample. + + Ties in ``y_scores`` are broken by giving maximal rank that would have + been assigned to all tied values. + + Note: Our implementation's score is 1 greater than the one given in + Tsoumakas et al., 2010. This extends it to handle the degenerate case + in which an instance has 0 true labels. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples, n_labels) + True binary labels in binary indicator format. + + y_score : array-like of shape (n_samples, n_labels) + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by "decision_function" on some classifiers). + For :term:`decision_function` scores, values greater than or equal to + zero should indicate the positive class. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + coverage_error : float + The coverage error. + + References + ---------- + .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010). + Mining multi-label data. In Data mining and knowledge discovery + handbook (pp. 667-685). Springer US. + + Examples + -------- + >>> from sklearn.metrics import coverage_error + >>> y_true = [[1, 0, 0], [0, 1, 1]] + >>> y_score = [[1, 0, 0], [0, 1, 1]] + >>> coverage_error(y_true, y_score) + 1.5 + """ + y_true = check_array(y_true, ensure_2d=True) + y_score = check_array(y_score, ensure_2d=True) + check_consistent_length(y_true, y_score, sample_weight) + + y_type = type_of_target(y_true, input_name="y_true") + if y_type != "multilabel-indicator": + raise ValueError("{0} format is not supported".format(y_type)) + + if y_true.shape != y_score.shape: + raise ValueError("y_true and y_score have different shape") + + y_score_mask = np.ma.masked_array(y_score, mask=np.logical_not(y_true)) + y_min_relevant = y_score_mask.min(axis=1).reshape((-1, 1)) + coverage = (y_score >= y_min_relevant).sum(axis=1) + coverage = coverage.filled(0) + + return float(np.average(coverage, weights=sample_weight)) + + +@validate_params( + { + "y_true": ["array-like", "sparse matrix"], + "y_score": ["array-like"], + "sample_weight": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def label_ranking_loss(y_true, y_score, *, sample_weight=None): + """Compute Ranking loss measure. + + Compute the average number of label pairs that are incorrectly ordered + given y_score weighted by the size of the label set and the number of + labels not in the label set. + + This is similar to the error set size, but weighted by the number of + relevant and irrelevant labels. The best performance is achieved with + a ranking loss of zero. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.17 + A function *label_ranking_loss* + + Parameters + ---------- + y_true : {array-like, sparse matrix} of shape (n_samples, n_labels) + True binary labels in binary indicator format. + + y_score : array-like of shape (n_samples, n_labels) + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by "decision_function" on some classifiers). + For :term:`decision_function` scores, values greater than or equal to + zero should indicate the positive class. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + loss : float + Average number of label pairs that are incorrectly ordered given + y_score weighted by the size of the label set and the number of labels not + in the label set. + + References + ---------- + .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010). + Mining multi-label data. In Data mining and knowledge discovery + handbook (pp. 667-685). Springer US. + + Examples + -------- + >>> from sklearn.metrics import label_ranking_loss + >>> y_true = [[1, 0, 0], [0, 0, 1]] + >>> y_score = [[0.75, 0.5, 1], [1, 0.2, 0.1]] + >>> label_ranking_loss(y_true, y_score) + 0.75 + """ + y_true = check_array(y_true, ensure_2d=False, accept_sparse="csr") + y_score = check_array(y_score, ensure_2d=False) + check_consistent_length(y_true, y_score, sample_weight) + + y_type = type_of_target(y_true, input_name="y_true") + if y_type not in ("multilabel-indicator",): + raise ValueError("{0} format is not supported".format(y_type)) + + if y_true.shape != y_score.shape: + raise ValueError("y_true and y_score have different shape") + + n_samples, n_labels = y_true.shape + + y_true = csr_matrix(y_true) + + loss = np.zeros(n_samples) + for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])): + # Sort and bin the label scores + unique_scores, unique_inverse = np.unique(y_score[i], return_inverse=True) + true_at_reversed_rank = np.bincount( + unique_inverse[y_true.indices[start:stop]], minlength=len(unique_scores) + ) + all_at_reversed_rank = np.bincount(unique_inverse, minlength=len(unique_scores)) + false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank + + # if the scores are ordered, it's possible to count the number of + # incorrectly ordered paires in linear time by cumulatively counting + # how many false labels of a given score have a score higher than the + # accumulated true labels with lower score. + loss[i] = np.dot(true_at_reversed_rank.cumsum(), false_at_reversed_rank) + + n_positives = count_nonzero(y_true, axis=1) + with np.errstate(divide="ignore", invalid="ignore"): + loss /= (n_labels - n_positives) * n_positives + + # When there is no positive or no negative labels, those values should + # be consider as correct, i.e. the ranking doesn't matter. + loss[np.logical_or(n_positives == 0, n_positives == n_labels)] = 0.0 + + return float(np.average(loss, weights=sample_weight)) + + +def _dcg_sample_scores(y_true, y_score, k=None, log_base=2, ignore_ties=False): + """Compute Discounted Cumulative Gain. + + Sum the true scores ranked in the order induced by the predicted scores, + after applying a logarithmic discount. + + This ranking metric yields a high value if true labels are ranked high by + ``y_score``. + + Parameters + ---------- + y_true : ndarray of shape (n_samples, n_labels) + True targets of multilabel classification, or true scores of entities + to be ranked. + + y_score : ndarray of shape (n_samples, n_labels) + Target scores, can either be probability estimates, confidence values, + or non-thresholded measure of decisions (as returned by + "decision_function" on some classifiers). + + k : int, default=None + Only consider the highest k scores in the ranking. If `None`, use all + outputs. + + log_base : float, default=2 + Base of the logarithm used for the discount. A low value means a + sharper discount (top results are more important). + + ignore_ties : bool, default=False + Assume that there are no ties in y_score (which is likely to be the + case if y_score is continuous) for efficiency gains. + + Returns + ------- + discounted_cumulative_gain : ndarray of shape (n_samples,) + The DCG score for each sample. + + See Also + -------- + ndcg_score : The Discounted Cumulative Gain divided by the Ideal Discounted + Cumulative Gain (the DCG obtained for a perfect ranking), in order to + have a score between 0 and 1. + """ + discount = 1 / (np.log(np.arange(y_true.shape[1]) + 2) / np.log(log_base)) + if k is not None: + discount[k:] = 0 + if ignore_ties: + ranking = np.argsort(y_score)[:, ::-1] + ranked = y_true[np.arange(ranking.shape[0])[:, np.newaxis], ranking] + cumulative_gains = discount.dot(ranked.T) + else: + discount_cumsum = np.cumsum(discount) + cumulative_gains = [ + _tie_averaged_dcg(y_t, y_s, discount_cumsum) + for y_t, y_s in zip(y_true, y_score) + ] + cumulative_gains = np.asarray(cumulative_gains) + return cumulative_gains + + +def _tie_averaged_dcg(y_true, y_score, discount_cumsum): + """ + Compute DCG by averaging over possible permutations of ties. + + The gain (`y_true`) of an index falling inside a tied group (in the order + induced by `y_score`) is replaced by the average gain within this group. + The discounted gain for a tied group is then the average `y_true` within + this group times the sum of discounts of the corresponding ranks. + + This amounts to averaging scores for all possible orderings of the tied + groups. + + (note in the case of dcg@k the discount is 0 after index k) + + Parameters + ---------- + y_true : ndarray + The true relevance scores. + + y_score : ndarray + Predicted scores. + + discount_cumsum : ndarray + Precomputed cumulative sum of the discounts. + + Returns + ------- + discounted_cumulative_gain : float + The discounted cumulative gain. + + References + ---------- + McSherry, F., & Najork, M. (2008, March). Computing information retrieval + performance measures efficiently in the presence of tied scores. In + European conference on information retrieval (pp. 414-421). Springer, + Berlin, Heidelberg. + """ + _, inv, counts = np.unique(-y_score, return_inverse=True, return_counts=True) + ranked = np.zeros(len(counts)) + np.add.at(ranked, inv, y_true) + ranked /= counts + groups = np.cumsum(counts) - 1 + discount_sums = np.empty(len(counts)) + discount_sums[0] = discount_cumsum[groups[0]] + discount_sums[1:] = np.diff(discount_cumsum[groups]) + return (ranked * discount_sums).sum() + + +def _check_dcg_target_type(y_true): + y_type = type_of_target(y_true, input_name="y_true") + supported_fmt = ( + "multilabel-indicator", + "continuous-multioutput", + "multiclass-multioutput", + ) + if y_type not in supported_fmt: + raise ValueError( + "Only {} formats are supported. Got {} instead".format( + supported_fmt, y_type + ) + ) + + +@validate_params( + { + "y_true": ["array-like"], + "y_score": ["array-like"], + "k": [Interval(Integral, 1, None, closed="left"), None], + "log_base": [Interval(Real, 0.0, None, closed="neither")], + "sample_weight": ["array-like", None], + "ignore_ties": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def dcg_score( + y_true, y_score, *, k=None, log_base=2, sample_weight=None, ignore_ties=False +): + """Compute Discounted Cumulative Gain. + + Sum the true scores ranked in the order induced by the predicted scores, + after applying a logarithmic discount. + + This ranking metric yields a high value if true labels are ranked high by + ``y_score``. + + Usually the Normalized Discounted Cumulative Gain (NDCG, computed by + ndcg_score) is preferred. + + Parameters + ---------- + y_true : array-like of shape (n_samples, n_labels) + True targets of multilabel classification, or true scores of entities + to be ranked. + + y_score : array-like of shape (n_samples, n_labels) + Target scores, can either be probability estimates, confidence values, + or non-thresholded measure of decisions (as returned by + "decision_function" on some classifiers). + + k : int, default=None + Only consider the highest k scores in the ranking. If None, use all + outputs. + + log_base : float, default=2 + Base of the logarithm used for the discount. A low value means a + sharper discount (top results are more important). + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If `None`, all samples are given the same weight. + + ignore_ties : bool, default=False + Assume that there are no ties in y_score (which is likely to be the + case if y_score is continuous) for efficiency gains. + + Returns + ------- + discounted_cumulative_gain : float + The averaged sample DCG scores. + + See Also + -------- + ndcg_score : The Discounted Cumulative Gain divided by the Ideal Discounted + Cumulative Gain (the DCG obtained for a perfect ranking), in order to + have a score between 0 and 1. + + References + ---------- + `Wikipedia entry for Discounted Cumulative Gain + `_. + + Jarvelin, K., & Kekalainen, J. (2002). + Cumulated gain-based evaluation of IR techniques. ACM Transactions on + Information Systems (TOIS), 20(4), 422-446. + + Wang, Y., Wang, L., Li, Y., He, D., Chen, W., & Liu, T. Y. (2013, May). + A theoretical analysis of NDCG ranking measures. In Proceedings of the 26th + Annual Conference on Learning Theory (COLT 2013). + + McSherry, F., & Najork, M. (2008, March). Computing information retrieval + performance measures efficiently in the presence of tied scores. In + European conference on information retrieval (pp. 414-421). Springer, + Berlin, Heidelberg. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import dcg_score + >>> # we have ground-truth relevance of some answers to a query: + >>> true_relevance = np.asarray([[10, 0, 0, 1, 5]]) + >>> # we predict scores for the answers + >>> scores = np.asarray([[.1, .2, .3, 4, 70]]) + >>> dcg_score(true_relevance, scores) + 9.49 + >>> # we can set k to truncate the sum; only top k answers contribute + >>> dcg_score(true_relevance, scores, k=2) + 5.63 + >>> # now we have some ties in our prediction + >>> scores = np.asarray([[1, 0, 0, 0, 1]]) + >>> # by default ties are averaged, so here we get the average true + >>> # relevance of our top predictions: (10 + 5) / 2 = 7.5 + >>> dcg_score(true_relevance, scores, k=1) + 7.5 + >>> # we can choose to ignore ties for faster results, but only + >>> # if we know there aren't ties in our scores, otherwise we get + >>> # wrong results: + >>> dcg_score(true_relevance, + ... scores, k=1, ignore_ties=True) + 5.0 + """ + y_true = check_array(y_true, ensure_2d=False) + y_score = check_array(y_score, ensure_2d=False) + check_consistent_length(y_true, y_score, sample_weight) + _check_dcg_target_type(y_true) + return float( + np.average( + _dcg_sample_scores( + y_true, y_score, k=k, log_base=log_base, ignore_ties=ignore_ties + ), + weights=sample_weight, + ) + ) + + +def _ndcg_sample_scores(y_true, y_score, k=None, ignore_ties=False): + """Compute Normalized Discounted Cumulative Gain. + + Sum the true scores ranked in the order induced by the predicted scores, + after applying a logarithmic discount. Then divide by the best possible + score (Ideal DCG, obtained for a perfect ranking) to obtain a score between + 0 and 1. + + This ranking metric yields a high value if true labels are ranked high by + ``y_score``. + + Parameters + ---------- + y_true : ndarray of shape (n_samples, n_labels) + True targets of multilabel classification, or true scores of entities + to be ranked. + + y_score : ndarray of shape (n_samples, n_labels) + Target scores, can either be probability estimates, confidence values, + or non-thresholded measure of decisions (as returned by + "decision_function" on some classifiers). + + k : int, default=None + Only consider the highest k scores in the ranking. If None, use all + outputs. + + ignore_ties : bool, default=False + Assume that there are no ties in y_score (which is likely to be the + case if y_score is continuous) for efficiency gains. + + Returns + ------- + normalized_discounted_cumulative_gain : ndarray of shape (n_samples,) + The NDCG score for each sample (float in [0., 1.]). + + See Also + -------- + dcg_score : Discounted Cumulative Gain (not normalized). + + """ + gain = _dcg_sample_scores(y_true, y_score, k, ignore_ties=ignore_ties) + # Here we use the order induced by y_true so we can ignore ties since + # the gain associated to tied indices is the same (permuting ties doesn't + # change the value of the re-ordered y_true) + normalizing_gain = _dcg_sample_scores(y_true, y_true, k, ignore_ties=True) + all_irrelevant = normalizing_gain == 0 + gain[all_irrelevant] = 0 + gain[~all_irrelevant] /= normalizing_gain[~all_irrelevant] + return gain + + +@validate_params( + { + "y_true": ["array-like"], + "y_score": ["array-like"], + "k": [Interval(Integral, 1, None, closed="left"), None], + "sample_weight": ["array-like", None], + "ignore_ties": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def ndcg_score(y_true, y_score, *, k=None, sample_weight=None, ignore_ties=False): + """Compute Normalized Discounted Cumulative Gain. + + Sum the true scores ranked in the order induced by the predicted scores, + after applying a logarithmic discount. Then divide by the best possible + score (Ideal DCG, obtained for a perfect ranking) to obtain a score between + 0 and 1. + + This ranking metric returns a high value if true labels are ranked high by + ``y_score``. + + Parameters + ---------- + y_true : array-like of shape (n_samples, n_labels) + True targets of multilabel classification, or true scores of entities + to be ranked. Negative values in `y_true` may result in an output + that is not between 0 and 1. + + y_score : array-like of shape (n_samples, n_labels) + Target scores, can either be probability estimates, confidence values, + or non-thresholded measure of decisions (as returned by + "decision_function" on some classifiers). + + k : int, default=None + Only consider the highest k scores in the ranking. If `None`, use all + outputs. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If `None`, all samples are given the same weight. + + ignore_ties : bool, default=False + Assume that there are no ties in y_score (which is likely to be the + case if y_score is continuous) for efficiency gains. + + Returns + ------- + normalized_discounted_cumulative_gain : float in [0., 1.] + The averaged NDCG scores for all samples. + + See Also + -------- + dcg_score : Discounted Cumulative Gain (not normalized). + + References + ---------- + `Wikipedia entry for Discounted Cumulative Gain + `_ + + Jarvelin, K., & Kekalainen, J. (2002). + Cumulated gain-based evaluation of IR techniques. ACM Transactions on + Information Systems (TOIS), 20(4), 422-446. + + Wang, Y., Wang, L., Li, Y., He, D., Chen, W., & Liu, T. Y. (2013, May). + A theoretical analysis of NDCG ranking measures. In Proceedings of the 26th + Annual Conference on Learning Theory (COLT 2013) + + McSherry, F., & Najork, M. (2008, March). Computing information retrieval + performance measures efficiently in the presence of tied scores. In + European conference on information retrieval (pp. 414-421). Springer, + Berlin, Heidelberg. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import ndcg_score + >>> # we have ground-truth relevance of some answers to a query: + >>> true_relevance = np.asarray([[10, 0, 0, 1, 5]]) + >>> # we predict some scores (relevance) for the answers + >>> scores = np.asarray([[.1, .2, .3, 4, 70]]) + >>> ndcg_score(true_relevance, scores) + 0.69 + >>> scores = np.asarray([[.05, 1.1, 1., .5, .0]]) + >>> ndcg_score(true_relevance, scores) + 0.49 + >>> # we can set k to truncate the sum; only top k answers contribute. + >>> ndcg_score(true_relevance, scores, k=4) + 0.35 + >>> # the normalization takes k into account so a perfect answer + >>> # would still get 1.0 + >>> ndcg_score(true_relevance, true_relevance, k=4) + 1.0... + >>> # now we have some ties in our prediction + >>> scores = np.asarray([[1, 0, 0, 0, 1]]) + >>> # by default ties are averaged, so here we get the average (normalized) + >>> # true relevance of our top predictions: (10 / 10 + 5 / 10) / 2 = .75 + >>> ndcg_score(true_relevance, scores, k=1) + 0.75 + >>> # we can choose to ignore ties for faster results, but only + >>> # if we know there aren't ties in our scores, otherwise we get + >>> # wrong results: + >>> ndcg_score(true_relevance, + ... scores, k=1, ignore_ties=True) + 0.5... + """ + y_true = check_array(y_true, ensure_2d=False) + y_score = check_array(y_score, ensure_2d=False) + check_consistent_length(y_true, y_score, sample_weight) + + if y_true.min() < 0: + raise ValueError("ndcg_score should not be used on negative y_true values.") + if y_true.ndim > 1 and y_true.shape[1] <= 1: + raise ValueError( + "Computing NDCG is only meaningful when there is more than 1 document. " + f"Got {y_true.shape[1]} instead." + ) + _check_dcg_target_type(y_true) + gain = _ndcg_sample_scores(y_true, y_score, k=k, ignore_ties=ignore_ties) + return float(np.average(gain, weights=sample_weight)) + + +@validate_params( + { + "y_true": ["array-like"], + "y_score": ["array-like"], + "k": [Interval(Integral, 1, None, closed="left")], + "normalize": ["boolean"], + "sample_weight": ["array-like", None], + "labels": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def top_k_accuracy_score( + y_true, y_score, *, k=2, normalize=True, sample_weight=None, labels=None +): + """Top-k Accuracy classification score. + + This metric computes the number of times where the correct label is among + the top `k` labels predicted (ranked by predicted scores). Note that the + multilabel case isn't covered here. + + Read more in the :ref:`User Guide ` + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + True labels. + + y_score : array-like of shape (n_samples,) or (n_samples, n_classes) + Target scores. These can be either probability estimates or + non-thresholded decision values (as returned by + :term:`decision_function` on some classifiers). + The binary case expects scores with shape (n_samples,) while the + multiclass case expects scores with shape (n_samples, n_classes). + In the multiclass case, the order of the class scores must + correspond to the order of ``labels``, if provided, or else to + the numerical or lexicographical order of the labels in ``y_true``. + If ``y_true`` does not contain all the labels, ``labels`` must be + provided. + + k : int, default=2 + Number of most likely outcomes considered to find the correct label. + + normalize : bool, default=True + If `True`, return the fraction of correctly classified samples. + Otherwise, return the number of correctly classified samples. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If `None`, all samples are given the same weight. + + labels : array-like of shape (n_classes,), default=None + Multiclass only. List of labels that index the classes in ``y_score``. + If ``None``, the numerical or lexicographical order of the labels in + ``y_true`` is used. If ``y_true`` does not contain all the labels, + ``labels`` must be provided. + + Returns + ------- + score : float + The top-k accuracy score. The best performance is 1 with + `normalize == True` and the number of samples with + `normalize == False`. + + See Also + -------- + accuracy_score : Compute the accuracy score. By default, the function will + return the fraction of correct predictions divided by the total number + of predictions. + + Notes + ----- + In cases where two or more labels are assigned equal predicted scores, + the labels with the highest indices will be chosen first. This might + impact the result if the correct label falls after the threshold because + of that. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import top_k_accuracy_score + >>> y_true = np.array([0, 1, 2, 2]) + >>> y_score = np.array([[0.5, 0.2, 0.2], # 0 is in top 2 + ... [0.3, 0.4, 0.2], # 1 is in top 2 + ... [0.2, 0.4, 0.3], # 2 is in top 2 + ... [0.7, 0.2, 0.1]]) # 2 isn't in top 2 + >>> top_k_accuracy_score(y_true, y_score, k=2) + 0.75 + >>> # Not normalizing gives the number of "correctly" classified samples + >>> top_k_accuracy_score(y_true, y_score, k=2, normalize=False) + 3.0 + """ + y_true = check_array(y_true, ensure_2d=False, dtype=None) + y_true = column_or_1d(y_true) + y_type = type_of_target(y_true, input_name="y_true") + if y_type == "binary" and labels is not None and len(labels) > 2: + y_type = "multiclass" + if y_type not in {"binary", "multiclass"}: + raise ValueError( + f"y type must be 'binary' or 'multiclass', got '{y_type}' instead." + ) + y_score = check_array(y_score, ensure_2d=False) + if y_type == "binary": + if y_score.ndim == 2 and y_score.shape[1] != 1: + raise ValueError( + "`y_true` is binary while y_score is 2d with" + f" {y_score.shape[1]} classes. If `y_true` does not contain all the" + " labels, `labels` must be provided." + ) + y_score = column_or_1d(y_score) + + check_consistent_length(y_true, y_score, sample_weight) + y_score_n_classes = y_score.shape[1] if y_score.ndim == 2 else 2 + + if labels is None: + classes = _unique(y_true) + n_classes = len(classes) + + if n_classes != y_score_n_classes: + raise ValueError( + f"Number of classes in 'y_true' ({n_classes}) not equal " + f"to the number of classes in 'y_score' ({y_score_n_classes})." + "You can provide a list of all known classes by assigning it " + "to the `labels` parameter." + ) + else: + labels = column_or_1d(labels) + classes = _unique(labels) + n_labels = len(labels) + n_classes = len(classes) + + if n_classes != n_labels: + raise ValueError("Parameter 'labels' must be unique.") + + if not np.array_equal(classes, labels): + raise ValueError("Parameter 'labels' must be ordered.") + + if n_classes != y_score_n_classes: + raise ValueError( + f"Number of given labels ({n_classes}) not equal to the " + f"number of classes in 'y_score' ({y_score_n_classes})." + ) + + if len(np.setdiff1d(y_true, classes)): + raise ValueError("'y_true' contains labels not in parameter 'labels'.") + + if k >= n_classes: + warnings.warn( + ( + f"'k' ({k}) greater than or equal to 'n_classes' ({n_classes}) " + "will result in a perfect score and is therefore meaningless." + ), + UndefinedMetricWarning, + ) + + y_true_encoded = _encode(y_true, uniques=classes) + + if y_type == "binary": + if k == 1: + threshold = 0.5 if y_score.min() >= 0 and y_score.max() <= 1 else 0 + y_pred = (y_score > threshold).astype(np.int64) + hits = y_pred == y_true_encoded + else: + hits = np.ones_like(y_score, dtype=np.bool_) + elif y_type == "multiclass": + sorted_pred = np.argsort(y_score, axis=1, kind="mergesort")[:, ::-1] + hits = (y_true_encoded == sorted_pred[:, :k].T).any(axis=0) + + if normalize: + return float(np.average(hits, weights=sample_weight)) + elif sample_weight is None: + return float(np.sum(hits)) + else: + return float(np.dot(hits, sample_weight)) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_regression.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_regression.py new file mode 100644 index 0000000000000000000000000000000000000000..0731e00ce3a1ab24adb2e33ed17ac948455586e8 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_regression.py @@ -0,0 +1,1930 @@ +"""Metrics to assess performance on regression task. + +Functions named as ``*_score`` return a scalar value to maximize: the higher +the better. + +Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize: +the lower the better. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from numbers import Real + +import numpy as np + +from ..exceptions import UndefinedMetricWarning +from ..utils._array_api import ( + _average, + _find_matching_floating_dtype, + get_namespace, + get_namespace_and_device, + size, +) +from ..utils._array_api import ( + _xlogy as xlogy, +) +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.stats import _weighted_percentile +from ..utils.validation import ( + _check_sample_weight, + _num_samples, + check_array, + check_consistent_length, + column_or_1d, +) + +__ALL__ = [ + "max_error", + "mean_absolute_error", + "mean_squared_error", + "mean_squared_log_error", + "median_absolute_error", + "mean_absolute_percentage_error", + "mean_pinball_loss", + "r2_score", + "root_mean_squared_log_error", + "root_mean_squared_error", + "explained_variance_score", + "mean_tweedie_deviance", + "mean_poisson_deviance", + "mean_gamma_deviance", + "d2_tweedie_score", + "d2_pinball_score", + "d2_absolute_error_score", +] + + +def _check_reg_targets( + y_true, y_pred, sample_weight, multioutput, dtype="numeric", xp=None +): + """Check that y_true, y_pred and sample_weight belong to the same regression task. + + To reduce redundancy when calling `_find_matching_floating_dtype`, + please use `_check_reg_targets_with_floating_dtype` instead. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) + Estimated target values. + + sample_weight : array-like of shape (n_samples,) or None + Sample weights. + + multioutput : array-like or string in ['raw_values', uniform_average', + 'variance_weighted'] or None + None is accepted due to backward compatibility of r2_score(). + + dtype : str or list, default="numeric" + the dtype argument passed to check_array. + + xp : module, default=None + Precomputed array namespace module. When passed, typically from a caller + that has already performed inspection of its own inputs, skips array + namespace inspection. + + Returns + ------- + type_true : one of {'continuous', continuous-multioutput'} + The type of the true target data, as output by + 'utils.multiclass.type_of_target'. + + y_true : array-like of shape (n_samples, n_outputs) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples, n_outputs) + Estimated target values. + + sample_weight : array-like of shape (n_samples,) or None + Sample weights. + + multioutput : array-like of shape (n_outputs) or string in ['raw_values', + uniform_average', 'variance_weighted'] or None + Custom output weights if ``multioutput`` is array-like or + just the corresponding argument if ``multioutput`` is a + correct keyword. + """ + xp, _ = get_namespace(y_true, y_pred, multioutput, xp=xp) + + check_consistent_length(y_true, y_pred, sample_weight) + y_true = check_array(y_true, ensure_2d=False, dtype=dtype) + y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype) + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, y_true, dtype=dtype) + + if y_true.ndim == 1: + y_true = xp.reshape(y_true, (-1, 1)) + + if y_pred.ndim == 1: + y_pred = xp.reshape(y_pred, (-1, 1)) + + if y_true.shape[1] != y_pred.shape[1]: + raise ValueError( + "y_true and y_pred have different number of output ({0}!={1})".format( + y_true.shape[1], y_pred.shape[1] + ) + ) + + n_outputs = y_true.shape[1] + allowed_multioutput_str = ("raw_values", "uniform_average", "variance_weighted") + if isinstance(multioutput, str): + if multioutput not in allowed_multioutput_str: + raise ValueError( + "Allowed 'multioutput' string values are {}. " + "You provided multioutput={!r}".format( + allowed_multioutput_str, multioutput + ) + ) + elif multioutput is not None: + multioutput = check_array(multioutput, ensure_2d=False) + if n_outputs == 1: + raise ValueError("Custom weights are useful only in multi-output cases.") + elif n_outputs != multioutput.shape[0]: + raise ValueError( + "There must be equally many custom weights " + f"({multioutput.shape[0]}) as outputs ({n_outputs})." + ) + y_type = "continuous" if n_outputs == 1 else "continuous-multioutput" + + return y_type, y_true, y_pred, sample_weight, multioutput + + +def _check_reg_targets_with_floating_dtype( + y_true, y_pred, sample_weight, multioutput, xp=None +): + """Ensures y_true, y_pred, and sample_weight correspond to same regression task. + + Extends `_check_reg_targets` by automatically selecting a suitable floating-point + data type for inputs using `_find_matching_floating_dtype`. + + Use this private method only when converting inputs to array API-compatibles. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) + Estimated target values. + + sample_weight : array-like of shape (n_samples,) + + multioutput : array-like or string in ['raw_values', 'uniform_average', \ + 'variance_weighted'] or None + None is accepted due to backward compatibility of r2_score(). + + xp : module, default=None + Precomputed array namespace module. When passed, typically from a caller + that has already performed inspection of its own inputs, skips array + namespace inspection. + + Returns + ------- + type_true : one of {'continuous', 'continuous-multioutput'} + The type of the true target data, as output by + 'utils.multiclass.type_of_target'. + + y_true : array-like of shape (n_samples, n_outputs) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples, n_outputs) + Estimated target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + multioutput : array-like of shape (n_outputs) or string in ['raw_values', \ + 'uniform_average', 'variance_weighted'] or None + Custom output weights if ``multioutput`` is array-like or + just the corresponding argument if ``multioutput`` is a + correct keyword. + """ + dtype_name = _find_matching_floating_dtype(y_true, y_pred, sample_weight, xp=xp) + + y_type, y_true, y_pred, sample_weight, multioutput = _check_reg_targets( + y_true, y_pred, sample_weight, multioutput, dtype=dtype_name, xp=xp + ) + + return y_type, y_true, y_pred, sample_weight, multioutput + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "sample_weight": ["array-like", None], + "multioutput": [StrOptions({"raw_values", "uniform_average"}), "array-like"], + }, + prefer_skip_nested_validation=True, +) +def mean_absolute_error( + y_true, y_pred, *, sample_weight=None, multioutput="uniform_average" +): + """Mean absolute error regression loss. + + The mean absolute error is a non-negative floating point value, where best value + is 0.0. Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) + Estimated target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + multioutput : {'raw_values', 'uniform_average'} or array-like of shape \ + (n_outputs,), default='uniform_average' + Defines aggregating of multiple output values. + Array-like value defines weights used to average errors. + + 'raw_values' : + Returns a full set of errors in case of multioutput input. + + 'uniform_average' : + Errors of all outputs are averaged with uniform weight. + + Returns + ------- + loss : float or array of floats + If multioutput is 'raw_values', then mean absolute error is returned + for each output separately. + If multioutput is 'uniform_average' or an ndarray of weights, then the + weighted average of all output errors is returned. + + MAE output is non-negative floating point. The best value is 0.0. + + Examples + -------- + >>> from sklearn.metrics import mean_absolute_error + >>> y_true = [3, -0.5, 2, 7] + >>> y_pred = [2.5, 0.0, 2, 8] + >>> mean_absolute_error(y_true, y_pred) + 0.5 + >>> y_true = [[0.5, 1], [-1, 1], [7, -6]] + >>> y_pred = [[0, 2], [-1, 2], [8, -5]] + >>> mean_absolute_error(y_true, y_pred) + 0.75 + >>> mean_absolute_error(y_true, y_pred, multioutput='raw_values') + array([0.5, 1. ]) + >>> mean_absolute_error(y_true, y_pred, multioutput=[0.3, 0.7]) + 0.85... + """ + xp, _ = get_namespace(y_true, y_pred, sample_weight, multioutput) + + _, y_true, y_pred, sample_weight, multioutput = ( + _check_reg_targets_with_floating_dtype( + y_true, y_pred, sample_weight, multioutput, xp=xp + ) + ) + + output_errors = _average( + xp.abs(y_pred - y_true), weights=sample_weight, axis=0, xp=xp + ) + if isinstance(multioutput, str): + if multioutput == "raw_values": + return output_errors + elif multioutput == "uniform_average": + # pass None as weights to _average: uniform mean + multioutput = None + + # Average across the outputs (if needed). + # The second call to `_average` should always return + # a scalar array that we convert to a Python float to + # consistently return the same eager evaluated value. + # Therefore, `axis=None`. + mean_absolute_error = _average(output_errors, weights=multioutput) + + return float(mean_absolute_error) + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "sample_weight": ["array-like", None], + "alpha": [Interval(Real, 0, 1, closed="both")], + "multioutput": [StrOptions({"raw_values", "uniform_average"}), "array-like"], + }, + prefer_skip_nested_validation=True, +) +def mean_pinball_loss( + y_true, y_pred, *, sample_weight=None, alpha=0.5, multioutput="uniform_average" +): + """Pinball loss for quantile regression. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) + Estimated target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + alpha : float, slope of the pinball loss, default=0.5, + This loss is equivalent to :ref:`mean_absolute_error` when `alpha=0.5`, + `alpha=0.95` is minimized by estimators of the 95th percentile. + + multioutput : {'raw_values', 'uniform_average'} or array-like of shape \ + (n_outputs,), default='uniform_average' + Defines aggregating of multiple output values. + Array-like value defines weights used to average errors. + + 'raw_values' : + Returns a full set of errors in case of multioutput input. + + 'uniform_average' : + Errors of all outputs are averaged with uniform weight. + + Returns + ------- + loss : float or ndarray of floats + If multioutput is 'raw_values', then mean absolute error is returned + for each output separately. + If multioutput is 'uniform_average' or an ndarray of weights, then the + weighted average of all output errors is returned. + + The pinball loss output is a non-negative floating point. The best + value is 0.0. + + Examples + -------- + >>> from sklearn.metrics import mean_pinball_loss + >>> y_true = [1, 2, 3] + >>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.1) + 0.03... + >>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.1) + 0.3... + >>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.9) + 0.3... + >>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.9) + 0.03... + >>> mean_pinball_loss(y_true, y_true, alpha=0.1) + 0.0 + >>> mean_pinball_loss(y_true, y_true, alpha=0.9) + 0.0 + """ + xp, _ = get_namespace(y_true, y_pred, sample_weight, multioutput) + + _, y_true, y_pred, sample_weight, multioutput = ( + _check_reg_targets_with_floating_dtype( + y_true, y_pred, sample_weight, multioutput, xp=xp + ) + ) + + diff = y_true - y_pred + sign = xp.astype(diff >= 0, diff.dtype) + loss = alpha * sign * diff - (1 - alpha) * (1 - sign) * diff + output_errors = _average(loss, weights=sample_weight, axis=0) + + if isinstance(multioutput, str) and multioutput == "raw_values": + return output_errors + + if isinstance(multioutput, str) and multioutput == "uniform_average": + # pass None as weights to _average: uniform mean + multioutput = None + + # Average across the outputs (if needed). + # The second call to `_average` should always return + # a scalar array that we convert to a Python float to + # consistently return the same eager evaluated value. + # Therefore, `axis=None`. + return float(_average(output_errors, weights=multioutput)) + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "sample_weight": ["array-like", None], + "multioutput": [StrOptions({"raw_values", "uniform_average"}), "array-like"], + }, + prefer_skip_nested_validation=True, +) +def mean_absolute_percentage_error( + y_true, y_pred, *, sample_weight=None, multioutput="uniform_average" +): + """Mean absolute percentage error (MAPE) regression loss. + + Note that we are not using the common "percentage" definition: the percentage + in the range [0, 100] is converted to a relative value in the range [0, 1] + by dividing by 100. Thus, an error of 200% corresponds to a relative error of 2. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.24 + + Parameters + ---------- + y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) + Estimated target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + multioutput : {'raw_values', 'uniform_average'} or array-like + Defines aggregating of multiple output values. + Array-like value defines weights used to average errors. + If input is list then the shape must be (n_outputs,). + + 'raw_values' : + Returns a full set of errors in case of multioutput input. + + 'uniform_average' : + Errors of all outputs are averaged with uniform weight. + + Returns + ------- + loss : float or ndarray of floats + If multioutput is 'raw_values', then mean absolute percentage error + is returned for each output separately. + If multioutput is 'uniform_average' or an ndarray of weights, then the + weighted average of all output errors is returned. + + MAPE output is non-negative floating point. The best value is 0.0. + But note that bad predictions can lead to arbitrarily large + MAPE values, especially if some `y_true` values are very close to zero. + Note that we return a large value instead of `inf` when `y_true` is zero. + + Examples + -------- + >>> from sklearn.metrics import mean_absolute_percentage_error + >>> y_true = [3, -0.5, 2, 7] + >>> y_pred = [2.5, 0.0, 2, 8] + >>> mean_absolute_percentage_error(y_true, y_pred) + 0.3273... + >>> y_true = [[0.5, 1], [-1, 1], [7, -6]] + >>> y_pred = [[0, 2], [-1, 2], [8, -5]] + >>> mean_absolute_percentage_error(y_true, y_pred) + 0.5515... + >>> mean_absolute_percentage_error(y_true, y_pred, multioutput=[0.3, 0.7]) + 0.6198... + >>> # the value when some element of the y_true is zero is arbitrarily high because + >>> # of the division by epsilon + >>> y_true = [1., 0., 2.4, 7.] + >>> y_pred = [1.2, 0.1, 2.4, 8.] + >>> mean_absolute_percentage_error(y_true, y_pred) + 112589990684262.48 + """ + xp, _, device_ = get_namespace_and_device( + y_true, y_pred, sample_weight, multioutput + ) + _, y_true, y_pred, sample_weight, multioutput = ( + _check_reg_targets_with_floating_dtype( + y_true, y_pred, sample_weight, multioutput, xp=xp + ) + ) + epsilon = xp.asarray(xp.finfo(xp.float64).eps, dtype=y_true.dtype, device=device_) + y_true_abs = xp.abs(y_true) + mape = xp.abs(y_pred - y_true) / xp.maximum(y_true_abs, epsilon) + output_errors = _average(mape, weights=sample_weight, axis=0) + if isinstance(multioutput, str): + if multioutput == "raw_values": + return output_errors + elif multioutput == "uniform_average": + # pass None as weights to _average: uniform mean + multioutput = None + + # Average across the outputs (if needed). + # The second call to `_average` should always return + # a scalar array that we convert to a Python float to + # consistently return the same eager evaluated value. + # Therefore, `axis=None`. + mean_absolute_percentage_error = _average(output_errors, weights=multioutput) + + return float(mean_absolute_percentage_error) + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "sample_weight": ["array-like", None], + "multioutput": [StrOptions({"raw_values", "uniform_average"}), "array-like"], + }, + prefer_skip_nested_validation=True, +) +def mean_squared_error( + y_true, + y_pred, + *, + sample_weight=None, + multioutput="uniform_average", +): + """Mean squared error regression loss. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) + Estimated target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + multioutput : {'raw_values', 'uniform_average'} or array-like of shape \ + (n_outputs,), default='uniform_average' + Defines aggregating of multiple output values. + Array-like value defines weights used to average errors. + + 'raw_values' : + Returns a full set of errors in case of multioutput input. + + 'uniform_average' : + Errors of all outputs are averaged with uniform weight. + + Returns + ------- + loss : float or array of floats + A non-negative floating point value (the best value is 0.0), or an + array of floating point values, one for each individual target. + + Examples + -------- + >>> from sklearn.metrics import mean_squared_error + >>> y_true = [3, -0.5, 2, 7] + >>> y_pred = [2.5, 0.0, 2, 8] + >>> mean_squared_error(y_true, y_pred) + 0.375 + >>> y_true = [[0.5, 1],[-1, 1],[7, -6]] + >>> y_pred = [[0, 2],[-1, 2],[8, -5]] + >>> mean_squared_error(y_true, y_pred) + 0.708... + >>> mean_squared_error(y_true, y_pred, multioutput='raw_values') + array([0.41666667, 1. ]) + >>> mean_squared_error(y_true, y_pred, multioutput=[0.3, 0.7]) + 0.825... + """ + xp, _ = get_namespace(y_true, y_pred, sample_weight, multioutput) + _, y_true, y_pred, sample_weight, multioutput = ( + _check_reg_targets_with_floating_dtype( + y_true, y_pred, sample_weight, multioutput, xp=xp + ) + ) + output_errors = _average((y_true - y_pred) ** 2, axis=0, weights=sample_weight) + + if isinstance(multioutput, str): + if multioutput == "raw_values": + return output_errors + elif multioutput == "uniform_average": + # pass None as weights to _average: uniform mean + multioutput = None + + # Average across the outputs (if needed). + # The second call to `_average` should always return + # a scalar array that we convert to a Python float to + # consistently return the same eager evaluated value. + # Therefore, `axis=None`. + mean_squared_error = _average(output_errors, weights=multioutput) + + return float(mean_squared_error) + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "sample_weight": ["array-like", None], + "multioutput": [StrOptions({"raw_values", "uniform_average"}), "array-like"], + }, + prefer_skip_nested_validation=True, +) +def root_mean_squared_error( + y_true, y_pred, *, sample_weight=None, multioutput="uniform_average" +): + """Root mean squared error regression loss. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.4 + + Parameters + ---------- + y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) + Estimated target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + multioutput : {'raw_values', 'uniform_average'} or array-like of shape \ + (n_outputs,), default='uniform_average' + Defines aggregating of multiple output values. + Array-like value defines weights used to average errors. + + 'raw_values' : + Returns a full set of errors in case of multioutput input. + + 'uniform_average' : + Errors of all outputs are averaged with uniform weight. + + Returns + ------- + loss : float or ndarray of floats + A non-negative floating point value (the best value is 0.0), or an + array of floating point values, one for each individual target. + + Examples + -------- + >>> from sklearn.metrics import root_mean_squared_error + >>> y_true = [3, -0.5, 2, 7] + >>> y_pred = [2.5, 0.0, 2, 8] + >>> root_mean_squared_error(y_true, y_pred) + 0.612... + >>> y_true = [[0.5, 1],[-1, 1],[7, -6]] + >>> y_pred = [[0, 2],[-1, 2],[8, -5]] + >>> root_mean_squared_error(y_true, y_pred) + 0.822... + """ + + xp, _ = get_namespace(y_true, y_pred, sample_weight, multioutput) + + output_errors = xp.sqrt( + mean_squared_error( + y_true, y_pred, sample_weight=sample_weight, multioutput="raw_values" + ) + ) + + if isinstance(multioutput, str): + if multioutput == "raw_values": + return output_errors + elif multioutput == "uniform_average": + # pass None as weights to _average: uniform mean + multioutput = None + + # Average across the outputs (if needed). + # The second call to `_average` should always return + # a scalar array that we convert to a Python float to + # consistently return the same eager evaluated value. + # Therefore, `axis=None`. + root_mean_squared_error = _average(output_errors, weights=multioutput) + + return float(root_mean_squared_error) + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "sample_weight": ["array-like", None], + "multioutput": [StrOptions({"raw_values", "uniform_average"}), "array-like"], + }, + prefer_skip_nested_validation=True, +) +def mean_squared_log_error( + y_true, + y_pred, + *, + sample_weight=None, + multioutput="uniform_average", +): + """Mean squared logarithmic error regression loss. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) + Estimated target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + multioutput : {'raw_values', 'uniform_average'} or array-like of shape \ + (n_outputs,), default='uniform_average' + + Defines aggregating of multiple output values. + Array-like value defines weights used to average errors. + + 'raw_values' : + Returns a full set of errors when the input is of multioutput + format. + + 'uniform_average' : + Errors of all outputs are averaged with uniform weight. + + Returns + ------- + loss : float or ndarray of floats + A non-negative floating point value (the best value is 0.0), or an + array of floating point values, one for each individual target. + + Examples + -------- + >>> from sklearn.metrics import mean_squared_log_error + >>> y_true = [3, 5, 2.5, 7] + >>> y_pred = [2.5, 5, 4, 8] + >>> mean_squared_log_error(y_true, y_pred) + 0.039... + >>> y_true = [[0.5, 1], [1, 2], [7, 6]] + >>> y_pred = [[0.5, 2], [1, 2.5], [8, 8]] + >>> mean_squared_log_error(y_true, y_pred) + 0.044... + >>> mean_squared_log_error(y_true, y_pred, multioutput='raw_values') + array([0.00462428, 0.08377444]) + >>> mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7]) + 0.060... + """ + xp, _ = get_namespace(y_true, y_pred) + + _, y_true, y_pred, sample_weight, multioutput = ( + _check_reg_targets_with_floating_dtype( + y_true, y_pred, sample_weight, multioutput, xp=xp + ) + ) + + if xp.any(y_true <= -1) or xp.any(y_pred <= -1): + raise ValueError( + "Mean Squared Logarithmic Error cannot be used when " + "targets contain values less than or equal to -1." + ) + + return mean_squared_error( + xp.log1p(y_true), + xp.log1p(y_pred), + sample_weight=sample_weight, + multioutput=multioutput, + ) + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "sample_weight": ["array-like", None], + "multioutput": [StrOptions({"raw_values", "uniform_average"}), "array-like"], + }, + prefer_skip_nested_validation=True, +) +def root_mean_squared_log_error( + y_true, y_pred, *, sample_weight=None, multioutput="uniform_average" +): + """Root mean squared logarithmic error regression loss. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.4 + + Parameters + ---------- + y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) + Estimated target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + multioutput : {'raw_values', 'uniform_average'} or array-like of shape \ + (n_outputs,), default='uniform_average' + + Defines aggregating of multiple output values. + Array-like value defines weights used to average errors. + + 'raw_values' : + Returns a full set of errors when the input is of multioutput + format. + + 'uniform_average' : + Errors of all outputs are averaged with uniform weight. + + Returns + ------- + loss : float or ndarray of floats + A non-negative floating point value (the best value is 0.0), or an + array of floating point values, one for each individual target. + + Examples + -------- + >>> from sklearn.metrics import root_mean_squared_log_error + >>> y_true = [3, 5, 2.5, 7] + >>> y_pred = [2.5, 5, 4, 8] + >>> root_mean_squared_log_error(y_true, y_pred) + 0.199... + """ + xp, _ = get_namespace(y_true, y_pred) + + _, y_true, y_pred, sample_weight, multioutput = ( + _check_reg_targets_with_floating_dtype( + y_true, y_pred, sample_weight, multioutput, xp=xp + ) + ) + + if xp.any(y_true <= -1) or xp.any(y_pred <= -1): + raise ValueError( + "Root Mean Squared Logarithmic Error cannot be used when " + "targets contain values less than or equal to -1." + ) + + return root_mean_squared_error( + xp.log1p(y_true), + xp.log1p(y_pred), + sample_weight=sample_weight, + multioutput=multioutput, + ) + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "multioutput": [StrOptions({"raw_values", "uniform_average"}), "array-like"], + "sample_weight": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def median_absolute_error( + y_true, y_pred, *, multioutput="uniform_average", sample_weight=None +): + """Median absolute error regression loss. + + Median absolute error output is non-negative floating point. The best value + is 0.0. Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) + Estimated target values. + + multioutput : {'raw_values', 'uniform_average'} or array-like of shape \ + (n_outputs,), default='uniform_average' + Defines aggregating of multiple output values. Array-like value defines + weights used to average errors. + + 'raw_values' : + Returns a full set of errors in case of multioutput input. + + 'uniform_average' : + Errors of all outputs are averaged with uniform weight. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + .. versionadded:: 0.24 + + Returns + ------- + loss : float or ndarray of floats + If multioutput is 'raw_values', then mean absolute error is returned + for each output separately. + If multioutput is 'uniform_average' or an ndarray of weights, then the + weighted average of all output errors is returned. + + Examples + -------- + >>> from sklearn.metrics import median_absolute_error + >>> y_true = [3, -0.5, 2, 7] + >>> y_pred = [2.5, 0.0, 2, 8] + >>> median_absolute_error(y_true, y_pred) + 0.5 + >>> y_true = [[0.5, 1], [-1, 1], [7, -6]] + >>> y_pred = [[0, 2], [-1, 2], [8, -5]] + >>> median_absolute_error(y_true, y_pred) + 0.75 + >>> median_absolute_error(y_true, y_pred, multioutput='raw_values') + array([0.5, 1. ]) + >>> median_absolute_error(y_true, y_pred, multioutput=[0.3, 0.7]) + 0.85 + """ + _, y_true, y_pred, sample_weight, multioutput = _check_reg_targets( + y_true, y_pred, sample_weight, multioutput + ) + if sample_weight is None: + output_errors = np.median(np.abs(y_pred - y_true), axis=0) + else: + output_errors = _weighted_percentile( + np.abs(y_pred - y_true), sample_weight=sample_weight + ) + if isinstance(multioutput, str): + if multioutput == "raw_values": + return output_errors + elif multioutput == "uniform_average": + # pass None as weights to np.average: uniform mean + multioutput = None + + return float(np.average(output_errors, weights=multioutput)) + + +def _assemble_r2_explained_variance( + numerator, denominator, n_outputs, multioutput, force_finite, xp, device +): + """Common part used by explained variance score and :math:`R^2` score.""" + dtype = numerator.dtype + + nonzero_denominator = denominator != 0 + + if not force_finite: + # Standard formula, that may lead to NaN or -Inf + output_scores = 1 - (numerator / denominator) + else: + nonzero_numerator = numerator != 0 + # Default = Zero Numerator = perfect predictions. Set to 1.0 + # (note: even if denominator is zero, thus avoiding NaN scores) + output_scores = xp.ones([n_outputs], device=device, dtype=dtype) + # Non-zero Numerator and Non-zero Denominator: use the formula + valid_score = nonzero_denominator & nonzero_numerator + + output_scores[valid_score] = 1 - ( + numerator[valid_score] / denominator[valid_score] + ) + + # Non-zero Numerator and Zero Denominator: + # arbitrary set to 0.0 to avoid -inf scores + output_scores[nonzero_numerator & ~nonzero_denominator] = 0.0 + + if isinstance(multioutput, str): + if multioutput == "raw_values": + # return scores individually + return output_scores + elif multioutput == "uniform_average": + # pass None as weights to _average: uniform mean + avg_weights = None + elif multioutput == "variance_weighted": + avg_weights = denominator + if not xp.any(nonzero_denominator): + # All weights are zero, _average would raise a ZeroDiv error. + # This only happens when all y are constant (or 1-element long) + # Since weights are all equal, fall back to uniform weights. + avg_weights = None + else: + avg_weights = multioutput + + result = _average(output_scores, weights=avg_weights) + if size(result) == 1: + return float(result) + return result + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "sample_weight": ["array-like", None], + "multioutput": [ + StrOptions({"raw_values", "uniform_average", "variance_weighted"}), + "array-like", + ], + "force_finite": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def explained_variance_score( + y_true, + y_pred, + *, + sample_weight=None, + multioutput="uniform_average", + force_finite=True, +): + """Explained variance regression score function. + + Best possible score is 1.0, lower values are worse. + + In the particular case when ``y_true`` is constant, the explained variance + score is not finite: it is either ``NaN`` (perfect predictions) or + ``-Inf`` (imperfect predictions). To prevent such non-finite numbers to + pollute higher-level experiments such as a grid search cross-validation, + by default these cases are replaced with 1.0 (perfect predictions) or 0.0 + (imperfect predictions) respectively. If ``force_finite`` + is set to ``False``, this score falls back on the original :math:`R^2` + definition. + + .. note:: + The Explained Variance score is similar to the + :func:`R^2 score `, with the notable difference that it + does not account for systematic offsets in the prediction. Most often + the :func:`R^2 score ` should be preferred. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) + Estimated target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + multioutput : {'raw_values', 'uniform_average', 'variance_weighted'} or \ + array-like of shape (n_outputs,), default='uniform_average' + Defines aggregating of multiple output scores. + Array-like value defines weights used to average scores. + + 'raw_values' : + Returns a full set of scores in case of multioutput input. + + 'uniform_average' : + Scores of all outputs are averaged with uniform weight. + + 'variance_weighted' : + Scores of all outputs are averaged, weighted by the variances + of each individual output. + + force_finite : bool, default=True + Flag indicating if ``NaN`` and ``-Inf`` scores resulting from constant + data should be replaced with real numbers (``1.0`` if prediction is + perfect, ``0.0`` otherwise). Default is ``True``, a convenient setting + for hyperparameters' search procedures (e.g. grid search + cross-validation). + + .. versionadded:: 1.1 + + Returns + ------- + score : float or ndarray of floats + The explained variance or ndarray if 'multioutput' is 'raw_values'. + + See Also + -------- + r2_score : + Similar metric, but accounting for systematic offsets in + prediction. + + Notes + ----- + This is not a symmetric function. + + Examples + -------- + >>> from sklearn.metrics import explained_variance_score + >>> y_true = [3, -0.5, 2, 7] + >>> y_pred = [2.5, 0.0, 2, 8] + >>> explained_variance_score(y_true, y_pred) + 0.957... + >>> y_true = [[0.5, 1], [-1, 1], [7, -6]] + >>> y_pred = [[0, 2], [-1, 2], [8, -5]] + >>> explained_variance_score(y_true, y_pred, multioutput='uniform_average') + 0.983... + >>> y_true = [-2, -2, -2] + >>> y_pred = [-2, -2, -2] + >>> explained_variance_score(y_true, y_pred) + 1.0 + >>> explained_variance_score(y_true, y_pred, force_finite=False) + nan + >>> y_true = [-2, -2, -2] + >>> y_pred = [-2, -2, -2 + 1e-8] + >>> explained_variance_score(y_true, y_pred) + 0.0 + >>> explained_variance_score(y_true, y_pred, force_finite=False) + -inf + """ + xp, _, device = get_namespace_and_device(y_true, y_pred, sample_weight, multioutput) + + _, y_true, y_pred, sample_weight, multioutput = ( + _check_reg_targets_with_floating_dtype( + y_true, y_pred, sample_weight, multioutput, xp=xp + ) + ) + + y_diff_avg = _average(y_true - y_pred, weights=sample_weight, axis=0) + numerator = _average( + (y_true - y_pred - y_diff_avg) ** 2, weights=sample_weight, axis=0 + ) + + y_true_avg = _average(y_true, weights=sample_weight, axis=0) + denominator = _average((y_true - y_true_avg) ** 2, weights=sample_weight, axis=0) + + return _assemble_r2_explained_variance( + numerator=numerator, + denominator=denominator, + n_outputs=y_true.shape[1], + multioutput=multioutput, + force_finite=force_finite, + xp=xp, + device=device, + ) + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "sample_weight": ["array-like", None], + "multioutput": [ + StrOptions({"raw_values", "uniform_average", "variance_weighted"}), + "array-like", + None, + ], + "force_finite": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def r2_score( + y_true, + y_pred, + *, + sample_weight=None, + multioutput="uniform_average", + force_finite=True, +): + """:math:`R^2` (coefficient of determination) regression score function. + + Best possible score is 1.0 and it can be negative (because the + model can be arbitrarily worse). In the general case when the true y is + non-constant, a constant model that always predicts the average y + disregarding the input features would get a :math:`R^2` score of 0.0. + + In the particular case when ``y_true`` is constant, the :math:`R^2` score + is not finite: it is either ``NaN`` (perfect predictions) or ``-Inf`` + (imperfect predictions). To prevent such non-finite numbers to pollute + higher-level experiments such as a grid search cross-validation, by default + these cases are replaced with 1.0 (perfect predictions) or 0.0 (imperfect + predictions) respectively. You can set ``force_finite`` to ``False`` to + prevent this fix from happening. + + Note: when the prediction residuals have zero mean, the :math:`R^2` score + is identical to the + :func:`Explained Variance score `. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) + Estimated target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + multioutput : {'raw_values', 'uniform_average', 'variance_weighted'}, \ + array-like of shape (n_outputs,) or None, default='uniform_average' + + Defines aggregating of multiple output scores. + Array-like value defines weights used to average scores. + Default is "uniform_average". + + 'raw_values' : + Returns a full set of scores in case of multioutput input. + + 'uniform_average' : + Scores of all outputs are averaged with uniform weight. + + 'variance_weighted' : + Scores of all outputs are averaged, weighted by the variances + of each individual output. + + .. versionchanged:: 0.19 + Default value of multioutput is 'uniform_average'. + + force_finite : bool, default=True + Flag indicating if ``NaN`` and ``-Inf`` scores resulting from constant + data should be replaced with real numbers (``1.0`` if prediction is + perfect, ``0.0`` otherwise). Default is ``True``, a convenient setting + for hyperparameters' search procedures (e.g. grid search + cross-validation). + + .. versionadded:: 1.1 + + Returns + ------- + z : float or ndarray of floats + The :math:`R^2` score or ndarray of scores if 'multioutput' is + 'raw_values'. + + Notes + ----- + This is not a symmetric function. + + Unlike most other scores, :math:`R^2` score may be negative (it need not + actually be the square of a quantity R). + + This metric is not well-defined for single samples and will return a NaN + value if n_samples is less than two. + + References + ---------- + .. [1] `Wikipedia entry on the Coefficient of determination + `_ + + Examples + -------- + >>> from sklearn.metrics import r2_score + >>> y_true = [3, -0.5, 2, 7] + >>> y_pred = [2.5, 0.0, 2, 8] + >>> r2_score(y_true, y_pred) + 0.948... + >>> y_true = [[0.5, 1], [-1, 1], [7, -6]] + >>> y_pred = [[0, 2], [-1, 2], [8, -5]] + >>> r2_score(y_true, y_pred, + ... multioutput='variance_weighted') + 0.938... + >>> y_true = [1, 2, 3] + >>> y_pred = [1, 2, 3] + >>> r2_score(y_true, y_pred) + 1.0 + >>> y_true = [1, 2, 3] + >>> y_pred = [2, 2, 2] + >>> r2_score(y_true, y_pred) + 0.0 + >>> y_true = [1, 2, 3] + >>> y_pred = [3, 2, 1] + >>> r2_score(y_true, y_pred) + -3.0 + >>> y_true = [-2, -2, -2] + >>> y_pred = [-2, -2, -2] + >>> r2_score(y_true, y_pred) + 1.0 + >>> r2_score(y_true, y_pred, force_finite=False) + nan + >>> y_true = [-2, -2, -2] + >>> y_pred = [-2, -2, -2 + 1e-8] + >>> r2_score(y_true, y_pred) + 0.0 + >>> r2_score(y_true, y_pred, force_finite=False) + -inf + """ + xp, _, device_ = get_namespace_and_device( + y_true, y_pred, sample_weight, multioutput + ) + + _, y_true, y_pred, sample_weight, multioutput = ( + _check_reg_targets_with_floating_dtype( + y_true, y_pred, sample_weight, multioutput, xp=xp + ) + ) + + if _num_samples(y_pred) < 2: + msg = "R^2 score is not well-defined with less than two samples." + warnings.warn(msg, UndefinedMetricWarning) + return float("nan") + + if sample_weight is not None: + sample_weight = column_or_1d(sample_weight) + weight = sample_weight[:, None] + else: + weight = 1.0 + + numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0) + denominator = xp.sum( + weight * (y_true - _average(y_true, axis=0, weights=sample_weight, xp=xp)) ** 2, + axis=0, + ) + + return _assemble_r2_explained_variance( + numerator=numerator, + denominator=denominator, + n_outputs=y_true.shape[1], + multioutput=multioutput, + force_finite=force_finite, + xp=xp, + device=device_, + ) + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + }, + prefer_skip_nested_validation=True, +) +def max_error(y_true, y_pred): + """ + The max_error metric calculates the maximum residual error. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) + Estimated target values. + + Returns + ------- + max_error : float + A positive floating point value (the best value is 0.0). + + Examples + -------- + >>> from sklearn.metrics import max_error + >>> y_true = [3, 2, 7, 1] + >>> y_pred = [4, 2, 7, 1] + >>> max_error(y_true, y_pred) + 1.0 + """ + xp, _ = get_namespace(y_true, y_pred) + y_type, y_true, y_pred, _, _ = _check_reg_targets( + y_true, y_pred, sample_weight=None, multioutput=None, xp=xp + ) + if y_type == "continuous-multioutput": + raise ValueError("Multioutput not supported in max_error") + return float(xp.max(xp.abs(y_true - y_pred))) + + +def _mean_tweedie_deviance(y_true, y_pred, sample_weight, power): + """Mean Tweedie deviance regression loss.""" + xp, _, device_ = get_namespace_and_device(y_true, y_pred) + p = power + if p < 0: + # 'Extreme stable', y any real number, y_pred > 0 + dev = 2 * ( + xp.pow( + xp.where(y_true > 0, y_true, 0.0), + 2 - p, + ) + / ((1 - p) * (2 - p)) + - y_true * xp.pow(y_pred, 1 - p) / (1 - p) + + xp.pow(y_pred, 2 - p) / (2 - p) + ) + elif p == 0: + # Normal distribution, y and y_pred any real number + dev = (y_true - y_pred) ** 2 + elif p == 1: + # Poisson distribution + dev = 2 * (xlogy(y_true, y_true / y_pred) - y_true + y_pred) + elif p == 2: + # Gamma distribution + dev = 2 * (xp.log(y_pred / y_true) + y_true / y_pred - 1) + else: + dev = 2 * ( + xp.pow(y_true, 2 - p) / ((1 - p) * (2 - p)) + - y_true * xp.pow(y_pred, 1 - p) / (1 - p) + + xp.pow(y_pred, 2 - p) / (2 - p) + ) + return float(_average(dev, weights=sample_weight)) + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "sample_weight": ["array-like", None], + "power": [ + Interval(Real, None, 0, closed="right"), + Interval(Real, 1, None, closed="left"), + ], + }, + prefer_skip_nested_validation=True, +) +def mean_tweedie_deviance(y_true, y_pred, *, sample_weight=None, power=0): + """Mean Tweedie deviance regression loss. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) + Estimated target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + power : float, default=0 + Tweedie power parameter. Either power <= 0 or power >= 1. + + The higher `p` the less weight is given to extreme + deviations between true and predicted targets. + + - power < 0: Extreme stable distribution. Requires: y_pred > 0. + - power = 0 : Normal distribution, output corresponds to + mean_squared_error. y_true and y_pred can be any real numbers. + - power = 1 : Poisson distribution. Requires: y_true >= 0 and + y_pred > 0. + - 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0 + and y_pred > 0. + - power = 2 : Gamma distribution. Requires: y_true > 0 and y_pred > 0. + - power = 3 : Inverse Gaussian distribution. Requires: y_true > 0 + and y_pred > 0. + - otherwise : Positive stable distribution. Requires: y_true > 0 + and y_pred > 0. + + Returns + ------- + loss : float + A non-negative floating point value (the best value is 0.0). + + Examples + -------- + >>> from sklearn.metrics import mean_tweedie_deviance + >>> y_true = [2, 0, 1, 4] + >>> y_pred = [0.5, 0.5, 2., 2.] + >>> mean_tweedie_deviance(y_true, y_pred, power=1) + 1.4260... + """ + xp, _ = get_namespace(y_true, y_pred) + y_type, y_true, y_pred, sample_weight, _ = _check_reg_targets_with_floating_dtype( + y_true, y_pred, sample_weight, multioutput=None, xp=xp + ) + if y_type == "continuous-multioutput": + raise ValueError("Multioutput not supported in mean_tweedie_deviance") + + if sample_weight is not None: + sample_weight = column_or_1d(sample_weight) + sample_weight = sample_weight[:, np.newaxis] + + message = f"Mean Tweedie deviance error with power={power} can only be used on " + if power < 0: + # 'Extreme stable', y any real number, y_pred > 0 + if xp.any(y_pred <= 0): + raise ValueError(message + "strictly positive y_pred.") + elif power == 0: + # Normal, y and y_pred can be any real number + pass + elif 1 <= power < 2: + # Poisson and compound Poisson distribution, y >= 0, y_pred > 0 + if xp.any(y_true < 0) or xp.any(y_pred <= 0): + raise ValueError(message + "non-negative y and strictly positive y_pred.") + elif power >= 2: + # Gamma and Extreme stable distribution, y and y_pred > 0 + if xp.any(y_true <= 0) or xp.any(y_pred <= 0): + raise ValueError(message + "strictly positive y and y_pred.") + else: # pragma: nocover + # Unreachable statement + raise ValueError + + return _mean_tweedie_deviance( + y_true, y_pred, sample_weight=sample_weight, power=power + ) + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "sample_weight": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def mean_poisson_deviance(y_true, y_pred, *, sample_weight=None): + """Mean Poisson deviance regression loss. + + Poisson deviance is equivalent to the Tweedie deviance with + the power parameter `power=1`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + Ground truth (correct) target values. Requires y_true >= 0. + + y_pred : array-like of shape (n_samples,) + Estimated target values. Requires y_pred > 0. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + loss : float + A non-negative floating point value (the best value is 0.0). + + Examples + -------- + >>> from sklearn.metrics import mean_poisson_deviance + >>> y_true = [2, 0, 1, 4] + >>> y_pred = [0.5, 0.5, 2., 2.] + >>> mean_poisson_deviance(y_true, y_pred) + 1.4260... + """ + return mean_tweedie_deviance(y_true, y_pred, sample_weight=sample_weight, power=1) + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "sample_weight": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def mean_gamma_deviance(y_true, y_pred, *, sample_weight=None): + """Mean Gamma deviance regression loss. + + Gamma deviance is equivalent to the Tweedie deviance with + the power parameter `power=2`. It is invariant to scaling of + the target variable, and measures relative errors. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + Ground truth (correct) target values. Requires y_true > 0. + + y_pred : array-like of shape (n_samples,) + Estimated target values. Requires y_pred > 0. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + loss : float + A non-negative floating point value (the best value is 0.0). + + Examples + -------- + >>> from sklearn.metrics import mean_gamma_deviance + >>> y_true = [2, 0.5, 1, 4] + >>> y_pred = [0.5, 0.5, 2., 2.] + >>> mean_gamma_deviance(y_true, y_pred) + 1.0568... + """ + return mean_tweedie_deviance(y_true, y_pred, sample_weight=sample_weight, power=2) + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "sample_weight": ["array-like", None], + "power": [ + Interval(Real, None, 0, closed="right"), + Interval(Real, 1, None, closed="left"), + ], + }, + prefer_skip_nested_validation=True, +) +def d2_tweedie_score(y_true, y_pred, *, sample_weight=None, power=0): + """ + :math:`D^2` regression score function, fraction of Tweedie deviance explained. + + Best possible score is 1.0 and it can be negative (because the model can be + arbitrarily worse). A model that always uses the empirical mean of `y_true` as + constant prediction, disregarding the input features, gets a D^2 score of 0.0. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.0 + + Parameters + ---------- + y_true : array-like of shape (n_samples,) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) + Estimated target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + power : float, default=0 + Tweedie power parameter. Either power <= 0 or power >= 1. + + The higher `p` the less weight is given to extreme + deviations between true and predicted targets. + + - power < 0: Extreme stable distribution. Requires: y_pred > 0. + - power = 0 : Normal distribution, output corresponds to r2_score. + y_true and y_pred can be any real numbers. + - power = 1 : Poisson distribution. Requires: y_true >= 0 and + y_pred > 0. + - 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0 + and y_pred > 0. + - power = 2 : Gamma distribution. Requires: y_true > 0 and y_pred > 0. + - power = 3 : Inverse Gaussian distribution. Requires: y_true > 0 + and y_pred > 0. + - otherwise : Positive stable distribution. Requires: y_true > 0 + and y_pred > 0. + + Returns + ------- + z : float + The D^2 score. + + Notes + ----- + This is not a symmetric function. + + Like R^2, D^2 score may be negative (it need not actually be the square of + a quantity D). + + This metric is not well-defined for single samples and will return a NaN + value if n_samples is less than two. + + References + ---------- + .. [1] Eq. (3.11) of Hastie, Trevor J., Robert Tibshirani and Martin J. + Wainwright. "Statistical Learning with Sparsity: The Lasso and + Generalizations." (2015). https://hastie.su.domains/StatLearnSparsity/ + + Examples + -------- + >>> from sklearn.metrics import d2_tweedie_score + >>> y_true = [0.5, 1, 2.5, 7] + >>> y_pred = [1, 1, 5, 3.5] + >>> d2_tweedie_score(y_true, y_pred) + 0.285... + >>> d2_tweedie_score(y_true, y_pred, power=1) + 0.487... + >>> d2_tweedie_score(y_true, y_pred, power=2) + 0.630... + >>> d2_tweedie_score(y_true, y_true, power=2) + 1.0 + """ + xp, _ = get_namespace(y_true, y_pred) + + y_type, y_true, y_pred, sample_weight, _ = _check_reg_targets_with_floating_dtype( + y_true, y_pred, sample_weight, multioutput=None, xp=xp + ) + if y_type == "continuous-multioutput": + raise ValueError("Multioutput not supported in d2_tweedie_score") + + if _num_samples(y_pred) < 2: + msg = "D^2 score is not well-defined with less than two samples." + warnings.warn(msg, UndefinedMetricWarning) + return float("nan") + + y_true, y_pred = xp.squeeze(y_true, axis=1), xp.squeeze(y_pred, axis=1) + numerator = mean_tweedie_deviance( + y_true, y_pred, sample_weight=sample_weight, power=power + ) + + y_avg = _average(y_true, weights=sample_weight, xp=xp) + denominator = _mean_tweedie_deviance( + y_true, y_avg, sample_weight=sample_weight, power=power + ) + + return 1 - numerator / denominator + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "sample_weight": ["array-like", None], + "alpha": [Interval(Real, 0, 1, closed="both")], + "multioutput": [ + StrOptions({"raw_values", "uniform_average"}), + "array-like", + ], + }, + prefer_skip_nested_validation=True, +) +def d2_pinball_score( + y_true, y_pred, *, sample_weight=None, alpha=0.5, multioutput="uniform_average" +): + """ + :math:`D^2` regression score function, fraction of pinball loss explained. + + Best possible score is 1.0 and it can be negative (because the model can be + arbitrarily worse). A model that always uses the empirical alpha-quantile of + `y_true` as constant prediction, disregarding the input features, + gets a :math:`D^2` score of 0.0. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.1 + + Parameters + ---------- + y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) + Estimated target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + alpha : float, default=0.5 + Slope of the pinball deviance. It determines the quantile level alpha + for which the pinball deviance and also D2 are optimal. + The default `alpha=0.5` is equivalent to `d2_absolute_error_score`. + + multioutput : {'raw_values', 'uniform_average'} or array-like of shape \ + (n_outputs,), default='uniform_average' + Defines aggregating of multiple output values. + Array-like value defines weights used to average scores. + + 'raw_values' : + Returns a full set of errors in case of multioutput input. + + 'uniform_average' : + Scores of all outputs are averaged with uniform weight. + + Returns + ------- + score : float or ndarray of floats + The :math:`D^2` score with a pinball deviance + or ndarray of scores if `multioutput='raw_values'`. + + Notes + ----- + Like :math:`R^2`, :math:`D^2` score may be negative + (it need not actually be the square of a quantity D). + + This metric is not well-defined for a single point and will return a NaN + value if n_samples is less than two. + + References + ---------- + .. [1] Eq. (7) of `Koenker, Roger; Machado, José A. F. (1999). + "Goodness of Fit and Related Inference Processes for Quantile Regression" + `_ + .. [2] Eq. (3.11) of Hastie, Trevor J., Robert Tibshirani and Martin J. + Wainwright. "Statistical Learning with Sparsity: The Lasso and + Generalizations." (2015). https://hastie.su.domains/StatLearnSparsity/ + + Examples + -------- + >>> from sklearn.metrics import d2_pinball_score + >>> y_true = [1, 2, 3] + >>> y_pred = [1, 3, 3] + >>> d2_pinball_score(y_true, y_pred) + 0.5 + >>> d2_pinball_score(y_true, y_pred, alpha=0.9) + 0.772... + >>> d2_pinball_score(y_true, y_pred, alpha=0.1) + -1.045... + >>> d2_pinball_score(y_true, y_true, alpha=0.1) + 1.0 + """ + _, y_true, y_pred, sample_weight, multioutput = _check_reg_targets( + y_true, y_pred, sample_weight, multioutput + ) + + if _num_samples(y_pred) < 2: + msg = "D^2 score is not well-defined with less than two samples." + warnings.warn(msg, UndefinedMetricWarning) + return float("nan") + + numerator = mean_pinball_loss( + y_true, + y_pred, + sample_weight=sample_weight, + alpha=alpha, + multioutput="raw_values", + ) + + if sample_weight is None: + y_quantile = np.tile( + np.percentile(y_true, q=alpha * 100, axis=0), (len(y_true), 1) + ) + else: + y_quantile = np.tile( + _weighted_percentile( + y_true, sample_weight=sample_weight, percentile_rank=alpha * 100 + ), + (len(y_true), 1), + ) + + denominator = mean_pinball_loss( + y_true, + y_quantile, + sample_weight=sample_weight, + alpha=alpha, + multioutput="raw_values", + ) + + nonzero_numerator = numerator != 0 + nonzero_denominator = denominator != 0 + valid_score = nonzero_numerator & nonzero_denominator + output_scores = np.ones(y_true.shape[1]) + + output_scores[valid_score] = 1 - (numerator[valid_score] / denominator[valid_score]) + output_scores[nonzero_numerator & ~nonzero_denominator] = 0.0 + + if isinstance(multioutput, str): + if multioutput == "raw_values": + # return scores individually + return output_scores + else: # multioutput == "uniform_average" + # passing None as weights to np.average results in uniform mean + avg_weights = None + else: + avg_weights = multioutput + + return float(np.average(output_scores, weights=avg_weights)) + + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "sample_weight": ["array-like", None], + "multioutput": [ + StrOptions({"raw_values", "uniform_average"}), + "array-like", + ], + }, + prefer_skip_nested_validation=True, +) +def d2_absolute_error_score( + y_true, y_pred, *, sample_weight=None, multioutput="uniform_average" +): + """ + :math:`D^2` regression score function, fraction of absolute error explained. + + Best possible score is 1.0 and it can be negative (because the model can be + arbitrarily worse). A model that always uses the empirical median of `y_true` + as constant prediction, disregarding the input features, + gets a :math:`D^2` score of 0.0. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.1 + + Parameters + ---------- + y_true : array-like of shape (n_samples,) or (n_samples, n_outputs) + Ground truth (correct) target values. + + y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs) + Estimated target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + multioutput : {'raw_values', 'uniform_average'} or array-like of shape \ + (n_outputs,), default='uniform_average' + Defines aggregating of multiple output values. + Array-like value defines weights used to average scores. + + 'raw_values' : + Returns a full set of errors in case of multioutput input. + + 'uniform_average' : + Scores of all outputs are averaged with uniform weight. + + Returns + ------- + score : float or ndarray of floats + The :math:`D^2` score with an absolute error deviance + or ndarray of scores if 'multioutput' is 'raw_values'. + + Notes + ----- + Like :math:`R^2`, :math:`D^2` score may be negative + (it need not actually be the square of a quantity D). + + This metric is not well-defined for single samples and will return a NaN + value if n_samples is less than two. + + References + ---------- + .. [1] Eq. (3.11) of Hastie, Trevor J., Robert Tibshirani and Martin J. + Wainwright. "Statistical Learning with Sparsity: The Lasso and + Generalizations." (2015). https://hastie.su.domains/StatLearnSparsity/ + + Examples + -------- + >>> from sklearn.metrics import d2_absolute_error_score + >>> y_true = [3, -0.5, 2, 7] + >>> y_pred = [2.5, 0.0, 2, 8] + >>> d2_absolute_error_score(y_true, y_pred) + 0.764... + >>> y_true = [[0.5, 1], [-1, 1], [7, -6]] + >>> y_pred = [[0, 2], [-1, 2], [8, -5]] + >>> d2_absolute_error_score(y_true, y_pred, multioutput='uniform_average') + 0.691... + >>> d2_absolute_error_score(y_true, y_pred, multioutput='raw_values') + array([0.8125 , 0.57142857]) + >>> y_true = [1, 2, 3] + >>> y_pred = [1, 2, 3] + >>> d2_absolute_error_score(y_true, y_pred) + 1.0 + >>> y_true = [1, 2, 3] + >>> y_pred = [2, 2, 2] + >>> d2_absolute_error_score(y_true, y_pred) + 0.0 + >>> y_true = [1, 2, 3] + >>> y_pred = [3, 2, 1] + >>> d2_absolute_error_score(y_true, y_pred) + -1.0 + """ + return d2_pinball_score( + y_true, y_pred, sample_weight=sample_weight, alpha=0.5, multioutput=multioutput + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py new file mode 100644 index 0000000000000000000000000000000000000000..08e5a20187de7f5c15985ed337603f442bda9fec --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py @@ -0,0 +1,1166 @@ +""" +The :mod:`sklearn.metrics.scorer` submodule implements a flexible +interface for model selection and evaluation using +arbitrary score functions. + +A scorer object is a callable that can be passed to +:class:`~sklearn.model_selection.GridSearchCV` or +:func:`sklearn.model_selection.cross_val_score` as the ``scoring`` +parameter, to specify how a model should be evaluated. + +The signature of the call is ``(estimator, X, y)`` where ``estimator`` +is the model to be evaluated, ``X`` is the test data and ``y`` is the +ground truth labeling (or ``None`` in the case of unsupervised models). +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import copy +import warnings +from collections import Counter +from functools import partial +from inspect import signature +from numbers import Integral +from traceback import format_exc + +import numpy as np + +from ..base import is_regressor +from ..utils import Bunch +from ..utils._param_validation import HasMethods, Hidden, StrOptions, validate_params +from ..utils._response import _get_response_values +from ..utils.metadata_routing import ( + MetadataRequest, + MetadataRouter, + MethodMapping, + _MetadataRequester, + _raise_for_params, + _routing_enabled, + get_routing_for_object, + process_routing, +) +from ..utils.validation import _check_response_method +from . import ( + accuracy_score, + average_precision_score, + balanced_accuracy_score, + brier_score_loss, + class_likelihood_ratios, + d2_absolute_error_score, + explained_variance_score, + f1_score, + jaccard_score, + log_loss, + matthews_corrcoef, + max_error, + mean_absolute_error, + mean_absolute_percentage_error, + mean_gamma_deviance, + mean_poisson_deviance, + mean_squared_error, + mean_squared_log_error, + median_absolute_error, + precision_score, + r2_score, + recall_score, + roc_auc_score, + root_mean_squared_error, + root_mean_squared_log_error, + top_k_accuracy_score, +) +from .cluster import ( + adjusted_mutual_info_score, + adjusted_rand_score, + completeness_score, + fowlkes_mallows_score, + homogeneity_score, + mutual_info_score, + normalized_mutual_info_score, + rand_score, + v_measure_score, +) + + +def _cached_call(cache, estimator, response_method, *args, **kwargs): + """Call estimator with method and args and kwargs.""" + if cache is not None and response_method in cache: + return cache[response_method] + + result, _ = _get_response_values( + estimator, *args, response_method=response_method, **kwargs + ) + + if cache is not None: + cache[response_method] = result + + return result + + +class _MultimetricScorer: + """Callable for multimetric scoring used to avoid repeated calls + to `predict_proba`, `predict`, and `decision_function`. + + `_MultimetricScorer` will return a dictionary of scores corresponding to + the scorers in the dictionary. Note that `_MultimetricScorer` can be + created with a dictionary with one key (i.e. only one actual scorer). + + Parameters + ---------- + scorers : dict + Dictionary mapping names to callable scorers. + + raise_exc : bool, default=True + Whether to raise the exception in `__call__` or not. If set to `False` + a formatted string of the exception details is passed as result of + the failing scorer. + """ + + def __init__(self, *, scorers, raise_exc=True): + self._scorers = scorers + self._raise_exc = raise_exc + + def __call__(self, estimator, *args, **kwargs): + """Evaluate predicted target values.""" + scores = {} + cache = {} if self._use_cache(estimator) else None + cached_call = partial(_cached_call, cache) + + if _routing_enabled(): + routed_params = process_routing(self, "score", **kwargs) + else: + # Scorers all get the same args, and get all of them except sample_weight. + # Only the ones having `sample_weight` in their signature will receive it. + # This does not work for metadata other than sample_weight, and for those + # users have to enable metadata routing. + common_kwargs = { + arg: value for arg, value in kwargs.items() if arg != "sample_weight" + } + routed_params = Bunch( + **{name: Bunch(score=common_kwargs.copy()) for name in self._scorers} + ) + if "sample_weight" in kwargs: + for name, scorer in self._scorers.items(): + if scorer._accept_sample_weight(): + routed_params[name].score["sample_weight"] = kwargs[ + "sample_weight" + ] + + for name, scorer in self._scorers.items(): + try: + if isinstance(scorer, _BaseScorer): + score = scorer._score( + cached_call, estimator, *args, **routed_params.get(name).score + ) + else: + score = scorer(estimator, *args, **routed_params.get(name).score) + scores[name] = score + except Exception as e: + if self._raise_exc: + raise e + else: + scores[name] = format_exc() + return scores + + def __repr__(self): + scorers = ", ".join([f'"{s}"' for s in self._scorers]) + return f"MultiMetricScorer({scorers})" + + def _accept_sample_weight(self): + # TODO(slep006): remove when metadata routing is the only way + return any(scorer._accept_sample_weight() for scorer in self._scorers.values()) + + def _use_cache(self, estimator): + """Return True if using a cache is beneficial, thus when a response method will + be called several time. + """ + if len(self._scorers) == 1: # Only one scorer + return False + + counter = Counter( + [ + _check_response_method(estimator, scorer._response_method).__name__ + for scorer in self._scorers.values() + if isinstance(scorer, _BaseScorer) + ] + ) + if any(val > 1 for val in counter.values()): + # The exact same response method or iterable of response methods + # will be called more than once. + return True + + return False + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.3 + + Returns + ------- + routing : MetadataRouter + A :class:`~utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + return MetadataRouter(owner=self.__class__.__name__).add( + **self._scorers, + method_mapping=MethodMapping().add(caller="score", callee="score"), + ) + + +class _BaseScorer(_MetadataRequester): + """Base scorer that is used as `scorer(estimator, X, y_true)`. + + Parameters + ---------- + score_func : callable + The score function to use. It will be called as + `score_func(y_true, y_pred, **kwargs)`. + + sign : int + Either 1 or -1 to returns the score with `sign * score_func(estimator, X, y)`. + Thus, `sign` defined if higher scores are better or worse. + + kwargs : dict + Additional parameters to pass to the score function. + + response_method : str + The method to call on the estimator to get the response values. + """ + + def __init__(self, score_func, sign, kwargs, response_method="predict"): + self._score_func = score_func + self._sign = sign + self._kwargs = kwargs + self._response_method = response_method + # TODO (1.8): remove in 1.8 (scoring="max_error" has been deprecated in 1.6) + self._deprecation_msg = None + + def _get_pos_label(self): + if "pos_label" in self._kwargs: + return self._kwargs["pos_label"] + score_func_params = signature(self._score_func).parameters + if "pos_label" in score_func_params: + return score_func_params["pos_label"].default + return None + + def _accept_sample_weight(self): + # TODO(slep006): remove when metadata routing is the only way + return "sample_weight" in signature(self._score_func).parameters + + def __repr__(self): + sign_string = "" if self._sign > 0 else ", greater_is_better=False" + response_method_string = f", response_method={self._response_method!r}" + kwargs_string = "".join([f", {k}={v}" for k, v in self._kwargs.items()]) + + return ( + f"make_scorer({self._score_func.__name__}{sign_string}" + f"{response_method_string}{kwargs_string})" + ) + + def __call__(self, estimator, X, y_true, sample_weight=None, **kwargs): + """Evaluate predicted target values for X relative to y_true. + + Parameters + ---------- + estimator : object + Trained estimator to use for scoring. Must have a predict_proba + method; the output of that is used to compute the score. + + X : {array-like, sparse matrix} + Test data that will be fed to estimator.predict. + + y_true : array-like + Gold standard target values for X. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + **kwargs : dict + Other parameters passed to the scorer. Refer to + :func:`set_score_request` for more details. + + Only available if `enable_metadata_routing=True`. See the + :ref:`User Guide `. + + .. versionadded:: 1.3 + + Returns + ------- + score : float + Score function applied to prediction of estimator on X. + """ + # TODO (1.8): remove in 1.8 (scoring="max_error" has been deprecated in 1.6) + if self._deprecation_msg is not None: + warnings.warn( + self._deprecation_msg, category=DeprecationWarning, stacklevel=2 + ) + + _raise_for_params(kwargs, self, None) + + _kwargs = copy.deepcopy(kwargs) + if sample_weight is not None: + _kwargs["sample_weight"] = sample_weight + + return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs) + + def _warn_overlap(self, message, kwargs): + """Warn if there is any overlap between ``self._kwargs`` and ``kwargs``. + + This method is intended to be used to check for overlap between + ``self._kwargs`` and ``kwargs`` passed as metadata. + """ + _kwargs = set() if self._kwargs is None else set(self._kwargs.keys()) + overlap = _kwargs.intersection(kwargs.keys()) + if overlap: + warnings.warn( + f"{message} Overlapping parameters are: {overlap}", UserWarning + ) + + def set_score_request(self, **kwargs): + """Set requested parameters by the scorer. + + Please see :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.3 + + Parameters + ---------- + kwargs : dict + Arguments should be of the form ``param_name=alias``, and `alias` + can be one of ``{True, False, None, str}``. + """ + if not _routing_enabled(): + raise RuntimeError( + "This method is only available when metadata routing is enabled." + " You can enable it using" + " sklearn.set_config(enable_metadata_routing=True)." + ) + + self._warn_overlap( + message=( + "You are setting metadata request for parameters which are " + "already set as kwargs for this metric. These set values will be " + "overridden by passed metadata if provided. Please pass them either " + "as metadata or kwargs to `make_scorer`." + ), + kwargs=kwargs, + ) + self._metadata_request = MetadataRequest(owner=self.__class__.__name__) + for param, alias in kwargs.items(): + self._metadata_request.score.add_request(param=param, alias=alias) + return self + + +class _Scorer(_BaseScorer): + def _score(self, method_caller, estimator, X, y_true, **kwargs): + """Evaluate the response method of `estimator` on `X` and `y_true`. + + Parameters + ---------- + method_caller : callable + Returns predictions given an estimator, method name, and other + arguments, potentially caching results. + + estimator : object + Trained estimator to use for scoring. + + X : {array-like, sparse matrix} + Test data that will be fed to clf.decision_function or + clf.predict_proba. + + y_true : array-like + Gold standard target values for X. These must be class labels, + not decision function values. + + **kwargs : dict + Other parameters passed to the scorer. Refer to + :func:`set_score_request` for more details. + + Returns + ------- + score : float + Score function applied to prediction of estimator on X. + """ + self._warn_overlap( + message=( + "There is an overlap between set kwargs of this scorer instance and" + " passed metadata. Please pass them either as kwargs to `make_scorer`" + " or metadata, but not both." + ), + kwargs=kwargs, + ) + + pos_label = None if is_regressor(estimator) else self._get_pos_label() + response_method = _check_response_method(estimator, self._response_method) + y_pred = method_caller( + estimator, + _get_response_method_name(response_method), + X, + pos_label=pos_label, + ) + + scoring_kwargs = {**self._kwargs, **kwargs} + return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs) + + +@validate_params( + { + "scoring": [str, callable, None], + }, + prefer_skip_nested_validation=True, +) +def get_scorer(scoring): + """Get a scorer from string. + + Read more in the :ref:`User Guide `. + :func:`~sklearn.metrics.get_scorer_names` can be used to retrieve the names + of all available scorers. + + Parameters + ---------- + scoring : str, callable or None + Scoring method as string. If callable it is returned as is. + If None, returns None. + + Returns + ------- + scorer : callable + The scorer. + + Notes + ----- + When passed a string, this function always returns a copy of the scorer + object. Calling `get_scorer` twice for the same scorer results in two + separate scorer objects. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.dummy import DummyClassifier + >>> from sklearn.metrics import get_scorer + >>> X = np.reshape([0, 1, -1, -0.5, 2], (-1, 1)) + >>> y = np.array([0, 1, 1, 0, 1]) + >>> classifier = DummyClassifier(strategy="constant", constant=0).fit(X, y) + >>> accuracy = get_scorer("accuracy") + >>> accuracy(classifier, X, y) + 0.4 + """ + if isinstance(scoring, str): + try: + if scoring == "max_error": + # TODO (1.8): scoring="max_error" has been deprecated in 1.6, + # remove in 1.8 + scorer = max_error_scorer + else: + scorer = copy.deepcopy(_SCORERS[scoring]) + except KeyError: + raise ValueError( + "%r is not a valid scoring value. " + "Use sklearn.metrics.get_scorer_names() " + "to get valid options." % scoring + ) + else: + scorer = scoring + return scorer + + +class _PassthroughScorer(_MetadataRequester): + # Passes scoring of estimator's `score` method back to estimator if scoring + # is `None`. + + def __init__(self, estimator): + self._estimator = estimator + + requests = MetadataRequest(owner=self.__class__.__name__) + try: + requests.score = copy.deepcopy(estimator._metadata_request.score) + except AttributeError: + try: + requests.score = copy.deepcopy(estimator._get_default_requests().score) + except AttributeError: + pass + + self._metadata_request = requests + + def __call__(self, estimator, *args, **kwargs): + """Method that wraps estimator.score""" + return estimator.score(*args, **kwargs) + + def __repr__(self): + return f"{self._estimator.__class__}.score" + + def _accept_sample_weight(self): + # TODO(slep006): remove when metadata routing is the only way + return "sample_weight" in signature(self._estimator.score).parameters + + def get_metadata_routing(self): + """Get requested data properties. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.3 + + Returns + ------- + routing : MetadataRouter + A :class:`~utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + return get_routing_for_object(self._metadata_request) + + def set_score_request(self, **kwargs): + """Set requested parameters by the scorer. + + Please see :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.5 + + Parameters + ---------- + kwargs : dict + Arguments should be of the form ``param_name=alias``, and `alias` + can be one of ``{True, False, None, str}``. + """ + if not _routing_enabled(): + raise RuntimeError( + "This method is only available when metadata routing is enabled." + " You can enable it using" + " sklearn.set_config(enable_metadata_routing=True)." + ) + + for param, alias in kwargs.items(): + self._metadata_request.score.add_request(param=param, alias=alias) + return self + + +def _check_multimetric_scoring(estimator, scoring): + """Check the scoring parameter in cases when multiple metrics are allowed. + + In addition, multimetric scoring leverages a caching mechanism to not call the same + estimator response method multiple times. Hence, the scorer is modified to only use + a single response method given a list of response methods and the estimator. + + Parameters + ---------- + estimator : sklearn estimator instance + The estimator for which the scoring will be applied. + + scoring : list, tuple or dict + Strategy to evaluate the performance of the cross-validated model on + the test set. + + The possibilities are: + + - a list or tuple of unique strings; + - a callable returning a dictionary where they keys are the metric + names and the values are the metric scores; + - a dictionary with metric names as keys and callables a values. + + See :ref:`multimetric_grid_search` for an example. + + Returns + ------- + scorers_dict : dict + A dict mapping each scorer name to its validated scorer. + """ + err_msg_generic = ( + f"scoring is invalid (got {scoring!r}). Refer to the " + "scoring glossary for details: " + "https://scikit-learn.org/stable/glossary.html#term-scoring" + ) + + if isinstance(scoring, (list, tuple, set)): + err_msg = ( + "The list/tuple elements must be unique strings of predefined scorers. " + ) + try: + keys = set(scoring) + except TypeError as e: + raise ValueError(err_msg) from e + + if len(keys) != len(scoring): + raise ValueError( + f"{err_msg} Duplicate elements were found in" + f" the given list. {scoring!r}" + ) + elif len(keys) > 0: + if not all(isinstance(k, str) for k in keys): + if any(callable(k) for k in keys): + raise ValueError( + f"{err_msg} One or more of the elements " + "were callables. Use a dict of score " + "name mapped to the scorer callable. " + f"Got {scoring!r}" + ) + else: + raise ValueError( + f"{err_msg} Non-string types were found " + f"in the given list. Got {scoring!r}" + ) + scorers = { + scorer: check_scoring(estimator, scoring=scorer) for scorer in scoring + } + else: + raise ValueError(f"{err_msg} Empty list was given. {scoring!r}") + + elif isinstance(scoring, dict): + keys = set(scoring) + if not all(isinstance(k, str) for k in keys): + raise ValueError( + "Non-string types were found in the keys of " + f"the given dict. scoring={scoring!r}" + ) + if len(keys) == 0: + raise ValueError(f"An empty dict was passed. {scoring!r}") + scorers = { + key: check_scoring(estimator, scoring=scorer) + for key, scorer in scoring.items() + } + else: + raise ValueError(err_msg_generic) + + return scorers + + +def _get_response_method_name(response_method): + try: + return response_method.__name__ + except AttributeError: + return _get_response_method_name(response_method.func) + + +@validate_params( + { + "score_func": [callable], + "response_method": [ + None, + list, + tuple, + StrOptions({"predict", "predict_proba", "decision_function"}), + Hidden(StrOptions({"default"})), + ], + "greater_is_better": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def make_scorer( + score_func, *, response_method="default", greater_is_better=True, **kwargs +): + """Make a scorer from a performance metric or loss function. + + A scorer is a wrapper around an arbitrary metric or loss function that is called + with the signature `scorer(estimator, X, y_true, **kwargs)`. + + It is accepted in all scikit-learn estimators or functions allowing a `scoring` + parameter. + + The parameter `response_method` allows to specify which method of the estimator + should be used to feed the scoring/loss function. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + score_func : callable + Score function (or loss function) with signature + ``score_func(y, y_pred, **kwargs)``. + + response_method : {"predict_proba", "decision_function", "predict"} or \ + list/tuple of such str, default=None + + Specifies the response method to use get prediction from an estimator + (i.e. :term:`predict_proba`, :term:`decision_function` or + :term:`predict`). Possible choices are: + + - if `str`, it corresponds to the name to the method to return; + - if a list or tuple of `str`, it provides the method names in order of + preference. The method returned corresponds to the first method in + the list and which is implemented by `estimator`. + - if `None`, it is equivalent to `"predict"`. + + .. versionadded:: 1.4 + + .. deprecated:: 1.6 + None is equivalent to 'predict' and is deprecated. It will be removed in + version 1.8. + + greater_is_better : bool, default=True + Whether `score_func` is a score function (default), meaning high is + good, or a loss function, meaning low is good. In the latter case, the + scorer object will sign-flip the outcome of the `score_func`. + + **kwargs : additional arguments + Additional parameters to be passed to `score_func`. + + Returns + ------- + scorer : callable + Callable object that returns a scalar score; greater is better. + + Examples + -------- + >>> from sklearn.metrics import fbeta_score, make_scorer + >>> ftwo_scorer = make_scorer(fbeta_score, beta=2) + >>> ftwo_scorer + make_scorer(fbeta_score, response_method='predict', beta=2) + >>> from sklearn.model_selection import GridSearchCV + >>> from sklearn.svm import LinearSVC + >>> grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]}, + ... scoring=ftwo_scorer) + """ + sign = 1 if greater_is_better else -1 + + if response_method is None: + warnings.warn( + "response_method=None is deprecated in version 1.6 and will be removed " + "in version 1.8. Leave it to its default value to avoid this warning.", + FutureWarning, + ) + response_method = "predict" + elif response_method == "default": + response_method = "predict" + + return _Scorer(score_func, sign, kwargs, response_method) + + +# Standard regression scores +explained_variance_scorer = make_scorer(explained_variance_score) +r2_scorer = make_scorer(r2_score) +neg_max_error_scorer = make_scorer(max_error, greater_is_better=False) +max_error_scorer = make_scorer(max_error, greater_is_better=False) +# TODO (1.8): remove in 1.8 (scoring="max_error" has been deprecated in 1.6) +deprecation_msg = ( + "Scoring method max_error was renamed to " + "neg_max_error in version 1.6 and will " + "be removed in 1.8." +) +max_error_scorer._deprecation_msg = deprecation_msg +neg_mean_squared_error_scorer = make_scorer(mean_squared_error, greater_is_better=False) +neg_mean_squared_log_error_scorer = make_scorer( + mean_squared_log_error, greater_is_better=False +) +neg_mean_absolute_error_scorer = make_scorer( + mean_absolute_error, greater_is_better=False +) +neg_mean_absolute_percentage_error_scorer = make_scorer( + mean_absolute_percentage_error, greater_is_better=False +) +neg_median_absolute_error_scorer = make_scorer( + median_absolute_error, greater_is_better=False +) +neg_root_mean_squared_error_scorer = make_scorer( + root_mean_squared_error, greater_is_better=False +) +neg_root_mean_squared_log_error_scorer = make_scorer( + root_mean_squared_log_error, greater_is_better=False +) +neg_mean_poisson_deviance_scorer = make_scorer( + mean_poisson_deviance, greater_is_better=False +) + +neg_mean_gamma_deviance_scorer = make_scorer( + mean_gamma_deviance, greater_is_better=False +) +d2_absolute_error_scorer = make_scorer(d2_absolute_error_score) + +# Standard Classification Scores +accuracy_scorer = make_scorer(accuracy_score) +balanced_accuracy_scorer = make_scorer(balanced_accuracy_score) +matthews_corrcoef_scorer = make_scorer(matthews_corrcoef) + + +def positive_likelihood_ratio(y_true, y_pred): + return class_likelihood_ratios(y_true, y_pred, replace_undefined_by=1.0)[0] + + +def negative_likelihood_ratio(y_true, y_pred): + return class_likelihood_ratios(y_true, y_pred, replace_undefined_by=1.0)[1] + + +positive_likelihood_ratio_scorer = make_scorer(positive_likelihood_ratio) +neg_negative_likelihood_ratio_scorer = make_scorer( + negative_likelihood_ratio, greater_is_better=False +) + +# Score functions that need decision values +top_k_accuracy_scorer = make_scorer( + top_k_accuracy_score, + greater_is_better=True, + response_method=("decision_function", "predict_proba"), +) +roc_auc_scorer = make_scorer( + roc_auc_score, + greater_is_better=True, + response_method=("decision_function", "predict_proba"), +) +average_precision_scorer = make_scorer( + average_precision_score, + response_method=("decision_function", "predict_proba"), +) +roc_auc_ovo_scorer = make_scorer( + roc_auc_score, response_method="predict_proba", multi_class="ovo" +) +roc_auc_ovo_weighted_scorer = make_scorer( + roc_auc_score, + response_method="predict_proba", + multi_class="ovo", + average="weighted", +) +roc_auc_ovr_scorer = make_scorer( + roc_auc_score, response_method="predict_proba", multi_class="ovr" +) +roc_auc_ovr_weighted_scorer = make_scorer( + roc_auc_score, + response_method="predict_proba", + multi_class="ovr", + average="weighted", +) + +# Score function for probabilistic classification +neg_log_loss_scorer = make_scorer( + log_loss, greater_is_better=False, response_method="predict_proba" +) +neg_brier_score_scorer = make_scorer( + brier_score_loss, greater_is_better=False, response_method="predict_proba" +) +brier_score_loss_scorer = make_scorer( + brier_score_loss, greater_is_better=False, response_method="predict_proba" +) + + +# Clustering scores +adjusted_rand_scorer = make_scorer(adjusted_rand_score) +rand_scorer = make_scorer(rand_score) +homogeneity_scorer = make_scorer(homogeneity_score) +completeness_scorer = make_scorer(completeness_score) +v_measure_scorer = make_scorer(v_measure_score) +mutual_info_scorer = make_scorer(mutual_info_score) +adjusted_mutual_info_scorer = make_scorer(adjusted_mutual_info_score) +normalized_mutual_info_scorer = make_scorer(normalized_mutual_info_score) +fowlkes_mallows_scorer = make_scorer(fowlkes_mallows_score) + + +_SCORERS = dict( + explained_variance=explained_variance_scorer, + r2=r2_scorer, + neg_max_error=neg_max_error_scorer, + matthews_corrcoef=matthews_corrcoef_scorer, + neg_median_absolute_error=neg_median_absolute_error_scorer, + neg_mean_absolute_error=neg_mean_absolute_error_scorer, + neg_mean_absolute_percentage_error=neg_mean_absolute_percentage_error_scorer, + neg_mean_squared_error=neg_mean_squared_error_scorer, + neg_mean_squared_log_error=neg_mean_squared_log_error_scorer, + neg_root_mean_squared_error=neg_root_mean_squared_error_scorer, + neg_root_mean_squared_log_error=neg_root_mean_squared_log_error_scorer, + neg_mean_poisson_deviance=neg_mean_poisson_deviance_scorer, + neg_mean_gamma_deviance=neg_mean_gamma_deviance_scorer, + d2_absolute_error_score=d2_absolute_error_scorer, + accuracy=accuracy_scorer, + top_k_accuracy=top_k_accuracy_scorer, + roc_auc=roc_auc_scorer, + roc_auc_ovr=roc_auc_ovr_scorer, + roc_auc_ovo=roc_auc_ovo_scorer, + roc_auc_ovr_weighted=roc_auc_ovr_weighted_scorer, + roc_auc_ovo_weighted=roc_auc_ovo_weighted_scorer, + balanced_accuracy=balanced_accuracy_scorer, + average_precision=average_precision_scorer, + neg_log_loss=neg_log_loss_scorer, + neg_brier_score=neg_brier_score_scorer, + positive_likelihood_ratio=positive_likelihood_ratio_scorer, + neg_negative_likelihood_ratio=neg_negative_likelihood_ratio_scorer, + # Cluster metrics that use supervised evaluation + adjusted_rand_score=adjusted_rand_scorer, + rand_score=rand_scorer, + homogeneity_score=homogeneity_scorer, + completeness_score=completeness_scorer, + v_measure_score=v_measure_scorer, + mutual_info_score=mutual_info_scorer, + adjusted_mutual_info_score=adjusted_mutual_info_scorer, + normalized_mutual_info_score=normalized_mutual_info_scorer, + fowlkes_mallows_score=fowlkes_mallows_scorer, +) + + +def get_scorer_names(): + """Get the names of all available scorers. + + These names can be passed to :func:`~sklearn.metrics.get_scorer` to + retrieve the scorer object. + + Returns + ------- + list of str + Names of all available scorers. + + Examples + -------- + >>> from sklearn.metrics import get_scorer_names + >>> all_scorers = get_scorer_names() + >>> type(all_scorers) + + >>> all_scorers[:3] + ['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score'] + >>> "roc_auc" in all_scorers + True + """ + return sorted(_SCORERS.keys()) + + +for name, metric in [ + ("precision", precision_score), + ("recall", recall_score), + ("f1", f1_score), + ("jaccard", jaccard_score), +]: + _SCORERS[name] = make_scorer(metric, average="binary") + for average in ["macro", "micro", "samples", "weighted"]: + qualified_name = "{0}_{1}".format(name, average) + _SCORERS[qualified_name] = make_scorer(metric, pos_label=None, average=average) + + +@validate_params( + { + "estimator": [HasMethods("fit"), None], + "scoring": [ + StrOptions(set(get_scorer_names())), + callable, + list, + set, + tuple, + dict, + None, + ], + "allow_none": ["boolean"], + "raise_exc": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def check_scoring(estimator=None, scoring=None, *, allow_none=False, raise_exc=True): + """Determine scorer from user options. + + A TypeError will be thrown if the estimator cannot be scored. + + Parameters + ---------- + estimator : estimator object implementing 'fit' or None, default=None + The object to use to fit the data. If `None`, then this function may error + depending on `allow_none`. + + scoring : str, callable, list, tuple, set, or dict, default=None + Scorer to use. If `scoring` represents a single score, one can use: + + - a single string (see :ref:`scoring_string_names`); + - a callable (see :ref:`scoring_callable`) that returns a single value; + - `None`, the `estimator`'s + :ref:`default evaluation criterion ` is used. + + If `scoring` represents multiple scores, one can use: + + - a list, tuple or set of unique strings; + - a callable returning a dictionary where the keys are the metric names and the + values are the metric scorers; + - a dictionary with metric names as keys and callables a values. The callables + need to have the signature `callable(estimator, X, y)`. + + allow_none : bool, default=False + Whether to return None or raise an error if no `scoring` is specified and the + estimator has no `score` method. + + raise_exc : bool, default=True + Whether to raise an exception (if a subset of the scorers in multimetric scoring + fails) or to return an error code. + + - If set to `True`, raises the failing scorer's exception. + - If set to `False`, a formatted string of the exception details is passed as + result of the failing scorer(s). + + This applies if `scoring` is list, tuple, set, or dict. Ignored if `scoring` is + a str or a callable. + + .. versionadded:: 1.6 + + Returns + ------- + scoring : callable + A scorer callable object / function with signature ``scorer(estimator, X, y)``. + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> from sklearn.metrics import check_scoring + >>> from sklearn.tree import DecisionTreeClassifier + >>> X, y = load_iris(return_X_y=True) + >>> classifier = DecisionTreeClassifier(max_depth=2).fit(X, y) + >>> scorer = check_scoring(classifier, scoring='accuracy') + >>> scorer(classifier, X, y) + 0.96... + + >>> from sklearn.metrics import make_scorer, accuracy_score, mean_squared_log_error + >>> X, y = load_iris(return_X_y=True) + >>> y *= -1 + >>> clf = DecisionTreeClassifier().fit(X, y) + >>> scoring = { + ... "accuracy": make_scorer(accuracy_score), + ... "mean_squared_log_error": make_scorer(mean_squared_log_error), + ... } + >>> scoring_call = check_scoring(estimator=clf, scoring=scoring, raise_exc=False) + >>> scores = scoring_call(clf, X, y) + >>> scores + {'accuracy': 1.0, 'mean_squared_log_error': 'Traceback ...'} + """ + if isinstance(scoring, str): + return get_scorer(scoring) + if callable(scoring): + # Heuristic to ensure user has not passed a metric + module = getattr(scoring, "__module__", None) + if ( + hasattr(module, "startswith") + and module.startswith("sklearn.metrics.") + and not module.startswith("sklearn.metrics._scorer") + and not module.startswith("sklearn.metrics.tests.") + ): + raise ValueError( + "scoring value %r looks like it is a metric " + "function rather than a scorer. A scorer should " + "require an estimator as its first parameter. " + "Please use `make_scorer` to convert a metric " + "to a scorer." % scoring + ) + return get_scorer(scoring) + if isinstance(scoring, (list, tuple, set, dict)): + scorers = _check_multimetric_scoring(estimator, scoring=scoring) + return _MultimetricScorer(scorers=scorers, raise_exc=raise_exc) + if scoring is None: + if hasattr(estimator, "score"): + return _PassthroughScorer(estimator) + elif allow_none: + return None + else: + raise TypeError( + "If no scoring is specified, the estimator passed should " + "have a 'score' method. The estimator %r does not." % estimator + ) + + +def _threshold_scores_to_class_labels(y_score, threshold, classes, pos_label): + """Threshold `y_score` and return the associated class labels.""" + if pos_label is None: + map_thresholded_score_to_label = np.array([0, 1]) + else: + pos_label_idx = np.flatnonzero(classes == pos_label)[0] + neg_label_idx = np.flatnonzero(classes != pos_label)[0] + map_thresholded_score_to_label = np.array([neg_label_idx, pos_label_idx]) + + return classes[map_thresholded_score_to_label[(y_score >= threshold).astype(int)]] + + +class _CurveScorer(_BaseScorer): + """Scorer taking a continuous response and output a score for each threshold. + + Parameters + ---------- + score_func : callable + The score function to use. It will be called as + `score_func(y_true, y_pred, **kwargs)`. + + sign : int + Either 1 or -1 to returns the score with `sign * score_func(estimator, X, y)`. + Thus, `sign` defined if higher scores are better or worse. + + kwargs : dict + Additional parameters to pass to the score function. + + thresholds : int or array-like + Related to the number of decision thresholds for which we want to compute the + score. If an integer, it will be used to generate `thresholds` thresholds + uniformly distributed between the minimum and maximum predicted scores. If an + array-like, it will be used as the thresholds. + + response_method : str + The method to call on the estimator to get the response values. + """ + + def __init__(self, score_func, sign, kwargs, thresholds, response_method): + super().__init__( + score_func=score_func, + sign=sign, + kwargs=kwargs, + response_method=response_method, + ) + self._thresholds = thresholds + + @classmethod + def from_scorer(cls, scorer, response_method, thresholds): + """Create a continuous scorer from a normal scorer.""" + instance = cls( + score_func=scorer._score_func, + sign=scorer._sign, + response_method=response_method, + thresholds=thresholds, + kwargs=scorer._kwargs, + ) + # transfer the metadata request + instance._metadata_request = scorer._get_metadata_request() + return instance + + def _score(self, method_caller, estimator, X, y_true, **kwargs): + """Evaluate predicted target values for X relative to y_true. + + Parameters + ---------- + method_caller : callable + Returns predictions given an estimator, method name, and other + arguments, potentially caching results. + + estimator : object + Trained estimator to use for scoring. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Test data that will be fed to estimator.predict. + + y_true : array-like of shape (n_samples,) + Gold standard target values for X. + + **kwargs : dict + Other parameters passed to the scorer. Refer to + :func:`set_score_request` for more details. + + Returns + ------- + scores : ndarray of shape (thresholds,) + The scores associated to each threshold. + + potential_thresholds : ndarray of shape (thresholds,) + The potential thresholds used to compute the scores. + """ + pos_label = self._get_pos_label() + y_score = method_caller( + estimator, self._response_method, X, pos_label=pos_label + ) + + scoring_kwargs = {**self._kwargs, **kwargs} + if isinstance(self._thresholds, Integral): + potential_thresholds = np.linspace( + np.min(y_score), np.max(y_score), self._thresholds + ) + else: + potential_thresholds = np.asarray(self._thresholds) + score_thresholds = [ + self._sign + * self._score_func( + y_true, + _threshold_scores_to_class_labels( + y_score, th, estimator.classes_, pos_label + ), + **scoring_kwargs, + ) + for th in potential_thresholds + ] + return np.array(score_thresholds), potential_thresholds diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..76020d80f8eb02a4647dada4415e5286a0bebe59 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__init__.py @@ -0,0 +1,55 @@ +"""Evaluation metrics for cluster analysis results. + +- Supervised evaluation uses a ground truth class values for each sample. +- Unsupervised evaluation does not use ground truths and measures the "quality" of the + model itself. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._bicluster import consensus_score +from ._supervised import ( + adjusted_mutual_info_score, + adjusted_rand_score, + completeness_score, + contingency_matrix, + entropy, + expected_mutual_information, + fowlkes_mallows_score, + homogeneity_completeness_v_measure, + homogeneity_score, + mutual_info_score, + normalized_mutual_info_score, + pair_confusion_matrix, + rand_score, + v_measure_score, +) +from ._unsupervised import ( + calinski_harabasz_score, + davies_bouldin_score, + silhouette_samples, + silhouette_score, +) + +__all__ = [ + "adjusted_mutual_info_score", + "adjusted_rand_score", + "calinski_harabasz_score", + "completeness_score", + "consensus_score", + "contingency_matrix", + "davies_bouldin_score", + "entropy", + "expected_mutual_information", + "fowlkes_mallows_score", + "homogeneity_completeness_v_measure", + "homogeneity_score", + "mutual_info_score", + "normalized_mutual_info_score", + "pair_confusion_matrix", + "rand_score", + "silhouette_samples", + "silhouette_score", + "v_measure_score", +] diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/__init__.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c69a2d027da84ba1b8addf91ea51b3cc6a80268 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/__init__.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/_bicluster.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/_bicluster.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f14a46c9f9f89d38b13f343e03698f2b94a604f Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/_bicluster.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/_supervised.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/_supervised.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..447798eca5602bc47102cf654fff33295f94aa54 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/_supervised.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/_unsupervised.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/_unsupervised.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..729e240f5a483c5c76692a99a0476d02a5fd68c5 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/__pycache__/_unsupervised.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_bicluster.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_bicluster.py new file mode 100644 index 0000000000000000000000000000000000000000..bb306c025b69466817de26661eaf286ea59024bc --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_bicluster.py @@ -0,0 +1,114 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np +from scipy.optimize import linear_sum_assignment + +from ...utils._param_validation import StrOptions, validate_params +from ...utils.validation import check_array, check_consistent_length + +__all__ = ["consensus_score"] + + +def _check_rows_and_columns(a, b): + """Unpacks the row and column arrays and checks their shape.""" + check_consistent_length(*a) + check_consistent_length(*b) + checks = lambda x: check_array(x, ensure_2d=False) + a_rows, a_cols = map(checks, a) + b_rows, b_cols = map(checks, b) + return a_rows, a_cols, b_rows, b_cols + + +def _jaccard(a_rows, a_cols, b_rows, b_cols): + """Jaccard coefficient on the elements of the two biclusters.""" + intersection = (a_rows * b_rows).sum() * (a_cols * b_cols).sum() + + a_size = a_rows.sum() * a_cols.sum() + b_size = b_rows.sum() * b_cols.sum() + + return intersection / (a_size + b_size - intersection) + + +def _pairwise_similarity(a, b, similarity): + """Computes pairwise similarity matrix. + + result[i, j] is the Jaccard coefficient of a's bicluster i and b's + bicluster j. + + """ + a_rows, a_cols, b_rows, b_cols = _check_rows_and_columns(a, b) + n_a = a_rows.shape[0] + n_b = b_rows.shape[0] + result = np.array( + [ + [similarity(a_rows[i], a_cols[i], b_rows[j], b_cols[j]) for j in range(n_b)] + for i in range(n_a) + ] + ) + return result + + +@validate_params( + { + "a": [tuple], + "b": [tuple], + "similarity": [callable, StrOptions({"jaccard"})], + }, + prefer_skip_nested_validation=True, +) +def consensus_score(a, b, *, similarity="jaccard"): + """The similarity of two sets of biclusters. + + Similarity between individual biclusters is computed. Then the best + matching between sets is found by solving a linear sum assignment problem, + using a modified Jonker-Volgenant algorithm. + The final score is the sum of similarities divided by the size of + the larger set. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + a : tuple (rows, columns) + Tuple of row and column indicators for a set of biclusters. + + b : tuple (rows, columns) + Another set of biclusters like ``a``. + + similarity : 'jaccard' or callable, default='jaccard' + May be the string "jaccard" to use the Jaccard coefficient, or + any function that takes four arguments, each of which is a 1d + indicator vector: (a_rows, a_columns, b_rows, b_columns). + + Returns + ------- + consensus_score : float + Consensus score, a non-negative value, sum of similarities + divided by size of larger set. + + See Also + -------- + scipy.optimize.linear_sum_assignment : Solve the linear sum assignment problem. + + References + ---------- + * Hochreiter, Bodenhofer, et. al., 2010. `FABIA: factor analysis + for bicluster acquisition + `__. + + Examples + -------- + >>> from sklearn.metrics import consensus_score + >>> a = ([[True, False], [False, True]], [[False, True], [True, False]]) + >>> b = ([[False, True], [True, False]], [[True, False], [False, True]]) + >>> consensus_score(a, b, similarity='jaccard') + 1.0 + """ + if similarity == "jaccard": + similarity = _jaccard + matrix = _pairwise_similarity(a, b, similarity) + row_indices, col_indices = linear_sum_assignment(1.0 - matrix) + n_a = len(a[0]) + n_b = len(b[0]) + return float(matrix[row_indices, col_indices].sum() / max(n_a, n_b)) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx new file mode 100644 index 0000000000000000000000000000000000000000..3d51def36c255b7479fea1ae516fdc47c0c4faeb --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_expected_mutual_info_fast.pyx @@ -0,0 +1,69 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from libc.math cimport exp, lgamma + +from ...utils._typedefs cimport float64_t, int64_t + +import numpy as np +from scipy.special import gammaln + + +def expected_mutual_information(contingency, int64_t n_samples): + """Calculate the expected mutual information for two labelings.""" + cdef: + float64_t emi = 0 + int64_t n_rows, n_cols + float64_t term2, term3, gln + int64_t[::1] a_view, b_view + float64_t[::1] term1 + float64_t[::1] gln_a, gln_b, gln_Na, gln_Nb, gln_Nnij, log_Nnij + float64_t[::1] log_a, log_b + Py_ssize_t i, j, nij + int64_t start, end + + n_rows, n_cols = contingency.shape + a = np.ravel(contingency.sum(axis=1).astype(np.int64, copy=False)) + b = np.ravel(contingency.sum(axis=0).astype(np.int64, copy=False)) + a_view = a + b_view = b + + # any labelling with zero entropy implies EMI = 0 + if a.size == 1 or b.size == 1: + return 0.0 + + # There are three major terms to the EMI equation, which are multiplied to + # and then summed over varying nij values. + # While nijs[0] will never be used, having it simplifies the indexing. + nijs = np.arange(0, max(np.max(a), np.max(b)) + 1, dtype='float') + nijs[0] = 1 # Stops divide by zero warnings. As its not used, no issue. + # term1 is nij / N + term1 = nijs / n_samples + # term2 is log((N*nij) / (a * b)) == log(N * nij) - log(a * b) + log_a = np.log(a) + log_b = np.log(b) + # term2 uses log(N * nij) = log(N) + log(nij) + log_Nnij = np.log(n_samples) + np.log(nijs) + # term3 is large, and involved many factorials. Calculate these in log + # space to stop overflows. + gln_a = gammaln(a + 1) + gln_b = gammaln(b + 1) + gln_Na = gammaln(n_samples - a + 1) + gln_Nb = gammaln(n_samples - b + 1) + gln_Nnij = gammaln(nijs + 1) + gammaln(n_samples + 1) + + # emi itself is a summation over the various values. + for i in range(n_rows): + for j in range(n_cols): + start = max(1, a_view[i] - n_samples + b_view[j]) + end = min(a_view[i], b_view[j]) + 1 + for nij in range(start, end): + term2 = log_Nnij[nij] - log_a[i] - log_b[j] + # Numerators are positive, denominators are negative. + gln = (gln_a[i] + gln_b[j] + gln_Na[i] + gln_Nb[j] + - gln_Nnij[nij] - lgamma(a_view[i] - nij + 1) + - lgamma(b_view[j] - nij + 1) + - lgamma(n_samples - a_view[i] - b_view[j] + nij + 1)) + term3 = exp(gln) + emi += (term1[nij] * term2 * term3) + return emi diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_supervised.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_supervised.py new file mode 100644 index 0000000000000000000000000000000000000000..ccc11d752adbacd4960592e154a2c886276bc3f9 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_supervised.py @@ -0,0 +1,1314 @@ +"""Utilities to evaluate the clustering performance of models. + +Functions named as *_score return a scalar value to maximize: the higher the +better. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from math import log +from numbers import Real + +import numpy as np +from scipy import sparse as sp + +from ...utils._array_api import _max_precision_float_dtype, get_namespace_and_device +from ...utils._param_validation import Hidden, Interval, StrOptions, validate_params +from ...utils.multiclass import type_of_target +from ...utils.validation import check_array, check_consistent_length +from ._expected_mutual_info_fast import expected_mutual_information + + +def check_clusterings(labels_true, labels_pred): + """Check that the labels arrays are 1D and of same dimension. + + Parameters + ---------- + labels_true : array-like of shape (n_samples,) + The true labels. + + labels_pred : array-like of shape (n_samples,) + The predicted labels. + """ + labels_true = check_array( + labels_true, + ensure_2d=False, + ensure_min_samples=0, + dtype=None, + ) + + labels_pred = check_array( + labels_pred, + ensure_2d=False, + ensure_min_samples=0, + dtype=None, + ) + + type_label = type_of_target(labels_true) + type_pred = type_of_target(labels_pred) + + if "continuous" in (type_pred, type_label): + msg = ( + "Clustering metrics expects discrete values but received" + f" {type_label} values for label, and {type_pred} values " + "for target" + ) + warnings.warn(msg, UserWarning) + + # input checks + if labels_true.ndim != 1: + raise ValueError("labels_true must be 1D: shape is %r" % (labels_true.shape,)) + if labels_pred.ndim != 1: + raise ValueError("labels_pred must be 1D: shape is %r" % (labels_pred.shape,)) + check_consistent_length(labels_true, labels_pred) + + return labels_true, labels_pred + + +def _generalized_average(U, V, average_method): + """Return a particular mean of two numbers.""" + if average_method == "min": + return min(U, V) + elif average_method == "geometric": + return np.sqrt(U * V) + elif average_method == "arithmetic": + return np.mean([U, V]) + elif average_method == "max": + return max(U, V) + else: + raise ValueError( + "'average_method' must be 'min', 'geometric', 'arithmetic', or 'max'" + ) + + +@validate_params( + { + "labels_true": ["array-like", None], + "labels_pred": ["array-like", None], + "eps": [Interval(Real, 0, None, closed="left"), None], + "sparse": ["boolean"], + "dtype": "no_validation", # delegate the validation to SciPy + }, + prefer_skip_nested_validation=True, +) +def contingency_matrix( + labels_true, labels_pred, *, eps=None, sparse=False, dtype=np.int64 +): + """Build a contingency matrix describing the relationship between labels. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + labels_true : array-like of shape (n_samples,) + Ground truth class labels to be used as a reference. + + labels_pred : array-like of shape (n_samples,) + Cluster labels to evaluate. + + eps : float, default=None + If a float, that value is added to all values in the contingency + matrix. This helps to stop NaN propagation. + If ``None``, nothing is adjusted. + + sparse : bool, default=False + If `True`, return a sparse CSR contingency matrix. If `eps` is not + `None` and `sparse` is `True` will raise ValueError. + + .. versionadded:: 0.18 + + dtype : numeric type, default=np.int64 + Output dtype. Ignored if `eps` is not `None`. + + .. versionadded:: 0.24 + + Returns + ------- + contingency : {array-like, sparse}, shape=[n_classes_true, n_classes_pred] + Matrix :math:`C` such that :math:`C_{i, j}` is the number of samples in + true class :math:`i` and in predicted class :math:`j`. If + ``eps is None``, the dtype of this array will be integer unless set + otherwise with the ``dtype`` argument. If ``eps`` is given, the dtype + will be float. + Will be a ``sklearn.sparse.csr_matrix`` if ``sparse=True``. + + Examples + -------- + >>> from sklearn.metrics.cluster import contingency_matrix + >>> labels_true = [0, 0, 1, 1, 2, 2] + >>> labels_pred = [1, 0, 2, 1, 0, 2] + >>> contingency_matrix(labels_true, labels_pred) + array([[1, 1, 0], + [0, 1, 1], + [1, 0, 1]]) + """ + + if eps is not None and sparse: + raise ValueError("Cannot set 'eps' when sparse=True") + + classes, class_idx = np.unique(labels_true, return_inverse=True) + clusters, cluster_idx = np.unique(labels_pred, return_inverse=True) + n_classes = classes.shape[0] + n_clusters = clusters.shape[0] + # Using coo_matrix to accelerate simple histogram calculation, + # i.e. bins are consecutive integers + # Currently, coo_matrix is faster than histogram2d for simple cases + contingency = sp.coo_matrix( + (np.ones(class_idx.shape[0]), (class_idx, cluster_idx)), + shape=(n_classes, n_clusters), + dtype=dtype, + ) + if sparse: + contingency = contingency.tocsr() + contingency.sum_duplicates() + else: + contingency = contingency.toarray() + if eps is not None: + # don't use += as contingency is integer + contingency = contingency + eps + return contingency + + +# clustering measures + + +@validate_params( + { + "labels_true": ["array-like"], + "labels_pred": ["array-like"], + }, + prefer_skip_nested_validation=True, +) +def pair_confusion_matrix(labels_true, labels_pred): + """Pair confusion matrix arising from two clusterings. + + The pair confusion matrix :math:`C` computes a 2 by 2 similarity matrix + between two clusterings by considering all pairs of samples and counting + pairs that are assigned into the same or into different clusters under + the true and predicted clusterings [1]_. + + Considering a pair of samples that is clustered together a positive pair, + then as in binary classification the count of true negatives is + :math:`C_{00}`, false negatives is :math:`C_{10}`, true positives is + :math:`C_{11}` and false positives is :math:`C_{01}`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + labels_true : array-like of shape (n_samples,), dtype=integral + Ground truth class labels to be used as a reference. + + labels_pred : array-like of shape (n_samples,), dtype=integral + Cluster labels to evaluate. + + Returns + ------- + C : ndarray of shape (2, 2), dtype=np.int64 + The contingency matrix. + + See Also + -------- + sklearn.metrics.rand_score : Rand Score. + sklearn.metrics.adjusted_rand_score : Adjusted Rand Score. + sklearn.metrics.adjusted_mutual_info_score : Adjusted Mutual Information. + + References + ---------- + .. [1] :doi:`Hubert, L., Arabie, P. "Comparing partitions." + Journal of Classification 2, 193–218 (1985). + <10.1007/BF01908075>` + + Examples + -------- + Perfectly matching labelings have all non-zero entries on the + diagonal regardless of actual label values: + + >>> from sklearn.metrics.cluster import pair_confusion_matrix + >>> pair_confusion_matrix([0, 0, 1, 1], [1, 1, 0, 0]) + array([[8, 0], + [0, 4]]... + + Labelings that assign all classes members to the same clusters + are complete but may be not always pure, hence penalized, and + have some off-diagonal non-zero entries: + + >>> pair_confusion_matrix([0, 0, 1, 2], [0, 0, 1, 1]) + array([[8, 2], + [0, 2]]... + + Note that the matrix is not symmetric. + """ + labels_true, labels_pred = check_clusterings(labels_true, labels_pred) + n_samples = np.int64(labels_true.shape[0]) + + # Computation using the contingency data + contingency = contingency_matrix( + labels_true, labels_pred, sparse=True, dtype=np.int64 + ) + n_c = np.ravel(contingency.sum(axis=1)) + n_k = np.ravel(contingency.sum(axis=0)) + sum_squares = (contingency.data**2).sum() + C = np.empty((2, 2), dtype=np.int64) + C[1, 1] = sum_squares - n_samples + C[0, 1] = contingency.dot(n_k).sum() - sum_squares + C[1, 0] = contingency.transpose().dot(n_c).sum() - sum_squares + C[0, 0] = n_samples**2 - C[0, 1] - C[1, 0] - sum_squares + return C + + +@validate_params( + { + "labels_true": ["array-like"], + "labels_pred": ["array-like"], + }, + prefer_skip_nested_validation=True, +) +def rand_score(labels_true, labels_pred): + """Rand index. + + The Rand Index computes a similarity measure between two clusterings + by considering all pairs of samples and counting pairs that are + assigned in the same or different clusters in the predicted and + true clusterings [1]_ [2]_. + + The raw RI score [3]_ is: + + .. code-block:: text + + RI = (number of agreeing pairs) / (number of pairs) + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + labels_true : array-like of shape (n_samples,), dtype=integral + Ground truth class labels to be used as a reference. + + labels_pred : array-like of shape (n_samples,), dtype=integral + Cluster labels to evaluate. + + Returns + ------- + RI : float + Similarity score between 0.0 and 1.0, inclusive, 1.0 stands for + perfect match. + + See Also + -------- + adjusted_rand_score: Adjusted Rand Score. + adjusted_mutual_info_score: Adjusted Mutual Information. + + References + ---------- + .. [1] :doi:`Hubert, L., Arabie, P. "Comparing partitions." + Journal of Classification 2, 193–218 (1985). + <10.1007/BF01908075>`. + + .. [2] `Wikipedia: Simple Matching Coefficient + `_ + + .. [3] `Wikipedia: Rand Index `_ + + Examples + -------- + Perfectly matching labelings have a score of 1 even + + >>> from sklearn.metrics.cluster import rand_score + >>> rand_score([0, 0, 1, 1], [1, 1, 0, 0]) + 1.0 + + Labelings that assign all classes members to the same clusters + are complete but may not always be pure, hence penalized: + + >>> rand_score([0, 0, 1, 2], [0, 0, 1, 1]) + 0.83 + """ + contingency = pair_confusion_matrix(labels_true, labels_pred) + numerator = contingency.diagonal().sum() + denominator = contingency.sum() + + if numerator == denominator or denominator == 0: + # Special limit cases: no clustering since the data is not split; + # or trivial clustering where each document is assigned a unique + # cluster. These are perfect matches hence return 1.0. + return 1.0 + + return float(numerator / denominator) + + +@validate_params( + { + "labels_true": ["array-like"], + "labels_pred": ["array-like"], + }, + prefer_skip_nested_validation=True, +) +def adjusted_rand_score(labels_true, labels_pred): + """Rand index adjusted for chance. + + The Rand Index computes a similarity measure between two clusterings + by considering all pairs of samples and counting pairs that are + assigned in the same or different clusters in the predicted and + true clusterings. + + The raw RI score is then "adjusted for chance" into the ARI score + using the following scheme:: + + ARI = (RI - Expected_RI) / (max(RI) - Expected_RI) + + The adjusted Rand index is thus ensured to have a value close to + 0.0 for random labeling independently of the number of clusters and + samples and exactly 1.0 when the clusterings are identical (up to + a permutation). The adjusted Rand index is bounded below by -0.5 for + especially discordant clusterings. + + ARI is a symmetric measure:: + + adjusted_rand_score(a, b) == adjusted_rand_score(b, a) + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + labels_true : array-like of shape (n_samples,), dtype=int + Ground truth class labels to be used as a reference. + + labels_pred : array-like of shape (n_samples,), dtype=int + Cluster labels to evaluate. + + Returns + ------- + ARI : float + Similarity score between -0.5 and 1.0. Random labelings have an ARI + close to 0.0. 1.0 stands for perfect match. + + See Also + -------- + adjusted_mutual_info_score : Adjusted Mutual Information. + + References + ---------- + .. [Hubert1985] L. Hubert and P. Arabie, Comparing Partitions, + Journal of Classification 1985 + https://link.springer.com/article/10.1007%2FBF01908075 + + .. [Steinley2004] D. Steinley, Properties of the Hubert-Arabie + adjusted Rand index, Psychological Methods 2004 + + .. [wk] https://en.wikipedia.org/wiki/Rand_index#Adjusted_Rand_index + + .. [Chacon] :doi:`Minimum adjusted Rand index for two clusterings of a given size, + 2022, J. E. Chacón and A. I. Rastrojo <10.1007/s11634-022-00491-w>` + + Examples + -------- + Perfectly matching labelings have a score of 1 even + + >>> from sklearn.metrics.cluster import adjusted_rand_score + >>> adjusted_rand_score([0, 0, 1, 1], [0, 0, 1, 1]) + 1.0 + >>> adjusted_rand_score([0, 0, 1, 1], [1, 1, 0, 0]) + 1.0 + + Labelings that assign all classes members to the same clusters + are complete but may not always be pure, hence penalized:: + + >>> adjusted_rand_score([0, 0, 1, 2], [0, 0, 1, 1]) + 0.57 + + ARI is symmetric, so labelings that have pure clusters with members + coming from the same classes but unnecessary splits are penalized:: + + >>> adjusted_rand_score([0, 0, 1, 1], [0, 0, 1, 2]) + 0.57 + + If classes members are completely split across different clusters, the + assignment is totally incomplete, hence the ARI is very low:: + + >>> adjusted_rand_score([0, 0, 0, 0], [0, 1, 2, 3]) + 0.0 + + ARI may take a negative value for especially discordant labelings that + are a worse choice than the expected value of random labels:: + + >>> adjusted_rand_score([0, 0, 1, 1], [0, 1, 0, 1]) + -0.5 + + See :ref:`sphx_glr_auto_examples_cluster_plot_adjusted_for_chance_measures.py` + for a more detailed example. + """ + (tn, fp), (fn, tp) = pair_confusion_matrix(labels_true, labels_pred) + # convert to Python integer types, to avoid overflow or underflow + tn, fp, fn, tp = int(tn), int(fp), int(fn), int(tp) + + # Special cases: empty data or full agreement + if fn == 0 and fp == 0: + return 1.0 + + return 2.0 * (tp * tn - fn * fp) / ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn)) + + +@validate_params( + { + "labels_true": ["array-like"], + "labels_pred": ["array-like"], + "beta": [Interval(Real, 0, None, closed="left")], + }, + prefer_skip_nested_validation=True, +) +def homogeneity_completeness_v_measure(labels_true, labels_pred, *, beta=1.0): + """Compute the homogeneity and completeness and V-Measure scores at once. + + Those metrics are based on normalized conditional entropy measures of + the clustering labeling to evaluate given the knowledge of a Ground + Truth class labels of the same samples. + + A clustering result satisfies homogeneity if all of its clusters + contain only data points which are members of a single class. + + A clustering result satisfies completeness if all the data points + that are members of a given class are elements of the same cluster. + + Both scores have positive values between 0.0 and 1.0, larger values + being desirable. + + Those 3 metrics are independent of the absolute values of the labels: + a permutation of the class or cluster label values won't change the + score values in any way. + + V-Measure is furthermore symmetric: swapping ``labels_true`` and + ``label_pred`` will give the same score. This does not hold for + homogeneity and completeness. V-Measure is identical to + :func:`normalized_mutual_info_score` with the arithmetic averaging + method. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + labels_true : array-like of shape (n_samples,) + Ground truth class labels to be used as a reference. + + labels_pred : array-like of shape (n_samples,) + Cluster labels to evaluate. + + beta : float, default=1.0 + Ratio of weight attributed to ``homogeneity`` vs ``completeness``. + If ``beta`` is greater than 1, ``completeness`` is weighted more + strongly in the calculation. If ``beta`` is less than 1, + ``homogeneity`` is weighted more strongly. + + Returns + ------- + homogeneity : float + Score between 0.0 and 1.0. 1.0 stands for perfectly homogeneous labeling. + + completeness : float + Score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling. + + v_measure : float + Harmonic mean of the first two. + + See Also + -------- + homogeneity_score : Homogeneity metric of cluster labeling. + completeness_score : Completeness metric of cluster labeling. + v_measure_score : V-Measure (NMI with arithmetic mean option). + + Examples + -------- + >>> from sklearn.metrics import homogeneity_completeness_v_measure + >>> y_true, y_pred = [0, 0, 1, 1, 2, 2], [0, 0, 1, 2, 2, 2] + >>> homogeneity_completeness_v_measure(y_true, y_pred) + (0.71, 0.771, 0.74) + """ + labels_true, labels_pred = check_clusterings(labels_true, labels_pred) + + if len(labels_true) == 0: + return 1.0, 1.0, 1.0 + + entropy_C = entropy(labels_true) + entropy_K = entropy(labels_pred) + + contingency = contingency_matrix(labels_true, labels_pred, sparse=True) + MI = mutual_info_score(None, None, contingency=contingency) + + homogeneity = MI / (entropy_C) if entropy_C else 1.0 + completeness = MI / (entropy_K) if entropy_K else 1.0 + + if homogeneity + completeness == 0.0: + v_measure_score = 0.0 + else: + v_measure_score = ( + (1 + beta) + * homogeneity + * completeness + / (beta * homogeneity + completeness) + ) + + return float(homogeneity), float(completeness), float(v_measure_score) + + +@validate_params( + { + "labels_true": ["array-like"], + "labels_pred": ["array-like"], + }, + prefer_skip_nested_validation=True, +) +def homogeneity_score(labels_true, labels_pred): + """Homogeneity metric of a cluster labeling given a ground truth. + + A clustering result satisfies homogeneity if all of its clusters + contain only data points which are members of a single class. + + This metric is independent of the absolute values of the labels: + a permutation of the class or cluster label values won't change the + score value in any way. + + This metric is not symmetric: switching ``label_true`` with ``label_pred`` + will return the :func:`completeness_score` which will be different in + general. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + labels_true : array-like of shape (n_samples,) + Ground truth class labels to be used as a reference. + + labels_pred : array-like of shape (n_samples,) + Cluster labels to evaluate. + + Returns + ------- + homogeneity : float + Score between 0.0 and 1.0. 1.0 stands for perfectly homogeneous labeling. + + See Also + -------- + completeness_score : Completeness metric of cluster labeling. + v_measure_score : V-Measure (NMI with arithmetic mean option). + + References + ---------- + + .. [1] `Andrew Rosenberg and Julia Hirschberg, 2007. V-Measure: A + conditional entropy-based external cluster evaluation measure + `_ + + Examples + -------- + + Perfect labelings are homogeneous:: + + >>> from sklearn.metrics.cluster import homogeneity_score + >>> homogeneity_score([0, 0, 1, 1], [1, 1, 0, 0]) + 1.0 + + Non-perfect labelings that further split classes into more clusters can be + perfectly homogeneous:: + + >>> print("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 0, 1, 2])) + 1.000000 + >>> print("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 1, 2, 3])) + 1.000000 + + Clusters that include samples from different classes do not make for an + homogeneous labeling:: + + >>> print("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 1, 0, 1])) + 0.0... + >>> print("%.6f" % homogeneity_score([0, 0, 1, 1], [0, 0, 0, 0])) + 0.0... + """ + return homogeneity_completeness_v_measure(labels_true, labels_pred)[0] + + +@validate_params( + { + "labels_true": ["array-like"], + "labels_pred": ["array-like"], + }, + prefer_skip_nested_validation=True, +) +def completeness_score(labels_true, labels_pred): + """Compute completeness metric of a cluster labeling given a ground truth. + + A clustering result satisfies completeness if all the data points + that are members of a given class are elements of the same cluster. + + This metric is independent of the absolute values of the labels: + a permutation of the class or cluster label values won't change the + score value in any way. + + This metric is not symmetric: switching ``label_true`` with ``label_pred`` + will return the :func:`homogeneity_score` which will be different in + general. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + labels_true : array-like of shape (n_samples,) + Ground truth class labels to be used as a reference. + + labels_pred : array-like of shape (n_samples,) + Cluster labels to evaluate. + + Returns + ------- + completeness : float + Score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling. + + See Also + -------- + homogeneity_score : Homogeneity metric of cluster labeling. + v_measure_score : V-Measure (NMI with arithmetic mean option). + + References + ---------- + + .. [1] `Andrew Rosenberg and Julia Hirschberg, 2007. V-Measure: A + conditional entropy-based external cluster evaluation measure + `_ + + Examples + -------- + + Perfect labelings are complete:: + + >>> from sklearn.metrics.cluster import completeness_score + >>> completeness_score([0, 0, 1, 1], [1, 1, 0, 0]) + 1.0 + + Non-perfect labelings that assign all classes members to the same clusters + are still complete:: + + >>> print(completeness_score([0, 0, 1, 1], [0, 0, 0, 0])) + 1.0 + >>> print(completeness_score([0, 1, 2, 3], [0, 0, 1, 1])) + 0.999 + + If classes members are split across different clusters, the + assignment cannot be complete:: + + >>> print(completeness_score([0, 0, 1, 1], [0, 1, 0, 1])) + 0.0 + >>> print(completeness_score([0, 0, 0, 0], [0, 1, 2, 3])) + 0.0 + """ + return homogeneity_completeness_v_measure(labels_true, labels_pred)[1] + + +@validate_params( + { + "labels_true": ["array-like"], + "labels_pred": ["array-like"], + "beta": [Interval(Real, 0, None, closed="left")], + }, + prefer_skip_nested_validation=True, +) +def v_measure_score(labels_true, labels_pred, *, beta=1.0): + """V-measure cluster labeling given a ground truth. + + This score is identical to :func:`normalized_mutual_info_score` with + the ``'arithmetic'`` option for averaging. + + The V-measure is the harmonic mean between homogeneity and completeness:: + + v = (1 + beta) * homogeneity * completeness + / (beta * homogeneity + completeness) + + This metric is independent of the absolute values of the labels: + a permutation of the class or cluster label values won't change the + score value in any way. + + This metric is furthermore symmetric: switching ``label_true`` with + ``label_pred`` will return the same score value. This can be useful to + measure the agreement of two independent label assignments strategies + on the same dataset when the real ground truth is not known. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + labels_true : array-like of shape (n_samples,) + Ground truth class labels to be used as a reference. + + labels_pred : array-like of shape (n_samples,) + Cluster labels to evaluate. + + beta : float, default=1.0 + Ratio of weight attributed to ``homogeneity`` vs ``completeness``. + If ``beta`` is greater than 1, ``completeness`` is weighted more + strongly in the calculation. If ``beta`` is less than 1, + ``homogeneity`` is weighted more strongly. + + Returns + ------- + v_measure : float + Score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling. + + See Also + -------- + homogeneity_score : Homogeneity metric of cluster labeling. + completeness_score : Completeness metric of cluster labeling. + normalized_mutual_info_score : Normalized Mutual Information. + + References + ---------- + + .. [1] `Andrew Rosenberg and Julia Hirschberg, 2007. V-Measure: A + conditional entropy-based external cluster evaluation measure + `_ + + Examples + -------- + Perfect labelings are both homogeneous and complete, hence have score 1.0:: + + >>> from sklearn.metrics.cluster import v_measure_score + >>> v_measure_score([0, 0, 1, 1], [0, 0, 1, 1]) + 1.0 + >>> v_measure_score([0, 0, 1, 1], [1, 1, 0, 0]) + 1.0 + + Labelings that assign all classes members to the same clusters + are complete but not homogeneous, hence penalized:: + + >>> print("%.6f" % v_measure_score([0, 0, 1, 2], [0, 0, 1, 1])) + 0.8 + >>> print("%.6f" % v_measure_score([0, 1, 2, 3], [0, 0, 1, 1])) + 0.67 + + Labelings that have pure clusters with members coming from the same + classes are homogeneous but un-necessary splits harm completeness + and thus penalize V-measure as well:: + + >>> print("%.6f" % v_measure_score([0, 0, 1, 1], [0, 0, 1, 2])) + 0.8 + >>> print("%.6f" % v_measure_score([0, 0, 1, 1], [0, 1, 2, 3])) + 0.67 + + If classes members are completely split across different clusters, + the assignment is totally incomplete, hence the V-Measure is null:: + + >>> print("%.6f" % v_measure_score([0, 0, 0, 0], [0, 1, 2, 3])) + 0.0 + + Clusters that include samples from totally different classes totally + destroy the homogeneity of the labeling, hence:: + + >>> print("%.6f" % v_measure_score([0, 0, 1, 1], [0, 0, 0, 0])) + 0.0 + """ + return homogeneity_completeness_v_measure(labels_true, labels_pred, beta=beta)[2] + + +@validate_params( + { + "labels_true": ["array-like", None], + "labels_pred": ["array-like", None], + "contingency": ["array-like", "sparse matrix", None], + }, + prefer_skip_nested_validation=True, +) +def mutual_info_score(labels_true, labels_pred, *, contingency=None): + """Mutual Information between two clusterings. + + The Mutual Information is a measure of the similarity between two labels + of the same data. Where :math:`|U_i|` is the number of the samples + in cluster :math:`U_i` and :math:`|V_j|` is the number of the + samples in cluster :math:`V_j`, the Mutual Information + between clusterings :math:`U` and :math:`V` is given as: + + .. math:: + + MI(U,V)=\\sum_{i=1}^{|U|} \\sum_{j=1}^{|V|} \\frac{|U_i\\cap V_j|}{N} + \\log\\frac{N|U_i \\cap V_j|}{|U_i||V_j|} + + This metric is independent of the absolute values of the labels: + a permutation of the class or cluster label values won't change the + score value in any way. + + This metric is furthermore symmetric: switching :math:`U` (i.e + ``label_true``) with :math:`V` (i.e. ``label_pred``) will return the + same score value. This can be useful to measure the agreement of two + independent label assignments strategies on the same dataset when the + real ground truth is not known. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + labels_true : array-like of shape (n_samples,), dtype=integral + A clustering of the data into disjoint subsets, called :math:`U` in + the above formula. + + labels_pred : array-like of shape (n_samples,), dtype=integral + A clustering of the data into disjoint subsets, called :math:`V` in + the above formula. + + contingency : {array-like, sparse matrix} of shape \ + (n_classes_true, n_classes_pred), default=None + A contingency matrix given by the + :func:`~sklearn.metrics.cluster.contingency_matrix` function. If value + is ``None``, it will be computed, otherwise the given value is used, + with ``labels_true`` and ``labels_pred`` ignored. + + Returns + ------- + mi : float + Mutual information, a non-negative value, measured in nats using the + natural logarithm. + + See Also + -------- + adjusted_mutual_info_score : Adjusted against chance Mutual Information. + normalized_mutual_info_score : Normalized Mutual Information. + + Notes + ----- + The logarithm used is the natural logarithm (base-e). + + Examples + -------- + >>> from sklearn.metrics import mutual_info_score + >>> labels_true = [0, 1, 1, 0, 1, 0] + >>> labels_pred = [0, 1, 0, 0, 1, 1] + >>> mutual_info_score(labels_true, labels_pred) + 0.0566 + """ + if contingency is None: + labels_true, labels_pred = check_clusterings(labels_true, labels_pred) + contingency = contingency_matrix(labels_true, labels_pred, sparse=True) + else: + contingency = check_array( + contingency, + accept_sparse=["csr", "csc", "coo"], + dtype=[int, np.int32, np.int64], + ) + + if isinstance(contingency, np.ndarray): + # For an array + nzx, nzy = np.nonzero(contingency) + nz_val = contingency[nzx, nzy] + else: + # For a sparse matrix + nzx, nzy, nz_val = sp.find(contingency) + + contingency_sum = contingency.sum() + pi = np.ravel(contingency.sum(axis=1)) + pj = np.ravel(contingency.sum(axis=0)) + + # Since MI <= min(H(X), H(Y)), any labelling with zero entropy, i.e. containing a + # single cluster, implies MI = 0 + if pi.size == 1 or pj.size == 1: + return 0.0 + + log_contingency_nm = np.log(nz_val) + contingency_nm = nz_val / contingency_sum + # Don't need to calculate the full outer product, just for non-zeroes + outer = pi.take(nzx).astype(np.int64, copy=False) * pj.take(nzy).astype( + np.int64, copy=False + ) + log_outer = -np.log(outer) + log(pi.sum()) + log(pj.sum()) + mi = ( + contingency_nm * (log_contingency_nm - log(contingency_sum)) + + contingency_nm * log_outer + ) + mi = np.where(np.abs(mi) < np.finfo(mi.dtype).eps, 0.0, mi) + return float(np.clip(mi.sum(), 0.0, None)) + + +@validate_params( + { + "labels_true": ["array-like"], + "labels_pred": ["array-like"], + "average_method": [StrOptions({"arithmetic", "max", "min", "geometric"})], + }, + prefer_skip_nested_validation=True, +) +def adjusted_mutual_info_score( + labels_true, labels_pred, *, average_method="arithmetic" +): + """Adjusted Mutual Information between two clusterings. + + Adjusted Mutual Information (AMI) is an adjustment of the Mutual + Information (MI) score to account for chance. It accounts for the fact that + the MI is generally higher for two clusterings with a larger number of + clusters, regardless of whether there is actually more information shared. + For two clusterings :math:`U` and :math:`V`, the AMI is given as:: + + AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [avg(H(U), H(V)) - E(MI(U, V))] + + This metric is independent of the absolute values of the labels: + a permutation of the class or cluster label values won't change the + score value in any way. + + This metric is furthermore symmetric: switching :math:`U` (``label_true``) + with :math:`V` (``labels_pred``) will return the same score value. This can + be useful to measure the agreement of two independent label assignments + strategies on the same dataset when the real ground truth is not known. + + Be mindful that this function is an order of magnitude slower than other + metrics, such as the Adjusted Rand Index. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + labels_true : int array-like of shape (n_samples,) + A clustering of the data into disjoint subsets, called :math:`U` in + the above formula. + + labels_pred : int array-like of shape (n_samples,) + A clustering of the data into disjoint subsets, called :math:`V` in + the above formula. + + average_method : {'min', 'geometric', 'arithmetic', 'max'}, default='arithmetic' + How to compute the normalizer in the denominator. + + .. versionadded:: 0.20 + + .. versionchanged:: 0.22 + The default value of ``average_method`` changed from 'max' to + 'arithmetic'. + + Returns + ------- + ami: float (upperlimited by 1.0) + The AMI returns a value of 1 when the two partitions are identical + (ie perfectly matched). Random partitions (independent labellings) have + an expected AMI around 0 on average hence can be negative. The value is + in adjusted nats (based on the natural logarithm). + + See Also + -------- + adjusted_rand_score : Adjusted Rand Index. + mutual_info_score : Mutual Information (not adjusted for chance). + + References + ---------- + .. [1] `Vinh, Epps, and Bailey, (2010). Information Theoretic Measures for + Clusterings Comparison: Variants, Properties, Normalization and + Correction for Chance, JMLR + `_ + + .. [2] `Wikipedia entry for the Adjusted Mutual Information + `_ + + Examples + -------- + + Perfect labelings are both homogeneous and complete, hence have + score 1.0:: + + >>> from sklearn.metrics.cluster import adjusted_mutual_info_score + >>> adjusted_mutual_info_score([0, 0, 1, 1], [0, 0, 1, 1]) + 1.0 + >>> adjusted_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0]) + 1.0 + + If classes members are completely split across different clusters, + the assignment is totally in-complete, hence the AMI is null:: + + >>> adjusted_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3]) + 0.0 + """ + labels_true, labels_pred = check_clusterings(labels_true, labels_pred) + n_samples = labels_true.shape[0] + classes = np.unique(labels_true) + clusters = np.unique(labels_pred) + + # Special limit cases: no clustering since the data is not split. + # It corresponds to both labellings having zero entropy. + # This is a perfect match hence return 1.0. + if ( + classes.shape[0] == clusters.shape[0] == 1 + or classes.shape[0] == clusters.shape[0] == 0 + ): + return 1.0 + # if there is only one class or one cluster return 0.0. + elif classes.shape[0] == 1 or clusters.shape[0] == 1: + return 0.0 + + contingency = contingency_matrix(labels_true, labels_pred, sparse=True) + # Calculate the MI for the two clusterings + mi = mutual_info_score(labels_true, labels_pred, contingency=contingency) + # Calculate the expected value for the mutual information + emi = expected_mutual_information(contingency, n_samples) + # Calculate entropy for each labeling + h_true, h_pred = entropy(labels_true), entropy(labels_pred) + normalizer = _generalized_average(h_true, h_pred, average_method) + denominator = normalizer - emi + # Avoid 0.0 / 0.0 when expectation equals maximum, i.e. a perfect match. + # normalizer should always be >= emi, but because of floating-point + # representation, sometimes emi is slightly larger. Correct this + # by preserving the sign. + if denominator < 0: + denominator = min(denominator, -np.finfo("float64").eps) + else: + denominator = max(denominator, np.finfo("float64").eps) + # The same applies analogously to mi and emi. + numerator = mi - emi + if numerator < 0: + numerator = min(numerator, -np.finfo("float64").eps) + else: + numerator = max(numerator, np.finfo("float64").eps) + return float(numerator / denominator) + + +@validate_params( + { + "labels_true": ["array-like"], + "labels_pred": ["array-like"], + "average_method": [StrOptions({"arithmetic", "max", "min", "geometric"})], + }, + prefer_skip_nested_validation=True, +) +def normalized_mutual_info_score( + labels_true, labels_pred, *, average_method="arithmetic" +): + """Normalized Mutual Information between two clusterings. + + Normalized Mutual Information (NMI) is a normalization of the Mutual + Information (MI) score to scale the results between 0 (no mutual + information) and 1 (perfect correlation). In this function, mutual + information is normalized by some generalized mean of ``H(labels_true)`` + and ``H(labels_pred))``, defined by the `average_method`. + + This measure is not adjusted for chance. Therefore + :func:`adjusted_mutual_info_score` might be preferred. + + This metric is independent of the absolute values of the labels: + a permutation of the class or cluster label values won't change the + score value in any way. + + This metric is furthermore symmetric: switching ``label_true`` with + ``label_pred`` will return the same score value. This can be useful to + measure the agreement of two independent label assignments strategies + on the same dataset when the real ground truth is not known. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + labels_true : int array-like of shape (n_samples,) + A clustering of the data into disjoint subsets. + + labels_pred : int array-like of shape (n_samples,) + A clustering of the data into disjoint subsets. + + average_method : {'min', 'geometric', 'arithmetic', 'max'}, default='arithmetic' + How to compute the normalizer in the denominator. + + .. versionadded:: 0.20 + + .. versionchanged:: 0.22 + The default value of ``average_method`` changed from 'geometric' to + 'arithmetic'. + + Returns + ------- + nmi : float + Score between 0.0 and 1.0 in normalized nats (based on the natural + logarithm). 1.0 stands for perfectly complete labeling. + + See Also + -------- + v_measure_score : V-Measure (NMI with arithmetic mean option). + adjusted_rand_score : Adjusted Rand Index. + adjusted_mutual_info_score : Adjusted Mutual Information (adjusted + against chance). + + Examples + -------- + + Perfect labelings are both homogeneous and complete, hence have + score 1.0:: + + >>> from sklearn.metrics.cluster import normalized_mutual_info_score + >>> normalized_mutual_info_score([0, 0, 1, 1], [0, 0, 1, 1]) + 1.0 + >>> normalized_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0]) + 1.0 + + If classes members are completely split across different clusters, + the assignment is totally in-complete, hence the NMI is null:: + + >>> normalized_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3]) + 0.0 + """ + labels_true, labels_pred = check_clusterings(labels_true, labels_pred) + classes = np.unique(labels_true) + clusters = np.unique(labels_pred) + + # Special limit cases: no clustering since the data is not split. + # It corresponds to both labellings having zero entropy. + # This is a perfect match hence return 1.0. + if ( + classes.shape[0] == clusters.shape[0] == 1 + or classes.shape[0] == clusters.shape[0] == 0 + ): + return 1.0 + + contingency = contingency_matrix(labels_true, labels_pred, sparse=True) + contingency = contingency.astype(np.float64, copy=False) + # Calculate the MI for the two clusterings + mi = mutual_info_score(labels_true, labels_pred, contingency=contingency) + + # At this point mi = 0 can't be a perfect match (the special case of a single + # cluster has been dealt with before). Hence, if mi = 0, the nmi must be 0 whatever + # the normalization. + if mi == 0: + return 0.0 + + # Calculate entropy for each labeling + h_true, h_pred = entropy(labels_true), entropy(labels_pred) + + normalizer = _generalized_average(h_true, h_pred, average_method) + return float(mi / normalizer) + + +@validate_params( + { + "labels_true": ["array-like"], + "labels_pred": ["array-like"], + "sparse": ["boolean", Hidden(StrOptions({"deprecated"}))], + }, + prefer_skip_nested_validation=True, +) +def fowlkes_mallows_score(labels_true, labels_pred, *, sparse="deprecated"): + """Measure the similarity of two clusterings of a set of points. + + .. versionadded:: 0.18 + + The Fowlkes-Mallows index (FMI) is defined as the geometric mean of + the precision and recall:: + + FMI = TP / sqrt((TP + FP) * (TP + FN)) + + Where ``TP`` is the number of **True Positive** (i.e. the number of pairs of + points that belong to the same cluster in both ``labels_true`` and + ``labels_pred``), ``FP`` is the number of **False Positive** (i.e. the + number of pairs of points that belong to the same cluster in + ``labels_pred`` but not in ``labels_true``) and ``FN`` is the number of + **False Negative** (i.e. the number of pairs of points that belong to the + same cluster in ``labels_true`` but not in ``labels_pred``). + + The score ranges from 0 to 1. A high value indicates a good similarity + between two clusters. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + labels_true : array-like of shape (n_samples,), dtype=int + A clustering of the data into disjoint subsets. + + labels_pred : array-like of shape (n_samples,), dtype=int + A clustering of the data into disjoint subsets. + + sparse : bool, default=False + Compute contingency matrix internally with sparse matrix. + + .. deprecated:: 1.7 + The ``sparse`` parameter is deprecated and will be removed in 1.9. It has + no effect. + + Returns + ------- + score : float + The resulting Fowlkes-Mallows score. + + References + ---------- + .. [1] `E. B. Fowkles and C. L. Mallows, 1983. "A method for comparing two + hierarchical clusterings". Journal of the American Statistical + Association + `_ + + .. [2] `Wikipedia entry for the Fowlkes-Mallows Index + `_ + + Examples + -------- + + Perfect labelings are both homogeneous and complete, hence have + score 1.0:: + + >>> from sklearn.metrics.cluster import fowlkes_mallows_score + >>> fowlkes_mallows_score([0, 0, 1, 1], [0, 0, 1, 1]) + 1.0 + >>> fowlkes_mallows_score([0, 0, 1, 1], [1, 1, 0, 0]) + 1.0 + + If classes members are completely split across different clusters, + the assignment is totally random, hence the FMI is null:: + + >>> fowlkes_mallows_score([0, 0, 0, 0], [0, 1, 2, 3]) + 0.0 + """ + # TODO(1.9): remove the sparse parameter + if sparse != "deprecated": + warnings.warn( + "The 'sparse' parameter was deprecated in 1.7 and will be removed in 1.9. " + "It has no effect. Leave it to its default value to silence this warning.", + FutureWarning, + ) + + labels_true, labels_pred = check_clusterings(labels_true, labels_pred) + (n_samples,) = labels_true.shape + + c = contingency_matrix(labels_true, labels_pred, sparse=True) + c = c.astype(np.int64, copy=False) + tk = np.dot(c.data, c.data) - n_samples + pk = np.sum(np.asarray(c.sum(axis=0)).ravel() ** 2) - n_samples + qk = np.sum(np.asarray(c.sum(axis=1)).ravel() ** 2) - n_samples + return float(np.sqrt(tk / pk) * np.sqrt(tk / qk)) if tk != 0.0 else 0.0 + + +@validate_params( + { + "labels": ["array-like"], + }, + prefer_skip_nested_validation=True, +) +def entropy(labels): + """Calculate the entropy for a labeling. + + Parameters + ---------- + labels : array-like of shape (n_samples,), dtype=int + The labels. + + Returns + ------- + entropy : float + The entropy for a labeling. + + Notes + ----- + The logarithm used is the natural logarithm (base-e). + """ + xp, is_array_api_compliant, device_ = get_namespace_and_device(labels) + labels_len = labels.shape[0] if is_array_api_compliant else len(labels) + if labels_len == 0: + return 1.0 + + pi = xp.astype(xp.unique_counts(labels)[1], _max_precision_float_dtype(xp, device_)) + + # single cluster => zero entropy + if pi.size == 1: + return 0.0 + + pi_sum = xp.sum(pi) + # log(a / b) should be calculated as log(a) - log(b) for + # possible loss of precision + # Always convert the result as a Python scalar (on CPU) instead of a device + # specific scalar array. + return float(-xp.sum((pi / pi_sum) * (xp.log(pi) - log(pi_sum)))) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_unsupervised.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_unsupervised.py new file mode 100644 index 0000000000000000000000000000000000000000..38cec419e73f778ecdb7bdac89e090a26cdd794a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/_unsupervised.py @@ -0,0 +1,463 @@ +"""Unsupervised evaluation metrics.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import functools +from numbers import Integral + +import numpy as np +from scipy.sparse import issparse + +from ...preprocessing import LabelEncoder +from ...utils import _safe_indexing, check_random_state, check_X_y +from ...utils._array_api import _atol_for_type +from ...utils._param_validation import ( + Interval, + StrOptions, + validate_params, +) +from ..pairwise import _VALID_METRICS, pairwise_distances, pairwise_distances_chunked + + +def check_number_of_labels(n_labels, n_samples): + """Check that number of labels are valid. + + Parameters + ---------- + n_labels : int + Number of labels. + + n_samples : int + Number of samples. + """ + if not 1 < n_labels < n_samples: + raise ValueError( + "Number of labels is %d. Valid values are 2 to n_samples - 1 (inclusive)" + % n_labels + ) + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "labels": ["array-like"], + "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable], + "sample_size": [Interval(Integral, 1, None, closed="left"), None], + "random_state": ["random_state"], + }, + prefer_skip_nested_validation=True, +) +def silhouette_score( + X, labels, *, metric="euclidean", sample_size=None, random_state=None, **kwds +): + """Compute the mean Silhouette Coefficient of all samples. + + The Silhouette Coefficient is calculated using the mean intra-cluster + distance (``a``) and the mean nearest-cluster distance (``b``) for each + sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a, + b)``. To clarify, ``b`` is the distance between a sample and the nearest + cluster that the sample is not a part of. + Note that Silhouette Coefficient is only defined if number of labels + is ``2 <= n_labels <= n_samples - 1``. + + This function returns the mean Silhouette Coefficient over all samples. + To obtain the values for each sample, use :func:`silhouette_samples`. + + The best value is 1 and the worst value is -1. Values near 0 indicate + overlapping clusters. Negative values generally indicate that a sample has + been assigned to the wrong cluster, as a different cluster is more similar. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples_a, n_samples_a) if metric == \ + "precomputed" or (n_samples_a, n_features) otherwise + An array of pairwise distances between samples, or a feature array. + + labels : array-like of shape (n_samples,) + Predicted labels for each sample. + + metric : str or callable, default='euclidean' + The metric to use when calculating distance between instances in a + feature array. If metric is a string, it must be one of the options + allowed by :func:`~sklearn.metrics.pairwise_distances`. If ``X`` is + the distance array itself, use ``metric="precomputed"``. + + sample_size : int, default=None + The size of the sample to use when computing the Silhouette Coefficient + on a random subset of the data. + If ``sample_size is None``, no sampling is used. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for selecting a subset of samples. + Used when ``sample_size is not None``. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + **kwds : optional keyword parameters + Any further parameters are passed directly to the distance function. + If using a scipy.spatial.distance metric, the parameters are still + metric dependent. See the scipy docs for usage examples. + + Returns + ------- + silhouette : float + Mean Silhouette Coefficient for all samples. + + References + ---------- + + .. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the + Interpretation and Validation of Cluster Analysis". Computational + and Applied Mathematics 20: 53-65. + `_ + + .. [2] `Wikipedia entry on the Silhouette Coefficient + `_ + + Examples + -------- + >>> from sklearn.datasets import make_blobs + >>> from sklearn.cluster import KMeans + >>> from sklearn.metrics import silhouette_score + >>> X, y = make_blobs(random_state=42) + >>> kmeans = KMeans(n_clusters=2, random_state=42) + >>> silhouette_score(X, kmeans.fit_predict(X)) + 0.49... + """ + if sample_size is not None: + X, labels = check_X_y(X, labels, accept_sparse=["csc", "csr"]) + random_state = check_random_state(random_state) + indices = random_state.permutation(X.shape[0])[:sample_size] + if metric == "precomputed": + X, labels = X[indices].T[indices].T, labels[indices] + else: + X, labels = X[indices], labels[indices] + return float(np.mean(silhouette_samples(X, labels, metric=metric, **kwds))) + + +def _silhouette_reduce(D_chunk, start, labels, label_freqs): + """Accumulate silhouette statistics for vertical chunk of X. + + Parameters + ---------- + D_chunk : {array-like, sparse matrix} of shape (n_chunk_samples, n_samples) + Precomputed distances for a chunk. If a sparse matrix is provided, + only CSR format is accepted. + start : int + First index in the chunk. + labels : array-like of shape (n_samples,) + Corresponding cluster labels, encoded as {0, ..., n_clusters-1}. + label_freqs : array-like + Distribution of cluster labels in ``labels``. + """ + n_chunk_samples = D_chunk.shape[0] + # accumulate distances from each sample to each cluster + cluster_distances = np.zeros( + (n_chunk_samples, len(label_freqs)), dtype=D_chunk.dtype + ) + + if issparse(D_chunk): + if D_chunk.format != "csr": + raise TypeError( + "Expected CSR matrix. Please pass sparse matrix in CSR format." + ) + for i in range(n_chunk_samples): + indptr = D_chunk.indptr + indices = D_chunk.indices[indptr[i] : indptr[i + 1]] + sample_weights = D_chunk.data[indptr[i] : indptr[i + 1]] + sample_labels = np.take(labels, indices) + cluster_distances[i] += np.bincount( + sample_labels, weights=sample_weights, minlength=len(label_freqs) + ) + else: + for i in range(n_chunk_samples): + sample_weights = D_chunk[i] + sample_labels = labels + cluster_distances[i] += np.bincount( + sample_labels, weights=sample_weights, minlength=len(label_freqs) + ) + + # intra_index selects intra-cluster distances within cluster_distances + end = start + n_chunk_samples + intra_index = (np.arange(n_chunk_samples), labels[start:end]) + # intra_cluster_distances are averaged over cluster size outside this function + intra_cluster_distances = cluster_distances[intra_index] + # of the remaining distances we normalise and extract the minimum + cluster_distances[intra_index] = np.inf + cluster_distances /= label_freqs + inter_cluster_distances = cluster_distances.min(axis=1) + return intra_cluster_distances, inter_cluster_distances + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "labels": ["array-like"], + "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable], + }, + prefer_skip_nested_validation=True, +) +def silhouette_samples(X, labels, *, metric="euclidean", **kwds): + """Compute the Silhouette Coefficient for each sample. + + The Silhouette Coefficient is a measure of how well samples are clustered + with samples that are similar to themselves. Clustering models with a high + Silhouette Coefficient are said to be dense, where samples in the same + cluster are similar to each other, and well separated, where samples in + different clusters are not very similar to each other. + + The Silhouette Coefficient is calculated using the mean intra-cluster + distance (``a``) and the mean nearest-cluster distance (``b``) for each + sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a, + b)``. + Note that Silhouette Coefficient is only defined if number of labels + is 2 ``<= n_labels <= n_samples - 1``. + + This function returns the Silhouette Coefficient for each sample. + + The best value is 1 and the worst value is -1. Values near 0 indicate + overlapping clusters. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples_a, n_samples_a) if metric == \ + "precomputed" or (n_samples_a, n_features) otherwise + An array of pairwise distances between samples, or a feature array. If + a sparse matrix is provided, CSR format should be favoured avoiding + an additional copy. + + labels : array-like of shape (n_samples,) + Label values for each sample. + + metric : str or callable, default='euclidean' + The metric to use when calculating distance between instances in a + feature array. If metric is a string, it must be one of the options + allowed by :func:`~sklearn.metrics.pairwise_distances`. + If ``X`` is the distance array itself, use "precomputed" as the metric. + Precomputed distance matrices must have 0 along the diagonal. + + **kwds : optional keyword parameters + Any further parameters are passed directly to the distance function. + If using a ``scipy.spatial.distance`` metric, the parameters are still + metric dependent. See the scipy docs for usage examples. + + Returns + ------- + silhouette : array-like of shape (n_samples,) + Silhouette Coefficients for each sample. + + References + ---------- + + .. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the + Interpretation and Validation of Cluster Analysis". Computational + and Applied Mathematics 20: 53-65. + `_ + + .. [2] `Wikipedia entry on the Silhouette Coefficient + `_ + + Examples + -------- + >>> from sklearn.metrics import silhouette_samples + >>> from sklearn.datasets import make_blobs + >>> from sklearn.cluster import KMeans + >>> X, y = make_blobs(n_samples=50, random_state=42) + >>> kmeans = KMeans(n_clusters=3, random_state=42) + >>> labels = kmeans.fit_predict(X) + >>> silhouette_samples(X, labels) + array([...]) + """ + X, labels = check_X_y(X, labels, accept_sparse=["csr"]) + + # Check for non-zero diagonal entries in precomputed distance matrix + if metric == "precomputed": + error_msg = ValueError( + "The precomputed distance matrix contains non-zero " + "elements on the diagonal. Use np.fill_diagonal(X, 0)." + ) + if X.dtype.kind == "f": + atol = _atol_for_type(X.dtype) + + if np.any(np.abs(X.diagonal()) > atol): + raise error_msg + elif np.any(X.diagonal() != 0): # integral dtype + raise error_msg + + le = LabelEncoder() + labels = le.fit_transform(labels) + n_samples = len(labels) + label_freqs = np.bincount(labels) + check_number_of_labels(len(le.classes_), n_samples) + + kwds["metric"] = metric + reduce_func = functools.partial( + _silhouette_reduce, labels=labels, label_freqs=label_freqs + ) + results = zip(*pairwise_distances_chunked(X, reduce_func=reduce_func, **kwds)) + intra_clust_dists, inter_clust_dists = results + intra_clust_dists = np.concatenate(intra_clust_dists) + inter_clust_dists = np.concatenate(inter_clust_dists) + + denom = (label_freqs - 1).take(labels, mode="clip") + with np.errstate(divide="ignore", invalid="ignore"): + intra_clust_dists /= denom + + sil_samples = inter_clust_dists - intra_clust_dists + with np.errstate(divide="ignore", invalid="ignore"): + sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists) + # nan values are for clusters of size 1, and should be 0 + return np.nan_to_num(sil_samples) + + +@validate_params( + { + "X": ["array-like"], + "labels": ["array-like"], + }, + prefer_skip_nested_validation=True, +) +def calinski_harabasz_score(X, labels): + """Compute the Calinski and Harabasz score. + + It is also known as the Variance Ratio Criterion. + + The score is defined as ratio of the sum of between-cluster dispersion and + of within-cluster dispersion. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + A list of ``n_features``-dimensional data points. Each row corresponds + to a single data point. + + labels : array-like of shape (n_samples,) + Predicted labels for each sample. + + Returns + ------- + score : float + The resulting Calinski-Harabasz score. + + References + ---------- + .. [1] `T. Calinski and J. Harabasz, 1974. "A dendrite method for cluster + analysis". Communications in Statistics + `_ + + Examples + -------- + >>> from sklearn.datasets import make_blobs + >>> from sklearn.cluster import KMeans + >>> from sklearn.metrics import calinski_harabasz_score + >>> X, _ = make_blobs(random_state=0) + >>> kmeans = KMeans(n_clusters=3, random_state=0,).fit(X) + >>> calinski_harabasz_score(X, kmeans.labels_) + 114.8... + """ + X, labels = check_X_y(X, labels) + le = LabelEncoder() + labels = le.fit_transform(labels) + + n_samples, _ = X.shape + n_labels = len(le.classes_) + + check_number_of_labels(n_labels, n_samples) + + extra_disp, intra_disp = 0.0, 0.0 + mean = np.mean(X, axis=0) + for k in range(n_labels): + cluster_k = X[labels == k] + mean_k = np.mean(cluster_k, axis=0) + extra_disp += len(cluster_k) * np.sum((mean_k - mean) ** 2) + intra_disp += np.sum((cluster_k - mean_k) ** 2) + + return float( + 1.0 + if intra_disp == 0.0 + else extra_disp * (n_samples - n_labels) / (intra_disp * (n_labels - 1.0)) + ) + + +@validate_params( + { + "X": ["array-like"], + "labels": ["array-like"], + }, + prefer_skip_nested_validation=True, +) +def davies_bouldin_score(X, labels): + """Compute the Davies-Bouldin score. + + The score is defined as the average similarity measure of each cluster with + its most similar cluster, where similarity is the ratio of within-cluster + distances to between-cluster distances. Thus, clusters which are farther + apart and less dispersed will result in a better score. + + The minimum score is zero, with lower values indicating better clustering. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.20 + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + A list of ``n_features``-dimensional data points. Each row corresponds + to a single data point. + + labels : array-like of shape (n_samples,) + Predicted labels for each sample. + + Returns + ------- + score: float + The resulting Davies-Bouldin score. + + References + ---------- + .. [1] Davies, David L.; Bouldin, Donald W. (1979). + `"A Cluster Separation Measure" + `__. + IEEE Transactions on Pattern Analysis and Machine Intelligence. + PAMI-1 (2): 224-227 + + Examples + -------- + >>> from sklearn.metrics import davies_bouldin_score + >>> X = [[0, 1], [1, 1], [3, 4]] + >>> labels = [0, 0, 1] + >>> davies_bouldin_score(X, labels) + 0.12... + """ + X, labels = check_X_y(X, labels) + le = LabelEncoder() + labels = le.fit_transform(labels) + n_samples, _ = X.shape + n_labels = len(le.classes_) + check_number_of_labels(n_labels, n_samples) + + intra_dists = np.zeros(n_labels) + centroids = np.zeros((n_labels, len(X[0])), dtype=float) + for k in range(n_labels): + cluster_k = _safe_indexing(X, labels == k) + centroid = cluster_k.mean(axis=0) + centroids[k] = centroid + intra_dists[k] = np.average(pairwise_distances(cluster_k, [centroid])) + + centroid_distances = pairwise_distances(centroids) + + if np.allclose(intra_dists, 0) or np.allclose(centroid_distances, 0): + return 0.0 + + centroid_distances[centroid_distances == 0] = np.inf + combined_intra_dists = intra_dists[:, None] + intra_dists + scores = np.max(combined_intra_dists / centroid_distances, axis=1) + return float(np.mean(scores)) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/meson.build b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..5f25296c7540f289dc74eba4a97ddac5fad9af90 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/meson.build @@ -0,0 +1,6 @@ +py.extension_module( + '_expected_mutual_info_fast', + cython_gen.process('_expected_mutual_info_fast.pyx'), + subdir: 'sklearn/metrics/cluster', + install: true +) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_bicluster.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_bicluster.py new file mode 100644 index 0000000000000000000000000000000000000000..53f7805100a1313709d1d8868d45071b3066f836 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_bicluster.py @@ -0,0 +1,56 @@ +"""Testing for bicluster metrics module""" + +import numpy as np + +from sklearn.metrics import consensus_score +from sklearn.metrics.cluster._bicluster import _jaccard +from sklearn.utils._testing import assert_almost_equal + + +def test_jaccard(): + a1 = np.array([True, True, False, False]) + a2 = np.array([True, True, True, True]) + a3 = np.array([False, True, True, False]) + a4 = np.array([False, False, True, True]) + + assert _jaccard(a1, a1, a1, a1) == 1 + assert _jaccard(a1, a1, a2, a2) == 0.25 + assert _jaccard(a1, a1, a3, a3) == 1.0 / 7 + assert _jaccard(a1, a1, a4, a4) == 0 + + +def test_consensus_score(): + a = [[True, True, False, False], [False, False, True, True]] + b = a[::-1] + + assert consensus_score((a, a), (a, a)) == 1 + assert consensus_score((a, a), (b, b)) == 1 + assert consensus_score((a, b), (a, b)) == 1 + assert consensus_score((a, b), (b, a)) == 1 + + assert consensus_score((a, a), (b, a)) == 0 + assert consensus_score((a, a), (a, b)) == 0 + assert consensus_score((b, b), (a, b)) == 0 + assert consensus_score((b, b), (b, a)) == 0 + + +def test_consensus_score_issue2445(): + """Different number of biclusters in A and B""" + a_rows = np.array( + [ + [True, True, False, False], + [False, False, True, True], + [False, False, False, True], + ] + ) + a_cols = np.array( + [ + [True, True, False, False], + [False, False, True, True], + [False, False, False, True], + ] + ) + idx = [0, 2] + s = consensus_score((a_rows, a_cols), (a_rows[idx], a_cols[idx])) + # B contains 2 of the 3 biclusters in A, so score should be 2/3 + assert_almost_equal(s, 2.0 / 3.0) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_common.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_common.py new file mode 100644 index 0000000000000000000000000000000000000000..a73670fbffce40eabaca55fc177648938cdccb26 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_common.py @@ -0,0 +1,234 @@ +from functools import partial +from itertools import chain + +import numpy as np +import pytest + +from sklearn.metrics.cluster import ( + adjusted_mutual_info_score, + adjusted_rand_score, + calinski_harabasz_score, + completeness_score, + davies_bouldin_score, + fowlkes_mallows_score, + homogeneity_score, + mutual_info_score, + normalized_mutual_info_score, + rand_score, + silhouette_score, + v_measure_score, +) +from sklearn.utils._testing import assert_allclose + +# Dictionaries of metrics +# ------------------------ +# The goal of having those dictionaries is to have an easy way to call a +# particular metric and associate a name to each function: +# - SUPERVISED_METRICS: all supervised cluster metrics - (when given a +# ground truth value) +# - UNSUPERVISED_METRICS: all unsupervised cluster metrics +# +# Those dictionaries will be used to test systematically some invariance +# properties, e.g. invariance toward several input layout. +# + +SUPERVISED_METRICS = { + "adjusted_mutual_info_score": adjusted_mutual_info_score, + "adjusted_rand_score": adjusted_rand_score, + "rand_score": rand_score, + "completeness_score": completeness_score, + "homogeneity_score": homogeneity_score, + "mutual_info_score": mutual_info_score, + "normalized_mutual_info_score": normalized_mutual_info_score, + "v_measure_score": v_measure_score, + "fowlkes_mallows_score": fowlkes_mallows_score, +} + +UNSUPERVISED_METRICS = { + "silhouette_score": silhouette_score, + "silhouette_manhattan": partial(silhouette_score, metric="manhattan"), + "calinski_harabasz_score": calinski_harabasz_score, + "davies_bouldin_score": davies_bouldin_score, +} + +# Lists of metrics with common properties +# --------------------------------------- +# Lists of metrics with common properties are used to test systematically some +# functionalities and invariance, e.g. SYMMETRIC_METRICS lists all metrics +# that are symmetric with respect to their input argument y_true and y_pred. +# +# -------------------------------------------------------------------- +# Symmetric with respect to their input arguments y_true and y_pred. +# Symmetric metrics only apply to supervised clusters. +SYMMETRIC_METRICS = [ + "adjusted_rand_score", + "rand_score", + "v_measure_score", + "mutual_info_score", + "adjusted_mutual_info_score", + "normalized_mutual_info_score", + "fowlkes_mallows_score", +] + +NON_SYMMETRIC_METRICS = ["homogeneity_score", "completeness_score"] + +# Metrics whose upper bound is 1 +NORMALIZED_METRICS = [ + "adjusted_rand_score", + "rand_score", + "homogeneity_score", + "completeness_score", + "v_measure_score", + "adjusted_mutual_info_score", + "fowlkes_mallows_score", + "normalized_mutual_info_score", +] + + +rng = np.random.RandomState(0) +y1 = rng.randint(3, size=30) +y2 = rng.randint(3, size=30) + + +def test_symmetric_non_symmetric_union(): + assert sorted(SYMMETRIC_METRICS + NON_SYMMETRIC_METRICS) == sorted( + SUPERVISED_METRICS + ) + + +@pytest.mark.parametrize( + "metric_name, y1, y2", [(name, y1, y2) for name in SYMMETRIC_METRICS] +) +def test_symmetry(metric_name, y1, y2): + metric = SUPERVISED_METRICS[metric_name] + assert metric(y1, y2) == pytest.approx(metric(y2, y1)) + + +@pytest.mark.parametrize( + "metric_name, y1, y2", [(name, y1, y2) for name in NON_SYMMETRIC_METRICS] +) +def test_non_symmetry(metric_name, y1, y2): + metric = SUPERVISED_METRICS[metric_name] + assert metric(y1, y2) != pytest.approx(metric(y2, y1)) + + +@pytest.mark.parametrize("metric_name", NORMALIZED_METRICS) +def test_normalized_output(metric_name): + upper_bound_1 = [0, 0, 0, 1, 1, 1] + upper_bound_2 = [0, 0, 0, 1, 1, 1] + metric = SUPERVISED_METRICS[metric_name] + assert metric([0, 0, 0, 1, 1], [0, 0, 0, 1, 2]) > 0.0 + assert metric([0, 0, 1, 1, 2], [0, 0, 1, 1, 1]) > 0.0 + assert metric([0, 0, 0, 1, 2], [0, 1, 1, 1, 1]) < 1.0 + assert metric([0, 0, 0, 1, 2], [0, 1, 1, 1, 1]) < 1.0 + assert metric(upper_bound_1, upper_bound_2) == pytest.approx(1.0) + + lower_bound_1 = [0, 0, 0, 0, 0, 0] + lower_bound_2 = [0, 1, 2, 3, 4, 5] + score = np.array( + [metric(lower_bound_1, lower_bound_2), metric(lower_bound_2, lower_bound_1)] + ) + assert not (score < 0).any() + + +@pytest.mark.parametrize("metric_name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS)) +def test_permute_labels(metric_name): + # All clustering metrics do not change score due to permutations of labels + # that is when 0 and 1 exchanged. + y_label = np.array([0, 0, 0, 1, 1, 0, 1]) + y_pred = np.array([1, 0, 1, 0, 1, 1, 0]) + if metric_name in SUPERVISED_METRICS: + metric = SUPERVISED_METRICS[metric_name] + score_1 = metric(y_pred, y_label) + assert_allclose(score_1, metric(1 - y_pred, y_label)) + assert_allclose(score_1, metric(1 - y_pred, 1 - y_label)) + assert_allclose(score_1, metric(y_pred, 1 - y_label)) + else: + metric = UNSUPERVISED_METRICS[metric_name] + X = np.random.randint(10, size=(7, 10)) + score_1 = metric(X, y_pred) + assert_allclose(score_1, metric(X, 1 - y_pred)) + + +@pytest.mark.parametrize("metric_name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS)) +# For all clustering metrics Input parameters can be both +# in the form of arrays lists, positive, negative or string +def test_format_invariance(metric_name): + y_true = [0, 0, 0, 0, 1, 1, 1, 1] + y_pred = [0, 1, 2, 3, 4, 5, 6, 7] + + def generate_formats(y): + y = np.array(y) + yield y, "array of ints" + yield y.tolist(), "list of ints" + yield [str(x) + "-a" for x in y.tolist()], "list of strs" + yield ( + np.array([str(x) + "-a" for x in y.tolist()], dtype=object), + "array of strs", + ) + yield y - 1, "including negative ints" + yield y + 1, "strictly positive ints" + + if metric_name in SUPERVISED_METRICS: + metric = SUPERVISED_METRICS[metric_name] + score_1 = metric(y_true, y_pred) + y_true_gen = generate_formats(y_true) + y_pred_gen = generate_formats(y_pred) + for (y_true_fmt, fmt_name), (y_pred_fmt, _) in zip(y_true_gen, y_pred_gen): + assert score_1 == metric(y_true_fmt, y_pred_fmt) + else: + metric = UNSUPERVISED_METRICS[metric_name] + X = np.random.randint(10, size=(8, 10)) + score_1 = metric(X, y_true) + assert score_1 == metric(X.astype(float), y_true) + y_true_gen = generate_formats(y_true) + for y_true_fmt, fmt_name in y_true_gen: + assert score_1 == metric(X, y_true_fmt) + + +@pytest.mark.parametrize("metric", SUPERVISED_METRICS.values()) +def test_single_sample(metric): + # only the supervised metrics support single sample + for i, j in [(0, 0), (0, 1), (1, 0), (1, 1)]: + metric([i], [j]) + + +@pytest.mark.parametrize( + "metric_name, metric_func", dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS).items() +) +def test_inf_nan_input(metric_name, metric_func): + if metric_name in SUPERVISED_METRICS: + invalids = [ + ([0, 1], [np.inf, np.inf]), + ([0, 1], [np.nan, np.nan]), + ([0, 1], [np.nan, np.inf]), + ] + else: + X = np.random.randint(10, size=(2, 10)) + invalids = [(X, [np.inf, np.inf]), (X, [np.nan, np.nan]), (X, [np.nan, np.inf])] + with pytest.raises(ValueError, match=r"contains (NaN|infinity)"): + for args in invalids: + metric_func(*args) + + +@pytest.mark.parametrize("name", chain(SUPERVISED_METRICS, UNSUPERVISED_METRICS)) +def test_returned_value_consistency(name): + """Ensure that the returned values of all metrics are consistent. + + It can only be a float. It should not be a numpy float64 or float32. + """ + + rng = np.random.RandomState(0) + X = rng.randint(10, size=(20, 10)) + labels_true = rng.randint(0, 3, size=(20,)) + labels_pred = rng.randint(0, 3, size=(20,)) + + if name in SUPERVISED_METRICS: + metric = SUPERVISED_METRICS[name] + score = metric(labels_true, labels_pred) + else: + metric = UNSUPERVISED_METRICS[name] + score = metric(X, labels_pred) + + assert isinstance(score, float) + assert not isinstance(score, (np.float64, np.float32)) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_supervised.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_supervised.py new file mode 100644 index 0000000000000000000000000000000000000000..7421b726ebe677a6845167b3b268614891b38013 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_supervised.py @@ -0,0 +1,522 @@ +import warnings + +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal + +from sklearn.base import config_context +from sklearn.metrics.cluster import ( + adjusted_mutual_info_score, + adjusted_rand_score, + completeness_score, + contingency_matrix, + entropy, + expected_mutual_information, + fowlkes_mallows_score, + homogeneity_completeness_v_measure, + homogeneity_score, + mutual_info_score, + normalized_mutual_info_score, + pair_confusion_matrix, + rand_score, + v_measure_score, +) +from sklearn.metrics.cluster._supervised import _generalized_average, check_clusterings +from sklearn.utils import assert_all_finite +from sklearn.utils._array_api import ( + _get_namespace_device_dtype_ids, + yield_namespace_device_dtype_combinations, +) +from sklearn.utils._testing import _array_api_for_tests, assert_almost_equal + +score_funcs = [ + adjusted_rand_score, + rand_score, + homogeneity_score, + completeness_score, + v_measure_score, + adjusted_mutual_info_score, + normalized_mutual_info_score, +] + + +@pytest.mark.parametrize("score_func", score_funcs) +def test_error_messages_on_wrong_input(score_func): + expected = r"Found input variables with inconsistent numbers of samples: \[2, 3\]" + with pytest.raises(ValueError, match=expected): + score_func([0, 1], [1, 1, 1]) + + expected = r"labels_true must be 1D: shape is \(2" + with pytest.raises(ValueError, match=expected): + score_func([[0, 1], [1, 0]], [1, 1, 1]) + + expected = r"labels_pred must be 1D: shape is \(2" + with pytest.raises(ValueError, match=expected): + score_func([0, 1, 0], [[1, 1], [0, 0]]) + + +def test_generalized_average(): + a, b = 1, 2 + methods = ["min", "geometric", "arithmetic", "max"] + means = [_generalized_average(a, b, method) for method in methods] + assert means[0] <= means[1] <= means[2] <= means[3] + c, d = 12, 12 + means = [_generalized_average(c, d, method) for method in methods] + assert means[0] == means[1] == means[2] == means[3] + + +@pytest.mark.parametrize("score_func", score_funcs) +def test_perfect_matches(score_func): + assert score_func([], []) == pytest.approx(1.0) + assert score_func([0], [1]) == pytest.approx(1.0) + assert score_func([0, 0, 0], [0, 0, 0]) == pytest.approx(1.0) + assert score_func([0, 1, 0], [42, 7, 42]) == pytest.approx(1.0) + assert score_func([0.0, 1.0, 0.0], [42.0, 7.0, 42.0]) == pytest.approx(1.0) + assert score_func([0.0, 1.0, 2.0], [42.0, 7.0, 2.0]) == pytest.approx(1.0) + assert score_func([0, 1, 2], [42, 7, 2]) == pytest.approx(1.0) + + +@pytest.mark.parametrize( + "score_func", + [ + normalized_mutual_info_score, + adjusted_mutual_info_score, + ], +) +@pytest.mark.parametrize("average_method", ["min", "geometric", "arithmetic", "max"]) +def test_perfect_matches_with_changing_means(score_func, average_method): + assert score_func([], [], average_method=average_method) == pytest.approx(1.0) + assert score_func([0], [1], average_method=average_method) == pytest.approx(1.0) + assert score_func( + [0, 0, 0], [0, 0, 0], average_method=average_method + ) == pytest.approx(1.0) + assert score_func( + [0, 1, 0], [42, 7, 42], average_method=average_method + ) == pytest.approx(1.0) + assert score_func( + [0.0, 1.0, 0.0], [42.0, 7.0, 42.0], average_method=average_method + ) == pytest.approx(1.0) + assert score_func( + [0.0, 1.0, 2.0], [42.0, 7.0, 2.0], average_method=average_method + ) == pytest.approx(1.0) + assert score_func( + [0, 1, 2], [42, 7, 2], average_method=average_method + ) == pytest.approx(1.0) + # Non-regression tests for: https://github.com/scikit-learn/scikit-learn/issues/30950 + assert score_func([0, 1], [0, 1], average_method=average_method) == pytest.approx( + 1.0 + ) + assert score_func( + [0, 1, 2, 3], [0, 1, 2, 3], average_method=average_method + ) == pytest.approx(1.0) + + +def test_homogeneous_but_not_complete_labeling(): + # homogeneous but not complete clustering + h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 2, 2]) + assert_almost_equal(h, 1.00, 2) + assert_almost_equal(c, 0.69, 2) + assert_almost_equal(v, 0.81, 2) + + +def test_complete_but_not_homogeneous_labeling(): + # complete but not homogeneous clustering + h, c, v = homogeneity_completeness_v_measure([0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 1, 1]) + assert_almost_equal(h, 0.58, 2) + assert_almost_equal(c, 1.00, 2) + assert_almost_equal(v, 0.73, 2) + + +def test_not_complete_and_not_homogeneous_labeling(): + # neither complete nor homogeneous but not so bad either + h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2]) + assert_almost_equal(h, 0.67, 2) + assert_almost_equal(c, 0.42, 2) + assert_almost_equal(v, 0.52, 2) + + +def test_beta_parameter(): + # test for when beta passed to + # homogeneity_completeness_v_measure + # and v_measure_score + beta_test = 0.2 + h_test = 0.67 + c_test = 0.42 + v_test = (1 + beta_test) * h_test * c_test / (beta_test * h_test + c_test) + + h, c, v = homogeneity_completeness_v_measure( + [0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test + ) + assert_almost_equal(h, h_test, 2) + assert_almost_equal(c, c_test, 2) + assert_almost_equal(v, v_test, 2) + + v = v_measure_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test) + assert_almost_equal(v, v_test, 2) + + +def test_non_consecutive_labels(): + # regression tests for labels with gaps + h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 2, 2, 2], [0, 1, 0, 1, 2, 2]) + assert_almost_equal(h, 0.67, 2) + assert_almost_equal(c, 0.42, 2) + assert_almost_equal(v, 0.52, 2) + + h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2]) + assert_almost_equal(h, 0.67, 2) + assert_almost_equal(c, 0.42, 2) + assert_almost_equal(v, 0.52, 2) + + ari_1 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2]) + ari_2 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2]) + assert_almost_equal(ari_1, 0.24, 2) + assert_almost_equal(ari_2, 0.24, 2) + + ri_1 = rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2]) + ri_2 = rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2]) + assert_almost_equal(ri_1, 0.66, 2) + assert_almost_equal(ri_2, 0.66, 2) + + +def uniform_labelings_scores(score_func, n_samples, k_range, n_runs=10, seed=42): + # Compute score for random uniform cluster labelings + random_labels = np.random.RandomState(seed).randint + scores = np.zeros((len(k_range), n_runs)) + for i, k in enumerate(k_range): + for j in range(n_runs): + labels_a = random_labels(low=0, high=k, size=n_samples) + labels_b = random_labels(low=0, high=k, size=n_samples) + scores[i, j] = score_func(labels_a, labels_b) + return scores + + +def test_adjustment_for_chance(): + # Check that adjusted scores are almost zero on random labels + n_clusters_range = [2, 10, 50, 90] + n_samples = 100 + n_runs = 10 + + scores = uniform_labelings_scores( + adjusted_rand_score, n_samples, n_clusters_range, n_runs + ) + + max_abs_scores = np.abs(scores).max(axis=1) + assert_array_almost_equal(max_abs_scores, [0.02, 0.03, 0.03, 0.02], 2) + + +def test_adjusted_mutual_info_score(): + # Compute the Adjusted Mutual Information and test against known values + labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) + labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) + # Mutual information + mi = mutual_info_score(labels_a, labels_b) + assert_almost_equal(mi, 0.41022, 5) + # with provided sparse contingency + C = contingency_matrix(labels_a, labels_b, sparse=True) + mi = mutual_info_score(labels_a, labels_b, contingency=C) + assert_almost_equal(mi, 0.41022, 5) + # with provided dense contingency + C = contingency_matrix(labels_a, labels_b) + mi = mutual_info_score(labels_a, labels_b, contingency=C) + assert_almost_equal(mi, 0.41022, 5) + # Expected mutual information + n_samples = C.sum() + emi = expected_mutual_information(C, n_samples) + assert_almost_equal(emi, 0.15042, 5) + # Adjusted mutual information + ami = adjusted_mutual_info_score(labels_a, labels_b) + assert_almost_equal(ami, 0.27821, 5) + ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3]) + assert ami == pytest.approx(1.0) + # Test with a very large array + a110 = np.array([list(labels_a) * 110]).flatten() + b110 = np.array([list(labels_b) * 110]).flatten() + ami = adjusted_mutual_info_score(a110, b110) + assert_almost_equal(ami, 0.38, 2) + + +def test_expected_mutual_info_overflow(): + # Test for regression where contingency cell exceeds 2**16 + # leading to overflow in np.outer, resulting in EMI > 1 + assert expected_mutual_information(np.array([[70000]]), 70000) <= 1 + + +def test_int_overflow_mutual_info_fowlkes_mallows_score(): + # Test overflow in mutual_info_classif and fowlkes_mallows_score + x = np.array( + [1] * (52632 + 2529) + + [2] * (14660 + 793) + + [3] * (3271 + 204) + + [4] * (814 + 39) + + [5] * (316 + 20) + ) + y = np.array( + [0] * 52632 + + [1] * 2529 + + [0] * 14660 + + [1] * 793 + + [0] * 3271 + + [1] * 204 + + [0] * 814 + + [1] * 39 + + [0] * 316 + + [1] * 20 + ) + + assert_all_finite(mutual_info_score(x, y)) + assert_all_finite(fowlkes_mallows_score(x, y)) + + +def test_entropy(): + assert_almost_equal(entropy([0, 0, 42.0]), 0.6365141, 5) + assert_almost_equal(entropy([]), 1) + assert entropy([1, 1, 1, 1]) == 0 + + +@pytest.mark.parametrize( + "array_namespace, device, dtype_name", + yield_namespace_device_dtype_combinations(), + ids=_get_namespace_device_dtype_ids, +) +def test_entropy_array_api(array_namespace, device, dtype_name): + xp = _array_api_for_tests(array_namespace, device) + float_labels = xp.asarray(np.asarray([0, 0, 42.0], dtype=dtype_name), device=device) + empty_int32_labels = xp.asarray([], dtype=xp.int32, device=device) + int_labels = xp.asarray([1, 1, 1, 1], device=device) + with config_context(array_api_dispatch=True): + assert entropy(float_labels) == pytest.approx(0.6365141, abs=1e-5) + assert entropy(empty_int32_labels) == 1 + assert entropy(int_labels) == 0 + + +def test_contingency_matrix(): + labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) + labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) + C = contingency_matrix(labels_a, labels_b) + C2 = np.histogram2d(labels_a, labels_b, bins=(np.arange(1, 5), np.arange(1, 5)))[0] + assert_array_almost_equal(C, C2) + C = contingency_matrix(labels_a, labels_b, eps=0.1) + assert_array_almost_equal(C, C2 + 0.1) + + +def test_contingency_matrix_sparse(): + labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) + labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) + C = contingency_matrix(labels_a, labels_b) + C_sparse = contingency_matrix(labels_a, labels_b, sparse=True).toarray() + assert_array_almost_equal(C, C_sparse) + with pytest.raises(ValueError, match="Cannot set 'eps' when sparse=True"): + contingency_matrix(labels_a, labels_b, eps=1e-10, sparse=True) + + +def test_exactly_zero_info_score(): + # Check numerical stability when information is exactly zero + for i in np.logspace(1, 4, 4).astype(int): + labels_a, labels_b = (np.ones(i, dtype=int), np.arange(i, dtype=int)) + assert normalized_mutual_info_score(labels_a, labels_b) == pytest.approx(0.0) + assert v_measure_score(labels_a, labels_b) == pytest.approx(0.0) + assert adjusted_mutual_info_score(labels_a, labels_b) == 0.0 + assert normalized_mutual_info_score(labels_a, labels_b) == pytest.approx(0.0) + for method in ["min", "geometric", "arithmetic", "max"]: + assert ( + adjusted_mutual_info_score(labels_a, labels_b, average_method=method) + == 0.0 + ) + assert normalized_mutual_info_score( + labels_a, labels_b, average_method=method + ) == pytest.approx(0.0) + + +def test_v_measure_and_mutual_information(seed=36): + # Check relation between v_measure, entropy and mutual information + for i in np.logspace(1, 4, 4).astype(int): + random_state = np.random.RandomState(seed) + labels_a, labels_b = ( + random_state.randint(0, 10, i), + random_state.randint(0, 10, i), + ) + assert_almost_equal( + v_measure_score(labels_a, labels_b), + 2.0 + * mutual_info_score(labels_a, labels_b) + / (entropy(labels_a) + entropy(labels_b)), + 0, + ) + avg = "arithmetic" + assert_almost_equal( + v_measure_score(labels_a, labels_b), + normalized_mutual_info_score(labels_a, labels_b, average_method=avg), + ) + + +def test_fowlkes_mallows_score(): + # General case + score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2]) + assert_almost_equal(score, 4.0 / np.sqrt(12.0 * 6.0)) + + # Perfect match but where the label names changed + perfect_score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0]) + assert_almost_equal(perfect_score, 1.0) + + # Worst case + worst_score = fowlkes_mallows_score([0, 0, 0, 0, 0, 0], [0, 1, 2, 3, 4, 5]) + assert_almost_equal(worst_score, 0.0) + + +def test_fowlkes_mallows_score_properties(): + # handcrafted example + labels_a = np.array([0, 0, 0, 1, 1, 2]) + labels_b = np.array([1, 1, 2, 2, 0, 0]) + expected = 1.0 / np.sqrt((1.0 + 3.0) * (1.0 + 2.0)) + # FMI = TP / sqrt((TP + FP) * (TP + FN)) + + score_original = fowlkes_mallows_score(labels_a, labels_b) + assert_almost_equal(score_original, expected) + + # symmetric property + score_symmetric = fowlkes_mallows_score(labels_b, labels_a) + assert_almost_equal(score_symmetric, expected) + + # permutation property + score_permuted = fowlkes_mallows_score((labels_a + 1) % 3, labels_b) + assert_almost_equal(score_permuted, expected) + + # symmetric and permutation(both together) + score_both = fowlkes_mallows_score(labels_b, (labels_a + 2) % 3) + assert_almost_equal(score_both, expected) + + +@pytest.mark.parametrize( + "labels_true, labels_pred", + [ + (["a"] * 6, [1, 1, 0, 0, 1, 1]), + ([1] * 6, [1, 1, 0, 0, 1, 1]), + ([1, 1, 0, 0, 1, 1], ["a"] * 6), + ([1, 1, 0, 0, 1, 1], [1] * 6), + (["a"] * 6, ["a"] * 6), + ], +) +def test_mutual_info_score_positive_constant_label(labels_true, labels_pred): + # Check that MI = 0 when one or both labelling are constant + # non-regression test for #16355 + assert mutual_info_score(labels_true, labels_pred) == 0 + + +def test_check_clustering_error(): + # Test warning message for continuous values + rng = np.random.RandomState(42) + noise = rng.rand(500) + wavelength = np.linspace(0.01, 1, 500) * 1e-6 + msg = ( + "Clustering metrics expects discrete values but received " + "continuous values for label, and continuous values for " + "target" + ) + + with pytest.warns(UserWarning, match=msg): + check_clusterings(wavelength, noise) + + +def test_pair_confusion_matrix_fully_dispersed(): + # edge case: every element is its own cluster + N = 100 + clustering1 = list(range(N)) + clustering2 = clustering1 + expected = np.array([[N * (N - 1), 0], [0, 0]]) + assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected) + + +def test_pair_confusion_matrix_single_cluster(): + # edge case: only one cluster + N = 100 + clustering1 = np.zeros((N,)) + clustering2 = clustering1 + expected = np.array([[0, 0], [0, N * (N - 1)]]) + assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected) + + +def test_pair_confusion_matrix(): + # regular case: different non-trivial clusterings + n = 10 + N = n**2 + clustering1 = np.hstack([[i + 1] * n for i in range(n)]) + clustering2 = np.hstack([[i + 1] * (n + 1) for i in range(n)])[:N] + # basic quadratic implementation + expected = np.zeros(shape=(2, 2), dtype=np.int64) + for i in range(len(clustering1)): + for j in range(len(clustering2)): + if i != j: + same_cluster_1 = int(clustering1[i] == clustering1[j]) + same_cluster_2 = int(clustering2[i] == clustering2[j]) + expected[same_cluster_1, same_cluster_2] += 1 + assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected) + + +@pytest.mark.parametrize( + "clustering1, clustering2", + [(list(range(100)), list(range(100))), (np.zeros((100,)), np.zeros((100,)))], +) +def test_rand_score_edge_cases(clustering1, clustering2): + # edge case 1: every element is its own cluster + # edge case 2: only one cluster + assert_allclose(rand_score(clustering1, clustering2), 1.0) + + +def test_rand_score(): + # regular case: different non-trivial clusterings + clustering1 = [0, 0, 0, 1, 1, 1] + clustering2 = [0, 1, 0, 1, 2, 2] + # pair confusion matrix + D11 = 2 * 2 # ordered pairs (1, 3), (5, 6) + D10 = 2 * 4 # ordered pairs (1, 2), (2, 3), (4, 5), (4, 6) + D01 = 2 * 1 # ordered pair (2, 4) + D00 = 5 * 6 - D11 - D01 - D10 # the remaining pairs + # rand score + expected_numerator = D00 + D11 + expected_denominator = D00 + D01 + D10 + D11 + expected = expected_numerator / expected_denominator + assert_allclose(rand_score(clustering1, clustering2), expected) + + +def test_adjusted_rand_score_overflow(): + """Check that large amount of data will not lead to overflow in + `adjusted_rand_score`. + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/20305 + """ + rng = np.random.RandomState(0) + y_true = rng.randint(0, 2, 100_000, dtype=np.int8) + y_pred = rng.randint(0, 2, 100_000, dtype=np.int8) + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + adjusted_rand_score(y_true, y_pred) + + +@pytest.mark.parametrize("average_method", ["min", "arithmetic", "geometric", "max"]) +def test_normalized_mutual_info_score_bounded(average_method): + """Check that nmi returns a score between 0 (included) and 1 (excluded + for non-perfect match) + + Non-regression test for issue #13836 + """ + labels1 = [0] * 469 + labels2 = [1] + labels1[1:] + labels3 = [0, 1] + labels1[2:] + + # labels1 is constant. The mutual info between labels1 and any other labelling is 0. + nmi = normalized_mutual_info_score(labels1, labels2, average_method=average_method) + assert nmi == 0 + + # non constant, non perfect matching labels + nmi = normalized_mutual_info_score(labels2, labels3, average_method=average_method) + assert 0 <= nmi < 1 + + +# TODO(1.9): remove +@pytest.mark.parametrize("sparse", [True, False]) +def test_fowlkes_mallows_sparse_deprecated(sparse): + """Check deprecation warning for 'sparse' parameter of fowlkes_mallows_score.""" + with pytest.warns( + FutureWarning, match="The 'sparse' parameter was deprecated in 1.7" + ): + fowlkes_mallows_score([0, 1], [1, 1], sparse=sparse) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_unsupervised.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_unsupervised.py new file mode 100644 index 0000000000000000000000000000000000000000..a0420bbd406ec873022ee3a6e511c51fafd82f11 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/cluster/tests/test_unsupervised.py @@ -0,0 +1,413 @@ +import warnings + +import numpy as np +import pytest +from numpy.testing import assert_allclose +from scipy.sparse import issparse + +from sklearn import datasets +from sklearn.metrics import pairwise_distances +from sklearn.metrics.cluster import ( + calinski_harabasz_score, + davies_bouldin_score, + silhouette_samples, + silhouette_score, +) +from sklearn.metrics.cluster._unsupervised import _silhouette_reduce +from sklearn.utils._testing import assert_array_equal +from sklearn.utils.fixes import ( + CSC_CONTAINERS, + CSR_CONTAINERS, + DOK_CONTAINERS, + LIL_CONTAINERS, +) + + +@pytest.mark.parametrize( + "sparse_container", + [None] + CSR_CONTAINERS + CSC_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS, +) +@pytest.mark.parametrize("sample_size", [None, "half"]) +def test_silhouette(sparse_container, sample_size): + # Tests the Silhouette Coefficient. + dataset = datasets.load_iris() + X, y = dataset.data, dataset.target + if sparse_container is not None: + X = sparse_container(X) + sample_size = int(X.shape[0] / 2) if sample_size == "half" else sample_size + + D = pairwise_distances(X, metric="euclidean") + # Given that the actual labels are used, we can assume that S would be positive. + score_precomputed = silhouette_score( + D, y, metric="precomputed", sample_size=sample_size, random_state=0 + ) + score_euclidean = silhouette_score( + X, y, metric="euclidean", sample_size=sample_size, random_state=0 + ) + assert score_precomputed > 0 + assert score_euclidean > 0 + assert score_precomputed == pytest.approx(score_euclidean) + + +def test_cluster_size_1(): + # Assert Silhouette Coefficient == 0 when there is 1 sample in a cluster + # (cluster 0). We also test the case where there are identical samples + # as the only members of a cluster (cluster 2). To our knowledge, this case + # is not discussed in reference material, and we choose for it a sample + # score of 1. + X = [[0.0], [1.0], [1.0], [2.0], [3.0], [3.0]] + labels = np.array([0, 1, 1, 1, 2, 2]) + + # Cluster 0: 1 sample -> score of 0 by Rousseeuw's convention + # Cluster 1: intra-cluster = [.5, .5, 1] + # inter-cluster = [1, 1, 1] + # silhouette = [.5, .5, 0] + # Cluster 2: intra-cluster = [0, 0] + # inter-cluster = [arbitrary, arbitrary] + # silhouette = [1., 1.] + + silhouette = silhouette_score(X, labels) + assert not np.isnan(silhouette) + ss = silhouette_samples(X, labels) + assert_array_equal(ss, [0, 0.5, 0.5, 0, 1, 1]) + + +def test_silhouette_paper_example(): + # Explicitly check per-sample results against Rousseeuw (1987) + # Data from Table 1 + lower = [ + 5.58, + 7.00, + 6.50, + 7.08, + 7.00, + 3.83, + 4.83, + 5.08, + 8.17, + 5.83, + 2.17, + 5.75, + 6.67, + 6.92, + 4.92, + 6.42, + 5.00, + 5.58, + 6.00, + 4.67, + 6.42, + 3.42, + 5.50, + 6.42, + 6.42, + 5.00, + 3.92, + 6.17, + 2.50, + 4.92, + 6.25, + 7.33, + 4.50, + 2.25, + 6.33, + 2.75, + 6.08, + 6.67, + 4.25, + 2.67, + 6.00, + 6.17, + 6.17, + 6.92, + 6.17, + 5.25, + 6.83, + 4.50, + 3.75, + 5.75, + 5.42, + 6.08, + 5.83, + 6.67, + 3.67, + 4.75, + 3.00, + 6.08, + 6.67, + 5.00, + 5.58, + 4.83, + 6.17, + 5.67, + 6.50, + 6.92, + ] + D = np.zeros((12, 12)) + D[np.tril_indices(12, -1)] = lower + D += D.T + + names = [ + "BEL", + "BRA", + "CHI", + "CUB", + "EGY", + "FRA", + "IND", + "ISR", + "USA", + "USS", + "YUG", + "ZAI", + ] + + # Data from Figure 2 + labels1 = [1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1] + expected1 = { + "USA": 0.43, + "BEL": 0.39, + "FRA": 0.35, + "ISR": 0.30, + "BRA": 0.22, + "EGY": 0.20, + "ZAI": 0.19, + "CUB": 0.40, + "USS": 0.34, + "CHI": 0.33, + "YUG": 0.26, + "IND": -0.04, + } + score1 = 0.28 + + # Data from Figure 3 + labels2 = [1, 2, 3, 3, 1, 1, 2, 1, 1, 3, 3, 2] + expected2 = { + "USA": 0.47, + "FRA": 0.44, + "BEL": 0.42, + "ISR": 0.37, + "EGY": 0.02, + "ZAI": 0.28, + "BRA": 0.25, + "IND": 0.17, + "CUB": 0.48, + "USS": 0.44, + "YUG": 0.31, + "CHI": 0.31, + } + score2 = 0.33 + + for labels, expected, score in [ + (labels1, expected1, score1), + (labels2, expected2, score2), + ]: + expected = [expected[name] for name in names] + # we check to 2dp because that's what's in the paper + pytest.approx( + expected, + silhouette_samples(D, np.array(labels), metric="precomputed"), + abs=1e-2, + ) + pytest.approx( + score, silhouette_score(D, np.array(labels), metric="precomputed"), abs=1e-2 + ) + + +def test_correct_labelsize(): + # Assert 1 < n_labels < n_samples + dataset = datasets.load_iris() + X = dataset.data + + # n_labels = n_samples + y = np.arange(X.shape[0]) + err_msg = ( + r"Number of labels is %d\. Valid values are 2 " + r"to n_samples - 1 \(inclusive\)" % len(np.unique(y)) + ) + with pytest.raises(ValueError, match=err_msg): + silhouette_score(X, y) + + # n_labels = 1 + y = np.zeros(X.shape[0]) + err_msg = ( + r"Number of labels is %d\. Valid values are 2 " + r"to n_samples - 1 \(inclusive\)" % len(np.unique(y)) + ) + with pytest.raises(ValueError, match=err_msg): + silhouette_score(X, y) + + +def test_non_encoded_labels(): + dataset = datasets.load_iris() + X = dataset.data + labels = dataset.target + assert silhouette_score(X, labels * 2 + 10) == silhouette_score(X, labels) + assert_array_equal( + silhouette_samples(X, labels * 2 + 10), silhouette_samples(X, labels) + ) + + +def test_non_numpy_labels(): + dataset = datasets.load_iris() + X = dataset.data + y = dataset.target + assert silhouette_score(list(X), list(y)) == silhouette_score(X, y) + + +@pytest.mark.parametrize("dtype", (np.float32, np.float64)) +def test_silhouette_nonzero_diag(dtype): + # Make sure silhouette_samples requires diagonal to be zero. + # Non-regression test for #12178 + + # Construct a zero-diagonal matrix + dists = pairwise_distances( + np.array([[0.2, 0.1, 0.12, 1.34, 1.11, 1.6]], dtype=dtype).T + ) + labels = [0, 0, 0, 1, 1, 1] + + # small values on the diagonal are OK + dists[2][2] = np.finfo(dists.dtype).eps * 10 + silhouette_samples(dists, labels, metric="precomputed") + + # values bigger than eps * 100 are not + dists[2][2] = np.finfo(dists.dtype).eps * 1000 + with pytest.raises(ValueError, match="contains non-zero"): + silhouette_samples(dists, labels, metric="precomputed") + + +@pytest.mark.parametrize( + "sparse_container", + CSC_CONTAINERS + CSR_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS, +) +def test_silhouette_samples_precomputed_sparse(sparse_container): + """Check that silhouette_samples works for sparse matrices correctly.""" + X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T + y = [0, 0, 0, 0, 1, 1, 1, 1] + pdist_dense = pairwise_distances(X) + pdist_sparse = sparse_container(pdist_dense) + assert issparse(pdist_sparse) + output_with_sparse_input = silhouette_samples(pdist_sparse, y, metric="precomputed") + output_with_dense_input = silhouette_samples(pdist_dense, y, metric="precomputed") + assert_allclose(output_with_sparse_input, output_with_dense_input) + + +@pytest.mark.parametrize( + "sparse_container", + CSC_CONTAINERS + CSR_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS, +) +def test_silhouette_samples_euclidean_sparse(sparse_container): + """Check that silhouette_samples works for sparse matrices correctly.""" + X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T + y = [0, 0, 0, 0, 1, 1, 1, 1] + pdist_dense = pairwise_distances(X) + pdist_sparse = sparse_container(pdist_dense) + assert issparse(pdist_sparse) + output_with_sparse_input = silhouette_samples(pdist_sparse, y) + output_with_dense_input = silhouette_samples(pdist_dense, y) + assert_allclose(output_with_sparse_input, output_with_dense_input) + + +@pytest.mark.parametrize( + "sparse_container", CSC_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS +) +def test_silhouette_reduce(sparse_container): + """Check for non-CSR input to private method `_silhouette_reduce`.""" + X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T + pdist_dense = pairwise_distances(X) + pdist_sparse = sparse_container(pdist_dense) + y = [0, 0, 0, 0, 1, 1, 1, 1] + label_freqs = np.bincount(y) + with pytest.raises( + TypeError, + match="Expected CSR matrix. Please pass sparse matrix in CSR format.", + ): + _silhouette_reduce(pdist_sparse, start=0, labels=y, label_freqs=label_freqs) + + +def assert_raises_on_only_one_label(func): + """Assert message when there is only one label""" + rng = np.random.RandomState(seed=0) + with pytest.raises(ValueError, match="Number of labels is"): + func(rng.rand(10, 2), np.zeros(10)) + + +def assert_raises_on_all_points_same_cluster(func): + """Assert message when all point are in different clusters""" + rng = np.random.RandomState(seed=0) + with pytest.raises(ValueError, match="Number of labels is"): + func(rng.rand(10, 2), np.arange(10)) + + +def test_calinski_harabasz_score(): + assert_raises_on_only_one_label(calinski_harabasz_score) + + assert_raises_on_all_points_same_cluster(calinski_harabasz_score) + + # Assert the value is 1. when all samples are equals + assert 1.0 == calinski_harabasz_score(np.ones((10, 2)), [0] * 5 + [1] * 5) + + # Assert the value is 0. when all the mean cluster are equal + assert 0.0 == calinski_harabasz_score([[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10) + + # General case (with non numpy arrays) + X = ( + [[0, 0], [1, 1]] * 5 + + [[3, 3], [4, 4]] * 5 + + [[0, 4], [1, 3]] * 5 + + [[3, 1], [4, 0]] * 5 + ) + labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10 + pytest.approx(calinski_harabasz_score(X, labels), 45 * (40 - 4) / (5 * (4 - 1))) + + +def test_davies_bouldin_score(): + assert_raises_on_only_one_label(davies_bouldin_score) + assert_raises_on_all_points_same_cluster(davies_bouldin_score) + + # Assert the value is 0. when all samples are equals + assert davies_bouldin_score(np.ones((10, 2)), [0] * 5 + [1] * 5) == pytest.approx( + 0.0 + ) + + # Assert the value is 0. when all the mean cluster are equal + assert davies_bouldin_score( + [[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10 + ) == pytest.approx(0.0) + + # General case (with non numpy arrays) + X = ( + [[0, 0], [1, 1]] * 5 + + [[3, 3], [4, 4]] * 5 + + [[0, 4], [1, 3]] * 5 + + [[3, 1], [4, 0]] * 5 + ) + labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10 + pytest.approx(davies_bouldin_score(X, labels), 2 * np.sqrt(0.5) / 3) + + # Ensure divide by zero warning is not raised in general case + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + davies_bouldin_score(X, labels) + + # General case - cluster have one sample + X = [[0, 0], [2, 2], [3, 3], [5, 5]] + labels = [0, 0, 1, 2] + pytest.approx(davies_bouldin_score(X, labels), (5.0 / 4) / 3) + + +def test_silhouette_score_integer_precomputed(): + """Check that silhouette_score works for precomputed metrics that are integers. + + Non-regression test for #22107. + """ + result = silhouette_score( + [[0, 1, 2], [1, 0, 1], [2, 1, 0]], [0, 0, 1], metric="precomputed" + ) + assert result == pytest.approx(1 / 6) + + # non-zero on diagonal for ints raises an error + with pytest.raises(ValueError, match="contains non-zero"): + silhouette_score( + [[1, 1, 2], [1, 0, 1], [2, 1, 0]], [0, 0, 1], metric="precomputed" + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/meson.build b/.venv/lib/python3.12/site-packages/sklearn/metrics/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..f0f9894cc6f59a9500a1598c9c9a94d5d6f58429 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/meson.build @@ -0,0 +1,49 @@ +# Metrics is cimported from other subpackages so this is needed for the cimport +# to work +metrics_cython_tree = [ + fs.copyfile('__init__.py') +] +# Some metrics code cimports code from utils, we may as well copy all the necessary files +metrics_cython_tree += utils_cython_tree + +_dist_metrics_pxd = custom_target( + '_dist_metrics_pxd', + output: '_dist_metrics.pxd', + input: '_dist_metrics.pxd.tp', + command: [tempita, '@INPUT@', '-o', '@OUTDIR@'], + # Need to install the generated pxd because it is needed in other subpackages + # Cython code, e.g. sklearn.cluster + install_dir: sklearn_dir / 'metrics', + install: true, +) +metrics_cython_tree += [_dist_metrics_pxd] + +_dist_metrics_pyx = custom_target( + '_dist_metrics_pyx', + output: '_dist_metrics.pyx', + input: '_dist_metrics.pyx.tp', + command: [tempita, '@INPUT@', '-o', '@OUTDIR@'], + # TODO in principle this should go in py.exension_module below. This is + # temporary work-around for dependency issue with .pyx.tp files. For more + # details, see https://github.com/mesonbuild/meson/issues/13212 + depends: metrics_cython_tree, +) + +_dist_metrics = py.extension_module( + '_dist_metrics', + cython_gen.process(_dist_metrics_pyx), + dependencies: [np_dep], + subdir: 'sklearn/metrics', + install: true +) + +py.extension_module( + '_pairwise_fast', + [cython_gen.process('_pairwise_fast.pyx'), metrics_cython_tree], + dependencies: [openmp_dep], + subdir: 'sklearn/metrics', + install: true +) + +subdir('_pairwise_distances_reduction') +subdir('cluster') diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/pairwise.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/pairwise.py new file mode 100644 index 0000000000000000000000000000000000000000..050b58866c8ef589fba008c8444948b30e3416ed --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/pairwise.py @@ -0,0 +1,2675 @@ +"""Metrics for pairwise distances and affinity of sets of samples.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import itertools +import math +import warnings +from functools import partial +from numbers import Integral, Real + +import numpy as np +from joblib import effective_n_jobs +from scipy.sparse import csr_matrix, issparse +from scipy.spatial import distance + +from .. import config_context +from ..exceptions import DataConversionWarning +from ..preprocessing import normalize +from ..utils import check_array, gen_batches, gen_even_slices +from ..utils._array_api import ( + _fill_or_add_to_diagonal, + _find_matching_floating_dtype, + _is_numpy_namespace, + _max_precision_float_dtype, + _modify_in_place_if_numpy, + get_namespace, + get_namespace_and_device, +) +from ..utils._chunking import get_chunk_n_rows +from ..utils._mask import _get_mask +from ..utils._missing import is_scalar_nan +from ..utils._param_validation import ( + Hidden, + Interval, + MissingValues, + Options, + StrOptions, + validate_params, +) +from ..utils.deprecation import _deprecate_force_all_finite +from ..utils.extmath import row_norms, safe_sparse_dot +from ..utils.fixes import parse_version, sp_base_version +from ..utils.parallel import Parallel, delayed +from ..utils.validation import _num_samples, check_non_negative +from ._pairwise_distances_reduction import ArgKmin +from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan + + +# Utility Functions +def _return_float_dtype(X, Y): + """ + 1. If dtype of X and Y is float32, then dtype float32 is returned. + 2. Else dtype float is returned. + """ + if not issparse(X) and not isinstance(X, np.ndarray): + X = np.asarray(X) + + if Y is None: + Y_dtype = X.dtype + elif not issparse(Y) and not isinstance(Y, np.ndarray): + Y = np.asarray(Y) + Y_dtype = Y.dtype + else: + Y_dtype = Y.dtype + + if X.dtype == Y_dtype == np.float32: + dtype = np.float32 + else: + dtype = float + + return X, Y, dtype + + +def check_pairwise_arrays( + X, + Y, + *, + precomputed=False, + dtype="infer_float", + accept_sparse="csr", + force_all_finite="deprecated", + ensure_all_finite=None, + ensure_2d=True, + copy=False, +): + """Set X and Y appropriately and checks inputs. + + If Y is None, it is set as a pointer to X (i.e. not a copy). + If Y is given, this does not happen. + All distance metrics should use this function first to assert that the + given parameters are correct and safe to use. + + Specifically, this function first ensures that both X and Y are arrays, + then checks that they are at least two dimensional while ensuring that + their elements are floats (or dtype if provided). Finally, the function + checks that the size of the second dimension of the two arrays is equal, or + the equivalent check for a precomputed distance matrix. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples_X, n_features) + + Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features) + + precomputed : bool, default=False + True if X is to be treated as precomputed distances to the samples in + Y. + + dtype : str, type, list of type or None default="infer_float" + Data type required for X and Y. If "infer_float", the dtype will be an + appropriate float type selected by _return_float_dtype. If None, the + dtype of the input is preserved. + + .. versionadded:: 0.18 + + accept_sparse : str, bool or list/tuple of str, default='csr' + String[s] representing allowed sparse matrix formats, such as 'csc', + 'csr', etc. If the input is sparse but not in the allowed format, + it will be converted to the first listed format. True allows the input + to be any format. False means that a sparse matrix input will + raise an error. + + force_all_finite : bool or 'allow-nan', default=True + Whether to raise an error on np.inf, np.nan, pd.NA in array. The + possibilities are: + + - True: Force all values of array to be finite. + - False: accepts np.inf, np.nan, pd.NA in array. + - 'allow-nan': accepts only np.nan and pd.NA values in array. Values + cannot be infinite. + + .. versionadded:: 0.22 + ``force_all_finite`` accepts the string ``'allow-nan'``. + + .. versionchanged:: 0.23 + Accepts `pd.NA` and converts it into `np.nan`. + + .. deprecated:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite` and will be removed + in 1.8. + + ensure_all_finite : bool or 'allow-nan', default=True + Whether to raise an error on np.inf, np.nan, pd.NA in array. The + possibilities are: + + - True: Force all values of array to be finite. + - False: accepts np.inf, np.nan, pd.NA in array. + - 'allow-nan': accepts only np.nan and pd.NA values in array. Values + cannot be infinite. + + .. versionadded:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite`. + + ensure_2d : bool, default=True + Whether to raise an error when the input arrays are not 2-dimensional. Setting + this to `False` is necessary when using a custom metric with certain + non-numerical inputs (e.g. a list of strings). + + .. versionadded:: 1.5 + + copy : bool, default=False + Whether a forced copy will be triggered. If copy=False, a copy might + be triggered by a conversion. + + .. versionadded:: 0.22 + + Returns + ------- + safe_X : {array-like, sparse matrix} of shape (n_samples_X, n_features) + An array equal to X, guaranteed to be a numpy array. + + safe_Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features) + An array equal to Y if Y was not None, guaranteed to be a numpy array. + If Y was None, safe_Y will be a pointer to X. + """ + ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite) + + xp, _ = get_namespace(X, Y) + if any([issparse(X), issparse(Y)]) or _is_numpy_namespace(xp): + X, Y, dtype_float = _return_float_dtype(X, Y) + else: + dtype_float = _find_matching_floating_dtype(X, Y, xp=xp) + + estimator = "check_pairwise_arrays" + if dtype == "infer_float": + dtype = dtype_float + + if Y is X or Y is None: + X = Y = check_array( + X, + accept_sparse=accept_sparse, + dtype=dtype, + copy=copy, + ensure_all_finite=ensure_all_finite, + estimator=estimator, + ensure_2d=ensure_2d, + ) + else: + X = check_array( + X, + accept_sparse=accept_sparse, + dtype=dtype, + copy=copy, + ensure_all_finite=ensure_all_finite, + estimator=estimator, + ensure_2d=ensure_2d, + ) + Y = check_array( + Y, + accept_sparse=accept_sparse, + dtype=dtype, + copy=copy, + ensure_all_finite=ensure_all_finite, + estimator=estimator, + ensure_2d=ensure_2d, + ) + + if precomputed: + if X.shape[1] != Y.shape[0]: + raise ValueError( + "Precomputed metric requires shape " + "(n_queries, n_indexed). Got (%d, %d) " + "for %d indexed." % (X.shape[0], X.shape[1], Y.shape[0]) + ) + elif ensure_2d and X.shape[1] != Y.shape[1]: + # Only check the number of features if 2d arrays are enforced. Otherwise, + # validation is left to the user for custom metrics. + raise ValueError( + "Incompatible dimension for X and Y matrices: " + "X.shape[1] == %d while Y.shape[1] == %d" % (X.shape[1], Y.shape[1]) + ) + + return X, Y + + +def check_paired_arrays(X, Y): + """Set X and Y appropriately and checks inputs for paired distances. + + All paired distance metrics should use this function first to assert that + the given parameters are correct and safe to use. + + Specifically, this function first ensures that both X and Y are arrays, + then checks that they are at least two dimensional while ensuring that + their elements are floats. Finally, the function checks that the size + of the dimensions of the two arrays are equal. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples_X, n_features) + + Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features) + + Returns + ------- + safe_X : {array-like, sparse matrix} of shape (n_samples_X, n_features) + An array equal to X, guaranteed to be a numpy array. + + safe_Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features) + An array equal to Y if Y was not None, guaranteed to be a numpy array. + If Y was None, safe_Y will be a pointer to X. + """ + X, Y = check_pairwise_arrays(X, Y) + if X.shape != Y.shape: + raise ValueError( + "X and Y should be of same shape. They were respectively %r and %r long." + % (X.shape, Y.shape) + ) + return X, Y + + +# Pairwise distances +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "Y": ["array-like", "sparse matrix", None], + "Y_norm_squared": ["array-like", None], + "squared": ["boolean"], + "X_norm_squared": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def euclidean_distances( + X, Y=None, *, Y_norm_squared=None, squared=False, X_norm_squared=None +): + """ + Compute the distance matrix between each pair from a feature array X and Y. + + For efficiency reasons, the euclidean distance between a pair of row + vector x and y is computed as:: + + dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y)) + + This formulation has two advantages over other ways of computing distances. + First, it is computationally efficient when dealing with sparse data. + Second, if one argument varies but the other remains unchanged, then + `dot(x, x)` and/or `dot(y, y)` can be pre-computed. + + However, this is not the most precise way of doing this computation, + because this equation potentially suffers from "catastrophic cancellation". + Also, the distance matrix returned by this function may not be exactly + symmetric as required by, e.g., :mod:`scipy.spatial.distance` functions. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples_X, n_features) + An array where each row is a sample and each column is a feature. + + Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), \ + default=None + An array where each row is a sample and each column is a feature. + If `None`, method uses `Y=X`. + + Y_norm_squared : array-like of shape (n_samples_Y,) or (n_samples_Y, 1) \ + or (1, n_samples_Y), default=None + Pre-computed dot-products of vectors in Y (e.g., + ``(Y**2).sum(axis=1)``) + May be ignored in some cases, see the note below. + + squared : bool, default=False + Return squared Euclidean distances. + + X_norm_squared : array-like of shape (n_samples_X,) or (n_samples_X, 1) \ + or (1, n_samples_X), default=None + Pre-computed dot-products of vectors in X (e.g., + ``(X**2).sum(axis=1)``) + May be ignored in some cases, see the note below. + + Returns + ------- + distances : ndarray of shape (n_samples_X, n_samples_Y) + Returns the distances between the row vectors of `X` + and the row vectors of `Y`. + + See Also + -------- + paired_distances : Distances between pairs of elements of X and Y. + + Notes + ----- + To achieve a better accuracy, `X_norm_squared` and `Y_norm_squared` may be + unused if they are passed as `np.float32`. + + Examples + -------- + >>> from sklearn.metrics.pairwise import euclidean_distances + >>> X = [[0, 1], [1, 1]] + >>> # distance between rows of X + >>> euclidean_distances(X, X) + array([[0., 1.], + [1., 0.]]) + >>> # get distance to origin + >>> euclidean_distances(X, [[0, 0]]) + array([[1. ], + [1.41421356]]) + """ + xp, _ = get_namespace(X, Y) + X, Y = check_pairwise_arrays(X, Y) + + if X_norm_squared is not None: + X_norm_squared = check_array(X_norm_squared, ensure_2d=False) + original_shape = X_norm_squared.shape + if X_norm_squared.shape == (X.shape[0],): + X_norm_squared = xp.reshape(X_norm_squared, (-1, 1)) + if X_norm_squared.shape == (1, X.shape[0]): + X_norm_squared = X_norm_squared.T + if X_norm_squared.shape != (X.shape[0], 1): + raise ValueError( + f"Incompatible dimensions for X of shape {X.shape} and " + f"X_norm_squared of shape {original_shape}." + ) + + if Y_norm_squared is not None: + Y_norm_squared = check_array(Y_norm_squared, ensure_2d=False) + original_shape = Y_norm_squared.shape + if Y_norm_squared.shape == (Y.shape[0],): + Y_norm_squared = xp.reshape(Y_norm_squared, (1, -1)) + if Y_norm_squared.shape == (Y.shape[0], 1): + Y_norm_squared = Y_norm_squared.T + if Y_norm_squared.shape != (1, Y.shape[0]): + raise ValueError( + f"Incompatible dimensions for Y of shape {Y.shape} and " + f"Y_norm_squared of shape {original_shape}." + ) + + return _euclidean_distances(X, Y, X_norm_squared, Y_norm_squared, squared) + + +def _euclidean_distances(X, Y, X_norm_squared=None, Y_norm_squared=None, squared=False): + """Computational part of euclidean_distances + + Assumes inputs are already checked. + + If norms are passed as float32, they are unused. If arrays are passed as + float32, norms needs to be recomputed on upcast chunks. + TODO: use a float64 accumulator in row_norms to avoid the latter. + """ + xp, _, device_ = get_namespace_and_device(X, Y) + if X_norm_squared is not None and X_norm_squared.dtype != xp.float32: + XX = xp.reshape(X_norm_squared, (-1, 1)) + elif X.dtype != xp.float32: + XX = row_norms(X, squared=True)[:, None] + else: + XX = None + + if Y is X: + YY = None if XX is None else XX.T + else: + if Y_norm_squared is not None and Y_norm_squared.dtype != xp.float32: + YY = xp.reshape(Y_norm_squared, (1, -1)) + elif Y.dtype != xp.float32: + YY = row_norms(Y, squared=True)[None, :] + else: + YY = None + + if X.dtype == xp.float32 or Y.dtype == xp.float32: + # To minimize precision issues with float32, we compute the distance + # matrix on chunks of X and Y upcast to float64 + distances = _euclidean_distances_upcast(X, XX, Y, YY) + else: + # if dtype is already float64, no need to chunk and upcast + distances = -2 * safe_sparse_dot(X, Y.T, dense_output=True) + distances += XX + distances += YY + + xp_zero = xp.asarray(0, device=device_, dtype=distances.dtype) + distances = _modify_in_place_if_numpy( + xp, xp.maximum, distances, xp_zero, out=distances + ) + + # Ensure that distances between vectors and themselves are set to 0.0. + # This may not be the case due to floating point rounding errors. + if X is Y: + _fill_or_add_to_diagonal(distances, 0, xp=xp, add_value=False) + + if squared: + return distances + + distances = _modify_in_place_if_numpy(xp, xp.sqrt, distances, out=distances) + return distances + + +@validate_params( + { + "X": ["array-like"], + "Y": ["array-like", None], + "squared": ["boolean"], + "missing_values": [MissingValues(numeric_only=True)], + "copy": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def nan_euclidean_distances( + X, Y=None, *, squared=False, missing_values=np.nan, copy=True +): + """Calculate the euclidean distances in the presence of missing values. + + Compute the euclidean distance between each pair of samples in X and Y, + where Y=X is assumed if Y=None. When calculating the distance between a + pair of samples, this formulation ignores feature coordinates with a + missing value in either sample and scales up the weight of the remaining + coordinates: + + .. code-block:: text + + dist(x,y) = sqrt(weight * sq. distance from present coordinates) + + where: + + .. code-block:: text + + weight = Total # of coordinates / # of present coordinates + + For example, the distance between ``[3, na, na, 6]`` and ``[1, na, 4, 5]`` is: + + .. math:: + \\sqrt{\\frac{4}{2}((3-1)^2 + (6-5)^2)} + + If all the coordinates are missing or if there are no common present + coordinates then NaN is returned for that pair. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.22 + + Parameters + ---------- + X : array-like of shape (n_samples_X, n_features) + An array where each row is a sample and each column is a feature. + + Y : array-like of shape (n_samples_Y, n_features), default=None + An array where each row is a sample and each column is a feature. + If `None`, method uses `Y=X`. + + squared : bool, default=False + Return squared Euclidean distances. + + missing_values : np.nan, float or int, default=np.nan + Representation of missing value. + + copy : bool, default=True + Make and use a deep copy of X and Y (if Y exists). + + Returns + ------- + distances : ndarray of shape (n_samples_X, n_samples_Y) + Returns the distances between the row vectors of `X` + and the row vectors of `Y`. + + See Also + -------- + paired_distances : Distances between pairs of elements of X and Y. + + References + ---------- + * John K. Dixon, "Pattern Recognition with Partly Missing Data", + IEEE Transactions on Systems, Man, and Cybernetics, Volume: 9, Issue: + 10, pp. 617 - 621, Oct. 1979. + http://ieeexplore.ieee.org/abstract/document/4310090/ + + Examples + -------- + >>> from sklearn.metrics.pairwise import nan_euclidean_distances + >>> nan = float("NaN") + >>> X = [[0, 1], [1, nan]] + >>> nan_euclidean_distances(X, X) # distance between rows of X + array([[0. , 1.41421356], + [1.41421356, 0. ]]) + + >>> # get distance to origin + >>> nan_euclidean_distances(X, [[0, 0]]) + array([[1. ], + [1.41421356]]) + """ + + ensure_all_finite = "allow-nan" if is_scalar_nan(missing_values) else True + X, Y = check_pairwise_arrays( + X, Y, accept_sparse=False, ensure_all_finite=ensure_all_finite, copy=copy + ) + # Get missing mask for X + missing_X = _get_mask(X, missing_values) + + # Get missing mask for Y + missing_Y = missing_X if Y is X else _get_mask(Y, missing_values) + + # set missing values to zero + X[missing_X] = 0 + Y[missing_Y] = 0 + + distances = euclidean_distances(X, Y, squared=True) + + # Adjust distances for missing values + XX = X * X + YY = Y * Y + distances -= np.dot(XX, missing_Y.T) + distances -= np.dot(missing_X, YY.T) + + np.clip(distances, 0, None, out=distances) + + if X is Y: + # Ensure that distances between vectors and themselves are set to 0.0. + # This may not be the case due to floating point rounding errors. + np.fill_diagonal(distances, 0.0) + + present_X = 1 - missing_X + present_Y = present_X if Y is X else ~missing_Y + present_count = np.dot(present_X, present_Y.T) + distances[present_count == 0] = np.nan + # avoid divide by zero + np.maximum(1, present_count, out=present_count) + distances /= present_count + distances *= X.shape[1] + + if not squared: + np.sqrt(distances, out=distances) + + return distances + + +def _euclidean_distances_upcast(X, XX=None, Y=None, YY=None, batch_size=None): + """Euclidean distances between X and Y. + + Assumes X and Y have float32 dtype. + Assumes XX and YY have float64 dtype or are None. + + X and Y are upcast to float64 by chunks, which size is chosen to limit + memory increase by approximately 10% (at least 10MiB). + """ + xp, _, device_ = get_namespace_and_device(X, Y) + n_samples_X = X.shape[0] + n_samples_Y = Y.shape[0] + n_features = X.shape[1] + + distances = xp.empty((n_samples_X, n_samples_Y), dtype=xp.float32, device=device_) + + if batch_size is None: + x_density = X.nnz / np.prod(X.shape) if issparse(X) else 1 + y_density = Y.nnz / np.prod(Y.shape) if issparse(Y) else 1 + + # Allow 10% more memory than X, Y and the distance matrix take (at + # least 10MiB) + maxmem = max( + ( + (x_density * n_samples_X + y_density * n_samples_Y) * n_features + + (x_density * n_samples_X * y_density * n_samples_Y) + ) + / 10, + 10 * 2**17, + ) + + # The increase amount of memory in 8-byte blocks is: + # - x_density * batch_size * n_features (copy of chunk of X) + # - y_density * batch_size * n_features (copy of chunk of Y) + # - batch_size * batch_size (chunk of distance matrix) + # Hence x² + (xd+yd)kx = M, where x=batch_size, k=n_features, M=maxmem + # xd=x_density and yd=y_density + tmp = (x_density + y_density) * n_features + batch_size = (-tmp + math.sqrt(tmp**2 + 4 * maxmem)) / 2 + batch_size = max(int(batch_size), 1) + + x_batches = gen_batches(n_samples_X, batch_size) + xp_max_float = _max_precision_float_dtype(xp=xp, device=device_) + for i, x_slice in enumerate(x_batches): + X_chunk = xp.astype(X[x_slice, :], xp_max_float) + if XX is None: + XX_chunk = row_norms(X_chunk, squared=True)[:, None] + else: + XX_chunk = XX[x_slice] + + y_batches = gen_batches(n_samples_Y, batch_size) + + for j, y_slice in enumerate(y_batches): + if X is Y and j < i: + # when X is Y the distance matrix is symmetric so we only need + # to compute half of it. + d = distances[y_slice, x_slice].T + + else: + Y_chunk = xp.astype(Y[y_slice, :], xp_max_float) + if YY is None: + YY_chunk = row_norms(Y_chunk, squared=True)[None, :] + else: + YY_chunk = YY[:, y_slice] + + d = -2 * safe_sparse_dot(X_chunk, Y_chunk.T, dense_output=True) + d += XX_chunk + d += YY_chunk + + distances[x_slice, y_slice] = xp.astype(d, xp.float32, copy=False) + + return distances + + +def _argmin_min_reduce(dist, start): + # `start` is specified in the signature but not used. This is because the higher + # order `pairwise_distances_chunked` function needs reduction functions that are + # passed as argument to have a two arguments signature. + indices = dist.argmin(axis=1) + values = dist[np.arange(dist.shape[0]), indices] + return indices, values + + +def _argmin_reduce(dist, start): + # `start` is specified in the signature but not used. This is because the higher + # order `pairwise_distances_chunked` function needs reduction functions that are + # passed as argument to have a two arguments signature. + return dist.argmin(axis=1) + + +_VALID_METRICS = [ + "euclidean", + "l2", + "l1", + "manhattan", + "cityblock", + "braycurtis", + "canberra", + "chebyshev", + "correlation", + "cosine", + "dice", + "hamming", + "jaccard", + "mahalanobis", + "matching", + "minkowski", + "rogerstanimoto", + "russellrao", + "seuclidean", + "sokalsneath", + "sqeuclidean", + "yule", + "wminkowski", + "nan_euclidean", + "haversine", +] +if sp_base_version < parse_version("1.17"): # pragma: no cover + # Deprecated in SciPy 1.15 and removed in SciPy 1.17 + _VALID_METRICS += ["sokalmichener"] +if sp_base_version < parse_version("1.11"): # pragma: no cover + # Deprecated in SciPy 1.9 and removed in SciPy 1.11 + _VALID_METRICS += ["kulsinski"] +if sp_base_version < parse_version("1.9"): + # Deprecated in SciPy 1.0 and removed in SciPy 1.9 + _VALID_METRICS += ["matching"] + +_NAN_METRICS = ["nan_euclidean"] + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "Y": ["array-like", "sparse matrix"], + "axis": [Options(Integral, {0, 1})], + "metric": [ + StrOptions(set(_VALID_METRICS).union(ArgKmin.valid_metrics())), + callable, + ], + "metric_kwargs": [dict, None], + }, + prefer_skip_nested_validation=False, # metric is not validated yet +) +def pairwise_distances_argmin_min( + X, Y, *, axis=1, metric="euclidean", metric_kwargs=None +): + """Compute minimum distances between one point and a set of points. + + This function computes for each row in X, the index of the row of Y which + is closest (according to the specified distance). The minimal distances are + also returned. + + This is mostly equivalent to calling:: + + (pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis), + pairwise_distances(X, Y=Y, metric=metric).min(axis=axis)) + + but uses much less memory, and is faster for large arrays. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples_X, n_features) + Array containing points. + + Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features) + Array containing points. + + axis : int, default=1 + Axis along which the argmin and distances are to be computed. + + metric : str or callable, default='euclidean' + Metric to use for distance computation. Any metric from scikit-learn + or :mod:`scipy.spatial.distance` can be used. + + If metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays as input and return one value indicating the + distance between them. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + Distance matrices are not supported. + + Valid values for metric are: + + - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan', 'nan_euclidean'] + + - from :mod:`scipy.spatial.distance`: ['braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', + 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', + 'yule'] + + See the documentation for :mod:`scipy.spatial.distance` for details on these + metrics. + + .. note:: + `'kulsinski'` is deprecated from SciPy 1.9 and will be removed in SciPy 1.11. + + .. note:: + `'matching'` has been removed in SciPy 1.9 (use `'hamming'` instead). + + metric_kwargs : dict, default=None + Keyword arguments to pass to specified metric function. + + Returns + ------- + argmin : ndarray + Y[argmin[i], :] is the row in Y that is closest to X[i, :]. + + distances : ndarray + The array of minimum distances. `distances[i]` is the distance between + the i-th row in X and the argmin[i]-th row in Y. + + See Also + -------- + pairwise_distances : Distances between every pair of samples of X and Y. + pairwise_distances_argmin : Same as `pairwise_distances_argmin_min` but only + returns the argmins. + + Examples + -------- + >>> from sklearn.metrics.pairwise import pairwise_distances_argmin_min + >>> X = [[0, 0, 0], [1, 1, 1]] + >>> Y = [[1, 0, 0], [1, 1, 0]] + >>> argmin, distances = pairwise_distances_argmin_min(X, Y) + >>> argmin + array([0, 1]) + >>> distances + array([1., 1.]) + """ + ensure_all_finite = "allow-nan" if metric == "nan_euclidean" else True + X, Y = check_pairwise_arrays(X, Y, ensure_all_finite=ensure_all_finite) + + if axis == 0: + X, Y = Y, X + + if metric_kwargs is None: + metric_kwargs = {} + + if ArgKmin.is_usable_for(X, Y, metric): + # This is an adaptor for one "sqeuclidean" specification. + # For this backend, we can directly use "sqeuclidean". + if metric_kwargs.get("squared", False) and metric == "euclidean": + metric = "sqeuclidean" + metric_kwargs = {} + + values, indices = ArgKmin.compute( + X=X, + Y=Y, + k=1, + metric=metric, + metric_kwargs=metric_kwargs, + strategy="auto", + return_distance=True, + ) + values = values.flatten() + indices = indices.flatten() + else: + # Joblib-based backend, which is used when user-defined callable + # are passed for metric. + + # This won't be used in the future once PairwiseDistancesReductions support: + # - DistanceMetrics which work on supposedly binary data + # - CSR-dense and dense-CSR case if 'euclidean' in metric. + + # Turn off check for finiteness because this is costly and because arrays + # have already been validated. + with config_context(assume_finite=True): + indices, values = zip( + *pairwise_distances_chunked( + X, Y, reduce_func=_argmin_min_reduce, metric=metric, **metric_kwargs + ) + ) + indices = np.concatenate(indices) + values = np.concatenate(values) + + return indices, values + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "Y": ["array-like", "sparse matrix"], + "axis": [Options(Integral, {0, 1})], + "metric": [ + StrOptions(set(_VALID_METRICS).union(ArgKmin.valid_metrics())), + callable, + ], + "metric_kwargs": [dict, None], + }, + prefer_skip_nested_validation=False, # metric is not validated yet +) +def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs=None): + """Compute minimum distances between one point and a set of points. + + This function computes for each row in X, the index of the row of Y which + is closest (according to the specified distance). + + This is mostly equivalent to calling:: + + pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis) + + but uses much less memory, and is faster for large arrays. + + This function works with dense 2D arrays only. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples_X, n_features) + Array containing points. + + Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features) + Arrays containing points. + + axis : int, default=1 + Axis along which the argmin and distances are to be computed. + + metric : str or callable, default="euclidean" + Metric to use for distance computation. Any metric from scikit-learn + or :mod:`scipy.spatial.distance` can be used. + + If metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays as input and return one value indicating the + distance between them. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + Distance matrices are not supported. + + Valid values for metric are: + + - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan', 'nan_euclidean'] + + - from :mod:`scipy.spatial.distance`: ['braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', + 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', + 'yule'] + + See the documentation for :mod:`scipy.spatial.distance` for details on these + metrics. + + .. note:: + `'kulsinski'` is deprecated from SciPy 1.9 and will be removed in SciPy 1.11. + + .. note:: + `'matching'` has been removed in SciPy 1.9 (use `'hamming'` instead). + + metric_kwargs : dict, default=None + Keyword arguments to pass to specified metric function. + + Returns + ------- + argmin : numpy.ndarray + Y[argmin[i], :] is the row in Y that is closest to X[i, :]. + + See Also + -------- + pairwise_distances : Distances between every pair of samples of X and Y. + pairwise_distances_argmin_min : Same as `pairwise_distances_argmin` but also + returns the distances. + + Examples + -------- + >>> from sklearn.metrics.pairwise import pairwise_distances_argmin + >>> X = [[0, 0, 0], [1, 1, 1]] + >>> Y = [[1, 0, 0], [1, 1, 0]] + >>> pairwise_distances_argmin(X, Y) + array([0, 1]) + """ + ensure_all_finite = "allow-nan" if metric == "nan_euclidean" else True + X, Y = check_pairwise_arrays(X, Y, ensure_all_finite=ensure_all_finite) + + if axis == 0: + X, Y = Y, X + + if metric_kwargs is None: + metric_kwargs = {} + + if ArgKmin.is_usable_for(X, Y, metric): + # This is an adaptor for one "sqeuclidean" specification. + # For this backend, we can directly use "sqeuclidean". + if metric_kwargs.get("squared", False) and metric == "euclidean": + metric = "sqeuclidean" + metric_kwargs = {} + + indices = ArgKmin.compute( + X=X, + Y=Y, + k=1, + metric=metric, + metric_kwargs=metric_kwargs, + strategy="auto", + return_distance=False, + ) + indices = indices.flatten() + else: + # Joblib-based backend, which is used when user-defined callable + # are passed for metric. + + # This won't be used in the future once PairwiseDistancesReductions support: + # - DistanceMetrics which work on supposedly binary data + # - CSR-dense and dense-CSR case if 'euclidean' in metric. + + # Turn off check for finiteness because this is costly and because arrays + # have already been validated. + with config_context(assume_finite=True): + indices = np.concatenate( + list( + # This returns a np.ndarray generator whose arrays we need + # to flatten into one. + pairwise_distances_chunked( + X, Y, reduce_func=_argmin_reduce, metric=metric, **metric_kwargs + ) + ) + ) + + return indices + + +@validate_params( + {"X": ["array-like", "sparse matrix"], "Y": ["array-like", "sparse matrix", None]}, + prefer_skip_nested_validation=True, +) +def haversine_distances(X, Y=None): + """Compute the Haversine distance between samples in X and Y. + + The Haversine (or great circle) distance is the angular distance between + two points on the surface of a sphere. The first coordinate of each point + is assumed to be the latitude, the second is the longitude, given + in radians. The dimension of the data must be 2. + + .. math:: + D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x_{lat} - y_{lat}) / 2) + + \\cos(x_{lat})\\cos(y_{lat})\\ + sin^2((x_{lon} - y_{lon}) / 2)}] + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples_X, 2) + A feature array. + + Y : {array-like, sparse matrix} of shape (n_samples_Y, 2), default=None + An optional second feature array. If `None`, uses `Y=X`. + + Returns + ------- + distances : ndarray of shape (n_samples_X, n_samples_Y) + The distance matrix. + + Notes + ----- + As the Earth is nearly spherical, the haversine formula provides a good + approximation of the distance between two points of the Earth surface, with + a less than 1% error on average. + + Examples + -------- + We want to calculate the distance between the Ezeiza Airport + (Buenos Aires, Argentina) and the Charles de Gaulle Airport (Paris, + France). + + >>> from sklearn.metrics.pairwise import haversine_distances + >>> from math import radians + >>> bsas = [-34.83333, -58.5166646] + >>> paris = [49.0083899664, 2.53844117956] + >>> bsas_in_radians = [radians(_) for _ in bsas] + >>> paris_in_radians = [radians(_) for _ in paris] + >>> result = haversine_distances([bsas_in_radians, paris_in_radians]) + >>> result * 6371000/1000 # multiply by Earth radius to get kilometers + array([[ 0. , 11099.54035582], + [11099.54035582, 0. ]]) + """ + from ..metrics import DistanceMetric + + return DistanceMetric.get_metric("haversine").pairwise(X, Y) + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "Y": ["array-like", "sparse matrix", None], + }, + prefer_skip_nested_validation=True, +) +def manhattan_distances(X, Y=None): + """Compute the L1 distances between the vectors in X and Y. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples_X, n_features) + An array where each row is a sample and each column is a feature. + + Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None + An array where each row is a sample and each column is a feature. + If `None`, method uses `Y=X`. + + Returns + ------- + distances : ndarray of shape (n_samples_X, n_samples_Y) + Pairwise L1 distances. + + Notes + ----- + When X and/or Y are CSR sparse matrices and they are not already + in canonical format, this function modifies them in-place to + make them canonical. + + Examples + -------- + >>> from sklearn.metrics.pairwise import manhattan_distances + >>> manhattan_distances([[3]], [[3]]) + array([[0.]]) + >>> manhattan_distances([[3]], [[2]]) + array([[1.]]) + >>> manhattan_distances([[2]], [[3]]) + array([[1.]]) + >>> manhattan_distances([[1, 2], [3, 4]],\ + [[1, 2], [0, 3]]) + array([[0., 2.], + [4., 4.]]) + """ + X, Y = check_pairwise_arrays(X, Y) + + if issparse(X) or issparse(Y): + X = csr_matrix(X, copy=False) + Y = csr_matrix(Y, copy=False) + X.sum_duplicates() # this also sorts indices in-place + Y.sum_duplicates() + D = np.zeros((X.shape[0], Y.shape[0])) + _sparse_manhattan(X.data, X.indices, X.indptr, Y.data, Y.indices, Y.indptr, D) + return D + + return distance.cdist(X, Y, "cityblock") + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "Y": ["array-like", "sparse matrix", None], + }, + prefer_skip_nested_validation=True, +) +def cosine_distances(X, Y=None): + """Compute cosine distance between samples in X and Y. + + Cosine distance is defined as 1.0 minus the cosine similarity. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples_X, n_features) + Matrix `X`. + + Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), \ + default=None + Matrix `Y`. + + Returns + ------- + distances : ndarray of shape (n_samples_X, n_samples_Y) + Returns the cosine distance between samples in X and Y. + + See Also + -------- + cosine_similarity : Compute cosine similarity between samples in X and Y. + scipy.spatial.distance.cosine : Dense matrices only. + + Examples + -------- + >>> from sklearn.metrics.pairwise import cosine_distances + >>> X = [[0, 0, 0], [1, 1, 1]] + >>> Y = [[1, 0, 0], [1, 1, 0]] + >>> cosine_distances(X, Y) + array([[1. , 1. ], + [0.422, 0.183]]) + """ + xp, _ = get_namespace(X, Y) + + # 1.0 - cosine_similarity(X, Y) without copy + S = cosine_similarity(X, Y) + S *= -1 + S += 1 + S = xp.clip(S, 0.0, 2.0) + if X is Y or Y is None: + # Ensure that distances between vectors and themselves are set to 0.0. + # This may not be the case due to floating point rounding errors. + _fill_or_add_to_diagonal(S, 0.0, xp, add_value=False) + return S + + +# Paired distances +@validate_params( + {"X": ["array-like", "sparse matrix"], "Y": ["array-like", "sparse matrix"]}, + prefer_skip_nested_validation=True, +) +def paired_euclidean_distances(X, Y): + """Compute the paired euclidean distances between X and Y. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input array/matrix X. + + Y : {array-like, sparse matrix} of shape (n_samples, n_features) + Input array/matrix Y. + + Returns + ------- + distances : ndarray of shape (n_samples,) + Output array/matrix containing the calculated paired euclidean + distances. + + Examples + -------- + >>> from sklearn.metrics.pairwise import paired_euclidean_distances + >>> X = [[0, 0, 0], [1, 1, 1]] + >>> Y = [[1, 0, 0], [1, 1, 0]] + >>> paired_euclidean_distances(X, Y) + array([1., 1.]) + """ + X, Y = check_paired_arrays(X, Y) + return row_norms(X - Y) + + +@validate_params( + {"X": ["array-like", "sparse matrix"], "Y": ["array-like", "sparse matrix"]}, + prefer_skip_nested_validation=True, +) +def paired_manhattan_distances(X, Y): + """Compute the paired L1 distances between X and Y. + + Distances are calculated between (X[0], Y[0]), (X[1], Y[1]), ..., + (X[n_samples], Y[n_samples]). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + An array-like where each row is a sample and each column is a feature. + + Y : {array-like, sparse matrix} of shape (n_samples, n_features) + An array-like where each row is a sample and each column is a feature. + + Returns + ------- + distances : ndarray of shape (n_samples,) + L1 paired distances between the row vectors of `X` + and the row vectors of `Y`. + + Examples + -------- + >>> from sklearn.metrics.pairwise import paired_manhattan_distances + >>> import numpy as np + >>> X = np.array([[1, 1, 0], [0, 1, 0], [0, 0, 1]]) + >>> Y = np.array([[0, 1, 0], [0, 0, 1], [0, 0, 0]]) + >>> paired_manhattan_distances(X, Y) + array([1., 2., 1.]) + """ + X, Y = check_paired_arrays(X, Y) + diff = X - Y + if issparse(diff): + diff.data = np.abs(diff.data) + return np.squeeze(np.array(diff.sum(axis=1))) + else: + return np.abs(diff).sum(axis=-1) + + +@validate_params( + {"X": ["array-like", "sparse matrix"], "Y": ["array-like", "sparse matrix"]}, + prefer_skip_nested_validation=True, +) +def paired_cosine_distances(X, Y): + """ + Compute the paired cosine distances between X and Y. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + An array where each row is a sample and each column is a feature. + + Y : {array-like, sparse matrix} of shape (n_samples, n_features) + An array where each row is a sample and each column is a feature. + + Returns + ------- + distances : ndarray of shape (n_samples,) + Returns the distances between the row vectors of `X` + and the row vectors of `Y`, where `distances[i]` is the + distance between `X[i]` and `Y[i]`. + + Notes + ----- + The cosine distance is equivalent to the half the squared + euclidean distance if each sample is normalized to unit norm. + + Examples + -------- + >>> from sklearn.metrics.pairwise import paired_cosine_distances + >>> X = [[0, 0, 0], [1, 1, 1]] + >>> Y = [[1, 0, 0], [1, 1, 0]] + >>> paired_cosine_distances(X, Y) + array([0.5 , 0.184]) + """ + X, Y = check_paired_arrays(X, Y) + return 0.5 * row_norms(normalize(X) - normalize(Y), squared=True) + + +PAIRED_DISTANCES = { + "cosine": paired_cosine_distances, + "euclidean": paired_euclidean_distances, + "l2": paired_euclidean_distances, + "l1": paired_manhattan_distances, + "manhattan": paired_manhattan_distances, + "cityblock": paired_manhattan_distances, +} + + +@validate_params( + { + "X": ["array-like"], + "Y": ["array-like"], + "metric": [StrOptions(set(PAIRED_DISTANCES)), callable], + }, + prefer_skip_nested_validation=True, +) +def paired_distances(X, Y, *, metric="euclidean", **kwds): + """ + Compute the paired distances between X and Y. + + Compute the distances between (X[0], Y[0]), (X[1], Y[1]), etc... + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Array 1 for distance computation. + + Y : ndarray of shape (n_samples, n_features) + Array 2 for distance computation. + + metric : str or callable, default="euclidean" + The metric to use when calculating distance between instances in a + feature array. If metric is a string, it must be one of the options + specified in PAIRED_DISTANCES, including "euclidean", + "manhattan", or "cosine". + Alternatively, if metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays from `X` as input and return a value indicating + the distance between them. + + **kwds : dict + Unused parameters. + + Returns + ------- + distances : ndarray of shape (n_samples,) + Returns the distances between the row vectors of `X` + and the row vectors of `Y`. + + See Also + -------- + sklearn.metrics.pairwise_distances : Computes the distance between every pair of + samples. + + Examples + -------- + >>> from sklearn.metrics.pairwise import paired_distances + >>> X = [[0, 1], [1, 1]] + >>> Y = [[0, 1], [2, 1]] + >>> paired_distances(X, Y) + array([0., 1.]) + """ + + if metric in PAIRED_DISTANCES: + func = PAIRED_DISTANCES[metric] + return func(X, Y) + elif callable(metric): + # Check the matrix first (it is usually done by the metric) + X, Y = check_paired_arrays(X, Y) + distances = np.zeros(len(X)) + for i in range(len(X)): + distances[i] = metric(X[i], Y[i]) + return distances + + +# Kernels +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "Y": ["array-like", "sparse matrix", None], + "dense_output": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def linear_kernel(X, Y=None, dense_output=True): + """ + Compute the linear kernel between X and Y. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples_X, n_features) + A feature array. + + Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None + An optional second feature array. If `None`, uses `Y=X`. + + dense_output : bool, default=True + Whether to return dense output even when the input is sparse. If + ``False``, the output is sparse if both input arrays are sparse. + + .. versionadded:: 0.20 + + Returns + ------- + kernel : ndarray of shape (n_samples_X, n_samples_Y) + The Gram matrix of the linear kernel, i.e. `X @ Y.T`. + + Examples + -------- + >>> from sklearn.metrics.pairwise import linear_kernel + >>> X = [[0, 0, 0], [1, 1, 1]] + >>> Y = [[1, 0, 0], [1, 1, 0]] + >>> linear_kernel(X, Y) + array([[0., 0.], + [1., 2.]]) + """ + X, Y = check_pairwise_arrays(X, Y) + return safe_sparse_dot(X, Y.T, dense_output=dense_output) + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "Y": ["array-like", "sparse matrix", None], + "degree": [Interval(Real, 1, None, closed="left")], + "gamma": [ + Interval(Real, 0, None, closed="left"), + None, + Hidden(np.ndarray), + ], + "coef0": [Interval(Real, None, None, closed="neither")], + }, + prefer_skip_nested_validation=True, +) +def polynomial_kernel(X, Y=None, degree=3, gamma=None, coef0=1): + """ + Compute the polynomial kernel between X and Y. + + .. code-block:: text + + K(X, Y) = (gamma + coef0) ^ degree + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples_X, n_features) + A feature array. + + Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None + An optional second feature array. If `None`, uses `Y=X`. + + degree : float, default=3 + Kernel degree. + + gamma : float, default=None + Coefficient of the vector inner product. If None, defaults to 1.0 / n_features. + + coef0 : float, default=1 + Constant offset added to scaled inner product. + + Returns + ------- + kernel : ndarray of shape (n_samples_X, n_samples_Y) + The polynomial kernel. + + Examples + -------- + >>> from sklearn.metrics.pairwise import polynomial_kernel + >>> X = [[0, 0, 0], [1, 1, 1]] + >>> Y = [[1, 0, 0], [1, 1, 0]] + >>> polynomial_kernel(X, Y, degree=2) + array([[1. , 1. ], + [1.77, 2.77]]) + """ + X, Y = check_pairwise_arrays(X, Y) + if gamma is None: + gamma = 1.0 / X.shape[1] + + K = safe_sparse_dot(X, Y.T, dense_output=True) + K *= gamma + K += coef0 + K **= degree + return K + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "Y": ["array-like", "sparse matrix", None], + "gamma": [ + Interval(Real, 0, None, closed="left"), + None, + Hidden(np.ndarray), + ], + "coef0": [Interval(Real, None, None, closed="neither")], + }, + prefer_skip_nested_validation=True, +) +def sigmoid_kernel(X, Y=None, gamma=None, coef0=1): + """Compute the sigmoid kernel between X and Y. + + .. code-block:: text + + K(X, Y) = tanh(gamma + coef0) + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples_X, n_features) + A feature array. + + Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None + An optional second feature array. If `None`, uses `Y=X`. + + gamma : float, default=None + Coefficient of the vector inner product. If None, defaults to 1.0 / n_features. + + coef0 : float, default=1 + Constant offset added to scaled inner product. + + Returns + ------- + kernel : ndarray of shape (n_samples_X, n_samples_Y) + Sigmoid kernel between two arrays. + + Examples + -------- + >>> from sklearn.metrics.pairwise import sigmoid_kernel + >>> X = [[0, 0, 0], [1, 1, 1]] + >>> Y = [[1, 0, 0], [1, 1, 0]] + >>> sigmoid_kernel(X, Y) + array([[0.76, 0.76], + [0.87, 0.93]]) + """ + xp, _ = get_namespace(X, Y) + X, Y = check_pairwise_arrays(X, Y) + if gamma is None: + gamma = 1.0 / X.shape[1] + + K = safe_sparse_dot(X, Y.T, dense_output=True) + K *= gamma + K += coef0 + # compute tanh in-place for numpy + K = _modify_in_place_if_numpy(xp, xp.tanh, K, out=K) + return K + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "Y": ["array-like", "sparse matrix", None], + "gamma": [ + Interval(Real, 0, None, closed="left"), + None, + Hidden(np.ndarray), + ], + }, + prefer_skip_nested_validation=True, +) +def rbf_kernel(X, Y=None, gamma=None): + """Compute the rbf (gaussian) kernel between X and Y. + + .. code-block:: text + + K(x, y) = exp(-gamma ||x-y||^2) + + for each pair of rows x in X and y in Y. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples_X, n_features) + A feature array. + + Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None + An optional second feature array. If `None`, uses `Y=X`. + + gamma : float, default=None + If None, defaults to 1.0 / n_features. + + Returns + ------- + kernel : ndarray of shape (n_samples_X, n_samples_Y) + The RBF kernel. + + Examples + -------- + >>> from sklearn.metrics.pairwise import rbf_kernel + >>> X = [[0, 0, 0], [1, 1, 1]] + >>> Y = [[1, 0, 0], [1, 1, 0]] + >>> rbf_kernel(X, Y) + array([[0.71, 0.51], + [0.51, 0.71]]) + """ + xp, _ = get_namespace(X, Y) + X, Y = check_pairwise_arrays(X, Y) + if gamma is None: + gamma = 1.0 / X.shape[1] + + K = euclidean_distances(X, Y, squared=True) + K *= -gamma + # exponentiate K in-place when using numpy + K = _modify_in_place_if_numpy(xp, xp.exp, K, out=K) + return K + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "Y": ["array-like", "sparse matrix", None], + "gamma": [ + Interval(Real, 0, None, closed="neither"), + Hidden(np.ndarray), + None, + ], + }, + prefer_skip_nested_validation=True, +) +def laplacian_kernel(X, Y=None, gamma=None): + """Compute the laplacian kernel between X and Y. + + The laplacian kernel is defined as: + + .. code-block:: text + + K(x, y) = exp(-gamma ||x-y||_1) + + for each pair of rows x in X and y in Y. + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.17 + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples_X, n_features) + A feature array. + + Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None + An optional second feature array. If `None`, uses `Y=X`. + + gamma : float, default=None + If None, defaults to 1.0 / n_features. Otherwise it should be strictly positive. + + Returns + ------- + kernel : ndarray of shape (n_samples_X, n_samples_Y) + The kernel matrix. + + Examples + -------- + >>> from sklearn.metrics.pairwise import laplacian_kernel + >>> X = [[0, 0, 0], [1, 1, 1]] + >>> Y = [[1, 0, 0], [1, 1, 0]] + >>> laplacian_kernel(X, Y) + array([[0.71, 0.51], + [0.51, 0.71]]) + """ + X, Y = check_pairwise_arrays(X, Y) + if gamma is None: + gamma = 1.0 / X.shape[1] + + K = -gamma * manhattan_distances(X, Y) + np.exp(K, K) # exponentiate K in-place + return K + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "Y": ["array-like", "sparse matrix", None], + "dense_output": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def cosine_similarity(X, Y=None, dense_output=True): + """Compute cosine similarity between samples in X and Y. + + Cosine similarity, or the cosine kernel, computes similarity as the + normalized dot product of X and Y: + + .. code-block:: text + + K(X, Y) = / (||X||*||Y||) + + On L2-normalized data, this function is equivalent to linear_kernel. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples_X, n_features) + Input data. + + Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), \ + default=None + Input data. If ``None``, the output will be the pairwise + similarities between all samples in ``X``. + + dense_output : bool, default=True + Whether to return dense output even when the input is sparse. If + ``False``, the output is sparse if both input arrays are sparse. + + .. versionadded:: 0.17 + parameter ``dense_output`` for dense output. + + Returns + ------- + similarities : ndarray or sparse matrix of shape (n_samples_X, n_samples_Y) + Returns the cosine similarity between samples in X and Y. + + Examples + -------- + >>> from sklearn.metrics.pairwise import cosine_similarity + >>> X = [[0, 0, 0], [1, 1, 1]] + >>> Y = [[1, 0, 0], [1, 1, 0]] + >>> cosine_similarity(X, Y) + array([[0. , 0. ], + [0.577, 0.816]]) + """ + X, Y = check_pairwise_arrays(X, Y) + + X_normalized = normalize(X, copy=True) + if X is Y: + Y_normalized = X_normalized + else: + Y_normalized = normalize(Y, copy=True) + + K = safe_sparse_dot(X_normalized, Y_normalized.T, dense_output=dense_output) + + return K + + +@validate_params( + {"X": ["array-like"], "Y": ["array-like", None]}, + prefer_skip_nested_validation=True, +) +def additive_chi2_kernel(X, Y=None): + """Compute the additive chi-squared kernel between observations in X and Y. + + The chi-squared kernel is computed between each pair of rows in X and Y. X + and Y have to be non-negative. This kernel is most commonly applied to + histograms. + + The chi-squared kernel is given by: + + .. code-block:: text + + k(x, y) = -Sum [(x - y)^2 / (x + y)] + + It can be interpreted as a weighted difference per entry. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : array-like of shape (n_samples_X, n_features) + A feature array. + + Y : array-like of shape (n_samples_Y, n_features), default=None + An optional second feature array. If `None`, uses `Y=X`. + + Returns + ------- + kernel : array-like of shape (n_samples_X, n_samples_Y) + The kernel matrix. + + See Also + -------- + chi2_kernel : The exponentiated version of the kernel, which is usually + preferable. + sklearn.kernel_approximation.AdditiveChi2Sampler : A Fourier approximation + to this kernel. + + Notes + ----- + As the negative of a distance, this kernel is only conditionally positive + definite. + + References + ---------- + * Zhang, J. and Marszalek, M. and Lazebnik, S. and Schmid, C. + Local features and kernels for classification of texture and object + categories: A comprehensive study + International Journal of Computer Vision 2007 + https://hal.archives-ouvertes.fr/hal-00171412/document + + Examples + -------- + >>> from sklearn.metrics.pairwise import additive_chi2_kernel + >>> X = [[0, 0, 0], [1, 1, 1]] + >>> Y = [[1, 0, 0], [1, 1, 0]] + >>> additive_chi2_kernel(X, Y) + array([[-1., -2.], + [-2., -1.]]) + """ + xp, _, device_ = get_namespace_and_device(X, Y) + X, Y = check_pairwise_arrays(X, Y, accept_sparse=False) + if xp.any(X < 0): + raise ValueError("X contains negative values.") + if Y is not X and xp.any(Y < 0): + raise ValueError("Y contains negative values.") + + if _is_numpy_namespace(xp): + result = np.zeros((X.shape[0], Y.shape[0]), dtype=X.dtype) + _chi2_kernel_fast(X, Y, result) + return result + else: + dtype = _find_matching_floating_dtype(X, Y, xp=xp) + xb = X[:, None, :] + yb = Y[None, :, :] + nom = -((xb - yb) ** 2) + denom = xb + yb + nom = xp.where(denom == 0, xp.asarray(0, dtype=dtype, device=device_), nom) + denom = xp.where(denom == 0, xp.asarray(1, dtype=dtype, device=device_), denom) + return xp.sum(nom / denom, axis=2) + + +@validate_params( + { + "X": ["array-like"], + "Y": ["array-like", None], + "gamma": [Interval(Real, 0, None, closed="neither"), Hidden(np.ndarray)], + }, + prefer_skip_nested_validation=True, +) +def chi2_kernel(X, Y=None, gamma=1.0): + """Compute the exponential chi-squared kernel between X and Y. + + The chi-squared kernel is computed between each pair of rows in X and Y. X + and Y have to be non-negative. This kernel is most commonly applied to + histograms. + + The chi-squared kernel is given by: + + .. code-block:: text + + k(x, y) = exp(-gamma Sum [(x - y)^2 / (x + y)]) + + It can be interpreted as a weighted difference per entry. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : array-like of shape (n_samples_X, n_features) + A feature array. + + Y : array-like of shape (n_samples_Y, n_features), default=None + An optional second feature array. If `None`, uses `Y=X`. + + gamma : float, default=1 + Scaling parameter of the chi2 kernel. + + Returns + ------- + kernel : ndarray of shape (n_samples_X, n_samples_Y) + The kernel matrix. + + See Also + -------- + additive_chi2_kernel : The additive version of this kernel. + sklearn.kernel_approximation.AdditiveChi2Sampler : A Fourier approximation + to the additive version of this kernel. + + References + ---------- + * Zhang, J. and Marszalek, M. and Lazebnik, S. and Schmid, C. + Local features and kernels for classification of texture and object + categories: A comprehensive study + International Journal of Computer Vision 2007 + https://hal.archives-ouvertes.fr/hal-00171412/document + + Examples + -------- + >>> from sklearn.metrics.pairwise import chi2_kernel + >>> X = [[0, 0, 0], [1, 1, 1]] + >>> Y = [[1, 0, 0], [1, 1, 0]] + >>> chi2_kernel(X, Y) + array([[0.368, 0.135], + [0.135, 0.368]]) + """ + xp, _ = get_namespace(X, Y) + K = additive_chi2_kernel(X, Y) + K *= gamma + if _is_numpy_namespace(xp): + return np.exp(K, out=K) + return xp.exp(K) + + +# Helper functions - distance +PAIRWISE_DISTANCE_FUNCTIONS = { + # If updating this dictionary, update the doc in both distance_metrics() + # and also in pairwise_distances()! + "cityblock": manhattan_distances, + "cosine": cosine_distances, + "euclidean": euclidean_distances, + "haversine": haversine_distances, + "l2": euclidean_distances, + "l1": manhattan_distances, + "manhattan": manhattan_distances, + "precomputed": None, # HACK: precomputed is always allowed, never called + "nan_euclidean": nan_euclidean_distances, +} + + +def distance_metrics(): + """Valid metrics for pairwise_distances. + + This function simply returns the valid pairwise distance metrics. + It exists to allow for a description of the mapping for + each of the valid strings. + + The valid distance metrics, and the function they map to, are: + + =============== ======================================== + metric Function + =============== ======================================== + 'cityblock' metrics.pairwise.manhattan_distances + 'cosine' metrics.pairwise.cosine_distances + 'euclidean' metrics.pairwise.euclidean_distances + 'haversine' metrics.pairwise.haversine_distances + 'l1' metrics.pairwise.manhattan_distances + 'l2' metrics.pairwise.euclidean_distances + 'manhattan' metrics.pairwise.manhattan_distances + 'nan_euclidean' metrics.pairwise.nan_euclidean_distances + =============== ======================================== + + Read more in the :ref:`User Guide `. + + Returns + ------- + distance_metrics : dict + Returns valid metrics for pairwise_distances. + """ + return PAIRWISE_DISTANCE_FUNCTIONS + + +def _dist_wrapper(dist_func, dist_matrix, slice_, *args, **kwargs): + """Write in-place to a slice of a distance matrix.""" + dist_matrix[:, slice_] = dist_func(*args, **kwargs) + + +def _parallel_pairwise(X, Y, func, n_jobs, **kwds): + """Break the pairwise matrix in n_jobs even slices + and compute them using multithreading.""" + + if Y is None: + Y = X + X, Y, dtype = _return_float_dtype(X, Y) + + if effective_n_jobs(n_jobs) == 1: + return func(X, Y, **kwds) + + # enforce a threading backend to prevent data communication overhead + fd = delayed(_dist_wrapper) + ret = np.empty((X.shape[0], Y.shape[0]), dtype=dtype, order="F") + Parallel(backend="threading", n_jobs=n_jobs)( + fd(func, ret, s, X, Y[s], **kwds) + for s in gen_even_slices(_num_samples(Y), effective_n_jobs(n_jobs)) + ) + + if (X is Y or Y is None) and func is euclidean_distances: + # zeroing diagonal for euclidean norm. + # TODO: do it also for other norms. + np.fill_diagonal(ret, 0) + + return ret + + +def _pairwise_callable(X, Y, metric, ensure_all_finite=True, **kwds): + """Handle the callable case for pairwise_{distances,kernels}.""" + X, Y = check_pairwise_arrays( + X, + Y, + dtype=None, + ensure_all_finite=ensure_all_finite, + # No input dimension checking done for custom metrics (left to user) + ensure_2d=False, + ) + + if X is Y: + # Only calculate metric for upper triangle + out = np.zeros((X.shape[0], Y.shape[0]), dtype="float") + iterator = itertools.combinations(range(X.shape[0]), 2) + for i, j in iterator: + # scipy has not yet implemented 1D sparse slices; once implemented this can + # be removed and `arr[ind]` can be simply used. + x = X[[i], :] if issparse(X) else X[i] + y = Y[[j], :] if issparse(Y) else Y[j] + out[i, j] = metric(x, y, **kwds) + + # Make symmetric + # NB: out += out.T will produce incorrect results + out = out + out.T + + # Calculate diagonal + # NB: nonzero diagonals are allowed for both metrics and kernels + for i in range(X.shape[0]): + # scipy has not yet implemented 1D sparse slices; once implemented this can + # be removed and `arr[ind]` can be simply used. + x = X[[i], :] if issparse(X) else X[i] + out[i, i] = metric(x, x, **kwds) + + else: + # Calculate all cells + out = np.empty((X.shape[0], Y.shape[0]), dtype="float") + iterator = itertools.product(range(X.shape[0]), range(Y.shape[0])) + for i, j in iterator: + # scipy has not yet implemented 1D sparse slices; once implemented this can + # be removed and `arr[ind]` can be simply used. + x = X[[i], :] if issparse(X) else X[i] + y = Y[[j], :] if issparse(Y) else Y[j] + out[i, j] = metric(x, y, **kwds) + + return out + + +def _check_chunk_size(reduced, chunk_size): + """Checks chunk is a sequence of expected size or a tuple of same.""" + if reduced is None: + return + is_tuple = isinstance(reduced, tuple) + if not is_tuple: + reduced = (reduced,) + if any(isinstance(r, tuple) or not hasattr(r, "__iter__") for r in reduced): + raise TypeError( + "reduce_func returned %r. Expected sequence(s) of length %d." + % (reduced if is_tuple else reduced[0], chunk_size) + ) + if any(_num_samples(r) != chunk_size for r in reduced): + actual_size = tuple(_num_samples(r) for r in reduced) + raise ValueError( + "reduce_func returned object of length %s. " + "Expected same length as input: %d." + % (actual_size if is_tuple else actual_size[0], chunk_size) + ) + + +def _precompute_metric_params(X, Y, metric=None, **kwds): + """Precompute data-derived metric parameters if not provided.""" + if metric == "seuclidean" and "V" not in kwds: + if X is Y: + V = np.var(X, axis=0, ddof=1) + else: + raise ValueError( + "The 'V' parameter is required for the seuclidean metric " + "when Y is passed." + ) + return {"V": V} + if metric == "mahalanobis" and "VI" not in kwds: + if X is Y: + VI = np.linalg.inv(np.cov(X.T)).T + else: + raise ValueError( + "The 'VI' parameter is required for the mahalanobis metric " + "when Y is passed." + ) + return {"VI": VI} + return {} + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "Y": ["array-like", "sparse matrix", None], + "reduce_func": [callable, None], + "metric": [StrOptions({"precomputed"}.union(_VALID_METRICS)), callable], + "n_jobs": [Integral, None], + "working_memory": [Interval(Real, 0, None, closed="left"), None], + }, + prefer_skip_nested_validation=False, # metric is not validated yet +) +def pairwise_distances_chunked( + X, + Y=None, + *, + reduce_func=None, + metric="euclidean", + n_jobs=None, + working_memory=None, + **kwds, +): + """Generate a distance matrix chunk by chunk with optional reduction. + + In cases where not all of a pairwise distance matrix needs to be + stored at once, this is used to calculate pairwise distances in + ``working_memory``-sized chunks. If ``reduce_func`` is given, it is + run on each chunk and its return values are concatenated into lists, + arrays or sparse matrices. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples_X, n_samples_X) or \ + (n_samples_X, n_features) + Array of pairwise distances between samples, or a feature array. + The shape the array should be (n_samples_X, n_samples_X) if + metric='precomputed' and (n_samples_X, n_features) otherwise. + + Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None + An optional second feature array. Only allowed if + metric != "precomputed". + + reduce_func : callable, default=None + The function which is applied on each chunk of the distance matrix, + reducing it to needed values. ``reduce_func(D_chunk, start)`` + is called repeatedly, where ``D_chunk`` is a contiguous vertical + slice of the pairwise distance matrix, starting at row ``start``. + It should return one of: None; an array, a list, or a sparse matrix + of length ``D_chunk.shape[0]``; or a tuple of such objects. + Returning None is useful for in-place operations, rather than + reductions. + + If None, pairwise_distances_chunked returns a generator of vertical + chunks of the distance matrix. + + metric : str or callable, default='euclidean' + The metric to use when calculating distance between instances in a + feature array. If metric is a string, it must be one of the options + allowed by :func:`scipy.spatial.distance.pdist` for its metric parameter, + or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. + If metric is "precomputed", X is assumed to be a distance matrix. + Alternatively, if metric is a callable function, it is called on + each pair of instances (rows) and the resulting value recorded. + The callable should take two arrays from X as input and return a + value indicating the distance between them. + + n_jobs : int, default=None + The number of jobs to use for the computation. This works by + breaking down the pairwise matrix into n_jobs even slices and + computing them in parallel. + + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + working_memory : float, default=None + The sought maximum memory for temporary distance matrix chunks. + When None (default), the value of + ``sklearn.get_config()['working_memory']`` is used. + + **kwds : optional keyword parameters + Any further parameters are passed directly to the distance function. + If using a :mod:`scipy.spatial.distance` metric, the parameters are still + metric dependent. See the scipy docs for usage examples. + + Yields + ------ + D_chunk : {ndarray, sparse matrix} + A contiguous slice of distance matrix, optionally processed by + ``reduce_func``. + + Examples + -------- + Without reduce_func: + + >>> import numpy as np + >>> from sklearn.metrics import pairwise_distances_chunked + >>> X = np.random.RandomState(0).rand(5, 3) + >>> D_chunk = next(pairwise_distances_chunked(X)) + >>> D_chunk + array([[0. , 0.295, 0.417, 0.197, 0.572], + [0.295, 0. , 0.576, 0.419, 0.764], + [0.417, 0.576, 0. , 0.449, 0.903], + [0.197, 0.419, 0.449, 0. , 0.512], + [0.572, 0.764, 0.903, 0.512, 0. ]]) + + Retrieve all neighbors and average distance within radius r: + + >>> r = .2 + >>> def reduce_func(D_chunk, start): + ... neigh = [np.flatnonzero(d < r) for d in D_chunk] + ... avg_dist = (D_chunk * (D_chunk < r)).mean(axis=1) + ... return neigh, avg_dist + >>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func) + >>> neigh, avg_dist = next(gen) + >>> neigh + [array([0, 3]), array([1]), array([2]), array([0, 3]), array([4])] + >>> avg_dist + array([0.039, 0. , 0. , 0.039, 0. ]) + + Where r is defined per sample, we need to make use of ``start``: + + >>> r = [.2, .4, .4, .3, .1] + >>> def reduce_func(D_chunk, start): + ... neigh = [np.flatnonzero(d < r[i]) + ... for i, d in enumerate(D_chunk, start)] + ... return neigh + >>> neigh = next(pairwise_distances_chunked(X, reduce_func=reduce_func)) + >>> neigh + [array([0, 3]), array([0, 1]), array([2]), array([0, 3]), array([4])] + + Force row-by-row generation by reducing ``working_memory``: + + >>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func, + ... working_memory=0) + >>> next(gen) + [array([0, 3])] + >>> next(gen) + [array([0, 1])] + """ + n_samples_X = _num_samples(X) + if metric == "precomputed": + slices = (slice(0, n_samples_X),) + else: + if Y is None: + Y = X + # We get as many rows as possible within our working_memory budget to + # store len(Y) distances in each row of output. + # + # Note: + # - this will get at least 1 row, even if 1 row of distances will + # exceed working_memory. + # - this does not account for any temporary memory usage while + # calculating distances (e.g. difference of vectors in manhattan + # distance. + chunk_n_rows = get_chunk_n_rows( + row_bytes=8 * _num_samples(Y), + max_n_rows=n_samples_X, + working_memory=working_memory, + ) + slices = gen_batches(n_samples_X, chunk_n_rows) + + # precompute data-derived metric params + params = _precompute_metric_params(X, Y, metric=metric, **kwds) + kwds.update(**params) + + for sl in slices: + if sl.start == 0 and sl.stop == n_samples_X: + X_chunk = X # enable optimised paths for X is Y + else: + X_chunk = X[sl] + D_chunk = pairwise_distances(X_chunk, Y, metric=metric, n_jobs=n_jobs, **kwds) + if (X is Y or Y is None) and PAIRWISE_DISTANCE_FUNCTIONS.get( + metric, None + ) is euclidean_distances: + # zeroing diagonal, taking care of aliases of "euclidean", + # i.e. "l2" + D_chunk.flat[sl.start :: _num_samples(X) + 1] = 0 + if reduce_func is not None: + chunk_size = D_chunk.shape[0] + D_chunk = reduce_func(D_chunk, sl.start) + _check_chunk_size(D_chunk, chunk_size) + yield D_chunk + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "Y": ["array-like", "sparse matrix", None], + "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable], + "n_jobs": [Integral, None], + "force_all_finite": [ + "boolean", + StrOptions({"allow-nan"}), + Hidden(StrOptions({"deprecated"})), + ], + "ensure_all_finite": ["boolean", StrOptions({"allow-nan"}), Hidden(None)], + }, + prefer_skip_nested_validation=True, +) +def pairwise_distances( + X, + Y=None, + metric="euclidean", + *, + n_jobs=None, + force_all_finite="deprecated", + ensure_all_finite=None, + **kwds, +): + """Compute the distance matrix from a feature array X and optional Y. + + This function takes one or two feature arrays or a distance matrix, and returns + a distance matrix. + + - If `X` is a feature array, of shape (n_samples_X, n_features), and: + + - `Y` is `None` and `metric` is not 'precomputed', the pairwise distances + between `X` and itself are returned. + - `Y` is a feature array of shape (n_samples_Y, n_features), the pairwise + distances between `X` and `Y` is returned. + + - If `X` is a distance matrix, of shape (n_samples_X, n_samples_X), `metric` + should be 'precomputed'. `Y` is thus ignored and `X` is returned as is. + + If the input is a collection of non-numeric data (e.g. a list of strings or a + boolean array), a custom metric must be passed. + + This method provides a safe way to take a distance matrix as input, while + preserving compatibility with many other algorithms that take a vector + array. + + Valid values for metric are: + + - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan', 'nan_euclidean']. All metrics support sparse matrix + inputs except 'nan_euclidean'. + + - From :mod:`scipy.spatial.distance`: ['braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', + 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', + 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']. + These metrics do not support sparse matrix inputs. + + .. note:: + `'kulsinski'` is deprecated from SciPy 1.9 and will be removed in SciPy 1.11. + + .. note:: + `'matching'` has been removed in SciPy 1.9 (use `'hamming'` instead). + + Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are + valid :mod:`scipy.spatial.distance` metrics), the scikit-learn implementation + will be used, which is faster and has support for sparse matrices (except + for 'cityblock'). For a verbose description of the metrics from + scikit-learn, see :func:`sklearn.metrics.pairwise.distance_metrics` + function. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples_X, n_samples_X) or \ + (n_samples_X, n_features) + Array of pairwise distances between samples, or a feature array. + The shape of the array should be (n_samples_X, n_samples_X) if + metric == "precomputed" and (n_samples_X, n_features) otherwise. + + Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None + An optional second feature array. Only allowed if + metric != "precomputed". + + metric : str or callable, default='euclidean' + The metric to use when calculating distance between instances in a + feature array. If metric is a string, it must be one of the options + allowed by :func:`scipy.spatial.distance.pdist` for its metric parameter, or + a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``. + If metric is "precomputed", X is assumed to be a distance matrix. + Alternatively, if metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays from X as input and return a value indicating + the distance between them. + + n_jobs : int, default=None + The number of jobs to use for the computation. This works by breaking + down the pairwise matrix into n_jobs even slices and computing them + using multithreading. + + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + The "euclidean" and "cosine" metrics rely heavily on BLAS which is already + multithreaded. So, increasing `n_jobs` would likely cause oversubscription + and quickly degrade performance. + + force_all_finite : bool or 'allow-nan', default=True + Whether to raise an error on np.inf, np.nan, pd.NA in array. Ignored + for a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``. The + possibilities are: + + - True: Force all values of array to be finite. + - False: accepts np.inf, np.nan, pd.NA in array. + - 'allow-nan': accepts only np.nan and pd.NA values in array. Values + cannot be infinite. + + .. versionadded:: 0.22 + ``force_all_finite`` accepts the string ``'allow-nan'``. + + .. versionchanged:: 0.23 + Accepts `pd.NA` and converts it into `np.nan`. + + .. deprecated:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite` and will be removed + in 1.8. + + ensure_all_finite : bool or 'allow-nan', default=True + Whether to raise an error on np.inf, np.nan, pd.NA in array. Ignored + for a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``. The + possibilities are: + + - True: Force all values of array to be finite. + - False: accepts np.inf, np.nan, pd.NA in array. + - 'allow-nan': accepts only np.nan and pd.NA values in array. Values + cannot be infinite. + + .. versionadded:: 1.6 + `force_all_finite` was renamed to `ensure_all_finite`. + + **kwds : optional keyword parameters + Any further parameters are passed directly to the distance function. + If using a scipy.spatial.distance metric, the parameters are still + metric dependent. See the scipy docs for usage examples. + + Returns + ------- + D : ndarray of shape (n_samples_X, n_samples_X) or \ + (n_samples_X, n_samples_Y) + A distance matrix D such that D_{i, j} is the distance between the + ith and jth vectors of the given matrix X, if Y is None. + If Y is not None, then D_{i, j} is the distance between the ith array + from X and the jth array from Y. + + See Also + -------- + pairwise_distances_chunked : Performs the same calculation as this + function, but returns a generator of chunks of the distance matrix, in + order to limit memory usage. + sklearn.metrics.pairwise.paired_distances : Computes the distances between + corresponding elements of two arrays. + + Notes + ----- + If metric is a callable, no restrictions are placed on `X` and `Y` dimensions. + + Examples + -------- + >>> from sklearn.metrics.pairwise import pairwise_distances + >>> X = [[0, 0, 0], [1, 1, 1]] + >>> Y = [[1, 0, 0], [1, 1, 0]] + >>> pairwise_distances(X, Y, metric='sqeuclidean') + array([[1., 2.], + [2., 1.]]) + """ + ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite) + + if metric == "precomputed": + X, _ = check_pairwise_arrays( + X, Y, precomputed=True, ensure_all_finite=ensure_all_finite + ) + + whom = ( + "`pairwise_distances`. Precomputed distance " + " need to have non-negative values." + ) + check_non_negative(X, whom=whom) + return X + elif metric in PAIRWISE_DISTANCE_FUNCTIONS: + func = PAIRWISE_DISTANCE_FUNCTIONS[metric] + elif callable(metric): + func = partial( + _pairwise_callable, + metric=metric, + ensure_all_finite=ensure_all_finite, + **kwds, + ) + else: + if issparse(X) or issparse(Y): + raise TypeError("scipy distance metrics do not support sparse matrices.") + + dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else "infer_float" + + if dtype is bool and (X.dtype != bool or (Y is not None and Y.dtype != bool)): + msg = "Data was converted to boolean for metric %s" % metric + warnings.warn(msg, DataConversionWarning) + + X, Y = check_pairwise_arrays( + X, Y, dtype=dtype, ensure_all_finite=ensure_all_finite + ) + + # precompute data-derived metric params + params = _precompute_metric_params(X, Y, metric=metric, **kwds) + kwds.update(**params) + + if effective_n_jobs(n_jobs) == 1 and X is Y: + return distance.squareform(distance.pdist(X, metric=metric, **kwds)) + func = partial(distance.cdist, metric=metric, **kwds) + + return _parallel_pairwise(X, Y, func, n_jobs, **kwds) + + +# These distances require boolean arrays, when using scipy.spatial.distance +PAIRWISE_BOOLEAN_FUNCTIONS = [ + "dice", + "jaccard", + "rogerstanimoto", + "russellrao", + "sokalsneath", + "yule", +] +if sp_base_version < parse_version("1.17"): + # Deprecated in SciPy 1.15 and removed in SciPy 1.17 + PAIRWISE_BOOLEAN_FUNCTIONS += ["sokalmichener"] +if sp_base_version < parse_version("1.11"): + # Deprecated in SciPy 1.9 and removed in SciPy 1.11 + PAIRWISE_BOOLEAN_FUNCTIONS += ["kulsinski"] +if sp_base_version < parse_version("1.9"): + # Deprecated in SciPy 1.0 and removed in SciPy 1.9 + PAIRWISE_BOOLEAN_FUNCTIONS += ["matching"] + +# Helper functions - distance +PAIRWISE_KERNEL_FUNCTIONS = { + # If updating this dictionary, update the doc in both distance_metrics() + # and also in pairwise_distances()! + "additive_chi2": additive_chi2_kernel, + "chi2": chi2_kernel, + "linear": linear_kernel, + "polynomial": polynomial_kernel, + "poly": polynomial_kernel, + "rbf": rbf_kernel, + "laplacian": laplacian_kernel, + "sigmoid": sigmoid_kernel, + "cosine": cosine_similarity, +} + + +def kernel_metrics(): + """Valid metrics for pairwise_kernels. + + This function simply returns the valid pairwise distance metrics. + It exists, however, to allow for a verbose description of the mapping for + each of the valid strings. + + The valid distance metrics, and the function they map to, are: + =============== ======================================== + metric Function + =============== ======================================== + 'additive_chi2' sklearn.pairwise.additive_chi2_kernel + 'chi2' sklearn.pairwise.chi2_kernel + 'linear' sklearn.pairwise.linear_kernel + 'poly' sklearn.pairwise.polynomial_kernel + 'polynomial' sklearn.pairwise.polynomial_kernel + 'rbf' sklearn.pairwise.rbf_kernel + 'laplacian' sklearn.pairwise.laplacian_kernel + 'sigmoid' sklearn.pairwise.sigmoid_kernel + 'cosine' sklearn.pairwise.cosine_similarity + =============== ======================================== + + Read more in the :ref:`User Guide `. + + Returns + ------- + kernel_metrics : dict + Returns valid metrics for pairwise_kernels. + """ + return PAIRWISE_KERNEL_FUNCTIONS + + +KERNEL_PARAMS = { + "additive_chi2": (), + "chi2": frozenset(["gamma"]), + "cosine": (), + "linear": (), + "poly": frozenset(["gamma", "degree", "coef0"]), + "polynomial": frozenset(["gamma", "degree", "coef0"]), + "rbf": frozenset(["gamma"]), + "laplacian": frozenset(["gamma"]), + "sigmoid": frozenset(["gamma", "coef0"]), +} + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "Y": ["array-like", "sparse matrix", None], + "metric": [ + StrOptions(set(PAIRWISE_KERNEL_FUNCTIONS) | {"precomputed"}), + callable, + ], + "filter_params": ["boolean"], + "n_jobs": [Integral, None], + }, + prefer_skip_nested_validation=True, +) +def pairwise_kernels( + X, Y=None, metric="linear", *, filter_params=False, n_jobs=None, **kwds +): + """Compute the kernel between arrays X and optional array Y. + + This function takes one or two feature arrays or a kernel matrix, and returns + a kernel matrix. + + - If `X` is a feature array, of shape (n_samples_X, n_features), and: + + - `Y` is `None` and `metric` is not 'precomputed', the pairwise kernels + between `X` and itself are returned. + - `Y` is a feature array of shape (n_samples_Y, n_features), the pairwise + kernels between `X` and `Y` is returned. + + - If `X` is a kernel matrix, of shape (n_samples_X, n_samples_X), `metric` + should be 'precomputed'. `Y` is thus ignored and `X` is returned as is. + + This method provides a safe way to take a kernel matrix as input, while + preserving compatibility with many other algorithms that take a vector + array. + + Valid values for metric are: + ['additive_chi2', 'chi2', 'linear', 'poly', 'polynomial', 'rbf', + 'laplacian', 'sigmoid', 'cosine'] + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples_X, n_samples_X) or \ + (n_samples_X, n_features) + Array of pairwise kernels between samples, or a feature array. + The shape of the array should be (n_samples_X, n_samples_X) if + metric == "precomputed" and (n_samples_X, n_features) otherwise. + + Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None + A second feature array only if X has shape (n_samples_X, n_features). + + metric : str or callable, default="linear" + The metric to use when calculating kernel between instances in a + feature array. If metric is a string, it must be one of the metrics + in ``pairwise.PAIRWISE_KERNEL_FUNCTIONS``. + If metric is "precomputed", X is assumed to be a kernel matrix. + Alternatively, if metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two rows from X as input and return the corresponding + kernel value as a single number. This means that callables from + :mod:`sklearn.metrics.pairwise` are not allowed, as they operate on + matrices, not single samples. Use the string identifying the kernel + instead. + + filter_params : bool, default=False + Whether to filter invalid parameters or not. + + n_jobs : int, default=None + The number of jobs to use for the computation. This works by breaking + down the pairwise matrix into n_jobs even slices and computing them + using multithreading. + + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + **kwds : optional keyword parameters + Any further parameters are passed directly to the kernel function. + + Returns + ------- + K : ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_samples_Y) + A kernel matrix K such that K_{i, j} is the kernel between the + ith and jth vectors of the given matrix X, if Y is None. + If Y is not None, then K_{i, j} is the kernel between the ith array + from X and the jth array from Y. + + Notes + ----- + If metric is a callable, no restrictions are placed on `X` and `Y` dimensions. + + Examples + -------- + >>> from sklearn.metrics.pairwise import pairwise_kernels + >>> X = [[0, 0, 0], [1, 1, 1]] + >>> Y = [[1, 0, 0], [1, 1, 0]] + >>> pairwise_kernels(X, Y, metric='linear') + array([[0., 0.], + [1., 2.]]) + """ + # import GPKernel locally to prevent circular imports + from ..gaussian_process.kernels import Kernel as GPKernel + + if metric == "precomputed": + X, _ = check_pairwise_arrays(X, Y, precomputed=True) + return X + elif isinstance(metric, GPKernel): + func = metric.__call__ + elif metric in PAIRWISE_KERNEL_FUNCTIONS: + if filter_params: + kwds = {k: kwds[k] for k in kwds if k in KERNEL_PARAMS[metric]} + func = PAIRWISE_KERNEL_FUNCTIONS[metric] + elif callable(metric): + func = partial(_pairwise_callable, metric=metric, **kwds) + + return _parallel_pairwise(X, Y, func, n_jobs, **kwds) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_classification.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..b66353e5ecfab4973aca5456473dbb947b86b0a9 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_classification.py @@ -0,0 +1,3397 @@ +import re +import warnings +from functools import partial +from itertools import chain, permutations, product + +import numpy as np +import pytest +from scipy import linalg +from scipy.spatial.distance import hamming as sp_hamming +from scipy.stats import bernoulli + +from sklearn import datasets, svm +from sklearn.datasets import make_multilabel_classification +from sklearn.exceptions import UndefinedMetricWarning +from sklearn.metrics import ( + accuracy_score, + average_precision_score, + balanced_accuracy_score, + brier_score_loss, + class_likelihood_ratios, + classification_report, + cohen_kappa_score, + confusion_matrix, + f1_score, + fbeta_score, + hamming_loss, + hinge_loss, + jaccard_score, + log_loss, + make_scorer, + matthews_corrcoef, + multilabel_confusion_matrix, + precision_recall_fscore_support, + precision_score, + recall_score, + zero_one_loss, +) +from sklearn.metrics._classification import _check_targets, d2_log_loss_score +from sklearn.model_selection import cross_val_score +from sklearn.preprocessing import LabelBinarizer, label_binarize +from sklearn.tree import DecisionTreeClassifier +from sklearn.utils._mocking import MockDataFrame +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, + ignore_warnings, +) +from sklearn.utils.extmath import _nanaverage +from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS +from sklearn.utils.validation import check_random_state + +############################################################################### +# Utilities for testing + + +def make_prediction(dataset=None, binary=False): + """Make some classification predictions on a toy dataset using a SVC + + If binary is True restrict to a binary classification problem instead of a + multiclass classification problem + """ + + if dataset is None: + # import some data to play with + dataset = datasets.load_iris() + + X = dataset.data + y = dataset.target + + if binary: + # restrict to a binary classification task + X, y = X[y < 2], y[y < 2] + + n_samples, n_features = X.shape + p = np.arange(n_samples) + + rng = check_random_state(37) + rng.shuffle(p) + X, y = X[p], y[p] + half = int(n_samples / 2) + + # add noisy features to make the problem harder and avoid perfect results + rng = np.random.RandomState(0) + X = np.c_[X, rng.randn(n_samples, 200 * n_features)] + + # run classifier, get class probabilities and label predictions + clf = svm.SVC(kernel="linear", probability=True, random_state=0) + y_pred_proba = clf.fit(X[:half], y[:half]).predict_proba(X[half:]) + + if binary: + # only interested in probabilities of the positive case + # XXX: do we really want a special API for the binary case? + y_pred_proba = y_pred_proba[:, 1] + + y_pred = clf.predict(X[half:]) + y_true = y[half:] + return y_true, y_pred, y_pred_proba + + +############################################################################### +# Tests + + +def test_classification_report_dictionary_output(): + # Test performance report with dictionary output + iris = datasets.load_iris() + y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) + + # print classification report with class names + expected_report = { + "setosa": { + "precision": 0.82608695652173914, + "recall": 0.79166666666666663, + "f1-score": 0.8085106382978724, + "support": 24, + }, + "versicolor": { + "precision": 0.33333333333333331, + "recall": 0.096774193548387094, + "f1-score": 0.15000000000000002, + "support": 31, + }, + "virginica": { + "precision": 0.41860465116279072, + "recall": 0.90000000000000002, + "f1-score": 0.57142857142857151, + "support": 20, + }, + "macro avg": { + "f1-score": 0.5099797365754813, + "precision": 0.5260083136726211, + "recall": 0.596146953405018, + "support": 75, + }, + "accuracy": 0.5333333333333333, + "weighted avg": { + "f1-score": 0.47310435663627154, + "precision": 0.5137535108414785, + "recall": 0.5333333333333333, + "support": 75, + }, + } + + report = classification_report( + y_true, + y_pred, + labels=np.arange(len(iris.target_names)), + target_names=iris.target_names, + output_dict=True, + ) + + # assert the 2 dicts are equal. + assert report.keys() == expected_report.keys() + for key in expected_report: + if key == "accuracy": + assert isinstance(report[key], float) + assert report[key] == expected_report[key] + else: + assert report[key].keys() == expected_report[key].keys() + for metric in expected_report[key]: + assert_almost_equal(expected_report[key][metric], report[key][metric]) + + assert isinstance(expected_report["setosa"]["precision"], float) + assert isinstance(expected_report["macro avg"]["precision"], float) + assert isinstance(expected_report["setosa"]["support"], int) + assert isinstance(expected_report["macro avg"]["support"], int) + + +def test_classification_report_output_dict_empty_input(): + report = classification_report(y_true=[], y_pred=[], output_dict=True) + expected_report = { + "accuracy": 0.0, + "macro avg": { + "f1-score": np.nan, + "precision": np.nan, + "recall": np.nan, + "support": 0, + }, + "weighted avg": { + "f1-score": np.nan, + "precision": np.nan, + "recall": np.nan, + "support": 0, + }, + } + assert isinstance(report, dict) + # assert the 2 dicts are equal. + assert report.keys() == expected_report.keys() + for key in expected_report: + if key == "accuracy": + assert isinstance(report[key], float) + assert report[key] == expected_report[key] + else: + assert report[key].keys() == expected_report[key].keys() + for metric in expected_report[key]: + assert_almost_equal(expected_report[key][metric], report[key][metric]) + + +@pytest.mark.parametrize("zero_division", ["warn", 0, 1, np.nan]) +def test_classification_report_zero_division_warning(zero_division): + y_true, y_pred = ["a", "b", "c"], ["a", "b", "d"] + with warnings.catch_warnings(record=True) as record: + classification_report( + y_true, y_pred, zero_division=zero_division, output_dict=True + ) + if zero_division == "warn": + assert len(record) > 1 + for item in record: + msg = "Use `zero_division` parameter to control this behavior." + assert msg in str(item.message) + else: + assert not record + + +@pytest.mark.parametrize( + "labels, show_micro_avg", [([0], True), ([0, 1], False), ([0, 1, 2], False)] +) +def test_classification_report_labels_subset_superset(labels, show_micro_avg): + """Check the behaviour of passing `labels` as a superset or subset of the labels. + WHen a superset, we expect to show the "accuracy" in the report while it should be + the micro-averaging if this is a subset. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/27927 + """ + + y_true, y_pred = [0, 1], [0, 1] + + report = classification_report(y_true, y_pred, labels=labels, output_dict=True) + if show_micro_avg: + assert "micro avg" in report + assert "accuracy" not in report + else: # accuracy should be shown + assert "accuracy" in report + assert "micro avg" not in report + + +def test_multilabel_accuracy_score_subset_accuracy(): + # Dense label indicator matrix format + y1 = np.array([[0, 1, 1], [1, 0, 1]]) + y2 = np.array([[0, 0, 1], [1, 0, 1]]) + + assert accuracy_score(y1, y2) == 0.5 + assert accuracy_score(y1, y1) == 1 + assert accuracy_score(y2, y2) == 1 + assert accuracy_score(y2, np.logical_not(y2)) == 0 + assert accuracy_score(y1, np.logical_not(y1)) == 0 + assert accuracy_score(y1, np.zeros(y1.shape)) == 0 + assert accuracy_score(y2, np.zeros(y1.shape)) == 0 + + +def test_precision_recall_f1_score_binary(): + # Test Precision Recall and F1 Score for binary classification task + y_true, y_pred, _ = make_prediction(binary=True) + + # detailed measures for each class + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) + assert_array_almost_equal(p, [0.73, 0.85], 2) + assert_array_almost_equal(r, [0.88, 0.68], 2) + assert_array_almost_equal(f, [0.80, 0.76], 2) + assert_array_equal(s, [25, 25]) + + # individual scoring function that can be used for grid search: in the + # binary class case the score is the value of the measure for the positive + # class (e.g. label == 1). This is deprecated for average != 'binary'. + for kwargs in [{}, {"average": "binary"}]: + with warnings.catch_warnings(): + warnings.simplefilter("error") + + ps = precision_score(y_true, y_pred, **kwargs) + assert_array_almost_equal(ps, 0.85, 2) + + rs = recall_score(y_true, y_pred, **kwargs) + assert_array_almost_equal(rs, 0.68, 2) + + fs = f1_score(y_true, y_pred, **kwargs) + assert_array_almost_equal(fs, 0.76, 2) + + assert_almost_equal( + fbeta_score(y_true, y_pred, beta=2, **kwargs), + (1 + 2**2) * ps * rs / (2**2 * ps + rs), + 2, + ) + + +@pytest.mark.filterwarnings(r"ignore::sklearn.exceptions.UndefinedMetricWarning") +def test_precision_recall_f_binary_single_class(): + # Test precision, recall and F-scores behave with a single positive or + # negative class + # Such a case may occur with non-stratified cross-validation + assert 1.0 == precision_score([1, 1], [1, 1]) + assert 1.0 == recall_score([1, 1], [1, 1]) + assert 1.0 == f1_score([1, 1], [1, 1]) + assert 1.0 == fbeta_score([1, 1], [1, 1], beta=0) + + assert 0.0 == precision_score([-1, -1], [-1, -1]) + assert 0.0 == recall_score([-1, -1], [-1, -1]) + assert 0.0 == f1_score([-1, -1], [-1, -1]) + assert 0.0 == fbeta_score([-1, -1], [-1, -1], beta=float("inf")) + assert fbeta_score([-1, -1], [-1, -1], beta=float("inf")) == pytest.approx( + fbeta_score([-1, -1], [-1, -1], beta=1e5) + ) + + +@pytest.mark.filterwarnings(r"ignore::sklearn.exceptions.UndefinedMetricWarning") +def test_precision_recall_f_extra_labels(): + # Test handling of explicit additional (not in input) labels to PRF + y_true = [1, 3, 3, 2] + y_pred = [1, 1, 3, 2] + y_true_bin = label_binarize(y_true, classes=np.arange(5)) + y_pred_bin = label_binarize(y_pred, classes=np.arange(5)) + data = [(y_true, y_pred), (y_true_bin, y_pred_bin)] + + for i, (y_true, y_pred) in enumerate(data): + # No average: zeros in array + actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], average=None) + assert_array_almost_equal([0.0, 1.0, 1.0, 0.5, 0.0], actual) + + # Macro average is changed + actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], average="macro") + assert_array_almost_equal(np.mean([0.0, 1.0, 1.0, 0.5, 0.0]), actual) + + # No effect otherwise + for average in ["micro", "weighted", "samples"]: + if average == "samples" and i == 0: + continue + assert_almost_equal( + recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4], average=average), + recall_score(y_true, y_pred, labels=None, average=average), + ) + + # Error when introducing invalid label in multilabel case + # (although it would only affect performance if average='macro'/None) + for average in [None, "macro", "micro", "samples"]: + with pytest.raises(ValueError): + recall_score(y_true_bin, y_pred_bin, labels=np.arange(6), average=average) + with pytest.raises(ValueError): + recall_score( + y_true_bin, y_pred_bin, labels=np.arange(-1, 4), average=average + ) + + # tests non-regression on issue #10307 + y_true = np.array([[0, 1, 1], [1, 0, 0]]) + y_pred = np.array([[1, 1, 1], [1, 0, 1]]) + p, r, f, _ = precision_recall_fscore_support( + y_true, y_pred, average="samples", labels=[0, 1] + ) + assert_almost_equal(np.array([p, r, f]), np.array([3 / 4, 1, 5 / 6])) + + +@pytest.mark.filterwarnings(r"ignore::sklearn.exceptions.UndefinedMetricWarning") +def test_precision_recall_f_ignored_labels(): + # Test a subset of labels may be requested for PRF + y_true = [1, 1, 2, 3] + y_pred = [1, 3, 3, 3] + y_true_bin = label_binarize(y_true, classes=np.arange(5)) + y_pred_bin = label_binarize(y_pred, classes=np.arange(5)) + data = [(y_true, y_pred), (y_true_bin, y_pred_bin)] + + for i, (y_true, y_pred) in enumerate(data): + recall_13 = partial(recall_score, y_true, y_pred, labels=[1, 3]) + recall_all = partial(recall_score, y_true, y_pred, labels=None) + + assert_array_almost_equal([0.5, 1.0], recall_13(average=None)) + assert_almost_equal((0.5 + 1.0) / 2, recall_13(average="macro")) + assert_almost_equal((0.5 * 2 + 1.0 * 1) / 3, recall_13(average="weighted")) + assert_almost_equal(2.0 / 3, recall_13(average="micro")) + + # ensure the above were meaningful tests: + for average in ["macro", "weighted", "micro"]: + assert recall_13(average=average) != recall_all(average=average) + + +def test_average_precision_score_non_binary_class(): + """Test multiclass-multiouptut for `average_precision_score`.""" + y_true = np.array( + [ + [2, 2, 1], + [1, 2, 0], + [0, 1, 2], + [1, 2, 1], + [2, 0, 1], + [1, 2, 1], + ] + ) + y_score = np.array( + [ + [0.7, 0.2, 0.1], + [0.4, 0.3, 0.3], + [0.1, 0.8, 0.1], + [0.2, 0.3, 0.5], + [0.4, 0.4, 0.2], + [0.1, 0.2, 0.7], + ] + ) + err_msg = "multiclass-multioutput format is not supported" + with pytest.raises(ValueError, match=err_msg): + average_precision_score(y_true, y_score, pos_label=2) + + +@pytest.mark.parametrize( + "y_true, y_score", + [ + ( + [0, 0, 1, 2], + np.array( + [ + [0.7, 0.2, 0.1], + [0.4, 0.3, 0.3], + [0.1, 0.8, 0.1], + [0.2, 0.3, 0.5], + ] + ), + ), + ( + [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], + [0, 0.1, 0.1, 0.4, 0.5, 0.6, 0.6, 0.9, 0.9, 1, 1], + ), + ], +) +def test_average_precision_score_duplicate_values(y_true, y_score): + """ + Duplicate values with precision-recall require a different + processing than when computing the AUC of a ROC, because the + precision-recall curve is a decreasing curve + The following situation corresponds to a perfect + test statistic, the average_precision_score should be 1. + """ + assert average_precision_score(y_true, y_score) == 1 + + +@pytest.mark.parametrize( + "y_true, y_score", + [ + ( + [2, 2, 1, 1, 0], + np.array( + [ + [0.2, 0.3, 0.5], + [0.2, 0.3, 0.5], + [0.4, 0.5, 0.3], + [0.4, 0.5, 0.3], + [0.8, 0.5, 0.3], + ] + ), + ), + ( + [0, 1, 1], + [0.5, 0.5, 0.6], + ), + ], +) +def test_average_precision_score_tied_values(y_true, y_score): + # Here if we go from left to right in y_true, the 0 values are + # separated from the 1 values, so it appears that we've + # correctly sorted our classifications. But in fact the first two + # values have the same score (0.5) and so the first two values + # could be swapped around, creating an imperfect sorting. This + # imperfection should come through in the end score, making it less + # than one. + assert average_precision_score(y_true, y_score) != 1.0 + + +def test_precision_recall_f_unused_pos_label(): + # Check warning that pos_label unused when set to non-default value + # but average != 'binary'; even if data is binary. + + msg = ( + r"Note that pos_label \(set to 2\) is " + r"ignored when average != 'binary' \(got 'macro'\). You " + r"may use labels=\[pos_label\] to specify a single " + "positive class." + ) + with pytest.warns(UserWarning, match=msg): + precision_recall_fscore_support( + [1, 2, 1], [1, 2, 2], pos_label=2, average="macro" + ) + + +def test_confusion_matrix_binary(): + # Test confusion matrix - binary classification case + y_true, y_pred, _ = make_prediction(binary=True) + + def test(y_true, y_pred): + cm = confusion_matrix(y_true, y_pred) + assert_array_equal(cm, [[22, 3], [8, 17]]) + + tp, fp, fn, tn = cm.flatten() + num = tp * tn - fp * fn + den = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) + + true_mcc = 0 if den == 0 else num / den + mcc = matthews_corrcoef(y_true, y_pred) + assert_array_almost_equal(mcc, true_mcc, decimal=2) + assert_array_almost_equal(mcc, 0.57, decimal=2) + + test(y_true, y_pred) + test([str(y) for y in y_true], [str(y) for y in y_pred]) + + +def test_multilabel_confusion_matrix_binary(): + # Test multilabel confusion matrix - binary classification case + y_true, y_pred, _ = make_prediction(binary=True) + + def test(y_true, y_pred): + cm = multilabel_confusion_matrix(y_true, y_pred) + assert_array_equal(cm, [[[17, 8], [3, 22]], [[22, 3], [8, 17]]]) + + test(y_true, y_pred) + test([str(y) for y in y_true], [str(y) for y in y_pred]) + + +def test_multilabel_confusion_matrix_multiclass(): + # Test multilabel confusion matrix - multi-class case + y_true, y_pred, _ = make_prediction(binary=False) + + def test(y_true, y_pred, string_type=False): + # compute confusion matrix with default labels introspection + cm = multilabel_confusion_matrix(y_true, y_pred) + assert_array_equal( + cm, [[[47, 4], [5, 19]], [[38, 6], [28, 3]], [[30, 25], [2, 18]]] + ) + + # compute confusion matrix with explicit label ordering + labels = ["0", "2", "1"] if string_type else [0, 2, 1] + cm = multilabel_confusion_matrix(y_true, y_pred, labels=labels) + assert_array_equal( + cm, [[[47, 4], [5, 19]], [[30, 25], [2, 18]], [[38, 6], [28, 3]]] + ) + + # compute confusion matrix with super set of present labels + labels = ["0", "2", "1", "3"] if string_type else [0, 2, 1, 3] + cm = multilabel_confusion_matrix(y_true, y_pred, labels=labels) + assert_array_equal( + cm, + [ + [[47, 4], [5, 19]], + [[30, 25], [2, 18]], + [[38, 6], [28, 3]], + [[75, 0], [0, 0]], + ], + ) + + test(y_true, y_pred) + test([str(y) for y in y_true], [str(y) for y in y_pred], string_type=True) + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_multilabel_confusion_matrix_multilabel(csc_container, csr_container): + # Test multilabel confusion matrix - multilabel-indicator case + + y_true = np.array([[1, 0, 1], [0, 1, 0], [1, 1, 0]]) + y_pred = np.array([[1, 0, 0], [0, 1, 1], [0, 0, 1]]) + y_true_csr = csr_container(y_true) + y_pred_csr = csr_container(y_pred) + y_true_csc = csc_container(y_true) + y_pred_csc = csc_container(y_pred) + + # cross test different types + sample_weight = np.array([2, 1, 3]) + real_cm = [[[1, 0], [1, 1]], [[1, 0], [1, 1]], [[0, 2], [1, 0]]] + trues = [y_true, y_true_csr, y_true_csc] + preds = [y_pred, y_pred_csr, y_pred_csc] + + for y_true_tmp in trues: + for y_pred_tmp in preds: + cm = multilabel_confusion_matrix(y_true_tmp, y_pred_tmp) + assert_array_equal(cm, real_cm) + + # test support for samplewise + cm = multilabel_confusion_matrix(y_true, y_pred, samplewise=True) + assert_array_equal(cm, [[[1, 0], [1, 1]], [[1, 1], [0, 1]], [[0, 1], [2, 0]]]) + + # test support for labels + cm = multilabel_confusion_matrix(y_true, y_pred, labels=[2, 0]) + assert_array_equal(cm, [[[0, 2], [1, 0]], [[1, 0], [1, 1]]]) + + # test support for labels with samplewise + cm = multilabel_confusion_matrix(y_true, y_pred, labels=[2, 0], samplewise=True) + assert_array_equal(cm, [[[0, 0], [1, 1]], [[1, 1], [0, 0]], [[0, 1], [1, 0]]]) + + # test support for sample_weight with sample_wise + cm = multilabel_confusion_matrix( + y_true, y_pred, sample_weight=sample_weight, samplewise=True + ) + assert_array_equal(cm, [[[2, 0], [2, 2]], [[1, 1], [0, 1]], [[0, 3], [6, 0]]]) + + +def test_multilabel_confusion_matrix_errors(): + y_true = np.array([[1, 0, 1], [0, 1, 0], [1, 1, 0]]) + y_pred = np.array([[1, 0, 0], [0, 1, 1], [0, 0, 1]]) + + # Bad sample_weight + with pytest.raises(ValueError, match="inconsistent numbers of samples"): + multilabel_confusion_matrix(y_true, y_pred, sample_weight=[1, 2]) + with pytest.raises(ValueError, match="should be a 1d array"): + multilabel_confusion_matrix( + y_true, y_pred, sample_weight=[[1, 2, 3], [2, 3, 4], [3, 4, 5]] + ) + + # Bad labels + err_msg = r"All labels must be in \[0, n labels\)" + with pytest.raises(ValueError, match=err_msg): + multilabel_confusion_matrix(y_true, y_pred, labels=[-1]) + err_msg = r"All labels must be in \[0, n labels\)" + with pytest.raises(ValueError, match=err_msg): + multilabel_confusion_matrix(y_true, y_pred, labels=[3]) + + # Using samplewise outside multilabel + with pytest.raises(ValueError, match="Samplewise metrics"): + multilabel_confusion_matrix([0, 1, 2], [1, 2, 0], samplewise=True) + + # Bad y_type + err_msg = "multiclass-multioutput is not supported" + with pytest.raises(ValueError, match=err_msg): + multilabel_confusion_matrix([[0, 1, 2], [2, 1, 0]], [[1, 2, 0], [1, 0, 2]]) + + +@pytest.mark.parametrize( + "normalize, cm_dtype, expected_results", + [ + ("true", "f", 0.333333333), + ("pred", "f", 0.333333333), + ("all", "f", 0.1111111111), + (None, "i", 2), + ], +) +def test_confusion_matrix_normalize(normalize, cm_dtype, expected_results): + y_test = [0, 1, 2] * 6 + y_pred = list(chain(*permutations([0, 1, 2]))) + cm = confusion_matrix(y_test, y_pred, normalize=normalize) + assert_allclose(cm, expected_results) + assert cm.dtype.kind == cm_dtype + + +def test_confusion_matrix_normalize_single_class(): + y_test = [0, 0, 0, 0, 1, 1, 1, 1] + y_pred = [0, 0, 0, 0, 0, 0, 0, 0] + + cm_true = confusion_matrix(y_test, y_pred, normalize="true") + assert cm_true.sum() == pytest.approx(2.0) + + # additionally check that no warnings are raised due to a division by zero + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + cm_pred = confusion_matrix(y_test, y_pred, normalize="pred") + + assert cm_pred.sum() == pytest.approx(1.0) + + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + confusion_matrix(y_pred, y_test, normalize="true") + + +def test_confusion_matrix_single_label(): + """Test `confusion_matrix` warns when only one label found.""" + y_test = [0, 0, 0, 0] + y_pred = [0, 0, 0, 0] + + with pytest.warns(UserWarning, match="A single label was found in"): + confusion_matrix(y_pred, y_test) + + +@pytest.mark.parametrize( + "params, warn_msg", + [ + # When `fp == 0` and `tp != 0`, LR+ is undefined + ( + { + "y_true": np.array([1, 1, 1, 0, 0, 0]), + "y_pred": np.array([1, 1, 1, 0, 0, 0]), + }, + "`positive_likelihood_ratio` is ill-defined and set to `np.nan`.", + ), + # When `fp == 0` and `tp == 0`, LR+ is undefined + ( + { + "y_true": np.array([1, 1, 1, 0, 0, 0]), + "y_pred": np.array([0, 0, 0, 0, 0, 0]), + }, + ( + "No samples were predicted for the positive class and " + "`positive_likelihood_ratio` is set to `np.nan`." + ), + ), + # When `tn == 0`, LR- is undefined + ( + { + "y_true": np.array([1, 1, 1, 0, 0, 0]), + "y_pred": np.array([0, 0, 0, 1, 1, 1]), + }, + "`negative_likelihood_ratio` is ill-defined and set to `np.nan`.", + ), + # When `tp + fn == 0` both ratios are undefined + ( + { + "y_true": np.array([0, 0, 0, 0, 0, 0]), + "y_pred": np.array([1, 1, 1, 0, 0, 0]), + }, + "No samples of the positive class are present in `y_true`.", + ), + ], +) +def test_likelihood_ratios_warnings(params, warn_msg): + # likelihood_ratios must raise warnings when at + # least one of the ratios is ill-defined. + + with pytest.warns(UserWarning, match=warn_msg): + class_likelihood_ratios(**params) + + +@pytest.mark.parametrize( + "params, err_msg", + [ + ( + { + "y_true": np.array([0, 1, 0, 1, 0]), + "y_pred": np.array([1, 1, 0, 0, 2]), + }, + ( + "class_likelihood_ratios only supports binary classification " + "problems, got targets of type: multiclass" + ), + ), + ], +) +def test_likelihood_ratios_errors(params, err_msg): + # likelihood_ratios must raise error when attempting + # non-binary classes to avoid Simpson's paradox + with pytest.raises(ValueError, match=err_msg): + class_likelihood_ratios(**params) + + +def test_likelihood_ratios(): + # Build confusion matrix with tn=9, fp=8, fn=1, tp=2, + # sensitivity=2/3, specificity=9/17, prevalence=3/20, + # LR+=34/24, LR-=17/27 + y_true = np.array([1] * 3 + [0] * 17) + y_pred = np.array([1] * 2 + [0] * 10 + [1] * 8) + + pos, neg = class_likelihood_ratios(y_true, y_pred) + assert_allclose(pos, 34 / 24) + assert_allclose(neg, 17 / 27) + + # Build limit case with y_pred = y_true + pos, neg = class_likelihood_ratios(y_true, y_true) + assert_array_equal(pos, np.nan * 2) + assert_allclose(neg, np.zeros(2), rtol=1e-12) + + # Ignore last 5 samples to get tn=9, fp=3, fn=1, tp=2, + # sensitivity=2/3, specificity=9/12, prevalence=3/20, + # LR+=24/9, LR-=12/27 + sample_weight = np.array([1.0] * 15 + [0.0] * 5) + pos, neg = class_likelihood_ratios(y_true, y_pred, sample_weight=sample_weight) + assert_allclose(pos, 24 / 9) + assert_allclose(neg, 12 / 27) + + +# TODO(1.9): remove test +@pytest.mark.parametrize("raise_warning", [True, False]) +def test_likelihood_ratios_raise_warning_deprecation(raise_warning): + """Test that class_likelihood_ratios raises a `FutureWarning` when `raise_warning` + param is set.""" + y_true = np.array([1, 0]) + y_pred = np.array([1, 0]) + + msg = "`raise_warning` was deprecated in version 1.7 and will be removed in 1.9." + with pytest.warns(FutureWarning, match=msg): + class_likelihood_ratios(y_true, y_pred, raise_warning=raise_warning) + + +def test_likelihood_ratios_replace_undefined_by_worst(): + """Test that class_likelihood_ratios returns the worst scores `1.0` for both LR+ and + LR- when `replace_undefined_by=1` is set.""" + # This data causes fp=0 (0 false positives) in the confusion_matrix and a division + # by zero that affects the positive_likelihood_ratio: + y_true = np.array([1, 1, 0]) + y_pred = np.array([1, 0, 0]) + + positive_likelihood_ratio, _ = class_likelihood_ratios( + y_true, y_pred, replace_undefined_by=1 + ) + assert positive_likelihood_ratio == pytest.approx(1.0) + + # This data causes tn=0 (0 true negatives) in the confusion_matrix and a division + # by zero that affects the negative_likelihood_ratio: + y_true = np.array([1, 0, 0]) + y_pred = np.array([1, 1, 1]) + + _, negative_likelihood_ratio = class_likelihood_ratios( + y_true, y_pred, replace_undefined_by=1 + ) + assert negative_likelihood_ratio == pytest.approx(1.0) + + +@pytest.mark.parametrize( + "replace_undefined_by", + [ + {"LR+": 0.0}, + {"LR-": 0.0}, + {"LR+": -5.0, "LR-": 0.0}, + {"LR+": 1.0, "LR-": "nan"}, + {"LR+": 0.0, "LR-": 0.0}, + {"LR+": 1.0, "LR-": 2.0}, + ], +) +def test_likelihood_ratios_wrong_dict_replace_undefined_by(replace_undefined_by): + """Test that class_likelihood_ratios raises a `ValueError` if the input dict for + `replace_undefined_by` is in the wrong format or contains impossible values.""" + y_true = np.array([1, 0]) + y_pred = np.array([1, 0]) + + msg = "The dictionary passed as `replace_undefined_by` needs to be in the form" + with pytest.raises(ValueError, match=msg): + class_likelihood_ratios( + y_true, y_pred, replace_undefined_by=replace_undefined_by + ) + + +@pytest.mark.parametrize( + "replace_undefined_by, expected", + [ + ({"LR+": 1.0, "LR-": 1.0}, 1.0), + ({"LR+": np.inf, "LR-": 0.0}, np.inf), + ({"LR+": 2.0, "LR-": 0.0}, 2.0), + ({"LR+": np.nan, "LR-": np.nan}, np.nan), + (np.nan, np.nan), + ], +) +def test_likelihood_ratios_replace_undefined_by_0_fp(replace_undefined_by, expected): + """Test that the `replace_undefined_by` param returns the right value for the + positive_likelihood_ratio as defined by the user.""" + # This data causes fp=0 (0 false positives) in the confusion_matrix and a division + # by zero that affects the positive_likelihood_ratio: + y_true = np.array([1, 1, 0]) + y_pred = np.array([1, 0, 0]) + + positive_likelihood_ratio, _ = class_likelihood_ratios( + y_true, y_pred, replace_undefined_by=replace_undefined_by + ) + + if np.isnan(expected): + assert np.isnan(positive_likelihood_ratio) + else: + assert positive_likelihood_ratio == pytest.approx(expected) + + +@pytest.mark.parametrize( + "replace_undefined_by, expected", + [ + ({"LR+": 1.0, "LR-": 1.0}, 1.0), + ({"LR+": np.inf, "LR-": 0.0}, 0.0), + ({"LR+": np.inf, "LR-": 0.5}, 0.5), + ({"LR+": np.nan, "LR-": np.nan}, np.nan), + (np.nan, np.nan), + ], +) +def test_likelihood_ratios_replace_undefined_by_0_tn(replace_undefined_by, expected): + """Test that the `replace_undefined_by` param returns the right value for the + negative_likelihood_ratio as defined by the user.""" + # This data causes tn=0 (0 true negatives) in the confusion_matrix and a division + # by zero that affects the negative_likelihood_ratio: + y_true = np.array([1, 0, 0]) + y_pred = np.array([1, 1, 1]) + + _, negative_likelihood_ratio = class_likelihood_ratios( + y_true, y_pred, replace_undefined_by=replace_undefined_by + ) + + if np.isnan(expected): + assert np.isnan(negative_likelihood_ratio) + else: + assert negative_likelihood_ratio == pytest.approx(expected) + + +def test_cohen_kappa(): + # These label vectors reproduce the contingency matrix from Artstein and + # Poesio (2008), Table 1: np.array([[20, 20], [10, 50]]). + y1 = np.array([0] * 40 + [1] * 60) + y2 = np.array([0] * 20 + [1] * 20 + [0] * 10 + [1] * 50) + kappa = cohen_kappa_score(y1, y2) + assert_almost_equal(kappa, 0.348, decimal=3) + assert kappa == cohen_kappa_score(y2, y1) + + # Add spurious labels and ignore them. + y1 = np.append(y1, [2] * 4) + y2 = np.append(y2, [2] * 4) + assert cohen_kappa_score(y1, y2, labels=[0, 1]) == kappa + + assert_almost_equal(cohen_kappa_score(y1, y1), 1.0) + + # Multiclass example: Artstein and Poesio, Table 4. + y1 = np.array([0] * 46 + [1] * 44 + [2] * 10) + y2 = np.array([0] * 52 + [1] * 32 + [2] * 16) + assert_almost_equal(cohen_kappa_score(y1, y2), 0.8013, decimal=4) + + # Weighting example: none, linear, quadratic. + y1 = np.array([0] * 46 + [1] * 44 + [2] * 10) + y2 = np.array([0] * 50 + [1] * 40 + [2] * 10) + assert_almost_equal(cohen_kappa_score(y1, y2), 0.9315, decimal=4) + assert_almost_equal(cohen_kappa_score(y1, y2, weights="linear"), 0.9412, decimal=4) + assert_almost_equal( + cohen_kappa_score(y1, y2, weights="quadratic"), 0.9541, decimal=4 + ) + + +def test_cohen_kappa_score_error_wrong_label(): + """Test that correct error is raised when users pass labels that are not in y1.""" + labels = [1, 2] + y1 = np.array(["a"] * 5 + ["b"] * 5) + y2 = np.array(["b"] * 10) + with pytest.raises( + ValueError, match="At least one label in `labels` must be present in `y1`" + ): + cohen_kappa_score(y1, y2, labels=labels) + + +@pytest.mark.parametrize("zero_division", [0, 1, np.nan]) +@pytest.mark.parametrize("y_true, y_pred", [([0], [0])]) +@pytest.mark.parametrize( + "metric", + [ + f1_score, + partial(fbeta_score, beta=1), + precision_score, + recall_score, + ], +) +def test_zero_division_nan_no_warning(metric, y_true, y_pred, zero_division): + """Check the behaviour of `zero_division` when setting to 0, 1 or np.nan. + No warnings should be raised. + """ + with warnings.catch_warnings(): + warnings.simplefilter("error") + result = metric(y_true, y_pred, zero_division=zero_division) + + if np.isnan(zero_division): + assert np.isnan(result) + else: + assert result == zero_division + + +@pytest.mark.parametrize("y_true, y_pred", [([0], [0])]) +@pytest.mark.parametrize( + "metric", + [ + f1_score, + partial(fbeta_score, beta=1), + precision_score, + recall_score, + ], +) +def test_zero_division_nan_warning(metric, y_true, y_pred): + """Check the behaviour of `zero_division` when setting to "warn". + A `UndefinedMetricWarning` should be raised. + """ + with pytest.warns(UndefinedMetricWarning): + result = metric(y_true, y_pred, zero_division="warn") + assert result == 0.0 + + +def test_matthews_corrcoef_against_numpy_corrcoef(global_random_seed): + rng = np.random.RandomState(global_random_seed) + y_true = rng.randint(0, 2, size=20) + y_pred = rng.randint(0, 2, size=20) + + assert_almost_equal( + matthews_corrcoef(y_true, y_pred), np.corrcoef(y_true, y_pred)[0, 1], 10 + ) + + +def test_matthews_corrcoef_against_jurman(global_random_seed): + # Check that the multiclass matthews_corrcoef agrees with the definition + # presented in Jurman, Riccadonna, Furlanello, (2012). A Comparison of MCC + # and CEN Error Measures in MultiClass Prediction + rng = np.random.RandomState(global_random_seed) + y_true = rng.randint(0, 2, size=20) + y_pred = rng.randint(0, 2, size=20) + sample_weight = rng.rand(20) + + C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight) + N = len(C) + cov_ytyp = sum( + [ + C[k, k] * C[m, l] - C[l, k] * C[k, m] + for k in range(N) + for m in range(N) + for l in range(N) + ] + ) + cov_ytyt = sum( + [ + C[:, k].sum() + * np.sum([C[g, f] for f in range(N) for g in range(N) if f != k]) + for k in range(N) + ] + ) + cov_ypyp = np.sum( + [ + C[k, :].sum() + * np.sum([C[f, g] for f in range(N) for g in range(N) if f != k]) + for k in range(N) + ] + ) + mcc_jurman = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp) + mcc_ours = matthews_corrcoef(y_true, y_pred, sample_weight=sample_weight) + + assert_almost_equal(mcc_ours, mcc_jurman, 10) + + +def test_matthews_corrcoef(global_random_seed): + rng = np.random.RandomState(global_random_seed) + y_true = ["a" if i == 0 else "b" for i in rng.randint(0, 2, size=20)] + + # corrcoef of same vectors must be 1 + assert_almost_equal(matthews_corrcoef(y_true, y_true), 1.0) + + # corrcoef, when the two vectors are opposites of each other, should be -1 + y_true_inv = ["b" if i == "a" else "a" for i in y_true] + assert_almost_equal(matthews_corrcoef(y_true, y_true_inv), -1) + + y_true_inv2 = label_binarize(y_true, classes=["a", "b"]) + y_true_inv2 = np.where(y_true_inv2, "a", "b") + assert_almost_equal(matthews_corrcoef(y_true, y_true_inv2), -1) + + # For the zero vector case, the corrcoef cannot be calculated and should + # output 0 + assert_almost_equal(matthews_corrcoef([0, 0, 0, 0], [0, 0, 0, 0]), 0.0) + + # And also for any other vector with 0 variance + assert_almost_equal(matthews_corrcoef(y_true, ["a"] * len(y_true)), 0.0) + + # These two vectors have 0 correlation and hence mcc should be 0 + y_1 = [1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1] + y_2 = [1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1] + assert_almost_equal(matthews_corrcoef(y_1, y_2), 0.0) + + # Check that sample weight is able to selectively exclude + mask = [1] * 10 + [0] * 10 + # Now the first half of the vector elements are alone given a weight of 1 + # and hence the mcc will not be a perfect 0 as in the previous case + with pytest.raises(AssertionError): + assert_almost_equal(matthews_corrcoef(y_1, y_2, sample_weight=mask), 0.0) + + +def test_matthews_corrcoef_multiclass(global_random_seed): + rng = np.random.RandomState(global_random_seed) + ord_a = ord("a") + n_classes = 4 + y_true = [chr(ord_a + i) for i in rng.randint(0, n_classes, size=20)] + + # corrcoef of same vectors must be 1 + assert_almost_equal(matthews_corrcoef(y_true, y_true), 1.0) + + # with multiclass > 2 it is not possible to achieve -1 + y_true = [0, 0, 1, 1, 2, 2] + y_pred_bad = [2, 2, 0, 0, 1, 1] + assert_almost_equal(matthews_corrcoef(y_true, y_pred_bad), -0.5) + + # Maximizing false positives and negatives minimizes the MCC + # The minimum will be different for depending on the input + y_true = [0, 0, 1, 1, 2, 2] + y_pred_min = [1, 1, 0, 0, 0, 0] + assert_almost_equal(matthews_corrcoef(y_true, y_pred_min), -12 / np.sqrt(24 * 16)) + + # Zero variance will result in an mcc of zero + y_true = [0, 1, 2] + y_pred = [3, 3, 3] + assert_almost_equal(matthews_corrcoef(y_true, y_pred), 0.0) + + # Also for ground truth with zero variance + y_true = [3, 3, 3] + y_pred = [0, 1, 2] + assert_almost_equal(matthews_corrcoef(y_true, y_pred), 0.0) + + # These two vectors have 0 correlation and hence mcc should be 0 + y_1 = [0, 1, 2, 0, 1, 2, 0, 1, 2] + y_2 = [1, 1, 1, 2, 2, 2, 0, 0, 0] + assert_almost_equal(matthews_corrcoef(y_1, y_2), 0.0) + + # We can test that binary assumptions hold using the multiclass computation + # by masking the weight of samples not in the first two classes + + # Masking the last label should let us get an MCC of -1 + y_true = [0, 0, 1, 1, 2] + y_pred = [1, 1, 0, 0, 2] + sample_weight = [1, 1, 1, 1, 0] + assert_almost_equal( + matthews_corrcoef(y_true, y_pred, sample_weight=sample_weight), -1 + ) + + # For the zero vector case, the corrcoef cannot be calculated and should + # output 0 + y_true = [0, 0, 1, 2] + y_pred = [0, 0, 1, 2] + sample_weight = [1, 1, 0, 0] + assert_almost_equal( + matthews_corrcoef(y_true, y_pred, sample_weight=sample_weight), 0.0 + ) + + +@pytest.mark.parametrize("n_points", [100, 10000]) +def test_matthews_corrcoef_overflow(n_points, global_random_seed): + # https://github.com/scikit-learn/scikit-learn/issues/9622 + rng = np.random.RandomState(global_random_seed) + + def mcc_safe(y_true, y_pred): + conf_matrix = confusion_matrix(y_true, y_pred) + true_pos = conf_matrix[1, 1] + false_pos = conf_matrix[1, 0] + false_neg = conf_matrix[0, 1] + n_points = len(y_true) + pos_rate = (true_pos + false_neg) / n_points + activity = (true_pos + false_pos) / n_points + mcc_numerator = true_pos / n_points - pos_rate * activity + mcc_denominator = activity * pos_rate * (1 - activity) * (1 - pos_rate) + return mcc_numerator / np.sqrt(mcc_denominator) + + def random_ys(n_points): # binary + x_true = rng.random_sample(n_points) + x_pred = x_true + 0.2 * (rng.random_sample(n_points) - 0.5) + y_true = x_true > 0.5 + y_pred = x_pred > 0.5 + return y_true, y_pred + + arr = np.repeat([0.0, 1.0], n_points) # binary + assert_almost_equal(matthews_corrcoef(arr, arr), 1.0) + arr = np.repeat([0.0, 1.0, 2.0], n_points) # multiclass + assert_almost_equal(matthews_corrcoef(arr, arr), 1.0) + + y_true, y_pred = random_ys(n_points) + assert_almost_equal(matthews_corrcoef(y_true, y_true), 1.0) + assert_almost_equal(matthews_corrcoef(y_true, y_pred), mcc_safe(y_true, y_pred)) + + +def test_precision_recall_f1_score_multiclass(): + # Test Precision Recall and F1 Score for multiclass classification task + y_true, y_pred, _ = make_prediction(binary=False) + + # compute scores with default labels introspection + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) + assert_array_almost_equal(p, [0.83, 0.33, 0.42], 2) + assert_array_almost_equal(r, [0.79, 0.09, 0.90], 2) + assert_array_almost_equal(f, [0.81, 0.15, 0.57], 2) + assert_array_equal(s, [24, 31, 20]) + + # averaging tests + ps = precision_score(y_true, y_pred, pos_label=1, average="micro") + assert_array_almost_equal(ps, 0.53, 2) + + rs = recall_score(y_true, y_pred, average="micro") + assert_array_almost_equal(rs, 0.53, 2) + + fs = f1_score(y_true, y_pred, average="micro") + assert_array_almost_equal(fs, 0.53, 2) + + ps = precision_score(y_true, y_pred, average="macro") + assert_array_almost_equal(ps, 0.53, 2) + + rs = recall_score(y_true, y_pred, average="macro") + assert_array_almost_equal(rs, 0.60, 2) + + fs = f1_score(y_true, y_pred, average="macro") + assert_array_almost_equal(fs, 0.51, 2) + + ps = precision_score(y_true, y_pred, average="weighted") + assert_array_almost_equal(ps, 0.51, 2) + + rs = recall_score(y_true, y_pred, average="weighted") + assert_array_almost_equal(rs, 0.53, 2) + + fs = f1_score(y_true, y_pred, average="weighted") + assert_array_almost_equal(fs, 0.47, 2) + + with pytest.raises(ValueError): + precision_score(y_true, y_pred, average="samples") + with pytest.raises(ValueError): + recall_score(y_true, y_pred, average="samples") + with pytest.raises(ValueError): + f1_score(y_true, y_pred, average="samples") + with pytest.raises(ValueError): + fbeta_score(y_true, y_pred, average="samples", beta=0.5) + + # same prediction but with and explicit label ordering + p, r, f, s = precision_recall_fscore_support( + y_true, y_pred, labels=[0, 2, 1], average=None + ) + assert_array_almost_equal(p, [0.83, 0.41, 0.33], 2) + assert_array_almost_equal(r, [0.79, 0.90, 0.10], 2) + assert_array_almost_equal(f, [0.81, 0.57, 0.15], 2) + assert_array_equal(s, [24, 20, 31]) + + +@pytest.mark.parametrize("average", ["samples", "micro", "macro", "weighted", None]) +def test_precision_refcall_f1_score_multilabel_unordered_labels(average): + # test that labels need not be sorted in the multilabel case + y_true = np.array([[1, 1, 0, 0]]) + y_pred = np.array([[0, 0, 1, 1]]) + p, r, f, s = precision_recall_fscore_support( + y_true, y_pred, labels=[3, 0, 1, 2], warn_for=[], average=average + ) + assert_array_equal(p, 0) + assert_array_equal(r, 0) + assert_array_equal(f, 0) + if average is None: + assert_array_equal(s, [0, 1, 1, 0]) + + +def test_precision_recall_f1_score_binary_averaged(): + y_true = np.array([0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1]) + y_pred = np.array([1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1]) + + # compute scores with default labels introspection + ps, rs, fs, _ = precision_recall_fscore_support(y_true, y_pred, average=None) + p, r, f, _ = precision_recall_fscore_support(y_true, y_pred, average="macro") + assert p == np.mean(ps) + assert r == np.mean(rs) + assert f == np.mean(fs) + p, r, f, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted") + support = np.bincount(y_true) + assert p == np.average(ps, weights=support) + assert r == np.average(rs, weights=support) + assert f == np.average(fs, weights=support) + + +def test_zero_precision_recall(): + # Check that pathological cases do not bring NaNs + + old_error_settings = np.seterr(all="raise") + + try: + y_true = np.array([0, 1, 2, 0, 1, 2]) + y_pred = np.array([2, 0, 1, 1, 2, 0]) + + assert_almost_equal(precision_score(y_true, y_pred, average="macro"), 0.0, 2) + assert_almost_equal(recall_score(y_true, y_pred, average="macro"), 0.0, 2) + assert_almost_equal(f1_score(y_true, y_pred, average="macro"), 0.0, 2) + + finally: + np.seterr(**old_error_settings) + + +def test_confusion_matrix_multiclass_subset_labels(): + # Test confusion matrix - multi-class case with subset of labels + y_true, y_pred, _ = make_prediction(binary=False) + + # compute confusion matrix with only first two labels considered + cm = confusion_matrix(y_true, y_pred, labels=[0, 1]) + assert_array_equal(cm, [[19, 4], [4, 3]]) + + # compute confusion matrix with explicit label ordering for only subset + # of labels + cm = confusion_matrix(y_true, y_pred, labels=[2, 1]) + assert_array_equal(cm, [[18, 2], [24, 3]]) + + # a label not in y_true should result in zeros for that row/column + extra_label = np.max(y_true) + 1 + cm = confusion_matrix(y_true, y_pred, labels=[2, extra_label]) + assert_array_equal(cm, [[18, 0], [0, 0]]) + + +@pytest.mark.parametrize( + "labels, err_msg", + [ + ([], "'labels' should contains at least one label."), + ([3, 4], "At least one label specified must be in y_true"), + ], + ids=["empty list", "unknown labels"], +) +def test_confusion_matrix_error(labels, err_msg): + y_true, y_pred, _ = make_prediction(binary=False) + with pytest.raises(ValueError, match=err_msg): + confusion_matrix(y_true, y_pred, labels=labels) + + +@pytest.mark.parametrize( + "labels", (None, [0, 1], [0, 1, 2]), ids=["None", "binary", "multiclass"] +) +def test_confusion_matrix_on_zero_length_input(labels): + expected_n_classes = len(labels) if labels else 0 + expected = np.zeros((expected_n_classes, expected_n_classes), dtype=int) + cm = confusion_matrix([], [], labels=labels) + assert_array_equal(cm, expected) + + +def test_confusion_matrix_dtype(): + y = [0, 1, 1] + weight = np.ones(len(y)) + # confusion_matrix returns int64 by default + cm = confusion_matrix(y, y) + assert cm.dtype == np.int64 + # The dtype of confusion_matrix is always 64 bit + for dtype in [np.bool_, np.int32, np.uint64]: + cm = confusion_matrix(y, y, sample_weight=weight.astype(dtype, copy=False)) + assert cm.dtype == np.int64 + for dtype in [np.float32, np.float64, None, object]: + cm = confusion_matrix(y, y, sample_weight=weight.astype(dtype, copy=False)) + assert cm.dtype == np.float64 + + # np.iinfo(np.uint32).max should be accumulated correctly + weight = np.full(len(y), 4294967295, dtype=np.uint32) + cm = confusion_matrix(y, y, sample_weight=weight) + assert cm[0, 0] == 4294967295 + assert cm[1, 1] == 8589934590 + + # np.iinfo(np.int64).max should cause an overflow + weight = np.full(len(y), 9223372036854775807, dtype=np.int64) + cm = confusion_matrix(y, y, sample_weight=weight) + assert cm[0, 0] == 9223372036854775807 + assert cm[1, 1] == -2 + + +@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) +def test_confusion_matrix_pandas_nullable(dtype): + """Checks that confusion_matrix works with pandas nullable dtypes. + + Non-regression test for gh-25635. + """ + pd = pytest.importorskip("pandas") + + y_ndarray = np.array([1, 0, 0, 1, 0, 1, 1, 0, 1]) + y_true = pd.Series(y_ndarray, dtype=dtype) + y_predicted = pd.Series([0, 0, 1, 1, 0, 1, 1, 1, 1], dtype="int64") + + output = confusion_matrix(y_true, y_predicted) + expected_output = confusion_matrix(y_ndarray, y_predicted) + + assert_array_equal(output, expected_output) + + +def test_classification_report_multiclass(): + # Test performance report + iris = datasets.load_iris() + y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) + + # print classification report with class names + expected_report = """\ + precision recall f1-score support + + setosa 0.83 0.79 0.81 24 + versicolor 0.33 0.10 0.15 31 + virginica 0.42 0.90 0.57 20 + + accuracy 0.53 75 + macro avg 0.53 0.60 0.51 75 +weighted avg 0.51 0.53 0.47 75 +""" + report = classification_report( + y_true, + y_pred, + labels=np.arange(len(iris.target_names)), + target_names=iris.target_names, + ) + assert report == expected_report + + +def test_classification_report_multiclass_balanced(): + y_true, y_pred = [0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2] + + expected_report = """\ + precision recall f1-score support + + 0 0.33 0.33 0.33 3 + 1 0.33 0.33 0.33 3 + 2 0.33 0.33 0.33 3 + + accuracy 0.33 9 + macro avg 0.33 0.33 0.33 9 +weighted avg 0.33 0.33 0.33 9 +""" + report = classification_report(y_true, y_pred) + assert report == expected_report + + +def test_classification_report_multiclass_with_label_detection(): + iris = datasets.load_iris() + y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) + + # print classification report with label detection + expected_report = """\ + precision recall f1-score support + + 0 0.83 0.79 0.81 24 + 1 0.33 0.10 0.15 31 + 2 0.42 0.90 0.57 20 + + accuracy 0.53 75 + macro avg 0.53 0.60 0.51 75 +weighted avg 0.51 0.53 0.47 75 +""" + report = classification_report(y_true, y_pred) + assert report == expected_report + + +def test_classification_report_multiclass_with_digits(): + # Test performance report with added digits in floating point values + iris = datasets.load_iris() + y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) + + # print classification report with class names + expected_report = """\ + precision recall f1-score support + + setosa 0.82609 0.79167 0.80851 24 + versicolor 0.33333 0.09677 0.15000 31 + virginica 0.41860 0.90000 0.57143 20 + + accuracy 0.53333 75 + macro avg 0.52601 0.59615 0.50998 75 +weighted avg 0.51375 0.53333 0.47310 75 +""" + report = classification_report( + y_true, + y_pred, + labels=np.arange(len(iris.target_names)), + target_names=iris.target_names, + digits=5, + ) + assert report == expected_report + + +def test_classification_report_multiclass_with_string_label(): + y_true, y_pred, _ = make_prediction(binary=False) + + y_true = np.array(["blue", "green", "red"])[y_true] + y_pred = np.array(["blue", "green", "red"])[y_pred] + + expected_report = """\ + precision recall f1-score support + + blue 0.83 0.79 0.81 24 + green 0.33 0.10 0.15 31 + red 0.42 0.90 0.57 20 + + accuracy 0.53 75 + macro avg 0.53 0.60 0.51 75 +weighted avg 0.51 0.53 0.47 75 +""" + report = classification_report(y_true, y_pred) + assert report == expected_report + + expected_report = """\ + precision recall f1-score support + + a 0.83 0.79 0.81 24 + b 0.33 0.10 0.15 31 + c 0.42 0.90 0.57 20 + + accuracy 0.53 75 + macro avg 0.53 0.60 0.51 75 +weighted avg 0.51 0.53 0.47 75 +""" + report = classification_report(y_true, y_pred, target_names=["a", "b", "c"]) + assert report == expected_report + + +def test_classification_report_multiclass_with_unicode_label(): + y_true, y_pred, _ = make_prediction(binary=False) + + labels = np.array(["blue\xa2", "green\xa2", "red\xa2"]) + y_true = labels[y_true] + y_pred = labels[y_pred] + + expected_report = """\ + precision recall f1-score support + + blue\xa2 0.83 0.79 0.81 24 + green\xa2 0.33 0.10 0.15 31 + red\xa2 0.42 0.90 0.57 20 + + accuracy 0.53 75 + macro avg 0.53 0.60 0.51 75 +weighted avg 0.51 0.53 0.47 75 +""" + report = classification_report(y_true, y_pred) + assert report == expected_report + + +def test_classification_report_multiclass_with_long_string_label(): + y_true, y_pred, _ = make_prediction(binary=False) + + labels = np.array(["blue", "green" * 5, "red"]) + y_true = labels[y_true] + y_pred = labels[y_pred] + + expected_report = """\ + precision recall f1-score support + + blue 0.83 0.79 0.81 24 +greengreengreengreengreen 0.33 0.10 0.15 31 + red 0.42 0.90 0.57 20 + + accuracy 0.53 75 + macro avg 0.53 0.60 0.51 75 + weighted avg 0.51 0.53 0.47 75 +""" + + report = classification_report(y_true, y_pred) + assert report == expected_report + + +def test_classification_report_labels_target_names_unequal_length(): + y_true = [0, 0, 2, 0, 0] + y_pred = [0, 2, 2, 0, 0] + target_names = ["class 0", "class 1", "class 2"] + + msg = "labels size, 2, does not match size of target_names, 3" + with pytest.warns(UserWarning, match=msg): + classification_report(y_true, y_pred, labels=[0, 2], target_names=target_names) + + +def test_classification_report_no_labels_target_names_unequal_length(): + y_true = [0, 0, 2, 0, 0] + y_pred = [0, 2, 2, 0, 0] + target_names = ["class 0", "class 1", "class 2"] + + err_msg = ( + "Number of classes, 2, does not " + "match size of target_names, 3. " + "Try specifying the labels parameter" + ) + with pytest.raises(ValueError, match=err_msg): + classification_report(y_true, y_pred, target_names=target_names) + + +@pytest.mark.filterwarnings(r"ignore::sklearn.exceptions.UndefinedMetricWarning") +def test_multilabel_classification_report(): + n_classes = 4 + n_samples = 50 + + _, y_true = make_multilabel_classification( + n_features=1, n_samples=n_samples, n_classes=n_classes, random_state=0 + ) + + _, y_pred = make_multilabel_classification( + n_features=1, n_samples=n_samples, n_classes=n_classes, random_state=1 + ) + + expected_report = """\ + precision recall f1-score support + + 0 0.50 0.67 0.57 24 + 1 0.51 0.74 0.61 27 + 2 0.29 0.08 0.12 26 + 3 0.52 0.56 0.54 27 + + micro avg 0.50 0.51 0.50 104 + macro avg 0.45 0.51 0.46 104 +weighted avg 0.45 0.51 0.46 104 + samples avg 0.46 0.42 0.40 104 +""" + + report = classification_report(y_true, y_pred) + assert report == expected_report + + +def test_multilabel_zero_one_loss_subset(): + # Dense label indicator matrix format + y1 = np.array([[0, 1, 1], [1, 0, 1]]) + y2 = np.array([[0, 0, 1], [1, 0, 1]]) + + assert zero_one_loss(y1, y2) == 0.5 + assert zero_one_loss(y1, y1) == 0 + assert zero_one_loss(y2, y2) == 0 + assert zero_one_loss(y2, np.logical_not(y2)) == 1 + assert zero_one_loss(y1, np.logical_not(y1)) == 1 + assert zero_one_loss(y1, np.zeros(y1.shape)) == 1 + assert zero_one_loss(y2, np.zeros(y1.shape)) == 1 + + +def test_multilabel_hamming_loss(): + # Dense label indicator matrix format + y1 = np.array([[0, 1, 1], [1, 0, 1]]) + y2 = np.array([[0, 0, 1], [1, 0, 1]]) + w = np.array([1, 3]) + + assert hamming_loss(y1, y2) == 1 / 6 + assert hamming_loss(y1, y1) == 0 + assert hamming_loss(y2, y2) == 0 + assert hamming_loss(y2, 1 - y2) == 1 + assert hamming_loss(y1, 1 - y1) == 1 + assert hamming_loss(y1, np.zeros(y1.shape)) == 4 / 6 + assert hamming_loss(y2, np.zeros(y1.shape)) == 0.5 + assert hamming_loss(y1, y2, sample_weight=w) == 1.0 / 12 + assert hamming_loss(y1, 1 - y2, sample_weight=w) == 11.0 / 12 + assert hamming_loss(y1, np.zeros_like(y1), sample_weight=w) == 2.0 / 3 + # sp_hamming only works with 1-D arrays + assert hamming_loss(y1[0], y2[0]) == sp_hamming(y1[0], y2[0]) + + +def test_jaccard_score_validation(): + y_true = np.array([0, 1, 0, 1, 1]) + y_pred = np.array([0, 1, 0, 1, 1]) + err_msg = r"pos_label=2 is not a valid label. It should be one of \[0, 1\]" + with pytest.raises(ValueError, match=err_msg): + jaccard_score(y_true, y_pred, average="binary", pos_label=2) + + y_true = np.array([[0, 1, 1], [1, 0, 0]]) + y_pred = np.array([[1, 1, 1], [1, 0, 1]]) + msg1 = ( + r"Target is multilabel-indicator but average='binary'. " + r"Please choose another average setting, one of \[None, " + r"'micro', 'macro', 'weighted', 'samples'\]." + ) + with pytest.raises(ValueError, match=msg1): + jaccard_score(y_true, y_pred, average="binary", pos_label=-1) + + y_true = np.array([0, 1, 1, 0, 2]) + y_pred = np.array([1, 1, 1, 1, 0]) + msg2 = ( + r"Target is multiclass but average='binary'. Please choose " + r"another average setting, one of \[None, 'micro', 'macro', " + r"'weighted'\]." + ) + with pytest.raises(ValueError, match=msg2): + jaccard_score(y_true, y_pred, average="binary") + msg3 = "Samplewise metrics are not available outside of multilabel classification." + with pytest.raises(ValueError, match=msg3): + jaccard_score(y_true, y_pred, average="samples") + + msg = ( + r"Note that pos_label \(set to 3\) is ignored when " + r"average != 'binary' \(got 'micro'\). You may use " + r"labels=\[pos_label\] to specify a single positive " + "class." + ) + with pytest.warns(UserWarning, match=msg): + jaccard_score(y_true, y_pred, average="micro", pos_label=3) + + +def test_multilabel_jaccard_score(recwarn): + # Dense label indicator matrix format + y1 = np.array([[0, 1, 1], [1, 0, 1]]) + y2 = np.array([[0, 0, 1], [1, 0, 1]]) + + # size(y1 \inter y2) = [1, 2] + # size(y1 \union y2) = [2, 2] + + assert jaccard_score(y1, y2, average="samples") == 0.75 + assert jaccard_score(y1, y1, average="samples") == 1 + assert jaccard_score(y2, y2, average="samples") == 1 + assert jaccard_score(y2, np.logical_not(y2), average="samples") == 0 + assert jaccard_score(y1, np.logical_not(y1), average="samples") == 0 + assert jaccard_score(y1, np.zeros(y1.shape), average="samples") == 0 + assert jaccard_score(y2, np.zeros(y1.shape), average="samples") == 0 + + y_true = np.array([[0, 1, 1], [1, 0, 0]]) + y_pred = np.array([[1, 1, 1], [1, 0, 1]]) + # average='macro' + assert_almost_equal(jaccard_score(y_true, y_pred, average="macro"), 2.0 / 3) + # average='micro' + assert_almost_equal(jaccard_score(y_true, y_pred, average="micro"), 3.0 / 5) + # average='samples' + assert_almost_equal(jaccard_score(y_true, y_pred, average="samples"), 7.0 / 12) + assert_almost_equal( + jaccard_score(y_true, y_pred, average="samples", labels=[0, 2]), 1.0 / 2 + ) + assert_almost_equal( + jaccard_score(y_true, y_pred, average="samples", labels=[1, 2]), 1.0 / 2 + ) + # average=None + assert_array_equal( + jaccard_score(y_true, y_pred, average=None), np.array([1.0 / 2, 1.0, 1.0 / 2]) + ) + + y_true = np.array([[0, 1, 1], [1, 0, 1]]) + y_pred = np.array([[1, 1, 1], [1, 0, 1]]) + assert_almost_equal(jaccard_score(y_true, y_pred, average="macro"), 5.0 / 6) + # average='weighted' + assert_almost_equal(jaccard_score(y_true, y_pred, average="weighted"), 7.0 / 8) + + msg2 = "Got 4 > 2" + with pytest.raises(ValueError, match=msg2): + jaccard_score(y_true, y_pred, labels=[4], average="macro") + msg3 = "Got -1 < 0" + with pytest.raises(ValueError, match=msg3): + jaccard_score(y_true, y_pred, labels=[-1], average="macro") + + msg = ( + "Jaccard is ill-defined and being set to 0.0 in labels " + "with no true or predicted samples." + ) + + with pytest.warns(UndefinedMetricWarning, match=msg): + assert ( + jaccard_score(np.array([[0, 1]]), np.array([[0, 1]]), average="macro") + == 0.5 + ) + + msg = ( + "Jaccard is ill-defined and being set to 0.0 in samples " + "with no true or predicted labels." + ) + + with pytest.warns(UndefinedMetricWarning, match=msg): + assert ( + jaccard_score( + np.array([[0, 0], [1, 1]]), + np.array([[0, 0], [1, 1]]), + average="samples", + ) + == 0.5 + ) + + assert not list(recwarn) + + +def test_multiclass_jaccard_score(recwarn): + y_true = ["ant", "ant", "cat", "cat", "ant", "cat", "bird", "bird"] + y_pred = ["cat", "ant", "cat", "cat", "ant", "bird", "bird", "cat"] + labels = ["ant", "bird", "cat"] + lb = LabelBinarizer() + lb.fit(labels) + y_true_bin = lb.transform(y_true) + y_pred_bin = lb.transform(y_pred) + multi_jaccard_score = partial(jaccard_score, y_true, y_pred) + bin_jaccard_score = partial(jaccard_score, y_true_bin, y_pred_bin) + multi_labels_list = [ + ["ant", "bird"], + ["ant", "cat"], + ["cat", "bird"], + ["ant"], + ["bird"], + ["cat"], + None, + ] + bin_labels_list = [[0, 1], [0, 2], [2, 1], [0], [1], [2], None] + + # other than average='samples'/'none-samples', test everything else here + for average in ("macro", "weighted", "micro", None): + for m_label, b_label in zip(multi_labels_list, bin_labels_list): + assert_almost_equal( + multi_jaccard_score(average=average, labels=m_label), + bin_jaccard_score(average=average, labels=b_label), + ) + + y_true = np.array([[0, 0], [0, 0], [0, 0]]) + y_pred = np.array([[0, 0], [0, 0], [0, 0]]) + with ignore_warnings(): + assert jaccard_score(y_true, y_pred, average="weighted") == 0 + + assert not list(recwarn) + + +def test_average_binary_jaccard_score(recwarn): + # tp=0, fp=0, fn=1, tn=0 + assert jaccard_score([1], [0], average="binary") == 0.0 + # tp=0, fp=0, fn=0, tn=1 + msg = ( + "Jaccard is ill-defined and being set to 0.0 due to " + "no true or predicted samples" + ) + with pytest.warns(UndefinedMetricWarning, match=msg): + assert jaccard_score([0, 0], [0, 0], average="binary") == 0.0 + + # tp=1, fp=0, fn=0, tn=0 (pos_label=0) + assert jaccard_score([0], [0], pos_label=0, average="binary") == 1.0 + y_true = np.array([1, 0, 1, 1, 0]) + y_pred = np.array([1, 0, 1, 1, 1]) + assert_almost_equal(jaccard_score(y_true, y_pred, average="binary"), 3.0 / 4) + assert_almost_equal( + jaccard_score(y_true, y_pred, average="binary", pos_label=0), 1.0 / 2 + ) + + assert not list(recwarn) + + +def test_jaccard_score_zero_division_warning(): + # check that we raised a warning with default behavior if a zero division + # happens + y_true = np.array([[1, 0, 1], [0, 0, 0]]) + y_pred = np.array([[0, 0, 0], [0, 0, 0]]) + msg = ( + "Jaccard is ill-defined and being set to 0.0 in " + "samples with no true or predicted labels." + " Use `zero_division` parameter to control this behavior." + ) + with pytest.warns(UndefinedMetricWarning, match=msg): + score = jaccard_score(y_true, y_pred, average="samples", zero_division="warn") + assert score == pytest.approx(0.0) + + +@pytest.mark.parametrize("zero_division, expected_score", [(0, 0), (1, 0.5)]) +def test_jaccard_score_zero_division_set_value(zero_division, expected_score): + # check that we don't issue warning by passing the zero_division parameter + y_true = np.array([[1, 0, 1], [0, 0, 0]]) + y_pred = np.array([[0, 0, 0], [0, 0, 0]]) + with warnings.catch_warnings(): + warnings.simplefilter("error", UndefinedMetricWarning) + score = jaccard_score( + y_true, y_pred, average="samples", zero_division=zero_division + ) + assert score == pytest.approx(expected_score) + + +@pytest.mark.filterwarnings(r"ignore::sklearn.exceptions.UndefinedMetricWarning") +def test_precision_recall_f1_score_multilabel_1(): + # Test precision_recall_f1_score on a crafted multilabel example + # First crafted example + + y_true = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 1]]) + y_pred = np.array([[0, 1, 0, 0], [0, 1, 0, 0], [1, 0, 1, 0]]) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) + + # tp = [0, 1, 1, 0] + # fn = [1, 0, 0, 1] + # fp = [1, 1, 0, 0] + # Check per class + + assert_array_almost_equal(p, [0.0, 0.5, 1.0, 0.0], 2) + assert_array_almost_equal(r, [0.0, 1.0, 1.0, 0.0], 2) + assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2) + assert_array_almost_equal(s, [1, 1, 1, 1], 2) + + f2 = fbeta_score(y_true, y_pred, beta=2, average=None) + support = s + assert_array_almost_equal(f2, [0, 0.83, 1, 0], 2) + + # Check macro + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="macro") + assert_almost_equal(p, 1.5 / 4) + assert_almost_equal(r, 0.5) + assert_almost_equal(f, 2.5 / 1.5 * 0.25) + assert s is None + assert_almost_equal( + fbeta_score(y_true, y_pred, beta=2, average="macro"), np.mean(f2) + ) + + # Check micro + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="micro") + assert_almost_equal(p, 0.5) + assert_almost_equal(r, 0.5) + assert_almost_equal(f, 0.5) + assert s is None + assert_almost_equal( + fbeta_score(y_true, y_pred, beta=2, average="micro"), + (1 + 4) * p * r / (4 * p + r), + ) + + # Check weighted + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="weighted") + assert_almost_equal(p, 1.5 / 4) + assert_almost_equal(r, 0.5) + assert_almost_equal(f, 2.5 / 1.5 * 0.25) + assert s is None + assert_almost_equal( + fbeta_score(y_true, y_pred, beta=2, average="weighted"), + np.average(f2, weights=support), + ) + # Check samples + # |h(x_i) inter y_i | = [0, 1, 1] + # |y_i| = [1, 1, 2] + # |h(x_i)| = [1, 1, 2] + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="samples") + assert_almost_equal(p, 0.5) + assert_almost_equal(r, 0.5) + assert_almost_equal(f, 0.5) + assert s is None + assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, average="samples"), 0.5) + + +@pytest.mark.filterwarnings(r"ignore::sklearn.exceptions.UndefinedMetricWarning") +def test_precision_recall_f1_score_multilabel_2(): + # Test precision_recall_f1_score on a crafted multilabel example 2 + # Second crafted example + y_true = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 1, 1, 0]]) + y_pred = np.array([[0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 0, 0]]) + + # tp = [ 0. 1. 0. 0.] + # fp = [ 1. 0. 0. 2.] + # fn = [ 1. 1. 1. 0.] + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) + assert_array_almost_equal(p, [0.0, 1.0, 0.0, 0.0], 2) + assert_array_almost_equal(r, [0.0, 0.5, 0.0, 0.0], 2) + assert_array_almost_equal(f, [0.0, 0.66, 0.0, 0.0], 2) + assert_array_almost_equal(s, [1, 2, 1, 0], 2) + + f2 = fbeta_score(y_true, y_pred, beta=2, average=None) + support = s + assert_array_almost_equal(f2, [0, 0.55, 0, 0], 2) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="micro") + assert_almost_equal(p, 0.25) + assert_almost_equal(r, 0.25) + assert_almost_equal(f, 2 * 0.25 * 0.25 / 0.5) + assert s is None + assert_almost_equal( + fbeta_score(y_true, y_pred, beta=2, average="micro"), + (1 + 4) * p * r / (4 * p + r), + ) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="macro") + assert_almost_equal(p, 0.25) + assert_almost_equal(r, 0.125) + assert_almost_equal(f, 2 / 12) + assert s is None + assert_almost_equal( + fbeta_score(y_true, y_pred, beta=2, average="macro"), np.mean(f2) + ) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="weighted") + assert_almost_equal(p, 2 / 4) + assert_almost_equal(r, 1 / 4) + assert_almost_equal(f, 2 / 3 * 2 / 4) + assert s is None + assert_almost_equal( + fbeta_score(y_true, y_pred, beta=2, average="weighted"), + np.average(f2, weights=support), + ) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="samples") + # Check samples + # |h(x_i) inter y_i | = [0, 0, 1] + # |y_i| = [1, 1, 2] + # |h(x_i)| = [1, 1, 2] + + assert_almost_equal(p, 1 / 6) + assert_almost_equal(r, 1 / 6) + assert_almost_equal(f, 2 / 4 * 1 / 3) + assert s is None + assert_almost_equal( + fbeta_score(y_true, y_pred, beta=2, average="samples"), 0.1666, 2 + ) + + +@pytest.mark.filterwarnings(r"ignore::sklearn.exceptions.UndefinedMetricWarning") +@pytest.mark.parametrize( + "zero_division, zero_division_expected", + [("warn", 0), (0, 0), (1, 1), (np.nan, np.nan)], +) +def test_precision_recall_f1_score_with_an_empty_prediction( + zero_division, zero_division_expected +): + y_true = np.array([[0, 1, 0, 0], [1, 0, 0, 0], [0, 1, 1, 0]]) + y_pred = np.array([[0, 0, 0, 0], [0, 0, 0, 1], [0, 1, 1, 0]]) + + # true_pos = [ 0. 1. 1. 0.] + # false_pos = [ 0. 0. 0. 1.] + # false_neg = [ 1. 1. 0. 0.] + + p, r, f, s = precision_recall_fscore_support( + y_true, y_pred, average=None, zero_division=zero_division + ) + + assert_array_almost_equal(p, [zero_division_expected, 1.0, 1.0, 0.0], 2) + assert_array_almost_equal(r, [0.0, 0.5, 1.0, zero_division_expected], 2) + expected_f = 0 + assert_array_almost_equal(f, [expected_f, 1 / 1.5, 1, expected_f], 2) + assert_array_almost_equal(s, [1, 2, 1, 0], 2) + + f2 = fbeta_score(y_true, y_pred, beta=2, average=None, zero_division=zero_division) + support = s + assert_array_almost_equal(f2, [expected_f, 0.55, 1, expected_f], 2) + + p, r, f, s = precision_recall_fscore_support( + y_true, y_pred, average="macro", zero_division=zero_division + ) + + value_to_sum = 0 if np.isnan(zero_division_expected) else zero_division_expected + values_to_average = 3 + (not np.isnan(zero_division_expected)) + + assert_almost_equal(p, (2 + value_to_sum) / values_to_average) + assert_almost_equal(r, (1.5 + value_to_sum) / values_to_average) + expected_f = (2 / 3 + 1) / 4 + assert_almost_equal(f, expected_f) + assert s is None + assert_almost_equal( + fbeta_score( + y_true, + y_pred, + beta=2, + average="macro", + zero_division=zero_division, + ), + _nanaverage(f2, weights=None), + ) + + p, r, f, s = precision_recall_fscore_support( + y_true, y_pred, average="micro", zero_division=zero_division + ) + assert_almost_equal(p, 2 / 3) + assert_almost_equal(r, 0.5) + assert_almost_equal(f, 2 / 3 / (2 / 3 + 0.5)) + assert s is None + assert_almost_equal( + fbeta_score( + y_true, y_pred, beta=2, average="micro", zero_division=zero_division + ), + (1 + 4) * p * r / (4 * p + r), + ) + + p, r, f, s = precision_recall_fscore_support( + y_true, y_pred, average="weighted", zero_division=zero_division + ) + assert_almost_equal(p, 3 / 4 if zero_division_expected == 0 else 1.0) + assert_almost_equal(r, 0.5) + values_to_average = 4 + assert_almost_equal(f, (2 * 2 / 3 + 1) / values_to_average) + assert s is None + assert_almost_equal( + fbeta_score( + y_true, y_pred, beta=2, average="weighted", zero_division=zero_division + ), + _nanaverage(f2, weights=support), + ) + + p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="samples") + # |h(x_i) inter y_i | = [0, 0, 2] + # |y_i| = [1, 1, 2] + # |h(x_i)| = [0, 1, 2] + assert_almost_equal(p, 1 / 3) + assert_almost_equal(r, 1 / 3) + assert_almost_equal(f, 1 / 3) + assert s is None + expected_result = 0.333 + assert_almost_equal( + fbeta_score( + y_true, y_pred, beta=2, average="samples", zero_division=zero_division + ), + expected_result, + 2, + ) + + +@pytest.mark.parametrize("beta", [1]) +@pytest.mark.parametrize("average", ["macro", "micro", "weighted", "samples"]) +@pytest.mark.parametrize("zero_division", [0, 1, np.nan]) +def test_precision_recall_f1_no_labels(beta, average, zero_division): + y_true = np.zeros((20, 3)) + y_pred = np.zeros_like(y_true) + + with warnings.catch_warnings(): + warnings.simplefilter("error") + + p, r, f, s = precision_recall_fscore_support( + y_true, + y_pred, + average=average, + beta=beta, + zero_division=zero_division, + ) + fbeta = fbeta_score( + y_true, + y_pred, + beta=beta, + average=average, + zero_division=zero_division, + ) + assert s is None + + # if zero_division = nan, check that all metrics are nan and exit + if np.isnan(zero_division): + for metric in [p, r, f, fbeta]: + assert np.isnan(metric) + return + + zero_division = float(zero_division) + assert_almost_equal(p, zero_division) + assert_almost_equal(r, zero_division) + assert_almost_equal(f, zero_division) + + assert_almost_equal(fbeta, float(zero_division)) + + +@pytest.mark.parametrize("average", ["macro", "micro", "weighted", "samples"]) +def test_precision_recall_f1_no_labels_check_warnings(average): + y_true = np.zeros((20, 3)) + y_pred = np.zeros_like(y_true) + + func = precision_recall_fscore_support + with pytest.warns(UndefinedMetricWarning): + p, r, f, s = func(y_true, y_pred, average=average, beta=1.0) + + assert_almost_equal(p, 0) + assert_almost_equal(r, 0) + assert_almost_equal(f, 0) + assert s is None + + with pytest.warns(UndefinedMetricWarning): + fbeta = fbeta_score(y_true, y_pred, average=average, beta=1.0) + + assert_almost_equal(fbeta, 0) + + +@pytest.mark.parametrize("zero_division", [0, 1, np.nan]) +def test_precision_recall_f1_no_labels_average_none(zero_division): + y_true = np.zeros((20, 3)) + y_pred = np.zeros_like(y_true) + + # tp = [0, 0, 0] + # fn = [0, 0, 0] + # fp = [0, 0, 0] + # support = [0, 0, 0] + # |y_hat_i inter y_i | = [0, 0, 0] + # |y_i| = [0, 0, 0] + # |y_hat_i| = [0, 0, 0] + + with warnings.catch_warnings(): + warnings.simplefilter("error") + + p, r, f, s = precision_recall_fscore_support( + y_true, + y_pred, + average=None, + beta=1.0, + zero_division=zero_division, + ) + fbeta = fbeta_score( + y_true, y_pred, beta=1.0, average=None, zero_division=zero_division + ) + + zero_division = np.float64(zero_division) + assert_array_almost_equal(p, [zero_division, zero_division, zero_division], 2) + assert_array_almost_equal(r, [zero_division, zero_division, zero_division], 2) + assert_array_almost_equal(f, [zero_division, zero_division, zero_division], 2) + assert_array_almost_equal(s, [0, 0, 0], 2) + + assert_array_almost_equal(fbeta, [zero_division, zero_division, zero_division], 2) + + +def test_precision_recall_f1_no_labels_average_none_warn(): + y_true = np.zeros((20, 3)) + y_pred = np.zeros_like(y_true) + + # tp = [0, 0, 0] + # fn = [0, 0, 0] + # fp = [0, 0, 0] + # support = [0, 0, 0] + # |y_hat_i inter y_i | = [0, 0, 0] + # |y_i| = [0, 0, 0] + # |y_hat_i| = [0, 0, 0] + + with pytest.warns(UndefinedMetricWarning): + p, r, f, s = precision_recall_fscore_support( + y_true, y_pred, average=None, beta=1 + ) + + assert_array_almost_equal(p, [0, 0, 0], 2) + assert_array_almost_equal(r, [0, 0, 0], 2) + assert_array_almost_equal(f, [0, 0, 0], 2) + assert_array_almost_equal(s, [0, 0, 0], 2) + + with pytest.warns(UndefinedMetricWarning): + fbeta = fbeta_score(y_true, y_pred, beta=1, average=None) + + assert_array_almost_equal(fbeta, [0, 0, 0], 2) + + +def test_prf_warnings(): + # average of per-label scores + f, w = precision_recall_fscore_support, UndefinedMetricWarning + for average in [None, "weighted", "macro"]: + msg = ( + "Precision is ill-defined and " + "being set to 0.0 in labels with no predicted samples." + " Use `zero_division` parameter to control" + " this behavior." + ) + with pytest.warns(w, match=msg): + f([0, 1, 2], [1, 1, 2], average=average) + + msg = ( + "Recall is ill-defined and " + "being set to 0.0 in labels with no true samples." + " Use `zero_division` parameter to control" + " this behavior." + ) + with pytest.warns(w, match=msg): + f([1, 1, 2], [0, 1, 2], average=average) + + # average of per-sample scores + msg = ( + "Precision is ill-defined and " + "being set to 0.0 in samples with no predicted labels." + " Use `zero_division` parameter to control" + " this behavior." + ) + with pytest.warns(w, match=msg): + f(np.array([[1, 0], [1, 0]]), np.array([[1, 0], [0, 0]]), average="samples") + + msg = ( + "Recall is ill-defined and " + "being set to 0.0 in samples with no true labels." + " Use `zero_division` parameter to control" + " this behavior." + ) + with pytest.warns(w, match=msg): + f(np.array([[1, 0], [0, 0]]), np.array([[1, 0], [1, 0]]), average="samples") + + # single score: micro-average + msg = ( + "Precision is ill-defined and " + "being set to 0.0 due to no predicted samples." + " Use `zero_division` parameter to control" + " this behavior." + ) + with pytest.warns(w, match=msg): + f(np.array([[1, 1], [1, 1]]), np.array([[0, 0], [0, 0]]), average="micro") + + msg = ( + "Recall is ill-defined and " + "being set to 0.0 due to no true samples." + " Use `zero_division` parameter to control" + " this behavior." + ) + with pytest.warns(w, match=msg): + f(np.array([[0, 0], [0, 0]]), np.array([[1, 1], [1, 1]]), average="micro") + + # single positive label + msg = ( + "Precision is ill-defined and " + "being set to 0.0 due to no predicted samples." + " Use `zero_division` parameter to control" + " this behavior." + ) + with pytest.warns(w, match=msg): + f([1, 1], [-1, -1], average="binary") + + msg = ( + "Recall is ill-defined and " + "being set to 0.0 due to no true samples." + " Use `zero_division` parameter to control" + " this behavior." + ) + with pytest.warns(w, match=msg): + f([-1, -1], [1, 1], average="binary") + + with warnings.catch_warnings(record=True) as record: + warnings.simplefilter("always") + precision_recall_fscore_support([0, 0], [0, 0], average="binary") + msg = ( + "F-score is ill-defined and being set to 0.0 due to no true nor " + "predicted samples. Use `zero_division` parameter to control this" + " behavior." + ) + assert str(record.pop().message) == msg + msg = ( + "Recall is ill-defined and " + "being set to 0.0 due to no true samples." + " Use `zero_division` parameter to control" + " this behavior." + ) + assert str(record.pop().message) == msg + msg = ( + "Precision is ill-defined and " + "being set to 0.0 due to no predicted samples." + " Use `zero_division` parameter to control" + " this behavior." + ) + assert str(record.pop().message) == msg + + +@pytest.mark.parametrize("zero_division", [0, 1, np.nan]) +def test_prf_no_warnings_if_zero_division_set(zero_division): + with warnings.catch_warnings(): + warnings.simplefilter("error") + + # average of per-label scores + for average in [None, "weighted", "macro"]: + precision_recall_fscore_support( + [0, 1, 2], [1, 1, 2], average=average, zero_division=zero_division + ) + + precision_recall_fscore_support( + [1, 1, 2], [0, 1, 2], average=average, zero_division=zero_division + ) + + # average of per-sample scores + precision_recall_fscore_support( + np.array([[1, 0], [1, 0]]), + np.array([[1, 0], [0, 0]]), + average="samples", + zero_division=zero_division, + ) + + precision_recall_fscore_support( + np.array([[1, 0], [0, 0]]), + np.array([[1, 0], [1, 0]]), + average="samples", + zero_division=zero_division, + ) + + # single score: micro-average + precision_recall_fscore_support( + np.array([[1, 1], [1, 1]]), + np.array([[0, 0], [0, 0]]), + average="micro", + zero_division=zero_division, + ) + + precision_recall_fscore_support( + np.array([[0, 0], [0, 0]]), + np.array([[1, 1], [1, 1]]), + average="micro", + zero_division=zero_division, + ) + + # single positive label + precision_recall_fscore_support( + [1, 1], [-1, -1], average="binary", zero_division=zero_division + ) + + precision_recall_fscore_support( + [-1, -1], [1, 1], average="binary", zero_division=zero_division + ) + + with warnings.catch_warnings(record=True) as record: + warnings.simplefilter("always") + precision_recall_fscore_support( + [0, 0], [0, 0], average="binary", zero_division=zero_division + ) + assert len(record) == 0 + + +@pytest.mark.parametrize("zero_division", ["warn", 0, 1, np.nan]) +def test_recall_warnings(zero_division): + with warnings.catch_warnings(): + warnings.simplefilter("error") + + recall_score( + np.array([[1, 1], [1, 1]]), + np.array([[0, 0], [0, 0]]), + average="micro", + zero_division=zero_division, + ) + + with warnings.catch_warnings(record=True) as record: + warnings.simplefilter("always") + recall_score( + np.array([[0, 0], [0, 0]]), + np.array([[1, 1], [1, 1]]), + average="micro", + zero_division=zero_division, + ) + if zero_division == "warn": + assert ( + str(record.pop().message) == "Recall is ill-defined and " + "being set to 0.0 due to no true samples." + " Use `zero_division` parameter to control" + " this behavior." + ) + else: + assert len(record) == 0 + + recall_score([0, 0], [0, 0]) + if zero_division == "warn": + assert ( + str(record.pop().message) == "Recall is ill-defined and " + "being set to 0.0 due to no true samples." + " Use `zero_division` parameter to control" + " this behavior." + ) + + +@pytest.mark.parametrize("zero_division", ["warn", 0, 1, np.nan]) +def test_precision_warnings(zero_division): + with warnings.catch_warnings(record=True) as record: + warnings.simplefilter("always") + precision_score( + np.array([[1, 1], [1, 1]]), + np.array([[0, 0], [0, 0]]), + average="micro", + zero_division=zero_division, + ) + if zero_division == "warn": + assert ( + str(record.pop().message) == "Precision is ill-defined and " + "being set to 0.0 due to no predicted samples." + " Use `zero_division` parameter to control" + " this behavior." + ) + else: + assert len(record) == 0 + + precision_score([0, 0], [0, 0]) + if zero_division == "warn": + assert ( + str(record.pop().message) == "Precision is ill-defined and " + "being set to 0.0 due to no predicted samples." + " Use `zero_division` parameter to control" + " this behavior." + ) + + with warnings.catch_warnings(): + warnings.simplefilter("error") + + precision_score( + np.array([[0, 0], [0, 0]]), + np.array([[1, 1], [1, 1]]), + average="micro", + zero_division=zero_division, + ) + + +@pytest.mark.parametrize("zero_division", ["warn", 0, 1, np.nan]) +def test_fscore_warnings(zero_division): + with warnings.catch_warnings(record=True) as record: + warnings.simplefilter("always") + + for score in [f1_score, partial(fbeta_score, beta=2)]: + score( + np.array([[1, 1], [1, 1]]), + np.array([[0, 0], [0, 0]]), + average="micro", + zero_division=zero_division, + ) + assert len(record) == 0 + + score( + np.array([[0, 0], [0, 0]]), + np.array([[1, 1], [1, 1]]), + average="micro", + zero_division=zero_division, + ) + assert len(record) == 0 + + score( + np.array([[0, 0], [0, 0]]), + np.array([[0, 0], [0, 0]]), + average="micro", + zero_division=zero_division, + ) + if zero_division == "warn": + assert ( + str(record.pop().message) == "F-score is ill-defined and " + "being set to 0.0 due to no true nor predicted " + "samples. Use `zero_division` parameter to " + "control this behavior." + ) + else: + assert len(record) == 0 + + +def test_prf_average_binary_data_non_binary(): + # Error if user does not explicitly set non-binary average mode + y_true_mc = [1, 2, 3, 3] + y_pred_mc = [1, 2, 3, 1] + msg_mc = ( + r"Target is multiclass but average='binary'. Please " + r"choose another average setting, one of \[" + r"None, 'micro', 'macro', 'weighted'\]." + ) + y_true_ind = np.array([[0, 1, 1], [1, 0, 0], [0, 0, 1]]) + y_pred_ind = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]]) + msg_ind = ( + r"Target is multilabel-indicator but average='binary'. Please " + r"choose another average setting, one of \[" + r"None, 'micro', 'macro', 'weighted', 'samples'\]." + ) + + for y_true, y_pred, msg in [ + (y_true_mc, y_pred_mc, msg_mc), + (y_true_ind, y_pred_ind, msg_ind), + ]: + for metric in [ + precision_score, + recall_score, + f1_score, + partial(fbeta_score, beta=2), + ]: + with pytest.raises(ValueError, match=msg): + metric(y_true, y_pred) + + +def test__check_targets(): + # Check that _check_targets correctly merges target types, squeezes + # output and fails if input lengths differ. + IND = "multilabel-indicator" + MC = "multiclass" + BIN = "binary" + CNT = "continuous" + MMC = "multiclass-multioutput" + MCN = "continuous-multioutput" + # all of length 3 + EXAMPLES = [ + (IND, np.array([[0, 1, 1], [1, 0, 0], [0, 0, 1]])), + # must not be considered binary + (IND, np.array([[0, 1], [1, 0], [1, 1]])), + (MC, [2, 3, 1]), + (BIN, [0, 1, 1]), + (CNT, [0.0, 1.5, 1.0]), + (MC, np.array([[2], [3], [1]])), + (BIN, np.array([[0], [1], [1]])), + (CNT, np.array([[0.0], [1.5], [1.0]])), + (MMC, np.array([[0, 2], [1, 3], [2, 3]])), + (MCN, np.array([[0.5, 2.0], [1.1, 3.0], [2.0, 3.0]])), + ] + # expected type given input types, or None for error + # (types will be tried in either order) + EXPECTED = { + (IND, IND): IND, + (MC, MC): MC, + (BIN, BIN): BIN, + (MC, IND): None, + (BIN, IND): None, + (BIN, MC): MC, + # Disallowed types + (CNT, CNT): None, + (MMC, MMC): None, + (MCN, MCN): None, + (IND, CNT): None, + (MC, CNT): None, + (BIN, CNT): None, + (MMC, CNT): None, + (MCN, CNT): None, + (IND, MMC): None, + (MC, MMC): None, + (BIN, MMC): None, + (MCN, MMC): None, + (IND, MCN): None, + (MC, MCN): None, + (BIN, MCN): None, + } + + for (type1, y1), (type2, y2) in product(EXAMPLES, repeat=2): + try: + expected = EXPECTED[type1, type2] + except KeyError: + expected = EXPECTED[type2, type1] + if expected is None: + with pytest.raises(ValueError): + _check_targets(y1, y2) + + if type1 != type2: + err_msg = ( + "Classification metrics can't handle a mix " + "of {0} and {1} targets".format(type1, type2) + ) + with pytest.raises(ValueError, match=err_msg): + _check_targets(y1, y2) + + else: + if type1 not in (BIN, MC, IND): + err_msg = "{0} is not supported".format(type1) + with pytest.raises(ValueError, match=err_msg): + _check_targets(y1, y2) + + else: + merged_type, y1out, y2out = _check_targets(y1, y2) + assert merged_type == expected + if merged_type.startswith("multilabel"): + assert y1out.format == "csr" + assert y2out.format == "csr" + else: + assert_array_equal(y1out, np.squeeze(y1)) + assert_array_equal(y2out, np.squeeze(y2)) + with pytest.raises(ValueError): + _check_targets(y1[:-1], y2) + + # Make sure seq of seq is not supported + y1 = [(1, 2), (0, 2, 3)] + y2 = [(2,), (0, 2)] + msg = ( + "You appear to be using a legacy multi-label data representation. " + "Sequence of sequences are no longer supported; use a binary array" + " or sparse matrix instead - the MultiLabelBinarizer" + " transformer can convert to this format." + ) + with pytest.raises(ValueError, match=msg): + _check_targets(y1, y2) + + +def test__check_targets_multiclass_with_both_y_true_and_y_pred_binary(): + # https://github.com/scikit-learn/scikit-learn/issues/8098 + y_true = [0, 1] + y_pred = [0, -1] + assert _check_targets(y_true, y_pred)[0] == "multiclass" + + +def test_hinge_loss_binary(): + y_true = np.array([-1, 1, 1, -1]) + pred_decision = np.array([-8.5, 0.5, 1.5, -0.3]) + assert hinge_loss(y_true, pred_decision) == 1.2 / 4 + + y_true = np.array([0, 2, 2, 0]) + pred_decision = np.array([-8.5, 0.5, 1.5, -0.3]) + assert hinge_loss(y_true, pred_decision) == 1.2 / 4 + + +def test_hinge_loss_multiclass(): + pred_decision = np.array( + [ + [+0.36, -0.17, -0.58, -0.99], + [-0.54, -0.37, -0.48, -0.58], + [-1.45, -0.58, -0.38, -0.17], + [-0.54, -0.38, -0.48, -0.58], + [-2.36, -0.79, -0.27, +0.24], + [-1.45, -0.58, -0.38, -0.17], + ] + ) + y_true = np.array([0, 1, 2, 1, 3, 2]) + dummy_losses = np.array( + [ + 1 - pred_decision[0][0] + pred_decision[0][1], + 1 - pred_decision[1][1] + pred_decision[1][2], + 1 - pred_decision[2][2] + pred_decision[2][3], + 1 - pred_decision[3][1] + pred_decision[3][2], + 1 - pred_decision[4][3] + pred_decision[4][2], + 1 - pred_decision[5][2] + pred_decision[5][3], + ] + ) + np.clip(dummy_losses, 0, None, out=dummy_losses) + dummy_hinge_loss = np.mean(dummy_losses) + assert hinge_loss(y_true, pred_decision) == dummy_hinge_loss + + +def test_hinge_loss_multiclass_missing_labels_with_labels_none(): + y_true = np.array([0, 1, 2, 2]) + pred_decision = np.array( + [ + [+1.27, 0.034, -0.68, -1.40], + [-1.45, -0.58, -0.38, -0.17], + [-2.36, -0.79, -0.27, +0.24], + [-2.36, -0.79, -0.27, +0.24], + ] + ) + error_message = ( + "Please include all labels in y_true or pass labels as third argument" + ) + with pytest.raises(ValueError, match=error_message): + hinge_loss(y_true, pred_decision) + + +def test_hinge_loss_multiclass_no_consistent_pred_decision_shape(): + # test for inconsistency between multiclass problem and pred_decision + # argument + y_true = np.array([2, 1, 0, 1, 0, 1, 1]) + pred_decision = np.array([0, 1, 2, 1, 0, 2, 1]) + error_message = ( + "The shape of pred_decision cannot be 1d array" + "with a multiclass target. pred_decision shape " + "must be (n_samples, n_classes), that is " + "(7, 3). Got: (7,)" + ) + with pytest.raises(ValueError, match=re.escape(error_message)): + hinge_loss(y_true=y_true, pred_decision=pred_decision) + + # test for inconsistency between pred_decision shape and labels number + pred_decision = np.array([[0, 1], [0, 1], [0, 1], [0, 1], [2, 0], [0, 1], [1, 0]]) + labels = [0, 1, 2] + error_message = ( + "The shape of pred_decision is not " + "consistent with the number of classes. " + "With a multiclass target, pred_decision " + "shape must be (n_samples, n_classes), that is " + "(7, 3). Got: (7, 2)" + ) + with pytest.raises(ValueError, match=re.escape(error_message)): + hinge_loss(y_true=y_true, pred_decision=pred_decision, labels=labels) + + +def test_hinge_loss_multiclass_with_missing_labels(): + pred_decision = np.array( + [ + [+0.36, -0.17, -0.58, -0.99], + [-0.55, -0.38, -0.48, -0.58], + [-1.45, -0.58, -0.38, -0.17], + [-0.55, -0.38, -0.48, -0.58], + [-1.45, -0.58, -0.38, -0.17], + ] + ) + y_true = np.array([0, 1, 2, 1, 2]) + labels = np.array([0, 1, 2, 3]) + dummy_losses = np.array( + [ + 1 - pred_decision[0][0] + pred_decision[0][1], + 1 - pred_decision[1][1] + pred_decision[1][2], + 1 - pred_decision[2][2] + pred_decision[2][3], + 1 - pred_decision[3][1] + pred_decision[3][2], + 1 - pred_decision[4][2] + pred_decision[4][3], + ] + ) + np.clip(dummy_losses, 0, None, out=dummy_losses) + dummy_hinge_loss = np.mean(dummy_losses) + assert hinge_loss(y_true, pred_decision, labels=labels) == dummy_hinge_loss + + +def test_hinge_loss_multiclass_missing_labels_only_two_unq_in_y_true(): + # non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/17630 + # check that we can compute the hinge loss when providing an array + # with labels allowing to not have all labels in y_true + pred_decision = np.array( + [ + [+0.36, -0.17, -0.58], + [-0.15, -0.58, -0.48], + [-1.45, -0.58, -0.38], + [-0.55, -0.78, -0.42], + [-1.45, -0.58, -0.38], + ] + ) + y_true = np.array([0, 2, 2, 0, 2]) + labels = np.array([0, 1, 2]) + dummy_losses = np.array( + [ + 1 - pred_decision[0][0] + pred_decision[0][1], + 1 - pred_decision[1][2] + pred_decision[1][0], + 1 - pred_decision[2][2] + pred_decision[2][1], + 1 - pred_decision[3][0] + pred_decision[3][2], + 1 - pred_decision[4][2] + pred_decision[4][1], + ] + ) + np.clip(dummy_losses, 0, None, out=dummy_losses) + dummy_hinge_loss = np.mean(dummy_losses) + assert_almost_equal( + hinge_loss(y_true, pred_decision, labels=labels), dummy_hinge_loss + ) + + +def test_hinge_loss_multiclass_invariance_lists(): + # Currently, invariance of string and integer labels cannot be tested + # in common invariance tests because invariance tests for multiclass + # decision functions is not implemented yet. + y_true = ["blue", "green", "red", "green", "white", "red"] + pred_decision = [ + [+0.36, -0.17, -0.58, -0.99], + [-0.55, -0.38, -0.48, -0.58], + [-1.45, -0.58, -0.38, -0.17], + [-0.55, -0.38, -0.48, -0.58], + [-2.36, -0.79, -0.27, +0.24], + [-1.45, -0.58, -0.38, -0.17], + ] + dummy_losses = np.array( + [ + 1 - pred_decision[0][0] + pred_decision[0][1], + 1 - pred_decision[1][1] + pred_decision[1][2], + 1 - pred_decision[2][2] + pred_decision[2][3], + 1 - pred_decision[3][1] + pred_decision[3][2], + 1 - pred_decision[4][3] + pred_decision[4][2], + 1 - pred_decision[5][2] + pred_decision[5][3], + ] + ) + np.clip(dummy_losses, 0, None, out=dummy_losses) + dummy_hinge_loss = np.mean(dummy_losses) + assert hinge_loss(y_true, pred_decision) == dummy_hinge_loss + + +def test_log_loss(): + # binary case with symbolic labels ("no" < "yes") + y_true = ["no", "no", "no", "yes", "yes", "yes"] + y_pred = np.array( + [[0.5, 0.5], [0.1, 0.9], [0.01, 0.99], [0.9, 0.1], [0.75, 0.25], [0.001, 0.999]] + ) + loss = log_loss(y_true, y_pred) + loss_true = -np.mean(bernoulli.logpmf(np.array(y_true) == "yes", y_pred[:, 1])) + assert_allclose(loss, loss_true) + + # multiclass case; adapted from http://bit.ly/RJJHWA + y_true = [1, 0, 2] + y_pred = [[0.2, 0.7, 0.1], [0.6, 0.2, 0.2], [0.6, 0.1, 0.3]] + loss = log_loss(y_true, y_pred, normalize=True) + assert_allclose(loss, 0.6904911) + + # check that we got all the shapes and axes right + # by doubling the length of y_true and y_pred + y_true *= 2 + y_pred *= 2 + loss = log_loss(y_true, y_pred, normalize=False) + assert_allclose(loss, 0.6904911 * 6) + + # raise error if number of classes are not equal. + y_true = [1, 0, 2] + y_pred = [[0.3, 0.7], [0.6, 0.4], [0.4, 0.6]] + with pytest.raises(ValueError): + log_loss(y_true, y_pred) + + # raise error if labels do not contain all values of y_true + y_true = ["a", "b", "c"] + y_pred = [[0.9, 0.1, 0.0], [0.1, 0.9, 0.0], [0.1, 0.1, 0.8]] + labels = ["a", "c", "d"] + error_str = ( + "y_true contains values {'b'} not belonging to the passed " + "labels ['a', 'c', 'd']." + ) + with pytest.raises(ValueError, match=re.escape(error_str)): + log_loss(y_true, y_pred, labels=labels) + + # case when y_true is a string array object + y_true = ["ham", "spam", "spam", "ham"] + y_pred = [[0.3, 0.7], [0.6, 0.4], [0.4, 0.6], [0.7, 0.3]] + loss = log_loss(y_true, y_pred) + assert_allclose(loss, 0.7469410) + + # test labels option + + y_true = [2, 2] + y_pred = [[0.2, 0.8], [0.6, 0.4]] + y_score = np.array([[0.1, 0.9], [0.1, 0.9]]) + error_str = ( + "y_true contains only one label (2). Please provide the list of all " + "expected class labels explicitly through the labels argument." + ) + with pytest.raises(ValueError, match=re.escape(error_str)): + log_loss(y_true, y_pred) + + y_pred = [[0.2, 0.8], [0.6, 0.4], [0.7, 0.3]] + error_str = "Found input variables with inconsistent numbers of samples: [3, 2]" + with pytest.raises(ValueError, match=re.escape(error_str)): + log_loss(y_true, y_pred) + + # works when the labels argument is used + + true_log_loss = -np.mean(np.log(y_score[:, 1])) + calculated_log_loss = log_loss(y_true, y_score, labels=[1, 2]) + assert_allclose(calculated_log_loss, true_log_loss) + + # ensure labels work when len(np.unique(y_true)) != y_pred.shape[1] + y_true = [1, 2, 2] + y_score2 = [[0.7, 0.1, 0.2], [0.2, 0.7, 0.1], [0.1, 0.7, 0.2]] + loss = log_loss(y_true, y_score2, labels=[1, 2, 3]) + assert_allclose(loss, -np.log(0.7)) + + +@pytest.mark.parametrize("dtype", [np.float64, np.float32, np.float16]) +def test_log_loss_eps(dtype): + """Check the behaviour internal eps that changes depending on the input dtype. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/24315 + """ + y_true = np.array([0, 1], dtype=dtype) + y_pred = np.array([1, 0], dtype=dtype) + + loss = log_loss(y_true, y_pred) + assert np.isfinite(loss) + + +@pytest.mark.parametrize("dtype", [np.float64, np.float32, np.float16]) +def test_log_loss_not_probabilities_warning(dtype): + """Check that log_loss raises a warning when y_pred values don't sum to 1.""" + y_true = np.array([0, 1, 1, 0]) + y_pred = np.array([[0.2, 0.7], [0.6, 0.3], [0.4, 0.7], [0.8, 0.3]], dtype=dtype) + + with pytest.warns(UserWarning, match="The y_prob values do not sum to one."): + log_loss(y_true, y_pred) + + +@pytest.mark.parametrize( + "y_true, y_pred", + [ + ([0, 1, 0], [0, 1, 0]), + ([0, 1, 0], [[1, 0], [0, 1], [1, 0]]), + ([0, 1, 2], [[1, 0, 0], [0, 1, 0], [0, 0, 1]]), + ], +) +def test_log_loss_perfect_predictions(y_true, y_pred): + """Check that log_loss returns 0 for perfect predictions.""" + # Because of the clipping, the result is not exactly 0 + assert log_loss(y_true, y_pred) == pytest.approx(0) + + +def test_log_loss_pandas_input(): + # case when input is a pandas series and dataframe gh-5715 + y_tr = np.array(["ham", "spam", "spam", "ham"]) + y_pr = np.array([[0.3, 0.7], [0.6, 0.4], [0.4, 0.6], [0.7, 0.3]]) + types = [(MockDataFrame, MockDataFrame)] + try: + from pandas import DataFrame, Series + + types.append((Series, DataFrame)) + except ImportError: + pass + for TrueInputType, PredInputType in types: + # y_pred dataframe, y_true series + y_true, y_pred = TrueInputType(y_tr), PredInputType(y_pr) + loss = log_loss(y_true, y_pred) + assert_allclose(loss, 0.7469410) + + +def test_log_loss_warnings(): + expected_message = re.escape( + "Labels passed were ['spam', 'eggs', 'ham']. But this function " + "assumes labels are ordered lexicographically. " + "Pass the ordered labels=['eggs', 'ham', 'spam'] and ensure that " + "the columns of y_prob correspond to this ordering." + ) + with pytest.warns(UserWarning, match=expected_message): + log_loss( + ["eggs", "spam", "ham"], + [[1, 0, 0], [0, 1, 0], [0, 0, 1]], + labels=["spam", "eggs", "ham"], + ) + + +def test_brier_score_loss_binary(): + # Check brier_score_loss function + y_true = np.array([0, 1, 1, 0, 1, 1]) + y_prob = np.array([0.1, 0.8, 0.9, 0.3, 1.0, 0.95]) + true_score = linalg.norm(y_true - y_prob) ** 2 / len(y_true) + + assert_almost_equal(brier_score_loss(y_true, y_true), 0.0) + assert_almost_equal(brier_score_loss(y_true, y_prob), true_score) + assert_almost_equal(brier_score_loss(1.0 + y_true, y_prob), true_score) + assert_almost_equal(brier_score_loss(2 * y_true - 1, y_prob), true_score) + + # check that using (n_samples, 2) y_prob or y_true gives the same score + y_prob_reshaped = np.column_stack((1 - y_prob, y_prob)) + y_true_reshaped = np.column_stack((1 - y_true, y_true)) + assert_almost_equal(brier_score_loss(y_true, y_prob_reshaped), true_score) + assert_almost_equal(brier_score_loss(y_true_reshaped, y_prob_reshaped), true_score) + + # check scale_by_half argument + assert_almost_equal( + brier_score_loss(y_true, y_prob, scale_by_half="auto"), true_score + ) + assert_almost_equal( + brier_score_loss(y_true, y_prob, scale_by_half=True), true_score + ) + assert_almost_equal( + brier_score_loss(y_true, y_prob, scale_by_half=False), 2 * true_score + ) + + # calculate correctly when there's only one class in y_true + assert_almost_equal(brier_score_loss([-1], [0.4]), 0.4**2) + assert_almost_equal(brier_score_loss([0], [0.4]), 0.4**2) + assert_almost_equal(brier_score_loss([1], [0.4]), (1 - 0.4) ** 2) + assert_almost_equal(brier_score_loss(["foo"], [0.4], pos_label="bar"), 0.4**2) + assert_almost_equal( + brier_score_loss(["foo"], [0.4], pos_label="foo"), + (1 - 0.4) ** 2, + ) + + +def test_brier_score_loss_multiclass(): + # test cases for multi-class + assert_almost_equal( + brier_score_loss( + ["eggs", "spam", "ham"], + [[1, 0, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0]], + labels=["eggs", "ham", "spam", "yams"], + ), + 2 / 3, + ) + + assert_almost_equal( + brier_score_loss( + [1, 0, 2], [[0.2, 0.7, 0.1], [0.6, 0.2, 0.2], [0.6, 0.1, 0.3]] + ), + 0.41333333, + ) + + # check perfect predictions for 3 classes + assert_almost_equal( + brier_score_loss( + [0, 1, 2], [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]] + ), + 0, + ) + + # check perfectly incorrect predictions for 3 classes + assert_almost_equal( + brier_score_loss( + [0, 1, 2], [[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0]] + ), + 2, + ) + + +def test_brier_score_loss_invalid_inputs(): + # binary case + y_true = np.array([0, 1, 1, 0, 1, 1]) + y_prob = np.array([0.1, 0.8, 0.9, 0.3, 1.0, 0.95]) + with pytest.raises(ValueError): + # bad length of y_prob + brier_score_loss(y_true, y_prob[1:]) + with pytest.raises(ValueError): + # y_pred has value greater than 1 + brier_score_loss(y_true, y_prob + 1.0) + with pytest.raises(ValueError): + # y_pred has value less than 0 + brier_score_loss(y_true, y_prob - 1.0) + + # multiclass case + y_true = np.array([1, 0, 2]) + y_prob = np.array([[0.2, 0.7, 0.1], [0.6, 0.2, 0.2], [0.6, 0.1, 0.3]]) + with pytest.raises(ValueError): + # bad length of y_pred + brier_score_loss(y_true, y_prob[1:]) + with pytest.raises(ValueError): + # y_pred has value greater than 1 + brier_score_loss(y_true, y_prob + 1.0) + with pytest.raises(ValueError): + # y_pred has value less than 0 + brier_score_loss(y_true, y_prob - 1.0) + + # raise an error for multiclass y_true and binary y_prob + y_true = np.array([0, 1, 2, 0]) + y_prob = np.array([0.8, 0.6, 0.4, 0.2]) + error_message = re.escape( + "The type of the target inferred from y_true is multiclass " + "but should be binary according to the shape of y_prob." + ) + with pytest.raises(ValueError, match=error_message): + brier_score_loss(y_true, y_prob) + + # raise an error for wrong number of classes + y_true = [0, 1, 2] + y_prob = [[1, 0], [0, 1], [0, 1]] + error_message = ( + "y_true and y_prob contain different number of " + "classes: 3 vs 2. Please provide the true " + "labels explicitly through the labels argument. " + "Classes found in " + "y_true: [0 1 2]" + ) + with pytest.raises(ValueError, match=re.escape(error_message)): + brier_score_loss(y_true, y_prob) + + y_true = ["eggs", "spam", "ham"] + y_prob = [[1, 0, 0], [0, 1, 0], [0, 1, 0]] + labels = ["eggs", "spam", "ham", "yams"] + error_message = ( + "The number of classes in labels is different " + "from that in y_prob. Classes found in " + "labels: ['eggs' 'ham' 'spam' 'yams']" + ) + with pytest.raises(ValueError, match=re.escape(error_message)): + brier_score_loss(y_true, y_prob, labels=labels) + + # raise error message when there's only one class in y_true + y_true = ["eggs"] + y_prob = [[0.9, 0.1]] + error_message = ( + "y_true contains only one label (eggs). Please " + "provide the list of all expected class labels explicitly through the " + "labels argument." + ) + with pytest.raises(ValueError, match=re.escape(error_message)): + brier_score_loss(y_true, y_prob) + + # error is fixed when labels is specified + assert_almost_equal(brier_score_loss(y_true, y_prob, labels=["eggs", "ham"]), 0.01) + + +def test_brier_score_loss_warnings(): + expected_message = re.escape( + "Labels passed were ['spam', 'eggs', 'ham']. But this function " + "assumes labels are ordered lexicographically. " + "Pass the ordered labels=['eggs', 'ham', 'spam'] and ensure that " + "the columns of y_prob correspond to this ordering." + ) + with pytest.warns(UserWarning, match=expected_message): + brier_score_loss( + ["eggs", "spam", "ham"], + [ + [1, 0, 0], + [0, 1, 0], + [0, 0, 1], + ], + labels=["spam", "eggs", "ham"], + ) + + +def test_balanced_accuracy_score_unseen(): + msg = "y_pred contains classes not in y_true" + with pytest.warns(UserWarning, match=msg): + balanced_accuracy_score([0, 0, 0], [0, 0, 1]) + + +@pytest.mark.parametrize( + "y_true,y_pred", + [ + (["a", "b", "a", "b"], ["a", "a", "a", "b"]), + (["a", "b", "c", "b"], ["a", "a", "a", "b"]), + (["a", "a", "a", "b"], ["a", "b", "c", "b"]), + ], +) +def test_balanced_accuracy_score(y_true, y_pred): + macro_recall = recall_score( + y_true, y_pred, average="macro", labels=np.unique(y_true) + ) + with ignore_warnings(): + # Warnings are tested in test_balanced_accuracy_score_unseen + balanced = balanced_accuracy_score(y_true, y_pred) + assert balanced == pytest.approx(macro_recall) + adjusted = balanced_accuracy_score(y_true, y_pred, adjusted=True) + chance = balanced_accuracy_score(y_true, np.full_like(y_true, y_true[0])) + assert adjusted == (balanced - chance) / (1 - chance) + + +@pytest.mark.parametrize( + "metric", + [ + jaccard_score, + f1_score, + partial(fbeta_score, beta=0.5), + precision_recall_fscore_support, + precision_score, + recall_score, + brier_score_loss, + ], +) +@pytest.mark.parametrize( + "classes", [(False, True), (0, 1), (0.0, 1.0), ("zero", "one")] +) +def test_classification_metric_pos_label_types(metric, classes): + """Check that the metric works with different types of `pos_label`. + + We can expect `pos_label` to be a bool, an integer, a float, a string. + No error should be raised for those types. + """ + rng = np.random.RandomState(42) + n_samples, pos_label = 10, classes[-1] + y_true = rng.choice(classes, size=n_samples, replace=True) + if metric is brier_score_loss: + # brier score loss requires probabilities + y_pred = rng.uniform(size=n_samples) + else: + y_pred = y_true.copy() + result = metric(y_true, y_pred, pos_label=pos_label) + assert not np.any(np.isnan(result)) + + +@pytest.mark.parametrize( + "y_true, y_pred, expected_score", + [ + (np.array([0, 1]), np.array([1, 0]), 0.0), + (np.array([0, 1]), np.array([0, 1]), 1.0), + (np.array([0, 1]), np.array([0, 0]), 0.0), + (np.array([0, 0]), np.array([0, 0]), 1.0), + ], +) +def test_f1_for_small_binary_inputs_with_zero_division(y_true, y_pred, expected_score): + """Check the behaviour of `zero_division` for f1-score. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/26965 + """ + assert f1_score(y_true, y_pred, zero_division=1.0) == pytest.approx(expected_score) + + +@pytest.mark.parametrize( + "scoring", + [ + make_scorer(f1_score, zero_division=np.nan), + make_scorer(fbeta_score, beta=2, zero_division=np.nan), + make_scorer(precision_score, zero_division=np.nan), + make_scorer(recall_score, zero_division=np.nan), + ], +) +def test_classification_metric_division_by_zero_nan_validaton(scoring): + """Check that we validate `np.nan` properly for classification metrics. + + With `n_jobs=2` in cross-validation, the `np.nan` used for the singleton will be + different in the sub-process and we should not use the `is` operator but + `math.isnan`. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/27563 + """ + X, y = datasets.make_classification(random_state=0) + classifier = DecisionTreeClassifier(max_depth=3, random_state=0).fit(X, y) + cross_val_score(classifier, X, y, scoring=scoring, n_jobs=2, error_score="raise") + + +def test_d2_log_loss_score(): + y_true = [0, 0, 0, 1, 1, 1] + y_true_string = ["no", "no", "no", "yes", "yes", "yes"] + y_pred = np.array( + [ + [0.5, 0.5], + [0.9, 0.1], + [0.4, 0.6], + [0.6, 0.4], + [0.35, 0.65], + [0.01, 0.99], + ] + ) + y_pred_null = np.array( + [ + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + ] + ) + d2_score = d2_log_loss_score(y_true=y_true, y_pred=y_pred) + log_likelihood = log_loss(y_true=y_true, y_pred=y_pred, normalize=False) + log_likelihood_null = log_loss(y_true=y_true, y_pred=y_pred_null, normalize=False) + d2_score_true = 1 - log_likelihood / log_likelihood_null + assert d2_score == pytest.approx(d2_score_true) + + # check that using sample weight also gives the correct d2 score + sample_weight = np.array([2, 1, 3, 4, 3, 1]) + y_pred_null[:, 0] = sample_weight[:3].sum() / sample_weight.sum() + y_pred_null[:, 1] = sample_weight[3:].sum() / sample_weight.sum() + d2_score = d2_log_loss_score( + y_true=y_true, y_pred=y_pred, sample_weight=sample_weight + ) + log_likelihood = log_loss( + y_true=y_true, + y_pred=y_pred, + sample_weight=sample_weight, + normalize=False, + ) + log_likelihood_null = log_loss( + y_true=y_true, + y_pred=y_pred_null, + sample_weight=sample_weight, + normalize=False, + ) + d2_score_true = 1 - log_likelihood / log_likelihood_null + assert d2_score == pytest.approx(d2_score_true) + + # check if good predictions give a relatively higher value for the d2 score + y_pred = np.array( + [ + [0.9, 0.1], + [0.8, 0.2], + [0.9, 0.1], + [0.1, 0.9], + [0.2, 0.8], + [0.1, 0.9], + ] + ) + d2_score = d2_log_loss_score(y_true, y_pred) + assert 0.5 < d2_score < 1.0 + # check that a similar value is obtained for string labels + d2_score_string = d2_log_loss_score(y_true_string, y_pred) + assert d2_score_string == pytest.approx(d2_score) + + # check if poor predictions gives a relatively low value for the d2 score + y_pred = np.array( + [ + [0.5, 0.5], + [0.1, 0.9], + [0.1, 0.9], + [0.9, 0.1], + [0.75, 0.25], + [0.1, 0.9], + ] + ) + d2_score = d2_log_loss_score(y_true, y_pred) + assert d2_score < 0 + # check that a similar value is obtained for string labels + d2_score_string = d2_log_loss_score(y_true_string, y_pred) + assert d2_score_string == pytest.approx(d2_score) + + # check if simply using the average of the classes as the predictions + # gives a d2 score of 0 + y_true = [0, 0, 0, 1, 1, 1] + y_pred = np.array( + [ + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + ] + ) + d2_score = d2_log_loss_score(y_true, y_pred) + assert d2_score == 0 + d2_score_string = d2_log_loss_score(y_true_string, y_pred) + assert d2_score_string == 0 + + # check if simply using the average of the classes as the predictions + # gives a d2 score of 0 when the positive class has a higher proportion + y_true = [0, 1, 1, 1] + y_true_string = ["no", "yes", "yes", "yes"] + y_pred = np.array([[0.25, 0.75], [0.25, 0.75], [0.25, 0.75], [0.25, 0.75]]) + d2_score = d2_log_loss_score(y_true, y_pred) + assert d2_score == 0 + d2_score_string = d2_log_loss_score(y_true_string, y_pred) + assert d2_score_string == 0 + sample_weight = [2, 2, 2, 2] + d2_score_with_sample_weight = d2_log_loss_score( + y_true, y_pred, sample_weight=sample_weight + ) + assert d2_score_with_sample_weight == 0 + + # check that the d2 scores seem correct when more than 2 + # labels are specified + y_true = ["high", "high", "low", "neutral"] + sample_weight = [1.4, 0.6, 0.8, 0.2] + + y_pred = np.array( + [ + [0.8, 0.1, 0.1], + [0.8, 0.1, 0.1], + [0.1, 0.8, 0.1], + [0.1, 0.1, 0.8], + ] + ) + d2_score = d2_log_loss_score(y_true, y_pred) + assert 0.5 < d2_score < 1.0 + d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight) + assert 0.5 < d2_score < 1.0 + + y_pred = np.array( + [ + [0.2, 0.5, 0.3], + [0.1, 0.7, 0.2], + [0.1, 0.1, 0.8], + [0.2, 0.7, 0.1], + ] + ) + d2_score = d2_log_loss_score(y_true, y_pred) + assert d2_score < 0 + d2_score = d2_log_loss_score(y_true, y_pred, sample_weight=sample_weight) + assert d2_score < 0 + + +def test_d2_log_loss_score_missing_labels(): + """Check that d2_log_loss_score works when not all labels are present in y_true + + non-regression test for https://github.com/scikit-learn/scikit-learn/issues/30713 + """ + y_true = [2, 0, 2, 0] + labels = [0, 1, 2] + sample_weight = [1.4, 0.6, 0.7, 0.3] + y_pred = np.tile([1, 0, 0], (4, 1)) + + log_loss_obs = log_loss(y_true, y_pred, sample_weight=sample_weight, labels=labels) + + # Null model consists of weighted average of the classes. + # Given that the sum of the weights is 3, + # - weighted average of 0s is (0.6 + 0.3) / 3 = 0.3 + # - weighted average of 1s is 0 + # - weighted average of 2s is (1.4 + 0.7) / 3 = 0.7 + y_pred_null = np.tile([0.3, 0, 0.7], (4, 1)) + log_loss_null = log_loss( + y_true, y_pred_null, sample_weight=sample_weight, labels=labels + ) + + expected_d2_score = 1 - log_loss_obs / log_loss_null + d2_score = d2_log_loss_score( + y_true, y_pred, sample_weight=sample_weight, labels=labels + ) + assert_allclose(d2_score, expected_d2_score) + + +def test_d2_log_loss_score_label_order(): + """Check that d2_log_loss_score doesn't depend on the order of the labels.""" + y_true = [2, 0, 2, 0] + y_pred = np.tile([1, 0, 0], (4, 1)) + + d2_score = d2_log_loss_score(y_true, y_pred, labels=[0, 1, 2]) + d2_score_other = d2_log_loss_score(y_true, y_pred, labels=[0, 2, 1]) + + assert_allclose(d2_score, d2_score_other) + + +def test_d2_log_loss_score_raises(): + """Test that d2_log_loss_score raises the appropriate errors on + invalid inputs.""" + y_true = [0, 1, 2] + y_pred = [[0.2, 0.8], [0.5, 0.5], [0.4, 0.6]] + err = "contain different number of classes" + with pytest.raises(ValueError, match=err): + d2_log_loss_score(y_true, y_pred) + + # check error if the number of classes in labels do not match the number + # of classes in y_pred. + y_true = [0, 1, 2] + y_pred = [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]] + labels = [0, 1, 2] + err = "number of classes in labels is different" + with pytest.raises(ValueError, match=err): + d2_log_loss_score(y_true, y_pred, labels=labels) + + # check error if y_true and y_pred do not have equal lengths + y_true = [0, 1, 2] + y_pred = [[0.5, 0.5, 0.5], [0.6, 0.3, 0.1]] + err = "inconsistent numbers of samples" + with pytest.raises(ValueError, match=err): + d2_log_loss_score(y_true, y_pred) + + # check warning for samples < 2 + y_true = [1] + y_pred = [[0.5, 0.5]] + err = "score is not well-defined" + with pytest.warns(UndefinedMetricWarning, match=err): + d2_log_loss_score(y_true, y_pred) + + # check error when y_true only has 1 label + y_true = [1, 1, 1] + y_pred = [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]] + err = "y_true contains only one label" + with pytest.raises(ValueError, match=err): + d2_log_loss_score(y_true, y_pred) + + # check error when y_true only has 1 label and labels also has + # only 1 label + y_true = [1, 1, 1] + labels = [1] + y_pred = [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]] + err = "The labels array needs to contain at least two" + with pytest.raises(ValueError, match=err): + d2_log_loss_score(y_true, y_pred, labels=labels) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_common.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_common.py new file mode 100644 index 0000000000000000000000000000000000000000..39522876e8f24589174fa6ce2f6890ad552e5899 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_common.py @@ -0,0 +1,2348 @@ +import math +from functools import partial +from inspect import signature +from itertools import chain, permutations, product + +import numpy as np +import pytest + +from sklearn._config import config_context +from sklearn.datasets import make_multilabel_classification +from sklearn.exceptions import UndefinedMetricWarning +from sklearn.metrics import ( + accuracy_score, + average_precision_score, + balanced_accuracy_score, + brier_score_loss, + cohen_kappa_score, + confusion_matrix, + coverage_error, + d2_absolute_error_score, + d2_pinball_score, + d2_tweedie_score, + dcg_score, + det_curve, + explained_variance_score, + f1_score, + fbeta_score, + hamming_loss, + hinge_loss, + jaccard_score, + label_ranking_average_precision_score, + label_ranking_loss, + log_loss, + matthews_corrcoef, + max_error, + mean_absolute_error, + mean_absolute_percentage_error, + mean_gamma_deviance, + mean_pinball_loss, + mean_poisson_deviance, + mean_squared_error, + mean_squared_log_error, + mean_tweedie_deviance, + median_absolute_error, + multilabel_confusion_matrix, + ndcg_score, + precision_recall_curve, + precision_score, + r2_score, + recall_score, + roc_auc_score, + roc_curve, + root_mean_squared_error, + root_mean_squared_log_error, + top_k_accuracy_score, + zero_one_loss, +) +from sklearn.metrics._base import _average_binary_score +from sklearn.metrics.pairwise import ( + additive_chi2_kernel, + chi2_kernel, + cosine_distances, + cosine_similarity, + euclidean_distances, + linear_kernel, + paired_cosine_distances, + paired_euclidean_distances, + polynomial_kernel, + rbf_kernel, + sigmoid_kernel, +) +from sklearn.preprocessing import LabelBinarizer +from sklearn.utils import shuffle +from sklearn.utils._array_api import ( + _atol_for_type, + _convert_to_numpy, + _get_namespace_device_dtype_ids, + yield_namespace_device_dtype_combinations, +) +from sklearn.utils._testing import ( + _array_api_for_tests, + assert_allclose, + assert_almost_equal, + assert_array_equal, + assert_array_less, + ignore_warnings, +) +from sklearn.utils.fixes import COO_CONTAINERS, parse_version, sp_version +from sklearn.utils.multiclass import type_of_target +from sklearn.utils.validation import _num_samples, check_random_state + +# Note toward developers about metric testing +# ------------------------------------------- +# It is often possible to write one general test for several metrics: +# +# - invariance properties, e.g. invariance to sample order +# - common behavior for an argument, e.g. the "normalize" with value True +# will return the mean of the metrics and with value False will return +# the sum of the metrics. +# +# In order to improve the overall metric testing, it is a good idea to write +# first a specific test for the given metric and then add a general test for +# all metrics that have the same behavior. +# +# Two types of datastructures are used in order to implement this system: +# dictionaries of metrics and lists of metrics with common properties. +# +# Dictionaries of metrics +# ------------------------ +# The goal of having those dictionaries is to have an easy way to call a +# particular metric and associate a name to each function: +# +# - REGRESSION_METRICS: all regression metrics. +# - CLASSIFICATION_METRICS: all classification metrics +# which compare a ground truth and the estimated targets as returned by a +# classifier. +# - THRESHOLDED_METRICS: all classification metrics which +# compare a ground truth and a score, e.g. estimated probabilities or +# decision function (format might vary) +# +# Those dictionaries will be used to test systematically some invariance +# properties, e.g. invariance toward several input layout. +# + +REGRESSION_METRICS = { + "max_error": max_error, + "mean_absolute_error": mean_absolute_error, + "mean_squared_error": mean_squared_error, + "mean_squared_log_error": mean_squared_log_error, + "mean_pinball_loss": mean_pinball_loss, + "median_absolute_error": median_absolute_error, + "mean_absolute_percentage_error": mean_absolute_percentage_error, + "explained_variance_score": explained_variance_score, + "r2_score": partial(r2_score, multioutput="variance_weighted"), + "root_mean_squared_error": root_mean_squared_error, + "root_mean_squared_log_error": root_mean_squared_log_error, + "mean_normal_deviance": partial(mean_tweedie_deviance, power=0), + "mean_poisson_deviance": mean_poisson_deviance, + "mean_gamma_deviance": mean_gamma_deviance, + "mean_compound_poisson_deviance": partial(mean_tweedie_deviance, power=1.4), + "d2_tweedie_score": partial(d2_tweedie_score, power=1.4), + "d2_pinball_score": d2_pinball_score, + "d2_absolute_error_score": d2_absolute_error_score, +} + +CLASSIFICATION_METRICS = { + "accuracy_score": accuracy_score, + "balanced_accuracy_score": balanced_accuracy_score, + "adjusted_balanced_accuracy_score": partial(balanced_accuracy_score, adjusted=True), + "unnormalized_accuracy_score": partial(accuracy_score, normalize=False), + # `confusion_matrix` returns absolute values and hence behaves unnormalized + # . Naming it with an unnormalized_ prefix is necessary for this module to + # skip sample_weight scaling checks which will fail for unnormalized + # metrics. + "unnormalized_confusion_matrix": confusion_matrix, + "normalized_confusion_matrix": lambda *args, **kwargs: ( + confusion_matrix(*args, **kwargs).astype("float") + / confusion_matrix(*args, **kwargs).sum(axis=1)[:, np.newaxis] + ), + "unnormalized_multilabel_confusion_matrix": multilabel_confusion_matrix, + "unnormalized_multilabel_confusion_matrix_sample": partial( + multilabel_confusion_matrix, samplewise=True + ), + "hamming_loss": hamming_loss, + "zero_one_loss": zero_one_loss, + "unnormalized_zero_one_loss": partial(zero_one_loss, normalize=False), + # These are needed to test averaging + "jaccard_score": jaccard_score, + "precision_score": precision_score, + "recall_score": recall_score, + "f1_score": f1_score, + "f2_score": partial(fbeta_score, beta=2), + "f0.5_score": partial(fbeta_score, beta=0.5), + "matthews_corrcoef_score": matthews_corrcoef, + "weighted_f0.5_score": partial(fbeta_score, average="weighted", beta=0.5), + "weighted_f1_score": partial(f1_score, average="weighted"), + "weighted_f2_score": partial(fbeta_score, average="weighted", beta=2), + "weighted_precision_score": partial(precision_score, average="weighted"), + "weighted_recall_score": partial(recall_score, average="weighted"), + "weighted_jaccard_score": partial(jaccard_score, average="weighted"), + "micro_f0.5_score": partial(fbeta_score, average="micro", beta=0.5), + "micro_f1_score": partial(f1_score, average="micro"), + "micro_f2_score": partial(fbeta_score, average="micro", beta=2), + "micro_precision_score": partial(precision_score, average="micro"), + "micro_recall_score": partial(recall_score, average="micro"), + "micro_jaccard_score": partial(jaccard_score, average="micro"), + "macro_f0.5_score": partial(fbeta_score, average="macro", beta=0.5), + "macro_f1_score": partial(f1_score, average="macro"), + "macro_f2_score": partial(fbeta_score, average="macro", beta=2), + "macro_precision_score": partial(precision_score, average="macro"), + "macro_recall_score": partial(recall_score, average="macro"), + "macro_jaccard_score": partial(jaccard_score, average="macro"), + "samples_f0.5_score": partial(fbeta_score, average="samples", beta=0.5), + "samples_f1_score": partial(f1_score, average="samples"), + "samples_f2_score": partial(fbeta_score, average="samples", beta=2), + "samples_precision_score": partial(precision_score, average="samples"), + "samples_recall_score": partial(recall_score, average="samples"), + "samples_jaccard_score": partial(jaccard_score, average="samples"), + "cohen_kappa_score": cohen_kappa_score, +} + + +def precision_recall_curve_padded_thresholds(*args, **kwargs): + """ + The dimensions of precision-recall pairs and the threshold array as + returned by the precision_recall_curve do not match. See + func:`sklearn.metrics.precision_recall_curve` + + This prevents implicit conversion of return value triple to an higher + dimensional np.array of dtype('float64') (it will be of dtype('object) + instead). This again is needed for assert_array_equal to work correctly. + + As a workaround we pad the threshold array with NaN values to match + the dimension of precision and recall arrays respectively. + """ + precision, recall, thresholds = precision_recall_curve(*args, **kwargs) + + pad_threshholds = len(precision) - len(thresholds) + + return np.array( + [ + precision, + recall, + np.pad( + thresholds.astype(np.float64), + pad_width=(0, pad_threshholds), + mode="constant", + constant_values=[np.nan], + ), + ] + ) + + +CURVE_METRICS = { + "roc_curve": roc_curve, + "precision_recall_curve": precision_recall_curve_padded_thresholds, + "det_curve": det_curve, +} + +THRESHOLDED_METRICS = { + "coverage_error": coverage_error, + "label_ranking_loss": label_ranking_loss, + "log_loss": log_loss, + "unnormalized_log_loss": partial(log_loss, normalize=False), + "hinge_loss": hinge_loss, + "brier_score_loss": brier_score_loss, + "roc_auc_score": roc_auc_score, # default: average="macro" + "weighted_roc_auc": partial(roc_auc_score, average="weighted"), + "samples_roc_auc": partial(roc_auc_score, average="samples"), + "micro_roc_auc": partial(roc_auc_score, average="micro"), + "ovr_roc_auc": partial(roc_auc_score, average="macro", multi_class="ovr"), + "weighted_ovr_roc_auc": partial( + roc_auc_score, average="weighted", multi_class="ovr" + ), + "ovo_roc_auc": partial(roc_auc_score, average="macro", multi_class="ovo"), + "weighted_ovo_roc_auc": partial( + roc_auc_score, average="weighted", multi_class="ovo" + ), + "partial_roc_auc": partial(roc_auc_score, max_fpr=0.5), + "average_precision_score": average_precision_score, # default: average="macro" + "weighted_average_precision_score": partial( + average_precision_score, average="weighted" + ), + "samples_average_precision_score": partial( + average_precision_score, average="samples" + ), + "micro_average_precision_score": partial(average_precision_score, average="micro"), + "label_ranking_average_precision_score": label_ranking_average_precision_score, + "ndcg_score": ndcg_score, + "dcg_score": dcg_score, + "top_k_accuracy_score": top_k_accuracy_score, +} + +ALL_METRICS = dict() +ALL_METRICS.update(THRESHOLDED_METRICS) +ALL_METRICS.update(CLASSIFICATION_METRICS) +ALL_METRICS.update(REGRESSION_METRICS) +ALL_METRICS.update(CURVE_METRICS) + +# Lists of metrics with common properties +# --------------------------------------- +# Lists of metrics with common properties are used to test systematically some +# functionalities and invariance, e.g. SYMMETRIC_METRICS lists all metrics that +# are symmetric with respect to their input argument y_true and y_pred. +# +# When you add a new metric or functionality, check if a general test +# is already written. + +# Those metrics don't support binary inputs +METRIC_UNDEFINED_BINARY = { + "samples_f0.5_score", + "samples_f1_score", + "samples_f2_score", + "samples_precision_score", + "samples_recall_score", + "samples_jaccard_score", + "coverage_error", + "unnormalized_multilabel_confusion_matrix_sample", + "label_ranking_loss", + "label_ranking_average_precision_score", + "dcg_score", + "ndcg_score", +} + +# Those metrics don't support multiclass inputs +METRIC_UNDEFINED_MULTICLASS = { + "micro_roc_auc", + "samples_roc_auc", + "partial_roc_auc", + "roc_auc_score", + "weighted_roc_auc", + "jaccard_score", + # with default average='binary', multiclass is prohibited + "precision_score", + "recall_score", + "f1_score", + "f2_score", + "f0.5_score", + # curves + "roc_curve", + "precision_recall_curve", + "det_curve", +} + +# Metric undefined with "binary" or "multiclass" input +METRIC_UNDEFINED_BINARY_MULTICLASS = METRIC_UNDEFINED_BINARY.union( + METRIC_UNDEFINED_MULTICLASS +) + +# Metrics with an "average" argument +METRICS_WITH_AVERAGING = { + "precision_score", + "recall_score", + "f1_score", + "f2_score", + "f0.5_score", + "jaccard_score", +} + +# Threshold-based metrics with an "average" argument +THRESHOLDED_METRICS_WITH_AVERAGING = { + "roc_auc_score", + "average_precision_score", + "partial_roc_auc", +} + +# Metrics with a "pos_label" argument +METRICS_WITH_POS_LABEL = { + "roc_curve", + "precision_recall_curve", + "det_curve", + "brier_score_loss", + "precision_score", + "recall_score", + "f1_score", + "f2_score", + "f0.5_score", + "jaccard_score", + "average_precision_score", + "weighted_average_precision_score", + "micro_average_precision_score", + "samples_average_precision_score", +} + +# Metrics with a "labels" argument +# TODO: Handle multi_class metrics that has a labels argument as well as a +# decision function argument. e.g hinge_loss +METRICS_WITH_LABELS = { + "unnormalized_confusion_matrix", + "normalized_confusion_matrix", + "roc_curve", + "precision_recall_curve", + "det_curve", + "precision_score", + "recall_score", + "f1_score", + "f2_score", + "f0.5_score", + "jaccard_score", + "weighted_f0.5_score", + "weighted_f1_score", + "weighted_f2_score", + "weighted_precision_score", + "weighted_recall_score", + "weighted_jaccard_score", + "micro_f0.5_score", + "micro_f1_score", + "micro_f2_score", + "micro_precision_score", + "micro_recall_score", + "micro_jaccard_score", + "macro_f0.5_score", + "macro_f1_score", + "macro_f2_score", + "macro_precision_score", + "macro_recall_score", + "macro_jaccard_score", + "unnormalized_multilabel_confusion_matrix", + "unnormalized_multilabel_confusion_matrix_sample", + "cohen_kappa_score", + "log_loss", + "brier_score_loss", +} + +# Metrics with a "normalize" option +METRICS_WITH_NORMALIZE_OPTION = { + "accuracy_score", + "top_k_accuracy_score", + "zero_one_loss", +} + +# Threshold-based metrics with "multilabel-indicator" format support +THRESHOLDED_MULTILABEL_METRICS = { + "log_loss", + "unnormalized_log_loss", + "brier_score_loss", + "roc_auc_score", + "weighted_roc_auc", + "samples_roc_auc", + "micro_roc_auc", + "partial_roc_auc", + "average_precision_score", + "weighted_average_precision_score", + "samples_average_precision_score", + "micro_average_precision_score", + "coverage_error", + "label_ranking_loss", + "ndcg_score", + "dcg_score", + "label_ranking_average_precision_score", +} + +# Classification metrics with "multilabel-indicator" format +MULTILABELS_METRICS = { + "accuracy_score", + "unnormalized_accuracy_score", + "hamming_loss", + "zero_one_loss", + "unnormalized_zero_one_loss", + "weighted_f0.5_score", + "weighted_f1_score", + "weighted_f2_score", + "weighted_precision_score", + "weighted_recall_score", + "weighted_jaccard_score", + "macro_f0.5_score", + "macro_f1_score", + "macro_f2_score", + "macro_precision_score", + "macro_recall_score", + "macro_jaccard_score", + "micro_f0.5_score", + "micro_f1_score", + "micro_f2_score", + "micro_precision_score", + "micro_recall_score", + "micro_jaccard_score", + "unnormalized_multilabel_confusion_matrix", + "samples_f0.5_score", + "samples_f1_score", + "samples_f2_score", + "samples_precision_score", + "samples_recall_score", + "samples_jaccard_score", +} + +# Regression metrics with "multioutput-continuous" format support +MULTIOUTPUT_METRICS = { + "mean_absolute_error", + "median_absolute_error", + "mean_squared_error", + "mean_squared_log_error", + "r2_score", + "root_mean_squared_error", + "root_mean_squared_log_error", + "explained_variance_score", + "mean_absolute_percentage_error", + "mean_pinball_loss", + "d2_pinball_score", + "d2_absolute_error_score", +} + +# Symmetric with respect to their input arguments y_true and y_pred +# metric(y_true, y_pred) == metric(y_pred, y_true). +SYMMETRIC_METRICS = { + "accuracy_score", + "unnormalized_accuracy_score", + "hamming_loss", + "zero_one_loss", + "unnormalized_zero_one_loss", + "micro_jaccard_score", + "macro_jaccard_score", + "jaccard_score", + "samples_jaccard_score", + "f1_score", + "micro_f1_score", + "macro_f1_score", + "weighted_recall_score", + "mean_squared_log_error", + "root_mean_squared_error", + "root_mean_squared_log_error", + # P = R = F = accuracy in multiclass case + "micro_f0.5_score", + "micro_f1_score", + "micro_f2_score", + "micro_precision_score", + "micro_recall_score", + "matthews_corrcoef_score", + "mean_absolute_error", + "mean_squared_error", + "median_absolute_error", + "max_error", + # Pinball loss is only symmetric for alpha=0.5 which is the default. + "mean_pinball_loss", + "cohen_kappa_score", + "mean_normal_deviance", +} + +# Asymmetric with respect to their input arguments y_true and y_pred +# metric(y_true, y_pred) != metric(y_pred, y_true). +NOT_SYMMETRIC_METRICS = { + "balanced_accuracy_score", + "adjusted_balanced_accuracy_score", + "explained_variance_score", + "r2_score", + "unnormalized_confusion_matrix", + "normalized_confusion_matrix", + "roc_curve", + "precision_recall_curve", + "det_curve", + "precision_score", + "recall_score", + "f2_score", + "f0.5_score", + "weighted_f0.5_score", + "weighted_f1_score", + "weighted_f2_score", + "weighted_precision_score", + "weighted_jaccard_score", + "unnormalized_multilabel_confusion_matrix", + "macro_f0.5_score", + "macro_f2_score", + "macro_precision_score", + "macro_recall_score", + "hinge_loss", + "mean_gamma_deviance", + "mean_poisson_deviance", + "mean_compound_poisson_deviance", + "d2_tweedie_score", + "d2_pinball_score", + "d2_absolute_error_score", + "mean_absolute_percentage_error", +} + + +# No Sample weight support +METRICS_WITHOUT_SAMPLE_WEIGHT = { + "median_absolute_error", + "max_error", + "ovo_roc_auc", + "weighted_ovo_roc_auc", +} + +METRICS_REQUIRE_POSITIVE_Y = { + "mean_poisson_deviance", + "mean_gamma_deviance", + "mean_compound_poisson_deviance", + "d2_tweedie_score", +} + +# Metrics involving y = log(1+x) +METRICS_WITH_LOG1P_Y = { + "mean_squared_log_error", + "root_mean_squared_log_error", +} + + +def _require_positive_targets(y1, y2): + """Make targets strictly positive""" + offset = abs(min(y1.min(), y2.min())) + 1 + y1 += offset + y2 += offset + return y1, y2 + + +def _require_log1p_targets(y1, y2): + """Make targets strictly larger than -1""" + offset = abs(min(y1.min(), y2.min())) - 0.99 + y1 = y1.astype(np.float64) + y2 = y2.astype(np.float64) + y1 += offset + y2 += offset + return y1, y2 + + +def test_symmetry_consistency(): + # We shouldn't forget any metrics + assert ( + SYMMETRIC_METRICS + | NOT_SYMMETRIC_METRICS + | set(THRESHOLDED_METRICS) + | METRIC_UNDEFINED_BINARY_MULTICLASS + ) == set(ALL_METRICS) + + assert (SYMMETRIC_METRICS & NOT_SYMMETRIC_METRICS) == set() + + +@pytest.mark.parametrize("name", sorted(SYMMETRIC_METRICS)) +def test_symmetric_metric(name): + # Test the symmetry of score and loss functions + random_state = check_random_state(0) + y_true = random_state.randint(0, 2, size=(20,)) + y_pred = random_state.randint(0, 2, size=(20,)) + + if name in METRICS_REQUIRE_POSITIVE_Y: + y_true, y_pred = _require_positive_targets(y_true, y_pred) + + elif name in METRICS_WITH_LOG1P_Y: + y_true, y_pred = _require_log1p_targets(y_true, y_pred) + + y_true_bin = random_state.randint(0, 2, size=(20, 25)) + y_pred_bin = random_state.randint(0, 2, size=(20, 25)) + + metric = ALL_METRICS[name] + if name in METRIC_UNDEFINED_BINARY: + if name in MULTILABELS_METRICS: + assert_allclose( + metric(y_true_bin, y_pred_bin), + metric(y_pred_bin, y_true_bin), + err_msg="%s is not symmetric" % name, + ) + else: + assert False, "This case is currently unhandled" + else: + assert_allclose( + metric(y_true, y_pred), + metric(y_pred, y_true), + err_msg="%s is not symmetric" % name, + ) + + +@pytest.mark.parametrize("name", sorted(NOT_SYMMETRIC_METRICS)) +def test_not_symmetric_metric(name): + # Test the symmetry of score and loss functions + random_state = check_random_state(0) + metric = ALL_METRICS[name] + + # The metric can be accidentally symmetric on a random draw. + # We run several random draws to check that at least of them + # gives an asymmetric result. + always_symmetric = True + for _ in range(5): + y_true = random_state.randint(0, 2, size=(20,)) + y_pred = random_state.randint(0, 2, size=(20,)) + + if name in METRICS_REQUIRE_POSITIVE_Y: + y_true, y_pred = _require_positive_targets(y_true, y_pred) + + nominal = metric(y_true, y_pred) + swapped = metric(y_pred, y_true) + if not np.allclose(nominal, swapped): + always_symmetric = False + break + + if always_symmetric: + raise ValueError(f"{name} seems to be symmetric") + + +def test_symmetry_tests(): + # check test_symmetric_metric and test_not_symmetric_metric + sym = "accuracy_score" + not_sym = "recall_score" + # test_symmetric_metric passes on a symmetric metric + # but fails on a not symmetric metric + test_symmetric_metric(sym) + with pytest.raises(AssertionError, match=f"{not_sym} is not symmetric"): + test_symmetric_metric(not_sym) + # test_not_symmetric_metric passes on a not symmetric metric + # but fails on a symmetric metric + test_not_symmetric_metric(not_sym) + with pytest.raises(ValueError, match=f"{sym} seems to be symmetric"): + test_not_symmetric_metric(sym) + + +@pytest.mark.parametrize( + "name", sorted(set(ALL_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS) +) +def test_sample_order_invariance(name): + random_state = check_random_state(0) + y_true = random_state.randint(0, 2, size=(20,)) + y_pred = random_state.randint(0, 2, size=(20,)) + + if name in METRICS_REQUIRE_POSITIVE_Y: + y_true, y_pred = _require_positive_targets(y_true, y_pred) + elif name in METRICS_WITH_LOG1P_Y: + y_true, y_pred = _require_log1p_targets(y_true, y_pred) + + y_true_shuffle, y_pred_shuffle = shuffle(y_true, y_pred, random_state=0) + + with ignore_warnings(): + metric = ALL_METRICS[name] + assert_allclose( + metric(y_true, y_pred), + metric(y_true_shuffle, y_pred_shuffle), + err_msg="%s is not sample order invariant" % name, + ) + + +def test_sample_order_invariance_multilabel_and_multioutput(): + random_state = check_random_state(0) + + # Generate some data + y_true = random_state.randint(0, 2, size=(20, 25)) + y_pred = random_state.randint(0, 2, size=(20, 25)) + y_score = random_state.uniform(size=y_true.shape) + + # Some metrics (e.g. log_loss) require y_score to be probabilities (sum to 1) + y_score /= y_score.sum(axis=1, keepdims=True) + + y_true_shuffle, y_pred_shuffle, y_score_shuffle = shuffle( + y_true, y_pred, y_score, random_state=0 + ) + + for name in MULTILABELS_METRICS: + metric = ALL_METRICS[name] + assert_allclose( + metric(y_true, y_pred), + metric(y_true_shuffle, y_pred_shuffle), + err_msg="%s is not sample order invariant" % name, + ) + + for name in THRESHOLDED_MULTILABEL_METRICS: + metric = ALL_METRICS[name] + assert_allclose( + metric(y_true, y_score), + metric(y_true_shuffle, y_score_shuffle), + err_msg="%s is not sample order invariant" % name, + ) + + for name in MULTIOUTPUT_METRICS: + metric = ALL_METRICS[name] + assert_allclose( + metric(y_true, y_score), + metric(y_true_shuffle, y_score_shuffle), + err_msg="%s is not sample order invariant" % name, + ) + assert_allclose( + metric(y_true, y_pred), + metric(y_true_shuffle, y_pred_shuffle), + err_msg="%s is not sample order invariant" % name, + ) + + +@pytest.mark.parametrize( + "name", sorted(set(ALL_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS) +) +def test_format_invariance_with_1d_vectors(name): + random_state = check_random_state(0) + y1 = random_state.randint(0, 2, size=(20,)) + y2 = random_state.randint(0, 2, size=(20,)) + + if name in METRICS_REQUIRE_POSITIVE_Y: + y1, y2 = _require_positive_targets(y1, y2) + elif name in METRICS_WITH_LOG1P_Y: + y1, y2 = _require_log1p_targets(y1, y2) + + y1_list = list(y1) + y2_list = list(y2) + + y1_1d, y2_1d = np.array(y1), np.array(y2) + assert_array_equal(y1_1d.ndim, 1) + assert_array_equal(y2_1d.ndim, 1) + y1_column = np.reshape(y1_1d, (-1, 1)) + y2_column = np.reshape(y2_1d, (-1, 1)) + y1_row = np.reshape(y1_1d, (1, -1)) + y2_row = np.reshape(y2_1d, (1, -1)) + + with ignore_warnings(): + metric = ALL_METRICS[name] + + measure = metric(y1, y2) + + assert_allclose( + metric(y1_list, y2_list), + measure, + err_msg="%s is not representation invariant with list" % name, + ) + + assert_allclose( + metric(y1_1d, y2_1d), + measure, + err_msg="%s is not representation invariant with np-array-1d" % name, + ) + + assert_allclose( + metric(y1_column, y2_column), + measure, + err_msg="%s is not representation invariant with np-array-column" % name, + ) + + # Mix format support + assert_allclose( + metric(y1_1d, y2_list), + measure, + err_msg="%s is not representation invariant with mix np-array-1d and list" + % name, + ) + + assert_allclose( + metric(y1_list, y2_1d), + measure, + err_msg="%s is not representation invariant with mix np-array-1d and list" + % name, + ) + + assert_allclose( + metric(y1_1d, y2_column), + measure, + err_msg=( + "%s is not representation invariant with mix " + "np-array-1d and np-array-column" + ) + % name, + ) + + assert_allclose( + metric(y1_column, y2_1d), + measure, + err_msg=( + "%s is not representation invariant with mix " + "np-array-1d and np-array-column" + ) + % name, + ) + + assert_allclose( + metric(y1_list, y2_column), + measure, + err_msg=( + "%s is not representation invariant with mix list and np-array-column" + ) + % name, + ) + + assert_allclose( + metric(y1_column, y2_list), + measure, + err_msg=( + "%s is not representation invariant with mix list and np-array-column" + ) + % name, + ) + + # These mix representations aren't allowed + with pytest.raises(ValueError): + metric(y1_1d, y2_row) + with pytest.raises(ValueError): + metric(y1_row, y2_1d) + with pytest.raises(ValueError): + metric(y1_list, y2_row) + with pytest.raises(ValueError): + metric(y1_row, y2_list) + with pytest.raises(ValueError): + metric(y1_column, y2_row) + with pytest.raises(ValueError): + metric(y1_row, y2_column) + + # NB: We do not test for y1_row, y2_row as these may be + # interpreted as multilabel or multioutput data. + if name not in ( + MULTIOUTPUT_METRICS | THRESHOLDED_MULTILABEL_METRICS | MULTILABELS_METRICS + ): + if "roc_auc" in name: + # for consistency between the `roc_cuve` and `roc_auc_score` + # np.nan is returned and an `UndefinedMetricWarning` is raised + with pytest.warns(UndefinedMetricWarning): + assert math.isnan(metric(y1_row, y2_row)) + else: + with pytest.raises(ValueError): + metric(y1_row, y2_row) + + +@pytest.mark.parametrize( + "name", sorted(set(CLASSIFICATION_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS) +) +def test_classification_invariance_string_vs_numbers_labels(name): + # Ensure that classification metrics with string labels are invariant + random_state = check_random_state(0) + y1 = random_state.randint(0, 2, size=(20,)) + y2 = random_state.randint(0, 2, size=(20,)) + + y1_str = np.array(["eggs", "spam"])[y1] + y2_str = np.array(["eggs", "spam"])[y2] + + pos_label_str = "spam" + labels_str = ["eggs", "spam"] + + with ignore_warnings(): + metric = CLASSIFICATION_METRICS[name] + measure_with_number = metric(y1, y2) + + # Ugly, but handle case with a pos_label and label + metric_str = metric + if name in METRICS_WITH_POS_LABEL: + metric_str = partial(metric_str, pos_label=pos_label_str) + + measure_with_str = metric_str(y1_str, y2_str) + + assert_array_equal( + measure_with_number, + measure_with_str, + err_msg="{0} failed string vs number invariance test".format(name), + ) + + measure_with_strobj = metric_str(y1_str.astype("O"), y2_str.astype("O")) + assert_array_equal( + measure_with_number, + measure_with_strobj, + err_msg="{0} failed string object vs number invariance test".format(name), + ) + + if name in METRICS_WITH_LABELS: + metric_str = partial(metric_str, labels=labels_str) + measure_with_str = metric_str(y1_str, y2_str) + assert_array_equal( + measure_with_number, + measure_with_str, + err_msg="{0} failed string vs number invariance test".format(name), + ) + + measure_with_strobj = metric_str(y1_str.astype("O"), y2_str.astype("O")) + assert_array_equal( + measure_with_number, + measure_with_strobj, + err_msg="{0} failed string vs number invariance test".format(name), + ) + + +@pytest.mark.parametrize("name", THRESHOLDED_METRICS) +def test_thresholded_invariance_string_vs_numbers_labels(name): + # Ensure that thresholded metrics with string labels are invariant + random_state = check_random_state(0) + y1 = random_state.randint(0, 2, size=(20,)) + y2 = random_state.randint(0, 2, size=(20,)) + + y1_str = np.array(["eggs", "spam"])[y1] + + pos_label_str = "spam" + + with ignore_warnings(): + metric = THRESHOLDED_METRICS[name] + if name not in METRIC_UNDEFINED_BINARY: + # Ugly, but handle case with a pos_label and label + metric_str = metric + if name in METRICS_WITH_POS_LABEL: + metric_str = partial(metric_str, pos_label=pos_label_str) + + measure_with_number = metric(y1, y2) + measure_with_str = metric_str(y1_str, y2) + assert_array_equal( + measure_with_number, + measure_with_str, + err_msg="{0} failed string vs number invariance test".format(name), + ) + + measure_with_strobj = metric_str(y1_str.astype("O"), y2) + assert_array_equal( + measure_with_number, + measure_with_strobj, + err_msg="{0} failed string object vs number invariance test".format( + name + ), + ) + else: + # TODO those metrics doesn't support string label yet + with pytest.raises(ValueError): + metric(y1_str, y2) + with pytest.raises(ValueError): + metric(y1_str.astype("O"), y2) + + +invalids_nan_inf = [ + ([0, 1], [np.inf, np.inf]), + ([0, 1], [np.nan, np.nan]), + ([0, 1], [np.nan, np.inf]), + ([0, 1], [np.inf, 1]), + ([0, 1], [np.nan, 1]), +] + + +@pytest.mark.parametrize( + "metric", chain(THRESHOLDED_METRICS.values(), REGRESSION_METRICS.values()) +) +@pytest.mark.parametrize("y_true, y_score", invalids_nan_inf) +def test_regression_thresholded_inf_nan_input(metric, y_true, y_score): + # Reshape since coverage_error only accepts 2D arrays. + if metric == coverage_error: + y_true = [y_true] + y_score = [y_score] + with pytest.raises(ValueError, match=r"contains (NaN|infinity)"): + metric(y_true, y_score) + + +@pytest.mark.parametrize("metric", CLASSIFICATION_METRICS.values()) +@pytest.mark.parametrize( + "y_true, y_score", + invalids_nan_inf + + + # Add an additional case for classification only + # non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/6809 + [ + ([np.nan, 1, 2], [1, 2, 3]), + ([np.inf, 1, 2], [1, 2, 3]), + ], +) +def test_classification_inf_nan_input(metric, y_true, y_score): + """check that classification metrics raise a message mentioning the + occurrence of non-finite values in the target vectors.""" + if not np.isfinite(y_true).all(): + input_name = "y_true" + if np.isnan(y_true).any(): + unexpected_value = "NaN" + else: + unexpected_value = "infinity or a value too large" + else: + input_name = "y_pred" + if np.isnan(y_score).any(): + unexpected_value = "NaN" + else: + unexpected_value = "infinity or a value too large" + + err_msg = f"Input {input_name} contains {unexpected_value}" + + with pytest.raises(ValueError, match=err_msg): + metric(y_true, y_score) + + +@pytest.mark.parametrize("metric", CLASSIFICATION_METRICS.values()) +def test_classification_binary_continuous_input(metric): + """check that classification metrics raise a message of mixed type data + with continuous/binary target vectors.""" + y_true, y_score = ["a", "b", "a"], [0.1, 0.2, 0.3] + err_msg = ( + "Classification metrics can't handle a mix of binary and continuous targets" + ) + with pytest.raises(ValueError, match=err_msg): + metric(y_true, y_score) + + +def check_single_sample(name): + # Non-regression test: scores should work with a single sample. + # This is important for leave-one-out cross validation. + # Score functions tested are those that formerly called np.squeeze, + # which turns an array of size 1 into a 0-d array (!). + metric = ALL_METRICS[name] + + # assert that no exception is thrown + if name in METRICS_REQUIRE_POSITIVE_Y: + values = [1, 2] + elif name in METRICS_WITH_LOG1P_Y: + values = [-0.7, 1] + else: + values = [0, 1] + for i, j in product(values, repeat=2): + metric([i], [j]) + + +def check_single_sample_multioutput(name): + metric = ALL_METRICS[name] + for i, j, k, l in product([0, 1], repeat=4): + metric(np.array([[i, j]]), np.array([[k, l]])) + + +# filter many metric specific warnings +@pytest.mark.filterwarnings("ignore") +@pytest.mark.parametrize( + "name", + sorted( + set(ALL_METRICS) + # Those metrics are not always defined with one sample + # or in multiclass classification + - METRIC_UNDEFINED_BINARY_MULTICLASS + - set(THRESHOLDED_METRICS) + ), +) +def test_single_sample(name): + check_single_sample(name) + + +# filter many metric specific warnings +@pytest.mark.filterwarnings("ignore") +@pytest.mark.parametrize("name", sorted(MULTIOUTPUT_METRICS | MULTILABELS_METRICS)) +def test_single_sample_multioutput(name): + check_single_sample_multioutput(name) + + +@pytest.mark.parametrize("name", sorted(MULTIOUTPUT_METRICS)) +def test_multioutput_number_of_output_differ(name): + y_true = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]]) + y_pred = np.array([[0, 0], [1, 0], [0, 0]]) + + metric = ALL_METRICS[name] + with pytest.raises(ValueError): + metric(y_true, y_pred) + + +@pytest.mark.parametrize("name", sorted(MULTIOUTPUT_METRICS)) +def test_multioutput_regression_invariance_to_dimension_shuffling(name): + # test invariance to dimension shuffling + random_state = check_random_state(0) + y_true = random_state.uniform(0, 2, size=(20, 5)) + y_pred = random_state.uniform(0, 2, size=(20, 5)) + + metric = ALL_METRICS[name] + error = metric(y_true, y_pred) + + for _ in range(3): + perm = random_state.permutation(y_true.shape[1]) + assert_allclose( + metric(y_true[:, perm], y_pred[:, perm]), + error, + err_msg="%s is not dimension shuffling invariant" % (name), + ) + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.UndefinedMetricWarning") +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +def test_multilabel_representation_invariance(coo_container): + # Generate some data + n_classes = 4 + n_samples = 50 + + _, y1 = make_multilabel_classification( + n_features=1, + n_classes=n_classes, + random_state=0, + n_samples=n_samples, + allow_unlabeled=True, + ) + _, y2 = make_multilabel_classification( + n_features=1, + n_classes=n_classes, + random_state=1, + n_samples=n_samples, + allow_unlabeled=True, + ) + + # To make sure at least one empty label is present + y1 = np.vstack([y1, [[0] * n_classes]]) + y2 = np.vstack([y2, [[0] * n_classes]]) + + y1_sparse_indicator = coo_container(y1) + y2_sparse_indicator = coo_container(y2) + + y1_list_array_indicator = list(y1) + y2_list_array_indicator = list(y2) + + y1_list_list_indicator = [list(a) for a in y1_list_array_indicator] + y2_list_list_indicator = [list(a) for a in y2_list_array_indicator] + + for name in MULTILABELS_METRICS: + metric = ALL_METRICS[name] + + # XXX cruel hack to work with partial functions + if isinstance(metric, partial): + metric.__module__ = "tmp" + metric.__name__ = name + + measure = metric(y1, y2) + + # Check representation invariance + assert_allclose( + metric(y1_sparse_indicator, y2_sparse_indicator), + measure, + err_msg=( + "%s failed representation invariance between " + "dense and sparse indicator formats." + ) + % name, + ) + assert_almost_equal( + metric(y1_list_list_indicator, y2_list_list_indicator), + measure, + err_msg=( + "%s failed representation invariance " + "between dense array and list of list " + "indicator formats." + ) + % name, + ) + assert_almost_equal( + metric(y1_list_array_indicator, y2_list_array_indicator), + measure, + err_msg=( + "%s failed representation invariance " + "between dense and list of array " + "indicator formats." + ) + % name, + ) + + +@pytest.mark.parametrize("name", sorted(MULTILABELS_METRICS)) +def test_raise_value_error_multilabel_sequences(name): + # make sure the multilabel-sequence format raises ValueError + multilabel_sequences = [ + [[1], [2], [0, 1]], + [(), (2), (0, 1)], + [[]], + [()], + np.array([[], [1, 2]], dtype="object"), + ] + + metric = ALL_METRICS[name] + for seq in multilabel_sequences: + with pytest.raises(ValueError): + metric(seq, seq) + + +@pytest.mark.parametrize("name", sorted(METRICS_WITH_NORMALIZE_OPTION)) +def test_normalize_option_binary_classification(name): + # Test in the binary case + n_classes = 2 + n_samples = 20 + random_state = check_random_state(0) + + y_true = random_state.randint(0, n_classes, size=(n_samples,)) + y_pred = random_state.randint(0, n_classes, size=(n_samples,)) + y_score = random_state.normal(size=y_true.shape) + + metrics = ALL_METRICS[name] + pred = y_score if name in THRESHOLDED_METRICS else y_pred + measure_normalized = metrics(y_true, pred, normalize=True) + measure_not_normalized = metrics(y_true, pred, normalize=False) + + assert_array_less( + -1.0 * measure_normalized, + 0, + err_msg="We failed to test correctly the normalize option", + ) + + assert_allclose( + measure_normalized, + measure_not_normalized / n_samples, + err_msg=f"Failed with {name}", + ) + + +@pytest.mark.parametrize("name", sorted(METRICS_WITH_NORMALIZE_OPTION)) +def test_normalize_option_multiclass_classification(name): + # Test in the multiclass case + n_classes = 4 + n_samples = 20 + random_state = check_random_state(0) + + y_true = random_state.randint(0, n_classes, size=(n_samples,)) + y_pred = random_state.randint(0, n_classes, size=(n_samples,)) + y_score = random_state.uniform(size=(n_samples, n_classes)) + + metrics = ALL_METRICS[name] + pred = y_score if name in THRESHOLDED_METRICS else y_pred + measure_normalized = metrics(y_true, pred, normalize=True) + measure_not_normalized = metrics(y_true, pred, normalize=False) + + assert_array_less( + -1.0 * measure_normalized, + 0, + err_msg="We failed to test correctly the normalize option", + ) + + assert_allclose( + measure_normalized, + measure_not_normalized / n_samples, + err_msg=f"Failed with {name}", + ) + + +@pytest.mark.parametrize( + "name", sorted(METRICS_WITH_NORMALIZE_OPTION.intersection(MULTILABELS_METRICS)) +) +def test_normalize_option_multilabel_classification(name): + # Test in the multilabel case + n_classes = 4 + n_samples = 100 + random_state = check_random_state(0) + + # for both random_state 0 and 1, y_true and y_pred has at least one + # unlabelled entry + _, y_true = make_multilabel_classification( + n_features=1, + n_classes=n_classes, + random_state=0, + allow_unlabeled=True, + n_samples=n_samples, + ) + _, y_pred = make_multilabel_classification( + n_features=1, + n_classes=n_classes, + random_state=1, + allow_unlabeled=True, + n_samples=n_samples, + ) + + y_score = random_state.uniform(size=y_true.shape) + + # To make sure at least one empty label is present + y_true += [0] * n_classes + y_pred += [0] * n_classes + + metrics = ALL_METRICS[name] + pred = y_score if name in THRESHOLDED_METRICS else y_pred + measure_normalized = metrics(y_true, pred, normalize=True) + measure_not_normalized = metrics(y_true, pred, normalize=False) + + assert_array_less( + -1.0 * measure_normalized, + 0, + err_msg="We failed to test correctly the normalize option", + ) + + assert_allclose( + measure_normalized, + measure_not_normalized / n_samples, + err_msg=f"Failed with {name}", + ) + + +def _check_averaging( + metric, y_true, y_pred, y_true_binarize, y_pred_binarize, is_multilabel +): + n_samples, n_classes = y_true_binarize.shape + + # No averaging + label_measure = metric(y_true, y_pred, average=None) + assert_allclose( + label_measure, + [ + metric(y_true_binarize[:, i], y_pred_binarize[:, i]) + for i in range(n_classes) + ], + ) + + # Micro measure + micro_measure = metric(y_true, y_pred, average="micro") + assert_allclose( + micro_measure, metric(y_true_binarize.ravel(), y_pred_binarize.ravel()) + ) + + # Macro measure + macro_measure = metric(y_true, y_pred, average="macro") + assert_allclose(macro_measure, np.mean(label_measure)) + + # Weighted measure + weights = np.sum(y_true_binarize, axis=0, dtype=int) + + if np.sum(weights) != 0: + weighted_measure = metric(y_true, y_pred, average="weighted") + assert_allclose(weighted_measure, np.average(label_measure, weights=weights)) + else: + weighted_measure = metric(y_true, y_pred, average="weighted") + assert_allclose(weighted_measure, 0) + + # Sample measure + if is_multilabel: + sample_measure = metric(y_true, y_pred, average="samples") + assert_allclose( + sample_measure, + np.mean( + [ + metric(y_true_binarize[i], y_pred_binarize[i]) + for i in range(n_samples) + ] + ), + ) + + with pytest.raises(ValueError): + metric(y_true, y_pred, average="unknown") + with pytest.raises(ValueError): + metric(y_true, y_pred, average="garbage") + + +def check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score): + is_multilabel = type_of_target(y_true).startswith("multilabel") + + metric = ALL_METRICS[name] + + if name in METRICS_WITH_AVERAGING: + _check_averaging( + metric, y_true, y_pred, y_true_binarize, y_pred_binarize, is_multilabel + ) + elif name in THRESHOLDED_METRICS_WITH_AVERAGING: + _check_averaging( + metric, y_true, y_score, y_true_binarize, y_score, is_multilabel + ) + else: + raise ValueError("Metric is not recorded as having an average option") + + +@pytest.mark.parametrize("name", sorted(METRICS_WITH_AVERAGING)) +def test_averaging_multiclass(name): + n_samples, n_classes = 50, 3 + random_state = check_random_state(0) + y_true = random_state.randint(0, n_classes, size=(n_samples,)) + y_pred = random_state.randint(0, n_classes, size=(n_samples,)) + y_score = random_state.uniform(size=(n_samples, n_classes)) + + lb = LabelBinarizer().fit(y_true) + y_true_binarize = lb.transform(y_true) + y_pred_binarize = lb.transform(y_pred) + + check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score) + + +@pytest.mark.parametrize( + "name", sorted(METRICS_WITH_AVERAGING | THRESHOLDED_METRICS_WITH_AVERAGING) +) +def test_averaging_multilabel(name): + n_samples, n_classes = 40, 5 + _, y = make_multilabel_classification( + n_features=1, + n_classes=n_classes, + random_state=5, + n_samples=n_samples, + allow_unlabeled=False, + ) + y_true = y[:20] + y_pred = y[20:] + y_score = check_random_state(0).normal(size=(20, n_classes)) + y_true_binarize = y_true + y_pred_binarize = y_pred + + check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score) + + +@pytest.mark.parametrize("name", sorted(METRICS_WITH_AVERAGING)) +def test_averaging_multilabel_all_zeroes(name): + y_true = np.zeros((20, 3)) + y_pred = np.zeros((20, 3)) + y_score = np.zeros((20, 3)) + y_true_binarize = y_true + y_pred_binarize = y_pred + + check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score) + + +def test_averaging_binary_multilabel_all_zeroes(): + y_true = np.zeros((20, 3)) + y_pred = np.zeros((20, 3)) + y_true_binarize = y_true + y_pred_binarize = y_pred + # Test _average_binary_score for weight.sum() == 0 + binary_metric = lambda y_true, y_score, average="macro": _average_binary_score( + precision_score, y_true, y_score, average + ) + _check_averaging( + binary_metric, + y_true, + y_pred, + y_true_binarize, + y_pred_binarize, + is_multilabel=True, + ) + + +@pytest.mark.parametrize("name", sorted(METRICS_WITH_AVERAGING)) +def test_averaging_multilabel_all_ones(name): + y_true = np.ones((20, 3)) + y_pred = np.ones((20, 3)) + y_score = np.ones((20, 3)) + y_true_binarize = y_true + y_pred_binarize = y_pred + + check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score) + + +def check_sample_weight_invariance(name, metric, y1, y2): + rng = np.random.RandomState(0) + sample_weight = rng.randint(1, 10, size=len(y1)) + + # top_k_accuracy_score always lead to a perfect score for k > 1 in the + # binary case + metric = partial(metric, k=1) if name == "top_k_accuracy_score" else metric + + # check that unit weights gives the same score as no weight + unweighted_score = metric(y1, y2, sample_weight=None) + + assert_allclose( + unweighted_score, + metric(y1, y2, sample_weight=np.ones(shape=len(y1))), + err_msg="For %s sample_weight=None is not equivalent to sample_weight=ones" + % name, + ) + + # check that the weighted and unweighted scores are unequal + weighted_score = metric(y1, y2, sample_weight=sample_weight) + + # use context manager to supply custom error message + with pytest.raises(AssertionError): + assert_allclose(unweighted_score, weighted_score) + raise ValueError( + "Unweighted and weighted scores are unexpectedly " + "almost equal (%s) and (%s) " + "for %s" % (unweighted_score, weighted_score, name) + ) + + # check that sample_weight can be a list + weighted_score_list = metric(y1, y2, sample_weight=sample_weight.tolist()) + assert_allclose( + weighted_score, + weighted_score_list, + err_msg=( + "Weighted scores for array and list " + "sample_weight input are not equal (%s != %s) for %s" + ) + % (weighted_score, weighted_score_list, name), + ) + + # check that integer weights is the same as repeated samples + repeat_weighted_score = metric( + np.repeat(y1, sample_weight, axis=0), + np.repeat(y2, sample_weight, axis=0), + sample_weight=None, + ) + assert_allclose( + weighted_score, + repeat_weighted_score, + err_msg="Weighting %s is not equal to repeating samples" % name, + ) + + # check that ignoring a fraction of the samples is equivalent to setting + # the corresponding weights to zero + sample_weight_subset = sample_weight[1::2] + sample_weight_zeroed = np.copy(sample_weight) + sample_weight_zeroed[::2] = 0 + y1_subset = y1[1::2] + y2_subset = y2[1::2] + weighted_score_subset = metric( + y1_subset, y2_subset, sample_weight=sample_weight_subset + ) + weighted_score_zeroed = metric(y1, y2, sample_weight=sample_weight_zeroed) + assert_allclose( + weighted_score_subset, + weighted_score_zeroed, + err_msg=( + "Zeroing weights does not give the same result as " + "removing the corresponding samples (%s != %s) for %s" + ) + % (weighted_score_zeroed, weighted_score_subset, name), + ) + + if not name.startswith("unnormalized"): + # check that the score is invariant under scaling of the weights by a + # common factor + for scaling in [2, 0.3]: + assert_allclose( + weighted_score, + metric(y1, y2, sample_weight=sample_weight * scaling), + err_msg="%s sample_weight is not invariant under scaling" % name, + ) + + # Check that if number of samples in y_true and sample_weight are not + # equal, meaningful error is raised. + error_message = ( + r"Found input variables with inconsistent numbers of " + r"samples: \[{}, {}, {}\]".format( + _num_samples(y1), _num_samples(y2), _num_samples(sample_weight) * 2 + ) + ) + with pytest.raises(ValueError, match=error_message): + metric(y1, y2, sample_weight=np.hstack([sample_weight, sample_weight])) + + +@pytest.mark.parametrize( + "name", + sorted( + set(ALL_METRICS).intersection(set(REGRESSION_METRICS)) + - METRICS_WITHOUT_SAMPLE_WEIGHT + ), +) +def test_regression_sample_weight_invariance(name): + n_samples = 50 + random_state = check_random_state(0) + # regression + y_true = random_state.random_sample(size=(n_samples,)) + y_pred = random_state.random_sample(size=(n_samples,)) + metric = ALL_METRICS[name] + check_sample_weight_invariance(name, metric, y_true, y_pred) + + +@pytest.mark.parametrize( + "name", + sorted( + set(ALL_METRICS).intersection(set(REGRESSION_METRICS)) + - METRICS_WITHOUT_SAMPLE_WEIGHT + ), +) +def test_regression_with_invalid_sample_weight(name): + # Check that `sample_weight` with incorrect length raises error + n_samples = 50 + random_state = check_random_state(0) + y_true = random_state.random_sample(size=(n_samples,)) + y_pred = random_state.random_sample(size=(n_samples,)) + metric = ALL_METRICS[name] + + sample_weight = random_state.random_sample(size=(n_samples - 1,)) + with pytest.raises(ValueError, match="Found input variables with inconsistent"): + metric(y_true, y_pred, sample_weight=sample_weight) + + sample_weight = random_state.random_sample(size=(n_samples * 2,)).reshape( + (n_samples, 2) + ) + with pytest.raises(ValueError, match="Sample weights must be 1D array or scalar"): + metric(y_true, y_pred, sample_weight=sample_weight) + + +@pytest.mark.parametrize( + "name", + sorted( + set(ALL_METRICS) + - set(REGRESSION_METRICS) + - METRICS_WITHOUT_SAMPLE_WEIGHT + - METRIC_UNDEFINED_BINARY + ), +) +def test_binary_sample_weight_invariance(name): + # binary + n_samples = 50 + random_state = check_random_state(0) + y_true = random_state.randint(0, 2, size=(n_samples,)) + y_pred = random_state.randint(0, 2, size=(n_samples,)) + y_score = random_state.random_sample(size=(n_samples,)) + metric = ALL_METRICS[name] + if name in THRESHOLDED_METRICS: + check_sample_weight_invariance(name, metric, y_true, y_score) + else: + check_sample_weight_invariance(name, metric, y_true, y_pred) + + +@pytest.mark.parametrize( + "name", + sorted( + set(ALL_METRICS) + - set(REGRESSION_METRICS) + - METRICS_WITHOUT_SAMPLE_WEIGHT + - METRIC_UNDEFINED_BINARY_MULTICLASS + ), +) +def test_multiclass_sample_weight_invariance(name): + # multiclass + n_samples = 50 + random_state = check_random_state(0) + y_true = random_state.randint(0, 5, size=(n_samples,)) + y_pred = random_state.randint(0, 5, size=(n_samples,)) + y_score = random_state.random_sample(size=(n_samples, 5)) + metric = ALL_METRICS[name] + if name in THRESHOLDED_METRICS: + # softmax + temp = np.exp(-y_score) + y_score_norm = temp / temp.sum(axis=-1).reshape(-1, 1) + check_sample_weight_invariance(name, metric, y_true, y_score_norm) + else: + check_sample_weight_invariance(name, metric, y_true, y_pred) + + +@pytest.mark.parametrize( + "name", + sorted( + (MULTILABELS_METRICS | THRESHOLDED_MULTILABEL_METRICS) + - METRICS_WITHOUT_SAMPLE_WEIGHT + ), +) +def test_multilabel_sample_weight_invariance(name): + # multilabel indicator + random_state = check_random_state(0) + _, ya = make_multilabel_classification( + n_features=1, n_classes=10, random_state=0, n_samples=50, allow_unlabeled=False + ) + _, yb = make_multilabel_classification( + n_features=1, n_classes=10, random_state=1, n_samples=50, allow_unlabeled=False + ) + y_true = np.vstack([ya, yb]) + y_pred = np.vstack([ya, ya]) + y_score = random_state.uniform(size=y_true.shape) + + # Some metrics (e.g. log_loss) require y_score to be probabilities (sum to 1) + y_score /= y_score.sum(axis=1, keepdims=True) + + metric = ALL_METRICS[name] + if name in THRESHOLDED_METRICS: + check_sample_weight_invariance(name, metric, y_true, y_score) + else: + check_sample_weight_invariance(name, metric, y_true, y_pred) + + +@pytest.mark.parametrize( + "name", + sorted(MULTIOUTPUT_METRICS - METRICS_WITHOUT_SAMPLE_WEIGHT), +) +def test_multioutput_sample_weight_invariance(name): + random_state = check_random_state(0) + y_true = random_state.uniform(0, 2, size=(20, 5)) + y_pred = random_state.uniform(0, 2, size=(20, 5)) + + metric = ALL_METRICS[name] + check_sample_weight_invariance(name, metric, y_true, y_pred) + + +def test_no_averaging_labels(): + # test labels argument when not using averaging + # in multi-class and multi-label cases + y_true_multilabel = np.array([[1, 1, 0, 0], [1, 1, 0, 0]]) + y_pred_multilabel = np.array([[0, 0, 1, 1], [0, 1, 1, 0]]) + y_true_multiclass = np.array([0, 1, 2]) + y_pred_multiclass = np.array([0, 2, 3]) + labels = np.array([3, 0, 1, 2]) + _, inverse_labels = np.unique(labels, return_inverse=True) + + for name in METRICS_WITH_AVERAGING: + for y_true, y_pred in [ + [y_true_multiclass, y_pred_multiclass], + [y_true_multilabel, y_pred_multilabel], + ]: + if name not in MULTILABELS_METRICS and y_pred.ndim > 1: + continue + + metric = ALL_METRICS[name] + + score_labels = metric(y_true, y_pred, labels=labels, average=None) + score = metric(y_true, y_pred, average=None) + assert_array_equal(score_labels, score[inverse_labels]) + + +@pytest.mark.parametrize( + "name", sorted(MULTILABELS_METRICS - {"unnormalized_multilabel_confusion_matrix"}) +) +def test_multilabel_label_permutations_invariance(name): + random_state = check_random_state(0) + n_samples, n_classes = 20, 4 + + y_true = random_state.randint(0, 2, size=(n_samples, n_classes)) + y_score = random_state.randint(0, 2, size=(n_samples, n_classes)) + + metric = ALL_METRICS[name] + score = metric(y_true, y_score) + + for perm in permutations(range(n_classes), n_classes): + y_score_perm = y_score[:, perm] + y_true_perm = y_true[:, perm] + + current_score = metric(y_true_perm, y_score_perm) + assert_almost_equal(score, current_score) + + +@pytest.mark.parametrize( + "name", sorted(THRESHOLDED_MULTILABEL_METRICS | MULTIOUTPUT_METRICS) +) +def test_thresholded_multilabel_multioutput_permutations_invariance(name): + random_state = check_random_state(0) + n_samples, n_classes = 20, 4 + y_true = random_state.randint(0, 2, size=(n_samples, n_classes)) + y_score = random_state.uniform(size=y_true.shape) + + # Some metrics (e.g. log_loss) require y_score to be probabilities (sum to 1) + y_score /= y_score.sum(axis=1, keepdims=True) + + # Makes sure all samples have at least one label. This works around errors + # when running metrics where average="sample" + y_true[y_true.sum(1) == 4, 0] = 0 + y_true[y_true.sum(1) == 0, 0] = 1 + + metric = ALL_METRICS[name] + score = metric(y_true, y_score) + + for perm in permutations(range(n_classes), n_classes): + y_score_perm = y_score[:, perm] + y_true_perm = y_true[:, perm] + + current_score = metric(y_true_perm, y_score_perm) + if metric == mean_absolute_percentage_error: + assert np.isfinite(current_score) + assert current_score > 1e6 + # Here we are not comparing the values in case of MAPE because + # whenever y_true value is exactly zero, the MAPE value doesn't + # signify anything. Thus, in this case we are just expecting + # very large finite value. + else: + assert_almost_equal(score, current_score) + + +@pytest.mark.parametrize( + "name", sorted(set(THRESHOLDED_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS) +) +def test_thresholded_metric_permutation_invariance(name): + n_samples, n_classes = 100, 3 + random_state = check_random_state(0) + + y_score = random_state.rand(n_samples, n_classes) + temp = np.exp(-y_score) + y_score = temp / temp.sum(axis=-1).reshape(-1, 1) + y_true = random_state.randint(0, n_classes, size=n_samples) + + metric = ALL_METRICS[name] + score = metric(y_true, y_score) + for perm in permutations(range(n_classes), n_classes): + inverse_perm = np.zeros(n_classes, dtype=int) + inverse_perm[list(perm)] = np.arange(n_classes) + y_score_perm = y_score[:, inverse_perm] + y_true_perm = np.take(perm, y_true) + + current_score = metric(y_true_perm, y_score_perm) + assert_almost_equal(score, current_score) + + +@pytest.mark.parametrize("metric_name", CLASSIFICATION_METRICS) +def test_metrics_consistent_type_error(metric_name): + # check that an understable message is raised when the type between y_true + # and y_pred mismatch + rng = np.random.RandomState(42) + y1 = np.array(["spam"] * 3 + ["eggs"] * 2, dtype=object) + y2 = rng.randint(0, 2, size=y1.size) + + err_msg = "Labels in y_true and y_pred should be of the same type." + with pytest.raises(TypeError, match=err_msg): + CLASSIFICATION_METRICS[metric_name](y1, y2) + + +@pytest.mark.parametrize( + "metric, y_pred_threshold", + [ + (average_precision_score, True), + (brier_score_loss, True), + (f1_score, False), + (partial(fbeta_score, beta=1), False), + (jaccard_score, False), + (precision_recall_curve, True), + (precision_score, False), + (recall_score, False), + (roc_curve, True), + ], +) +@pytest.mark.parametrize("dtype_y_str", [str, object]) +def test_metrics_pos_label_error_str(metric, y_pred_threshold, dtype_y_str): + # check that the error message if `pos_label` is not specified and the + # targets is made of strings. + rng = np.random.RandomState(42) + y1 = np.array(["spam"] * 3 + ["eggs"] * 2, dtype=dtype_y_str) + y2 = rng.randint(0, 2, size=y1.size) + + if not y_pred_threshold: + y2 = np.array(["spam", "eggs"], dtype=dtype_y_str)[y2] + + err_msg_pos_label_None = ( + "y_true takes value in {'eggs', 'spam'} and pos_label is not " + "specified: either make y_true take value in {0, 1} or {-1, 1} or " + "pass pos_label explicit" + ) + err_msg_pos_label_1 = ( + r"pos_label=1 is not a valid label. It should be one of \['eggs', 'spam'\]" + ) + + pos_label_default = signature(metric).parameters["pos_label"].default + + err_msg = err_msg_pos_label_1 if pos_label_default == 1 else err_msg_pos_label_None + with pytest.raises(ValueError, match=err_msg): + metric(y1, y2) + + +def check_array_api_metric( + metric, array_namespace, device, dtype_name, a_np, b_np, **metric_kwargs +): + xp = _array_api_for_tests(array_namespace, device) + + a_xp = xp.asarray(a_np, device=device) + b_xp = xp.asarray(b_np, device=device) + + metric_np = metric(a_np, b_np, **metric_kwargs) + + if metric_kwargs.get("sample_weight") is not None: + metric_kwargs["sample_weight"] = xp.asarray( + metric_kwargs["sample_weight"], device=device + ) + + multioutput = metric_kwargs.get("multioutput") + if isinstance(multioutput, np.ndarray): + metric_kwargs["multioutput"] = xp.asarray(multioutput, device=device) + + # When array API dispatch is disabled, and np.asarray works (for example PyTorch + # with CPU device), calling the metric function with such numpy compatible inputs + # should work (albeit by implicitly converting to numpy arrays instead of + # dispatching to the array library). + try: + np.asarray(a_xp) + np.asarray(b_xp) + numpy_as_array_works = True + except (TypeError, RuntimeError, ValueError): + # PyTorch with CUDA device and CuPy raise TypeError consistently. + # array-api-strict chose to raise RuntimeError instead. NumPy raises + # a ValueError if the `__array__` dunder does not return an array. + # Exception type may need to be updated in the future for other libraries. + numpy_as_array_works = False + + if numpy_as_array_works: + metric_xp = metric(a_xp, b_xp, **metric_kwargs) + assert_allclose( + metric_xp, + metric_np, + atol=_atol_for_type(dtype_name), + ) + metric_xp_mixed_1 = metric(a_np, b_xp, **metric_kwargs) + assert_allclose( + metric_xp_mixed_1, + metric_np, + atol=_atol_for_type(dtype_name), + ) + metric_xp_mixed_2 = metric(a_xp, b_np, **metric_kwargs) + assert_allclose( + metric_xp_mixed_2, + metric_np, + atol=_atol_for_type(dtype_name), + ) + + with config_context(array_api_dispatch=True): + metric_xp = metric(a_xp, b_xp, **metric_kwargs) + + assert_allclose( + _convert_to_numpy(xp.asarray(metric_xp), xp), + metric_np, + atol=_atol_for_type(dtype_name), + ) + + +def check_array_api_binary_classification_metric( + metric, array_namespace, device, dtype_name +): + y_true_np = np.array([0, 0, 1, 1]) + y_pred_np = np.array([0, 1, 0, 1]) + + check_array_api_metric( + metric, + array_namespace, + device, + dtype_name, + a_np=y_true_np, + b_np=y_pred_np, + sample_weight=None, + ) + + sample_weight = np.array([0.0, 0.1, 2.0, 1.0], dtype=dtype_name) + + check_array_api_metric( + metric, + array_namespace, + device, + dtype_name, + a_np=y_true_np, + b_np=y_pred_np, + sample_weight=sample_weight, + ) + + +def check_array_api_multiclass_classification_metric( + metric, array_namespace, device, dtype_name +): + y_true_np = np.array([0, 1, 2, 3]) + y_pred_np = np.array([0, 1, 0, 2]) + + additional_params = { + "average": ("micro", "macro", "weighted"), + "beta": (0.2, 0.5, 0.8), + } + metric_kwargs_combinations = _get_metric_kwargs_for_array_api_testing( + metric=metric, + params=additional_params, + ) + for metric_kwargs in metric_kwargs_combinations: + check_array_api_metric( + metric, + array_namespace, + device, + dtype_name, + a_np=y_true_np, + b_np=y_pred_np, + sample_weight=None, + **metric_kwargs, + ) + + sample_weight = np.array([0.0, 0.1, 2.0, 1.0], dtype=dtype_name) + + check_array_api_metric( + metric, + array_namespace, + device, + dtype_name, + a_np=y_true_np, + b_np=y_pred_np, + sample_weight=sample_weight, + **metric_kwargs, + ) + + +def check_array_api_multilabel_classification_metric( + metric, array_namespace, device, dtype_name +): + y_true_np = np.array([[1, 1], [0, 1], [0, 0]], dtype=dtype_name) + y_pred_np = np.array([[1, 1], [1, 1], [1, 1]], dtype=dtype_name) + + additional_params = { + "average": ("micro", "macro", "weighted"), + "beta": (0.2, 0.5, 0.8), + } + metric_kwargs_combinations = _get_metric_kwargs_for_array_api_testing( + metric=metric, + params=additional_params, + ) + for metric_kwargs in metric_kwargs_combinations: + check_array_api_metric( + metric, + array_namespace, + device, + dtype_name, + a_np=y_true_np, + b_np=y_pred_np, + sample_weight=None, + **metric_kwargs, + ) + + sample_weight = np.array([0.0, 0.1, 2.0], dtype=dtype_name) + + check_array_api_metric( + metric, + array_namespace, + device, + dtype_name, + a_np=y_true_np, + b_np=y_pred_np, + sample_weight=sample_weight, + **metric_kwargs, + ) + + +def check_array_api_regression_metric(metric, array_namespace, device, dtype_name): + func_name = metric.func.__name__ if isinstance(metric, partial) else metric.__name__ + if func_name == "mean_poisson_deviance" and sp_version < parse_version("1.14.0"): + pytest.skip( + "mean_poisson_deviance's dependency `xlogy` is available as of scipy 1.14.0" + ) + + y_true_np = np.array([2.0, 0.1, 1.0, 4.0], dtype=dtype_name) + y_pred_np = np.array([0.5, 0.5, 2, 2], dtype=dtype_name) + + metric_kwargs = {} + metric_params = signature(metric).parameters + + if "sample_weight" in metric_params: + metric_kwargs["sample_weight"] = None + + check_array_api_metric( + metric, + array_namespace, + device, + dtype_name, + a_np=y_true_np, + b_np=y_pred_np, + **metric_kwargs, + ) + + if "sample_weight" in metric_params: + metric_kwargs["sample_weight"] = np.array( + [0.1, 2.0, 1.5, 0.5], dtype=dtype_name + ) + + check_array_api_metric( + metric, + array_namespace, + device, + dtype_name, + a_np=y_true_np, + b_np=y_pred_np, + **metric_kwargs, + ) + + +def check_array_api_regression_metric_multioutput( + metric, array_namespace, device, dtype_name +): + y_true_np = np.array([[1, 3, 2], [1, 2, 2]], dtype=dtype_name) + y_pred_np = np.array([[1, 4, 4], [1, 1, 1]], dtype=dtype_name) + + check_array_api_metric( + metric, + array_namespace, + device, + dtype_name, + a_np=y_true_np, + b_np=y_pred_np, + sample_weight=None, + ) + + sample_weight = np.array([0.1, 2.0], dtype=dtype_name) + + check_array_api_metric( + metric, + array_namespace, + device, + dtype_name, + a_np=y_true_np, + b_np=y_pred_np, + sample_weight=sample_weight, + ) + + check_array_api_metric( + metric, + array_namespace, + device, + dtype_name, + a_np=y_true_np, + b_np=y_pred_np, + multioutput=np.array([0.1, 0.3, 0.7], dtype=dtype_name), + ) + + check_array_api_metric( + metric, + array_namespace, + device, + dtype_name, + a_np=y_true_np, + b_np=y_pred_np, + multioutput="raw_values", + ) + + +def check_array_api_metric_pairwise(metric, array_namespace, device, dtype_name): + X_np = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], dtype=dtype_name) + Y_np = np.array([[0.2, 0.3, 0.4], [0.5, 0.6, 0.7]], dtype=dtype_name) + + metric_kwargs = {} + if "dense_output" in signature(metric).parameters: + metric_kwargs["dense_output"] = False + check_array_api_metric( + metric, + array_namespace, + device, + dtype_name, + a_np=X_np, + b_np=Y_np, + **metric_kwargs, + ) + metric_kwargs["dense_output"] = True + + check_array_api_metric( + metric, + array_namespace, + device, + dtype_name, + a_np=X_np, + b_np=Y_np, + **metric_kwargs, + ) + + +array_api_metric_checkers = { + accuracy_score: [ + check_array_api_binary_classification_metric, + check_array_api_multiclass_classification_metric, + check_array_api_multilabel_classification_metric, + ], + f1_score: [ + check_array_api_binary_classification_metric, + check_array_api_multiclass_classification_metric, + check_array_api_multilabel_classification_metric, + ], + fbeta_score: [ + check_array_api_multiclass_classification_metric, + check_array_api_multilabel_classification_metric, + ], + jaccard_score: [ + check_array_api_binary_classification_metric, + check_array_api_multiclass_classification_metric, + check_array_api_multilabel_classification_metric, + ], + multilabel_confusion_matrix: [ + check_array_api_binary_classification_metric, + check_array_api_multiclass_classification_metric, + check_array_api_multilabel_classification_metric, + ], + precision_score: [ + check_array_api_binary_classification_metric, + check_array_api_multiclass_classification_metric, + check_array_api_multilabel_classification_metric, + ], + recall_score: [ + check_array_api_binary_classification_metric, + check_array_api_multiclass_classification_metric, + check_array_api_multilabel_classification_metric, + ], + zero_one_loss: [ + check_array_api_binary_classification_metric, + check_array_api_multiclass_classification_metric, + check_array_api_multilabel_classification_metric, + ], + hamming_loss: [ + check_array_api_binary_classification_metric, + check_array_api_multiclass_classification_metric, + check_array_api_multilabel_classification_metric, + ], + mean_tweedie_deviance: [check_array_api_regression_metric], + partial(mean_tweedie_deviance, power=-0.5): [check_array_api_regression_metric], + partial(mean_tweedie_deviance, power=1.5): [check_array_api_regression_metric], + r2_score: [ + check_array_api_regression_metric, + check_array_api_regression_metric_multioutput, + ], + cosine_similarity: [check_array_api_metric_pairwise], + explained_variance_score: [ + check_array_api_regression_metric, + check_array_api_regression_metric_multioutput, + ], + mean_absolute_error: [ + check_array_api_regression_metric, + check_array_api_regression_metric_multioutput, + ], + mean_pinball_loss: [ + check_array_api_regression_metric, + check_array_api_regression_metric_multioutput, + ], + mean_squared_error: [ + check_array_api_regression_metric, + check_array_api_regression_metric_multioutput, + ], + mean_squared_log_error: [ + check_array_api_regression_metric, + check_array_api_regression_metric_multioutput, + ], + d2_tweedie_score: [ + check_array_api_regression_metric, + ], + paired_cosine_distances: [check_array_api_metric_pairwise], + mean_poisson_deviance: [check_array_api_regression_metric], + additive_chi2_kernel: [check_array_api_metric_pairwise], + mean_gamma_deviance: [check_array_api_regression_metric], + max_error: [check_array_api_regression_metric], + mean_absolute_percentage_error: [ + check_array_api_regression_metric, + check_array_api_regression_metric_multioutput, + ], + chi2_kernel: [check_array_api_metric_pairwise], + paired_euclidean_distances: [check_array_api_metric_pairwise], + cosine_distances: [check_array_api_metric_pairwise], + euclidean_distances: [check_array_api_metric_pairwise], + linear_kernel: [check_array_api_metric_pairwise], + polynomial_kernel: [check_array_api_metric_pairwise], + rbf_kernel: [check_array_api_metric_pairwise], + root_mean_squared_error: [ + check_array_api_regression_metric, + check_array_api_regression_metric_multioutput, + ], + root_mean_squared_log_error: [ + check_array_api_regression_metric, + check_array_api_regression_metric_multioutput, + ], + sigmoid_kernel: [check_array_api_metric_pairwise], +} + + +def yield_metric_checker_combinations(metric_checkers=array_api_metric_checkers): + for metric, checkers in metric_checkers.items(): + for checker in checkers: + yield metric, checker + + +@pytest.mark.parametrize( + "array_namespace, device, dtype_name", + yield_namespace_device_dtype_combinations(), + ids=_get_namespace_device_dtype_ids, +) +@pytest.mark.parametrize("metric, check_func", yield_metric_checker_combinations()) +def test_array_api_compliance(metric, array_namespace, device, dtype_name, check_func): + check_func(metric, array_namespace, device, dtype_name) + + +@pytest.mark.parametrize("df_lib_name", ["pandas", "polars"]) +@pytest.mark.parametrize("metric_name", sorted(ALL_METRICS)) +def test_metrics_dataframe_series(metric_name, df_lib_name): + df_lib = pytest.importorskip(df_lib_name) + + y_pred = df_lib.Series([0.0, 1.0, 0, 1.0]) + y_true = df_lib.Series([1.0, 0.0, 0.0, 0.0]) + + metric = ALL_METRICS[metric_name] + try: + expected_metric = metric(y_pred.to_numpy(), y_true.to_numpy()) + except ValueError: + pytest.skip(f"{metric_name} can not deal with 1d inputs") + + assert_allclose(metric(y_pred, y_true), expected_metric) + + +def _get_metric_kwargs_for_array_api_testing(metric, params): + """Helper function to enable specifying a variety of additional params and + their corresponding values, so that they can be passed to a metric function + when testing for array api compliance.""" + metric_kwargs_combinations = [{}] + for param, values in params.items(): + if param not in signature(metric).parameters: + continue + + new_combinations = [] + for kwargs in metric_kwargs_combinations: + for value in values: + new_kwargs = kwargs.copy() + new_kwargs[param] = value + new_combinations.append(new_kwargs) + + metric_kwargs_combinations = new_combinations + + return metric_kwargs_combinations + + +@pytest.mark.parametrize("name", sorted(ALL_METRICS)) +def test_returned_value_consistency(name): + """Ensure that the returned values of all metrics are consistent. + + It can either be a float, a numpy array, or a tuple of floats or numpy arrays. + It should not be a numpy float64 or float32. + """ + + rng = np.random.RandomState(0) + y_true = rng.randint(0, 2, size=(20,)) + y_pred = rng.randint(0, 2, size=(20,)) + + if name in METRICS_REQUIRE_POSITIVE_Y: + y_true, y_pred = _require_positive_targets(y_true, y_pred) + + if name in METRIC_UNDEFINED_BINARY: + y_true = rng.randint(0, 2, size=(20, 3)) + y_pred = rng.randint(0, 2, size=(20, 3)) + + metric = ALL_METRICS[name] + score = metric(y_true, y_pred) + + assert isinstance(score, (float, np.ndarray, tuple)) + assert not isinstance(score, (np.float64, np.float32)) + + if isinstance(score, tuple): + assert all(isinstance(v, float) for v in score) or all( + isinstance(v, np.ndarray) for v in score + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_dist_metrics.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_dist_metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..f93d3b984bdb7c218d0517ca9e6c21ec930f96fc --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_dist_metrics.py @@ -0,0 +1,431 @@ +import copy +import itertools +import pickle + +import numpy as np +import pytest +from scipy.spatial.distance import cdist + +from sklearn.metrics import DistanceMetric +from sklearn.metrics._dist_metrics import ( + BOOL_METRICS, + DEPRECATED_METRICS, + DistanceMetric32, + DistanceMetric64, +) +from sklearn.utils import check_random_state +from sklearn.utils._testing import ( + assert_allclose, + create_memmap_backed_data, + ignore_warnings, +) +from sklearn.utils.fixes import CSR_CONTAINERS + + +def dist_func(x1, x2, p): + return np.sum((x1 - x2) ** p) ** (1.0 / p) + + +rng = check_random_state(0) +d = 4 +n1 = 20 +n2 = 25 +X64 = rng.random_sample((n1, d)) +Y64 = rng.random_sample((n2, d)) +X32 = X64.astype("float32") +Y32 = Y64.astype("float32") + +[X_mmap, Y_mmap] = create_memmap_backed_data([X64, Y64]) + +# make boolean arrays: ones and zeros +X_bool = (X64 < 0.3).astype(np.float64) # quite sparse +Y_bool = (Y64 < 0.7).astype(np.float64) # not too sparse + +[X_bool_mmap, Y_bool_mmap] = create_memmap_backed_data([X_bool, Y_bool]) + + +V = rng.random_sample((d, d)) +VI = np.dot(V, V.T) + +METRICS_DEFAULT_PARAMS = [ + ("euclidean", {}), + ("cityblock", {}), + ("minkowski", dict(p=(0.5, 1, 1.5, 2, 3))), + ("chebyshev", {}), + ("seuclidean", dict(V=(rng.random_sample(d),))), + ("mahalanobis", dict(VI=(VI,))), + ("hamming", {}), + ("canberra", {}), + ("braycurtis", {}), + ("minkowski", dict(p=(0.5, 1, 1.5, 3), w=(rng.random_sample(d),))), +] + + +@pytest.mark.parametrize( + "metric_param_grid", METRICS_DEFAULT_PARAMS, ids=lambda params: params[0] +) +@pytest.mark.parametrize("X, Y", [(X64, Y64), (X32, Y32), (X_mmap, Y_mmap)]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_cdist(metric_param_grid, X, Y, csr_container): + metric, param_grid = metric_param_grid + keys = param_grid.keys() + X_csr, Y_csr = csr_container(X), csr_container(Y) + for vals in itertools.product(*param_grid.values()): + kwargs = dict(zip(keys, vals)) + rtol_dict = {} + if metric == "mahalanobis" and X.dtype == np.float32: + # Computation of mahalanobis differs between + # the scipy and scikit-learn implementation. + # Hence, we increase the relative tolerance. + # TODO: Inspect slight numerical discrepancy + # with scipy + rtol_dict = {"rtol": 1e-6} + + D_scipy_cdist = cdist(X, Y, metric, **kwargs) + + dm = DistanceMetric.get_metric(metric, X.dtype, **kwargs) + + # DistanceMetric.pairwise must be consistent for all + # combinations of formats in {sparse, dense}. + D_sklearn = dm.pairwise(X, Y) + assert D_sklearn.flags.c_contiguous + assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) + + D_sklearn = dm.pairwise(X_csr, Y_csr) + assert D_sklearn.flags.c_contiguous + assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) + + D_sklearn = dm.pairwise(X_csr, Y) + assert D_sklearn.flags.c_contiguous + assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) + + D_sklearn = dm.pairwise(X, Y_csr) + assert D_sklearn.flags.c_contiguous + assert_allclose(D_sklearn, D_scipy_cdist, **rtol_dict) + + +@pytest.mark.parametrize("metric", BOOL_METRICS) +@pytest.mark.parametrize( + "X_bool, Y_bool", [(X_bool, Y_bool), (X_bool_mmap, Y_bool_mmap)] +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_cdist_bool_metric(metric, X_bool, Y_bool, csr_container): + if metric in DEPRECATED_METRICS: + with ignore_warnings(category=DeprecationWarning): + # Some metrics can be deprecated depending on the scipy version. + # But if they are present, we still want to test whether + # scikit-learn gives the same result, whether or not they are + # deprecated. + D_scipy_cdist = cdist(X_bool, Y_bool, metric) + else: + D_scipy_cdist = cdist(X_bool, Y_bool, metric) + + dm = DistanceMetric.get_metric(metric) + D_sklearn = dm.pairwise(X_bool, Y_bool) + assert_allclose(D_sklearn, D_scipy_cdist) + + # DistanceMetric.pairwise must be consistent + # on all combinations of format in {sparse, dense}². + X_bool_csr, Y_bool_csr = csr_container(X_bool), csr_container(Y_bool) + + D_sklearn = dm.pairwise(X_bool, Y_bool) + assert D_sklearn.flags.c_contiguous + assert_allclose(D_sklearn, D_scipy_cdist) + + D_sklearn = dm.pairwise(X_bool_csr, Y_bool_csr) + assert D_sklearn.flags.c_contiguous + assert_allclose(D_sklearn, D_scipy_cdist) + + D_sklearn = dm.pairwise(X_bool, Y_bool_csr) + assert D_sklearn.flags.c_contiguous + assert_allclose(D_sklearn, D_scipy_cdist) + + D_sklearn = dm.pairwise(X_bool_csr, Y_bool) + assert D_sklearn.flags.c_contiguous + assert_allclose(D_sklearn, D_scipy_cdist) + + +@pytest.mark.parametrize( + "metric_param_grid", METRICS_DEFAULT_PARAMS, ids=lambda params: params[0] +) +@pytest.mark.parametrize("X", [X64, X32, X_mmap]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_pdist(metric_param_grid, X, csr_container): + metric, param_grid = metric_param_grid + keys = param_grid.keys() + X_csr = csr_container(X) + for vals in itertools.product(*param_grid.values()): + kwargs = dict(zip(keys, vals)) + rtol_dict = {} + if metric == "mahalanobis" and X.dtype == np.float32: + # Computation of mahalanobis differs between + # the scipy and scikit-learn implementation. + # Hence, we increase the relative tolerance. + # TODO: Inspect slight numerical discrepancy + # with scipy + rtol_dict = {"rtol": 1e-6} + + D_scipy_pdist = cdist(X, X, metric, **kwargs) + + dm = DistanceMetric.get_metric(metric, X.dtype, **kwargs) + D_sklearn = dm.pairwise(X) + assert D_sklearn.flags.c_contiguous + assert_allclose(D_sklearn, D_scipy_pdist, **rtol_dict) + + D_sklearn_csr = dm.pairwise(X_csr) + assert D_sklearn.flags.c_contiguous + assert_allclose(D_sklearn_csr, D_scipy_pdist, **rtol_dict) + + D_sklearn_csr = dm.pairwise(X_csr, X_csr) + assert D_sklearn.flags.c_contiguous + assert_allclose(D_sklearn_csr, D_scipy_pdist, **rtol_dict) + + +@pytest.mark.parametrize( + "metric_param_grid", METRICS_DEFAULT_PARAMS, ids=lambda params: params[0] +) +def test_distance_metrics_dtype_consistency(metric_param_grid): + # DistanceMetric must return similar distances for both float32 and float64 + # input data. + metric, param_grid = metric_param_grid + keys = param_grid.keys() + + # Choose rtol to make sure that this test is robust to changes in the random + # seed in the module-level test data generation code. + rtol = 1e-5 + + for vals in itertools.product(*param_grid.values()): + kwargs = dict(zip(keys, vals)) + dm64 = DistanceMetric.get_metric(metric, np.float64, **kwargs) + dm32 = DistanceMetric.get_metric(metric, np.float32, **kwargs) + + D64 = dm64.pairwise(X64) + D32 = dm32.pairwise(X32) + + assert D64.dtype == np.float64 + assert D32.dtype == np.float32 + + # assert_allclose introspects the dtype of the input arrays to decide + # which rtol value to use by default but in this case we know that D32 + # is not computed with the same precision so we set rtol manually. + assert_allclose(D64, D32, rtol=rtol) + + D64 = dm64.pairwise(X64, Y64) + D32 = dm32.pairwise(X32, Y32) + assert_allclose(D64, D32, rtol=rtol) + + +@pytest.mark.parametrize("metric", BOOL_METRICS) +@pytest.mark.parametrize("X_bool", [X_bool, X_bool_mmap]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_pdist_bool_metrics(metric, X_bool, csr_container): + if metric in DEPRECATED_METRICS: + with ignore_warnings(category=DeprecationWarning): + # Some metrics can be deprecated depending on the scipy version. + # But if they are present, we still want to test whether + # scikit-learn gives the same result, whether or not they are + # deprecated. + D_scipy_pdist = cdist(X_bool, X_bool, metric) + else: + D_scipy_pdist = cdist(X_bool, X_bool, metric) + + dm = DistanceMetric.get_metric(metric) + D_sklearn = dm.pairwise(X_bool) + assert_allclose(D_sklearn, D_scipy_pdist) + + X_bool_csr = csr_container(X_bool) + D_sklearn = dm.pairwise(X_bool_csr) + assert_allclose(D_sklearn, D_scipy_pdist) + + +@pytest.mark.parametrize("writable_kwargs", [True, False]) +@pytest.mark.parametrize( + "metric_param_grid", METRICS_DEFAULT_PARAMS, ids=lambda params: params[0] +) +@pytest.mark.parametrize("X", [X64, X32]) +def test_pickle(writable_kwargs, metric_param_grid, X): + metric, param_grid = metric_param_grid + keys = param_grid.keys() + for vals in itertools.product(*param_grid.values()): + if any(isinstance(val, np.ndarray) for val in vals): + vals = copy.deepcopy(vals) + for val in vals: + if isinstance(val, np.ndarray): + val.setflags(write=writable_kwargs) + kwargs = dict(zip(keys, vals)) + dm = DistanceMetric.get_metric(metric, X.dtype, **kwargs) + D1 = dm.pairwise(X) + dm2 = pickle.loads(pickle.dumps(dm)) + D2 = dm2.pairwise(X) + assert_allclose(D1, D2) + + +@pytest.mark.parametrize("metric", BOOL_METRICS) +@pytest.mark.parametrize("X_bool", [X_bool, X_bool_mmap]) +def test_pickle_bool_metrics(metric, X_bool): + dm = DistanceMetric.get_metric(metric) + D1 = dm.pairwise(X_bool) + dm2 = pickle.loads(pickle.dumps(dm)) + D2 = dm2.pairwise(X_bool) + assert_allclose(D1, D2) + + +@pytest.mark.parametrize("X, Y", [(X64, Y64), (X32, Y32), (X_mmap, Y_mmap)]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_haversine_metric(X, Y, csr_container): + # The Haversine DistanceMetric only works on 2 features. + X = np.asarray(X[:, :2]) + Y = np.asarray(Y[:, :2]) + + X_csr, Y_csr = csr_container(X), csr_container(Y) + + # Haversine is not supported by scipy.special.distance.{cdist,pdist} + # So we reimplement it to have a reference. + def haversine_slow(x1, x2): + return 2 * np.arcsin( + np.sqrt( + np.sin(0.5 * (x1[0] - x2[0])) ** 2 + + np.cos(x1[0]) * np.cos(x2[0]) * np.sin(0.5 * (x1[1] - x2[1])) ** 2 + ) + ) + + D_reference = np.zeros((X_csr.shape[0], Y_csr.shape[0])) + for i, xi in enumerate(X): + for j, yj in enumerate(Y): + D_reference[i, j] = haversine_slow(xi, yj) + + haversine = DistanceMetric.get_metric("haversine", X.dtype) + + D_sklearn = haversine.pairwise(X, Y) + assert_allclose( + haversine.dist_to_rdist(D_sklearn), np.sin(0.5 * D_reference) ** 2, rtol=1e-6 + ) + + assert_allclose(D_sklearn, D_reference) + + D_sklearn = haversine.pairwise(X_csr, Y_csr) + assert D_sklearn.flags.c_contiguous + assert_allclose(D_sklearn, D_reference) + + D_sklearn = haversine.pairwise(X_csr, Y) + assert D_sklearn.flags.c_contiguous + assert_allclose(D_sklearn, D_reference) + + D_sklearn = haversine.pairwise(X, Y_csr) + assert D_sklearn.flags.c_contiguous + assert_allclose(D_sklearn, D_reference) + + +def test_pyfunc_metric(): + X = np.random.random((10, 3)) + + euclidean = DistanceMetric.get_metric("euclidean") + pyfunc = DistanceMetric.get_metric("pyfunc", func=dist_func, p=2) + + # Check if both callable metric and predefined metric initialized + # DistanceMetric object is picklable + euclidean_pkl = pickle.loads(pickle.dumps(euclidean)) + pyfunc_pkl = pickle.loads(pickle.dumps(pyfunc)) + + D1 = euclidean.pairwise(X) + D2 = pyfunc.pairwise(X) + + D1_pkl = euclidean_pkl.pairwise(X) + D2_pkl = pyfunc_pkl.pairwise(X) + + assert_allclose(D1, D2) + assert_allclose(D1_pkl, D2_pkl) + + +def test_input_data_size(): + # Regression test for #6288 + # Previously, a metric requiring a particular input dimension would fail + def custom_metric(x, y): + assert x.shape[0] == 3 + return np.sum((x - y) ** 2) + + rng = check_random_state(0) + X = rng.rand(10, 3) + + pyfunc = DistanceMetric.get_metric("pyfunc", func=custom_metric) + eucl = DistanceMetric.get_metric("euclidean") + assert_allclose(pyfunc.pairwise(X), eucl.pairwise(X) ** 2) + + +def test_readonly_kwargs(): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/21685 + + rng = check_random_state(0) + + weights = rng.rand(100) + VI = rng.rand(10, 10) + weights.setflags(write=False) + VI.setflags(write=False) + + # Those distances metrics have to support readonly buffers. + DistanceMetric.get_metric("seuclidean", V=weights) + DistanceMetric.get_metric("mahalanobis", VI=VI) + + +@pytest.mark.parametrize( + "w, err_type, err_msg", + [ + (np.array([1, 1.5, -13]), ValueError, "w cannot contain negative weights"), + (np.array([1, 1.5, np.nan]), ValueError, "w contains NaN"), + *[ + ( + csr_container([[1, 1.5, 1]]), + TypeError, + "Sparse data was passed for w, but dense data is required", + ) + for csr_container in CSR_CONTAINERS + ], + (np.array(["a", "b", "c"]), ValueError, "could not convert string to float"), + (np.array([]), ValueError, "a minimum of 1 is required"), + ], +) +def test_minkowski_metric_validate_weights_values(w, err_type, err_msg): + with pytest.raises(err_type, match=err_msg): + DistanceMetric.get_metric("minkowski", p=3, w=w) + + +def test_minkowski_metric_validate_weights_size(): + w2 = rng.random_sample(d + 1) + dm = DistanceMetric.get_metric("minkowski", p=3, w=w2) + msg = ( + "MinkowskiDistance: the size of w must match " + f"the number of features \\({X64.shape[1]}\\). " + f"Currently len\\(w\\)={w2.shape[0]}." + ) + with pytest.raises(ValueError, match=msg): + dm.pairwise(X64, Y64) + + +@pytest.mark.parametrize("metric, metric_kwargs", METRICS_DEFAULT_PARAMS) +@pytest.mark.parametrize("dtype", (np.float32, np.float64)) +def test_get_metric_dtype(metric, metric_kwargs, dtype): + specialized_cls = { + np.float32: DistanceMetric32, + np.float64: DistanceMetric64, + }[dtype] + + # We don't need the entire grid, just one for a sanity check + metric_kwargs = {k: v[0] for k, v in metric_kwargs.items()} + generic_type = type(DistanceMetric.get_metric(metric, dtype, **metric_kwargs)) + specialized_type = type(specialized_cls.get_metric(metric, **metric_kwargs)) + + assert generic_type is specialized_type + + +def test_get_metric_bad_dtype(): + dtype = np.int32 + msg = r"Unexpected dtype .* provided. Please select a dtype from" + with pytest.raises(ValueError, match=msg): + DistanceMetric.get_metric("manhattan", dtype) + + +def test_minkowski_metric_validate_bad_p_parameter(): + msg = "p must be greater than 0" + with pytest.raises(ValueError, match=msg): + DistanceMetric.get_metric("minkowski", p=0) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_pairwise.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_pairwise.py new file mode 100644 index 0000000000000000000000000000000000000000..4c1ba4b2f7d5280235ed2038ac2bd933db4b701d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_pairwise.py @@ -0,0 +1,1683 @@ +import warnings +from types import GeneratorType + +import numpy as np +import pytest +from numpy import linalg +from scipy.sparse import issparse +from scipy.spatial.distance import ( + cdist, + cityblock, + cosine, + minkowski, + pdist, + squareform, +) + +from sklearn import config_context +from sklearn.exceptions import DataConversionWarning +from sklearn.metrics.pairwise import ( + PAIRED_DISTANCES, + PAIRWISE_BOOLEAN_FUNCTIONS, + PAIRWISE_DISTANCE_FUNCTIONS, + PAIRWISE_KERNEL_FUNCTIONS, + _euclidean_distances_upcast, + additive_chi2_kernel, + check_paired_arrays, + check_pairwise_arrays, + chi2_kernel, + cosine_distances, + cosine_similarity, + euclidean_distances, + haversine_distances, + laplacian_kernel, + linear_kernel, + manhattan_distances, + nan_euclidean_distances, + paired_cosine_distances, + paired_distances, + paired_euclidean_distances, + paired_manhattan_distances, + pairwise_distances, + pairwise_distances_argmin, + pairwise_distances_argmin_min, + pairwise_distances_chunked, + pairwise_kernels, + polynomial_kernel, + rbf_kernel, + sigmoid_kernel, +) +from sklearn.preprocessing import normalize +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_equal, + ignore_warnings, +) +from sklearn.utils.fixes import ( + BSR_CONTAINERS, + COO_CONTAINERS, + CSC_CONTAINERS, + CSR_CONTAINERS, + DOK_CONTAINERS, +) +from sklearn.utils.parallel import Parallel, delayed + + +def test_pairwise_distances_for_dense_data(global_dtype): + # Test the pairwise_distance helper function. + rng = np.random.RandomState(0) + + # Euclidean distance should be equivalent to calling the function. + X = rng.random_sample((5, 4)).astype(global_dtype, copy=False) + S = pairwise_distances(X, metric="euclidean") + S2 = euclidean_distances(X) + assert_allclose(S, S2) + assert S.dtype == S2.dtype == global_dtype + + # Euclidean distance, with Y != X. + Y = rng.random_sample((2, 4)).astype(global_dtype, copy=False) + S = pairwise_distances(X, Y, metric="euclidean") + S2 = euclidean_distances(X, Y) + assert_allclose(S, S2) + assert S.dtype == S2.dtype == global_dtype + + # Check to ensure NaNs work with pairwise_distances. + X_masked = rng.random_sample((5, 4)).astype(global_dtype, copy=False) + Y_masked = rng.random_sample((2, 4)).astype(global_dtype, copy=False) + X_masked[0, 0] = np.nan + Y_masked[0, 0] = np.nan + S_masked = pairwise_distances(X_masked, Y_masked, metric="nan_euclidean") + S2_masked = nan_euclidean_distances(X_masked, Y_masked) + assert_allclose(S_masked, S2_masked) + assert S_masked.dtype == S2_masked.dtype == global_dtype + + # Test with tuples as X and Y + X_tuples = tuple([tuple([v for v in row]) for row in X]) + Y_tuples = tuple([tuple([v for v in row]) for row in Y]) + S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean") + assert_allclose(S, S2) + assert S.dtype == S2.dtype == global_dtype + + # Test haversine distance + # The data should be valid latitude and longitude + # haversine converts to float64 currently so we don't check dtypes. + X = rng.random_sample((5, 2)).astype(global_dtype, copy=False) + X[:, 0] = (X[:, 0] - 0.5) * 2 * np.pi / 2 + X[:, 1] = (X[:, 1] - 0.5) * 2 * np.pi + S = pairwise_distances(X, metric="haversine") + S2 = haversine_distances(X) + assert_allclose(S, S2) + + # Test haversine distance, with Y != X + Y = rng.random_sample((2, 2)).astype(global_dtype, copy=False) + Y[:, 0] = (Y[:, 0] - 0.5) * 2 * np.pi / 2 + Y[:, 1] = (Y[:, 1] - 0.5) * 2 * np.pi + S = pairwise_distances(X, Y, metric="haversine") + S2 = haversine_distances(X, Y) + assert_allclose(S, S2) + + # "cityblock" uses scikit-learn metric, cityblock (function) is + # scipy.spatial. + # The metric functions from scipy converts to float64 so we don't check the dtypes. + S = pairwise_distances(X, metric="cityblock") + S2 = pairwise_distances(X, metric=cityblock) + assert S.shape[0] == S.shape[1] + assert S.shape[0] == X.shape[0] + assert_allclose(S, S2) + + # The manhattan metric should be equivalent to cityblock. + S = pairwise_distances(X, Y, metric="manhattan") + S2 = pairwise_distances(X, Y, metric=cityblock) + assert S.shape[0] == X.shape[0] + assert S.shape[1] == Y.shape[0] + assert_allclose(S, S2) + + # Test cosine as a string metric versus cosine callable + # The string "cosine" uses sklearn.metric, + # while the function cosine is scipy.spatial + S = pairwise_distances(X, Y, metric="cosine") + S2 = pairwise_distances(X, Y, metric=cosine) + assert S.shape[0] == X.shape[0] + assert S.shape[1] == Y.shape[0] + assert_allclose(S, S2) + + +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +@pytest.mark.parametrize("bsr_container", BSR_CONTAINERS) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_pairwise_distances_for_sparse_data( + coo_container, csc_container, bsr_container, csr_container, global_dtype +): + # Test the pairwise_distance helper function. + rng = np.random.RandomState(0) + X = rng.random_sample((5, 4)).astype(global_dtype, copy=False) + Y = rng.random_sample((2, 4)).astype(global_dtype, copy=False) + + # Test with sparse X and Y, + # currently only supported for Euclidean, L1 and cosine. + X_sparse = csr_container(X) + Y_sparse = csr_container(Y) + + S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean") + S2 = euclidean_distances(X_sparse, Y_sparse) + assert_allclose(S, S2) + assert S.dtype == S2.dtype == global_dtype + + S = pairwise_distances(X_sparse, Y_sparse, metric="cosine") + S2 = cosine_distances(X_sparse, Y_sparse) + assert_allclose(S, S2) + assert S.dtype == S2.dtype == global_dtype + + S = pairwise_distances(X_sparse, csc_container(Y), metric="manhattan") + S2 = manhattan_distances(bsr_container(X), coo_container(Y)) + assert_allclose(S, S2) + if global_dtype == np.float64: + assert S.dtype == S2.dtype == global_dtype + else: + # TODO Fix manhattan_distances to preserve dtype. + # currently pairwise_distances uses manhattan_distances but converts the result + # back to the input dtype + with pytest.raises(AssertionError): + assert S.dtype == S2.dtype == global_dtype + + S2 = manhattan_distances(X, Y) + assert_allclose(S, S2) + if global_dtype == np.float64: + assert S.dtype == S2.dtype == global_dtype + else: + # TODO Fix manhattan_distances to preserve dtype. + # currently pairwise_distances uses manhattan_distances but converts the result + # back to the input dtype + with pytest.raises(AssertionError): + assert S.dtype == S2.dtype == global_dtype + + # Test with scipy.spatial.distance metric, with a kwd + kwds = {"p": 2.0} + S = pairwise_distances(X, Y, metric="minkowski", **kwds) + S2 = pairwise_distances(X, Y, metric=minkowski, **kwds) + assert_allclose(S, S2) + + # same with Y = None + kwds = {"p": 2.0} + S = pairwise_distances(X, metric="minkowski", **kwds) + S2 = pairwise_distances(X, metric=minkowski, **kwds) + assert_allclose(S, S2) + + # Test that scipy distance metrics throw an error if sparse matrix given + with pytest.raises(TypeError): + pairwise_distances(X_sparse, metric="minkowski") + with pytest.raises(TypeError): + pairwise_distances(X, Y_sparse, metric="minkowski") + + +# Some scipy metrics are deprecated (depending on the scipy version) but we +# still want to test them. +@ignore_warnings(category=DeprecationWarning) +@pytest.mark.parametrize("metric", PAIRWISE_BOOLEAN_FUNCTIONS) +def test_pairwise_boolean_distance(metric): + # test that we convert to boolean arrays for boolean distances + rng = np.random.RandomState(0) + X = rng.randn(5, 4) + Y = X.copy() + Y[0, 0] = 1 - Y[0, 0] + + # ignore conversion to boolean in pairwise_distances + with ignore_warnings(category=DataConversionWarning): + for Z in [Y, None]: + res = pairwise_distances(X, Z, metric=metric) + np.nan_to_num(res, nan=0, posinf=0, neginf=0, copy=False) + assert np.sum(res != 0) == 0 + + # non-boolean arrays are converted to boolean for boolean + # distance metrics with a data conversion warning + msg = "Data was converted to boolean for metric %s" % metric + with pytest.warns(DataConversionWarning, match=msg): + pairwise_distances(X, metric=metric) + + # Check that the warning is raised if X is boolean by Y is not boolean: + with pytest.warns(DataConversionWarning, match=msg): + pairwise_distances(X.astype(bool), Y=Y, metric=metric) + + # Check that no warning is raised if X is already boolean and Y is None: + with warnings.catch_warnings(): + warnings.simplefilter("error", DataConversionWarning) + pairwise_distances(X.astype(bool), metric=metric) + + +def test_no_data_conversion_warning(): + # No warnings issued if metric is not a boolean distance function + rng = np.random.RandomState(0) + X = rng.randn(5, 4) + with warnings.catch_warnings(): + warnings.simplefilter("error", DataConversionWarning) + pairwise_distances(X, metric="minkowski") + + +@pytest.mark.parametrize("func", [pairwise_distances, pairwise_kernels]) +def test_pairwise_precomputed(func): + # Test correct shape + with pytest.raises(ValueError, match=".* shape .*"): + func(np.zeros((5, 3)), metric="precomputed") + # with two args + with pytest.raises(ValueError, match=".* shape .*"): + func(np.zeros((5, 3)), np.zeros((4, 4)), metric="precomputed") + # even if shape[1] agrees (although thus second arg is spurious) + with pytest.raises(ValueError, match=".* shape .*"): + func(np.zeros((5, 3)), np.zeros((4, 3)), metric="precomputed") + + # Test not copied (if appropriate dtype) + S = np.zeros((5, 5)) + S2 = func(S, metric="precomputed") + assert S is S2 + # with two args + S = np.zeros((5, 3)) + S2 = func(S, np.zeros((3, 3)), metric="precomputed") + assert S is S2 + + # Test always returns float dtype + S = func(np.array([[1]], dtype="int"), metric="precomputed") + assert "f" == S.dtype.kind + + # Test converts list to array-like + S = func([[1.0]], metric="precomputed") + assert isinstance(S, np.ndarray) + + +def test_pairwise_precomputed_non_negative(): + # Test non-negative values + with pytest.raises(ValueError, match=".* non-negative values.*"): + pairwise_distances(np.full((5, 5), -1), metric="precomputed") + + +_minkowski_kwds = {"w": np.arange(1, 5).astype("double", copy=False), "p": 1} + + +def callable_rbf_kernel(x, y, **kwds): + # Callable version of pairwise.rbf_kernel. + K = rbf_kernel(np.atleast_2d(x), np.atleast_2d(y), **kwds) + # unpack the output since this is a scalar packed in a 0-dim array + return K.item() + + +@pytest.mark.parametrize( + "func, metric, kwds", + [ + (pairwise_distances, "euclidean", {}), + ( + pairwise_distances, + minkowski, + _minkowski_kwds, + ), + ( + pairwise_distances, + "minkowski", + _minkowski_kwds, + ), + (pairwise_kernels, "polynomial", {"degree": 1}), + (pairwise_kernels, callable_rbf_kernel, {"gamma": 0.1}), + ], +) +@pytest.mark.parametrize("dtype", [np.float64, np.float32, int]) +def test_pairwise_parallel(func, metric, kwds, dtype): + rng = np.random.RandomState(0) + X = np.array(5 * rng.random_sample((5, 4)), dtype=dtype) + Y = np.array(5 * rng.random_sample((3, 4)), dtype=dtype) + + S = func(X, metric=metric, n_jobs=1, **kwds) + S2 = func(X, metric=metric, n_jobs=2, **kwds) + assert_allclose(S, S2) + + S = func(X, Y, metric=metric, n_jobs=1, **kwds) + S2 = func(X, Y, metric=metric, n_jobs=2, **kwds) + assert_allclose(S, S2) + + +def test_pairwise_callable_nonstrict_metric(): + # paired_distances should allow callable metric where metric(x, x) != 0 + # Knowing that the callable is a strict metric would allow the diagonal to + # be left uncalculated and set to 0. + assert pairwise_distances([[1.0]], metric=lambda x, y: 5)[0, 0] == 5 + + +# Test with all metrics that should be in PAIRWISE_KERNEL_FUNCTIONS. +@pytest.mark.parametrize( + "metric", + ["rbf", "laplacian", "sigmoid", "polynomial", "linear", "chi2", "additive_chi2"], +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_pairwise_kernels(metric, csr_container): + # Test the pairwise_kernels helper function. + + rng = np.random.RandomState(0) + X = rng.random_sample((5, 4)) + Y = rng.random_sample((2, 4)) + function = PAIRWISE_KERNEL_FUNCTIONS[metric] + # Test with Y=None + K1 = pairwise_kernels(X, metric=metric) + K2 = function(X) + assert_allclose(K1, K2) + # Test with Y=Y + K1 = pairwise_kernels(X, Y=Y, metric=metric) + K2 = function(X, Y=Y) + assert_allclose(K1, K2) + # Test with tuples as X and Y + X_tuples = tuple([tuple([v for v in row]) for row in X]) + Y_tuples = tuple([tuple([v for v in row]) for row in Y]) + K2 = pairwise_kernels(X_tuples, Y_tuples, metric=metric) + assert_allclose(K1, K2) + + # Test with sparse X and Y + X_sparse = csr_container(X) + Y_sparse = csr_container(Y) + if metric in ["chi2", "additive_chi2"]: + # these don't support sparse matrices yet + return + K1 = pairwise_kernels(X_sparse, Y=Y_sparse, metric=metric) + assert_allclose(K1, K2) + + +def test_pairwise_kernels_callable(): + # Test the pairwise_kernels helper function + # with a callable function, with given keywords. + rng = np.random.RandomState(0) + X = rng.random_sample((5, 4)) + Y = rng.random_sample((2, 4)) + + metric = callable_rbf_kernel + kwds = {"gamma": 0.1} + K1 = pairwise_kernels(X, Y=Y, metric=metric, **kwds) + K2 = rbf_kernel(X, Y=Y, **kwds) + assert_allclose(K1, K2) + + # callable function, X=Y + K1 = pairwise_kernels(X, Y=X, metric=metric, **kwds) + K2 = rbf_kernel(X, Y=X, **kwds) + assert_allclose(K1, K2) + + +def test_pairwise_kernels_filter_param(): + rng = np.random.RandomState(0) + X = rng.random_sample((5, 4)) + Y = rng.random_sample((2, 4)) + K = rbf_kernel(X, Y, gamma=0.1) + params = {"gamma": 0.1, "blabla": ":)"} + K2 = pairwise_kernels(X, Y, metric="rbf", filter_params=True, **params) + assert_allclose(K, K2) + + with pytest.raises(TypeError): + pairwise_kernels(X, Y, metric="rbf", **params) + + +@pytest.mark.parametrize("metric, func", PAIRED_DISTANCES.items()) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_paired_distances(metric, func, csr_container): + # Test the pairwise_distance helper function. + rng = np.random.RandomState(0) + # Euclidean distance should be equivalent to calling the function. + X = rng.random_sample((5, 4)) + # Euclidean distance, with Y != X. + Y = rng.random_sample((5, 4)) + + S = paired_distances(X, Y, metric=metric) + S2 = func(X, Y) + assert_allclose(S, S2) + S3 = func(csr_container(X), csr_container(Y)) + assert_allclose(S, S3) + if metric in PAIRWISE_DISTANCE_FUNCTIONS: + # Check the pairwise_distances implementation + # gives the same value + distances = PAIRWISE_DISTANCE_FUNCTIONS[metric](X, Y) + distances = np.diag(distances) + assert_allclose(distances, S) + + +def test_paired_distances_callable(global_dtype): + # Test the paired_distance helper function + # with the callable implementation + rng = np.random.RandomState(0) + # Euclidean distance should be equivalent to calling the function. + X = rng.random_sample((5, 4)).astype(global_dtype, copy=False) + # Euclidean distance, with Y != X. + Y = rng.random_sample((5, 4)).astype(global_dtype, copy=False) + + S = paired_distances(X, Y, metric="manhattan") + S2 = paired_distances(X, Y, metric=lambda x, y: np.abs(x - y).sum(axis=0)) + assert_allclose(S, S2) + + # Test that a value error is raised when the lengths of X and Y should not + # differ + Y = rng.random_sample((3, 4)) + with pytest.raises(ValueError): + paired_distances(X, Y) + + +@pytest.mark.parametrize("dok_container", DOK_CONTAINERS) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_pairwise_distances_argmin_min(dok_container, csr_container, global_dtype): + # Check pairwise minimum distances computation for any metric + X = np.asarray([[0], [1]], dtype=global_dtype) + Y = np.asarray([[-2], [3]], dtype=global_dtype) + + Xsp = dok_container(X) + Ysp = csr_container(Y, dtype=global_dtype) + + expected_idx = [0, 1] + expected_vals = [2, 2] + expected_vals_sq = [4, 4] + + # euclidean metric + idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean") + idx2 = pairwise_distances_argmin(X, Y, metric="euclidean") + assert_allclose(idx, expected_idx) + assert_allclose(idx2, expected_idx) + assert_allclose(vals, expected_vals) + # sparse matrix case + idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean") + idxsp2 = pairwise_distances_argmin(Xsp, Ysp, metric="euclidean") + assert_allclose(idxsp, expected_idx) + assert_allclose(idxsp2, expected_idx) + assert_allclose(valssp, expected_vals) + # We don't want np.matrix here + assert type(idxsp) == np.ndarray + assert type(valssp) == np.ndarray + + # Squared Euclidean metric + idx, vals = pairwise_distances_argmin_min(X, Y, metric="sqeuclidean") + idx2, vals2 = pairwise_distances_argmin_min( + X, Y, metric="euclidean", metric_kwargs={"squared": True} + ) + idx3 = pairwise_distances_argmin(X, Y, metric="sqeuclidean") + idx4 = pairwise_distances_argmin( + X, Y, metric="euclidean", metric_kwargs={"squared": True} + ) + + assert_allclose(vals, expected_vals_sq) + assert_allclose(vals2, expected_vals_sq) + + assert_allclose(idx, expected_idx) + assert_allclose(idx2, expected_idx) + assert_allclose(idx3, expected_idx) + assert_allclose(idx4, expected_idx) + + # Non-euclidean scikit-learn metric + idx, vals = pairwise_distances_argmin_min(X, Y, metric="manhattan") + idx2 = pairwise_distances_argmin(X, Y, metric="manhattan") + assert_allclose(idx, expected_idx) + assert_allclose(idx2, expected_idx) + assert_allclose(vals, expected_vals) + # sparse matrix case + idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan") + idxsp2 = pairwise_distances_argmin(Xsp, Ysp, metric="manhattan") + assert_allclose(idxsp, expected_idx) + assert_allclose(idxsp2, expected_idx) + assert_allclose(valssp, expected_vals) + + # Non-euclidean Scipy distance (callable) + idx, vals = pairwise_distances_argmin_min( + X, Y, metric=minkowski, metric_kwargs={"p": 2} + ) + assert_allclose(idx, expected_idx) + assert_allclose(vals, expected_vals) + + # Non-euclidean Scipy distance (string) + idx, vals = pairwise_distances_argmin_min( + X, Y, metric="minkowski", metric_kwargs={"p": 2} + ) + assert_allclose(idx, expected_idx) + assert_allclose(vals, expected_vals) + + # Compare with naive implementation + rng = np.random.RandomState(0) + X = rng.randn(97, 149) + Y = rng.randn(111, 149) + + dist = pairwise_distances(X, Y, metric="manhattan") + dist_orig_ind = dist.argmin(axis=0) + dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))] + + dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min( + X, Y, axis=0, metric="manhattan" + ) + assert_allclose(dist_orig_ind, dist_chunked_ind, rtol=1e-7) + assert_allclose(dist_orig_val, dist_chunked_val, rtol=1e-7) + + # Changing the axis and permuting datasets must give the same results + argmin_0, dist_0 = pairwise_distances_argmin_min(X, Y, axis=0) + argmin_1, dist_1 = pairwise_distances_argmin_min(Y, X, axis=1) + + assert_allclose(dist_0, dist_1) + assert_array_equal(argmin_0, argmin_1) + + argmin_0, dist_0 = pairwise_distances_argmin_min(X, X, axis=0) + argmin_1, dist_1 = pairwise_distances_argmin_min(X, X, axis=1) + + assert_allclose(dist_0, dist_1) + assert_array_equal(argmin_0, argmin_1) + + # Changing the axis and permuting datasets must give the same results + argmin_0 = pairwise_distances_argmin(X, Y, axis=0) + argmin_1 = pairwise_distances_argmin(Y, X, axis=1) + + assert_array_equal(argmin_0, argmin_1) + + argmin_0 = pairwise_distances_argmin(X, X, axis=0) + argmin_1 = pairwise_distances_argmin(X, X, axis=1) + + assert_array_equal(argmin_0, argmin_1) + + # F-contiguous arrays must be supported and must return identical results. + argmin_C_contiguous = pairwise_distances_argmin(X, Y) + argmin_F_contiguous = pairwise_distances_argmin( + np.asfortranarray(X), np.asfortranarray(Y) + ) + + assert_array_equal(argmin_C_contiguous, argmin_F_contiguous) + + +def _reduce_func(dist, start): + return dist[:, :100] + + +def test_pairwise_distances_chunked_reduce(global_dtype): + rng = np.random.RandomState(0) + X = rng.random_sample((400, 4)).astype(global_dtype, copy=False) + # Reduced Euclidean distance + S = pairwise_distances(X)[:, :100] + S_chunks = pairwise_distances_chunked( + X, None, reduce_func=_reduce_func, working_memory=2**-16 + ) + assert isinstance(S_chunks, GeneratorType) + S_chunks = list(S_chunks) + assert len(S_chunks) > 1 + assert S_chunks[0].dtype == X.dtype + + # atol is for diagonal where S is explicitly zeroed on the diagonal + assert_allclose(np.vstack(S_chunks), S, atol=1e-7) + + +def test_pairwise_distances_chunked_reduce_none(global_dtype): + # check that the reduce func is allowed to return None + rng = np.random.RandomState(0) + X = rng.random_sample((10, 4)).astype(global_dtype, copy=False) + S_chunks = pairwise_distances_chunked( + X, None, reduce_func=lambda dist, start: None, working_memory=2**-16 + ) + assert isinstance(S_chunks, GeneratorType) + S_chunks = list(S_chunks) + assert len(S_chunks) > 1 + assert all(chunk is None for chunk in S_chunks) + + +@pytest.mark.parametrize( + "good_reduce", + [ + lambda D, start: list(D), + lambda D, start: np.array(D), + lambda D, start: (list(D), list(D)), + ] + + [ + lambda D, start, scipy_csr_type=scipy_csr_type: scipy_csr_type(D) + for scipy_csr_type in CSR_CONTAINERS + ] + + [ + lambda D, start, scipy_dok_type=scipy_dok_type: ( + scipy_dok_type(D), + np.array(D), + list(D), + ) + for scipy_dok_type in DOK_CONTAINERS + ], +) +def test_pairwise_distances_chunked_reduce_valid(good_reduce): + X = np.arange(10).reshape(-1, 1) + S_chunks = pairwise_distances_chunked( + X, None, reduce_func=good_reduce, working_memory=64 + ) + next(S_chunks) + + +@pytest.mark.parametrize( + ("bad_reduce", "err_type", "message"), + [ + ( + lambda D, s: np.concatenate([D, D[-1:]]), + ValueError, + r"length 11\..* input: 10\.", + ), + ( + lambda D, s: (D, np.concatenate([D, D[-1:]])), + ValueError, + r"length \(10, 11\)\..* input: 10\.", + ), + (lambda D, s: (D[:9], D), ValueError, r"length \(9, 10\)\..* input: 10\."), + ( + lambda D, s: 7, + TypeError, + r"returned 7\. Expected sequence\(s\) of length 10\.", + ), + ( + lambda D, s: (7, 8), + TypeError, + r"returned \(7, 8\)\. Expected sequence\(s\) of length 10\.", + ), + ( + lambda D, s: (np.arange(10), 9), + TypeError, + r", 9\)\. Expected sequence\(s\) of length 10\.", + ), + ], +) +def test_pairwise_distances_chunked_reduce_invalid( + global_dtype, bad_reduce, err_type, message +): + X = np.arange(10).reshape(-1, 1).astype(global_dtype, copy=False) + S_chunks = pairwise_distances_chunked( + X, None, reduce_func=bad_reduce, working_memory=64 + ) + with pytest.raises(err_type, match=message): + next(S_chunks) + + +def check_pairwise_distances_chunked(X, Y, working_memory, metric="euclidean"): + gen = pairwise_distances_chunked(X, Y, working_memory=working_memory, metric=metric) + assert isinstance(gen, GeneratorType) + blockwise_distances = list(gen) + Y = X if Y is None else Y + min_block_mib = len(Y) * 8 * 2**-20 + + for block in blockwise_distances: + memory_used = block.nbytes + assert memory_used <= max(working_memory, min_block_mib) * 2**20 + + blockwise_distances = np.vstack(blockwise_distances) + S = pairwise_distances(X, Y, metric=metric) + assert_allclose(blockwise_distances, S, atol=1e-7) + + +@pytest.mark.parametrize("metric", ("euclidean", "l2", "sqeuclidean")) +def test_pairwise_distances_chunked_diagonal(metric, global_dtype): + rng = np.random.RandomState(0) + X = rng.normal(size=(1000, 10), scale=1e10).astype(global_dtype, copy=False) + chunks = list(pairwise_distances_chunked(X, working_memory=1, metric=metric)) + assert len(chunks) > 1 + assert_allclose(np.diag(np.vstack(chunks)), 0, rtol=1e-10) + + +@pytest.mark.parametrize("metric", ("euclidean", "l2", "sqeuclidean")) +def test_parallel_pairwise_distances_diagonal(metric, global_dtype): + rng = np.random.RandomState(0) + X = rng.normal(size=(1000, 10), scale=1e10).astype(global_dtype, copy=False) + distances = pairwise_distances(X, metric=metric, n_jobs=2) + assert_allclose(np.diag(distances), 0, atol=1e-10) + + +@pytest.mark.filterwarnings("ignore:Could not adhere to working_memory config") +def test_pairwise_distances_chunked(global_dtype): + # Test the pairwise_distance helper function. + rng = np.random.RandomState(0) + # Euclidean distance should be equivalent to calling the function. + X = rng.random_sample((200, 4)).astype(global_dtype, copy=False) + check_pairwise_distances_chunked(X, None, working_memory=1, metric="euclidean") + # Test small amounts of memory + for power in range(-16, 0): + check_pairwise_distances_chunked( + X, None, working_memory=2**power, metric="euclidean" + ) + # X as list + check_pairwise_distances_chunked( + X.tolist(), None, working_memory=1, metric="euclidean" + ) + # Euclidean distance, with Y != X. + Y = rng.random_sample((100, 4)).astype(global_dtype, copy=False) + check_pairwise_distances_chunked(X, Y, working_memory=1, metric="euclidean") + check_pairwise_distances_chunked( + X.tolist(), Y.tolist(), working_memory=1, metric="euclidean" + ) + # absurdly large working_memory + check_pairwise_distances_chunked(X, Y, working_memory=10000, metric="euclidean") + # "cityblock" uses scikit-learn metric, cityblock (function) is + # scipy.spatial. + check_pairwise_distances_chunked(X, Y, working_memory=1, metric="cityblock") + + # Test precomputed returns all at once + D = pairwise_distances(X) + gen = pairwise_distances_chunked(D, working_memory=2**-16, metric="precomputed") + assert isinstance(gen, GeneratorType) + assert next(gen) is D + with pytest.raises(StopIteration): + next(gen) + + +@pytest.mark.parametrize( + "x_array_constr", + [np.array] + CSR_CONTAINERS, + ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS], +) +@pytest.mark.parametrize( + "y_array_constr", + [np.array] + CSR_CONTAINERS, + ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS], +) +def test_euclidean_distances_known_result(x_array_constr, y_array_constr): + # Check the pairwise Euclidean distances computation on known result + X = x_array_constr([[0]]) + Y = y_array_constr([[1], [2]]) + D = euclidean_distances(X, Y) + assert_allclose(D, [[1.0, 2.0]]) + + +@pytest.mark.parametrize( + "y_array_constr", + [np.array] + CSR_CONTAINERS, + ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS], +) +def test_euclidean_distances_with_norms(global_dtype, y_array_constr): + # check that we still get the right answers with {X,Y}_norm_squared + # and that we get a wrong answer with wrong {X,Y}_norm_squared + rng = np.random.RandomState(0) + X = rng.random_sample((10, 10)).astype(global_dtype, copy=False) + Y = rng.random_sample((20, 10)).astype(global_dtype, copy=False) + + # norms will only be used if their dtype is float64 + X_norm_sq = (X.astype(np.float64) ** 2).sum(axis=1).reshape(1, -1) + Y_norm_sq = (Y.astype(np.float64) ** 2).sum(axis=1).reshape(1, -1) + + Y = y_array_constr(Y) + + D1 = euclidean_distances(X, Y) + D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq) + D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq) + D4 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq, Y_norm_squared=Y_norm_sq) + assert_allclose(D2, D1) + assert_allclose(D3, D1) + assert_allclose(D4, D1) + + # check we get the wrong answer with wrong {X,Y}_norm_squared + wrong_D = euclidean_distances( + X, + Y, + X_norm_squared=np.zeros_like(X_norm_sq), + Y_norm_squared=np.zeros_like(Y_norm_sq), + ) + with pytest.raises(AssertionError): + assert_allclose(wrong_D, D1) + + +@pytest.mark.parametrize("symmetric", [True, False]) +def test_euclidean_distances_float32_norms(global_random_seed, symmetric): + # Non-regression test for #27621 + rng = np.random.RandomState(global_random_seed) + X = rng.random_sample((10, 10)) + Y = X if symmetric else rng.random_sample((20, 10)) + X_norm_sq = (X.astype(np.float32) ** 2).sum(axis=1).reshape(1, -1) + Y_norm_sq = (Y.astype(np.float32) ** 2).sum(axis=1).reshape(1, -1) + D1 = euclidean_distances(X, Y) + D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq) + D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq) + D4 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq, Y_norm_squared=Y_norm_sq) + assert_allclose(D2, D1) + assert_allclose(D3, D1) + assert_allclose(D4, D1) + + +def test_euclidean_distances_norm_shapes(): + # Check all accepted shapes for the norms or appropriate error messages. + rng = np.random.RandomState(0) + X = rng.random_sample((10, 10)) + Y = rng.random_sample((20, 10)) + + X_norm_squared = (X**2).sum(axis=1) + Y_norm_squared = (Y**2).sum(axis=1) + + D1 = euclidean_distances( + X, Y, X_norm_squared=X_norm_squared, Y_norm_squared=Y_norm_squared + ) + D2 = euclidean_distances( + X, + Y, + X_norm_squared=X_norm_squared.reshape(-1, 1), + Y_norm_squared=Y_norm_squared.reshape(-1, 1), + ) + D3 = euclidean_distances( + X, + Y, + X_norm_squared=X_norm_squared.reshape(1, -1), + Y_norm_squared=Y_norm_squared.reshape(1, -1), + ) + + assert_allclose(D2, D1) + assert_allclose(D3, D1) + + with pytest.raises(ValueError, match="Incompatible dimensions for X"): + euclidean_distances(X, Y, X_norm_squared=X_norm_squared[:5]) + with pytest.raises(ValueError, match="Incompatible dimensions for Y"): + euclidean_distances(X, Y, Y_norm_squared=Y_norm_squared[:5]) + + +@pytest.mark.parametrize( + "x_array_constr", + [np.array] + CSR_CONTAINERS, + ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS], +) +@pytest.mark.parametrize( + "y_array_constr", + [np.array] + CSR_CONTAINERS, + ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS], +) +def test_euclidean_distances(global_dtype, x_array_constr, y_array_constr): + # check that euclidean distances gives same result as scipy cdist + # when X and Y != X are provided + rng = np.random.RandomState(0) + X = rng.random_sample((100, 10)).astype(global_dtype, copy=False) + X[X < 0.8] = 0 + Y = rng.random_sample((10, 10)).astype(global_dtype, copy=False) + Y[Y < 0.8] = 0 + + expected = cdist(X, Y) + + X = x_array_constr(X) + Y = y_array_constr(Y) + distances = euclidean_distances(X, Y) + + # the default rtol=1e-7 is too close to the float32 precision + # and fails due to rounding errors. + assert_allclose(distances, expected, rtol=1e-6) + assert distances.dtype == global_dtype + + +@pytest.mark.parametrize( + "x_array_constr", + [np.array] + CSR_CONTAINERS, + ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS], +) +def test_euclidean_distances_sym(global_dtype, x_array_constr): + # check that euclidean distances gives same result as scipy pdist + # when only X is provided + rng = np.random.RandomState(0) + X = rng.random_sample((100, 10)).astype(global_dtype, copy=False) + X[X < 0.8] = 0 + + expected = squareform(pdist(X)) + + X = x_array_constr(X) + distances = euclidean_distances(X) + + # the default rtol=1e-7 is too close to the float32 precision + # and fails due to rounding errors. + assert_allclose(distances, expected, rtol=1e-6) + assert distances.dtype == global_dtype + + +@pytest.mark.parametrize("batch_size", [None, 5, 7, 101]) +@pytest.mark.parametrize( + "x_array_constr", + [np.array] + CSR_CONTAINERS, + ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS], +) +@pytest.mark.parametrize( + "y_array_constr", + [np.array] + CSR_CONTAINERS, + ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS], +) +def test_euclidean_distances_upcast(batch_size, x_array_constr, y_array_constr): + # check batches handling when Y != X (#13910) + rng = np.random.RandomState(0) + X = rng.random_sample((100, 10)).astype(np.float32) + X[X < 0.8] = 0 + Y = rng.random_sample((10, 10)).astype(np.float32) + Y[Y < 0.8] = 0 + + expected = cdist(X, Y) + + X = x_array_constr(X) + Y = y_array_constr(Y) + distances = _euclidean_distances_upcast(X, Y=Y, batch_size=batch_size) + distances = np.sqrt(np.maximum(distances, 0)) + + # the default rtol=1e-7 is too close to the float32 precision + # and fails due to rounding errors. + assert_allclose(distances, expected, rtol=1e-6) + + +@pytest.mark.parametrize("batch_size", [None, 5, 7, 101]) +@pytest.mark.parametrize( + "x_array_constr", + [np.array] + CSR_CONTAINERS, + ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS], +) +def test_euclidean_distances_upcast_sym(batch_size, x_array_constr): + # check batches handling when X is Y (#13910) + rng = np.random.RandomState(0) + X = rng.random_sample((100, 10)).astype(np.float32) + X[X < 0.8] = 0 + + expected = squareform(pdist(X)) + + X = x_array_constr(X) + distances = _euclidean_distances_upcast(X, Y=X, batch_size=batch_size) + distances = np.sqrt(np.maximum(distances, 0)) + + # the default rtol=1e-7 is too close to the float32 precision + # and fails due to rounding errors. + assert_allclose(distances, expected, rtol=1e-6) + + +@pytest.mark.parametrize( + "dtype, eps, rtol", + [ + (np.float32, 1e-4, 1e-5), + pytest.param( + np.float64, + 1e-8, + 0.99, + marks=pytest.mark.xfail(reason="failing due to lack of precision"), + ), + ], +) +@pytest.mark.parametrize("dim", [1, 1000000]) +def test_euclidean_distances_extreme_values(dtype, eps, rtol, dim): + # check that euclidean distances is correct with float32 input thanks to + # upcasting. On float64 there are still precision issues. + X = np.array([[1.0] * dim], dtype=dtype) + Y = np.array([[1.0 + eps] * dim], dtype=dtype) + + distances = euclidean_distances(X, Y) + expected = cdist(X, Y) + + assert_allclose(distances, expected, rtol=1e-5) + + +@pytest.mark.parametrize("squared", [True, False]) +def test_nan_euclidean_distances_equal_to_euclidean_distance(squared): + # with no nan values + rng = np.random.RandomState(1337) + X = rng.randn(3, 4) + Y = rng.randn(4, 4) + + normal_distance = euclidean_distances(X, Y=Y, squared=squared) + nan_distance = nan_euclidean_distances(X, Y=Y, squared=squared) + assert_allclose(normal_distance, nan_distance) + + +@pytest.mark.parametrize("X", [np.array([[np.inf, 0]]), np.array([[0, -np.inf]])]) +@pytest.mark.parametrize("Y", [np.array([[np.inf, 0]]), np.array([[0, -np.inf]]), None]) +def test_nan_euclidean_distances_infinite_values(X, Y): + with pytest.raises(ValueError) as excinfo: + nan_euclidean_distances(X, Y=Y) + + exp_msg = "Input contains infinity or a value too large for dtype('float64')." + assert exp_msg == str(excinfo.value) + + +@pytest.mark.parametrize( + "X, X_diag, missing_value", + [ + (np.array([[0, 1], [1, 0]]), np.sqrt(2), np.nan), + (np.array([[0, 1], [1, np.nan]]), np.sqrt(2), np.nan), + (np.array([[np.nan, 1], [1, np.nan]]), np.nan, np.nan), + (np.array([[np.nan, 1], [np.nan, 0]]), np.sqrt(2), np.nan), + (np.array([[0, np.nan], [1, np.nan]]), np.sqrt(2), np.nan), + (np.array([[0, 1], [1, 0]]), np.sqrt(2), -1), + (np.array([[0, 1], [1, -1]]), np.sqrt(2), -1), + (np.array([[-1, 1], [1, -1]]), np.nan, -1), + (np.array([[-1, 1], [-1, 0]]), np.sqrt(2), -1), + (np.array([[0, -1], [1, -1]]), np.sqrt(2), -1), + ], +) +def test_nan_euclidean_distances_2x2(X, X_diag, missing_value): + exp_dist = np.array([[0.0, X_diag], [X_diag, 0]]) + + dist = nan_euclidean_distances(X, missing_values=missing_value) + assert_allclose(exp_dist, dist) + + dist_sq = nan_euclidean_distances(X, squared=True, missing_values=missing_value) + assert_allclose(exp_dist**2, dist_sq) + + dist_two = nan_euclidean_distances(X, X, missing_values=missing_value) + assert_allclose(exp_dist, dist_two) + + dist_two_copy = nan_euclidean_distances(X, X.copy(), missing_values=missing_value) + assert_allclose(exp_dist, dist_two_copy) + + +@pytest.mark.parametrize("missing_value", [np.nan, -1]) +def test_nan_euclidean_distances_complete_nan(missing_value): + X = np.array([[missing_value, missing_value], [0, 1]]) + + exp_dist = np.array([[np.nan, np.nan], [np.nan, 0]]) + + dist = nan_euclidean_distances(X, missing_values=missing_value) + assert_allclose(exp_dist, dist) + + dist = nan_euclidean_distances(X, X.copy(), missing_values=missing_value) + assert_allclose(exp_dist, dist) + + +@pytest.mark.parametrize("missing_value", [np.nan, -1]) +def test_nan_euclidean_distances_not_trival(missing_value): + X = np.array( + [ + [1.0, missing_value, 3.0, 4.0, 2.0], + [missing_value, 4.0, 6.0, 1.0, missing_value], + [3.0, missing_value, missing_value, missing_value, 1.0], + ] + ) + + Y = np.array( + [ + [missing_value, 7.0, 7.0, missing_value, 2.0], + [missing_value, missing_value, 5.0, 4.0, 7.0], + [missing_value, missing_value, missing_value, 4.0, 5.0], + ] + ) + + # Check for symmetry + D1 = nan_euclidean_distances(X, Y, missing_values=missing_value) + D2 = nan_euclidean_distances(Y, X, missing_values=missing_value) + + assert_almost_equal(D1, D2.T) + + # Check with explicit formula and squared=True + assert_allclose( + nan_euclidean_distances( + X[:1], Y[:1], squared=True, missing_values=missing_value + ), + [[5.0 / 2.0 * ((7 - 3) ** 2 + (2 - 2) ** 2)]], + ) + + # Check with explicit formula and squared=False + assert_allclose( + nan_euclidean_distances( + X[1:2], Y[1:2], squared=False, missing_values=missing_value + ), + [[np.sqrt(5.0 / 2.0 * ((6 - 5) ** 2 + (1 - 4) ** 2))]], + ) + + # Check when Y = X is explicitly passed + D3 = nan_euclidean_distances(X, missing_values=missing_value) + D4 = nan_euclidean_distances(X, X, missing_values=missing_value) + D5 = nan_euclidean_distances(X, X.copy(), missing_values=missing_value) + assert_allclose(D3, D4) + assert_allclose(D4, D5) + + # Check copy = True against copy = False + D6 = nan_euclidean_distances(X, Y, copy=True) + D7 = nan_euclidean_distances(X, Y, copy=False) + assert_allclose(D6, D7) + + +@pytest.mark.parametrize("missing_value", [np.nan, -1]) +def test_nan_euclidean_distances_one_feature_match_positive(missing_value): + # First feature is the only feature that is non-nan and in both + # samples. The result of `nan_euclidean_distances` with squared=True + # should be non-negative. The non-squared version should all be close to 0. + X = np.array( + [ + [-122.27, 648.0, missing_value, 37.85], + [-122.27, missing_value, 2.34701493, missing_value], + ] + ) + + dist_squared = nan_euclidean_distances( + X, missing_values=missing_value, squared=True + ) + assert np.all(dist_squared >= 0) + + dist = nan_euclidean_distances(X, missing_values=missing_value, squared=False) + assert_allclose(dist, 0.0) + + +def test_cosine_distances(): + # Check the pairwise Cosine distances computation + rng = np.random.RandomState(1337) + x = np.abs(rng.rand(910)) + XA = np.vstack([x, x]) + D = cosine_distances(XA) + assert_allclose(D, [[0.0, 0.0], [0.0, 0.0]], atol=1e-10) + # check that all elements are in [0, 2] + assert np.all(D >= 0.0) + assert np.all(D <= 2.0) + # check that diagonal elements are equal to 0 + assert_allclose(D[np.diag_indices_from(D)], [0.0, 0.0]) + + XB = np.vstack([x, -x]) + D2 = cosine_distances(XB) + # check that all elements are in [0, 2] + assert np.all(D2 >= 0.0) + assert np.all(D2 <= 2.0) + # check that diagonal elements are equal to 0 and non diagonal to 2 + assert_allclose(D2, [[0.0, 2.0], [2.0, 0.0]]) + + # check large random matrix + X = np.abs(rng.rand(1000, 5000)) + D = cosine_distances(X) + # check that diagonal elements are equal to 0 + assert_allclose(D[np.diag_indices_from(D)], [0.0] * D.shape[0]) + assert np.all(D >= 0.0) + assert np.all(D <= 2.0) + + +def test_haversine_distances(): + # Check haversine distance with distances computation + def slow_haversine_distances(x, y): + diff_lat = y[0] - x[0] + diff_lon = y[1] - x[1] + a = np.sin(diff_lat / 2) ** 2 + ( + np.cos(x[0]) * np.cos(y[0]) * np.sin(diff_lon / 2) ** 2 + ) + c = 2 * np.arcsin(np.sqrt(a)) + return c + + rng = np.random.RandomState(0) + X = rng.random_sample((5, 2)) + Y = rng.random_sample((10, 2)) + D1 = np.array([[slow_haversine_distances(x, y) for y in Y] for x in X]) + D2 = haversine_distances(X, Y) + assert_allclose(D1, D2) + # Test haversine distance does not accept X where n_feature != 2 + X = rng.random_sample((10, 3)) + err_msg = "Haversine distance only valid in 2 dimensions" + with pytest.raises(ValueError, match=err_msg): + haversine_distances(X) + + +# Paired distances + + +def test_paired_euclidean_distances(): + # Check the paired Euclidean distances computation + X = [[0], [0]] + Y = [[1], [2]] + D = paired_euclidean_distances(X, Y) + assert_allclose(D, [1.0, 2.0]) + + +def test_paired_manhattan_distances(): + # Check the paired manhattan distances computation + X = [[0], [0]] + Y = [[1], [2]] + D = paired_manhattan_distances(X, Y) + assert_allclose(D, [1.0, 2.0]) + + +def test_paired_cosine_distances(): + # Check the paired manhattan distances computation + X = [[0], [0]] + Y = [[1], [2]] + D = paired_cosine_distances(X, Y) + assert_allclose(D, [0.5, 0.5]) + + +def test_chi_square_kernel(): + rng = np.random.RandomState(0) + X = rng.random_sample((5, 4)) + Y = rng.random_sample((10, 4)) + K_add = additive_chi2_kernel(X, Y) + gamma = 0.1 + K = chi2_kernel(X, Y, gamma=gamma) + assert K.dtype == float + for i, x in enumerate(X): + for j, y in enumerate(Y): + chi2 = -np.sum((x - y) ** 2 / (x + y)) + chi2_exp = np.exp(gamma * chi2) + assert_almost_equal(K_add[i, j], chi2) + assert_almost_equal(K[i, j], chi2_exp) + + # check diagonal is ones for data with itself + K = chi2_kernel(Y) + assert_array_equal(np.diag(K), 1) + # check off-diagonal is < 1 but > 0: + assert np.all(K > 0) + assert np.all(K - np.diag(np.diag(K)) < 1) + # check that float32 is preserved + X = rng.random_sample((5, 4)).astype(np.float32) + Y = rng.random_sample((10, 4)).astype(np.float32) + K = chi2_kernel(X, Y) + assert K.dtype == np.float32 + + # check integer type gets converted, + # check that zeros are handled + X = rng.random_sample((10, 4)).astype(np.int32) + K = chi2_kernel(X, X) + assert np.isfinite(K).all() + assert K.dtype == float + + # check that kernel of similar things is greater than dissimilar ones + X = [[0.3, 0.7], [1.0, 0]] + Y = [[0, 1], [0.9, 0.1]] + K = chi2_kernel(X, Y) + assert K[0, 0] > K[0, 1] + assert K[1, 1] > K[1, 0] + + # test negative input + with pytest.raises(ValueError): + chi2_kernel([[0, -1]]) + with pytest.raises(ValueError): + chi2_kernel([[0, -1]], [[-1, -1]]) + with pytest.raises(ValueError): + chi2_kernel([[0, 1]], [[-1, -1]]) + + # different n_features in X and Y + with pytest.raises(ValueError): + chi2_kernel([[0, 1]], [[0.2, 0.2, 0.6]]) + + +@pytest.mark.parametrize( + "kernel", + ( + linear_kernel, + polynomial_kernel, + rbf_kernel, + laplacian_kernel, + sigmoid_kernel, + cosine_similarity, + ), +) +def test_kernel_symmetry(kernel): + # Valid kernels should be symmetric + rng = np.random.RandomState(0) + X = rng.random_sample((5, 4)) + K = kernel(X, X) + assert_allclose(K, K.T, 15) + + +@pytest.mark.parametrize( + "kernel", + ( + linear_kernel, + polynomial_kernel, + rbf_kernel, + laplacian_kernel, + sigmoid_kernel, + cosine_similarity, + ), +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_kernel_sparse(kernel, csr_container): + rng = np.random.RandomState(0) + X = rng.random_sample((5, 4)) + X_sparse = csr_container(X) + K = kernel(X, X) + K2 = kernel(X_sparse, X_sparse) + assert_allclose(K, K2) + + +def test_linear_kernel(): + rng = np.random.RandomState(0) + X = rng.random_sample((5, 4)) + K = linear_kernel(X, X) + # the diagonal elements of a linear kernel are their squared norm + assert_allclose(K.flat[::6], [linalg.norm(x) ** 2 for x in X]) + + +def test_rbf_kernel(): + rng = np.random.RandomState(0) + X = rng.random_sample((5, 4)) + K = rbf_kernel(X, X) + # the diagonal elements of a rbf kernel are 1 + assert_allclose(K.flat[::6], np.ones(5)) + + +def test_laplacian_kernel(): + rng = np.random.RandomState(0) + X = rng.random_sample((5, 4)) + K = laplacian_kernel(X, X) + # the diagonal elements of a laplacian kernel are 1 + assert_allclose(np.diag(K), np.ones(5)) + + # off-diagonal elements are < 1 but > 0: + assert np.all(K > 0) + assert np.all(K - np.diag(np.diag(K)) < 1) + + +@pytest.mark.parametrize( + "metric, pairwise_func", + [("linear", linear_kernel), ("cosine", cosine_similarity)], +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_pairwise_similarity_sparse_output(metric, pairwise_func, csr_container): + rng = np.random.RandomState(0) + X = rng.random_sample((5, 4)) + Y = rng.random_sample((3, 4)) + Xcsr = csr_container(X) + Ycsr = csr_container(Y) + + # should be sparse + K1 = pairwise_func(Xcsr, Ycsr, dense_output=False) + assert issparse(K1) + + # should be dense, and equal to K1 + K2 = pairwise_func(X, Y, dense_output=True) + assert not issparse(K2) + assert_allclose(K1.toarray(), K2) + + # show the kernel output equal to the sparse.toarray() + K3 = pairwise_kernels(X, Y=Y, metric=metric) + assert_allclose(K1.toarray(), K3) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_cosine_similarity(csr_container): + # Test the cosine_similarity. + + rng = np.random.RandomState(0) + X = rng.random_sample((5, 4)) + Y = rng.random_sample((3, 4)) + Xcsr = csr_container(X) + Ycsr = csr_container(Y) + + for X_, Y_ in ((X, None), (X, Y), (Xcsr, None), (Xcsr, Ycsr)): + # Test that the cosine is kernel is equal to a linear kernel when data + # has been previously normalized by L2-norm. + K1 = pairwise_kernels(X_, Y=Y_, metric="cosine") + X_ = normalize(X_) + if Y_ is not None: + Y_ = normalize(Y_) + K2 = pairwise_kernels(X_, Y=Y_, metric="linear") + assert_allclose(K1, K2) + + +def test_check_dense_matrices(): + # Ensure that pairwise array check works for dense matrices. + # Check that if XB is None, XB is returned as reference to XA + XA = np.resize(np.arange(40), (5, 8)) + XA_checked, XB_checked = check_pairwise_arrays(XA, None) + assert XA_checked is XB_checked + assert_array_equal(XA, XA_checked) + + +def test_check_XB_returned(): + # Ensure that if XA and XB are given correctly, they return as equal. + # Check that if XB is not None, it is returned equal. + # Note that the second dimension of XB is the same as XA. + XA = np.resize(np.arange(40), (5, 8)) + XB = np.resize(np.arange(32), (4, 8)) + XA_checked, XB_checked = check_pairwise_arrays(XA, XB) + assert_array_equal(XA, XA_checked) + assert_array_equal(XB, XB_checked) + + XB = np.resize(np.arange(40), (5, 8)) + XA_checked, XB_checked = check_paired_arrays(XA, XB) + assert_array_equal(XA, XA_checked) + assert_array_equal(XB, XB_checked) + + +def test_check_different_dimensions(): + # Ensure an error is raised if the dimensions are different. + XA = np.resize(np.arange(45), (5, 9)) + XB = np.resize(np.arange(32), (4, 8)) + with pytest.raises(ValueError): + check_pairwise_arrays(XA, XB) + + XB = np.resize(np.arange(4 * 9), (4, 9)) + with pytest.raises(ValueError): + check_paired_arrays(XA, XB) + + +def test_check_invalid_dimensions(): + # Ensure an error is raised on 1D input arrays. + # The modified tests are not 1D. In the old test, the array was internally + # converted to 2D anyways + XA = np.arange(45).reshape(9, 5) + XB = np.arange(32).reshape(4, 8) + with pytest.raises(ValueError): + check_pairwise_arrays(XA, XB) + XA = np.arange(45).reshape(9, 5) + XB = np.arange(32).reshape(4, 8) + with pytest.raises(ValueError): + check_pairwise_arrays(XA, XB) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_check_sparse_arrays(csr_container): + # Ensures that checks return valid sparse matrices. + rng = np.random.RandomState(0) + XA = rng.random_sample((5, 4)) + XA_sparse = csr_container(XA) + XB = rng.random_sample((5, 4)) + XB_sparse = csr_container(XB) + XA_checked, XB_checked = check_pairwise_arrays(XA_sparse, XB_sparse) + # compare their difference because testing csr matrices for + # equality with '==' does not work as expected. + assert issparse(XA_checked) + assert abs(XA_sparse - XA_checked).sum() == 0 + assert issparse(XB_checked) + assert abs(XB_sparse - XB_checked).sum() == 0 + + XA_checked, XA_2_checked = check_pairwise_arrays(XA_sparse, XA_sparse) + assert issparse(XA_checked) + assert abs(XA_sparse - XA_checked).sum() == 0 + assert issparse(XA_2_checked) + assert abs(XA_2_checked - XA_checked).sum() == 0 + + +def tuplify(X): + # Turns a numpy matrix (any n-dimensional array) into tuples. + s = X.shape + if len(s) > 1: + # Tuplify each sub-array in the input. + return tuple(tuplify(row) for row in X) + else: + # Single dimension input, just return tuple of contents. + return tuple(r for r in X) + + +def test_check_tuple_input(): + # Ensures that checks return valid tuples. + rng = np.random.RandomState(0) + XA = rng.random_sample((5, 4)) + XA_tuples = tuplify(XA) + XB = rng.random_sample((5, 4)) + XB_tuples = tuplify(XB) + XA_checked, XB_checked = check_pairwise_arrays(XA_tuples, XB_tuples) + assert_array_equal(XA_tuples, XA_checked) + assert_array_equal(XB_tuples, XB_checked) + + +def test_check_preserve_type(): + # Ensures that type float32 is preserved. + XA = np.resize(np.arange(40), (5, 8)).astype(np.float32) + XB = np.resize(np.arange(40), (5, 8)).astype(np.float32) + + XA_checked, XB_checked = check_pairwise_arrays(XA, None) + assert XA_checked.dtype == np.float32 + + # both float32 + XA_checked, XB_checked = check_pairwise_arrays(XA, XB) + assert XA_checked.dtype == np.float32 + assert XB_checked.dtype == np.float32 + + # mismatched A + XA_checked, XB_checked = check_pairwise_arrays(XA.astype(float), XB) + assert XA_checked.dtype == float + assert XB_checked.dtype == float + + # mismatched B + XA_checked, XB_checked = check_pairwise_arrays(XA, XB.astype(float)) + assert XA_checked.dtype == float + assert XB_checked.dtype == float + + +@pytest.mark.parametrize("n_jobs", [1, 2]) +@pytest.mark.parametrize("metric", ["seuclidean", "mahalanobis"]) +@pytest.mark.parametrize( + "dist_function", [pairwise_distances, pairwise_distances_chunked] +) +def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function): + # check that pairwise_distances give the same result in sequential and + # parallel, when metric has data-derived parameters. + with config_context(working_memory=0.1): # to have more than 1 chunk + rng = np.random.RandomState(0) + X = rng.random_sample((100, 10)) + + expected_dist = squareform(pdist(X, metric=metric)) + dist = np.vstack(tuple(dist_function(X, metric=metric, n_jobs=n_jobs))) + + assert_allclose(dist, expected_dist) + + +@pytest.mark.parametrize("metric", ["seuclidean", "mahalanobis"]) +def test_pairwise_distances_data_derived_params_error(metric): + # check that pairwise_distances raises an error when Y is passed but + # metric has data-derived params that are not provided by the user. + rng = np.random.RandomState(0) + X = rng.random_sample((100, 10)) + Y = rng.random_sample((100, 10)) + + with pytest.raises( + ValueError, + match=rf"The '(V|VI)' parameter is required for the {metric} metric", + ): + pairwise_distances(X, Y, metric=metric) + + +@pytest.mark.parametrize( + "metric", + [ + "braycurtis", + "canberra", + "chebyshev", + "correlation", + "hamming", + "mahalanobis", + "minkowski", + "seuclidean", + "sqeuclidean", + "cityblock", + "cosine", + "euclidean", + ], +) +@pytest.mark.parametrize("y_is_x", [True, False], ids=["Y is X", "Y is not X"]) +def test_numeric_pairwise_distances_datatypes(metric, global_dtype, y_is_x): + # Check that pairwise distances gives the same result as pdist and cdist + # regardless of input datatype when using any scipy metric for comparing + # numeric vectors + # + # This test is necessary because pairwise_distances used to throw an + # error when using metric='seuclidean' and the input data was not + # of type np.float64 (#15730) + + rng = np.random.RandomState(0) + + X = rng.random_sample((5, 4)).astype(global_dtype, copy=False) + + params = {} + if y_is_x: + Y = X + expected_dist = squareform(pdist(X, metric=metric)) + else: + Y = rng.random_sample((5, 4)).astype(global_dtype, copy=False) + expected_dist = cdist(X, Y, metric=metric) + # precompute parameters for seuclidean & mahalanobis when x is not y + if metric == "seuclidean": + params = {"V": np.var(np.vstack([X, Y]), axis=0, ddof=1, dtype=np.float64)} + elif metric == "mahalanobis": + params = {"VI": np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T} + + dist = pairwise_distances(X, Y, metric=metric, **params) + + assert_allclose(dist, expected_dist) + + +@pytest.mark.parametrize( + "pairwise_distances_func", + [pairwise_distances, pairwise_distances_argmin, pairwise_distances_argmin_min], +) +def test_nan_euclidean_support(pairwise_distances_func): + """Check that `nan_euclidean` is lenient with `nan` values.""" + + X = [[0, 1], [1, np.nan], [2, 3], [3, 5]] + output = pairwise_distances_func(X, X, metric="nan_euclidean") + + assert not np.isnan(output).any() + + +def test_nan_euclidean_constant_input_argmin(): + """Check that the behavior of constant input is the same in the case of + full of nan vector and full of zero vector. + """ + + X_nan = [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]] + argmin_nan = pairwise_distances_argmin(X_nan, X_nan, metric="nan_euclidean") + + X_const = [[0, 0], [0, 0], [0, 0]] + argmin_const = pairwise_distances_argmin(X_const, X_const, metric="nan_euclidean") + + assert_allclose(argmin_nan, argmin_const) + + +@pytest.mark.parametrize( + "X,Y,expected_distance", + [ + ( + ["a", "ab", "abc"], + None, + [[0.0, 1.0, 2.0], [1.0, 0.0, 1.0], [2.0, 1.0, 0.0]], + ), + ( + ["a", "ab", "abc"], + ["a", "ab"], + [[0.0, 1.0], [1.0, 0.0], [2.0, 1.0]], + ), + ], +) +def test_pairwise_dist_custom_metric_for_string(X, Y, expected_distance): + """Check pairwise_distances with lists of strings as input.""" + + def dummy_string_similarity(x, y): + return np.abs(len(x) - len(y)) + + actual_distance = pairwise_distances(X=X, Y=Y, metric=dummy_string_similarity) + assert_allclose(actual_distance, expected_distance) + + +def test_pairwise_dist_custom_metric_for_bool(): + """Check that pairwise_distances does not convert boolean input to float + when using a custom metric. + """ + + def dummy_bool_dist(v1, v2): + # dummy distance func using `&` and thus relying on the input data being boolean + return 1 - (v1 & v2).sum() / (v1 | v2).sum() + + X = np.array([[1, 0, 0, 0], [1, 0, 1, 0], [1, 1, 1, 1]], dtype=bool) + + expected_distance = np.array( + [ + [0.0, 0.5, 0.75], + [0.5, 0.0, 0.5], + [0.75, 0.5, 0.0], + ] + ) + + actual_distance = pairwise_distances(X=X, metric=dummy_bool_dist) + assert_allclose(actual_distance, expected_distance) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse_manhattan_readonly_dataset(csr_container): + # Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/7981 + matrices1 = [csr_container(np.ones((5, 5)))] + matrices2 = [csr_container(np.ones((5, 5)))] + # Joblib memory maps datasets which makes them read-only. + # The following call was reporting as failing in #7981, but this must pass. + Parallel(n_jobs=2, max_nbytes=0)( + delayed(manhattan_distances)(m1, m2) for m1, m2 in zip(matrices1, matrices2) + ) + + +# TODO(1.8): remove +def test_force_all_finite_rename_warning(): + X = np.random.uniform(size=(10, 10)) + Y = np.random.uniform(size=(10, 10)) + + msg = "'force_all_finite' was renamed to 'ensure_all_finite'" + + with pytest.warns(FutureWarning, match=msg): + check_pairwise_arrays(X, Y, force_all_finite=True) + + with pytest.warns(FutureWarning, match=msg): + pairwise_distances(X, Y, force_all_finite=True) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_pairwise_distances_reduction.py new file mode 100644 index 0000000000000000000000000000000000000000..0ea6d5d094d5602ce4e3d4161b398d42c65677e6 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -0,0 +1,1643 @@ +import itertools +import re +import warnings +from functools import partial + +import numpy as np +import pytest +from scipy.spatial.distance import cdist + +from sklearn.metrics import euclidean_distances, pairwise_distances +from sklearn.metrics._pairwise_distances_reduction import ( + ArgKmin, + ArgKminClassMode, + BaseDistancesReductionDispatcher, + RadiusNeighbors, + RadiusNeighborsClassMode, + sqeuclidean_row_norms, +) +from sklearn.utils._testing import ( + assert_allclose, + assert_array_equal, + create_memmap_backed_data, +) +from sklearn.utils.fixes import CSR_CONTAINERS +from sklearn.utils.parallel import _get_threadpool_controller + +# Common supported metric between scipy.spatial.distance.cdist +# and BaseDistanceReductionDispatcher. +# This allows constructing tests to check consistency of results +# of concrete BaseDistanceReductionDispatcher on some metrics using APIs +# from scipy and numpy. +CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS = [ + "braycurtis", + "canberra", + "chebyshev", + "cityblock", + "euclidean", + "minkowski", + "seuclidean", +] + + +def _get_metric_params_list(metric: str, n_features: int, seed: int = 1): + """Return list of dummy DistanceMetric kwargs for tests.""" + + # Distinguishing on cases not to compute unneeded datastructures. + rng = np.random.RandomState(seed) + + if metric == "minkowski": + minkowski_kwargs = [ + dict(p=1.5), + dict(p=2), + dict(p=3), + dict(p=np.inf), + dict(p=3, w=rng.rand(n_features)), + ] + + return minkowski_kwargs + + if metric == "seuclidean": + return [dict(V=rng.rand(n_features))] + + # Case of: "euclidean", "manhattan", "chebyshev", "haversine" or any other metric. + # In those cases, no kwargs is needed. + return [{}] + + +def assert_same_distances_for_common_neighbors( + query_idx, + dist_row_a, + dist_row_b, + indices_row_a, + indices_row_b, + rtol, + atol, +): + """Check that the distances of common neighbors are equal up to tolerance. + + This does not check if there are missing neighbors in either result set. + Missingness is handled by assert_no_missing_neighbors. + """ + # Compute a mapping from indices to distances for each result set and + # check that the computed neighbors with matching indices are within + # the expected distance tolerance. + indices_to_dist_a = dict(zip(indices_row_a, dist_row_a)) + indices_to_dist_b = dict(zip(indices_row_b, dist_row_b)) + + common_indices = set(indices_row_a).intersection(set(indices_row_b)) + for idx in common_indices: + dist_a = indices_to_dist_a[idx] + dist_b = indices_to_dist_b[idx] + try: + assert_allclose(dist_a, dist_b, rtol=rtol, atol=atol) + except AssertionError as e: + # Wrap exception to provide more context while also including + # the original exception with the computed absolute and + # relative differences. + raise AssertionError( + f"Query vector with index {query_idx} lead to different distances" + f" for common neighbor with index {idx}:" + f" dist_a={dist_a} vs dist_b={dist_b} (with atol={atol} and" + f" rtol={rtol})" + ) from e + + +def assert_no_missing_neighbors( + query_idx, + dist_row_a, + dist_row_b, + indices_row_a, + indices_row_b, + threshold, +): + """Compare the indices of neighbors in two results sets. + + Any neighbor index with a distance below the precision threshold should + match one in the other result set. We ignore the last few neighbors beyond + the threshold as those can typically be missing due to rounding errors. + + For radius queries, the threshold is just the radius minus the expected + precision level. + + For k-NN queries, it is the maximum distance to the k-th neighbor minus the + expected precision level. + """ + mask_a = dist_row_a < threshold + mask_b = dist_row_b < threshold + missing_from_b = np.setdiff1d(indices_row_a[mask_a], indices_row_b) + missing_from_a = np.setdiff1d(indices_row_b[mask_b], indices_row_a) + if len(missing_from_a) > 0 or len(missing_from_b) > 0: + raise AssertionError( + f"Query vector with index {query_idx} lead to mismatched result indices:\n" + f"neighbors in b missing from a: {missing_from_a}\n" + f"neighbors in a missing from b: {missing_from_b}\n" + f"dist_row_a={dist_row_a}\n" + f"dist_row_b={dist_row_b}\n" + f"indices_row_a={indices_row_a}\n" + f"indices_row_b={indices_row_b}\n" + ) + + +def assert_compatible_argkmin_results( + neighbors_dists_a, + neighbors_dists_b, + neighbors_indices_a, + neighbors_indices_b, + rtol=1e-5, + atol=1e-6, +): + """Assert that argkmin results are valid up to rounding errors. + + This function asserts that the results of argkmin queries are valid up to: + - rounding error tolerance on distance values; + - permutations of indices for distances values that differ up to the + expected precision level. + + Furthermore, the distances must be sorted. + + To be used for testing neighbors queries on float32 datasets: we accept + neighbors rank swaps only if they are caused by small rounding errors on + the distance computations. + """ + is_sorted = lambda a: np.all(a[:-1] <= a[1:]) + + assert ( + neighbors_dists_a.shape + == neighbors_dists_b.shape + == neighbors_indices_a.shape + == neighbors_indices_b.shape + ), "Arrays of results have incompatible shapes." + + n_queries, _ = neighbors_dists_a.shape + + # Asserting equality results one row at a time + for query_idx in range(n_queries): + dist_row_a = neighbors_dists_a[query_idx] + dist_row_b = neighbors_dists_b[query_idx] + indices_row_a = neighbors_indices_a[query_idx] + indices_row_b = neighbors_indices_b[query_idx] + + assert is_sorted(dist_row_a), f"Distances aren't sorted on row {query_idx}" + assert is_sorted(dist_row_b), f"Distances aren't sorted on row {query_idx}" + + assert_same_distances_for_common_neighbors( + query_idx, + dist_row_a, + dist_row_b, + indices_row_a, + indices_row_b, + rtol, + atol, + ) + + # Check that any neighbor with distances below the rounding error + # threshold have matching indices. The threshold is the distance to the + # k-th neighbors minus the expected precision level: + # + # (1 - rtol) * dist_k - atol + # + # Where dist_k is defined as the maximum distance to the kth-neighbor + # among the two result sets. This way of defining the threshold is + # stricter than taking the minimum of the two. + threshold = (1 - rtol) * np.maximum( + np.max(dist_row_a), np.max(dist_row_b) + ) - atol + assert_no_missing_neighbors( + query_idx, + dist_row_a, + dist_row_b, + indices_row_a, + indices_row_b, + threshold, + ) + + +def _non_trivial_radius( + *, + X=None, + Y=None, + metric=None, + precomputed_dists=None, + expected_n_neighbors=10, + n_subsampled_queries=10, + **metric_kwargs, +): + # Find a non-trivial radius using a small subsample of the pairwise + # distances between X and Y: we want to return around expected_n_neighbors + # on average. Yielding too many results would make the test slow (because + # checking the results is expensive for large result sets), yielding 0 most + # of the time would make the test useless. + assert precomputed_dists is not None or metric is not None, ( + "Either metric or precomputed_dists must be provided." + ) + + if precomputed_dists is None: + assert X is not None + assert Y is not None + sampled_dists = pairwise_distances(X, Y, metric=metric, **metric_kwargs) + else: + sampled_dists = precomputed_dists[:n_subsampled_queries].copy() + sampled_dists.sort(axis=1) + return sampled_dists[:, expected_n_neighbors].mean() + + +def assert_compatible_radius_results( + neighbors_dists_a, + neighbors_dists_b, + neighbors_indices_a, + neighbors_indices_b, + radius, + check_sorted=True, + rtol=1e-5, + atol=1e-6, +): + """Assert that radius neighborhood results are valid up to: + + - relative and absolute tolerance on computed distance values + - permutations of indices for distances values that differ up to + a precision level + - missing or extra last elements if their distance is + close to the radius + + To be used for testing neighbors queries on float32 datasets: we + accept neighbors rank swaps only if they are caused by small + rounding errors on the distance computations. + + Input arrays must be sorted w.r.t distances. + """ + is_sorted = lambda a: np.all(a[:-1] <= a[1:]) + + assert ( + len(neighbors_dists_a) + == len(neighbors_dists_b) + == len(neighbors_indices_a) + == len(neighbors_indices_b) + ) + + n_queries = len(neighbors_dists_a) + + # Asserting equality of results one vector at a time + for query_idx in range(n_queries): + dist_row_a = neighbors_dists_a[query_idx] + dist_row_b = neighbors_dists_b[query_idx] + indices_row_a = neighbors_indices_a[query_idx] + indices_row_b = neighbors_indices_b[query_idx] + + if check_sorted: + assert is_sorted(dist_row_a), f"Distances aren't sorted on row {query_idx}" + assert is_sorted(dist_row_b), f"Distances aren't sorted on row {query_idx}" + + assert len(dist_row_a) == len(indices_row_a) + assert len(dist_row_b) == len(indices_row_b) + + # Check that all distances are within the requested radius + if len(dist_row_a) > 0: + max_dist_a = np.max(dist_row_a) + assert max_dist_a <= radius, ( + f"Largest returned distance {max_dist_a} not within requested" + f" radius {radius} on row {query_idx}" + ) + if len(dist_row_b) > 0: + max_dist_b = np.max(dist_row_b) + assert max_dist_b <= radius, ( + f"Largest returned distance {max_dist_b} not within requested" + f" radius {radius} on row {query_idx}" + ) + + assert_same_distances_for_common_neighbors( + query_idx, + dist_row_a, + dist_row_b, + indices_row_a, + indices_row_b, + rtol, + atol, + ) + + threshold = (1 - rtol) * radius - atol + assert_no_missing_neighbors( + query_idx, + dist_row_a, + dist_row_b, + indices_row_a, + indices_row_b, + threshold, + ) + + +FLOAT32_TOLS = { + "atol": 1e-7, + "rtol": 1e-5, +} +FLOAT64_TOLS = { + "atol": 1e-9, + "rtol": 1e-7, +} +ASSERT_RESULT = { + (ArgKmin, np.float64): partial(assert_compatible_argkmin_results, **FLOAT64_TOLS), + (ArgKmin, np.float32): partial(assert_compatible_argkmin_results, **FLOAT32_TOLS), + ( + RadiusNeighbors, + np.float64, + ): partial(assert_compatible_radius_results, **FLOAT64_TOLS), + ( + RadiusNeighbors, + np.float32, + ): partial(assert_compatible_radius_results, **FLOAT32_TOLS), +} + + +def test_assert_compatible_argkmin_results(): + atol = 1e-7 + rtol = 0.0 + tols = dict(atol=atol, rtol=rtol) + + eps = atol / 3 + _1m = 1.0 - eps + _1p = 1.0 + eps + + _6_1m = 6.1 - eps + _6_1p = 6.1 + eps + + ref_dist = np.array( + [ + [1.2, 2.5, _6_1m, 6.1, _6_1p], + [_1m, _1m, 1, _1p, _1p], + ] + ) + ref_indices = np.array( + [ + [1, 2, 3, 4, 5], + [6, 7, 8, 9, 10], + ] + ) + + # Sanity check: compare the reference results to themselves. + assert_compatible_argkmin_results( + ref_dist, ref_dist, ref_indices, ref_indices, rtol + ) + + # Apply valid permutation on indices: the last 3 points are all very close + # to one another so we accept any permutation on their rankings. + assert_compatible_argkmin_results( + np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), + np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), + np.array([[1, 2, 3, 4, 5]]), + np.array([[1, 2, 5, 4, 3]]), + **tols, + ) + + # The last few indices do not necessarily have to match because of the rounding + # errors on the distances: there could be tied results at the boundary. + assert_compatible_argkmin_results( + np.array([[1.2, 2.5, 3.0, 6.1, _6_1p]]), + np.array([[1.2, 2.5, 3.0, _6_1m, 6.1]]), + np.array([[1, 2, 3, 4, 5]]), + np.array([[1, 2, 3, 6, 7]]), + **tols, + ) + + # All points have close distances so any ranking permutation + # is valid for this query result. + assert_compatible_argkmin_results( + np.array([[_1m, 1, _1p, _1p, _1p]]), + np.array([[1, 1, 1, 1, _1p]]), + np.array([[7, 6, 8, 10, 9]]), + np.array([[6, 9, 7, 8, 10]]), + **tols, + ) + + # They could also be nearly truncation of very large nearly tied result + # sets hence all indices can also be distinct in this case: + assert_compatible_argkmin_results( + np.array([[_1m, 1, _1p, _1p, _1p]]), + np.array([[_1m, 1, 1, 1, _1p]]), + np.array([[34, 30, 8, 12, 24]]), + np.array([[42, 1, 21, 13, 3]]), + **tols, + ) + + # Apply invalid permutation on indices: permuting the ranks of the 2 + # nearest neighbors is invalid because the distance values are too + # different. + msg = re.escape( + "Query vector with index 0 lead to different distances for common neighbor with" + " index 1: dist_a=1.2 vs dist_b=2.5" + ) + with pytest.raises(AssertionError, match=msg): + assert_compatible_argkmin_results( + np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), + np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), + np.array([[1, 2, 3, 4, 5]]), + np.array([[2, 1, 3, 4, 5]]), + **tols, + ) + + # Detect missing indices within the expected precision level, even when the + # distances match exactly. + msg = re.escape( + "neighbors in b missing from a: [12]\nneighbors in a missing from b: [1]" + ) + with pytest.raises(AssertionError, match=msg): + assert_compatible_argkmin_results( + np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), + np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), + np.array([[1, 2, 3, 4, 5]]), + np.array([[12, 2, 4, 11, 3]]), + **tols, + ) + + # Detect missing indices outside the expected precision level. + msg = re.escape( + "neighbors in b missing from a: []\nneighbors in a missing from b: [3]" + ) + with pytest.raises(AssertionError, match=msg): + assert_compatible_argkmin_results( + np.array([[_1m, 1.0, _6_1m, 6.1, _6_1p]]), + np.array([[1.0, 1.0, _6_1m, 6.1, 7]]), + np.array([[1, 2, 3, 4, 5]]), + np.array([[2, 1, 4, 5, 12]]), + **tols, + ) + + # Detect missing indices outside the expected precision level, in the other + # direction: + msg = re.escape( + "neighbors in b missing from a: [5]\nneighbors in a missing from b: []" + ) + with pytest.raises(AssertionError, match=msg): + assert_compatible_argkmin_results( + np.array([[_1m, 1.0, _6_1m, 6.1, 7]]), + np.array([[1.0, 1.0, _6_1m, 6.1, _6_1p]]), + np.array([[1, 2, 3, 4, 12]]), + np.array([[2, 1, 5, 3, 4]]), + **tols, + ) + + # Distances aren't properly sorted + msg = "Distances aren't sorted on row 0" + with pytest.raises(AssertionError, match=msg): + assert_compatible_argkmin_results( + np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]), + np.array([[2.5, 1.2, _6_1m, 6.1, _6_1p]]), + np.array([[1, 2, 3, 4, 5]]), + np.array([[2, 1, 4, 5, 3]]), + **tols, + ) + + +@pytest.mark.parametrize("check_sorted", [True, False]) +def test_assert_compatible_radius_results(check_sorted): + atol = 1e-7 + rtol = 0.0 + tols = dict(atol=atol, rtol=rtol) + + eps = atol / 3 + _1m = 1.0 - eps + _1p = 1.0 + eps + _6_1m = 6.1 - eps + _6_1p = 6.1 + eps + + ref_dist = [ + np.array([1.2, 2.5, _6_1m, 6.1, _6_1p]), + np.array([_1m, 1, _1p, _1p]), + ] + + ref_indices = [ + np.array([1, 2, 3, 4, 5]), + np.array([6, 7, 8, 9]), + ] + + # Sanity check: compare the reference results to themselves. + assert_compatible_radius_results( + ref_dist, + ref_dist, + ref_indices, + ref_indices, + radius=7.0, + check_sorted=check_sorted, + **tols, + ) + + # Apply valid permutation on indices + assert_compatible_radius_results( + np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), + np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), + np.array([np.array([1, 2, 3, 4, 5])]), + np.array([np.array([1, 2, 4, 5, 3])]), + radius=7.0, + check_sorted=check_sorted, + **tols, + ) + assert_compatible_radius_results( + np.array([np.array([_1m, _1m, 1, _1p, _1p])]), + np.array([np.array([_1m, _1m, 1, _1p, _1p])]), + np.array([np.array([6, 7, 8, 9, 10])]), + np.array([np.array([6, 9, 7, 8, 10])]), + radius=7.0, + check_sorted=check_sorted, + **tols, + ) + + # Apply invalid permutation on indices + msg = re.escape( + "Query vector with index 0 lead to different distances for common neighbor with" + " index 1: dist_a=1.2 vs dist_b=2.5" + ) + with pytest.raises(AssertionError, match=msg): + assert_compatible_radius_results( + np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), + np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), + np.array([np.array([1, 2, 3, 4, 5])]), + np.array([np.array([2, 1, 3, 4, 5])]), + radius=7.0, + check_sorted=check_sorted, + **tols, + ) + + # Having extra last or missing elements is valid if they are in the + # tolerated rounding error range: [(1 - rtol) * radius - atol, radius] + assert_compatible_radius_results( + np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p, _6_1p])]), + np.array([np.array([1.2, 2.5, _6_1m, 6.1])]), + np.array([np.array([1, 2, 3, 4, 5, 7])]), + np.array([np.array([1, 2, 3, 6])]), + radius=_6_1p, + check_sorted=check_sorted, + **tols, + ) + + # Any discrepancy outside the tolerated rounding error range is invalid and + # indicates a missing neighbor in one of the result sets. + msg = re.escape( + "Query vector with index 0 lead to mismatched result indices:\nneighbors in b" + " missing from a: []\nneighbors in a missing from b: [3]" + ) + with pytest.raises(AssertionError, match=msg): + assert_compatible_radius_results( + np.array([np.array([1.2, 2.5, 6])]), + np.array([np.array([1.2, 2.5])]), + np.array([np.array([1, 2, 3])]), + np.array([np.array([1, 2])]), + radius=6.1, + check_sorted=check_sorted, + **tols, + ) + msg = re.escape( + "Query vector with index 0 lead to mismatched result indices:\nneighbors in b" + " missing from a: [4]\nneighbors in a missing from b: [2]" + ) + with pytest.raises(AssertionError, match=msg): + assert_compatible_radius_results( + np.array([np.array([1.2, 2.1, 2.5])]), + np.array([np.array([1.2, 2, 2.5])]), + np.array([np.array([1, 2, 3])]), + np.array([np.array([1, 4, 3])]), + radius=6.1, + check_sorted=check_sorted, + **tols, + ) + + # Radius upper bound is strictly checked + msg = re.escape( + "Largest returned distance 6.100000033333333 not within requested radius 6.1 on" + " row 0" + ) + with pytest.raises(AssertionError, match=msg): + assert_compatible_radius_results( + np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), + np.array([np.array([1.2, 2.5, _6_1m, 6.1, 6.1])]), + np.array([np.array([1, 2, 3, 4, 5])]), + np.array([np.array([2, 1, 4, 5, 3])]), + radius=6.1, + check_sorted=check_sorted, + **tols, + ) + with pytest.raises(AssertionError, match=msg): + assert_compatible_radius_results( + np.array([np.array([1.2, 2.5, _6_1m, 6.1, 6.1])]), + np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), + np.array([np.array([1, 2, 3, 4, 5])]), + np.array([np.array([2, 1, 4, 5, 3])]), + radius=6.1, + check_sorted=check_sorted, + **tols, + ) + + if check_sorted: + # Distances aren't properly sorted + msg = "Distances aren't sorted on row 0" + with pytest.raises(AssertionError, match=msg): + assert_compatible_radius_results( + np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), + np.array([np.array([2.5, 1.2, _6_1m, 6.1, _6_1p])]), + np.array([np.array([1, 2, 3, 4, 5])]), + np.array([np.array([2, 1, 4, 5, 3])]), + radius=_6_1p, + check_sorted=True, + **tols, + ) + else: + assert_compatible_radius_results( + np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]), + np.array([np.array([2.5, 1.2, _6_1m, 6.1, _6_1p])]), + np.array([np.array([1, 2, 3, 4, 5])]), + np.array([np.array([2, 1, 4, 5, 3])]), + radius=_6_1p, + check_sorted=False, + **tols, + ) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_pairwise_distances_reduction_is_usable_for(csr_container): + rng = np.random.RandomState(0) + X = rng.rand(100, 10) + Y = rng.rand(100, 10) + X_csr = csr_container(X) + Y_csr = csr_container(Y) + metric = "manhattan" + + # Must be usable for all possible pair of {dense, sparse} datasets + assert BaseDistancesReductionDispatcher.is_usable_for(X, Y, metric) + assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y_csr, metric) + assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y, metric) + assert BaseDistancesReductionDispatcher.is_usable_for(X, Y_csr, metric) + + assert BaseDistancesReductionDispatcher.is_usable_for( + X.astype(np.float64), Y.astype(np.float64), metric + ) + + assert BaseDistancesReductionDispatcher.is_usable_for( + X.astype(np.float32), Y.astype(np.float32), metric + ) + + assert not BaseDistancesReductionDispatcher.is_usable_for( + X.astype(np.int64), Y.astype(np.int64), metric + ) + + assert not BaseDistancesReductionDispatcher.is_usable_for(X, Y, metric="pyfunc") + assert not BaseDistancesReductionDispatcher.is_usable_for( + X.astype(np.float32), Y, metric + ) + assert not BaseDistancesReductionDispatcher.is_usable_for( + X, Y.astype(np.int32), metric + ) + + # F-ordered arrays are not supported + assert not BaseDistancesReductionDispatcher.is_usable_for( + np.asfortranarray(X), Y, metric + ) + + assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y, metric="euclidean") + assert BaseDistancesReductionDispatcher.is_usable_for( + X, Y_csr, metric="sqeuclidean" + ) + + # FIXME: the current Cython implementation is too slow for a large number of + # features. We temporarily disable it to fallback on SciPy's implementation. + # See: https://github.com/scikit-learn/scikit-learn/issues/28191 + assert not BaseDistancesReductionDispatcher.is_usable_for( + X_csr, Y_csr, metric="sqeuclidean" + ) + assert not BaseDistancesReductionDispatcher.is_usable_for( + X_csr, Y_csr, metric="euclidean" + ) + + # CSR matrices without non-zeros elements aren't currently supported + # TODO: support CSR matrices without non-zeros elements + X_csr_0_nnz = csr_container(X * 0) + assert not BaseDistancesReductionDispatcher.is_usable_for(X_csr_0_nnz, Y, metric) + + # CSR matrices with int64 indices and indptr (e.g. large nnz, or large n_features) + # aren't supported as of now. + # See: https://github.com/scikit-learn/scikit-learn/issues/23653 + # TODO: support CSR matrices with int64 indices and indptr + X_csr_int64 = csr_container(X) + X_csr_int64.indices = X_csr_int64.indices.astype(np.int64) + assert not BaseDistancesReductionDispatcher.is_usable_for(X_csr_int64, Y, metric) + + +def test_argkmin_factory_method_wrong_usages(): + rng = np.random.RandomState(1) + X = rng.rand(100, 10) + Y = rng.rand(100, 10) + k = 5 + metric = "euclidean" + + msg = ( + "Only float64 or float32 datasets pairs are supported at this time, " + "got: X.dtype=float32 and Y.dtype=float64" + ) + with pytest.raises(ValueError, match=msg): + ArgKmin.compute(X=X.astype(np.float32), Y=Y, k=k, metric=metric) + + msg = ( + "Only float64 or float32 datasets pairs are supported at this time, " + "got: X.dtype=float64 and Y.dtype=int32" + ) + with pytest.raises(ValueError, match=msg): + ArgKmin.compute(X=X, Y=Y.astype(np.int32), k=k, metric=metric) + + with pytest.raises(ValueError, match="k == -1, must be >= 1."): + ArgKmin.compute(X=X, Y=Y, k=-1, metric=metric) + + with pytest.raises(ValueError, match="k == 0, must be >= 1."): + ArgKmin.compute(X=X, Y=Y, k=0, metric=metric) + + with pytest.raises(ValueError, match="Unrecognized metric"): + ArgKmin.compute(X=X, Y=Y, k=k, metric="wrong metric") + + with pytest.raises( + ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)" + ): + ArgKmin.compute(X=np.array([1.0, 2.0]), Y=Y, k=k, metric=metric) + + with pytest.raises(ValueError, match="ndarray is not C-contiguous"): + ArgKmin.compute(X=np.asfortranarray(X), Y=Y, k=k, metric=metric) + + # A UserWarning must be raised in this case. + unused_metric_kwargs = {"p": 3} + + message = r"Some metric_kwargs have been passed \({'p': 3}\) but" + + with pytest.warns(UserWarning, match=message): + ArgKmin.compute( + X=X, Y=Y, k=k, metric=metric, metric_kwargs=unused_metric_kwargs + ) + + # A UserWarning must be raised in this case. + metric_kwargs = { + "p": 3, # unused + "Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2), + } + + message = r"Some metric_kwargs have been passed \({'p': 3, 'Y_norm_squared'" + + with pytest.warns(UserWarning, match=message): + ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs) + + # No user warning must be raised in this case. + metric_kwargs = { + "X_norm_squared": sqeuclidean_row_norms(X, num_threads=2), + } + with warnings.catch_warnings(): + warnings.simplefilter("error", category=UserWarning) + ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs) + + # No user warning must be raised in this case. + metric_kwargs = { + "X_norm_squared": sqeuclidean_row_norms(X, num_threads=2), + "Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2), + } + with warnings.catch_warnings(): + warnings.simplefilter("error", category=UserWarning) + ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs) + + +def test_argkmin_classmode_factory_method_wrong_usages(): + rng = np.random.RandomState(1) + X = rng.rand(100, 10) + Y = rng.rand(100, 10) + k = 5 + metric = "manhattan" + + weights = "uniform" + Y_labels = rng.randint(low=0, high=10, size=100) + unique_Y_labels = np.unique(Y_labels) + + msg = ( + "Only float64 or float32 datasets pairs are supported at this time, " + "got: X.dtype=float32 and Y.dtype=float64" + ) + with pytest.raises(ValueError, match=msg): + ArgKminClassMode.compute( + X=X.astype(np.float32), + Y=Y, + k=k, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + ) + + msg = ( + "Only float64 or float32 datasets pairs are supported at this time, " + "got: X.dtype=float64 and Y.dtype=int32" + ) + with pytest.raises(ValueError, match=msg): + ArgKminClassMode.compute( + X=X, + Y=Y.astype(np.int32), + k=k, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + ) + + with pytest.raises(ValueError, match="k == -1, must be >= 1."): + ArgKminClassMode.compute( + X=X, + Y=Y, + k=-1, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + ) + + with pytest.raises(ValueError, match="k == 0, must be >= 1."): + ArgKminClassMode.compute( + X=X, + Y=Y, + k=0, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + ) + + with pytest.raises(ValueError, match="Unrecognized metric"): + ArgKminClassMode.compute( + X=X, + Y=Y, + k=k, + metric="wrong metric", + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + ) + + with pytest.raises( + ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)" + ): + ArgKminClassMode.compute( + X=np.array([1.0, 2.0]), + Y=Y, + k=k, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + ) + + with pytest.raises(ValueError, match="ndarray is not C-contiguous"): + ArgKminClassMode.compute( + X=np.asfortranarray(X), + Y=Y, + k=k, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + ) + + non_existent_weights_strategy = "non_existent_weights_strategy" + message = ( + "Only the 'uniform' or 'distance' weights options are supported at this time. " + f"Got: weights='{non_existent_weights_strategy}'." + ) + with pytest.raises(ValueError, match=message): + ArgKminClassMode.compute( + X=X, + Y=Y, + k=k, + metric=metric, + weights=non_existent_weights_strategy, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + ) + + # TODO: introduce assertions on UserWarnings once the Euclidean specialisation + # of ArgKminClassMode is supported. + + +def test_radius_neighbors_factory_method_wrong_usages(): + rng = np.random.RandomState(1) + X = rng.rand(100, 10) + Y = rng.rand(100, 10) + radius = 5 + metric = "euclidean" + + msg = ( + "Only float64 or float32 datasets pairs are supported at this time, " + "got: X.dtype=float32 and Y.dtype=float64" + ) + with pytest.raises( + ValueError, + match=msg, + ): + RadiusNeighbors.compute( + X=X.astype(np.float32), Y=Y, radius=radius, metric=metric + ) + + msg = ( + "Only float64 or float32 datasets pairs are supported at this time, " + "got: X.dtype=float64 and Y.dtype=int32" + ) + with pytest.raises( + ValueError, + match=msg, + ): + RadiusNeighbors.compute(X=X, Y=Y.astype(np.int32), radius=radius, metric=metric) + + with pytest.raises(ValueError, match="radius == -1.0, must be >= 0."): + RadiusNeighbors.compute(X=X, Y=Y, radius=-1, metric=metric) + + with pytest.raises(ValueError, match="Unrecognized metric"): + RadiusNeighbors.compute(X=X, Y=Y, radius=radius, metric="wrong metric") + + with pytest.raises( + ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)" + ): + RadiusNeighbors.compute( + X=np.array([1.0, 2.0]), Y=Y, radius=radius, metric=metric + ) + + with pytest.raises(ValueError, match="ndarray is not C-contiguous"): + RadiusNeighbors.compute( + X=np.asfortranarray(X), Y=Y, radius=radius, metric=metric + ) + + unused_metric_kwargs = {"p": 3} + + # A UserWarning must be raised in this case. + message = r"Some metric_kwargs have been passed \({'p': 3}\) but" + + with pytest.warns(UserWarning, match=message): + RadiusNeighbors.compute( + X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=unused_metric_kwargs + ) + + # A UserWarning must be raised in this case. + metric_kwargs = { + "p": 3, # unused + "Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2), + } + + message = r"Some metric_kwargs have been passed \({'p': 3, 'Y_norm_squared'" + + with pytest.warns(UserWarning, match=message): + RadiusNeighbors.compute( + X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs + ) + + # No user warning must be raised in this case. + metric_kwargs = { + "X_norm_squared": sqeuclidean_row_norms(X, num_threads=2), + "Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2), + } + with warnings.catch_warnings(): + warnings.simplefilter("error", category=UserWarning) + RadiusNeighbors.compute( + X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs + ) + + # No user warning must be raised in this case. + metric_kwargs = { + "X_norm_squared": sqeuclidean_row_norms(X, num_threads=2), + } + with warnings.catch_warnings(): + warnings.simplefilter("error", category=UserWarning) + RadiusNeighbors.compute( + X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs + ) + + +def test_radius_neighbors_classmode_factory_method_wrong_usages(): + rng = np.random.RandomState(1) + X = rng.rand(100, 10) + Y = rng.rand(100, 10) + radius = 5 + metric = "manhattan" + weights = "uniform" + Y_labels = rng.randint(low=0, high=10, size=100) + unique_Y_labels = np.unique(Y_labels) + + msg = ( + "Only float64 or float32 datasets pairs are supported at this time, " + "got: X.dtype=float32 and Y.dtype=float64" + ) + with pytest.raises(ValueError, match=msg): + RadiusNeighborsClassMode.compute( + X=X.astype(np.float32), + Y=Y, + radius=radius, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + outlier_label=None, + ) + + msg = ( + "Only float64 or float32 datasets pairs are supported at this time, " + "got: X.dtype=float64 and Y.dtype=int32" + ) + with pytest.raises(ValueError, match=msg): + RadiusNeighborsClassMode.compute( + X=X, + Y=Y.astype(np.int32), + radius=radius, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + outlier_label=None, + ) + + with pytest.raises(ValueError, match="radius == -1.0, must be >= 0."): + RadiusNeighborsClassMode.compute( + X=X, + Y=Y, + radius=-1, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + outlier_label=None, + ) + + with pytest.raises(ValueError, match="Unrecognized metric"): + RadiusNeighborsClassMode.compute( + X=X, + Y=Y, + radius=-1, + metric="wrong_metric", + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + outlier_label=None, + ) + + with pytest.raises( + ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)" + ): + RadiusNeighborsClassMode.compute( + X=np.array([1.0, 2.0]), + Y=Y, + radius=radius, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + outlier_label=None, + ) + + with pytest.raises(ValueError, match="ndarray is not C-contiguous"): + RadiusNeighborsClassMode.compute( + X=np.asfortranarray(X), + Y=Y, + radius=radius, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + outlier_label=None, + ) + + non_existent_weights_strategy = "non_existent_weights_strategy" + msg = ( + "Only the 'uniform' or 'distance' weights options are supported at this time. " + f"Got: weights='{non_existent_weights_strategy}'." + ) + with pytest.raises(ValueError, match=msg): + RadiusNeighborsClassMode.compute( + X=X, + Y=Y, + radius=radius, + metric="wrong_metric", + weights=non_existent_weights_strategy, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + outlier_label=None, + ) + + +@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors]) +@pytest.mark.parametrize("dtype", [np.float64, np.float32]) +def test_chunk_size_agnosticism( + global_random_seed, + Dispatcher, + dtype, + n_features=100, +): + """Check that results do not depend on the chunk size.""" + rng = np.random.RandomState(global_random_seed) + spread = 100 + n_samples_X, n_samples_Y = rng.choice([97, 100, 101, 500], size=2, replace=False) + X = rng.rand(n_samples_X, n_features).astype(dtype) * spread + Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread + + if Dispatcher is ArgKmin: + parameter = 10 + check_parameters = {} + compute_parameters = {} + else: + radius = _non_trivial_radius(X=X, Y=Y, metric="euclidean") + parameter = radius + check_parameters = {"radius": radius} + compute_parameters = {"sort_results": True} + + ref_dist, ref_indices = Dispatcher.compute( + X, + Y, + parameter, + chunk_size=256, # default + metric="manhattan", + return_distance=True, + **compute_parameters, + ) + + dist, indices = Dispatcher.compute( + X, + Y, + parameter, + chunk_size=41, + metric="manhattan", + return_distance=True, + **compute_parameters, + ) + + ASSERT_RESULT[(Dispatcher, dtype)]( + ref_dist, dist, ref_indices, indices, **check_parameters + ) + + +@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors]) +@pytest.mark.parametrize("dtype", [np.float64, np.float32]) +def test_n_threads_agnosticism( + global_random_seed, + Dispatcher, + dtype, + n_features=100, +): + """Check that results do not depend on the number of threads.""" + rng = np.random.RandomState(global_random_seed) + n_samples_X, n_samples_Y = rng.choice([97, 100, 101, 500], size=2, replace=False) + spread = 100 + X = rng.rand(n_samples_X, n_features).astype(dtype) * spread + Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread + + if Dispatcher is ArgKmin: + parameter = 10 + check_parameters = {} + compute_parameters = {} + else: + radius = _non_trivial_radius(X=X, Y=Y, metric="euclidean") + parameter = radius + check_parameters = {"radius": radius} + compute_parameters = {"sort_results": True} + + ref_dist, ref_indices = Dispatcher.compute( + X, + Y, + parameter, + chunk_size=25, # make sure we use multiple threads + return_distance=True, + **compute_parameters, + ) + + with _get_threadpool_controller().limit(limits=1, user_api="openmp"): + dist, indices = Dispatcher.compute( + X, + Y, + parameter, + chunk_size=25, + return_distance=True, + **compute_parameters, + ) + + ASSERT_RESULT[(Dispatcher, dtype)]( + ref_dist, dist, ref_indices, indices, **check_parameters + ) + + +@pytest.mark.parametrize( + "Dispatcher, dtype", + [ + (ArgKmin, np.float64), + (RadiusNeighbors, np.float32), + (ArgKmin, np.float32), + (RadiusNeighbors, np.float64), + ], +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_format_agnosticism( + global_random_seed, + Dispatcher, + dtype, + csr_container, +): + """Check that results do not depend on the format (dense, sparse) of the input.""" + rng = np.random.RandomState(global_random_seed) + spread = 100 + n_samples, n_features = 100, 100 + + X = rng.rand(n_samples, n_features).astype(dtype) * spread + Y = rng.rand(n_samples, n_features).astype(dtype) * spread + + X_csr = csr_container(X) + Y_csr = csr_container(Y) + + if Dispatcher is ArgKmin: + parameter = 10 + check_parameters = {} + compute_parameters = {} + else: + # Adjusting the radius to ensure that the expected results is neither + # trivially empty nor too large. + radius = _non_trivial_radius(X=X, Y=Y, metric="euclidean") + parameter = radius + check_parameters = {"radius": radius} + compute_parameters = {"sort_results": True} + + dist_dense, indices_dense = Dispatcher.compute( + X, + Y, + parameter, + chunk_size=50, + return_distance=True, + **compute_parameters, + ) + + for _X, _Y in itertools.product((X, X_csr), (Y, Y_csr)): + if _X is X and _Y is Y: + continue + dist, indices = Dispatcher.compute( + _X, + _Y, + parameter, + chunk_size=50, + return_distance=True, + **compute_parameters, + ) + ASSERT_RESULT[(Dispatcher, dtype)]( + dist_dense, + dist, + indices_dense, + indices, + **check_parameters, + ) + + +@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors]) +def test_strategies_consistency( + global_random_seed, + global_dtype, + Dispatcher, + n_features=10, +): + """Check that the results do not depend on the strategy used.""" + rng = np.random.RandomState(global_random_seed) + metric = rng.choice( + np.array( + [ + "euclidean", + "minkowski", + "manhattan", + "haversine", + ], + dtype=object, + ) + ) + n_samples_X, n_samples_Y = rng.choice([97, 100, 101, 500], size=2, replace=False) + spread = 100 + X = rng.rand(n_samples_X, n_features).astype(global_dtype) * spread + Y = rng.rand(n_samples_Y, n_features).astype(global_dtype) * spread + + # Haversine distance only accepts 2D data + if metric == "haversine": + X = np.ascontiguousarray(X[:, :2]) + Y = np.ascontiguousarray(Y[:, :2]) + + if Dispatcher is ArgKmin: + parameter = 10 + check_parameters = {} + compute_parameters = {} + else: + radius = _non_trivial_radius(X=X, Y=Y, metric=metric) + parameter = radius + check_parameters = {"radius": radius} + compute_parameters = {"sort_results": True} + + dist_par_X, indices_par_X = Dispatcher.compute( + X, + Y, + parameter, + metric=metric, + # Taking the first + metric_kwargs=_get_metric_params_list( + metric, n_features, seed=global_random_seed + )[0], + # To be sure to use parallelization + chunk_size=n_samples_X // 4, + strategy="parallel_on_X", + return_distance=True, + **compute_parameters, + ) + + dist_par_Y, indices_par_Y = Dispatcher.compute( + X, + Y, + parameter, + metric=metric, + # Taking the first + metric_kwargs=_get_metric_params_list( + metric, n_features, seed=global_random_seed + )[0], + # To be sure to use parallelization + chunk_size=n_samples_Y // 4, + strategy="parallel_on_Y", + return_distance=True, + **compute_parameters, + ) + + ASSERT_RESULT[(Dispatcher, global_dtype)]( + dist_par_X, dist_par_Y, indices_par_X, indices_par_Y, **check_parameters + ) + + +# "Concrete Dispatchers"-specific tests + + +@pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS) +@pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y")) +@pytest.mark.parametrize("dtype", [np.float64, np.float32]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_pairwise_distances_argkmin( + global_random_seed, + metric, + strategy, + dtype, + csr_container, + n_queries=5, + n_samples=100, + k=10, +): + rng = np.random.RandomState(global_random_seed) + n_features = rng.choice([50, 500]) + translation = rng.choice([0, 1e6]) + spread = 1000 + X = translation + rng.rand(n_queries, n_features).astype(dtype) * spread + Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread + + X_csr = csr_container(X) + Y_csr = csr_container(Y) + + # Haversine distance only accepts 2D data + if metric == "haversine": + X = np.ascontiguousarray(X[:, :2]) + Y = np.ascontiguousarray(Y[:, :2]) + + metric_kwargs = _get_metric_params_list(metric, n_features)[0] + + # Reference for argkmin results + if metric == "euclidean": + # Compare to scikit-learn GEMM optimized implementation + dist_matrix = euclidean_distances(X, Y) + else: + dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs) + # Taking argkmin (indices of the k smallest values) + argkmin_indices_ref = np.argsort(dist_matrix, axis=1)[:, :k] + # Getting the associated distances + argkmin_distances_ref = np.zeros(argkmin_indices_ref.shape, dtype=np.float64) + for row_idx in range(argkmin_indices_ref.shape[0]): + argkmin_distances_ref[row_idx] = dist_matrix[ + row_idx, argkmin_indices_ref[row_idx] + ] + + for _X, _Y in itertools.product((X, X_csr), (Y, Y_csr)): + argkmin_distances, argkmin_indices = ArgKmin.compute( + _X, + _Y, + k, + metric=metric, + metric_kwargs=metric_kwargs, + return_distance=True, + # So as to have more than a chunk, forcing parallelism. + chunk_size=n_samples // 4, + strategy=strategy, + ) + + ASSERT_RESULT[(ArgKmin, dtype)]( + argkmin_distances, + argkmin_distances_ref, + argkmin_indices, + argkmin_indices_ref, + ) + + +@pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS) +@pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y")) +@pytest.mark.parametrize("dtype", [np.float64, np.float32]) +def test_pairwise_distances_radius_neighbors( + global_random_seed, + metric, + strategy, + dtype, + n_queries=5, + n_samples=100, +): + rng = np.random.RandomState(global_random_seed) + n_features = rng.choice([50, 500]) + translation = rng.choice([0, 1e6]) + spread = 1000 + X = translation + rng.rand(n_queries, n_features).astype(dtype) * spread + Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread + + metric_kwargs = _get_metric_params_list( + metric, n_features, seed=global_random_seed + )[0] + + # Reference for argkmin results + if metric == "euclidean": + # Compare to scikit-learn GEMM optimized implementation + dist_matrix = euclidean_distances(X, Y) + else: + dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs) + + radius = _non_trivial_radius(precomputed_dists=dist_matrix) + + # Getting the neighbors for a given radius + neigh_indices_ref = [] + neigh_distances_ref = [] + + for row in dist_matrix: + ind = np.arange(row.shape[0])[row <= radius] + dist = row[ind] + + sort = np.argsort(dist) + ind, dist = ind[sort], dist[sort] + + neigh_indices_ref.append(ind) + neigh_distances_ref.append(dist) + + neigh_distances, neigh_indices = RadiusNeighbors.compute( + X, + Y, + radius, + metric=metric, + metric_kwargs=metric_kwargs, + return_distance=True, + # So as to have more than a chunk, forcing parallelism. + chunk_size=n_samples // 4, + strategy=strategy, + sort_results=True, + ) + + ASSERT_RESULT[(RadiusNeighbors, dtype)]( + neigh_distances, neigh_distances_ref, neigh_indices, neigh_indices_ref, radius + ) + + +@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors]) +@pytest.mark.parametrize("metric", ["manhattan", "euclidean"]) +@pytest.mark.parametrize("dtype", [np.float64, np.float32]) +def test_memmap_backed_data( + metric, + Dispatcher, + dtype, +): + """Check that the results do not depend on the datasets writability.""" + rng = np.random.RandomState(0) + spread = 100 + n_samples, n_features = 128, 10 + X = rng.rand(n_samples, n_features).astype(dtype) * spread + Y = rng.rand(n_samples, n_features).astype(dtype) * spread + + # Create read only datasets + X_mm, Y_mm = create_memmap_backed_data([X, Y]) + + if Dispatcher is ArgKmin: + parameter = 10 + check_parameters = {} + compute_parameters = {} + else: + # Scaling the radius slightly with the numbers of dimensions + radius = 10 ** np.log(n_features) + parameter = radius + check_parameters = {"radius": radius} + compute_parameters = {"sort_results": True} + + ref_dist, ref_indices = Dispatcher.compute( + X, + Y, + parameter, + metric=metric, + return_distance=True, + **compute_parameters, + ) + + dist_mm, indices_mm = Dispatcher.compute( + X_mm, + Y_mm, + parameter, + metric=metric, + return_distance=True, + **compute_parameters, + ) + + ASSERT_RESULT[(Dispatcher, dtype)]( + ref_dist, dist_mm, ref_indices, indices_mm, **check_parameters + ) + + +@pytest.mark.parametrize("dtype", [np.float64, np.float32]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sqeuclidean_row_norms( + global_random_seed, + dtype, + csr_container, +): + rng = np.random.RandomState(global_random_seed) + spread = 100 + n_samples = rng.choice([97, 100, 101, 1000]) + n_features = rng.choice([5, 10, 100]) + num_threads = rng.choice([1, 2, 8]) + X = rng.rand(n_samples, n_features).astype(dtype) * spread + + X_csr = csr_container(X) + + sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2 + sq_row_norm = sqeuclidean_row_norms(X, num_threads=num_threads) + + sq_row_norm_csr = sqeuclidean_row_norms(X_csr, num_threads=num_threads) + + assert_allclose(sq_row_norm_reference, sq_row_norm) + assert_allclose(sq_row_norm_reference, sq_row_norm_csr) + + with pytest.raises(ValueError): + X = np.asfortranarray(X) + sqeuclidean_row_norms(X, num_threads=num_threads) + + +def test_argkmin_classmode_strategy_consistent(): + rng = np.random.RandomState(1) + X = rng.rand(100, 10) + Y = rng.rand(100, 10) + k = 5 + metric = "manhattan" + + weights = "uniform" + Y_labels = rng.randint(low=0, high=10, size=100) + unique_Y_labels = np.unique(Y_labels) + results_X = ArgKminClassMode.compute( + X=X, + Y=Y, + k=k, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + strategy="parallel_on_X", + ) + results_Y = ArgKminClassMode.compute( + X=X, + Y=Y, + k=k, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + strategy="parallel_on_Y", + ) + assert_array_equal(results_X, results_Y) + + +@pytest.mark.parametrize("outlier_label", [None, 0, 3, 6, 9]) +def test_radius_neighbors_classmode_strategy_consistent(outlier_label): + rng = np.random.RandomState(1) + X = rng.rand(100, 10) + Y = rng.rand(100, 10) + radius = 5 + metric = "manhattan" + + weights = "uniform" + Y_labels = rng.randint(low=0, high=10, size=100) + unique_Y_labels = np.unique(Y_labels) + results_X = RadiusNeighborsClassMode.compute( + X=X, + Y=Y, + radius=radius, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + outlier_label=outlier_label, + strategy="parallel_on_X", + ) + results_Y = RadiusNeighborsClassMode.compute( + X=X, + Y=Y, + radius=radius, + metric=metric, + weights=weights, + Y_labels=Y_labels, + unique_Y_labels=unique_Y_labels, + outlier_label=outlier_label, + strategy="parallel_on_Y", + ) + assert_allclose(results_X, results_Y) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_ranking.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_ranking.py new file mode 100644 index 0000000000000000000000000000000000000000..7d740249f8aba4d5a87ecd2d6a16087557335d9a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_ranking.py @@ -0,0 +1,2270 @@ +import math +import re + +import numpy as np +import pytest +from scipy import stats + +from sklearn import datasets, svm +from sklearn.datasets import make_multilabel_classification +from sklearn.exceptions import UndefinedMetricWarning +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import ( + accuracy_score, + auc, + average_precision_score, + coverage_error, + dcg_score, + det_curve, + label_ranking_average_precision_score, + label_ranking_loss, + ndcg_score, + precision_recall_curve, + roc_auc_score, + roc_curve, + top_k_accuracy_score, +) +from sklearn.metrics._ranking import _dcg_sample_scores, _ndcg_sample_scores +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import label_binarize +from sklearn.random_projection import _sparse_random_matrix +from sklearn.utils._testing import ( + _convert_container, + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) +from sklearn.utils.extmath import softmax +from sklearn.utils.fixes import CSR_CONTAINERS +from sklearn.utils.validation import ( + check_array, + check_consistent_length, + check_random_state, +) + +############################################################################### +# Utilities for testing + +CURVE_FUNCS = [ + det_curve, + precision_recall_curve, + roc_curve, +] + + +def make_prediction(dataset=None, binary=False): + """Make some classification predictions on a toy dataset using a SVC + + If binary is True restrict to a binary classification problem instead of a + multiclass classification problem + """ + + if dataset is None: + # import some data to play with + dataset = datasets.load_iris() + + X = dataset.data + y = dataset.target + + if binary: + # restrict to a binary classification task + X, y = X[y < 2], y[y < 2] + + n_samples, n_features = X.shape + p = np.arange(n_samples) + + rng = check_random_state(37) + rng.shuffle(p) + X, y = X[p], y[p] + half = int(n_samples / 2) + + # add noisy features to make the problem harder and avoid perfect results + rng = np.random.RandomState(0) + X = np.c_[X, rng.randn(n_samples, 200 * n_features)] + + # run classifier, get class probabilities and label predictions + clf = svm.SVC(kernel="linear", probability=True, random_state=0) + y_score = clf.fit(X[:half], y[:half]).predict_proba(X[half:]) + + if binary: + # only interested in probabilities of the positive case + # XXX: do we really want a special API for the binary case? + y_score = y_score[:, 1] + + y_pred = clf.predict(X[half:]) + y_true = y[half:] + return y_true, y_pred, y_score + + +############################################################################### +# Tests + + +def _auc(y_true, y_score): + """Alternative implementation to check for correctness of + `roc_auc_score`.""" + pos_label = np.unique(y_true)[1] + + # Count the number of times positive samples are correctly ranked above + # negative samples. + pos = y_score[y_true == pos_label] + neg = y_score[y_true != pos_label] + diff_matrix = pos.reshape(1, -1) - neg.reshape(-1, 1) + n_correct = np.sum(diff_matrix > 0) + + return n_correct / float(len(pos) * len(neg)) + + +def _average_precision(y_true, y_score): + """Alternative implementation to check for correctness of + `average_precision_score`. + + Note that this implementation fails on some edge cases. + For example, for constant predictions e.g. [0.5, 0.5, 0.5], + y_true = [1, 0, 0] returns an average precision of 0.33... + but y_true = [0, 0, 1] returns 1.0. + """ + pos_label = np.unique(y_true)[1] + n_pos = np.sum(y_true == pos_label) + order = np.argsort(y_score)[::-1] + y_score = y_score[order] + y_true = y_true[order] + + score = 0 + for i in range(len(y_score)): + if y_true[i] == pos_label: + # Compute precision up to document i + # i.e, percentage of relevant documents up to document i. + prec = 0 + for j in range(0, i + 1): + if y_true[j] == pos_label: + prec += 1.0 + prec /= i + 1.0 + score += prec + + return score / n_pos + + +def _average_precision_slow(y_true, y_score): + """A second alternative implementation of average precision that closely + follows the Wikipedia article's definition (see References). This should + give identical results as `average_precision_score` for all inputs. + + References + ---------- + .. [1] `Wikipedia entry for the Average precision + `_ + """ + precision, recall, threshold = precision_recall_curve(y_true, y_score) + precision = list(reversed(precision)) + recall = list(reversed(recall)) + average_precision = 0 + for i in range(1, len(precision)): + average_precision += precision[i] * (recall[i] - recall[i - 1]) + return average_precision + + +def _partial_roc_auc_score(y_true, y_predict, max_fpr): + """Alternative implementation to check for correctness of `roc_auc_score` + with `max_fpr` set. + """ + + def _partial_roc(y_true, y_predict, max_fpr): + fpr, tpr, _ = roc_curve(y_true, y_predict) + new_fpr = fpr[fpr <= max_fpr] + new_fpr = np.append(new_fpr, max_fpr) + new_tpr = tpr[fpr <= max_fpr] + idx_out = np.argmax(fpr > max_fpr) + idx_in = idx_out - 1 + x_interp = [fpr[idx_in], fpr[idx_out]] + y_interp = [tpr[idx_in], tpr[idx_out]] + new_tpr = np.append(new_tpr, np.interp(max_fpr, x_interp, y_interp)) + return (new_fpr, new_tpr) + + new_fpr, new_tpr = _partial_roc(y_true, y_predict, max_fpr) + partial_auc = auc(new_fpr, new_tpr) + + # Formula (5) from McClish 1989 + fpr1 = 0 + fpr2 = max_fpr + min_area = 0.5 * (fpr2 - fpr1) * (fpr2 + fpr1) + max_area = fpr2 - fpr1 + return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area)) + + +@pytest.mark.parametrize("drop", [True, False]) +def test_roc_curve(drop): + # Test Area under Receiver Operating Characteristic (ROC) curve + y_true, _, y_score = make_prediction(binary=True) + expected_auc = _auc(y_true, y_score) + + fpr, tpr, thresholds = roc_curve(y_true, y_score, drop_intermediate=drop) + roc_auc = auc(fpr, tpr) + assert_array_almost_equal(roc_auc, expected_auc, decimal=2) + assert_almost_equal(roc_auc, roc_auc_score(y_true, y_score)) + assert fpr.shape == tpr.shape + assert fpr.shape == thresholds.shape + + +def test_roc_curve_end_points(): + # Make sure that roc_curve returns a curve start at 0 and ending and + # 1 even in corner cases + rng = np.random.RandomState(0) + y_true = np.array([0] * 50 + [1] * 50) + y_pred = rng.randint(3, size=100) + fpr, tpr, thr = roc_curve(y_true, y_pred, drop_intermediate=True) + assert fpr[0] == 0 + assert fpr[-1] == 1 + assert fpr.shape == tpr.shape + assert fpr.shape == thr.shape + + +def test_roc_returns_consistency(): + # Test whether the returned threshold matches up with tpr + # make small toy dataset + y_true, _, y_score = make_prediction(binary=True) + fpr, tpr, thresholds = roc_curve(y_true, y_score) + + # use the given thresholds to determine the tpr + tpr_correct = [] + for t in thresholds: + tp = np.sum((y_score >= t) & y_true) + p = np.sum(y_true) + tpr_correct.append(1.0 * tp / p) + + # compare tpr and tpr_correct to see if the thresholds' order was correct + assert_array_almost_equal(tpr, tpr_correct, decimal=2) + assert fpr.shape == tpr.shape + assert fpr.shape == thresholds.shape + + +def test_roc_curve_multi(): + # roc_curve not applicable for multi-class problems + y_true, _, y_score = make_prediction(binary=False) + + with pytest.raises(ValueError): + roc_curve(y_true, y_score) + + +def test_roc_curve_confidence(): + # roc_curve for confidence scores + y_true, _, y_score = make_prediction(binary=True) + + fpr, tpr, thresholds = roc_curve(y_true, y_score - 0.5) + roc_auc = auc(fpr, tpr) + assert_array_almost_equal(roc_auc, 0.90, decimal=2) + assert fpr.shape == tpr.shape + assert fpr.shape == thresholds.shape + + +def test_roc_curve_hard(): + # roc_curve for hard decisions + y_true, pred, y_score = make_prediction(binary=True) + + # always predict one + trivial_pred = np.ones(y_true.shape) + fpr, tpr, thresholds = roc_curve(y_true, trivial_pred) + roc_auc = auc(fpr, tpr) + assert_array_almost_equal(roc_auc, 0.50, decimal=2) + assert fpr.shape == tpr.shape + assert fpr.shape == thresholds.shape + + # always predict zero + trivial_pred = np.zeros(y_true.shape) + fpr, tpr, thresholds = roc_curve(y_true, trivial_pred) + roc_auc = auc(fpr, tpr) + assert_array_almost_equal(roc_auc, 0.50, decimal=2) + assert fpr.shape == tpr.shape + assert fpr.shape == thresholds.shape + + # hard decisions + fpr, tpr, thresholds = roc_curve(y_true, pred) + roc_auc = auc(fpr, tpr) + assert_array_almost_equal(roc_auc, 0.78, decimal=2) + assert fpr.shape == tpr.shape + assert fpr.shape == thresholds.shape + + +def test_roc_curve_one_label(): + y_true = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + y_pred = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] + # assert there are warnings + expected_message = ( + "No negative samples in y_true, false positive value should be meaningless" + ) + with pytest.warns(UndefinedMetricWarning, match=expected_message): + fpr, tpr, thresholds = roc_curve(y_true, y_pred) + + # all true labels, all fpr should be nan + assert_array_equal(fpr, np.full(len(thresholds), np.nan)) + assert fpr.shape == tpr.shape + assert fpr.shape == thresholds.shape + + # assert there are warnings + expected_message = ( + "No positive samples in y_true, true positive value should be meaningless" + ) + with pytest.warns(UndefinedMetricWarning, match=expected_message): + fpr, tpr, thresholds = roc_curve([1 - x for x in y_true], y_pred) + # all negative labels, all tpr should be nan + assert_array_equal(tpr, np.full(len(thresholds), np.nan)) + assert fpr.shape == tpr.shape + assert fpr.shape == thresholds.shape + + +def test_roc_curve_toydata(): + # Binary classification + y_true = [0, 1] + y_score = [0, 1] + tpr, fpr, _ = roc_curve(y_true, y_score) + roc_auc = roc_auc_score(y_true, y_score) + assert_array_almost_equal(tpr, [0, 0, 1]) + assert_array_almost_equal(fpr, [0, 1, 1]) + assert_almost_equal(roc_auc, 1.0) + + y_true = [0, 1] + y_score = [1, 0] + tpr, fpr, _ = roc_curve(y_true, y_score) + roc_auc = roc_auc_score(y_true, y_score) + assert_array_almost_equal(tpr, [0, 1, 1]) + assert_array_almost_equal(fpr, [0, 0, 1]) + assert_almost_equal(roc_auc, 0.0) + + y_true = [1, 0] + y_score = [1, 1] + tpr, fpr, _ = roc_curve(y_true, y_score) + roc_auc = roc_auc_score(y_true, y_score) + assert_array_almost_equal(tpr, [0, 1]) + assert_array_almost_equal(fpr, [0, 1]) + assert_almost_equal(roc_auc, 0.5) + + y_true = [1, 0] + y_score = [1, 0] + tpr, fpr, _ = roc_curve(y_true, y_score) + roc_auc = roc_auc_score(y_true, y_score) + assert_array_almost_equal(tpr, [0, 0, 1]) + assert_array_almost_equal(fpr, [0, 1, 1]) + assert_almost_equal(roc_auc, 1.0) + + y_true = [1, 0] + y_score = [0.5, 0.5] + tpr, fpr, _ = roc_curve(y_true, y_score) + roc_auc = roc_auc_score(y_true, y_score) + assert_array_almost_equal(tpr, [0, 1]) + assert_array_almost_equal(fpr, [0, 1]) + assert_almost_equal(roc_auc, 0.5) + + # case with no positive samples + y_true = [0, 0] + y_score = [0.25, 0.75] + # assert UndefinedMetricWarning because of no positive sample in y_true + expected_message = ( + "No positive samples in y_true, true positive value should be meaningless" + ) + with pytest.warns(UndefinedMetricWarning, match=expected_message): + tpr, fpr, _ = roc_curve(y_true, y_score) + assert_array_almost_equal(tpr, [0.0, 0.5, 1.0]) + assert_array_almost_equal(fpr, [np.nan, np.nan, np.nan]) + expected_message = ( + "Only one class is present in y_true. " + "ROC AUC score is not defined in that case." + ) + with pytest.warns(UndefinedMetricWarning, match=expected_message): + auc = roc_auc_score(y_true, y_score) + assert math.isnan(auc) + + # case with no negative samples + y_true = [1, 1] + y_score = [0.25, 0.75] + # assert UndefinedMetricWarning because of no negative sample in y_true + expected_message = ( + "No negative samples in y_true, false positive value should be meaningless" + ) + with pytest.warns(UndefinedMetricWarning, match=expected_message): + tpr, fpr, _ = roc_curve(y_true, y_score) + assert_array_almost_equal(tpr, [np.nan, np.nan, np.nan]) + assert_array_almost_equal(fpr, [0.0, 0.5, 1.0]) + expected_message = ( + "Only one class is present in y_true. " + "ROC AUC score is not defined in that case." + ) + with pytest.warns(UndefinedMetricWarning, match=expected_message): + auc = roc_auc_score(y_true, y_score) + assert math.isnan(auc) + + # Multi-label classification task + y_true = np.array([[0, 1], [0, 1]]) + y_score = np.array([[0, 1], [0, 1]]) + with pytest.warns(UndefinedMetricWarning, match=expected_message): + roc_auc_score(y_true, y_score, average="macro") + with pytest.warns(UndefinedMetricWarning, match=expected_message): + roc_auc_score(y_true, y_score, average="weighted") + assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 1.0) + assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 1.0) + + y_true = np.array([[0, 1], [0, 1]]) + y_score = np.array([[0, 1], [1, 0]]) + with pytest.warns(UndefinedMetricWarning, match=expected_message): + roc_auc_score(y_true, y_score, average="macro") + with pytest.warns(UndefinedMetricWarning, match=expected_message): + roc_auc_score(y_true, y_score, average="weighted") + assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0.5) + assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0.5) + + y_true = np.array([[1, 0], [0, 1]]) + y_score = np.array([[0, 1], [1, 0]]) + assert_almost_equal(roc_auc_score(y_true, y_score, average="macro"), 0) + assert_almost_equal(roc_auc_score(y_true, y_score, average="weighted"), 0) + assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0) + assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0) + + y_true = np.array([[1, 0], [0, 1]]) + y_score = np.array([[0.5, 0.5], [0.5, 0.5]]) + assert_almost_equal(roc_auc_score(y_true, y_score, average="macro"), 0.5) + assert_almost_equal(roc_auc_score(y_true, y_score, average="weighted"), 0.5) + assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0.5) + assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0.5) + + +def test_roc_curve_drop_intermediate(): + # Test that drop_intermediate drops the correct thresholds + y_true = [0, 0, 0, 0, 1, 1] + y_score = [0.0, 0.2, 0.5, 0.6, 0.7, 1.0] + tpr, fpr, thresholds = roc_curve(y_true, y_score, drop_intermediate=True) + assert_array_almost_equal(thresholds, [np.inf, 1.0, 0.7, 0.0]) + + # Test dropping thresholds with repeating scores + y_true = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] + y_score = [0.0, 0.1, 0.6, 0.6, 0.7, 0.8, 0.9, 0.6, 0.7, 0.8, 0.9, 0.9, 1.0] + tpr, fpr, thresholds = roc_curve(y_true, y_score, drop_intermediate=True) + assert_array_almost_equal(thresholds, [np.inf, 1.0, 0.9, 0.7, 0.6, 0.0]) + + +def test_roc_curve_fpr_tpr_increasing(): + # Ensure that fpr and tpr returned by roc_curve are increasing. + # Construct an edge case with float y_score and sample_weight + # when some adjacent values of fpr and tpr are actually the same. + y_true = [0, 0, 1, 1, 1] + y_score = [0.1, 0.7, 0.3, 0.4, 0.5] + sample_weight = np.repeat(0.2, 5) + fpr, tpr, _ = roc_curve(y_true, y_score, sample_weight=sample_weight) + assert (np.diff(fpr) < 0).sum() == 0 + assert (np.diff(tpr) < 0).sum() == 0 + + +def test_auc(): + # Test Area Under Curve (AUC) computation + x = [0, 1] + y = [0, 1] + assert_array_almost_equal(auc(x, y), 0.5) + x = [1, 0] + y = [0, 1] + assert_array_almost_equal(auc(x, y), 0.5) + x = [1, 0, 0] + y = [0, 1, 1] + assert_array_almost_equal(auc(x, y), 0.5) + x = [0, 1] + y = [1, 1] + assert_array_almost_equal(auc(x, y), 1) + x = [0, 0.5, 1] + y = [0, 0.5, 1] + assert_array_almost_equal(auc(x, y), 0.5) + + +def test_auc_errors(): + # Incompatible shapes + with pytest.raises(ValueError): + auc([0.0, 0.5, 1.0], [0.1, 0.2]) + + # Too few x values + with pytest.raises(ValueError): + auc([0.0], [0.1]) + + # x is not in order + x = [2, 1, 3, 4] + y = [5, 6, 7, 8] + error_message = "x is neither increasing nor decreasing : {}".format(np.array(x)) + with pytest.raises(ValueError, match=re.escape(error_message)): + auc(x, y) + + +@pytest.mark.parametrize( + "y_true, labels", + [ + (np.array([0, 1, 0, 2]), [0, 1, 2]), + (np.array([0, 1, 0, 2]), None), + (["a", "b", "a", "c"], ["a", "b", "c"]), + (["a", "b", "a", "c"], None), + ], +) +def test_multiclass_ovo_roc_auc_toydata(y_true, labels): + # Tests the one-vs-one multiclass ROC AUC algorithm + # on a small example, representative of an expected use case. + y_scores = np.array( + [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]] + ) + + # Used to compute the expected output. + # Consider labels 0 and 1: + # positive label is 0, negative label is 1 + score_01 = roc_auc_score([1, 0, 1], [0.1, 0.3, 0.35]) + # positive label is 1, negative label is 0 + score_10 = roc_auc_score([0, 1, 0], [0.8, 0.4, 0.5]) + average_score_01 = (score_01 + score_10) / 2 + + # Consider labels 0 and 2: + score_02 = roc_auc_score([1, 1, 0], [0.1, 0.35, 0]) + score_20 = roc_auc_score([0, 0, 1], [0.1, 0.15, 0.8]) + average_score_02 = (score_02 + score_20) / 2 + + # Consider labels 1 and 2: + score_12 = roc_auc_score([1, 0], [0.4, 0.2]) + score_21 = roc_auc_score([0, 1], [0.3, 0.8]) + average_score_12 = (score_12 + score_21) / 2 + + # Unweighted, one-vs-one multiclass ROC AUC algorithm + ovo_unweighted_score = (average_score_01 + average_score_02 + average_score_12) / 3 + assert_almost_equal( + roc_auc_score(y_true, y_scores, labels=labels, multi_class="ovo"), + ovo_unweighted_score, + ) + + # Weighted, one-vs-one multiclass ROC AUC algorithm + # Each term is weighted by the prevalence for the positive label. + pair_scores = [average_score_01, average_score_02, average_score_12] + prevalence = [0.75, 0.75, 0.50] + ovo_weighted_score = np.average(pair_scores, weights=prevalence) + assert_almost_equal( + roc_auc_score( + y_true, y_scores, labels=labels, multi_class="ovo", average="weighted" + ), + ovo_weighted_score, + ) + + # Check that average=None raises NotImplemented error + error_message = "average=None is not implemented for multi_class='ovo'." + with pytest.raises(NotImplementedError, match=error_message): + roc_auc_score(y_true, y_scores, labels=labels, multi_class="ovo", average=None) + + +@pytest.mark.parametrize( + "y_true, labels", + [ + (np.array([0, 2, 0, 2]), [0, 1, 2]), + (np.array(["a", "d", "a", "d"]), ["a", "b", "d"]), + ], +) +def test_multiclass_ovo_roc_auc_toydata_binary(y_true, labels): + # Tests the one-vs-one multiclass ROC AUC algorithm for binary y_true + # + # on a small example, representative of an expected use case. + y_scores = np.array( + [[0.2, 0.0, 0.8], [0.6, 0.0, 0.4], [0.55, 0.0, 0.45], [0.4, 0.0, 0.6]] + ) + + # Used to compute the expected output. + # Consider labels 0 and 1: + # positive label is 0, negative label is 1 + score_01 = roc_auc_score([1, 0, 1, 0], [0.2, 0.6, 0.55, 0.4]) + # positive label is 1, negative label is 0 + score_10 = roc_auc_score([0, 1, 0, 1], [0.8, 0.4, 0.45, 0.6]) + ovo_score = (score_01 + score_10) / 2 + + assert_almost_equal( + roc_auc_score(y_true, y_scores, labels=labels, multi_class="ovo"), ovo_score + ) + + # Weighted, one-vs-one multiclass ROC AUC algorithm + assert_almost_equal( + roc_auc_score( + y_true, y_scores, labels=labels, multi_class="ovo", average="weighted" + ), + ovo_score, + ) + + +@pytest.mark.parametrize( + "y_true, labels", + [ + (np.array([0, 1, 2, 2]), None), + (["a", "b", "c", "c"], None), + ([0, 1, 2, 2], [0, 1, 2]), + (["a", "b", "c", "c"], ["a", "b", "c"]), + ], +) +def test_multiclass_ovr_roc_auc_toydata(y_true, labels): + # Tests the unweighted, one-vs-rest multiclass ROC AUC algorithm + # on a small example, representative of an expected use case. + y_scores = np.array( + [[1.0, 0.0, 0.0], [0.1, 0.5, 0.4], [0.1, 0.1, 0.8], [0.3, 0.3, 0.4]] + ) + # Compute the expected result by individually computing the 'one-vs-rest' + # ROC AUC scores for classes 0, 1, and 2. + out_0 = roc_auc_score([1, 0, 0, 0], y_scores[:, 0]) + out_1 = roc_auc_score([0, 1, 0, 0], y_scores[:, 1]) + out_2 = roc_auc_score([0, 0, 1, 1], y_scores[:, 2]) + assert_almost_equal( + roc_auc_score(y_true, y_scores, multi_class="ovr", labels=labels, average=None), + [out_0, out_1, out_2], + ) + + # Compute unweighted results (default behaviour is average="macro") + result_unweighted = (out_0 + out_1 + out_2) / 3.0 + assert_almost_equal( + roc_auc_score(y_true, y_scores, multi_class="ovr", labels=labels), + result_unweighted, + ) + + # Tests the weighted, one-vs-rest multiclass ROC AUC algorithm + # on the same input (Provost & Domingos, 2000) + result_weighted = out_0 * 0.25 + out_1 * 0.25 + out_2 * 0.5 + assert_almost_equal( + roc_auc_score( + y_true, y_scores, multi_class="ovr", labels=labels, average="weighted" + ), + result_weighted, + ) + + +@pytest.mark.parametrize( + "multi_class, average", + [ + ("ovr", "macro"), + ("ovr", "micro"), + ("ovo", "macro"), + ], +) +def test_perfect_imperfect_chance_multiclass_roc_auc(multi_class, average): + y_true = np.array([3, 1, 2, 0]) + + # Perfect classifier (from a ranking point of view) has roc_auc_score = 1.0 + y_perfect = [ + [0.0, 0.0, 0.0, 1.0], + [0.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0], + [0.75, 0.05, 0.05, 0.15], + ] + assert_almost_equal( + roc_auc_score(y_true, y_perfect, multi_class=multi_class, average=average), + 1.0, + ) + + # Imperfect classifier has roc_auc_score < 1.0 + y_imperfect = [ + [0.0, 0.0, 0.0, 1.0], + [0.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0], + [0.0, 0.0, 0.0, 1.0], + ] + assert ( + roc_auc_score(y_true, y_imperfect, multi_class=multi_class, average=average) + < 1.0 + ) + + # Chance level classifier has roc_auc_score = 5.0 + y_chance = 0.25 * np.ones((4, 4)) + assert roc_auc_score( + y_true, y_chance, multi_class=multi_class, average=average + ) == pytest.approx(0.5) + + +def test_micro_averaged_ovr_roc_auc(global_random_seed): + seed = global_random_seed + # Let's generate a set of random predictions and matching true labels such + # that the predictions are not perfect. To make the problem more interesting, + # we use an imbalanced class distribution (by using different parameters + # in the Dirichlet prior (conjugate prior of the multinomial distribution). + y_pred = stats.dirichlet.rvs([2.0, 1.0, 0.5], size=1000, random_state=seed) + y_true = np.asarray( + [ + stats.multinomial.rvs(n=1, p=y_pred_i, random_state=seed).argmax() + for y_pred_i in y_pred + ] + ) + y_onehot = label_binarize(y_true, classes=[0, 1, 2]) + fpr, tpr, _ = roc_curve(y_onehot.ravel(), y_pred.ravel()) + roc_auc_by_hand = auc(fpr, tpr) + roc_auc_auto = roc_auc_score(y_true, y_pred, multi_class="ovr", average="micro") + assert roc_auc_by_hand == pytest.approx(roc_auc_auto) + + +@pytest.mark.parametrize( + "msg, y_true, labels", + [ + ("Parameter 'labels' must be unique", np.array([0, 1, 2, 2]), [0, 2, 0]), + ( + "Parameter 'labels' must be unique", + np.array(["a", "b", "c", "c"]), + ["a", "a", "b"], + ), + ( + ( + "Number of classes in y_true not equal to the number of columns " + "in 'y_score'" + ), + np.array([0, 2, 0, 2]), + None, + ), + ( + "Parameter 'labels' must be ordered", + np.array(["a", "b", "c", "c"]), + ["a", "c", "b"], + ), + ( + ( + "Number of given labels, 2, not equal to the number of columns in " + "'y_score', 3" + ), + np.array([0, 1, 2, 2]), + [0, 1], + ), + ( + ( + "Number of given labels, 2, not equal to the number of columns in " + "'y_score', 3" + ), + np.array(["a", "b", "c", "c"]), + ["a", "b"], + ), + ( + ( + "Number of given labels, 4, not equal to the number of columns in " + "'y_score', 3" + ), + np.array([0, 1, 2, 2]), + [0, 1, 2, 3], + ), + ( + ( + "Number of given labels, 4, not equal to the number of columns in " + "'y_score', 3" + ), + np.array(["a", "b", "c", "c"]), + ["a", "b", "c", "d"], + ), + ( + "'y_true' contains labels not in parameter 'labels'", + np.array(["a", "b", "c", "e"]), + ["a", "b", "c"], + ), + ( + "'y_true' contains labels not in parameter 'labels'", + np.array(["a", "b", "c", "d"]), + ["a", "b", "c"], + ), + ( + "'y_true' contains labels not in parameter 'labels'", + np.array([0, 1, 2, 3]), + [0, 1, 2], + ), + ], +) +@pytest.mark.parametrize("multi_class", ["ovo", "ovr"]) +def test_roc_auc_score_multiclass_labels_error(msg, y_true, labels, multi_class): + y_scores = np.array( + [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]] + ) + + with pytest.raises(ValueError, match=msg): + roc_auc_score(y_true, y_scores, labels=labels, multi_class=multi_class) + + +@pytest.mark.parametrize( + "msg, kwargs", + [ + ( + ( + r"average must be one of \('macro', 'weighted', None\) for " + r"multiclass problems" + ), + {"average": "samples", "multi_class": "ovo"}, + ), + ( + ( + r"average must be one of \('micro', 'macro', 'weighted', None\) for " + r"multiclass problems" + ), + {"average": "samples", "multi_class": "ovr"}, + ), + ( + ( + r"sample_weight is not supported for multiclass one-vs-one " + r"ROC AUC, 'sample_weight' must be None in this case" + ), + {"multi_class": "ovo", "sample_weight": []}, + ), + ( + ( + r"Partial AUC computation not available in multiclass setting, " + r"'max_fpr' must be set to `None`, received `max_fpr=0.5` " + r"instead" + ), + {"multi_class": "ovo", "max_fpr": 0.5}, + ), + (r"multi_class must be in \('ovo', 'ovr'\)", {}), + ], +) +def test_roc_auc_score_multiclass_error(msg, kwargs): + # Test that roc_auc_score function returns an error when trying + # to compute multiclass AUC for parameters where an output + # is not defined. + rng = check_random_state(404) + y_score = rng.rand(20, 3) + y_prob = softmax(y_score) + y_true = rng.randint(0, 3, size=20) + with pytest.raises(ValueError, match=msg): + roc_auc_score(y_true, y_prob, **kwargs) + + +def test_auc_score_non_binary_class(): + # Test that roc_auc_score function returns an error when trying + # to compute AUC for non-binary class values. + rng = check_random_state(404) + y_pred = rng.rand(10) + # y_true contains only one class value + y_true = np.zeros(10, dtype="int") + warn_message = ( + "Only one class is present in y_true. " + "ROC AUC score is not defined in that case." + ) + with pytest.warns(UndefinedMetricWarning, match=warn_message): + roc_auc_score(y_true, y_pred) + y_true = np.ones(10, dtype="int") + with pytest.warns(UndefinedMetricWarning, match=warn_message): + roc_auc_score(y_true, y_pred) + y_true = np.full(10, -1, dtype="int") + with pytest.warns(UndefinedMetricWarning, match=warn_message): + roc_auc_score(y_true, y_pred) + + +@pytest.mark.parametrize("curve_func", CURVE_FUNCS) +def test_binary_clf_curve_multiclass_error(curve_func): + rng = check_random_state(404) + y_true = rng.randint(0, 3, size=10) + y_pred = rng.rand(10) + msg = "multiclass format is not supported" + with pytest.raises(ValueError, match=msg): + curve_func(y_true, y_pred) + + +@pytest.mark.parametrize("curve_func", CURVE_FUNCS) +def test_binary_clf_curve_implicit_pos_label(curve_func): + # Check that using string class labels raises an informative + # error for any supported string dtype: + msg = ( + "y_true takes value in {'a', 'b'} and pos_label is " + "not specified: either make y_true take " + "value in {0, 1} or {-1, 1} or pass pos_label " + "explicitly." + ) + with pytest.raises(ValueError, match=msg): + curve_func(np.array(["a", "b"], dtype="= 0 and y_score.max() <= 1 else 0 + y_pred = (y_score > threshold).astype(np.int64) if k == 1 else y_true + + score = top_k_accuracy_score(y_true, y_score, k=k) + score_acc = accuracy_score(y_true, y_pred) + + assert score == score_acc == pytest.approx(true_score) + + +@pytest.mark.parametrize( + "y_true, true_score, labels", + [ + (np.array([0, 1, 1, 2]), 0.75, [0, 1, 2, 3]), + (np.array([0, 1, 1, 1]), 0.5, [0, 1, 2, 3]), + (np.array([1, 1, 1, 1]), 0.5, [0, 1, 2, 3]), + (np.array(["a", "e", "e", "a"]), 0.75, ["a", "b", "d", "e"]), + ], +) +@pytest.mark.parametrize("labels_as_ndarray", [True, False]) +def test_top_k_accuracy_score_multiclass_with_labels( + y_true, true_score, labels, labels_as_ndarray +): + """Test when labels and y_score are multiclass.""" + if labels_as_ndarray: + labels = np.asarray(labels) + y_score = np.array( + [ + [0.4, 0.3, 0.2, 0.1], + [0.1, 0.3, 0.4, 0.2], + [0.4, 0.1, 0.2, 0.3], + [0.3, 0.2, 0.4, 0.1], + ] + ) + + score = top_k_accuracy_score(y_true, y_score, k=2, labels=labels) + assert score == pytest.approx(true_score) + + +def test_top_k_accuracy_score_increasing(): + # Make sure increasing k leads to a higher score + X, y = datasets.make_classification( + n_classes=10, n_samples=1000, n_informative=10, random_state=0 + ) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + clf = LogisticRegression(random_state=0) + clf.fit(X_train, y_train) + + for X, y in zip((X_train, X_test), (y_train, y_test)): + scores = [ + top_k_accuracy_score(y, clf.predict_proba(X), k=k) for k in range(2, 10) + ] + + assert np.all(np.diff(scores) > 0) + + +@pytest.mark.parametrize( + "y_true, k, true_score", + [ + ([0, 1, 2, 3], 1, 0.25), + ([0, 1, 2, 3], 2, 0.5), + ([0, 1, 2, 3], 3, 1), + ], +) +def test_top_k_accuracy_score_ties(y_true, k, true_score): + # Make sure highest indices labels are chosen first in case of ties + y_score = np.array( + [ + [5, 5, 7, 0], + [1, 5, 5, 5], + [0, 0, 3, 3], + [1, 1, 1, 1], + ] + ) + assert top_k_accuracy_score(y_true, y_score, k=k) == pytest.approx(true_score) + + +@pytest.mark.parametrize( + "y_true, k", + [ + ([0, 1, 2, 3], 4), + ([0, 1, 2, 3], 5), + ], +) +def test_top_k_accuracy_score_warning(y_true, k): + y_score = np.array( + [ + [0.4, 0.3, 0.2, 0.1], + [0.1, 0.4, 0.3, 0.2], + [0.2, 0.1, 0.4, 0.3], + [0.3, 0.2, 0.1, 0.4], + ] + ) + expected_message = ( + r"'k' \(\d+\) greater than or equal to 'n_classes' \(\d+\) will result in a " + "perfect score and is therefore meaningless." + ) + with pytest.warns(UndefinedMetricWarning, match=expected_message): + score = top_k_accuracy_score(y_true, y_score, k=k) + assert score == 1 + + +@pytest.mark.parametrize( + "y_true, y_score, labels, msg", + [ + ( + [0, 0.57, 1, 2], + [ + [0.2, 0.1, 0.7], + [0.4, 0.3, 0.3], + [0.3, 0.4, 0.3], + [0.4, 0.5, 0.1], + ], + None, + "y type must be 'binary' or 'multiclass', got 'continuous'", + ), + ( + [0, 1, 2, 3], + [ + [0.2, 0.1, 0.7], + [0.4, 0.3, 0.3], + [0.3, 0.4, 0.3], + [0.4, 0.5, 0.1], + ], + None, + r"Number of classes in 'y_true' \(4\) not equal to the number of " + r"classes in 'y_score' \(3\).", + ), + ( + ["c", "c", "a", "b"], + [ + [0.2, 0.1, 0.7], + [0.4, 0.3, 0.3], + [0.3, 0.4, 0.3], + [0.4, 0.5, 0.1], + ], + ["a", "b", "c", "c"], + "Parameter 'labels' must be unique.", + ), + ( + ["c", "c", "a", "b"], + [ + [0.2, 0.1, 0.7], + [0.4, 0.3, 0.3], + [0.3, 0.4, 0.3], + [0.4, 0.5, 0.1], + ], + ["a", "c", "b"], + "Parameter 'labels' must be ordered.", + ), + ( + [0, 0, 1, 2], + [ + [0.2, 0.1, 0.7], + [0.4, 0.3, 0.3], + [0.3, 0.4, 0.3], + [0.4, 0.5, 0.1], + ], + [0, 1, 2, 3], + r"Number of given labels \(4\) not equal to the number of classes in " + r"'y_score' \(3\).", + ), + ( + [0, 0, 1, 2], + [ + [0.2, 0.1, 0.7], + [0.4, 0.3, 0.3], + [0.3, 0.4, 0.3], + [0.4, 0.5, 0.1], + ], + [0, 1, 3], + "'y_true' contains labels not in parameter 'labels'.", + ), + ( + [0, 1], + [[0.5, 0.2, 0.2], [0.3, 0.4, 0.2]], + None, + ( + "`y_true` is binary while y_score is 2d with 3 classes. If" + " `y_true` does not contain all the labels, `labels` must be provided" + ), + ), + ], +) +def test_top_k_accuracy_score_error(y_true, y_score, labels, msg): + with pytest.raises(ValueError, match=msg): + top_k_accuracy_score(y_true, y_score, k=2, labels=labels) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_label_ranking_avg_precision_score_should_allow_csr_matrix_for_y_true_input( + csr_container, +): + # Test that label_ranking_avg_precision_score accept sparse y_true. + # Non-regression test for #22575 + y_true = csr_container([[1, 0, 0], [0, 0, 1]]) + y_score = np.array([[0.5, 0.9, 0.6], [0, 0, 1]]) + result = label_ranking_average_precision_score(y_true, y_score) + assert result == pytest.approx(2 / 3) + + +@pytest.mark.parametrize( + "metric", [average_precision_score, det_curve, precision_recall_curve, roc_curve] +) +@pytest.mark.parametrize( + "classes", [(False, True), (0, 1), (0.0, 1.0), ("zero", "one")] +) +def test_ranking_metric_pos_label_types(metric, classes): + """Check that the metric works with different types of `pos_label`. + + We can expect `pos_label` to be a bool, an integer, a float, a string. + No error should be raised for those types. + """ + rng = np.random.RandomState(42) + n_samples, pos_label = 10, classes[-1] + y_true = rng.choice(classes, size=n_samples, replace=True) + y_proba = rng.rand(n_samples) + result = metric(y_true, y_proba, pos_label=pos_label) + if isinstance(result, float): + assert not np.isnan(result) + else: + metric_1, metric_2, thresholds = result + assert not np.isnan(metric_1).any() + assert not np.isnan(metric_2).any() + assert not np.isnan(thresholds).any() + + +def test_roc_curve_with_probablity_estimates(global_random_seed): + """Check that thresholds do not exceed 1.0 when `y_score` is a probability + estimate. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/26193 + """ + rng = np.random.RandomState(global_random_seed) + y_true = rng.randint(0, 2, size=10) + y_score = rng.rand(10) + _, _, thresholds = roc_curve(y_true, y_score) + assert np.isinf(thresholds[0]) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_regression.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_regression.py new file mode 100644 index 0000000000000000000000000000000000000000..396ae5d0ffae143e333f14861dc839931326a030 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_regression.py @@ -0,0 +1,636 @@ +from itertools import product + +import numpy as np +import pytest +from numpy.testing import assert_allclose +from scipy import optimize +from scipy.special import factorial, xlogy + +from sklearn.dummy import DummyRegressor +from sklearn.exceptions import UndefinedMetricWarning +from sklearn.metrics import ( + d2_absolute_error_score, + d2_pinball_score, + d2_tweedie_score, + explained_variance_score, + make_scorer, + max_error, + mean_absolute_error, + mean_absolute_percentage_error, + mean_pinball_loss, + mean_squared_error, + mean_squared_log_error, + mean_tweedie_deviance, + median_absolute_error, + r2_score, + root_mean_squared_error, + root_mean_squared_log_error, +) +from sklearn.metrics._regression import _check_reg_targets +from sklearn.model_selection import GridSearchCV +from sklearn.utils._testing import ( + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) + + +def test_regression_metrics(n_samples=50): + y_true = np.arange(n_samples) + y_pred = y_true + 1 + y_pred_2 = y_true - 1 + + assert_almost_equal(mean_squared_error(y_true, y_pred), 1.0) + assert_almost_equal( + mean_squared_log_error(y_true, y_pred), + mean_squared_error(np.log(1 + y_true), np.log(1 + y_pred)), + ) + assert_almost_equal(mean_absolute_error(y_true, y_pred), 1.0) + assert_almost_equal(mean_pinball_loss(y_true, y_pred), 0.5) + assert_almost_equal(mean_pinball_loss(y_true, y_pred_2), 0.5) + assert_almost_equal(mean_pinball_loss(y_true, y_pred, alpha=0.4), 0.6) + assert_almost_equal(mean_pinball_loss(y_true, y_pred_2, alpha=0.4), 0.4) + assert_almost_equal(median_absolute_error(y_true, y_pred), 1.0) + mape = mean_absolute_percentage_error(y_true, y_pred) + assert np.isfinite(mape) + assert mape > 1e6 + assert_almost_equal(max_error(y_true, y_pred), 1.0) + assert_almost_equal(r2_score(y_true, y_pred), 0.995, 2) + assert_almost_equal(r2_score(y_true, y_pred, force_finite=False), 0.995, 2) + assert_almost_equal(explained_variance_score(y_true, y_pred), 1.0) + assert_almost_equal( + explained_variance_score(y_true, y_pred, force_finite=False), 1.0 + ) + assert_almost_equal( + mean_tweedie_deviance(y_true, y_pred, power=0), + mean_squared_error(y_true, y_pred), + ) + assert_almost_equal( + d2_tweedie_score(y_true, y_pred, power=0), r2_score(y_true, y_pred) + ) + dev_median = np.abs(y_true - np.median(y_true)).sum() + assert_array_almost_equal( + d2_absolute_error_score(y_true, y_pred), + 1 - np.abs(y_true - y_pred).sum() / dev_median, + ) + alpha = 0.2 + pinball_loss = lambda y_true, y_pred, alpha: alpha * np.maximum( + y_true - y_pred, 0 + ) + (1 - alpha) * np.maximum(y_pred - y_true, 0) + y_quantile = np.percentile(y_true, q=alpha * 100) + assert_almost_equal( + d2_pinball_score(y_true, y_pred, alpha=alpha), + 1 + - pinball_loss(y_true, y_pred, alpha).sum() + / pinball_loss(y_true, y_quantile, alpha).sum(), + ) + assert_almost_equal( + d2_absolute_error_score(y_true, y_pred), + d2_pinball_score(y_true, y_pred, alpha=0.5), + ) + + # Tweedie deviance needs positive y_pred, except for p=0, + # p>=2 needs positive y_true + # results evaluated by sympy + y_true = np.arange(1, 1 + n_samples) + y_pred = 2 * y_true + n = n_samples + assert_almost_equal( + mean_tweedie_deviance(y_true, y_pred, power=-1), + 5 / 12 * n * (n**2 + 2 * n + 1), + ) + assert_almost_equal( + mean_tweedie_deviance(y_true, y_pred, power=1), (n + 1) * (1 - np.log(2)) + ) + assert_almost_equal( + mean_tweedie_deviance(y_true, y_pred, power=2), 2 * np.log(2) - 1 + ) + assert_almost_equal( + mean_tweedie_deviance(y_true, y_pred, power=3 / 2), + ((6 * np.sqrt(2) - 8) / n) * np.sqrt(y_true).sum(), + ) + assert_almost_equal( + mean_tweedie_deviance(y_true, y_pred, power=3), np.sum(1 / y_true) / (4 * n) + ) + + dev_mean = 2 * np.mean(xlogy(y_true, 2 * y_true / (n + 1))) + assert_almost_equal( + d2_tweedie_score(y_true, y_pred, power=1), + 1 - (n + 1) * (1 - np.log(2)) / dev_mean, + ) + + dev_mean = 2 * np.log((n + 1) / 2) - 2 / n * np.log(factorial(n)) + assert_almost_equal( + d2_tweedie_score(y_true, y_pred, power=2), 1 - (2 * np.log(2) - 1) / dev_mean + ) + + +def test_root_mean_squared_error_multioutput_raw_value(): + # non-regression test for + # https://github.com/scikit-learn/scikit-learn/pull/16323 + mse = mean_squared_error([[1]], [[10]], multioutput="raw_values") + rmse = root_mean_squared_error([[1]], [[10]], multioutput="raw_values") + assert np.sqrt(mse) == pytest.approx(rmse) + + +def test_multioutput_regression(): + y_true = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]]) + y_pred = np.array([[0, 0, 0, 1], [1, 0, 1, 1], [0, 0, 0, 1]]) + + error = mean_squared_error(y_true, y_pred) + assert_almost_equal(error, (1.0 / 3 + 2.0 / 3 + 2.0 / 3) / 4.0) + + error = root_mean_squared_error(y_true, y_pred) + assert_almost_equal(error, 0.454, decimal=2) + + error = mean_squared_log_error(y_true, y_pred) + assert_almost_equal(error, 0.200, decimal=2) + + error = root_mean_squared_log_error(y_true, y_pred) + assert_almost_equal(error, 0.315, decimal=2) + + # mean_absolute_error and mean_squared_error are equal because + # it is a binary problem. + error = mean_absolute_error(y_true, y_pred) + assert_almost_equal(error, (1.0 + 2.0 / 3) / 4.0) + + error = mean_pinball_loss(y_true, y_pred) + assert_almost_equal(error, (1.0 + 2.0 / 3) / 8.0) + + error = np.around(mean_absolute_percentage_error(y_true, y_pred), decimals=2) + assert np.isfinite(error) + assert error > 1e6 + error = median_absolute_error(y_true, y_pred) + assert_almost_equal(error, (1.0 + 1.0) / 4.0) + + error = r2_score(y_true, y_pred, multioutput="variance_weighted") + assert_almost_equal(error, 1.0 - 5.0 / 2) + error = r2_score(y_true, y_pred, multioutput="uniform_average") + assert_almost_equal(error, -0.875) + + score = d2_pinball_score(y_true, y_pred, alpha=0.5, multioutput="raw_values") + raw_expected_score = [ + 1 + - np.abs(y_true[:, i] - y_pred[:, i]).sum() + / np.abs(y_true[:, i] - np.median(y_true[:, i])).sum() + for i in range(y_true.shape[1]) + ] + # in the last case, the denominator vanishes and hence we get nan, + # but since the numerator vanishes as well the expected score is 1.0 + raw_expected_score = np.where(np.isnan(raw_expected_score), 1, raw_expected_score) + assert_array_almost_equal(score, raw_expected_score) + + score = d2_pinball_score(y_true, y_pred, alpha=0.5, multioutput="uniform_average") + assert_almost_equal(score, raw_expected_score.mean()) + # constant `y_true` with force_finite=True leads to 1. or 0. + yc = [5.0, 5.0] + error = r2_score(yc, [5.0, 5.0], multioutput="variance_weighted") + assert_almost_equal(error, 1.0) + error = r2_score(yc, [5.0, 5.1], multioutput="variance_weighted") + assert_almost_equal(error, 0.0) + + # Setting force_finite=False results in the nan for 4th output propagating + error = r2_score( + y_true, y_pred, multioutput="variance_weighted", force_finite=False + ) + assert_almost_equal(error, np.nan) + error = r2_score(y_true, y_pred, multioutput="uniform_average", force_finite=False) + assert_almost_equal(error, np.nan) + + # Dropping the 4th output to check `force_finite=False` for nominal + y_true = y_true[:, :-1] + y_pred = y_pred[:, :-1] + error = r2_score(y_true, y_pred, multioutput="variance_weighted") + error2 = r2_score( + y_true, y_pred, multioutput="variance_weighted", force_finite=False + ) + assert_almost_equal(error, error2) + error = r2_score(y_true, y_pred, multioutput="uniform_average") + error2 = r2_score(y_true, y_pred, multioutput="uniform_average", force_finite=False) + assert_almost_equal(error, error2) + + # constant `y_true` with force_finite=False leads to NaN or -Inf. + error = r2_score( + yc, [5.0, 5.0], multioutput="variance_weighted", force_finite=False + ) + assert_almost_equal(error, np.nan) + error = r2_score( + yc, [5.0, 6.0], multioutput="variance_weighted", force_finite=False + ) + assert_almost_equal(error, -np.inf) + + +def test_regression_metrics_at_limits(): + # Single-sample case + # Note: for r2 and d2_tweedie see also test_regression_single_sample + assert_almost_equal(mean_squared_error([0.0], [0.0]), 0.0) + assert_almost_equal(root_mean_squared_error([0.0], [0.0]), 0.0) + assert_almost_equal(mean_squared_log_error([0.0], [0.0]), 0.0) + assert_almost_equal(mean_absolute_error([0.0], [0.0]), 0.0) + assert_almost_equal(mean_pinball_loss([0.0], [0.0]), 0.0) + assert_almost_equal(mean_absolute_percentage_error([0.0], [0.0]), 0.0) + assert_almost_equal(median_absolute_error([0.0], [0.0]), 0.0) + assert_almost_equal(max_error([0.0], [0.0]), 0.0) + assert_almost_equal(explained_variance_score([0.0], [0.0]), 1.0) + + # Perfect cases + assert_almost_equal(r2_score([0.0, 1], [0.0, 1]), 1.0) + assert_almost_equal(d2_pinball_score([0.0, 1], [0.0, 1]), 1.0) + + # Non-finite cases + # R² and explained variance have a fix by default for non-finite cases + for s in (r2_score, explained_variance_score): + assert_almost_equal(s([0, 0], [1, -1]), 0.0) + assert_almost_equal(s([0, 0], [1, -1], force_finite=False), -np.inf) + assert_almost_equal(s([1, 1], [1, 1]), 1.0) + assert_almost_equal(s([1, 1], [1, 1], force_finite=False), np.nan) + msg = ( + "Mean Squared Logarithmic Error cannot be used when " + "targets contain values less than or equal to -1." + ) + with pytest.raises(ValueError, match=msg): + mean_squared_log_error([-1.0], [-1.0]) + msg = ( + "Mean Squared Logarithmic Error cannot be used when " + "targets contain values less than or equal to -1." + ) + with pytest.raises(ValueError, match=msg): + mean_squared_log_error([1.0, 2.0, 3.0], [1.0, -2.0, 3.0]) + msg = ( + "Mean Squared Logarithmic Error cannot be used when " + "targets contain values less than or equal to -1." + ) + with pytest.raises(ValueError, match=msg): + mean_squared_log_error([1.0, -2.0, 3.0], [1.0, 2.0, 3.0]) + msg = ( + "Mean Squared Logarithmic Error cannot be used when " + "targets contain values less than or equal to -1." + ) + with pytest.raises(ValueError, match=msg): + root_mean_squared_log_error([1.0, -2.0, 3.0], [1.0, 2.0, 3.0]) + msg = ( + "Root Mean Squared Logarithmic Error cannot be used when " + "targets contain values less than or equal to -1." + ) + + # Tweedie deviance error + power = -1.2 + assert_allclose( + mean_tweedie_deviance([0], [1.0], power=power), 2 / (2 - power), rtol=1e-3 + ) + msg = "can only be used on strictly positive y_pred." + with pytest.raises(ValueError, match=msg): + mean_tweedie_deviance([0.0], [0.0], power=power) + with pytest.raises(ValueError, match=msg): + d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power) + + assert_almost_equal(mean_tweedie_deviance([0.0], [0.0], power=0), 0.0, 2) + + power = 1.0 + msg = "only be used on non-negative y and strictly positive y_pred." + with pytest.raises(ValueError, match=msg): + mean_tweedie_deviance([0.0], [0.0], power=power) + with pytest.raises(ValueError, match=msg): + d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power) + + power = 1.5 + assert_allclose(mean_tweedie_deviance([0.0], [1.0], power=power), 2 / (2 - power)) + msg = "only be used on non-negative y and strictly positive y_pred." + with pytest.raises(ValueError, match=msg): + mean_tweedie_deviance([0.0], [0.0], power=power) + with pytest.raises(ValueError, match=msg): + d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power) + + power = 2.0 + assert_allclose(mean_tweedie_deviance([1.0], [1.0], power=power), 0.00, atol=1e-8) + msg = "can only be used on strictly positive y and y_pred." + with pytest.raises(ValueError, match=msg): + mean_tweedie_deviance([0.0], [0.0], power=power) + with pytest.raises(ValueError, match=msg): + d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power) + + power = 3.0 + assert_allclose(mean_tweedie_deviance([1.0], [1.0], power=power), 0.00, atol=1e-8) + msg = "can only be used on strictly positive y and y_pred." + with pytest.raises(ValueError, match=msg): + mean_tweedie_deviance([0.0], [0.0], power=power) + with pytest.raises(ValueError, match=msg): + d2_tweedie_score([0.0] * 2, [0.0] * 2, power=power) + + +def test__check_reg_targets(): + # All of length 3 + EXAMPLES = [ + ("continuous", [1, 2, 3], 1), + ("continuous", [[1], [2], [3]], 1), + ("continuous-multioutput", [[1, 1], [2, 2], [3, 1]], 2), + ("continuous-multioutput", [[5, 1], [4, 2], [3, 1]], 2), + ("continuous-multioutput", [[1, 3, 4], [2, 2, 2], [3, 1, 1]], 3), + ] + + for (type1, y1, n_out1), (type2, y2, n_out2) in product(EXAMPLES, repeat=2): + if type1 == type2 and n_out1 == n_out2: + y_type, y_check1, y_check2, _, _ = _check_reg_targets( + y1, y2, sample_weight=None, multioutput=None + ) + assert type1 == y_type + if type1 == "continuous": + assert_array_equal(y_check1, np.reshape(y1, (-1, 1))) + assert_array_equal(y_check2, np.reshape(y2, (-1, 1))) + else: + assert_array_equal(y_check1, y1) + assert_array_equal(y_check2, y2) + else: + with pytest.raises(ValueError): + _check_reg_targets(y1, y2, sample_weight=None, multioutput=None) + + +def test__check_reg_targets_exception(): + invalid_multioutput = "this_value_is_not_valid" + expected_message = ( + "Allowed 'multioutput' string values are.+You provided multioutput={!r}".format( + invalid_multioutput + ) + ) + with pytest.raises(ValueError, match=expected_message): + _check_reg_targets([1, 2, 3], [[1], [2], [3]], None, invalid_multioutput) + + +def test_regression_multioutput_array(): + y_true = [[1, 2], [2.5, -1], [4.5, 3], [5, 7]] + y_pred = [[1, 1], [2, -1], [5, 4], [5, 6.5]] + + mse = mean_squared_error(y_true, y_pred, multioutput="raw_values") + mae = mean_absolute_error(y_true, y_pred, multioutput="raw_values") + + pbl = mean_pinball_loss(y_true, y_pred, multioutput="raw_values") + mape = mean_absolute_percentage_error(y_true, y_pred, multioutput="raw_values") + r = r2_score(y_true, y_pred, multioutput="raw_values") + evs = explained_variance_score(y_true, y_pred, multioutput="raw_values") + d2ps = d2_pinball_score(y_true, y_pred, alpha=0.5, multioutput="raw_values") + evs2 = explained_variance_score( + y_true, y_pred, multioutput="raw_values", force_finite=False + ) + + assert_array_almost_equal(mse, [0.125, 0.5625], decimal=2) + assert_array_almost_equal(mae, [0.25, 0.625], decimal=2) + assert_array_almost_equal(pbl, [0.25 / 2, 0.625 / 2], decimal=2) + assert_array_almost_equal(mape, [0.0778, 0.2262], decimal=2) + assert_array_almost_equal(r, [0.95, 0.93], decimal=2) + assert_array_almost_equal(evs, [0.95, 0.93], decimal=2) + assert_array_almost_equal(d2ps, [0.833, 0.722], decimal=2) + assert_array_almost_equal(evs2, [0.95, 0.93], decimal=2) + + # mean_absolute_error and mean_squared_error are equal because + # it is a binary problem. + y_true = [[0, 0]] * 4 + y_pred = [[1, 1]] * 4 + mse = mean_squared_error(y_true, y_pred, multioutput="raw_values") + mae = mean_absolute_error(y_true, y_pred, multioutput="raw_values") + pbl = mean_pinball_loss(y_true, y_pred, multioutput="raw_values") + r = r2_score(y_true, y_pred, multioutput="raw_values") + d2ps = d2_pinball_score(y_true, y_pred, multioutput="raw_values") + assert_array_almost_equal(mse, [1.0, 1.0], decimal=2) + assert_array_almost_equal(mae, [1.0, 1.0], decimal=2) + assert_array_almost_equal(pbl, [0.5, 0.5], decimal=2) + assert_array_almost_equal(r, [0.0, 0.0], decimal=2) + assert_array_almost_equal(d2ps, [0.0, 0.0], decimal=2) + + r = r2_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput="raw_values") + assert_array_almost_equal(r, [0, -3.5], decimal=2) + assert np.mean(r) == r2_score( + [[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput="uniform_average" + ) + evs = explained_variance_score( + [[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput="raw_values" + ) + assert_array_almost_equal(evs, [0, -1.25], decimal=2) + evs2 = explained_variance_score( + [[0, -1], [0, 1]], + [[2, 2], [1, 1]], + multioutput="raw_values", + force_finite=False, + ) + assert_array_almost_equal(evs2, [-np.inf, -1.25], decimal=2) + + # Checking for the condition in which both numerator and denominator is + # zero. + y_true = [[1, 3], [1, 2]] + y_pred = [[1, 4], [1, 1]] + r2 = r2_score(y_true, y_pred, multioutput="raw_values") + assert_array_almost_equal(r2, [1.0, -3.0], decimal=2) + assert np.mean(r2) == r2_score(y_true, y_pred, multioutput="uniform_average") + r22 = r2_score(y_true, y_pred, multioutput="raw_values", force_finite=False) + assert_array_almost_equal(r22, [np.nan, -3.0], decimal=2) + assert_almost_equal( + np.mean(r22), + r2_score(y_true, y_pred, multioutput="uniform_average", force_finite=False), + ) + + evs = explained_variance_score(y_true, y_pred, multioutput="raw_values") + assert_array_almost_equal(evs, [1.0, -3.0], decimal=2) + assert np.mean(evs) == explained_variance_score(y_true, y_pred) + d2ps = d2_pinball_score(y_true, y_pred, alpha=0.5, multioutput="raw_values") + assert_array_almost_equal(d2ps, [1.0, -1.0], decimal=2) + evs2 = explained_variance_score( + y_true, y_pred, multioutput="raw_values", force_finite=False + ) + assert_array_almost_equal(evs2, [np.nan, -3.0], decimal=2) + assert_almost_equal( + np.mean(evs2), explained_variance_score(y_true, y_pred, force_finite=False) + ) + + # Handling msle separately as it does not accept negative inputs. + y_true = np.array([[0.5, 1], [1, 2], [7, 6]]) + y_pred = np.array([[0.5, 2], [1, 2.5], [8, 8]]) + msle = mean_squared_log_error(y_true, y_pred, multioutput="raw_values") + msle2 = mean_squared_error( + np.log(1 + y_true), np.log(1 + y_pred), multioutput="raw_values" + ) + assert_array_almost_equal(msle, msle2, decimal=2) + + +def test_regression_custom_weights(): + y_true = [[1, 2], [2.5, -1], [4.5, 3], [5, 7]] + y_pred = [[1, 1], [2, -1], [5, 4], [5, 6.5]] + + msew = mean_squared_error(y_true, y_pred, multioutput=[0.4, 0.6]) + rmsew = root_mean_squared_error(y_true, y_pred, multioutput=[0.4, 0.6]) + maew = mean_absolute_error(y_true, y_pred, multioutput=[0.4, 0.6]) + mapew = mean_absolute_percentage_error(y_true, y_pred, multioutput=[0.4, 0.6]) + rw = r2_score(y_true, y_pred, multioutput=[0.4, 0.6]) + evsw = explained_variance_score(y_true, y_pred, multioutput=[0.4, 0.6]) + d2psw = d2_pinball_score(y_true, y_pred, alpha=0.5, multioutput=[0.4, 0.6]) + evsw2 = explained_variance_score( + y_true, y_pred, multioutput=[0.4, 0.6], force_finite=False + ) + + assert_almost_equal(msew, 0.39, decimal=2) + assert_almost_equal(rmsew, 0.59, decimal=2) + assert_almost_equal(maew, 0.475, decimal=3) + assert_almost_equal(mapew, 0.1668, decimal=2) + assert_almost_equal(rw, 0.94, decimal=2) + assert_almost_equal(evsw, 0.94, decimal=2) + assert_almost_equal(d2psw, 0.766, decimal=2) + assert_almost_equal(evsw2, 0.94, decimal=2) + + # Handling msle separately as it does not accept negative inputs. + y_true = np.array([[0.5, 1], [1, 2], [7, 6]]) + y_pred = np.array([[0.5, 2], [1, 2.5], [8, 8]]) + msle = mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7]) + msle2 = mean_squared_error( + np.log(1 + y_true), np.log(1 + y_pred), multioutput=[0.3, 0.7] + ) + assert_almost_equal(msle, msle2, decimal=2) + + +@pytest.mark.parametrize("metric", [r2_score, d2_tweedie_score, d2_pinball_score]) +def test_regression_single_sample(metric): + y_true = [0] + y_pred = [1] + warning_msg = "not well-defined with less than two samples." + + # Trigger the warning + with pytest.warns(UndefinedMetricWarning, match=warning_msg): + score = metric(y_true, y_pred) + assert np.isnan(score) + + +def test_tweedie_deviance_continuity(global_random_seed): + n_samples = 100 + + rng = np.random.RandomState(global_random_seed) + + y_true = rng.rand(n_samples) + 0.1 + y_pred = rng.rand(n_samples) + 0.1 + + assert_allclose( + mean_tweedie_deviance(y_true, y_pred, power=0 - 1e-10), + mean_tweedie_deviance(y_true, y_pred, power=0), + ) + + # Ws we get closer to the limit, with 1e-12 difference the + # tolerance to pass the below check increases. There are likely + # numerical precision issues on the edges of different definition + # regions. + assert_allclose( + mean_tweedie_deviance(y_true, y_pred, power=1 + 1e-10), + mean_tweedie_deviance(y_true, y_pred, power=1), + rtol=1e-5, + ) + + assert_allclose( + mean_tweedie_deviance(y_true, y_pred, power=2 - 1e-10), + mean_tweedie_deviance(y_true, y_pred, power=2), + rtol=1e-5, + ) + + assert_allclose( + mean_tweedie_deviance(y_true, y_pred, power=2 + 1e-10), + mean_tweedie_deviance(y_true, y_pred, power=2), + rtol=1e-5, + ) + + +def test_mean_absolute_percentage_error(global_random_seed): + random_number_generator = np.random.RandomState(global_random_seed) + y_true = random_number_generator.exponential(size=100) + y_pred = 1.2 * y_true + assert mean_absolute_percentage_error(y_true, y_pred) == pytest.approx(0.2) + + +@pytest.mark.parametrize( + "distribution", ["normal", "lognormal", "exponential", "uniform"] +) +@pytest.mark.parametrize("target_quantile", [0.05, 0.5, 0.75]) +def test_mean_pinball_loss_on_constant_predictions( + distribution, target_quantile, global_random_seed +): + if not hasattr(np, "quantile"): + pytest.skip( + "This test requires a more recent version of numpy " + "with support for np.quantile." + ) + + # Check that the pinball loss is minimized by the empirical quantile. + n_samples = 3000 + rng = np.random.RandomState(global_random_seed) + data = getattr(rng, distribution)(size=n_samples) + + # Compute the best possible pinball loss for any constant predictor: + best_pred = np.quantile(data, target_quantile) + best_constant_pred = np.full(n_samples, fill_value=best_pred) + best_pbl = mean_pinball_loss(data, best_constant_pred, alpha=target_quantile) + + # Evaluate the loss on a grid of quantiles + candidate_predictions = np.quantile(data, np.linspace(0, 1, 100)) + for pred in candidate_predictions: + # Compute the pinball loss of a constant predictor: + constant_pred = np.full(n_samples, fill_value=pred) + pbl = mean_pinball_loss(data, constant_pred, alpha=target_quantile) + + # Check that the loss of this constant predictor is greater or equal + # than the loss of using the optimal quantile (up to machine + # precision): + assert pbl >= best_pbl - np.finfo(np.float64).eps + + # Check that the value of the pinball loss matches the analytical + # formula. + expected_pbl = (pred - data[data < pred]).sum() * (1 - target_quantile) + ( + data[data >= pred] - pred + ).sum() * target_quantile + expected_pbl /= n_samples + assert_almost_equal(expected_pbl, pbl) + + # Check that we can actually recover the target_quantile by minimizing the + # pinball loss w.r.t. the constant prediction quantile. + def objective_func(x): + constant_pred = np.full(n_samples, fill_value=x) + return mean_pinball_loss(data, constant_pred, alpha=target_quantile) + + result = optimize.minimize(objective_func, data.mean()) + assert result.success + # The minimum is not unique with limited data, hence the large tolerance. + # For the normal distribution and the 0.5 quantile, the expected result is close to + # 0, hence the additional use of absolute tolerance. + assert_allclose(result.x, best_pred, rtol=1e-1, atol=1e-3) + assert result.fun == pytest.approx(best_pbl) + + +def test_dummy_quantile_parameter_tuning(global_random_seed): + # Integration test to check that it is possible to use the pinball loss to + # tune the hyperparameter of a quantile regressor. This is conceptually + # similar to the previous test but using the scikit-learn estimator and + # scoring API instead. + n_samples = 1000 + rng = np.random.RandomState(global_random_seed) + X = rng.normal(size=(n_samples, 5)) # Ignored + y = rng.exponential(size=n_samples) + + all_quantiles = [0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95] + for alpha in all_quantiles: + neg_mean_pinball_loss = make_scorer( + mean_pinball_loss, + alpha=alpha, + greater_is_better=False, + ) + regressor = DummyRegressor(strategy="quantile", quantile=0.25) + grid_search = GridSearchCV( + regressor, + param_grid=dict(quantile=all_quantiles), + scoring=neg_mean_pinball_loss, + ).fit(X, y) + + assert grid_search.best_params_["quantile"] == pytest.approx(alpha) + + +def test_pinball_loss_relation_with_mae(global_random_seed): + # Test that mean_pinball loss with alpha=0.5 if half of mean absolute error + rng = np.random.RandomState(global_random_seed) + n = 100 + y_true = rng.normal(size=n) + y_pred = y_true.copy() + rng.uniform(n) + assert ( + mean_absolute_error(y_true, y_pred) + == mean_pinball_loss(y_true, y_pred, alpha=0.5) * 2 + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_score_objects.py b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_score_objects.py new file mode 100644 index 0000000000000000000000000000000000000000..672ed8ae7eecc593e0aa02e76e7158c9f01e67e4 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/metrics/tests/test_score_objects.py @@ -0,0 +1,1665 @@ +import numbers +import pickle +import warnings +from copy import deepcopy +from functools import partial + +import joblib +import numpy as np +import pytest +from numpy.testing import assert_allclose + +from sklearn import config_context +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.cluster import KMeans +from sklearn.datasets import ( + load_diabetes, + make_blobs, + make_classification, + make_multilabel_classification, + make_regression, +) +from sklearn.linear_model import LogisticRegression, Perceptron, Ridge +from sklearn.metrics import ( + accuracy_score, + average_precision_score, + balanced_accuracy_score, + brier_score_loss, + check_scoring, + f1_score, + fbeta_score, + get_scorer, + get_scorer_names, + jaccard_score, + log_loss, + make_scorer, + matthews_corrcoef, + precision_score, + r2_score, + recall_score, + roc_auc_score, + top_k_accuracy_score, +) +from sklearn.metrics import cluster as cluster_module +from sklearn.metrics._scorer import ( + _check_multimetric_scoring, + _CurveScorer, + _MultimetricScorer, + _PassthroughScorer, + _Scorer, +) +from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split +from sklearn.multiclass import OneVsRestClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.pipeline import make_pipeline +from sklearn.svm import LinearSVC +from sklearn.tests.metadata_routing_common import ( + assert_request_is_empty, +) +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +from sklearn.utils._testing import ( + assert_almost_equal, + assert_array_equal, + ignore_warnings, +) +from sklearn.utils.metadata_routing import MetadataRouter, MethodMapping + +REGRESSION_SCORERS = [ + "d2_absolute_error_score", + "explained_variance", + "r2", + "neg_mean_absolute_error", + "neg_mean_squared_error", + "neg_mean_absolute_percentage_error", + "neg_mean_squared_log_error", + "neg_median_absolute_error", + "neg_root_mean_squared_error", + "neg_root_mean_squared_log_error", + "mean_absolute_error", + "mean_absolute_percentage_error", + "mean_squared_error", + "median_absolute_error", + "neg_max_error", + "neg_mean_poisson_deviance", + "neg_mean_gamma_deviance", +] + +CLF_SCORERS = [ + "accuracy", + "balanced_accuracy", + "top_k_accuracy", + "f1", + "f1_weighted", + "f1_macro", + "f1_micro", + "roc_auc", + "average_precision", + "precision", + "precision_weighted", + "precision_macro", + "precision_micro", + "recall", + "recall_weighted", + "recall_macro", + "recall_micro", + "neg_log_loss", + "neg_brier_score", + "jaccard", + "jaccard_weighted", + "jaccard_macro", + "jaccard_micro", + "roc_auc_ovr", + "roc_auc_ovo", + "roc_auc_ovr_weighted", + "roc_auc_ovo_weighted", + "matthews_corrcoef", + "positive_likelihood_ratio", + "neg_negative_likelihood_ratio", +] + +# All supervised cluster scorers (They behave like classification metric) +CLUSTER_SCORERS = [ + "adjusted_rand_score", + "rand_score", + "homogeneity_score", + "completeness_score", + "v_measure_score", + "mutual_info_score", + "adjusted_mutual_info_score", + "normalized_mutual_info_score", + "fowlkes_mallows_score", +] + +MULTILABEL_ONLY_SCORERS = [ + "precision_samples", + "recall_samples", + "f1_samples", + "jaccard_samples", +] + +REQUIRE_POSITIVE_Y_SCORERS = ["neg_mean_poisson_deviance", "neg_mean_gamma_deviance"] + + +def _require_positive_y(y): + """Make targets strictly positive""" + offset = abs(y.min()) + 1 + y = y + offset + return y + + +def _make_estimators(X_train, y_train, y_ml_train): + # Make estimators that make sense to test various scoring methods + sensible_regr = DecisionTreeRegressor(random_state=0) + # some of the regressions scorers require strictly positive input. + sensible_regr.fit(X_train, _require_positive_y(y_train)) + sensible_clf = DecisionTreeClassifier(random_state=0) + sensible_clf.fit(X_train, y_train) + sensible_ml_clf = DecisionTreeClassifier(random_state=0) + sensible_ml_clf.fit(X_train, y_ml_train) + return dict( + [(name, sensible_regr) for name in REGRESSION_SCORERS] + + [(name, sensible_clf) for name in CLF_SCORERS] + + [(name, sensible_clf) for name in CLUSTER_SCORERS] + + [(name, sensible_ml_clf) for name in MULTILABEL_ONLY_SCORERS] + ) + + +@pytest.fixture(scope="module") +def memmap_data_and_estimators(tmp_path_factory): + temp_folder = tmp_path_factory.mktemp("sklearn_test_score_objects") + X, y = make_classification(n_samples=30, n_features=5, random_state=0) + _, y_ml = make_multilabel_classification(n_samples=X.shape[0], random_state=0) + filename = temp_folder / "test_data.pkl" + joblib.dump((X, y, y_ml), filename) + X_mm, y_mm, y_ml_mm = joblib.load(filename, mmap_mode="r") + estimators = _make_estimators(X_mm, y_mm, y_ml_mm) + + yield X_mm, y_mm, y_ml_mm, estimators + + +class EstimatorWithFit(BaseEstimator): + """Dummy estimator to test scoring validators""" + + def fit(self, X, y): + return self + + +class EstimatorWithFitAndScore(BaseEstimator): + """Dummy estimator to test scoring validators""" + + def fit(self, X, y): + return self + + def score(self, X, y): + return 1.0 + + +class EstimatorWithFitAndPredict(BaseEstimator): + """Dummy estimator to test scoring validators""" + + def fit(self, X, y): + self.y = y + return self + + def predict(self, X): + return self.y + + +class DummyScorer: + """Dummy scorer that always returns 1.""" + + def __call__(self, est, X, y): + return 1 + + +def test_all_scorers_repr(): + # Test that all scorers have a working repr + for name in get_scorer_names(): + repr(get_scorer(name)) + + +def check_scoring_validator_for_single_metric_usecases(scoring_validator): + # Test all branches of single metric usecases + estimator = EstimatorWithFitAndScore() + estimator.fit([[1]], [1]) + scorer = scoring_validator(estimator) + assert isinstance(scorer, _PassthroughScorer) + assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0) + + estimator = EstimatorWithFitAndPredict() + estimator.fit([[1]], [1]) + pattern = ( + r"If no scoring is specified, the estimator passed should have" + r" a 'score' method\. The estimator .* does not\." + ) + with pytest.raises(TypeError, match=pattern): + scoring_validator(estimator) + + scorer = scoring_validator(estimator, scoring="accuracy") + assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0) + + estimator = EstimatorWithFit() + scorer = scoring_validator(estimator, scoring="accuracy") + assert isinstance(scorer, _Scorer) + assert scorer._response_method == "predict" + + # Test the allow_none parameter for check_scoring alone + if scoring_validator is check_scoring: + estimator = EstimatorWithFit() + scorer = scoring_validator(estimator, allow_none=True) + assert scorer is None + + +@pytest.mark.parametrize( + "scoring", + ( + ("accuracy",), + ["precision"], + {"acc": "accuracy", "precision": "precision"}, + ("accuracy", "precision"), + ["precision", "accuracy"], + { + "accuracy": make_scorer(accuracy_score), + "precision": make_scorer(precision_score), + }, + ), + ids=[ + "single_tuple", + "single_list", + "dict_str", + "multi_tuple", + "multi_list", + "dict_callable", + ], +) +def test_check_scoring_and_check_multimetric_scoring(scoring): + check_scoring_validator_for_single_metric_usecases(check_scoring) + # To make sure the check_scoring is correctly applied to the constituent + # scorers + + estimator = LinearSVC(random_state=0) + estimator.fit([[1], [2], [3]], [1, 1, 0]) + + scorers = _check_multimetric_scoring(estimator, scoring) + assert isinstance(scorers, dict) + assert sorted(scorers.keys()) == sorted(list(scoring)) + assert all([isinstance(scorer, _Scorer) for scorer in list(scorers.values())]) + assert all(scorer._response_method == "predict" for scorer in scorers.values()) + + if "acc" in scoring: + assert_almost_equal( + scorers["acc"](estimator, [[1], [2], [3]], [1, 0, 0]), 2.0 / 3.0 + ) + if "accuracy" in scoring: + assert_almost_equal( + scorers["accuracy"](estimator, [[1], [2], [3]], [1, 0, 0]), 2.0 / 3.0 + ) + if "precision" in scoring: + assert_almost_equal( + scorers["precision"](estimator, [[1], [2], [3]], [1, 0, 0]), 0.5 + ) + + +@pytest.mark.parametrize( + "scoring, msg", + [ + ( + (make_scorer(precision_score), make_scorer(accuracy_score)), + "One or more of the elements were callables", + ), + ([5], "Non-string types were found"), + ((make_scorer(precision_score),), "One or more of the elements were callables"), + ((), "Empty list was given"), + (("f1", "f1"), "Duplicate elements were found"), + ({4: "accuracy"}, "Non-string types were found in the keys"), + ({}, "An empty dict was passed"), + ], + ids=[ + "tuple of callables", + "list of int", + "tuple of one callable", + "empty tuple", + "non-unique str", + "non-string key dict", + "empty dict", + ], +) +def test_check_scoring_and_check_multimetric_scoring_errors(scoring, msg): + # Make sure it raises errors when scoring parameter is not valid. + # More weird corner cases are tested at test_validation.py + estimator = EstimatorWithFitAndPredict() + estimator.fit([[1]], [1]) + + with pytest.raises(ValueError, match=msg): + _check_multimetric_scoring(estimator, scoring=scoring) + + +def test_check_scoring_gridsearchcv(): + # test that check_scoring works on GridSearchCV and pipeline. + # slightly redundant non-regression test. + + grid = GridSearchCV(LinearSVC(), param_grid={"C": [0.1, 1]}, cv=3) + scorer = check_scoring(grid, scoring="f1") + assert isinstance(scorer, _Scorer) + assert scorer._response_method == "predict" + + pipe = make_pipeline(LinearSVC()) + scorer = check_scoring(pipe, scoring="f1") + assert isinstance(scorer, _Scorer) + assert scorer._response_method == "predict" + + # check that cross_val_score definitely calls the scorer + # and doesn't make any assumptions about the estimator apart from having a + # fit. + scores = cross_val_score( + EstimatorWithFit(), [[1], [2], [3]], [1, 0, 1], scoring=DummyScorer(), cv=3 + ) + assert_array_equal(scores, 1) + + +@pytest.mark.parametrize( + "scorer_name, metric", + [ + ("f1", f1_score), + ("f1_weighted", partial(f1_score, average="weighted")), + ("f1_macro", partial(f1_score, average="macro")), + ("f1_micro", partial(f1_score, average="micro")), + ("precision", precision_score), + ("precision_weighted", partial(precision_score, average="weighted")), + ("precision_macro", partial(precision_score, average="macro")), + ("precision_micro", partial(precision_score, average="micro")), + ("recall", recall_score), + ("recall_weighted", partial(recall_score, average="weighted")), + ("recall_macro", partial(recall_score, average="macro")), + ("recall_micro", partial(recall_score, average="micro")), + ("jaccard", jaccard_score), + ("jaccard_weighted", partial(jaccard_score, average="weighted")), + ("jaccard_macro", partial(jaccard_score, average="macro")), + ("jaccard_micro", partial(jaccard_score, average="micro")), + ("top_k_accuracy", top_k_accuracy_score), + ("matthews_corrcoef", matthews_corrcoef), + ], +) +def test_classification_binary_scores(scorer_name, metric): + # check consistency between score and scorer for scores supporting + # binary classification. + X, y = make_blobs(random_state=0, centers=2) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + clf = LinearSVC(random_state=0) + clf.fit(X_train, y_train) + + score = get_scorer(scorer_name)(clf, X_test, y_test) + expected_score = metric(y_test, clf.predict(X_test)) + assert_almost_equal(score, expected_score) + + +@pytest.mark.parametrize( + "scorer_name, metric", + [ + ("accuracy", accuracy_score), + ("balanced_accuracy", balanced_accuracy_score), + ("f1_weighted", partial(f1_score, average="weighted")), + ("f1_macro", partial(f1_score, average="macro")), + ("f1_micro", partial(f1_score, average="micro")), + ("precision_weighted", partial(precision_score, average="weighted")), + ("precision_macro", partial(precision_score, average="macro")), + ("precision_micro", partial(precision_score, average="micro")), + ("recall_weighted", partial(recall_score, average="weighted")), + ("recall_macro", partial(recall_score, average="macro")), + ("recall_micro", partial(recall_score, average="micro")), + ("jaccard_weighted", partial(jaccard_score, average="weighted")), + ("jaccard_macro", partial(jaccard_score, average="macro")), + ("jaccard_micro", partial(jaccard_score, average="micro")), + ], +) +def test_classification_multiclass_scores(scorer_name, metric): + # check consistency between score and scorer for scores supporting + # multiclass classification. + X, y = make_classification( + n_classes=3, n_informative=3, n_samples=30, random_state=0 + ) + + # use `stratify` = y to ensure train and test sets capture all classes + X_train, X_test, y_train, y_test = train_test_split( + X, y, random_state=0, stratify=y + ) + + clf = DecisionTreeClassifier(random_state=0) + clf.fit(X_train, y_train) + score = get_scorer(scorer_name)(clf, X_test, y_test) + expected_score = metric(y_test, clf.predict(X_test)) + assert score == pytest.approx(expected_score) + + +def test_custom_scorer_pickling(): + # test that custom scorer can be pickled + X, y = make_blobs(random_state=0, centers=2) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + clf = LinearSVC(random_state=0) + clf.fit(X_train, y_train) + + scorer = make_scorer(fbeta_score, beta=2) + score1 = scorer(clf, X_test, y_test) + unpickled_scorer = pickle.loads(pickle.dumps(scorer)) + score2 = unpickled_scorer(clf, X_test, y_test) + assert score1 == pytest.approx(score2) + + # smoke test the repr: + repr(fbeta_score) + + +def test_regression_scorers(): + # Test regression scorers. + diabetes = load_diabetes() + X, y = diabetes.data, diabetes.target + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + clf = Ridge() + clf.fit(X_train, y_train) + score1 = get_scorer("r2")(clf, X_test, y_test) + score2 = r2_score(y_test, clf.predict(X_test)) + assert_almost_equal(score1, score2) + + +def test_thresholded_scorers(): + # Test scorers that take thresholds. + X, y = make_blobs(random_state=0, centers=2) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + clf = LogisticRegression(random_state=0) + clf.fit(X_train, y_train) + score1 = get_scorer("roc_auc")(clf, X_test, y_test) + score2 = roc_auc_score(y_test, clf.decision_function(X_test)) + score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) + assert_almost_equal(score1, score2) + assert_almost_equal(score1, score3) + + logscore = get_scorer("neg_log_loss")(clf, X_test, y_test) + logloss = log_loss(y_test, clf.predict_proba(X_test)) + assert_almost_equal(-logscore, logloss) + + # same for an estimator without decision_function + clf = DecisionTreeClassifier() + clf.fit(X_train, y_train) + score1 = get_scorer("roc_auc")(clf, X_test, y_test) + score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) + assert_almost_equal(score1, score2) + + # test with a regressor (no decision_function) + reg = DecisionTreeRegressor() + reg.fit(X_train, y_train) + err_msg = "DecisionTreeRegressor has none of the following attributes" + with pytest.raises(AttributeError, match=err_msg): + get_scorer("roc_auc")(reg, X_test, y_test) + + # Test that an exception is raised on more than two classes + X, y = make_blobs(random_state=0, centers=3) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + clf.fit(X_train, y_train) + with pytest.raises(ValueError, match="multi_class must be in \\('ovo', 'ovr'\\)"): + get_scorer("roc_auc")(clf, X_test, y_test) + + # test error is raised with a single class present in model + # (predict_proba shape is not suitable for binary auc) + X, y = make_blobs(random_state=0, centers=2) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + clf = DecisionTreeClassifier() + clf.fit(X_train, np.zeros_like(y_train)) + with pytest.raises(ValueError, match="need classifier with two classes"): + get_scorer("roc_auc")(clf, X_test, y_test) + + # for proba scorers + with pytest.raises(ValueError, match="need classifier with two classes"): + get_scorer("neg_log_loss")(clf, X_test, y_test) + + +def test_thresholded_scorers_multilabel_indicator_data(): + # Test that the scorer work with multilabel-indicator format + # for multilabel and multi-output multi-class classifier + X, y = make_multilabel_classification(allow_unlabeled=False, random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + # Multi-output multi-class predict_proba + clf = DecisionTreeClassifier() + clf.fit(X_train, y_train) + y_proba = clf.predict_proba(X_test) + score1 = get_scorer("roc_auc")(clf, X_test, y_test) + score2 = roc_auc_score(y_test, np.vstack([p[:, -1] for p in y_proba]).T) + assert_almost_equal(score1, score2) + + # Multilabel predict_proba + clf = OneVsRestClassifier(DecisionTreeClassifier()) + clf.fit(X_train, y_train) + score1 = get_scorer("roc_auc")(clf, X_test, y_test) + score2 = roc_auc_score(y_test, clf.predict_proba(X_test)) + assert_almost_equal(score1, score2) + + # Multilabel decision function + clf = OneVsRestClassifier(LinearSVC(random_state=0)) + clf.fit(X_train, y_train) + score1 = get_scorer("roc_auc")(clf, X_test, y_test) + score2 = roc_auc_score(y_test, clf.decision_function(X_test)) + assert_almost_equal(score1, score2) + + +def test_supervised_cluster_scorers(): + # Test clustering scorers against gold standard labeling. + X, y = make_blobs(random_state=0, centers=2) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + km = KMeans(n_clusters=3, n_init="auto") + km.fit(X_train) + for name in CLUSTER_SCORERS: + score1 = get_scorer(name)(km, X_test, y_test) + score2 = getattr(cluster_module, name)(y_test, km.predict(X_test)) + assert_almost_equal(score1, score2) + + +def test_raises_on_score_list(): + # Test that when a list of scores is returned, we raise proper errors. + X, y = make_blobs(random_state=0) + f1_scorer_no_average = make_scorer(f1_score, average=None) + clf = DecisionTreeClassifier() + with pytest.raises(ValueError): + cross_val_score(clf, X, y, scoring=f1_scorer_no_average) + grid_search = GridSearchCV( + clf, scoring=f1_scorer_no_average, param_grid={"max_depth": [1, 2]} + ) + with pytest.raises(ValueError): + grid_search.fit(X, y) + + +def test_classification_scorer_sample_weight(): + # Test that classification scorers support sample_weight or raise sensible + # errors + + # Unlike the metrics invariance test, in the scorer case it's harder + # to ensure that, on the classifier output, weighted and unweighted + # scores really should be unequal. + X, y = make_classification(random_state=0) + _, y_ml = make_multilabel_classification(n_samples=X.shape[0], random_state=0) + split = train_test_split(X, y, y_ml, random_state=0) + X_train, X_test, y_train, y_test, y_ml_train, y_ml_test = split + + sample_weight = np.ones_like(y_test) + sample_weight[:10] = 0 + + # get sensible estimators for each metric + estimator = _make_estimators(X_train, y_train, y_ml_train) + + for name in get_scorer_names(): + scorer = get_scorer(name) + if name in REGRESSION_SCORERS: + # skip the regression scores + continue + if name == "top_k_accuracy": + # in the binary case k > 1 will always lead to a perfect score + scorer._kwargs = {"k": 1} + if name in MULTILABEL_ONLY_SCORERS: + target = y_ml_test + else: + target = y_test + try: + weighted = scorer( + estimator[name], X_test, target, sample_weight=sample_weight + ) + ignored = scorer(estimator[name], X_test[10:], target[10:]) + unweighted = scorer(estimator[name], X_test, target) + # this should not raise. sample_weight should be ignored if None. + _ = scorer(estimator[name], X_test[:10], target[:10], sample_weight=None) + assert weighted != unweighted, ( + f"scorer {name} behaves identically when called with " + f"sample weights: {weighted} vs {unweighted}" + ) + assert_almost_equal( + weighted, + ignored, + err_msg=( + f"scorer {name} behaves differently " + "when ignoring samples and setting " + f"sample_weight to 0: {weighted} vs {ignored}" + ), + ) + + except TypeError as e: + assert "sample_weight" in str(e), ( + f"scorer {name} raises unhelpful exception when called " + f"with sample weights: {e}" + ) + + +def test_regression_scorer_sample_weight(): + # Test that regression scorers support sample_weight or raise sensible + # errors + + # Odd number of test samples req for neg_median_absolute_error + X, y = make_regression(n_samples=101, n_features=20, random_state=0) + y = _require_positive_y(y) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + sample_weight = np.ones_like(y_test) + # Odd number req for neg_median_absolute_error + sample_weight[:11] = 0 + + reg = DecisionTreeRegressor(random_state=0) + reg.fit(X_train, y_train) + + for name in get_scorer_names(): + scorer = get_scorer(name) + if name not in REGRESSION_SCORERS: + # skip classification scorers + continue + try: + weighted = scorer(reg, X_test, y_test, sample_weight=sample_weight) + ignored = scorer(reg, X_test[11:], y_test[11:]) + unweighted = scorer(reg, X_test, y_test) + assert weighted != unweighted, ( + f"scorer {name} behaves identically when called with " + f"sample weights: {weighted} vs {unweighted}" + ) + assert_almost_equal( + weighted, + ignored, + err_msg=( + f"scorer {name} behaves differently " + "when ignoring samples and setting " + f"sample_weight to 0: {weighted} vs {ignored}" + ), + ) + + except TypeError as e: + assert "sample_weight" in str(e), ( + f"scorer {name} raises unhelpful exception when called " + f"with sample weights: {e}" + ) + + +@pytest.mark.parametrize("name", get_scorer_names()) +def test_scorer_memmap_input(name, memmap_data_and_estimators): + # Non-regression test for #6147: some score functions would + # return singleton memmap when computed on memmap data instead of scalar + # float values. + X_mm, y_mm, y_ml_mm, estimators = memmap_data_and_estimators + + if name in REQUIRE_POSITIVE_Y_SCORERS: + y_mm_1 = _require_positive_y(y_mm) + y_ml_mm_1 = _require_positive_y(y_ml_mm) + else: + y_mm_1, y_ml_mm_1 = y_mm, y_ml_mm + + # UndefinedMetricWarning for P / R scores + with ignore_warnings(): + scorer, estimator = get_scorer(name), estimators[name] + if name in MULTILABEL_ONLY_SCORERS: + score = scorer(estimator, X_mm, y_ml_mm_1) + else: + score = scorer(estimator, X_mm, y_mm_1) + assert isinstance(score, numbers.Number), name + + +def test_scoring_is_not_metric(): + with pytest.raises(ValueError, match="make_scorer"): + check_scoring(LogisticRegression(), scoring=f1_score) + with pytest.raises(ValueError, match="make_scorer"): + check_scoring(LogisticRegression(), scoring=roc_auc_score) + with pytest.raises(ValueError, match="make_scorer"): + check_scoring(Ridge(), scoring=r2_score) + with pytest.raises(ValueError, match="make_scorer"): + check_scoring(KMeans(), scoring=cluster_module.adjusted_rand_score) + with pytest.raises(ValueError, match="make_scorer"): + check_scoring(KMeans(), scoring=cluster_module.rand_score) + + +def test_deprecated_scorer(): + X, y = make_regression(n_samples=10, n_features=1, random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + reg = DecisionTreeRegressor() + reg.fit(X_train, y_train) + deprecated_scorer = get_scorer("max_error") + with pytest.warns(DeprecationWarning): + deprecated_scorer(reg, X_test, y_test) + + +@pytest.mark.parametrize( + ( + "scorers,expected_predict_count," + "expected_predict_proba_count,expected_decision_func_count" + ), + [ + ( + { + "a1": "accuracy", + "a2": "accuracy", + "ll1": "neg_log_loss", + "ll2": "neg_log_loss", + "ra1": "roc_auc", + "ra2": "roc_auc", + }, + 1, + 1, + 1, + ), + (["roc_auc", "accuracy"], 1, 0, 1), + (["neg_log_loss", "accuracy"], 1, 1, 0), + ], +) +def test_multimetric_scorer_calls_method_once( + scorers, + expected_predict_count, + expected_predict_proba_count, + expected_decision_func_count, +): + X, y = np.array([[1], [1], [0], [0], [0]]), np.array([0, 1, 1, 1, 0]) + pos_proba = np.random.rand(X.shape[0]) + proba = np.c_[1 - pos_proba, pos_proba] + + class MyClassifier(ClassifierMixin, BaseEstimator): + def __init__(self): + self._expected_predict_count = 0 + self._expected_predict_proba_count = 0 + self._expected_decision_function_count = 0 + + def fit(self, X, y): + self.classes_ = np.unique(y) + return self + + def predict(self, X): + self._expected_predict_count += 1 + return y + + def predict_proba(self, X): + self._expected_predict_proba_count += 1 + return proba + + def decision_function(self, X): + self._expected_decision_function_count += 1 + return pos_proba + + mock_est = MyClassifier().fit(X, y) + scorer_dict = _check_multimetric_scoring(LogisticRegression(), scorers) + multi_scorer = _MultimetricScorer(scorers=scorer_dict) + results = multi_scorer(mock_est, X, y) + + assert set(scorers) == set(results) # compare dict keys + + assert mock_est._expected_predict_count == expected_predict_count + assert mock_est._expected_predict_proba_count == expected_predict_proba_count + assert mock_est._expected_decision_function_count == expected_decision_func_count + + +@pytest.mark.parametrize( + "scorers", + [ + (["roc_auc", "neg_log_loss"]), + ( + { + "roc_auc": make_scorer( + roc_auc_score, + response_method=["predict_proba", "decision_function"], + ), + "neg_log_loss": make_scorer(log_loss, response_method="predict_proba"), + } + ), + ], +) +def test_multimetric_scorer_calls_method_once_classifier_no_decision(scorers): + predict_proba_call_cnt = 0 + + class MockKNeighborsClassifier(KNeighborsClassifier): + def predict_proba(self, X): + nonlocal predict_proba_call_cnt + predict_proba_call_cnt += 1 + return super().predict_proba(X) + + X, y = np.array([[1], [1], [0], [0], [0]]), np.array([0, 1, 1, 1, 0]) + + # no decision function + clf = MockKNeighborsClassifier(n_neighbors=1) + clf.fit(X, y) + + scorer_dict = _check_multimetric_scoring(clf, scorers) + scorer = _MultimetricScorer(scorers=scorer_dict) + scorer(clf, X, y) + + assert predict_proba_call_cnt == 1 + + +def test_multimetric_scorer_calls_method_once_regressor_threshold(): + predict_called_cnt = 0 + + class MockDecisionTreeRegressor(DecisionTreeRegressor): + def predict(self, X): + nonlocal predict_called_cnt + predict_called_cnt += 1 + return super().predict(X) + + X, y = np.array([[1], [1], [0], [0], [0]]), np.array([0, 1, 1, 1, 0]) + + # no decision function + clf = MockDecisionTreeRegressor() + clf.fit(X, y) + + scorers = {"neg_mse": "neg_mean_squared_error", "r2": "r2"} + scorer_dict = _check_multimetric_scoring(clf, scorers) + scorer = _MultimetricScorer(scorers=scorer_dict) + scorer(clf, X, y) + + assert predict_called_cnt == 1 + + +def test_multimetric_scorer_sanity_check(): + # scoring dictionary returned is the same as calling each scorer separately + scorers = { + "a1": "accuracy", + "a2": "accuracy", + "ll1": "neg_log_loss", + "ll2": "neg_log_loss", + "ra1": "roc_auc", + "ra2": "roc_auc", + } + + X, y = make_classification(random_state=0) + + clf = DecisionTreeClassifier() + clf.fit(X, y) + + scorer_dict = _check_multimetric_scoring(clf, scorers) + multi_scorer = _MultimetricScorer(scorers=scorer_dict) + + result = multi_scorer(clf, X, y) + + separate_scores = { + name: get_scorer(name)(clf, X, y) + for name in ["accuracy", "neg_log_loss", "roc_auc"] + } + + for key, value in result.items(): + score_name = scorers[key] + assert_allclose(value, separate_scores[score_name]) + + +@pytest.mark.parametrize("raise_exc", [True, False]) +def test_multimetric_scorer_exception_handling(raise_exc): + """Check that the calling of the `_MultimetricScorer` returns + exception messages in the result dict for the failing scorers + in case of `raise_exc` is `False` and if `raise_exc` is `True`, + then the proper exception is raised. + """ + scorers = { + "failing_1": "neg_mean_squared_log_error", + "non_failing": "neg_median_absolute_error", + "failing_2": "neg_mean_squared_log_error", + } + + X, y = make_classification( + n_samples=50, n_features=2, n_redundant=0, random_state=0 + ) + # neg_mean_squared_log_error fails if y contains values less than or equal to -1 + y *= -1 + + clf = DecisionTreeClassifier().fit(X, y) + + scorer_dict = _check_multimetric_scoring(clf, scorers) + multi_scorer = _MultimetricScorer(scorers=scorer_dict, raise_exc=raise_exc) + + error_msg = ( + "Mean Squared Logarithmic Error cannot be used when " + "targets contain values less than or equal to -1." + ) + + if raise_exc: + with pytest.raises(ValueError, match=error_msg): + multi_scorer(clf, X, y) + else: + result = multi_scorer(clf, X, y) + + exception_message_1 = result["failing_1"] + score = result["non_failing"] + exception_message_2 = result["failing_2"] + + assert isinstance(exception_message_1, str) and error_msg in exception_message_1 + assert isinstance(score, float) + assert isinstance(exception_message_2, str) and error_msg in exception_message_2 + + +@pytest.mark.parametrize( + "scorer_name, metric", + [ + ("roc_auc_ovr", partial(roc_auc_score, multi_class="ovr")), + ("roc_auc_ovo", partial(roc_auc_score, multi_class="ovo")), + ( + "roc_auc_ovr_weighted", + partial(roc_auc_score, multi_class="ovr", average="weighted"), + ), + ( + "roc_auc_ovo_weighted", + partial(roc_auc_score, multi_class="ovo", average="weighted"), + ), + ], +) +def test_multiclass_roc_proba_scorer(scorer_name, metric): + scorer = get_scorer(scorer_name) + X, y = make_classification( + n_classes=3, n_informative=3, n_samples=20, random_state=0 + ) + lr = LogisticRegression().fit(X, y) + y_proba = lr.predict_proba(X) + expected_score = metric(y, y_proba) + + assert scorer(lr, X, y) == pytest.approx(expected_score) + + +def test_multiclass_roc_proba_scorer_label(): + scorer = make_scorer( + roc_auc_score, + multi_class="ovo", + labels=[0, 1, 2], + response_method="predict_proba", + ) + X, y = make_classification( + n_classes=3, n_informative=3, n_samples=20, random_state=0 + ) + lr = LogisticRegression().fit(X, y) + y_proba = lr.predict_proba(X) + + y_binary = y == 0 + expected_score = roc_auc_score( + y_binary, y_proba, multi_class="ovo", labels=[0, 1, 2] + ) + + assert scorer(lr, X, y_binary) == pytest.approx(expected_score) + + +@pytest.mark.parametrize( + "scorer_name", + ["roc_auc_ovr", "roc_auc_ovo", "roc_auc_ovr_weighted", "roc_auc_ovo_weighted"], +) +def test_multiclass_roc_no_proba_scorer_errors(scorer_name): + # Perceptron has no predict_proba + scorer = get_scorer(scorer_name) + X, y = make_classification( + n_classes=3, n_informative=3, n_samples=20, random_state=0 + ) + lr = Perceptron().fit(X, y) + msg = "Perceptron has none of the following attributes: predict_proba." + with pytest.raises(AttributeError, match=msg): + scorer(lr, X, y) + + +@pytest.fixture +def string_labeled_classification_problem(): + """Train a classifier on binary problem with string target. + + The classifier is trained on a binary classification problem where the + minority class of interest has a string label that is intentionally not the + greatest class label using the lexicographic order. In this case, "cancer" + is the positive label, and `classifier.classes_` is + `["cancer", "not cancer"]`. + + In addition, the dataset is imbalanced to better identify problems when + using non-symmetric performance metrics such as f1-score, average precision + and so on. + + Returns + ------- + classifier : estimator object + Trained classifier on the binary problem. + X_test : ndarray of shape (n_samples, n_features) + Data to be used as testing set in tests. + y_test : ndarray of shape (n_samples,), dtype=object + Binary target where labels are strings. + y_pred : ndarray of shape (n_samples,), dtype=object + Prediction of `classifier` when predicting for `X_test`. + y_pred_proba : ndarray of shape (n_samples, 2), dtype=np.float64 + Probabilities of `classifier` when predicting for `X_test`. + y_pred_decision : ndarray of shape (n_samples,), dtype=np.float64 + Decision function values of `classifier` when predicting on `X_test`. + """ + from sklearn.datasets import load_breast_cancer + from sklearn.utils import shuffle + + X, y = load_breast_cancer(return_X_y=True) + # create an highly imbalanced classification task + idx_positive = np.flatnonzero(y == 1) + idx_negative = np.flatnonzero(y == 0) + idx_selected = np.hstack([idx_negative, idx_positive[:25]]) + X, y = X[idx_selected], y[idx_selected] + X, y = shuffle(X, y, random_state=42) + # only use 2 features to make the problem even harder + X = X[:, :2] + y = np.array(["cancer" if c == 1 else "not cancer" for c in y], dtype=object) + X_train, X_test, y_train, y_test = train_test_split( + X, + y, + stratify=y, + random_state=0, + ) + classifier = LogisticRegression().fit(X_train, y_train) + y_pred = classifier.predict(X_test) + y_pred_proba = classifier.predict_proba(X_test) + y_pred_decision = classifier.decision_function(X_test) + + return classifier, X_test, y_test, y_pred, y_pred_proba, y_pred_decision + + +def test_average_precision_pos_label(string_labeled_classification_problem): + # check that _Scorer will lead to the right score when passing + # `pos_label`. Currently, only `average_precision_score` is defined to + # be such a scorer. + ( + clf, + X_test, + y_test, + _, + y_pred_proba, + y_pred_decision, + ) = string_labeled_classification_problem + + pos_label = "cancer" + # we need to select the positive column or reverse the decision values + y_pred_proba = y_pred_proba[:, 0] + y_pred_decision = y_pred_decision * -1 + assert clf.classes_[0] == pos_label + + # check that when calling the scoring function, probability estimates and + # decision values lead to the same results + ap_proba = average_precision_score(y_test, y_pred_proba, pos_label=pos_label) + ap_decision_function = average_precision_score( + y_test, y_pred_decision, pos_label=pos_label + ) + assert ap_proba == pytest.approx(ap_decision_function) + + # create a scorer which would require to pass a `pos_label` + # check that it fails if `pos_label` is not provided + average_precision_scorer = make_scorer( + average_precision_score, + response_method=("decision_function", "predict_proba"), + ) + err_msg = "pos_label=1 is not a valid label. It should be one of " + with pytest.raises(ValueError, match=err_msg): + average_precision_scorer(clf, X_test, y_test) + + # otherwise, the scorer should give the same results than calling the + # scoring function + average_precision_scorer = make_scorer( + average_precision_score, + response_method=("decision_function", "predict_proba"), + pos_label=pos_label, + ) + ap_scorer = average_precision_scorer(clf, X_test, y_test) + + assert ap_scorer == pytest.approx(ap_proba) + + # The above scorer call is using `clf.decision_function`. We will force + # it to use `clf.predict_proba`. + clf_without_predict_proba = deepcopy(clf) + + def _predict_proba(self, X): + raise NotImplementedError + + clf_without_predict_proba.predict_proba = partial( + _predict_proba, clf_without_predict_proba + ) + # sanity check + with pytest.raises(NotImplementedError): + clf_without_predict_proba.predict_proba(X_test) + + ap_scorer = average_precision_scorer(clf_without_predict_proba, X_test, y_test) + assert ap_scorer == pytest.approx(ap_proba) + + +def test_brier_score_loss_pos_label(string_labeled_classification_problem): + # check that _Scorer leads to the right score when `pos_label` is + # provided. Currently only the `brier_score_loss` is defined to be such + # a scorer. + clf, X_test, y_test, _, y_pred_proba, _ = string_labeled_classification_problem + + pos_label = "cancer" + assert clf.classes_[0] == pos_label + + # brier score loss is symmetric + brier_pos_cancer = brier_score_loss(y_test, y_pred_proba[:, 0], pos_label="cancer") + brier_pos_not_cancer = brier_score_loss( + y_test, y_pred_proba[:, 1], pos_label="not cancer" + ) + assert brier_pos_cancer == pytest.approx(brier_pos_not_cancer) + + brier_scorer = make_scorer( + brier_score_loss, + response_method="predict_proba", + pos_label=pos_label, + ) + assert brier_scorer(clf, X_test, y_test) == pytest.approx(brier_pos_cancer) + + +@pytest.mark.parametrize( + "score_func", [f1_score, precision_score, recall_score, jaccard_score] +) +def test_non_symmetric_metric_pos_label( + score_func, string_labeled_classification_problem +): + # check that _Scorer leads to the right score when `pos_label` is + # provided. We check for all possible metric supported. + # Note: At some point we may end up having "scorer tags". + clf, X_test, y_test, y_pred, _, _ = string_labeled_classification_problem + + pos_label = "cancer" + assert clf.classes_[0] == pos_label + + score_pos_cancer = score_func(y_test, y_pred, pos_label="cancer") + score_pos_not_cancer = score_func(y_test, y_pred, pos_label="not cancer") + + assert score_pos_cancer != pytest.approx(score_pos_not_cancer) + + scorer = make_scorer(score_func, pos_label=pos_label) + assert scorer(clf, X_test, y_test) == pytest.approx(score_pos_cancer) + + +@pytest.mark.parametrize( + "scorer", + [ + make_scorer( + average_precision_score, + response_method=("decision_function", "predict_proba"), + pos_label="xxx", + ), + make_scorer(brier_score_loss, response_method="predict_proba", pos_label="xxx"), + make_scorer(f1_score, pos_label="xxx"), + ], + ids=["non-thresholded scorer", "probability scorer", "thresholded scorer"], +) +def test_scorer_select_proba_error(scorer): + # check that we raise the proper error when passing an unknown + # pos_label + X, y = make_classification( + n_classes=2, n_informative=3, n_samples=20, random_state=0 + ) + lr = LogisticRegression().fit(X, y) + assert scorer._kwargs["pos_label"] not in np.unique(y).tolist() + + err_msg = "is not a valid label" + with pytest.raises(ValueError, match=err_msg): + scorer(lr, X, y) + + +def test_get_scorer_return_copy(): + # test that get_scorer returns a copy + assert get_scorer("roc_auc") is not get_scorer("roc_auc") + + +def test_scorer_no_op_multiclass_select_proba(): + # check that calling a _Scorer on a multiclass problem do not raise + # even if `y_true` would be binary during the scoring. + # `_select_proba_binary` should not be called in this case. + X, y = make_classification( + n_classes=3, n_informative=3, n_samples=20, random_state=0 + ) + lr = LogisticRegression().fit(X, y) + + mask_last_class = y == lr.classes_[-1] + X_test, y_test = X[~mask_last_class], y[~mask_last_class] + assert_array_equal(np.unique(y_test), lr.classes_[:-1]) + + scorer = make_scorer( + roc_auc_score, + response_method="predict_proba", + multi_class="ovo", + labels=lr.classes_, + ) + scorer(lr, X_test, y_test) + + +@pytest.mark.parametrize("name", get_scorer_names()) +def test_scorer_set_score_request_raises(name): + """Test that set_score_request is only available when feature flag is on.""" + # Make sure they expose the routing methods. + scorer = get_scorer(name) + with pytest.raises(RuntimeError, match="This method is only available"): + scorer.set_score_request() + + +@pytest.mark.parametrize("name", get_scorer_names(), ids=get_scorer_names()) +@config_context(enable_metadata_routing=True) +def test_scorer_metadata_request(name): + """Testing metadata requests for scorers. + + This test checks many small things in a large test, to reduce the + boilerplate required for each section. + """ + # Make sure they expose the routing methods. + scorer = get_scorer(name) + assert hasattr(scorer, "set_score_request") + assert hasattr(scorer, "get_metadata_routing") + + # Check that by default no metadata is requested. + assert_request_is_empty(scorer.get_metadata_routing()) + + weighted_scorer = scorer.set_score_request(sample_weight=True) + # set_score_request should mutate the instance, rather than returning a + # new instance + assert weighted_scorer is scorer + + # make sure the scorer doesn't request anything on methods other than + # `score`, and that the requested value on `score` is correct. + assert_request_is_empty(weighted_scorer.get_metadata_routing(), exclude="score") + assert ( + weighted_scorer.get_metadata_routing().score.requests["sample_weight"] is True + ) + + # make sure putting the scorer in a router doesn't request anything by + # default + router = MetadataRouter(owner="test").add( + scorer=get_scorer(name), + method_mapping=MethodMapping().add(caller="score", callee="score"), + ) + # make sure `sample_weight` is refused if passed. + with pytest.raises(TypeError, match="got unexpected argument"): + router.validate_metadata(params={"sample_weight": 1}, method="score") + # make sure `sample_weight` is not routed even if passed. + routed_params = router.route_params(params={"sample_weight": 1}, caller="score") + assert not routed_params.scorer.score + + # make sure putting weighted_scorer in a router requests sample_weight + router = MetadataRouter(owner="test").add( + scorer=weighted_scorer, + method_mapping=MethodMapping().add(caller="score", callee="score"), + ) + router.validate_metadata(params={"sample_weight": 1}, method="score") + routed_params = router.route_params(params={"sample_weight": 1}, caller="score") + assert list(routed_params.scorer.score.keys()) == ["sample_weight"] + + +@config_context(enable_metadata_routing=True) +def test_metadata_kwarg_conflict(): + """This test makes sure the right warning is raised if the user passes + some metadata both as a constructor to make_scorer, and during __call__. + """ + X, y = make_classification( + n_classes=3, n_informative=3, n_samples=20, random_state=0 + ) + lr = LogisticRegression().fit(X, y) + + scorer = make_scorer( + roc_auc_score, + response_method="predict_proba", + multi_class="ovo", + labels=lr.classes_, + ) + with pytest.warns(UserWarning, match="already set as kwargs"): + scorer.set_score_request(labels=True) + + with pytest.warns(UserWarning, match="There is an overlap"): + scorer(lr, X, y, labels=lr.classes_) + + +@config_context(enable_metadata_routing=True) +def test_PassthroughScorer_set_score_request(): + """Test that _PassthroughScorer.set_score_request adds the correct metadata request + on itself and doesn't change its estimator's routing.""" + est = LogisticRegression().set_score_request(sample_weight="estimator_weights") + # make a `_PassthroughScorer` with `check_scoring`: + scorer = check_scoring(est, None) + assert ( + scorer.get_metadata_routing().score.requests["sample_weight"] + == "estimator_weights" + ) + + scorer.set_score_request(sample_weight="scorer_weights") + assert ( + scorer.get_metadata_routing().score.requests["sample_weight"] + == "scorer_weights" + ) + + # making sure changing the passthrough object doesn't affect the estimator. + assert ( + est.get_metadata_routing().score.requests["sample_weight"] + == "estimator_weights" + ) + + +def test_PassthroughScorer_set_score_request_raises_without_routing_enabled(): + """Test that _PassthroughScorer.set_score_request raises if metadata routing is + disabled.""" + scorer = check_scoring(LogisticRegression(), None) + msg = "This method is only available when metadata routing is enabled." + + with pytest.raises(RuntimeError, match=msg): + scorer.set_score_request(sample_weight="my_weights") + + +@config_context(enable_metadata_routing=True) +def test_multimetric_scoring_metadata_routing(): + # Test that _MultimetricScorer properly routes metadata. + def score1(y_true, y_pred): + return 1 + + def score2(y_true, y_pred, sample_weight="test"): + # make sure sample_weight is not passed + assert sample_weight == "test" + return 1 + + def score3(y_true, y_pred, sample_weight=None): + # make sure sample_weight is passed + assert sample_weight is not None + return 1 + + scorers = { + "score1": make_scorer(score1), + "score2": make_scorer(score2).set_score_request(sample_weight=False), + "score3": make_scorer(score3).set_score_request(sample_weight=True), + } + + X, y = make_classification( + n_samples=50, n_features=2, n_redundant=0, random_state=0 + ) + + clf = DecisionTreeClassifier().fit(X, y) + + scorer_dict = _check_multimetric_scoring(clf, scorers) + multi_scorer = _MultimetricScorer(scorers=scorer_dict) + # This passes since routing is done. + multi_scorer(clf, X, y, sample_weight=1) + + +@config_context(enable_metadata_routing=False) +def test_multimetric_scoring_kwargs(): + # Test that _MultimetricScorer correctly forwards kwargs + # to the scorers when metadata routing is disabled. + # `sample_weight` is only forwarded to the scorers that accept it. + # Other arguments are forwarded to all scorers. + def score1(y_true, y_pred, common_arg=None): + # make sure common_arg is passed + assert common_arg is not None + return 1 + + def score2(y_true, y_pred, common_arg=None, sample_weight=None): + # make sure common_arg is passed + assert common_arg is not None + # make sure sample_weight is passed + assert sample_weight is not None + return 1 + + scorers = { + "score1": make_scorer(score1), + "score2": make_scorer(score2), + } + + X, y = make_classification( + n_samples=50, n_features=2, n_redundant=0, random_state=0 + ) + + clf = DecisionTreeClassifier().fit(X, y) + + scorer_dict = _check_multimetric_scoring(clf, scorers) + multi_scorer = _MultimetricScorer(scorers=scorer_dict) + multi_scorer(clf, X, y, common_arg=1, sample_weight=1) + + +def test_kwargs_without_metadata_routing_error(): + # Test that kwargs are not supported in scorers if metadata routing is not + # enabled. + # TODO: remove when enable_metadata_routing is deprecated + def score(y_true, y_pred, param=None): + return 1 # pragma: no cover + + X, y = make_classification( + n_samples=50, n_features=2, n_redundant=0, random_state=0 + ) + + clf = DecisionTreeClassifier().fit(X, y) + scorer = make_scorer(score) + with config_context(enable_metadata_routing=False): + with pytest.raises( + ValueError, match="is only supported if enable_metadata_routing=True" + ): + scorer(clf, X, y, param="blah") + + +def test_get_scorer_multilabel_indicator(): + """Check that our scorer deal with multi-label indicator matrices. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/26817 + """ + X, Y = make_multilabel_classification(n_samples=72, n_classes=3, random_state=0) + X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0) + + estimator = KNeighborsClassifier().fit(X_train, Y_train) + + score = get_scorer("average_precision")(estimator, X_test, Y_test) + assert score > 0.8 + + +@pytest.mark.parametrize( + "scorer, expected_repr", + [ + ( + get_scorer("accuracy"), + "make_scorer(accuracy_score, response_method='predict')", + ), + ( + get_scorer("neg_log_loss"), + ( + "make_scorer(log_loss, greater_is_better=False," + " response_method='predict_proba')" + ), + ), + ( + get_scorer("roc_auc"), + ( + "make_scorer(roc_auc_score, response_method=" + "('decision_function', 'predict_proba'))" + ), + ), + ( + make_scorer(fbeta_score, beta=2), + "make_scorer(fbeta_score, response_method='predict', beta=2)", + ), + ], +) +def test_make_scorer_repr(scorer, expected_repr): + """Check the representation of the scorer.""" + assert repr(scorer) == expected_repr + + +@pytest.mark.parametrize("pass_estimator", [True, False]) +def test_get_scorer_multimetric(pass_estimator): + """Check that check_scoring is compatible with multi-metric configurations.""" + X, y = make_classification(n_samples=150, n_features=10, random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + clf = LogisticRegression(random_state=0) + + if pass_estimator: + check_scoring_ = check_scoring + else: + check_scoring_ = partial(check_scoring, clf) + + clf.fit(X_train, y_train) + + y_pred = clf.predict(X_test) + y_proba = clf.predict_proba(X_test) + + expected_results = { + "r2": r2_score(y_test, y_pred), + "roc_auc": roc_auc_score(y_test, y_proba[:, 1]), + "accuracy": accuracy_score(y_test, y_pred), + } + + for container in [set, list, tuple]: + scoring = check_scoring_(scoring=container(["r2", "roc_auc", "accuracy"])) + result = scoring(clf, X_test, y_test) + + assert result.keys() == expected_results.keys() + for name in result: + assert result[name] == pytest.approx(expected_results[name]) + + def double_accuracy(y_true, y_pred): + return 2 * accuracy_score(y_true, y_pred) + + custom_scorer = make_scorer(double_accuracy, response_method="predict") + + # dict with different names + dict_scoring = check_scoring_( + scoring={ + "my_r2": "r2", + "my_roc_auc": "roc_auc", + "double_accuracy": custom_scorer, + } + ) + dict_result = dict_scoring(clf, X_test, y_test) + assert len(dict_result) == 3 + assert dict_result["my_r2"] == pytest.approx(expected_results["r2"]) + assert dict_result["my_roc_auc"] == pytest.approx(expected_results["roc_auc"]) + assert dict_result["double_accuracy"] == pytest.approx( + 2 * expected_results["accuracy"] + ) + + +def test_multimetric_scorer_repr(): + """Check repr for multimetric scorer""" + multi_metric_scorer = check_scoring(scoring=["accuracy", "r2"]) + + assert str(multi_metric_scorer) == 'MultiMetricScorer("accuracy", "r2")' + + +def test_check_scoring_multimetric_raise_exc(): + """Test that check_scoring returns error code for a subset of scorers in + multimetric scoring if raise_exc=False and raises otherwise.""" + + def raising_scorer(estimator, X, y): + raise ValueError("That doesn't work.") + + X, y = make_classification(n_samples=150, n_features=10, random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + clf = LogisticRegression().fit(X_train, y_train) + + # "raising_scorer" is raising ValueError and should return an string representation + # of the error of the last scorer: + scoring = { + "accuracy": make_scorer(accuracy_score), + "raising_scorer": raising_scorer, + } + scoring_call = check_scoring(estimator=clf, scoring=scoring, raise_exc=False) + scores = scoring_call(clf, X_test, y_test) + assert "That doesn't work." in scores["raising_scorer"] + + # should raise an error + scoring_call = check_scoring(estimator=clf, scoring=scoring, raise_exc=True) + err_msg = "That doesn't work." + with pytest.raises(ValueError, match=err_msg): + scores = scoring_call(clf, X_test, y_test) + + +@pytest.mark.parametrize("enable_metadata_routing", [True, False]) +def test_metadata_routing_multimetric_metadata_routing(enable_metadata_routing): + """Test multimetric scorer works with and without metadata routing enabled when + there is no actual metadata to pass. + + Non-regression test for https://github.com/scikit-learn/scikit-learn/issues/28256 + """ + X, y = make_classification(n_samples=50, n_features=10, random_state=0) + estimator = EstimatorWithFitAndPredict().fit(X, y) + + multimetric_scorer = _MultimetricScorer(scorers={"acc": get_scorer("accuracy")}) + with config_context(enable_metadata_routing=enable_metadata_routing): + multimetric_scorer(estimator, X, y) + + +def test_curve_scorer(): + """Check the behaviour of the `_CurveScorer` class.""" + X, y = make_classification(random_state=0) + estimator = LogisticRegression().fit(X, y) + curve_scorer = _CurveScorer( + balanced_accuracy_score, + sign=1, + response_method="predict_proba", + thresholds=10, + kwargs={}, + ) + scores, thresholds = curve_scorer(estimator, X, y) + + assert thresholds.shape == scores.shape + # check that the thresholds are probabilities with extreme values close to 0 and 1. + # they are not exactly 0 and 1 because they are the extremum of the + # `estimator.predict_proba(X)` values. + assert 0 <= thresholds.min() <= 0.01 + assert 0.99 <= thresholds.max() <= 1 + # balanced accuracy should be between 0.5 and 1 when it is not adjusted + assert 0.5 <= scores.min() <= 1 + + # check that passing kwargs to the scorer works + curve_scorer = _CurveScorer( + balanced_accuracy_score, + sign=1, + response_method="predict_proba", + thresholds=10, + kwargs={"adjusted": True}, + ) + scores, thresholds = curve_scorer(estimator, X, y) + + # balanced accuracy should be between 0.5 and 1 when it is not adjusted + assert 0 <= scores.min() <= 0.5 + + # check that we can inverse the sign of the score when dealing with `neg_*` scorer + curve_scorer = _CurveScorer( + balanced_accuracy_score, + sign=-1, + response_method="predict_proba", + thresholds=10, + kwargs={"adjusted": True}, + ) + scores, thresholds = curve_scorer(estimator, X, y) + + assert all(scores <= 0) + + +def test_curve_scorer_pos_label(global_random_seed): + """Check that we propagate properly the `pos_label` parameter to the scorer.""" + n_samples = 30 + X, y = make_classification( + n_samples=n_samples, weights=[0.9, 0.1], random_state=global_random_seed + ) + estimator = LogisticRegression().fit(X, y) + + curve_scorer = _CurveScorer( + recall_score, + sign=1, + response_method="predict_proba", + thresholds=10, + kwargs={"pos_label": 1}, + ) + scores_pos_label_1, thresholds_pos_label_1 = curve_scorer(estimator, X, y) + + curve_scorer = _CurveScorer( + recall_score, + sign=1, + response_method="predict_proba", + thresholds=10, + kwargs={"pos_label": 0}, + ) + scores_pos_label_0, thresholds_pos_label_0 = curve_scorer(estimator, X, y) + + # Since `pos_label` is forwarded to the curve_scorer, the thresholds are not equal. + assert not (thresholds_pos_label_1 == thresholds_pos_label_0).all() + # The min-max range for the thresholds is defined by the probabilities of the + # `pos_label` class (the column of `predict_proba`). + y_pred = estimator.predict_proba(X) + assert thresholds_pos_label_0.min() == pytest.approx(y_pred.min(axis=0)[0]) + assert thresholds_pos_label_0.max() == pytest.approx(y_pred.max(axis=0)[0]) + assert thresholds_pos_label_1.min() == pytest.approx(y_pred.min(axis=0)[1]) + assert thresholds_pos_label_1.max() == pytest.approx(y_pred.max(axis=0)[1]) + + # The recall cannot be negative and `pos_label=1` should have a higher recall + # since there is less samples to be considered. + assert 0.0 < scores_pos_label_0.min() < scores_pos_label_1.min() + assert scores_pos_label_0.max() == pytest.approx(1.0) + assert scores_pos_label_1.max() == pytest.approx(1.0) + + +# TODO(1.8): remove +def test_make_scorer_reponse_method_default_warning(): + with pytest.warns(FutureWarning, match="response_method=None is deprecated"): + make_scorer(accuracy_score, response_method=None) + + # No warning is raised if response_method is left to its default value + # because the future default value has the same effect as the current one. + with warnings.catch_warnings(): + warnings.simplefilter("error", FutureWarning) + make_scorer(accuracy_score) diff --git a/.venv/lib/python3.12/site-packages/sklearn/mixture/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/mixture/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c27263a0ed74381a7c8dad4d6488eba570eb49b8 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/mixture/__init__.py @@ -0,0 +1,9 @@ +"""Mixture modeling algorithms.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._bayesian_mixture import BayesianGaussianMixture +from ._gaussian_mixture import GaussianMixture + +__all__ = ["BayesianGaussianMixture", "GaussianMixture"] diff --git a/.venv/lib/python3.12/site-packages/sklearn/mixture/_base.py b/.venv/lib/python3.12/site-packages/sklearn/mixture/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..f66344a2847533629f52ddb10a4e819144cc8cfe --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/mixture/_base.py @@ -0,0 +1,571 @@ +"""Base class for mixture models.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from abc import ABCMeta, abstractmethod +from numbers import Integral, Real +from time import time + +import numpy as np +from scipy.special import logsumexp + +from .. import cluster +from ..base import BaseEstimator, DensityMixin, _fit_context +from ..cluster import kmeans_plusplus +from ..exceptions import ConvergenceWarning +from ..utils import check_random_state +from ..utils._param_validation import Interval, StrOptions +from ..utils.validation import check_is_fitted, validate_data + + +def _check_shape(param, param_shape, name): + """Validate the shape of the input parameter 'param'. + + Parameters + ---------- + param : array + + param_shape : tuple + + name : str + """ + param = np.array(param) + if param.shape != param_shape: + raise ValueError( + "The parameter '%s' should have the shape of %s, but got %s" + % (name, param_shape, param.shape) + ) + + +class BaseMixture(DensityMixin, BaseEstimator, metaclass=ABCMeta): + """Base class for mixture models. + + This abstract class specifies an interface for all mixture classes and + provides basic common methods for mixture models. + """ + + _parameter_constraints: dict = { + "n_components": [Interval(Integral, 1, None, closed="left")], + "tol": [Interval(Real, 0.0, None, closed="left")], + "reg_covar": [Interval(Real, 0.0, None, closed="left")], + "max_iter": [Interval(Integral, 0, None, closed="left")], + "n_init": [Interval(Integral, 1, None, closed="left")], + "init_params": [ + StrOptions({"kmeans", "random", "random_from_data", "k-means++"}) + ], + "random_state": ["random_state"], + "warm_start": ["boolean"], + "verbose": ["verbose"], + "verbose_interval": [Interval(Integral, 1, None, closed="left")], + } + + def __init__( + self, + n_components, + tol, + reg_covar, + max_iter, + n_init, + init_params, + random_state, + warm_start, + verbose, + verbose_interval, + ): + self.n_components = n_components + self.tol = tol + self.reg_covar = reg_covar + self.max_iter = max_iter + self.n_init = n_init + self.init_params = init_params + self.random_state = random_state + self.warm_start = warm_start + self.verbose = verbose + self.verbose_interval = verbose_interval + + @abstractmethod + def _check_parameters(self, X): + """Check initial parameters of the derived class. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + """ + pass + + def _initialize_parameters(self, X, random_state): + """Initialize the model parameters. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + random_state : RandomState + A random number generator instance that controls the random seed + used for the method chosen to initialize the parameters. + """ + n_samples, _ = X.shape + + if self.init_params == "kmeans": + resp = np.zeros((n_samples, self.n_components), dtype=X.dtype) + label = ( + cluster.KMeans( + n_clusters=self.n_components, n_init=1, random_state=random_state + ) + .fit(X) + .labels_ + ) + resp[np.arange(n_samples), label] = 1 + elif self.init_params == "random": + resp = np.asarray( + random_state.uniform(size=(n_samples, self.n_components)), dtype=X.dtype + ) + resp /= resp.sum(axis=1)[:, np.newaxis] + elif self.init_params == "random_from_data": + resp = np.zeros((n_samples, self.n_components), dtype=X.dtype) + indices = random_state.choice( + n_samples, size=self.n_components, replace=False + ) + resp[indices, np.arange(self.n_components)] = 1 + elif self.init_params == "k-means++": + resp = np.zeros((n_samples, self.n_components), dtype=X.dtype) + _, indices = kmeans_plusplus( + X, + self.n_components, + random_state=random_state, + ) + resp[indices, np.arange(self.n_components)] = 1 + + self._initialize(X, resp) + + @abstractmethod + def _initialize(self, X, resp): + """Initialize the model parameters of the derived class. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + resp : array-like of shape (n_samples, n_components) + """ + pass + + def fit(self, X, y=None): + """Estimate model parameters with the EM algorithm. + + The method fits the model ``n_init`` times and sets the parameters with + which the model has the largest likelihood or lower bound. Within each + trial, the method iterates between E-step and M-step for ``max_iter`` + times until the change of likelihood or lower bound is less than + ``tol``, otherwise, a ``ConvergenceWarning`` is raised. + If ``warm_start`` is ``True``, then ``n_init`` is ignored and a single + initialization is performed upon the first call. Upon consecutive + calls, training starts where it left off. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + List of n_features-dimensional data points. Each row + corresponds to a single data point. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + The fitted mixture. + """ + # parameters are validated in fit_predict + self.fit_predict(X, y) + return self + + @_fit_context(prefer_skip_nested_validation=True) + def fit_predict(self, X, y=None): + """Estimate model parameters using X and predict the labels for X. + + The method fits the model n_init times and sets the parameters with + which the model has the largest likelihood or lower bound. Within each + trial, the method iterates between E-step and M-step for `max_iter` + times until the change of likelihood or lower bound is less than + `tol`, otherwise, a :class:`~sklearn.exceptions.ConvergenceWarning` is + raised. After fitting, it predicts the most probable label for the + input data points. + + .. versionadded:: 0.20 + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + List of n_features-dimensional data points. Each row + corresponds to a single data point. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + labels : array, shape (n_samples,) + Component labels. + """ + X = validate_data(self, X, dtype=[np.float64, np.float32], ensure_min_samples=2) + if X.shape[0] < self.n_components: + raise ValueError( + "Expected n_samples >= n_components " + f"but got n_components = {self.n_components}, " + f"n_samples = {X.shape[0]}" + ) + self._check_parameters(X) + + # if we enable warm_start, we will have a unique initialisation + do_init = not (self.warm_start and hasattr(self, "converged_")) + n_init = self.n_init if do_init else 1 + + max_lower_bound = -np.inf + best_lower_bounds = [] + self.converged_ = False + + random_state = check_random_state(self.random_state) + + n_samples, _ = X.shape + for init in range(n_init): + self._print_verbose_msg_init_beg(init) + + if do_init: + self._initialize_parameters(X, random_state) + + lower_bound = -np.inf if do_init else self.lower_bound_ + current_lower_bounds = [] + + if self.max_iter == 0: + best_params = self._get_parameters() + best_n_iter = 0 + else: + converged = False + for n_iter in range(1, self.max_iter + 1): + prev_lower_bound = lower_bound + + log_prob_norm, log_resp = self._e_step(X) + self._m_step(X, log_resp) + lower_bound = self._compute_lower_bound(log_resp, log_prob_norm) + current_lower_bounds.append(lower_bound) + + change = lower_bound - prev_lower_bound + self._print_verbose_msg_iter_end(n_iter, change) + + if abs(change) < self.tol: + converged = True + break + + self._print_verbose_msg_init_end(lower_bound, converged) + + if lower_bound > max_lower_bound or max_lower_bound == -np.inf: + max_lower_bound = lower_bound + best_params = self._get_parameters() + best_n_iter = n_iter + best_lower_bounds = current_lower_bounds + self.converged_ = converged + + # Should only warn about convergence if max_iter > 0, otherwise + # the user is assumed to have used 0-iters initialization + # to get the initial means. + if not self.converged_ and self.max_iter > 0: + warnings.warn( + ( + "Best performing initialization did not converge. " + "Try different init parameters, or increase max_iter, " + "tol, or check for degenerate data." + ), + ConvergenceWarning, + ) + + self._set_parameters(best_params) + self.n_iter_ = best_n_iter + self.lower_bound_ = max_lower_bound + self.lower_bounds_ = best_lower_bounds + + # Always do a final e-step to guarantee that the labels returned by + # fit_predict(X) are always consistent with fit(X).predict(X) + # for any value of max_iter and tol (and any random_state). + _, log_resp = self._e_step(X) + + return log_resp.argmax(axis=1) + + def _e_step(self, X): + """E step. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + Returns + ------- + log_prob_norm : float + Mean of the logarithms of the probabilities of each sample in X + + log_responsibility : array, shape (n_samples, n_components) + Logarithm of the posterior probabilities (or responsibilities) of + the point of each sample in X. + """ + log_prob_norm, log_resp = self._estimate_log_prob_resp(X) + return np.mean(log_prob_norm), log_resp + + @abstractmethod + def _m_step(self, X, log_resp): + """M step. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + log_resp : array-like of shape (n_samples, n_components) + Logarithm of the posterior probabilities (or responsibilities) of + the point of each sample in X. + """ + pass + + @abstractmethod + def _get_parameters(self): + pass + + @abstractmethod + def _set_parameters(self, params): + pass + + def score_samples(self, X): + """Compute the log-likelihood of each sample. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + List of n_features-dimensional data points. Each row + corresponds to a single data point. + + Returns + ------- + log_prob : array, shape (n_samples,) + Log-likelihood of each sample in `X` under the current model. + """ + check_is_fitted(self) + X = validate_data(self, X, reset=False) + + return logsumexp(self._estimate_weighted_log_prob(X), axis=1) + + def score(self, X, y=None): + """Compute the per-sample average log-likelihood of the given data X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_dimensions) + List of n_features-dimensional data points. Each row + corresponds to a single data point. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + log_likelihood : float + Log-likelihood of `X` under the Gaussian mixture model. + """ + return self.score_samples(X).mean() + + def predict(self, X): + """Predict the labels for the data samples in X using trained model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + List of n_features-dimensional data points. Each row + corresponds to a single data point. + + Returns + ------- + labels : array, shape (n_samples,) + Component labels. + """ + check_is_fitted(self) + X = validate_data(self, X, reset=False) + return self._estimate_weighted_log_prob(X).argmax(axis=1) + + def predict_proba(self, X): + """Evaluate the components' density for each sample. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + List of n_features-dimensional data points. Each row + corresponds to a single data point. + + Returns + ------- + resp : array, shape (n_samples, n_components) + Density of each Gaussian component for each sample in X. + """ + check_is_fitted(self) + X = validate_data(self, X, reset=False) + _, log_resp = self._estimate_log_prob_resp(X) + return np.exp(log_resp) + + def sample(self, n_samples=1): + """Generate random samples from the fitted Gaussian distribution. + + Parameters + ---------- + n_samples : int, default=1 + Number of samples to generate. + + Returns + ------- + X : array, shape (n_samples, n_features) + Randomly generated sample. + + y : array, shape (nsamples,) + Component labels. + """ + check_is_fitted(self) + + if n_samples < 1: + raise ValueError( + "Invalid value for 'n_samples': %d . The sampling requires at " + "least one sample." % (self.n_components) + ) + + _, n_features = self.means_.shape + rng = check_random_state(self.random_state) + n_samples_comp = rng.multinomial(n_samples, self.weights_) + + if self.covariance_type == "full": + X = np.vstack( + [ + rng.multivariate_normal(mean, covariance, int(sample)) + for (mean, covariance, sample) in zip( + self.means_, self.covariances_, n_samples_comp + ) + ] + ) + elif self.covariance_type == "tied": + X = np.vstack( + [ + rng.multivariate_normal(mean, self.covariances_, int(sample)) + for (mean, sample) in zip(self.means_, n_samples_comp) + ] + ) + else: + X = np.vstack( + [ + mean + + rng.standard_normal(size=(sample, n_features)) + * np.sqrt(covariance) + for (mean, covariance, sample) in zip( + self.means_, self.covariances_, n_samples_comp + ) + ] + ) + + y = np.concatenate( + [np.full(sample, j, dtype=int) for j, sample in enumerate(n_samples_comp)] + ) + + return (X, y) + + def _estimate_weighted_log_prob(self, X): + """Estimate the weighted log-probabilities, log P(X | Z) + log weights. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + Returns + ------- + weighted_log_prob : array, shape (n_samples, n_component) + """ + return self._estimate_log_prob(X) + self._estimate_log_weights() + + @abstractmethod + def _estimate_log_weights(self): + """Estimate log-weights in EM algorithm, E[ log pi ] in VB algorithm. + + Returns + ------- + log_weight : array, shape (n_components, ) + """ + pass + + @abstractmethod + def _estimate_log_prob(self, X): + """Estimate the log-probabilities log P(X | Z). + + Compute the log-probabilities per each component for each sample. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + Returns + ------- + log_prob : array, shape (n_samples, n_component) + """ + pass + + def _estimate_log_prob_resp(self, X): + """Estimate log probabilities and responsibilities for each sample. + + Compute the log probabilities, weighted log probabilities per + component and responsibilities for each sample in X with respect to + the current state of the model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + Returns + ------- + log_prob_norm : array, shape (n_samples,) + log p(X) + + log_responsibilities : array, shape (n_samples, n_components) + logarithm of the responsibilities + """ + weighted_log_prob = self._estimate_weighted_log_prob(X) + log_prob_norm = logsumexp(weighted_log_prob, axis=1) + with np.errstate(under="ignore"): + # ignore underflow + log_resp = weighted_log_prob - log_prob_norm[:, np.newaxis] + return log_prob_norm, log_resp + + def _print_verbose_msg_init_beg(self, n_init): + """Print verbose message on initialization.""" + if self.verbose == 1: + print("Initialization %d" % n_init) + elif self.verbose >= 2: + print("Initialization %d" % n_init) + self._init_prev_time = time() + self._iter_prev_time = self._init_prev_time + + def _print_verbose_msg_iter_end(self, n_iter, diff_ll): + """Print verbose message on initialization.""" + if n_iter % self.verbose_interval == 0: + if self.verbose == 1: + print(" Iteration %d" % n_iter) + elif self.verbose >= 2: + cur_time = time() + print( + " Iteration %d\t time lapse %.5fs\t ll change %.5f" + % (n_iter, cur_time - self._iter_prev_time, diff_ll) + ) + self._iter_prev_time = cur_time + + def _print_verbose_msg_init_end(self, lb, init_has_converged): + """Print verbose message on the end of iteration.""" + converged_msg = "converged" if init_has_converged else "did not converge" + if self.verbose == 1: + print(f"Initialization {converged_msg}.") + elif self.verbose >= 2: + t = time() - self._init_prev_time + print( + f"Initialization {converged_msg}. time lapse {t:.5f}s\t lower bound" + f" {lb:.5f}." + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/mixture/_bayesian_mixture.py b/.venv/lib/python3.12/site-packages/sklearn/mixture/_bayesian_mixture.py new file mode 100644 index 0000000000000000000000000000000000000000..57220186faf61694f0945a276bc60254ba861bd5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/mixture/_bayesian_mixture.py @@ -0,0 +1,891 @@ +"""Bayesian Gaussian Mixture Model.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import math +from numbers import Real + +import numpy as np +from scipy.special import betaln, digamma, gammaln + +from ..utils import check_array +from ..utils._param_validation import Interval, StrOptions +from ._base import BaseMixture, _check_shape +from ._gaussian_mixture import ( + _check_precision_matrix, + _check_precision_positivity, + _compute_log_det_cholesky, + _compute_precision_cholesky, + _estimate_gaussian_parameters, + _estimate_log_gaussian_prob, +) + + +def _log_dirichlet_norm(dirichlet_concentration): + """Compute the log of the Dirichlet distribution normalization term. + + Parameters + ---------- + dirichlet_concentration : array-like of shape (n_samples,) + The parameters values of the Dirichlet distribution. + + Returns + ------- + log_dirichlet_norm : float + The log normalization of the Dirichlet distribution. + """ + return gammaln(np.sum(dirichlet_concentration)) - np.sum( + gammaln(dirichlet_concentration) + ) + + +def _log_wishart_norm(degrees_of_freedom, log_det_precisions_chol, n_features): + """Compute the log of the Wishart distribution normalization term. + + Parameters + ---------- + degrees_of_freedom : array-like of shape (n_components,) + The number of degrees of freedom on the covariance Wishart + distributions. + + log_det_precision_chol : array-like of shape (n_components,) + The determinant of the precision matrix for each component. + + n_features : int + The number of features. + + Return + ------ + log_wishart_norm : array-like of shape (n_components,) + The log normalization of the Wishart distribution. + """ + # To simplify the computation we have removed the np.log(np.pi) term + return -( + degrees_of_freedom * log_det_precisions_chol + + degrees_of_freedom * n_features * 0.5 * math.log(2.0) + + np.sum( + gammaln(0.5 * (degrees_of_freedom - np.arange(n_features)[:, np.newaxis])), + 0, + ) + ) + + +class BayesianGaussianMixture(BaseMixture): + """Variational Bayesian estimation of a Gaussian mixture. + + This class allows to infer an approximate posterior distribution over the + parameters of a Gaussian mixture distribution. The effective number of + components can be inferred from the data. + + This class implements two types of prior for the weights distribution: a + finite mixture model with Dirichlet distribution and an infinite mixture + model with the Dirichlet Process. In practice Dirichlet Process inference + algorithm is approximated and uses a truncated distribution with a fixed + maximum number of components (called the Stick-breaking representation). + The number of components actually used almost always depends on the data. + + .. versionadded:: 0.18 + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int, default=1 + The number of mixture components. Depending on the data and the value + of the `weight_concentration_prior` the model can decide to not use + all the components by setting some component `weights_` to values very + close to zero. The number of effective components is therefore smaller + than n_components. + + covariance_type : {'full', 'tied', 'diag', 'spherical'}, default='full' + String describing the type of covariance parameters to use. + Must be one of: + + - 'full' (each component has its own general covariance matrix), + - 'tied' (all components share the same general covariance matrix), + - 'diag' (each component has its own diagonal covariance matrix), + - 'spherical' (each component has its own single variance). + + tol : float, default=1e-3 + The convergence threshold. EM iterations will stop when the + lower bound average gain on the likelihood (of the training data with + respect to the model) is below this threshold. + + reg_covar : float, default=1e-6 + Non-negative regularization added to the diagonal of covariance. + Allows to assure that the covariance matrices are all positive. + + max_iter : int, default=100 + The number of EM iterations to perform. + + n_init : int, default=1 + The number of initializations to perform. The result with the highest + lower bound value on the likelihood is kept. + + init_params : {'kmeans', 'k-means++', 'random', 'random_from_data'}, \ + default='kmeans' + The method used to initialize the weights, the means and the + covariances. String must be one of: + + - 'kmeans': responsibilities are initialized using kmeans. + - 'k-means++': use the k-means++ method to initialize. + - 'random': responsibilities are initialized randomly. + - 'random_from_data': initial means are randomly selected data points. + + .. versionchanged:: v1.1 + `init_params` now accepts 'random_from_data' and 'k-means++' as + initialization methods. + + weight_concentration_prior_type : {'dirichlet_process', 'dirichlet_distribution'}, \ + default='dirichlet_process' + String describing the type of the weight concentration prior. + + weight_concentration_prior : float or None, default=None + The dirichlet concentration of each component on the weight + distribution (Dirichlet). This is commonly called gamma in the + literature. The higher concentration puts more mass in + the center and will lead to more components being active, while a lower + concentration parameter will lead to more mass at the edge of the + mixture weights simplex. The value of the parameter must be greater + than 0. If it is None, it's set to ``1. / n_components``. + + mean_precision_prior : float or None, default=None + The precision prior on the mean distribution (Gaussian). + Controls the extent of where means can be placed. Larger + values concentrate the cluster means around `mean_prior`. + The value of the parameter must be greater than 0. + If it is None, it is set to 1. + + mean_prior : array-like, shape (n_features,), default=None + The prior on the mean distribution (Gaussian). + If it is None, it is set to the mean of X. + + degrees_of_freedom_prior : float or None, default=None + The prior of the number of degrees of freedom on the covariance + distributions (Wishart). If it is None, it's set to `n_features`. + + covariance_prior : float or array-like, default=None + The prior on the covariance distribution (Wishart). + If it is None, the emiprical covariance prior is initialized using the + covariance of X. The shape depends on `covariance_type`:: + + (n_features, n_features) if 'full', + (n_features, n_features) if 'tied', + (n_features) if 'diag', + float if 'spherical' + + random_state : int, RandomState instance or None, default=None + Controls the random seed given to the method chosen to initialize the + parameters (see `init_params`). + In addition, it controls the generation of random samples from the + fitted distribution (see the method `sample`). + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + warm_start : bool, default=False + If 'warm_start' is True, the solution of the last fitting is used as + initialization for the next call of fit(). This can speed up + convergence when fit is called several times on similar problems. + See :term:`the Glossary `. + + verbose : int, default=0 + Enable verbose output. If 1 then it prints the current + initialization and each iteration step. If greater than 1 then + it prints also the log probability and the time needed + for each step. + + verbose_interval : int, default=10 + Number of iteration done before the next print. + + Attributes + ---------- + weights_ : array-like of shape (n_components,) + The weights of each mixture components. + + means_ : array-like of shape (n_components, n_features) + The mean of each mixture component. + + covariances_ : array-like + The covariance of each mixture component. + The shape depends on `covariance_type`:: + + (n_components,) if 'spherical', + (n_features, n_features) if 'tied', + (n_components, n_features) if 'diag', + (n_components, n_features, n_features) if 'full' + + precisions_ : array-like + The precision matrices for each component in the mixture. A precision + matrix is the inverse of a covariance matrix. A covariance matrix is + symmetric positive definite so the mixture of Gaussian can be + equivalently parameterized by the precision matrices. Storing the + precision matrices instead of the covariance matrices makes it more + efficient to compute the log-likelihood of new samples at test time. + The shape depends on ``covariance_type``:: + + (n_components,) if 'spherical', + (n_features, n_features) if 'tied', + (n_components, n_features) if 'diag', + (n_components, n_features, n_features) if 'full' + + precisions_cholesky_ : array-like + The cholesky decomposition of the precision matrices of each mixture + component. A precision matrix is the inverse of a covariance matrix. + A covariance matrix is symmetric positive definite so the mixture of + Gaussian can be equivalently parameterized by the precision matrices. + Storing the precision matrices instead of the covariance matrices makes + it more efficient to compute the log-likelihood of new samples at test + time. The shape depends on ``covariance_type``:: + + (n_components,) if 'spherical', + (n_features, n_features) if 'tied', + (n_components, n_features) if 'diag', + (n_components, n_features, n_features) if 'full' + + converged_ : bool + True when convergence of the best fit of inference was reached, False otherwise. + + n_iter_ : int + Number of step used by the best fit of inference to reach the + convergence. + + lower_bound_ : float + Lower bound value on the model evidence (of the training data) of the + best fit of inference. + + lower_bounds_ : array-like of shape (`n_iter_`,) + The list of lower bound values on the model evidence from each iteration + of the best fit of inference. + + weight_concentration_prior_ : tuple or float + The dirichlet concentration of each component on the weight + distribution (Dirichlet). The type depends on + ``weight_concentration_prior_type``:: + + (float, float) if 'dirichlet_process' (Beta parameters), + float if 'dirichlet_distribution' (Dirichlet parameters). + + The higher concentration puts more mass in + the center and will lead to more components being active, while a lower + concentration parameter will lead to more mass at the edge of the + simplex. + + weight_concentration_ : array-like of shape (n_components,) + The dirichlet concentration of each component on the weight + distribution (Dirichlet). + + mean_precision_prior_ : float + The precision prior on the mean distribution (Gaussian). + Controls the extent of where means can be placed. + Larger values concentrate the cluster means around `mean_prior`. + If mean_precision_prior is set to None, `mean_precision_prior_` is set + to 1. + + mean_precision_ : array-like of shape (n_components,) + The precision of each components on the mean distribution (Gaussian). + + mean_prior_ : array-like of shape (n_features,) + The prior on the mean distribution (Gaussian). + + degrees_of_freedom_prior_ : float + The prior of the number of degrees of freedom on the covariance + distributions (Wishart). + + degrees_of_freedom_ : array-like of shape (n_components,) + The number of degrees of freedom of each components in the model. + + covariance_prior_ : float or array-like + The prior on the covariance distribution (Wishart). + The shape depends on `covariance_type`:: + + (n_features, n_features) if 'full', + (n_features, n_features) if 'tied', + (n_features) if 'diag', + float if 'spherical' + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + GaussianMixture : Finite Gaussian mixture fit with EM. + + References + ---------- + + .. [1] `Bishop, Christopher M. (2006). "Pattern recognition and machine + learning". Vol. 4 No. 4. New York: Springer. + `_ + + .. [2] `Hagai Attias. (2000). "A Variational Bayesian Framework for + Graphical Models". In Advances in Neural Information Processing + Systems 12. + `_ + + .. [3] `Blei, David M. and Michael I. Jordan. (2006). "Variational + inference for Dirichlet process mixtures". Bayesian analysis 1.1 + `_ + + Examples + -------- + >>> import numpy as np + >>> from sklearn.mixture import BayesianGaussianMixture + >>> X = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [12, 4], [10, 7]]) + >>> bgm = BayesianGaussianMixture(n_components=2, random_state=42).fit(X) + >>> bgm.means_ + array([[2.49 , 2.29], + [8.45, 4.52 ]]) + >>> bgm.predict([[0, 0], [9, 3]]) + array([0, 1]) + """ + + _parameter_constraints: dict = { + **BaseMixture._parameter_constraints, + "covariance_type": [StrOptions({"spherical", "tied", "diag", "full"})], + "weight_concentration_prior_type": [ + StrOptions({"dirichlet_process", "dirichlet_distribution"}) + ], + "weight_concentration_prior": [ + None, + Interval(Real, 0.0, None, closed="neither"), + ], + "mean_precision_prior": [None, Interval(Real, 0.0, None, closed="neither")], + "mean_prior": [None, "array-like"], + "degrees_of_freedom_prior": [None, Interval(Real, 0.0, None, closed="neither")], + "covariance_prior": [ + None, + "array-like", + Interval(Real, 0.0, None, closed="neither"), + ], + } + + def __init__( + self, + *, + n_components=1, + covariance_type="full", + tol=1e-3, + reg_covar=1e-6, + max_iter=100, + n_init=1, + init_params="kmeans", + weight_concentration_prior_type="dirichlet_process", + weight_concentration_prior=None, + mean_precision_prior=None, + mean_prior=None, + degrees_of_freedom_prior=None, + covariance_prior=None, + random_state=None, + warm_start=False, + verbose=0, + verbose_interval=10, + ): + super().__init__( + n_components=n_components, + tol=tol, + reg_covar=reg_covar, + max_iter=max_iter, + n_init=n_init, + init_params=init_params, + random_state=random_state, + warm_start=warm_start, + verbose=verbose, + verbose_interval=verbose_interval, + ) + + self.covariance_type = covariance_type + self.weight_concentration_prior_type = weight_concentration_prior_type + self.weight_concentration_prior = weight_concentration_prior + self.mean_precision_prior = mean_precision_prior + self.mean_prior = mean_prior + self.degrees_of_freedom_prior = degrees_of_freedom_prior + self.covariance_prior = covariance_prior + + def _check_parameters(self, X): + """Check that the parameters are well defined. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + """ + self._check_weights_parameters() + self._check_means_parameters(X) + self._check_precision_parameters(X) + self._checkcovariance_prior_parameter(X) + + def _check_weights_parameters(self): + """Check the parameter of the Dirichlet distribution.""" + if self.weight_concentration_prior is None: + self.weight_concentration_prior_ = 1.0 / self.n_components + else: + self.weight_concentration_prior_ = self.weight_concentration_prior + + def _check_means_parameters(self, X): + """Check the parameters of the Gaussian distribution. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + """ + _, n_features = X.shape + + if self.mean_precision_prior is None: + self.mean_precision_prior_ = 1.0 + else: + self.mean_precision_prior_ = self.mean_precision_prior + + if self.mean_prior is None: + self.mean_prior_ = X.mean(axis=0) + else: + self.mean_prior_ = check_array( + self.mean_prior, dtype=[np.float64, np.float32], ensure_2d=False + ) + _check_shape(self.mean_prior_, (n_features,), "means") + + def _check_precision_parameters(self, X): + """Check the prior parameters of the precision distribution. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + """ + _, n_features = X.shape + + if self.degrees_of_freedom_prior is None: + self.degrees_of_freedom_prior_ = n_features + elif self.degrees_of_freedom_prior > n_features - 1.0: + self.degrees_of_freedom_prior_ = self.degrees_of_freedom_prior + else: + raise ValueError( + "The parameter 'degrees_of_freedom_prior' " + "should be greater than %d, but got %.3f." + % (n_features - 1, self.degrees_of_freedom_prior) + ) + + def _checkcovariance_prior_parameter(self, X): + """Check the `covariance_prior_`. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + """ + _, n_features = X.shape + + if self.covariance_prior is None: + self.covariance_prior_ = { + "full": np.atleast_2d(np.cov(X.T)), + "tied": np.atleast_2d(np.cov(X.T)), + "diag": np.var(X, axis=0, ddof=1), + "spherical": np.var(X, axis=0, ddof=1).mean(), + }[self.covariance_type] + + elif self.covariance_type in ["full", "tied"]: + self.covariance_prior_ = check_array( + self.covariance_prior, dtype=[np.float64, np.float32], ensure_2d=False + ) + _check_shape( + self.covariance_prior_, + (n_features, n_features), + "%s covariance_prior" % self.covariance_type, + ) + _check_precision_matrix(self.covariance_prior_, self.covariance_type) + elif self.covariance_type == "diag": + self.covariance_prior_ = check_array( + self.covariance_prior, dtype=[np.float64, np.float32], ensure_2d=False + ) + _check_shape( + self.covariance_prior_, + (n_features,), + "%s covariance_prior" % self.covariance_type, + ) + _check_precision_positivity(self.covariance_prior_, self.covariance_type) + # spherical case + else: + self.covariance_prior_ = self.covariance_prior + + def _initialize(self, X, resp): + """Initialization of the mixture parameters. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + resp : array-like of shape (n_samples, n_components) + """ + nk, xk, sk = _estimate_gaussian_parameters( + X, resp, self.reg_covar, self.covariance_type + ) + + self._estimate_weights(nk) + self._estimate_means(nk, xk) + self._estimate_precisions(nk, xk, sk) + + def _estimate_weights(self, nk): + """Estimate the parameters of the Dirichlet distribution. + + Parameters + ---------- + nk : array-like of shape (n_components,) + """ + if self.weight_concentration_prior_type == "dirichlet_process": + # For dirichlet process weight_concentration will be a tuple + # containing the two parameters of the beta distribution + self.weight_concentration_ = ( + 1.0 + nk, + ( + self.weight_concentration_prior_ + + np.hstack((np.cumsum(nk[::-1])[-2::-1], 0)) + ), + ) + else: + # case Variational Gaussian mixture with dirichlet distribution + self.weight_concentration_ = self.weight_concentration_prior_ + nk + + def _estimate_means(self, nk, xk): + """Estimate the parameters of the Gaussian distribution. + + Parameters + ---------- + nk : array-like of shape (n_components,) + + xk : array-like of shape (n_components, n_features) + """ + self.mean_precision_ = self.mean_precision_prior_ + nk + self.means_ = ( + self.mean_precision_prior_ * self.mean_prior_ + nk[:, np.newaxis] * xk + ) / self.mean_precision_[:, np.newaxis] + + def _estimate_precisions(self, nk, xk, sk): + """Estimate the precisions parameters of the precision distribution. + + Parameters + ---------- + nk : array-like of shape (n_components,) + + xk : array-like of shape (n_components, n_features) + + sk : array-like + The shape depends of `covariance_type`: + 'full' : (n_components, n_features, n_features) + 'tied' : (n_features, n_features) + 'diag' : (n_components, n_features) + 'spherical' : (n_components,) + """ + { + "full": self._estimate_wishart_full, + "tied": self._estimate_wishart_tied, + "diag": self._estimate_wishart_diag, + "spherical": self._estimate_wishart_spherical, + }[self.covariance_type](nk, xk, sk) + + self.precisions_cholesky_ = _compute_precision_cholesky( + self.covariances_, self.covariance_type + ) + + def _estimate_wishart_full(self, nk, xk, sk): + """Estimate the full Wishart distribution parameters. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + nk : array-like of shape (n_components,) + + xk : array-like of shape (n_components, n_features) + + sk : array-like of shape (n_components, n_features, n_features) + """ + _, n_features = xk.shape + + # Warning : in some Bishop book, there is a typo on the formula 10.63 + # `degrees_of_freedom_k = degrees_of_freedom_0 + Nk` is + # the correct formula + self.degrees_of_freedom_ = self.degrees_of_freedom_prior_ + nk + + self.covariances_ = np.empty((self.n_components, n_features, n_features)) + + for k in range(self.n_components): + diff = xk[k] - self.mean_prior_ + self.covariances_[k] = ( + self.covariance_prior_ + + nk[k] * sk[k] + + nk[k] + * self.mean_precision_prior_ + / self.mean_precision_[k] + * np.outer(diff, diff) + ) + + # Contrary to the original bishop book, we normalize the covariances + self.covariances_ /= self.degrees_of_freedom_[:, np.newaxis, np.newaxis] + + def _estimate_wishart_tied(self, nk, xk, sk): + """Estimate the tied Wishart distribution parameters. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + nk : array-like of shape (n_components,) + + xk : array-like of shape (n_components, n_features) + + sk : array-like of shape (n_features, n_features) + """ + _, n_features = xk.shape + + # Warning : in some Bishop book, there is a typo on the formula 10.63 + # `degrees_of_freedom_k = degrees_of_freedom_0 + Nk` + # is the correct formula + self.degrees_of_freedom_ = ( + self.degrees_of_freedom_prior_ + nk.sum() / self.n_components + ) + + diff = xk - self.mean_prior_ + self.covariances_ = ( + self.covariance_prior_ + + sk * nk.sum() / self.n_components + + self.mean_precision_prior_ + / self.n_components + * np.dot((nk / self.mean_precision_) * diff.T, diff) + ) + + # Contrary to the original bishop book, we normalize the covariances + self.covariances_ /= self.degrees_of_freedom_ + + def _estimate_wishart_diag(self, nk, xk, sk): + """Estimate the diag Wishart distribution parameters. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + nk : array-like of shape (n_components,) + + xk : array-like of shape (n_components, n_features) + + sk : array-like of shape (n_components, n_features) + """ + _, n_features = xk.shape + + # Warning : in some Bishop book, there is a typo on the formula 10.63 + # `degrees_of_freedom_k = degrees_of_freedom_0 + Nk` + # is the correct formula + self.degrees_of_freedom_ = self.degrees_of_freedom_prior_ + nk + + diff = xk - self.mean_prior_ + self.covariances_ = self.covariance_prior_ + nk[:, np.newaxis] * ( + sk + + (self.mean_precision_prior_ / self.mean_precision_)[:, np.newaxis] + * np.square(diff) + ) + + # Contrary to the original bishop book, we normalize the covariances + self.covariances_ /= self.degrees_of_freedom_[:, np.newaxis] + + def _estimate_wishart_spherical(self, nk, xk, sk): + """Estimate the spherical Wishart distribution parameters. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + nk : array-like of shape (n_components,) + + xk : array-like of shape (n_components, n_features) + + sk : array-like of shape (n_components,) + """ + _, n_features = xk.shape + + # Warning : in some Bishop book, there is a typo on the formula 10.63 + # `degrees_of_freedom_k = degrees_of_freedom_0 + Nk` + # is the correct formula + self.degrees_of_freedom_ = self.degrees_of_freedom_prior_ + nk + + diff = xk - self.mean_prior_ + self.covariances_ = self.covariance_prior_ + nk * ( + sk + + self.mean_precision_prior_ + / self.mean_precision_ + * np.mean(np.square(diff), 1) + ) + + # Contrary to the original bishop book, we normalize the covariances + self.covariances_ /= self.degrees_of_freedom_ + + def _m_step(self, X, log_resp): + """M step. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + log_resp : array-like of shape (n_samples, n_components) + Logarithm of the posterior probabilities (or responsibilities) of + the point of each sample in X. + """ + n_samples, _ = X.shape + + nk, xk, sk = _estimate_gaussian_parameters( + X, np.exp(log_resp), self.reg_covar, self.covariance_type + ) + self._estimate_weights(nk) + self._estimate_means(nk, xk) + self._estimate_precisions(nk, xk, sk) + + def _estimate_log_weights(self): + if self.weight_concentration_prior_type == "dirichlet_process": + digamma_sum = digamma( + self.weight_concentration_[0] + self.weight_concentration_[1] + ) + digamma_a = digamma(self.weight_concentration_[0]) + digamma_b = digamma(self.weight_concentration_[1]) + return ( + digamma_a + - digamma_sum + + np.hstack((0, np.cumsum(digamma_b - digamma_sum)[:-1])) + ) + else: + # case Variational Gaussian mixture with dirichlet distribution + return digamma(self.weight_concentration_) - digamma( + np.sum(self.weight_concentration_) + ) + + def _estimate_log_prob(self, X): + _, n_features = X.shape + # We remove `n_features * np.log(self.degrees_of_freedom_)` because + # the precision matrix is normalized + log_gauss = _estimate_log_gaussian_prob( + X, self.means_, self.precisions_cholesky_, self.covariance_type + ) - 0.5 * n_features * np.log(self.degrees_of_freedom_) + + log_lambda = n_features * np.log(2.0) + np.sum( + digamma( + 0.5 + * (self.degrees_of_freedom_ - np.arange(0, n_features)[:, np.newaxis]) + ), + 0, + ) + + return log_gauss + 0.5 * (log_lambda - n_features / self.mean_precision_) + + def _compute_lower_bound(self, log_resp, log_prob_norm): + """Estimate the lower bound of the model. + + The lower bound on the likelihood (of the training data with respect to + the model) is used to detect the convergence and has to increase at + each iteration. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + log_resp : array, shape (n_samples, n_components) + Logarithm of the posterior probabilities (or responsibilities) of + the point of each sample in X. + + log_prob_norm : float + Logarithm of the probability of each sample in X. + + Returns + ------- + lower_bound : float + """ + # Contrary to the original formula, we have done some simplification + # and removed all the constant terms. + (n_features,) = self.mean_prior_.shape + + # We removed `.5 * n_features * np.log(self.degrees_of_freedom_)` + # because the precision matrix is normalized. + log_det_precisions_chol = _compute_log_det_cholesky( + self.precisions_cholesky_, self.covariance_type, n_features + ) - 0.5 * n_features * np.log(self.degrees_of_freedom_) + + if self.covariance_type == "tied": + log_wishart = self.n_components * np.float64( + _log_wishart_norm( + self.degrees_of_freedom_, log_det_precisions_chol, n_features + ) + ) + else: + log_wishart = np.sum( + _log_wishart_norm( + self.degrees_of_freedom_, log_det_precisions_chol, n_features + ) + ) + + if self.weight_concentration_prior_type == "dirichlet_process": + log_norm_weight = -np.sum( + betaln(self.weight_concentration_[0], self.weight_concentration_[1]) + ) + else: + log_norm_weight = _log_dirichlet_norm(self.weight_concentration_) + + return ( + -np.sum(np.exp(log_resp) * log_resp) + - log_wishart + - log_norm_weight + - 0.5 * n_features * np.sum(np.log(self.mean_precision_)) + ) + + def _get_parameters(self): + return ( + self.weight_concentration_, + self.mean_precision_, + self.means_, + self.degrees_of_freedom_, + self.covariances_, + self.precisions_cholesky_, + ) + + def _set_parameters(self, params): + ( + self.weight_concentration_, + self.mean_precision_, + self.means_, + self.degrees_of_freedom_, + self.covariances_, + self.precisions_cholesky_, + ) = params + + # Weights computation + if self.weight_concentration_prior_type == "dirichlet_process": + weight_dirichlet_sum = ( + self.weight_concentration_[0] + self.weight_concentration_[1] + ) + tmp = self.weight_concentration_[1] / weight_dirichlet_sum + self.weights_ = ( + self.weight_concentration_[0] + / weight_dirichlet_sum + * np.hstack((1, np.cumprod(tmp[:-1]))) + ) + self.weights_ /= np.sum(self.weights_) + else: + self.weights_ = self.weight_concentration_ / np.sum( + self.weight_concentration_ + ) + + # Precisions matrices computation + if self.covariance_type == "full": + self.precisions_ = np.array( + [ + np.dot(prec_chol, prec_chol.T) + for prec_chol in self.precisions_cholesky_ + ] + ) + + elif self.covariance_type == "tied": + self.precisions_ = np.dot( + self.precisions_cholesky_, self.precisions_cholesky_.T + ) + else: + self.precisions_ = self.precisions_cholesky_**2 diff --git a/.venv/lib/python3.12/site-packages/sklearn/mixture/_gaussian_mixture.py b/.venv/lib/python3.12/site-packages/sklearn/mixture/_gaussian_mixture.py new file mode 100644 index 0000000000000000000000000000000000000000..c4bdd3a0d68c81c73bcf6d606cf09bdd52aff66c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/mixture/_gaussian_mixture.py @@ -0,0 +1,934 @@ +"""Gaussian Mixture Model.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np +from scipy import linalg + +from ..utils import check_array +from ..utils._param_validation import StrOptions +from ..utils.extmath import row_norms +from ._base import BaseMixture, _check_shape + +############################################################################### +# Gaussian mixture shape checkers used by the GaussianMixture class + + +def _check_weights(weights, n_components): + """Check the user provided 'weights'. + + Parameters + ---------- + weights : array-like of shape (n_components,) + The proportions of components of each mixture. + + n_components : int + Number of components. + + Returns + ------- + weights : array, shape (n_components,) + """ + weights = check_array(weights, dtype=[np.float64, np.float32], ensure_2d=False) + _check_shape(weights, (n_components,), "weights") + + # check range + if any(np.less(weights, 0.0)) or any(np.greater(weights, 1.0)): + raise ValueError( + "The parameter 'weights' should be in the range " + "[0, 1], but got max value %.5f, min value %.5f" + % (np.min(weights), np.max(weights)) + ) + + # check normalization + atol = 1e-6 if weights.dtype == np.float32 else 1e-8 + if not np.allclose(np.abs(1.0 - np.sum(weights)), 0.0, atol=atol): + raise ValueError( + "The parameter 'weights' should be normalized, but got sum(weights) = %.5f" + % np.sum(weights) + ) + return weights + + +def _check_means(means, n_components, n_features): + """Validate the provided 'means'. + + Parameters + ---------- + means : array-like of shape (n_components, n_features) + The centers of the current components. + + n_components : int + Number of components. + + n_features : int + Number of features. + + Returns + ------- + means : array, (n_components, n_features) + """ + means = check_array(means, dtype=[np.float64, np.float32], ensure_2d=False) + _check_shape(means, (n_components, n_features), "means") + return means + + +def _check_precision_positivity(precision, covariance_type): + """Check a precision vector is positive-definite.""" + if np.any(np.less_equal(precision, 0.0)): + raise ValueError("'%s precision' should be positive" % covariance_type) + + +def _check_precision_matrix(precision, covariance_type): + """Check a precision matrix is symmetric and positive-definite.""" + if not ( + np.allclose(precision, precision.T) and np.all(linalg.eigvalsh(precision) > 0.0) + ): + raise ValueError( + "'%s precision' should be symmetric, positive-definite" % covariance_type + ) + + +def _check_precisions_full(precisions, covariance_type): + """Check the precision matrices are symmetric and positive-definite.""" + for prec in precisions: + _check_precision_matrix(prec, covariance_type) + + +def _check_precisions(precisions, covariance_type, n_components, n_features): + """Validate user provided precisions. + + Parameters + ---------- + precisions : array-like + 'full' : shape of (n_components, n_features, n_features) + 'tied' : shape of (n_features, n_features) + 'diag' : shape of (n_components, n_features) + 'spherical' : shape of (n_components,) + + covariance_type : str + + n_components : int + Number of components. + + n_features : int + Number of features. + + Returns + ------- + precisions : array + """ + precisions = check_array( + precisions, + dtype=[np.float64, np.float32], + ensure_2d=False, + allow_nd=covariance_type == "full", + ) + + precisions_shape = { + "full": (n_components, n_features, n_features), + "tied": (n_features, n_features), + "diag": (n_components, n_features), + "spherical": (n_components,), + } + _check_shape( + precisions, precisions_shape[covariance_type], "%s precision" % covariance_type + ) + + _check_precisions = { + "full": _check_precisions_full, + "tied": _check_precision_matrix, + "diag": _check_precision_positivity, + "spherical": _check_precision_positivity, + } + _check_precisions[covariance_type](precisions, covariance_type) + return precisions + + +############################################################################### +# Gaussian mixture parameters estimators (used by the M-Step) + + +def _estimate_gaussian_covariances_full(resp, X, nk, means, reg_covar): + """Estimate the full covariance matrices. + + Parameters + ---------- + resp : array-like of shape (n_samples, n_components) + + X : array-like of shape (n_samples, n_features) + + nk : array-like of shape (n_components,) + + means : array-like of shape (n_components, n_features) + + reg_covar : float + + Returns + ------- + covariances : array, shape (n_components, n_features, n_features) + The covariance matrix of the current components. + """ + n_components, n_features = means.shape + covariances = np.empty((n_components, n_features, n_features), dtype=X.dtype) + for k in range(n_components): + diff = X - means[k] + covariances[k] = np.dot(resp[:, k] * diff.T, diff) / nk[k] + covariances[k].flat[:: n_features + 1] += reg_covar + return covariances + + +def _estimate_gaussian_covariances_tied(resp, X, nk, means, reg_covar): + """Estimate the tied covariance matrix. + + Parameters + ---------- + resp : array-like of shape (n_samples, n_components) + + X : array-like of shape (n_samples, n_features) + + nk : array-like of shape (n_components,) + + means : array-like of shape (n_components, n_features) + + reg_covar : float + + Returns + ------- + covariance : array, shape (n_features, n_features) + The tied covariance matrix of the components. + """ + avg_X2 = np.dot(X.T, X) + avg_means2 = np.dot(nk * means.T, means) + covariance = avg_X2 - avg_means2 + covariance /= nk.sum() + covariance.flat[:: len(covariance) + 1] += reg_covar + return covariance + + +def _estimate_gaussian_covariances_diag(resp, X, nk, means, reg_covar): + """Estimate the diagonal covariance vectors. + + Parameters + ---------- + responsibilities : array-like of shape (n_samples, n_components) + + X : array-like of shape (n_samples, n_features) + + nk : array-like of shape (n_components,) + + means : array-like of shape (n_components, n_features) + + reg_covar : float + + Returns + ------- + covariances : array, shape (n_components, n_features) + The covariance vector of the current components. + """ + avg_X2 = np.dot(resp.T, X * X) / nk[:, np.newaxis] + avg_means2 = means**2 + return avg_X2 - avg_means2 + reg_covar + + +def _estimate_gaussian_covariances_spherical(resp, X, nk, means, reg_covar): + """Estimate the spherical variance values. + + Parameters + ---------- + responsibilities : array-like of shape (n_samples, n_components) + + X : array-like of shape (n_samples, n_features) + + nk : array-like of shape (n_components,) + + means : array-like of shape (n_components, n_features) + + reg_covar : float + + Returns + ------- + variances : array, shape (n_components,) + The variance values of each components. + """ + return _estimate_gaussian_covariances_diag(resp, X, nk, means, reg_covar).mean(1) + + +def _estimate_gaussian_parameters(X, resp, reg_covar, covariance_type): + """Estimate the Gaussian distribution parameters. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input data array. + + resp : array-like of shape (n_samples, n_components) + The responsibilities for each data sample in X. + + reg_covar : float + The regularization added to the diagonal of the covariance matrices. + + covariance_type : {'full', 'tied', 'diag', 'spherical'} + The type of precision matrices. + + Returns + ------- + nk : array-like of shape (n_components,) + The numbers of data samples in the current components. + + means : array-like of shape (n_components, n_features) + The centers of the current components. + + covariances : array-like + The covariance matrix of the current components. + The shape depends of the covariance_type. + """ + nk = resp.sum(axis=0) + 10 * np.finfo(resp.dtype).eps + means = np.dot(resp.T, X) / nk[:, np.newaxis] + covariances = { + "full": _estimate_gaussian_covariances_full, + "tied": _estimate_gaussian_covariances_tied, + "diag": _estimate_gaussian_covariances_diag, + "spherical": _estimate_gaussian_covariances_spherical, + }[covariance_type](resp, X, nk, means, reg_covar) + return nk, means, covariances + + +def _compute_precision_cholesky(covariances, covariance_type): + """Compute the Cholesky decomposition of the precisions. + + Parameters + ---------- + covariances : array-like + The covariance matrix of the current components. + The shape depends of the covariance_type. + + covariance_type : {'full', 'tied', 'diag', 'spherical'} + The type of precision matrices. + + Returns + ------- + precisions_cholesky : array-like + The cholesky decomposition of sample precisions of the current + components. The shape depends of the covariance_type. + """ + estimate_precision_error_message = ( + "Fitting the mixture model failed because some components have " + "ill-defined empirical covariance (for instance caused by singleton " + "or collapsed samples). Try to decrease the number of components, " + "increase reg_covar, or scale the input data." + ) + dtype = covariances.dtype + if dtype == np.float32: + estimate_precision_error_message += ( + " The numerical accuracy can also be improved by passing float64" + " data instead of float32." + ) + + if covariance_type == "full": + n_components, n_features, _ = covariances.shape + precisions_chol = np.empty((n_components, n_features, n_features), dtype=dtype) + for k, covariance in enumerate(covariances): + try: + cov_chol = linalg.cholesky(covariance, lower=True) + except linalg.LinAlgError: + raise ValueError(estimate_precision_error_message) + precisions_chol[k] = linalg.solve_triangular( + cov_chol, np.eye(n_features, dtype=dtype), lower=True + ).T + elif covariance_type == "tied": + _, n_features = covariances.shape + try: + cov_chol = linalg.cholesky(covariances, lower=True) + except linalg.LinAlgError: + raise ValueError(estimate_precision_error_message) + precisions_chol = linalg.solve_triangular( + cov_chol, np.eye(n_features, dtype=dtype), lower=True + ).T + else: + if np.any(np.less_equal(covariances, 0.0)): + raise ValueError(estimate_precision_error_message) + precisions_chol = 1.0 / np.sqrt(covariances) + return precisions_chol + + +def _flipudlr(array): + """Reverse the rows and columns of an array.""" + return np.flipud(np.fliplr(array)) + + +def _compute_precision_cholesky_from_precisions(precisions, covariance_type): + r"""Compute the Cholesky decomposition of precisions using precisions themselves. + + As implemented in :func:`_compute_precision_cholesky`, the `precisions_cholesky_` is + an upper-triangular matrix for each Gaussian component, which can be expressed as + the $UU^T$ factorization of the precision matrix for each Gaussian component, where + $U$ is an upper-triangular matrix. + + In order to use the Cholesky decomposition to get $UU^T$, the precision matrix + $\Lambda$ needs to be permutated such that its rows and columns are reversed, which + can be done by applying a similarity transformation with an exchange matrix $J$, + where the 1 elements reside on the anti-diagonal and all other elements are 0. In + particular, the Cholesky decomposition of the transformed precision matrix is + $J\Lambda J=LL^T$, where $L$ is a lower-triangular matrix. Because $\Lambda=UU^T$ + and $J=J^{-1}=J^T$, the `precisions_cholesky_` for each Gaussian component can be + expressed as $JLJ$. + + Refer to #26415 for details. + + Parameters + ---------- + precisions : array-like + The precision matrix of the current components. + The shape depends on the covariance_type. + + covariance_type : {'full', 'tied', 'diag', 'spherical'} + The type of precision matrices. + + Returns + ------- + precisions_cholesky : array-like + The cholesky decomposition of sample precisions of the current + components. The shape depends on the covariance_type. + """ + if covariance_type == "full": + precisions_cholesky = np.array( + [ + _flipudlr(linalg.cholesky(_flipudlr(precision), lower=True)) + for precision in precisions + ] + ) + elif covariance_type == "tied": + precisions_cholesky = _flipudlr( + linalg.cholesky(_flipudlr(precisions), lower=True) + ) + else: + precisions_cholesky = np.sqrt(precisions) + return precisions_cholesky + + +############################################################################### +# Gaussian mixture probability estimators +def _compute_log_det_cholesky(matrix_chol, covariance_type, n_features): + """Compute the log-det of the cholesky decomposition of matrices. + + Parameters + ---------- + matrix_chol : array-like + Cholesky decompositions of the matrices. + 'full' : shape of (n_components, n_features, n_features) + 'tied' : shape of (n_features, n_features) + 'diag' : shape of (n_components, n_features) + 'spherical' : shape of (n_components,) + + covariance_type : {'full', 'tied', 'diag', 'spherical'} + + n_features : int + Number of features. + + Returns + ------- + log_det_precision_chol : array-like of shape (n_components,) + The determinant of the precision matrix for each component. + """ + if covariance_type == "full": + n_components, _, _ = matrix_chol.shape + log_det_chol = np.sum( + np.log(matrix_chol.reshape(n_components, -1)[:, :: n_features + 1]), axis=1 + ) + + elif covariance_type == "tied": + log_det_chol = np.sum(np.log(np.diag(matrix_chol))) + + elif covariance_type == "diag": + log_det_chol = np.sum(np.log(matrix_chol), axis=1) + + else: + log_det_chol = n_features * np.log(matrix_chol) + + return log_det_chol + + +def _estimate_log_gaussian_prob(X, means, precisions_chol, covariance_type): + """Estimate the log Gaussian probability. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + means : array-like of shape (n_components, n_features) + + precisions_chol : array-like + Cholesky decompositions of the precision matrices. + 'full' : shape of (n_components, n_features, n_features) + 'tied' : shape of (n_features, n_features) + 'diag' : shape of (n_components, n_features) + 'spherical' : shape of (n_components,) + + covariance_type : {'full', 'tied', 'diag', 'spherical'} + + Returns + ------- + log_prob : array, shape (n_samples, n_components) + """ + n_samples, n_features = X.shape + n_components, _ = means.shape + # The determinant of the precision matrix from the Cholesky decomposition + # corresponds to the negative half of the determinant of the full precision + # matrix. + # In short: det(precision_chol) = - det(precision) / 2 + log_det = _compute_log_det_cholesky(precisions_chol, covariance_type, n_features) + + if covariance_type == "full": + log_prob = np.empty((n_samples, n_components), dtype=X.dtype) + for k, (mu, prec_chol) in enumerate(zip(means, precisions_chol)): + y = np.dot(X, prec_chol) - np.dot(mu, prec_chol) + log_prob[:, k] = np.sum(np.square(y), axis=1) + + elif covariance_type == "tied": + log_prob = np.empty((n_samples, n_components), dtype=X.dtype) + for k, mu in enumerate(means): + y = np.dot(X, precisions_chol) - np.dot(mu, precisions_chol) + log_prob[:, k] = np.sum(np.square(y), axis=1) + + elif covariance_type == "diag": + precisions = precisions_chol**2 + log_prob = ( + np.sum((means**2 * precisions), 1) + - 2.0 * np.dot(X, (means * precisions).T) + + np.dot(X**2, precisions.T) + ) + + elif covariance_type == "spherical": + precisions = precisions_chol**2 + log_prob = ( + np.sum(means**2, 1) * precisions + - 2 * np.dot(X, means.T * precisions) + + np.outer(row_norms(X, squared=True), precisions) + ) + # Since we are using the precision of the Cholesky decomposition, + # `- 0.5 * log_det_precision` becomes `+ log_det_precision_chol` + return -0.5 * (n_features * np.log(2 * np.pi).astype(X.dtype) + log_prob) + log_det + + +class GaussianMixture(BaseMixture): + """Gaussian Mixture. + + Representation of a Gaussian mixture model probability distribution. + This class allows to estimate the parameters of a Gaussian mixture + distribution. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.18 + + Parameters + ---------- + n_components : int, default=1 + The number of mixture components. + + covariance_type : {'full', 'tied', 'diag', 'spherical'}, default='full' + String describing the type of covariance parameters to use. + Must be one of: + + - 'full': each component has its own general covariance matrix. + - 'tied': all components share the same general covariance matrix. + - 'diag': each component has its own diagonal covariance matrix. + - 'spherical': each component has its own single variance. + + For an example of using `covariance_type`, refer to + :ref:`sphx_glr_auto_examples_mixture_plot_gmm_selection.py`. + + tol : float, default=1e-3 + The convergence threshold. EM iterations will stop when the + lower bound average gain is below this threshold. + + reg_covar : float, default=1e-6 + Non-negative regularization added to the diagonal of covariance. + Allows to assure that the covariance matrices are all positive. + + max_iter : int, default=100 + The number of EM iterations to perform. + + n_init : int, default=1 + The number of initializations to perform. The best results are kept. + + init_params : {'kmeans', 'k-means++', 'random', 'random_from_data'}, \ + default='kmeans' + The method used to initialize the weights, the means and the + precisions. + String must be one of: + + - 'kmeans' : responsibilities are initialized using kmeans. + - 'k-means++' : use the k-means++ method to initialize. + - 'random' : responsibilities are initialized randomly. + - 'random_from_data' : initial means are randomly selected data points. + + .. versionchanged:: v1.1 + `init_params` now accepts 'random_from_data' and 'k-means++' as + initialization methods. + + weights_init : array-like of shape (n_components, ), default=None + The user-provided initial weights. + If it is None, weights are initialized using the `init_params` method. + + means_init : array-like of shape (n_components, n_features), default=None + The user-provided initial means, + If it is None, means are initialized using the `init_params` method. + + precisions_init : array-like, default=None + The user-provided initial precisions (inverse of the covariance + matrices). + If it is None, precisions are initialized using the 'init_params' + method. + The shape depends on 'covariance_type':: + + (n_components,) if 'spherical', + (n_features, n_features) if 'tied', + (n_components, n_features) if 'diag', + (n_components, n_features, n_features) if 'full' + + random_state : int, RandomState instance or None, default=None + Controls the random seed given to the method chosen to initialize the + parameters (see `init_params`). + In addition, it controls the generation of random samples from the + fitted distribution (see the method `sample`). + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + warm_start : bool, default=False + If 'warm_start' is True, the solution of the last fitting is used as + initialization for the next call of fit(). This can speed up + convergence when fit is called several times on similar problems. + In that case, 'n_init' is ignored and only a single initialization + occurs upon the first call. + See :term:`the Glossary `. + + verbose : int, default=0 + Enable verbose output. If 1 then it prints the current + initialization and each iteration step. If greater than 1 then + it prints also the log probability and the time needed + for each step. + + verbose_interval : int, default=10 + Number of iteration done before the next print. + + Attributes + ---------- + weights_ : array-like of shape (n_components,) + The weights of each mixture components. + + means_ : array-like of shape (n_components, n_features) + The mean of each mixture component. + + covariances_ : array-like + The covariance of each mixture component. + The shape depends on `covariance_type`:: + + (n_components,) if 'spherical', + (n_features, n_features) if 'tied', + (n_components, n_features) if 'diag', + (n_components, n_features, n_features) if 'full' + + For an example of using covariances, refer to + :ref:`sphx_glr_auto_examples_mixture_plot_gmm_covariances.py`. + + precisions_ : array-like + The precision matrices for each component in the mixture. A precision + matrix is the inverse of a covariance matrix. A covariance matrix is + symmetric positive definite so the mixture of Gaussian can be + equivalently parameterized by the precision matrices. Storing the + precision matrices instead of the covariance matrices makes it more + efficient to compute the log-likelihood of new samples at test time. + The shape depends on `covariance_type`:: + + (n_components,) if 'spherical', + (n_features, n_features) if 'tied', + (n_components, n_features) if 'diag', + (n_components, n_features, n_features) if 'full' + + precisions_cholesky_ : array-like + The cholesky decomposition of the precision matrices of each mixture + component. A precision matrix is the inverse of a covariance matrix. + A covariance matrix is symmetric positive definite so the mixture of + Gaussian can be equivalently parameterized by the precision matrices. + Storing the precision matrices instead of the covariance matrices makes + it more efficient to compute the log-likelihood of new samples at test + time. The shape depends on `covariance_type`:: + + (n_components,) if 'spherical', + (n_features, n_features) if 'tied', + (n_components, n_features) if 'diag', + (n_components, n_features, n_features) if 'full' + + converged_ : bool + True when convergence of the best fit of EM was reached, False otherwise. + + n_iter_ : int + Number of step used by the best fit of EM to reach the convergence. + + lower_bound_ : float + Lower bound value on the log-likelihood (of the training data with + respect to the model) of the best fit of EM. + + lower_bounds_ : array-like of shape (`n_iter_`,) + The list of lower bound values on the log-likelihood from each + iteration of the best fit of EM. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + BayesianGaussianMixture : Gaussian mixture model fit with a variational + inference. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.mixture import GaussianMixture + >>> X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]]) + >>> gm = GaussianMixture(n_components=2, random_state=0).fit(X) + >>> gm.means_ + array([[10., 2.], + [ 1., 2.]]) + >>> gm.predict([[0, 0], [12, 3]]) + array([1, 0]) + + For a comparison of Gaussian Mixture with other clustering algorithms, see + :ref:`sphx_glr_auto_examples_cluster_plot_cluster_comparison.py` + """ + + _parameter_constraints: dict = { + **BaseMixture._parameter_constraints, + "covariance_type": [StrOptions({"full", "tied", "diag", "spherical"})], + "weights_init": ["array-like", None], + "means_init": ["array-like", None], + "precisions_init": ["array-like", None], + } + + def __init__( + self, + n_components=1, + *, + covariance_type="full", + tol=1e-3, + reg_covar=1e-6, + max_iter=100, + n_init=1, + init_params="kmeans", + weights_init=None, + means_init=None, + precisions_init=None, + random_state=None, + warm_start=False, + verbose=0, + verbose_interval=10, + ): + super().__init__( + n_components=n_components, + tol=tol, + reg_covar=reg_covar, + max_iter=max_iter, + n_init=n_init, + init_params=init_params, + random_state=random_state, + warm_start=warm_start, + verbose=verbose, + verbose_interval=verbose_interval, + ) + + self.covariance_type = covariance_type + self.weights_init = weights_init + self.means_init = means_init + self.precisions_init = precisions_init + + def _check_parameters(self, X): + """Check the Gaussian mixture parameters are well defined.""" + _, n_features = X.shape + + if self.weights_init is not None: + self.weights_init = _check_weights(self.weights_init, self.n_components) + + if self.means_init is not None: + self.means_init = _check_means( + self.means_init, self.n_components, n_features + ) + + if self.precisions_init is not None: + self.precisions_init = _check_precisions( + self.precisions_init, + self.covariance_type, + self.n_components, + n_features, + ) + + def _initialize_parameters(self, X, random_state): + # If all the initial parameters are all provided, then there is no need to run + # the initialization. + compute_resp = ( + self.weights_init is None + or self.means_init is None + or self.precisions_init is None + ) + if compute_resp: + super()._initialize_parameters(X, random_state) + else: + self._initialize(X, None) + + def _initialize(self, X, resp): + """Initialization of the Gaussian mixture parameters. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + resp : array-like of shape (n_samples, n_components) + """ + n_samples, _ = X.shape + weights, means, covariances = None, None, None + if resp is not None: + weights, means, covariances = _estimate_gaussian_parameters( + X, resp, self.reg_covar, self.covariance_type + ) + if self.weights_init is None: + weights /= n_samples + + self.weights_ = weights if self.weights_init is None else self.weights_init + self.means_ = means if self.means_init is None else self.means_init + + if self.precisions_init is None: + self.covariances_ = covariances + self.precisions_cholesky_ = _compute_precision_cholesky( + covariances, self.covariance_type + ) + else: + self.precisions_cholesky_ = _compute_precision_cholesky_from_precisions( + self.precisions_init, self.covariance_type + ) + + def _m_step(self, X, log_resp): + """M step. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + log_resp : array-like of shape (n_samples, n_components) + Logarithm of the posterior probabilities (or responsibilities) of + the point of each sample in X. + """ + self.weights_, self.means_, self.covariances_ = _estimate_gaussian_parameters( + X, np.exp(log_resp), self.reg_covar, self.covariance_type + ) + self.weights_ /= self.weights_.sum() + self.precisions_cholesky_ = _compute_precision_cholesky( + self.covariances_, self.covariance_type + ) + + def _estimate_log_prob(self, X): + return _estimate_log_gaussian_prob( + X, self.means_, self.precisions_cholesky_, self.covariance_type + ) + + def _estimate_log_weights(self): + return np.log(self.weights_) + + def _compute_lower_bound(self, _, log_prob_norm): + return log_prob_norm + + def _get_parameters(self): + return ( + self.weights_, + self.means_, + self.covariances_, + self.precisions_cholesky_, + ) + + def _set_parameters(self, params): + ( + self.weights_, + self.means_, + self.covariances_, + self.precisions_cholesky_, + ) = params + + # Attributes computation + _, n_features = self.means_.shape + + dtype = self.precisions_cholesky_.dtype + if self.covariance_type == "full": + self.precisions_ = np.empty_like(self.precisions_cholesky_) + for k, prec_chol in enumerate(self.precisions_cholesky_): + self.precisions_[k] = np.dot(prec_chol, prec_chol.T) + + elif self.covariance_type == "tied": + self.precisions_ = np.dot( + self.precisions_cholesky_, self.precisions_cholesky_.T + ) + else: + self.precisions_ = self.precisions_cholesky_**2 + + def _n_parameters(self): + """Return the number of free parameters in the model.""" + _, n_features = self.means_.shape + if self.covariance_type == "full": + cov_params = self.n_components * n_features * (n_features + 1) / 2.0 + elif self.covariance_type == "diag": + cov_params = self.n_components * n_features + elif self.covariance_type == "tied": + cov_params = n_features * (n_features + 1) / 2.0 + elif self.covariance_type == "spherical": + cov_params = self.n_components + mean_params = n_features * self.n_components + return int(cov_params + mean_params + self.n_components - 1) + + def bic(self, X): + """Bayesian information criterion for the current model on the input X. + + You can refer to this :ref:`mathematical section ` for more + details regarding the formulation of the BIC used. + + For an example of GMM selection using `bic` information criterion, + refer to :ref:`sphx_glr_auto_examples_mixture_plot_gmm_selection.py`. + + Parameters + ---------- + X : array of shape (n_samples, n_dimensions) + The input samples. + + Returns + ------- + bic : float + The lower the better. + """ + return -2 * self.score(X) * X.shape[0] + self._n_parameters() * np.log( + X.shape[0] + ) + + def aic(self, X): + """Akaike information criterion for the current model on the input X. + + You can refer to this :ref:`mathematical section ` for more + details regarding the formulation of the AIC used. + + Parameters + ---------- + X : array of shape (n_samples, n_dimensions) + The input samples. + + Returns + ------- + aic : float + The lower the better. + """ + return -2 * self.score(X) * X.shape[0] + 2 * self._n_parameters() diff --git a/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/test_bayesian_mixture.py b/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/test_bayesian_mixture.py new file mode 100644 index 0000000000000000000000000000000000000000..d36543903cb87b07ea1a1c75b9a69aa63bd7dbff --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/test_bayesian_mixture.py @@ -0,0 +1,464 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import copy + +import numpy as np +import pytest +from scipy.special import gammaln + +from sklearn.exceptions import NotFittedError +from sklearn.metrics.cluster import adjusted_rand_score +from sklearn.mixture import BayesianGaussianMixture +from sklearn.mixture._bayesian_mixture import _log_dirichlet_norm, _log_wishart_norm +from sklearn.mixture.tests.test_gaussian_mixture import RandomData +from sklearn.utils._testing import ( + assert_almost_equal, + assert_array_equal, +) + +COVARIANCE_TYPE = ["full", "tied", "diag", "spherical"] +PRIOR_TYPE = ["dirichlet_process", "dirichlet_distribution"] + + +def test_log_dirichlet_norm(): + rng = np.random.RandomState(0) + + weight_concentration = rng.rand(2) + expected_norm = gammaln(np.sum(weight_concentration)) - np.sum( + gammaln(weight_concentration) + ) + predected_norm = _log_dirichlet_norm(weight_concentration) + + assert_almost_equal(expected_norm, predected_norm) + + +def test_log_wishart_norm(): + rng = np.random.RandomState(0) + + n_components, n_features = 5, 2 + degrees_of_freedom = np.abs(rng.rand(n_components)) + 1.0 + log_det_precisions_chol = n_features * np.log(range(2, 2 + n_components)) + + expected_norm = np.empty(5) + for k, (degrees_of_freedom_k, log_det_k) in enumerate( + zip(degrees_of_freedom, log_det_precisions_chol) + ): + expected_norm[k] = -( + degrees_of_freedom_k * (log_det_k + 0.5 * n_features * np.log(2.0)) + + np.sum( + gammaln( + 0.5 + * (degrees_of_freedom_k - np.arange(0, n_features)[:, np.newaxis]) + ), + 0, + ) + ).item() + predected_norm = _log_wishart_norm( + degrees_of_freedom, log_det_precisions_chol, n_features + ) + + assert_almost_equal(expected_norm, predected_norm) + + +def test_bayesian_mixture_weights_prior_initialisation(): + rng = np.random.RandomState(0) + n_samples, n_components, n_features = 10, 5, 2 + X = rng.rand(n_samples, n_features) + + # Check correct init for a given value of weight_concentration_prior + weight_concentration_prior = rng.rand() + bgmm = BayesianGaussianMixture( + weight_concentration_prior=weight_concentration_prior, random_state=rng + ).fit(X) + assert_almost_equal(weight_concentration_prior, bgmm.weight_concentration_prior_) + + # Check correct init for the default value of weight_concentration_prior + bgmm = BayesianGaussianMixture(n_components=n_components, random_state=rng).fit(X) + assert_almost_equal(1.0 / n_components, bgmm.weight_concentration_prior_) + + +def test_bayesian_mixture_mean_prior_initialisation(): + rng = np.random.RandomState(0) + n_samples, n_components, n_features = 10, 3, 2 + X = rng.rand(n_samples, n_features) + + # Check correct init for a given value of mean_precision_prior + mean_precision_prior = rng.rand() + bgmm = BayesianGaussianMixture( + mean_precision_prior=mean_precision_prior, random_state=rng + ).fit(X) + assert_almost_equal(mean_precision_prior, bgmm.mean_precision_prior_) + + # Check correct init for the default value of mean_precision_prior + bgmm = BayesianGaussianMixture(random_state=rng).fit(X) + assert_almost_equal(1.0, bgmm.mean_precision_prior_) + + # Check correct init for a given value of mean_prior + mean_prior = rng.rand(n_features) + bgmm = BayesianGaussianMixture( + n_components=n_components, mean_prior=mean_prior, random_state=rng + ).fit(X) + assert_almost_equal(mean_prior, bgmm.mean_prior_) + + # Check correct init for the default value of bemean_priorta + bgmm = BayesianGaussianMixture(n_components=n_components, random_state=rng).fit(X) + assert_almost_equal(X.mean(axis=0), bgmm.mean_prior_) + + +def test_bayesian_mixture_precisions_prior_initialisation(): + rng = np.random.RandomState(0) + n_samples, n_features = 10, 2 + X = rng.rand(n_samples, n_features) + + # Check raise message for a bad value of degrees_of_freedom_prior + bad_degrees_of_freedom_prior_ = n_features - 1.0 + bgmm = BayesianGaussianMixture( + degrees_of_freedom_prior=bad_degrees_of_freedom_prior_, random_state=rng + ) + msg = ( + "The parameter 'degrees_of_freedom_prior' should be greater than" + f" {n_features - 1}, but got {bad_degrees_of_freedom_prior_:.3f}." + ) + with pytest.raises(ValueError, match=msg): + bgmm.fit(X) + + # Check correct init for a given value of degrees_of_freedom_prior + degrees_of_freedom_prior = rng.rand() + n_features - 1.0 + bgmm = BayesianGaussianMixture( + degrees_of_freedom_prior=degrees_of_freedom_prior, random_state=rng + ).fit(X) + assert_almost_equal(degrees_of_freedom_prior, bgmm.degrees_of_freedom_prior_) + + # Check correct init for the default value of degrees_of_freedom_prior + degrees_of_freedom_prior_default = n_features + bgmm = BayesianGaussianMixture( + degrees_of_freedom_prior=degrees_of_freedom_prior_default, random_state=rng + ).fit(X) + assert_almost_equal( + degrees_of_freedom_prior_default, bgmm.degrees_of_freedom_prior_ + ) + + # Check correct init for a given value of covariance_prior + covariance_prior = { + "full": np.cov(X.T, bias=1) + 10, + "tied": np.cov(X.T, bias=1) + 5, + "diag": np.diag(np.atleast_2d(np.cov(X.T, bias=1))) + 3, + "spherical": rng.rand(), + } + + bgmm = BayesianGaussianMixture(random_state=rng) + for cov_type in ["full", "tied", "diag", "spherical"]: + bgmm.covariance_type = cov_type + bgmm.covariance_prior = covariance_prior[cov_type] + bgmm.fit(X) + assert_almost_equal(covariance_prior[cov_type], bgmm.covariance_prior_) + + # Check correct init for the default value of covariance_prior + covariance_prior_default = { + "full": np.atleast_2d(np.cov(X.T)), + "tied": np.atleast_2d(np.cov(X.T)), + "diag": np.var(X, axis=0, ddof=1), + "spherical": np.var(X, axis=0, ddof=1).mean(), + } + + bgmm = BayesianGaussianMixture(random_state=0) + for cov_type in ["full", "tied", "diag", "spherical"]: + bgmm.covariance_type = cov_type + bgmm.fit(X) + assert_almost_equal(covariance_prior_default[cov_type], bgmm.covariance_prior_) + + +def test_bayesian_mixture_check_is_fitted(): + rng = np.random.RandomState(0) + n_samples, n_features = 10, 2 + + # Check raise message + bgmm = BayesianGaussianMixture(random_state=rng) + X = rng.rand(n_samples, n_features) + + msg = "This BayesianGaussianMixture instance is not fitted yet." + with pytest.raises(ValueError, match=msg): + bgmm.score(X) + + +def test_bayesian_mixture_weights(): + rng = np.random.RandomState(0) + n_samples, n_features = 10, 2 + + X = rng.rand(n_samples, n_features) + + # Case Dirichlet distribution for the weight concentration prior type + bgmm = BayesianGaussianMixture( + weight_concentration_prior_type="dirichlet_distribution", + n_components=3, + random_state=rng, + ).fit(X) + + expected_weights = bgmm.weight_concentration_ / np.sum(bgmm.weight_concentration_) + assert_almost_equal(expected_weights, bgmm.weights_) + assert_almost_equal(np.sum(bgmm.weights_), 1.0) + + # Case Dirichlet process for the weight concentration prior type + dpgmm = BayesianGaussianMixture( + weight_concentration_prior_type="dirichlet_process", + n_components=3, + random_state=rng, + ).fit(X) + weight_dirichlet_sum = ( + dpgmm.weight_concentration_[0] + dpgmm.weight_concentration_[1] + ) + tmp = dpgmm.weight_concentration_[1] / weight_dirichlet_sum + expected_weights = ( + dpgmm.weight_concentration_[0] + / weight_dirichlet_sum + * np.hstack((1, np.cumprod(tmp[:-1]))) + ) + expected_weights /= np.sum(expected_weights) + assert_almost_equal(expected_weights, dpgmm.weights_) + assert_almost_equal(np.sum(dpgmm.weights_), 1.0) + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +def test_monotonic_likelihood(): + # We check that each step of the each step of variational inference without + # regularization improve monotonically the training set of the bound + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=20) + n_components = rand_data.n_components + + for prior_type in PRIOR_TYPE: + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + bgmm = BayesianGaussianMixture( + weight_concentration_prior_type=prior_type, + n_components=2 * n_components, + covariance_type=covar_type, + warm_start=True, + max_iter=1, + random_state=rng, + tol=1e-3, + ) + current_lower_bound = -np.inf + # Do one training iteration at a time so we can make sure that the + # training log likelihood increases after each iteration. + for _ in range(600): + prev_lower_bound = current_lower_bound + current_lower_bound = bgmm.fit(X).lower_bound_ + assert current_lower_bound >= prev_lower_bound + + if bgmm.converged_: + break + assert bgmm.converged_ + + +def test_compare_covar_type(): + # We can compare the 'full' precision with the other cov_type if we apply + # 1 iter of the M-step (done during _initialize_parameters). + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=7) + X = rand_data.X["full"] + n_components = rand_data.n_components + + for prior_type in PRIOR_TYPE: + # Computation of the full_covariance + bgmm = BayesianGaussianMixture( + weight_concentration_prior_type=prior_type, + n_components=2 * n_components, + covariance_type="full", + max_iter=1, + random_state=0, + tol=1e-7, + ) + bgmm._check_parameters(X) + bgmm._initialize_parameters(X, np.random.RandomState(0)) + full_covariances = ( + bgmm.covariances_ * bgmm.degrees_of_freedom_[:, np.newaxis, np.newaxis] + ) + + # Check tied_covariance = mean(full_covariances, 0) + bgmm = BayesianGaussianMixture( + weight_concentration_prior_type=prior_type, + n_components=2 * n_components, + covariance_type="tied", + max_iter=1, + random_state=0, + tol=1e-7, + ) + bgmm._check_parameters(X) + bgmm._initialize_parameters(X, np.random.RandomState(0)) + + tied_covariance = bgmm.covariances_ * bgmm.degrees_of_freedom_ + assert_almost_equal(tied_covariance, np.mean(full_covariances, 0)) + + # Check diag_covariance = diag(full_covariances) + bgmm = BayesianGaussianMixture( + weight_concentration_prior_type=prior_type, + n_components=2 * n_components, + covariance_type="diag", + max_iter=1, + random_state=0, + tol=1e-7, + ) + bgmm._check_parameters(X) + bgmm._initialize_parameters(X, np.random.RandomState(0)) + + diag_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_[:, np.newaxis] + assert_almost_equal( + diag_covariances, np.array([np.diag(cov) for cov in full_covariances]) + ) + + # Check spherical_covariance = np.mean(diag_covariances, 0) + bgmm = BayesianGaussianMixture( + weight_concentration_prior_type=prior_type, + n_components=2 * n_components, + covariance_type="spherical", + max_iter=1, + random_state=0, + tol=1e-7, + ) + bgmm._check_parameters(X) + bgmm._initialize_parameters(X, np.random.RandomState(0)) + + spherical_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_ + assert_almost_equal(spherical_covariances, np.mean(diag_covariances, 1)) + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +def test_check_covariance_precision(): + # We check that the dot product of the covariance and the precision + # matrices is identity. + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=7) + n_components, n_features = 2 * rand_data.n_components, 2 + + # Computation of the full_covariance + bgmm = BayesianGaussianMixture( + n_components=n_components, max_iter=100, random_state=rng, tol=1e-3, reg_covar=0 + ) + for covar_type in COVARIANCE_TYPE: + bgmm.covariance_type = covar_type + bgmm.fit(rand_data.X[covar_type]) + + if covar_type == "full": + for covar, precision in zip(bgmm.covariances_, bgmm.precisions_): + assert_almost_equal(np.dot(covar, precision), np.eye(n_features)) + elif covar_type == "tied": + assert_almost_equal( + np.dot(bgmm.covariances_, bgmm.precisions_), np.eye(n_features) + ) + + elif covar_type == "diag": + assert_almost_equal( + bgmm.covariances_ * bgmm.precisions_, + np.ones((n_components, n_features)), + ) + + else: + assert_almost_equal( + bgmm.covariances_ * bgmm.precisions_, np.ones(n_components) + ) + + +def test_invariant_translation(): + # We check here that adding a constant in the data change correctly the + # parameters of the mixture + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=100) + n_components = 2 * rand_data.n_components + + for prior_type in PRIOR_TYPE: + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + bgmm1 = BayesianGaussianMixture( + weight_concentration_prior_type=prior_type, + n_components=n_components, + max_iter=100, + random_state=0, + tol=1e-3, + reg_covar=0, + ).fit(X) + bgmm2 = BayesianGaussianMixture( + weight_concentration_prior_type=prior_type, + n_components=n_components, + max_iter=100, + random_state=0, + tol=1e-3, + reg_covar=0, + ).fit(X + 100) + + assert_almost_equal(bgmm1.means_, bgmm2.means_ - 100) + assert_almost_equal(bgmm1.weights_, bgmm2.weights_) + assert_almost_equal(bgmm1.covariances_, bgmm2.covariances_) + + +@pytest.mark.filterwarnings("ignore:.*did not converge.*") +@pytest.mark.parametrize( + "seed, max_iter, tol", + [ + (0, 2, 1e-7), # strict non-convergence + (1, 2, 1e-1), # loose non-convergence + (3, 300, 1e-7), # strict convergence + (4, 300, 1e-1), # loose convergence + ], +) +def test_bayesian_mixture_fit_predict(seed, max_iter, tol): + rng = np.random.RandomState(seed) + rand_data = RandomData(rng, n_samples=50, scale=7) + n_components = 2 * rand_data.n_components + + for covar_type in COVARIANCE_TYPE: + bgmm1 = BayesianGaussianMixture( + n_components=n_components, + max_iter=max_iter, + random_state=rng, + tol=tol, + reg_covar=0, + ) + bgmm1.covariance_type = covar_type + bgmm2 = copy.deepcopy(bgmm1) + X = rand_data.X[covar_type] + + Y_pred1 = bgmm1.fit(X).predict(X) + Y_pred2 = bgmm2.fit_predict(X) + assert_array_equal(Y_pred1, Y_pred2) + + +def test_bayesian_mixture_fit_predict_n_init(): + # Check that fit_predict is equivalent to fit.predict, when n_init > 1 + X = np.random.RandomState(0).randn(50, 5) + gm = BayesianGaussianMixture(n_components=5, n_init=10, random_state=0) + y_pred1 = gm.fit_predict(X) + y_pred2 = gm.predict(X) + assert_array_equal(y_pred1, y_pred2) + + +def test_bayesian_mixture_predict_predict_proba(): + # this is the same test as test_gaussian_mixture_predict_predict_proba() + rng = np.random.RandomState(0) + rand_data = RandomData(rng) + for prior_type in PRIOR_TYPE: + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + Y = rand_data.Y + bgmm = BayesianGaussianMixture( + n_components=rand_data.n_components, + random_state=rng, + weight_concentration_prior_type=prior_type, + covariance_type=covar_type, + ) + + # Check a warning message arrive if we don't do fit + msg = ( + "This BayesianGaussianMixture instance is not fitted yet. " + "Call 'fit' with appropriate arguments before using this " + "estimator." + ) + with pytest.raises(NotFittedError, match=msg): + bgmm.predict(X) + + bgmm.fit(X) + Y_pred = bgmm.predict(X) + Y_pred_proba = bgmm.predict_proba(X).argmax(axis=1) + assert_array_equal(Y_pred, Y_pred_proba) + assert adjusted_rand_score(Y, Y_pred) >= 0.95 diff --git a/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/test_gaussian_mixture.py b/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/test_gaussian_mixture.py new file mode 100644 index 0000000000000000000000000000000000000000..488a2ab147e8362eede842f64f4787fda47b9159 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/test_gaussian_mixture.py @@ -0,0 +1,1473 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import copy +import itertools +import re +import sys +import warnings +from io import StringIO +from unittest.mock import Mock + +import numpy as np +import pytest +from scipy import linalg, stats + +import sklearn +from sklearn.cluster import KMeans +from sklearn.covariance import EmpiricalCovariance +from sklearn.datasets import make_spd_matrix +from sklearn.exceptions import ConvergenceWarning, NotFittedError +from sklearn.metrics.cluster import adjusted_rand_score +from sklearn.mixture import GaussianMixture +from sklearn.mixture._gaussian_mixture import ( + _compute_log_det_cholesky, + _compute_precision_cholesky, + _estimate_gaussian_covariances_diag, + _estimate_gaussian_covariances_full, + _estimate_gaussian_covariances_spherical, + _estimate_gaussian_covariances_tied, + _estimate_gaussian_parameters, +) +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) +from sklearn.utils.extmath import fast_logdet + +COVARIANCE_TYPE = ["full", "tied", "diag", "spherical"] + + +def generate_data( + n_samples, n_features, weights, means, precisions, covariance_type, dtype=np.float64 +): + rng = np.random.RandomState(0) + + X = [] + if covariance_type == "spherical": + for _, (w, m, c) in enumerate(zip(weights, means, precisions["spherical"])): + X.append( + rng.multivariate_normal( + m, c * np.eye(n_features), int(np.round(w * n_samples)) + ).astype(dtype) + ) + if covariance_type == "diag": + for _, (w, m, c) in enumerate(zip(weights, means, precisions["diag"])): + X.append( + rng.multivariate_normal( + m, np.diag(c), int(np.round(w * n_samples)) + ).astype(dtype) + ) + if covariance_type == "tied": + for _, (w, m) in enumerate(zip(weights, means)): + X.append( + rng.multivariate_normal( + m, precisions["tied"], int(np.round(w * n_samples)) + ).astype(dtype) + ) + if covariance_type == "full": + for _, (w, m, c) in enumerate(zip(weights, means, precisions["full"])): + X.append( + rng.multivariate_normal(m, c, int(np.round(w * n_samples))).astype( + dtype + ) + ) + + X = np.vstack(X) + return X + + +class RandomData: + def __init__( + self, + rng, + n_samples=200, + n_components=2, + n_features=2, + scale=50, + dtype=np.float64, + ): + self.n_samples = n_samples + self.n_components = n_components + self.n_features = n_features + + self.weights = rng.rand(n_components).astype(dtype) + self.weights = self.weights.astype(dtype) / self.weights.sum() + self.means = rng.rand(n_components, n_features).astype(dtype) * scale + self.covariances = { + "spherical": 0.5 + rng.rand(n_components).astype(dtype), + "diag": (0.5 + rng.rand(n_components, n_features).astype(dtype)) ** 2, + "tied": make_spd_matrix(n_features, random_state=rng).astype(dtype), + "full": np.array( + [ + make_spd_matrix(n_features, random_state=rng).astype(dtype) * 0.5 + for _ in range(n_components) + ] + ), + } + self.precisions = { + "spherical": 1.0 / self.covariances["spherical"], + "diag": 1.0 / self.covariances["diag"], + "tied": linalg.inv(self.covariances["tied"]), + "full": np.array( + [linalg.inv(covariance) for covariance in self.covariances["full"]] + ), + } + + self.X = dict( + zip( + COVARIANCE_TYPE, + [ + generate_data( + n_samples, + n_features, + self.weights, + self.means, + self.covariances, + covar_type, + dtype=dtype, + ) + for covar_type in COVARIANCE_TYPE + ], + ) + ) + self.Y = np.hstack( + [ + np.full(int(np.round(w * n_samples)), k, dtype=int) + for k, w in enumerate(self.weights) + ] + ) + + +def test_gaussian_mixture_attributes(): + # test bad parameters + rng = np.random.RandomState(0) + X = rng.rand(10, 2) + + # test good parameters + n_components, tol, n_init, max_iter, reg_covar = 2, 1e-4, 3, 30, 1e-1 + covariance_type, init_params = "full", "random" + gmm = GaussianMixture( + n_components=n_components, + tol=tol, + n_init=n_init, + max_iter=max_iter, + reg_covar=reg_covar, + covariance_type=covariance_type, + init_params=init_params, + ).fit(X) + + assert gmm.n_components == n_components + assert gmm.covariance_type == covariance_type + assert gmm.tol == tol + assert gmm.reg_covar == reg_covar + assert gmm.max_iter == max_iter + assert gmm.n_init == n_init + assert gmm.init_params == init_params + + +def test_check_weights(): + rng = np.random.RandomState(0) + rand_data = RandomData(rng) + + n_components = rand_data.n_components + X = rand_data.X["full"] + + g = GaussianMixture(n_components=n_components) + + # Check bad shape + weights_bad_shape = rng.rand(n_components, 1) + g.weights_init = weights_bad_shape + msg = re.escape( + "The parameter 'weights' should have the shape of " + f"({n_components},), but got {weights_bad_shape.shape}" + ) + with pytest.raises(ValueError, match=msg): + g.fit(X) + + # Check bad range + weights_bad_range = rng.rand(n_components) + 1 + g.weights_init = weights_bad_range + msg = re.escape( + "The parameter 'weights' should be in the range [0, 1], but got" + f" max value {np.min(weights_bad_range):.5f}, " + f"min value {np.max(weights_bad_range):.5f}" + ) + with pytest.raises(ValueError, match=msg): + g.fit(X) + + # Check bad normalization + weights_bad_norm = rng.rand(n_components) + weights_bad_norm = weights_bad_norm / (weights_bad_norm.sum() + 1) + g.weights_init = weights_bad_norm + msg = re.escape( + "The parameter 'weights' should be normalized, " + f"but got sum(weights) = {np.sum(weights_bad_norm):.5f}" + ) + with pytest.raises(ValueError, match=msg): + g.fit(X) + + # Check good weights matrix + weights = rand_data.weights + g = GaussianMixture(weights_init=weights, n_components=n_components) + g.fit(X) + assert_array_equal(weights, g.weights_init) + + +def test_check_means(): + rng = np.random.RandomState(0) + rand_data = RandomData(rng) + + n_components, n_features = rand_data.n_components, rand_data.n_features + X = rand_data.X["full"] + + g = GaussianMixture(n_components=n_components) + + # Check means bad shape + means_bad_shape = rng.rand(n_components + 1, n_features) + g.means_init = means_bad_shape + msg = "The parameter 'means' should have the shape of " + with pytest.raises(ValueError, match=msg): + g.fit(X) + + # Check good means matrix + means = rand_data.means + g.means_init = means + g.fit(X) + assert_array_equal(means, g.means_init) + + +def test_check_precisions(): + rng = np.random.RandomState(0) + rand_data = RandomData(rng) + + n_components, n_features = rand_data.n_components, rand_data.n_features + + # Define the bad precisions for each covariance_type + precisions_bad_shape = { + "full": np.ones((n_components + 1, n_features, n_features)), + "tied": np.ones((n_features + 1, n_features + 1)), + "diag": np.ones((n_components + 1, n_features)), + "spherical": np.ones((n_components + 1)), + } + + # Define not positive-definite precisions + precisions_not_pos = np.ones((n_components, n_features, n_features)) + precisions_not_pos[0] = np.eye(n_features) + precisions_not_pos[0, 0, 0] = -1.0 + + precisions_not_positive = { + "full": precisions_not_pos, + "tied": precisions_not_pos[0], + "diag": np.full((n_components, n_features), -1.0), + "spherical": np.full(n_components, -1.0), + } + + not_positive_errors = { + "full": "symmetric, positive-definite", + "tied": "symmetric, positive-definite", + "diag": "positive", + "spherical": "positive", + } + + for covar_type in COVARIANCE_TYPE: + X = RandomData(rng).X[covar_type] + g = GaussianMixture( + n_components=n_components, covariance_type=covar_type, random_state=rng + ) + + # Check precisions with bad shapes + g.precisions_init = precisions_bad_shape[covar_type] + msg = f"The parameter '{covar_type} precision' should have the shape of" + with pytest.raises(ValueError, match=msg): + g.fit(X) + + # Check not positive precisions + g.precisions_init = precisions_not_positive[covar_type] + msg = f"'{covar_type} precision' should be {not_positive_errors[covar_type]}" + with pytest.raises(ValueError, match=msg): + g.fit(X) + + # Check the correct init of precisions_init + g.precisions_init = rand_data.precisions[covar_type] + g.fit(X) + assert_array_equal(rand_data.precisions[covar_type], g.precisions_init) + + +def test_suffstat_sk_full(): + # compare the precision matrix compute from the + # EmpiricalCovariance.covariance fitted on X*sqrt(resp) + # with _sufficient_sk_full, n_components=1 + rng = np.random.RandomState(0) + n_samples, n_features = 500, 2 + + # special case 1, assuming data is "centered" + X = rng.rand(n_samples, n_features) + resp = rng.rand(n_samples, 1) + X_resp = np.sqrt(resp) * X + nk = np.array([n_samples]) + xk = np.zeros((1, n_features)) + covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0) + ecov = EmpiricalCovariance(assume_centered=True) + ecov.fit(X_resp) + assert_almost_equal(ecov.error_norm(covars_pred[0], norm="frobenius"), 0) + assert_almost_equal(ecov.error_norm(covars_pred[0], norm="spectral"), 0) + + # check the precision computation + precs_chol_pred = _compute_precision_cholesky(covars_pred, "full") + precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred]) + precs_est = np.array([linalg.inv(cov) for cov in covars_pred]) + assert_array_almost_equal(precs_est, precs_pred) + + # special case 2, assuming resp are all ones + resp = np.ones((n_samples, 1)) + nk = np.array([n_samples]) + xk = X.mean(axis=0).reshape((1, -1)) + covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0) + ecov = EmpiricalCovariance(assume_centered=False) + ecov.fit(X) + assert_almost_equal(ecov.error_norm(covars_pred[0], norm="frobenius"), 0) + assert_almost_equal(ecov.error_norm(covars_pred[0], norm="spectral"), 0) + + # check the precision computation + precs_chol_pred = _compute_precision_cholesky(covars_pred, "full") + precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred]) + precs_est = np.array([linalg.inv(cov) for cov in covars_pred]) + assert_array_almost_equal(precs_est, precs_pred) + + +def test_suffstat_sk_tied(): + # use equation Nk * Sk / N = S_tied + rng = np.random.RandomState(0) + n_samples, n_features, n_components = 500, 2, 2 + + resp = rng.rand(n_samples, n_components) + resp = resp / resp.sum(axis=1)[:, np.newaxis] + X = rng.rand(n_samples, n_features) + nk = resp.sum(axis=0) + xk = np.dot(resp.T, X) / nk[:, np.newaxis] + + covars_pred_full = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0) + covars_pred_full = ( + np.sum(nk[:, np.newaxis, np.newaxis] * covars_pred_full, 0) / n_samples + ) + + covars_pred_tied = _estimate_gaussian_covariances_tied(resp, X, nk, xk, 0) + + ecov = EmpiricalCovariance() + ecov.covariance_ = covars_pred_full + assert_almost_equal(ecov.error_norm(covars_pred_tied, norm="frobenius"), 0) + assert_almost_equal(ecov.error_norm(covars_pred_tied, norm="spectral"), 0) + + # check the precision computation + precs_chol_pred = _compute_precision_cholesky(covars_pred_tied, "tied") + precs_pred = np.dot(precs_chol_pred, precs_chol_pred.T) + precs_est = linalg.inv(covars_pred_tied) + assert_array_almost_equal(precs_est, precs_pred) + + +def test_suffstat_sk_diag(): + # test against 'full' case + rng = np.random.RandomState(0) + n_samples, n_features, n_components = 500, 2, 2 + + resp = rng.rand(n_samples, n_components) + resp = resp / resp.sum(axis=1)[:, np.newaxis] + X = rng.rand(n_samples, n_features) + nk = resp.sum(axis=0) + xk = np.dot(resp.T, X) / nk[:, np.newaxis] + covars_pred_full = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0) + covars_pred_diag = _estimate_gaussian_covariances_diag(resp, X, nk, xk, 0) + + ecov = EmpiricalCovariance() + for cov_full, cov_diag in zip(covars_pred_full, covars_pred_diag): + ecov.covariance_ = np.diag(np.diag(cov_full)) + cov_diag = np.diag(cov_diag) + assert_almost_equal(ecov.error_norm(cov_diag, norm="frobenius"), 0) + assert_almost_equal(ecov.error_norm(cov_diag, norm="spectral"), 0) + + # check the precision computation + precs_chol_pred = _compute_precision_cholesky(covars_pred_diag, "diag") + assert_almost_equal(covars_pred_diag, 1.0 / precs_chol_pred**2) + + +def test_gaussian_suffstat_sk_spherical(global_dtype): + # computing spherical covariance equals to the variance of one-dimension + # data after flattening, n_components=1 + rng = np.random.RandomState(0) + n_samples, n_features = 500, 2 + + X = rng.rand(n_samples, n_features).astype(global_dtype) + X = X - X.mean() + resp = np.ones((n_samples, 1), dtype=global_dtype) + nk = np.array([n_samples], dtype=global_dtype) + xk = X.mean() + covars_pred_spherical = _estimate_gaussian_covariances_spherical(resp, X, nk, xk, 0) + covars_pred_spherical2 = np.dot(X.flatten().T, X.flatten()) / ( + n_features * n_samples + ) + assert_almost_equal(covars_pred_spherical, covars_pred_spherical2) + assert covars_pred_spherical.dtype == global_dtype + + # check the precision computation + precs_chol_pred = _compute_precision_cholesky(covars_pred_spherical, "spherical") + assert_almost_equal(covars_pred_spherical, 1.0 / precs_chol_pred**2) + assert precs_chol_pred.dtype == global_dtype + + +def test_compute_log_det_cholesky(global_dtype): + n_features = 2 + rand_data = RandomData(np.random.RandomState(0), dtype=global_dtype) + + for covar_type in COVARIANCE_TYPE: + covariance = rand_data.covariances[covar_type] + + if covar_type == "full": + predected_det = np.array([linalg.det(cov) for cov in covariance]) + elif covar_type == "tied": + predected_det = linalg.det(covariance) + elif covar_type == "diag": + predected_det = np.array([np.prod(cov) for cov in covariance]) + elif covar_type == "spherical": + predected_det = covariance**n_features + + # We compute the cholesky decomposition of the covariance matrix + assert covariance.dtype == global_dtype + expected_det = _compute_log_det_cholesky( + _compute_precision_cholesky(covariance, covar_type), + covar_type, + n_features=n_features, + ) + assert_array_almost_equal(expected_det, -0.5 * np.log(predected_det)) + assert expected_det.dtype == global_dtype + + +def _naive_lmvnpdf_diag(X, means, covars): + resp = np.empty((len(X), len(means))) + stds = np.sqrt(covars) + for i, (mean, std) in enumerate(zip(means, stds)): + resp[:, i] = stats.norm.logpdf(X, mean, std).sum(axis=1) + return resp + + +def test_gaussian_mixture_log_probabilities(): + from sklearn.mixture._gaussian_mixture import _estimate_log_gaussian_prob + + # test against with _naive_lmvnpdf_diag + rng = np.random.RandomState(0) + rand_data = RandomData(rng) + n_samples = 500 + n_features = rand_data.n_features + n_components = rand_data.n_components + + means = rand_data.means + covars_diag = rng.rand(n_components, n_features) + X = rng.rand(n_samples, n_features) + log_prob_naive = _naive_lmvnpdf_diag(X, means, covars_diag) + + # full covariances + precs_full = np.array([np.diag(1.0 / np.sqrt(x)) for x in covars_diag]) + + log_prob = _estimate_log_gaussian_prob(X, means, precs_full, "full") + assert_array_almost_equal(log_prob, log_prob_naive) + + # diag covariances + precs_chol_diag = 1.0 / np.sqrt(covars_diag) + log_prob = _estimate_log_gaussian_prob(X, means, precs_chol_diag, "diag") + assert_array_almost_equal(log_prob, log_prob_naive) + + # tied + covars_tied = np.array([x for x in covars_diag]).mean(axis=0) + precs_tied = np.diag(np.sqrt(1.0 / covars_tied)) + + log_prob_naive = _naive_lmvnpdf_diag(X, means, [covars_tied] * n_components) + log_prob = _estimate_log_gaussian_prob(X, means, precs_tied, "tied") + + assert_array_almost_equal(log_prob, log_prob_naive) + + # spherical + covars_spherical = covars_diag.mean(axis=1) + precs_spherical = 1.0 / np.sqrt(covars_diag.mean(axis=1)) + log_prob_naive = _naive_lmvnpdf_diag( + X, means, [[k] * n_features for k in covars_spherical] + ) + log_prob = _estimate_log_gaussian_prob(X, means, precs_spherical, "spherical") + assert_array_almost_equal(log_prob, log_prob_naive) + + +# skip tests on weighted_log_probabilities, log_weights + + +def test_gaussian_mixture_estimate_log_prob_resp(): + # test whether responsibilities are normalized + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=5) + n_samples = rand_data.n_samples + n_features = rand_data.n_features + n_components = rand_data.n_components + + X = rng.rand(n_samples, n_features) + for covar_type in COVARIANCE_TYPE: + weights = rand_data.weights + means = rand_data.means + precisions = rand_data.precisions[covar_type] + g = GaussianMixture( + n_components=n_components, + random_state=rng, + weights_init=weights, + means_init=means, + precisions_init=precisions, + covariance_type=covar_type, + ) + g.fit(X) + resp = g.predict_proba(X) + assert_array_almost_equal(resp.sum(axis=1), np.ones(n_samples)) + assert_array_equal(g.weights_init, weights) + assert_array_equal(g.means_init, means) + assert_array_equal(g.precisions_init, precisions) + + +def test_gaussian_mixture_predict_predict_proba(): + rng = np.random.RandomState(0) + rand_data = RandomData(rng) + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + Y = rand_data.Y + g = GaussianMixture( + n_components=rand_data.n_components, + random_state=rng, + weights_init=rand_data.weights, + means_init=rand_data.means, + precisions_init=rand_data.precisions[covar_type], + covariance_type=covar_type, + ) + + # Check a warning message arrive if we don't do fit + msg = ( + "This GaussianMixture instance is not fitted yet. Call 'fit' " + "with appropriate arguments before using this estimator." + ) + with pytest.raises(NotFittedError, match=msg): + g.predict(X) + + g.fit(X) + Y_pred = g.predict(X) + Y_pred_proba = g.predict_proba(X).argmax(axis=1) + assert_array_equal(Y_pred, Y_pred_proba) + assert adjusted_rand_score(Y, Y_pred) > 0.95 + + +@pytest.mark.filterwarnings("ignore:.*did not converge.*") +@pytest.mark.parametrize( + "seed, max_iter, tol", + [ + (0, 2, 1e-7), # strict non-convergence + (1, 2, 1e-1), # loose non-convergence + (3, 300, 1e-7), # strict convergence + (4, 300, 1e-1), # loose convergence + ], +) +def test_gaussian_mixture_fit_predict(seed, max_iter, tol, global_dtype): + rng = np.random.RandomState(seed) + rand_data = RandomData(rng, dtype=global_dtype) + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + Y = rand_data.Y + g = GaussianMixture( + n_components=rand_data.n_components, + random_state=rng, + weights_init=rand_data.weights, + means_init=rand_data.means, + precisions_init=rand_data.precisions[covar_type], + covariance_type=covar_type, + max_iter=max_iter, + tol=tol, + ) + + # check if fit_predict(X) is equivalent to fit(X).predict(X) + f = copy.deepcopy(g) + Y_pred1 = f.fit(X).predict(X) + Y_pred2 = g.fit_predict(X) + assert_array_equal(Y_pred1, Y_pred2) + assert adjusted_rand_score(Y, Y_pred2) > 0.95 + assert g.means_.dtype == global_dtype + assert g.weights_.dtype == global_dtype + assert g.precisions_.dtype == global_dtype + + +def test_gaussian_mixture_fit_predict_n_init(): + # Check that fit_predict is equivalent to fit.predict, when n_init > 1 + X = np.random.RandomState(0).randn(1000, 5) + gm = GaussianMixture(n_components=5, n_init=5, random_state=0) + y_pred1 = gm.fit_predict(X) + y_pred2 = gm.predict(X) + assert_array_equal(y_pred1, y_pred2) + + +def test_gaussian_mixture_fit(global_dtype): + # recover the ground truth + rng = np.random.RandomState(0) + rand_data = RandomData(rng, dtype=global_dtype) + n_features = rand_data.n_features + n_components = rand_data.n_components + + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + g = GaussianMixture( + n_components=n_components, + n_init=20, + reg_covar=0, + random_state=rng, + covariance_type=covar_type, + ) + g.fit(X) + + # needs more data to pass the test with rtol=1e-7 + assert_allclose( + np.sort(g.weights_), np.sort(rand_data.weights), rtol=0.1, atol=1e-2 + ) + + arg_idx1 = g.means_[:, 0].argsort() + arg_idx2 = rand_data.means[:, 0].argsort() + assert_allclose( + g.means_[arg_idx1], rand_data.means[arg_idx2], rtol=0.1, atol=1e-2 + ) + + if covar_type == "full": + prec_pred = g.precisions_ + prec_test = rand_data.precisions["full"] + elif covar_type == "tied": + prec_pred = np.array([g.precisions_] * n_components) + prec_test = np.array([rand_data.precisions["tied"]] * n_components) + elif covar_type == "spherical": + prec_pred = np.array([np.eye(n_features) * c for c in g.precisions_]) + prec_test = np.array( + [np.eye(n_features) * c for c in rand_data.precisions["spherical"]] + ) + elif covar_type == "diag": + prec_pred = np.array([np.diag(d) for d in g.precisions_]) + prec_test = np.array([np.diag(d) for d in rand_data.precisions["diag"]]) + + arg_idx1 = np.trace(prec_pred, axis1=1, axis2=2).argsort() + arg_idx2 = np.trace(prec_test, axis1=1, axis2=2).argsort() + for k, h in zip(arg_idx1, arg_idx2): + ecov = EmpiricalCovariance() + ecov.covariance_ = prec_test[h] + # the accuracy depends on the number of data and randomness, rng + assert_allclose(ecov.error_norm(prec_pred[k]), 0, atol=0.15) + + assert g.means_.dtype == global_dtype + assert g.covariances_.dtype == global_dtype + assert g.precisions_.dtype == global_dtype + + +def test_gaussian_mixture_fit_best_params(): + rng = np.random.RandomState(0) + rand_data = RandomData(rng) + n_components = rand_data.n_components + n_init = 10 + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + g = GaussianMixture( + n_components=n_components, + n_init=1, + reg_covar=0, + random_state=rng, + covariance_type=covar_type, + ) + ll = [] + for _ in range(n_init): + g.fit(X) + ll.append(g.score(X)) + ll = np.array(ll) + g_best = GaussianMixture( + n_components=n_components, + n_init=n_init, + reg_covar=0, + random_state=rng, + covariance_type=covar_type, + ) + g_best.fit(X) + assert_almost_equal(ll.min(), g_best.score(X)) + + +def test_gaussian_mixture_fit_convergence_warning(): + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=1) + n_components = rand_data.n_components + max_iter = 1 + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + g = GaussianMixture( + n_components=n_components, + n_init=1, + max_iter=max_iter, + reg_covar=0, + random_state=rng, + covariance_type=covar_type, + ) + msg = ( + "Best performing initialization did not converge. " + "Try different init parameters, or increase max_iter, " + "tol, or check for degenerate data." + ) + with pytest.warns(ConvergenceWarning, match=msg): + g.fit(X) + + +def test_multiple_init(): + # Test that multiple inits does not much worse than a single one + rng = np.random.RandomState(0) + n_samples, n_features, n_components = 50, 5, 2 + X = rng.randn(n_samples, n_features) + for cv_type in COVARIANCE_TYPE: + train1 = ( + GaussianMixture( + n_components=n_components, covariance_type=cv_type, random_state=0 + ) + .fit(X) + .score(X) + ) + train2 = ( + GaussianMixture( + n_components=n_components, + covariance_type=cv_type, + random_state=0, + n_init=5, + ) + .fit(X) + .score(X) + ) + assert train2 >= train1 + + +def test_gaussian_mixture_n_parameters(): + # Test that the right number of parameters is estimated + rng = np.random.RandomState(0) + n_samples, n_features, n_components = 50, 5, 2 + X = rng.randn(n_samples, n_features) + n_params = {"spherical": 13, "diag": 21, "tied": 26, "full": 41} + for cv_type in COVARIANCE_TYPE: + g = GaussianMixture( + n_components=n_components, covariance_type=cv_type, random_state=rng + ).fit(X) + assert g._n_parameters() == n_params[cv_type] + + +def test_bic_1d_1component(): + # Test all of the covariance_types return the same BIC score for + # 1-dimensional, 1 component fits. + rng = np.random.RandomState(0) + n_samples, n_dim, n_components = 100, 1, 1 + X = rng.randn(n_samples, n_dim) + bic_full = ( + GaussianMixture( + n_components=n_components, covariance_type="full", random_state=rng + ) + .fit(X) + .bic(X) + ) + for covariance_type in ["tied", "diag", "spherical"]: + bic = ( + GaussianMixture( + n_components=n_components, + covariance_type=covariance_type, + random_state=rng, + ) + .fit(X) + .bic(X) + ) + assert_almost_equal(bic_full, bic) + + +def test_gaussian_mixture_aic_bic(): + # Test the aic and bic criteria + rng = np.random.RandomState(0) + n_samples, n_features, n_components = 50, 3, 2 + X = rng.randn(n_samples, n_features) + # standard gaussian entropy + sgh = 0.5 * ( + fast_logdet(np.cov(X.T, bias=1)) + n_features * (1 + np.log(2 * np.pi)) + ) + for cv_type in COVARIANCE_TYPE: + g = GaussianMixture( + n_components=n_components, + covariance_type=cv_type, + random_state=rng, + max_iter=200, + ) + g.fit(X) + aic = 2 * n_samples * sgh + 2 * g._n_parameters() + bic = 2 * n_samples * sgh + np.log(n_samples) * g._n_parameters() + bound = n_features / np.sqrt(n_samples) + assert (g.aic(X) - aic) / n_samples < bound + assert (g.bic(X) - bic) / n_samples < bound + + +def test_gaussian_mixture_verbose(): + rng = np.random.RandomState(0) + rand_data = RandomData(rng) + n_components = rand_data.n_components + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + g = GaussianMixture( + n_components=n_components, + n_init=1, + reg_covar=0, + random_state=rng, + covariance_type=covar_type, + verbose=1, + ) + h = GaussianMixture( + n_components=n_components, + n_init=1, + reg_covar=0, + random_state=rng, + covariance_type=covar_type, + verbose=2, + ) + old_stdout = sys.stdout + sys.stdout = StringIO() + try: + g.fit(X) + h.fit(X) + finally: + sys.stdout = old_stdout + + +@pytest.mark.filterwarnings("ignore:.*did not converge.*") +@pytest.mark.parametrize("seed", (0, 1, 2)) +def test_warm_start(seed): + random_state = seed + rng = np.random.RandomState(random_state) + n_samples, n_features, n_components = 500, 2, 2 + X = rng.rand(n_samples, n_features) + + # Assert the warm_start give the same result for the same number of iter + g = GaussianMixture( + n_components=n_components, + n_init=1, + max_iter=2, + reg_covar=0, + random_state=random_state, + warm_start=False, + ) + h = GaussianMixture( + n_components=n_components, + n_init=1, + max_iter=1, + reg_covar=0, + random_state=random_state, + warm_start=True, + ) + + g.fit(X) + score1 = h.fit(X).score(X) + score2 = h.fit(X).score(X) + + assert_almost_equal(g.weights_, h.weights_) + assert_almost_equal(g.means_, h.means_) + assert_almost_equal(g.precisions_, h.precisions_) + assert score2 > score1 + + # Assert that by using warm_start we can converge to a good solution + g = GaussianMixture( + n_components=n_components, + n_init=1, + max_iter=5, + reg_covar=0, + random_state=random_state, + warm_start=False, + tol=1e-6, + ) + h = GaussianMixture( + n_components=n_components, + n_init=1, + max_iter=5, + reg_covar=0, + random_state=random_state, + warm_start=True, + tol=1e-6, + ) + + g.fit(X) + assert not g.converged_ + + h.fit(X) + # depending on the data there is large variability in the number of + # refit necessary to converge due to the complete randomness of the + # data + for _ in range(1000): + h.fit(X) + if h.converged_: + break + assert h.converged_ + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +def test_convergence_detected_with_warm_start(): + # We check that convergence is detected when warm_start=True + rng = np.random.RandomState(0) + rand_data = RandomData(rng) + n_components = rand_data.n_components + X = rand_data.X["full"] + + for max_iter in (1, 2, 50): + gmm = GaussianMixture( + n_components=n_components, + warm_start=True, + max_iter=max_iter, + random_state=rng, + ) + for _ in range(100): + gmm.fit(X) + if gmm.converged_: + break + assert gmm.converged_ + assert max_iter >= gmm.n_iter_ + + +def test_score(global_dtype): + covar_type = "full" + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=7, dtype=global_dtype) + n_components = rand_data.n_components + X = rand_data.X[covar_type] + assert X.dtype == global_dtype + + # Check the error message if we don't call fit + gmm1 = GaussianMixture( + n_components=n_components, + n_init=1, + max_iter=1, + reg_covar=0, + random_state=rng, + covariance_type=covar_type, + ) + msg = ( + "This GaussianMixture instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this estimator." + ) + with pytest.raises(NotFittedError, match=msg): + gmm1.score(X) + + # Check score value + with warnings.catch_warnings(): + warnings.simplefilter("ignore", ConvergenceWarning) + gmm1.fit(X) + + assert gmm1.means_.dtype == global_dtype + assert gmm1.covariances_.dtype == global_dtype + + gmm_score = gmm1.score(X) + gmm_score_proba = gmm1.score_samples(X).mean() + assert_almost_equal(gmm_score, gmm_score_proba) + assert gmm_score_proba.dtype == global_dtype + + # Check if the score increase + gmm2 = GaussianMixture( + n_components=n_components, + n_init=1, + reg_covar=0, + random_state=rng, + covariance_type=covar_type, + ).fit(X) + assert gmm2.score(X) > gmm1.score(X) + + +def test_score_samples(): + covar_type = "full" + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=7) + n_components = rand_data.n_components + X = rand_data.X[covar_type] + + # Check the error message if we don't call fit + gmm = GaussianMixture( + n_components=n_components, + n_init=1, + reg_covar=0, + random_state=rng, + covariance_type=covar_type, + ) + msg = ( + "This GaussianMixture instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this estimator." + ) + with pytest.raises(NotFittedError, match=msg): + gmm.score_samples(X) + + gmm_score_samples = gmm.fit(X).score_samples(X) + assert gmm_score_samples.shape[0] == rand_data.n_samples + + +def test_monotonic_likelihood(): + # We check that each step of the EM without regularization improve + # monotonically the training set likelihood + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=7) + n_components = rand_data.n_components + + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + gmm = GaussianMixture( + n_components=n_components, + covariance_type=covar_type, + reg_covar=0, + warm_start=True, + max_iter=1, + random_state=rng, + tol=1e-7, + ) + current_log_likelihood = -np.inf + with warnings.catch_warnings(): + warnings.simplefilter("ignore", ConvergenceWarning) + # Do one training iteration at a time so we can make sure that the + # training log likelihood increases after each iteration. + for _ in range(600): + prev_log_likelihood = current_log_likelihood + current_log_likelihood = gmm.fit(X).score(X) + assert current_log_likelihood >= prev_log_likelihood + + if gmm.converged_: + break + + assert gmm.converged_ + + +def test_regularisation(): + # We train the GaussianMixture on degenerate data by defining two clusters + # of a 0 covariance. + rng = np.random.RandomState(0) + n_samples, n_features = 10, 5 + + X = np.vstack( + (np.ones((n_samples // 2, n_features)), np.zeros((n_samples // 2, n_features))) + ) + + for covar_type in COVARIANCE_TYPE: + gmm = GaussianMixture( + n_components=n_samples, + reg_covar=0, + covariance_type=covar_type, + random_state=rng, + ) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + msg = re.escape( + "Fitting the mixture model failed because some components have" + " ill-defined empirical covariance (for instance caused by " + "singleton or collapsed samples). Try to decrease the number " + "of components, increase reg_covar, or scale the input data." + ) + with pytest.raises(ValueError, match=msg): + gmm.fit(X) + + gmm.set_params(reg_covar=1e-6).fit(X) + + +@pytest.mark.parametrize("covar_type", COVARIANCE_TYPE) +def test_fitted_precision_covariance_concistency(covar_type, global_dtype): + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=7, dtype=global_dtype) + n_components = rand_data.n_components + + X = rand_data.X[covar_type] + gmm = GaussianMixture( + n_components=n_components, + covariance_type=covar_type, + random_state=rng, + n_init=5, + ) + gmm.fit(X) + assert gmm.precisions_.dtype == global_dtype + assert gmm.covariances_.dtype == global_dtype + if covar_type == "full": + for prec, covar in zip(gmm.precisions_, gmm.covariances_): + assert_array_almost_equal(linalg.inv(prec), covar) + elif covar_type == "tied": + assert_array_almost_equal(linalg.inv(gmm.precisions_), gmm.covariances_) + else: + assert_array_almost_equal(gmm.precisions_, 1.0 / gmm.covariances_) + + +def test_sample(): + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=7, n_components=3) + n_features, n_components = rand_data.n_features, rand_data.n_components + + for covar_type in COVARIANCE_TYPE: + X = rand_data.X[covar_type] + + gmm = GaussianMixture( + n_components=n_components, covariance_type=covar_type, random_state=rng + ) + # To sample we need that GaussianMixture is fitted + msg = "This GaussianMixture instance is not fitted" + with pytest.raises(NotFittedError, match=msg): + gmm.sample(0) + gmm.fit(X) + + msg = "Invalid value for 'n_samples'" + with pytest.raises(ValueError, match=msg): + gmm.sample(0) + + # Just to make sure the class samples correctly + n_samples = 20000 + X_s, y_s = gmm.sample(n_samples) + + for k in range(n_components): + if covar_type == "full": + assert_array_almost_equal( + gmm.covariances_[k], np.cov(X_s[y_s == k].T), decimal=1 + ) + elif covar_type == "tied": + assert_array_almost_equal( + gmm.covariances_, np.cov(X_s[y_s == k].T), decimal=1 + ) + elif covar_type == "diag": + assert_array_almost_equal( + gmm.covariances_[k], np.diag(np.cov(X_s[y_s == k].T)), decimal=1 + ) + else: + assert_array_almost_equal( + gmm.covariances_[k], + np.var(X_s[y_s == k] - gmm.means_[k]), + decimal=1, + ) + + means_s = np.array([np.mean(X_s[y_s == k], 0) for k in range(n_components)]) + assert_array_almost_equal(gmm.means_, means_s, decimal=1) + + # Check shapes of sampled data, see + # https://github.com/scikit-learn/scikit-learn/issues/7701 + assert X_s.shape == (n_samples, n_features) + + for sample_size in range(1, 100): + X_s, _ = gmm.sample(sample_size) + assert X_s.shape == (sample_size, n_features) + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +def test_init(): + # We check that by increasing the n_init number we have a better solution + for random_state in range(15): + rand_data = RandomData( + np.random.RandomState(random_state), n_samples=50, scale=1 + ) + n_components = rand_data.n_components + X = rand_data.X["full"] + + gmm1 = GaussianMixture( + n_components=n_components, n_init=1, max_iter=1, random_state=random_state + ).fit(X) + gmm2 = GaussianMixture( + n_components=n_components, n_init=10, max_iter=1, random_state=random_state + ).fit(X) + + assert gmm2.lower_bound_ >= gmm1.lower_bound_ + + +def test_gaussian_mixture_setting_best_params(): + """`GaussianMixture`'s best_parameters, `n_iter_` and `lower_bound_` + must be set appropriately in the case of divergence. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/18216 + """ + rnd = np.random.RandomState(0) + n_samples = 30 + X = rnd.uniform(size=(n_samples, 3)) + + # following initialization parameters were found to lead to divergence + means_init = np.array( + [ + [0.670637869618158, 0.21038256107384043, 0.12892629765485303], + [0.09394051075844147, 0.5759464955561779, 0.929296197576212], + [0.5033230372781258, 0.9569852381759425, 0.08654043447295741], + [0.18578301420435747, 0.5531158970919143, 0.19388943970532435], + [0.4548589928173794, 0.35182513658825276, 0.568146063202464], + [0.609279894978321, 0.7929063819678847, 0.9620097270828052], + ] + ) + precisions_init = np.array( + [ + 999999.999604483, + 999999.9990869573, + 553.7603944542167, + 204.78596008931834, + 15.867423501783637, + 85.4595728389735, + ] + ) + weights_init = [ + 0.03333333333333341, + 0.03333333333333341, + 0.06666666666666674, + 0.06666666666666674, + 0.7000000000000001, + 0.10000000000000007, + ] + + gmm = GaussianMixture( + covariance_type="spherical", + reg_covar=0, + means_init=means_init, + weights_init=weights_init, + random_state=rnd, + n_components=len(weights_init), + precisions_init=precisions_init, + max_iter=1, + ) + # ensure that no error is thrown during fit + gmm.fit(X) + + # check that the fit did not converge + assert not gmm.converged_ + + # check that parameters are set for gmm + for attr in [ + "weights_", + "means_", + "covariances_", + "precisions_cholesky_", + "n_iter_", + "lower_bound_", + "lower_bounds_", + ]: + assert hasattr(gmm, attr) + + +@pytest.mark.parametrize( + "init_params", ["random", "random_from_data", "k-means++", "kmeans"] +) +def test_init_means_not_duplicated(init_params, global_random_seed): + # Check that all initialisations provide not duplicated starting means + rng = np.random.RandomState(global_random_seed) + rand_data = RandomData(rng, scale=5) + n_components = rand_data.n_components + X = rand_data.X["full"] + + gmm = GaussianMixture( + n_components=n_components, init_params=init_params, random_state=rng, max_iter=0 + ) + gmm.fit(X) + + means = gmm.means_ + for i_mean, j_mean in itertools.combinations(means, r=2): + assert not np.allclose(i_mean, j_mean) + + +@pytest.mark.parametrize( + "init_params", ["random", "random_from_data", "k-means++", "kmeans"] +) +def test_means_for_all_inits(init_params, global_random_seed, global_dtype): + # Check fitted means properties for all initializations + rng = np.random.RandomState(global_random_seed) + rand_data = RandomData(rng, scale=5, dtype=global_dtype) + n_components = rand_data.n_components + X = rand_data.X["full"] + + gmm = GaussianMixture( + n_components=n_components, init_params=init_params, random_state=rng + ) + gmm.fit(X) + + assert gmm.means_.shape == (n_components, X.shape[1]) + assert np.all(X.min(axis=0) <= gmm.means_) + assert np.all(gmm.means_ <= X.max(axis=0)) + assert gmm.converged_ + assert gmm.means_.dtype == global_dtype + assert gmm.covariances_.dtype == global_dtype + assert gmm.weights_.dtype == global_dtype + + +def test_max_iter_zero(): + # Check that max_iter=0 returns initialisation as expected + # Pick arbitrary initial means and check equal to max_iter=0 + rng = np.random.RandomState(0) + rand_data = RandomData(rng, scale=5) + n_components = rand_data.n_components + X = rand_data.X["full"] + means_init = [[20, 30], [30, 25]] + gmm = GaussianMixture( + n_components=n_components, + random_state=rng, + means_init=means_init, + tol=1e-06, + max_iter=0, + ) + gmm.fit(X) + + assert_allclose(gmm.means_, means_init) + + +def test_gaussian_mixture_precisions_init_diag(global_dtype): + """Check that we properly initialize `precision_cholesky_` when we manually + provide the precision matrix. + + In this regard, we check the consistency between estimating the precision + matrix and providing the same precision matrix as initialization. It should + lead to the same results with the same number of iterations. + + If the initialization is wrong then the number of iterations will increase. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/16944 + """ + # generate a toy dataset + n_samples = 300 + rng = np.random.RandomState(0) + shifted_gaussian = rng.randn(n_samples, 2) + np.array([20, 20]) + C = np.array([[0.0, -0.7], [3.5, 0.7]]) + stretched_gaussian = np.dot(rng.randn(n_samples, 2), C) + X = np.vstack([shifted_gaussian, stretched_gaussian]).astype(global_dtype) + + # common parameters to check the consistency of precision initialization + n_components, covariance_type, reg_covar, random_state = 2, "diag", 1e-6, 0 + + # execute the manual initialization to compute the precision matrix: + # - run KMeans to have an initial guess + # - estimate the covariance + # - compute the precision matrix from the estimated covariance + resp = np.zeros((X.shape[0], n_components)).astype(global_dtype) + label = ( + KMeans(n_clusters=n_components, n_init=1, random_state=random_state) + .fit(X) + .labels_ + ) + resp[np.arange(X.shape[0]), label] = 1 + _, _, covariance = _estimate_gaussian_parameters( + X, resp, reg_covar=reg_covar, covariance_type=covariance_type + ) + assert covariance.dtype == global_dtype + precisions_init = 1 / covariance + + gm_with_init = GaussianMixture( + n_components=n_components, + covariance_type=covariance_type, + reg_covar=reg_covar, + precisions_init=precisions_init, + random_state=random_state, + ).fit(X) + assert gm_with_init.means_.dtype == global_dtype + assert gm_with_init.covariances_.dtype == global_dtype + assert gm_with_init.precisions_cholesky_.dtype == global_dtype + + gm_without_init = GaussianMixture( + n_components=n_components, + covariance_type=covariance_type, + reg_covar=reg_covar, + random_state=random_state, + ).fit(X) + assert gm_without_init.means_.dtype == global_dtype + assert gm_without_init.covariances_.dtype == global_dtype + assert gm_without_init.precisions_cholesky_.dtype == global_dtype + + assert gm_without_init.n_iter_ == gm_with_init.n_iter_ + assert_allclose( + gm_with_init.precisions_cholesky_, gm_without_init.precisions_cholesky_ + ) + + +def _generate_data(seed, n_samples, n_features, n_components, dtype=np.float64): + """Randomly generate samples and responsibilities.""" + rs = np.random.RandomState(seed) + X = rs.random_sample((n_samples, n_features)).astype(dtype) + resp = rs.random_sample((n_samples, n_components)).astype(dtype) + resp /= resp.sum(axis=1)[:, np.newaxis] + return X, resp + + +def _calculate_precisions(X, resp, covariance_type): + """Calculate precision matrix of X and its Cholesky decomposition + for the given covariance type. + """ + reg_covar = 1e-6 + weights, means, covariances = _estimate_gaussian_parameters( + X, resp, reg_covar, covariance_type + ) + precisions_cholesky = _compute_precision_cholesky(covariances, covariance_type) + + _, n_components = resp.shape + # Instantiate a `GaussianMixture` model in order to use its + # `_set_parameters` method to return the `precisions_` and + # `precisions_cholesky_` from matching the `covariance_type` + # provided. + gmm = GaussianMixture(n_components=n_components, covariance_type=covariance_type) + params = (weights, means, covariances, precisions_cholesky) + gmm._set_parameters(params) + return gmm.precisions_, gmm.precisions_cholesky_ + + +@pytest.mark.parametrize("covariance_type", COVARIANCE_TYPE) +def test_gaussian_mixture_precisions_init( + covariance_type, global_random_seed, global_dtype +): + """Non-regression test for #26415.""" + + X, resp = _generate_data( + seed=global_random_seed, + n_samples=100, + n_features=3, + n_components=4, + dtype=global_dtype, + ) + + precisions_init, desired_precisions_cholesky = _calculate_precisions( + X, resp, covariance_type + ) + assert precisions_init.dtype == global_dtype + assert desired_precisions_cholesky.dtype == global_dtype + + gmm = GaussianMixture( + covariance_type=covariance_type, precisions_init=precisions_init + ) + gmm._initialize(X, resp) + actual_precisions_cholesky = gmm.precisions_cholesky_ + assert_allclose(actual_precisions_cholesky, desired_precisions_cholesky) + + +def test_gaussian_mixture_single_component_stable(): + """ + Non-regression test for #23032 ensuring 1-component GM works on only a + few samples. + """ + rng = np.random.RandomState(0) + X = rng.multivariate_normal(np.zeros(2), np.identity(2), size=3) + gm = GaussianMixture(n_components=1) + gm.fit(X).sample() + + +def test_gaussian_mixture_all_init_does_not_estimate_gaussian_parameters( + monkeypatch, + global_random_seed, +): + """When all init parameters are provided, the Gaussian parameters + are not estimated. + + Non-regression test for gh-26015. + """ + + mock = Mock(side_effect=_estimate_gaussian_parameters) + monkeypatch.setattr( + sklearn.mixture._gaussian_mixture, "_estimate_gaussian_parameters", mock + ) + + rng = np.random.RandomState(global_random_seed) + rand_data = RandomData(rng) + + gm = GaussianMixture( + n_components=rand_data.n_components, + weights_init=rand_data.weights, + means_init=rand_data.means, + precisions_init=rand_data.precisions["full"], + random_state=rng, + ) + gm.fit(rand_data.X["full"]) + # The initial gaussian parameters are not estimated. They are estimated for every + # m_step. + assert mock.call_count == gm.n_iter_ diff --git a/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/test_mixture.py b/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/test_mixture.py new file mode 100644 index 0000000000000000000000000000000000000000..9c98d150f06a8c7685d24c083e2ed2866f17c8ca --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/mixture/tests/test_mixture.py @@ -0,0 +1,30 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np +import pytest + +from sklearn.mixture import BayesianGaussianMixture, GaussianMixture + + +@pytest.mark.parametrize("estimator", [GaussianMixture(), BayesianGaussianMixture()]) +def test_gaussian_mixture_n_iter(estimator): + # check that n_iter is the number of iteration performed. + rng = np.random.RandomState(0) + X = rng.rand(10, 5) + max_iter = 1 + estimator.set_params(max_iter=max_iter) + estimator.fit(X) + assert estimator.n_iter_ == max_iter + + +@pytest.mark.parametrize("estimator", [GaussianMixture(), BayesianGaussianMixture()]) +def test_mixture_n_components_greater_than_n_samples_error(estimator): + """Check error when n_components <= n_samples""" + rng = np.random.RandomState(0) + X = rng.rand(10, 5) + estimator.set_params(n_components=12) + + msg = "Expected n_samples >= n_components" + with pytest.raises(ValueError, match=msg): + estimator.fit(X) diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8eb0ef772c552fc6e2171acc13c1e98966a1cfb4 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/__init__.py @@ -0,0 +1,99 @@ +"""Tools for model selection, such as cross validation and hyper-parameter tuning.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import typing + +from ._classification_threshold import ( + FixedThresholdClassifier, + TunedThresholdClassifierCV, +) +from ._plot import LearningCurveDisplay, ValidationCurveDisplay +from ._search import GridSearchCV, ParameterGrid, ParameterSampler, RandomizedSearchCV +from ._split import ( + BaseCrossValidator, + BaseShuffleSplit, + GroupKFold, + GroupShuffleSplit, + KFold, + LeaveOneGroupOut, + LeaveOneOut, + LeavePGroupsOut, + LeavePOut, + PredefinedSplit, + RepeatedKFold, + RepeatedStratifiedKFold, + ShuffleSplit, + StratifiedGroupKFold, + StratifiedKFold, + StratifiedShuffleSplit, + TimeSeriesSplit, + check_cv, + train_test_split, +) +from ._validation import ( + cross_val_predict, + cross_val_score, + cross_validate, + learning_curve, + permutation_test_score, + validation_curve, +) + +if typing.TYPE_CHECKING: + # Avoid errors in type checkers (e.g. mypy) for experimental estimators. + # TODO: remove this check once the estimator is no longer experimental. + from ._search_successive_halving import ( # noqa: F401 + HalvingGridSearchCV, + HalvingRandomSearchCV, + ) + + +__all__ = [ + "BaseCrossValidator", + "BaseShuffleSplit", + "FixedThresholdClassifier", + "GridSearchCV", + "GroupKFold", + "GroupShuffleSplit", + "KFold", + "LearningCurveDisplay", + "LeaveOneGroupOut", + "LeaveOneOut", + "LeavePGroupsOut", + "LeavePOut", + "ParameterGrid", + "ParameterSampler", + "PredefinedSplit", + "RandomizedSearchCV", + "RepeatedKFold", + "RepeatedStratifiedKFold", + "ShuffleSplit", + "StratifiedGroupKFold", + "StratifiedKFold", + "StratifiedShuffleSplit", + "TimeSeriesSplit", + "TunedThresholdClassifierCV", + "ValidationCurveDisplay", + "check_cv", + "cross_val_predict", + "cross_val_score", + "cross_validate", + "learning_curve", + "permutation_test_score", + "train_test_split", + "validation_curve", +] + + +# TODO: remove this check once the estimator is no longer experimental. +def __getattr__(name): + if name in {"HalvingGridSearchCV", "HalvingRandomSearchCV"}: + raise ImportError( + f"{name} is experimental and the API might change without any " + "deprecation cycle. To use it, you need to explicitly import " + "enable_halving_search_cv:\n" + "from sklearn.experimental import enable_halving_search_cv" + ) + raise AttributeError(f"module {__name__} has no attribute {name}") diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/_classification_threshold.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_classification_threshold.py new file mode 100644 index 0000000000000000000000000000000000000000..c68ed38b8819d989d0ec838840b5b5406eec7e57 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_classification_threshold.py @@ -0,0 +1,889 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from collections.abc import MutableMapping +from numbers import Integral, Real + +import numpy as np + +from ..base import ( + BaseEstimator, + ClassifierMixin, + MetaEstimatorMixin, + _fit_context, + clone, +) +from ..exceptions import NotFittedError +from ..metrics import ( + check_scoring, + get_scorer_names, +) +from ..metrics._scorer import ( + _CurveScorer, + _threshold_scores_to_class_labels, +) +from ..utils import _safe_indexing, get_tags +from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions +from ..utils._response import _get_response_values_binary +from ..utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + process_routing, +) +from ..utils.metaestimators import available_if +from ..utils.multiclass import type_of_target +from ..utils.parallel import Parallel, delayed +from ..utils.validation import ( + _check_method_params, + _estimator_has, + _num_samples, + check_is_fitted, + indexable, +) +from ._split import StratifiedShuffleSplit, check_cv + + +def _check_is_fitted(estimator): + try: + check_is_fitted(estimator.estimator) + except NotFittedError: + check_is_fitted(estimator, "estimator_") + + +class BaseThresholdClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator): + """Base class for binary classifiers that set a non-default decision threshold. + + In this base class, we define the following interface: + + - the validation of common parameters in `fit`; + - the different prediction methods that can be used with the classifier. + + .. versionadded:: 1.5 + + Parameters + ---------- + estimator : estimator instance + The binary classifier, fitted or not, for which we want to optimize + the decision threshold used during `predict`. + + response_method : {"auto", "decision_function", "predict_proba"}, default="auto" + Methods by the classifier `estimator` corresponding to the + decision function for which we want to find a threshold. It can be: + + * if `"auto"`, it will try to invoke, for each classifier, + `"predict_proba"` or `"decision_function"` in that order. + * otherwise, one of `"predict_proba"` or `"decision_function"`. + If the method is not implemented by the classifier, it will raise an + error. + """ + + _parameter_constraints: dict = { + "estimator": [ + HasMethods(["fit", "predict_proba"]), + HasMethods(["fit", "decision_function"]), + ], + "response_method": [StrOptions({"auto", "predict_proba", "decision_function"})], + } + + def __init__(self, estimator, *, response_method="auto"): + self.estimator = estimator + self.response_method = response_method + + def _get_response_method(self): + """Define the response method.""" + if self.response_method == "auto": + response_method = ["predict_proba", "decision_function"] + else: + response_method = self.response_method + return response_method + + @_fit_context( + # *ThresholdClassifier*.estimator is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y, **params): + """Fit the classifier. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) + Target values. + + **params : dict + Parameters to pass to the `fit` method of the underlying + classifier. + + Returns + ------- + self : object + Returns an instance of self. + """ + _raise_for_params(params, self, None) + + X, y = indexable(X, y) + + y_type = type_of_target(y, input_name="y") + if y_type != "binary": + raise ValueError( + f"Only binary classification is supported. Unknown label type: {y_type}" + ) + + self._fit(X, y, **params) + + if hasattr(self.estimator_, "n_features_in_"): + self.n_features_in_ = self.estimator_.n_features_in_ + if hasattr(self.estimator_, "feature_names_in_"): + self.feature_names_in_ = self.estimator_.feature_names_in_ + + return self + + @property + def classes_(self): + """Classes labels.""" + return self.estimator_.classes_ + + @available_if(_estimator_has("predict_proba")) + def predict_proba(self, X): + """Predict class probabilities for `X` using the fitted estimator. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + Returns + ------- + probabilities : ndarray of shape (n_samples, n_classes) + The class probabilities of the input samples. + """ + _check_is_fitted(self) + estimator = getattr(self, "estimator_", self.estimator) + return estimator.predict_proba(X) + + @available_if(_estimator_has("predict_log_proba")) + def predict_log_proba(self, X): + """Predict logarithm class probabilities for `X` using the fitted estimator. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + Returns + ------- + log_probabilities : ndarray of shape (n_samples, n_classes) + The logarithm class probabilities of the input samples. + """ + _check_is_fitted(self) + estimator = getattr(self, "estimator_", self.estimator) + return estimator.predict_log_proba(X) + + @available_if(_estimator_has("decision_function")) + def decision_function(self, X): + """Decision function for samples in `X` using the fitted estimator. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. + + Returns + ------- + decisions : ndarray of shape (n_samples,) + The decision function computed the fitted estimator. + """ + _check_is_fitted(self) + estimator = getattr(self, "estimator_", self.estimator) + return estimator.decision_function(X) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.classifier_tags.multi_class = False + tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse + return tags + + +class FixedThresholdClassifier(BaseThresholdClassifier): + """Binary classifier that manually sets the decision threshold. + + This classifier allows to change the default decision threshold used for + converting posterior probability estimates (i.e. output of `predict_proba`) or + decision scores (i.e. output of `decision_function`) into a class label. + + Here, the threshold is not optimized and is set to a constant value. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.5 + + Parameters + ---------- + estimator : estimator instance + The binary classifier, fitted or not, for which we want to optimize + the decision threshold used during `predict`. + + threshold : {"auto"} or float, default="auto" + The decision threshold to use when converting posterior probability estimates + (i.e. output of `predict_proba`) or decision scores (i.e. output of + `decision_function`) into a class label. When `"auto"`, the threshold is set + to 0.5 if `predict_proba` is used as `response_method`, otherwise it is set to + 0 (i.e. the default threshold for `decision_function`). + + pos_label : int, float, bool or str, default=None + The label of the positive class. Used to process the output of the + `response_method` method. When `pos_label=None`, if `y_true` is in `{-1, 1}` or + `{0, 1}`, `pos_label` is set to 1, otherwise an error will be raised. + + response_method : {"auto", "decision_function", "predict_proba"}, default="auto" + Methods by the classifier `estimator` corresponding to the + decision function for which we want to find a threshold. It can be: + + * if `"auto"`, it will try to invoke `"predict_proba"` or `"decision_function"` + in that order. + * otherwise, one of `"predict_proba"` or `"decision_function"`. + If the method is not implemented by the classifier, it will raise an + error. + + Attributes + ---------- + estimator_ : estimator instance + The fitted classifier used when predicting. + + classes_ : ndarray of shape (n_classes,) + The class labels. + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + + See Also + -------- + sklearn.model_selection.TunedThresholdClassifierCV : Classifier that post-tunes + the decision threshold based on some metrics and using cross-validation. + sklearn.calibration.CalibratedClassifierCV : Estimator that calibrates + probabilities. + + Examples + -------- + >>> from sklearn.datasets import make_classification + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.metrics import confusion_matrix + >>> from sklearn.model_selection import FixedThresholdClassifier, train_test_split + >>> X, y = make_classification( + ... n_samples=1_000, weights=[0.9, 0.1], class_sep=0.8, random_state=42 + ... ) + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, stratify=y, random_state=42 + ... ) + >>> classifier = LogisticRegression(random_state=0).fit(X_train, y_train) + >>> print(confusion_matrix(y_test, classifier.predict(X_test))) + [[217 7] + [ 19 7]] + >>> classifier_other_threshold = FixedThresholdClassifier( + ... classifier, threshold=0.1, response_method="predict_proba" + ... ).fit(X_train, y_train) + >>> print(confusion_matrix(y_test, classifier_other_threshold.predict(X_test))) + [[184 40] + [ 6 20]] + """ + + _parameter_constraints: dict = { + **BaseThresholdClassifier._parameter_constraints, + "threshold": [StrOptions({"auto"}), Real], + "pos_label": [Real, str, "boolean", None], + } + + def __init__( + self, + estimator, + *, + threshold="auto", + pos_label=None, + response_method="auto", + ): + super().__init__(estimator=estimator, response_method=response_method) + self.pos_label = pos_label + self.threshold = threshold + + @property + def classes_(self): + if estimator := getattr(self, "estimator_", None): + return estimator.classes_ + try: + check_is_fitted(self.estimator) + return self.estimator.classes_ + except NotFittedError: + raise AttributeError( + "The underlying estimator is not fitted yet." + ) from NotFittedError + + def _fit(self, X, y, **params): + """Fit the classifier. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) + Target values. + + **params : dict + Parameters to pass to the `fit` method of the underlying + classifier. + + Returns + ------- + self : object + Returns an instance of self. + """ + routed_params = process_routing(self, "fit", **params) + self.estimator_ = clone(self.estimator).fit(X, y, **routed_params.estimator.fit) + return self + + def predict(self, X): + """Predict the target of new samples. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The samples, as accepted by `estimator.predict`. + + Returns + ------- + class_labels : ndarray of shape (n_samples,) + The predicted class. + """ + _check_is_fitted(self) + + estimator = getattr(self, "estimator_", self.estimator) + + y_score, _, response_method_used = _get_response_values_binary( + estimator, + X, + self._get_response_method(), + pos_label=self.pos_label, + return_response_method_used=True, + ) + + if self.threshold == "auto": + decision_threshold = 0.5 if response_method_used == "predict_proba" else 0.0 + else: + decision_threshold = self.threshold + + return _threshold_scores_to_class_labels( + y_score, decision_threshold, self.classes_, self.pos_label + ) + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__).add( + estimator=self.estimator, + method_mapping=MethodMapping().add(callee="fit", caller="fit"), + ) + return router + + +def _fit_and_score_over_thresholds( + classifier, + X, + y, + *, + fit_params, + train_idx, + val_idx, + curve_scorer, + score_params, +): + """Fit a classifier and compute the scores for different decision thresholds. + + Parameters + ---------- + classifier : estimator instance + The classifier to fit and use for scoring. If `classifier` is already fitted, + it will be used as is. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The entire dataset. + + y : array-like of shape (n_samples,) + The entire target vector. + + fit_params : dict + Parameters to pass to the `fit` method of the underlying classifier. + + train_idx : ndarray of shape (n_train_samples,) or None + The indices of the training set. If `None`, `classifier` is expected to be + already fitted. + + val_idx : ndarray of shape (n_val_samples,) + The indices of the validation set used to score `classifier`. If `train_idx`, + the entire set will be used. + + curve_scorer : scorer instance + The scorer taking `classifier` and the validation set as input and outputting + decision thresholds and scores as a curve. Note that this is different from + the usual scorer that outputs a single score value as `curve_scorer` + outputs a single score value for each threshold. + + score_params : dict + Parameters to pass to the `score` method of the underlying scorer. + + Returns + ------- + scores : ndarray of shape (thresholds,) or tuple of such arrays + The scores computed for each decision threshold. When TPR/TNR or precision/ + recall are computed, `scores` is a tuple of two arrays. + + potential_thresholds : ndarray of shape (thresholds,) + The decision thresholds used to compute the scores. They are returned in + ascending order. + """ + + if train_idx is not None: + X_train, X_val = _safe_indexing(X, train_idx), _safe_indexing(X, val_idx) + y_train, y_val = _safe_indexing(y, train_idx), _safe_indexing(y, val_idx) + fit_params_train = _check_method_params(X, fit_params, indices=train_idx) + score_params_val = _check_method_params(X, score_params, indices=val_idx) + classifier.fit(X_train, y_train, **fit_params_train) + else: # prefit estimator, only a validation set is provided + X_val, y_val, score_params_val = X, y, score_params + + return curve_scorer(classifier, X_val, y_val, **score_params_val) + + +def _mean_interpolated_score(target_thresholds, cv_thresholds, cv_scores): + """Compute the mean interpolated score across folds by defining common thresholds. + + Parameters + ---------- + target_thresholds : ndarray of shape (thresholds,) + The thresholds to use to compute the mean score. + + cv_thresholds : ndarray of shape (n_folds, thresholds_fold) + The thresholds used to compute the scores for each fold. + + cv_scores : ndarray of shape (n_folds, thresholds_fold) + The scores computed for each threshold for each fold. + + Returns + ------- + mean_score : ndarray of shape (thresholds,) + The mean score across all folds for each target threshold. + """ + return np.mean( + [ + np.interp(target_thresholds, split_thresholds, split_score) + for split_thresholds, split_score in zip(cv_thresholds, cv_scores) + ], + axis=0, + ) + + +class TunedThresholdClassifierCV(BaseThresholdClassifier): + """Classifier that post-tunes the decision threshold using cross-validation. + + This estimator post-tunes the decision threshold (cut-off point) that is + used for converting posterior probability estimates (i.e. output of + `predict_proba`) or decision scores (i.e. output of `decision_function`) + into a class label. The tuning is done by optimizing a binary metric, + potentially constrained by a another metric. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.5 + + Parameters + ---------- + estimator : estimator instance + The classifier, fitted or not, for which we want to optimize + the decision threshold used during `predict`. + + scoring : str or callable, default="balanced_accuracy" + The objective metric to be optimized. Can be one of: + + - str: string associated to a scoring function for binary classification, + see :ref:`scoring_string_names` for options. + - callable: a scorer callable object (e.g., function) with signature + ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details. + + response_method : {"auto", "decision_function", "predict_proba"}, default="auto" + Methods by the classifier `estimator` corresponding to the + decision function for which we want to find a threshold. It can be: + + * if `"auto"`, it will try to invoke, for each classifier, + `"predict_proba"` or `"decision_function"` in that order. + * otherwise, one of `"predict_proba"` or `"decision_function"`. + If the method is not implemented by the classifier, it will raise an + error. + + thresholds : int or array-like, default=100 + The number of decision threshold to use when discretizing the output of the + classifier `method`. Pass an array-like to manually specify the thresholds + to use. + + cv : int, float, cross-validation generator, iterable or "prefit", default=None + Determines the cross-validation splitting strategy to train classifier. + Possible inputs for cv are: + + * `None`, to use the default 5-fold stratified K-fold cross validation; + * An integer number, to specify the number of folds in a stratified k-fold; + * A float number, to specify a single shuffle split. The floating number should + be in (0, 1) and represent the size of the validation set; + * An object to be used as a cross-validation generator; + * An iterable yielding train, test splits; + * `"prefit"`, to bypass the cross-validation. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. warning:: + Using `cv="prefit"` and passing the same dataset for fitting `estimator` + and tuning the cut-off point is subject to undesired overfitting. You can + refer to :ref:`TunedThresholdClassifierCV_no_cv` for an example. + + This option should only be used when the set used to fit `estimator` is + different from the one used to tune the cut-off point (by calling + :meth:`TunedThresholdClassifierCV.fit`). + + refit : bool, default=True + Whether or not to refit the classifier on the entire training set once + the decision threshold has been found. + Note that forcing `refit=False` on cross-validation having more + than a single split will raise an error. Similarly, `refit=True` in + conjunction with `cv="prefit"` will raise an error. + + n_jobs : int, default=None + The number of jobs to run in parallel. When `cv` represents a + cross-validation strategy, the fitting and scoring on each data split + is done in parallel. ``None`` means 1 unless in a + :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. See :term:`Glossary ` for more details. + + random_state : int, RandomState instance or None, default=None + Controls the randomness of cross-validation when `cv` is a float. + See :term:`Glossary `. + + store_cv_results : bool, default=False + Whether to store all scores and thresholds computed during the cross-validation + process. + + Attributes + ---------- + estimator_ : estimator instance + The fitted classifier used when predicting. + + best_threshold_ : float + The new decision threshold. + + best_score_ : float or None + The optimal score of the objective metric, evaluated at `best_threshold_`. + + cv_results_ : dict or None + A dictionary containing the scores and thresholds computed during the + cross-validation process. Only exist if `store_cv_results=True`. The + keys are `"thresholds"` and `"scores"`. + + classes_ : ndarray of shape (n_classes,) + The class labels. + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Only defined if the + underlying estimator exposes such an attribute when fit. + + See Also + -------- + sklearn.model_selection.FixedThresholdClassifier : Classifier that uses a + constant threshold. + sklearn.calibration.CalibratedClassifierCV : Estimator that calibrates + probabilities. + + Examples + -------- + >>> from sklearn.datasets import make_classification + >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.metrics import classification_report + >>> from sklearn.model_selection import TunedThresholdClassifierCV, train_test_split + >>> X, y = make_classification( + ... n_samples=1_000, weights=[0.9, 0.1], class_sep=0.8, random_state=42 + ... ) + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, stratify=y, random_state=42 + ... ) + >>> classifier = RandomForestClassifier(random_state=0).fit(X_train, y_train) + >>> print(classification_report(y_test, classifier.predict(X_test))) + precision recall f1-score support + + 0 0.94 0.99 0.96 224 + 1 0.80 0.46 0.59 26 + + accuracy 0.93 250 + macro avg 0.87 0.72 0.77 250 + weighted avg 0.93 0.93 0.92 250 + + >>> classifier_tuned = TunedThresholdClassifierCV( + ... classifier, scoring="balanced_accuracy" + ... ).fit(X_train, y_train) + >>> print( + ... f"Cut-off point found at {classifier_tuned.best_threshold_:.3f}" + ... ) + Cut-off point found at 0.342 + >>> print(classification_report(y_test, classifier_tuned.predict(X_test))) + precision recall f1-score support + + 0 0.96 0.95 0.96 224 + 1 0.61 0.65 0.63 26 + + accuracy 0.92 250 + macro avg 0.78 0.80 0.79 250 + weighted avg 0.92 0.92 0.92 250 + + """ + + _parameter_constraints: dict = { + **BaseThresholdClassifier._parameter_constraints, + "scoring": [ + StrOptions(set(get_scorer_names())), + callable, + MutableMapping, + ], + "thresholds": [Interval(Integral, 1, None, closed="left"), "array-like"], + "cv": [ + "cv_object", + StrOptions({"prefit"}), + Interval(RealNotInt, 0.0, 1.0, closed="neither"), + ], + "refit": ["boolean"], + "n_jobs": [Integral, None], + "random_state": ["random_state"], + "store_cv_results": ["boolean"], + } + + def __init__( + self, + estimator, + *, + scoring="balanced_accuracy", + response_method="auto", + thresholds=100, + cv=None, + refit=True, + n_jobs=None, + random_state=None, + store_cv_results=False, + ): + super().__init__(estimator=estimator, response_method=response_method) + self.scoring = scoring + self.thresholds = thresholds + self.cv = cv + self.refit = refit + self.n_jobs = n_jobs + self.random_state = random_state + self.store_cv_results = store_cv_results + + def _fit(self, X, y, **params): + """Fit the classifier and post-tune the decision threshold. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) + Target values. + + **params : dict + Parameters to pass to the `fit` method of the underlying + classifier and to the `scoring` scorer. + + Returns + ------- + self : object + Returns an instance of self. + """ + if isinstance(self.cv, Real) and 0 < self.cv < 1: + cv = StratifiedShuffleSplit( + n_splits=1, test_size=self.cv, random_state=self.random_state + ) + elif self.cv == "prefit": + if self.refit is True: + raise ValueError("When cv='prefit', refit cannot be True.") + try: + check_is_fitted(self.estimator, "classes_") + except NotFittedError as exc: + raise NotFittedError( + """When cv='prefit', `estimator` must be fitted.""" + ) from exc + cv = self.cv + else: + cv = check_cv(self.cv, y=y, classifier=True) + if self.refit is False and cv.get_n_splits() > 1: + raise ValueError("When cv has several folds, refit cannot be False.") + + routed_params = process_routing(self, "fit", **params) + self._curve_scorer = self._get_curve_scorer() + + # in the following block, we: + # - define the final classifier `self.estimator_` and train it if necessary + # - define `classifier` to be used to post-tune the decision threshold + # - define `split` to be used to fit/score `classifier` + if cv == "prefit": + self.estimator_ = self.estimator + classifier = self.estimator_ + splits = [(None, range(_num_samples(X)))] + else: + self.estimator_ = clone(self.estimator) + classifier = clone(self.estimator) + splits = cv.split(X, y, **routed_params.splitter.split) + + if self.refit: + # train on the whole dataset + X_train, y_train, fit_params_train = X, y, routed_params.estimator.fit + else: + # single split cross-validation + train_idx, _ = next(cv.split(X, y, **routed_params.splitter.split)) + X_train = _safe_indexing(X, train_idx) + y_train = _safe_indexing(y, train_idx) + fit_params_train = _check_method_params( + X, routed_params.estimator.fit, indices=train_idx + ) + + self.estimator_.fit(X_train, y_train, **fit_params_train) + + cv_scores, cv_thresholds = zip( + *Parallel(n_jobs=self.n_jobs)( + delayed(_fit_and_score_over_thresholds)( + clone(classifier) if cv != "prefit" else classifier, + X, + y, + fit_params=routed_params.estimator.fit, + train_idx=train_idx, + val_idx=val_idx, + curve_scorer=self._curve_scorer, + score_params=routed_params.scorer.score, + ) + for train_idx, val_idx in splits + ) + ) + + if any(np.isclose(th[0], th[-1]) for th in cv_thresholds): + raise ValueError( + "The provided estimator makes constant predictions. Therefore, it is " + "impossible to optimize the decision threshold." + ) + + # find the global min and max thresholds across all folds + min_threshold = min( + split_thresholds.min() for split_thresholds in cv_thresholds + ) + max_threshold = max( + split_thresholds.max() for split_thresholds in cv_thresholds + ) + if isinstance(self.thresholds, Integral): + decision_thresholds = np.linspace( + min_threshold, max_threshold, num=self.thresholds + ) + else: + decision_thresholds = np.asarray(self.thresholds) + + objective_scores = _mean_interpolated_score( + decision_thresholds, cv_thresholds, cv_scores + ) + best_idx = objective_scores.argmax() + self.best_score_ = objective_scores[best_idx] + self.best_threshold_ = decision_thresholds[best_idx] + if self.store_cv_results: + self.cv_results_ = { + "thresholds": decision_thresholds, + "scores": objective_scores, + } + + return self + + def predict(self, X): + """Predict the target of new samples. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The samples, as accepted by `estimator.predict`. + + Returns + ------- + class_labels : ndarray of shape (n_samples,) + The predicted class. + """ + check_is_fitted(self, "estimator_") + pos_label = self._curve_scorer._get_pos_label() + y_score, _ = _get_response_values_binary( + self.estimator_, + X, + self._get_response_method(), + pos_label=pos_label, + ) + + return _threshold_scores_to_class_labels( + y_score, self.best_threshold_, self.classes_, pos_label + ) + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = ( + MetadataRouter(owner=self.__class__.__name__) + .add( + estimator=self.estimator, + method_mapping=MethodMapping().add(callee="fit", caller="fit"), + ) + .add( + splitter=self.cv, + method_mapping=MethodMapping().add(callee="split", caller="fit"), + ) + .add( + scorer=self._get_curve_scorer(), + method_mapping=MethodMapping().add(callee="score", caller="fit"), + ) + ) + return router + + def _get_curve_scorer(self): + """Get the curve scorer based on the objective metric used.""" + scoring = check_scoring(self.estimator, scoring=self.scoring) + curve_scorer = _CurveScorer.from_scorer( + scoring, self._get_response_method(), self.thresholds + ) + return curve_scorer diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/_plot.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_plot.py new file mode 100644 index 0000000000000000000000000000000000000000..a69c8f455bd417b97c716c473304bfdc041d85c5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_plot.py @@ -0,0 +1,885 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np + +from ..utils._optional_dependencies import check_matplotlib_support +from ..utils._plotting import _interval_max_min_ratio, _validate_score_name +from ._validation import learning_curve, validation_curve + + +class _BaseCurveDisplay: + def _plot_curve( + self, + x_data, + *, + ax=None, + negate_score=False, + score_name=None, + score_type="test", + std_display_style="fill_between", + line_kw=None, + fill_between_kw=None, + errorbar_kw=None, + ): + check_matplotlib_support(f"{self.__class__.__name__}.plot") + + import matplotlib.pyplot as plt + + if ax is None: + _, ax = plt.subplots() + + if negate_score: + train_scores, test_scores = -self.train_scores, -self.test_scores + else: + train_scores, test_scores = self.train_scores, self.test_scores + + if std_display_style not in ("errorbar", "fill_between", None): + raise ValueError( + f"Unknown std_display_style: {std_display_style}. Should be one of" + " 'errorbar', 'fill_between', or None." + ) + + if score_type not in ("test", "train", "both"): + raise ValueError( + f"Unknown score_type: {score_type}. Should be one of 'test', " + "'train', or 'both'." + ) + + if score_type == "train": + scores = {"Train": train_scores} + elif score_type == "test": + scores = {"Test": test_scores} + else: # score_type == "both" + scores = {"Train": train_scores, "Test": test_scores} + + if std_display_style in ("fill_between", None): + # plot the mean score + if line_kw is None: + line_kw = {} + + self.lines_ = [] + for line_label, score in scores.items(): + self.lines_.append( + *ax.plot( + x_data, + score.mean(axis=1), + label=line_label, + **line_kw, + ) + ) + self.errorbar_ = None + self.fill_between_ = None # overwritten below by fill_between + + if std_display_style == "errorbar": + if errorbar_kw is None: + errorbar_kw = {} + + self.errorbar_ = [] + for line_label, score in scores.items(): + self.errorbar_.append( + ax.errorbar( + x_data, + score.mean(axis=1), + score.std(axis=1), + label=line_label, + **errorbar_kw, + ) + ) + self.lines_, self.fill_between_ = None, None + elif std_display_style == "fill_between": + if fill_between_kw is None: + fill_between_kw = {} + default_fill_between_kw = {"alpha": 0.5} + fill_between_kw = {**default_fill_between_kw, **fill_between_kw} + + self.fill_between_ = [] + for line_label, score in scores.items(): + self.fill_between_.append( + ax.fill_between( + x_data, + score.mean(axis=1) - score.std(axis=1), + score.mean(axis=1) + score.std(axis=1), + **fill_between_kw, + ) + ) + + score_name = self.score_name if score_name is None else score_name + + ax.legend() + + # We found that a ratio, smaller or bigger than 5, between the largest and + # smallest gap of the x values is a good indicator to choose between linear + # and log scale. + if _interval_max_min_ratio(x_data) > 5: + xscale = "symlog" if x_data.min() <= 0 else "log" + else: + xscale = "linear" + + ax.set_xscale(xscale) + ax.set_ylabel(f"{score_name}") + + self.ax_ = ax + self.figure_ = ax.figure + + +class LearningCurveDisplay(_BaseCurveDisplay): + """Learning Curve visualization. + + It is recommended to use + :meth:`~sklearn.model_selection.LearningCurveDisplay.from_estimator` to + create a :class:`~sklearn.model_selection.LearningCurveDisplay` instance. + All parameters are stored as attributes. + + Read more in the :ref:`User Guide ` for general information + about the visualization API and + :ref:`detailed documentation ` regarding the learning + curve visualization. + + .. versionadded:: 1.2 + + Parameters + ---------- + train_sizes : ndarray of shape (n_unique_ticks,) + Numbers of training examples that has been used to generate the + learning curve. + + train_scores : ndarray of shape (n_ticks, n_cv_folds) + Scores on training sets. + + test_scores : ndarray of shape (n_ticks, n_cv_folds) + Scores on test set. + + score_name : str, default=None + The name of the score used in `learning_curve`. It will override the name + inferred from the `scoring` parameter. If `score` is `None`, we use `"Score"` if + `negate_score` is `False` and `"Negative score"` otherwise. If `scoring` is a + string or a callable, we infer the name. We replace `_` by spaces and capitalize + the first letter. We remove `neg_` and replace it by `"Negative"` if + `negate_score` is `False` or just remove it otherwise. + + Attributes + ---------- + ax_ : matplotlib Axes + Axes with the learning curve. + + figure_ : matplotlib Figure + Figure containing the learning curve. + + errorbar_ : list of matplotlib Artist or None + When the `std_display_style` is `"errorbar"`, this is a list of + `matplotlib.container.ErrorbarContainer` objects. If another style is + used, `errorbar_` is `None`. + + lines_ : list of matplotlib Artist or None + When the `std_display_style` is `"fill_between"`, this is a list of + `matplotlib.lines.Line2D` objects corresponding to the mean train and + test scores. If another style is used, `line_` is `None`. + + fill_between_ : list of matplotlib Artist or None + When the `std_display_style` is `"fill_between"`, this is a list of + `matplotlib.collections.PolyCollection` objects. If another style is + used, `fill_between_` is `None`. + + See Also + -------- + sklearn.model_selection.learning_curve : Compute the learning curve. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import load_iris + >>> from sklearn.model_selection import LearningCurveDisplay, learning_curve + >>> from sklearn.tree import DecisionTreeClassifier + >>> X, y = load_iris(return_X_y=True) + >>> tree = DecisionTreeClassifier(random_state=0) + >>> train_sizes, train_scores, test_scores = learning_curve( + ... tree, X, y) + >>> display = LearningCurveDisplay(train_sizes=train_sizes, + ... train_scores=train_scores, test_scores=test_scores, score_name="Score") + >>> display.plot() + <...> + >>> plt.show() + """ + + def __init__(self, *, train_sizes, train_scores, test_scores, score_name=None): + self.train_sizes = train_sizes + self.train_scores = train_scores + self.test_scores = test_scores + self.score_name = score_name + + def plot( + self, + ax=None, + *, + negate_score=False, + score_name=None, + score_type="both", + std_display_style="fill_between", + line_kw=None, + fill_between_kw=None, + errorbar_kw=None, + ): + """Plot visualization. + + Parameters + ---------- + ax : matplotlib Axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + negate_score : bool, default=False + Whether or not to negate the scores obtained through + :func:`~sklearn.model_selection.learning_curve`. This is + particularly useful when using the error denoted by `neg_*` in + `scikit-learn`. + + score_name : str, default=None + The name of the score used to decorate the y-axis of the plot. It will + override the name inferred from the `scoring` parameter. If `score` is + `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"` + otherwise. If `scoring` is a string or a callable, we infer the name. We + replace `_` by spaces and capitalize the first letter. We remove `neg_` and + replace it by `"Negative"` if `negate_score` is + `False` or just remove it otherwise. + + score_type : {"test", "train", "both"}, default="both" + The type of score to plot. Can be one of `"test"`, `"train"`, or + `"both"`. + + std_display_style : {"errorbar", "fill_between"} or None, default="fill_between" + The style used to display the score standard deviation around the + mean score. If None, no standard deviation representation is + displayed. + + line_kw : dict, default=None + Additional keyword arguments passed to the `plt.plot` used to draw + the mean score. + + fill_between_kw : dict, default=None + Additional keyword arguments passed to the `plt.fill_between` used + to draw the score standard deviation. + + errorbar_kw : dict, default=None + Additional keyword arguments passed to the `plt.errorbar` used to + draw mean score and standard deviation score. + + Returns + ------- + display : :class:`~sklearn.model_selection.LearningCurveDisplay` + Object that stores computed values. + """ + self._plot_curve( + self.train_sizes, + ax=ax, + negate_score=negate_score, + score_name=score_name, + score_type=score_type, + std_display_style=std_display_style, + line_kw=line_kw, + fill_between_kw=fill_between_kw, + errorbar_kw=errorbar_kw, + ) + self.ax_.set_xlabel("Number of samples in the training set") + return self + + @classmethod + def from_estimator( + cls, + estimator, + X, + y, + *, + groups=None, + train_sizes=np.linspace(0.1, 1.0, 5), + cv=None, + scoring=None, + exploit_incremental_learning=False, + n_jobs=None, + pre_dispatch="all", + verbose=0, + shuffle=False, + random_state=None, + error_score=np.nan, + fit_params=None, + ax=None, + negate_score=False, + score_name=None, + score_type="both", + std_display_style="fill_between", + line_kw=None, + fill_between_kw=None, + errorbar_kw=None, + ): + """Create a learning curve display from an estimator. + + Read more in the :ref:`User Guide ` for general + information about the visualization API and :ref:`detailed + documentation ` regarding the learning curve + visualization. + + Parameters + ---------- + estimator : object type that implements the "fit" and "predict" methods + An object of that type which is cloned for each validation. + + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None + Target relative to X for classification or regression; + None for unsupervised learning. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. Only used in conjunction with a "Group" :term:`cv` + instance (e.g., :class:`GroupKFold`). + + train_sizes : array-like of shape (n_ticks,), \ + default=np.linspace(0.1, 1.0, 5) + Relative or absolute numbers of training examples that will be used + to generate the learning curve. If the dtype is float, it is + regarded as a fraction of the maximum size of the training set + (that is determined by the selected validation method), i.e. it has + to be within (0, 1]. Otherwise it is interpreted as absolute sizes + of the training sets. Note that for classification the number of + samples usually have to be big enough to contain at least one + sample from each class. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - int, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For int/None inputs, if the estimator is a classifier and `y` is + either binary or multiclass, + :class:`~sklearn.model_selection.StratifiedKFold` is used. In all + other cases, :class:`~sklearn.model_selection.KFold` is used. These + splitters are instantiated with `shuffle=False` so the splits will + be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + scoring : str or callable, default=None + The scoring method to use when calculating the learning curve. Options: + + - str: see :ref:`scoring_string_names` for options. + - callable: a scorer callable object (e.g., function) with signature + ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details. + - `None`: the `estimator`'s + :ref:`default evaluation criterion ` is used. + + exploit_incremental_learning : bool, default=False + If the estimator supports incremental learning, this will be + used to speed up fitting for different training set sizes. + + n_jobs : int, default=None + Number of jobs to run in parallel. Training the estimator and + computing the score are parallelized over the different training + and test sets. `None` means 1 unless in a + :obj:`joblib.parallel_backend` context. `-1` means using all + processors. See :term:`Glossary ` for more details. + + pre_dispatch : int or str, default='all' + Number of predispatched jobs for parallel execution (default is + all). The option can reduce the allocated memory. The str can + be an expression like '2*n_jobs'. + + verbose : int, default=0 + Controls the verbosity: the higher, the more messages. + + shuffle : bool, default=False + Whether to shuffle training data before taking prefixes of it + based on`train_sizes`. + + random_state : int, RandomState instance or None, default=None + Used when `shuffle` is True. Pass an int for reproducible + output across multiple function calls. + See :term:`Glossary `. + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator + fitting. If set to 'raise', the error is raised. If a numeric value + is given, FitFailedWarning is raised. + + fit_params : dict, default=None + Parameters to pass to the fit method of the estimator. + + ax : matplotlib Axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + negate_score : bool, default=False + Whether or not to negate the scores obtained through + :func:`~sklearn.model_selection.learning_curve`. This is + particularly useful when using the error denoted by `neg_*` in + `scikit-learn`. + + score_name : str, default=None + The name of the score used to decorate the y-axis of the plot. It will + override the name inferred from the `scoring` parameter. If `score` is + `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"` + otherwise. If `scoring` is a string or a callable, we infer the name. We + replace `_` by spaces and capitalize the first letter. We remove `neg_` and + replace it by `"Negative"` if `negate_score` is + `False` or just remove it otherwise. + + score_type : {"test", "train", "both"}, default="both" + The type of score to plot. Can be one of `"test"`, `"train"`, or + `"both"`. + + std_display_style : {"errorbar", "fill_between"} or None, default="fill_between" + The style used to display the score standard deviation around the + mean score. If `None`, no representation of the standard deviation + is displayed. + + line_kw : dict, default=None + Additional keyword arguments passed to the `plt.plot` used to draw + the mean score. + + fill_between_kw : dict, default=None + Additional keyword arguments passed to the `plt.fill_between` used + to draw the score standard deviation. + + errorbar_kw : dict, default=None + Additional keyword arguments passed to the `plt.errorbar` used to + draw mean score and standard deviation score. + + Returns + ------- + display : :class:`~sklearn.model_selection.LearningCurveDisplay` + Object that stores computed values. + + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import load_iris + >>> from sklearn.model_selection import LearningCurveDisplay + >>> from sklearn.tree import DecisionTreeClassifier + >>> X, y = load_iris(return_X_y=True) + >>> tree = DecisionTreeClassifier(random_state=0) + >>> LearningCurveDisplay.from_estimator(tree, X, y) + <...> + >>> plt.show() + """ + check_matplotlib_support(f"{cls.__name__}.from_estimator") + + score_name = _validate_score_name(score_name, scoring, negate_score) + + train_sizes, train_scores, test_scores = learning_curve( + estimator, + X, + y, + groups=groups, + train_sizes=train_sizes, + cv=cv, + scoring=scoring, + exploit_incremental_learning=exploit_incremental_learning, + n_jobs=n_jobs, + pre_dispatch=pre_dispatch, + verbose=verbose, + shuffle=shuffle, + random_state=random_state, + error_score=error_score, + return_times=False, + fit_params=fit_params, + ) + + viz = cls( + train_sizes=train_sizes, + train_scores=train_scores, + test_scores=test_scores, + score_name=score_name, + ) + return viz.plot( + ax=ax, + negate_score=negate_score, + score_type=score_type, + std_display_style=std_display_style, + line_kw=line_kw, + fill_between_kw=fill_between_kw, + errorbar_kw=errorbar_kw, + ) + + +class ValidationCurveDisplay(_BaseCurveDisplay): + """Validation Curve visualization. + + It is recommended to use + :meth:`~sklearn.model_selection.ValidationCurveDisplay.from_estimator` to + create a :class:`~sklearn.model_selection.ValidationCurveDisplay` instance. + All parameters are stored as attributes. + + Read more in the :ref:`User Guide ` for general information + about the visualization API and :ref:`detailed documentation + ` regarding the validation curve visualization. + + .. versionadded:: 1.3 + + Parameters + ---------- + param_name : str + Name of the parameter that has been varied. + + param_range : array-like of shape (n_ticks,) + The values of the parameter that have been evaluated. + + train_scores : ndarray of shape (n_ticks, n_cv_folds) + Scores on training sets. + + test_scores : ndarray of shape (n_ticks, n_cv_folds) + Scores on test set. + + score_name : str, default=None + The name of the score used in `validation_curve`. It will override the name + inferred from the `scoring` parameter. If `score` is `None`, we use `"Score"` if + `negate_score` is `False` and `"Negative score"` otherwise. If `scoring` is a + string or a callable, we infer the name. We replace `_` by spaces and capitalize + the first letter. We remove `neg_` and replace it by `"Negative"` if + `negate_score` is `False` or just remove it otherwise. + + Attributes + ---------- + ax_ : matplotlib Axes + Axes with the validation curve. + + figure_ : matplotlib Figure + Figure containing the validation curve. + + errorbar_ : list of matplotlib Artist or None + When the `std_display_style` is `"errorbar"`, this is a list of + `matplotlib.container.ErrorbarContainer` objects. If another style is + used, `errorbar_` is `None`. + + lines_ : list of matplotlib Artist or None + When the `std_display_style` is `"fill_between"`, this is a list of + `matplotlib.lines.Line2D` objects corresponding to the mean train and + test scores. If another style is used, `line_` is `None`. + + fill_between_ : list of matplotlib Artist or None + When the `std_display_style` is `"fill_between"`, this is a list of + `matplotlib.collections.PolyCollection` objects. If another style is + used, `fill_between_` is `None`. + + See Also + -------- + sklearn.model_selection.validation_curve : Compute the validation curve. + + Examples + -------- + >>> import numpy as np + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_classification + >>> from sklearn.model_selection import ValidationCurveDisplay, validation_curve + >>> from sklearn.linear_model import LogisticRegression + >>> X, y = make_classification(n_samples=1_000, random_state=0) + >>> logistic_regression = LogisticRegression() + >>> param_name, param_range = "C", np.logspace(-8, 3, 10) + >>> train_scores, test_scores = validation_curve( + ... logistic_regression, X, y, param_name=param_name, param_range=param_range + ... ) + >>> display = ValidationCurveDisplay( + ... param_name=param_name, param_range=param_range, + ... train_scores=train_scores, test_scores=test_scores, score_name="Score" + ... ) + >>> display.plot() + <...> + >>> plt.show() + """ + + def __init__( + self, *, param_name, param_range, train_scores, test_scores, score_name=None + ): + self.param_name = param_name + self.param_range = param_range + self.train_scores = train_scores + self.test_scores = test_scores + self.score_name = score_name + + def plot( + self, + ax=None, + *, + negate_score=False, + score_name=None, + score_type="both", + std_display_style="fill_between", + line_kw=None, + fill_between_kw=None, + errorbar_kw=None, + ): + """Plot visualization. + + Parameters + ---------- + ax : matplotlib Axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + negate_score : bool, default=False + Whether or not to negate the scores obtained through + :func:`~sklearn.model_selection.validation_curve`. This is + particularly useful when using the error denoted by `neg_*` in + `scikit-learn`. + + score_name : str, default=None + The name of the score used to decorate the y-axis of the plot. It will + override the name inferred from the `scoring` parameter. If `score` is + `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"` + otherwise. If `scoring` is a string or a callable, we infer the name. We + replace `_` by spaces and capitalize the first letter. We remove `neg_` and + replace it by `"Negative"` if `negate_score` is + `False` or just remove it otherwise. + + score_type : {"test", "train", "both"}, default="both" + The type of score to plot. Can be one of `"test"`, `"train"`, or + `"both"`. + + std_display_style : {"errorbar", "fill_between"} or None, default="fill_between" + The style used to display the score standard deviation around the + mean score. If None, no standard deviation representation is + displayed. + + line_kw : dict, default=None + Additional keyword arguments passed to the `plt.plot` used to draw + the mean score. + + fill_between_kw : dict, default=None + Additional keyword arguments passed to the `plt.fill_between` used + to draw the score standard deviation. + + errorbar_kw : dict, default=None + Additional keyword arguments passed to the `plt.errorbar` used to + draw mean score and standard deviation score. + + Returns + ------- + display : :class:`~sklearn.model_selection.ValidationCurveDisplay` + Object that stores computed values. + """ + self._plot_curve( + self.param_range, + ax=ax, + negate_score=negate_score, + score_name=score_name, + score_type=score_type, + std_display_style=std_display_style, + line_kw=line_kw, + fill_between_kw=fill_between_kw, + errorbar_kw=errorbar_kw, + ) + self.ax_.set_xlabel(f"{self.param_name}") + return self + + @classmethod + def from_estimator( + cls, + estimator, + X, + y, + *, + param_name, + param_range, + groups=None, + cv=None, + scoring=None, + n_jobs=None, + pre_dispatch="all", + verbose=0, + error_score=np.nan, + fit_params=None, + ax=None, + negate_score=False, + score_name=None, + score_type="both", + std_display_style="fill_between", + line_kw=None, + fill_between_kw=None, + errorbar_kw=None, + ): + """Create a validation curve display from an estimator. + + Read more in the :ref:`User Guide ` for general + information about the visualization API and :ref:`detailed + documentation ` regarding the validation curve + visualization. + + Parameters + ---------- + estimator : object type that implements the "fit" and "predict" methods + An object of that type which is cloned for each validation. + + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None + Target relative to X for classification or regression; + None for unsupervised learning. + + param_name : str + Name of the parameter that will be varied. + + param_range : array-like of shape (n_values,) + The values of the parameter that will be evaluated. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. Only used in conjunction with a "Group" :term:`cv` + instance (e.g., :class:`GroupKFold`). + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - int, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For int/None inputs, if the estimator is a classifier and `y` is + either binary or multiclass, + :class:`~sklearn.model_selection.StratifiedKFold` is used. In all + other cases, :class:`~sklearn.model_selection.KFold` is used. These + splitters are instantiated with `shuffle=False` so the splits will + be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + scoring : str or callable, default=None + Scoring method to use when computing the validation curve. Options: + + - str: see :ref:`scoring_string_names` for options. + - callable: a scorer callable object (e.g., function) with signature + ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details. + - `None`: the `estimator`'s + :ref:`default evaluation criterion ` is used. + + n_jobs : int, default=None + Number of jobs to run in parallel. Training the estimator and + computing the score are parallelized over the different training + and test sets. `None` means 1 unless in a + :obj:`joblib.parallel_backend` context. `-1` means using all + processors. See :term:`Glossary ` for more details. + + pre_dispatch : int or str, default='all' + Number of predispatched jobs for parallel execution (default is + all). The option can reduce the allocated memory. The str can + be an expression like '2*n_jobs'. + + verbose : int, default=0 + Controls the verbosity: the higher, the more messages. + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator + fitting. If set to 'raise', the error is raised. If a numeric value + is given, FitFailedWarning is raised. + + fit_params : dict, default=None + Parameters to pass to the fit method of the estimator. + + ax : matplotlib Axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + negate_score : bool, default=False + Whether or not to negate the scores obtained through + :func:`~sklearn.model_selection.validation_curve`. This is + particularly useful when using the error denoted by `neg_*` in + `scikit-learn`. + + score_name : str, default=None + The name of the score used to decorate the y-axis of the plot. It will + override the name inferred from the `scoring` parameter. If `score` is + `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"` + otherwise. If `scoring` is a string or a callable, we infer the name. We + replace `_` by spaces and capitalize the first letter. We remove `neg_` and + replace it by `"Negative"` if `negate_score` is + `False` or just remove it otherwise. + + score_type : {"test", "train", "both"}, default="both" + The type of score to plot. Can be one of `"test"`, `"train"`, or + `"both"`. + + std_display_style : {"errorbar", "fill_between"} or None, default="fill_between" + The style used to display the score standard deviation around the + mean score. If `None`, no representation of the standard deviation + is displayed. + + line_kw : dict, default=None + Additional keyword arguments passed to the `plt.plot` used to draw + the mean score. + + fill_between_kw : dict, default=None + Additional keyword arguments passed to the `plt.fill_between` used + to draw the score standard deviation. + + errorbar_kw : dict, default=None + Additional keyword arguments passed to the `plt.errorbar` used to + draw mean score and standard deviation score. + + Returns + ------- + display : :class:`~sklearn.model_selection.ValidationCurveDisplay` + Object that stores computed values. + + Examples + -------- + >>> import numpy as np + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_classification + >>> from sklearn.model_selection import ValidationCurveDisplay + >>> from sklearn.linear_model import LogisticRegression + >>> X, y = make_classification(n_samples=1_000, random_state=0) + >>> logistic_regression = LogisticRegression() + >>> param_name, param_range = "C", np.logspace(-8, 3, 10) + >>> ValidationCurveDisplay.from_estimator( + ... logistic_regression, X, y, param_name=param_name, + ... param_range=param_range, + ... ) + <...> + >>> plt.show() + """ + check_matplotlib_support(f"{cls.__name__}.from_estimator") + + score_name = _validate_score_name(score_name, scoring, negate_score) + + train_scores, test_scores = validation_curve( + estimator, + X, + y, + param_name=param_name, + param_range=param_range, + groups=groups, + cv=cv, + scoring=scoring, + n_jobs=n_jobs, + pre_dispatch=pre_dispatch, + verbose=verbose, + error_score=error_score, + fit_params=fit_params, + ) + + viz = cls( + param_name=param_name, + param_range=np.asarray(param_range), + train_scores=train_scores, + test_scores=test_scores, + score_name=score_name, + ) + return viz.plot( + ax=ax, + negate_score=negate_score, + score_type=score_type, + std_display_style=std_display_style, + line_kw=line_kw, + fill_between_kw=fill_between_kw, + errorbar_kw=errorbar_kw, + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search.py new file mode 100644 index 0000000000000000000000000000000000000000..5bd3f81195631da3fd21b8c3db95dcfc3df258fc --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search.py @@ -0,0 +1,1996 @@ +""" +The :mod:`sklearn.model_selection._search` includes utilities to fine-tune the +parameters of an estimator. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numbers +import operator +import time +import warnings +from abc import ABCMeta, abstractmethod +from collections import defaultdict +from collections.abc import Iterable, Mapping, Sequence +from copy import deepcopy +from functools import partial, reduce +from inspect import signature +from itertools import product + +import numpy as np +from numpy.ma import MaskedArray +from scipy.stats import rankdata + +from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier +from ..exceptions import NotFittedError +from ..metrics import check_scoring +from ..metrics._scorer import ( + _check_multimetric_scoring, + _MultimetricScorer, + get_scorer_names, +) +from ..utils import Bunch, check_random_state +from ..utils._param_validation import HasMethods, Interval, StrOptions +from ..utils._repr_html.estimator import _VisualBlock +from ..utils._tags import get_tags +from ..utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + _routing_enabled, + process_routing, +) +from ..utils.metaestimators import available_if +from ..utils.parallel import Parallel, delayed +from ..utils.random import sample_without_replacement +from ..utils.validation import _check_method_params, check_is_fitted, indexable +from ._split import check_cv +from ._validation import ( + _aggregate_score_dicts, + _fit_and_score, + _insert_error_scores, + _normalize_score_results, + _warn_or_raise_about_fit_failures, +) + +__all__ = ["GridSearchCV", "ParameterGrid", "ParameterSampler", "RandomizedSearchCV"] + + +class ParameterGrid: + """Grid of parameters with a discrete number of values for each. + + Can be used to iterate over parameter value combinations with the + Python built-in function iter. + The order of the generated parameter combinations is deterministic. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + param_grid : dict of str to sequence, or sequence of such + The parameter grid to explore, as a dictionary mapping estimator + parameters to sequences of allowed values. + + An empty dict signifies default parameters. + + A sequence of dicts signifies a sequence of grids to search, and is + useful to avoid exploring parameter combinations that make no sense + or have no effect. See the examples below. + + Examples + -------- + >>> from sklearn.model_selection import ParameterGrid + >>> param_grid = {'a': [1, 2], 'b': [True, False]} + >>> list(ParameterGrid(param_grid)) == ( + ... [{'a': 1, 'b': True}, {'a': 1, 'b': False}, + ... {'a': 2, 'b': True}, {'a': 2, 'b': False}]) + True + + >>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}] + >>> list(ParameterGrid(grid)) == [{'kernel': 'linear'}, + ... {'kernel': 'rbf', 'gamma': 1}, + ... {'kernel': 'rbf', 'gamma': 10}] + True + >>> ParameterGrid(grid)[1] == {'kernel': 'rbf', 'gamma': 1} + True + + See Also + -------- + GridSearchCV : Uses :class:`ParameterGrid` to perform a full parallelized + parameter search. + """ + + def __init__(self, param_grid): + if not isinstance(param_grid, (Mapping, Iterable)): + raise TypeError( + f"Parameter grid should be a dict or a list, got: {param_grid!r} of" + f" type {type(param_grid).__name__}" + ) + + if isinstance(param_grid, Mapping): + # wrap dictionary in a singleton list to support either dict + # or list of dicts + param_grid = [param_grid] + + # check if all entries are dictionaries of lists + for grid in param_grid: + if not isinstance(grid, dict): + raise TypeError(f"Parameter grid is not a dict ({grid!r})") + for key, value in grid.items(): + if isinstance(value, np.ndarray) and value.ndim > 1: + raise ValueError( + f"Parameter array for {key!r} should be one-dimensional, got:" + f" {value!r} with shape {value.shape}" + ) + if isinstance(value, str) or not isinstance( + value, (np.ndarray, Sequence) + ): + raise TypeError( + f"Parameter grid for parameter {key!r} needs to be a list or a" + f" numpy array, but got {value!r} (of type " + f"{type(value).__name__}) instead. Single values " + "need to be wrapped in a list with one element." + ) + if len(value) == 0: + raise ValueError( + f"Parameter grid for parameter {key!r} need " + f"to be a non-empty sequence, got: {value!r}" + ) + + self.param_grid = param_grid + + def __iter__(self): + """Iterate over the points in the grid. + + Returns + ------- + params : iterator over dict of str to any + Yields dictionaries mapping each estimator parameter to one of its + allowed values. + """ + for p in self.param_grid: + # Always sort the keys of a dictionary, for reproducibility + items = sorted(p.items()) + if not items: + yield {} + else: + keys, values = zip(*items) + for v in product(*values): + params = dict(zip(keys, v)) + yield params + + def __len__(self): + """Number of points on the grid.""" + # Product function that can handle iterables (np.prod can't). + product = partial(reduce, operator.mul) + return sum( + product(len(v) for v in p.values()) if p else 1 for p in self.param_grid + ) + + def __getitem__(self, ind): + """Get the parameters that would be ``ind``th in iteration + + Parameters + ---------- + ind : int + The iteration index + + Returns + ------- + params : dict of str to any + Equal to list(self)[ind] + """ + # This is used to make discrete sampling without replacement memory + # efficient. + for sub_grid in self.param_grid: + # XXX: could memoize information used here + if not sub_grid: + if ind == 0: + return {} + else: + ind -= 1 + continue + + # Reverse so most frequent cycling parameter comes first + keys, values_lists = zip(*sorted(sub_grid.items())[::-1]) + sizes = [len(v_list) for v_list in values_lists] + total = np.prod(sizes) + + if ind >= total: + # Try the next grid + ind -= total + else: + out = {} + for key, v_list, n in zip(keys, values_lists, sizes): + ind, offset = divmod(ind, n) + out[key] = v_list[offset] + return out + + raise IndexError("ParameterGrid index out of range") + + +class ParameterSampler: + """Generator on parameters sampled from given distributions. + + Non-deterministic iterable over random candidate combinations for hyper- + parameter search. If all parameters are presented as a list, + sampling without replacement is performed. If at least one parameter + is given as a distribution, sampling with replacement is used. + It is highly recommended to use continuous distributions for continuous + parameters. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + param_distributions : dict + Dictionary with parameters names (`str`) as keys and distributions + or lists of parameters to try. Distributions must provide a ``rvs`` + method for sampling (such as those from scipy.stats.distributions). + If a list is given, it is sampled uniformly. + If a list of dicts is given, first a dict is sampled uniformly, and + then a parameter is sampled using that dict as above. + + n_iter : int + Number of parameter settings that are produced. + + random_state : int, RandomState instance or None, default=None + Pseudo random number generator state used for random uniform sampling + from lists of possible values instead of scipy.stats distributions. + Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. + + Returns + ------- + params : dict of str to any + **Yields** dictionaries mapping each estimator parameter to + as sampled value. + + Examples + -------- + >>> from sklearn.model_selection import ParameterSampler + >>> from scipy.stats.distributions import expon + >>> import numpy as np + >>> rng = np.random.RandomState(0) + >>> param_grid = {'a':[1, 2], 'b': expon()} + >>> param_list = list(ParameterSampler(param_grid, n_iter=4, + ... random_state=rng)) + >>> rounded_list = [dict((k, round(v, 6)) for (k, v) in d.items()) + ... for d in param_list] + >>> rounded_list == [{'b': 0.89856, 'a': 1}, + ... {'b': 0.923223, 'a': 1}, + ... {'b': 1.878964, 'a': 2}, + ... {'b': 1.038159, 'a': 2}] + True + """ + + def __init__(self, param_distributions, n_iter, *, random_state=None): + if not isinstance(param_distributions, (Mapping, Iterable)): + raise TypeError( + "Parameter distribution is not a dict or a list," + f" got: {param_distributions!r} of type " + f"{type(param_distributions).__name__}" + ) + + if isinstance(param_distributions, Mapping): + # wrap dictionary in a singleton list to support either dict + # or list of dicts + param_distributions = [param_distributions] + + for dist in param_distributions: + if not isinstance(dist, dict): + raise TypeError( + "Parameter distribution is not a dict ({!r})".format(dist) + ) + for key in dist: + if not isinstance(dist[key], Iterable) and not hasattr( + dist[key], "rvs" + ): + raise TypeError( + f"Parameter grid for parameter {key!r} is not iterable " + f"or a distribution (value={dist[key]})" + ) + self.n_iter = n_iter + self.random_state = random_state + self.param_distributions = param_distributions + + def _is_all_lists(self): + return all( + all(not hasattr(v, "rvs") for v in dist.values()) + for dist in self.param_distributions + ) + + def __iter__(self): + rng = check_random_state(self.random_state) + + # if all distributions are given as lists, we want to sample without + # replacement + if self._is_all_lists(): + # look up sampled parameter settings in parameter grid + param_grid = ParameterGrid(self.param_distributions) + grid_size = len(param_grid) + n_iter = self.n_iter + + if grid_size < n_iter: + warnings.warn( + "The total space of parameters %d is smaller " + "than n_iter=%d. Running %d iterations. For exhaustive " + "searches, use GridSearchCV." % (grid_size, self.n_iter, grid_size), + UserWarning, + ) + n_iter = grid_size + for i in sample_without_replacement(grid_size, n_iter, random_state=rng): + yield param_grid[i] + + else: + for _ in range(self.n_iter): + dist = rng.choice(self.param_distributions) + # Always sort the keys of a dictionary, for reproducibility + items = sorted(dist.items()) + params = dict() + for k, v in items: + if hasattr(v, "rvs"): + params[k] = v.rvs(random_state=rng) + else: + params[k] = v[rng.randint(len(v))] + yield params + + def __len__(self): + """Number of points that will be sampled.""" + if self._is_all_lists(): + grid_size = len(ParameterGrid(self.param_distributions)) + return min(self.n_iter, grid_size) + else: + return self.n_iter + + +def _check_refit(search_cv, attr): + if not search_cv.refit: + raise AttributeError( + f"This {type(search_cv).__name__} instance was initialized with " + f"`refit=False`. {attr} is available only after refitting on the best " + "parameters. You can refit an estimator manually using the " + "`best_params_` attribute" + ) + + +def _search_estimator_has(attr): + """Check if we can delegate a method to the underlying estimator. + + Calling a prediction method will only be available if `refit=True`. In + such case, we check first the fitted best estimator. If it is not + fitted, we check the unfitted estimator. + + Checking the unfitted estimator allows to use `hasattr` on the `SearchCV` + instance even before calling `fit`. + """ + + def check(self): + _check_refit(self, attr) + if hasattr(self, "best_estimator_"): + # raise an AttributeError if `attr` does not exist + getattr(self.best_estimator_, attr) + return True + # raise an AttributeError if `attr` does not exist + getattr(self.estimator, attr) + return True + + return check + + +def _yield_masked_array_for_each_param(candidate_params): + """ + Yield a masked array for each candidate param. + + `candidate_params` is a sequence of params which were used in + a `GridSearchCV`. We use masked arrays for the results, as not + all params are necessarily present in each element of + `candidate_params`. For example, if using `GridSearchCV` with + a `SVC` model, then one might search over params like: + + - kernel=["rbf"], gamma=[0.1, 1] + - kernel=["poly"], degree=[1, 2] + + and then param `'gamma'` would not be present in entries of + `candidate_params` corresponding to `kernel='poly'`. + """ + n_candidates = len(candidate_params) + param_results = defaultdict(dict) + + for cand_idx, params in enumerate(candidate_params): + for name, value in params.items(): + param_results["param_%s" % name][cand_idx] = value + + for key, param_result in param_results.items(): + param_list = list(param_result.values()) + try: + arr = np.array(param_list) + except ValueError: + # This can happen when param_list contains lists of different + # lengths, for example: + # param_list=[[1], [2, 3]] + arr_dtype = np.dtype(object) + else: + # There are two cases when we don't use the automatically inferred + # dtype when creating the array and we use object instead: + # - string dtype + # - when array.ndim > 1, that means that param_list was something + # like a list of same-size sequences, which gets turned into a + # multi-dimensional array but we want a 1d array + arr_dtype = arr.dtype if arr.dtype.kind != "U" and arr.ndim == 1 else object + + # Use one MaskedArray and mask all the places where the param is not + # applicable for that candidate (which may not contain all the params). + ma = MaskedArray(np.empty(n_candidates, dtype=arr_dtype), mask=True) + for index, value in param_result.items(): + # Setting the value at an index unmasks that index + ma[index] = value + yield (key, ma) + + +class BaseSearchCV(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta): + """Abstract base class for hyper parameter search with cross-validation.""" + + _parameter_constraints: dict = { + "estimator": [HasMethods(["fit"])], + "scoring": [ + StrOptions(set(get_scorer_names())), + callable, + list, + tuple, + dict, + None, + ], + "n_jobs": [numbers.Integral, None], + "refit": ["boolean", str, callable], + "cv": ["cv_object"], + "verbose": ["verbose"], + "pre_dispatch": [numbers.Integral, str], + "error_score": [StrOptions({"raise"}), numbers.Real], + "return_train_score": ["boolean"], + } + + @abstractmethod + def __init__( + self, + estimator, + *, + scoring=None, + n_jobs=None, + refit=True, + cv=None, + verbose=0, + pre_dispatch="2*n_jobs", + error_score=np.nan, + return_train_score=True, + ): + self.scoring = scoring + self.estimator = estimator + self.n_jobs = n_jobs + self.refit = refit + self.cv = cv + self.verbose = verbose + self.pre_dispatch = pre_dispatch + self.error_score = error_score + self.return_train_score = return_train_score + + @property + # TODO(1.8) remove this property + def _estimator_type(self): + return self.estimator._estimator_type + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + sub_estimator_tags = get_tags(self.estimator) + tags.estimator_type = sub_estimator_tags.estimator_type + tags.classifier_tags = deepcopy(sub_estimator_tags.classifier_tags) + tags.regressor_tags = deepcopy(sub_estimator_tags.regressor_tags) + # allows cross-validation to see 'precomputed' metrics + tags.input_tags.pairwise = sub_estimator_tags.input_tags.pairwise + tags.input_tags.sparse = sub_estimator_tags.input_tags.sparse + tags.array_api_support = sub_estimator_tags.array_api_support + return tags + + def score(self, X, y=None, **params): + """Return the score on the given data, if the estimator has been refit. + + This uses the score defined by ``scoring`` where provided, and the + ``best_estimator_.score`` method otherwise. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples, n_output) \ + or (n_samples,), default=None + Target relative to X for classification or regression; + None for unsupervised learning. + + **params : dict + Parameters to be passed to the underlying scorer(s). + + .. versionadded:: 1.4 + Only available if `enable_metadata_routing=True`. See + :ref:`Metadata Routing User Guide ` for more + details. + + Returns + ------- + score : float + The score defined by ``scoring`` if provided, and the + ``best_estimator_.score`` method otherwise. + """ + _check_refit(self, "score") + check_is_fitted(self) + + _raise_for_params(params, self, "score") + + if _routing_enabled(): + score_params = process_routing(self, "score", **params).scorer["score"] + else: + score_params = dict() + + if self.scorer_ is None: + raise ValueError( + "No score function explicitly defined, " + "and the estimator doesn't provide one %s" % self.best_estimator_ + ) + if isinstance(self.scorer_, dict): + if self.multimetric_: + scorer = self.scorer_[self.refit] + else: + scorer = self.scorer_ + return scorer(self.best_estimator_, X, y, **score_params) + + # callable + score = self.scorer_(self.best_estimator_, X, y, **score_params) + if self.multimetric_: + score = score[self.refit] + return score + + @available_if(_search_estimator_has("score_samples")) + def score_samples(self, X): + """Call score_samples on the estimator with the best found parameters. + + Only available if ``refit=True`` and the underlying estimator supports + ``score_samples``. + + .. versionadded:: 0.24 + + Parameters + ---------- + X : iterable + Data to predict on. Must fulfill input requirements + of the underlying estimator. + + Returns + ------- + y_score : ndarray of shape (n_samples,) + The ``best_estimator_.score_samples`` method. + """ + check_is_fitted(self) + return self.best_estimator_.score_samples(X) + + @available_if(_search_estimator_has("predict")) + def predict(self, X): + """Call predict on the estimator with the best found parameters. + + Only available if ``refit=True`` and the underlying estimator supports + ``predict``. + + Parameters + ---------- + X : indexable, length n_samples + Must fulfill the input assumptions of the + underlying estimator. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + The predicted labels or values for `X` based on the estimator with + the best found parameters. + """ + check_is_fitted(self) + return self.best_estimator_.predict(X) + + @available_if(_search_estimator_has("predict_proba")) + def predict_proba(self, X): + """Call predict_proba on the estimator with the best found parameters. + + Only available if ``refit=True`` and the underlying estimator supports + ``predict_proba``. + + Parameters + ---------- + X : indexable, length n_samples + Must fulfill the input assumptions of the + underlying estimator. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) or (n_samples, n_classes) + Predicted class probabilities for `X` based on the estimator with + the best found parameters. The order of the classes corresponds + to that in the fitted attribute :term:`classes_`. + """ + check_is_fitted(self) + return self.best_estimator_.predict_proba(X) + + @available_if(_search_estimator_has("predict_log_proba")) + def predict_log_proba(self, X): + """Call predict_log_proba on the estimator with the best found parameters. + + Only available if ``refit=True`` and the underlying estimator supports + ``predict_log_proba``. + + Parameters + ---------- + X : indexable, length n_samples + Must fulfill the input assumptions of the + underlying estimator. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) or (n_samples, n_classes) + Predicted class log-probabilities for `X` based on the estimator + with the best found parameters. The order of the classes + corresponds to that in the fitted attribute :term:`classes_`. + """ + check_is_fitted(self) + return self.best_estimator_.predict_log_proba(X) + + @available_if(_search_estimator_has("decision_function")) + def decision_function(self, X): + """Call decision_function on the estimator with the best found parameters. + + Only available if ``refit=True`` and the underlying estimator supports + ``decision_function``. + + Parameters + ---------- + X : indexable, length n_samples + Must fulfill the input assumptions of the + underlying estimator. + + Returns + ------- + y_score : ndarray of shape (n_samples,) or (n_samples, n_classes) \ + or (n_samples, n_classes * (n_classes-1) / 2) + Result of the decision function for `X` based on the estimator with + the best found parameters. + """ + check_is_fitted(self) + return self.best_estimator_.decision_function(X) + + @available_if(_search_estimator_has("transform")) + def transform(self, X): + """Call transform on the estimator with the best found parameters. + + Only available if the underlying estimator supports ``transform`` and + ``refit=True``. + + Parameters + ---------- + X : indexable, length n_samples + Must fulfill the input assumptions of the + underlying estimator. + + Returns + ------- + Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) + `X` transformed in the new space based on the estimator with + the best found parameters. + """ + check_is_fitted(self) + return self.best_estimator_.transform(X) + + @available_if(_search_estimator_has("inverse_transform")) + def inverse_transform(self, X): + """Call inverse_transform on the estimator with the best found params. + + Only available if the underlying estimator implements + ``inverse_transform`` and ``refit=True``. + + Parameters + ---------- + X : indexable, length n_samples + Must fulfill the input assumptions of the + underlying estimator. + + Returns + ------- + X_original : {ndarray, sparse matrix} of shape (n_samples, n_features) + Result of the `inverse_transform` function for `X` based on the + estimator with the best found parameters. + """ + check_is_fitted(self) + return self.best_estimator_.inverse_transform(X) + + @property + def n_features_in_(self): + """Number of features seen during :term:`fit`. + + Only available when `refit=True`. + """ + # For consistency with other estimators we raise a AttributeError so + # that hasattr() fails if the search estimator isn't fitted. + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute.".format( + self.__class__.__name__ + ) + ) from nfe + + return self.best_estimator_.n_features_in_ + + @property + def classes_(self): + """Class labels. + + Only available when `refit=True` and the estimator is a classifier. + """ + _search_estimator_has("classes_")(self) + return self.best_estimator_.classes_ + + def _run_search(self, evaluate_candidates): + """Repeatedly calls `evaluate_candidates` to conduct a search. + + This method, implemented in sub-classes, makes it possible to + customize the scheduling of evaluations: GridSearchCV and + RandomizedSearchCV schedule evaluations for their whole parameter + search space at once but other more sequential approaches are also + possible: for instance is possible to iteratively schedule evaluations + for new regions of the parameter search space based on previously + collected evaluation results. This makes it possible to implement + Bayesian optimization or more generally sequential model-based + optimization by deriving from the BaseSearchCV abstract base class. + For example, Successive Halving is implemented by calling + `evaluate_candidates` multiples times (once per iteration of the SH + process), each time passing a different set of candidates with `X` + and `y` of increasing sizes. + + Parameters + ---------- + evaluate_candidates : callable + This callback accepts: + - a list of candidates, where each candidate is a dict of + parameter settings. + - an optional `cv` parameter which can be used to e.g. + evaluate candidates on different dataset splits, or + evaluate candidates on subsampled data (as done in the + Successive Halving estimators). By default, the original + `cv` parameter is used, and it is available as a private + `_checked_cv_orig` attribute. + - an optional `more_results` dict. Each key will be added to + the `cv_results_` attribute. Values should be lists of + length `n_candidates` + + It returns a dict of all results so far, formatted like + ``cv_results_``. + + Important note (relevant whether the default cv is used or not): + in randomized splitters, and unless the random_state parameter of + cv was set to an int, calling cv.split() multiple times will + yield different splits. Since cv.split() is called in + evaluate_candidates, this means that candidates will be evaluated + on different splits each time evaluate_candidates is called. This + might be a methodological issue depending on the search strategy + that you're implementing. To prevent randomized splitters from + being used, you may use _split._yields_constant_splits() + + Examples + -------- + + :: + + def _run_search(self, evaluate_candidates): + 'Try C=0.1 only if C=1 is better than C=10' + all_results = evaluate_candidates([{'C': 1}, {'C': 10}]) + score = all_results['mean_test_score'] + if score[0] < score[1]: + evaluate_candidates([{'C': 0.1}]) + """ + raise NotImplementedError("_run_search not implemented.") + + def _check_refit_for_multimetric(self, scores): + """Check `refit` is compatible with `scores` is valid""" + multimetric_refit_msg = ( + "For multi-metric scoring, the parameter refit must be set to a " + "scorer key or a callable to refit an estimator with the best " + "parameter setting on the whole data and make the best_* " + "attributes available for that metric. If this is not needed, " + f"refit should be set to False explicitly. {self.refit!r} was " + "passed." + ) + + valid_refit_dict = isinstance(self.refit, str) and self.refit in scores + + if ( + self.refit is not False + and not valid_refit_dict + and not callable(self.refit) + ): + raise ValueError(multimetric_refit_msg) + + @staticmethod + def _select_best_index(refit, refit_metric, results): + """Select index of the best combination of hyperparemeters.""" + if callable(refit): + # If callable, refit is expected to return the index of the best + # parameter set. + best_index = refit(results) + if not isinstance(best_index, numbers.Integral): + raise TypeError("best_index_ returned is not an integer") + if best_index < 0 or best_index >= len(results["params"]): + raise IndexError("best_index_ index out of range") + else: + best_index = results[f"rank_test_{refit_metric}"].argmin() + return best_index + + def _get_scorers(self): + """Get the scorer(s) to be used. + + This is used in ``fit`` and ``get_metadata_routing``. + + Returns + ------- + scorers, refit_metric + """ + refit_metric = "score" + + if callable(self.scoring): + scorers = self.scoring + elif self.scoring is None or isinstance(self.scoring, str): + scorers = check_scoring(self.estimator, self.scoring) + else: + scorers = _check_multimetric_scoring(self.estimator, self.scoring) + self._check_refit_for_multimetric(scorers) + refit_metric = self.refit + scorers = _MultimetricScorer( + scorers=scorers, raise_exc=(self.error_score == "raise") + ) + + return scorers, refit_metric + + def _check_scorers_accept_sample_weight(self): + # TODO(slep006): remove when metadata routing is the only way + scorers, _ = self._get_scorers() + # In the multimetric case, warn the user for each scorer separately + if isinstance(scorers, _MultimetricScorer): + for name, scorer in scorers._scorers.items(): + if not scorer._accept_sample_weight(): + warnings.warn( + f"The scoring {name}={scorer} does not support sample_weight, " + "which may lead to statistically incorrect results when " + f"fitting {self} with sample_weight. " + ) + return scorers._accept_sample_weight() + # In most cases, scorers is a Scorer object + # But it's a function when user passes scoring=function + if hasattr(scorers, "_accept_sample_weight"): + accept = scorers._accept_sample_weight() + else: + accept = "sample_weight" in signature(scorers).parameters + if not accept: + warnings.warn( + f"The scoring {scorers} does not support sample_weight, " + "which may lead to statistically incorrect results when " + f"fitting {self} with sample_weight. " + ) + return accept + + def _get_routed_params_for_fit(self, params): + """Get the parameters to be used for routing. + + This is a method instead of a snippet in ``fit`` since it's used twice, + here in ``fit``, and in ``HalvingRandomSearchCV.fit``. + """ + if _routing_enabled(): + routed_params = process_routing(self, "fit", **params) + else: + params = params.copy() + groups = params.pop("groups", None) + routed_params = Bunch( + estimator=Bunch(fit=params), + splitter=Bunch(split={"groups": groups}), + scorer=Bunch(score={}), + ) + # NOTE: sample_weight is forwarded to the scorer if sample_weight + # is not None and scorers accept sample_weight. For _MultimetricScorer, + # sample_weight is forwarded if any scorer accepts sample_weight + if ( + params.get("sample_weight") is not None + and self._check_scorers_accept_sample_weight() + ): + routed_params.scorer.score["sample_weight"] = params["sample_weight"] + return routed_params + + @_fit_context( + # *SearchCV.estimator is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y=None, **params): + """Run fit with all sets of parameters. + + Parameters + ---------- + + X : array-like of shape (n_samples, n_features) or (n_samples, n_samples) + Training vectors, where `n_samples` is the number of samples and + `n_features` is the number of features. For precomputed kernel or + distance matrix, the expected shape of X is (n_samples, n_samples). + + y : array-like of shape (n_samples, n_output) \ + or (n_samples,), default=None + Target relative to X for classification or regression; + None for unsupervised learning. + + **params : dict of str -> object + Parameters passed to the ``fit`` method of the estimator, the scorer, + and the CV splitter. + + If a fit parameter is an array-like whose length is equal to + `num_samples` then it will be split by cross-validation along with + `X` and `y`. For example, the :term:`sample_weight` parameter is + split because `len(sample_weights) = len(X)`. However, this behavior + does not apply to `groups` which is passed to the splitter configured + via the `cv` parameter of the constructor. Thus, `groups` is used + *to perform the split* and determines which samples are + assigned to the each side of the a split. + + Returns + ------- + self : object + Instance of fitted estimator. + """ + estimator = self.estimator + scorers, refit_metric = self._get_scorers() + + X, y = indexable(X, y) + params = _check_method_params(X, params=params) + + routed_params = self._get_routed_params_for_fit(params) + + cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator)) + n_splits = cv_orig.get_n_splits(X, y, **routed_params.splitter.split) + + base_estimator = clone(self.estimator) + + parallel = Parallel(n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch) + + fit_and_score_kwargs = dict( + scorer=scorers, + fit_params=routed_params.estimator.fit, + score_params=routed_params.scorer.score, + return_train_score=self.return_train_score, + return_n_test_samples=True, + return_times=True, + return_parameters=False, + error_score=self.error_score, + verbose=self.verbose, + ) + results = {} + with parallel: + all_candidate_params = [] + all_out = [] + all_more_results = defaultdict(list) + + def evaluate_candidates(candidate_params, cv=None, more_results=None): + cv = cv or cv_orig + candidate_params = list(candidate_params) + n_candidates = len(candidate_params) + + if self.verbose > 0: + print( + "Fitting {0} folds for each of {1} candidates," + " totalling {2} fits".format( + n_splits, n_candidates, n_candidates * n_splits + ) + ) + + out = parallel( + delayed(_fit_and_score)( + clone(base_estimator), + X, + y, + train=train, + test=test, + parameters=parameters, + split_progress=(split_idx, n_splits), + candidate_progress=(cand_idx, n_candidates), + **fit_and_score_kwargs, + ) + for (cand_idx, parameters), (split_idx, (train, test)) in product( + enumerate(candidate_params), + enumerate(cv.split(X, y, **routed_params.splitter.split)), + ) + ) + + if len(out) < 1: + raise ValueError( + "No fits were performed. " + "Was the CV iterator empty? " + "Were there no candidates?" + ) + elif len(out) != n_candidates * n_splits: + raise ValueError( + "cv.split and cv.get_n_splits returned " + "inconsistent results. Expected {} " + "splits, got {}".format(n_splits, len(out) // n_candidates) + ) + + _warn_or_raise_about_fit_failures(out, self.error_score) + + # For callable self.scoring, the return type is only know after + # calling. If the return type is a dictionary, the error scores + # can now be inserted with the correct key. The type checking + # of out will be done in `_insert_error_scores`. + if callable(self.scoring): + _insert_error_scores(out, self.error_score) + + all_candidate_params.extend(candidate_params) + all_out.extend(out) + + if more_results is not None: + for key, value in more_results.items(): + all_more_results[key].extend(value) + + nonlocal results + results = self._format_results( + all_candidate_params, n_splits, all_out, all_more_results + ) + + return results + + self._run_search(evaluate_candidates) + + # multimetric is determined here because in the case of a callable + # self.scoring the return type is only known after calling + first_test_score = all_out[0]["test_scores"] + self.multimetric_ = isinstance(first_test_score, dict) + + # check refit_metric now for a callable scorer that is multimetric + if callable(self.scoring) and self.multimetric_: + self._check_refit_for_multimetric(first_test_score) + refit_metric = self.refit + + # For multi-metric evaluation, store the best_index_, best_params_ and + # best_score_ iff refit is one of the scorer names + # In single metric evaluation, refit_metric is "score" + if self.refit or not self.multimetric_: + self.best_index_ = self._select_best_index( + self.refit, refit_metric, results + ) + if not callable(self.refit): + # With a non-custom callable, we can select the best score + # based on the best index + self.best_score_ = results[f"mean_test_{refit_metric}"][ + self.best_index_ + ] + self.best_params_ = results["params"][self.best_index_] + + if self.refit: + # here we clone the estimator as well as the parameters, since + # sometimes the parameters themselves might be estimators, e.g. + # when we search over different estimators in a pipeline. + # ref: https://github.com/scikit-learn/scikit-learn/pull/26786 + self.best_estimator_ = clone(base_estimator).set_params( + **clone(self.best_params_, safe=False) + ) + + refit_start_time = time.time() + if y is not None: + self.best_estimator_.fit(X, y, **routed_params.estimator.fit) + else: + self.best_estimator_.fit(X, **routed_params.estimator.fit) + refit_end_time = time.time() + self.refit_time_ = refit_end_time - refit_start_time + + if hasattr(self.best_estimator_, "feature_names_in_"): + self.feature_names_in_ = self.best_estimator_.feature_names_in_ + + # Store the only scorer not as a dict for single metric evaluation + if isinstance(scorers, _MultimetricScorer): + self.scorer_ = scorers._scorers + else: + self.scorer_ = scorers + + self.cv_results_ = results + self.n_splits_ = n_splits + + return self + + def _format_results(self, candidate_params, n_splits, out, more_results=None): + n_candidates = len(candidate_params) + out = _aggregate_score_dicts(out) + + results = dict(more_results or {}) + for key, val in results.items(): + # each value is a list (as per evaluate_candidate's convention) + # we convert it to an array for consistency with the other keys + results[key] = np.asarray(val) + + def _store(key_name, array, weights=None, splits=False, rank=False): + """A small helper to store the scores/times to the cv_results_""" + # When iterated first by splits, then by parameters + # We want `array` to have `n_candidates` rows and `n_splits` cols. + array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits) + if splits: + for split_idx in range(n_splits): + # Uses closure to alter the results + results["split%d_%s" % (split_idx, key_name)] = array[:, split_idx] + + array_means = np.average(array, axis=1, weights=weights) + results["mean_%s" % key_name] = array_means + + if key_name.startswith(("train_", "test_")) and np.any( + ~np.isfinite(array_means) + ): + warnings.warn( + ( + f"One or more of the {key_name.split('_')[0]} scores " + f"are non-finite: {array_means}" + ), + category=UserWarning, + ) + + # Weighted std is not directly available in numpy + array_stds = np.sqrt( + np.average( + (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights + ) + ) + results["std_%s" % key_name] = array_stds + + if rank: + # When the fit/scoring fails `array_means` contains NaNs, we + # will exclude them from the ranking process and consider them + # as tied with the worst performers. + if np.isnan(array_means).all(): + # All fit/scoring routines failed. + rank_result = np.ones_like(array_means, dtype=np.int32) + else: + min_array_means = np.nanmin(array_means) - 1 + array_means = np.nan_to_num(array_means, nan=min_array_means) + rank_result = rankdata(-array_means, method="min").astype( + np.int32, copy=False + ) + results["rank_%s" % key_name] = rank_result + + _store("fit_time", out["fit_time"]) + _store("score_time", out["score_time"]) + # Store a list of param dicts at the key 'params' + for param, ma in _yield_masked_array_for_each_param(candidate_params): + results[param] = ma + results["params"] = candidate_params + + test_scores_dict = _normalize_score_results(out["test_scores"]) + if self.return_train_score: + train_scores_dict = _normalize_score_results(out["train_scores"]) + + for scorer_name in test_scores_dict: + # Computed the (weighted) mean and std for test scores alone + _store( + "test_%s" % scorer_name, + test_scores_dict[scorer_name], + splits=True, + rank=True, + weights=None, + ) + if self.return_train_score: + _store( + "train_%s" % scorer_name, + train_scores_dict[scorer_name], + splits=True, + ) + + return results + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.4 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__) + router.add( + estimator=self.estimator, + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + + scorer, _ = self._get_scorers() + router.add( + scorer=scorer, + method_mapping=MethodMapping() + .add(caller="score", callee="score") + .add(caller="fit", callee="score"), + ) + router.add( + splitter=self.cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + return router + + def _sk_visual_block_(self): + if hasattr(self, "best_estimator_"): + key, estimator = "best_estimator_", self.best_estimator_ + else: + key, estimator = "estimator", self.estimator + + return _VisualBlock( + "parallel", + [estimator], + names=[f"{key}: {estimator.__class__.__name__}"], + name_details=[str(estimator)], + ) + + +class GridSearchCV(BaseSearchCV): + """Exhaustive search over specified parameter values for an estimator. + + Important members are fit, predict. + + GridSearchCV implements a "fit" and a "score" method. + It also implements "score_samples", "predict", "predict_proba", + "decision_function", "transform" and "inverse_transform" if they are + implemented in the estimator used. + + The parameters of the estimator used to apply these methods are optimized + by cross-validated grid-search over a parameter grid. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : estimator object + This is assumed to implement the scikit-learn estimator interface. + Either estimator needs to provide a ``score`` function, + or ``scoring`` must be passed. + + param_grid : dict or list of dictionaries + Dictionary with parameters names (`str`) as keys and lists of + parameter settings to try as values, or a list of such + dictionaries, in which case the grids spanned by each dictionary + in the list are explored. This enables searching over any sequence + of parameter settings. + + scoring : str, callable, list, tuple or dict, default=None + Strategy to evaluate the performance of the cross-validated model on + the test set. + + If `scoring` represents a single score, one can use: + + - a single string (see :ref:`scoring_string_names`); + - a callable (see :ref:`scoring_callable`) that returns a single value; + - `None`, the `estimator`'s + :ref:`default evaluation criterion ` is used. + + If `scoring` represents multiple scores, one can use: + + - a list or tuple of unique strings; + - a callable returning a dictionary where the keys are the metric + names and the values are the metric scores; + - a dictionary with metric names as keys and callables as values. + + See :ref:`multimetric_grid_search` for an example. + + n_jobs : int, default=None + Number of jobs to run in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + .. versionchanged:: v0.20 + `n_jobs` default changed from 1 to None + + refit : bool, str, or callable, default=True + Refit an estimator using the best found parameters on the whole + dataset. + + For multiple metric evaluation, this needs to be a `str` denoting the + scorer that would be used to find the best parameters for refitting + the estimator at the end. + + Where there are considerations other than maximum score in + choosing a best estimator, ``refit`` can be set to a function which + returns the selected ``best_index_`` given ``cv_results_``. In that + case, the ``best_estimator_`` and ``best_params_`` will be set + according to the returned ``best_index_`` while the ``best_score_`` + attribute will not be available. + + The refitted estimator is made available at the ``best_estimator_`` + attribute and permits using ``predict`` directly on this + ``GridSearchCV`` instance. + + Also for multiple metric evaluation, the attributes ``best_index_``, + ``best_score_`` and ``best_params_`` will only be available if + ``refit`` is set and all of them will be determined w.r.t this specific + scorer. + + See ``scoring`` parameter to know more about multiple metric + evaluation. + + See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_digits.py` + to see how to design a custom selection strategy using a callable + via `refit`. + + See :ref:`this example + ` + for an example of how to use ``refit=callable`` to balance model + complexity and cross-validated score. + + .. versionchanged:: 0.20 + Support for callable added. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - integer, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value if None changed from 3-fold to 5-fold. + + verbose : int + Controls the verbosity: the higher, the more messages. + + - >1 : the computation time for each fold and parameter candidate is + displayed; + - >2 : the score is also displayed; + - >3 : the fold and candidate parameter indexes are also displayed + together with the starting time of the computation. + + pre_dispatch : int, or str, default='2*n_jobs' + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - None, in which case all the jobs are immediately created and spawned. Use + this for lightweight and fast-running jobs, to avoid delays due to on-demand + spawning of the jobs + - An int, giving the exact number of total jobs that are spawned + - A str, giving an expression as a function of n_jobs, as in '2*n_jobs' + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. If a numeric value is given, + FitFailedWarning is raised. This parameter does not affect the refit + step, which will always raise the error. + + return_train_score : bool, default=False + If ``False``, the ``cv_results_`` attribute will not include training + scores. + Computing training scores is used to get insights on how different + parameter settings impact the overfitting/underfitting trade-off. + However computing the scores on the training set can be computationally + expensive and is not strictly required to select the parameters that + yield the best generalization performance. + + .. versionadded:: 0.19 + + .. versionchanged:: 0.21 + Default value was changed from ``True`` to ``False`` + + Attributes + ---------- + cv_results_ : dict of numpy (masked) ndarrays + A dict with keys as column headers and values as columns, that can be + imported into a pandas ``DataFrame``. + + For instance the below given table + + +------------+-----------+------------+-----------------+---+---------+ + |param_kernel|param_gamma|param_degree|split0_test_score|...|rank_t...| + +============+===========+============+=================+===+=========+ + | 'poly' | -- | 2 | 0.80 |...| 2 | + +------------+-----------+------------+-----------------+---+---------+ + | 'poly' | -- | 3 | 0.70 |...| 4 | + +------------+-----------+------------+-----------------+---+---------+ + | 'rbf' | 0.1 | -- | 0.80 |...| 3 | + +------------+-----------+------------+-----------------+---+---------+ + | 'rbf' | 0.2 | -- | 0.93 |...| 1 | + +------------+-----------+------------+-----------------+---+---------+ + + will be represented by a ``cv_results_`` dict of:: + + { + 'param_kernel': masked_array(data = ['poly', 'poly', 'rbf', 'rbf'], + mask = [False False False False]...) + 'param_gamma': masked_array(data = [-- -- 0.1 0.2], + mask = [ True True False False]...), + 'param_degree': masked_array(data = [2.0 3.0 -- --], + mask = [False False True True]...), + 'split0_test_score' : [0.80, 0.70, 0.80, 0.93], + 'split1_test_score' : [0.82, 0.50, 0.70, 0.78], + 'mean_test_score' : [0.81, 0.60, 0.75, 0.85], + 'std_test_score' : [0.01, 0.10, 0.05, 0.08], + 'rank_test_score' : [2, 4, 3, 1], + 'split0_train_score' : [0.80, 0.92, 0.70, 0.93], + 'split1_train_score' : [0.82, 0.55, 0.70, 0.87], + 'mean_train_score' : [0.81, 0.74, 0.70, 0.90], + 'std_train_score' : [0.01, 0.19, 0.00, 0.03], + 'mean_fit_time' : [0.73, 0.63, 0.43, 0.49], + 'std_fit_time' : [0.01, 0.02, 0.01, 0.01], + 'mean_score_time' : [0.01, 0.06, 0.04, 0.04], + 'std_score_time' : [0.00, 0.00, 0.00, 0.01], + 'params' : [{'kernel': 'poly', 'degree': 2}, ...], + } + + NOTE + + The key ``'params'`` is used to store a list of parameter + settings dicts for all the parameter candidates. + + The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and + ``std_score_time`` are all in seconds. + + For multi-metric evaluation, the scores for all the scorers are + available in the ``cv_results_`` dict at the keys ending with that + scorer's name (``'_'``) instead of ``'_score'`` shown + above. ('split0_test_precision', 'mean_train_precision' etc.) + + best_estimator_ : estimator + Estimator that was chosen by the search, i.e. estimator + which gave highest score (or smallest loss if specified) + on the left out data. Not available if ``refit=False``. + + See ``refit`` parameter for more information on allowed values. + + best_score_ : float + Mean cross-validated score of the best_estimator + + For multi-metric evaluation, this is present only if ``refit`` is + specified. + + This attribute is not available if ``refit`` is a function. + + best_params_ : dict + Parameter setting that gave the best results on the hold out data. + + For multi-metric evaluation, this is present only if ``refit`` is + specified. + + best_index_ : int + The index (of the ``cv_results_`` arrays) which corresponds to the best + candidate parameter setting. + + The dict at ``search.cv_results_['params'][search.best_index_]`` gives + the parameter setting for the best model, that gives the highest + mean score (``search.best_score_``). + + For multi-metric evaluation, this is present only if ``refit`` is + specified. + + scorer_ : function or a dict + Scorer function used on the held out data to choose the best + parameters for the model. + + For multi-metric evaluation, this attribute holds the validated + ``scoring`` dict which maps the scorer key to the scorer callable. + + n_splits_ : int + The number of cross-validation splits (folds/iterations). + + refit_time_ : float + Seconds used for refitting the best model on the whole dataset. + + This is present only if ``refit`` is not False. + + .. versionadded:: 0.20 + + multimetric_ : bool + Whether or not the scorers compute several metrics. + + classes_ : ndarray of shape (n_classes,) + The classes labels. This is present only if ``refit`` is specified and + the underlying estimator is a classifier. + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if + `best_estimator_` is defined (see the documentation for the `refit` + parameter for more details) and that `best_estimator_` exposes + `n_features_in_` when fit. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Only defined if + `best_estimator_` is defined (see the documentation for the `refit` + parameter for more details) and that `best_estimator_` exposes + `feature_names_in_` when fit. + + .. versionadded:: 1.0 + + See Also + -------- + ParameterGrid : Generates all the combinations of a hyperparameter grid. + train_test_split : Utility function to split the data into a development + set usable for fitting a GridSearchCV instance and an evaluation set + for its final evaluation. + sklearn.metrics.make_scorer : Make a scorer from a performance metric or + loss function. + + Notes + ----- + The parameters selected are those that maximize the score of the left out + data, unless an explicit score is passed in which case it is used instead. + + If `n_jobs` was set to a value higher than one, the data is copied for each + point in the grid (and not `n_jobs` times). This is done for efficiency + reasons if individual jobs take very little time, but may raise errors if + the dataset is large and not enough memory is available. A workaround in + this case is to set `pre_dispatch`. Then, the memory is copied only + `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * + n_jobs`. + + Examples + -------- + >>> from sklearn import svm, datasets + >>> from sklearn.model_selection import GridSearchCV + >>> iris = datasets.load_iris() + >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} + >>> svc = svm.SVC() + >>> clf = GridSearchCV(svc, parameters) + >>> clf.fit(iris.data, iris.target) + GridSearchCV(estimator=SVC(), + param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')}) + >>> sorted(clf.cv_results_.keys()) + ['mean_fit_time', 'mean_score_time', 'mean_test_score',... + 'param_C', 'param_kernel', 'params',... + 'rank_test_score', 'split0_test_score',... + 'split2_test_score', ... + 'std_fit_time', 'std_score_time', 'std_test_score'] + """ + + _parameter_constraints: dict = { + **BaseSearchCV._parameter_constraints, + "param_grid": [dict, list], + } + + def __init__( + self, + estimator, + param_grid, + *, + scoring=None, + n_jobs=None, + refit=True, + cv=None, + verbose=0, + pre_dispatch="2*n_jobs", + error_score=np.nan, + return_train_score=False, + ): + super().__init__( + estimator=estimator, + scoring=scoring, + n_jobs=n_jobs, + refit=refit, + cv=cv, + verbose=verbose, + pre_dispatch=pre_dispatch, + error_score=error_score, + return_train_score=return_train_score, + ) + self.param_grid = param_grid + + def _run_search(self, evaluate_candidates): + """Search all candidates in param_grid""" + evaluate_candidates(ParameterGrid(self.param_grid)) + + +class RandomizedSearchCV(BaseSearchCV): + """Randomized search on hyper parameters. + + RandomizedSearchCV implements a "fit" and a "score" method. + It also implements "score_samples", "predict", "predict_proba", + "decision_function", "transform" and "inverse_transform" if they are + implemented in the estimator used. + + The parameters of the estimator used to apply these methods are optimized + by cross-validated search over parameter settings. + + In contrast to GridSearchCV, not all parameter values are tried out, but + rather a fixed number of parameter settings is sampled from the specified + distributions. The number of parameter settings that are tried is + given by n_iter. + + If all parameters are presented as a list, + sampling without replacement is performed. If at least one parameter + is given as a distribution, sampling with replacement is used. + It is highly recommended to use continuous distributions for continuous + parameters. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.14 + + Parameters + ---------- + estimator : estimator object + An object of that type is instantiated for each grid point. + This is assumed to implement the scikit-learn estimator interface. + Either estimator needs to provide a ``score`` function, + or ``scoring`` must be passed. + + param_distributions : dict or list of dicts + Dictionary with parameters names (`str`) as keys and distributions + or lists of parameters to try. Distributions must provide a ``rvs`` + method for sampling (such as those from scipy.stats.distributions). + If a list is given, it is sampled uniformly. + If a list of dicts is given, first a dict is sampled uniformly, and + then a parameter is sampled using that dict as above. + + n_iter : int, default=10 + Number of parameter settings that are sampled. n_iter trades + off runtime vs quality of the solution. + + scoring : str, callable, list, tuple or dict, default=None + Strategy to evaluate the performance of the cross-validated model on + the test set. + + If `scoring` represents a single score, one can use: + + - a single string (see :ref:`scoring_string_names`); + - a callable (see :ref:`scoring_callable`) that returns a single value; + - `None`, the `estimator`'s + :ref:`default evaluation criterion ` is used. + + If `scoring` represents multiple scores, one can use: + + - a list or tuple of unique strings; + - a callable returning a dictionary where the keys are the metric + names and the values are the metric scores; + - a dictionary with metric names as keys and callables as values. + + See :ref:`multimetric_grid_search` for an example. + + If None, the estimator's score method is used. + + n_jobs : int, default=None + Number of jobs to run in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + .. versionchanged:: v0.20 + `n_jobs` default changed from 1 to None + + refit : bool, str, or callable, default=True + Refit an estimator using the best found parameters on the whole + dataset. + + For multiple metric evaluation, this needs to be a `str` denoting the + scorer that would be used to find the best parameters for refitting + the estimator at the end. + + Where there are considerations other than maximum score in + choosing a best estimator, ``refit`` can be set to a function which + returns the selected ``best_index_`` given the ``cv_results_``. In that + case, the ``best_estimator_`` and ``best_params_`` will be set + according to the returned ``best_index_`` while the ``best_score_`` + attribute will not be available. + + The refitted estimator is made available at the ``best_estimator_`` + attribute and permits using ``predict`` directly on this + ``RandomizedSearchCV`` instance. + + Also for multiple metric evaluation, the attributes ``best_index_``, + ``best_score_`` and ``best_params_`` will only be available if + ``refit`` is set and all of them will be determined w.r.t this specific + scorer. + + See ``scoring`` parameter to know more about multiple metric + evaluation. + + See :ref:`this example + ` + for an example of how to use ``refit=callable`` to balance model + complexity and cross-validated score. + + .. versionchanged:: 0.20 + Support for callable added. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - integer, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value if None changed from 3-fold to 5-fold. + + verbose : int + Controls the verbosity: the higher, the more messages. + + - >1 : the computation time for each fold and parameter candidate is + displayed; + - >2 : the score is also displayed; + - >3 : the fold and candidate parameter indexes are also displayed + together with the starting time of the computation. + + pre_dispatch : int, or str, default='2*n_jobs' + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - None, in which case all the jobs are immediately created and spawned. Use + this for lightweight and fast-running jobs, to avoid delays due to on-demand + spawning of the jobs + - An int, giving the exact number of total jobs that are spawned + - A str, giving an expression as a function of n_jobs, as in '2*n_jobs' + + random_state : int, RandomState instance or None, default=None + Pseudo random number generator state used for random uniform sampling + from lists of possible values instead of scipy.stats distributions. + Pass an int for reproducible output across multiple + function calls. + See :term:`Glossary `. + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. If a numeric value is given, + FitFailedWarning is raised. This parameter does not affect the refit + step, which will always raise the error. + + return_train_score : bool, default=False + If ``False``, the ``cv_results_`` attribute will not include training + scores. + Computing training scores is used to get insights on how different + parameter settings impact the overfitting/underfitting trade-off. + However computing the scores on the training set can be computationally + expensive and is not strictly required to select the parameters that + yield the best generalization performance. + + .. versionadded:: 0.19 + + .. versionchanged:: 0.21 + Default value was changed from ``True`` to ``False`` + + Attributes + ---------- + cv_results_ : dict of numpy (masked) ndarrays + A dict with keys as column headers and values as columns, that can be + imported into a pandas ``DataFrame``. + + For instance the below given table + + +--------------+-------------+-------------------+---+---------------+ + | param_kernel | param_gamma | split0_test_score |...|rank_test_score| + +==============+=============+===================+===+===============+ + | 'rbf' | 0.1 | 0.80 |...| 1 | + +--------------+-------------+-------------------+---+---------------+ + | 'rbf' | 0.2 | 0.84 |...| 3 | + +--------------+-------------+-------------------+---+---------------+ + | 'rbf' | 0.3 | 0.70 |...| 2 | + +--------------+-------------+-------------------+---+---------------+ + + will be represented by a ``cv_results_`` dict of:: + + { + 'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'], + mask = False), + 'param_gamma' : masked_array(data = [0.1 0.2 0.3], mask = False), + 'split0_test_score' : [0.80, 0.84, 0.70], + 'split1_test_score' : [0.82, 0.50, 0.70], + 'mean_test_score' : [0.81, 0.67, 0.70], + 'std_test_score' : [0.01, 0.24, 0.00], + 'rank_test_score' : [1, 3, 2], + 'split0_train_score' : [0.80, 0.92, 0.70], + 'split1_train_score' : [0.82, 0.55, 0.70], + 'mean_train_score' : [0.81, 0.74, 0.70], + 'std_train_score' : [0.01, 0.19, 0.00], + 'mean_fit_time' : [0.73, 0.63, 0.43], + 'std_fit_time' : [0.01, 0.02, 0.01], + 'mean_score_time' : [0.01, 0.06, 0.04], + 'std_score_time' : [0.00, 0.00, 0.00], + 'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...], + } + + NOTE + + The key ``'params'`` is used to store a list of parameter + settings dicts for all the parameter candidates. + + The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and + ``std_score_time`` are all in seconds. + + For multi-metric evaluation, the scores for all the scorers are + available in the ``cv_results_`` dict at the keys ending with that + scorer's name (``'_'``) instead of ``'_score'`` shown + above. ('split0_test_precision', 'mean_train_precision' etc.) + + best_estimator_ : estimator + Estimator that was chosen by the search, i.e. estimator + which gave highest score (or smallest loss if specified) + on the left out data. Not available if ``refit=False``. + + For multi-metric evaluation, this attribute is present only if + ``refit`` is specified. + + See ``refit`` parameter for more information on allowed values. + + best_score_ : float + Mean cross-validated score of the best_estimator. + + For multi-metric evaluation, this is not available if ``refit`` is + ``False``. See ``refit`` parameter for more information. + + This attribute is not available if ``refit`` is a function. + + best_params_ : dict + Parameter setting that gave the best results on the hold out data. + + For multi-metric evaluation, this is not available if ``refit`` is + ``False``. See ``refit`` parameter for more information. + + best_index_ : int + The index (of the ``cv_results_`` arrays) which corresponds to the best + candidate parameter setting. + + The dict at ``search.cv_results_['params'][search.best_index_]`` gives + the parameter setting for the best model, that gives the highest + mean score (``search.best_score_``). + + For multi-metric evaluation, this is not available if ``refit`` is + ``False``. See ``refit`` parameter for more information. + + scorer_ : function or a dict + Scorer function used on the held out data to choose the best + parameters for the model. + + For multi-metric evaluation, this attribute holds the validated + ``scoring`` dict which maps the scorer key to the scorer callable. + + n_splits_ : int + The number of cross-validation splits (folds/iterations). + + refit_time_ : float + Seconds used for refitting the best model on the whole dataset. + + This is present only if ``refit`` is not False. + + .. versionadded:: 0.20 + + multimetric_ : bool + Whether or not the scorers compute several metrics. + + classes_ : ndarray of shape (n_classes,) + The classes labels. This is present only if ``refit`` is specified and + the underlying estimator is a classifier. + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if + `best_estimator_` is defined (see the documentation for the `refit` + parameter for more details) and that `best_estimator_` exposes + `n_features_in_` when fit. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Only defined if + `best_estimator_` is defined (see the documentation for the `refit` + parameter for more details) and that `best_estimator_` exposes + `feature_names_in_` when fit. + + .. versionadded:: 1.0 + + See Also + -------- + GridSearchCV : Does exhaustive search over a grid of parameters. + ParameterSampler : A generator over parameter settings, constructed from + param_distributions. + + Notes + ----- + The parameters selected are those that maximize the score of the held-out + data, according to the scoring parameter. + + If `n_jobs` was set to a value higher than one, the data is copied for each + parameter setting(and not `n_jobs` times). This is done for efficiency + reasons if individual jobs take very little time, but may raise errors if + the dataset is large and not enough memory is available. A workaround in + this case is to set `pre_dispatch`. Then, the memory is copied only + `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * + n_jobs`. + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import RandomizedSearchCV + >>> from scipy.stats import uniform + >>> iris = load_iris() + >>> logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200, + ... random_state=0) + >>> distributions = dict(C=uniform(loc=0, scale=4), + ... penalty=['l2', 'l1']) + >>> clf = RandomizedSearchCV(logistic, distributions, random_state=0) + >>> search = clf.fit(iris.data, iris.target) + >>> search.best_params_ + {'C': np.float64(2.195...), 'penalty': 'l1'} + """ + + _parameter_constraints: dict = { + **BaseSearchCV._parameter_constraints, + "param_distributions": [dict, list], + "n_iter": [Interval(numbers.Integral, 1, None, closed="left")], + "random_state": ["random_state"], + } + + def __init__( + self, + estimator, + param_distributions, + *, + n_iter=10, + scoring=None, + n_jobs=None, + refit=True, + cv=None, + verbose=0, + pre_dispatch="2*n_jobs", + random_state=None, + error_score=np.nan, + return_train_score=False, + ): + self.param_distributions = param_distributions + self.n_iter = n_iter + self.random_state = random_state + super().__init__( + estimator=estimator, + scoring=scoring, + n_jobs=n_jobs, + refit=refit, + cv=cv, + verbose=verbose, + pre_dispatch=pre_dispatch, + error_score=error_score, + return_train_score=return_train_score, + ) + + def _run_search(self, evaluate_candidates): + """Search n_iter candidates from param_distributions""" + evaluate_candidates( + ParameterSampler( + self.param_distributions, self.n_iter, random_state=self.random_state + ) + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search_successive_halving.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search_successive_halving.py new file mode 100644 index 0000000000000000000000000000000000000000..bcd9a83e6dc4394c1ab75713a4373dd0709e90cf --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_search_successive_halving.py @@ -0,0 +1,1095 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from abc import abstractmethod +from math import ceil, floor, log +from numbers import Integral, Real + +import numpy as np + +from ..base import _fit_context, is_classifier +from ..metrics._scorer import get_scorer_names +from ..utils import resample +from ..utils._param_validation import Interval, StrOptions +from ..utils.multiclass import check_classification_targets +from ..utils.validation import _num_samples, validate_data +from . import ParameterGrid, ParameterSampler +from ._search import BaseSearchCV +from ._split import _yields_constant_splits, check_cv + +__all__ = ["HalvingGridSearchCV", "HalvingRandomSearchCV"] + + +class _SubsampleMetaSplitter: + """Splitter that subsamples a given fraction of the dataset""" + + def __init__(self, *, base_cv, fraction, subsample_test, random_state): + self.base_cv = base_cv + self.fraction = fraction + self.subsample_test = subsample_test + self.random_state = random_state + + def split(self, X, y, **kwargs): + for train_idx, test_idx in self.base_cv.split(X, y, **kwargs): + train_idx = resample( + train_idx, + replace=False, + random_state=self.random_state, + n_samples=int(self.fraction * len(train_idx)), + ) + if self.subsample_test: + test_idx = resample( + test_idx, + replace=False, + random_state=self.random_state, + n_samples=int(self.fraction * len(test_idx)), + ) + yield train_idx, test_idx + + +def _top_k(results, k, itr): + # Return the best candidates of a given iteration + iteration, mean_test_score, params = ( + np.asarray(a) + for a in (results["iter"], results["mean_test_score"], results["params"]) + ) + iter_indices = np.flatnonzero(iteration == itr) + scores = mean_test_score[iter_indices] + # argsort() places NaNs at the end of the array so we move NaNs to the + # front of the array so the last `k` items are the those with the + # highest scores. + sorted_indices = np.roll(np.argsort(scores), np.count_nonzero(np.isnan(scores))) + return np.array(params[iter_indices][sorted_indices[-k:]]) + + +class BaseSuccessiveHalving(BaseSearchCV): + """Implements successive halving. + + Ref: + Almost optimal exploration in multi-armed bandits, ICML 13 + Zohar Karnin, Tomer Koren, Oren Somekh + """ + + _parameter_constraints: dict = { + **BaseSearchCV._parameter_constraints, + # overwrite `scoring` since multi-metrics are not supported + "scoring": [StrOptions(set(get_scorer_names())), callable, None], + "random_state": ["random_state"], + "max_resources": [ + Interval(Integral, 0, None, closed="neither"), + StrOptions({"auto"}), + ], + "min_resources": [ + Interval(Integral, 0, None, closed="neither"), + StrOptions({"exhaust", "smallest"}), + ], + "resource": [str], + "factor": [Interval(Real, 0, None, closed="neither")], + "aggressive_elimination": ["boolean"], + } + _parameter_constraints.pop("pre_dispatch") # not used in this class + + def __init__( + self, + estimator, + *, + scoring=None, + n_jobs=None, + refit=True, + cv=5, + verbose=0, + random_state=None, + error_score=np.nan, + return_train_score=True, + max_resources="auto", + min_resources="exhaust", + resource="n_samples", + factor=3, + aggressive_elimination=False, + ): + super().__init__( + estimator, + scoring=scoring, + n_jobs=n_jobs, + refit=refit, + cv=cv, + verbose=verbose, + error_score=error_score, + return_train_score=return_train_score, + ) + + self.random_state = random_state + self.max_resources = max_resources + self.resource = resource + self.factor = factor + self.min_resources = min_resources + self.aggressive_elimination = aggressive_elimination + + def _check_input_parameters(self, X, y, split_params): + # We need to enforce that successive calls to cv.split() yield the same + # splits: see https://github.com/scikit-learn/scikit-learn/issues/15149 + if not _yields_constant_splits(self._checked_cv_orig): + raise ValueError( + "The cv parameter must yield consistent folds across " + "calls to split(). Set its random_state to an int, or set " + "shuffle=False." + ) + + if ( + self.resource != "n_samples" + and self.resource not in self.estimator.get_params() + ): + raise ValueError( + f"Cannot use resource={self.resource} which is not supported " + f"by estimator {self.estimator.__class__.__name__}" + ) + + if isinstance(self, HalvingRandomSearchCV): + if self.min_resources == self.n_candidates == "exhaust": + # for n_candidates=exhaust to work, we need to know what + # min_resources is. Similarly min_resources=exhaust needs to + # know the actual number of candidates. + raise ValueError( + "n_candidates and min_resources cannot be both set to 'exhaust'." + ) + + self.min_resources_ = self.min_resources + if self.min_resources_ in ("smallest", "exhaust"): + if self.resource == "n_samples": + n_splits = self._checked_cv_orig.get_n_splits(X, y, **split_params) + # please see https://gph.is/1KjihQe for a justification + magic_factor = 2 + self.min_resources_ = n_splits * magic_factor + if is_classifier(self.estimator): + y = validate_data(self, X="no_validation", y=y) + check_classification_targets(y) + n_classes = np.unique(y).shape[0] + self.min_resources_ *= n_classes + else: + self.min_resources_ = 1 + # if 'exhaust', min_resources_ might be set to a higher value later + # in _run_search + + self.max_resources_ = self.max_resources + if self.max_resources_ == "auto": + if not self.resource == "n_samples": + raise ValueError( + "resource can only be 'n_samples' when max_resources='auto'" + ) + self.max_resources_ = _num_samples(X) + + if self.min_resources_ > self.max_resources_: + raise ValueError( + f"min_resources_={self.min_resources_} is greater " + f"than max_resources_={self.max_resources_}." + ) + + if self.min_resources_ == 0: + raise ValueError( + f"min_resources_={self.min_resources_}: you might have passed " + "an empty dataset X." + ) + + @staticmethod + def _select_best_index(refit, refit_metric, results): + """Custom refit callable to return the index of the best candidate. + + We want the best candidate out of the last iteration. By default + BaseSearchCV would return the best candidate out of all iterations. + + Currently, we only support for a single metric thus `refit` and + `refit_metric` are not required. + """ + last_iter = np.max(results["iter"]) + last_iter_indices = np.flatnonzero(results["iter"] == last_iter) + + test_scores = results["mean_test_score"][last_iter_indices] + # If all scores are NaNs there is no way to pick between them, + # so we (arbitrarily) declare the zero'th entry the best one + if np.isnan(test_scores).all(): + best_idx = 0 + else: + best_idx = np.nanargmax(test_scores) + + return last_iter_indices[best_idx] + + @_fit_context( + # Halving*SearchCV.estimator is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y=None, **params): + """Run fit with all sets of parameters. + + Parameters + ---------- + + X : array-like, shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like, shape (n_samples,) or (n_samples, n_output), optional + Target relative to X for classification or regression; + None for unsupervised learning. + + **params : dict of string -> object + Parameters passed to the ``fit`` method of the estimator. + + Returns + ------- + self : object + Instance of fitted estimator. + """ + self._checked_cv_orig = check_cv( + self.cv, y, classifier=is_classifier(self.estimator) + ) + + routed_params = self._get_routed_params_for_fit(params) + self._check_input_parameters( + X=X, y=y, split_params=routed_params.splitter.split + ) + + self._n_samples_orig = _num_samples(X) + + super().fit(X, y=y, **params) + + # Set best_score_: BaseSearchCV does not set it, as refit is a callable + self.best_score_ = self.cv_results_["mean_test_score"][self.best_index_] + + return self + + def _run_search(self, evaluate_candidates): + candidate_params = self._generate_candidate_params() + + if self.resource != "n_samples" and any( + self.resource in candidate for candidate in candidate_params + ): + # Can only check this now since we need the candidates list + raise ValueError( + f"Cannot use parameter {self.resource} as the resource since " + "it is part of the searched parameters." + ) + + # n_required_iterations is the number of iterations needed so that the + # last iterations evaluates less than `factor` candidates. + n_required_iterations = 1 + floor(log(len(candidate_params), self.factor)) + + if self.min_resources == "exhaust": + # To exhaust the resources, we want to start with the biggest + # min_resources possible so that the last (required) iteration + # uses as many resources as possible + last_iteration = n_required_iterations - 1 + self.min_resources_ = max( + self.min_resources_, + self.max_resources_ // self.factor**last_iteration, + ) + + # n_possible_iterations is the number of iterations that we can + # actually do starting from min_resources and without exceeding + # max_resources. Depending on max_resources and the number of + # candidates, this may be higher or smaller than + # n_required_iterations. + n_possible_iterations = 1 + floor( + log(self.max_resources_ // self.min_resources_, self.factor) + ) + + if self.aggressive_elimination: + n_iterations = n_required_iterations + else: + n_iterations = min(n_possible_iterations, n_required_iterations) + + if self.verbose: + print(f"n_iterations: {n_iterations}") + print(f"n_required_iterations: {n_required_iterations}") + print(f"n_possible_iterations: {n_possible_iterations}") + print(f"min_resources_: {self.min_resources_}") + print(f"max_resources_: {self.max_resources_}") + print(f"aggressive_elimination: {self.aggressive_elimination}") + print(f"factor: {self.factor}") + + self.n_resources_ = [] + self.n_candidates_ = [] + + for itr in range(n_iterations): + power = itr # default + if self.aggressive_elimination: + # this will set n_resources to the initial value (i.e. the + # value of n_resources at the first iteration) for as many + # iterations as needed (while candidates are being + # eliminated), and then go on as usual. + power = max(0, itr - n_required_iterations + n_possible_iterations) + + n_resources = int(self.factor**power * self.min_resources_) + # guard, probably not needed + n_resources = min(n_resources, self.max_resources_) + self.n_resources_.append(n_resources) + + n_candidates = len(candidate_params) + self.n_candidates_.append(n_candidates) + + if self.verbose: + print("-" * 10) + print(f"iter: {itr}") + print(f"n_candidates: {n_candidates}") + print(f"n_resources: {n_resources}") + + if self.resource == "n_samples": + # subsampling will be done in cv.split() + cv = _SubsampleMetaSplitter( + base_cv=self._checked_cv_orig, + fraction=n_resources / self._n_samples_orig, + subsample_test=True, + random_state=self.random_state, + ) + + else: + # Need copy so that the n_resources of next iteration does + # not overwrite + candidate_params = [c.copy() for c in candidate_params] + for candidate in candidate_params: + candidate[self.resource] = n_resources + cv = self._checked_cv_orig + + more_results = { + "iter": [itr] * n_candidates, + "n_resources": [n_resources] * n_candidates, + } + + results = evaluate_candidates( + candidate_params, cv, more_results=more_results + ) + + n_candidates_to_keep = ceil(n_candidates / self.factor) + candidate_params = _top_k(results, n_candidates_to_keep, itr) + + self.n_remaining_candidates_ = len(candidate_params) + self.n_required_iterations_ = n_required_iterations + self.n_possible_iterations_ = n_possible_iterations + self.n_iterations_ = n_iterations + + @abstractmethod + def _generate_candidate_params(self): + pass + + +class HalvingGridSearchCV(BaseSuccessiveHalving): + """Search over specified parameter values with successive halving. + + The search strategy starts evaluating all the candidates with a small + amount of resources and iteratively selects the best candidates, using + more and more resources. + + Read more in the :ref:`User guide `. + + .. note:: + + This estimator is still **experimental** for now: the predictions + and the API might change without any deprecation cycle. To use it, + you need to explicitly import ``enable_halving_search_cv``:: + + >>> # explicitly require this experimental feature + >>> from sklearn.experimental import enable_halving_search_cv # noqa + >>> # now you can import normally from model_selection + >>> from sklearn.model_selection import HalvingGridSearchCV + + Parameters + ---------- + estimator : estimator object + This is assumed to implement the scikit-learn estimator interface. + Either estimator needs to provide a ``score`` function, + or ``scoring`` must be passed. + + param_grid : dict or list of dictionaries + Dictionary with parameters names (string) as keys and lists of + parameter settings to try as values, or a list of such + dictionaries, in which case the grids spanned by each dictionary + in the list are explored. This enables searching over any sequence + of parameter settings. + + factor : int or float, default=3 + The 'halving' parameter, which determines the proportion of candidates + that are selected for each subsequent iteration. For example, + ``factor=3`` means that only one third of the candidates are selected. + + resource : ``'n_samples'`` or str, default='n_samples' + Defines the resource that increases with each iteration. By default, + the resource is the number of samples. It can also be set to any + parameter of the base estimator that accepts positive integer + values, e.g. 'n_iterations' or 'n_estimators' for a gradient + boosting estimator. In this case ``max_resources`` cannot be 'auto' + and must be set explicitly. + + max_resources : int, default='auto' + The maximum amount of resource that any candidate is allowed to use + for a given iteration. By default, this is set to ``n_samples`` when + ``resource='n_samples'`` (default), else an error is raised. + + min_resources : {'exhaust', 'smallest'} or int, default='exhaust' + The minimum amount of resource that any candidate is allowed to use + for a given iteration. Equivalently, this defines the amount of + resources `r0` that are allocated for each candidate at the first + iteration. + + - 'smallest' is a heuristic that sets `r0` to a small value: + + - ``n_splits * 2`` when ``resource='n_samples'`` for a regression problem + - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a + classification problem + - ``1`` when ``resource != 'n_samples'`` + + - 'exhaust' will set `r0` such that the **last** iteration uses as + much resources as possible. Namely, the last iteration will use the + highest value smaller than ``max_resources`` that is a multiple of + both ``min_resources`` and ``factor``. In general, using 'exhaust' + leads to a more accurate estimator, but is slightly more time + consuming. + + Note that the amount of resources used at each iteration is always a + multiple of ``min_resources``. + + aggressive_elimination : bool, default=False + This is only relevant in cases where there isn't enough resources to + reduce the remaining candidates to at most `factor` after the last + iteration. If ``True``, then the search process will 'replay' the + first iteration for as long as needed until the number of candidates + is small enough. This is ``False`` by default, which means that the + last iteration may evaluate more than ``factor`` candidates. See + :ref:`aggressive_elimination` for more details. + + cv : int, cross-validation generator or iterable, default=5 + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - integer, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. note:: + Due to implementation details, the folds produced by `cv` must be + the same across multiple calls to `cv.split()`. For + built-in `scikit-learn` iterators, this can be achieved by + deactivating shuffling (`shuffle=False`), or by setting the + `cv`'s `random_state` parameter to an integer. + + scoring : str or callable, default=None + Scoring method to use to evaluate the predictions on the test set. + + - str: see :ref:`scoring_string_names` for options. + - callable: a scorer callable object (e.g., function) with signature + ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details. + - `None`: the `estimator`'s + :ref:`default evaluation criterion ` is used. + + refit : bool or callable, default=True + Refit an estimator using the best found parameters on the whole + dataset. + + Where there are considerations other than maximum score in + choosing a best estimator, ``refit`` can be set to a function which + returns the selected ``best_index_`` given ``cv_results_``. In that + case, the ``best_estimator_`` and ``best_params_`` will be set + according to the returned ``best_index_`` while the ``best_score_`` + attribute will not be available. + + The refitted estimator is made available at the ``best_estimator_`` + attribute and permits using ``predict`` directly on this + ``HalvingGridSearchCV`` instance. + + See :ref:`this example + ` + for an example of how to use ``refit=callable`` to balance model + complexity and cross-validated score. + + error_score : 'raise' or numeric + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. If a numeric value is given, + FitFailedWarning is raised. This parameter does not affect the refit + step, which will always raise the error. Default is ``np.nan``. + + return_train_score : bool, default=False + If ``False``, the ``cv_results_`` attribute will not include training + scores. + Computing training scores is used to get insights on how different + parameter settings impact the overfitting/underfitting trade-off. + However computing the scores on the training set can be computationally + expensive and is not strictly required to select the parameters that + yield the best generalization performance. + + random_state : int, RandomState instance or None, default=None + Pseudo random number generator state used for subsampling the dataset + when `resources != 'n_samples'`. Ignored otherwise. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + n_jobs : int or None, default=None + Number of jobs to run in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + verbose : int + Controls the verbosity: the higher, the more messages. + + Attributes + ---------- + n_resources_ : list of int + The amount of resources used at each iteration. + + n_candidates_ : list of int + The number of candidate parameters that were evaluated at each + iteration. + + n_remaining_candidates_ : int + The number of candidate parameters that are left after the last + iteration. It corresponds to `ceil(n_candidates[-1] / factor)` + + max_resources_ : int + The maximum number of resources that any candidate is allowed to use + for a given iteration. Note that since the number of resources used + at each iteration must be a multiple of ``min_resources_``, the + actual number of resources used at the last iteration may be smaller + than ``max_resources_``. + + min_resources_ : int + The amount of resources that are allocated for each candidate at the + first iteration. + + n_iterations_ : int + The actual number of iterations that were run. This is equal to + ``n_required_iterations_`` if ``aggressive_elimination`` is ``True``. + Else, this is equal to ``min(n_possible_iterations_, + n_required_iterations_)``. + + n_possible_iterations_ : int + The number of iterations that are possible starting with + ``min_resources_`` resources and without exceeding + ``max_resources_``. + + n_required_iterations_ : int + The number of iterations that are required to end up with less than + ``factor`` candidates at the last iteration, starting with + ``min_resources_`` resources. This will be smaller than + ``n_possible_iterations_`` when there isn't enough resources. + + cv_results_ : dict of numpy (masked) ndarrays + A dict with keys as column headers and values as columns, that can be + imported into a pandas ``DataFrame``. It contains lots of information + for analysing the results of a search. + Please refer to the :ref:`User guide` + for details. + + best_estimator_ : estimator or dict + Estimator that was chosen by the search, i.e. estimator + which gave highest score (or smallest loss if specified) + on the left out data. Not available if ``refit=False``. + + best_score_ : float + Mean cross-validated score of the best_estimator. + + best_params_ : dict + Parameter setting that gave the best results on the hold out data. + + best_index_ : int + The index (of the ``cv_results_`` arrays) which corresponds to the best + candidate parameter setting. + + The dict at ``search.cv_results_['params'][search.best_index_]`` gives + the parameter setting for the best model, that gives the highest + mean score (``search.best_score_``). + + scorer_ : function or a dict + Scorer function used on the held out data to choose the best + parameters for the model. + + n_splits_ : int + The number of cross-validation splits (folds/iterations). + + refit_time_ : float + Seconds used for refitting the best model on the whole dataset. + + This is present only if ``refit`` is not False. + + multimetric_ : bool + Whether or not the scorers compute several metrics. + + classes_ : ndarray of shape (n_classes,) + The classes labels. This is present only if ``refit`` is specified and + the underlying estimator is a classifier. + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if + `best_estimator_` is defined (see the documentation for the `refit` + parameter for more details) and that `best_estimator_` exposes + `n_features_in_` when fit. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Only defined if + `best_estimator_` is defined (see the documentation for the `refit` + parameter for more details) and that `best_estimator_` exposes + `feature_names_in_` when fit. + + .. versionadded:: 1.0 + + See Also + -------- + :class:`HalvingRandomSearchCV`: + Random search over a set of parameters using successive halving. + + Notes + ----- + The parameters selected are those that maximize the score of the held-out + data, according to the scoring parameter. + + All parameter combinations scored with a NaN will share the lowest rank. + + Examples + -------- + + >>> from sklearn.datasets import load_iris + >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.experimental import enable_halving_search_cv # noqa + >>> from sklearn.model_selection import HalvingGridSearchCV + ... + >>> X, y = load_iris(return_X_y=True) + >>> clf = RandomForestClassifier(random_state=0) + ... + >>> param_grid = {"max_depth": [3, None], + ... "min_samples_split": [5, 10]} + >>> search = HalvingGridSearchCV(clf, param_grid, resource='n_estimators', + ... max_resources=10, + ... random_state=0).fit(X, y) + >>> search.best_params_ # doctest: +SKIP + {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9} + """ + + _parameter_constraints: dict = { + **BaseSuccessiveHalving._parameter_constraints, + "param_grid": [dict, list], + } + + def __init__( + self, + estimator, + param_grid, + *, + factor=3, + resource="n_samples", + max_resources="auto", + min_resources="exhaust", + aggressive_elimination=False, + cv=5, + scoring=None, + refit=True, + error_score=np.nan, + return_train_score=True, + random_state=None, + n_jobs=None, + verbose=0, + ): + super().__init__( + estimator, + scoring=scoring, + n_jobs=n_jobs, + refit=refit, + verbose=verbose, + cv=cv, + random_state=random_state, + error_score=error_score, + return_train_score=return_train_score, + max_resources=max_resources, + resource=resource, + factor=factor, + min_resources=min_resources, + aggressive_elimination=aggressive_elimination, + ) + self.param_grid = param_grid + + def _generate_candidate_params(self): + return ParameterGrid(self.param_grid) + + +class HalvingRandomSearchCV(BaseSuccessiveHalving): + """Randomized search on hyper parameters. + + The search strategy starts evaluating all the candidates with a small + amount of resources and iteratively selects the best candidates, using more + and more resources. + + The candidates are sampled at random from the parameter space and the + number of sampled candidates is determined by ``n_candidates``. + + Read more in the :ref:`User guide`. + + .. note:: + + This estimator is still **experimental** for now: the predictions + and the API might change without any deprecation cycle. To use it, + you need to explicitly import ``enable_halving_search_cv``:: + + >>> # explicitly require this experimental feature + >>> from sklearn.experimental import enable_halving_search_cv # noqa + >>> # now you can import normally from model_selection + >>> from sklearn.model_selection import HalvingRandomSearchCV + + Parameters + ---------- + estimator : estimator object + This is assumed to implement the scikit-learn estimator interface. + Either estimator needs to provide a ``score`` function, + or ``scoring`` must be passed. + + param_distributions : dict or list of dicts + Dictionary with parameters names (`str`) as keys and distributions + or lists of parameters to try. Distributions must provide a ``rvs`` + method for sampling (such as those from scipy.stats.distributions). + If a list is given, it is sampled uniformly. + If a list of dicts is given, first a dict is sampled uniformly, and + then a parameter is sampled using that dict as above. + + n_candidates : "exhaust" or int, default="exhaust" + The number of candidate parameters to sample, at the first + iteration. Using 'exhaust' will sample enough candidates so that the + last iteration uses as many resources as possible, based on + `min_resources`, `max_resources` and `factor`. In this case, + `min_resources` cannot be 'exhaust'. + + factor : int or float, default=3 + The 'halving' parameter, which determines the proportion of candidates + that are selected for each subsequent iteration. For example, + ``factor=3`` means that only one third of the candidates are selected. + + resource : ``'n_samples'`` or str, default='n_samples' + Defines the resource that increases with each iteration. By default, + the resource is the number of samples. It can also be set to any + parameter of the base estimator that accepts positive integer + values, e.g. 'n_iterations' or 'n_estimators' for a gradient + boosting estimator. In this case ``max_resources`` cannot be 'auto' + and must be set explicitly. + + max_resources : int, default='auto' + The maximum number of resources that any candidate is allowed to use + for a given iteration. By default, this is set ``n_samples`` when + ``resource='n_samples'`` (default), else an error is raised. + + min_resources : {'exhaust', 'smallest'} or int, default='smallest' + The minimum amount of resource that any candidate is allowed to use + for a given iteration. Equivalently, this defines the amount of + resources `r0` that are allocated for each candidate at the first + iteration. + + - 'smallest' is a heuristic that sets `r0` to a small value: + + - ``n_splits * 2`` when ``resource='n_samples'`` for a regression problem + - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a + classification problem + - ``1`` when ``resource != 'n_samples'`` + + - 'exhaust' will set `r0` such that the **last** iteration uses as + much resources as possible. Namely, the last iteration will use the + highest value smaller than ``max_resources`` that is a multiple of + both ``min_resources`` and ``factor``. In general, using 'exhaust' + leads to a more accurate estimator, but is slightly more time + consuming. 'exhaust' isn't available when `n_candidates='exhaust'`. + + Note that the amount of resources used at each iteration is always a + multiple of ``min_resources``. + + aggressive_elimination : bool, default=False + This is only relevant in cases where there isn't enough resources to + reduce the remaining candidates to at most `factor` after the last + iteration. If ``True``, then the search process will 'replay' the + first iteration for as long as needed until the number of candidates + is small enough. This is ``False`` by default, which means that the + last iteration may evaluate more than ``factor`` candidates. See + :ref:`aggressive_elimination` for more details. + + cv : int, cross-validation generator or an iterable, default=5 + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - integer, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For integer/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. note:: + Due to implementation details, the folds produced by `cv` must be + the same across multiple calls to `cv.split()`. For + built-in `scikit-learn` iterators, this can be achieved by + deactivating shuffling (`shuffle=False`), or by setting the + `cv`'s `random_state` parameter to an integer. + + scoring : str or callable, default=None + Scoring method to use to evaluate the predictions on the test set. + + - str: see :ref:`scoring_string_names` for options. + - callable: a scorer callable object (e.g., function) with signature + ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details. + - `None`: the `estimator`'s + :ref:`default evaluation criterion ` is used. + + refit : bool or callable, default=True + Refit an estimator using the best found parameters on the whole + dataset. + + Where there are considerations other than maximum score in + choosing a best estimator, ``refit`` can be set to a function which + returns the selected ``best_index_`` given ``cv_results_``. In that + case, the ``best_estimator_`` and ``best_params_`` will be set + according to the returned ``best_index_`` while the ``best_score_`` + attribute will not be available. + + The refitted estimator is made available at the ``best_estimator_`` + attribute and permits using ``predict`` directly on this + ``HalvingRandomSearchCV`` instance. + + See :ref:`this example + ` + for an example of how to use ``refit=callable`` to balance model + complexity and cross-validated score. + + error_score : 'raise' or numeric + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. If a numeric value is given, + FitFailedWarning is raised. This parameter does not affect the refit + step, which will always raise the error. Default is ``np.nan``. + + return_train_score : bool, default=False + If ``False``, the ``cv_results_`` attribute will not include training + scores. + Computing training scores is used to get insights on how different + parameter settings impact the overfitting/underfitting trade-off. + However computing the scores on the training set can be computationally + expensive and is not strictly required to select the parameters that + yield the best generalization performance. + + random_state : int, RandomState instance or None, default=None + Pseudo random number generator state used for subsampling the dataset + when `resources != 'n_samples'`. Also used for random uniform + sampling from lists of possible values instead of scipy.stats + distributions. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + n_jobs : int or None, default=None + Number of jobs to run in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + verbose : int + Controls the verbosity: the higher, the more messages. + + Attributes + ---------- + n_resources_ : list of int + The amount of resources used at each iteration. + + n_candidates_ : list of int + The number of candidate parameters that were evaluated at each + iteration. + + n_remaining_candidates_ : int + The number of candidate parameters that are left after the last + iteration. It corresponds to `ceil(n_candidates[-1] / factor)` + + max_resources_ : int + The maximum number of resources that any candidate is allowed to use + for a given iteration. Note that since the number of resources used at + each iteration must be a multiple of ``min_resources_``, the actual + number of resources used at the last iteration may be smaller than + ``max_resources_``. + + min_resources_ : int + The amount of resources that are allocated for each candidate at the + first iteration. + + n_iterations_ : int + The actual number of iterations that were run. This is equal to + ``n_required_iterations_`` if ``aggressive_elimination`` is ``True``. + Else, this is equal to ``min(n_possible_iterations_, + n_required_iterations_)``. + + n_possible_iterations_ : int + The number of iterations that are possible starting with + ``min_resources_`` resources and without exceeding + ``max_resources_``. + + n_required_iterations_ : int + The number of iterations that are required to end up with less than + ``factor`` candidates at the last iteration, starting with + ``min_resources_`` resources. This will be smaller than + ``n_possible_iterations_`` when there isn't enough resources. + + cv_results_ : dict of numpy (masked) ndarrays + A dict with keys as column headers and values as columns, that can be + imported into a pandas ``DataFrame``. It contains lots of information + for analysing the results of a search. + Please refer to the :ref:`User guide` + for details. + + best_estimator_ : estimator or dict + Estimator that was chosen by the search, i.e. estimator + which gave highest score (or smallest loss if specified) + on the left out data. Not available if ``refit=False``. + + best_score_ : float + Mean cross-validated score of the best_estimator. + + best_params_ : dict + Parameter setting that gave the best results on the hold out data. + + best_index_ : int + The index (of the ``cv_results_`` arrays) which corresponds to the best + candidate parameter setting. + + The dict at ``search.cv_results_['params'][search.best_index_]`` gives + the parameter setting for the best model, that gives the highest + mean score (``search.best_score_``). + + scorer_ : function or a dict + Scorer function used on the held out data to choose the best + parameters for the model. + + n_splits_ : int + The number of cross-validation splits (folds/iterations). + + refit_time_ : float + Seconds used for refitting the best model on the whole dataset. + + This is present only if ``refit`` is not False. + + multimetric_ : bool + Whether or not the scorers compute several metrics. + + classes_ : ndarray of shape (n_classes,) + The classes labels. This is present only if ``refit`` is specified and + the underlying estimator is a classifier. + + n_features_in_ : int + Number of features seen during :term:`fit`. Only defined if + `best_estimator_` is defined (see the documentation for the `refit` + parameter for more details) and that `best_estimator_` exposes + `n_features_in_` when fit. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Only defined if + `best_estimator_` is defined (see the documentation for the `refit` + parameter for more details) and that `best_estimator_` exposes + `feature_names_in_` when fit. + + .. versionadded:: 1.0 + + See Also + -------- + :class:`HalvingGridSearchCV`: + Search over a grid of parameters using successive halving. + + Notes + ----- + The parameters selected are those that maximize the score of the held-out + data, according to the scoring parameter. + + All parameter combinations scored with a NaN will share the lowest rank. + + Examples + -------- + + >>> from sklearn.datasets import load_iris + >>> from sklearn.ensemble import RandomForestClassifier + >>> from sklearn.experimental import enable_halving_search_cv # noqa + >>> from sklearn.model_selection import HalvingRandomSearchCV + >>> from scipy.stats import randint + >>> import numpy as np + ... + >>> X, y = load_iris(return_X_y=True) + >>> clf = RandomForestClassifier(random_state=0) + >>> np.random.seed(0) + ... + >>> param_distributions = {"max_depth": [3, None], + ... "min_samples_split": randint(2, 11)} + >>> search = HalvingRandomSearchCV(clf, param_distributions, + ... resource='n_estimators', + ... max_resources=10, + ... random_state=0).fit(X, y) + >>> search.best_params_ # doctest: +SKIP + {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9} + """ + + _parameter_constraints: dict = { + **BaseSuccessiveHalving._parameter_constraints, + "param_distributions": [dict, list], + "n_candidates": [ + Interval(Integral, 0, None, closed="neither"), + StrOptions({"exhaust"}), + ], + } + + def __init__( + self, + estimator, + param_distributions, + *, + n_candidates="exhaust", + factor=3, + resource="n_samples", + max_resources="auto", + min_resources="smallest", + aggressive_elimination=False, + cv=5, + scoring=None, + refit=True, + error_score=np.nan, + return_train_score=True, + random_state=None, + n_jobs=None, + verbose=0, + ): + super().__init__( + estimator, + scoring=scoring, + n_jobs=n_jobs, + refit=refit, + verbose=verbose, + cv=cv, + random_state=random_state, + error_score=error_score, + return_train_score=return_train_score, + max_resources=max_resources, + resource=resource, + factor=factor, + min_resources=min_resources, + aggressive_elimination=aggressive_elimination, + ) + self.param_distributions = param_distributions + self.n_candidates = n_candidates + + def _generate_candidate_params(self): + n_candidates_first_iter = self.n_candidates + if n_candidates_first_iter == "exhaust": + # This will generate enough candidate so that the last iteration + # uses as much resources as possible + n_candidates_first_iter = self.max_resources_ // self.min_resources_ + return ParameterSampler( + self.param_distributions, + n_candidates_first_iter, + random_state=self.random_state, + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/_split.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_split.py new file mode 100644 index 0000000000000000000000000000000000000000..640b7f6eee2f02c0f7f22d89b8d9523d36ddc27f --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_split.py @@ -0,0 +1,3055 @@ +""" +The :mod:`sklearn.model_selection._split` module includes classes and +functions to split the data based on a preset strategy. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numbers +import warnings +from abc import ABCMeta, abstractmethod +from collections import defaultdict +from collections.abc import Iterable +from inspect import signature +from itertools import chain, combinations +from math import ceil, floor + +import numpy as np +from scipy.special import comb + +from ..utils import ( + _safe_indexing, + check_random_state, + indexable, + metadata_routing, +) +from ..utils._array_api import ( + _convert_to_numpy, + ensure_common_namespace_device, + get_namespace, +) +from ..utils._param_validation import Interval, RealNotInt, validate_params +from ..utils.extmath import _approximate_mode +from ..utils.metadata_routing import _MetadataRequester +from ..utils.multiclass import type_of_target +from ..utils.validation import _num_samples, check_array, column_or_1d + +__all__ = [ + "BaseCrossValidator", + "GroupKFold", + "GroupShuffleSplit", + "KFold", + "LeaveOneGroupOut", + "LeaveOneOut", + "LeavePGroupsOut", + "LeavePOut", + "PredefinedSplit", + "RepeatedKFold", + "RepeatedStratifiedKFold", + "ShuffleSplit", + "StratifiedGroupKFold", + "StratifiedKFold", + "StratifiedShuffleSplit", + "check_cv", + "train_test_split", +] + + +class _UnsupportedGroupCVMixin: + """Mixin for splitters that do not support Groups.""" + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,) + The target variable for supervised learning problems. + + groups : object + Always ignored, exists for compatibility. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + if groups is not None: + warnings.warn( + f"The groups parameter is ignored by {self.__class__.__name__}", + UserWarning, + ) + return super().split(X, y, groups=groups) + + +class GroupsConsumerMixin(_MetadataRequester): + """A Mixin to ``groups`` by default. + + This Mixin makes the object to request ``groups`` by default as ``True``. + + .. versionadded:: 1.3 + """ + + __metadata_request__split = {"groups": True} + + +class BaseCrossValidator(_MetadataRequester, metaclass=ABCMeta): + """Base class for all cross-validators. + + Implementations must define `_iter_test_masks` or `_iter_test_indices`. + """ + + # This indicates that by default CV splitters don't have a "groups" kwarg, + # unless indicated by inheriting from ``GroupsConsumerMixin``. + # This also prevents ``set_split_request`` to be generated for splitters + # which don't support ``groups``. + __metadata_request__split = {"groups": metadata_routing.UNUSED} + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,) + The target variable for supervised learning problems. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + X, y, groups = indexable(X, y, groups) + indices = np.arange(_num_samples(X)) + for test_index in self._iter_test_masks(X, y, groups): + train_index = indices[np.logical_not(test_index)] + test_index = indices[test_index] + yield train_index, test_index + + # Since subclasses must implement either _iter_test_masks or + # _iter_test_indices, neither can be abstract. + def _iter_test_masks(self, X=None, y=None, groups=None): + """Generates boolean masks corresponding to test sets. + + By default, delegates to _iter_test_indices(X, y, groups) + """ + for test_index in self._iter_test_indices(X, y, groups): + test_mask = np.zeros(_num_samples(X), dtype=bool) + test_mask[test_index] = True + yield test_mask + + def _iter_test_indices(self, X=None, y=None, groups=None): + """Generates integer indices corresponding to test sets.""" + raise NotImplementedError + + @abstractmethod + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator.""" + + def __repr__(self): + return _build_repr(self) + + +class LeaveOneOut(_UnsupportedGroupCVMixin, BaseCrossValidator): + """Leave-One-Out cross-validator. + + Provides train/test indices to split data in train/test sets. Each + sample is used once as a test set (singleton) while the remaining + samples form the training set. + + Note: ``LeaveOneOut()`` is equivalent to ``KFold(n_splits=n)`` and + ``LeavePOut(p=1)`` where ``n`` is the number of samples. + + Due to the high number of test sets (which is the same as the + number of samples) this cross-validation method can be very costly. + For large datasets one should favor :class:`KFold`, :class:`ShuffleSplit` + or :class:`StratifiedKFold`. + + Read more in the :ref:`User Guide `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import LeaveOneOut + >>> X = np.array([[1, 2], [3, 4]]) + >>> y = np.array([1, 2]) + >>> loo = LeaveOneOut() + >>> loo.get_n_splits(X) + 2 + >>> print(loo) + LeaveOneOut() + >>> for i, (train_index, test_index) in enumerate(loo.split(X)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[1] + Test: index=[0] + Fold 1: + Train: index=[0] + Test: index=[1] + + See Also + -------- + LeaveOneGroupOut : For splitting the data according to explicit, + domain-specific stratification of the dataset. + GroupKFold : K-fold iterator variant with non-overlapping groups. + """ + + def _iter_test_indices(self, X, y=None, groups=None): + n_samples = _num_samples(X) + if n_samples <= 1: + raise ValueError( + "Cannot perform LeaveOneOut with n_samples={}.".format(n_samples) + ) + return range(n_samples) + + def get_n_splits(self, X, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : object + Always ignored, exists for compatibility. + + groups : object + Always ignored, exists for compatibility. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + if X is None: + raise ValueError("The 'X' parameter should not be None.") + return _num_samples(X) + + +class LeavePOut(_UnsupportedGroupCVMixin, BaseCrossValidator): + """Leave-P-Out cross-validator. + + Provides train/test indices to split data in train/test sets. This results + in testing on all distinct samples of size p, while the remaining n - p + samples form the training set in each iteration. + + Note: ``LeavePOut(p)`` is NOT equivalent to + ``KFold(n_splits=n_samples // p)`` which creates non-overlapping test sets. + + Due to the high number of iterations which grows combinatorically with the + number of samples this cross-validation method can be very costly. For + large datasets one should favor :class:`KFold`, :class:`StratifiedKFold` + or :class:`ShuffleSplit`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + p : int + Size of the test sets. Must be strictly less than the number of + samples. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import LeavePOut + >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + >>> y = np.array([1, 2, 3, 4]) + >>> lpo = LeavePOut(2) + >>> lpo.get_n_splits(X) + 6 + >>> print(lpo) + LeavePOut(p=2) + >>> for i, (train_index, test_index) in enumerate(lpo.split(X)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[2 3] + Test: index=[0 1] + Fold 1: + Train: index=[1 3] + Test: index=[0 2] + Fold 2: + Train: index=[1 2] + Test: index=[0 3] + Fold 3: + Train: index=[0 3] + Test: index=[1 2] + Fold 4: + Train: index=[0 2] + Test: index=[1 3] + Fold 5: + Train: index=[0 1] + Test: index=[2 3] + """ + + def __init__(self, p): + self.p = p + + def _iter_test_indices(self, X, y=None, groups=None): + n_samples = _num_samples(X) + if n_samples <= self.p: + raise ValueError( + "p={} must be strictly less than the number of samples={}".format( + self.p, n_samples + ) + ) + for combination in combinations(range(n_samples), self.p): + yield np.array(combination) + + def get_n_splits(self, X, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : object + Always ignored, exists for compatibility. + + groups : object + Always ignored, exists for compatibility. + """ + if X is None: + raise ValueError("The 'X' parameter should not be None.") + return int(comb(_num_samples(X), self.p, exact=True)) + + +class _BaseKFold(BaseCrossValidator, metaclass=ABCMeta): + """Base class for K-Fold cross-validators and TimeSeriesSplit.""" + + @abstractmethod + def __init__(self, n_splits, *, shuffle, random_state): + if not isinstance(n_splits, numbers.Integral): + raise ValueError( + "The number of folds must be of Integral type. " + "%s of type %s was passed." % (n_splits, type(n_splits)) + ) + n_splits = int(n_splits) + + if n_splits <= 1: + raise ValueError( + "k-fold cross-validation requires at least one" + " train/test split by setting n_splits=2 or more," + " got n_splits={0}.".format(n_splits) + ) + + if not isinstance(shuffle, bool): + raise TypeError("shuffle must be True or False; got {0}".format(shuffle)) + + if not shuffle and random_state is not None: # None is the default + raise ValueError( + ( + "Setting a random_state has no effect since shuffle is " + "False. You should leave " + "random_state to its default (None), or set shuffle=True." + ), + ) + + self.n_splits = n_splits + self.shuffle = shuffle + self.random_state = random_state + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,), default=None + The target variable for supervised learning problems. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + X, y, groups = indexable(X, y, groups) + n_samples = _num_samples(X) + if self.n_splits > n_samples: + raise ValueError( + ( + "Cannot have number of splits n_splits={0} greater" + " than the number of samples: n_samples={1}." + ).format(self.n_splits, n_samples) + ) + + for train, test in super().split(X, y, groups): + yield train, test + + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator. + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + + y : object + Always ignored, exists for compatibility. + + groups : object + Always ignored, exists for compatibility. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + return self.n_splits + + +class KFold(_UnsupportedGroupCVMixin, _BaseKFold): + """K-Fold cross-validator. + + Provides train/test indices to split data in train/test sets. Split + dataset into k consecutive folds (without shuffling by default). + + Each fold is then used once as a validation while the k - 1 remaining + folds form the training set. + + Read more in the :ref:`User Guide `. + + For visualisation of cross-validation behaviour and + comparison between common scikit-learn split methods + refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` + + Parameters + ---------- + n_splits : int, default=5 + Number of folds. Must be at least 2. + + .. versionchanged:: 0.22 + ``n_splits`` default value changed from 3 to 5. + + shuffle : bool, default=False + Whether to shuffle the data before splitting into batches. + Note that the samples within each split will not be shuffled. + + random_state : int, RandomState instance or None, default=None + When `shuffle` is True, `random_state` affects the ordering of the + indices, which controls the randomness of each fold. Otherwise, this + parameter has no effect. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import KFold + >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) + >>> y = np.array([1, 2, 3, 4]) + >>> kf = KFold(n_splits=2) + >>> kf.get_n_splits(X) + 2 + >>> print(kf) + KFold(n_splits=2, random_state=None, shuffle=False) + >>> for i, (train_index, test_index) in enumerate(kf.split(X)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[2 3] + Test: index=[0 1] + Fold 1: + Train: index=[0 1] + Test: index=[2 3] + + Notes + ----- + The first ``n_samples % n_splits`` folds have size + ``n_samples // n_splits + 1``, other folds have size + ``n_samples // n_splits``, where ``n_samples`` is the number of samples. + + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting `random_state` + to an integer. + + See Also + -------- + StratifiedKFold : Takes class information into account to avoid building + folds with imbalanced class distributions (for binary or multiclass + classification tasks). + + GroupKFold : K-fold iterator variant with non-overlapping groups. + + RepeatedKFold : Repeats K-Fold n times. + """ + + def __init__(self, n_splits=5, *, shuffle=False, random_state=None): + super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state) + + def _iter_test_indices(self, X, y=None, groups=None): + n_samples = _num_samples(X) + indices = np.arange(n_samples) + if self.shuffle: + check_random_state(self.random_state).shuffle(indices) + + n_splits = self.n_splits + fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=int) + fold_sizes[: n_samples % n_splits] += 1 + current = 0 + for fold_size in fold_sizes: + start, stop = current, current + fold_size + yield indices[start:stop] + current = stop + + +class GroupKFold(GroupsConsumerMixin, _BaseKFold): + """K-fold iterator variant with non-overlapping groups. + + Each group will appear exactly once in the test set across all folds (the + number of distinct groups has to be at least equal to the number of folds). + + The folds are approximately balanced in the sense that the number of + samples is approximately the same in each test fold when `shuffle` is True. + + Read more in the :ref:`User Guide `. + + For visualisation of cross-validation behaviour and + comparison between common scikit-learn split methods + refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` + + Parameters + ---------- + n_splits : int, default=5 + Number of folds. Must be at least 2. + + .. versionchanged:: 0.22 + ``n_splits`` default value changed from 3 to 5. + + shuffle : bool, default=False + Whether to shuffle the groups before splitting into batches. + Note that the samples within each split will not be shuffled. + + .. versionadded:: 1.6 + + random_state : int, RandomState instance or None, default=None + When `shuffle` is True, `random_state` affects the ordering of the + indices, which controls the randomness of each fold. Otherwise, this + parameter has no effect. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + .. versionadded:: 1.6 + + Notes + ----- + Groups appear in an arbitrary order throughout the folds. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import GroupKFold + >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]]) + >>> y = np.array([1, 2, 3, 4, 5, 6]) + >>> groups = np.array([0, 0, 2, 2, 3, 3]) + >>> group_kfold = GroupKFold(n_splits=2) + >>> group_kfold.get_n_splits(X, y, groups) + 2 + >>> print(group_kfold) + GroupKFold(n_splits=2, random_state=None, shuffle=False) + >>> for i, (train_index, test_index) in enumerate(group_kfold.split(X, y, groups)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}, group={groups[train_index]}") + ... print(f" Test: index={test_index}, group={groups[test_index]}") + Fold 0: + Train: index=[2 3], group=[2 2] + Test: index=[0 1 4 5], group=[0 0 3 3] + Fold 1: + Train: index=[0 1 4 5], group=[0 0 3 3] + Test: index=[2 3], group=[2 2] + + See Also + -------- + LeaveOneGroupOut : For splitting the data according to explicit + domain-specific stratification of the dataset. + + StratifiedKFold : Takes class information into account to avoid building + folds with imbalanced class proportions (for binary or multiclass + classification tasks). + """ + + def __init__(self, n_splits=5, *, shuffle=False, random_state=None): + super().__init__(n_splits, shuffle=shuffle, random_state=random_state) + + def _iter_test_indices(self, X, y, groups): + if groups is None: + raise ValueError("The 'groups' parameter should not be None.") + groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None) + + unique_groups, group_idx = np.unique(groups, return_inverse=True) + n_groups = len(unique_groups) + + if self.n_splits > n_groups: + raise ValueError( + "Cannot have number of splits n_splits=%d greater" + " than the number of groups: %d." % (self.n_splits, n_groups) + ) + + if self.shuffle: + # Split and shuffle unique groups across n_splits + rng = check_random_state(self.random_state) + unique_groups = rng.permutation(unique_groups) + split_groups = np.array_split(unique_groups, self.n_splits) + + for test_group_ids in split_groups: + test_mask = np.isin(groups, test_group_ids) + yield np.where(test_mask)[0] + + else: + # Weight groups by their number of occurrences + n_samples_per_group = np.bincount(group_idx) + + # Distribute the most frequent groups first + indices = np.argsort(n_samples_per_group)[::-1] + n_samples_per_group = n_samples_per_group[indices] + + # Total weight of each fold + n_samples_per_fold = np.zeros(self.n_splits) + + # Mapping from group index to fold index + group_to_fold = np.zeros(len(unique_groups)) + + # Distribute samples by adding the largest weight to the lightest fold + for group_index, weight in enumerate(n_samples_per_group): + lightest_fold = np.argmin(n_samples_per_fold) + n_samples_per_fold[lightest_fold] += weight + group_to_fold[indices[group_index]] = lightest_fold + + indices = group_to_fold[group_idx] + + for f in range(self.n_splits): + yield np.where(indices == f)[0] + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,), default=None + The target variable for supervised learning problems. + + groups : array-like of shape (n_samples,) + Group labels for the samples used while splitting the dataset into + train/test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + return super().split(X, y, groups) + + +class StratifiedKFold(_BaseKFold): + """Class-wise stratified K-Fold cross-validator. + + Provides train/test indices to split data in train/test sets. + + This cross-validation object is a variation of KFold that returns + stratified folds. The folds are made by preserving the percentage of + samples for each class in `y` in a binary or multiclass classification + setting. + + Read more in the :ref:`User Guide `. + + For visualisation of cross-validation behaviour and + comparison between common scikit-learn split methods + refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` + + .. note:: + + Stratification on the class label solves an engineering problem rather + than a statistical one. See :ref:`stratification` for more details. + + Parameters + ---------- + n_splits : int, default=5 + Number of folds. Must be at least 2. + + .. versionchanged:: 0.22 + ``n_splits`` default value changed from 3 to 5. + + shuffle : bool, default=False + Whether to shuffle each class's samples before splitting into batches. + Note that the samples within each split will not be shuffled. + + random_state : int, RandomState instance or None, default=None + When `shuffle` is True, `random_state` affects the ordering of the + indices, which controls the randomness of each fold for each class. + Otherwise, leave `random_state` as `None`. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import StratifiedKFold + >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) + >>> y = np.array([0, 0, 1, 1]) + >>> skf = StratifiedKFold(n_splits=2) + >>> skf.get_n_splits(X, y) + 2 + >>> print(skf) + StratifiedKFold(n_splits=2, random_state=None, shuffle=False) + >>> for i, (train_index, test_index) in enumerate(skf.split(X, y)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[1 3] + Test: index=[0 2] + Fold 1: + Train: index=[0 2] + Test: index=[1 3] + + Notes + ----- + The implementation is designed to: + + * Generate test sets such that all contain the same distribution of + classes, or as close as possible. + * Be invariant to class label: relabelling ``y = ["Happy", "Sad"]`` to + ``y = [1, 0]`` should not change the indices generated. + * Preserve order dependencies in the dataset ordering, when + ``shuffle=False``: all samples from class k in some test set were + contiguous in y, or separated in y by samples from classes other than k. + * Generate test sets where the smallest and largest differ by at most one + sample. + + .. versionchanged:: 0.22 + The previous implementation did not follow the last constraint. + + See Also + -------- + RepeatedStratifiedKFold : Repeats Stratified K-Fold n times. + """ + + def __init__(self, n_splits=5, *, shuffle=False, random_state=None): + super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state) + + def _make_test_folds(self, X, y=None): + rng = check_random_state(self.random_state) + # XXX: as of now, cross-validation splitters only operate in NumPy-land + # without attempting to leverage array API namespace features. However + # they might be fed by array API inputs, e.g. in CV-enabled estimators so + # we need the following explicit conversion: + xp, is_array_api = get_namespace(y) + if is_array_api: + y = _convert_to_numpy(y, xp) + else: + y = np.asarray(y) + type_of_target_y = type_of_target(y) + allowed_target_types = ("binary", "multiclass") + if type_of_target_y not in allowed_target_types: + raise ValueError( + "Supported target types are: {}. Got {!r} instead.".format( + allowed_target_types, type_of_target_y + ) + ) + + y = column_or_1d(y) + + _, y_idx, y_inv = np.unique(y, return_index=True, return_inverse=True) + # y_inv encodes y according to lexicographic order. We invert y_idx to + # map the classes so that they are encoded by order of appearance: + # 0 represents the first label appearing in y, 1 the second, etc. + _, class_perm = np.unique(y_idx, return_inverse=True) + y_encoded = class_perm[y_inv] + + n_classes = len(y_idx) + y_counts = np.bincount(y_encoded) + min_groups = np.min(y_counts) + if np.all(self.n_splits > y_counts): + raise ValueError( + "n_splits=%d cannot be greater than the" + " number of members in each class." % (self.n_splits) + ) + if self.n_splits > min_groups: + warnings.warn( + "The least populated class in y has only %d" + " members, which is less than n_splits=%d." + % (min_groups, self.n_splits), + UserWarning, + ) + + # Determine the optimal number of samples from each class in each fold, + # using round robin over the sorted y. (This can be done direct from + # counts, but that code is unreadable.) + y_order = np.sort(y_encoded) + allocation = np.asarray( + [ + np.bincount(y_order[i :: self.n_splits], minlength=n_classes) + for i in range(self.n_splits) + ] + ) + + # To maintain the data order dependencies as best as possible within + # the stratification constraint, we assign samples from each class in + # blocks (and then mess that up when shuffle=True). + test_folds = np.empty(len(y), dtype="i") + for k in range(n_classes): + # since the kth column of allocation stores the number of samples + # of class k in each test set, this generates blocks of fold + # indices corresponding to the allocation for class k. + folds_for_class = np.arange(self.n_splits).repeat(allocation[:, k]) + if self.shuffle: + rng.shuffle(folds_for_class) + test_folds[y_encoded == k] = folds_for_class + return test_folds + + def _iter_test_masks(self, X, y=None, groups=None): + test_folds = self._make_test_folds(X, y) + for i in range(self.n_splits): + yield test_folds == i + + def split(self, X, y, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + Note that providing ``y`` is sufficient to generate the splits and + hence ``np.zeros(n_samples)`` may be used as a placeholder for + ``X`` instead of actual training data. + + y : array-like of shape (n_samples,) + The target variable for supervised learning problems. + Stratification is done based on the y labels. + + groups : object + Always ignored, exists for compatibility. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + + Notes + ----- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting `random_state` + to an integer. + """ + if groups is not None: + warnings.warn( + f"The groups parameter is ignored by {self.__class__.__name__}", + UserWarning, + ) + y = check_array(y, input_name="y", ensure_2d=False, dtype=None) + return super().split(X, y, groups) + + +class StratifiedGroupKFold(GroupsConsumerMixin, _BaseKFold): + """Class-wise stratified K-Fold iterator variant with non-overlapping groups. + + This cross-validation object is a variation of StratifiedKFold attempts to + return stratified folds with non-overlapping groups. The folds are made by + preserving the percentage of samples for each class in `y` in a binary or + multiclass classification setting. + + Each group will appear exactly once in the test set across all folds (the + number of distinct groups has to be at least equal to the number of folds). + + The difference between :class:`GroupKFold` + and `StratifiedGroupKFold` is that + the former attempts to create balanced folds such that the number of + distinct groups is approximately the same in each fold, whereas + `StratifiedGroupKFold` attempts to create folds which preserve the + percentage of samples for each class as much as possible given the + constraint of non-overlapping groups between splits. + + Read more in the :ref:`User Guide `. + + For visualisation of cross-validation behaviour and + comparison between common scikit-learn split methods + refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` + + .. note:: + + Stratification on the class label solves an engineering problem rather + than a statistical one. See :ref:`stratification` for more details. + + Parameters + ---------- + n_splits : int, default=5 + Number of folds. Must be at least 2. + + shuffle : bool, default=False + Whether to shuffle each class's samples before splitting into batches. + Note that the samples within each split will not be shuffled. + This implementation can only shuffle groups that have approximately the + same y distribution, no global shuffle will be performed. + + random_state : int or RandomState instance, default=None + When `shuffle` is True, `random_state` affects the ordering of the + indices, which controls the randomness of each fold for each class. + Otherwise, leave `random_state` as `None`. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import StratifiedGroupKFold + >>> X = np.ones((17, 2)) + >>> y = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + >>> groups = np.array([1, 1, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 8, 8]) + >>> sgkf = StratifiedGroupKFold(n_splits=3) + >>> sgkf.get_n_splits(X, y) + 3 + >>> print(sgkf) + StratifiedGroupKFold(n_splits=3, random_state=None, shuffle=False) + >>> for i, (train_index, test_index) in enumerate(sgkf.split(X, y, groups)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" group={groups[train_index]}") + ... print(f" Test: index={test_index}") + ... print(f" group={groups[test_index]}") + Fold 0: + Train: index=[ 0 1 2 3 7 8 9 10 11 15 16] + group=[1 1 2 2 4 5 5 5 5 8 8] + Test: index=[ 4 5 6 12 13 14] + group=[3 3 3 6 6 7] + Fold 1: + Train: index=[ 4 5 6 7 8 9 10 11 12 13 14] + group=[3 3 3 4 5 5 5 5 6 6 7] + Test: index=[ 0 1 2 3 15 16] + group=[1 1 2 2 8 8] + Fold 2: + Train: index=[ 0 1 2 3 4 5 6 12 13 14 15 16] + group=[1 1 2 2 3 3 3 6 6 7 8 8] + Test: index=[ 7 8 9 10 11] + group=[4 5 5 5 5] + + Notes + ----- + The implementation is designed to: + + * Mimic the behavior of StratifiedKFold as much as possible for trivial + groups (e.g. when each group contains only one sample). + * Be invariant to class label: relabelling ``y = ["Happy", "Sad"]`` to + ``y = [1, 0]`` should not change the indices generated. + * Stratify based on samples as much as possible while keeping + non-overlapping groups constraint. That means that in some cases when + there is a small number of groups containing a large number of samples + the stratification will not be possible and the behavior will be close + to GroupKFold. + + See also + -------- + StratifiedKFold: Takes class information into account to build folds which + retain class distributions (for binary or multiclass classification + tasks). + + GroupKFold: K-fold iterator variant with non-overlapping groups. + """ + + def __init__(self, n_splits=5, shuffle=False, random_state=None): + super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state) + + def _iter_test_indices(self, X, y, groups): + # Implementation is based on this kaggle kernel: + # https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation + # and is a subject to Apache 2.0 License. You may obtain a copy of the + # License at http://www.apache.org/licenses/LICENSE-2.0 + # Changelist: + # - Refactored function to a class following scikit-learn KFold + # interface. + # - Added heuristic for assigning group to the least populated fold in + # cases when all other criteria are equal + # - Swtch from using python ``Counter`` to ``np.unique`` to get class + # distribution + # - Added scikit-learn checks for input: checking that target is binary + # or multiclass, checking passed random state, checking that number + # of splits is less than number of members in each class, checking + # that least populated class has more members than there are splits. + rng = check_random_state(self.random_state) + y = np.asarray(y) + type_of_target_y = type_of_target(y) + allowed_target_types = ("binary", "multiclass") + if type_of_target_y not in allowed_target_types: + raise ValueError( + "Supported target types are: {}. Got {!r} instead.".format( + allowed_target_types, type_of_target_y + ) + ) + + y = column_or_1d(y) + _, y_inv, y_cnt = np.unique(y, return_inverse=True, return_counts=True) + if np.all(self.n_splits > y_cnt): + raise ValueError( + "n_splits=%d cannot be greater than the" + " number of members in each class." % (self.n_splits) + ) + n_smallest_class = np.min(y_cnt) + if self.n_splits > n_smallest_class: + warnings.warn( + "The least populated class in y has only %d" + " members, which is less than n_splits=%d." + % (n_smallest_class, self.n_splits), + UserWarning, + ) + n_classes = len(y_cnt) + + _, groups_inv, groups_cnt = np.unique( + groups, return_inverse=True, return_counts=True + ) + y_counts_per_group = np.zeros((len(groups_cnt), n_classes)) + for class_idx, group_idx in zip(y_inv, groups_inv): + y_counts_per_group[group_idx, class_idx] += 1 + + y_counts_per_fold = np.zeros((self.n_splits, n_classes)) + groups_per_fold = defaultdict(set) + + if self.shuffle: + rng.shuffle(y_counts_per_group) + + # Stable sort to keep shuffled order for groups with the same + # class distribution variance + sorted_groups_idx = np.argsort( + -np.std(y_counts_per_group, axis=1), kind="mergesort" + ) + + for group_idx in sorted_groups_idx: + group_y_counts = y_counts_per_group[group_idx] + best_fold = self._find_best_fold( + y_counts_per_fold=y_counts_per_fold, + y_cnt=y_cnt, + group_y_counts=group_y_counts, + ) + y_counts_per_fold[best_fold] += group_y_counts + groups_per_fold[best_fold].add(group_idx) + + for i in range(self.n_splits): + test_indices = [ + idx + for idx, group_idx in enumerate(groups_inv) + if group_idx in groups_per_fold[i] + ] + yield test_indices + + def _find_best_fold(self, y_counts_per_fold, y_cnt, group_y_counts): + best_fold = None + min_eval = np.inf + min_samples_in_fold = np.inf + for i in range(self.n_splits): + y_counts_per_fold[i] += group_y_counts + # Summarise the distribution over classes in each proposed fold + std_per_class = np.std(y_counts_per_fold / y_cnt.reshape(1, -1), axis=0) + y_counts_per_fold[i] -= group_y_counts + fold_eval = np.mean(std_per_class) + samples_in_fold = np.sum(y_counts_per_fold[i]) + is_current_fold_better = fold_eval < min_eval or ( + np.isclose(fold_eval, min_eval) + and samples_in_fold < min_samples_in_fold + ) + if is_current_fold_better: + min_eval = fold_eval + min_samples_in_fold = samples_in_fold + best_fold = i + return best_fold + + +class TimeSeriesSplit(_BaseKFold): + """Time Series cross-validator. + + Provides train/test indices to split time-ordered data, where other + cross-validation methods are inappropriate, as they would lead to training + on future data and evaluating on past data. + To ensure comparable metrics across folds, samples must be equally spaced. + Once this condition is met, each test set covers the same time duration, + while the train set size accumulates data from previous splits. + + This cross-validation object is a variation of :class:`KFold`. + In the k-th split, it returns the first k folds as the train set and the + (k+1)-th fold as the test set. + + Note that, unlike standard cross-validation methods, successive + training sets are supersets of those that come before them. + + Read more in the :ref:`User Guide `. + + For visualisation of cross-validation behaviour and + comparison between common scikit-learn split methods + refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` + + .. versionadded:: 0.18 + + Parameters + ---------- + n_splits : int, default=5 + Number of splits. Must be at least 2. + + .. versionchanged:: 0.22 + ``n_splits`` default value changed from 3 to 5. + + max_train_size : int, default=None + Maximum size for a single training set. + + test_size : int, default=None + Used to limit the size of the test set. Defaults to + ``n_samples // (n_splits + 1)``, which is the maximum allowed value + with ``gap=0``. + + .. versionadded:: 0.24 + + gap : int, default=0 + Number of samples to exclude from the end of each train set before + the test set. + + .. versionadded:: 0.24 + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import TimeSeriesSplit + >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]]) + >>> y = np.array([1, 2, 3, 4, 5, 6]) + >>> tscv = TimeSeriesSplit() + >>> print(tscv) + TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None) + >>> for i, (train_index, test_index) in enumerate(tscv.split(X)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[0] + Test: index=[1] + Fold 1: + Train: index=[0 1] + Test: index=[2] + Fold 2: + Train: index=[0 1 2] + Test: index=[3] + Fold 3: + Train: index=[0 1 2 3] + Test: index=[4] + Fold 4: + Train: index=[0 1 2 3 4] + Test: index=[5] + >>> # Fix test_size to 2 with 12 samples + >>> X = np.random.randn(12, 2) + >>> y = np.random.randint(0, 2, 12) + >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2) + >>> for i, (train_index, test_index) in enumerate(tscv.split(X)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[0 1 2 3 4 5] + Test: index=[6 7] + Fold 1: + Train: index=[0 1 2 3 4 5 6 7] + Test: index=[8 9] + Fold 2: + Train: index=[0 1 2 3 4 5 6 7 8 9] + Test: index=[10 11] + >>> # Add in a 2 period gap + >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2, gap=2) + >>> for i, (train_index, test_index) in enumerate(tscv.split(X)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[0 1 2 3] + Test: index=[6 7] + Fold 1: + Train: index=[0 1 2 3 4 5] + Test: index=[8 9] + Fold 2: + Train: index=[0 1 2 3 4 5 6 7] + Test: index=[10 11] + + For a more extended example see + :ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py`. + + Notes + ----- + The training set has size ``i * n_samples // (n_splits + 1) + + n_samples % (n_splits + 1)`` in the ``i`` th split, + with a test set of size ``n_samples//(n_splits + 1)`` by default, + where ``n_samples`` is the number of samples. Note that this + formula is only valid when ``test_size`` and ``max_train_size`` are + left to their default values. + """ + + def __init__(self, n_splits=5, *, max_train_size=None, test_size=None, gap=0): + super().__init__(n_splits, shuffle=False, random_state=None) + self.max_train_size = max_train_size + self.test_size = test_size + self.gap = gap + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Always ignored, exists for compatibility. + + groups : array-like of shape (n_samples,) + Always ignored, exists for compatibility. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + if groups is not None: + warnings.warn( + f"The groups parameter is ignored by {self.__class__.__name__}", + UserWarning, + ) + return self._split(X) + + def _split(self, X): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + (X,) = indexable(X) + n_samples = _num_samples(X) + n_splits = self.n_splits + n_folds = n_splits + 1 + gap = self.gap + test_size = ( + self.test_size if self.test_size is not None else n_samples // n_folds + ) + + # Make sure we have enough samples for the given split parameters + if n_folds > n_samples: + raise ValueError( + f"Cannot have number of folds={n_folds} greater" + f" than the number of samples={n_samples}." + ) + if n_samples - gap - (test_size * n_splits) <= 0: + raise ValueError( + f"Too many splits={n_splits} for number of samples" + f"={n_samples} with test_size={test_size} and gap={gap}." + ) + + indices = np.arange(n_samples) + test_starts = range(n_samples - n_splits * test_size, n_samples, test_size) + + for test_start in test_starts: + train_end = test_start - gap + if self.max_train_size and self.max_train_size < train_end: + yield ( + indices[train_end - self.max_train_size : train_end], + indices[test_start : test_start + test_size], + ) + else: + yield ( + indices[:train_end], + indices[test_start : test_start + test_size], + ) + + +class LeaveOneGroupOut(GroupsConsumerMixin, BaseCrossValidator): + """Leave One Group Out cross-validator. + + Provides train/test indices to split data such that each training set is + comprised of all samples except ones belonging to one specific group. + Arbitrary domain specific group information is provided as an array of integers + that encodes the group of each sample. + + For instance the groups could be the year of collection of the samples + and thus allow for cross-validation against time-based splits. + + Read more in the :ref:`User Guide `. + + Notes + ----- + Splits are ordered according to the index of the group left out. The first + split has testing set consisting of the group whose index in `groups` is + lowest, and so on. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import LeaveOneGroupOut + >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + >>> y = np.array([1, 2, 1, 2]) + >>> groups = np.array([1, 1, 2, 2]) + >>> logo = LeaveOneGroupOut() + >>> logo.get_n_splits(X, y, groups) + 2 + >>> logo.get_n_splits(groups=groups) # 'groups' is always required + 2 + >>> print(logo) + LeaveOneGroupOut() + >>> for i, (train_index, test_index) in enumerate(logo.split(X, y, groups)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}, group={groups[train_index]}") + ... print(f" Test: index={test_index}, group={groups[test_index]}") + Fold 0: + Train: index=[2 3], group=[2 2] + Test: index=[0 1], group=[1 1] + Fold 1: + Train: index=[0 1], group=[1 1] + Test: index=[2 3], group=[2 2] + + See also + -------- + GroupKFold: K-fold iterator variant with non-overlapping groups. + """ + + def _iter_test_masks(self, X, y, groups): + if groups is None: + raise ValueError("The 'groups' parameter should not be None.") + # We make a copy of groups to avoid side-effects during iteration + groups = check_array( + groups, input_name="groups", copy=True, ensure_2d=False, dtype=None + ) + unique_groups = np.unique(groups) + if len(unique_groups) <= 1: + raise ValueError( + "The groups parameter contains fewer than 2 unique groups " + "(%s). LeaveOneGroupOut expects at least 2." % unique_groups + ) + for i in unique_groups: + yield groups == i + + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator. + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + + y : object + Always ignored, exists for compatibility. + + groups : array-like of shape (n_samples,) + Group labels for the samples used while splitting the dataset into + train/test set. This 'groups' parameter must always be specified to + calculate the number of splits, though the other parameters can be + omitted. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + if groups is None: + raise ValueError("The 'groups' parameter should not be None.") + groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None) + return len(np.unique(groups)) + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,), default=None + The target variable for supervised learning problems. + + groups : array-like of shape (n_samples,) + Group labels for the samples used while splitting the dataset into + train/test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + return super().split(X, y, groups) + + +class LeavePGroupsOut(GroupsConsumerMixin, BaseCrossValidator): + """Leave P Group(s) Out cross-validator. + + Provides train/test indices to split data according to a third-party + provided group. This group information can be used to encode arbitrary + domain specific stratifications of the samples as integers. + + For instance the groups could be the year of collection of the samples + and thus allow for cross-validation against time-based splits. + + The difference between LeavePGroupsOut and LeaveOneGroupOut is that + the former builds the test sets with all the samples assigned to + ``p`` different values of the groups while the latter uses samples + all assigned the same groups. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_groups : int + Number of groups (``p``) to leave out in the test split. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import LeavePGroupsOut + >>> X = np.array([[1, 2], [3, 4], [5, 6]]) + >>> y = np.array([1, 2, 1]) + >>> groups = np.array([1, 2, 3]) + >>> lpgo = LeavePGroupsOut(n_groups=2) + >>> lpgo.get_n_splits(X, y, groups) + 3 + >>> lpgo.get_n_splits(groups=groups) # 'groups' is always required + 3 + >>> print(lpgo) + LeavePGroupsOut(n_groups=2) + >>> for i, (train_index, test_index) in enumerate(lpgo.split(X, y, groups)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}, group={groups[train_index]}") + ... print(f" Test: index={test_index}, group={groups[test_index]}") + Fold 0: + Train: index=[2], group=[3] + Test: index=[0 1], group=[1 2] + Fold 1: + Train: index=[1], group=[2] + Test: index=[0 2], group=[1 3] + Fold 2: + Train: index=[0], group=[1] + Test: index=[1 2], group=[2 3] + + See Also + -------- + GroupKFold : K-fold iterator variant with non-overlapping groups. + """ + + def __init__(self, n_groups): + self.n_groups = n_groups + + def _iter_test_masks(self, X, y, groups): + if groups is None: + raise ValueError("The 'groups' parameter should not be None.") + groups = check_array( + groups, input_name="groups", copy=True, ensure_2d=False, dtype=None + ) + unique_groups = np.unique(groups) + if self.n_groups >= len(unique_groups): + raise ValueError( + "The groups parameter contains fewer than (or equal to) " + "n_groups (%d) numbers of unique groups (%s). LeavePGroupsOut " + "expects that at least n_groups + 1 (%d) unique groups be " + "present" % (self.n_groups, unique_groups, self.n_groups + 1) + ) + combi = combinations(range(len(unique_groups)), self.n_groups) + for indices in combi: + test_index = np.zeros(_num_samples(X), dtype=bool) + for l in unique_groups[np.array(indices)]: + test_index[groups == l] = True + yield test_index + + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator. + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + + y : object + Always ignored, exists for compatibility. + + groups : array-like of shape (n_samples,) + Group labels for the samples used while splitting the dataset into + train/test set. This 'groups' parameter must always be specified to + calculate the number of splits, though the other parameters can be + omitted. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + if groups is None: + raise ValueError("The 'groups' parameter should not be None.") + groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None) + return int(comb(len(np.unique(groups)), self.n_groups, exact=True)) + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,), default=None + The target variable for supervised learning problems. + + groups : array-like of shape (n_samples,) + Group labels for the samples used while splitting the dataset into + train/test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + return super().split(X, y, groups) + + +class _RepeatedSplits(_MetadataRequester, metaclass=ABCMeta): + """Repeated splits for an arbitrary randomized CV splitter. + + Repeats splits for cross-validators n times with different randomization + in each repetition. + + Parameters + ---------- + cv : callable + Cross-validator class. + + n_repeats : int, default=10 + Number of times cross-validator needs to be repeated. + + random_state : int, RandomState instance or None, default=None + Passes `random_state` to the arbitrary repeating cross validator. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + **cvargs : additional params + Constructor parameters for cv. Must not contain random_state + and shuffle. + """ + + # This indicates that by default CV splitters don't have a "groups" kwarg, + # unless indicated by inheriting from ``GroupsConsumerMixin``. + # This also prevents ``set_split_request`` to be generated for splitters + # which don't support ``groups``. + __metadata_request__split = {"groups": metadata_routing.UNUSED} + + def __init__(self, cv, *, n_repeats=10, random_state=None, **cvargs): + if not isinstance(n_repeats, numbers.Integral): + raise ValueError("Number of repetitions must be of Integral type.") + + if n_repeats <= 0: + raise ValueError("Number of repetitions must be greater than 0.") + + if any(key in cvargs for key in ("random_state", "shuffle")): + raise ValueError("cvargs must not contain random_state or shuffle.") + + self.cv = cv + self.n_repeats = n_repeats + self.random_state = random_state + self.cvargs = cvargs + + def split(self, X, y=None, groups=None): + """Generates indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,) + The target variable for supervised learning problems. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + n_repeats = self.n_repeats + rng = check_random_state(self.random_state) + + for idx in range(n_repeats): + cv = self.cv(random_state=rng, shuffle=True, **self.cvargs) + for train_index, test_index in cv.split(X, y, groups): + yield train_index, test_index + + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator. + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + ``np.zeros(n_samples)`` may be used as a placeholder. + + y : object + Always ignored, exists for compatibility. + ``np.zeros(n_samples)`` may be used as a placeholder. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + rng = check_random_state(self.random_state) + cv = self.cv(random_state=rng, shuffle=True, **self.cvargs) + return cv.get_n_splits(X, y, groups) * self.n_repeats + + def __repr__(self): + return _build_repr(self) + + +class RepeatedKFold(_UnsupportedGroupCVMixin, _RepeatedSplits): + """Repeated K-Fold cross validator. + + Repeats K-Fold `n_repeats` times with different randomization in each repetition. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_splits : int, default=5 + Number of folds. Must be at least 2. + + n_repeats : int, default=10 + Number of times cross-validator needs to be repeated. + + random_state : int, RandomState instance or None, default=None + Controls the randomness of each repeated cross-validation instance. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import RepeatedKFold + >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) + >>> y = np.array([0, 0, 1, 1]) + >>> rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=2652124) + >>> rkf.get_n_splits(X, y) + 4 + >>> print(rkf) + RepeatedKFold(n_repeats=2, n_splits=2, random_state=2652124) + >>> for i, (train_index, test_index) in enumerate(rkf.split(X)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + ... + Fold 0: + Train: index=[0 1] + Test: index=[2 3] + Fold 1: + Train: index=[2 3] + Test: index=[0 1] + Fold 2: + Train: index=[1 2] + Test: index=[0 3] + Fold 3: + Train: index=[0 3] + Test: index=[1 2] + + Notes + ----- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting `random_state` + to an integer. + + See Also + -------- + RepeatedStratifiedKFold : Repeats Stratified K-Fold n times. + """ + + def __init__(self, *, n_splits=5, n_repeats=10, random_state=None): + super().__init__( + KFold, n_repeats=n_repeats, random_state=random_state, n_splits=n_splits + ) + + +class RepeatedStratifiedKFold(_UnsupportedGroupCVMixin, _RepeatedSplits): + """Repeated class-wise stratified K-Fold cross validator. + + Repeats Stratified K-Fold n times with different randomization in each + repetition. + + Read more in the :ref:`User Guide `. + + .. note:: + + Stratification on the class label solves an engineering problem rather + than a statistical one. See :ref:`stratification` for more details. + + Parameters + ---------- + n_splits : int, default=5 + Number of folds. Must be at least 2. + + n_repeats : int, default=10 + Number of times cross-validator needs to be repeated. + + random_state : int, RandomState instance or None, default=None + Controls the generation of the random states for each repetition. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import RepeatedStratifiedKFold + >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) + >>> y = np.array([0, 0, 1, 1]) + >>> rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=2, + ... random_state=36851234) + >>> rskf.get_n_splits(X, y) + 4 + >>> print(rskf) + RepeatedStratifiedKFold(n_repeats=2, n_splits=2, random_state=36851234) + >>> for i, (train_index, test_index) in enumerate(rskf.split(X, y)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + ... + Fold 0: + Train: index=[1 2] + Test: index=[0 3] + Fold 1: + Train: index=[0 3] + Test: index=[1 2] + Fold 2: + Train: index=[1 3] + Test: index=[0 2] + Fold 3: + Train: index=[0 2] + Test: index=[1 3] + + Notes + ----- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting `random_state` + to an integer. + + See Also + -------- + RepeatedKFold : Repeats K-Fold n times. + """ + + def __init__(self, *, n_splits=5, n_repeats=10, random_state=None): + super().__init__( + StratifiedKFold, + n_repeats=n_repeats, + random_state=random_state, + n_splits=n_splits, + ) + + def split(self, X, y, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + Note that providing ``y`` is sufficient to generate the splits and + hence ``np.zeros(n_samples)`` may be used as a placeholder for + ``X`` instead of actual training data. + + y : array-like of shape (n_samples,) + The target variable for supervised learning problems. + Stratification is done based on the y labels. + + groups : object + Always ignored, exists for compatibility. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + + Notes + ----- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting `random_state` + to an integer. + """ + y = check_array(y, input_name="y", ensure_2d=False, dtype=None) + return super().split(X, y, groups=groups) + + +class BaseShuffleSplit(_MetadataRequester, metaclass=ABCMeta): + """Base class for *ShuffleSplit. + + Parameters + ---------- + n_splits : int, default=10 + Number of re-shuffling & splitting iterations. + + test_size : float or int, default=None + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to include in the test split. If int, represents the + absolute number of test samples. If None, the value is set to the + complement of the train size. If ``train_size`` is also None, it will + be set to 0.1. + + train_size : float or int, default=None + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the train split. If + int, represents the absolute number of train samples. If None, + the value is automatically set to the complement of the test size. + + random_state : int, RandomState instance or None, default=None + Controls the randomness of the training and testing indices produced. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + """ + + # This indicates that by default CV splitters don't have a "groups" kwarg, + # unless indicated by inheriting from ``GroupsConsumerMixin``. + # This also prevents ``set_split_request`` to be generated for splitters + # which don't support ``groups``. + __metadata_request__split = {"groups": metadata_routing.UNUSED} + + def __init__( + self, n_splits=10, *, test_size=None, train_size=None, random_state=None + ): + self.n_splits = n_splits + self.test_size = test_size + self.train_size = train_size + self.random_state = random_state + self._default_test_size = 0.1 + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,) + The target variable for supervised learning problems. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + + Notes + ----- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting `random_state` + to an integer. + """ + X, y, groups = indexable(X, y, groups) + for train, test in self._iter_indices(X, y, groups): + yield train, test + + def _iter_indices(self, X, y=None, groups=None): + """Generate (train, test) indices""" + n_samples = _num_samples(X) + n_train, n_test = _validate_shuffle_split( + n_samples, + self.test_size, + self.train_size, + default_test_size=self._default_test_size, + ) + + rng = check_random_state(self.random_state) + for i in range(self.n_splits): + # random partition + permutation = rng.permutation(n_samples) + ind_test = permutation[:n_test] + ind_train = permutation[n_test : (n_test + n_train)] + yield ind_train, ind_test + + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator. + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + + y : object + Always ignored, exists for compatibility. + + groups : object + Always ignored, exists for compatibility. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + return self.n_splits + + def __repr__(self): + return _build_repr(self) + + +class ShuffleSplit(_UnsupportedGroupCVMixin, BaseShuffleSplit): + """Random permutation cross-validator. + + Yields indices to split data into training and test sets. + + Note: contrary to other cross-validation strategies, random splits + do not guarantee that test sets across all folds will be mutually exclusive, + and might include overlapping samples. However, this is still very likely for + sizeable datasets. + + Read more in the :ref:`User Guide `. + + For visualisation of cross-validation behaviour and + comparison between common scikit-learn split methods + refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` + + Parameters + ---------- + n_splits : int, default=10 + Number of re-shuffling & splitting iterations. + + test_size : float or int, default=None + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to include in the test split. If int, represents the + absolute number of test samples. If None, the value is set to the + complement of the train size. If ``train_size`` is also None, it will + be set to 0.1. + + train_size : float or int, default=None + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the train split. If + int, represents the absolute number of train samples. If None, + the value is automatically set to the complement of the test size. + + random_state : int, RandomState instance or None, default=None + Controls the randomness of the training and testing indices produced. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import ShuffleSplit + >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [3, 4], [5, 6]]) + >>> y = np.array([1, 2, 1, 2, 1, 2]) + >>> rs = ShuffleSplit(n_splits=5, test_size=.25, random_state=0) + >>> rs.get_n_splits(X) + 5 + >>> print(rs) + ShuffleSplit(n_splits=5, random_state=0, test_size=0.25, train_size=None) + >>> for i, (train_index, test_index) in enumerate(rs.split(X)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[1 3 0 4] + Test: index=[5 2] + Fold 1: + Train: index=[4 0 2 5] + Test: index=[1 3] + Fold 2: + Train: index=[1 2 4 0] + Test: index=[3 5] + Fold 3: + Train: index=[3 4 1 0] + Test: index=[5 2] + Fold 4: + Train: index=[3 5 1 0] + Test: index=[2 4] + >>> # Specify train and test size + >>> rs = ShuffleSplit(n_splits=5, train_size=0.5, test_size=.25, + ... random_state=0) + >>> for i, (train_index, test_index) in enumerate(rs.split(X)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[1 3 0] + Test: index=[5 2] + Fold 1: + Train: index=[4 0 2] + Test: index=[1 3] + Fold 2: + Train: index=[1 2 4] + Test: index=[3 5] + Fold 3: + Train: index=[3 4 1] + Test: index=[5 2] + Fold 4: + Train: index=[3 5 1] + Test: index=[2 4] + """ + + def __init__( + self, n_splits=10, *, test_size=None, train_size=None, random_state=None + ): + super().__init__( + n_splits=n_splits, + test_size=test_size, + train_size=train_size, + random_state=random_state, + ) + self._default_test_size = 0.1 + + +class GroupShuffleSplit(GroupsConsumerMixin, BaseShuffleSplit): + """Shuffle-Group(s)-Out cross-validation iterator. + + Provides randomized train/test indices to split data according to a + third-party provided group. This group information can be used to encode + arbitrary domain specific stratifications of the samples as integers. + + For instance the groups could be the year of collection of the samples + and thus allow for cross-validation against time-based splits. + + The difference between :class:`LeavePGroupsOut` and ``GroupShuffleSplit`` is that + the former generates splits using all subsets of size ``p`` unique groups, + whereas ``GroupShuffleSplit`` generates a user-determined number of random + test splits, each with a user-determined fraction of unique groups. + + For example, a less computationally intensive alternative to + ``LeavePGroupsOut(p=10)`` would be + ``GroupShuffleSplit(test_size=10, n_splits=100)``. + + Contrary to other cross-validation strategies, the random splits + do not guarantee that test sets across all folds will be mutually exclusive, + and might include overlapping samples. However, this is still very likely for + sizeable datasets. + + Note: The parameters ``test_size`` and ``train_size`` refer to groups, and + not to samples as in :class:`ShuffleSplit`. + + Read more in the :ref:`User Guide `. + + For visualisation of cross-validation behaviour and + comparison between common scikit-learn split methods + refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` + + Parameters + ---------- + n_splits : int, default=5 + Number of re-shuffling & splitting iterations. + + test_size : float, int, default=None + If float, should be between 0.0 and 1.0 and represent the proportion + of groups to include in the test split (rounded up). If int, + represents the absolute number of test groups. If None, the value is + set to the complement of the train size. If ``train_size`` is also None, + it will be set to 0.2. + + train_size : float or int, default=None + If float, should be between 0.0 and 1.0 and represent the + proportion of the groups to include in the train split. If + int, represents the absolute number of train groups. If None, + the value is automatically set to the complement of the test size. + + random_state : int, RandomState instance or None, default=None + Controls the randomness of the training and testing indices produced. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import GroupShuffleSplit + >>> X = np.ones(shape=(8, 2)) + >>> y = np.ones(shape=(8, 1)) + >>> groups = np.array([1, 1, 2, 2, 2, 3, 3, 3]) + >>> print(groups.shape) + (8,) + >>> gss = GroupShuffleSplit(n_splits=2, train_size=.7, random_state=42) + >>> gss.get_n_splits() + 2 + >>> print(gss) + GroupShuffleSplit(n_splits=2, random_state=42, test_size=None, train_size=0.7) + >>> for i, (train_index, test_index) in enumerate(gss.split(X, y, groups)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}, group={groups[train_index]}") + ... print(f" Test: index={test_index}, group={groups[test_index]}") + Fold 0: + Train: index=[2 3 4 5 6 7], group=[2 2 2 3 3 3] + Test: index=[0 1], group=[1 1] + Fold 1: + Train: index=[0 1 5 6 7], group=[1 1 3 3 3] + Test: index=[2 3 4], group=[2 2 2] + + See Also + -------- + ShuffleSplit : Shuffles samples to create independent test/train sets. + + LeavePGroupsOut : Train set leaves out all possible subsets of `p` groups. + """ + + def __init__( + self, n_splits=5, *, test_size=None, train_size=None, random_state=None + ): + super().__init__( + n_splits=n_splits, + test_size=test_size, + train_size=train_size, + random_state=random_state, + ) + self._default_test_size = 0.2 + + def _iter_indices(self, X, y, groups): + if groups is None: + raise ValueError("The 'groups' parameter should not be None.") + groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None) + classes, group_indices = np.unique(groups, return_inverse=True) + for group_train, group_test in super()._iter_indices(X=classes): + # these are the indices of classes in the partition + # invert them into data indices + + train = np.flatnonzero(np.isin(group_indices, group_train)) + test = np.flatnonzero(np.isin(group_indices, group_test)) + + yield train, test + + def split(self, X, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,), default=None + The target variable for supervised learning problems. + + groups : array-like of shape (n_samples,) + Group labels for the samples used while splitting the dataset into + train/test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + + Notes + ----- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting `random_state` + to an integer. + """ + return super().split(X, y, groups) + + +class StratifiedShuffleSplit(BaseShuffleSplit): + """Class-wise stratified ShuffleSplit cross-validator. + + Provides train/test indices to split data in train/test sets. + + This cross-validation object is a merge of :class:`StratifiedKFold` and + :class:`ShuffleSplit`, which returns stratified randomized folds. The folds + are made by preserving the percentage of samples for each class in `y` in a + binary or multiclass classification setting. + + Note: like the :class:`ShuffleSplit` strategy, stratified random splits + do not guarantee that test sets across all folds will be mutually exclusive, + and might include overlapping samples. However, this is still very likely for + sizeable datasets. + + Read more in the :ref:`User Guide `. + + For visualisation of cross-validation behaviour and + comparison between common scikit-learn split methods + refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` + + .. note:: + + Stratification on the class label solves an engineering problem rather + than a statistical one. See :ref:`stratification` for more details. + + Parameters + ---------- + n_splits : int, default=10 + Number of re-shuffling & splitting iterations. + + test_size : float or int, default=None + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to include in the test split. If int, represents the + absolute number of test samples. If None, the value is set to the + complement of the train size. If ``train_size`` is also None, it will + be set to 0.1. + + train_size : float or int, default=None + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the train split. If + int, represents the absolute number of train samples. If None, + the value is automatically set to the complement of the test size. + + random_state : int, RandomState instance or None, default=None + Controls the randomness of the training and testing indices produced. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import StratifiedShuffleSplit + >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]]) + >>> y = np.array([0, 0, 0, 1, 1, 1]) + >>> sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0) + >>> sss.get_n_splits(X, y) + 5 + >>> print(sss) + StratifiedShuffleSplit(n_splits=5, random_state=0, ...) + >>> for i, (train_index, test_index) in enumerate(sss.split(X, y)): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[5 2 3] + Test: index=[4 1 0] + Fold 1: + Train: index=[5 1 4] + Test: index=[0 2 3] + Fold 2: + Train: index=[5 0 2] + Test: index=[4 3 1] + Fold 3: + Train: index=[4 1 0] + Test: index=[2 3 5] + Fold 4: + Train: index=[0 5 1] + Test: index=[3 4 2] + """ + + def __init__( + self, n_splits=10, *, test_size=None, train_size=None, random_state=None + ): + super().__init__( + n_splits=n_splits, + test_size=test_size, + train_size=train_size, + random_state=random_state, + ) + self._default_test_size = 0.1 + + def _iter_indices(self, X, y, groups=None): + n_samples = _num_samples(X) + y = check_array(y, input_name="y", ensure_2d=False, dtype=None) + n_train, n_test = _validate_shuffle_split( + n_samples, + self.test_size, + self.train_size, + default_test_size=self._default_test_size, + ) + + # Convert to numpy as not all operations are supported by the Array API. + # `y` is probably never a very large array, which means that converting it + # should be cheap + xp, _ = get_namespace(y) + y = _convert_to_numpy(y, xp=xp) + + if y.ndim == 2: + # for multi-label y, map each distinct row to a string repr + # using join because str(row) uses an ellipsis if len(row) > 1000 + y = np.array([" ".join(row.astype("str")) for row in y]) + + classes, y_indices = np.unique(y, return_inverse=True) + n_classes = classes.shape[0] + + class_counts = np.bincount(y_indices) + if np.min(class_counts) < 2: + raise ValueError( + "The least populated class in y has only 1" + " member, which is too few. The minimum" + " number of groups for any class cannot" + " be less than 2." + ) + + if n_train < n_classes: + raise ValueError( + "The train_size = %d should be greater or " + "equal to the number of classes = %d" % (n_train, n_classes) + ) + if n_test < n_classes: + raise ValueError( + "The test_size = %d should be greater or " + "equal to the number of classes = %d" % (n_test, n_classes) + ) + + # Find the sorted list of instances for each class: + # (np.unique above performs a sort, so code is O(n logn) already) + class_indices = np.split( + np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1] + ) + + rng = check_random_state(self.random_state) + + for _ in range(self.n_splits): + # if there are ties in the class-counts, we want + # to make sure to break them anew in each iteration + n_i = _approximate_mode(class_counts, n_train, rng) + class_counts_remaining = class_counts - n_i + t_i = _approximate_mode(class_counts_remaining, n_test, rng) + + train = [] + test = [] + + for i in range(n_classes): + permutation = rng.permutation(class_counts[i]) + perm_indices_class_i = class_indices[i].take(permutation, mode="clip") + + train.extend(perm_indices_class_i[: n_i[i]]) + test.extend(perm_indices_class_i[n_i[i] : n_i[i] + t_i[i]]) + + train = rng.permutation(train) + test = rng.permutation(test) + + yield train, test + + def split(self, X, y, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + Note that providing ``y`` is sufficient to generate the splits and + hence ``np.zeros(n_samples)`` may be used as a placeholder for + ``X`` instead of actual training data. + + y : array-like of shape (n_samples,) or (n_samples, n_labels) + The target variable for supervised learning problems. + Stratification is done based on the y labels. + + groups : object + Always ignored, exists for compatibility. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + + Notes + ----- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting `random_state` + to an integer. + """ + if groups is not None: + warnings.warn( + f"The groups parameter is ignored by {self.__class__.__name__}", + UserWarning, + ) + y = check_array(y, input_name="y", ensure_2d=False, dtype=None) + return super().split(X, y, groups) + + +def _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=None): + """ + Validation helper to check if the train/test sizes are meaningful w.r.t. the + size of the data (n_samples). + """ + if test_size is None and train_size is None: + test_size = default_test_size + + test_size_type = np.asarray(test_size).dtype.kind + train_size_type = np.asarray(train_size).dtype.kind + + if (test_size_type == "i" and (test_size >= n_samples or test_size <= 0)) or ( + test_size_type == "f" and (test_size <= 0 or test_size >= 1) + ): + raise ValueError( + "test_size={0} should be either positive and smaller" + " than the number of samples {1} or a float in the " + "(0, 1) range".format(test_size, n_samples) + ) + + if (train_size_type == "i" and (train_size >= n_samples or train_size <= 0)) or ( + train_size_type == "f" and (train_size <= 0 or train_size >= 1) + ): + raise ValueError( + "train_size={0} should be either positive and smaller" + " than the number of samples {1} or a float in the " + "(0, 1) range".format(train_size, n_samples) + ) + + if train_size is not None and train_size_type not in ("i", "f"): + raise ValueError("Invalid value for train_size: {}".format(train_size)) + if test_size is not None and test_size_type not in ("i", "f"): + raise ValueError("Invalid value for test_size: {}".format(test_size)) + + if train_size_type == "f" and test_size_type == "f" and train_size + test_size > 1: + raise ValueError( + "The sum of test_size and train_size = {}, should be in the (0, 1)" + " range. Reduce test_size and/or train_size.".format(train_size + test_size) + ) + + if test_size_type == "f": + n_test = ceil(test_size * n_samples) + elif test_size_type == "i": + n_test = float(test_size) + + if train_size_type == "f": + n_train = floor(train_size * n_samples) + elif train_size_type == "i": + n_train = float(train_size) + + if train_size is None: + n_train = n_samples - n_test + elif test_size is None: + n_test = n_samples - n_train + + if n_train + n_test > n_samples: + raise ValueError( + "The sum of train_size and test_size = %d, " + "should be smaller than the number of " + "samples %d. Reduce test_size and/or " + "train_size." % (n_train + n_test, n_samples) + ) + + n_train, n_test = int(n_train), int(n_test) + + if n_train == 0: + raise ValueError( + "With n_samples={}, test_size={} and train_size={}, the " + "resulting train set will be empty. Adjust any of the " + "aforementioned parameters.".format(n_samples, test_size, train_size) + ) + + return n_train, n_test + + +class PredefinedSplit(BaseCrossValidator): + """Predefined split cross-validator. + + Provides train/test indices to split data into train/test sets using a + predefined scheme specified by the user with the ``test_fold`` parameter. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.16 + + Parameters + ---------- + test_fold : array-like of shape (n_samples,) + The entry ``test_fold[i]`` represents the index of the test set that + sample ``i`` belongs to. It is possible to exclude sample ``i`` from + any test set (i.e. include sample ``i`` in every training set) by + setting ``test_fold[i]`` equal to -1. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import PredefinedSplit + >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) + >>> y = np.array([0, 0, 1, 1]) + >>> test_fold = [0, 1, -1, 1] + >>> ps = PredefinedSplit(test_fold) + >>> ps.get_n_splits() + 2 + >>> print(ps) + PredefinedSplit(test_fold=array([ 0, 1, -1, 1])) + >>> for i, (train_index, test_index) in enumerate(ps.split()): + ... print(f"Fold {i}:") + ... print(f" Train: index={train_index}") + ... print(f" Test: index={test_index}") + Fold 0: + Train: index=[1 2 3] + Test: index=[0] + Fold 1: + Train: index=[0 2] + Test: index=[1 3] + """ + + def __init__(self, test_fold): + self.test_fold = np.array(test_fold, dtype=int) + self.test_fold = column_or_1d(self.test_fold) + self.unique_folds = np.unique(self.test_fold) + self.unique_folds = self.unique_folds[self.unique_folds != -1] + + def split(self, X=None, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + + y : object + Always ignored, exists for compatibility. + + groups : object + Always ignored, exists for compatibility. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + if groups is not None: + warnings.warn( + f"The groups parameter is ignored by {self.__class__.__name__}", + UserWarning, + ) + return self._split() + + def _split(self): + """Generate indices to split data into training and test set. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + ind = np.arange(len(self.test_fold)) + for test_index in self._iter_test_masks(): + train_index = ind[np.logical_not(test_index)] + test_index = ind[test_index] + yield train_index, test_index + + def _iter_test_masks(self): + """Generates boolean masks corresponding to test sets.""" + for f in self.unique_folds: + test_index = np.where(self.test_fold == f)[0] + test_mask = np.zeros(len(self.test_fold), dtype=bool) + test_mask[test_index] = True + yield test_mask + + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator. + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + + y : object + Always ignored, exists for compatibility. + + groups : object + Always ignored, exists for compatibility. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + return len(self.unique_folds) + + +class _CVIterableWrapper(BaseCrossValidator): + """Wrapper class for old style cv objects and iterables.""" + + def __init__(self, cv): + self.cv = list(cv) + + def get_n_splits(self, X=None, y=None, groups=None): + """Returns the number of splitting iterations in the cross-validator. + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + + y : object + Always ignored, exists for compatibility. + + groups : object + Always ignored, exists for compatibility. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + return len(self.cv) + + def split(self, X=None, y=None, groups=None): + """Generate indices to split data into training and test set. + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + + y : object + Always ignored, exists for compatibility. + + groups : object + Always ignored, exists for compatibility. + + Yields + ------ + train : ndarray + The training set indices for that split. + + test : ndarray + The testing set indices for that split. + """ + for train, test in self.cv: + yield train, test + + +def check_cv(cv=5, y=None, *, classifier=False): + """Input checker utility for building a cross-validator. + + Parameters + ---------- + cv : int, cross-validation generator, iterable or None, default=5 + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + - None, to use the default 5-fold cross validation, + - integer, to specify the number of folds. + - :term:`CV splitter`, + - An iterable that generates (train, test) splits as arrays of indices. + + For integer/None inputs, if classifier is True and ``y`` is either + binary or multiclass, :class:`StratifiedKFold` is used. In all other + cases, :class:`KFold` is used. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value changed from 3-fold to 5-fold. + + y : array-like, default=None + The target variable for supervised learning problems. + + classifier : bool, default=False + Whether the task is a classification task, in which case + stratified KFold will be used. + + Returns + ------- + checked_cv : a cross-validator instance. + The return value is a cross-validator which generates the train/test + splits via the ``split`` method. + + Examples + -------- + >>> from sklearn.model_selection import check_cv + >>> check_cv(cv=5, y=None, classifier=False) + KFold(...) + >>> check_cv(cv=5, y=[1, 1, 0, 0, 0, 0], classifier=True) + StratifiedKFold(...) + """ + cv = 5 if cv is None else cv + if isinstance(cv, numbers.Integral): + if ( + classifier + and (y is not None) + and (type_of_target(y, input_name="y") in ("binary", "multiclass")) + ): + return StratifiedKFold(cv) + else: + return KFold(cv) + + if not hasattr(cv, "split") or isinstance(cv, str): + if not isinstance(cv, Iterable) or isinstance(cv, str): + raise ValueError( + "Expected cv as an integer, cross-validation " + "object (from sklearn.model_selection) " + "or an iterable. Got %s." % cv + ) + return _CVIterableWrapper(cv) + + return cv # New style cv objects are passed without any modification + + +@validate_params( + { + "test_size": [ + Interval(RealNotInt, 0, 1, closed="neither"), + Interval(numbers.Integral, 1, None, closed="left"), + None, + ], + "train_size": [ + Interval(RealNotInt, 0, 1, closed="neither"), + Interval(numbers.Integral, 1, None, closed="left"), + None, + ], + "random_state": ["random_state"], + "shuffle": ["boolean"], + "stratify": ["array-like", None], + }, + prefer_skip_nested_validation=True, +) +def train_test_split( + *arrays, + test_size=None, + train_size=None, + random_state=None, + shuffle=True, + stratify=None, +): + """Split arrays or matrices into random train and test subsets. + + Quick utility that wraps input validation, + ``next(ShuffleSplit().split(X, y))``, and application to input data + into a single call for splitting (and optionally subsampling) data into a + one-liner. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + *arrays : sequence of indexables with same length / shape[0] + Allowed inputs are lists, numpy arrays, scipy-sparse + matrices or pandas dataframes. + + test_size : float or int, default=None + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to include in the test split. If int, represents the + absolute number of test samples. If None, the value is set to the + complement of the train size. If ``train_size`` is also None, it will + be set to 0.25. + + train_size : float or int, default=None + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the train split. If + int, represents the absolute number of train samples. If None, + the value is automatically set to the complement of the test size. + + random_state : int, RandomState instance or None, default=None + Controls the shuffling applied to the data before applying the split. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + shuffle : bool, default=True + Whether or not to shuffle the data before splitting. If shuffle=False + then stratify must be None. + + stratify : array-like, default=None + If not None, data is split in a stratified fashion, using this as + the class labels. + Read more in the :ref:`User Guide `. + + Returns + ------- + splitting : list, length=2 * len(arrays) + List containing train-test split of inputs. + + .. versionadded:: 0.16 + If the input is sparse, the output will be a + ``scipy.sparse.csr_matrix``. Else, output type is the same as the + input type. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import train_test_split + >>> X, y = np.arange(10).reshape((5, 2)), range(5) + >>> X + array([[0, 1], + [2, 3], + [4, 5], + [6, 7], + [8, 9]]) + >>> list(y) + [0, 1, 2, 3, 4] + + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, test_size=0.33, random_state=42) + ... + >>> X_train + array([[4, 5], + [0, 1], + [6, 7]]) + >>> y_train + [2, 0, 3] + >>> X_test + array([[2, 3], + [8, 9]]) + >>> y_test + [1, 4] + + >>> train_test_split(y, shuffle=False) + [[0, 1, 2], [3, 4]] + + >>> from sklearn import datasets + >>> iris = datasets.load_iris(as_frame=True) + >>> X, y = iris['data'], iris['target'] + >>> X.head() + sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) + 0 5.1 3.5 1.4 0.2 + 1 4.9 3.0 1.4 0.2 + 2 4.7 3.2 1.3 0.2 + 3 4.6 3.1 1.5 0.2 + 4 5.0 3.6 1.4 0.2 + >>> y.head() + 0 0 + 1 0 + 2 0 + 3 0 + 4 0 + ... + + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, test_size=0.33, random_state=42) + ... + >>> X_train.head() + sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) + 96 5.7 2.9 4.2 1.3 + 105 7.6 3.0 6.6 2.1 + 66 5.6 3.0 4.5 1.5 + 0 5.1 3.5 1.4 0.2 + 122 7.7 2.8 6.7 2.0 + >>> y_train.head() + 96 1 + 105 2 + 66 1 + 0 0 + 122 2 + ... + >>> X_test.head() + sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) + 73 6.1 2.8 4.7 1.2 + 18 5.7 3.8 1.7 0.3 + 118 7.7 2.6 6.9 2.3 + 78 6.0 2.9 4.5 1.5 + 76 6.8 2.8 4.8 1.4 + >>> y_test.head() + 73 1 + 18 0 + 118 2 + 78 1 + 76 1 + ... + """ + n_arrays = len(arrays) + if n_arrays == 0: + raise ValueError("At least one array required as input") + + arrays = indexable(*arrays) + + n_samples = _num_samples(arrays[0]) + n_train, n_test = _validate_shuffle_split( + n_samples, test_size, train_size, default_test_size=0.25 + ) + + if shuffle is False: + if stratify is not None: + raise ValueError( + "Stratified train/test split is not implemented for shuffle=False" + ) + + train = np.arange(n_train) + test = np.arange(n_train, n_train + n_test) + + else: + if stratify is not None: + CVClass = StratifiedShuffleSplit + else: + CVClass = ShuffleSplit + + cv = CVClass(test_size=n_test, train_size=n_train, random_state=random_state) + + train, test = next(cv.split(X=arrays[0], y=stratify)) + + train, test = ensure_common_namespace_device(arrays[0], train, test) + + return list( + chain.from_iterable( + (_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays + ) + ) + + +# Tell nose that train_test_split is not a test. +# (Needed for external libraries that may use nose.) +# Use setattr to avoid mypy errors when monkeypatching. +setattr(train_test_split, "__test__", False) + + +def _pprint(params, offset=0, printer=repr): + """Pretty print the dictionary 'params' + + Parameters + ---------- + params : dict + The dictionary to pretty print + + offset : int, default=0 + The offset in characters to add at the begin of each line. + + printer : callable, default=repr + The function to convert entries to strings, typically + the builtin str or repr + + """ + # Do a multi-line justified repr: + options = np.get_printoptions() + np.set_printoptions(precision=5, threshold=64, edgeitems=2) + params_list = list() + this_line_length = offset + line_sep = ",\n" + (1 + offset // 2) * " " + for i, (k, v) in enumerate(sorted(params.items())): + if isinstance(v, float): + # use str for representing floating point numbers + # this way we get consistent representation across + # architectures and versions. + this_repr = "%s=%s" % (k, str(v)) + else: + # use repr of the rest + this_repr = "%s=%s" % (k, printer(v)) + if len(this_repr) > 500: + this_repr = this_repr[:300] + "..." + this_repr[-100:] + if i > 0: + if this_line_length + len(this_repr) >= 75 or "\n" in this_repr: + params_list.append(line_sep) + this_line_length = len(line_sep) + else: + params_list.append(", ") + this_line_length += 2 + params_list.append(this_repr) + this_line_length += len(this_repr) + + np.set_printoptions(**options) + lines = "".join(params_list) + # Strip trailing space to avoid nightmare in doctests + lines = "\n".join(l.rstrip(" ") for l in lines.split("\n")) + return lines + + +def _build_repr(self): + # XXX This is copied from BaseEstimator's get_params + cls = self.__class__ + init = getattr(cls.__init__, "deprecated_original", cls.__init__) + # Ignore varargs, kw and default values and pop self + init_signature = signature(init) + # Consider the constructor parameters excluding 'self' + if init is object.__init__: + args = [] + else: + args = sorted( + [ + p.name + for p in init_signature.parameters.values() + if p.name != "self" and p.kind != p.VAR_KEYWORD + ] + ) + class_name = self.__class__.__name__ + params = dict() + for key in args: + # We need deprecation warnings to always be on in order to + # catch deprecated param values. + # This is set in utils/__init__.py but it gets overwritten + # when running under python3 somehow. + warnings.simplefilter("always", FutureWarning) + try: + with warnings.catch_warnings(record=True) as w: + value = getattr(self, key, None) + if value is None and hasattr(self, "cvargs"): + value = self.cvargs.get(key, None) + if len(w) and w[0].category is FutureWarning: + # if the parameter is deprecated, don't show it + continue + finally: + warnings.filters.pop(0) + params[key] = value + + return "%s(%s)" % (class_name, _pprint(params, offset=len(class_name))) + + +def _yields_constant_splits(cv): + # Return True if calling cv.split() always returns the same splits + # We assume that if a cv doesn't have a shuffle parameter, it shuffles by + # default (e.g. ShuffleSplit). If it actually doesn't shuffle (e.g. + # LeaveOneOut), then it won't have a random_state parameter anyway, in + # which case it will default to 0, leading to output=True + shuffle = getattr(cv, "shuffle", True) + random_state = getattr(cv, "random_state", 0) + return isinstance(random_state, numbers.Integral) or not shuffle diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py new file mode 100644 index 0000000000000000000000000000000000000000..c5a1406e6c2a50e70e366be0fd199795eeb60417 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py @@ -0,0 +1,2530 @@ +""" +The :mod:`sklearn.model_selection._validation` module includes classes and +functions to validate the model. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numbers +import time +import warnings +from collections import Counter +from contextlib import suppress +from functools import partial +from numbers import Real +from traceback import format_exc + +import numpy as np +import scipy.sparse as sp +from joblib import logger + +from ..base import clone, is_classifier +from ..exceptions import FitFailedWarning, UnsetMetadataPassedError +from ..metrics import check_scoring, get_scorer_names +from ..metrics._scorer import _MultimetricScorer +from ..preprocessing import LabelEncoder +from ..utils import Bunch, _safe_indexing, check_random_state, indexable +from ..utils._array_api import device, get_namespace +from ..utils._param_validation import ( + HasMethods, + Integral, + Interval, + StrOptions, + validate_params, +) +from ..utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _routing_enabled, + process_routing, +) +from ..utils.metaestimators import _safe_split +from ..utils.parallel import Parallel, delayed +from ..utils.validation import _check_method_params, _num_samples +from ._split import check_cv + +__all__ = [ + "cross_val_predict", + "cross_val_score", + "cross_validate", + "learning_curve", + "permutation_test_score", + "validation_curve", +] + + +def _check_params_groups_deprecation(fit_params, params, groups, version): + """A helper function to check deprecations on `groups` and `fit_params`. + + # TODO(SLEP6): To be removed when set_config(enable_metadata_routing=False) is not + # possible. + """ + if params is not None and fit_params is not None: + raise ValueError( + "`params` and `fit_params` cannot both be provided. Pass parameters " + "via `params`. `fit_params` is deprecated and will be removed in " + f"version {version}." + ) + elif fit_params is not None: + warnings.warn( + ( + "`fit_params` is deprecated and will be removed in version {version}. " + "Pass parameters via `params` instead." + ), + FutureWarning, + ) + params = fit_params + + params = {} if params is None else params + + _check_groups_routing_disabled(groups) + + return params + + +# TODO(SLEP6): To be removed when set_config(enable_metadata_routing=False) is not +# possible. +def _check_groups_routing_disabled(groups): + if groups is not None and _routing_enabled(): + raise ValueError( + "`groups` can only be passed if metadata routing is not enabled via" + " `sklearn.set_config(enable_metadata_routing=True)`. When routing is" + " enabled, pass `groups` alongside other metadata via the `params` argument" + " instead." + ) + + +@validate_params( + { + "estimator": [HasMethods("fit")], + "X": ["array-like", "sparse matrix"], + "y": ["array-like", None], + "groups": ["array-like", None], + "scoring": [ + StrOptions(set(get_scorer_names())), + callable, + list, + tuple, + dict, + None, + ], + "cv": ["cv_object"], + "n_jobs": [Integral, None], + "verbose": ["verbose"], + "params": [dict, None], + "pre_dispatch": [Integral, str], + "return_train_score": ["boolean"], + "return_estimator": ["boolean"], + "return_indices": ["boolean"], + "error_score": [StrOptions({"raise"}), Real], + }, + prefer_skip_nested_validation=False, # estimator is not validated yet +) +def cross_validate( + estimator, + X, + y=None, + *, + groups=None, + scoring=None, + cv=None, + n_jobs=None, + verbose=0, + params=None, + pre_dispatch="2*n_jobs", + return_train_score=False, + return_estimator=False, + return_indices=False, + error_score=np.nan, +): + """Evaluate metric(s) by cross-validation and also record fit/score times. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : estimator object implementing 'fit' + The object to use to fit the data. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to fit. Can be for example a list, or an array. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None + The target variable to try to predict in the case of + supervised learning. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. Only used in conjunction with a "Group" :term:`cv` + instance (e.g., :class:`GroupKFold`). + + .. versionchanged:: 1.4 + ``groups`` can only be passed if metadata routing is not enabled + via ``sklearn.set_config(enable_metadata_routing=True)``. When routing + is enabled, pass ``groups`` alongside other metadata via the ``params`` + argument instead. E.g.: + ``cross_validate(..., params={'groups': groups})``. + + scoring : str, callable, list, tuple, or dict, default=None + Strategy to evaluate the performance of the `estimator` across cross-validation + splits. + + If `scoring` represents a single score, one can use: + + - a single string (see :ref:`scoring_string_names`); + - a callable (see :ref:`scoring_callable`) that returns a single value. + - `None`, the `estimator`'s + :ref:`default evaluation criterion ` is used. + + If `scoring` represents multiple scores, one can use: + + - a list or tuple of unique strings; + - a callable returning a dictionary where the keys are the metric + names and the values are the metric scores; + - a dictionary with metric names as keys and callables a values. + + See :ref:`multimetric_grid_search` for an example. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - int, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For int/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value if None changed from 3-fold to 5-fold. + + n_jobs : int, default=None + Number of jobs to run in parallel. Training the estimator and computing + the score are parallelized over the cross-validation splits. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + verbose : int, default=0 + The verbosity level. + + params : dict, default=None + Parameters to pass to the underlying estimator's ``fit``, the scorer, + and the CV splitter. + + .. versionadded:: 1.4 + + pre_dispatch : int or str, default='2*n_jobs' + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - An int, giving the exact number of total jobs that are spawned + - A str, giving an expression as a function of n_jobs, as in '2*n_jobs' + + return_train_score : bool, default=False + Whether to include train scores. + Computing training scores is used to get insights on how different + parameter settings impact the overfitting/underfitting trade-off. + However computing the scores on the training set can be computationally + expensive and is not strictly required to select the parameters that + yield the best generalization performance. + + .. versionadded:: 0.19 + + .. versionchanged:: 0.21 + Default value was changed from ``True`` to ``False`` + + return_estimator : bool, default=False + Whether to return the estimators fitted on each split. + + .. versionadded:: 0.20 + + return_indices : bool, default=False + Whether to return the train-test indices selected for each split. + + .. versionadded:: 1.3 + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. + If a numeric value is given, FitFailedWarning is raised. + + .. versionadded:: 0.20 + + Returns + ------- + scores : dict of float arrays of shape (n_splits,) + Array of scores of the estimator for each run of the cross validation. + + A dict of arrays containing the score/time arrays for each scorer is + returned. The possible keys for this ``dict`` are: + + ``test_score`` + The score array for test scores on each cv split. + Suffix ``_score`` in ``test_score`` changes to a specific + metric like ``test_r2`` or ``test_auc`` if there are + multiple scoring metrics in the scoring parameter. + ``train_score`` + The score array for train scores on each cv split. + Suffix ``_score`` in ``train_score`` changes to a specific + metric like ``train_r2`` or ``train_auc`` if there are + multiple scoring metrics in the scoring parameter. + This is available only if ``return_train_score`` parameter + is ``True``. + ``fit_time`` + The time for fitting the estimator on the train + set for each cv split. + ``score_time`` + The time for scoring the estimator on the test set for each + cv split. (Note: time for scoring on the train set is not + included even if ``return_train_score`` is set to ``True``). + ``estimator`` + The estimator objects for each cv split. + This is available only if ``return_estimator`` parameter + is set to ``True``. + ``indices`` + The train/test positional indices for each cv split. A dictionary + is returned where the keys are either `"train"` or `"test"` + and the associated values are a list of integer-dtyped NumPy + arrays with the indices. Available only if `return_indices=True`. + + See Also + -------- + cross_val_score : Run cross-validation for single metric evaluation. + + cross_val_predict : Get predictions from each split of cross-validation for + diagnostic purposes. + + sklearn.metrics.make_scorer : Make a scorer from a performance metric or + loss function. + + Examples + -------- + >>> from sklearn import datasets, linear_model + >>> from sklearn.model_selection import cross_validate + >>> from sklearn.metrics import make_scorer + >>> from sklearn.metrics import confusion_matrix + >>> from sklearn.svm import LinearSVC + >>> diabetes = datasets.load_diabetes() + >>> X = diabetes.data[:150] + >>> y = diabetes.target[:150] + >>> lasso = linear_model.Lasso() + + Single metric evaluation using ``cross_validate`` + + >>> cv_results = cross_validate(lasso, X, y, cv=3) + >>> sorted(cv_results.keys()) + ['fit_time', 'score_time', 'test_score'] + >>> cv_results['test_score'] + array([0.3315057 , 0.08022103, 0.03531816]) + + Multiple metric evaluation using ``cross_validate`` + (please refer the ``scoring`` parameter doc for more information) + + >>> scores = cross_validate(lasso, X, y, cv=3, + ... scoring=('r2', 'neg_mean_squared_error'), + ... return_train_score=True) + >>> print(scores['test_neg_mean_squared_error']) + [-3635.5 -3573.3 -6114.7] + >>> print(scores['train_r2']) + [0.28009951 0.3908844 0.22784907] + """ + _check_groups_routing_disabled(groups) + + X, y = indexable(X, y) + params = {} if params is None else params + cv = check_cv(cv, y, classifier=is_classifier(estimator)) + + scorers = check_scoring( + estimator, scoring=scoring, raise_exc=(error_score == "raise") + ) + + if _routing_enabled(): + # For estimators, a MetadataRouter is created in get_metadata_routing + # methods. For these router methods, we create the router to use + # `process_routing` on it. + router = ( + MetadataRouter(owner="cross_validate") + .add( + splitter=cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + .add( + estimator=estimator, + # TODO(SLEP6): also pass metadata to the predict method for + # scoring? + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + .add( + scorer=scorers, + method_mapping=MethodMapping().add(caller="fit", callee="score"), + ) + ) + try: + routed_params = process_routing(router, "fit", **params) + except UnsetMetadataPassedError as e: + # The default exception would mention `fit` since in the above + # `process_routing` code, we pass `fit` as the caller. However, + # the user is not calling `fit` directly, so we change the message + # to make it more suitable for this case. + raise UnsetMetadataPassedError( + message=str(e).replace("cross_validate.fit", "cross_validate"), + unrequested_params=e.unrequested_params, + routed_params=e.routed_params, + ) + else: + routed_params = Bunch() + routed_params.splitter = Bunch(split={"groups": groups}) + routed_params.estimator = Bunch(fit=params) + routed_params.scorer = Bunch(score={}) + + indices = cv.split(X, y, **routed_params.splitter.split) + if return_indices: + # materialize the indices since we need to store them in the returned dict + indices = list(indices) + + # We clone the estimator to make sure that all the folds are + # independent, and that it is pickle-able. + parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) + results = parallel( + delayed(_fit_and_score)( + clone(estimator), + X, + y, + scorer=scorers, + train=train, + test=test, + verbose=verbose, + parameters=None, + fit_params=routed_params.estimator.fit, + score_params=routed_params.scorer.score, + return_train_score=return_train_score, + return_times=True, + return_estimator=return_estimator, + error_score=error_score, + ) + for train, test in indices + ) + + _warn_or_raise_about_fit_failures(results, error_score) + + # For callable scoring, the return type is only know after calling. If the + # return type is a dictionary, the error scores can now be inserted with + # the correct key. + if callable(scoring): + _insert_error_scores(results, error_score) + + results = _aggregate_score_dicts(results) + + ret = {} + ret["fit_time"] = results["fit_time"] + ret["score_time"] = results["score_time"] + + if return_estimator: + ret["estimator"] = results["estimator"] + + if return_indices: + ret["indices"] = {} + ret["indices"]["train"], ret["indices"]["test"] = zip(*indices) + + test_scores_dict = _normalize_score_results(results["test_scores"]) + if return_train_score: + train_scores_dict = _normalize_score_results(results["train_scores"]) + + for name in test_scores_dict: + ret["test_%s" % name] = test_scores_dict[name] + if return_train_score: + key = "train_%s" % name + ret[key] = train_scores_dict[name] + + return ret + + +def _insert_error_scores(results, error_score): + """Insert error in `results` by replacing them inplace with `error_score`. + + This only applies to multimetric scores because `_fit_and_score` will + handle the single metric case. + """ + successful_score = None + failed_indices = [] + for i, result in enumerate(results): + if result["fit_error"] is not None: + failed_indices.append(i) + elif successful_score is None: + successful_score = result["test_scores"] + + if isinstance(successful_score, dict): + formatted_error = {name: error_score for name in successful_score} + for i in failed_indices: + results[i]["test_scores"] = formatted_error.copy() + if "train_scores" in results[i]: + results[i]["train_scores"] = formatted_error.copy() + + +def _normalize_score_results(scores, scaler_score_key="score"): + """Creates a scoring dictionary based on the type of `scores`""" + if isinstance(scores[0], dict): + # multimetric scoring + return _aggregate_score_dicts(scores) + # scaler + return {scaler_score_key: scores} + + +def _warn_or_raise_about_fit_failures(results, error_score): + fit_errors = [ + result["fit_error"] for result in results if result["fit_error"] is not None + ] + if fit_errors: + num_failed_fits = len(fit_errors) + num_fits = len(results) + fit_errors_counter = Counter(fit_errors) + delimiter = "-" * 80 + "\n" + fit_errors_summary = "\n".join( + f"{delimiter}{n} fits failed with the following error:\n{error}" + for error, n in fit_errors_counter.items() + ) + + if num_failed_fits == num_fits: + all_fits_failed_message = ( + f"\nAll the {num_fits} fits failed.\n" + "It is very likely that your model is misconfigured.\n" + "You can try to debug the error by setting error_score='raise'.\n\n" + f"Below are more details about the failures:\n{fit_errors_summary}" + ) + raise ValueError(all_fits_failed_message) + + else: + some_fits_failed_message = ( + f"\n{num_failed_fits} fits failed out of a total of {num_fits}.\n" + "The score on these train-test partitions for these parameters" + f" will be set to {error_score}.\n" + "If these failures are not expected, you can try to debug them " + "by setting error_score='raise'.\n\n" + f"Below are more details about the failures:\n{fit_errors_summary}" + ) + warnings.warn(some_fits_failed_message, FitFailedWarning) + + +@validate_params( + { + "estimator": [HasMethods("fit")], + "X": ["array-like", "sparse matrix"], + "y": ["array-like", None], + "groups": ["array-like", None], + "scoring": [StrOptions(set(get_scorer_names())), callable, None], + "cv": ["cv_object"], + "n_jobs": [Integral, None], + "verbose": ["verbose"], + "params": [dict, None], + "pre_dispatch": [Integral, str, None], + "error_score": [StrOptions({"raise"}), Real], + }, + prefer_skip_nested_validation=False, # estimator is not validated yet +) +def cross_val_score( + estimator, + X, + y=None, + *, + groups=None, + scoring=None, + cv=None, + n_jobs=None, + verbose=0, + params=None, + pre_dispatch="2*n_jobs", + error_score=np.nan, +): + """Evaluate a score by cross-validation. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : estimator object implementing 'fit' + The object to use to fit the data. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to fit. Can be for example a list, or an array. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs), \ + default=None + The target variable to try to predict in the case of + supervised learning. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. Only used in conjunction with a "Group" :term:`cv` + instance (e.g., :class:`GroupKFold`). + + .. versionchanged:: 1.4 + ``groups`` can only be passed if metadata routing is not enabled + via ``sklearn.set_config(enable_metadata_routing=True)``. When routing + is enabled, pass ``groups`` alongside other metadata via the ``params`` + argument instead. E.g.: + ``cross_val_score(..., params={'groups': groups})``. + + scoring : str or callable, default=None + Strategy to evaluate the performance of the `estimator` across cross-validation + splits. + + - str: see :ref:`scoring_string_names` for options. + - callable: a scorer callable object (e.g., function) with signature + ``scorer(estimator, X, y)``, which should return only a single value. + See :ref:`scoring_callable` for details. + - `None`: the `estimator`'s + :ref:`default evaluation criterion ` is used. + + Similar to the use of `scoring` in :func:`cross_validate` but only a + single metric is permitted. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - `None`, to use the default 5-fold cross validation, + - int, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable that generates (train, test) splits as arrays of indices. + + For `int`/`None` inputs, if the estimator is a classifier and `y` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + `cv` default value if `None` changed from 3-fold to 5-fold. + + n_jobs : int, default=None + Number of jobs to run in parallel. Training the estimator and computing + the score are parallelized over the cross-validation splits. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + verbose : int, default=0 + The verbosity level. + + params : dict, default=None + Parameters to pass to the underlying estimator's ``fit``, the scorer, + and the CV splitter. + + .. versionadded:: 1.4 + + pre_dispatch : int or str, default='2*n_jobs' + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - ``None``, in which case all the jobs are immediately created and spawned. Use + this for lightweight and fast-running jobs, to avoid delays due to on-demand + spawning of the jobs + - An int, giving the exact number of total jobs that are spawned + - A str, giving an expression as a function of n_jobs, as in '2*n_jobs' + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. + If a numeric value is given, FitFailedWarning is raised. + + .. versionadded:: 0.20 + + Returns + ------- + scores : ndarray of float of shape=(len(list(cv)),) + Array of scores of the estimator for each run of the cross validation. + + See Also + -------- + cross_validate : To run cross-validation on multiple metrics and also to + return train scores, fit times and score times. + + cross_val_predict : Get predictions from each split of cross-validation for + diagnostic purposes. + + sklearn.metrics.make_scorer : Make a scorer from a performance metric or + loss function. + + Examples + -------- + >>> from sklearn import datasets, linear_model + >>> from sklearn.model_selection import cross_val_score + >>> diabetes = datasets.load_diabetes() + >>> X = diabetes.data[:150] + >>> y = diabetes.target[:150] + >>> lasso = linear_model.Lasso() + >>> print(cross_val_score(lasso, X, y, cv=3)) + [0.3315057 0.08022103 0.03531816] + """ + # To ensure multimetric format is not supported + scorer = check_scoring(estimator, scoring=scoring) + + cv_results = cross_validate( + estimator=estimator, + X=X, + y=y, + groups=groups, + scoring={"score": scorer}, + cv=cv, + n_jobs=n_jobs, + verbose=verbose, + params=params, + pre_dispatch=pre_dispatch, + error_score=error_score, + ) + return cv_results["test_score"] + + +def _fit_and_score( + estimator, + X, + y, + *, + scorer, + train, + test, + verbose, + parameters, + fit_params, + score_params, + return_train_score=False, + return_parameters=False, + return_n_test_samples=False, + return_times=False, + return_estimator=False, + split_progress=None, + candidate_progress=None, + error_score=np.nan, +): + """Fit estimator and compute scores for a given dataset split. + + Parameters + ---------- + estimator : estimator object implementing 'fit' + The object to use to fit the data. + + X : array-like of shape (n_samples, n_features) + The data to fit. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None + The target variable to try to predict in the case of + supervised learning. + + scorer : A single callable or dict mapping scorer name to the callable + If it is a single callable, the return value for ``train_scores`` and + ``test_scores`` is a single float. + + For a dict, it should be one mapping the scorer name to the scorer + callable object / function. + + The callable object / fn should have signature + ``scorer(estimator, X, y)``. + + train : array-like of shape (n_train_samples,) + Indices of training samples. + + test : array-like of shape (n_test_samples,) + Indices of test samples. + + verbose : int + The verbosity level. + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. + If a numeric value is given, FitFailedWarning is raised. + + parameters : dict or None + Parameters to be set on the estimator. + + fit_params : dict or None + Parameters that will be passed to ``estimator.fit``. + + score_params : dict or None + Parameters that will be passed to the scorer. + + return_train_score : bool, default=False + Compute and return score on training set. + + return_parameters : bool, default=False + Return parameters that has been used for the estimator. + + split_progress : {list, tuple} of int, default=None + A list or tuple of format (, ). + + candidate_progress : {list, tuple} of int, default=None + A list or tuple of format + (, ). + + return_n_test_samples : bool, default=False + Whether to return the ``n_test_samples``. + + return_times : bool, default=False + Whether to return the fit/score times. + + return_estimator : bool, default=False + Whether to return the fitted estimator. + + Returns + ------- + result : dict with the following attributes + train_scores : dict of scorer name -> float + Score on training set (for all the scorers), + returned only if `return_train_score` is `True`. + test_scores : dict of scorer name -> float + Score on testing set (for all the scorers). + n_test_samples : int + Number of test samples. + fit_time : float + Time spent for fitting in seconds. + score_time : float + Time spent for scoring in seconds. + parameters : dict or None + The parameters that have been evaluated. + estimator : estimator object + The fitted estimator. + fit_error : str or None + Traceback str if the fit failed, None if the fit succeeded. + """ + xp, _ = get_namespace(X) + X_device = device(X) + + # Make sure that we can fancy index X even if train and test are provided + # as NumPy arrays by NumPy only cross-validation splitters. + train, test = xp.asarray(train, device=X_device), xp.asarray(test, device=X_device) + + if not isinstance(error_score, numbers.Number) and error_score != "raise": + raise ValueError( + "error_score must be the string 'raise' or a numeric value. " + "(Hint: if using 'raise', please make sure that it has been " + "spelled correctly.)" + ) + + progress_msg = "" + if verbose > 2: + if split_progress is not None: + progress_msg = f" {split_progress[0] + 1}/{split_progress[1]}" + if candidate_progress and verbose > 9: + progress_msg += f"; {candidate_progress[0] + 1}/{candidate_progress[1]}" + + if verbose > 1: + if parameters is None: + params_msg = "" + else: + sorted_keys = sorted(parameters) # Ensure deterministic o/p + params_msg = ", ".join(f"{k}={parameters[k]}" for k in sorted_keys) + if verbose > 9: + start_msg = f"[CV{progress_msg}] START {params_msg}" + print(f"{start_msg}{(80 - len(start_msg)) * '.'}") + + # Adjust length of sample weights + fit_params = fit_params if fit_params is not None else {} + fit_params = _check_method_params(X, params=fit_params, indices=train) + score_params = score_params if score_params is not None else {} + score_params_train = _check_method_params(X, params=score_params, indices=train) + score_params_test = _check_method_params(X, params=score_params, indices=test) + + if parameters is not None: + # here we clone the parameters, since sometimes the parameters + # themselves might be estimators, e.g. when we search over different + # estimators in a pipeline. + # ref: https://github.com/scikit-learn/scikit-learn/pull/26786 + estimator = estimator.set_params(**clone(parameters, safe=False)) + + start_time = time.time() + + X_train, y_train = _safe_split(estimator, X, y, train) + X_test, y_test = _safe_split(estimator, X, y, test, train) + + result = {} + try: + if y_train is None: + estimator.fit(X_train, **fit_params) + else: + estimator.fit(X_train, y_train, **fit_params) + + except Exception: + # Note fit time as time until error + fit_time = time.time() - start_time + score_time = 0.0 + if error_score == "raise": + raise + elif isinstance(error_score, numbers.Number): + if isinstance(scorer, _MultimetricScorer): + test_scores = {name: error_score for name in scorer._scorers} + if return_train_score: + train_scores = test_scores.copy() + else: + test_scores = error_score + if return_train_score: + train_scores = error_score + result["fit_error"] = format_exc() + else: + result["fit_error"] = None + + fit_time = time.time() - start_time + test_scores = _score( + estimator, X_test, y_test, scorer, score_params_test, error_score + ) + score_time = time.time() - start_time - fit_time + if return_train_score: + train_scores = _score( + estimator, X_train, y_train, scorer, score_params_train, error_score + ) + + if verbose > 1: + total_time = score_time + fit_time + end_msg = f"[CV{progress_msg}] END " + result_msg = params_msg + (";" if params_msg else "") + if verbose > 2: + if isinstance(test_scores, dict): + for scorer_name in sorted(test_scores): + result_msg += f" {scorer_name}: (" + if return_train_score: + scorer_scores = train_scores[scorer_name] + result_msg += f"train={scorer_scores:.3f}, " + result_msg += f"test={test_scores[scorer_name]:.3f})" + else: + result_msg += ", score=" + if return_train_score: + result_msg += f"(train={train_scores:.3f}, test={test_scores:.3f})" + else: + result_msg += f"{test_scores:.3f}" + result_msg += f" total time={logger.short_format_time(total_time)}" + + # Right align the result_msg + end_msg += "." * (80 - len(end_msg) - len(result_msg)) + end_msg += result_msg + print(end_msg) + + result["test_scores"] = test_scores + if return_train_score: + result["train_scores"] = train_scores + if return_n_test_samples: + result["n_test_samples"] = _num_samples(X_test) + if return_times: + result["fit_time"] = fit_time + result["score_time"] = score_time + if return_parameters: + result["parameters"] = parameters + if return_estimator: + result["estimator"] = estimator + return result + + +def _score(estimator, X_test, y_test, scorer, score_params, error_score="raise"): + """Compute the score(s) of an estimator on a given test set. + + Will return a dict of floats if `scorer` is a _MultiMetricScorer, otherwise a single + float is returned. + """ + score_params = {} if score_params is None else score_params + + try: + if y_test is None: + scores = scorer(estimator, X_test, **score_params) + else: + scores = scorer(estimator, X_test, y_test, **score_params) + except Exception: + if isinstance(scorer, _MultimetricScorer): + # If `_MultimetricScorer` raises exception, the `error_score` + # parameter is equal to "raise". + raise + else: + if error_score == "raise": + raise + else: + scores = error_score + warnings.warn( + ( + "Scoring failed. The score on this train-test partition for " + f"these parameters will be set to {error_score}. Details: \n" + f"{format_exc()}" + ), + UserWarning, + ) + + # Check non-raised error messages in `_MultimetricScorer` + if isinstance(scorer, _MultimetricScorer): + exception_messages = [ + (name, str_e) for name, str_e in scores.items() if isinstance(str_e, str) + ] + if exception_messages: + # error_score != "raise" + for name, str_e in exception_messages: + scores[name] = error_score + warnings.warn( + ( + "Scoring failed. The score on this train-test partition for " + f"these parameters will be set to {error_score}. Details: \n" + f"{str_e}" + ), + UserWarning, + ) + + error_msg = "scoring must return a number, got %s (%s) instead. (scorer=%s)" + if isinstance(scores, dict): + for name, score in scores.items(): + if hasattr(score, "item"): + with suppress(ValueError): + # e.g. unwrap memmapped scalars + score = score.item() + if not isinstance(score, numbers.Number): + raise ValueError(error_msg % (score, type(score), name)) + scores[name] = score + else: # scalar + if hasattr(scores, "item"): + with suppress(ValueError): + # e.g. unwrap memmapped scalars + scores = scores.item() + if not isinstance(scores, numbers.Number): + raise ValueError(error_msg % (scores, type(scores), scorer)) + return scores + + +@validate_params( + { + "estimator": [HasMethods(["fit", "predict"])], + "X": ["array-like", "sparse matrix"], + "y": ["array-like", "sparse matrix", None], + "groups": ["array-like", None], + "cv": ["cv_object"], + "n_jobs": [Integral, None], + "verbose": ["verbose"], + "params": [dict, None], + "pre_dispatch": [Integral, str, None], + "method": [ + StrOptions( + { + "predict", + "predict_proba", + "predict_log_proba", + "decision_function", + } + ) + ], + }, + prefer_skip_nested_validation=False, # estimator is not validated yet +) +def cross_val_predict( + estimator, + X, + y=None, + *, + groups=None, + cv=None, + n_jobs=None, + verbose=0, + params=None, + pre_dispatch="2*n_jobs", + method="predict", +): + """Generate cross-validated estimates for each input data point. + + The data is split according to the cv parameter. Each sample belongs + to exactly one test set, and its prediction is computed with an + estimator fitted on the corresponding training set. + + Passing these predictions into an evaluation metric may not be a valid + way to measure generalization performance. Results can differ from + :func:`cross_validate` and :func:`cross_val_score` unless all tests sets + have equal size and the metric decomposes over samples. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : estimator + The estimator instance to use to fit the data. It must implement a `fit` + method and the method given by the `method` parameter. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to fit. Can be, for example a list, or an array at least 2d. + + y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs), \ + default=None + The target variable to try to predict in the case of + supervised learning. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. Only used in conjunction with a "Group" :term:`cv` + instance (e.g., :class:`GroupKFold`). + + .. versionchanged:: 1.4 + ``groups`` can only be passed if metadata routing is not enabled + via ``sklearn.set_config(enable_metadata_routing=True)``. When routing + is enabled, pass ``groups`` alongside other metadata via the ``params`` + argument instead. E.g.: + ``cross_val_predict(..., params={'groups': groups})``. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - int, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable that generates (train, test) splits as arrays of indices. + + For int/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value if None changed from 3-fold to 5-fold. + + n_jobs : int, default=None + Number of jobs to run in parallel. Training the estimator and + predicting are parallelized over the cross-validation splits. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + verbose : int, default=0 + The verbosity level. + + params : dict, default=None + Parameters to pass to the underlying estimator's ``fit`` and the CV + splitter. + + .. versionadded:: 1.4 + + pre_dispatch : int or str, default='2*n_jobs' + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - None, in which case all the jobs are immediately created and spawned. Use + this for lightweight and fast-running jobs, to avoid delays due to on-demand + spawning of the jobs + - An int, giving the exact number of total jobs that are spawned + - A str, giving an expression as a function of n_jobs, as in '2*n_jobs' + + method : {'predict', 'predict_proba', 'predict_log_proba', \ + 'decision_function'}, default='predict' + The method to be invoked by `estimator`. + + Returns + ------- + predictions : ndarray + This is the result of calling `method`. Shape: + + - When `method` is 'predict' and in special case where `method` is + 'decision_function' and the target is binary: (n_samples,) + - When `method` is one of {'predict_proba', 'predict_log_proba', + 'decision_function'} (unless special case above): + (n_samples, n_classes) + - If `estimator` is :term:`multioutput`, an extra dimension + 'n_outputs' is added to the end of each shape above. + + See Also + -------- + cross_val_score : Calculate score for each CV split. + cross_validate : Calculate one or more scores and timings for each CV + split. + + Notes + ----- + In the case that one or more classes are absent in a training portion, a + default score needs to be assigned to all instances for that class if + ``method`` produces columns per class, as in {'decision_function', + 'predict_proba', 'predict_log_proba'}. For ``predict_proba`` this value is + 0. In order to ensure finite output, we approximate negative infinity by + the minimum finite float value for the dtype in other cases. + + Examples + -------- + >>> from sklearn import datasets, linear_model + >>> from sklearn.model_selection import cross_val_predict + >>> diabetes = datasets.load_diabetes() + >>> X = diabetes.data[:150] + >>> y = diabetes.target[:150] + >>> lasso = linear_model.Lasso() + >>> y_pred = cross_val_predict(lasso, X, y, cv=3) + + For a detailed example of using ``cross_val_predict`` to visualize + prediction errors, please see + :ref:`sphx_glr_auto_examples_model_selection_plot_cv_predict.py`. + """ + _check_groups_routing_disabled(groups) + X, y = indexable(X, y) + params = {} if params is None else params + + if _routing_enabled(): + # For estimators, a MetadataRouter is created in get_metadata_routing + # methods. For these router methods, we create the router to use + # `process_routing` on it. + router = ( + MetadataRouter(owner="cross_val_predict") + .add( + splitter=cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + .add( + estimator=estimator, + # TODO(SLEP6): also pass metadata for the predict method. + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + ) + try: + routed_params = process_routing(router, "fit", **params) + except UnsetMetadataPassedError as e: + # The default exception would mention `fit` since in the above + # `process_routing` code, we pass `fit` as the caller. However, + # the user is not calling `fit` directly, so we change the message + # to make it more suitable for this case. + raise UnsetMetadataPassedError( + message=str(e).replace("cross_val_predict.fit", "cross_val_predict"), + unrequested_params=e.unrequested_params, + routed_params=e.routed_params, + ) + else: + routed_params = Bunch() + routed_params.splitter = Bunch(split={"groups": groups}) + routed_params.estimator = Bunch(fit=params) + + cv = check_cv(cv, y, classifier=is_classifier(estimator)) + splits = list(cv.split(X, y, **routed_params.splitter.split)) + + test_indices = np.concatenate([test for _, test in splits]) + if not _check_is_permutation(test_indices, _num_samples(X)): + raise ValueError("cross_val_predict only works for partitions") + + # If classification methods produce multiple columns of output, + # we need to manually encode classes to ensure consistent column ordering. + encode = ( + method in ["decision_function", "predict_proba", "predict_log_proba"] + and y is not None + ) + if encode: + y = np.asarray(y) + if y.ndim == 1: + le = LabelEncoder() + y = le.fit_transform(y) + elif y.ndim == 2: + y_enc = np.zeros_like(y, dtype=int) + for i_label in range(y.shape[1]): + y_enc[:, i_label] = LabelEncoder().fit_transform(y[:, i_label]) + y = y_enc + + # We clone the estimator to make sure that all the folds are + # independent, and that it is pickle-able. + parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) + predictions = parallel( + delayed(_fit_and_predict)( + clone(estimator), + X, + y, + train, + test, + routed_params.estimator.fit, + method, + ) + for train, test in splits + ) + + inv_test_indices = np.empty(len(test_indices), dtype=int) + inv_test_indices[test_indices] = np.arange(len(test_indices)) + + if sp.issparse(predictions[0]): + predictions = sp.vstack(predictions, format=predictions[0].format) + elif encode and isinstance(predictions[0], list): + # `predictions` is a list of method outputs from each fold. + # If each of those is also a list, then treat this as a + # multioutput-multiclass task. We need to separately concatenate + # the method outputs for each label into an `n_labels` long list. + n_labels = y.shape[1] + concat_pred = [] + for i_label in range(n_labels): + label_preds = np.concatenate([p[i_label] for p in predictions]) + concat_pred.append(label_preds) + predictions = concat_pred + else: + predictions = np.concatenate(predictions) + + if isinstance(predictions, list): + return [p[inv_test_indices] for p in predictions] + else: + return predictions[inv_test_indices] + + +def _fit_and_predict(estimator, X, y, train, test, fit_params, method): + """Fit estimator and predict values for a given dataset split. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : estimator object implementing 'fit' and 'predict' + The object to use to fit the data. + + X : array-like of shape (n_samples, n_features) + The data to fit. + + .. versionchanged:: 0.20 + X is only required to be an object with finite length or shape now + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None + The target variable to try to predict in the case of + supervised learning. + + train : array-like of shape (n_train_samples,) + Indices of training samples. + + test : array-like of shape (n_test_samples,) + Indices of test samples. + + fit_params : dict or None + Parameters that will be passed to ``estimator.fit``. + + method : str + Invokes the passed method name of the passed estimator. + + Returns + ------- + predictions : sequence + Result of calling 'estimator.method' + """ + # Adjust length of sample weights + fit_params = fit_params if fit_params is not None else {} + fit_params = _check_method_params(X, params=fit_params, indices=train) + + X_train, y_train = _safe_split(estimator, X, y, train) + X_test, _ = _safe_split(estimator, X, y, test, train) + + if y_train is None: + estimator.fit(X_train, **fit_params) + else: + estimator.fit(X_train, y_train, **fit_params) + func = getattr(estimator, method) + predictions = func(X_test) + + encode = ( + method in ["decision_function", "predict_proba", "predict_log_proba"] + and y is not None + ) + + if encode: + if isinstance(predictions, list): + predictions = [ + _enforce_prediction_order( + estimator.classes_[i_label], + predictions[i_label], + n_classes=len(set(y[:, i_label])), + method=method, + ) + for i_label in range(len(predictions)) + ] + else: + # A 2D y array should be a binary label indicator matrix + n_classes = len(set(y)) if y.ndim == 1 else y.shape[1] + predictions = _enforce_prediction_order( + estimator.classes_, predictions, n_classes, method + ) + return predictions + + +def _enforce_prediction_order(classes, predictions, n_classes, method): + """Ensure that prediction arrays have correct column order + + When doing cross-validation, if one or more classes are + not present in the subset of data used for training, + then the output prediction array might not have the same + columns as other folds. Use the list of class names + (assumed to be ints) to enforce the correct column order. + + Note that `classes` is the list of classes in this fold + (a subset of the classes in the full training set) + and `n_classes` is the number of classes in the full training set. + """ + if n_classes != len(classes): + recommendation = ( + "To fix this, use a cross-validation " + "technique resulting in properly " + "stratified folds" + ) + warnings.warn( + "Number of classes in training fold ({}) does " + "not match total number of classes ({}). " + "Results may not be appropriate for your use case. " + "{}".format(len(classes), n_classes, recommendation), + RuntimeWarning, + ) + if method == "decision_function": + if predictions.ndim == 2 and predictions.shape[1] != len(classes): + # This handles the case when the shape of predictions + # does not match the number of classes used to train + # it with. This case is found when sklearn.svm.SVC is + # set to `decision_function_shape='ovo'`. + raise ValueError( + "Output shape {} of {} does not match " + "number of classes ({}) in fold. " + "Irregular decision_function outputs " + "are not currently supported by " + "cross_val_predict".format(predictions.shape, method, len(classes)) + ) + if len(classes) <= 2: + # In this special case, `predictions` contains a 1D array. + raise ValueError( + "Only {} class/es in training fold, but {} " + "in overall dataset. This " + "is not supported for decision_function " + "with imbalanced folds. {}".format( + len(classes), n_classes, recommendation + ) + ) + + float_min = np.finfo(predictions.dtype).min + default_values = { + "decision_function": float_min, + "predict_log_proba": float_min, + "predict_proba": 0, + } + predictions_for_all_classes = np.full( + (_num_samples(predictions), n_classes), + default_values[method], + dtype=predictions.dtype, + ) + predictions_for_all_classes[:, classes] = predictions + predictions = predictions_for_all_classes + return predictions + + +def _check_is_permutation(indices, n_samples): + """Check whether indices is a reordering of the array np.arange(n_samples) + + Parameters + ---------- + indices : ndarray + int array to test + n_samples : int + number of expected elements + + Returns + ------- + is_partition : bool + True iff sorted(indices) is np.arange(n) + """ + if len(indices) != n_samples: + return False + hit = np.zeros(n_samples, dtype=bool) + hit[indices] = True + if not np.all(hit): + return False + return True + + +@validate_params( + { + "estimator": [HasMethods("fit")], + "X": ["array-like", "sparse matrix"], + "y": ["array-like", None], + "groups": ["array-like", None], + "cv": ["cv_object"], + "n_permutations": [Interval(Integral, 1, None, closed="left")], + "n_jobs": [Integral, None], + "random_state": ["random_state"], + "verbose": ["verbose"], + "scoring": [StrOptions(set(get_scorer_names())), callable, None], + "fit_params": [dict, None], + "params": [dict, None], + }, + prefer_skip_nested_validation=False, # estimator is not validated yet +) +def permutation_test_score( + estimator, + X, + y, + *, + groups=None, + cv=None, + n_permutations=100, + n_jobs=None, + random_state=0, + verbose=0, + scoring=None, + fit_params=None, + params=None, +): + """Evaluate the significance of a cross-validated score with permutations. + + Permutes targets to generate 'randomized data' and compute the empirical + p-value against the null hypothesis that features and targets are + independent. + + The p-value represents the fraction of randomized data sets where the + estimator performed as well or better than on the original data. A small + p-value suggests that there is a real dependency between features and + targets which has been used by the estimator to give good predictions. + A large p-value may be due to lack of real dependency between features + and targets or the estimator was not able to use the dependency to + give good predictions. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : estimator object implementing 'fit' + The object to use to fit the data. + + X : array-like of shape at least 2D + The data to fit. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None + The target variable to try to predict in the case of + supervised learning. + + groups : array-like of shape (n_samples,), default=None + Labels to constrain permutation within groups, i.e. ``y`` values + are permuted among samples with the same group identifier. + When not specified, ``y`` values are permuted among all samples. + + When a grouped cross-validator is used, the group labels are + also passed on to the ``split`` method of the cross-validator. The + cross-validator uses them for grouping the samples while splitting + the dataset into train/test set. + + .. versionchanged:: 1.6 + ``groups`` can only be passed if metadata routing is not enabled + via ``sklearn.set_config(enable_metadata_routing=True)``. When routing + is enabled, pass ``groups`` alongside other metadata via the ``params`` + argument instead. E.g.: + ``permutation_test_score(..., params={'groups': groups})``. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - `None`, to use the default 5-fold cross validation, + - int, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For `int`/`None` inputs, if the estimator is a classifier and `y` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + `cv` default value if `None` changed from 3-fold to 5-fold. + + n_permutations : int, default=100 + Number of times to permute ``y``. + + n_jobs : int, default=None + Number of jobs to run in parallel. Training the estimator and computing + the cross-validated score are parallelized over the permutations. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + random_state : int, RandomState instance or None, default=0 + Pass an int for reproducible output for permutation of + ``y`` values among samples. See :term:`Glossary `. + + verbose : int, default=0 + The verbosity level. + + scoring : str or callable, default=None + Scoring method to use to evaluate the predictions on the validation set. + + - str: see :ref:`scoring_string_names` for options. + - callable: a scorer callable object (e.g., function) with signature + ``scorer(estimator, X, y)``, which should return only a single value. + See :ref:`scoring_callable` for details. + - `None`: the `estimator`'s + :ref:`default evaluation criterion ` is used. + + fit_params : dict, default=None + Parameters to pass to the fit method of the estimator. + + .. deprecated:: 1.6 + This parameter is deprecated and will be removed in version 1.6. Use + ``params`` instead. + + params : dict, default=None + Parameters to pass to the `fit` method of the estimator, the scorer + and the cv splitter. + + - If `enable_metadata_routing=False` (default): Parameters directly passed to + the `fit` method of the estimator. + + - If `enable_metadata_routing=True`: Parameters safely routed to the `fit` + method of the estimator, `cv` object and `scorer`. See :ref:`Metadata Routing + User Guide ` for more details. + + .. versionadded:: 1.6 + + Returns + ------- + score : float + The true score without permuting targets. + + permutation_scores : array of shape (n_permutations,) + The scores obtained for each permutations. + + pvalue : float + The p-value, which approximates the probability that the score would + be obtained by chance. This is calculated as: + + `(C + 1) / (n_permutations + 1)` + + Where C is the number of permutations whose score >= the true score. + + The best possible p-value is 1/(n_permutations + 1), the worst is 1.0. + + Notes + ----- + This function implements Test 1 in: + + Ojala and Garriga. `Permutation Tests for Studying Classifier Performance + `_. The + Journal of Machine Learning Research (2010) vol. 11 + + Examples + -------- + >>> from sklearn.datasets import make_classification + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import permutation_test_score + >>> X, y = make_classification(random_state=0) + >>> estimator = LogisticRegression() + >>> score, permutation_scores, pvalue = permutation_test_score( + ... estimator, X, y, random_state=0 + ... ) + >>> print(f"Original Score: {score:.3f}") + Original Score: 0.810 + >>> print( + ... f"Permutation Scores: {permutation_scores.mean():.3f} +/- " + ... f"{permutation_scores.std():.3f}" + ... ) + Permutation Scores: 0.505 +/- 0.057 + >>> print(f"P-value: {pvalue:.3f}") + P-value: 0.010 + """ + params = _check_params_groups_deprecation(fit_params, params, groups, "1.8") + + X, y, groups = indexable(X, y, groups) + + cv = check_cv(cv, y, classifier=is_classifier(estimator)) + scorer = check_scoring(estimator, scoring=scoring) + random_state = check_random_state(random_state) + + if _routing_enabled(): + router = ( + MetadataRouter(owner="permutation_test_score") + .add( + estimator=estimator, + # TODO(SLEP6): also pass metadata to the predict method for + # scoring? + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + .add( + splitter=cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + .add( + scorer=scorer, + method_mapping=MethodMapping().add(caller="fit", callee="score"), + ) + ) + + try: + routed_params = process_routing(router, "fit", **params) + except UnsetMetadataPassedError as e: + # The default exception would mention `fit` since in the above + # `process_routing` code, we pass `fit` as the caller. However, + # the user is not calling `fit` directly, so we change the message + # to make it more suitable for this case. + raise UnsetMetadataPassedError( + message=str(e).replace( + "permutation_test_score.fit", "permutation_test_score" + ), + unrequested_params=e.unrequested_params, + routed_params=e.routed_params, + ) + + else: + routed_params = Bunch() + routed_params.estimator = Bunch(fit=params) + routed_params.splitter = Bunch(split={"groups": groups}) + routed_params.scorer = Bunch(score={}) + + # We clone the estimator to make sure that all the folds are + # independent, and that it is pickle-able. + score = _permutation_test_score( + clone(estimator), + X, + y, + cv, + scorer, + split_params=routed_params.splitter.split, + fit_params=routed_params.estimator.fit, + score_params=routed_params.scorer.score, + ) + permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)( + delayed(_permutation_test_score)( + clone(estimator), + X, + _shuffle(y, groups, random_state), + cv, + scorer, + split_params=routed_params.splitter.split, + fit_params=routed_params.estimator.fit, + score_params=routed_params.scorer.score, + ) + for _ in range(n_permutations) + ) + permutation_scores = np.array(permutation_scores) + pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1) + return score, permutation_scores, pvalue + + +def _permutation_test_score( + estimator, X, y, cv, scorer, split_params, fit_params, score_params +): + """Auxiliary function for permutation_test_score""" + # Adjust length of sample weights + fit_params = fit_params if fit_params is not None else {} + score_params = score_params if score_params is not None else {} + + avg_score = [] + for train, test in cv.split(X, y, **split_params): + X_train, y_train = _safe_split(estimator, X, y, train) + X_test, y_test = _safe_split(estimator, X, y, test, train) + fit_params_train = _check_method_params(X, params=fit_params, indices=train) + score_params_test = _check_method_params(X, params=score_params, indices=test) + estimator.fit(X_train, y_train, **fit_params_train) + avg_score.append(scorer(estimator, X_test, y_test, **score_params_test)) + return np.mean(avg_score) + + +def _shuffle(y, groups, random_state): + """Return a shuffled copy of y eventually shuffle among same groups.""" + if groups is None: + indices = random_state.permutation(len(y)) + else: + indices = np.arange(len(groups)) + for group in np.unique(groups): + this_mask = groups == group + indices[this_mask] = random_state.permutation(indices[this_mask]) + return _safe_indexing(y, indices) + + +@validate_params( + { + "estimator": [HasMethods(["fit"])], + "X": ["array-like", "sparse matrix"], + "y": ["array-like", None], + "groups": ["array-like", None], + "train_sizes": ["array-like"], + "cv": ["cv_object"], + "scoring": [StrOptions(set(get_scorer_names())), callable, None], + "exploit_incremental_learning": ["boolean"], + "n_jobs": [Integral, None], + "pre_dispatch": [Integral, str], + "verbose": ["verbose"], + "shuffle": ["boolean"], + "random_state": ["random_state"], + "error_score": [StrOptions({"raise"}), Real], + "return_times": ["boolean"], + "fit_params": [dict, None], + "params": [dict, None], + }, + prefer_skip_nested_validation=False, # estimator is not validated yet +) +def learning_curve( + estimator, + X, + y, + *, + groups=None, + train_sizes=np.linspace(0.1, 1.0, 5), + cv=None, + scoring=None, + exploit_incremental_learning=False, + n_jobs=None, + pre_dispatch="all", + verbose=0, + shuffle=False, + random_state=None, + error_score=np.nan, + return_times=False, + fit_params=None, + params=None, +): + """Learning curve. + + Determines cross-validated training and test scores for different training + set sizes. + + A cross-validation generator splits the whole dataset k times in training + and test data. Subsets of the training set with varying sizes will be used + to train the estimator and a score for each training subset size and the + test set will be computed. Afterwards, the scores will be averaged over + all k runs for each training subset size. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : object type that implements the "fit" method + An object of that type which is cloned for each validation. It must + also implement "predict" unless `scoring` is a callable that doesn't + rely on "predict" to compute a score. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None + Target relative to X for classification or regression; + None for unsupervised learning. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. Only used in conjunction with a "Group" :term:`cv` + instance (e.g., :class:`GroupKFold`). + + .. versionchanged:: 1.6 + ``groups`` can only be passed if metadata routing is not enabled + via ``sklearn.set_config(enable_metadata_routing=True)``. When routing + is enabled, pass ``groups`` alongside other metadata via the ``params`` + argument instead. E.g.: + ``learning_curve(..., params={'groups': groups})``. + + train_sizes : array-like of shape (n_ticks,), \ + default=np.linspace(0.1, 1.0, 5) + Relative or absolute numbers of training examples that will be used to + generate the learning curve. If the dtype is float, it is regarded as a + fraction of the maximum size of the training set (that is determined + by the selected validation method), i.e. it has to be within (0, 1]. + Otherwise it is interpreted as absolute sizes of the training sets. + Note that for classification the number of samples usually has to + be big enough to contain at least one sample from each class. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - int, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For int/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value if None changed from 3-fold to 5-fold. + + scoring : str or callable, default=None + Scoring method to use to evaluate the training and test sets. + + - str: see :ref:`scoring_string_names` for options. + - callable: a scorer callable object (e.g., function) with signature + ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details. + - `None`: the `estimator`'s + :ref:`default evaluation criterion ` is used. + + exploit_incremental_learning : bool, default=False + If the estimator supports incremental learning, this will be + used to speed up fitting for different training set sizes. + + n_jobs : int, default=None + Number of jobs to run in parallel. Training the estimator and computing + the score are parallelized over the different training and test sets. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + pre_dispatch : int or str, default='all' + Number of predispatched jobs for parallel execution (default is + all). The option can reduce the allocated memory. The str can + be an expression like '2*n_jobs'. + + verbose : int, default=0 + Controls the verbosity: the higher, the more messages. + + shuffle : bool, default=False + Whether to shuffle training data before taking prefixes of it + based on``train_sizes``. + + random_state : int, RandomState instance or None, default=None + Used when ``shuffle`` is True. Pass an int for reproducible + output across multiple function calls. + See :term:`Glossary `. + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. + If a numeric value is given, FitFailedWarning is raised. + + .. versionadded:: 0.20 + + return_times : bool, default=False + Whether to return the fit and score times. + + fit_params : dict, default=None + Parameters to pass to the fit method of the estimator. + + .. deprecated:: 1.6 + This parameter is deprecated and will be removed in version 1.8. Use + ``params`` instead. + + params : dict, default=None + Parameters to pass to the `fit` method of the estimator and to the scorer. + + - If `enable_metadata_routing=False` (default): Parameters directly passed to + the `fit` method of the estimator. + + - If `enable_metadata_routing=True`: Parameters safely routed to the `fit` + method of the estimator. See :ref:`Metadata Routing User Guide + ` for more details. + + .. versionadded:: 1.6 + + Returns + ------- + train_sizes_abs : array of shape (n_unique_ticks,) + Numbers of training examples that has been used to generate the + learning curve. Note that the number of ticks might be less + than n_ticks because duplicate entries will be removed. + + train_scores : array of shape (n_ticks, n_cv_folds) + Scores on training sets. + + test_scores : array of shape (n_ticks, n_cv_folds) + Scores on test set. + + fit_times : array of shape (n_ticks, n_cv_folds) + Times spent for fitting in seconds. Only present if ``return_times`` + is True. + + score_times : array of shape (n_ticks, n_cv_folds) + Times spent for scoring in seconds. Only present if ``return_times`` + is True. + + See Also + -------- + LearningCurveDisplay.from_estimator : Plot a learning curve using an + estimator and data. + + Examples + -------- + >>> from sklearn.datasets import make_classification + >>> from sklearn.tree import DecisionTreeClassifier + >>> from sklearn.model_selection import learning_curve + >>> X, y = make_classification(n_samples=100, n_features=10, random_state=42) + >>> tree = DecisionTreeClassifier(max_depth=4, random_state=42) + >>> train_size_abs, train_scores, test_scores = learning_curve( + ... tree, X, y, train_sizes=[0.3, 0.6, 0.9] + ... ) + >>> for train_size, cv_train_scores, cv_test_scores in zip( + ... train_size_abs, train_scores, test_scores + ... ): + ... print(f"{train_size} samples were used to train the model") + ... print(f"The average train accuracy is {cv_train_scores.mean():.2f}") + ... print(f"The average test accuracy is {cv_test_scores.mean():.2f}") + 24 samples were used to train the model + The average train accuracy is 1.00 + The average test accuracy is 0.85 + 48 samples were used to train the model + The average train accuracy is 1.00 + The average test accuracy is 0.90 + 72 samples were used to train the model + The average train accuracy is 1.00 + The average test accuracy is 0.93 + """ + if exploit_incremental_learning and not hasattr(estimator, "partial_fit"): + raise ValueError( + "An estimator must support the partial_fit interface " + "to exploit incremental learning" + ) + + params = _check_params_groups_deprecation(fit_params, params, groups, "1.8") + + X, y, groups = indexable(X, y, groups) + + cv = check_cv(cv, y, classifier=is_classifier(estimator)) + + scorer = check_scoring(estimator, scoring=scoring) + + if _routing_enabled(): + router = ( + MetadataRouter(owner="learning_curve") + .add( + estimator=estimator, + # TODO(SLEP6): also pass metadata to the predict method for + # scoring? + method_mapping=MethodMapping() + .add(caller="fit", callee="fit") + .add(caller="fit", callee="partial_fit"), + ) + .add( + splitter=cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + .add( + scorer=scorer, + method_mapping=MethodMapping().add(caller="fit", callee="score"), + ) + ) + + try: + routed_params = process_routing(router, "fit", **params) + except UnsetMetadataPassedError as e: + # The default exception would mention `fit` since in the above + # `process_routing` code, we pass `fit` as the caller. However, + # the user is not calling `fit` directly, so we change the message + # to make it more suitable for this case. + raise UnsetMetadataPassedError( + message=str(e).replace("learning_curve.fit", "learning_curve"), + unrequested_params=e.unrequested_params, + routed_params=e.routed_params, + ) + + else: + routed_params = Bunch() + routed_params.estimator = Bunch(fit=params, partial_fit=params) + routed_params.splitter = Bunch(split={"groups": groups}) + routed_params.scorer = Bunch(score={}) + + # Store cv as list as we will be iterating over the list multiple times + cv_iter = list(cv.split(X, y, **routed_params.splitter.split)) + + n_max_training_samples = len(cv_iter[0][0]) + # Because the lengths of folds can be significantly different, it is + # not guaranteed that we use all of the available training data when we + # use the first 'n_max_training_samples' samples. + train_sizes_abs = _translate_train_sizes(train_sizes, n_max_training_samples) + n_unique_ticks = train_sizes_abs.shape[0] + if verbose > 0: + print("[learning_curve] Training set sizes: " + str(train_sizes_abs)) + + parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose) + + if shuffle: + rng = check_random_state(random_state) + cv_iter = ((rng.permutation(train), test) for train, test in cv_iter) + + if exploit_incremental_learning: + classes = np.unique(y) if is_classifier(estimator) else None + out = parallel( + delayed(_incremental_fit_estimator)( + clone(estimator), + X, + y, + classes, + train, + test, + train_sizes_abs, + scorer, + return_times, + error_score=error_score, + fit_params=routed_params.estimator.partial_fit, + score_params=routed_params.scorer.score, + ) + for train, test in cv_iter + ) + out = np.asarray(out).transpose((2, 1, 0)) + else: + train_test_proportions = [] + for train, test in cv_iter: + for n_train_samples in train_sizes_abs: + train_test_proportions.append((train[:n_train_samples], test)) + + results = parallel( + delayed(_fit_and_score)( + clone(estimator), + X, + y, + scorer=scorer, + train=train, + test=test, + verbose=verbose, + parameters=None, + fit_params=routed_params.estimator.fit, + score_params=routed_params.scorer.score, + return_train_score=True, + error_score=error_score, + return_times=return_times, + ) + for train, test in train_test_proportions + ) + _warn_or_raise_about_fit_failures(results, error_score) + results = _aggregate_score_dicts(results) + train_scores = results["train_scores"].reshape(-1, n_unique_ticks).T + test_scores = results["test_scores"].reshape(-1, n_unique_ticks).T + out = [train_scores, test_scores] + + if return_times: + fit_times = results["fit_time"].reshape(-1, n_unique_ticks).T + score_times = results["score_time"].reshape(-1, n_unique_ticks).T + out.extend([fit_times, score_times]) + + ret = train_sizes_abs, out[0], out[1] + + if return_times: + ret = ret + (out[2], out[3]) + + return ret + + +def _translate_train_sizes(train_sizes, n_max_training_samples): + """Determine absolute sizes of training subsets and validate 'train_sizes'. + + Examples: + _translate_train_sizes([0.5, 1.0], 10) -> [5, 10] + _translate_train_sizes([5, 10], 10) -> [5, 10] + + Parameters + ---------- + train_sizes : array-like of shape (n_ticks,) + Numbers of training examples that will be used to generate the + learning curve. If the dtype is float, it is regarded as a + fraction of 'n_max_training_samples', i.e. it has to be within (0, 1]. + + n_max_training_samples : int + Maximum number of training samples (upper bound of 'train_sizes'). + + Returns + ------- + train_sizes_abs : array of shape (n_unique_ticks,) + Numbers of training examples that will be used to generate the + learning curve. Note that the number of ticks might be less + than n_ticks because duplicate entries will be removed. + """ + train_sizes_abs = np.asarray(train_sizes) + n_ticks = train_sizes_abs.shape[0] + n_min_required_samples = np.min(train_sizes_abs) + n_max_required_samples = np.max(train_sizes_abs) + if np.issubdtype(train_sizes_abs.dtype, np.floating): + if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0: + raise ValueError( + "train_sizes has been interpreted as fractions " + "of the maximum number of training samples and " + "must be within (0, 1], but is within [%f, %f]." + % (n_min_required_samples, n_max_required_samples) + ) + train_sizes_abs = (train_sizes_abs * n_max_training_samples).astype( + dtype=int, copy=False + ) + train_sizes_abs = np.clip(train_sizes_abs, 1, n_max_training_samples) + else: + if ( + n_min_required_samples <= 0 + or n_max_required_samples > n_max_training_samples + ): + raise ValueError( + "train_sizes has been interpreted as absolute " + "numbers of training samples and must be within " + "(0, %d], but is within [%d, %d]." + % ( + n_max_training_samples, + n_min_required_samples, + n_max_required_samples, + ) + ) + + train_sizes_abs = np.unique(train_sizes_abs) + if n_ticks > train_sizes_abs.shape[0]: + warnings.warn( + "Removed duplicate entries from 'train_sizes'. Number " + "of ticks will be less than the size of " + "'train_sizes': %d instead of %d." % (train_sizes_abs.shape[0], n_ticks), + RuntimeWarning, + ) + + return train_sizes_abs + + +def _incremental_fit_estimator( + estimator, + X, + y, + classes, + train, + test, + train_sizes, + scorer, + return_times, + error_score, + fit_params, + score_params, +): + """Train estimator on training subsets incrementally and compute scores.""" + train_scores, test_scores, fit_times, score_times = [], [], [], [] + partitions = zip(train_sizes, np.split(train, train_sizes)[:-1]) + if fit_params is None: + fit_params = {} + if classes is None: + partial_fit_func = partial(estimator.partial_fit, **fit_params) + else: + partial_fit_func = partial(estimator.partial_fit, classes=classes, **fit_params) + score_params = score_params if score_params is not None else {} + score_params_train = _check_method_params(X, params=score_params, indices=train) + score_params_test = _check_method_params(X, params=score_params, indices=test) + + for n_train_samples, partial_train in partitions: + train_subset = train[:n_train_samples] + X_train, y_train = _safe_split(estimator, X, y, train_subset) + X_partial_train, y_partial_train = _safe_split(estimator, X, y, partial_train) + X_test, y_test = _safe_split(estimator, X, y, test, train_subset) + start_fit = time.time() + if y_partial_train is None: + partial_fit_func(X_partial_train) + else: + partial_fit_func(X_partial_train, y_partial_train) + fit_time = time.time() - start_fit + fit_times.append(fit_time) + + start_score = time.time() + + test_scores.append( + _score( + estimator, + X_test, + y_test, + scorer, + score_params=score_params_test, + error_score=error_score, + ) + ) + train_scores.append( + _score( + estimator, + X_train, + y_train, + scorer, + score_params=score_params_train, + error_score=error_score, + ) + ) + score_time = time.time() - start_score + score_times.append(score_time) + + ret = ( + (train_scores, test_scores, fit_times, score_times) + if return_times + else (train_scores, test_scores) + ) + + return np.array(ret).T + + +@validate_params( + { + "estimator": [HasMethods(["fit"])], + "X": ["array-like", "sparse matrix"], + "y": ["array-like", None], + "param_name": [str], + "param_range": ["array-like"], + "groups": ["array-like", None], + "cv": ["cv_object"], + "scoring": [StrOptions(set(get_scorer_names())), callable, None], + "n_jobs": [Integral, None], + "pre_dispatch": [Integral, str], + "verbose": ["verbose"], + "error_score": [StrOptions({"raise"}), Real], + "fit_params": [dict, None], + "params": [dict, None], + }, + prefer_skip_nested_validation=False, # estimator is not validated yet +) +def validation_curve( + estimator, + X, + y, + *, + param_name, + param_range, + groups=None, + cv=None, + scoring=None, + n_jobs=None, + pre_dispatch="all", + verbose=0, + error_score=np.nan, + fit_params=None, + params=None, +): + """Validation curve. + + Determine training and test scores for varying parameter values. + + Compute scores for an estimator with different values of a specified + parameter. This is similar to grid search with one parameter. However, this + will also compute training scores and is merely a utility for plotting the + results. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : object type that implements the "fit" method + An object of that type which is cloned for each validation. It must + also implement "predict" unless `scoring` is a callable that doesn't + rely on "predict" to compute a score. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None + Target relative to X for classification or regression; + None for unsupervised learning. + + param_name : str + Name of the parameter that will be varied. + + param_range : array-like of shape (n_values,) + The values of the parameter that will be evaluated. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. Only used in conjunction with a "Group" :term:`cv` + instance (e.g., :class:`GroupKFold`). + + .. versionchanged:: 1.6 + ``groups`` can only be passed if metadata routing is not enabled + via ``sklearn.set_config(enable_metadata_routing=True)``. When routing + is enabled, pass ``groups`` alongside other metadata via the ``params`` + argument instead. E.g.: + ``validation_curve(..., params={'groups': groups})``. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - int, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For int/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + .. versionchanged:: 0.22 + ``cv`` default value if None changed from 3-fold to 5-fold. + + scoring : str or callable, default=None + Scoring method to use to evaluate the training and test sets. + + - str: see :ref:`scoring_string_names` for options. + - callable: a scorer callable object (e.g., function) with signature + ``scorer(estimator, X, y)``. See :ref:`scoring_callable` for details. + - `None`: the `estimator`'s + :ref:`default evaluation criterion ` is used. + + n_jobs : int, default=None + Number of jobs to run in parallel. Training the estimator and computing + the score are parallelized over the combinations of each parameter + value and each cross-validation split. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + pre_dispatch : int or str, default='all' + Number of predispatched jobs for parallel execution (default is + all). The option can reduce the allocated memory. The str can + be an expression like '2*n_jobs'. + + verbose : int, default=0 + Controls the verbosity: the higher, the more messages. + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator fitting. + If set to 'raise', the error is raised. + If a numeric value is given, FitFailedWarning is raised. + + .. versionadded:: 0.20 + + fit_params : dict, default=None + Parameters to pass to the fit method of the estimator. + + .. deprecated:: 1.6 + This parameter is deprecated and will be removed in version 1.8. Use + ``params`` instead. + + params : dict, default=None + Parameters to pass to the estimator, scorer and cross-validation object. + + - If `enable_metadata_routing=False` (default): Parameters directly passed to + the `fit` method of the estimator. + + - If `enable_metadata_routing=True`: Parameters safely routed to the `fit` + method of the estimator, to the scorer and to the cross-validation object. + See :ref:`Metadata Routing User Guide ` for more details. + + .. versionadded:: 1.6 + + Returns + ------- + train_scores : array of shape (n_ticks, n_cv_folds) + Scores on training sets. + + test_scores : array of shape (n_ticks, n_cv_folds) + Scores on test set. + + See Also + -------- + ValidationCurveDisplay.from_estimator : Plot the validation curve + given an estimator, the data, and the parameter to vary. + + Notes + ----- + See :ref:`sphx_glr_auto_examples_model_selection_plot_train_error_vs_test_error.py` + + Examples + -------- + >>> import numpy as np + >>> from sklearn.datasets import make_classification + >>> from sklearn.model_selection import validation_curve + >>> from sklearn.linear_model import LogisticRegression + >>> X, y = make_classification(n_samples=1_000, random_state=0) + >>> logistic_regression = LogisticRegression() + >>> param_name, param_range = "C", np.logspace(-8, 3, 10) + >>> train_scores, test_scores = validation_curve( + ... logistic_regression, X, y, param_name=param_name, param_range=param_range + ... ) + >>> print(f"The average train accuracy is {train_scores.mean():.2f}") + The average train accuracy is 0.81 + >>> print(f"The average test accuracy is {test_scores.mean():.2f}") + The average test accuracy is 0.81 + """ + params = _check_params_groups_deprecation(fit_params, params, groups, "1.8") + X, y, groups = indexable(X, y, groups) + + cv = check_cv(cv, y, classifier=is_classifier(estimator)) + scorer = check_scoring(estimator, scoring=scoring) + + if _routing_enabled(): + router = ( + MetadataRouter(owner="validation_curve") + .add( + estimator=estimator, + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + .add( + splitter=cv, + method_mapping=MethodMapping().add(caller="fit", callee="split"), + ) + .add( + scorer=scorer, + method_mapping=MethodMapping().add(caller="fit", callee="score"), + ) + ) + + try: + routed_params = process_routing(router, "fit", **params) + except UnsetMetadataPassedError as e: + # The default exception would mention `fit` since in the above + # `process_routing` code, we pass `fit` as the caller. However, + # the user is not calling `fit` directly, so we change the message + # to make it more suitable for this case. + raise UnsetMetadataPassedError( + message=str(e).replace("validation_curve.fit", "validation_curve"), + unrequested_params=e.unrequested_params, + routed_params=e.routed_params, + ) + + else: + routed_params = Bunch() + routed_params.estimator = Bunch(fit=params) + routed_params.splitter = Bunch(split={"groups": groups}) + routed_params.scorer = Bunch(score={}) + + parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose) + results = parallel( + delayed(_fit_and_score)( + clone(estimator), + X, + y, + scorer=scorer, + train=train, + test=test, + verbose=verbose, + parameters={param_name: v}, + fit_params=routed_params.estimator.fit, + score_params=routed_params.scorer.score, + return_train_score=True, + error_score=error_score, + ) + # NOTE do not change order of iteration to allow one time cv splitters + for train, test in cv.split(X, y, **routed_params.splitter.split) + for v in param_range + ) + n_params = len(param_range) + + results = _aggregate_score_dicts(results) + train_scores = results["train_scores"].reshape(-1, n_params).T + test_scores = results["test_scores"].reshape(-1, n_params).T + + return train_scores, test_scores + + +def _aggregate_score_dicts(scores): + """Aggregate the list of dict to dict of np ndarray + + The aggregated output of _aggregate_score_dicts will be a list of dict + of form [{'prec': 0.1, 'acc':1.0}, {'prec': 0.1, 'acc':1.0}, ...] + Convert it to a dict of array {'prec': np.array([0.1 ...]), ...} + + Parameters + ---------- + + scores : list of dict + List of dicts of the scores for all scorers. This is a flat list, + assumed originally to be of row major order. + + Example + ------- + + >>> scores = [{'a': 1, 'b':10}, {'a': 2, 'b':2}, {'a': 3, 'b':3}, + ... {'a': 10, 'b': 10}] # doctest: +SKIP + >>> _aggregate_score_dicts(scores) # doctest: +SKIP + {'a': array([1, 2, 3, 10]), + 'b': array([10, 2, 3, 10])} + """ + return { + key: ( + np.asarray([score[key] for score in scores]) + if isinstance(scores[0][key], numbers.Number) + else [score[key] for score in scores] + ) + for key in scores[0] + } diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/common.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/common.py new file mode 100644 index 0000000000000000000000000000000000000000..54a993db76933a5e710f0ddd20a4efd0118ecf95 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/common.py @@ -0,0 +1,24 @@ +""" +Common utilities for testing model selection. +""" + +import numpy as np + +from sklearn.model_selection import KFold + + +class OneTimeSplitter: + """A wrapper to make KFold single entry cv iterator""" + + def __init__(self, n_splits=4, n_samples=99): + self.n_splits = n_splits + self.n_samples = n_samples + self.indices = iter(KFold(n_splits=n_splits).split(np.ones(n_samples))) + + def split(self, X=None, y=None, groups=None): + """Split can be called only once""" + for index in self.indices: + yield index + + def get_n_splits(self, X=None, y=None, groups=None): + return self.n_splits diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_classification_threshold.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_classification_threshold.py new file mode 100644 index 0000000000000000000000000000000000000000..1ba4dcea369748622d366df0477e3b7911873593 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_classification_threshold.py @@ -0,0 +1,618 @@ +import numpy as np +import pytest + +from sklearn import config_context +from sklearn.base import clone +from sklearn.datasets import ( + load_breast_cancer, + load_iris, + make_classification, + make_multilabel_classification, +) +from sklearn.dummy import DummyClassifier +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.exceptions import NotFittedError +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import ( + balanced_accuracy_score, + f1_score, + fbeta_score, + make_scorer, +) +from sklearn.metrics._scorer import _CurveScorer +from sklearn.model_selection import ( + FixedThresholdClassifier, + StratifiedShuffleSplit, + TunedThresholdClassifierCV, +) +from sklearn.model_selection._classification_threshold import ( + _fit_and_score_over_thresholds, +) +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.utils._mocking import CheckingClassifier +from sklearn.utils._testing import ( + _convert_container, + assert_allclose, + assert_array_equal, +) + + +def test_fit_and_score_over_thresholds_curve_scorers(): + """Check that `_fit_and_score_over_thresholds` returns thresholds in ascending order + for the different accepted curve scorers.""" + X, y = make_classification(n_samples=100, random_state=0) + train_idx, val_idx = np.arange(50), np.arange(50, 100) + classifier = LogisticRegression() + + curve_scorer = _CurveScorer( + score_func=balanced_accuracy_score, + sign=1, + response_method="predict_proba", + thresholds=10, + kwargs={}, + ) + scores, thresholds = _fit_and_score_over_thresholds( + classifier, + X, + y, + fit_params={}, + train_idx=train_idx, + val_idx=val_idx, + curve_scorer=curve_scorer, + score_params={}, + ) + + assert np.all(thresholds[:-1] <= thresholds[1:]) + assert isinstance(scores, np.ndarray) + assert np.logical_and(scores >= 0, scores <= 1).all() + + +def test_fit_and_score_over_thresholds_prefit(): + """Check the behaviour with a prefit classifier.""" + X, y = make_classification(n_samples=100, random_state=0) + + # `train_idx is None` to indicate that the classifier is prefit + train_idx, val_idx = None, np.arange(50, 100) + classifier = DecisionTreeClassifier(random_state=0).fit(X, y) + # make sure that the classifier memorized the full dataset such that + # we get perfect predictions and thus match the expected score + assert classifier.score(X[val_idx], y[val_idx]) == pytest.approx(1.0) + + curve_scorer = _CurveScorer( + score_func=balanced_accuracy_score, + sign=1, + response_method="predict_proba", + thresholds=2, + kwargs={}, + ) + scores, thresholds = _fit_and_score_over_thresholds( + classifier, + X, + y, + fit_params={}, + train_idx=train_idx, + val_idx=val_idx, + curve_scorer=curve_scorer, + score_params={}, + ) + assert np.all(thresholds[:-1] <= thresholds[1:]) + assert_allclose(scores, [0.5, 1.0]) + + +@config_context(enable_metadata_routing=True) +def test_fit_and_score_over_thresholds_sample_weight(): + """Check that we dispatch the sample-weight to fit and score the classifier.""" + X, y = load_iris(return_X_y=True) + X, y = X[:100], y[:100] # only 2 classes + + # create a dataset and repeat twice the sample of class #0 + X_repeated, y_repeated = np.vstack([X, X[y == 0]]), np.hstack([y, y[y == 0]]) + # create a sample weight vector that is equivalent to the repeated dataset + sample_weight = np.ones_like(y) + sample_weight[:50] *= 2 + + classifier = LogisticRegression() + train_repeated_idx = np.arange(X_repeated.shape[0]) + val_repeated_idx = np.arange(X_repeated.shape[0]) + curve_scorer = _CurveScorer( + score_func=balanced_accuracy_score, + sign=1, + response_method="predict_proba", + thresholds=10, + kwargs={}, + ) + scores_repeated, thresholds_repeated = _fit_and_score_over_thresholds( + classifier, + X_repeated, + y_repeated, + fit_params={}, + train_idx=train_repeated_idx, + val_idx=val_repeated_idx, + curve_scorer=curve_scorer, + score_params={}, + ) + + train_idx, val_idx = np.arange(X.shape[0]), np.arange(X.shape[0]) + scores, thresholds = _fit_and_score_over_thresholds( + classifier.set_fit_request(sample_weight=True), + X, + y, + fit_params={"sample_weight": sample_weight}, + train_idx=train_idx, + val_idx=val_idx, + curve_scorer=curve_scorer.set_score_request(sample_weight=True), + score_params={"sample_weight": sample_weight}, + ) + + assert_allclose(thresholds_repeated, thresholds) + assert_allclose(scores_repeated, scores) + + +@pytest.mark.parametrize("fit_params_type", ["list", "array"]) +@config_context(enable_metadata_routing=True) +def test_fit_and_score_over_thresholds_fit_params(fit_params_type): + """Check that we pass `fit_params` to the classifier when calling `fit`.""" + X, y = make_classification(n_samples=100, random_state=0) + fit_params = { + "a": _convert_container(y, fit_params_type), + "b": _convert_container(y, fit_params_type), + } + + classifier = CheckingClassifier(expected_fit_params=["a", "b"], random_state=0) + classifier.set_fit_request(a=True, b=True) + train_idx, val_idx = np.arange(50), np.arange(50, 100) + + curve_scorer = _CurveScorer( + score_func=balanced_accuracy_score, + sign=1, + response_method="predict_proba", + thresholds=10, + kwargs={}, + ) + _fit_and_score_over_thresholds( + classifier, + X, + y, + fit_params=fit_params, + train_idx=train_idx, + val_idx=val_idx, + curve_scorer=curve_scorer, + score_params={}, + ) + + +@pytest.mark.parametrize( + "data", + [ + make_classification(n_classes=3, n_clusters_per_class=1, random_state=0), + make_multilabel_classification(random_state=0), + ], +) +def test_tuned_threshold_classifier_no_binary(data): + """Check that we raise an informative error message for non-binary problem.""" + err_msg = "Only binary classification is supported." + with pytest.raises(ValueError, match=err_msg): + TunedThresholdClassifierCV(LogisticRegression()).fit(*data) + + +@pytest.mark.parametrize( + "params, err_type, err_msg", + [ + ( + {"cv": "prefit", "refit": True}, + ValueError, + "When cv='prefit', refit cannot be True.", + ), + ( + {"cv": 10, "refit": False}, + ValueError, + "When cv has several folds, refit cannot be False.", + ), + ( + {"cv": "prefit", "refit": False}, + NotFittedError, + "`estimator` must be fitted.", + ), + ], +) +def test_tuned_threshold_classifier_conflict_cv_refit(params, err_type, err_msg): + """Check that we raise an informative error message when `cv` and `refit` + cannot be used together. + """ + X, y = make_classification(n_samples=100, random_state=0) + with pytest.raises(err_type, match=err_msg): + TunedThresholdClassifierCV(LogisticRegression(), **params).fit(X, y) + + +@pytest.mark.parametrize( + "estimator", + [LogisticRegression(), SVC(), GradientBoostingClassifier(n_estimators=4)], +) +@pytest.mark.parametrize( + "response_method", ["predict_proba", "predict_log_proba", "decision_function"] +) +@pytest.mark.parametrize( + "ThresholdClassifier", [FixedThresholdClassifier, TunedThresholdClassifierCV] +) +def test_threshold_classifier_estimator_response_methods( + ThresholdClassifier, estimator, response_method +): + """Check that `TunedThresholdClassifierCV` exposes the same response methods as the + underlying estimator. + """ + X, y = make_classification(n_samples=100, random_state=0) + + model = ThresholdClassifier(estimator=estimator) + assert hasattr(model, response_method) == hasattr(estimator, response_method) + + model.fit(X, y) + assert hasattr(model, response_method) == hasattr(estimator, response_method) + + if hasattr(model, response_method): + y_pred_cutoff = getattr(model, response_method)(X) + y_pred_underlying_estimator = getattr(model.estimator_, response_method)(X) + + assert_allclose(y_pred_cutoff, y_pred_underlying_estimator) + + +@pytest.mark.parametrize( + "response_method", ["auto", "decision_function", "predict_proba"] +) +def test_tuned_threshold_classifier_without_constraint_value(response_method): + """Check that `TunedThresholdClassifierCV` is optimizing a given objective + metric.""" + X, y = load_breast_cancer(return_X_y=True) + # remove feature to degrade performances + X = X[:, :5] + + # make the problem completely imbalanced such that the balanced accuracy is low + indices_pos = np.flatnonzero(y == 1) + indices_pos = indices_pos[: indices_pos.size // 50] + indices_neg = np.flatnonzero(y == 0) + + X = np.vstack([X[indices_neg], X[indices_pos]]) + y = np.hstack([y[indices_neg], y[indices_pos]]) + + lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y) + thresholds = 100 + model = TunedThresholdClassifierCV( + estimator=lr, + scoring="balanced_accuracy", + response_method=response_method, + thresholds=thresholds, + store_cv_results=True, + ) + score_optimized = balanced_accuracy_score(y, model.fit(X, y).predict(X)) + score_baseline = balanced_accuracy_score(y, lr.predict(X)) + assert score_optimized > score_baseline + assert model.cv_results_["thresholds"].shape == (thresholds,) + assert model.cv_results_["scores"].shape == (thresholds,) + + +def test_tuned_threshold_classifier_metric_with_parameter(): + """Check that we can pass a metric with a parameter in addition check that + `f_beta` with `beta=1` is equivalent to `f1` and different from `f_beta` with + `beta=2`. + """ + X, y = load_breast_cancer(return_X_y=True) + lr = make_pipeline(StandardScaler(), LogisticRegression()).fit(X, y) + model_fbeta_1 = TunedThresholdClassifierCV( + estimator=lr, scoring=make_scorer(fbeta_score, beta=1) + ).fit(X, y) + model_fbeta_2 = TunedThresholdClassifierCV( + estimator=lr, scoring=make_scorer(fbeta_score, beta=2) + ).fit(X, y) + model_f1 = TunedThresholdClassifierCV( + estimator=lr, scoring=make_scorer(f1_score) + ).fit(X, y) + + assert model_fbeta_1.best_threshold_ == pytest.approx(model_f1.best_threshold_) + assert model_fbeta_1.best_threshold_ != pytest.approx(model_fbeta_2.best_threshold_) + + +@pytest.mark.parametrize( + "response_method", ["auto", "decision_function", "predict_proba"] +) +@pytest.mark.parametrize( + "metric", + [ + make_scorer(balanced_accuracy_score), + make_scorer(f1_score, pos_label="cancer"), + ], +) +def test_tuned_threshold_classifier_with_string_targets(response_method, metric): + """Check that targets represented by str are properly managed. + Also, check with several metrics to be sure that `pos_label` is properly + dispatched. + """ + X, y = load_breast_cancer(return_X_y=True) + # Encode numeric targets by meaningful strings. We purposely designed the class + # names such that the `pos_label` is the first alphabetically sorted class and thus + # encoded as 0. + classes = np.array(["cancer", "healthy"], dtype=object) + y = classes[y] + model = TunedThresholdClassifierCV( + estimator=make_pipeline(StandardScaler(), LogisticRegression()), + scoring=metric, + response_method=response_method, + thresholds=100, + ).fit(X, y) + assert_array_equal(model.classes_, np.sort(classes)) + y_pred = model.predict(X) + assert_array_equal(np.unique(y_pred), np.sort(classes)) + + +@pytest.mark.parametrize("with_sample_weight", [True, False]) +@config_context(enable_metadata_routing=True) +def test_tuned_threshold_classifier_refit(with_sample_weight, global_random_seed): + """Check the behaviour of the `refit` parameter.""" + rng = np.random.RandomState(global_random_seed) + X, y = make_classification(n_samples=100, random_state=0) + if with_sample_weight: + sample_weight = rng.randn(X.shape[0]) + sample_weight = np.abs(sample_weight, out=sample_weight) + else: + sample_weight = None + + # check that `estimator_` if fitted on the full dataset when `refit=True` + estimator = LogisticRegression().set_fit_request(sample_weight=True) + model = TunedThresholdClassifierCV(estimator, refit=True).fit( + X, y, sample_weight=sample_weight + ) + + assert model.estimator_ is not estimator + estimator.fit(X, y, sample_weight=sample_weight) + assert_allclose(model.estimator_.coef_, estimator.coef_) + assert_allclose(model.estimator_.intercept_, estimator.intercept_) + + # check that `estimator_` was not altered when `refit=False` and `cv="prefit"` + estimator = LogisticRegression().set_fit_request(sample_weight=True) + estimator.fit(X, y, sample_weight=sample_weight) + coef = estimator.coef_.copy() + model = TunedThresholdClassifierCV(estimator, cv="prefit", refit=False).fit( + X, y, sample_weight=sample_weight + ) + + assert model.estimator_ is estimator + assert_allclose(model.estimator_.coef_, coef) + + # check that we train `estimator_` on the training split of a given cross-validation + estimator = LogisticRegression().set_fit_request(sample_weight=True) + cv = [ + (np.arange(50), np.arange(50, 100)), + ] # single split + model = TunedThresholdClassifierCV(estimator, cv=cv, refit=False).fit( + X, y, sample_weight=sample_weight + ) + + assert model.estimator_ is not estimator + if with_sample_weight: + sw_train = sample_weight[cv[0][0]] + else: + sw_train = None + estimator.fit(X[cv[0][0]], y[cv[0][0]], sample_weight=sw_train) + assert_allclose(model.estimator_.coef_, estimator.coef_) + + +@pytest.mark.parametrize("fit_params_type", ["list", "array"]) +@config_context(enable_metadata_routing=True) +def test_tuned_threshold_classifier_fit_params(fit_params_type): + """Check that we pass `fit_params` to the classifier when calling `fit`.""" + X, y = make_classification(n_samples=100, random_state=0) + fit_params = { + "a": _convert_container(y, fit_params_type), + "b": _convert_container(y, fit_params_type), + } + + classifier = CheckingClassifier(expected_fit_params=["a", "b"], random_state=0) + classifier.set_fit_request(a=True, b=True) + model = TunedThresholdClassifierCV(classifier) + model.fit(X, y, **fit_params) + + +@config_context(enable_metadata_routing=True) +def test_tuned_threshold_classifier_cv_zeros_sample_weights_equivalence(): + """Check that passing removing some sample from the dataset `X` is + equivalent to passing a `sample_weight` with a factor 0.""" + X, y = load_iris(return_X_y=True) + # Scale the data to avoid any convergence issue + X = StandardScaler().fit_transform(X) + # Only use 2 classes and select samples such that 2-fold cross-validation + # split will lead to an equivalence with a `sample_weight` of 0 + X = np.vstack((X[:40], X[50:90])) + y = np.hstack((y[:40], y[50:90])) + sample_weight = np.zeros_like(y) + sample_weight[::2] = 1 + + estimator = LogisticRegression().set_fit_request(sample_weight=True) + model_without_weights = TunedThresholdClassifierCV(estimator, cv=2) + model_with_weights = clone(model_without_weights) + + model_with_weights.fit(X, y, sample_weight=sample_weight) + model_without_weights.fit(X[::2], y[::2]) + + assert_allclose( + model_with_weights.estimator_.coef_, model_without_weights.estimator_.coef_ + ) + + y_pred_with_weights = model_with_weights.predict_proba(X) + y_pred_without_weights = model_without_weights.predict_proba(X) + assert_allclose(y_pred_with_weights, y_pred_without_weights) + + +def test_tuned_threshold_classifier_thresholds_array(): + """Check that we can pass an array to `thresholds` and it is used as candidate + threshold internally.""" + X, y = make_classification(random_state=0) + estimator = LogisticRegression() + thresholds = np.linspace(0, 1, 11) + tuned_model = TunedThresholdClassifierCV( + estimator, + thresholds=thresholds, + response_method="predict_proba", + store_cv_results=True, + ).fit(X, y) + assert_allclose(tuned_model.cv_results_["thresholds"], thresholds) + + +@pytest.mark.parametrize("store_cv_results", [True, False]) +def test_tuned_threshold_classifier_store_cv_results(store_cv_results): + """Check that if `cv_results_` exists depending on `store_cv_results`.""" + X, y = make_classification(random_state=0) + estimator = LogisticRegression() + tuned_model = TunedThresholdClassifierCV( + estimator, store_cv_results=store_cv_results + ).fit(X, y) + if store_cv_results: + assert hasattr(tuned_model, "cv_results_") + else: + assert not hasattr(tuned_model, "cv_results_") + + +def test_tuned_threshold_classifier_cv_float(): + """Check the behaviour when `cv` is set to a float.""" + X, y = make_classification(random_state=0) + + # case where `refit=False` and cv is a float: the underlying estimator will be fit + # on the training set given by a ShuffleSplit. We check that we get the same model + # coefficients. + test_size = 0.3 + estimator = LogisticRegression() + tuned_model = TunedThresholdClassifierCV( + estimator, cv=test_size, refit=False, random_state=0 + ).fit(X, y) + tuned_model.fit(X, y) + + cv = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=0) + train_idx, val_idx = next(cv.split(X, y)) + cloned_estimator = clone(estimator).fit(X[train_idx], y[train_idx]) + + assert_allclose(tuned_model.estimator_.coef_, cloned_estimator.coef_) + + # case where `refit=True`, then the underlying estimator is fitted on the full + # dataset. + tuned_model.set_params(refit=True).fit(X, y) + cloned_estimator = clone(estimator).fit(X, y) + + assert_allclose(tuned_model.estimator_.coef_, cloned_estimator.coef_) + + +def test_tuned_threshold_classifier_error_constant_predictor(): + """Check that we raise a ValueError if the underlying classifier returns constant + probabilities such that we cannot find any threshold. + """ + X, y = make_classification(random_state=0) + estimator = DummyClassifier(strategy="constant", constant=1) + tuned_model = TunedThresholdClassifierCV(estimator, response_method="predict_proba") + err_msg = "The provided estimator makes constant predictions" + with pytest.raises(ValueError, match=err_msg): + tuned_model.fit(X, y) + + +@pytest.mark.parametrize( + "response_method", ["auto", "predict_proba", "decision_function"] +) +def test_fixed_threshold_classifier_equivalence_default(response_method): + """Check that `FixedThresholdClassifier` has the same behaviour as the vanilla + classifier. + """ + X, y = make_classification(random_state=0) + classifier = LogisticRegression().fit(X, y) + classifier_default_threshold = FixedThresholdClassifier( + estimator=clone(classifier), response_method=response_method + ) + classifier_default_threshold.fit(X, y) + + # emulate the response method that should take into account the `pos_label` + if response_method in ("auto", "predict_proba"): + y_score = classifier_default_threshold.predict_proba(X)[:, 1] + threshold = 0.5 + else: # response_method == "decision_function" + y_score = classifier_default_threshold.decision_function(X) + threshold = 0.0 + + y_pred_lr = (y_score >= threshold).astype(int) + assert_allclose(classifier_default_threshold.predict(X), y_pred_lr) + + +@pytest.mark.parametrize( + "response_method, threshold", [("predict_proba", 0.7), ("decision_function", 2.0)] +) +@pytest.mark.parametrize("pos_label", [0, 1]) +def test_fixed_threshold_classifier(response_method, threshold, pos_label): + """Check that applying `predict` lead to the same prediction as applying the + threshold to the output of the response method. + """ + X, y = make_classification(n_samples=50, random_state=0) + logistic_regression = LogisticRegression().fit(X, y) + model = FixedThresholdClassifier( + estimator=clone(logistic_regression), + threshold=threshold, + response_method=response_method, + pos_label=pos_label, + ).fit(X, y) + + # check that the underlying estimator is the same + assert_allclose(model.estimator_.coef_, logistic_regression.coef_) + + # emulate the response method that should take into account the `pos_label` + if response_method == "predict_proba": + y_score = model.predict_proba(X)[:, pos_label] + else: # response_method == "decision_function" + y_score = model.decision_function(X) + y_score = y_score if pos_label == 1 else -y_score + + # create a mapping from boolean values to class labels + map_to_label = np.array([0, 1]) if pos_label == 1 else np.array([1, 0]) + y_pred_lr = map_to_label[(y_score >= threshold).astype(int)] + assert_allclose(model.predict(X), y_pred_lr) + + for method in ("predict_proba", "predict_log_proba", "decision_function"): + assert_allclose( + getattr(model, method)(X), getattr(logistic_regression, method)(X) + ) + assert_allclose( + getattr(model.estimator_, method)(X), + getattr(logistic_regression, method)(X), + ) + + +@config_context(enable_metadata_routing=True) +def test_fixed_threshold_classifier_metadata_routing(): + """Check that everything works with metadata routing.""" + X, y = make_classification(random_state=0) + sample_weight = np.ones_like(y) + sample_weight[::2] = 2 + classifier = LogisticRegression().set_fit_request(sample_weight=True) + classifier.fit(X, y, sample_weight=sample_weight) + classifier_default_threshold = FixedThresholdClassifier(estimator=clone(classifier)) + classifier_default_threshold.fit(X, y, sample_weight=sample_weight) + assert_allclose(classifier_default_threshold.estimator_.coef_, classifier.coef_) + + +@pytest.mark.parametrize( + "method", ["predict_proba", "decision_function", "predict", "predict_log_proba"] +) +def test_fixed_threshold_classifier_fitted_estimator(method): + """Check that if the underlying estimator is already fitted, no fit is required.""" + X, y = make_classification(random_state=0) + classifier = LogisticRegression().fit(X, y) + fixed_threshold_classifier = FixedThresholdClassifier(estimator=classifier) + # This should not raise an error + getattr(fixed_threshold_classifier, method)(X) + + +def test_fixed_threshold_classifier_classes_(): + """Check that the classes_ attribute is properly set.""" + X, y = make_classification(random_state=0) + with pytest.raises( + AttributeError, match="The underlying estimator is not fitted yet." + ): + FixedThresholdClassifier(estimator=LogisticRegression()).classes_ + + classifier = LogisticRegression().fit(X, y) + fixed_threshold_classifier = FixedThresholdClassifier(estimator=classifier) + assert_array_equal(fixed_threshold_classifier.classes_, classifier.classes_) diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_plot.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_plot.py new file mode 100644 index 0000000000000000000000000000000000000000..4e884755174545ababe24d423fd84cf7882104cb --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_plot.py @@ -0,0 +1,572 @@ +import numpy as np +import pytest + +from sklearn.datasets import load_iris +from sklearn.model_selection import ( + LearningCurveDisplay, + ValidationCurveDisplay, + learning_curve, + validation_curve, +) +from sklearn.tree import DecisionTreeClassifier +from sklearn.utils import shuffle +from sklearn.utils._testing import assert_allclose, assert_array_equal + + +@pytest.fixture +def data(): + return shuffle(*load_iris(return_X_y=True), random_state=0) + + +@pytest.mark.parametrize( + "params, err_type, err_msg", + [ + ({"std_display_style": "invalid"}, ValueError, "Unknown std_display_style:"), + ({"score_type": "invalid"}, ValueError, "Unknown score_type:"), + ], +) +@pytest.mark.parametrize( + "CurveDisplay, specific_params", + [ + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}), + ], +) +def test_curve_display_parameters_validation( + pyplot, data, params, err_type, err_msg, CurveDisplay, specific_params +): + """Check that we raise a proper error when passing invalid parameters.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + with pytest.raises(err_type, match=err_msg): + CurveDisplay.from_estimator(estimator, X, y, **specific_params, **params) + + +def test_learning_curve_display_default_usage(pyplot, data): + """Check the default usage of the LearningCurveDisplay class.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + train_sizes = [0.3, 0.6, 0.9] + display = LearningCurveDisplay.from_estimator( + estimator, X, y, train_sizes=train_sizes + ) + + import matplotlib as mpl + + assert display.errorbar_ is None + + assert isinstance(display.lines_, list) + for line in display.lines_: + assert isinstance(line, mpl.lines.Line2D) + + assert isinstance(display.fill_between_, list) + for fill in display.fill_between_: + assert isinstance(fill, mpl.collections.PolyCollection) + assert fill.get_alpha() == 0.5 + + assert display.score_name == "Score" + assert display.ax_.get_xlabel() == "Number of samples in the training set" + assert display.ax_.get_ylabel() == "Score" + + _, legend_labels = display.ax_.get_legend_handles_labels() + assert legend_labels == ["Train", "Test"] + + train_sizes_abs, train_scores, test_scores = learning_curve( + estimator, X, y, train_sizes=train_sizes + ) + + assert_array_equal(display.train_sizes, train_sizes_abs) + assert_allclose(display.train_scores, train_scores) + assert_allclose(display.test_scores, test_scores) + + +def test_validation_curve_display_default_usage(pyplot, data): + """Check the default usage of the ValidationCurveDisplay class.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + param_name, param_range = "max_depth", [1, 3, 5] + display = ValidationCurveDisplay.from_estimator( + estimator, X, y, param_name=param_name, param_range=param_range + ) + + import matplotlib as mpl + + assert display.errorbar_ is None + + assert isinstance(display.lines_, list) + for line in display.lines_: + assert isinstance(line, mpl.lines.Line2D) + + assert isinstance(display.fill_between_, list) + for fill in display.fill_between_: + assert isinstance(fill, mpl.collections.PolyCollection) + assert fill.get_alpha() == 0.5 + + assert display.score_name == "Score" + assert display.ax_.get_xlabel() == f"{param_name}" + assert display.ax_.get_ylabel() == "Score" + + _, legend_labels = display.ax_.get_legend_handles_labels() + assert legend_labels == ["Train", "Test"] + + train_scores, test_scores = validation_curve( + estimator, X, y, param_name=param_name, param_range=param_range + ) + + assert_array_equal(display.param_range, param_range) + assert_allclose(display.train_scores, train_scores) + assert_allclose(display.test_scores, test_scores) + + +@pytest.mark.parametrize( + "CurveDisplay, specific_params", + [ + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}), + ], +) +def test_curve_display_negate_score(pyplot, data, CurveDisplay, specific_params): + """Check the behaviour of the `negate_score` parameter calling `from_estimator` and + `plot`. + """ + X, y = data + estimator = DecisionTreeClassifier(max_depth=1, random_state=0) + + negate_score = False + display = CurveDisplay.from_estimator( + estimator, X, y, **specific_params, negate_score=negate_score + ) + + positive_scores = display.lines_[0].get_data()[1] + assert (positive_scores >= 0).all() + assert display.ax_.get_ylabel() == "Score" + + negate_score = True + display = CurveDisplay.from_estimator( + estimator, X, y, **specific_params, negate_score=negate_score + ) + + negative_scores = display.lines_[0].get_data()[1] + assert (negative_scores <= 0).all() + assert_allclose(negative_scores, -positive_scores) + assert display.ax_.get_ylabel() == "Negative score" + + negate_score = False + display = CurveDisplay.from_estimator( + estimator, X, y, **specific_params, negate_score=negate_score + ) + assert display.ax_.get_ylabel() == "Score" + display.plot(negate_score=not negate_score) + assert display.ax_.get_ylabel() == "Score" + assert (display.lines_[0].get_data()[1] < 0).all() + + +@pytest.mark.parametrize( + "score_name, ylabel", [(None, "Score"), ("Accuracy", "Accuracy")] +) +@pytest.mark.parametrize( + "CurveDisplay, specific_params", + [ + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}), + ], +) +def test_curve_display_score_name( + pyplot, data, score_name, ylabel, CurveDisplay, specific_params +): + """Check that we can overwrite the default score name shown on the y-axis.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + display = CurveDisplay.from_estimator( + estimator, X, y, **specific_params, score_name=score_name + ) + + assert display.ax_.get_ylabel() == ylabel + X, y = data + estimator = DecisionTreeClassifier(max_depth=1, random_state=0) + + display = CurveDisplay.from_estimator( + estimator, X, y, **specific_params, score_name=score_name + ) + + assert display.score_name == ylabel + + +@pytest.mark.parametrize("std_display_style", (None, "errorbar")) +def test_learning_curve_display_score_type(pyplot, data, std_display_style): + """Check the behaviour of setting the `score_type` parameter.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + train_sizes = [0.3, 0.6, 0.9] + train_sizes_abs, train_scores, test_scores = learning_curve( + estimator, X, y, train_sizes=train_sizes + ) + + score_type = "train" + display = LearningCurveDisplay.from_estimator( + estimator, + X, + y, + train_sizes=train_sizes, + score_type=score_type, + std_display_style=std_display_style, + ) + + _, legend_label = display.ax_.get_legend_handles_labels() + assert legend_label == ["Train"] + + if std_display_style is None: + assert len(display.lines_) == 1 + assert display.errorbar_ is None + x_data, y_data = display.lines_[0].get_data() + else: + assert display.lines_ is None + assert len(display.errorbar_) == 1 + x_data, y_data = display.errorbar_[0].lines[0].get_data() + + assert_array_equal(x_data, train_sizes_abs) + assert_allclose(y_data, train_scores.mean(axis=1)) + + score_type = "test" + display = LearningCurveDisplay.from_estimator( + estimator, + X, + y, + train_sizes=train_sizes, + score_type=score_type, + std_display_style=std_display_style, + ) + + _, legend_label = display.ax_.get_legend_handles_labels() + assert legend_label == ["Test"] + + if std_display_style is None: + assert len(display.lines_) == 1 + assert display.errorbar_ is None + x_data, y_data = display.lines_[0].get_data() + else: + assert display.lines_ is None + assert len(display.errorbar_) == 1 + x_data, y_data = display.errorbar_[0].lines[0].get_data() + + assert_array_equal(x_data, train_sizes_abs) + assert_allclose(y_data, test_scores.mean(axis=1)) + + score_type = "both" + display = LearningCurveDisplay.from_estimator( + estimator, + X, + y, + train_sizes=train_sizes, + score_type=score_type, + std_display_style=std_display_style, + ) + + _, legend_label = display.ax_.get_legend_handles_labels() + assert legend_label == ["Train", "Test"] + + if std_display_style is None: + assert len(display.lines_) == 2 + assert display.errorbar_ is None + x_data_train, y_data_train = display.lines_[0].get_data() + x_data_test, y_data_test = display.lines_[1].get_data() + else: + assert display.lines_ is None + assert len(display.errorbar_) == 2 + x_data_train, y_data_train = display.errorbar_[0].lines[0].get_data() + x_data_test, y_data_test = display.errorbar_[1].lines[0].get_data() + + assert_array_equal(x_data_train, train_sizes_abs) + assert_allclose(y_data_train, train_scores.mean(axis=1)) + assert_array_equal(x_data_test, train_sizes_abs) + assert_allclose(y_data_test, test_scores.mean(axis=1)) + + +@pytest.mark.parametrize("std_display_style", (None, "errorbar")) +def test_validation_curve_display_score_type(pyplot, data, std_display_style): + """Check the behaviour of setting the `score_type` parameter.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + param_name, param_range = "max_depth", [1, 3, 5] + train_scores, test_scores = validation_curve( + estimator, X, y, param_name=param_name, param_range=param_range + ) + + score_type = "train" + display = ValidationCurveDisplay.from_estimator( + estimator, + X, + y, + param_name=param_name, + param_range=param_range, + score_type=score_type, + std_display_style=std_display_style, + ) + + _, legend_label = display.ax_.get_legend_handles_labels() + assert legend_label == ["Train"] + + if std_display_style is None: + assert len(display.lines_) == 1 + assert display.errorbar_ is None + x_data, y_data = display.lines_[0].get_data() + else: + assert display.lines_ is None + assert len(display.errorbar_) == 1 + x_data, y_data = display.errorbar_[0].lines[0].get_data() + + assert_array_equal(x_data, param_range) + assert_allclose(y_data, train_scores.mean(axis=1)) + + score_type = "test" + display = ValidationCurveDisplay.from_estimator( + estimator, + X, + y, + param_name=param_name, + param_range=param_range, + score_type=score_type, + std_display_style=std_display_style, + ) + + _, legend_label = display.ax_.get_legend_handles_labels() + assert legend_label == ["Test"] + + if std_display_style is None: + assert len(display.lines_) == 1 + assert display.errorbar_ is None + x_data, y_data = display.lines_[0].get_data() + else: + assert display.lines_ is None + assert len(display.errorbar_) == 1 + x_data, y_data = display.errorbar_[0].lines[0].get_data() + + assert_array_equal(x_data, param_range) + assert_allclose(y_data, test_scores.mean(axis=1)) + + score_type = "both" + display = ValidationCurveDisplay.from_estimator( + estimator, + X, + y, + param_name=param_name, + param_range=param_range, + score_type=score_type, + std_display_style=std_display_style, + ) + + _, legend_label = display.ax_.get_legend_handles_labels() + assert legend_label == ["Train", "Test"] + + if std_display_style is None: + assert len(display.lines_) == 2 + assert display.errorbar_ is None + x_data_train, y_data_train = display.lines_[0].get_data() + x_data_test, y_data_test = display.lines_[1].get_data() + else: + assert display.lines_ is None + assert len(display.errorbar_) == 2 + x_data_train, y_data_train = display.errorbar_[0].lines[0].get_data() + x_data_test, y_data_test = display.errorbar_[1].lines[0].get_data() + + assert_array_equal(x_data_train, param_range) + assert_allclose(y_data_train, train_scores.mean(axis=1)) + assert_array_equal(x_data_test, param_range) + assert_allclose(y_data_test, test_scores.mean(axis=1)) + + +@pytest.mark.parametrize( + "CurveDisplay, specific_params, expected_xscale", + [ + ( + ValidationCurveDisplay, + {"param_name": "max_depth", "param_range": np.arange(1, 5)}, + "linear", + ), + (LearningCurveDisplay, {"train_sizes": np.linspace(0.1, 0.9, num=5)}, "linear"), + ( + ValidationCurveDisplay, + { + "param_name": "max_depth", + "param_range": np.round(np.logspace(0, 2, num=5)).astype(np.int64), + }, + "log", + ), + (LearningCurveDisplay, {"train_sizes": np.logspace(-1, 0, num=5)}, "log"), + ], +) +def test_curve_display_xscale_auto( + pyplot, data, CurveDisplay, specific_params, expected_xscale +): + """Check the behaviour of the x-axis scaling depending on the data provided.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + display = CurveDisplay.from_estimator(estimator, X, y, **specific_params) + assert display.ax_.get_xscale() == expected_xscale + + +@pytest.mark.parametrize( + "CurveDisplay, specific_params", + [ + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}), + ], +) +def test_curve_display_std_display_style(pyplot, data, CurveDisplay, specific_params): + """Check the behaviour of the parameter `std_display_style`.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + import matplotlib as mpl + + std_display_style = None + display = CurveDisplay.from_estimator( + estimator, + X, + y, + **specific_params, + std_display_style=std_display_style, + ) + + assert len(display.lines_) == 2 + for line in display.lines_: + assert isinstance(line, mpl.lines.Line2D) + assert display.errorbar_ is None + assert display.fill_between_ is None + _, legend_label = display.ax_.get_legend_handles_labels() + assert len(legend_label) == 2 + + std_display_style = "fill_between" + display = CurveDisplay.from_estimator( + estimator, + X, + y, + **specific_params, + std_display_style=std_display_style, + ) + + assert len(display.lines_) == 2 + for line in display.lines_: + assert isinstance(line, mpl.lines.Line2D) + assert display.errorbar_ is None + assert len(display.fill_between_) == 2 + for fill_between in display.fill_between_: + assert isinstance(fill_between, mpl.collections.PolyCollection) + _, legend_label = display.ax_.get_legend_handles_labels() + assert len(legend_label) == 2 + + std_display_style = "errorbar" + display = CurveDisplay.from_estimator( + estimator, + X, + y, + **specific_params, + std_display_style=std_display_style, + ) + + assert display.lines_ is None + assert len(display.errorbar_) == 2 + for errorbar in display.errorbar_: + assert isinstance(errorbar, mpl.container.ErrorbarContainer) + assert display.fill_between_ is None + _, legend_label = display.ax_.get_legend_handles_labels() + assert len(legend_label) == 2 + + +@pytest.mark.parametrize( + "CurveDisplay, specific_params", + [ + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}), + ], +) +def test_curve_display_plot_kwargs(pyplot, data, CurveDisplay, specific_params): + """Check the behaviour of the different plotting keyword arguments: `line_kw`, + `fill_between_kw`, and `errorbar_kw`.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + std_display_style = "fill_between" + line_kw = {"color": "red"} + fill_between_kw = {"color": "red", "alpha": 1.0} + display = CurveDisplay.from_estimator( + estimator, + X, + y, + **specific_params, + std_display_style=std_display_style, + line_kw=line_kw, + fill_between_kw=fill_between_kw, + ) + + assert display.lines_[0].get_color() == "red" + assert_allclose( + display.fill_between_[0].get_facecolor(), + [[1.0, 0.0, 0.0, 1.0]], # trust me, it's red + ) + + std_display_style = "errorbar" + errorbar_kw = {"color": "red"} + display = CurveDisplay.from_estimator( + estimator, + X, + y, + **specific_params, + std_display_style=std_display_style, + errorbar_kw=errorbar_kw, + ) + + assert display.errorbar_[0].lines[0].get_color() == "red" + + +@pytest.mark.parametrize( + "param_range, xscale", + [([5, 10, 15], "linear"), ([-50, 5, 50, 500], "symlog"), ([5, 50, 500], "log")], +) +def test_validation_curve_xscale_from_param_range_provided_as_a_list( + pyplot, data, param_range, xscale +): + """Check the induced xscale from the provided param_range values.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + param_name = "max_depth" + display = ValidationCurveDisplay.from_estimator( + estimator, + X, + y, + param_name=param_name, + param_range=param_range, + ) + + assert display.ax_.get_xscale() == xscale + + +@pytest.mark.parametrize( + "Display, params", + [ + (LearningCurveDisplay, {}), + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + ], +) +def test_subclassing_displays(pyplot, data, Display, params): + """Check that named constructors return the correct type when subclassed. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/pull/27675 + """ + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + class SubclassOfDisplay(Display): + pass + + display = SubclassOfDisplay.from_estimator(estimator, X, y, **params) + assert isinstance(display, SubclassOfDisplay) diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_search.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_search.py new file mode 100644 index 0000000000000000000000000000000000000000..7888dd2d1766b411549f29c995ac9bd58595158d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_search.py @@ -0,0 +1,2966 @@ +"""Test the search module""" + +import pickle +import re +import sys +import warnings +from collections.abc import Iterable, Sized +from functools import partial +from io import StringIO +from itertools import chain, product +from types import GeneratorType + +import numpy as np +import pytest +from scipy.stats import bernoulli, expon, uniform + +from sklearn import config_context +from sklearn.base import BaseEstimator, ClassifierMixin, clone, is_classifier +from sklearn.cluster import KMeans +from sklearn.compose import ColumnTransformer +from sklearn.datasets import ( + make_blobs, + make_classification, + make_multilabel_classification, +) +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +from sklearn.dummy import DummyClassifier +from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.exceptions import FitFailedWarning +from sklearn.experimental import enable_halving_search_cv # noqa: F401 +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.impute import SimpleImputer +from sklearn.linear_model import ( + LinearRegression, + LogisticRegression, + Ridge, + SGDClassifier, +) +from sklearn.metrics import ( + accuracy_score, + confusion_matrix, + f1_score, + make_scorer, + r2_score, + recall_score, + roc_auc_score, +) +from sklearn.metrics.pairwise import euclidean_distances +from sklearn.model_selection import ( + GridSearchCV, + GroupKFold, + GroupShuffleSplit, + HalvingGridSearchCV, + KFold, + LeaveOneGroupOut, + LeavePGroupsOut, + ParameterGrid, + ParameterSampler, + RandomizedSearchCV, + StratifiedKFold, + StratifiedShuffleSplit, + train_test_split, +) +from sklearn.model_selection._search import ( + BaseSearchCV, + _yield_masked_array_for_each_param, +) +from sklearn.model_selection.tests.common import OneTimeSplitter +from sklearn.naive_bayes import ComplementNB +from sklearn.neighbors import KernelDensity, KNeighborsClassifier, LocalOutlierFactor +from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.preprocessing import ( + OneHotEncoder, + OrdinalEncoder, + SplineTransformer, + StandardScaler, +) +from sklearn.svm import SVC, LinearSVC +from sklearn.tests.metadata_routing_common import ( + ConsumingScorer, + _Registry, + check_recorded_metadata, +) +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +from sklearn.utils._array_api import ( + _get_namespace_device_dtype_ids, + yield_namespace_device_dtype_combinations, +) +from sklearn.utils._mocking import CheckingClassifier, MockDataFrame +from sklearn.utils._testing import ( + MinimalClassifier, + MinimalRegressor, + MinimalTransformer, + _array_api_for_tests, + assert_allclose, + assert_allclose_dense_sparse, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, + set_random_state, +) +from sklearn.utils.estimator_checks import _enforce_estimator_tags_y +from sklearn.utils.fixes import CSR_CONTAINERS +from sklearn.utils.validation import _num_samples + + +# Neither of the following two estimators inherit from BaseEstimator, +# to test hyperparameter search on user-defined classifiers. +class MockClassifier(ClassifierMixin, BaseEstimator): + """Dummy classifier to test the parameter search algorithms""" + + def __init__(self, foo_param=0): + self.foo_param = foo_param + + def fit(self, X, Y): + assert len(X) == len(Y) + self.classes_ = np.unique(Y) + return self + + def predict(self, T): + return T.shape[0] + + def transform(self, X): + return X + self.foo_param + + def inverse_transform(self, X): + return X - self.foo_param + + predict_proba = predict + predict_log_proba = predict + decision_function = predict + + def score(self, X=None, Y=None): + if self.foo_param > 1: + score = 1.0 + else: + score = 0.0 + return score + + def get_params(self, deep=False): + return {"foo_param": self.foo_param} + + def set_params(self, **params): + self.foo_param = params["foo_param"] + return self + + +class LinearSVCNoScore(LinearSVC): + """A LinearSVC classifier that has no score method.""" + + @property + def score(self): + raise AttributeError + + +X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) +y = np.array([1, 1, 2, 2]) + + +def assert_grid_iter_equals_getitem(grid): + assert list(grid) == [grid[i] for i in range(len(grid))] + + +@pytest.mark.parametrize("klass", [ParameterGrid, partial(ParameterSampler, n_iter=10)]) +@pytest.mark.parametrize( + "input, error_type, error_message", + [ + (0, TypeError, r"Parameter .* a dict or a list, got: 0 of type int"), + ([{"foo": [0]}, 0], TypeError, r"Parameter .* is not a dict \(0\)"), + ( + {"foo": 0}, + TypeError, + r"Parameter (grid|distribution) for parameter 'foo' (is not|needs to be) " + r"(a list or a numpy array|iterable or a distribution).*", + ), + ], +) +def test_validate_parameter_input(klass, input, error_type, error_message): + with pytest.raises(error_type, match=error_message): + klass(input) + + +def test_parameter_grid(): + # Test basic properties of ParameterGrid. + params1 = {"foo": [1, 2, 3]} + grid1 = ParameterGrid(params1) + assert isinstance(grid1, Iterable) + assert isinstance(grid1, Sized) + assert len(grid1) == 3 + assert_grid_iter_equals_getitem(grid1) + + params2 = {"foo": [4, 2], "bar": ["ham", "spam", "eggs"]} + grid2 = ParameterGrid(params2) + assert len(grid2) == 6 + + # loop to assert we can iterate over the grid multiple times + for i in range(2): + # tuple + chain transforms {"a": 1, "b": 2} to ("a", 1, "b", 2) + points = set(tuple(chain(*(sorted(p.items())))) for p in grid2) + assert points == set( + ("bar", x, "foo", y) for x, y in product(params2["bar"], params2["foo"]) + ) + assert_grid_iter_equals_getitem(grid2) + + # Special case: empty grid (useful to get default estimator settings) + empty = ParameterGrid({}) + assert len(empty) == 1 + assert list(empty) == [{}] + assert_grid_iter_equals_getitem(empty) + with pytest.raises(IndexError): + empty[1] + + has_empty = ParameterGrid([{"C": [1, 10]}, {}, {"C": [0.5]}]) + assert len(has_empty) == 4 + assert list(has_empty) == [{"C": 1}, {"C": 10}, {}, {"C": 0.5}] + assert_grid_iter_equals_getitem(has_empty) + + +def test_grid_search(): + # Test that the best estimator contains the right value for foo_param + clf = MockClassifier() + grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, cv=2, verbose=3) + # make sure it selects the smallest parameter in case of ties + old_stdout = sys.stdout + sys.stdout = StringIO() + grid_search.fit(X, y) + sys.stdout = old_stdout + assert grid_search.best_estimator_.foo_param == 2 + + assert_array_equal(grid_search.cv_results_["param_foo_param"].data, [1, 2, 3]) + + # Smoke test the score etc: + grid_search.score(X, y) + grid_search.predict_proba(X) + grid_search.decision_function(X) + grid_search.transform(X) + + # Test exception handling on scoring + grid_search.scoring = "sklearn" + with pytest.raises(ValueError): + grid_search.fit(X, y) + + +def test_grid_search_pipeline_steps(): + # check that parameters that are estimators are cloned before fitting + pipe = Pipeline([("regressor", LinearRegression())]) + param_grid = {"regressor": [LinearRegression(), Ridge()]} + grid_search = GridSearchCV(pipe, param_grid, cv=2) + grid_search.fit(X, y) + regressor_results = grid_search.cv_results_["param_regressor"] + assert isinstance(regressor_results[0], LinearRegression) + assert isinstance(regressor_results[1], Ridge) + assert not hasattr(regressor_results[0], "coef_") + assert not hasattr(regressor_results[1], "coef_") + assert regressor_results[0] is not grid_search.best_estimator_ + assert regressor_results[1] is not grid_search.best_estimator_ + # check that we didn't modify the parameter grid that was passed + assert not hasattr(param_grid["regressor"][0], "coef_") + assert not hasattr(param_grid["regressor"][1], "coef_") + + +@pytest.mark.parametrize("SearchCV", [GridSearchCV, RandomizedSearchCV]) +def test_SearchCV_with_fit_params(SearchCV): + X = np.arange(100).reshape(10, 10) + y = np.array([0] * 5 + [1] * 5) + clf = CheckingClassifier(expected_fit_params=["spam", "eggs"]) + searcher = SearchCV(clf, {"foo_param": [1, 2, 3]}, cv=2, error_score="raise") + + # The CheckingClassifier generates an assertion error if + # a parameter is missing or has length != len(X). + err_msg = r"Expected fit parameter\(s\) \['eggs'\] not seen." + with pytest.raises(AssertionError, match=err_msg): + searcher.fit(X, y, spam=np.ones(10)) + + err_msg = "Fit parameter spam has length 1; expected" + with pytest.raises(AssertionError, match=err_msg): + searcher.fit(X, y, spam=np.ones(1), eggs=np.zeros(10)) + searcher.fit(X, y, spam=np.ones(10), eggs=np.zeros(10)) + + +def test_grid_search_no_score(): + # Test grid-search on classifier that has no score function. + clf = LinearSVC(random_state=0) + X, y = make_blobs(random_state=0, centers=2) + Cs = [0.1, 1, 10] + clf_no_score = LinearSVCNoScore(random_state=0) + grid_search = GridSearchCV(clf, {"C": Cs}, scoring="accuracy") + grid_search.fit(X, y) + + grid_search_no_score = GridSearchCV(clf_no_score, {"C": Cs}, scoring="accuracy") + # smoketest grid search + grid_search_no_score.fit(X, y) + + # check that best params are equal + assert grid_search_no_score.best_params_ == grid_search.best_params_ + # check that we can call score and that it gives the correct result + assert grid_search.score(X, y) == grid_search_no_score.score(X, y) + + # giving no scoring function raises an error + grid_search_no_score = GridSearchCV(clf_no_score, {"C": Cs}) + with pytest.raises(TypeError, match="no scoring"): + grid_search_no_score.fit([[1]]) + + +def test_grid_search_score_method(): + X, y = make_classification(n_samples=100, n_classes=2, flip_y=0.2, random_state=0) + clf = LinearSVC(random_state=0) + grid = {"C": [0.1]} + + search_no_scoring = GridSearchCV(clf, grid, scoring=None).fit(X, y) + search_accuracy = GridSearchCV(clf, grid, scoring="accuracy").fit(X, y) + search_no_score_method_auc = GridSearchCV( + LinearSVCNoScore(), grid, scoring="roc_auc" + ).fit(X, y) + search_auc = GridSearchCV(clf, grid, scoring="roc_auc").fit(X, y) + + # Check warning only occurs in situation where behavior changed: + # estimator requires score method to compete with scoring parameter + score_no_scoring = search_no_scoring.score(X, y) + score_accuracy = search_accuracy.score(X, y) + score_no_score_auc = search_no_score_method_auc.score(X, y) + score_auc = search_auc.score(X, y) + + # ensure the test is sane + assert score_auc < 1.0 + assert score_accuracy < 1.0 + assert score_auc != score_accuracy + + assert_almost_equal(score_accuracy, score_no_scoring) + assert_almost_equal(score_auc, score_no_score_auc) + + +def test_grid_search_groups(): + # Check if ValueError (when groups is None) propagates to GridSearchCV + # And also check if groups is correctly passed to the cv object + rng = np.random.RandomState(0) + + X, y = make_classification(n_samples=15, n_classes=2, random_state=0) + groups = rng.randint(0, 3, 15) + + clf = LinearSVC(random_state=0) + grid = {"C": [1]} + + group_cvs = [ + LeaveOneGroupOut(), + LeavePGroupsOut(2), + GroupKFold(n_splits=3), + GroupShuffleSplit(), + ] + error_msg = "The 'groups' parameter should not be None." + for cv in group_cvs: + gs = GridSearchCV(clf, grid, cv=cv) + with pytest.raises(ValueError, match=error_msg): + gs.fit(X, y) + gs.fit(X, y, groups=groups) + + non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()] + for cv in non_group_cvs: + gs = GridSearchCV(clf, grid, cv=cv) + # Should not raise an error + gs.fit(X, y) + + +def test_classes__property(): + # Test that classes_ property matches best_estimator_.classes_ + X = np.arange(100).reshape(10, 10) + y = np.array([0] * 5 + [1] * 5) + Cs = [0.1, 1, 10] + + grid_search = GridSearchCV(LinearSVC(random_state=0), {"C": Cs}) + grid_search.fit(X, y) + assert_array_equal(grid_search.best_estimator_.classes_, grid_search.classes_) + + # Test that regressors do not have a classes_ attribute + grid_search = GridSearchCV(Ridge(), {"alpha": [1.0, 2.0]}) + grid_search.fit(X, y) + assert not hasattr(grid_search, "classes_") + + # Test that the grid searcher has no classes_ attribute before it's fit + grid_search = GridSearchCV(LinearSVC(random_state=0), {"C": Cs}) + assert not hasattr(grid_search, "classes_") + + # Test that the grid searcher has no classes_ attribute without a refit + grid_search = GridSearchCV(LinearSVC(random_state=0), {"C": Cs}, refit=False) + grid_search.fit(X, y) + assert not hasattr(grid_search, "classes_") + + +def test_trivial_cv_results_attr(): + # Test search over a "grid" with only one point. + clf = MockClassifier() + grid_search = GridSearchCV(clf, {"foo_param": [1]}, cv=2) + grid_search.fit(X, y) + assert hasattr(grid_search, "cv_results_") + + random_search = RandomizedSearchCV(clf, {"foo_param": [0]}, n_iter=1, cv=2) + random_search.fit(X, y) + assert hasattr(grid_search, "cv_results_") + + +def test_no_refit(): + # Test that GSCV can be used for model selection alone without refitting + clf = MockClassifier() + for scoring in [None, ["accuracy", "precision"]]: + grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, refit=False, cv=2) + grid_search.fit(X, y) + assert ( + not hasattr(grid_search, "best_estimator_") + and hasattr(grid_search, "best_index_") + and hasattr(grid_search, "best_params_") + ) + + # Make sure the functions predict/transform etc. raise meaningful + # error messages + for fn_name in ( + "predict", + "predict_proba", + "predict_log_proba", + "transform", + "inverse_transform", + ): + outer_msg = f"has no attribute '{fn_name}'" + inner_msg = ( + f"`refit=False`. {fn_name} is available only after " + "refitting on the best parameters" + ) + with pytest.raises(AttributeError, match=outer_msg) as exec_info: + getattr(grid_search, fn_name)(X) + + assert isinstance(exec_info.value.__cause__, AttributeError) + assert inner_msg in str(exec_info.value.__cause__) + + # Test that an invalid refit param raises appropriate error messages + error_msg = ( + "For multi-metric scoring, the parameter refit must be set to a scorer key" + ) + for refit in [True, "recall", "accuracy"]: + with pytest.raises(ValueError, match=error_msg): + GridSearchCV( + clf, {}, refit=refit, scoring={"acc": "accuracy", "prec": "precision"} + ).fit(X, y) + + +def test_grid_search_error(): + # Test that grid search will capture errors on data with different length + X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) + + clf = LinearSVC() + cv = GridSearchCV(clf, {"C": [0.1, 1.0]}) + with pytest.raises(ValueError): + cv.fit(X_[:180], y_) + + +def test_grid_search_one_grid_point(): + X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) + param_dict = {"C": [1.0], "kernel": ["rbf"], "gamma": [0.1]} + + clf = SVC(gamma="auto") + cv = GridSearchCV(clf, param_dict) + cv.fit(X_, y_) + + clf = SVC(C=1.0, kernel="rbf", gamma=0.1) + clf.fit(X_, y_) + + assert_array_equal(clf.dual_coef_, cv.best_estimator_.dual_coef_) + + +def test_grid_search_when_param_grid_includes_range(): + # Test that the best estimator contains the right value for foo_param + clf = MockClassifier() + grid_search = None + grid_search = GridSearchCV(clf, {"foo_param": range(1, 4)}, cv=2) + grid_search.fit(X, y) + assert grid_search.best_estimator_.foo_param == 2 + + +def test_grid_search_bad_param_grid(): + X, y = make_classification(n_samples=10, n_features=5, random_state=0) + param_dict = {"C": 1} + clf = SVC(gamma="auto") + error_msg = re.escape( + "Parameter grid for parameter 'C' needs to be a list or " + "a numpy array, but got 1 (of type int) instead. Single " + "values need to be wrapped in a list with one element." + ) + search = GridSearchCV(clf, param_dict) + with pytest.raises(TypeError, match=error_msg): + search.fit(X, y) + + param_dict = {"C": []} + clf = SVC() + error_msg = re.escape( + "Parameter grid for parameter 'C' need to be a non-empty sequence, got: []" + ) + search = GridSearchCV(clf, param_dict) + with pytest.raises(ValueError, match=error_msg): + search.fit(X, y) + + param_dict = {"C": "1,2,3"} + clf = SVC(gamma="auto") + error_msg = re.escape( + "Parameter grid for parameter 'C' needs to be a list or a numpy array, " + "but got '1,2,3' (of type str) instead. Single values need to be " + "wrapped in a list with one element." + ) + search = GridSearchCV(clf, param_dict) + with pytest.raises(TypeError, match=error_msg): + search.fit(X, y) + + param_dict = {"C": np.ones((3, 2))} + clf = SVC() + search = GridSearchCV(clf, param_dict) + with pytest.raises(ValueError): + search.fit(X, y) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_grid_search_sparse(csr_container): + # Test that grid search works with both dense and sparse matrices + X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) + + clf = LinearSVC() + cv = GridSearchCV(clf, {"C": [0.1, 1.0]}) + cv.fit(X_[:180], y_[:180]) + y_pred = cv.predict(X_[180:]) + C = cv.best_estimator_.C + + X_ = csr_container(X_) + clf = LinearSVC() + cv = GridSearchCV(clf, {"C": [0.1, 1.0]}) + cv.fit(X_[:180].tocoo(), y_[:180]) + y_pred2 = cv.predict(X_[180:]) + C2 = cv.best_estimator_.C + + assert np.mean(y_pred == y_pred2) >= 0.9 + assert C == C2 + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_grid_search_sparse_scoring(csr_container): + X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) + + clf = LinearSVC() + cv = GridSearchCV(clf, {"C": [0.1, 1.0]}, scoring="f1") + cv.fit(X_[:180], y_[:180]) + y_pred = cv.predict(X_[180:]) + C = cv.best_estimator_.C + + X_ = csr_container(X_) + clf = LinearSVC() + cv = GridSearchCV(clf, {"C": [0.1, 1.0]}, scoring="f1") + cv.fit(X_[:180], y_[:180]) + y_pred2 = cv.predict(X_[180:]) + C2 = cv.best_estimator_.C + + assert_array_equal(y_pred, y_pred2) + assert C == C2 + # Smoke test the score + # np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]), + # cv.score(X_[:180], y[:180])) + + # test loss where greater is worse + def f1_loss(y_true_, y_pred_): + return -f1_score(y_true_, y_pred_) + + F1Loss = make_scorer(f1_loss, greater_is_better=False) + cv = GridSearchCV(clf, {"C": [0.1, 1.0]}, scoring=F1Loss) + cv.fit(X_[:180], y_[:180]) + y_pred3 = cv.predict(X_[180:]) + C3 = cv.best_estimator_.C + + assert C == C3 + assert_array_equal(y_pred, y_pred3) + + +def test_grid_search_precomputed_kernel(): + # Test that grid search works when the input features are given in the + # form of a precomputed kernel matrix + X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) + + # compute the training kernel matrix corresponding to the linear kernel + K_train = np.dot(X_[:180], X_[:180].T) + y_train = y_[:180] + + clf = SVC(kernel="precomputed") + cv = GridSearchCV(clf, {"C": [0.1, 1.0]}) + cv.fit(K_train, y_train) + + assert cv.best_score_ >= 0 + + # compute the test kernel matrix + K_test = np.dot(X_[180:], X_[:180].T) + y_test = y_[180:] + + y_pred = cv.predict(K_test) + + assert np.mean(y_pred == y_test) >= 0 + + # test error is raised when the precomputed kernel is not array-like + # or sparse + with pytest.raises(ValueError): + cv.fit(K_train.tolist(), y_train) + + +def test_grid_search_precomputed_kernel_error_nonsquare(): + # Test that grid search returns an error with a non-square precomputed + # training kernel matrix + K_train = np.zeros((10, 20)) + y_train = np.ones((10,)) + clf = SVC(kernel="precomputed") + cv = GridSearchCV(clf, {"C": [0.1, 1.0]}) + with pytest.raises(ValueError): + cv.fit(K_train, y_train) + + +class BrokenClassifier(BaseEstimator): + """Broken classifier that cannot be fit twice""" + + def __init__(self, parameter=None): + self.parameter = parameter + + def fit(self, X, y): + assert not hasattr(self, "has_been_fit_") + self.has_been_fit_ = True + + def predict(self, X): + return np.zeros(X.shape[0]) + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.UndefinedMetricWarning") +def test_refit(): + # Regression test for bug in refitting + # Simulates re-fitting a broken estimator; this used to break with + # sparse SVMs. + X = np.arange(100).reshape(10, 10) + y = np.array([0] * 5 + [1] * 5) + + clf = GridSearchCV( + BrokenClassifier(), [{"parameter": [0, 1]}], scoring="precision", refit=True + ) + clf.fit(X, y) + + +def test_refit_callable(): + """ + Test refit=callable, which adds flexibility in identifying the + "best" estimator. + """ + + def refit_callable(cv_results): + """ + A dummy function tests `refit=callable` interface. + Return the index of a model that has the least + `mean_test_score`. + """ + # Fit a dummy clf with `refit=True` to get a list of keys in + # clf.cv_results_. + X, y = make_classification(n_samples=100, n_features=4, random_state=42) + clf = GridSearchCV( + LinearSVC(random_state=42), + {"C": [0.01, 0.1, 1]}, + scoring="precision", + refit=True, + ) + clf.fit(X, y) + # Ensure that `best_index_ != 0` for this dummy clf + assert clf.best_index_ != 0 + + # Assert every key matches those in `cv_results` + for key in clf.cv_results_.keys(): + assert key in cv_results + + return cv_results["mean_test_score"].argmin() + + X, y = make_classification(n_samples=100, n_features=4, random_state=42) + clf = GridSearchCV( + LinearSVC(random_state=42), + {"C": [0.01, 0.1, 1]}, + scoring="precision", + refit=refit_callable, + ) + clf.fit(X, y) + + assert clf.best_index_ == 0 + # Ensure `best_score_` is disabled when using `refit=callable` + assert not hasattr(clf, "best_score_") + + +def test_refit_callable_invalid_type(): + """ + Test implementation catches the errors when 'best_index_' returns an + invalid result. + """ + + def refit_callable_invalid_type(cv_results): + """ + A dummy function tests when returned 'best_index_' is not integer. + """ + return None + + X, y = make_classification(n_samples=100, n_features=4, random_state=42) + + clf = GridSearchCV( + LinearSVC(random_state=42), + {"C": [0.1, 1]}, + scoring="precision", + refit=refit_callable_invalid_type, + ) + with pytest.raises(TypeError, match="best_index_ returned is not an integer"): + clf.fit(X, y) + + +@pytest.mark.parametrize("out_bound_value", [-1, 2]) +@pytest.mark.parametrize("search_cv", [RandomizedSearchCV, GridSearchCV]) +def test_refit_callable_out_bound(out_bound_value, search_cv): + """ + Test implementation catches the errors when 'best_index_' returns an + out of bound result. + """ + + def refit_callable_out_bound(cv_results): + """ + A dummy function tests when returned 'best_index_' is out of bounds. + """ + return out_bound_value + + X, y = make_classification(n_samples=100, n_features=4, random_state=42) + + clf = search_cv( + LinearSVC(random_state=42), + {"C": [0.1, 1]}, + scoring="precision", + refit=refit_callable_out_bound, + ) + with pytest.raises(IndexError, match="best_index_ index out of range"): + clf.fit(X, y) + + +def test_refit_callable_multi_metric(): + """ + Test refit=callable in multiple metric evaluation setting + """ + + def refit_callable(cv_results): + """ + A dummy function tests `refit=callable` interface. + Return the index of a model that has the least + `mean_test_prec`. + """ + assert "mean_test_prec" in cv_results + return cv_results["mean_test_prec"].argmin() + + X, y = make_classification(n_samples=100, n_features=4, random_state=42) + scoring = {"Accuracy": make_scorer(accuracy_score), "prec": "precision"} + clf = GridSearchCV( + LinearSVC(random_state=42), + {"C": [0.01, 0.1, 1]}, + scoring=scoring, + refit=refit_callable, + ) + clf.fit(X, y) + + assert clf.best_index_ == 0 + # Ensure `best_score_` is disabled when using `refit=callable` + assert not hasattr(clf, "best_score_") + + +def test_gridsearch_nd(): + # Pass X as list in GridSearchCV + X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2) + y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11) + + def check_X(x): + return x.shape[1:] == (5, 3, 2) + + def check_y(x): + return x.shape[1:] == (7, 11) + + clf = CheckingClassifier( + check_X=check_X, + check_y=check_y, + methods_to_check=["fit"], + ) + grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}) + grid_search.fit(X_4d, y_3d).score(X, y) + assert hasattr(grid_search, "cv_results_") + + +def test_X_as_list(): + # Pass X as list in GridSearchCV + X = np.arange(100).reshape(10, 10) + y = np.array([0] * 5 + [1] * 5) + + clf = CheckingClassifier( + check_X=lambda x: isinstance(x, list), + methods_to_check=["fit"], + ) + cv = KFold(n_splits=3) + grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, cv=cv) + grid_search.fit(X.tolist(), y).score(X, y) + assert hasattr(grid_search, "cv_results_") + + +def test_y_as_list(): + # Pass y as list in GridSearchCV + X = np.arange(100).reshape(10, 10) + y = np.array([0] * 5 + [1] * 5) + + clf = CheckingClassifier( + check_y=lambda x: isinstance(x, list), + methods_to_check=["fit"], + ) + cv = KFold(n_splits=3) + grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, cv=cv) + grid_search.fit(X, y.tolist()).score(X, y) + assert hasattr(grid_search, "cv_results_") + + +def test_pandas_input(): + # check cross_val_score doesn't destroy pandas dataframe + types = [(MockDataFrame, MockDataFrame)] + try: + from pandas import DataFrame, Series + + types.append((DataFrame, Series)) + except ImportError: + pass + + X = np.arange(100).reshape(10, 10) + y = np.array([0] * 5 + [1] * 5) + + for InputFeatureType, TargetType in types: + # X dataframe, y series + X_df, y_ser = InputFeatureType(X), TargetType(y) + + def check_df(x): + return isinstance(x, InputFeatureType) + + def check_series(x): + return isinstance(x, TargetType) + + clf = CheckingClassifier(check_X=check_df, check_y=check_series) + + grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}) + grid_search.fit(X_df, y_ser).score(X_df, y_ser) + grid_search.predict(X_df) + assert hasattr(grid_search, "cv_results_") + + +def test_unsupervised_grid_search(): + # test grid-search with unsupervised estimator + X, y = make_blobs(n_samples=50, random_state=0) + km = KMeans(random_state=0, init="random", n_init=1) + + # Multi-metric evaluation unsupervised + scoring = ["adjusted_rand_score", "fowlkes_mallows_score"] + for refit in ["adjusted_rand_score", "fowlkes_mallows_score"]: + grid_search = GridSearchCV( + km, param_grid=dict(n_clusters=[2, 3, 4]), scoring=scoring, refit=refit + ) + grid_search.fit(X, y) + # Both ARI and FMS can find the right number :) + assert grid_search.best_params_["n_clusters"] == 3 + + # Single metric evaluation unsupervised + grid_search = GridSearchCV( + km, param_grid=dict(n_clusters=[2, 3, 4]), scoring="fowlkes_mallows_score" + ) + grid_search.fit(X, y) + assert grid_search.best_params_["n_clusters"] == 3 + + # Now without a score, and without y + grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4])) + grid_search.fit(X) + assert grid_search.best_params_["n_clusters"] == 4 + + +def test_gridsearch_no_predict(): + # test grid-search with an estimator without predict. + # slight duplication of a test from KDE + def custom_scoring(estimator, X): + return 42 if estimator.bandwidth == 0.1 else 0 + + X, _ = make_blobs(cluster_std=0.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]]) + search = GridSearchCV( + KernelDensity(), + param_grid=dict(bandwidth=[0.01, 0.1, 1]), + scoring=custom_scoring, + ) + search.fit(X) + assert search.best_params_["bandwidth"] == 0.1 + assert search.best_score_ == 42 + + +def test_param_sampler(): + # test basic properties of param sampler + param_distributions = {"kernel": ["rbf", "linear"], "C": uniform(0, 1)} + sampler = ParameterSampler( + param_distributions=param_distributions, n_iter=10, random_state=0 + ) + samples = [x for x in sampler] + assert len(samples) == 10 + for sample in samples: + assert sample["kernel"] in ["rbf", "linear"] + assert 0 <= sample["C"] <= 1 + + # test that repeated calls yield identical parameters + param_distributions = {"C": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]} + sampler = ParameterSampler( + param_distributions=param_distributions, n_iter=3, random_state=0 + ) + assert [x for x in sampler] == [x for x in sampler] + + param_distributions = {"C": uniform(0, 1)} + sampler = ParameterSampler( + param_distributions=param_distributions, n_iter=10, random_state=0 + ) + assert [x for x in sampler] == [x for x in sampler] + + +def check_cv_results_array_types( + search, param_keys, score_keys, expected_cv_results_kinds +): + # Check if the search `cv_results`'s array are of correct types + cv_results = search.cv_results_ + assert all(isinstance(cv_results[param], np.ma.MaskedArray) for param in param_keys) + assert { + key: cv_results[key].dtype.kind for key in param_keys + } == expected_cv_results_kinds + assert not any(isinstance(cv_results[key], np.ma.MaskedArray) for key in score_keys) + assert all( + cv_results[key].dtype == np.float64 + for key in score_keys + if not key.startswith("rank") + ) + + scorer_keys = search.scorer_.keys() if search.multimetric_ else ["score"] + + for key in scorer_keys: + assert cv_results["rank_test_%s" % key].dtype == np.int32 + + +def check_cv_results_keys(cv_results, param_keys, score_keys, n_cand, extra_keys=()): + # Test the search.cv_results_ contains all the required results + all_keys = param_keys + score_keys + extra_keys + assert_array_equal(sorted(cv_results.keys()), sorted(all_keys + ("params",))) + assert all(cv_results[key].shape == (n_cand,) for key in param_keys + score_keys) + + +def test_grid_search_cv_results(): + X, y = make_classification(n_samples=50, n_features=4, random_state=42) + + n_grid_points = 6 + params = [ + dict( + kernel=[ + "rbf", + ], + C=[1, 10], + gamma=[0.1, 1], + ), + dict( + kernel=[ + "poly", + ], + degree=[1, 2], + ), + ] + + param_keys = ("param_C", "param_degree", "param_gamma", "param_kernel") + score_keys = ( + "mean_test_score", + "mean_train_score", + "rank_test_score", + "split0_test_score", + "split1_test_score", + "split2_test_score", + "split0_train_score", + "split1_train_score", + "split2_train_score", + "std_test_score", + "std_train_score", + "mean_fit_time", + "std_fit_time", + "mean_score_time", + "std_score_time", + ) + n_candidates = n_grid_points + + search = GridSearchCV(SVC(), cv=3, param_grid=params, return_train_score=True) + search.fit(X, y) + cv_results = search.cv_results_ + # Check if score and timing are reasonable + assert all(cv_results["rank_test_score"] >= 1) + assert (all(cv_results[k] >= 0) for k in score_keys if k != "rank_test_score") + assert ( + all(cv_results[k] <= 1) + for k in score_keys + if "time" not in k and k != "rank_test_score" + ) + # Check cv_results structure + expected_cv_results_kinds = { + "param_C": "i", + "param_degree": "i", + "param_gamma": "f", + "param_kernel": "O", + } + check_cv_results_array_types( + search, param_keys, score_keys, expected_cv_results_kinds + ) + check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates) + # Check masking + cv_results = search.cv_results_ + + poly_results = [ + ( + cv_results["param_C"].mask[i] + and cv_results["param_gamma"].mask[i] + and not cv_results["param_degree"].mask[i] + ) + for i in range(n_candidates) + if cv_results["param_kernel"][i] == "poly" + ] + assert all(poly_results) + assert len(poly_results) == 2 + + rbf_results = [ + ( + not cv_results["param_C"].mask[i] + and not cv_results["param_gamma"].mask[i] + and cv_results["param_degree"].mask[i] + ) + for i in range(n_candidates) + if cv_results["param_kernel"][i] == "rbf" + ] + assert all(rbf_results) + assert len(rbf_results) == 4 + + +def test_random_search_cv_results(): + X, y = make_classification(n_samples=50, n_features=4, random_state=42) + + n_search_iter = 30 + + params = [ + {"kernel": ["rbf"], "C": expon(scale=10), "gamma": expon(scale=0.1)}, + {"kernel": ["poly"], "degree": [2, 3]}, + ] + param_keys = ("param_C", "param_degree", "param_gamma", "param_kernel") + score_keys = ( + "mean_test_score", + "mean_train_score", + "rank_test_score", + "split0_test_score", + "split1_test_score", + "split2_test_score", + "split0_train_score", + "split1_train_score", + "split2_train_score", + "std_test_score", + "std_train_score", + "mean_fit_time", + "std_fit_time", + "mean_score_time", + "std_score_time", + ) + n_candidates = n_search_iter + + search = RandomizedSearchCV( + SVC(), + n_iter=n_search_iter, + cv=3, + param_distributions=params, + return_train_score=True, + ) + search.fit(X, y) + cv_results = search.cv_results_ + # Check results structure + expected_cv_results_kinds = { + "param_C": "f", + "param_degree": "i", + "param_gamma": "f", + "param_kernel": "O", + } + check_cv_results_array_types( + search, param_keys, score_keys, expected_cv_results_kinds + ) + check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates) + assert all( + ( + cv_results["param_C"].mask[i] + and cv_results["param_gamma"].mask[i] + and not cv_results["param_degree"].mask[i] + ) + for i in range(n_candidates) + if cv_results["param_kernel"][i] == "poly" + ) + assert all( + ( + not cv_results["param_C"].mask[i] + and not cv_results["param_gamma"].mask[i] + and cv_results["param_degree"].mask[i] + ) + for i in range(n_candidates) + if cv_results["param_kernel"][i] == "rbf" + ) + + +@pytest.mark.parametrize( + "SearchCV, specialized_params", + [ + (GridSearchCV, {"param_grid": {"C": [1, 10]}}), + (RandomizedSearchCV, {"param_distributions": {"C": [1, 10]}, "n_iter": 2}), + ], +) +def test_search_default_iid(SearchCV, specialized_params): + # Test the IID parameter TODO: Clearly this test does something else??? + # noise-free simple 2d-data + X, y = make_blobs( + centers=[[0, 0], [1, 0], [0, 1], [1, 1]], + random_state=0, + cluster_std=0.1, + shuffle=False, + n_samples=80, + ) + # split dataset into two folds that are not iid + # first one contains data of all 4 blobs, second only from two. + mask = np.ones(X.shape[0], dtype=bool) + mask[np.where(y == 1)[0][::2]] = 0 + mask[np.where(y == 2)[0][::2]] = 0 + # this leads to perfect classification on one fold and a score of 1/3 on + # the other + # create "cv" for splits + cv = [[mask, ~mask], [~mask, mask]] + + common_params = {"estimator": SVC(), "cv": cv, "return_train_score": True} + search = SearchCV(**common_params, **specialized_params) + search.fit(X, y) + + test_cv_scores = np.array( + [ + search.cv_results_["split%d_test_score" % s][0] + for s in range(search.n_splits_) + ] + ) + test_mean = search.cv_results_["mean_test_score"][0] + test_std = search.cv_results_["std_test_score"][0] + + train_cv_scores = np.array( + [ + search.cv_results_["split%d_train_score" % s][0] + for s in range(search.n_splits_) + ] + ) + train_mean = search.cv_results_["mean_train_score"][0] + train_std = search.cv_results_["std_train_score"][0] + + assert search.cv_results_["param_C"][0] == 1 + # scores are the same as above + assert_allclose(test_cv_scores, [1, 1.0 / 3.0]) + assert_allclose(train_cv_scores, [1, 1]) + # Unweighted mean/std is used + assert test_mean == pytest.approx(np.mean(test_cv_scores)) + assert test_std == pytest.approx(np.std(test_cv_scores)) + + # For the train scores, we do not take a weighted mean irrespective of + # i.i.d. or not + assert train_mean == pytest.approx(1) + assert train_std == pytest.approx(0) + + +def test_grid_search_cv_results_multimetric(): + X, y = make_classification(n_samples=50, n_features=4, random_state=42) + + n_splits = 3 + params = [ + dict( + kernel=[ + "rbf", + ], + C=[1, 10], + gamma=[0.1, 1], + ), + dict( + kernel=[ + "poly", + ], + degree=[1, 2], + ), + ] + + grid_searches = [] + for scoring in ( + {"accuracy": make_scorer(accuracy_score), "recall": make_scorer(recall_score)}, + "accuracy", + "recall", + ): + grid_search = GridSearchCV( + SVC(), cv=n_splits, param_grid=params, scoring=scoring, refit=False + ) + grid_search.fit(X, y) + grid_searches.append(grid_search) + + compare_cv_results_multimetric_with_single(*grid_searches) + + +def test_random_search_cv_results_multimetric(): + X, y = make_classification(n_samples=50, n_features=4, random_state=42) + + n_splits = 3 + n_search_iter = 30 + + # Scipy 0.12's stats dists do not accept seed, hence we use param grid + params = dict(C=np.logspace(-4, 1, 3), gamma=np.logspace(-5, 0, 3, base=0.1)) + for refit in (True, False): + random_searches = [] + for scoring in (("accuracy", "recall"), "accuracy", "recall"): + # If True, for multi-metric pass refit='accuracy' + if refit: + probability = True + refit = "accuracy" if isinstance(scoring, tuple) else refit + else: + probability = False + clf = SVC(probability=probability, random_state=42) + random_search = RandomizedSearchCV( + clf, + n_iter=n_search_iter, + cv=n_splits, + param_distributions=params, + scoring=scoring, + refit=refit, + random_state=0, + ) + random_search.fit(X, y) + random_searches.append(random_search) + + compare_cv_results_multimetric_with_single(*random_searches) + compare_refit_methods_when_refit_with_acc( + random_searches[0], random_searches[1], refit + ) + + +def compare_cv_results_multimetric_with_single(search_multi, search_acc, search_rec): + """Compare multi-metric cv_results with the ensemble of multiple + single metric cv_results from single metric grid/random search""" + + assert search_multi.multimetric_ + assert_array_equal(sorted(search_multi.scorer_), ("accuracy", "recall")) + + cv_results_multi = search_multi.cv_results_ + cv_results_acc_rec = { + re.sub("_score$", "_accuracy", k): v for k, v in search_acc.cv_results_.items() + } + cv_results_acc_rec.update( + {re.sub("_score$", "_recall", k): v for k, v in search_rec.cv_results_.items()} + ) + + # Check if score and timing are reasonable, also checks if the keys + # are present + assert all( + ( + np.all(cv_results_multi[k] <= 1) + for k in ( + "mean_score_time", + "std_score_time", + "mean_fit_time", + "std_fit_time", + ) + ) + ) + + # Compare the keys, other than time keys, among multi-metric and + # single metric grid search results. np.testing.assert_equal performs a + # deep nested comparison of the two cv_results dicts + np.testing.assert_equal( + {k: v for k, v in cv_results_multi.items() if not k.endswith("_time")}, + {k: v for k, v in cv_results_acc_rec.items() if not k.endswith("_time")}, + ) + + +def compare_refit_methods_when_refit_with_acc(search_multi, search_acc, refit): + """Compare refit multi-metric search methods with single metric methods""" + assert search_acc.refit == refit + if refit: + assert search_multi.refit == "accuracy" + else: + assert not search_multi.refit + return # search cannot predict/score without refit + + X, y = make_blobs(n_samples=100, n_features=4, random_state=42) + for method in ("predict", "predict_proba", "predict_log_proba"): + assert_almost_equal( + getattr(search_multi, method)(X), getattr(search_acc, method)(X) + ) + assert_almost_equal(search_multi.score(X, y), search_acc.score(X, y)) + for key in ("best_index_", "best_score_", "best_params_"): + assert getattr(search_multi, key) == getattr(search_acc, key) + + +@pytest.mark.parametrize( + "search_cv", + [ + RandomizedSearchCV( + estimator=DecisionTreeClassifier(), + param_distributions={"max_depth": [5, 10]}, + ), + GridSearchCV( + estimator=DecisionTreeClassifier(), param_grid={"max_depth": [5, 10]} + ), + ], +) +def test_search_cv_score_samples_error(search_cv): + X, y = make_blobs(n_samples=100, n_features=4, random_state=42) + search_cv.fit(X, y) + + # Make sure to error out when underlying estimator does not implement + # the method `score_samples` + outer_msg = f"'{search_cv.__class__.__name__}' has no attribute 'score_samples'" + inner_msg = "'DecisionTreeClassifier' object has no attribute 'score_samples'" + + with pytest.raises(AttributeError, match=outer_msg) as exec_info: + search_cv.score_samples(X) + assert isinstance(exec_info.value.__cause__, AttributeError) + assert inner_msg == str(exec_info.value.__cause__) + + +def test_unsupported_sample_weight_scorer(): + """Checks that fitting with sample_weight raises a warning if the scorer does not + support sample_weight""" + + def fake_score_func(y_true, y_pred): + "Fake scoring function that does not support sample_weight" + return 0.5 + + fake_scorer = make_scorer(fake_score_func) + + X, y = make_classification(n_samples=10, n_features=4, random_state=42) + sw = np.ones_like(y) + search_cv = GridSearchCV(estimator=LogisticRegression(), param_grid={"C": [1, 10]}) + # function + search_cv.set_params(scoring=fake_score_func) + with pytest.warns(UserWarning, match="does not support sample_weight"): + search_cv.fit(X, y, sample_weight=sw) + # scorer + search_cv.set_params(scoring=fake_scorer) + with pytest.warns(UserWarning, match="does not support sample_weight"): + search_cv.fit(X, y, sample_weight=sw) + # multi-metric evaluation + search_cv.set_params( + scoring=dict(fake=fake_scorer, accuracy="accuracy"), refit=False + ) + # only fake scorer does not support sample_weight + with pytest.warns( + UserWarning, match=r"The scoring fake=.* does not support sample_weight" + ): + search_cv.fit(X, y, sample_weight=sw) + + +@pytest.mark.parametrize( + "estimator", + [ + GridSearchCV(estimator=LogisticRegression(), param_grid={"C": [1, 10, 100]}), + RandomizedSearchCV( + estimator=Ridge(), param_distributions={"alpha": [1, 0.1, 0.01]} + ), + ], +) +def test_search_cv_sample_weight_equivalence(estimator): + estimator_weighted = clone(estimator) + estimator_repeated = clone(estimator) + set_random_state(estimator_weighted, random_state=0) + set_random_state(estimator_repeated, random_state=0) + + rng = np.random.RandomState(42) + n_classes = 3 + n_samples_per_group = 30 + n_groups = 4 + n_samples = n_groups * n_samples_per_group + X = rng.rand(n_samples, n_samples * 2) + y = rng.randint(0, n_classes, size=n_samples) + sw = rng.randint(0, 5, size=n_samples) + # we use groups with LeaveOneGroupOut to ensure that + # the splits are the same in the repeated/weighted datasets + groups = np.tile(np.arange(n_groups), n_samples_per_group) + + X_weighted = X + y_weighted = y + groups_weighted = groups + splits_weighted = list(LeaveOneGroupOut().split(X_weighted, groups=groups_weighted)) + estimator_weighted.set_params(cv=splits_weighted) + # repeat samples according to weights + X_repeated = X_weighted.repeat(repeats=sw, axis=0) + y_repeated = y_weighted.repeat(repeats=sw) + groups_repeated = groups_weighted.repeat(repeats=sw) + splits_repeated = list(LeaveOneGroupOut().split(X_repeated, groups=groups_repeated)) + estimator_repeated.set_params(cv=splits_repeated) + + y_weighted = _enforce_estimator_tags_y(estimator_weighted, y_weighted) + y_repeated = _enforce_estimator_tags_y(estimator_repeated, y_repeated) + + estimator_repeated.fit(X_repeated, y=y_repeated, sample_weight=None) + estimator_weighted.fit(X_weighted, y=y_weighted, sample_weight=sw) + + # check that scores stored in cv_results_ + # are equal for the weighted/repeated datasets + score_keys = [ + key for key in estimator_repeated.cv_results_ if key.endswith("score") + ] + for key in score_keys: + s1 = estimator_repeated.cv_results_[key] + s2 = estimator_weighted.cv_results_[key] + err_msg = f"{key} values are not equal for weighted/repeated datasets" + assert_allclose(s1, s2, err_msg=err_msg) + + for key in ["best_score_", "best_index_"]: + s1 = getattr(estimator_repeated, key) + s2 = getattr(estimator_weighted, key) + err_msg = f"{key} values are not equal for weighted/repeated datasets" + assert_almost_equal(s1, s2, err_msg=err_msg) + + for method in ["predict_proba", "decision_function", "predict", "transform"]: + if hasattr(estimator, method): + s1 = getattr(estimator_repeated, method)(X) + s2 = getattr(estimator_weighted, method)(X) + err_msg = ( + f"Comparing the output of {method} revealed that fitting " + "with `sample_weight` is not equivalent to fitting with removed " + "or repeated data points." + ) + assert_allclose_dense_sparse(s1, s2, err_msg=err_msg) + + +@pytest.mark.parametrize( + "search_cv", + [ + RandomizedSearchCV( + estimator=LocalOutlierFactor(novelty=True), + param_distributions={"n_neighbors": [5, 10]}, + scoring="precision", + ), + GridSearchCV( + estimator=LocalOutlierFactor(novelty=True), + param_grid={"n_neighbors": [5, 10]}, + scoring="precision", + ), + ], +) +def test_search_cv_score_samples_method(search_cv): + # Set parameters + rng = np.random.RandomState(42) + n_samples = 300 + outliers_fraction = 0.15 + n_outliers = int(outliers_fraction * n_samples) + n_inliers = n_samples - n_outliers + + # Create dataset + X = make_blobs( + n_samples=n_inliers, + n_features=2, + centers=[[0, 0], [0, 0]], + cluster_std=0.5, + random_state=0, + )[0] + # Add some noisy points + X = np.concatenate([X, rng.uniform(low=-6, high=6, size=(n_outliers, 2))], axis=0) + + # Define labels to be able to score the estimator with `search_cv` + y_true = np.array([1] * n_samples) + y_true[-n_outliers:] = -1 + + # Fit on data + search_cv.fit(X, y_true) + + # Verify that the stand alone estimator yields the same results + # as the ones obtained with *SearchCV + assert_allclose( + search_cv.score_samples(X), search_cv.best_estimator_.score_samples(X) + ) + + +def test_search_cv_results_rank_tie_breaking(): + X, y = make_blobs(n_samples=50, random_state=42) + + # The two C values are close enough to give similar models + # which would result in a tie of their mean cv-scores + param_grid = {"C": [1, 1.001, 0.001]} + + grid_search = GridSearchCV(SVC(), param_grid=param_grid, return_train_score=True) + random_search = RandomizedSearchCV( + SVC(), n_iter=3, param_distributions=param_grid, return_train_score=True + ) + + for search in (grid_search, random_search): + search.fit(X, y) + cv_results = search.cv_results_ + # Check tie breaking strategy - + # Check that there is a tie in the mean scores between + # candidates 1 and 2 alone + assert_almost_equal( + cv_results["mean_test_score"][0], cv_results["mean_test_score"][1] + ) + assert_almost_equal( + cv_results["mean_train_score"][0], cv_results["mean_train_score"][1] + ) + assert not np.allclose( + cv_results["mean_test_score"][1], cv_results["mean_test_score"][2] + ) + assert not np.allclose( + cv_results["mean_train_score"][1], cv_results["mean_train_score"][2] + ) + # 'min' rank should be assigned to the tied candidates + assert_almost_equal(search.cv_results_["rank_test_score"], [1, 1, 3]) + + +def test_search_cv_results_none_param(): + X, y = [[1], [2], [3], [4], [5]], [0, 0, 0, 0, 1] + estimators = (DecisionTreeRegressor(), DecisionTreeClassifier()) + est_parameters = {"random_state": [0, None]} + cv = KFold() + + for est in estimators: + grid_search = GridSearchCV( + est, + est_parameters, + cv=cv, + ).fit(X, y) + assert_array_equal(grid_search.cv_results_["param_random_state"], [0, None]) + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.FitFailedWarning") +def test_search_cv_timing(): + svc = LinearSVC(random_state=0) + + X = [ + [ + 1, + ], + [ + 2, + ], + [ + 3, + ], + [ + 4, + ], + ] + y = [0, 1, 1, 0] + + gs = GridSearchCV(svc, {"C": [0, 1]}, cv=2, error_score=0) + rs = RandomizedSearchCV(svc, {"C": [0, 1]}, cv=2, error_score=0, n_iter=2) + + for search in (gs, rs): + search.fit(X, y) + for key in ["mean_fit_time", "std_fit_time"]: + # NOTE The precision of time.time in windows is not high + # enough for the fit/score times to be non-zero for trivial X and y + assert np.all(search.cv_results_[key] >= 0) + assert np.all(search.cv_results_[key] < 1) + + for key in ["mean_score_time", "std_score_time"]: + assert search.cv_results_[key][1] >= 0 + assert search.cv_results_[key][0] == 0.0 + assert np.all(search.cv_results_[key] < 1) + + assert hasattr(search, "refit_time_") + assert isinstance(search.refit_time_, float) + assert search.refit_time_ >= 0 + + +def test_grid_search_correct_score_results(): + # test that correct scores are used + n_splits = 3 + clf = LinearSVC(random_state=0) + X, y = make_blobs(random_state=0, centers=2) + Cs = [0.1, 1, 10] + for score in ["f1", "roc_auc"]: + grid_search = GridSearchCV(clf, {"C": Cs}, scoring=score, cv=n_splits) + cv_results = grid_search.fit(X, y).cv_results_ + + # Test scorer names + result_keys = list(cv_results.keys()) + expected_keys = ("mean_test_score", "rank_test_score") + tuple( + "split%d_test_score" % cv_i for cv_i in range(n_splits) + ) + assert all(np.isin(expected_keys, result_keys)) + + cv = StratifiedKFold(n_splits=n_splits) + n_splits = grid_search.n_splits_ + for candidate_i, C in enumerate(Cs): + clf.set_params(C=C) + cv_scores = np.array( + [ + grid_search.cv_results_["split%d_test_score" % s][candidate_i] + for s in range(n_splits) + ] + ) + for i, (train, test) in enumerate(cv.split(X, y)): + clf.fit(X[train], y[train]) + if score == "f1": + correct_score = f1_score(y[test], clf.predict(X[test])) + elif score == "roc_auc": + dec = clf.decision_function(X[test]) + correct_score = roc_auc_score(y[test], dec) + assert_almost_equal(correct_score, cv_scores[i]) + + +def test_pickle(): + # Test that a fit search can be pickled + clf = MockClassifier() + grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, refit=True, cv=2) + grid_search.fit(X, y) + grid_search_pickled = pickle.loads(pickle.dumps(grid_search)) + assert_array_almost_equal(grid_search.predict(X), grid_search_pickled.predict(X)) + + random_search = RandomizedSearchCV( + clf, {"foo_param": [1, 2, 3]}, refit=True, n_iter=3, cv=2 + ) + random_search.fit(X, y) + random_search_pickled = pickle.loads(pickle.dumps(random_search)) + assert_array_almost_equal( + random_search.predict(X), random_search_pickled.predict(X) + ) + + +def test_grid_search_with_multioutput_data(): + # Test search with multi-output estimator + + X, y = make_multilabel_classification(return_indicator=True, random_state=0) + + est_parameters = {"max_depth": [1, 2, 3, 4]} + cv = KFold() + + estimators = [ + DecisionTreeRegressor(random_state=0), + DecisionTreeClassifier(random_state=0), + ] + + # Test with grid search cv + for est in estimators: + grid_search = GridSearchCV(est, est_parameters, cv=cv) + grid_search.fit(X, y) + res_params = grid_search.cv_results_["params"] + for cand_i in range(len(res_params)): + est.set_params(**res_params[cand_i]) + + for i, (train, test) in enumerate(cv.split(X, y)): + est.fit(X[train], y[train]) + correct_score = est.score(X[test], y[test]) + assert_almost_equal( + correct_score, + grid_search.cv_results_["split%d_test_score" % i][cand_i], + ) + + # Test with a randomized search + for est in estimators: + random_search = RandomizedSearchCV(est, est_parameters, cv=cv, n_iter=3) + random_search.fit(X, y) + res_params = random_search.cv_results_["params"] + for cand_i in range(len(res_params)): + est.set_params(**res_params[cand_i]) + + for i, (train, test) in enumerate(cv.split(X, y)): + est.fit(X[train], y[train]) + correct_score = est.score(X[test], y[test]) + assert_almost_equal( + correct_score, + random_search.cv_results_["split%d_test_score" % i][cand_i], + ) + + +def test_predict_proba_disabled(): + # Test predict_proba when disabled on estimator. + X = np.arange(20).reshape(5, -1) + y = [0, 0, 1, 1, 1] + clf = SVC(probability=False) + gs = GridSearchCV(clf, {}, cv=2).fit(X, y) + assert not hasattr(gs, "predict_proba") + + +def test_grid_search_allows_nans(): + # Test GridSearchCV with SimpleImputer + X = np.arange(20, dtype=np.float64).reshape(5, -1) + X[2, :] = np.nan + y = [0, 0, 1, 1, 1] + p = Pipeline( + [ + ("imputer", SimpleImputer(strategy="mean", missing_values=np.nan)), + ("classifier", MockClassifier()), + ] + ) + GridSearchCV(p, {"classifier__foo_param": [1, 2, 3]}, cv=2).fit(X, y) + + +class FailingClassifier(BaseEstimator): + """Classifier that raises a ValueError on fit()""" + + FAILING_PARAMETER = 2 + + def __init__(self, parameter=None): + self.parameter = parameter + + def fit(self, X, y=None): + if self.parameter == FailingClassifier.FAILING_PARAMETER: + raise ValueError("Failing classifier failed as required") + + def predict(self, X): + return np.zeros(X.shape[0]) + + def score(self, X=None, Y=None): + return 0.0 + + +def test_grid_search_failing_classifier(): + # GridSearchCV with on_error != 'raise' + # Ensures that a warning is raised and score reset where appropriate. + + X, y = make_classification(n_samples=20, n_features=10, random_state=0) + + clf = FailingClassifier() + + # refit=False because we only want to check that errors caused by fits + # to individual folds will be caught and warnings raised instead. If + # refit was done, then an exception would be raised on refit and not + # caught by grid_search (expected behavior), and this would cause an + # error in this test. + gs = GridSearchCV( + clf, + [{"parameter": [0, 1, 2]}], + scoring="accuracy", + refit=False, + error_score=0.0, + ) + + warning_message = re.compile( + "5 fits failed.+total of 15.+The score on these" + r" train-test partitions for these parameters will be set to 0\.0.+" + "5 fits failed with the following error.+ValueError.+Failing classifier failed" + " as required", + flags=re.DOTALL, + ) + with pytest.warns(FitFailedWarning, match=warning_message): + gs.fit(X, y) + n_candidates = len(gs.cv_results_["params"]) + + # Ensure that grid scores were set to zero as required for those fits + # that are expected to fail. + def get_cand_scores(i): + return np.array( + [gs.cv_results_["split%d_test_score" % s][i] for s in range(gs.n_splits_)] + ) + + assert all( + ( + np.all(get_cand_scores(cand_i) == 0.0) + for cand_i in range(n_candidates) + if gs.cv_results_["param_parameter"][cand_i] + == FailingClassifier.FAILING_PARAMETER + ) + ) + + gs = GridSearchCV( + clf, + [{"parameter": [0, 1, 2]}], + scoring="accuracy", + refit=False, + error_score=float("nan"), + ) + warning_message = re.compile( + "5 fits failed.+total of 15.+The score on these" + r" train-test partitions for these parameters will be set to nan.+" + "5 fits failed with the following error.+ValueError.+Failing classifier failed" + " as required", + flags=re.DOTALL, + ) + with pytest.warns(FitFailedWarning, match=warning_message): + gs.fit(X, y) + n_candidates = len(gs.cv_results_["params"]) + assert all( + np.all(np.isnan(get_cand_scores(cand_i))) + for cand_i in range(n_candidates) + if gs.cv_results_["param_parameter"][cand_i] + == FailingClassifier.FAILING_PARAMETER + ) + + ranks = gs.cv_results_["rank_test_score"] + + # Check that succeeded estimators have lower ranks + assert ranks[0] <= 2 and ranks[1] <= 2 + # Check that failed estimator has the highest rank + assert ranks[clf.FAILING_PARAMETER] == 3 + assert gs.best_index_ != clf.FAILING_PARAMETER + + +def test_grid_search_classifier_all_fits_fail(): + X, y = make_classification(n_samples=20, n_features=10, random_state=0) + + clf = FailingClassifier() + + gs = GridSearchCV( + clf, + [{"parameter": [FailingClassifier.FAILING_PARAMETER] * 3}], + error_score=0.0, + ) + + warning_message = re.compile( + ( + "All the 15 fits failed.+15 fits failed with the following" + " error.+ValueError.+Failing classifier failed as required" + ), + flags=re.DOTALL, + ) + with pytest.raises(ValueError, match=warning_message): + gs.fit(X, y) + + +def test_grid_search_failing_classifier_raise(): + # GridSearchCV with on_error == 'raise' raises the error + + X, y = make_classification(n_samples=20, n_features=10, random_state=0) + + clf = FailingClassifier() + + # refit=False because we want to test the behaviour of the grid search part + gs = GridSearchCV( + clf, + [{"parameter": [0, 1, 2]}], + scoring="accuracy", + refit=False, + error_score="raise", + ) + + # FailingClassifier issues a ValueError so this is what we look for. + with pytest.raises(ValueError): + gs.fit(X, y) + + +def test_parameters_sampler_replacement(): + # raise warning if n_iter is bigger than total parameter space + params = [ + {"first": [0, 1], "second": ["a", "b", "c"]}, + {"third": ["two", "values"]}, + ] + sampler = ParameterSampler(params, n_iter=9) + n_iter = 9 + grid_size = 8 + expected_warning = ( + "The total space of parameters %d is smaller " + "than n_iter=%d. Running %d iterations. For " + "exhaustive searches, use GridSearchCV." % (grid_size, n_iter, grid_size) + ) + with pytest.warns(UserWarning, match=expected_warning): + list(sampler) + + # degenerates to GridSearchCV if n_iter the same as grid_size + sampler = ParameterSampler(params, n_iter=8) + samples = list(sampler) + assert len(samples) == 8 + for values in ParameterGrid(params): + assert values in samples + assert len(ParameterSampler(params, n_iter=1000)) == 8 + + # test sampling without replacement in a large grid + params = {"a": range(10), "b": range(10), "c": range(10)} + sampler = ParameterSampler(params, n_iter=99, random_state=42) + samples = list(sampler) + assert len(samples) == 99 + hashable_samples = ["a%db%dc%d" % (p["a"], p["b"], p["c"]) for p in samples] + assert len(set(hashable_samples)) == 99 + + # doesn't go into infinite loops + params_distribution = {"first": bernoulli(0.5), "second": ["a", "b", "c"]} + sampler = ParameterSampler(params_distribution, n_iter=7) + samples = list(sampler) + assert len(samples) == 7 + + +def test_stochastic_gradient_loss_param(): + # Make sure the predict_proba works when loss is specified + # as one of the parameters in the param_grid. + param_grid = { + "loss": ["log_loss"], + } + X = np.arange(24).reshape(6, -1) + y = [0, 0, 0, 1, 1, 1] + clf = GridSearchCV( + estimator=SGDClassifier(loss="hinge"), param_grid=param_grid, cv=3 + ) + + # When the estimator is not fitted, `predict_proba` is not available as the + # loss is 'hinge'. + assert not hasattr(clf, "predict_proba") + clf.fit(X, y) + clf.predict_proba(X) + clf.predict_log_proba(X) + + # Make sure `predict_proba` is not available when setting loss=['hinge'] + # in param_grid + param_grid = { + "loss": ["hinge"], + } + clf = GridSearchCV( + estimator=SGDClassifier(loss="hinge"), param_grid=param_grid, cv=3 + ) + assert not hasattr(clf, "predict_proba") + clf.fit(X, y) + assert not hasattr(clf, "predict_proba") + + +def test_search_train_scores_set_to_false(): + X = np.arange(6).reshape(6, -1) + y = [0, 0, 0, 1, 1, 1] + clf = LinearSVC(random_state=0) + + gs = GridSearchCV(clf, param_grid={"C": [0.1, 0.2]}, cv=3) + gs.fit(X, y) + + +def test_grid_search_cv_splits_consistency(): + # Check if a one time iterable is accepted as a cv parameter. + n_samples = 100 + n_splits = 5 + X, y = make_classification(n_samples=n_samples, random_state=0) + + gs = GridSearchCV( + LinearSVC(random_state=0), + param_grid={"C": [0.1, 0.2, 0.3]}, + cv=OneTimeSplitter(n_splits=n_splits, n_samples=n_samples), + return_train_score=True, + ) + gs.fit(X, y) + + gs2 = GridSearchCV( + LinearSVC(random_state=0), + param_grid={"C": [0.1, 0.2, 0.3]}, + cv=KFold(n_splits=n_splits), + return_train_score=True, + ) + gs2.fit(X, y) + + # Give generator as a cv parameter + assert isinstance( + KFold(n_splits=n_splits, shuffle=True, random_state=0).split(X, y), + GeneratorType, + ) + gs3 = GridSearchCV( + LinearSVC(random_state=0), + param_grid={"C": [0.1, 0.2, 0.3]}, + cv=KFold(n_splits=n_splits, shuffle=True, random_state=0).split(X, y), + return_train_score=True, + ) + gs3.fit(X, y) + + gs4 = GridSearchCV( + LinearSVC(random_state=0), + param_grid={"C": [0.1, 0.2, 0.3]}, + cv=KFold(n_splits=n_splits, shuffle=True, random_state=0), + return_train_score=True, + ) + gs4.fit(X, y) + + def _pop_time_keys(cv_results): + for key in ( + "mean_fit_time", + "std_fit_time", + "mean_score_time", + "std_score_time", + ): + cv_results.pop(key) + return cv_results + + # Check if generators are supported as cv and + # that the splits are consistent + np.testing.assert_equal( + _pop_time_keys(gs3.cv_results_), _pop_time_keys(gs4.cv_results_) + ) + + # OneTimeSplitter is a non-re-entrant cv where split can be called only + # once if ``cv.split`` is called once per param setting in GridSearchCV.fit + # the 2nd and 3rd parameter will not be evaluated as no train/test indices + # will be generated for the 2nd and subsequent cv.split calls. + # This is a check to make sure cv.split is not called once per param + # setting. + np.testing.assert_equal( + {k: v for k, v in gs.cv_results_.items() if not k.endswith("_time")}, + {k: v for k, v in gs2.cv_results_.items() if not k.endswith("_time")}, + ) + + # Check consistency of folds across the parameters + gs = GridSearchCV( + LinearSVC(random_state=0), + param_grid={"C": [0.1, 0.1, 0.2, 0.2]}, + cv=KFold(n_splits=n_splits, shuffle=True), + return_train_score=True, + ) + gs.fit(X, y) + + # As the first two param settings (C=0.1) and the next two param + # settings (C=0.2) are same, the test and train scores must also be + # same as long as the same train/test indices are generated for all + # the cv splits, for both param setting + for score_type in ("train", "test"): + per_param_scores = {} + for param_i in range(4): + per_param_scores[param_i] = [ + gs.cv_results_["split%d_%s_score" % (s, score_type)][param_i] + for s in range(5) + ] + + assert_array_almost_equal(per_param_scores[0], per_param_scores[1]) + assert_array_almost_equal(per_param_scores[2], per_param_scores[3]) + + +def test_transform_inverse_transform_round_trip(): + clf = MockClassifier() + grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, cv=2, verbose=3) + + grid_search.fit(X, y) + X_round_trip = grid_search.inverse_transform(grid_search.transform(X)) + assert_array_equal(X, X_round_trip) + + +def test_custom_run_search(): + def check_results(results, gscv): + exp_results = gscv.cv_results_ + assert sorted(results.keys()) == sorted(exp_results) + for k in results: + if not k.endswith("_time"): + # XXX: results['params'] is a list :| + results[k] = np.asanyarray(results[k]) + if results[k].dtype.kind == "O": + assert_array_equal( + exp_results[k], results[k], err_msg="Checking " + k + ) + else: + assert_allclose(exp_results[k], results[k], err_msg="Checking " + k) + + def fit_grid(param_grid): + return GridSearchCV(clf, param_grid, return_train_score=True).fit(X, y) + + class CustomSearchCV(BaseSearchCV): + def __init__(self, estimator, **kwargs): + super().__init__(estimator, **kwargs) + + def _run_search(self, evaluate): + results = evaluate([{"max_depth": 1}, {"max_depth": 2}]) + check_results(results, fit_grid({"max_depth": [1, 2]})) + results = evaluate([{"min_samples_split": 5}, {"min_samples_split": 10}]) + check_results( + results, + fit_grid([{"max_depth": [1, 2]}, {"min_samples_split": [5, 10]}]), + ) + + # Using regressor to make sure each score differs + clf = DecisionTreeRegressor(random_state=0) + X, y = make_classification(n_samples=100, n_informative=4, random_state=0) + mycv = CustomSearchCV(clf, return_train_score=True).fit(X, y) + gscv = fit_grid([{"max_depth": [1, 2]}, {"min_samples_split": [5, 10]}]) + + results = mycv.cv_results_ + check_results(results, gscv) + for attr in dir(gscv): + if ( + attr[0].islower() + and attr[-1:] == "_" + and attr + not in { + "cv_results_", + "best_estimator_", + "refit_time_", + "classes_", + "scorer_", + } + ): + assert getattr(gscv, attr) == getattr(mycv, attr), ( + "Attribute %s not equal" % attr + ) + + +def test__custom_fit_no_run_search(): + class NoRunSearchSearchCV(BaseSearchCV): + def __init__(self, estimator, **kwargs): + super().__init__(estimator, **kwargs) + + def fit(self, X, y=None, groups=None, **fit_params): + return self + + # this should not raise any exceptions + NoRunSearchSearchCV(SVC()).fit(X, y) + + class BadSearchCV(BaseSearchCV): + def __init__(self, estimator, **kwargs): + super().__init__(estimator, **kwargs) + + with pytest.raises(NotImplementedError, match="_run_search not implemented."): + # this should raise a NotImplementedError + BadSearchCV(SVC()).fit(X, y) + + +def test_empty_cv_iterator_error(): + # Use global X, y + + # create cv + cv = KFold(n_splits=3).split(X) + + # pop all of it, this should cause the expected ValueError + [u for u in cv] + # cv is empty now + + train_size = 100 + ridge = RandomizedSearchCV(Ridge(), {"alpha": [1e-3, 1e-2, 1e-1]}, cv=cv, n_jobs=4) + + # assert that this raises an error + with pytest.raises( + ValueError, + match=( + "No fits were performed. " + "Was the CV iterator empty\\? " + "Were there no candidates\\?" + ), + ): + ridge.fit(X[:train_size], y[:train_size]) + + +def test_random_search_bad_cv(): + # Use global X, y + + class BrokenKFold(KFold): + def get_n_splits(self, *args, **kw): + return 1 + + # create bad cv + cv = BrokenKFold(n_splits=3) + + train_size = 100 + ridge = RandomizedSearchCV(Ridge(), {"alpha": [1e-3, 1e-2, 1e-1]}, cv=cv, n_jobs=4) + + # assert that this raises an error + with pytest.raises( + ValueError, + match=( + "cv.split and cv.get_n_splits returned " + "inconsistent results. Expected \\d+ " + "splits, got \\d+" + ), + ): + ridge.fit(X[:train_size], y[:train_size]) + + +@pytest.mark.parametrize("return_train_score", [False, True]) +@pytest.mark.parametrize( + "SearchCV, specialized_params", + [ + (GridSearchCV, {"param_grid": {"max_depth": [2, 3, 5, 8]}}), + ( + RandomizedSearchCV, + {"param_distributions": {"max_depth": [2, 3, 5, 8]}, "n_iter": 4}, + ), + ], +) +def test_searchcv_raise_warning_with_non_finite_score( + SearchCV, specialized_params, return_train_score +): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/10529 + # Check that we raise a UserWarning when a non-finite score is + # computed in the SearchCV + X, y = make_classification(n_classes=2, random_state=0) + + class FailingScorer: + """Scorer that will fail for some split but not all.""" + + def __init__(self): + self.n_counts = 0 + + def __call__(self, estimator, X, y): + self.n_counts += 1 + if self.n_counts % 5 == 0: + return np.nan + return 1 + + grid = SearchCV( + DecisionTreeClassifier(), + scoring=FailingScorer(), + cv=3, + return_train_score=return_train_score, + **specialized_params, + ) + + with pytest.warns(UserWarning) as warn_msg: + grid.fit(X, y) + + set_with_warning = ["test", "train"] if return_train_score else ["test"] + assert len(warn_msg) == len(set_with_warning) + for msg, dataset in zip(warn_msg, set_with_warning): + assert f"One or more of the {dataset} scores are non-finite" in str(msg.message) + + # all non-finite scores should be equally ranked last + last_rank = grid.cv_results_["rank_test_score"].max() + non_finite_mask = np.isnan(grid.cv_results_["mean_test_score"]) + assert_array_equal(grid.cv_results_["rank_test_score"][non_finite_mask], last_rank) + # all finite scores should be better ranked than the non-finite scores + assert np.all(grid.cv_results_["rank_test_score"][~non_finite_mask] < last_rank) + + +def test_callable_multimetric_confusion_matrix(): + # Test callable with many metrics inserts the correct names and metrics + # into the search cv object + def custom_scorer(clf, X, y): + y_pred = clf.predict(X) + cm = confusion_matrix(y, y_pred) + return {"tn": cm[0, 0], "fp": cm[0, 1], "fn": cm[1, 0], "tp": cm[1, 1]} + + X, y = make_classification(n_samples=40, n_features=4, random_state=42) + est = LinearSVC(random_state=42) + search = GridSearchCV(est, {"C": [0.1, 1]}, scoring=custom_scorer, refit="fp") + + search.fit(X, y) + + score_names = ["tn", "fp", "fn", "tp"] + for name in score_names: + assert "mean_test_{}".format(name) in search.cv_results_ + + y_pred = search.predict(X) + cm = confusion_matrix(y, y_pred) + assert search.score(X, y) == pytest.approx(cm[0, 1]) + + +def test_callable_multimetric_same_as_list_of_strings(): + # Test callable multimetric is the same as a list of strings + def custom_scorer(est, X, y): + y_pred = est.predict(X) + return { + "recall": recall_score(y, y_pred), + "accuracy": accuracy_score(y, y_pred), + } + + X, y = make_classification(n_samples=40, n_features=4, random_state=42) + est = LinearSVC(random_state=42) + search_callable = GridSearchCV( + est, {"C": [0.1, 1]}, scoring=custom_scorer, refit="recall" + ) + search_str = GridSearchCV( + est, {"C": [0.1, 1]}, scoring=["recall", "accuracy"], refit="recall" + ) + + search_callable.fit(X, y) + search_str.fit(X, y) + + assert search_callable.best_score_ == pytest.approx(search_str.best_score_) + assert search_callable.best_index_ == search_str.best_index_ + assert search_callable.score(X, y) == pytest.approx(search_str.score(X, y)) + + +def test_callable_single_metric_same_as_single_string(): + # Tests callable scorer is the same as scoring with a single string + def custom_scorer(est, X, y): + y_pred = est.predict(X) + return recall_score(y, y_pred) + + X, y = make_classification(n_samples=40, n_features=4, random_state=42) + est = LinearSVC(random_state=42) + search_callable = GridSearchCV( + est, {"C": [0.1, 1]}, scoring=custom_scorer, refit=True + ) + search_str = GridSearchCV(est, {"C": [0.1, 1]}, scoring="recall", refit="recall") + search_list_str = GridSearchCV( + est, {"C": [0.1, 1]}, scoring=["recall"], refit="recall" + ) + search_callable.fit(X, y) + search_str.fit(X, y) + search_list_str.fit(X, y) + + assert search_callable.best_score_ == pytest.approx(search_str.best_score_) + assert search_callable.best_index_ == search_str.best_index_ + assert search_callable.score(X, y) == pytest.approx(search_str.score(X, y)) + + assert search_list_str.best_score_ == pytest.approx(search_str.best_score_) + assert search_list_str.best_index_ == search_str.best_index_ + assert search_list_str.score(X, y) == pytest.approx(search_str.score(X, y)) + + +def test_callable_multimetric_error_on_invalid_key(): + # Raises when the callable scorer does not return a dict with `refit` key. + def bad_scorer(est, X, y): + return {"bad_name": 1} + + X, y = make_classification(n_samples=40, n_features=4, random_state=42) + clf = GridSearchCV( + LinearSVC(random_state=42), + {"C": [0.1, 1]}, + scoring=bad_scorer, + refit="good_name", + ) + + msg = ( + "For multi-metric scoring, the parameter refit must be set to a " + "scorer key or a callable to refit" + ) + with pytest.raises(ValueError, match=msg): + clf.fit(X, y) + + +def test_callable_multimetric_error_failing_clf(): + # Warns when there is an estimator the fails to fit with a float + # error_score + def custom_scorer(est, X, y): + return {"acc": 1} + + X, y = make_classification(n_samples=20, n_features=10, random_state=0) + + clf = FailingClassifier() + gs = GridSearchCV( + clf, + [{"parameter": [0, 1, 2]}], + scoring=custom_scorer, + refit=False, + error_score=0.1, + ) + + warning_message = re.compile( + "5 fits failed.+total of 15.+The score on these" + r" train-test partitions for these parameters will be set to 0\.1", + flags=re.DOTALL, + ) + with pytest.warns(FitFailedWarning, match=warning_message): + gs.fit(X, y) + + assert_allclose(gs.cv_results_["mean_test_acc"], [1, 1, 0.1]) + + +def test_callable_multimetric_clf_all_fits_fail(): + # Warns and raises when all estimator fails to fit. + def custom_scorer(est, X, y): + return {"acc": 1} + + X, y = make_classification(n_samples=20, n_features=10, random_state=0) + + clf = FailingClassifier() + + gs = GridSearchCV( + clf, + [{"parameter": [FailingClassifier.FAILING_PARAMETER] * 3}], + scoring=custom_scorer, + refit=False, + error_score=0.1, + ) + + individual_fit_error_message = "ValueError: Failing classifier failed as required" + error_message = re.compile( + ( + "All the 15 fits failed.+your model is misconfigured.+" + f"{individual_fit_error_message}" + ), + flags=re.DOTALL, + ) + + with pytest.raises(ValueError, match=error_message): + gs.fit(X, y) + + +def test_n_features_in(): + # make sure grid search and random search delegate n_features_in to the + # best estimator + n_features = 4 + X, y = make_classification(n_features=n_features) + gbdt = HistGradientBoostingClassifier() + param_grid = {"max_iter": [3, 4]} + gs = GridSearchCV(gbdt, param_grid) + rs = RandomizedSearchCV(gbdt, param_grid, n_iter=1) + assert not hasattr(gs, "n_features_in_") + assert not hasattr(rs, "n_features_in_") + gs.fit(X, y) + rs.fit(X, y) + assert gs.n_features_in_ == n_features + assert rs.n_features_in_ == n_features + + +@pytest.mark.parametrize("pairwise", [True, False]) +def test_search_cv_pairwise_property_delegated_to_base_estimator(pairwise): + """ + Test implementation of BaseSearchCV has the pairwise tag + which matches the pairwise tag of its estimator. + This test make sure pairwise tag is delegated to the base estimator. + + Non-regression test for issue #13920. + """ + + class TestEstimator(BaseEstimator): + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = pairwise + return tags + + est = TestEstimator() + attr_message = "BaseSearchCV pairwise tag must match estimator" + cv = GridSearchCV(est, {"n_neighbors": [10]}) + assert pairwise == cv.__sklearn_tags__().input_tags.pairwise, attr_message + + +def test_search_cv__pairwise_property_delegated_to_base_estimator(): + """ + Test implementation of BaseSearchCV has the pairwise property + which matches the pairwise tag of its estimator. + This test make sure pairwise tag is delegated to the base estimator. + + Non-regression test for issue #13920. + """ + + class EstimatorPairwise(BaseEstimator): + def __init__(self, pairwise=True): + self.pairwise = pairwise + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = self.pairwise + return tags + + est = EstimatorPairwise() + attr_message = "BaseSearchCV _pairwise property must match estimator" + + for _pairwise_setting in [True, False]: + est.set_params(pairwise=_pairwise_setting) + cv = GridSearchCV(est, {"n_neighbors": [10]}) + assert _pairwise_setting == cv.__sklearn_tags__().input_tags.pairwise, ( + attr_message + ) + + +def test_search_cv_pairwise_property_equivalence_of_precomputed(): + """ + Test implementation of BaseSearchCV has the pairwise tag + which matches the pairwise tag of its estimator. + This test ensures the equivalence of 'precomputed'. + + Non-regression test for issue #13920. + """ + n_samples = 50 + n_splits = 2 + X, y = make_classification(n_samples=n_samples, random_state=0) + grid_params = {"n_neighbors": [10]} + + # defaults to euclidean metric (minkowski p = 2) + clf = KNeighborsClassifier() + cv = GridSearchCV(clf, grid_params, cv=n_splits) + cv.fit(X, y) + preds_original = cv.predict(X) + + # precompute euclidean metric to validate pairwise is working + X_precomputed = euclidean_distances(X) + clf = KNeighborsClassifier(metric="precomputed") + cv = GridSearchCV(clf, grid_params, cv=n_splits) + cv.fit(X_precomputed, y) + preds_precomputed = cv.predict(X_precomputed) + + attr_message = "GridSearchCV not identical with precomputed metric" + assert (preds_original == preds_precomputed).all(), attr_message + + +@pytest.mark.parametrize( + "SearchCV, param_search", + [(GridSearchCV, {"a": [0.1, 0.01]}), (RandomizedSearchCV, {"a": uniform(1, 3)})], +) +def test_scalar_fit_param(SearchCV, param_search): + # unofficially sanctioned tolerance for scalar values in fit_params + # non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/15805 + class TestEstimator(ClassifierMixin, BaseEstimator): + def __init__(self, a=None): + self.a = a + + def fit(self, X, y, r=None): + self.r_ = r + + def predict(self, X): + return np.zeros(shape=(len(X))) + + model = SearchCV(TestEstimator(), param_search) + X, y = make_classification(random_state=42) + model.fit(X, y, r=42) + assert model.best_estimator_.r_ == 42 + + +@pytest.mark.parametrize( + "SearchCV, param_search", + [ + (GridSearchCV, {"alpha": [0.1, 0.01]}), + (RandomizedSearchCV, {"alpha": uniform(0.01, 0.1)}), + ], +) +def test_scalar_fit_param_compat(SearchCV, param_search): + # check support for scalar values in fit_params, for instance in LightGBM + # that do not exactly respect the scikit-learn API contract but that we do + # not want to break without an explicit deprecation cycle and API + # recommendations for implementing early stopping with a user provided + # validation set. non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/15805 + X_train, X_valid, y_train, y_valid = train_test_split( + *make_classification(random_state=42), random_state=42 + ) + + class _FitParamClassifier(SGDClassifier): + def fit( + self, + X, + y, + sample_weight=None, + tuple_of_arrays=None, + scalar_param=None, + callable_param=None, + ): + super().fit(X, y, sample_weight=sample_weight) + assert scalar_param > 0 + assert callable(callable_param) + + # The tuple of arrays should be preserved as tuple. + assert isinstance(tuple_of_arrays, tuple) + assert tuple_of_arrays[0].ndim == 2 + assert tuple_of_arrays[1].ndim == 1 + return self + + def _fit_param_callable(): + pass + + model = SearchCV(_FitParamClassifier(), param_search) + + # NOTE: `fit_params` should be data dependent (e.g. `sample_weight`) which + # is not the case for the following parameters. But this abuse is common in + # popular third-party libraries and we should tolerate this behavior for + # now and be careful not to break support for those without following + # proper deprecation cycle. + fit_params = { + "tuple_of_arrays": (X_valid, y_valid), + "callable_param": _fit_param_callable, + "scalar_param": 42, + } + model.fit(X_train, y_train, **fit_params) + + +# FIXME: Replace this test with a full `check_estimator` once we have API only +# checks. +@pytest.mark.filterwarnings("ignore:The total space of parameters 4 is") +@pytest.mark.parametrize("SearchCV", [GridSearchCV, RandomizedSearchCV]) +@pytest.mark.parametrize("Predictor", [MinimalRegressor, MinimalClassifier]) +def test_search_cv_using_minimal_compatible_estimator(SearchCV, Predictor): + # Check that third-party library can run tests without inheriting from + # BaseEstimator. + rng = np.random.RandomState(0) + X, y = rng.randn(25, 2), np.array([0] * 5 + [1] * 20) + + model = Pipeline( + [("transformer", MinimalTransformer()), ("predictor", Predictor())] + ) + + params = { + "transformer__param": [1, 10], + "predictor__parama": [1, 10], + } + search = SearchCV(model, params, error_score="raise") + search.fit(X, y) + + assert search.best_params_.keys() == params.keys() + + y_pred = search.predict(X) + if is_classifier(search): + assert_array_equal(y_pred, 1) + assert search.score(X, y) == pytest.approx(accuracy_score(y, y_pred)) + else: + assert_allclose(y_pred, y.mean()) + assert search.score(X, y) == pytest.approx(r2_score(y, y_pred)) + + +@pytest.mark.parametrize("return_train_score", [True, False]) +def test_search_cv_verbose_3(capsys, return_train_score): + """Check that search cv with verbose>2 shows the score for single + metrics. non-regression test for #19658.""" + X, y = make_classification(n_samples=100, n_classes=2, flip_y=0.2, random_state=0) + clf = LinearSVC(random_state=0) + grid = {"C": [0.1]} + + GridSearchCV( + clf, + grid, + scoring="accuracy", + verbose=3, + cv=3, + return_train_score=return_train_score, + ).fit(X, y) + captured = capsys.readouterr().out + if return_train_score: + match = re.findall(r"score=\(train=[\d\.]+, test=[\d.]+\)", captured) + else: + match = re.findall(r"score=[\d\.]+", captured) + assert len(match) == 3 + + +@pytest.mark.parametrize( + "SearchCV, param_search", + [ + (GridSearchCV, "param_grid"), + (RandomizedSearchCV, "param_distributions"), + (HalvingGridSearchCV, "param_grid"), + ], +) +def test_search_estimator_param(SearchCV, param_search): + # test that SearchCV object doesn't change the object given in the parameter grid + X, y = make_classification(random_state=42) + + params = {"clf": [LinearSVC()], "clf__C": [0.01]} + orig_C = params["clf"][0].C + + pipe = Pipeline([("trs", MinimalTransformer()), ("clf", None)]) + + param_grid_search = {param_search: params} + gs = SearchCV(pipe, refit=True, cv=2, scoring="accuracy", **param_grid_search).fit( + X, y + ) + + # testing that the original object in params is not changed + assert params["clf"][0].C == orig_C + # testing that the GS is setting the parameter of the step correctly + assert gs.best_estimator_.named_steps["clf"].C == 0.01 + + +def test_search_with_2d_array(): + parameter_grid = { + "vect__ngram_range": ((1, 1), (1, 2)), # unigrams or bigrams + "vect__norm": ("l1", "l2"), + } + pipeline = Pipeline( + [ + ("vect", TfidfVectorizer()), + ("clf", ComplementNB()), + ] + ) + random_search = RandomizedSearchCV( + estimator=pipeline, + param_distributions=parameter_grid, + n_iter=3, + random_state=0, + n_jobs=2, + verbose=1, + cv=3, + ) + data_train = ["one", "two", "three", "four", "five"] + data_target = [0, 0, 1, 0, 1] + random_search.fit(data_train, data_target) + result = random_search.cv_results_["param_vect__ngram_range"] + expected_data = np.empty(3, dtype=object) + expected_data[:] = [(1, 2), (1, 2), (1, 1)] + np.testing.assert_array_equal(result.data, expected_data) + + +def test_search_html_repr(): + """Test different HTML representations for GridSearchCV.""" + X, y = make_classification(random_state=42) + + pipeline = Pipeline([("scale", StandardScaler()), ("clf", DummyClassifier())]) + param_grid = {"clf": [DummyClassifier(), LogisticRegression()]} + + # Unfitted shows the original pipeline + search_cv = GridSearchCV(pipeline, param_grid=param_grid, refit=False) + with config_context(display="diagram"): + repr_html = search_cv._repr_html_() + assert "
DummyClassifier
" in repr_html + + # Fitted with `refit=False` shows the original pipeline + search_cv.fit(X, y) + with config_context(display="diagram"): + repr_html = search_cv._repr_html_() + assert "
DummyClassifier
" in repr_html + + # Fitted with `refit=True` shows the best estimator + search_cv = GridSearchCV(pipeline, param_grid=param_grid, refit=True) + search_cv.fit(X, y) + with config_context(display="diagram"): + repr_html = search_cv._repr_html_() + assert "
DummyClassifier
" not in repr_html + assert "
LogisticRegression
" in repr_html + + +# Metadata Routing Tests +# ====================== + + +@pytest.mark.parametrize( + "SearchCV, param_search", + [ + (GridSearchCV, "param_grid"), + (RandomizedSearchCV, "param_distributions"), + ], +) +@config_context(enable_metadata_routing=True) +def test_multi_metric_search_forwards_metadata(SearchCV, param_search): + """Test that *SearchCV forwards metadata correctly when passed multiple metrics.""" + X, y = make_classification(random_state=42) + n_samples = _num_samples(X) + rng = np.random.RandomState(0) + score_weights = rng.rand(n_samples) + score_metadata = rng.rand(n_samples) + + est = LinearSVC() + param_grid_search = {param_search: {"C": [1]}} + + scorer_registry = _Registry() + scorer = ConsumingScorer(registry=scorer_registry).set_score_request( + sample_weight="score_weights", metadata="score_metadata" + ) + scoring = dict(my_scorer=scorer, accuracy="accuracy") + SearchCV(est, refit="accuracy", cv=2, scoring=scoring, **param_grid_search).fit( + X, y, score_weights=score_weights, score_metadata=score_metadata + ) + assert len(scorer_registry) + for _scorer in scorer_registry: + check_recorded_metadata( + obj=_scorer, + method="score", + parent="_score", + split_params=("sample_weight", "metadata"), + sample_weight=score_weights, + metadata=score_metadata, + ) + + +@pytest.mark.parametrize( + "SearchCV, param_search", + [ + (GridSearchCV, "param_grid"), + (RandomizedSearchCV, "param_distributions"), + (HalvingGridSearchCV, "param_grid"), + ], +) +def test_score_rejects_params_with_no_routing_enabled(SearchCV, param_search): + """*SearchCV should reject **params when metadata routing is not enabled + since this is added only when routing is enabled.""" + X, y = make_classification(random_state=42) + est = LinearSVC() + param_grid_search = {param_search: {"C": [1]}} + + gs = SearchCV(est, cv=2, **param_grid_search).fit(X, y) + + with pytest.raises(ValueError, match="is only supported if"): + gs.score(X, y, metadata=1) + + +# End of Metadata Routing Tests +# ============================= + + +def test_cv_results_dtype_issue_29074(): + """Non-regression test for https://github.com/scikit-learn/scikit-learn/issues/29074""" + + class MetaEstimator(BaseEstimator, ClassifierMixin): + def __init__( + self, + base_clf, + parameter1=None, + parameter2=None, + parameter3=None, + parameter4=None, + ): + self.base_clf = base_clf + self.parameter1 = parameter1 + self.parameter2 = parameter2 + self.parameter3 = parameter3 + self.parameter4 = parameter4 + + def fit(self, X, y=None): + self.base_clf.fit(X, y) + return self + + def score(self, X, y): + return self.base_clf.score(X, y) + + # Values of param_grid are such that np.result_type gives slightly + # different errors, in particular ValueError and TypeError + param_grid = { + "parameter1": [None, {"option": "A"}, {"option": "B"}], + "parameter2": [None, [1, 2]], + "parameter3": [{"a": 1}], + "parameter4": ["str1", "str2"], + } + grid_search = GridSearchCV( + estimator=MetaEstimator(LogisticRegression()), + param_grid=param_grid, + cv=3, + ) + + X, y = make_blobs(random_state=0) + grid_search.fit(X, y) + for param in param_grid: + assert grid_search.cv_results_[f"param_{param}"].dtype == object + + +def test_search_with_estimators_issue_29157(): + """Check cv_results_ for estimators with a `dtype` parameter, e.g. OneHotEncoder.""" + pd = pytest.importorskip("pandas") + df = pd.DataFrame( + { + "numeric_1": [1, 2, 3, 4, 5], + "object_1": ["a", "a", "a", "a", "a"], + "target": [1.0, 4.1, 2.0, 3.0, 1.0], + } + ) + X = df.drop("target", axis=1) + y = df["target"] + enc = ColumnTransformer( + [("enc", OneHotEncoder(sparse_output=False), ["object_1"])], + remainder="passthrough", + ) + pipe = Pipeline( + [ + ("enc", enc), + ("regressor", LinearRegression()), + ] + ) + grid_params = { + "enc__enc": [ + OneHotEncoder(sparse_output=False), + OrdinalEncoder(), + ] + } + grid_search = GridSearchCV(pipe, grid_params, cv=2) + grid_search.fit(X, y) + assert grid_search.cv_results_["param_enc__enc"].dtype == object + + +def test_cv_results_multi_size_array(): + """Check that GridSearchCV works with params that are arrays of different sizes. + + Non-regression test for #29277. + """ + n_features = 10 + X, y = make_classification(n_features=10) + + spline_reg_pipe = make_pipeline( + SplineTransformer(extrapolation="periodic"), + LogisticRegression(), + ) + + n_knots_list = [n_features * i for i in [10, 11, 12]] + knots_list = [ + np.linspace(0, np.pi * 2, n_knots).reshape((-1, n_features)) + for n_knots in n_knots_list + ] + spline_reg_pipe_cv = GridSearchCV( + estimator=spline_reg_pipe, + param_grid={ + "splinetransformer__knots": knots_list, + }, + ) + + spline_reg_pipe_cv.fit(X, y) + assert ( + spline_reg_pipe_cv.cv_results_["param_splinetransformer__knots"].dtype == object + ) + + +@pytest.mark.parametrize( + "array_namespace, device, dtype", + yield_namespace_device_dtype_combinations(), + ids=_get_namespace_device_dtype_ids, +) +@pytest.mark.parametrize("SearchCV", [GridSearchCV, RandomizedSearchCV]) +def test_array_api_search_cv_classifier(SearchCV, array_namespace, device, dtype): + xp = _array_api_for_tests(array_namespace, device) + + X = np.arange(100).reshape((10, 10)) + X_np = X.astype(dtype) + X_xp = xp.asarray(X_np, device=device) + + # y should always be an integer, no matter what `dtype` is + y_np = np.array([0] * 5 + [1] * 5) + y_xp = xp.asarray(y_np, device=device) + + with config_context(array_api_dispatch=True): + searcher = SearchCV( + LinearDiscriminantAnalysis(), + {"tol": [1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7]}, + cv=2, + error_score="raise", + ) + searcher.fit(X_xp, y_xp) + searcher.score(X_xp, y_xp) + + +# Construct these outside the tests so that the same object is used +# for both input and `expected` +one_hot_encoder = OneHotEncoder() +ordinal_encoder = OrdinalEncoder() + +# If we construct this directly via `MaskedArray`, the list of tuples +# gets auto-converted to a 2D array. +ma_with_tuples = np.ma.MaskedArray(np.empty(2), mask=True, dtype=object) # type: ignore[var-annotated] +ma_with_tuples[0] = (1, 2) +ma_with_tuples[1] = (3, 4) + + +@pytest.mark.parametrize( + ("candidate_params", "expected"), + [ + pytest.param( + [{"foo": 1}, {"foo": 2}], + [ + ("param_foo", np.ma.MaskedArray(np.array([1, 2]))), + ], + id="simple numeric, single param", + ), + pytest.param( + [{"foo": 1, "bar": 3}, {"foo": 2, "bar": 4}, {"foo": 3}], + [ + ("param_foo", np.ma.MaskedArray(np.array([1, 2, 3]))), + ( + "param_bar", + np.ma.MaskedArray(np.array([3, 4, 0]), mask=[False, False, True]), + ), + ], + id="simple numeric, one param is missing in one round", + ), + pytest.param( + [{"foo": [[1], [2], [3]]}, {"foo": [[1], [2]]}], + [ + ( + "param_foo", + np.ma.MaskedArray([[[1], [2], [3]], [[1], [2]]], dtype=object), + ), + ], + id="lists of different lengths", + ), + pytest.param( + [{"foo": (1, 2)}, {"foo": (3, 4)}], + [ + ( + "param_foo", + ma_with_tuples, + ), + ], + id="lists tuples", + ), + pytest.param( + [{"foo": ordinal_encoder}, {"foo": one_hot_encoder}], + [ + ( + "param_foo", + np.ma.MaskedArray([ordinal_encoder, one_hot_encoder], dtype=object), + ), + ], + id="estimators", + ), + ], +) +def test_yield_masked_array_for_each_param(candidate_params, expected): + result = list(_yield_masked_array_for_each_param(candidate_params)) + for (key, value), (expected_key, expected_value) in zip(result, expected): + assert key == expected_key + assert value.dtype == expected_value.dtype + np.testing.assert_array_equal(value, expected_value) + np.testing.assert_array_equal(value.mask, expected_value.mask) + + +def test_yield_masked_array_no_runtime_warning(): + # non-regression test for https://github.com/scikit-learn/scikit-learn/issues/29929 + candidate_params = [{"param": i} for i in range(1000)] + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + list(_yield_masked_array_for_each_param(candidate_params)) diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_split.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_split.py new file mode 100644 index 0000000000000000000000000000000000000000..0f31055d9b7f959c36888efbd4adae01f0a06822 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_split.py @@ -0,0 +1,2102 @@ +"""Test the split module""" + +import re +import warnings +from itertools import combinations, combinations_with_replacement, permutations + +import numpy as np +import pytest +from scipy import stats +from scipy.sparse import issparse +from scipy.special import comb + +from sklearn import config_context +from sklearn.datasets import load_digits, make_classification +from sklearn.dummy import DummyClassifier +from sklearn.model_selection import ( + GridSearchCV, + GroupKFold, + GroupShuffleSplit, + KFold, + LeaveOneGroupOut, + LeaveOneOut, + LeavePGroupsOut, + LeavePOut, + PredefinedSplit, + RepeatedKFold, + RepeatedStratifiedKFold, + ShuffleSplit, + StratifiedGroupKFold, + StratifiedKFold, + StratifiedShuffleSplit, + TimeSeriesSplit, + check_cv, + cross_val_score, + train_test_split, +) +from sklearn.model_selection._split import ( + _build_repr, + _validate_shuffle_split, + _yields_constant_splits, +) +from sklearn.svm import SVC +from sklearn.tests.metadata_routing_common import assert_request_is_empty +from sklearn.utils._array_api import ( + _convert_to_numpy, + _get_namespace_device_dtype_ids, + get_namespace, + yield_namespace_device_dtype_combinations, +) +from sklearn.utils._array_api import ( + device as array_api_device, +) +from sklearn.utils._mocking import MockDataFrame +from sklearn.utils._testing import ( + assert_allclose, + assert_array_almost_equal, + assert_array_equal, + ignore_warnings, +) +from sklearn.utils.estimator_checks import ( + _array_api_for_tests, +) +from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS +from sklearn.utils.validation import _num_samples + +NO_GROUP_SPLITTERS = [ + KFold(), + StratifiedKFold(), + TimeSeriesSplit(), + LeaveOneOut(), + LeavePOut(p=2), + ShuffleSplit(), + StratifiedShuffleSplit(test_size=0.5), + PredefinedSplit([1, 1, 2, 2]), + RepeatedKFold(), + RepeatedStratifiedKFold(), +] + +GROUP_SPLITTERS = [ + GroupKFold(), + LeavePGroupsOut(n_groups=1), + StratifiedGroupKFold(), + LeaveOneGroupOut(), + GroupShuffleSplit(), +] +GROUP_SPLITTER_NAMES = set(splitter.__class__.__name__ for splitter in GROUP_SPLITTERS) + +ALL_SPLITTERS = NO_GROUP_SPLITTERS + GROUP_SPLITTERS # type: ignore[list-item] + +SPLITTERS_REQUIRING_TARGET = [ + StratifiedKFold(), + StratifiedShuffleSplit(), + RepeatedStratifiedKFold(), +] + +X = np.ones(10) +y = np.arange(10) // 2 +test_groups = ( + np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]), + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]), + np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]), + np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]), + [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3], + ["1", "1", "1", "1", "2", "2", "2", "3", "3", "3", "3", "3"], +) +digits = load_digits() + +pytestmark = pytest.mark.filterwarnings( + "error:The groups parameter:UserWarning:sklearn.*" +) + + +def _split(splitter, X, y, groups): + if splitter.__class__.__name__ in GROUP_SPLITTER_NAMES: + return splitter.split(X, y, groups=groups) + else: + return splitter.split(X, y) + + +def test_cross_validator_with_default_params(): + n_samples = 4 + n_unique_groups = 4 + n_splits = 2 + p = 2 + n_shuffle_splits = 10 # (the default value) + + X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + X_1d = np.array([1, 2, 3, 4]) + y = np.array([1, 1, 2, 2]) + groups = np.array([1, 2, 3, 4]) + loo = LeaveOneOut() + lpo = LeavePOut(p) + kf = KFold(n_splits) + skf = StratifiedKFold(n_splits) + lolo = LeaveOneGroupOut() + lopo = LeavePGroupsOut(p) + ss = ShuffleSplit(random_state=0) + ps = PredefinedSplit([1, 1, 2, 2]) # n_splits = np of unique folds = 2 + sgkf = StratifiedGroupKFold(n_splits) + + loo_repr = "LeaveOneOut()" + lpo_repr = "LeavePOut(p=2)" + kf_repr = "KFold(n_splits=2, random_state=None, shuffle=False)" + skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)" + lolo_repr = "LeaveOneGroupOut()" + lopo_repr = "LeavePGroupsOut(n_groups=2)" + ss_repr = ( + "ShuffleSplit(n_splits=10, random_state=0, test_size=None, train_size=None)" + ) + ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))" + sgkf_repr = "StratifiedGroupKFold(n_splits=2, random_state=None, shuffle=False)" + + n_splits_expected = [ + n_samples, + comb(n_samples, p), + n_splits, + n_splits, + n_unique_groups, + comb(n_unique_groups, p), + n_shuffle_splits, + 2, + n_splits, + ] + + for i, (cv, cv_repr) in enumerate( + zip( + [loo, lpo, kf, skf, lolo, lopo, ss, ps, sgkf], + [ + loo_repr, + lpo_repr, + kf_repr, + skf_repr, + lolo_repr, + lopo_repr, + ss_repr, + ps_repr, + sgkf_repr, + ], + ) + ): + # Test if get_n_splits works correctly + assert n_splits_expected[i] == cv.get_n_splits(X, y, groups) + + # Test if the cross-validator works as expected even if + # the data is 1d + np.testing.assert_equal( + list(_split(cv, X, y, groups)), list(_split(cv, X_1d, y, groups)) + ) + # Test that train, test indices returned are integers + for train, test in _split(cv, X, y, groups): + assert np.asarray(train).dtype.kind == "i" + assert np.asarray(test).dtype.kind == "i" + + # Test if the repr works without any errors + assert cv_repr == repr(cv) + + # ValueError for get_n_splits methods + msg = "The 'X' parameter should not be None." + with pytest.raises(ValueError, match=msg): + loo.get_n_splits(None, y, groups) + with pytest.raises(ValueError, match=msg): + lpo.get_n_splits(None, y, groups) + + +def test_2d_y(): + # smoke test for 2d y and multi-label + n_samples = 30 + rng = np.random.RandomState(1) + X = rng.randint(0, 3, size=(n_samples, 2)) + y = rng.randint(0, 3, size=(n_samples,)) + y_2d = y.reshape(-1, 1) + y_multilabel = rng.randint(0, 2, size=(n_samples, 3)) + groups = rng.randint(0, 3, size=(n_samples,)) + splitters = [ + LeaveOneOut(), + LeavePOut(p=2), + KFold(), + StratifiedKFold(), + RepeatedKFold(), + RepeatedStratifiedKFold(), + StratifiedGroupKFold(), + ShuffleSplit(), + StratifiedShuffleSplit(test_size=0.5), + GroupShuffleSplit(), + LeaveOneGroupOut(), + LeavePGroupsOut(n_groups=2), + GroupKFold(n_splits=3), + TimeSeriesSplit(), + PredefinedSplit(test_fold=groups), + ] + for splitter in splitters: + list(_split(splitter, X, y, groups=groups)) + list(_split(splitter, X, y_2d, groups=groups)) + try: + list(_split(splitter, X, y_multilabel, groups=groups)) + except ValueError as e: + allowed_target_types = ("binary", "multiclass") + msg = "Supported target types are: {}. Got 'multilabel".format( + allowed_target_types + ) + assert msg in str(e) + + +def check_valid_split(train, test, n_samples=None): + # Use python sets to get more informative assertion failure messages + train, test = set(train), set(test) + + # Train and test split should not overlap + assert train.intersection(test) == set() + + if n_samples is not None: + # Check that the union of train an test split cover all the indices + assert train.union(test) == set(range(n_samples)) + + +def check_cv_coverage(cv, X, y, groups, expected_n_splits): + n_samples = _num_samples(X) + # Check that a all the samples appear at least once in a test fold + assert cv.get_n_splits(X, y, groups) == expected_n_splits + + collected_test_samples = set() + iterations = 0 + for train, test in cv.split(X, y, groups): + check_valid_split(train, test, n_samples=n_samples) + iterations += 1 + collected_test_samples.update(test) + + # Check that the accumulated test samples cover the whole dataset + assert iterations == expected_n_splits + if n_samples is not None: + assert collected_test_samples == set(range(n_samples)) + + +def test_kfold_valueerrors(): + X1 = np.array([[1, 2], [3, 4], [5, 6]]) + X2 = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]) + # Check that errors are raised if there is not enough samples + (ValueError, next, KFold(4).split(X1)) + + # Check that a warning is raised if the least populated class has too few + # members. + y = np.array([3, 3, -1, -1, 3]) + + skf_3 = StratifiedKFold(3) + with pytest.warns(Warning, match="The least populated class"): + next(skf_3.split(X2, y)) + + sgkf_3 = StratifiedGroupKFold(3) + naive_groups = np.arange(len(y)) + with pytest.warns(Warning, match="The least populated class"): + next(sgkf_3.split(X2, y, naive_groups)) + + # Check that despite the warning the folds are still computed even + # though all the classes are not necessarily represented at on each + # side of the split at each split + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + check_cv_coverage(skf_3, X2, y, groups=None, expected_n_splits=3) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + check_cv_coverage(sgkf_3, X2, y, groups=naive_groups, expected_n_splits=3) + + # Check that errors are raised if all n_groups for individual + # classes are less than n_splits. + y = np.array([3, 3, -1, -1, 2]) + + with pytest.raises(ValueError): + next(skf_3.split(X2, y)) + with pytest.raises(ValueError): + next(sgkf_3.split(X2, y)) + + # Error when number of folds is <= 1 + with pytest.raises(ValueError): + KFold(0) + with pytest.raises(ValueError): + KFold(1) + error_string = "k-fold cross-validation requires at least one train/test split" + with pytest.raises(ValueError, match=error_string): + StratifiedKFold(0) + with pytest.raises(ValueError, match=error_string): + StratifiedKFold(1) + with pytest.raises(ValueError, match=error_string): + StratifiedGroupKFold(0) + with pytest.raises(ValueError, match=error_string): + StratifiedGroupKFold(1) + + # When n_splits is not integer: + with pytest.raises(ValueError): + KFold(1.5) + with pytest.raises(ValueError): + KFold(2.0) + with pytest.raises(ValueError): + StratifiedKFold(1.5) + with pytest.raises(ValueError): + StratifiedKFold(2.0) + with pytest.raises(ValueError): + StratifiedGroupKFold(1.5) + with pytest.raises(ValueError): + StratifiedGroupKFold(2.0) + + # When shuffle is not a bool: + with pytest.raises(TypeError): + KFold(n_splits=4, shuffle=None) + + +def test_kfold_indices(): + # Check all indices are returned in the test folds + X1 = np.ones(18) + kf = KFold(3) + check_cv_coverage(kf, X1, y=None, groups=None, expected_n_splits=3) + + # Check all indices are returned in the test folds even when equal-sized + # folds are not possible + X2 = np.ones(17) + kf = KFold(3) + check_cv_coverage(kf, X2, y=None, groups=None, expected_n_splits=3) + + # Check if get_n_splits returns the number of folds + assert 5 == KFold(5).get_n_splits(X2) + + +def test_kfold_no_shuffle(): + # Manually check that KFold preserves the data ordering on toy datasets + X2 = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] + + splits = KFold(2).split(X2[:-1]) + train, test = next(splits) + assert_array_equal(test, [0, 1]) + assert_array_equal(train, [2, 3]) + + train, test = next(splits) + assert_array_equal(test, [2, 3]) + assert_array_equal(train, [0, 1]) + + splits = KFold(2).split(X2) + train, test = next(splits) + assert_array_equal(test, [0, 1, 2]) + assert_array_equal(train, [3, 4]) + + train, test = next(splits) + assert_array_equal(test, [3, 4]) + assert_array_equal(train, [0, 1, 2]) + + +def test_stratified_kfold_no_shuffle(): + # Manually check that StratifiedKFold preserves the data ordering as much + # as possible on toy datasets in order to avoid hiding sample dependencies + # when possible + X, y = np.ones(4), [1, 1, 0, 0] + splits = StratifiedKFold(2).split(X, y) + train, test = next(splits) + assert_array_equal(test, [0, 2]) + assert_array_equal(train, [1, 3]) + + train, test = next(splits) + assert_array_equal(test, [1, 3]) + assert_array_equal(train, [0, 2]) + + X, y = np.ones(7), [1, 1, 1, 0, 0, 0, 0] + splits = StratifiedKFold(2).split(X, y) + train, test = next(splits) + assert_array_equal(test, [0, 1, 3, 4]) + assert_array_equal(train, [2, 5, 6]) + + train, test = next(splits) + assert_array_equal(test, [2, 5, 6]) + assert_array_equal(train, [0, 1, 3, 4]) + + # Check if get_n_splits returns the number of folds + assert 5 == StratifiedKFold(5).get_n_splits(X, y) + + # Make sure string labels are also supported + X = np.ones(7) + y1 = ["1", "1", "1", "0", "0", "0", "0"] + y2 = [1, 1, 1, 0, 0, 0, 0] + np.testing.assert_equal( + list(StratifiedKFold(2).split(X, y1)), list(StratifiedKFold(2).split(X, y2)) + ) + + # Check equivalence to KFold + y = [0, 1, 0, 1, 0, 1, 0, 1] + X = np.ones_like(y) + np.testing.assert_equal( + list(StratifiedKFold(3).split(X, y)), list(KFold(3).split(X, y)) + ) + + +@pytest.mark.parametrize("shuffle", [False, True]) +@pytest.mark.parametrize("k", [4, 5, 6, 7, 8, 9, 10]) +@pytest.mark.parametrize("kfold", [StratifiedKFold, StratifiedGroupKFold]) +def test_stratified_kfold_ratios(k, shuffle, kfold): + # Check that stratified kfold preserves class ratios in individual splits + # Repeat with shuffling turned off and on + n_samples = 1000 + X = np.ones(n_samples) + y = np.array( + [4] * int(0.10 * n_samples) + + [0] * int(0.89 * n_samples) + + [1] * int(0.01 * n_samples) + ) + # ensure perfect stratification with StratifiedGroupKFold + groups = np.arange(len(y)) + distr = np.bincount(y) / len(y) + + test_sizes = [] + random_state = None if not shuffle else 0 + skf = kfold(k, random_state=random_state, shuffle=shuffle) + for train, test in _split(skf, X, y, groups=groups): + assert_allclose(np.bincount(y[train]) / len(train), distr, atol=0.02) + assert_allclose(np.bincount(y[test]) / len(test), distr, atol=0.02) + test_sizes.append(len(test)) + assert np.ptp(test_sizes) <= 1 + + +@pytest.mark.parametrize("shuffle", [False, True]) +@pytest.mark.parametrize("k", [4, 6, 7]) +@pytest.mark.parametrize("kfold", [StratifiedKFold, StratifiedGroupKFold]) +def test_stratified_kfold_label_invariance(k, shuffle, kfold): + # Check that stratified kfold gives the same indices regardless of labels + n_samples = 100 + y = np.array( + [2] * int(0.10 * n_samples) + + [0] * int(0.89 * n_samples) + + [1] * int(0.01 * n_samples) + ) + X = np.ones(len(y)) + # ensure perfect stratification with StratifiedGroupKFold + groups = np.arange(len(y)) + + def get_splits(y): + random_state = None if not shuffle else 0 + return [ + (list(train), list(test)) + for train, test in _split( + kfold(k, random_state=random_state, shuffle=shuffle), + X, + y, + groups=groups, + ) + ] + + splits_base = get_splits(y) + for perm in permutations([0, 1, 2]): + y_perm = np.take(perm, y) + splits_perm = get_splits(y_perm) + assert splits_perm == splits_base + + +def test_kfold_balance(): + # Check that KFold returns folds with balanced sizes + for i in range(11, 17): + kf = KFold(5).split(X=np.ones(i)) + sizes = [len(test) for _, test in kf] + + assert (np.max(sizes) - np.min(sizes)) <= 1 + assert np.sum(sizes) == i + + +@pytest.mark.parametrize("kfold", [StratifiedKFold, StratifiedGroupKFold]) +def test_stratifiedkfold_balance(kfold): + # Check that KFold returns folds with balanced sizes (only when + # stratification is possible) + # Repeat with shuffling turned off and on + X = np.ones(17) + y = [0] * 3 + [1] * 14 + # ensure perfect stratification with StratifiedGroupKFold + groups = np.arange(len(y)) + + for shuffle in (True, False): + cv = kfold(3, shuffle=shuffle) + for i in range(11, 17): + skf = _split(cv, X[:i], y[:i], groups[:i]) + sizes = [len(test) for _, test in skf] + + assert (np.max(sizes) - np.min(sizes)) <= 1 + assert np.sum(sizes) == i + + +def test_shuffle_kfold(): + # Check the indices are shuffled properly + kf = KFold(3) + kf2 = KFold(3, shuffle=True, random_state=0) + kf3 = KFold(3, shuffle=True, random_state=1) + + X = np.ones(300) + + all_folds = np.zeros(300) + for (tr1, te1), (tr2, te2), (tr3, te3) in zip( + kf.split(X), kf2.split(X), kf3.split(X) + ): + for tr_a, tr_b in combinations((tr1, tr2, tr3), 2): + # Assert that there is no complete overlap + assert len(np.intersect1d(tr_a, tr_b)) != len(tr1) + + # Set all test indices in successive iterations of kf2 to 1 + all_folds[te2] = 1 + + # Check that all indices are returned in the different test folds + assert sum(all_folds) == 300 + + +@pytest.mark.parametrize("kfold", [KFold, StratifiedKFold, StratifiedGroupKFold]) +def test_shuffle_kfold_stratifiedkfold_reproducibility(kfold): + X = np.ones(15) # Divisible by 3 + y = [0] * 7 + [1] * 8 + groups_1 = np.arange(len(y)) + X2 = np.ones(16) # Not divisible by 3 + y2 = [0] * 8 + [1] * 8 + groups_2 = np.arange(len(y2)) + + # Check that when the shuffle is True, multiple split calls produce the + # same split when random_state is int + kf = kfold(3, shuffle=True, random_state=0) + + np.testing.assert_equal( + list(_split(kf, X, y, groups_1)), list(_split(kf, X, y, groups_1)) + ) + + # Check that when the shuffle is True, multiple split calls often + # (not always) produce different splits when random_state is + # RandomState instance or None + kf = kfold(3, shuffle=True, random_state=np.random.RandomState(0)) + for data in zip((X, X2), (y, y2), (groups_1, groups_2)): + # Test if the two splits are different cv + for (_, test_a), (_, test_b) in zip(_split(kf, *data), _split(kf, *data)): + # cv.split(...) returns an array of tuples, each tuple + # consisting of an array with train indices and test indices + # Ensure that the splits for data are not same + # when random state is not set + with pytest.raises(AssertionError): + np.testing.assert_array_equal(test_a, test_b) + + +def test_shuffle_stratifiedkfold(): + # Check that shuffling is happening when requested, and for proper + # sample coverage + X_40 = np.ones(40) + y = [0] * 20 + [1] * 20 + kf0 = StratifiedKFold(5, shuffle=True, random_state=0) + kf1 = StratifiedKFold(5, shuffle=True, random_state=1) + for (_, test0), (_, test1) in zip(kf0.split(X_40, y), kf1.split(X_40, y)): + assert set(test0) != set(test1) + check_cv_coverage(kf0, X_40, y, groups=None, expected_n_splits=5) + + # Ensure that we shuffle each class's samples with different + # random_state in StratifiedKFold + # See https://github.com/scikit-learn/scikit-learn/pull/13124 + X = np.arange(10) + y = [0] * 5 + [1] * 5 + kf1 = StratifiedKFold(5, shuffle=True, random_state=0) + kf2 = StratifiedKFold(5, shuffle=True, random_state=1) + test_set1 = sorted([tuple(s[1]) for s in kf1.split(X, y)]) + test_set2 = sorted([tuple(s[1]) for s in kf2.split(X, y)]) + assert test_set1 != test_set2 + + +def test_shuffle_groupkfold(): + # Check that shuffling is happening when requested, and for proper + # sample coverage + X = np.ones(40) + y = [0] * 20 + [1] * 20 + groups = np.arange(40) // 3 + gkf0 = GroupKFold(4, shuffle=True, random_state=0) + gkf1 = GroupKFold(4, shuffle=True, random_state=1) + + # Check that the groups are shuffled differently + test_groups0 = [ + set(groups[test_idx]) for _, test_idx in gkf0.split(X, None, groups) + ] + test_groups1 = [ + set(groups[test_idx]) for _, test_idx in gkf1.split(X, None, groups) + ] + for g0, g1 in zip(test_groups0, test_groups1): + assert g0 != g1, "Test groups should differ with different random states" + + # Check coverage and splits + check_cv_coverage(gkf0, X, y, groups, expected_n_splits=4) + check_cv_coverage(gkf1, X, y, groups, expected_n_splits=4) + + +def test_kfold_can_detect_dependent_samples_on_digits(): # see #2372 + # The digits samples are dependent: they are apparently grouped by authors + # although we don't have any information on the groups segment locations + # for this data. We can highlight this fact by computing k-fold cross- + # validation with and without shuffling: we observe that the shuffling case + # wrongly makes the IID assumption and is therefore too optimistic: it + # estimates a much higher accuracy (around 0.93) than that the non + # shuffling variant (around 0.81). + + X, y = digits.data[:600], digits.target[:600] + model = SVC(C=10, gamma=0.005) + + n_splits = 3 + + cv = KFold(n_splits=n_splits, shuffle=False) + mean_score = cross_val_score(model, X, y, cv=cv).mean() + assert 0.92 > mean_score + assert mean_score > 0.80 + + # Shuffling the data artificially breaks the dependency and hides the + # overfitting of the model with regards to the writing style of the authors + # by yielding a seriously overestimated score: + + cv = KFold(n_splits, shuffle=True, random_state=0) + mean_score = cross_val_score(model, X, y, cv=cv).mean() + assert mean_score > 0.92 + + cv = KFold(n_splits, shuffle=True, random_state=1) + mean_score = cross_val_score(model, X, y, cv=cv).mean() + assert mean_score > 0.92 + + # Similarly, StratifiedKFold should try to shuffle the data as little + # as possible (while respecting the balanced class constraints) + # and thus be able to detect the dependency by not overestimating + # the CV score either. As the digits dataset is approximately balanced + # the estimated mean score is close to the score measured with + # non-shuffled KFold + + cv = StratifiedKFold(n_splits) + mean_score = cross_val_score(model, X, y, cv=cv).mean() + assert 0.94 > mean_score + assert mean_score > 0.80 + + +def test_stratified_group_kfold_trivial(): + sgkf = StratifiedGroupKFold(n_splits=3) + # Trivial example - groups with the same distribution + y = np.array([1] * 6 + [0] * 12) + X = np.ones_like(y).reshape(-1, 1) + groups = np.asarray((1, 2, 3, 4, 5, 6, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6)) + distr = np.bincount(y) / len(y) + test_sizes = [] + for train, test in sgkf.split(X, y, groups): + # check group constraint + assert np.intersect1d(groups[train], groups[test]).size == 0 + # check y distribution + assert_allclose(np.bincount(y[train]) / len(train), distr, atol=0.02) + assert_allclose(np.bincount(y[test]) / len(test), distr, atol=0.02) + test_sizes.append(len(test)) + assert np.ptp(test_sizes) <= 1 + + +def test_stratified_group_kfold_approximate(): + # Not perfect stratification (even though it is possible) because of + # iteration over groups + sgkf = StratifiedGroupKFold(n_splits=3) + y = np.array([1] * 6 + [0] * 12) + X = np.ones_like(y).reshape(-1, 1) + groups = np.array([1, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 4, 5, 5, 5, 6, 6, 6]) + expected = np.asarray([[0.833, 0.166], [0.666, 0.333], [0.5, 0.5]]) + test_sizes = [] + for (train, test), expect_dist in zip(sgkf.split(X, y, groups), expected): + # check group constraint + assert np.intersect1d(groups[train], groups[test]).size == 0 + split_dist = np.bincount(y[test]) / len(test) + assert_allclose(split_dist, expect_dist, atol=0.001) + test_sizes.append(len(test)) + assert np.ptp(test_sizes) <= 1 + + +@pytest.mark.parametrize( + "y, groups, expected", + [ + ( + np.array([0] * 6 + [1] * 6), + np.array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]), + np.asarray([[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]), + ), + ( + np.array([0] * 9 + [1] * 3), + np.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 6]), + np.asarray([[0.75, 0.25], [0.75, 0.25], [0.75, 0.25]]), + ), + ], +) +def test_stratified_group_kfold_homogeneous_groups(y, groups, expected): + sgkf = StratifiedGroupKFold(n_splits=3) + X = np.ones_like(y).reshape(-1, 1) + for (train, test), expect_dist in zip(sgkf.split(X, y, groups), expected): + # check group constraint + assert np.intersect1d(groups[train], groups[test]).size == 0 + split_dist = np.bincount(y[test]) / len(test) + assert_allclose(split_dist, expect_dist, atol=0.001) + + +@pytest.mark.parametrize("cls_distr", [(0.4, 0.6), (0.3, 0.7), (0.2, 0.8), (0.8, 0.2)]) +@pytest.mark.parametrize("n_groups", [5, 30, 70]) +def test_stratified_group_kfold_against_group_kfold(cls_distr, n_groups): + # Check that given sufficient amount of samples StratifiedGroupKFold + # produces better stratified folds than regular GroupKFold + n_splits = 5 + sgkf = StratifiedGroupKFold(n_splits=n_splits) + gkf = GroupKFold(n_splits=n_splits) + rng = np.random.RandomState(0) + n_points = 1000 + y = rng.choice(2, size=n_points, p=cls_distr) + X = np.ones_like(y).reshape(-1, 1) + g = rng.choice(n_groups, n_points) + sgkf_folds = sgkf.split(X, y, groups=g) + gkf_folds = gkf.split(X, y, groups=g) + sgkf_entr = 0 + gkf_entr = 0 + for (sgkf_train, sgkf_test), (_, gkf_test) in zip(sgkf_folds, gkf_folds): + # check group constraint + assert np.intersect1d(g[sgkf_train], g[sgkf_test]).size == 0 + sgkf_distr = np.bincount(y[sgkf_test]) / len(sgkf_test) + gkf_distr = np.bincount(y[gkf_test]) / len(gkf_test) + sgkf_entr += stats.entropy(sgkf_distr, qk=cls_distr) + gkf_entr += stats.entropy(gkf_distr, qk=cls_distr) + sgkf_entr /= n_splits + gkf_entr /= n_splits + assert sgkf_entr <= gkf_entr + + +def test_shuffle_split(): + ss1 = ShuffleSplit(test_size=0.2, random_state=0).split(X) + ss2 = ShuffleSplit(test_size=2, random_state=0).split(X) + ss3 = ShuffleSplit(test_size=np.int32(2), random_state=0).split(X) + ss4 = ShuffleSplit(test_size=2, random_state=0).split(X) + for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4): + assert_array_equal(t1[0], t2[0]) + assert_array_equal(t2[0], t3[0]) + assert_array_equal(t3[0], t4[0]) + assert_array_equal(t1[1], t2[1]) + assert_array_equal(t2[1], t3[1]) + assert_array_equal(t3[1], t4[1]) + + +@pytest.mark.parametrize("split_class", [ShuffleSplit, StratifiedShuffleSplit]) +@pytest.mark.parametrize( + "train_size, exp_train, exp_test", [(None, 9, 1), (8, 8, 2), (0.8, 8, 2)] +) +def test_shuffle_split_default_test_size(split_class, train_size, exp_train, exp_test): + # Check that the default value has the expected behavior, i.e. 0.1 if both + # unspecified or complement train_size unless both are specified. + X = np.ones(10) + y = np.ones(10) + + X_train, X_test = next(split_class(train_size=train_size).split(X, y)) + + assert len(X_train) == exp_train + assert len(X_test) == exp_test + + +@pytest.mark.parametrize( + "train_size, exp_train, exp_test", [(None, 8, 2), (7, 7, 3), (0.7, 7, 3)] +) +def test_group_shuffle_split_default_test_size(train_size, exp_train, exp_test): + # Check that the default value has the expected behavior, i.e. 0.2 if both + # unspecified or complement train_size unless both are specified. + X = np.ones(10) + y = np.ones(10) + groups = range(10) + + X_train, X_test = next(GroupShuffleSplit(train_size=train_size).split(X, y, groups)) + + assert len(X_train) == exp_train + assert len(X_test) == exp_test + + +def test_stratified_shuffle_split_init(): + X = np.arange(7) + y = np.asarray([0, 1, 1, 1, 2, 2, 2]) + # Check that error is raised if there is a class with only one sample + with pytest.raises(ValueError): + next(StratifiedShuffleSplit(3, test_size=0.2).split(X, y)) + + # Check that error is raised if the test set size is smaller than n_classes + with pytest.raises(ValueError): + next(StratifiedShuffleSplit(3, test_size=2).split(X, y)) + # Check that error is raised if the train set size is smaller than + # n_classes + with pytest.raises(ValueError): + next(StratifiedShuffleSplit(3, test_size=3, train_size=2).split(X, y)) + + X = np.arange(9) + y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2]) + + # Train size or test size too small + with pytest.raises(ValueError): + next(StratifiedShuffleSplit(train_size=2).split(X, y)) + with pytest.raises(ValueError): + next(StratifiedShuffleSplit(test_size=2).split(X, y)) + + +def test_stratified_shuffle_split_respects_test_size(): + y = np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]) + test_size = 5 + train_size = 10 + sss = StratifiedShuffleSplit( + 6, test_size=test_size, train_size=train_size, random_state=0 + ).split(np.ones(len(y)), y) + for train, test in sss: + assert len(train) == train_size + assert len(test) == test_size + + +def test_stratified_shuffle_split_iter(): + ys = [ + np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]), + np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]), + np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2), + np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]), + np.array([-1] * 800 + [1] * 50), + np.concatenate([[i] * (100 + i) for i in range(11)]), + [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3], + ["1", "1", "1", "1", "2", "2", "2", "3", "3", "3", "3", "3"], + ] + + for y in ys: + sss = StratifiedShuffleSplit(6, test_size=0.33, random_state=0).split( + np.ones(len(y)), y + ) + y = np.asanyarray(y) # To make it indexable for y[train] + # this is how test-size is computed internally + # in _validate_shuffle_split + test_size = np.ceil(0.33 * len(y)) + train_size = len(y) - test_size + for train, test in sss: + assert_array_equal(np.unique(y[train]), np.unique(y[test])) + # Checks if folds keep classes proportions + p_train = np.bincount(np.unique(y[train], return_inverse=True)[1]) / float( + len(y[train]) + ) + p_test = np.bincount(np.unique(y[test], return_inverse=True)[1]) / float( + len(y[test]) + ) + assert_array_almost_equal(p_train, p_test, 1) + assert len(train) + len(test) == y.size + assert len(train) == train_size + assert len(test) == test_size + assert_array_equal(np.intersect1d(train, test), []) + + +def test_stratified_shuffle_split_even(): + # Test the StratifiedShuffleSplit, indices are drawn with a + # equal chance + n_folds = 5 + n_splits = 1000 + + def assert_counts_are_ok(idx_counts, p): + # Here we test that the distribution of the counts + # per index is close enough to a binomial + threshold = 0.05 / n_splits + bf = stats.binom(n_splits, p) + for count in idx_counts: + prob = bf.pmf(count) + assert prob > threshold, ( + "An index is not drawn with chance corresponding to even draws" + ) + + for n_samples in (6, 22): + groups = np.array((n_samples // 2) * [0, 1]) + splits = StratifiedShuffleSplit( + n_splits=n_splits, test_size=1.0 / n_folds, random_state=0 + ) + + train_counts = [0] * n_samples + test_counts = [0] * n_samples + n_splits_actual = 0 + for train, test in splits.split(X=np.ones(n_samples), y=groups): + n_splits_actual += 1 + for counter, ids in [(train_counts, train), (test_counts, test)]: + for id in ids: + counter[id] += 1 + assert n_splits_actual == n_splits + + n_train, n_test = _validate_shuffle_split( + n_samples, test_size=1.0 / n_folds, train_size=1.0 - (1.0 / n_folds) + ) + + assert len(train) == n_train + assert len(test) == n_test + assert len(set(train).intersection(test)) == 0 + + group_counts = np.unique(groups) + assert splits.test_size == 1.0 / n_folds + assert n_train + n_test == len(groups) + assert len(group_counts) == 2 + ex_test_p = float(n_test) / n_samples + ex_train_p = float(n_train) / n_samples + + assert_counts_are_ok(train_counts, ex_train_p) + assert_counts_are_ok(test_counts, ex_test_p) + + +def test_stratified_shuffle_split_overlap_train_test_bug(): + # See https://github.com/scikit-learn/scikit-learn/issues/6121 for + # the original bug report + y = [0, 1, 2, 3] * 3 + [4, 5] * 5 + X = np.ones_like(y) + + sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0) + + train, test = next(sss.split(X=X, y=y)) + + # no overlap + assert_array_equal(np.intersect1d(train, test), []) + + # complete partition + assert_array_equal(np.union1d(train, test), np.arange(len(y))) + + +def test_stratified_shuffle_split_multilabel(): + # fix for issue 9037 + for y in [ + np.array([[0, 1], [1, 0], [1, 0], [0, 1]]), + np.array([[0, 1], [1, 1], [1, 1], [0, 1]]), + ]: + X = np.ones_like(y) + sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0) + train, test = next(sss.split(X=X, y=y)) + y_train = y[train] + y_test = y[test] + + # no overlap + assert_array_equal(np.intersect1d(train, test), []) + + # complete partition + assert_array_equal(np.union1d(train, test), np.arange(len(y))) + + # correct stratification of entire rows + # (by design, here y[:, 0] uniquely determines the entire row of y) + expected_ratio = np.mean(y[:, 0]) + assert expected_ratio == np.mean(y_train[:, 0]) + assert expected_ratio == np.mean(y_test[:, 0]) + + +def test_stratified_shuffle_split_multilabel_many_labels(): + # fix in PR #9922: for multilabel data with > 1000 labels, str(row) + # truncates with an ellipsis for elements in positions 4 through + # len(row) - 4, so labels were not being correctly split using the powerset + # method for transforming a multilabel problem to a multiclass one; this + # test checks that this problem is fixed. + row_with_many_zeros = [1, 0, 1] + [0] * 1000 + [1, 0, 1] + row_with_many_ones = [1, 0, 1] + [1] * 1000 + [1, 0, 1] + y = np.array([row_with_many_zeros] * 10 + [row_with_many_ones] * 100) + X = np.ones_like(y) + + sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0) + train, test = next(sss.split(X=X, y=y)) + y_train = y[train] + y_test = y[test] + + # correct stratification of entire rows + # (by design, here y[:, 4] uniquely determines the entire row of y) + expected_ratio = np.mean(y[:, 4]) + assert expected_ratio == np.mean(y_train[:, 4]) + assert expected_ratio == np.mean(y_test[:, 4]) + + +def test_predefinedsplit_with_kfold_split(): + # Check that PredefinedSplit can reproduce a split generated by Kfold. + folds = np.full(10, -1.0) + kf_train = [] + kf_test = [] + for i, (train_ind, test_ind) in enumerate(KFold(5, shuffle=True).split(X)): + kf_train.append(train_ind) + kf_test.append(test_ind) + folds[test_ind] = i + ps = PredefinedSplit(folds) + # n_splits is simply the no of unique folds + assert len(np.unique(folds)) == ps.get_n_splits() + ps_train, ps_test = zip(*ps.split()) + assert_array_equal(ps_train, kf_train) + assert_array_equal(ps_test, kf_test) + + +def test_group_shuffle_split(): + for groups_i in test_groups: + X = y = np.ones(len(groups_i)) + n_splits = 6 + test_size = 1.0 / 3 + slo = GroupShuffleSplit(n_splits, test_size=test_size, random_state=0) + + # Make sure the repr works + repr(slo) + + # Test that the length is correct + assert slo.get_n_splits(X, y, groups=groups_i) == n_splits + + l_unique = np.unique(groups_i) + l = np.asarray(groups_i) + + for train, test in slo.split(X, y, groups=groups_i): + # First test: no train group is in the test set and vice versa + l_train_unique = np.unique(l[train]) + l_test_unique = np.unique(l[test]) + assert not np.any(np.isin(l[train], l_test_unique)) + assert not np.any(np.isin(l[test], l_train_unique)) + + # Second test: train and test add up to all the data + assert l[train].size + l[test].size == l.size + + # Third test: train and test are disjoint + assert_array_equal(np.intersect1d(train, test), []) + + # Fourth test: + # unique train and test groups are correct, +- 1 for rounding error + assert abs(len(l_test_unique) - round(test_size * len(l_unique))) <= 1 + assert ( + abs(len(l_train_unique) - round((1.0 - test_size) * len(l_unique))) <= 1 + ) + + +def test_leave_one_p_group_out(): + logo = LeaveOneGroupOut() + lpgo_1 = LeavePGroupsOut(n_groups=1) + lpgo_2 = LeavePGroupsOut(n_groups=2) + + # Make sure the repr works + assert repr(logo) == "LeaveOneGroupOut()" + assert repr(lpgo_1) == "LeavePGroupsOut(n_groups=1)" + assert repr(lpgo_2) == "LeavePGroupsOut(n_groups=2)" + assert repr(LeavePGroupsOut(n_groups=3)) == "LeavePGroupsOut(n_groups=3)" + + for j, (cv, p_groups_out) in enumerate(((logo, 1), (lpgo_1, 1), (lpgo_2, 2))): + for i, groups_i in enumerate(test_groups): + n_groups = len(np.unique(groups_i)) + n_splits = n_groups if p_groups_out == 1 else n_groups * (n_groups - 1) / 2 + X = y = np.ones(len(groups_i)) + + # Test that the length is correct + assert cv.get_n_splits(X, y, groups=groups_i) == n_splits + + groups_arr = np.asarray(groups_i) + + # Split using the original list / array / list of string groups_i + for train, test in cv.split(X, y, groups=groups_i): + # First test: no train group is in the test set and vice versa + assert_array_equal( + np.intersect1d(groups_arr[train], groups_arr[test]).tolist(), [] + ) + + # Second test: train and test add up to all the data + assert len(train) + len(test) == len(groups_i) + + # Third test: + # The number of groups in test must be equal to p_groups_out + assert np.unique(groups_arr[test]).shape[0], p_groups_out + + # check get_n_splits() with dummy parameters + assert logo.get_n_splits(None, None, ["a", "b", "c", "b", "c"]) == 3 + assert logo.get_n_splits(groups=[1.0, 1.1, 1.0, 1.2]) == 3 + assert lpgo_2.get_n_splits(None, None, np.arange(4)) == 6 + assert lpgo_1.get_n_splits(groups=np.arange(4)) == 4 + + # raise ValueError if a `groups` parameter is illegal + with pytest.raises(ValueError): + logo.get_n_splits(None, None, [0.0, np.nan, 0.0]) + with pytest.raises(ValueError): + lpgo_2.get_n_splits(None, None, [0.0, np.inf, 0.0]) + + msg = "The 'groups' parameter should not be None." + with pytest.raises(ValueError, match=msg): + logo.get_n_splits(None, None, None) + with pytest.raises(ValueError, match=msg): + lpgo_1.get_n_splits(None, None, None) + + +def test_leave_group_out_changing_groups(): + # Check that LeaveOneGroupOut and LeavePGroupsOut work normally if + # the groups variable is changed before calling split + groups = np.array([0, 1, 2, 1, 1, 2, 0, 0]) + X = np.ones(len(groups)) + groups_changing = np.array(groups, copy=True) + lolo = LeaveOneGroupOut().split(X, groups=groups) + lolo_changing = LeaveOneGroupOut().split(X, groups=groups) + lplo = LeavePGroupsOut(n_groups=2).split(X, groups=groups) + lplo_changing = LeavePGroupsOut(n_groups=2).split(X, groups=groups) + groups_changing[:] = 0 + for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]: + for (train, test), (train_chan, test_chan) in zip(llo, llo_changing): + assert_array_equal(train, train_chan) + assert_array_equal(test, test_chan) + + # n_splits = no of 2 (p) group combinations of the unique groups = 3C2 = 3 + assert 3 == LeavePGroupsOut(n_groups=2).get_n_splits(X, y=X, groups=groups) + # n_splits = no of unique groups (C(uniq_lbls, 1) = n_unique_groups) + assert 3 == LeaveOneGroupOut().get_n_splits(X, y=X, groups=groups) + + +def test_leave_group_out_order_dependence(): + # Check that LeaveOneGroupOut orders the splits according to the index + # of the group left out. + groups = np.array([2, 2, 0, 0, 1, 1]) + X = np.ones(len(groups)) + + splits = iter(LeaveOneGroupOut().split(X, groups=groups)) + + expected_indices = [ + ([0, 1, 4, 5], [2, 3]), + ([0, 1, 2, 3], [4, 5]), + ([2, 3, 4, 5], [0, 1]), + ] + + for expected_train, expected_test in expected_indices: + train, test = next(splits) + assert_array_equal(train, expected_train) + assert_array_equal(test, expected_test) + + +def test_leave_one_p_group_out_error_on_fewer_number_of_groups(): + X = y = groups = np.ones(0) + msg = re.escape("Found array with 0 sample(s)") + with pytest.raises(ValueError, match=msg): + next(LeaveOneGroupOut().split(X, y, groups)) + + X = y = groups = np.ones(1) + msg = re.escape( + f"The groups parameter contains fewer than 2 unique groups ({groups})." + " LeaveOneGroupOut expects at least 2." + ) + with pytest.raises(ValueError, match=msg): + next(LeaveOneGroupOut().split(X, y, groups)) + + X = y = groups = np.ones(1) + msg = re.escape( + "The groups parameter contains fewer than (or equal to) n_groups " + f"(3) numbers of unique groups ({groups}). LeavePGroupsOut expects " + "that at least n_groups + 1 (4) unique groups " + "be present" + ) + with pytest.raises(ValueError, match=msg): + next(LeavePGroupsOut(n_groups=3).split(X, y, groups)) + + X = y = groups = np.arange(3) + msg = re.escape( + "The groups parameter contains fewer than (or equal to) n_groups " + f"(3) numbers of unique groups ({groups}). LeavePGroupsOut expects " + "that at least n_groups + 1 (4) unique groups " + "be present" + ) + with pytest.raises(ValueError, match=msg): + next(LeavePGroupsOut(n_groups=3).split(X, y, groups)) + + +def test_repeated_cv_value_errors(): + # n_repeats is not integer or <= 0 + for cv in (RepeatedKFold, RepeatedStratifiedKFold): + with pytest.raises(ValueError): + cv(n_repeats=0) + with pytest.raises(ValueError): + cv(n_repeats=1.5) + + +@pytest.mark.parametrize("RepeatedCV", [RepeatedKFold, RepeatedStratifiedKFold]) +def test_repeated_cv_repr(RepeatedCV): + n_splits, n_repeats = 2, 6 + repeated_cv = RepeatedCV(n_splits=n_splits, n_repeats=n_repeats) + repeated_cv_repr = "{}(n_repeats=6, n_splits=2, random_state=None)".format( + repeated_cv.__class__.__name__ + ) + assert repeated_cv_repr == repr(repeated_cv) + + +def test_repeated_kfold_determinstic_split(): + X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] + random_state = 258173307 + rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=random_state) + + # split should produce same and deterministic splits on + # each call + for _ in range(3): + splits = rkf.split(X) + train, test = next(splits) + assert_array_equal(train, [2, 4]) + assert_array_equal(test, [0, 1, 3]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 3]) + assert_array_equal(test, [2, 4]) + + train, test = next(splits) + assert_array_equal(train, [0, 1]) + assert_array_equal(test, [2, 3, 4]) + + train, test = next(splits) + assert_array_equal(train, [2, 3, 4]) + assert_array_equal(test, [0, 1]) + + with pytest.raises(StopIteration): + next(splits) + + +def test_get_n_splits_for_repeated_kfold(): + n_splits = 3 + n_repeats = 4 + rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats) + expected_n_splits = n_splits * n_repeats + assert expected_n_splits == rkf.get_n_splits() + + +def test_get_n_splits_for_repeated_stratified_kfold(): + n_splits = 3 + n_repeats = 4 + rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats) + expected_n_splits = n_splits * n_repeats + assert expected_n_splits == rskf.get_n_splits() + + +def test_repeated_stratified_kfold_determinstic_split(): + X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] + y = [1, 1, 1, 0, 0] + random_state = 1944695409 + rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=2, random_state=random_state) + + # split should produce same and deterministic splits on + # each call + for _ in range(3): + splits = rskf.split(X, y) + train, test = next(splits) + assert_array_equal(train, [1, 4]) + assert_array_equal(test, [0, 2, 3]) + + train, test = next(splits) + assert_array_equal(train, [0, 2, 3]) + assert_array_equal(test, [1, 4]) + + train, test = next(splits) + assert_array_equal(train, [2, 3]) + assert_array_equal(test, [0, 1, 4]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 4]) + assert_array_equal(test, [2, 3]) + + with pytest.raises(StopIteration): + next(splits) + + +def test_train_test_split_errors(): + pytest.raises(ValueError, train_test_split) + + pytest.raises(ValueError, train_test_split, range(3), train_size=1.1) + + pytest.raises(ValueError, train_test_split, range(3), test_size=0.6, train_size=0.6) + pytest.raises( + ValueError, + train_test_split, + range(3), + test_size=np.float32(0.6), + train_size=np.float32(0.6), + ) + pytest.raises(ValueError, train_test_split, range(3), test_size="wrong_type") + pytest.raises(ValueError, train_test_split, range(3), test_size=2, train_size=4) + pytest.raises(TypeError, train_test_split, range(3), some_argument=1.1) + pytest.raises(ValueError, train_test_split, range(3), range(42)) + pytest.raises(ValueError, train_test_split, range(10), shuffle=False, stratify=True) + + with pytest.raises( + ValueError, + match=r"train_size=11 should be either positive and " + r"smaller than the number of samples 10 or a " + r"float in the \(0, 1\) range", + ): + train_test_split(range(10), train_size=11, test_size=1) + + +@pytest.mark.parametrize( + "train_size, exp_train, exp_test", [(None, 7, 3), (8, 8, 2), (0.8, 8, 2)] +) +def test_train_test_split_default_test_size(train_size, exp_train, exp_test): + # Check that the default value has the expected behavior, i.e. complement + # train_size unless both are specified. + X_train, X_test = train_test_split(X, train_size=train_size) + + assert len(X_train) == exp_train + assert len(X_test) == exp_test + + +@pytest.mark.parametrize( + "array_namespace, device, dtype_name", + yield_namespace_device_dtype_combinations(), + ids=_get_namespace_device_dtype_ids, +) +@pytest.mark.parametrize( + "shuffle,stratify", + ( + (True, None), + (True, np.hstack((np.ones(6), np.zeros(4)))), + # stratification only works with shuffling + (False, None), + ), +) +def test_array_api_train_test_split( + shuffle, stratify, array_namespace, device, dtype_name +): + xp = _array_api_for_tests(array_namespace, device) + + X = np.arange(100).reshape((10, 10)) + y = np.arange(10) + + X_np = X.astype(dtype_name) + X_xp = xp.asarray(X_np, device=device) + + y_np = y.astype(dtype_name) + y_xp = xp.asarray(y_np, device=device) + + X_train_np, X_test_np, y_train_np, y_test_np = train_test_split( + X_np, y, random_state=0, shuffle=shuffle, stratify=stratify + ) + with config_context(array_api_dispatch=True): + if stratify is not None: + stratify_xp = xp.asarray(stratify) + else: + stratify_xp = stratify + X_train_xp, X_test_xp, y_train_xp, y_test_xp = train_test_split( + X_xp, y_xp, shuffle=shuffle, stratify=stratify_xp, random_state=0 + ) + + # Check that namespace is preserved, has to happen with + # array_api_dispatch enabled. + assert get_namespace(X_train_xp)[0] == get_namespace(X_xp)[0] + assert get_namespace(X_test_xp)[0] == get_namespace(X_xp)[0] + assert get_namespace(y_train_xp)[0] == get_namespace(y_xp)[0] + assert get_namespace(y_test_xp)[0] == get_namespace(y_xp)[0] + + # Check device and dtype is preserved on output + assert array_api_device(X_train_xp) == array_api_device(X_xp) + assert array_api_device(y_train_xp) == array_api_device(y_xp) + assert array_api_device(X_test_xp) == array_api_device(X_xp) + assert array_api_device(y_test_xp) == array_api_device(y_xp) + + assert X_train_xp.dtype == X_xp.dtype + assert y_train_xp.dtype == y_xp.dtype + assert X_test_xp.dtype == X_xp.dtype + assert y_test_xp.dtype == y_xp.dtype + + assert_allclose( + _convert_to_numpy(X_train_xp, xp=xp), + X_train_np, + ) + assert_allclose( + _convert_to_numpy(X_test_xp, xp=xp), + X_test_np, + ) + + +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +def test_train_test_split(coo_container): + X = np.arange(100).reshape((10, 10)) + X_s = coo_container(X) + y = np.arange(10) + + # simple test + split = train_test_split(X, y, test_size=None, train_size=0.5) + X_train, X_test, y_train, y_test = split + assert len(y_test) == len(y_train) + # test correspondence of X and y + assert_array_equal(X_train[:, 0], y_train * 10) + assert_array_equal(X_test[:, 0], y_test * 10) + + # don't convert lists to anything else by default + split = train_test_split(X, X_s, y.tolist()) + X_train, X_test, X_s_train, X_s_test, y_train, y_test = split + assert isinstance(y_train, list) + assert isinstance(y_test, list) + + # allow nd-arrays + X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2) + y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11) + split = train_test_split(X_4d, y_3d) + assert split[0].shape == (7, 5, 3, 2) + assert split[1].shape == (3, 5, 3, 2) + assert split[2].shape == (7, 7, 11) + assert split[3].shape == (3, 7, 11) + + # test stratification option + y = np.array([1, 1, 1, 1, 2, 2, 2, 2]) + for test_size, exp_test_size in zip([2, 4, 0.25, 0.5, 0.75], [2, 4, 2, 4, 6]): + train, test = train_test_split( + y, test_size=test_size, stratify=y, random_state=0 + ) + assert len(test) == exp_test_size + assert len(test) + len(train) == len(y) + # check the 1:1 ratio of ones and twos in the data is preserved + assert np.sum(train == 1) == np.sum(train == 2) + + # test unshuffled split + y = np.arange(10) + for test_size in [2, 0.2]: + train, test = train_test_split(y, shuffle=False, test_size=test_size) + assert_array_equal(test, [8, 9]) + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7]) + + +def test_train_test_split_32bit_overflow(): + """Check for integer overflow on 32-bit platforms. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/20774 + """ + + # A number 'n' big enough for expression 'n * n * train_size' to cause + # an overflow for signed 32-bit integer + big_number = 100000 + + # Definition of 'y' is a part of reproduction - population for at least + # one class should be in the same order of magnitude as size of X + X = np.arange(big_number) + y = X > (0.99 * big_number) + + split = train_test_split(X, y, stratify=y, train_size=0.25) + X_train, X_test, y_train, y_test = split + + assert X_train.size + X_test.size == big_number + assert y_train.size + y_test.size == big_number + + +def test_train_test_split_pandas(): + # check train_test_split doesn't destroy pandas dataframe + types = [MockDataFrame] + try: + from pandas import DataFrame + + types.append(DataFrame) + except ImportError: + pass + for InputFeatureType in types: + # X dataframe + X_df = InputFeatureType(X) + X_train, X_test = train_test_split(X_df) + assert isinstance(X_train, InputFeatureType) + assert isinstance(X_test, InputFeatureType) + + +@pytest.mark.parametrize( + "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS +) +def test_train_test_split_sparse(sparse_container): + # check that train_test_split converts scipy sparse matrices + # to csr, as stated in the documentation + X = np.arange(100).reshape((10, 10)) + X_s = sparse_container(X) + X_train, X_test = train_test_split(X_s) + assert issparse(X_train) and X_train.format == "csr" + assert issparse(X_test) and X_test.format == "csr" + + +def test_train_test_split_mock_pandas(): + # X mock dataframe + X_df = MockDataFrame(X) + X_train, X_test = train_test_split(X_df) + assert isinstance(X_train, MockDataFrame) + assert isinstance(X_test, MockDataFrame) + X_train_arr, X_test_arr = train_test_split(X_df) + + +def test_train_test_split_list_input(): + # Check that when y is a list / list of string labels, it works. + X = np.ones(7) + y1 = ["1"] * 4 + ["0"] * 3 + y2 = np.hstack((np.ones(4), np.zeros(3))) + y3 = y2.tolist() + + for stratify in (True, False): + X_train1, X_test1, y_train1, y_test1 = train_test_split( + X, y1, stratify=y1 if stratify else None, random_state=0 + ) + X_train2, X_test2, y_train2, y_test2 = train_test_split( + X, y2, stratify=y2 if stratify else None, random_state=0 + ) + X_train3, X_test3, y_train3, y_test3 = train_test_split( + X, y3, stratify=y3 if stratify else None, random_state=0 + ) + + np.testing.assert_equal(X_train1, X_train2) + np.testing.assert_equal(y_train2, y_train3) + np.testing.assert_equal(X_test1, X_test3) + np.testing.assert_equal(y_test3, y_test2) + + +@pytest.mark.parametrize( + "test_size, train_size", + [(2.0, None), (1.0, None), (0.1, 0.95), (None, 1j), (11, None), (10, None), (8, 3)], +) +def test_shufflesplit_errors(test_size, train_size): + with pytest.raises(ValueError): + next(ShuffleSplit(test_size=test_size, train_size=train_size).split(X)) + + +def test_shufflesplit_reproducible(): + # Check that iterating twice on the ShuffleSplit gives the same + # sequence of train-test when the random_state is given + ss = ShuffleSplit(random_state=21) + assert_array_equal([a for a, b in ss.split(X)], [a for a, b in ss.split(X)]) + + +def test_stratifiedshufflesplit_list_input(): + # Check that when y is a list / list of string labels, it works. + sss = StratifiedShuffleSplit(test_size=2, random_state=42) + X = np.ones(7) + y1 = ["1"] * 4 + ["0"] * 3 + y2 = np.hstack((np.ones(4), np.zeros(3))) + y3 = y2.tolist() + + np.testing.assert_equal(list(sss.split(X, y1)), list(sss.split(X, y2))) + np.testing.assert_equal(list(sss.split(X, y3)), list(sss.split(X, y2))) + + +def test_train_test_split_allow_nans(): + # Check that train_test_split allows input data with NaNs + X = np.arange(200, dtype=np.float64).reshape(10, -1) + X[2, :] = np.nan + y = np.repeat([0, 1], X.shape[0] / 2) + train_test_split(X, y, test_size=0.2, random_state=42) + + +def test_check_cv(): + X = np.ones(9) + cv = check_cv(3, classifier=False) + # Use numpy.testing.assert_equal which recursively compares + # lists of lists + np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X))) + + y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1]) + cv = check_cv(3, y_binary, classifier=True) + np.testing.assert_equal( + list(StratifiedKFold(3).split(X, y_binary)), list(cv.split(X, y_binary)) + ) + + y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2]) + cv = check_cv(3, y_multiclass, classifier=True) + np.testing.assert_equal( + list(StratifiedKFold(3).split(X, y_multiclass)), list(cv.split(X, y_multiclass)) + ) + # also works with 2d multiclass + y_multiclass_2d = y_multiclass.reshape(-1, 1) + cv = check_cv(3, y_multiclass_2d, classifier=True) + np.testing.assert_equal( + list(StratifiedKFold(3).split(X, y_multiclass_2d)), + list(cv.split(X, y_multiclass_2d)), + ) + + assert not np.all( + next(StratifiedKFold(3).split(X, y_multiclass_2d))[0] + == next(KFold(3).split(X, y_multiclass_2d))[0] + ) + + X = np.ones(5) + y_multilabel = np.array( + [[0, 0, 0, 0], [0, 1, 1, 0], [0, 0, 0, 1], [1, 1, 0, 1], [0, 0, 1, 0]] + ) + cv = check_cv(3, y_multilabel, classifier=True) + np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X))) + + y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]]) + cv = check_cv(3, y_multioutput, classifier=True) + np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X))) + + with pytest.raises(ValueError): + check_cv(cv="lolo") + + +def test_cv_iterable_wrapper(): + kf_iter = KFold().split(X, y) + kf_iter_wrapped = check_cv(kf_iter) + # Since the wrapped iterable is enlisted and stored, + # split can be called any number of times to produce + # consistent results. + np.testing.assert_equal( + list(kf_iter_wrapped.split(X, y)), list(kf_iter_wrapped.split(X, y)) + ) + # If the splits are randomized, successive calls to split yields different + # results + kf_randomized_iter = KFold(shuffle=True, random_state=0).split(X, y) + kf_randomized_iter_wrapped = check_cv(kf_randomized_iter) + # numpy's assert_array_equal properly compares nested lists + np.testing.assert_equal( + list(kf_randomized_iter_wrapped.split(X, y)), + list(kf_randomized_iter_wrapped.split(X, y)), + ) + + try: + splits_are_equal = True + np.testing.assert_equal( + list(kf_iter_wrapped.split(X, y)), + list(kf_randomized_iter_wrapped.split(X, y)), + ) + except AssertionError: + splits_are_equal = False + assert not splits_are_equal, ( + "If the splits are randomized, " + "successive calls to split should yield different results" + ) + + +@pytest.mark.parametrize("kfold", [GroupKFold, StratifiedGroupKFold]) +@pytest.mark.parametrize("shuffle", [True, False]) +def test_group_kfold(kfold, shuffle, global_random_seed): + rng = np.random.RandomState(global_random_seed) + + # Parameters of the test + n_groups = 15 + n_samples = 1000 + n_splits = 5 + + X = y = np.ones(n_samples) + + # Construct the test data + tolerance = 0.05 * n_samples # 5 percent error allowed + groups = rng.randint(0, n_groups, n_samples) + + ideal_n_groups_per_fold = n_samples // n_splits + + len(np.unique(groups)) + # Get the test fold indices from the test set indices of each fold + folds = np.zeros(n_samples) + random_state = None if not shuffle else global_random_seed + lkf = kfold(n_splits=n_splits, shuffle=shuffle, random_state=random_state) + for i, (_, test) in enumerate(lkf.split(X, y, groups)): + folds[test] = i + + # Check that folds have approximately the same size + assert len(folds) == len(groups) + for i in np.unique(folds): + assert tolerance >= abs(sum(folds == i) - ideal_n_groups_per_fold) + + # Check that each group appears only in 1 fold + for group in np.unique(groups): + assert len(np.unique(folds[groups == group])) == 1 + + # Check that no group is on both sides of the split + groups = np.asarray(groups, dtype=object) + for train, test in lkf.split(X, y, groups): + assert len(np.intersect1d(groups[train], groups[test])) == 0 + + # Construct the test data + groups = np.array( + [ + "Albert", + "Jean", + "Bertrand", + "Michel", + "Jean", + "Francis", + "Robert", + "Michel", + "Rachel", + "Lois", + "Michelle", + "Bernard", + "Marion", + "Laura", + "Jean", + "Rachel", + "Franck", + "John", + "Gael", + "Anna", + "Alix", + "Robert", + "Marion", + "David", + "Tony", + "Abel", + "Becky", + "Madmood", + "Cary", + "Mary", + "Alexandre", + "David", + "Francis", + "Barack", + "Abdoul", + "Rasha", + "Xi", + "Silvia", + ] + ) + + n_groups = len(np.unique(groups)) + n_samples = len(groups) + n_splits = 5 + tolerance = 0.05 * n_samples # 5 percent error allowed + ideal_n_groups_per_fold = n_samples // n_splits + + X = y = np.ones(n_samples) + + # Get the test fold indices from the test set indices of each fold + folds = np.zeros(n_samples) + for i, (_, test) in enumerate(lkf.split(X, y, groups)): + folds[test] = i + + # Check that folds have approximately the same size + assert len(folds) == len(groups) + if not shuffle: + for i in np.unique(folds): + assert tolerance >= abs(sum(folds == i) - ideal_n_groups_per_fold) + + # Check that each group appears only in 1 fold + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + for group in np.unique(groups): + assert len(np.unique(folds[groups == group])) == 1 + + # Check that no group is on both sides of the split + groups = np.asarray(groups, dtype=object) + for train, test in lkf.split(X, y, groups): + assert len(np.intersect1d(groups[train], groups[test])) == 0 + + # groups can also be a list + # use a new instance for reproducibility when shuffle=True + lkf_copy = kfold(n_splits=n_splits, shuffle=shuffle, random_state=random_state) + cv_iter = list(lkf.split(X, y, groups.tolist())) + for (train1, test1), (train2, test2) in zip(lkf_copy.split(X, y, groups), cv_iter): + assert_array_equal(train1, train2) + assert_array_equal(test1, test2) + + # Should fail if there are more folds than groups + groups = np.array([1, 1, 1, 2, 2]) + X = y = np.ones(len(groups)) + with pytest.raises(ValueError, match="Cannot have number of splits.*greater"): + next(GroupKFold(n_splits=3).split(X, y, groups)) + + +def test_time_series_cv(): + X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]] + + # Should fail if there are more folds than samples + with pytest.raises(ValueError, match="Cannot have number of folds.*greater"): + next(TimeSeriesSplit(n_splits=7).split(X)) + + tscv = TimeSeriesSplit(2) + + # Manually check that Time Series CV preserves the data + # ordering on toy datasets + splits = tscv.split(X[:-1]) + train, test = next(splits) + assert_array_equal(train, [0, 1]) + assert_array_equal(test, [2, 3]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3]) + assert_array_equal(test, [4, 5]) + + splits = TimeSeriesSplit(2).split(X) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2]) + assert_array_equal(test, [3, 4]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4]) + assert_array_equal(test, [5, 6]) + + # Check get_n_splits returns the correct number of splits + splits = TimeSeriesSplit(2).split(X) + n_splits_actual = len(list(splits)) + assert n_splits_actual == tscv.get_n_splits() + assert n_splits_actual == 2 + + +def _check_time_series_max_train_size(splits, check_splits, max_train_size): + for (train, test), (check_train, check_test) in zip(splits, check_splits): + assert_array_equal(test, check_test) + assert len(check_train) <= max_train_size + suffix_start = max(len(train) - max_train_size, 0) + assert_array_equal(check_train, train[suffix_start:]) + + +def test_time_series_max_train_size(): + X = np.zeros((6, 1)) + splits = TimeSeriesSplit(n_splits=3).split(X) + check_splits = TimeSeriesSplit(n_splits=3, max_train_size=3).split(X) + _check_time_series_max_train_size(splits, check_splits, max_train_size=3) + + # Test for the case where the size of a fold is greater than max_train_size + check_splits = TimeSeriesSplit(n_splits=3, max_train_size=2).split(X) + _check_time_series_max_train_size(splits, check_splits, max_train_size=2) + + # Test for the case where the size of each fold is less than max_train_size + check_splits = TimeSeriesSplit(n_splits=3, max_train_size=5).split(X) + _check_time_series_max_train_size(splits, check_splits, max_train_size=2) + + +def test_time_series_test_size(): + X = np.zeros((10, 1)) + + # Test alone + splits = TimeSeriesSplit(n_splits=3, test_size=3).split(X) + + train, test = next(splits) + assert_array_equal(train, [0]) + assert_array_equal(test, [1, 2, 3]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3]) + assert_array_equal(test, [4, 5, 6]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6]) + assert_array_equal(test, [7, 8, 9]) + + # Test with max_train_size + splits = TimeSeriesSplit(n_splits=2, test_size=2, max_train_size=4).split(X) + + train, test = next(splits) + assert_array_equal(train, [2, 3, 4, 5]) + assert_array_equal(test, [6, 7]) + + train, test = next(splits) + assert_array_equal(train, [4, 5, 6, 7]) + assert_array_equal(test, [8, 9]) + + # Should fail with not enough data points for configuration + with pytest.raises(ValueError, match="Too many splits.*with test_size"): + splits = TimeSeriesSplit(n_splits=5, test_size=2).split(X) + next(splits) + + +def test_time_series_gap(): + X = np.zeros((10, 1)) + + # Test alone + splits = TimeSeriesSplit(n_splits=2, gap=2).split(X) + + train, test = next(splits) + assert_array_equal(train, [0, 1]) + assert_array_equal(test, [4, 5, 6]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4]) + assert_array_equal(test, [7, 8, 9]) + + # Test with max_train_size + splits = TimeSeriesSplit(n_splits=3, gap=2, max_train_size=2).split(X) + + train, test = next(splits) + assert_array_equal(train, [0, 1]) + assert_array_equal(test, [4, 5]) + + train, test = next(splits) + assert_array_equal(train, [2, 3]) + assert_array_equal(test, [6, 7]) + + train, test = next(splits) + assert_array_equal(train, [4, 5]) + assert_array_equal(test, [8, 9]) + + # Test with test_size + splits = TimeSeriesSplit(n_splits=2, gap=2, max_train_size=4, test_size=2).split(X) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3]) + assert_array_equal(test, [6, 7]) + + train, test = next(splits) + assert_array_equal(train, [2, 3, 4, 5]) + assert_array_equal(test, [8, 9]) + + # Test with additional test_size + splits = TimeSeriesSplit(n_splits=2, gap=2, test_size=3).split(X) + + train, test = next(splits) + assert_array_equal(train, [0, 1]) + assert_array_equal(test, [4, 5, 6]) + + train, test = next(splits) + assert_array_equal(train, [0, 1, 2, 3, 4]) + assert_array_equal(test, [7, 8, 9]) + + # Verify proper error is thrown + with pytest.raises(ValueError, match="Too many splits.*and gap"): + splits = TimeSeriesSplit(n_splits=4, gap=2).split(X) + next(splits) + + +@ignore_warnings +def test_nested_cv(): + # Test if nested cross validation works with different combinations of cv + rng = np.random.RandomState(0) + + X, y = make_classification(n_samples=15, n_classes=2, random_state=0) + groups = rng.randint(0, 5, 15) + + cvs = [ + LeaveOneGroupOut(), + StratifiedKFold(n_splits=2), + LeaveOneOut(), + GroupKFold(n_splits=3), + StratifiedKFold(), + StratifiedGroupKFold(), + StratifiedShuffleSplit(n_splits=3, random_state=0), + ] + + for inner_cv, outer_cv in combinations_with_replacement(cvs, 2): + gs = GridSearchCV( + DummyClassifier(), + param_grid={"strategy": ["stratified", "most_frequent"]}, + cv=inner_cv, + error_score="raise", + ) + cross_val_score( + gs, X=X, y=y, groups=groups, cv=outer_cv, params={"groups": groups} + ) + + +def test_build_repr(): + class MockSplitter: + def __init__(self, a, b=0, c=None): + self.a = a + self.b = b + self.c = c + + def __repr__(self): + return _build_repr(self) + + assert repr(MockSplitter(5, 6)) == "MockSplitter(a=5, b=6, c=None)" + + +@pytest.mark.parametrize( + "CVSplitter", (ShuffleSplit, GroupShuffleSplit, StratifiedShuffleSplit) +) +def test_shuffle_split_empty_trainset(CVSplitter): + cv = CVSplitter(test_size=0.99) + X, y = [[1]], [0] # 1 sample + with pytest.raises( + ValueError, + match=( + "With n_samples=1, test_size=0.99 and train_size=None, " + "the resulting train set will be empty" + ), + ): + next(_split(cv, X, y, groups=[1])) + + +def test_train_test_split_empty_trainset(): + (X,) = [[1]] # 1 sample + with pytest.raises( + ValueError, + match=( + "With n_samples=1, test_size=0.99 and train_size=None, " + "the resulting train set will be empty" + ), + ): + train_test_split(X, test_size=0.99) + + X = [[1], [1], [1]] # 3 samples, ask for more than 2 thirds + with pytest.raises( + ValueError, + match=( + "With n_samples=3, test_size=0.67 and train_size=None, " + "the resulting train set will be empty" + ), + ): + train_test_split(X, test_size=0.67) + + +def test_leave_one_out_empty_trainset(): + # LeaveOneGroup out expect at least 2 groups so no need to check + cv = LeaveOneOut() + X, y = [[1]], [0] # 1 sample + with pytest.raises(ValueError, match="Cannot perform LeaveOneOut with n_samples=1"): + next(cv.split(X, y)) + + +def test_leave_p_out_empty_trainset(): + # No need to check LeavePGroupsOut + cv = LeavePOut(p=2) + X, y = [[1], [2]], [0, 3] # 2 samples + with pytest.raises( + ValueError, match="p=2 must be strictly less than the number of samples=2" + ): + next(cv.split(X, y)) + + +@pytest.mark.parametrize( + "Klass", (KFold, StratifiedKFold, StratifiedGroupKFold, GroupKFold) +) +def test_random_state_shuffle_false(Klass): + # passing a non-default random_state when shuffle=False makes no sense + with pytest.raises(ValueError, match="has no effect since shuffle is False"): + Klass(3, shuffle=False, random_state=0) + + +@pytest.mark.parametrize( + "cv, expected", + [ + (KFold(), True), + (KFold(shuffle=True, random_state=123), True), + (StratifiedKFold(), True), + (StratifiedKFold(shuffle=True, random_state=123), True), + (StratifiedGroupKFold(shuffle=True, random_state=123), True), + (StratifiedGroupKFold(), True), + (RepeatedKFold(random_state=123), True), + (RepeatedStratifiedKFold(random_state=123), True), + (ShuffleSplit(random_state=123), True), + (GroupShuffleSplit(random_state=123), True), + (StratifiedShuffleSplit(random_state=123), True), + (GroupKFold(), True), + (GroupKFold(shuffle=True, random_state=123), True), + (TimeSeriesSplit(), True), + (LeaveOneOut(), True), + (LeaveOneGroupOut(), True), + (LeavePGroupsOut(n_groups=2), True), + (LeavePOut(p=2), True), + (KFold(shuffle=True, random_state=None), False), + (KFold(shuffle=True, random_state=None), False), + (StratifiedKFold(shuffle=True, random_state=np.random.RandomState(0)), False), + (StratifiedKFold(shuffle=True, random_state=np.random.RandomState(0)), False), + (RepeatedKFold(random_state=None), False), + (RepeatedKFold(random_state=np.random.RandomState(0)), False), + (RepeatedStratifiedKFold(random_state=None), False), + (RepeatedStratifiedKFold(random_state=np.random.RandomState(0)), False), + (ShuffleSplit(random_state=None), False), + (ShuffleSplit(random_state=np.random.RandomState(0)), False), + (GroupShuffleSplit(random_state=None), False), + (GroupShuffleSplit(random_state=np.random.RandomState(0)), False), + (StratifiedShuffleSplit(random_state=None), False), + (StratifiedShuffleSplit(random_state=np.random.RandomState(0)), False), + ], +) +def test_yields_constant_splits(cv, expected): + assert _yields_constant_splits(cv) == expected + + +@pytest.mark.parametrize("cv", ALL_SPLITTERS, ids=[str(cv) for cv in ALL_SPLITTERS]) +def test_splitter_get_metadata_routing(cv): + """Check get_metadata_routing returns the correct MetadataRouter.""" + assert hasattr(cv, "get_metadata_routing") + metadata = cv.get_metadata_routing() + if cv in GROUP_SPLITTERS: + assert metadata.split.requests["groups"] is True + elif cv in NO_GROUP_SPLITTERS: + assert not metadata.split.requests + + assert_request_is_empty(metadata, exclude=["split"]) + + +@pytest.mark.parametrize("cv", ALL_SPLITTERS, ids=[str(cv) for cv in ALL_SPLITTERS]) +def test_splitter_set_split_request(cv): + """Check set_split_request is defined for group splitters and not for others.""" + if cv in GROUP_SPLITTERS: + assert hasattr(cv, "set_split_request") + elif cv in NO_GROUP_SPLITTERS: + assert not hasattr(cv, "set_split_request") + + +@pytest.mark.parametrize("cv", NO_GROUP_SPLITTERS, ids=str) +def test_no_group_splitters_warns_with_groups(cv): + msg = f"The groups parameter is ignored by {cv.__class__.__name__}" + + n_samples = 30 + rng = np.random.RandomState(1) + X = rng.randint(0, 3, size=(n_samples, 2)) + y = rng.randint(0, 3, size=(n_samples,)) + groups = rng.randint(0, 3, size=(n_samples,)) + + with pytest.warns(UserWarning, match=msg): + cv.split(X, y, groups=groups) + + +@pytest.mark.parametrize( + "cv", SPLITTERS_REQUIRING_TARGET, ids=[str(cv) for cv in SPLITTERS_REQUIRING_TARGET] +) +def test_stratified_splitter_without_y(cv): + msg = "missing 1 required positional argument: 'y'" + with pytest.raises(TypeError, match=msg): + cv.split(X) diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_successive_halving.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_successive_halving.py new file mode 100644 index 0000000000000000000000000000000000000000..bdfab45b4f7ca337ce7e3ce92df517a00bc42a8e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_successive_halving.py @@ -0,0 +1,853 @@ +from math import ceil + +import numpy as np +import pytest +from scipy.stats import expon, norm, randint + +from sklearn.datasets import make_classification +from sklearn.dummy import DummyClassifier +from sklearn.experimental import enable_halving_search_cv # noqa: F401 +from sklearn.model_selection import ( + GroupKFold, + GroupShuffleSplit, + HalvingGridSearchCV, + HalvingRandomSearchCV, + KFold, + LeaveOneGroupOut, + LeavePGroupsOut, + ShuffleSplit, + StratifiedKFold, + StratifiedShuffleSplit, +) +from sklearn.model_selection._search_successive_halving import ( + _SubsampleMetaSplitter, + _top_k, +) +from sklearn.model_selection.tests.test_search import ( + check_cv_results_array_types, + check_cv_results_keys, +) +from sklearn.svm import SVC, LinearSVC + + +class FastClassifier(DummyClassifier): + """Dummy classifier that accepts parameters a, b, ... z. + + These parameter don't affect the predictions and are useful for fast + grid searching.""" + + # update the constraints such that we accept all parameters from a to z + _parameter_constraints: dict = { + **DummyClassifier._parameter_constraints, + **{chr(key): "no_validation" for key in range(ord("a"), ord("z") + 1)}, + } + + def __init__( + self, strategy="stratified", random_state=None, constant=None, **kwargs + ): + super().__init__( + strategy=strategy, random_state=random_state, constant=constant + ) + + def get_params(self, deep=False): + params = super().get_params(deep=deep) + for char in range(ord("a"), ord("z") + 1): + params[chr(char)] = "whatever" + return params + + +class SometimesFailClassifier(DummyClassifier): + def __init__( + self, + strategy="stratified", + random_state=None, + constant=None, + n_estimators=10, + fail_fit=False, + fail_predict=False, + a=0, + ): + self.fail_fit = fail_fit + self.fail_predict = fail_predict + self.n_estimators = n_estimators + self.a = a + + super().__init__( + strategy=strategy, random_state=random_state, constant=constant + ) + + def fit(self, X, y): + if self.fail_fit: + raise Exception("fitting failed") + return super().fit(X, y) + + def predict(self, X): + if self.fail_predict: + raise Exception("predict failed") + return super().predict(X) + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.FitFailedWarning") +@pytest.mark.filterwarnings("ignore:Scoring failed:UserWarning") +@pytest.mark.filterwarnings("ignore:One or more of the:UserWarning") +@pytest.mark.parametrize("HalvingSearch", (HalvingGridSearchCV, HalvingRandomSearchCV)) +@pytest.mark.parametrize("fail_at", ("fit", "predict")) +def test_nan_handling(HalvingSearch, fail_at): + """Check the selection of the best scores in presence of failure represented by + NaN values.""" + n_samples = 1_000 + X, y = make_classification(n_samples=n_samples, random_state=0) + + search = HalvingSearch( + SometimesFailClassifier(), + {f"fail_{fail_at}": [False, True], "a": range(3)}, + resource="n_estimators", + max_resources=6, + min_resources=1, + factor=2, + ) + + search.fit(X, y) + + # estimators that failed during fit/predict should always rank lower + # than ones where the fit/predict succeeded + assert not search.best_params_[f"fail_{fail_at}"] + scores = search.cv_results_["mean_test_score"] + ranks = search.cv_results_["rank_test_score"] + + # some scores should be NaN + assert np.isnan(scores).any() + + unique_nan_ranks = np.unique(ranks[np.isnan(scores)]) + # all NaN scores should have the same rank + assert unique_nan_ranks.shape[0] == 1 + # NaNs should have the lowest rank + assert (unique_nan_ranks[0] >= ranks).all() + + +@pytest.mark.parametrize("Est", (HalvingGridSearchCV, HalvingRandomSearchCV)) +@pytest.mark.parametrize( + ( + "aggressive_elimination," + "max_resources," + "expected_n_iterations," + "expected_n_required_iterations," + "expected_n_possible_iterations," + "expected_n_remaining_candidates," + "expected_n_candidates," + "expected_n_resources," + ), + [ + # notice how it loops at the beginning + # also, the number of candidates evaluated at the last iteration is + # <= factor + (True, "limited", 4, 4, 3, 1, [60, 20, 7, 3], [20, 20, 60, 180]), + # no aggressive elimination: we end up with less iterations, and + # the number of candidates at the last iter is > factor, which isn't + # ideal + (False, "limited", 3, 4, 3, 3, [60, 20, 7], [20, 60, 180]), + # # When the amount of resource isn't limited, aggressive_elimination + # # has no effect. Here the default min_resources='exhaust' will take + # # over. + (True, "unlimited", 4, 4, 4, 1, [60, 20, 7, 3], [37, 111, 333, 999]), + (False, "unlimited", 4, 4, 4, 1, [60, 20, 7, 3], [37, 111, 333, 999]), + ], +) +def test_aggressive_elimination( + Est, + aggressive_elimination, + max_resources, + expected_n_iterations, + expected_n_required_iterations, + expected_n_possible_iterations, + expected_n_remaining_candidates, + expected_n_candidates, + expected_n_resources, +): + # Test the aggressive_elimination parameter. + + n_samples = 1000 + X, y = make_classification(n_samples=n_samples, random_state=0) + param_grid = {"a": ("l1", "l2"), "b": list(range(30))} + base_estimator = FastClassifier() + + if max_resources == "limited": + max_resources = 180 + else: + max_resources = n_samples + + sh = Est( + base_estimator, + param_grid, + aggressive_elimination=aggressive_elimination, + max_resources=max_resources, + factor=3, + ) + sh.set_params(verbose=True) # just for test coverage + + if Est is HalvingRandomSearchCV: + # same number of candidates as with the grid + sh.set_params(n_candidates=2 * 30, min_resources="exhaust") + + sh.fit(X, y) + + assert sh.n_iterations_ == expected_n_iterations + assert sh.n_required_iterations_ == expected_n_required_iterations + assert sh.n_possible_iterations_ == expected_n_possible_iterations + assert sh.n_resources_ == expected_n_resources + assert sh.n_candidates_ == expected_n_candidates + assert sh.n_remaining_candidates_ == expected_n_remaining_candidates + assert ceil(sh.n_candidates_[-1] / sh.factor) == sh.n_remaining_candidates_ + + +@pytest.mark.parametrize("Est", (HalvingGridSearchCV, HalvingRandomSearchCV)) +@pytest.mark.parametrize( + ( + "min_resources," + "max_resources," + "expected_n_iterations," + "expected_n_possible_iterations," + "expected_n_resources," + ), + [ + # with enough resources + ("smallest", "auto", 2, 4, [20, 60]), + # with enough resources but min_resources set manually + (50, "auto", 2, 3, [50, 150]), + # without enough resources, only one iteration can be done + ("smallest", 30, 1, 1, [20]), + # with exhaust: use as much resources as possible at the last iter + ("exhaust", "auto", 2, 2, [333, 999]), + ("exhaust", 1000, 2, 2, [333, 999]), + ("exhaust", 999, 2, 2, [333, 999]), + ("exhaust", 600, 2, 2, [200, 600]), + ("exhaust", 599, 2, 2, [199, 597]), + ("exhaust", 300, 2, 2, [100, 300]), + ("exhaust", 60, 2, 2, [20, 60]), + ("exhaust", 50, 1, 1, [20]), + ("exhaust", 20, 1, 1, [20]), + ], +) +def test_min_max_resources( + Est, + min_resources, + max_resources, + expected_n_iterations, + expected_n_possible_iterations, + expected_n_resources, +): + # Test the min_resources and max_resources parameters, and how they affect + # the number of resources used at each iteration + n_samples = 1000 + X, y = make_classification(n_samples=n_samples, random_state=0) + param_grid = {"a": [1, 2], "b": [1, 2, 3]} + base_estimator = FastClassifier() + + sh = Est( + base_estimator, + param_grid, + factor=3, + min_resources=min_resources, + max_resources=max_resources, + ) + if Est is HalvingRandomSearchCV: + sh.set_params(n_candidates=6) # same number as with the grid + + sh.fit(X, y) + + expected_n_required_iterations = 2 # given 6 combinations and factor = 3 + assert sh.n_iterations_ == expected_n_iterations + assert sh.n_required_iterations_ == expected_n_required_iterations + assert sh.n_possible_iterations_ == expected_n_possible_iterations + assert sh.n_resources_ == expected_n_resources + if min_resources == "exhaust": + assert sh.n_possible_iterations_ == sh.n_iterations_ == len(sh.n_resources_) + + +@pytest.mark.parametrize("Est", (HalvingRandomSearchCV, HalvingGridSearchCV)) +@pytest.mark.parametrize( + "max_resources, n_iterations, n_possible_iterations", + [ + ("auto", 5, 9), # all resources are used + (1024, 5, 9), + (700, 5, 8), + (512, 5, 8), + (511, 5, 7), + (32, 4, 4), + (31, 3, 3), + (16, 3, 3), + (4, 1, 1), # max_resources == min_resources, only one iteration is + # possible + ], +) +def test_n_iterations(Est, max_resources, n_iterations, n_possible_iterations): + # test the number of actual iterations that were run depending on + # max_resources + + n_samples = 1024 + X, y = make_classification(n_samples=n_samples, random_state=1) + param_grid = {"a": [1, 2], "b": list(range(10))} + base_estimator = FastClassifier() + factor = 2 + + sh = Est( + base_estimator, + param_grid, + cv=2, + factor=factor, + max_resources=max_resources, + min_resources=4, + ) + if Est is HalvingRandomSearchCV: + sh.set_params(n_candidates=20) # same as for HalvingGridSearchCV + sh.fit(X, y) + assert sh.n_required_iterations_ == 5 + assert sh.n_iterations_ == n_iterations + assert sh.n_possible_iterations_ == n_possible_iterations + + +@pytest.mark.parametrize("Est", (HalvingRandomSearchCV, HalvingGridSearchCV)) +def test_resource_parameter(Est): + # Test the resource parameter + + n_samples = 1000 + X, y = make_classification(n_samples=n_samples, random_state=0) + param_grid = {"a": [1, 2], "b": list(range(10))} + base_estimator = FastClassifier() + sh = Est(base_estimator, param_grid, cv=2, resource="c", max_resources=10, factor=3) + sh.fit(X, y) + assert set(sh.n_resources_) == set([1, 3, 9]) + for r_i, params, param_c in zip( + sh.cv_results_["n_resources"], + sh.cv_results_["params"], + sh.cv_results_["param_c"], + ): + assert r_i == params["c"] == param_c + + with pytest.raises( + ValueError, match="Cannot use resource=1234 which is not supported " + ): + sh = HalvingGridSearchCV( + base_estimator, param_grid, cv=2, resource="1234", max_resources=10 + ) + sh.fit(X, y) + + with pytest.raises( + ValueError, + match=( + "Cannot use parameter c as the resource since it is part " + "of the searched parameters." + ), + ): + param_grid = {"a": [1, 2], "b": [1, 2], "c": [1, 3]} + sh = HalvingGridSearchCV( + base_estimator, param_grid, cv=2, resource="c", max_resources=10 + ) + sh.fit(X, y) + + +@pytest.mark.parametrize( + "max_resources, n_candidates, expected_n_candidates", + [ + (512, "exhaust", 128), # generate exactly as much as needed + (32, "exhaust", 8), + (32, 8, 8), + (32, 7, 7), # ask for less than what we could + (32, 9, 9), # ask for more than 'reasonable' + ], +) +def test_random_search(max_resources, n_candidates, expected_n_candidates): + # Test random search and make sure the number of generated candidates is + # as expected + + n_samples = 1024 + X, y = make_classification(n_samples=n_samples, random_state=0) + param_grid = {"a": norm, "b": norm} + base_estimator = FastClassifier() + sh = HalvingRandomSearchCV( + base_estimator, + param_grid, + n_candidates=n_candidates, + cv=2, + max_resources=max_resources, + factor=2, + min_resources=4, + ) + sh.fit(X, y) + assert sh.n_candidates_[0] == expected_n_candidates + if n_candidates == "exhaust": + # Make sure 'exhaust' makes the last iteration use as much resources as + # we can + assert sh.n_resources_[-1] == max_resources + + +@pytest.mark.parametrize( + "param_distributions, expected_n_candidates", + [ + ({"a": [1, 2]}, 2), # all lists, sample less than n_candidates + ({"a": randint(1, 3)}, 10), # not all list, respect n_candidates + ], +) +def test_random_search_discrete_distributions( + param_distributions, expected_n_candidates +): + # Make sure random search samples the appropriate number of candidates when + # we ask for more than what's possible. How many parameters are sampled + # depends whether the distributions are 'all lists' or not (see + # ParameterSampler for details). This is somewhat redundant with the checks + # in ParameterSampler but interaction bugs were discovered during + # development of SH + + n_samples = 1024 + X, y = make_classification(n_samples=n_samples, random_state=0) + base_estimator = FastClassifier() + sh = HalvingRandomSearchCV(base_estimator, param_distributions, n_candidates=10) + sh.fit(X, y) + assert sh.n_candidates_[0] == expected_n_candidates + + +@pytest.mark.parametrize("Est", (HalvingGridSearchCV, HalvingRandomSearchCV)) +@pytest.mark.parametrize( + "params, expected_error_message", + [ + ( + {"resource": "not_a_parameter"}, + "Cannot use resource=not_a_parameter which is not supported", + ), + ( + {"resource": "a", "max_resources": 100}, + "Cannot use parameter a as the resource since it is part of", + ), + ( + {"max_resources": "auto", "resource": "b"}, + "resource can only be 'n_samples' when max_resources='auto'", + ), + ( + {"min_resources": 15, "max_resources": 14}, + "min_resources_=15 is greater than max_resources_=14", + ), + ({"cv": KFold(shuffle=True)}, "must yield consistent folds"), + ({"cv": ShuffleSplit()}, "must yield consistent folds"), + ], +) +def test_input_errors(Est, params, expected_error_message): + base_estimator = FastClassifier() + param_grid = {"a": [1]} + X, y = make_classification(100) + + sh = Est(base_estimator, param_grid, **params) + + with pytest.raises(ValueError, match=expected_error_message): + sh.fit(X, y) + + +@pytest.mark.parametrize( + "params, expected_error_message", + [ + ( + {"n_candidates": "exhaust", "min_resources": "exhaust"}, + "cannot be both set to 'exhaust'", + ), + ], +) +def test_input_errors_randomized(params, expected_error_message): + # tests specific to HalvingRandomSearchCV + + base_estimator = FastClassifier() + param_grid = {"a": [1]} + X, y = make_classification(100) + + sh = HalvingRandomSearchCV(base_estimator, param_grid, **params) + + with pytest.raises(ValueError, match=expected_error_message): + sh.fit(X, y) + + +@pytest.mark.parametrize( + "fraction, subsample_test, expected_train_size, expected_test_size", + [ + (0.5, True, 40, 10), + (0.5, False, 40, 20), + (0.2, True, 16, 4), + (0.2, False, 16, 20), + ], +) +def test_subsample_splitter_shapes( + fraction, subsample_test, expected_train_size, expected_test_size +): + # Make sure splits returned by SubsampleMetaSplitter are of appropriate + # size + + n_samples = 100 + X, y = make_classification(n_samples) + cv = _SubsampleMetaSplitter( + base_cv=KFold(5), + fraction=fraction, + subsample_test=subsample_test, + random_state=None, + ) + + for train, test in cv.split(X, y): + assert train.shape[0] == expected_train_size + assert test.shape[0] == expected_test_size + if subsample_test: + assert train.shape[0] + test.shape[0] == int(n_samples * fraction) + else: + assert test.shape[0] == n_samples // cv.base_cv.get_n_splits() + + +@pytest.mark.parametrize("subsample_test", (True, False)) +def test_subsample_splitter_determinism(subsample_test): + # Make sure _SubsampleMetaSplitter is consistent across calls to split(): + # - we're OK having training sets differ (they're always sampled with a + # different fraction anyway) + # - when we don't subsample the test set, we want it to be always the same. + # This check is the most important. This is ensured by the determinism + # of the base_cv. + + # Note: we could force both train and test splits to be always the same if + # we drew an int seed in _SubsampleMetaSplitter.__init__ + + n_samples = 100 + X, y = make_classification(n_samples) + cv = _SubsampleMetaSplitter( + base_cv=KFold(5), fraction=0.5, subsample_test=subsample_test, random_state=None + ) + + folds_a = list(cv.split(X, y, groups=None)) + folds_b = list(cv.split(X, y, groups=None)) + + for (train_a, test_a), (train_b, test_b) in zip(folds_a, folds_b): + assert not np.all(train_a == train_b) + + if subsample_test: + assert not np.all(test_a == test_b) + else: + assert np.all(test_a == test_b) + assert np.all(X[test_a] == X[test_b]) + + +@pytest.mark.parametrize( + "k, itr, expected", + [ + (1, 0, ["c"]), + (2, 0, ["a", "c"]), + (4, 0, ["d", "b", "a", "c"]), + (10, 0, ["d", "b", "a", "c"]), + (1, 1, ["e"]), + (2, 1, ["f", "e"]), + (10, 1, ["f", "e"]), + (1, 2, ["i"]), + (10, 2, ["g", "h", "i"]), + ], +) +def test_top_k(k, itr, expected): + results = { # this isn't a 'real world' result dict + "iter": [0, 0, 0, 0, 1, 1, 2, 2, 2], + "mean_test_score": [4, 3, 5, 1, 11, 10, 5, 6, 9], + "params": ["a", "b", "c", "d", "e", "f", "g", "h", "i"], + } + got = _top_k(results, k=k, itr=itr) + assert np.all(got == expected) + + +@pytest.mark.parametrize("Est", (HalvingRandomSearchCV, HalvingGridSearchCV)) +def test_cv_results(Est): + # test that the cv_results_ matches correctly the logic of the + # tournament: in particular that the candidates continued in each + # successive iteration are those that were best in the previous iteration + pd = pytest.importorskip("pandas") + + rng = np.random.RandomState(0) + + n_samples = 1000 + X, y = make_classification(n_samples=n_samples, random_state=0) + param_grid = {"a": ("l1", "l2"), "b": list(range(30))} + base_estimator = FastClassifier() + + # generate random scores: we want to avoid ties, which would otherwise + # mess with the ordering and make testing harder + def scorer(est, X, y): + return rng.rand() + + sh = Est(base_estimator, param_grid, factor=2, scoring=scorer) + if Est is HalvingRandomSearchCV: + # same number of candidates as with the grid + sh.set_params(n_candidates=2 * 30, min_resources="exhaust") + + sh.fit(X, y) + + # non-regression check for + # https://github.com/scikit-learn/scikit-learn/issues/19203 + assert isinstance(sh.cv_results_["iter"], np.ndarray) + assert isinstance(sh.cv_results_["n_resources"], np.ndarray) + + cv_results_df = pd.DataFrame(sh.cv_results_) + + # just make sure we don't have ties + assert len(cv_results_df["mean_test_score"].unique()) == len(cv_results_df) + + cv_results_df["params_str"] = cv_results_df["params"].apply(str) + table = cv_results_df.pivot( + index="params_str", columns="iter", values="mean_test_score" + ) + + # table looks like something like this: + # iter 0 1 2 3 4 5 + # params_str + # {'a': 'l2', 'b': 23} 0.75 NaN NaN NaN NaN NaN + # {'a': 'l1', 'b': 30} 0.90 0.875 NaN NaN NaN NaN + # {'a': 'l1', 'b': 0} 0.75 NaN NaN NaN NaN NaN + # {'a': 'l2', 'b': 3} 0.85 0.925 0.9125 0.90625 NaN NaN + # {'a': 'l1', 'b': 5} 0.80 NaN NaN NaN NaN NaN + # ... + + # where a NaN indicates that the candidate wasn't evaluated at a given + # iteration, because it wasn't part of the top-K at some previous + # iteration. We here make sure that candidates that aren't in the top-k at + # any given iteration are indeed not evaluated at the subsequent + # iterations. + nan_mask = pd.isna(table) + n_iter = sh.n_iterations_ + for it in range(n_iter - 1): + already_discarded_mask = nan_mask[it] + + # make sure that if a candidate is already discarded, we don't evaluate + # it later + assert ( + already_discarded_mask & nan_mask[it + 1] == already_discarded_mask + ).all() + + # make sure that the number of discarded candidate is correct + discarded_now_mask = ~already_discarded_mask & nan_mask[it + 1] + kept_mask = ~already_discarded_mask & ~discarded_now_mask + assert kept_mask.sum() == sh.n_candidates_[it + 1] + + # make sure that all discarded candidates have a lower score than the + # kept candidates + discarded_max_score = table[it].where(discarded_now_mask).max() + kept_min_score = table[it].where(kept_mask).min() + assert discarded_max_score < kept_min_score + + # We now make sure that the best candidate is chosen only from the last + # iteration. + # We also make sure this is true even if there were higher scores in + # earlier rounds (this isn't generally the case, but worth ensuring it's + # possible). + + last_iter = cv_results_df["iter"].max() + idx_best_last_iter = cv_results_df[cv_results_df["iter"] == last_iter][ + "mean_test_score" + ].idxmax() + idx_best_all_iters = cv_results_df["mean_test_score"].idxmax() + + assert sh.best_params_ == cv_results_df.iloc[idx_best_last_iter]["params"] + assert ( + cv_results_df.iloc[idx_best_last_iter]["mean_test_score"] + < cv_results_df.iloc[idx_best_all_iters]["mean_test_score"] + ) + assert ( + cv_results_df.iloc[idx_best_last_iter]["params"] + != cv_results_df.iloc[idx_best_all_iters]["params"] + ) + + +@pytest.mark.parametrize("Est", (HalvingGridSearchCV, HalvingRandomSearchCV)) +def test_base_estimator_inputs(Est): + # make sure that the base estimators are passed the correct parameters and + # number of samples at each iteration. + pd = pytest.importorskip("pandas") + + passed_n_samples_fit = [] + passed_n_samples_predict = [] + passed_params = [] + + class FastClassifierBookKeeping(FastClassifier): + def fit(self, X, y): + passed_n_samples_fit.append(X.shape[0]) + return super().fit(X, y) + + def predict(self, X): + passed_n_samples_predict.append(X.shape[0]) + return super().predict(X) + + def set_params(self, **params): + passed_params.append(params) + return super().set_params(**params) + + n_samples = 1024 + n_splits = 2 + X, y = make_classification(n_samples=n_samples, random_state=0) + param_grid = {"a": ("l1", "l2"), "b": list(range(30))} + base_estimator = FastClassifierBookKeeping() + + sh = Est( + base_estimator, + param_grid, + factor=2, + cv=n_splits, + return_train_score=False, + refit=False, + ) + if Est is HalvingRandomSearchCV: + # same number of candidates as with the grid + sh.set_params(n_candidates=2 * 30, min_resources="exhaust") + + sh.fit(X, y) + + assert len(passed_n_samples_fit) == len(passed_n_samples_predict) + passed_n_samples = [ + x + y for (x, y) in zip(passed_n_samples_fit, passed_n_samples_predict) + ] + + # Lists are of length n_splits * n_iter * n_candidates_at_i. + # Each chunk of size n_splits corresponds to the n_splits folds for the + # same candidate at the same iteration, so they contain equal values. We + # subsample such that the lists are of length n_iter * n_candidates_at_it + passed_n_samples = passed_n_samples[::n_splits] + passed_params = passed_params[::n_splits] + + cv_results_df = pd.DataFrame(sh.cv_results_) + + assert len(passed_params) == len(passed_n_samples) == len(cv_results_df) + + uniques, counts = np.unique(passed_n_samples, return_counts=True) + assert (sh.n_resources_ == uniques).all() + assert (sh.n_candidates_ == counts).all() + + assert (cv_results_df["params"] == passed_params).all() + assert (cv_results_df["n_resources"] == passed_n_samples).all() + + +@pytest.mark.parametrize("Est", (HalvingGridSearchCV, HalvingRandomSearchCV)) +def test_groups_support(Est): + # Check if ValueError (when groups is None) propagates to + # HalvingGridSearchCV and HalvingRandomSearchCV + # And also check if groups is correctly passed to the cv object + rng = np.random.RandomState(0) + + X, y = make_classification(n_samples=50, n_classes=2, random_state=0) + groups = rng.randint(0, 3, 50) + + clf = LinearSVC(random_state=0) + grid = {"C": [1]} + + group_cvs = [ + LeaveOneGroupOut(), + LeavePGroupsOut(2), + GroupKFold(n_splits=3), + GroupShuffleSplit(random_state=0), + ] + error_msg = "The 'groups' parameter should not be None." + for cv in group_cvs: + gs = Est(clf, grid, cv=cv, random_state=0) + with pytest.raises(ValueError, match=error_msg): + gs.fit(X, y) + gs.fit(X, y, groups=groups) + + non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit(random_state=0)] + for cv in non_group_cvs: + gs = Est(clf, grid, cv=cv) + # Should not raise an error + gs.fit(X, y) + + +@pytest.mark.parametrize("SearchCV", [HalvingRandomSearchCV, HalvingGridSearchCV]) +def test_min_resources_null(SearchCV): + """Check that we raise an error if the minimum resources is set to 0.""" + base_estimator = FastClassifier() + param_grid = {"a": [1]} + X = np.empty(0).reshape(0, 3) + + search = SearchCV(base_estimator, param_grid, min_resources="smallest") + + err_msg = "min_resources_=0: you might have passed an empty dataset X." + with pytest.raises(ValueError, match=err_msg): + search.fit(X, []) + + +@pytest.mark.parametrize("SearchCV", [HalvingGridSearchCV, HalvingRandomSearchCV]) +def test_select_best_index(SearchCV): + """Check the selection strategy of the halving search.""" + results = { # this isn't a 'real world' result dict + "iter": np.array([0, 0, 0, 0, 1, 1, 2, 2, 2]), + "mean_test_score": np.array([4, 3, 5, 1, 11, 10, 5, 6, 9]), + "params": np.array(["a", "b", "c", "d", "e", "f", "g", "h", "i"]), + } + + # we expect the index of 'i' + best_index = SearchCV._select_best_index(None, None, results) + assert best_index == 8 + + +def test_halving_random_search_list_of_dicts(): + """Check the behaviour of the `HalvingRandomSearchCV` with `param_distribution` + being a list of dictionary. + """ + X, y = make_classification(n_samples=150, n_features=4, random_state=42) + + params = [ + {"kernel": ["rbf"], "C": expon(scale=10), "gamma": expon(scale=0.1)}, + {"kernel": ["poly"], "degree": [2, 3]}, + ] + param_keys = ( + "param_C", + "param_degree", + "param_gamma", + "param_kernel", + ) + score_keys = ( + "mean_test_score", + "mean_train_score", + "rank_test_score", + "split0_test_score", + "split1_test_score", + "split2_test_score", + "split0_train_score", + "split1_train_score", + "split2_train_score", + "std_test_score", + "std_train_score", + "mean_fit_time", + "std_fit_time", + "mean_score_time", + "std_score_time", + ) + extra_keys = ("n_resources", "iter") + + search = HalvingRandomSearchCV( + SVC(), cv=3, param_distributions=params, return_train_score=True, random_state=0 + ) + search.fit(X, y) + n_candidates = sum(search.n_candidates_) + cv_results = search.cv_results_ + # Check results structure + check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates, extra_keys) + expected_cv_results_kinds = { + "param_C": "f", + "param_degree": "i", + "param_gamma": "f", + "param_kernel": "O", + } + check_cv_results_array_types( + search, param_keys, score_keys, expected_cv_results_kinds + ) + + assert all( + ( + cv_results["param_C"].mask[i] + and cv_results["param_gamma"].mask[i] + and not cv_results["param_degree"].mask[i] + ) + for i in range(n_candidates) + if cv_results["param_kernel"][i] == "poly" + ) + assert all( + ( + not cv_results["param_C"].mask[i] + and not cv_results["param_gamma"].mask[i] + and cv_results["param_degree"].mask[i] + ) + for i in range(n_candidates) + if cv_results["param_kernel"][i] == "rbf" + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_validation.py b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_validation.py new file mode 100644 index 0000000000000000000000000000000000000000..c20131b8d3f387d32a9abe5cbf80a6387e0017f3 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/model_selection/tests/test_validation.py @@ -0,0 +1,2739 @@ +"""Test the validation module""" + +import os +import re +import sys +import tempfile +import warnings +from functools import partial +from io import StringIO +from time import sleep + +import numpy as np +import pytest +from scipy.sparse import issparse + +from sklearn import config_context +from sklearn.base import BaseEstimator, ClassifierMixin, clone +from sklearn.cluster import KMeans +from sklearn.datasets import ( + load_diabetes, + load_digits, + load_iris, + make_classification, + make_multilabel_classification, + make_regression, +) +from sklearn.ensemble import RandomForestClassifier +from sklearn.exceptions import FitFailedWarning, UnsetMetadataPassedError +from sklearn.impute import SimpleImputer +from sklearn.linear_model import ( + LogisticRegression, + PassiveAggressiveClassifier, + Ridge, + RidgeClassifier, + SGDClassifier, +) +from sklearn.metrics import ( + accuracy_score, + check_scoring, + confusion_matrix, + explained_variance_score, + make_scorer, + mean_squared_error, + precision_recall_fscore_support, + precision_score, + r2_score, +) +from sklearn.metrics._scorer import _MultimetricScorer +from sklearn.model_selection import ( + GridSearchCV, + GroupKFold, + GroupShuffleSplit, + KFold, + LeaveOneGroupOut, + LeaveOneOut, + LeavePGroupsOut, + ShuffleSplit, + StratifiedKFold, + cross_val_predict, + cross_val_score, + cross_validate, + learning_curve, + permutation_test_score, + validation_curve, +) +from sklearn.model_selection._validation import ( + _check_is_permutation, + _fit_and_score, + _score, +) +from sklearn.model_selection.tests.common import OneTimeSplitter +from sklearn.model_selection.tests.test_search import FailingClassifier +from sklearn.multiclass import OneVsRestClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.neural_network import MLPRegressor +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import LabelEncoder, scale +from sklearn.svm import SVC, LinearSVC +from sklearn.tests.metadata_routing_common import ( + ConsumingClassifier, + ConsumingScorer, + ConsumingSplitter, + _Registry, + check_recorded_metadata, +) +from sklearn.utils import shuffle +from sklearn.utils._mocking import CheckingClassifier, MockDataFrame +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) +from sklearn.utils.fixes import COO_CONTAINERS, CSR_CONTAINERS +from sklearn.utils.validation import _num_samples + + +class MockImprovingEstimator(BaseEstimator): + """Dummy classifier to test the learning curve""" + + def __init__(self, n_max_train_sizes): + self.n_max_train_sizes = n_max_train_sizes + self.train_sizes = 0 + self.X_subset = None + + def fit(self, X_subset, y_subset=None): + self.X_subset = X_subset + self.train_sizes = X_subset.shape[0] + return self + + def predict(self, X): + raise NotImplementedError + + def score(self, X=None, Y=None): + # training score becomes worse (2 -> 1), test error better (0 -> 1) + if self._is_training_data(X): + return 2.0 - float(self.train_sizes) / self.n_max_train_sizes + else: + return float(self.train_sizes) / self.n_max_train_sizes + + def _is_training_data(self, X): + return X is self.X_subset + + +class MockIncrementalImprovingEstimator(MockImprovingEstimator): + """Dummy classifier that provides partial_fit""" + + def __init__(self, n_max_train_sizes, expected_fit_params=None): + super().__init__(n_max_train_sizes) + self.x = None + self.expected_fit_params = expected_fit_params + + def _is_training_data(self, X): + return self.x in X + + def partial_fit(self, X, y=None, **params): + self.train_sizes += X.shape[0] + self.x = X[0] + if self.expected_fit_params: + missing = set(self.expected_fit_params) - set(params) + if missing: + raise AssertionError( + f"Expected fit parameter(s) {list(missing)} not seen." + ) + for key, value in params.items(): + if key in self.expected_fit_params and _num_samples( + value + ) != _num_samples(X): + raise AssertionError( + f"Fit parameter {key} has length {_num_samples(value)}" + f"; expected {_num_samples(X)}." + ) + + +class MockEstimatorWithParameter(BaseEstimator): + """Dummy classifier to test the validation curve""" + + def __init__(self, param=0.5): + self.X_subset = None + self.param = param + + def fit(self, X_subset, y_subset): + self.X_subset = X_subset + self.train_sizes = X_subset.shape[0] + return self + + def predict(self, X): + raise NotImplementedError + + def score(self, X=None, y=None): + return self.param if self._is_training_data(X) else 1 - self.param + + def _is_training_data(self, X): + return X is self.X_subset + + +class MockEstimatorWithSingleFitCallAllowed(MockEstimatorWithParameter): + """Dummy classifier that disallows repeated calls of fit method""" + + def fit(self, X_subset, y_subset): + assert not hasattr(self, "fit_called_"), "fit is called the second time" + self.fit_called_ = True + return super().fit(X_subset, y_subset) + + def predict(self, X): + raise NotImplementedError + + +class MockClassifier(ClassifierMixin, BaseEstimator): + """Dummy classifier to test the cross-validation""" + + def __init__(self, a=0, allow_nd=False): + self.a = a + self.allow_nd = allow_nd + + def fit( + self, + X, + Y=None, + sample_weight=None, + class_prior=None, + sparse_sample_weight=None, + sparse_param=None, + dummy_int=None, + dummy_str=None, + dummy_obj=None, + callback=None, + ): + """The dummy arguments are to test that this fit function can + accept non-array arguments through cross-validation, such as: + - int + - str (this is actually array-like) + - object + - function + """ + self.dummy_int = dummy_int + self.dummy_str = dummy_str + self.dummy_obj = dummy_obj + if callback is not None: + callback(self) + + if self.allow_nd: + X = X.reshape(len(X), -1) + if X.ndim >= 3 and not self.allow_nd: + raise ValueError("X cannot be d") + if sample_weight is not None: + assert sample_weight.shape[0] == X.shape[0], ( + "MockClassifier extra fit_param " + "sample_weight.shape[0] is {0}, should be {1}".format( + sample_weight.shape[0], X.shape[0] + ) + ) + if class_prior is not None: + assert class_prior.shape[0] == len(np.unique(y)), ( + "MockClassifier extra fit_param class_prior.shape[0]" + " is {0}, should be {1}".format(class_prior.shape[0], len(np.unique(y))) + ) + if sparse_sample_weight is not None: + fmt = ( + "MockClassifier extra fit_param sparse_sample_weight" + ".shape[0] is {0}, should be {1}" + ) + assert sparse_sample_weight.shape[0] == X.shape[0], fmt.format( + sparse_sample_weight.shape[0], X.shape[0] + ) + if sparse_param is not None: + fmt = ( + "MockClassifier extra fit_param sparse_param.shape " + "is ({0}, {1}), should be ({2}, {3})" + ) + assert sparse_param.shape == P.shape, fmt.format( + sparse_param.shape[0], + sparse_param.shape[1], + P.shape[0], + P.shape[1], + ) + self.classes_ = np.unique(y) + return self + + def predict(self, T): + if self.allow_nd: + T = T.reshape(len(T), -1) + return T[:, 0] + + def predict_proba(self, T): + return T + + def score(self, X=None, Y=None): + return 1.0 / (1 + np.abs(self.a)) + + def get_params(self, deep=False): + return {"a": self.a, "allow_nd": self.allow_nd} + + +# XXX: use 2D array, since 1D X is being detected as a single sample in +# check_consistent_length +X = np.ones((15, 2)) +y = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6]) +# The number of samples per class needs to be > n_splits, +# for StratifiedKFold(n_splits=3) +y2 = np.array([1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) +P = np.eye(5) + + +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +def test_cross_val_score(coo_container): + clf = MockClassifier() + X_sparse = coo_container(X) + + for a in range(-10, 10): + clf.a = a + # Smoke test + scores = cross_val_score(clf, X, y2) + assert_array_equal(scores, clf.score(X, y2)) + + # test with multioutput y + multioutput_y = np.column_stack([y2, y2[::-1]]) + scores = cross_val_score(clf, X_sparse, multioutput_y) + assert_array_equal(scores, clf.score(X_sparse, multioutput_y)) + + scores = cross_val_score(clf, X_sparse, y2) + assert_array_equal(scores, clf.score(X_sparse, y2)) + + # test with multioutput y + scores = cross_val_score(clf, X_sparse, multioutput_y) + assert_array_equal(scores, clf.score(X_sparse, multioutput_y)) + + # test with X and y as list + list_check = lambda x: isinstance(x, list) + clf = CheckingClassifier(check_X=list_check) + scores = cross_val_score(clf, X.tolist(), y2.tolist(), cv=3) + + clf = CheckingClassifier(check_y=list_check) + scores = cross_val_score(clf, X, y2.tolist(), cv=3) + + # test with 3d X and + X_3d = X[:, :, np.newaxis] + clf = MockClassifier(allow_nd=True) + scores = cross_val_score(clf, X_3d, y2) + + clf = MockClassifier(allow_nd=False) + with pytest.raises(ValueError): + cross_val_score(clf, X_3d, y2, error_score="raise") + + +def test_cross_validate_many_jobs(): + # regression test for #12154: cv='warn' with n_jobs>1 trigger a copy of + # the parameters leading to a failure in check_cv due to cv is 'warn' + # instead of cv == 'warn'. + X, y = load_iris(return_X_y=True) + clf = SVC(gamma="auto") + grid = GridSearchCV(clf, param_grid={"C": [1, 10]}) + cross_validate(grid, X, y, n_jobs=2) + + +def test_cross_validate_invalid_scoring_param(): + X, y = make_classification(random_state=0) + estimator = MockClassifier() + + # Test the errors + error_message_regexp = ".*must be unique strings.*" + + # List/tuple of callables should raise a message advising users to use + # dict of names to callables mapping + with pytest.raises(ValueError, match=error_message_regexp): + cross_validate( + estimator, + X, + y, + scoring=(make_scorer(precision_score), make_scorer(accuracy_score)), + ) + with pytest.raises(ValueError, match=error_message_regexp): + cross_validate(estimator, X, y, scoring=(make_scorer(precision_score),)) + + # So should empty lists/tuples + with pytest.raises(ValueError, match=error_message_regexp + "Empty list.*"): + cross_validate(estimator, X, y, scoring=()) + + # So should duplicated entries + with pytest.raises(ValueError, match=error_message_regexp + "Duplicate.*"): + cross_validate(estimator, X, y, scoring=("f1_micro", "f1_micro")) + + # Nested Lists should raise a generic error message + with pytest.raises(ValueError, match=error_message_regexp): + cross_validate(estimator, X, y, scoring=[[make_scorer(precision_score)]]) + + # Empty dict should raise invalid scoring error + with pytest.raises(ValueError, match="An empty dict"): + cross_validate(estimator, X, y, scoring=(dict())) + + multiclass_scorer = make_scorer(precision_recall_fscore_support) + + # Multiclass Scorers that return multiple values are not supported yet + # the warning message we're expecting to see + warning_message = ( + "Scoring failed. The score on this train-test " + f"partition for these parameters will be set to {np.nan}. " + "Details: \n" + ) + + with pytest.warns(UserWarning, match=warning_message): + cross_validate(estimator, X, y, scoring=multiclass_scorer) + + with pytest.warns(UserWarning, match=warning_message): + cross_validate(estimator, X, y, scoring={"foo": multiclass_scorer}) + + +def test_cross_validate_nested_estimator(): + # Non-regression test to ensure that nested + # estimators are properly returned in a list + # https://github.com/scikit-learn/scikit-learn/pull/17745 + (X, y) = load_iris(return_X_y=True) + pipeline = Pipeline( + [ + ("imputer", SimpleImputer()), + ("classifier", MockClassifier()), + ] + ) + + results = cross_validate(pipeline, X, y, return_estimator=True) + estimators = results["estimator"] + + assert isinstance(estimators, list) + assert all(isinstance(estimator, Pipeline) for estimator in estimators) + + +@pytest.mark.parametrize("use_sparse", [False, True]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_cross_validate(use_sparse: bool, csr_container): + # Compute train and test mse/r2 scores + cv = KFold() + + # Regression + X_reg, y_reg = make_regression(n_samples=30, random_state=0) + reg = Ridge(random_state=0) + + # Classification + X_clf, y_clf = make_classification(n_samples=30, random_state=0) + clf = SVC(kernel="linear", random_state=0) + + if use_sparse: + X_reg = csr_container(X_reg) + X_clf = csr_container(X_clf) + + for X, y, est in ((X_reg, y_reg, reg), (X_clf, y_clf, clf)): + # It's okay to evaluate regression metrics on classification too + mse_scorer = check_scoring(est, scoring="neg_mean_squared_error") + r2_scorer = check_scoring(est, scoring="r2") + train_mse_scores = [] + test_mse_scores = [] + train_r2_scores = [] + test_r2_scores = [] + fitted_estimators = [] + + for train, test in cv.split(X, y): + est = clone(est).fit(X[train], y[train]) + train_mse_scores.append(mse_scorer(est, X[train], y[train])) + train_r2_scores.append(r2_scorer(est, X[train], y[train])) + test_mse_scores.append(mse_scorer(est, X[test], y[test])) + test_r2_scores.append(r2_scorer(est, X[test], y[test])) + fitted_estimators.append(est) + + train_mse_scores = np.array(train_mse_scores) + test_mse_scores = np.array(test_mse_scores) + train_r2_scores = np.array(train_r2_scores) + test_r2_scores = np.array(test_r2_scores) + fitted_estimators = np.array(fitted_estimators) + + scores = ( + train_mse_scores, + test_mse_scores, + train_r2_scores, + test_r2_scores, + fitted_estimators, + ) + + # To ensure that the test does not suffer from + # large statistical fluctuations due to slicing small datasets, + # we pass the cross-validation instance + check_cross_validate_single_metric(est, X, y, scores, cv) + check_cross_validate_multi_metric(est, X, y, scores, cv) + + +def check_cross_validate_single_metric(clf, X, y, scores, cv): + ( + train_mse_scores, + test_mse_scores, + train_r2_scores, + test_r2_scores, + fitted_estimators, + ) = scores + # Test single metric evaluation when scoring is string or singleton list + for return_train_score, dict_len in ((True, 4), (False, 3)): + # Single metric passed as a string + if return_train_score: + mse_scores_dict = cross_validate( + clf, + X, + y, + scoring="neg_mean_squared_error", + return_train_score=True, + cv=cv, + ) + assert_array_almost_equal(mse_scores_dict["train_score"], train_mse_scores) + else: + mse_scores_dict = cross_validate( + clf, + X, + y, + scoring="neg_mean_squared_error", + return_train_score=False, + cv=cv, + ) + assert isinstance(mse_scores_dict, dict) + assert len(mse_scores_dict) == dict_len + assert_array_almost_equal(mse_scores_dict["test_score"], test_mse_scores) + + # Single metric passed as a list + if return_train_score: + # It must be True by default - deprecated + r2_scores_dict = cross_validate( + clf, X, y, scoring=["r2"], return_train_score=True, cv=cv + ) + assert_array_almost_equal(r2_scores_dict["train_r2"], train_r2_scores, True) + else: + r2_scores_dict = cross_validate( + clf, X, y, scoring=["r2"], return_train_score=False, cv=cv + ) + assert isinstance(r2_scores_dict, dict) + assert len(r2_scores_dict) == dict_len + assert_array_almost_equal(r2_scores_dict["test_r2"], test_r2_scores) + + # Test return_estimator option + mse_scores_dict = cross_validate( + clf, X, y, scoring="neg_mean_squared_error", return_estimator=True, cv=cv + ) + for k, est in enumerate(mse_scores_dict["estimator"]): + est_coef = est.coef_.copy() + if issparse(est_coef): + est_coef = est_coef.toarray() + + fitted_est_coef = fitted_estimators[k].coef_.copy() + if issparse(fitted_est_coef): + fitted_est_coef = fitted_est_coef.toarray() + + assert_almost_equal(est_coef, fitted_est_coef) + assert_almost_equal(est.intercept_, fitted_estimators[k].intercept_) + + +def check_cross_validate_multi_metric(clf, X, y, scores, cv): + # Test multimetric evaluation when scoring is a list / dict + ( + train_mse_scores, + test_mse_scores, + train_r2_scores, + test_r2_scores, + fitted_estimators, + ) = scores + + def custom_scorer(clf, X, y): + y_pred = clf.predict(X) + return { + "r2": r2_score(y, y_pred), + "neg_mean_squared_error": -mean_squared_error(y, y_pred), + } + + all_scoring = ( + ("r2", "neg_mean_squared_error"), + { + "r2": make_scorer(r2_score), + "neg_mean_squared_error": "neg_mean_squared_error", + }, + custom_scorer, + ) + + keys_sans_train = { + "test_r2", + "test_neg_mean_squared_error", + "fit_time", + "score_time", + } + keys_with_train = keys_sans_train.union( + {"train_r2", "train_neg_mean_squared_error"} + ) + + for return_train_score in (True, False): + for scoring in all_scoring: + if return_train_score: + # return_train_score must be True by default - deprecated + cv_results = cross_validate( + clf, X, y, scoring=scoring, return_train_score=True, cv=cv + ) + assert_array_almost_equal(cv_results["train_r2"], train_r2_scores) + assert_array_almost_equal( + cv_results["train_neg_mean_squared_error"], train_mse_scores + ) + else: + cv_results = cross_validate( + clf, X, y, scoring=scoring, return_train_score=False, cv=cv + ) + assert isinstance(cv_results, dict) + assert set(cv_results.keys()) == ( + keys_with_train if return_train_score else keys_sans_train + ) + assert_array_almost_equal(cv_results["test_r2"], test_r2_scores) + assert_array_almost_equal( + cv_results["test_neg_mean_squared_error"], test_mse_scores + ) + + # Make sure all the arrays are of np.ndarray type + assert isinstance(cv_results["test_r2"], np.ndarray) + assert isinstance(cv_results["test_neg_mean_squared_error"], np.ndarray) + assert isinstance(cv_results["fit_time"], np.ndarray) + assert isinstance(cv_results["score_time"], np.ndarray) + + # Ensure all the times are within sane limits + assert np.all(cv_results["fit_time"] >= 0) + assert np.all(cv_results["fit_time"] < 10) + assert np.all(cv_results["score_time"] >= 0) + assert np.all(cv_results["score_time"] < 10) + + +def test_cross_val_score_predict_groups(): + # Check if ValueError (when groups is None) propagates to cross_val_score + # and cross_val_predict + # And also check if groups is correctly passed to the cv object + X, y = make_classification(n_samples=20, n_classes=2, random_state=0) + + clf = SVC(kernel="linear") + + group_cvs = [ + LeaveOneGroupOut(), + LeavePGroupsOut(2), + GroupKFold(), + GroupShuffleSplit(), + ] + error_message = "The 'groups' parameter should not be None." + for cv in group_cvs: + with pytest.raises(ValueError, match=error_message): + cross_val_score(estimator=clf, X=X, y=y, cv=cv) + with pytest.raises(ValueError, match=error_message): + cross_val_predict(estimator=clf, X=X, y=y, cv=cv) + + +def test_cross_val_score_pandas(): + # check cross_val_score doesn't destroy pandas dataframe + types = [(MockDataFrame, MockDataFrame)] + try: + from pandas import DataFrame, Series + + types.append((Series, DataFrame)) + except ImportError: + pass + for TargetType, InputFeatureType in types: + # X dataframe, y series + # 3 fold cross val is used so we need at least 3 samples per class + X_df, y_ser = InputFeatureType(X), TargetType(y2) + check_df = lambda x: isinstance(x, InputFeatureType) + check_series = lambda x: isinstance(x, TargetType) + clf = CheckingClassifier(check_X=check_df, check_y=check_series) + cross_val_score(clf, X_df, y_ser, cv=3) + + +def test_cross_val_score_mask(): + # test that cross_val_score works with boolean masks + svm = SVC(kernel="linear") + iris = load_iris() + X, y = iris.data, iris.target + kfold = KFold(5) + scores_indices = cross_val_score(svm, X, y, cv=kfold) + kfold = KFold(5) + cv_masks = [] + for train, test in kfold.split(X, y): + mask_train = np.zeros(len(y), dtype=bool) + mask_test = np.zeros(len(y), dtype=bool) + mask_train[train] = 1 + mask_test[test] = 1 + cv_masks.append((train, test)) + scores_masks = cross_val_score(svm, X, y, cv=cv_masks) + assert_array_equal(scores_indices, scores_masks) + + +def test_cross_val_score_precomputed(): + # test for svm with precomputed kernel + svm = SVC(kernel="precomputed") + iris = load_iris() + X, y = iris.data, iris.target + linear_kernel = np.dot(X, X.T) + score_precomputed = cross_val_score(svm, linear_kernel, y) + svm = SVC(kernel="linear") + score_linear = cross_val_score(svm, X, y) + assert_array_almost_equal(score_precomputed, score_linear) + + # test with callable + svm = SVC(kernel=lambda x, y: np.dot(x, y.T)) + score_callable = cross_val_score(svm, X, y) + assert_array_almost_equal(score_precomputed, score_callable) + + # Error raised for non-square X + svm = SVC(kernel="precomputed") + with pytest.raises(ValueError): + cross_val_score(svm, X, y) + + # test error is raised when the precomputed kernel is not array-like + # or sparse + with pytest.raises(ValueError): + cross_val_score(svm, linear_kernel.tolist(), y) + + +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +def test_cross_val_score_fit_params(coo_container): + clf = MockClassifier() + n_samples = X.shape[0] + n_classes = len(np.unique(y)) + + W_sparse = coo_container( + (np.array([1]), (np.array([1]), np.array([0]))), shape=(15, 1) + ) + P_sparse = coo_container(np.eye(5)) + + DUMMY_INT = 42 + DUMMY_STR = "42" + DUMMY_OBJ = object() + + def assert_fit_params(clf): + # Function to test that the values are passed correctly to the + # classifier arguments for non-array type + + assert clf.dummy_int == DUMMY_INT + assert clf.dummy_str == DUMMY_STR + assert clf.dummy_obj == DUMMY_OBJ + + fit_params = { + "sample_weight": np.ones(n_samples), + "class_prior": np.full(n_classes, 1.0 / n_classes), + "sparse_sample_weight": W_sparse, + "sparse_param": P_sparse, + "dummy_int": DUMMY_INT, + "dummy_str": DUMMY_STR, + "dummy_obj": DUMMY_OBJ, + "callback": assert_fit_params, + } + cross_val_score(clf, X, y2, params=fit_params) + + +def test_cross_val_score_score_func(): + clf = MockClassifier() + _score_func_args = [] + + def score_func(y_test, y_predict): + _score_func_args.append((y_test, y_predict)) + return 1.0 + + with warnings.catch_warnings(record=True): + scoring = make_scorer(score_func) + score = cross_val_score(clf, X, y, scoring=scoring, cv=3) + assert_array_equal(score, [1.0, 1.0, 1.0]) + # Test that score function is called only 3 times (for cv=3) + assert len(_score_func_args) == 3 + + +def test_cross_val_score_with_score_func_classification(): + iris = load_iris() + clf = SVC(kernel="linear") + + # Default score (should be the accuracy score) + scores = cross_val_score(clf, iris.data, iris.target) + assert_array_almost_equal(scores, [0.97, 1.0, 0.97, 0.97, 1.0], 2) + + # Correct classification score (aka. zero / one score) - should be the + # same as the default estimator score + zo_scores = cross_val_score(clf, iris.data, iris.target, scoring="accuracy") + assert_array_almost_equal(zo_scores, [0.97, 1.0, 0.97, 0.97, 1.0], 2) + + # F1 score (class are balanced so f1_score should be equal to zero/one + # score + f1_scores = cross_val_score(clf, iris.data, iris.target, scoring="f1_weighted") + assert_array_almost_equal(f1_scores, [0.97, 1.0, 0.97, 0.97, 1.0], 2) + + +def test_cross_val_score_with_score_func_regression(): + X, y = make_regression(n_samples=30, n_features=20, n_informative=5, random_state=0) + reg = Ridge() + + # Default score of the Ridge regression estimator + scores = cross_val_score(reg, X, y) + assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) + + # R2 score (aka. determination coefficient) - should be the + # same as the default estimator score + r2_scores = cross_val_score(reg, X, y, scoring="r2") + assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) + + # Mean squared error; this is a loss function, so "scores" are negative + neg_mse_scores = cross_val_score(reg, X, y, scoring="neg_mean_squared_error") + expected_neg_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99]) + assert_array_almost_equal(neg_mse_scores, expected_neg_mse, 2) + + # Explained variance + scoring = make_scorer(explained_variance_score) + ev_scores = cross_val_score(reg, X, y, scoring=scoring) + assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) + + +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +def test_permutation_score(coo_container): + iris = load_iris() + X = iris.data + X_sparse = coo_container(X) + y = iris.target + svm = SVC(kernel="linear") + cv = StratifiedKFold(2) + + score, scores, pvalue = permutation_test_score( + svm, X, y, n_permutations=30, cv=cv, scoring="accuracy" + ) + assert score > 0.9 + assert_almost_equal(pvalue, 0.0, 1) + + score_group, _, pvalue_group = permutation_test_score( + svm, + X, + y, + n_permutations=30, + cv=cv, + scoring="accuracy", + groups=np.ones(y.size), + random_state=0, + ) + assert score_group == score + assert pvalue_group == pvalue + + # check that we obtain the same results with a sparse representation + svm_sparse = SVC(kernel="linear") + cv_sparse = StratifiedKFold(2) + score_group, _, pvalue_group = permutation_test_score( + svm_sparse, + X_sparse, + y, + n_permutations=30, + cv=cv_sparse, + scoring="accuracy", + groups=np.ones(y.size), + random_state=0, + ) + + assert score_group == score + assert pvalue_group == pvalue + + # test with custom scoring object + def custom_score(y_true, y_pred): + return ((y_true == y_pred).sum() - (y_true != y_pred).sum()) / y_true.shape[0] + + scorer = make_scorer(custom_score) + score, _, pvalue = permutation_test_score( + svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0 + ) + assert_almost_equal(score, 0.93, 2) + assert_almost_equal(pvalue, 0.01, 3) + + # set random y + y = np.mod(np.arange(len(y)), 3) + + score, scores, pvalue = permutation_test_score( + svm, X, y, n_permutations=30, cv=cv, scoring="accuracy" + ) + + assert score < 0.5 + assert pvalue > 0.2 + + +def test_permutation_test_score_allow_nans(): + # Check that permutation_test_score allows input data with NaNs + X = np.arange(200, dtype=np.float64).reshape(10, -1) + X[2, :] = np.nan + y = np.repeat([0, 1], X.shape[0] / 2) + p = Pipeline( + [ + ("imputer", SimpleImputer(strategy="mean", missing_values=np.nan)), + ("classifier", MockClassifier()), + ] + ) + permutation_test_score(p, X, y) + + +def test_permutation_test_score_params(): + X = np.arange(100).reshape(10, 10) + y = np.array([0] * 5 + [1] * 5) + clf = CheckingClassifier(expected_sample_weight=True) + + err_msg = r"Expected sample_weight to be passed" + with pytest.raises(AssertionError, match=err_msg): + permutation_test_score(clf, X, y) + + err_msg = r"sample_weight.shape == \(1,\), expected \(8,\)!" + with pytest.raises(ValueError, match=err_msg): + permutation_test_score(clf, X, y, params={"sample_weight": np.ones(1)}) + permutation_test_score(clf, X, y, params={"sample_weight": np.ones(10)}) + + +def test_cross_val_score_allow_nans(): + # Check that cross_val_score allows input data with NaNs + X = np.arange(200, dtype=np.float64).reshape(10, -1) + X[2, :] = np.nan + y = np.repeat([0, 1], X.shape[0] / 2) + p = Pipeline( + [ + ("imputer", SimpleImputer(strategy="mean", missing_values=np.nan)), + ("classifier", MockClassifier()), + ] + ) + cross_val_score(p, X, y) + + +def test_cross_val_score_multilabel(): + X = np.array( + [ + [-3, 4], + [2, 4], + [3, 3], + [0, 2], + [-3, 1], + [-2, 1], + [0, 0], + [-2, -1], + [-1, -2], + [1, -2], + ] + ) + y = np.array( + [[1, 1], [0, 1], [0, 1], [0, 1], [1, 1], [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]] + ) + clf = KNeighborsClassifier(n_neighbors=1) + scoring_micro = make_scorer(precision_score, average="micro") + scoring_macro = make_scorer(precision_score, average="macro") + scoring_samples = make_scorer(precision_score, average="samples") + score_micro = cross_val_score(clf, X, y, scoring=scoring_micro) + score_macro = cross_val_score(clf, X, y, scoring=scoring_macro) + score_samples = cross_val_score(clf, X, y, scoring=scoring_samples) + assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3]) + assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4]) + assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4]) + + +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +def test_cross_val_predict(coo_container): + X, y = load_diabetes(return_X_y=True) + cv = KFold() + + est = Ridge() + + # Naive loop (should be same as cross_val_predict): + preds2 = np.zeros_like(y) + for train, test in cv.split(X, y): + est.fit(X[train], y[train]) + preds2[test] = est.predict(X[test]) + + preds = cross_val_predict(est, X, y, cv=cv) + assert_array_almost_equal(preds, preds2) + + preds = cross_val_predict(est, X, y) + assert len(preds) == len(y) + + cv = LeaveOneOut() + preds = cross_val_predict(est, X, y, cv=cv) + assert len(preds) == len(y) + + Xsp = X.copy() + Xsp *= Xsp > np.median(Xsp) + Xsp = coo_container(Xsp) + preds = cross_val_predict(est, Xsp, y) + assert_array_almost_equal(len(preds), len(y)) + + preds = cross_val_predict(KMeans(n_init="auto"), X) + assert len(preds) == len(y) + + class BadCV: + def split(self, X, y=None, groups=None): + for i in range(4): + yield np.array([0, 1, 2, 3]), np.array([4, 5, 6, 7, 8]) + + with pytest.raises(ValueError): + cross_val_predict(est, X, y, cv=BadCV()) + + X, y = load_iris(return_X_y=True) + + warning_message = ( + r"Number of classes in training fold \(2\) does " + r"not match total number of classes \(3\). " + "Results may not be appropriate for your use case." + ) + with pytest.warns(RuntimeWarning, match=warning_message): + cross_val_predict( + LogisticRegression(solver="liblinear"), + X, + y, + method="predict_proba", + cv=KFold(2), + ) + + +def test_cross_val_predict_decision_function_shape(): + X, y = make_classification(n_classes=2, n_samples=50, random_state=0) + + preds = cross_val_predict(LogisticRegression(), X, y, method="decision_function") + assert preds.shape == (50,) + + X, y = load_iris(return_X_y=True) + + preds = cross_val_predict(LogisticRegression(), X, y, method="decision_function") + assert preds.shape == (150, 3) + + # This specifically tests imbalanced splits for binary + # classification with decision_function. This is only + # applicable to classifiers that can be fit on a single + # class. + X = X[:100] + y = y[:100] + error_message = ( + "Only 1 class/es in training fold," + " but 2 in overall dataset. This" + " is not supported for decision_function" + " with imbalanced folds. To fix " + "this, use a cross-validation technique " + "resulting in properly stratified folds" + ) + with pytest.raises(ValueError, match=error_message): + cross_val_predict( + RidgeClassifier(), X, y, method="decision_function", cv=KFold(2) + ) + + X, y = load_digits(return_X_y=True) + est = SVC(kernel="linear", decision_function_shape="ovo") + + preds = cross_val_predict(est, X, y, method="decision_function") + assert preds.shape == (1797, 45) + + ind = np.argsort(y) + X, y = X[ind], y[ind] + error_message_regexp = ( + r"Output shape \(599L?, 21L?\) of " + "decision_function does not match number of " + r"classes \(7\) in fold. Irregular " + "decision_function .*" + ) + with pytest.raises(ValueError, match=error_message_regexp): + cross_val_predict(est, X, y, cv=KFold(n_splits=3), method="decision_function") + + +def test_cross_val_predict_predict_proba_shape(): + X, y = make_classification(n_classes=2, n_samples=50, random_state=0) + + preds = cross_val_predict(LogisticRegression(), X, y, method="predict_proba") + assert preds.shape == (50, 2) + + X, y = load_iris(return_X_y=True) + + preds = cross_val_predict(LogisticRegression(), X, y, method="predict_proba") + assert preds.shape == (150, 3) + + +def test_cross_val_predict_predict_log_proba_shape(): + X, y = make_classification(n_classes=2, n_samples=50, random_state=0) + + preds = cross_val_predict(LogisticRegression(), X, y, method="predict_log_proba") + assert preds.shape == (50, 2) + + X, y = load_iris(return_X_y=True) + + preds = cross_val_predict(LogisticRegression(), X, y, method="predict_log_proba") + assert preds.shape == (150, 3) + + +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +def test_cross_val_predict_input_types(coo_container): + iris = load_iris() + X, y = iris.data, iris.target + X_sparse = coo_container(X) + multioutput_y = np.column_stack([y, y[::-1]]) + + clf = Ridge(fit_intercept=False, random_state=0) + # 3 fold cv is used --> at least 3 samples per class + # Smoke test + predictions = cross_val_predict(clf, X, y) + assert predictions.shape == (150,) + + # test with multioutput y + predictions = cross_val_predict(clf, X_sparse, multioutput_y) + assert predictions.shape == (150, 2) + + predictions = cross_val_predict(clf, X_sparse, y) + assert_array_equal(predictions.shape, (150,)) + + # test with multioutput y + predictions = cross_val_predict(clf, X_sparse, multioutput_y) + assert_array_equal(predictions.shape, (150, 2)) + + # test with X and y as list + list_check = lambda x: isinstance(x, list) + clf = CheckingClassifier(check_X=list_check) + predictions = cross_val_predict(clf, X.tolist(), y.tolist()) + + clf = CheckingClassifier(check_y=list_check) + predictions = cross_val_predict(clf, X, y.tolist()) + + # test with X and y as list and non empty method + predictions = cross_val_predict( + LogisticRegression(), + X.tolist(), + y.tolist(), + method="decision_function", + ) + predictions = cross_val_predict( + LogisticRegression(), + X, + y.tolist(), + method="decision_function", + ) + + # test with 3d X and + X_3d = X[:, :, np.newaxis] + check_3d = lambda x: x.ndim == 3 + clf = CheckingClassifier(check_X=check_3d) + predictions = cross_val_predict(clf, X_3d, y) + assert_array_equal(predictions.shape, (150,)) + + +def test_cross_val_predict_pandas(): + # check cross_val_score doesn't destroy pandas dataframe + types = [(MockDataFrame, MockDataFrame)] + try: + from pandas import DataFrame, Series + + types.append((Series, DataFrame)) + except ImportError: + pass + for TargetType, InputFeatureType in types: + # X dataframe, y series + X_df, y_ser = InputFeatureType(X), TargetType(y2) + check_df = lambda x: isinstance(x, InputFeatureType) + check_series = lambda x: isinstance(x, TargetType) + clf = CheckingClassifier(check_X=check_df, check_y=check_series) + cross_val_predict(clf, X_df, y_ser, cv=3) + + +def test_cross_val_predict_unbalanced(): + X, y = make_classification( + n_samples=100, + n_features=2, + n_redundant=0, + n_informative=2, + n_clusters_per_class=1, + random_state=1, + ) + # Change the first sample to a new class + y[0] = 2 + clf = LogisticRegression(random_state=1) + cv = StratifiedKFold(n_splits=2) + train, test = list(cv.split(X, y)) + yhat_proba = cross_val_predict(clf, X, y, cv=cv, method="predict_proba") + assert y[test[0]][0] == 2 # sanity check for further assertions + assert np.all(yhat_proba[test[0]][:, 2] == 0) + assert np.all(yhat_proba[test[0]][:, 0:1] > 0) + assert np.all(yhat_proba[test[1]] > 0) + assert_array_almost_equal(yhat_proba.sum(axis=1), np.ones(y.shape), decimal=12) + + +def test_cross_val_predict_y_none(): + # ensure that cross_val_predict works when y is None + mock_classifier = MockClassifier() + rng = np.random.RandomState(42) + X = rng.rand(100, 10) + y_hat = cross_val_predict(mock_classifier, X, y=None, cv=5, method="predict") + assert_allclose(X[:, 0], y_hat) + y_hat_proba = cross_val_predict( + mock_classifier, X, y=None, cv=5, method="predict_proba" + ) + assert_allclose(X, y_hat_proba) + + +@pytest.mark.parametrize("coo_container", COO_CONTAINERS) +def test_cross_val_score_sparse_fit_params(coo_container): + iris = load_iris() + X, y = iris.data, iris.target + clf = MockClassifier() + fit_params = {"sparse_sample_weight": coo_container(np.eye(X.shape[0]))} + a = cross_val_score(clf, X, y, params=fit_params, cv=3) + assert_array_equal(a, np.ones(3)) + + +def test_learning_curve(): + n_samples = 30 + n_splits = 3 + X, y = make_classification( + n_samples=n_samples, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + estimator = MockImprovingEstimator(n_samples * ((n_splits - 1) / n_splits)) + for shuffle_train in [False, True]: + with warnings.catch_warnings(record=True) as w: + ( + train_sizes, + train_scores, + test_scores, + fit_times, + score_times, + ) = learning_curve( + estimator, + X, + y, + cv=KFold(n_splits=n_splits), + train_sizes=np.linspace(0.1, 1.0, 10), + shuffle=shuffle_train, + return_times=True, + ) + if len(w) > 0: + raise RuntimeError("Unexpected warning: %r" % w[0].message) + assert train_scores.shape == (10, 3) + assert test_scores.shape == (10, 3) + assert fit_times.shape == (10, 3) + assert score_times.shape == (10, 3) + assert_array_equal(train_sizes, np.linspace(2, 20, 10)) + assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10)) + assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10)) + + # Cannot use assert_array_almost_equal for fit and score times because + # the values are hardware-dependant + assert fit_times.dtype == "float64" + assert score_times.dtype == "float64" + + # Test a custom cv splitter that can iterate only once + with warnings.catch_warnings(record=True) as w: + train_sizes2, train_scores2, test_scores2 = learning_curve( + estimator, + X, + y, + cv=OneTimeSplitter(n_splits=n_splits, n_samples=n_samples), + train_sizes=np.linspace(0.1, 1.0, 10), + shuffle=shuffle_train, + ) + if len(w) > 0: + raise RuntimeError("Unexpected warning: %r" % w[0].message) + assert_array_almost_equal(train_scores2, train_scores) + assert_array_almost_equal(test_scores2, test_scores) + + +def test_learning_curve_unsupervised(): + X, _ = make_classification( + n_samples=30, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + estimator = MockImprovingEstimator(20) + train_sizes, train_scores, test_scores = learning_curve( + estimator, X, y=None, cv=3, train_sizes=np.linspace(0.1, 1.0, 10) + ) + assert_array_equal(train_sizes, np.linspace(2, 20, 10)) + assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10)) + assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10)) + + +def test_learning_curve_verbose(): + X, y = make_classification( + n_samples=30, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + estimator = MockImprovingEstimator(20) + + old_stdout = sys.stdout + sys.stdout = StringIO() + try: + train_sizes, train_scores, test_scores = learning_curve( + estimator, X, y, cv=3, verbose=1 + ) + finally: + out = sys.stdout.getvalue() + sys.stdout.close() + sys.stdout = old_stdout + + assert "[learning_curve]" in out + + +def test_learning_curve_incremental_learning_not_possible(): + X, y = make_classification( + n_samples=2, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + # The mockup does not have partial_fit() + estimator = MockImprovingEstimator(1) + with pytest.raises(ValueError): + learning_curve(estimator, X, y, exploit_incremental_learning=True) + + +def test_learning_curve_incremental_learning(): + X, y = make_classification( + n_samples=30, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + estimator = MockIncrementalImprovingEstimator(20) + for shuffle_train in [False, True]: + train_sizes, train_scores, test_scores = learning_curve( + estimator, + X, + y, + cv=3, + exploit_incremental_learning=True, + train_sizes=np.linspace(0.1, 1.0, 10), + shuffle=shuffle_train, + ) + assert_array_equal(train_sizes, np.linspace(2, 20, 10)) + assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10)) + assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10)) + + +def test_learning_curve_incremental_learning_unsupervised(): + X, _ = make_classification( + n_samples=30, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + estimator = MockIncrementalImprovingEstimator(20) + train_sizes, train_scores, test_scores = learning_curve( + estimator, + X, + y=None, + cv=3, + exploit_incremental_learning=True, + train_sizes=np.linspace(0.1, 1.0, 10), + ) + assert_array_equal(train_sizes, np.linspace(2, 20, 10)) + assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10)) + assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10)) + + +def test_learning_curve_batch_and_incremental_learning_are_equal(): + X, y = make_classification( + n_samples=30, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + train_sizes = np.linspace(0.2, 1.0, 5) + estimator = PassiveAggressiveClassifier(max_iter=1, tol=None, shuffle=False) + + train_sizes_inc, train_scores_inc, test_scores_inc = learning_curve( + estimator, + X, + y, + train_sizes=train_sizes, + cv=3, + exploit_incremental_learning=True, + ) + train_sizes_batch, train_scores_batch, test_scores_batch = learning_curve( + estimator, + X, + y, + cv=3, + train_sizes=train_sizes, + exploit_incremental_learning=False, + ) + + assert_array_equal(train_sizes_inc, train_sizes_batch) + assert_array_almost_equal( + train_scores_inc.mean(axis=1), train_scores_batch.mean(axis=1) + ) + assert_array_almost_equal( + test_scores_inc.mean(axis=1), test_scores_batch.mean(axis=1) + ) + + +def test_learning_curve_n_sample_range_out_of_bounds(): + X, y = make_classification( + n_samples=30, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + estimator = MockImprovingEstimator(20) + with pytest.raises(ValueError): + learning_curve(estimator, X, y, cv=3, train_sizes=[0, 1]) + with pytest.raises(ValueError): + learning_curve(estimator, X, y, cv=3, train_sizes=[0.0, 1.0]) + with pytest.raises(ValueError): + learning_curve(estimator, X, y, cv=3, train_sizes=[0.1, 1.1]) + with pytest.raises(ValueError): + learning_curve(estimator, X, y, cv=3, train_sizes=[0, 20]) + with pytest.raises(ValueError): + learning_curve(estimator, X, y, cv=3, train_sizes=[1, 21]) + + +def test_learning_curve_remove_duplicate_sample_sizes(): + X, y = make_classification( + n_samples=3, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + estimator = MockImprovingEstimator(2) + warning_message = ( + "Removed duplicate entries from 'train_sizes'. Number of ticks " + "will be less than the size of 'train_sizes': 2 instead of 3." + ) + with pytest.warns(RuntimeWarning, match=warning_message): + train_sizes, _, _ = learning_curve( + estimator, X, y, cv=3, train_sizes=np.linspace(0.33, 1.0, 3) + ) + assert_array_equal(train_sizes, [1, 2]) + + +def test_learning_curve_with_boolean_indices(): + X, y = make_classification( + n_samples=30, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + estimator = MockImprovingEstimator(20) + cv = KFold(n_splits=3) + train_sizes, train_scores, test_scores = learning_curve( + estimator, X, y, cv=cv, train_sizes=np.linspace(0.1, 1.0, 10) + ) + assert_array_equal(train_sizes, np.linspace(2, 20, 10)) + assert_array_almost_equal(train_scores.mean(axis=1), np.linspace(1.9, 1.0, 10)) + assert_array_almost_equal(test_scores.mean(axis=1), np.linspace(0.1, 1.0, 10)) + + +def test_learning_curve_with_shuffle(): + # Following test case was designed this way to verify the code + # changes made in pull request: #7506. + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [11, 12], + [13, 14], + [15, 16], + [17, 18], + [19, 20], + [7, 8], + [9, 10], + [11, 12], + [13, 14], + [15, 16], + [17, 18], + ] + ) + y = np.array([1, 1, 1, 2, 3, 4, 1, 1, 2, 3, 4, 1, 2, 3, 4]) + groups = np.array([1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 4, 4, 4, 4]) + # Splits on these groups fail without shuffle as the first iteration + # of the learning curve doesn't contain label 4 in the training set. + estimator = PassiveAggressiveClassifier(max_iter=5, tol=None, shuffle=False) + + cv = GroupKFold(n_splits=2) + train_sizes_batch, train_scores_batch, test_scores_batch = learning_curve( + estimator, + X, + y, + cv=cv, + n_jobs=1, + train_sizes=np.linspace(0.3, 1.0, 3), + groups=groups, + shuffle=True, + random_state=2, + ) + assert_array_almost_equal( + train_scores_batch.mean(axis=1), np.array([0.75, 0.3, 0.36111111]) + ) + assert_array_almost_equal( + test_scores_batch.mean(axis=1), np.array([0.36111111, 0.25, 0.25]) + ) + with pytest.raises(ValueError): + learning_curve( + estimator, + X, + y, + cv=cv, + n_jobs=1, + train_sizes=np.linspace(0.3, 1.0, 3), + groups=groups, + error_score="raise", + ) + + train_sizes_inc, train_scores_inc, test_scores_inc = learning_curve( + estimator, + X, + y, + cv=cv, + n_jobs=1, + train_sizes=np.linspace(0.3, 1.0, 3), + groups=groups, + shuffle=True, + random_state=2, + exploit_incremental_learning=True, + ) + assert_array_almost_equal( + train_scores_inc.mean(axis=1), train_scores_batch.mean(axis=1) + ) + assert_array_almost_equal( + test_scores_inc.mean(axis=1), test_scores_batch.mean(axis=1) + ) + + +def test_learning_curve_params(): + X = np.arange(100).reshape(10, 10) + y = np.array([0] * 5 + [1] * 5) + clf = CheckingClassifier(expected_sample_weight=True) + + err_msg = r"Expected sample_weight to be passed" + with pytest.raises(AssertionError, match=err_msg): + learning_curve(clf, X, y, error_score="raise") + + err_msg = r"sample_weight.shape == \(1,\), expected \(2,\)!" + with pytest.raises(ValueError, match=err_msg): + learning_curve( + clf, X, y, error_score="raise", params={"sample_weight": np.ones(1)} + ) + learning_curve( + clf, X, y, error_score="raise", params={"sample_weight": np.ones(10)} + ) + + +def test_learning_curve_incremental_learning_params(): + X, y = make_classification( + n_samples=30, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + estimator = MockIncrementalImprovingEstimator(20, ["sample_weight"]) + err_msg = r"Expected fit parameter\(s\) \['sample_weight'\] not seen." + with pytest.raises(AssertionError, match=err_msg): + learning_curve( + estimator, + X, + y, + cv=3, + exploit_incremental_learning=True, + train_sizes=np.linspace(0.1, 1.0, 10), + error_score="raise", + ) + + err_msg = "Fit parameter sample_weight has length 3; expected" + with pytest.raises(AssertionError, match=err_msg): + learning_curve( + estimator, + X, + y, + cv=3, + exploit_incremental_learning=True, + train_sizes=np.linspace(0.1, 1.0, 10), + error_score="raise", + params={"sample_weight": np.ones(3)}, + ) + + learning_curve( + estimator, + X, + y, + cv=3, + exploit_incremental_learning=True, + train_sizes=np.linspace(0.1, 1.0, 10), + error_score="raise", + params={"sample_weight": np.ones(2)}, + ) + + +def test_validation_curve(): + X, y = make_classification( + n_samples=2, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + param_range = np.linspace(0, 1, 10) + with warnings.catch_warnings(record=True) as w: + train_scores, test_scores = validation_curve( + MockEstimatorWithParameter(), + X, + y, + param_name="param", + param_range=param_range, + cv=2, + ) + if len(w) > 0: + raise RuntimeError("Unexpected warning: %r" % w[0].message) + + assert_array_almost_equal(train_scores.mean(axis=1), param_range) + assert_array_almost_equal(test_scores.mean(axis=1), 1 - param_range) + + +def test_validation_curve_clone_estimator(): + X, y = make_classification( + n_samples=2, + n_features=1, + n_informative=1, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + + param_range = np.linspace(1, 0, 10) + _, _ = validation_curve( + MockEstimatorWithSingleFitCallAllowed(), + X, + y, + param_name="param", + param_range=param_range, + cv=2, + ) + + +def test_validation_curve_cv_splits_consistency(): + n_samples = 100 + n_splits = 5 + X, y = make_classification(n_samples=100, random_state=0) + + scores1 = validation_curve( + SVC(kernel="linear", random_state=0), + X, + y, + param_name="C", + param_range=[0.1, 0.1, 0.2, 0.2], + cv=OneTimeSplitter(n_splits=n_splits, n_samples=n_samples), + ) + # The OneTimeSplitter is a non-re-entrant cv splitter. Unless, the + # `split` is called for each parameter, the following should produce + # identical results for param setting 1 and param setting 2 as both have + # the same C value. + assert_array_almost_equal(*np.vsplit(np.hstack(scores1)[(0, 2, 1, 3), :], 2)) + + scores2 = validation_curve( + SVC(kernel="linear", random_state=0), + X, + y, + param_name="C", + param_range=[0.1, 0.1, 0.2, 0.2], + cv=KFold(n_splits=n_splits, shuffle=True), + ) + + # For scores2, compare the 1st and 2nd parameter's scores + # (Since the C value for 1st two param setting is 0.1, they must be + # consistent unless the train test folds differ between the param settings) + assert_array_almost_equal(*np.vsplit(np.hstack(scores2)[(0, 2, 1, 3), :], 2)) + + scores3 = validation_curve( + SVC(kernel="linear", random_state=0), + X, + y, + param_name="C", + param_range=[0.1, 0.1, 0.2, 0.2], + cv=KFold(n_splits=n_splits), + ) + + # OneTimeSplitter is basically unshuffled KFold(n_splits=5). Sanity check. + assert_array_almost_equal(np.array(scores3), np.array(scores1)) + + +def test_validation_curve_params(): + X = np.arange(100).reshape(10, 10) + y = np.array([0] * 5 + [1] * 5) + clf = CheckingClassifier(expected_sample_weight=True) + + err_msg = r"Expected sample_weight to be passed" + with pytest.raises(AssertionError, match=err_msg): + validation_curve( + clf, + X, + y, + param_name="foo_param", + param_range=[1, 2, 3], + error_score="raise", + ) + + err_msg = r"sample_weight.shape == \(1,\), expected \(8,\)!" + with pytest.raises(ValueError, match=err_msg): + validation_curve( + clf, + X, + y, + param_name="foo_param", + param_range=[1, 2, 3], + error_score="raise", + params={"sample_weight": np.ones(1)}, + ) + validation_curve( + clf, + X, + y, + param_name="foo_param", + param_range=[1, 2, 3], + error_score="raise", + params={"sample_weight": np.ones(10)}, + ) + + +def test_check_is_permutation(): + rng = np.random.RandomState(0) + p = np.arange(100) + rng.shuffle(p) + assert _check_is_permutation(p, 100) + assert not _check_is_permutation(np.delete(p, 23), 100) + + p[0] = 23 + assert not _check_is_permutation(p, 100) + + # Check if the additional duplicate indices are caught + assert not _check_is_permutation(np.hstack((p, 0)), 100) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_cross_val_predict_sparse_prediction(csr_container): + # check that cross_val_predict gives same result for sparse and dense input + X, y = make_multilabel_classification( + n_classes=2, + n_labels=1, + allow_unlabeled=False, + return_indicator=True, + random_state=1, + ) + X_sparse = csr_container(X) + y_sparse = csr_container(y) + classif = OneVsRestClassifier(SVC(kernel="linear")) + preds = cross_val_predict(classif, X, y, cv=10) + preds_sparse = cross_val_predict(classif, X_sparse, y_sparse, cv=10) + preds_sparse = preds_sparse.toarray() + assert_array_almost_equal(preds_sparse, preds) + + +def check_cross_val_predict_binary(est, X, y, method): + """Helper for tests of cross_val_predict with binary classification""" + cv = KFold(n_splits=3, shuffle=False) + + # Generate expected outputs + if y.ndim == 1: + exp_shape = (len(X),) if method == "decision_function" else (len(X), 2) + else: + exp_shape = y.shape + expected_predictions = np.zeros(exp_shape) + for train, test in cv.split(X, y): + est = clone(est).fit(X[train], y[train]) + expected_predictions[test] = getattr(est, method)(X[test]) + + # Check actual outputs for several representations of y + for tg in [y, y + 1, y - 2, y.astype("str")]: + assert_allclose( + cross_val_predict(est, X, tg, method=method, cv=cv), expected_predictions + ) + + +def check_cross_val_predict_multiclass(est, X, y, method): + """Helper for tests of cross_val_predict with multiclass classification""" + cv = KFold(n_splits=3, shuffle=False) + + # Generate expected outputs + float_min = np.finfo(np.float64).min + default_values = { + "decision_function": float_min, + "predict_log_proba": float_min, + "predict_proba": 0, + } + expected_predictions = np.full( + (len(X), len(set(y))), default_values[method], dtype=np.float64 + ) + _, y_enc = np.unique(y, return_inverse=True) + for train, test in cv.split(X, y_enc): + est = clone(est).fit(X[train], y_enc[train]) + fold_preds = getattr(est, method)(X[test]) + i_cols_fit = np.unique(y_enc[train]) + expected_predictions[np.ix_(test, i_cols_fit)] = fold_preds + + # Check actual outputs for several representations of y + for tg in [y, y + 1, y - 2, y.astype("str")]: + assert_allclose( + cross_val_predict(est, X, tg, method=method, cv=cv), expected_predictions + ) + + +def check_cross_val_predict_multilabel(est, X, y, method): + """Check the output of cross_val_predict for 2D targets using + Estimators which provide a predictions as a list with one + element per class. + """ + cv = KFold(n_splits=3, shuffle=False) + + # Create empty arrays of the correct size to hold outputs + float_min = np.finfo(np.float64).min + default_values = { + "decision_function": float_min, + "predict_log_proba": float_min, + "predict_proba": 0, + } + n_targets = y.shape[1] + expected_preds = [] + for i_col in range(n_targets): + n_classes_in_label = len(set(y[:, i_col])) + if n_classes_in_label == 2 and method == "decision_function": + exp_shape = (len(X),) + else: + exp_shape = (len(X), n_classes_in_label) + expected_preds.append( + np.full(exp_shape, default_values[method], dtype=np.float64) + ) + + # Generate expected outputs + y_enc_cols = [ + np.unique(y[:, i], return_inverse=True)[1][:, np.newaxis] + for i in range(y.shape[1]) + ] + y_enc = np.concatenate(y_enc_cols, axis=1) + for train, test in cv.split(X, y_enc): + est = clone(est).fit(X[train], y_enc[train]) + fold_preds = getattr(est, method)(X[test]) + for i_col in range(n_targets): + fold_cols = np.unique(y_enc[train][:, i_col]) + if expected_preds[i_col].ndim == 1: + # Decision function with <=2 classes + expected_preds[i_col][test] = fold_preds[i_col] + else: + idx = np.ix_(test, fold_cols) + expected_preds[i_col][idx] = fold_preds[i_col] + + # Check actual outputs for several representations of y + for tg in [y, y + 1, y - 2, y.astype("str")]: + cv_predict_output = cross_val_predict(est, X, tg, method=method, cv=cv) + assert len(cv_predict_output) == len(expected_preds) + for i in range(len(cv_predict_output)): + assert_allclose(cv_predict_output[i], expected_preds[i]) + + +def check_cross_val_predict_with_method_binary(est): + # This test includes the decision_function with two classes. + # This is a special case: it has only one column of output. + X, y = make_classification(n_classes=2, random_state=0) + for method in ["decision_function", "predict_proba", "predict_log_proba"]: + check_cross_val_predict_binary(est, X, y, method) + + +def check_cross_val_predict_with_method_multiclass(est): + iris = load_iris() + X, y = iris.data, iris.target + X, y = shuffle(X, y, random_state=0) + for method in ["decision_function", "predict_proba", "predict_log_proba"]: + check_cross_val_predict_multiclass(est, X, y, method) + + +def test_cross_val_predict_with_method(): + check_cross_val_predict_with_method_binary(LogisticRegression()) + check_cross_val_predict_with_method_multiclass(LogisticRegression()) + + +def test_cross_val_predict_method_checking(): + # Regression test for issue #9639. Tests that cross_val_predict does not + # check estimator methods (e.g. predict_proba) before fitting + iris = load_iris() + X, y = iris.data, iris.target + X, y = shuffle(X, y, random_state=0) + for method in ["decision_function", "predict_proba", "predict_log_proba"]: + est = SGDClassifier(loss="log_loss", random_state=2) + check_cross_val_predict_multiclass(est, X, y, method) + + +def test_gridsearchcv_cross_val_predict_with_method(): + iris = load_iris() + X, y = iris.data, iris.target + X, y = shuffle(X, y, random_state=0) + est = GridSearchCV(LogisticRegression(random_state=42), {"C": [0.1, 1]}, cv=2) + for method in ["decision_function", "predict_proba", "predict_log_proba"]: + check_cross_val_predict_multiclass(est, X, y, method) + + +def test_cross_val_predict_with_method_multilabel_ovr(): + # OVR does multilabel predictions, but only arrays of + # binary indicator columns. The output of predict_proba + # is a 2D array with shape (n_samples, n_classes). + n_samp = 100 + n_classes = 4 + X, y = make_multilabel_classification( + n_samples=n_samp, n_labels=3, n_classes=n_classes, n_features=5, random_state=42 + ) + est = OneVsRestClassifier(LogisticRegression(solver="liblinear", random_state=0)) + for method in ["predict_proba", "decision_function"]: + check_cross_val_predict_binary(est, X, y, method=method) + + +class RFWithDecisionFunction(RandomForestClassifier): + # None of the current multioutput-multiclass estimators have + # decision function methods. Create a mock decision function + # to test the cross_val_predict function's handling of this case. + def decision_function(self, X): + probs = self.predict_proba(X) + msg = "This helper should only be used on multioutput-multiclass tasks" + assert isinstance(probs, list), msg + probs = [p[:, -1] if p.shape[1] == 2 else p for p in probs] + return probs + + +def test_cross_val_predict_with_method_multilabel_rf(): + # The RandomForest allows multiple classes in each label. + # Output of predict_proba is a list of outputs of predict_proba + # for each individual label. + n_classes = 4 + X, y = make_multilabel_classification( + n_samples=100, n_labels=3, n_classes=n_classes, n_features=5, random_state=42 + ) + y[:, 0] += y[:, 1] # Put three classes in the first column + for method in ["predict_proba", "predict_log_proba", "decision_function"]: + est = RFWithDecisionFunction(n_estimators=5, random_state=0) + with warnings.catch_warnings(): + # Suppress "RuntimeWarning: divide by zero encountered in log" + warnings.simplefilter("ignore") + check_cross_val_predict_multilabel(est, X, y, method=method) + + +def test_cross_val_predict_with_method_rare_class(): + # Test a multiclass problem where one class will be missing from + # one of the CV training sets. + rng = np.random.RandomState(0) + X = rng.normal(0, 1, size=(14, 10)) + y = np.array([0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 3]) + est = LogisticRegression() + for method in ["predict_proba", "predict_log_proba", "decision_function"]: + with warnings.catch_warnings(): + # Suppress warning about too few examples of a class + warnings.simplefilter("ignore") + check_cross_val_predict_multiclass(est, X, y, method) + + +def test_cross_val_predict_with_method_multilabel_rf_rare_class(): + # The RandomForest allows anything for the contents of the labels. + # Output of predict_proba is a list of outputs of predict_proba + # for each individual label. + # In this test, the first label has a class with a single example. + # We'll have one CV fold where the training data don't include it. + rng = np.random.RandomState(0) + X = rng.normal(0, 1, size=(5, 10)) + y = np.array([[0, 0], [1, 1], [2, 1], [0, 1], [1, 0]]) + for method in ["predict_proba", "predict_log_proba"]: + est = RFWithDecisionFunction(n_estimators=5, random_state=0) + with warnings.catch_warnings(): + # Suppress "RuntimeWarning: divide by zero encountered in log" + warnings.simplefilter("ignore") + check_cross_val_predict_multilabel(est, X, y, method=method) + + +def get_expected_predictions(X, y, cv, classes, est, method): + expected_predictions = np.zeros([len(y), classes]) + func = getattr(est, method) + + for train, test in cv.split(X, y): + est.fit(X[train], y[train]) + expected_predictions_ = func(X[test]) + # To avoid 2 dimensional indexing + if method == "predict_proba": + exp_pred_test = np.zeros((len(test), classes)) + else: + exp_pred_test = np.full( + (len(test), classes), np.finfo(expected_predictions.dtype).min + ) + exp_pred_test[:, est.classes_] = expected_predictions_ + expected_predictions[test] = exp_pred_test + + return expected_predictions + + +def test_cross_val_predict_class_subset(): + X = np.arange(200).reshape(100, 2) + y = np.array([x // 10 for x in range(100)]) + classes = 10 + + kfold3 = KFold(n_splits=3) + kfold4 = KFold(n_splits=4) + + le = LabelEncoder() + + methods = ["decision_function", "predict_proba", "predict_log_proba"] + for method in methods: + est = LogisticRegression() + + # Test with n_splits=3 + predictions = cross_val_predict(est, X, y, method=method, cv=kfold3) + + # Runs a naive loop (should be same as cross_val_predict): + expected_predictions = get_expected_predictions( + X, y, kfold3, classes, est, method + ) + assert_array_almost_equal(expected_predictions, predictions) + + # Test with n_splits=4 + predictions = cross_val_predict(est, X, y, method=method, cv=kfold4) + expected_predictions = get_expected_predictions( + X, y, kfold4, classes, est, method + ) + assert_array_almost_equal(expected_predictions, predictions) + + # Testing unordered labels + y = shuffle(np.repeat(range(10), 10), random_state=0) + predictions = cross_val_predict(est, X, y, method=method, cv=kfold3) + y = le.fit_transform(y) + expected_predictions = get_expected_predictions( + X, y, kfold3, classes, est, method + ) + assert_array_almost_equal(expected_predictions, predictions) + + +def test_score_memmap(): + # Ensure a scalar score of memmap type is accepted + iris = load_iris() + X, y = iris.data, iris.target + clf = MockClassifier() + tf = tempfile.NamedTemporaryFile(mode="wb", delete=False) + tf.write(b"Hello world!!!!!") + tf.close() + scores = np.memmap(tf.name, dtype=np.float64) + score = np.memmap(tf.name, shape=(), mode="r", dtype=np.float64) + try: + cross_val_score(clf, X, y, scoring=lambda est, X, y: score) + with pytest.raises(ValueError): + cross_val_score(clf, X, y, scoring=lambda est, X, y: scores) + finally: + # Best effort to release the mmap file handles before deleting the + # backing file under Windows + scores, score = None, None + for _ in range(3): + try: + os.unlink(tf.name) + break + except OSError: + sleep(1.0) + + +def test_permutation_test_score_pandas(): + # check permutation_test_score doesn't destroy pandas dataframe + types = [(MockDataFrame, MockDataFrame)] + try: + from pandas import DataFrame, Series + + types.append((Series, DataFrame)) + except ImportError: + pass + for TargetType, InputFeatureType in types: + # X dataframe, y series + iris = load_iris() + X, y = iris.data, iris.target + X_df, y_ser = InputFeatureType(X), TargetType(y) + check_df = lambda x: isinstance(x, InputFeatureType) + check_series = lambda x: isinstance(x, TargetType) + clf = CheckingClassifier(check_X=check_df, check_y=check_series) + permutation_test_score(clf, X_df, y_ser) + + +def test_fit_and_score_failing(): + # Create a failing classifier to deliberately fail + failing_clf = FailingClassifier(FailingClassifier.FAILING_PARAMETER) + # dummy X data + X = np.arange(1, 10) + train, test = np.arange(0, 5), np.arange(5, 9) + fit_and_score_args = dict( + estimator=failing_clf, + X=X, + y=None, + scorer=dict(), + train=train, + test=test, + verbose=0, + parameters=None, + fit_params=None, + score_params=None, + ) + # passing error score to trigger the warning message + fit_and_score_args["error_score"] = "raise" + # check if exception was raised, with default error_score='raise' + with pytest.raises(ValueError, match="Failing classifier failed as required"): + _fit_and_score(**fit_and_score_args) + + assert failing_clf.score() == 0.0 # FailingClassifier coverage + + +def test_fit_and_score_working(): + X, y = make_classification(n_samples=30, random_state=0) + clf = SVC(kernel="linear", random_state=0) + train, test = next(ShuffleSplit().split(X)) + # Test return_parameters option + fit_and_score_args = dict( + estimator=clf, + X=X, + y=y, + scorer=dict(), + train=train, + test=test, + verbose=0, + parameters={"max_iter": 100, "tol": 0.1}, + fit_params=None, + score_params=None, + return_parameters=True, + ) + result = _fit_and_score(**fit_and_score_args) + assert result["parameters"] == fit_and_score_args["parameters"] + + +class DataDependentFailingClassifier(BaseEstimator): + def __init__(self, max_x_value=None): + self.max_x_value = max_x_value + + def fit(self, X, y=None): + num_values_too_high = (X > self.max_x_value).sum() + if num_values_too_high: + raise ValueError( + f"Classifier fit failed with {num_values_too_high} values too high" + ) + + def score(self, X=None, Y=None): + return 0.0 + + +@pytest.mark.parametrize("error_score", [np.nan, 0]) +def test_cross_validate_some_failing_fits_warning(error_score): + # Create a failing classifier to deliberately fail + failing_clf = DataDependentFailingClassifier(max_x_value=8) + # dummy X data + X = np.arange(1, 10) + y = np.ones(9) + # passing error score to trigger the warning message + cross_validate_args = [failing_clf, X, y] + cross_validate_kwargs = {"cv": 3, "error_score": error_score} + # check if the warning message type is as expected + + individual_fit_error_message = ( + "ValueError: Classifier fit failed with 1 values too high" + ) + warning_message = re.compile( + ( + "2 fits failed.+total of 3.+The score on these" + " train-test partitions for these parameters will be set to" + f" {cross_validate_kwargs['error_score']}.+{individual_fit_error_message}" + ), + flags=re.DOTALL, + ) + + with pytest.warns(FitFailedWarning, match=warning_message): + cross_validate(*cross_validate_args, **cross_validate_kwargs) + + +@pytest.mark.parametrize("error_score", [np.nan, 0]) +def test_cross_validate_all_failing_fits_error(error_score): + # Create a failing classifier to deliberately fail + failing_clf = FailingClassifier(FailingClassifier.FAILING_PARAMETER) + # dummy X data + X = np.arange(1, 10) + y = np.ones(9) + + cross_validate_args = [failing_clf, X, y] + cross_validate_kwargs = {"cv": 7, "error_score": error_score} + + individual_fit_error_message = "ValueError: Failing classifier failed as required" + error_message = re.compile( + ( + "All the 7 fits failed.+your model is misconfigured.+" + f"{individual_fit_error_message}" + ), + flags=re.DOTALL, + ) + + with pytest.raises(ValueError, match=error_message): + cross_validate(*cross_validate_args, **cross_validate_kwargs) + + +def _failing_scorer(estimator, X, y, error_msg): + raise ValueError(error_msg) + + +@pytest.mark.filterwarnings("ignore:lbfgs failed to converge") +@pytest.mark.parametrize("error_score", [np.nan, 0, "raise"]) +def test_cross_val_score_failing_scorer(error_score): + # check that an estimator can fail during scoring in `cross_val_score` and + # that we can optionally replaced it with `error_score` + X, y = load_iris(return_X_y=True) + clf = LogisticRegression(max_iter=5).fit(X, y) + + error_msg = "This scorer is supposed to fail!!!" + failing_scorer = partial(_failing_scorer, error_msg=error_msg) + + if error_score == "raise": + with pytest.raises(ValueError, match=error_msg): + cross_val_score( + clf, X, y, cv=3, scoring=failing_scorer, error_score=error_score + ) + else: + warning_msg = ( + "Scoring failed. The score on this train-test partition for " + f"these parameters will be set to {error_score}" + ) + with pytest.warns(UserWarning, match=warning_msg): + scores = cross_val_score( + clf, X, y, cv=3, scoring=failing_scorer, error_score=error_score + ) + assert_allclose(scores, error_score) + + +@pytest.mark.filterwarnings("ignore:lbfgs failed to converge") +@pytest.mark.parametrize("error_score", [np.nan, 0, "raise"]) +@pytest.mark.parametrize("return_train_score", [True, False]) +@pytest.mark.parametrize("with_multimetric", [False, True]) +def test_cross_validate_failing_scorer( + error_score, return_train_score, with_multimetric +): + # Check that an estimator can fail during scoring in `cross_validate` and + # that we can optionally replace it with `error_score`. In the multimetric + # case also check the result of a non-failing scorer where the other scorers + # are failing. + X, y = load_iris(return_X_y=True) + clf = LogisticRegression(max_iter=5).fit(X, y) + + error_msg = "This scorer is supposed to fail!!!" + failing_scorer = partial(_failing_scorer, error_msg=error_msg) + if with_multimetric: + non_failing_scorer = make_scorer(mean_squared_error) + scoring = { + "score_1": failing_scorer, + "score_2": non_failing_scorer, + "score_3": failing_scorer, + } + else: + scoring = failing_scorer + + if error_score == "raise": + with pytest.raises(ValueError, match=error_msg): + cross_validate( + clf, + X, + y, + cv=3, + scoring=scoring, + return_train_score=return_train_score, + error_score=error_score, + ) + else: + warning_msg = ( + "Scoring failed. The score on this train-test partition for " + f"these parameters will be set to {error_score}" + ) + with pytest.warns(UserWarning, match=warning_msg): + results = cross_validate( + clf, + X, + y, + cv=3, + scoring=scoring, + return_train_score=return_train_score, + error_score=error_score, + ) + for key in results: + if "_score" in key: + if "_score_2" in key: + # check the test (and optionally train) score for the + # scorer that should be non-failing + for i in results[key]: + assert isinstance(i, float) + else: + # check the test (and optionally train) score for all + # scorers that should be assigned to `error_score`. + assert_allclose(results[key], error_score) + + +def three_params_scorer(i, j, k): + return 3.4213 + + +@pytest.mark.parametrize( + "train_score, scorer, verbose, split_prg, cdt_prg, expected", + [ + ( + False, + three_params_scorer, + 2, + (1, 3), + (0, 1), + r"\[CV\] END ...................................................." + r" total time= 0.\ds", + ), + ( + True, + _MultimetricScorer( + scorers={"sc1": three_params_scorer, "sc2": three_params_scorer} + ), + 3, + (1, 3), + (0, 1), + r"\[CV 2/3\] END sc1: \(train=3.421, test=3.421\) sc2: " + r"\(train=3.421, test=3.421\) total time= 0.\ds", + ), + ( + False, + _MultimetricScorer( + scorers={"sc1": three_params_scorer, "sc2": three_params_scorer} + ), + 10, + (1, 3), + (0, 1), + r"\[CV 2/3; 1/1\] END ....... sc1: \(test=3.421\) sc2: \(test=3.421\)" + r" total time= 0.\ds", + ), + ], +) +def test_fit_and_score_verbosity( + capsys, train_score, scorer, verbose, split_prg, cdt_prg, expected +): + X, y = make_classification(n_samples=30, random_state=0) + clf = SVC(kernel="linear", random_state=0) + train, test = next(ShuffleSplit().split(X)) + + # test print without train score + fit_and_score_args = dict( + estimator=clf, + X=X, + y=y, + scorer=scorer, + train=train, + test=test, + verbose=verbose, + parameters=None, + fit_params=None, + score_params=None, + return_train_score=train_score, + split_progress=split_prg, + candidate_progress=cdt_prg, + ) + _fit_and_score(**fit_and_score_args) + out, _ = capsys.readouterr() + outlines = out.split("\n") + if len(outlines) > 2: + assert re.match(expected, outlines[1]) + else: + assert re.match(expected, outlines[0]) + + +def test_score(): + error_message = "scoring must return a number, got None" + + def two_params_scorer(estimator, X_test): + return None + + with pytest.raises(ValueError, match=error_message): + _score( + estimator=None, + X_test=None, + y_test=None, + scorer=two_params_scorer, + score_params=None, + error_score=np.nan, + ) + + +def test_callable_multimetric_confusion_matrix_cross_validate(): + def custom_scorer(clf, X, y): + y_pred = clf.predict(X) + cm = confusion_matrix(y, y_pred) + return {"tn": cm[0, 0], "fp": cm[0, 1], "fn": cm[1, 0], "tp": cm[1, 1]} + + X, y = make_classification(n_samples=40, n_features=4, random_state=42) + est = LinearSVC(random_state=42) + est.fit(X, y) + cv_results = cross_validate(est, X, y, cv=5, scoring=custom_scorer) + + score_names = ["tn", "fp", "fn", "tp"] + for name in score_names: + assert "test_{}".format(name) in cv_results + + +def test_learning_curve_partial_fit_regressors(): + """Check that regressors with partial_fit is supported. + + Non-regression test for #22981. + """ + X, y = make_regression(random_state=42) + + # Does not error + learning_curve(MLPRegressor(), X, y, exploit_incremental_learning=True, cv=2) + + +def test_learning_curve_some_failing_fits_warning(global_random_seed): + """Checks for fit failures in `learning_curve` and raises the required warning""" + + X, y = make_classification( + n_samples=30, + n_classes=3, + n_informative=6, + shuffle=False, + random_state=global_random_seed, + ) + # sorting the target to trigger SVC error on the 2 first splits because a single + # class is present + sorted_idx = np.argsort(y) + X, y = X[sorted_idx], y[sorted_idx] + + svc = SVC() + warning_message = "10 fits failed out of a total of 25" + + with pytest.warns(FitFailedWarning, match=warning_message): + _, train_score, test_score, *_ = learning_curve( + svc, X, y, cv=5, error_score=np.nan + ) + + # the first 2 splits should lead to warnings and thus np.nan scores + for idx in range(2): + assert np.isnan(train_score[idx]).all() + assert np.isnan(test_score[idx]).all() + + for idx in range(2, train_score.shape[0]): + assert not np.isnan(train_score[idx]).any() + assert not np.isnan(test_score[idx]).any() + + +def test_cross_validate_return_indices(global_random_seed): + """Check the behaviour of `return_indices` in `cross_validate`.""" + X, y = load_iris(return_X_y=True) + X = scale(X) # scale features for better convergence + estimator = LogisticRegression() + + cv = KFold(n_splits=3, shuffle=True, random_state=global_random_seed) + cv_results = cross_validate(estimator, X, y, cv=cv, n_jobs=2, return_indices=False) + assert "indices" not in cv_results + + cv_results = cross_validate(estimator, X, y, cv=cv, n_jobs=2, return_indices=True) + assert "indices" in cv_results + train_indices = cv_results["indices"]["train"] + test_indices = cv_results["indices"]["test"] + assert len(train_indices) == cv.n_splits + assert len(test_indices) == cv.n_splits + + assert_array_equal([indices.size for indices in train_indices], 100) + assert_array_equal([indices.size for indices in test_indices], 50) + + for split_idx, (expected_train_idx, expected_test_idx) in enumerate(cv.split(X, y)): + assert_array_equal(train_indices[split_idx], expected_train_idx) + assert_array_equal(test_indices[split_idx], expected_test_idx) + + +# Tests for metadata routing in cross_val* and in *curve +# ====================================================== + + +# TODO(1.8): remove `learning_curve`, `validation_curve` and `permutation_test_score`. +@pytest.mark.parametrize( + "func, extra_args", + [ + (learning_curve, {}), + (permutation_test_score, {}), + (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}), + ], +) +def test_fit_param_deprecation(func, extra_args): + """Check that we warn about deprecating `fit_params`.""" + with pytest.warns(FutureWarning, match="`fit_params` is deprecated"): + func( + estimator=ConsumingClassifier(), X=X, y=y, cv=2, fit_params={}, **extra_args + ) + + with pytest.raises( + ValueError, match="`params` and `fit_params` cannot both be provided" + ): + func( + estimator=ConsumingClassifier(), + X=X, + y=y, + fit_params={}, + params={}, + **extra_args, + ) + + +@pytest.mark.parametrize( + "func, extra_args", + [ + (cross_validate, {}), + (cross_val_score, {}), + (cross_val_predict, {}), + (learning_curve, {}), + (permutation_test_score, {}), + (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}), + ], +) +@config_context(enable_metadata_routing=True) +def test_groups_with_routing_validation(func, extra_args): + """Check that we raise an error if `groups` are passed to the cv method instead + of `params` when metadata routing is enabled. + """ + with pytest.raises(ValueError, match="`groups` can only be passed if"): + func( + estimator=ConsumingClassifier(), + X=X, + y=y, + groups=[], + **extra_args, + ) + + +@pytest.mark.parametrize( + "func, extra_args", + [ + (cross_validate, {}), + (cross_val_score, {}), + (cross_val_predict, {}), + (learning_curve, {}), + (permutation_test_score, {}), + (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}), + ], +) +@config_context(enable_metadata_routing=True) +def test_cross_validate_params_none(func, extra_args): + """Test that no errors are raised when passing `params=None`, which is the + default value. + Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/30447 + """ + X, y = make_classification(n_samples=100, n_classes=2, random_state=0) + func(estimator=ConsumingClassifier(), X=X, y=y, **extra_args) + + +@pytest.mark.parametrize( + "func, extra_args", + [ + (cross_validate, {}), + (cross_val_score, {}), + (cross_val_predict, {}), + (learning_curve, {}), + (permutation_test_score, {}), + (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}), + ], +) +@config_context(enable_metadata_routing=True) +def test_passed_unrequested_metadata(func, extra_args): + """Check that we raise an error when passing metadata that is not + requested.""" + + err_msg = re.escape( + "[metadata] are passed but are not explicitly set as requested or not " + "requested for ConsumingClassifier.fit, which is used within" + ) + with pytest.raises(UnsetMetadataPassedError, match=err_msg): + func( + estimator=ConsumingClassifier(), + X=X, + y=y2, + params=dict(metadata=[]), + **extra_args, + ) + + # cross_val_predict doesn't use scoring + if func == cross_val_predict: + return + + err_msg = re.escape( + "[metadata] are passed but are not explicitly set as requested or not " + "requested for ConsumingClassifier.score, which is used within" + ) + with pytest.raises(UnsetMetadataPassedError, match=err_msg): + func( + estimator=ConsumingClassifier() + .set_fit_request(metadata=True) + .set_partial_fit_request(metadata=True), + X=X, + y=y2, + params=dict(metadata=[]), + **extra_args, + ) + + +@pytest.mark.parametrize( + "func, extra_args", + [ + (cross_validate, {}), + (cross_val_score, {}), + (cross_val_predict, {}), + (learning_curve, {}), + (permutation_test_score, {}), + (validation_curve, {"param_name": "alpha", "param_range": np.array([1])}), + ], +) +@config_context(enable_metadata_routing=True) +def test_validation_functions_routing(func, extra_args): + """Check that the respective cv method is properly dispatching the metadata + to the consumer.""" + scorer_registry = _Registry() + scorer = ConsumingScorer(registry=scorer_registry).set_score_request( + sample_weight="score_weights", metadata="score_metadata" + ) + splitter_registry = _Registry() + splitter = ConsumingSplitter(registry=splitter_registry).set_split_request( + groups="split_groups", metadata="split_metadata" + ) + estimator_registry = _Registry() + estimator = ConsumingClassifier(registry=estimator_registry).set_fit_request( + sample_weight="fit_sample_weight", metadata="fit_metadata" + ) + + n_samples = _num_samples(X) + rng = np.random.RandomState(0) + score_weights = rng.rand(n_samples) + score_metadata = rng.rand(n_samples) + split_groups = rng.randint(0, 3, n_samples) + split_metadata = rng.rand(n_samples) + fit_sample_weight = rng.rand(n_samples) + fit_metadata = rng.rand(n_samples) + + scoring_args = { + cross_validate: dict(scoring=dict(my_scorer=scorer, accuracy="accuracy")), + cross_val_score: dict(scoring=scorer), + learning_curve: dict(scoring=scorer), + validation_curve: dict(scoring=scorer), + permutation_test_score: dict(scoring=scorer), + cross_val_predict: dict(), + } + + params = dict( + split_groups=split_groups, + split_metadata=split_metadata, + fit_sample_weight=fit_sample_weight, + fit_metadata=fit_metadata, + ) + + if func is not cross_val_predict: + params.update( + score_weights=score_weights, + score_metadata=score_metadata, + ) + + func( + estimator, + X=X, + y=y, + cv=splitter, + **scoring_args[func], + **extra_args, + params=params, + ) + + if func is not cross_val_predict: + # cross_val_predict doesn't need a scorer + assert len(scorer_registry) + for _scorer in scorer_registry: + check_recorded_metadata( + obj=_scorer, + method="score", + parent=func.__name__, + split_params=("sample_weight", "metadata"), + sample_weight=score_weights, + metadata=score_metadata, + ) + + assert len(splitter_registry) + for _splitter in splitter_registry: + check_recorded_metadata( + obj=_splitter, + method="split", + parent=func.__name__, + groups=split_groups, + metadata=split_metadata, + ) + + assert len(estimator_registry) + for _estimator in estimator_registry: + check_recorded_metadata( + obj=_estimator, + method="fit", + parent=func.__name__, + split_params=("sample_weight", "metadata"), + sample_weight=fit_sample_weight, + metadata=fit_metadata, + ) + + +@config_context(enable_metadata_routing=True) +def test_learning_curve_exploit_incremental_learning_routing(): + """Test that learning_curve routes metadata to the estimator correctly while + partial_fitting it with `exploit_incremental_learning=True`.""" + + n_samples = _num_samples(X) + rng = np.random.RandomState(0) + fit_sample_weight = rng.rand(n_samples) + fit_metadata = rng.rand(n_samples) + + estimator_registry = _Registry() + estimator = ConsumingClassifier( + registry=estimator_registry + ).set_partial_fit_request( + sample_weight="fit_sample_weight", metadata="fit_metadata" + ) + + learning_curve( + estimator, + X=X, + y=y, + cv=ConsumingSplitter(), + exploit_incremental_learning=True, + params=dict(fit_sample_weight=fit_sample_weight, fit_metadata=fit_metadata), + ) + + assert len(estimator_registry) + for _estimator in estimator_registry: + check_recorded_metadata( + obj=_estimator, + method="partial_fit", + parent="learning_curve", + split_params=("sample_weight", "metadata"), + sample_weight=fit_sample_weight, + metadata=fit_metadata, + ) + + +# End of metadata routing tests +# ============================= diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4e0de99f5e7e37bb92643ad29f3c859c689d4918 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/__init__.py @@ -0,0 +1,42 @@ +"""The k-nearest neighbors algorithms.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._ball_tree import BallTree +from ._base import VALID_METRICS, VALID_METRICS_SPARSE, sort_graph_by_row_values +from ._classification import KNeighborsClassifier, RadiusNeighborsClassifier +from ._graph import ( + KNeighborsTransformer, + RadiusNeighborsTransformer, + kneighbors_graph, + radius_neighbors_graph, +) +from ._kd_tree import KDTree +from ._kde import KernelDensity +from ._lof import LocalOutlierFactor +from ._nca import NeighborhoodComponentsAnalysis +from ._nearest_centroid import NearestCentroid +from ._regression import KNeighborsRegressor, RadiusNeighborsRegressor +from ._unsupervised import NearestNeighbors + +__all__ = [ + "VALID_METRICS", + "VALID_METRICS_SPARSE", + "BallTree", + "KDTree", + "KNeighborsClassifier", + "KNeighborsRegressor", + "KNeighborsTransformer", + "KernelDensity", + "LocalOutlierFactor", + "NearestCentroid", + "NearestNeighbors", + "NeighborhoodComponentsAnalysis", + "RadiusNeighborsClassifier", + "RadiusNeighborsRegressor", + "RadiusNeighborsTransformer", + "kneighbors_graph", + "radius_neighbors_graph", + "sort_graph_by_row_values", +] diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_ball_tree.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_ball_tree.pyx.tp new file mode 100644 index 0000000000000000000000000000000000000000..44d876187c54f370a6acaa72645c39371526fac8 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_ball_tree.pyx.tp @@ -0,0 +1,284 @@ +{{py: + +# Generated file: _ball_tree.pyx + +implementation_specific_values = [ + # The values are arranged as follows: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE + # + ('64', 'float64_t', 'np.float64'), + ('32', 'float32_t', 'np.float32') +] + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +}} + + +__all__ = ['BallTree', 'BallTree64', 'BallTree32'] + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +DOC_DICT{{name_suffix}} = { + 'BinaryTree': 'BallTree{{name_suffix}}', + 'binary_tree': 'ball_tree{{name_suffix}}', +} + +VALID_METRICS{{name_suffix}} = [ + 'BrayCurtisDistance{{name_suffix}}', + 'CanberraDistance{{name_suffix}}', + 'ChebyshevDistance{{name_suffix}}', + 'DiceDistance{{name_suffix}}', + 'EuclideanDistance{{name_suffix}}', + 'HammingDistance{{name_suffix}}', + 'HaversineDistance{{name_suffix}}', + 'JaccardDistance{{name_suffix}}', + 'MahalanobisDistance{{name_suffix}}', + 'ManhattanDistance{{name_suffix}}', + 'MinkowskiDistance{{name_suffix}}', + 'PyFuncDistance{{name_suffix}}', + 'RogersTanimotoDistance{{name_suffix}}', + 'RussellRaoDistance{{name_suffix}}', + 'SEuclideanDistance{{name_suffix}}', + 'SokalMichenerDistance{{name_suffix}}', + 'SokalSneathDistance{{name_suffix}}', + 'WMinkowskiDistance{{name_suffix}}', +] + +{{endfor}} + +include "_binary_tree.pxi" + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +# Inherit BallTree{{name_suffix}} from BinaryTree{{name_suffix}} +cdef class BallTree{{name_suffix}}(BinaryTree{{name_suffix}}): + __doc__ = CLASS_DOC.format(**DOC_DICT{{name_suffix}}) + pass + +{{endfor}} + + +#---------------------------------------------------------------------- +# The functions below specialized the Binary Tree as a Ball Tree +# +# Note that these functions use the concept of "reduced distance". +# The reduced distance, defined for some metrics, is a quantity which +# is more efficient to compute than the distance, but preserves the +# relative rankings of the true distance. For example, the reduced +# distance for the Euclidean metric is the squared-euclidean distance. +# For some metrics, the reduced distance is simply the distance. + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +cdef int allocate_data{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t n_nodes, + intp_t n_features, +) except -1: + """Allocate arrays needed for the KD Tree""" + tree.node_bounds = np.zeros((1, n_nodes, n_features), dtype={{INPUT_DTYPE}}) + return 0 + + +cdef int init_node{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + NodeData_t[::1] node_data, + intp_t i_node, + intp_t idx_start, + intp_t idx_end, +) except -1: + """Initialize the node for the dataset stored in tree.data""" + cdef intp_t n_features = tree.data.shape[1] + cdef intp_t n_points = idx_end - idx_start + + cdef intp_t i, j + cdef float64_t radius + cdef const {{INPUT_DTYPE_t}} *this_pt + + cdef intp_t* idx_array = &tree.idx_array[0] + cdef const {{INPUT_DTYPE_t}}* data = &tree.data[0, 0] + cdef {{INPUT_DTYPE_t}}* centroid = &tree.node_bounds[0, i_node, 0] + + cdef bint with_sample_weight = tree.sample_weight is not None + cdef const {{INPUT_DTYPE_t}}* sample_weight + cdef float64_t sum_weight_node + if with_sample_weight: + sample_weight = &tree.sample_weight[0] + + # determine Node centroid + for j in range(n_features): + centroid[j] = 0 + + if with_sample_weight: + sum_weight_node = 0 + for i in range(idx_start, idx_end): + sum_weight_node += sample_weight[idx_array[i]] + this_pt = data + n_features * idx_array[i] + for j from 0 <= j < n_features: + centroid[j] += this_pt[j] * sample_weight[idx_array[i]] + + for j in range(n_features): + centroid[j] /= sum_weight_node + else: + for i in range(idx_start, idx_end): + this_pt = data + n_features * idx_array[i] + for j from 0 <= j < n_features: + centroid[j] += this_pt[j] + + for j in range(n_features): + centroid[j] /= n_points + + # determine Node radius + radius = 0 + for i in range(idx_start, idx_end): + radius = fmax(radius, + tree.rdist(centroid, + data + n_features * idx_array[i], + n_features)) + + node_data[i_node].radius = tree.dist_metric._rdist_to_dist(radius) + node_data[i_node].idx_start = idx_start + node_data[i_node].idx_end = idx_end + return 0 + + +cdef inline float64_t min_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, +) except -1 nogil: + """Compute the minimum distance between a point and a node""" + cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], + tree.data.shape[1]) + return fmax(0, dist_pt - tree.node_data[i_node].radius) + + +cdef inline float64_t max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, +) except -1: + """Compute the maximum distance between a point and a node""" + cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], + tree.data.shape[1]) + return dist_pt + tree.node_data[i_node].radius + + +cdef inline int min_max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, + float64_t* min_dist, + float64_t* max_dist, +) except -1 nogil: + """Compute the minimum and maximum distance between a point and a node""" + cdef float64_t dist_pt = tree.dist(pt, &tree.node_bounds[0, i_node, 0], + tree.data.shape[1]) + cdef float64_t rad = tree.node_data[i_node].radius + min_dist[0] = fmax(0, dist_pt - rad) + max_dist[0] = dist_pt + rad + return 0 + + +cdef inline float64_t min_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, +) except -1 nogil: + """Compute the minimum reduced-distance between a point and a node""" + if tree.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + min_dist{{name_suffix}}(tree, i_node, pt) + ) + else: + return tree.dist_metric._dist_to_rdist( + min_dist{{name_suffix}}(tree, i_node, pt) + ) + + +cdef inline float64_t max_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, +) except -1: + """Compute the maximum reduced-distance between a point and a node""" + if tree.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + max_dist{{name_suffix}}(tree, i_node, pt) + ) + else: + return tree.dist_metric._dist_to_rdist( + max_dist{{name_suffix}}(tree, i_node, pt) + ) + + +cdef inline float64_t min_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """compute the minimum distance between two nodes""" + cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0], + &tree1.node_bounds[0, i_node1, 0], + tree1.data.shape[1]) + return fmax(0, (dist_pt - tree1.node_data[i_node1].radius + - tree2.node_data[i_node2].radius)) + + +cdef inline float64_t max_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """compute the maximum distance between two nodes""" + cdef float64_t dist_pt = tree1.dist(&tree2.node_bounds[0, i_node2, 0], + &tree1.node_bounds[0, i_node1, 0], + tree1.data.shape[1]) + return (dist_pt + tree1.node_data[i_node1].radius + + tree2.node_data[i_node2].radius) + + +cdef inline float64_t min_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """compute the minimum reduced distance between two nodes""" + if tree1.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + min_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + else: + return tree1.dist_metric._dist_to_rdist( + min_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + + +cdef inline float64_t max_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """compute the maximum reduced distance between two nodes""" + if tree1.euclidean: + return euclidean_dist_to_rdist{{name_suffix}}( + max_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + else: + return tree1.dist_metric._dist_to_rdist( + max_dist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + +{{endfor}} + + +class BallTree(BallTree64): + __doc__ = CLASS_DOC.format(BinaryTree="BallTree") + pass diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_base.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..767eee1358aa873808ab7796d080cea06bae97bc --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_base.py @@ -0,0 +1,1404 @@ +"""Base and mixin classes for nearest neighbors.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import itertools +import numbers +import warnings +from abc import ABCMeta, abstractmethod +from functools import partial +from numbers import Integral, Real + +import numpy as np +from joblib import effective_n_jobs +from scipy.sparse import csr_matrix, issparse + +from ..base import BaseEstimator, MultiOutputMixin, is_classifier +from ..exceptions import DataConversionWarning, EfficiencyWarning +from ..metrics import DistanceMetric, pairwise_distances_chunked +from ..metrics._pairwise_distances_reduction import ( + ArgKmin, + RadiusNeighbors, +) +from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS +from ..utils import ( + check_array, + gen_even_slices, + get_tags, +) +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.fixes import parse_version, sp_base_version +from ..utils.multiclass import check_classification_targets +from ..utils.parallel import Parallel, delayed +from ..utils.validation import _to_object_array, check_is_fitted, validate_data +from ._ball_tree import BallTree +from ._kd_tree import KDTree + +SCIPY_METRICS = [ + "braycurtis", + "canberra", + "chebyshev", + "correlation", + "cosine", + "dice", + "hamming", + "jaccard", + "mahalanobis", + "minkowski", + "rogerstanimoto", + "russellrao", + "seuclidean", + "sokalsneath", + "sqeuclidean", + "yule", +] +if sp_base_version < parse_version("1.17"): + # Deprecated in SciPy 1.15 and removed in SciPy 1.17 + SCIPY_METRICS += ["sokalmichener"] +if sp_base_version < parse_version("1.11"): + # Deprecated in SciPy 1.9 and removed in SciPy 1.11 + SCIPY_METRICS += ["kulsinski"] +if sp_base_version < parse_version("1.9"): + # Deprecated in SciPy 1.0 and removed in SciPy 1.9 + SCIPY_METRICS += ["matching"] + +VALID_METRICS = dict( + ball_tree=BallTree.valid_metrics, + kd_tree=KDTree.valid_metrics, + # The following list comes from the + # sklearn.metrics.pairwise doc string + brute=sorted(set(PAIRWISE_DISTANCE_FUNCTIONS).union(SCIPY_METRICS)), +) + +VALID_METRICS_SPARSE = dict( + ball_tree=[], + kd_tree=[], + brute=(PAIRWISE_DISTANCE_FUNCTIONS.keys() - {"haversine", "nan_euclidean"}), +) + + +def _get_weights(dist, weights): + """Get the weights from an array of distances and a parameter ``weights``. + + Assume weights have already been validated. + + Parameters + ---------- + dist : ndarray + The input distances. + + weights : {'uniform', 'distance'}, callable or None + The kind of weighting used. + + Returns + ------- + weights_arr : array of the same shape as ``dist`` + If ``weights == 'uniform'``, then returns None. + """ + if weights in (None, "uniform"): + return None + + if weights == "distance": + # if user attempts to classify a point that was zero distance from one + # or more training points, those training points are weighted as 1.0 + # and the other points as 0.0 + if dist.dtype is np.dtype(object): + for point_dist_i, point_dist in enumerate(dist): + # check if point_dist is iterable + # (ex: RadiusNeighborClassifier.predict may set an element of + # dist to 1e-6 to represent an 'outlier') + if hasattr(point_dist, "__contains__") and 0.0 in point_dist: + dist[point_dist_i] = point_dist == 0.0 + else: + dist[point_dist_i] = 1.0 / point_dist + else: + with np.errstate(divide="ignore"): + dist = 1.0 / dist + inf_mask = np.isinf(dist) + inf_row = np.any(inf_mask, axis=1) + dist[inf_row] = inf_mask[inf_row] + return dist + + if callable(weights): + return weights(dist) + + +def _is_sorted_by_data(graph): + """Return whether the graph's non-zero entries are sorted by data. + + The non-zero entries are stored in graph.data and graph.indices. + For each row (or sample), the non-zero entries can be either: + - sorted by indices, as after graph.sort_indices(); + - sorted by data, as after _check_precomputed(graph); + - not sorted. + + Parameters + ---------- + graph : sparse matrix of shape (n_samples, n_samples) + Neighbors graph as given by `kneighbors_graph` or + `radius_neighbors_graph`. Matrix should be of format CSR format. + + Returns + ------- + res : bool + Whether input graph is sorted by data. + """ + assert graph.format == "csr" + out_of_order = graph.data[:-1] > graph.data[1:] + line_change = np.unique(graph.indptr[1:-1] - 1) + line_change = line_change[line_change < out_of_order.shape[0]] + return out_of_order.sum() == out_of_order[line_change].sum() + + +def _check_precomputed(X): + """Check precomputed distance matrix. + + If the precomputed distance matrix is sparse, it checks that the non-zero + entries are sorted by distances. If not, the matrix is copied and sorted. + + Parameters + ---------- + X : {sparse matrix, array-like}, (n_samples, n_samples) + Distance matrix to other samples. X may be a sparse matrix, in which + case only non-zero elements may be considered neighbors. + + Returns + ------- + X : {sparse matrix, array-like}, (n_samples, n_samples) + Distance matrix to other samples. X may be a sparse matrix, in which + case only non-zero elements may be considered neighbors. + """ + if not issparse(X): + X = check_array(X, ensure_non_negative=True, input_name="X") + return X + else: + graph = X + + if graph.format not in ("csr", "csc", "coo", "lil"): + raise TypeError( + "Sparse matrix in {!r} format is not supported due to " + "its handling of explicit zeros".format(graph.format) + ) + copied = graph.format != "csr" + graph = check_array( + graph, + accept_sparse="csr", + ensure_non_negative=True, + input_name="precomputed distance matrix", + ) + graph = sort_graph_by_row_values(graph, copy=not copied, warn_when_not_sorted=True) + + return graph + + +@validate_params( + { + "graph": ["sparse matrix"], + "copy": ["boolean"], + "warn_when_not_sorted": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def sort_graph_by_row_values(graph, copy=False, warn_when_not_sorted=True): + """Sort a sparse graph such that each row is stored with increasing values. + + .. versionadded:: 1.2 + + Parameters + ---------- + graph : sparse matrix of shape (n_samples, n_samples) + Distance matrix to other samples, where only non-zero elements are + considered neighbors. Matrix is converted to CSR format if not already. + + copy : bool, default=False + If True, the graph is copied before sorting. If False, the sorting is + performed inplace. If the graph is not of CSR format, `copy` must be + True to allow the conversion to CSR format, otherwise an error is + raised. + + warn_when_not_sorted : bool, default=True + If True, a :class:`~sklearn.exceptions.EfficiencyWarning` is raised + when the input graph is not sorted by row values. + + Returns + ------- + graph : sparse matrix of shape (n_samples, n_samples) + Distance matrix to other samples, where only non-zero elements are + considered neighbors. Matrix is in CSR format. + + Examples + -------- + >>> from scipy.sparse import csr_matrix + >>> from sklearn.neighbors import sort_graph_by_row_values + >>> X = csr_matrix( + ... [[0., 3., 1.], + ... [3., 0., 2.], + ... [1., 2., 0.]]) + >>> X.data + array([3., 1., 3., 2., 1., 2.]) + >>> X_ = sort_graph_by_row_values(X) + >>> X_.data + array([1., 3., 2., 3., 1., 2.]) + """ + if graph.format == "csr" and _is_sorted_by_data(graph): + return graph + + if warn_when_not_sorted: + warnings.warn( + ( + "Precomputed sparse input was not sorted by row values. Use the" + " function sklearn.neighbors.sort_graph_by_row_values to sort the input" + " by row values, with warn_when_not_sorted=False to remove this" + " warning." + ), + EfficiencyWarning, + ) + + if graph.format not in ("csr", "csc", "coo", "lil"): + raise TypeError( + f"Sparse matrix in {graph.format!r} format is not supported due to " + "its handling of explicit zeros" + ) + elif graph.format != "csr": + if not copy: + raise ValueError( + "The input graph is not in CSR format. Use copy=True to allow " + "the conversion to CSR format." + ) + graph = graph.asformat("csr") + elif copy: # csr format with copy=True + graph = graph.copy() + + row_nnz = np.diff(graph.indptr) + if row_nnz.max() == row_nnz.min(): + # if each sample has the same number of provided neighbors + n_samples = graph.shape[0] + distances = graph.data.reshape(n_samples, -1) + + order = np.argsort(distances, kind="mergesort") + order += np.arange(n_samples)[:, None] * row_nnz[0] + order = order.ravel() + graph.data = graph.data[order] + graph.indices = graph.indices[order] + + else: + for start, stop in zip(graph.indptr, graph.indptr[1:]): + order = np.argsort(graph.data[start:stop], kind="mergesort") + graph.data[start:stop] = graph.data[start:stop][order] + graph.indices[start:stop] = graph.indices[start:stop][order] + + return graph + + +def _kneighbors_from_graph(graph, n_neighbors, return_distance): + """Decompose a nearest neighbors sparse graph into distances and indices. + + Parameters + ---------- + graph : sparse matrix of shape (n_samples, n_samples) + Neighbors graph as given by `kneighbors_graph` or + `radius_neighbors_graph`. Matrix should be of format CSR format. + + n_neighbors : int + Number of neighbors required for each sample. + + return_distance : bool + Whether or not to return the distances. + + Returns + ------- + neigh_dist : ndarray of shape (n_samples, n_neighbors) + Distances to nearest neighbors. Only present if `return_distance=True`. + + neigh_ind : ndarray of shape (n_samples, n_neighbors) + Indices of nearest neighbors. + """ + n_samples = graph.shape[0] + assert graph.format == "csr" + + # number of neighbors by samples + row_nnz = np.diff(graph.indptr) + row_nnz_min = row_nnz.min() + if n_neighbors is not None and row_nnz_min < n_neighbors: + raise ValueError( + "%d neighbors per samples are required, but some samples have only" + " %d neighbors in precomputed graph matrix. Decrease number of " + "neighbors used or recompute the graph with more neighbors." + % (n_neighbors, row_nnz_min) + ) + + def extract(a): + # if each sample has the same number of provided neighbors + if row_nnz.max() == row_nnz_min: + return a.reshape(n_samples, -1)[:, :n_neighbors] + else: + idx = np.tile(np.arange(n_neighbors), (n_samples, 1)) + idx += graph.indptr[:-1, None] + return a.take(idx, mode="clip").reshape(n_samples, n_neighbors) + + if return_distance: + return extract(graph.data), extract(graph.indices) + else: + return extract(graph.indices) + + +def _radius_neighbors_from_graph(graph, radius, return_distance): + """Decompose a nearest neighbors sparse graph into distances and indices. + + Parameters + ---------- + graph : sparse matrix of shape (n_samples, n_samples) + Neighbors graph as given by `kneighbors_graph` or + `radius_neighbors_graph`. Matrix should be of format CSR format. + + radius : float + Radius of neighborhoods which should be strictly positive. + + return_distance : bool + Whether or not to return the distances. + + Returns + ------- + neigh_dist : ndarray of shape (n_samples,) of arrays + Distances to nearest neighbors. Only present if `return_distance=True`. + + neigh_ind : ndarray of shape (n_samples,) of arrays + Indices of nearest neighbors. + """ + assert graph.format == "csr" + + no_filter_needed = bool(graph.data.max() <= radius) + + if no_filter_needed: + data, indices, indptr = graph.data, graph.indices, graph.indptr + else: + mask = graph.data <= radius + if return_distance: + data = np.compress(mask, graph.data) + indices = np.compress(mask, graph.indices) + indptr = np.concatenate(([0], np.cumsum(mask)))[graph.indptr] + + indices = indices.astype(np.intp, copy=no_filter_needed) + + if return_distance: + neigh_dist = _to_object_array(np.split(data, indptr[1:-1])) + neigh_ind = _to_object_array(np.split(indices, indptr[1:-1])) + + if return_distance: + return neigh_dist, neigh_ind + else: + return neigh_ind + + +class NeighborsBase(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): + """Base class for nearest neighbors estimators.""" + + _parameter_constraints: dict = { + "n_neighbors": [Interval(Integral, 1, None, closed="left"), None], + "radius": [Interval(Real, 0, None, closed="both"), None], + "algorithm": [StrOptions({"auto", "ball_tree", "kd_tree", "brute"})], + "leaf_size": [Interval(Integral, 1, None, closed="left")], + "p": [Interval(Real, 0, None, closed="right"), None], + "metric": [StrOptions(set(itertools.chain(*VALID_METRICS.values()))), callable], + "metric_params": [dict, None], + "n_jobs": [Integral, None], + } + + @abstractmethod + def __init__( + self, + n_neighbors=None, + radius=None, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + n_jobs=None, + ): + self.n_neighbors = n_neighbors + self.radius = radius + self.algorithm = algorithm + self.leaf_size = leaf_size + self.metric = metric + self.metric_params = metric_params + self.p = p + self.n_jobs = n_jobs + + def _check_algorithm_metric(self): + if self.algorithm == "auto": + if self.metric == "precomputed": + alg_check = "brute" + elif ( + callable(self.metric) + or self.metric in VALID_METRICS["ball_tree"] + or isinstance(self.metric, DistanceMetric) + ): + alg_check = "ball_tree" + else: + alg_check = "brute" + else: + alg_check = self.algorithm + + if callable(self.metric): + if self.algorithm == "kd_tree": + # callable metric is only valid for brute force and ball_tree + raise ValueError( + "kd_tree does not support callable metric '%s'" + "Function call overhead will result" + "in very poor performance." % self.metric + ) + elif self.metric not in VALID_METRICS[alg_check] and not isinstance( + self.metric, DistanceMetric + ): + raise ValueError( + "Metric '%s' not valid. Use " + "sorted(sklearn.neighbors.VALID_METRICS['%s']) " + "to get valid options. " + "Metric can also be a callable function." % (self.metric, alg_check) + ) + + if self.metric_params is not None and "p" in self.metric_params: + if self.p is not None: + warnings.warn( + ( + "Parameter p is found in metric_params. " + "The corresponding parameter from __init__ " + "is ignored." + ), + SyntaxWarning, + stacklevel=3, + ) + + def _fit(self, X, y=None): + ensure_all_finite = "allow-nan" if get_tags(self).input_tags.allow_nan else True + if self.__sklearn_tags__().target_tags.required: + if not isinstance(X, (KDTree, BallTree, NeighborsBase)): + X, y = validate_data( + self, + X, + y, + accept_sparse="csr", + multi_output=True, + order="C", + ensure_all_finite=ensure_all_finite, + ) + + if is_classifier(self): + # Classification targets require a specific format + if y.ndim == 1 or (y.ndim == 2 and y.shape[1] == 1): + if y.ndim != 1: + warnings.warn( + ( + "A column-vector y was passed when a " + "1d array was expected. Please change " + "the shape of y to (n_samples,), for " + "example using ravel()." + ), + DataConversionWarning, + stacklevel=2, + ) + + self.outputs_2d_ = False + y = y.reshape((-1, 1)) + else: + self.outputs_2d_ = True + + check_classification_targets(y) + self.classes_ = [] + # Using `dtype=np.intp` is necessary since `np.bincount` + # (called in _classification.py) fails when dealing + # with a float64 array on 32bit systems. + self._y = np.empty(y.shape, dtype=np.intp) + for k in range(self._y.shape[1]): + classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True) + self.classes_.append(classes) + + if not self.outputs_2d_: + self.classes_ = self.classes_[0] + self._y = self._y.ravel() + else: + self._y = y + + else: + if not isinstance(X, (KDTree, BallTree, NeighborsBase)): + X = validate_data( + self, + X, + ensure_all_finite=ensure_all_finite, + accept_sparse="csr", + order="C", + ) + + self._check_algorithm_metric() + if self.metric_params is None: + self.effective_metric_params_ = {} + else: + self.effective_metric_params_ = self.metric_params.copy() + + effective_p = self.effective_metric_params_.get("p", self.p) + if self.metric == "minkowski": + self.effective_metric_params_["p"] = effective_p + + self.effective_metric_ = self.metric + # For minkowski distance, use more efficient methods where available + if self.metric == "minkowski": + p = self.effective_metric_params_.pop("p", 2) + w = self.effective_metric_params_.pop("w", None) + + if p == 1 and w is None: + self.effective_metric_ = "manhattan" + elif p == 2 and w is None: + self.effective_metric_ = "euclidean" + elif p == np.inf and w is None: + self.effective_metric_ = "chebyshev" + else: + # Use the generic minkowski metric, possibly weighted. + self.effective_metric_params_["p"] = p + self.effective_metric_params_["w"] = w + + if isinstance(X, NeighborsBase): + self._fit_X = X._fit_X + self._tree = X._tree + self._fit_method = X._fit_method + self.n_samples_fit_ = X.n_samples_fit_ + return self + + elif isinstance(X, BallTree): + self._fit_X = X.data + self._tree = X + self._fit_method = "ball_tree" + self.n_samples_fit_ = X.data.shape[0] + return self + + elif isinstance(X, KDTree): + self._fit_X = X.data + self._tree = X + self._fit_method = "kd_tree" + self.n_samples_fit_ = X.data.shape[0] + return self + + if self.metric == "precomputed": + X = _check_precomputed(X) + # Precomputed matrix X must be squared + if X.shape[0] != X.shape[1]: + raise ValueError( + "Precomputed matrix must be square." + " Input is a {}x{} matrix.".format(X.shape[0], X.shape[1]) + ) + self.n_features_in_ = X.shape[1] + + n_samples = X.shape[0] + if n_samples == 0: + raise ValueError("n_samples must be greater than 0") + + if issparse(X): + if self.algorithm not in ("auto", "brute"): + warnings.warn("cannot use tree with sparse input: using brute force") + + if ( + self.effective_metric_ not in VALID_METRICS_SPARSE["brute"] + and not callable(self.effective_metric_) + and not isinstance(self.effective_metric_, DistanceMetric) + ): + raise ValueError( + "Metric '%s' not valid for sparse input. " + "Use sorted(sklearn.neighbors." + "VALID_METRICS_SPARSE['brute']) " + "to get valid options. " + "Metric can also be a callable function." % (self.effective_metric_) + ) + self._fit_X = X.copy() + self._tree = None + self._fit_method = "brute" + self.n_samples_fit_ = X.shape[0] + return self + + self._fit_method = self.algorithm + self._fit_X = X + self.n_samples_fit_ = X.shape[0] + + if self._fit_method == "auto": + # A tree approach is better for small number of neighbors or small + # number of features, with KDTree generally faster when available + if ( + self.metric == "precomputed" + or self._fit_X.shape[1] > 15 + or ( + self.n_neighbors is not None + and self.n_neighbors >= self._fit_X.shape[0] // 2 + ) + ): + self._fit_method = "brute" + else: + if ( + self.effective_metric_ == "minkowski" + and self.effective_metric_params_["p"] < 1 + ): + self._fit_method = "brute" + elif ( + self.effective_metric_ == "minkowski" + and self.effective_metric_params_.get("w") is not None + ): + # 'minkowski' with weights is not supported by KDTree but is + # supported byBallTree. + self._fit_method = "ball_tree" + elif self.effective_metric_ in VALID_METRICS["kd_tree"]: + self._fit_method = "kd_tree" + elif ( + callable(self.effective_metric_) + or self.effective_metric_ in VALID_METRICS["ball_tree"] + ): + self._fit_method = "ball_tree" + else: + self._fit_method = "brute" + + if ( + self.effective_metric_ == "minkowski" + and self.effective_metric_params_["p"] < 1 + ): + # For 0 < p < 1 Minkowski distances aren't valid distance + # metric as they do not satisfy triangular inequality: + # they are semi-metrics. + # algorithm="kd_tree" and algorithm="ball_tree" can't be used because + # KDTree and BallTree require a proper distance metric to work properly. + # However, the brute-force algorithm supports semi-metrics. + if self._fit_method == "brute": + warnings.warn( + "Mind that for 0 < p < 1, Minkowski metrics are not distance" + " metrics. Continuing the execution with `algorithm='brute'`." + ) + else: # self._fit_method in ("kd_tree", "ball_tree") + raise ValueError( + f'algorithm="{self._fit_method}" does not support 0 < p < 1 for ' + "the Minkowski metric. To resolve this problem either " + 'set p >= 1 or algorithm="brute".' + ) + + if self._fit_method == "ball_tree": + self._tree = BallTree( + X, + self.leaf_size, + metric=self.effective_metric_, + **self.effective_metric_params_, + ) + elif self._fit_method == "kd_tree": + if ( + self.effective_metric_ == "minkowski" + and self.effective_metric_params_.get("w") is not None + ): + raise ValueError( + "algorithm='kd_tree' is not valid for " + "metric='minkowski' with a weight parameter 'w': " + "try algorithm='ball_tree' " + "or algorithm='brute' instead." + ) + self._tree = KDTree( + X, + self.leaf_size, + metric=self.effective_metric_, + **self.effective_metric_params_, + ) + elif self._fit_method == "brute": + self._tree = None + + return self + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + # For cross-validation routines to split data correctly + tags.input_tags.pairwise = self.metric == "precomputed" + # when input is precomputed metric values, all those values need to be positive + tags.input_tags.positive_only = tags.input_tags.pairwise + tags.input_tags.allow_nan = self.metric == "nan_euclidean" + return tags + + +class KNeighborsMixin: + """Mixin for k-neighbors searches.""" + + def _kneighbors_reduce_func(self, dist, start, n_neighbors, return_distance): + """Reduce a chunk of distances to the nearest neighbors. + + Callback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked` + + Parameters + ---------- + dist : ndarray of shape (n_samples_chunk, n_samples) + The distance matrix. + + start : int + The index in X which the first row of dist corresponds to. + + n_neighbors : int + Number of neighbors required for each sample. + + return_distance : bool + Whether or not to return the distances. + + Returns + ------- + dist : array of shape (n_samples_chunk, n_neighbors) + Returned only if `return_distance=True`. + + neigh : array of shape (n_samples_chunk, n_neighbors) + The neighbors indices. + """ + sample_range = np.arange(dist.shape[0])[:, None] + neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1) + neigh_ind = neigh_ind[:, :n_neighbors] + # argpartition doesn't guarantee sorted order, so we sort again + neigh_ind = neigh_ind[sample_range, np.argsort(dist[sample_range, neigh_ind])] + if return_distance: + if self.effective_metric_ == "euclidean": + result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind + else: + result = dist[sample_range, neigh_ind], neigh_ind + else: + result = neigh_ind + return result + + def kneighbors(self, X=None, n_neighbors=None, return_distance=True): + """Find the K-neighbors of a point. + + Returns indices of and distances to the neighbors of each point. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed', default=None + The query point or points. + If not provided, neighbors of each indexed point are returned. + In this case, the query point is not considered its own neighbor. + + n_neighbors : int, default=None + Number of neighbors required for each sample. The default is the + value passed to the constructor. + + return_distance : bool, default=True + Whether or not to return the distances. + + Returns + ------- + neigh_dist : ndarray of shape (n_queries, n_neighbors) + Array representing the lengths to points, only present if + return_distance=True. + + neigh_ind : ndarray of shape (n_queries, n_neighbors) + Indices of the nearest points in the population matrix. + + Examples + -------- + In the following example, we construct a NearestNeighbors + class from an array representing our data set and ask who's + the closest point to [1,1,1] + + >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]] + >>> from sklearn.neighbors import NearestNeighbors + >>> neigh = NearestNeighbors(n_neighbors=1) + >>> neigh.fit(samples) + NearestNeighbors(n_neighbors=1) + >>> print(neigh.kneighbors([[1., 1., 1.]])) + (array([[0.5]]), array([[2]])) + + As you can see, it returns [[0.5]], and [[2]], which means that the + element is at distance 0.5 and is the third element of samples + (indexes start at 0). You can also query for multiple points: + + >>> X = [[0., 1., 0.], [1., 0., 1.]] + >>> neigh.kneighbors(X, return_distance=False) + array([[1], + [2]]...) + """ + check_is_fitted(self) + + if n_neighbors is None: + n_neighbors = self.n_neighbors + elif n_neighbors <= 0: + raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors) + elif not isinstance(n_neighbors, numbers.Integral): + raise TypeError( + "n_neighbors does not take %s value, enter integer value" + % type(n_neighbors) + ) + + ensure_all_finite = "allow-nan" if get_tags(self).input_tags.allow_nan else True + query_is_train = X is None + if query_is_train: + X = self._fit_X + # Include an extra neighbor to account for the sample itself being + # returned, which is removed later + n_neighbors += 1 + else: + if self.metric == "precomputed": + X = _check_precomputed(X) + else: + X = validate_data( + self, + X, + ensure_all_finite=ensure_all_finite, + accept_sparse="csr", + reset=False, + order="C", + ) + + n_samples_fit = self.n_samples_fit_ + if n_neighbors > n_samples_fit: + if query_is_train: + n_neighbors -= 1 # ok to modify inplace because an error is raised + inequality_str = "n_neighbors < n_samples_fit" + else: + inequality_str = "n_neighbors <= n_samples_fit" + raise ValueError( + f"Expected {inequality_str}, but " + f"n_neighbors = {n_neighbors}, n_samples_fit = {n_samples_fit}, " + f"n_samples = {X.shape[0]}" # include n_samples for common tests + ) + + n_jobs = effective_n_jobs(self.n_jobs) + chunked_results = None + use_pairwise_distances_reductions = ( + self._fit_method == "brute" + and ArgKmin.is_usable_for( + X if X is not None else self._fit_X, self._fit_X, self.effective_metric_ + ) + ) + if use_pairwise_distances_reductions: + results = ArgKmin.compute( + X=X, + Y=self._fit_X, + k=n_neighbors, + metric=self.effective_metric_, + metric_kwargs=self.effective_metric_params_, + strategy="auto", + return_distance=return_distance, + ) + + elif ( + self._fit_method == "brute" and self.metric == "precomputed" and issparse(X) + ): + results = _kneighbors_from_graph( + X, n_neighbors=n_neighbors, return_distance=return_distance + ) + + elif self._fit_method == "brute": + # Joblib-based backend, which is used when user-defined callable + # are passed for metric. + + # This won't be used in the future once PairwiseDistancesReductions + # support: + # - DistanceMetrics which work on supposedly binary data + # - CSR-dense and dense-CSR case if 'euclidean' in metric. + reduce_func = partial( + self._kneighbors_reduce_func, + n_neighbors=n_neighbors, + return_distance=return_distance, + ) + + # for efficiency, use squared euclidean distances + if self.effective_metric_ == "euclidean": + kwds = {"squared": True} + else: + kwds = self.effective_metric_params_ + + chunked_results = list( + pairwise_distances_chunked( + X, + self._fit_X, + reduce_func=reduce_func, + metric=self.effective_metric_, + n_jobs=n_jobs, + **kwds, + ) + ) + + elif self._fit_method in ["ball_tree", "kd_tree"]: + if issparse(X): + raise ValueError( + "%s does not work with sparse matrices. Densify the data, " + "or set algorithm='brute'" % self._fit_method + ) + chunked_results = Parallel(n_jobs, prefer="threads")( + delayed(self._tree.query)(X[s], n_neighbors, return_distance) + for s in gen_even_slices(X.shape[0], n_jobs) + ) + else: + raise ValueError("internal: _fit_method not recognized") + + if chunked_results is not None: + if return_distance: + neigh_dist, neigh_ind = zip(*chunked_results) + results = np.vstack(neigh_dist), np.vstack(neigh_ind) + else: + results = np.vstack(chunked_results) + + if not query_is_train: + return results + else: + # If the query data is the same as the indexed data, we would like + # to ignore the first nearest neighbor of every sample, i.e + # the sample itself. + if return_distance: + neigh_dist, neigh_ind = results + else: + neigh_ind = results + + n_queries, _ = X.shape + sample_range = np.arange(n_queries)[:, None] + sample_mask = neigh_ind != sample_range + + # Corner case: When the number of duplicates are more + # than the number of neighbors, the first NN will not + # be the sample, but a duplicate. + # In that case mask the first duplicate. + dup_gr_nbrs = np.all(sample_mask, axis=1) + sample_mask[:, 0][dup_gr_nbrs] = False + neigh_ind = np.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) + + if return_distance: + neigh_dist = np.reshape( + neigh_dist[sample_mask], (n_queries, n_neighbors - 1) + ) + return neigh_dist, neigh_ind + return neigh_ind + + def kneighbors_graph(self, X=None, n_neighbors=None, mode="connectivity"): + """Compute the (weighted) graph of k-Neighbors for points in X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed', default=None + The query point or points. + If not provided, neighbors of each indexed point are returned. + In this case, the query point is not considered its own neighbor. + For ``metric='precomputed'`` the shape should be + (n_queries, n_indexed). Otherwise the shape should be + (n_queries, n_features). + + n_neighbors : int, default=None + Number of neighbors for each sample. The default is the value + passed to the constructor. + + mode : {'connectivity', 'distance'}, default='connectivity' + Type of returned matrix: 'connectivity' will return the + connectivity matrix with ones and zeros, in 'distance' the + edges are distances between points, type of distance + depends on the selected metric parameter in + NearestNeighbors class. + + Returns + ------- + A : sparse-matrix of shape (n_queries, n_samples_fit) + `n_samples_fit` is the number of samples in the fitted data. + `A[i, j]` gives the weight of the edge connecting `i` to `j`. + The matrix is of CSR format. + + See Also + -------- + NearestNeighbors.radius_neighbors_graph : Compute the (weighted) graph + of Neighbors for points in X. + + Examples + -------- + >>> X = [[0], [3], [1]] + >>> from sklearn.neighbors import NearestNeighbors + >>> neigh = NearestNeighbors(n_neighbors=2) + >>> neigh.fit(X) + NearestNeighbors(n_neighbors=2) + >>> A = neigh.kneighbors_graph(X) + >>> A.toarray() + array([[1., 0., 1.], + [0., 1., 1.], + [1., 0., 1.]]) + """ + check_is_fitted(self) + if n_neighbors is None: + n_neighbors = self.n_neighbors + + # check the input only in self.kneighbors + + # construct CSR matrix representation of the k-NN graph + if mode == "connectivity": + A_ind = self.kneighbors(X, n_neighbors, return_distance=False) + n_queries = A_ind.shape[0] + A_data = np.ones(n_queries * n_neighbors) + + elif mode == "distance": + A_data, A_ind = self.kneighbors(X, n_neighbors, return_distance=True) + A_data = np.ravel(A_data) + + else: + raise ValueError( + 'Unsupported mode, must be one of "connectivity", ' + f'or "distance" but got "{mode}" instead' + ) + + n_queries = A_ind.shape[0] + n_samples_fit = self.n_samples_fit_ + n_nonzero = n_queries * n_neighbors + A_indptr = np.arange(0, n_nonzero + 1, n_neighbors) + + kneighbors_graph = csr_matrix( + (A_data, A_ind.ravel(), A_indptr), shape=(n_queries, n_samples_fit) + ) + + return kneighbors_graph + + +class RadiusNeighborsMixin: + """Mixin for radius-based neighbors searches.""" + + def _radius_neighbors_reduce_func(self, dist, start, radius, return_distance): + """Reduce a chunk of distances to the nearest neighbors. + + Callback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked` + + Parameters + ---------- + dist : ndarray of shape (n_samples_chunk, n_samples) + The distance matrix. + + start : int + The index in X which the first row of dist corresponds to. + + radius : float + The radius considered when making the nearest neighbors search. + + return_distance : bool + Whether or not to return the distances. + + Returns + ------- + dist : list of ndarray of shape (n_samples_chunk,) + Returned only if `return_distance=True`. + + neigh : list of ndarray of shape (n_samples_chunk,) + The neighbors indices. + """ + neigh_ind = [np.where(d <= radius)[0] for d in dist] + + if return_distance: + if self.effective_metric_ == "euclidean": + dist = [np.sqrt(d[neigh_ind[i]]) for i, d in enumerate(dist)] + else: + dist = [d[neigh_ind[i]] for i, d in enumerate(dist)] + results = dist, neigh_ind + else: + results = neigh_ind + return results + + def radius_neighbors( + self, X=None, radius=None, return_distance=True, sort_results=False + ): + """Find the neighbors within a given radius of a point or points. + + Return the indices and distances of each point from the dataset + lying in a ball with size ``radius`` around the points of the query + array. Points lying on the boundary are included in the results. + + The result points are *not* necessarily sorted by distance to their + query point. + + Parameters + ---------- + X : {array-like, sparse matrix} of (n_samples, n_features), default=None + The query point or points. + If not provided, neighbors of each indexed point are returned. + In this case, the query point is not considered its own neighbor. + + radius : float, default=None + Limiting distance of neighbors to return. The default is the value + passed to the constructor. + + return_distance : bool, default=True + Whether or not to return the distances. + + sort_results : bool, default=False + If True, the distances and indices will be sorted by increasing + distances before being returned. If False, the results may not + be sorted. If `return_distance=False`, setting `sort_results=True` + will result in an error. + + .. versionadded:: 0.22 + + Returns + ------- + neigh_dist : ndarray of shape (n_samples,) of arrays + Array representing the distances to each point, only present if + `return_distance=True`. The distance values are computed according + to the ``metric`` constructor parameter. + + neigh_ind : ndarray of shape (n_samples,) of arrays + An array of arrays of indices of the approximate nearest points + from the population matrix that lie within a ball of size + ``radius`` around the query points. + + Notes + ----- + Because the number of neighbors of each point is not necessarily + equal, the results for multiple query points cannot be fit in a + standard data array. + For efficiency, `radius_neighbors` returns arrays of objects, where + each object is a 1D array of indices or distances. + + Examples + -------- + In the following example, we construct a NeighborsClassifier + class from an array representing our data set and ask who's + the closest point to [1, 1, 1]: + + >>> import numpy as np + >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]] + >>> from sklearn.neighbors import NearestNeighbors + >>> neigh = NearestNeighbors(radius=1.6) + >>> neigh.fit(samples) + NearestNeighbors(radius=1.6) + >>> rng = neigh.radius_neighbors([[1., 1., 1.]]) + >>> print(np.asarray(rng[0][0])) + [1.5 0.5] + >>> print(np.asarray(rng[1][0])) + [1 2] + + The first array returned contains the distances to all points which + are closer than 1.6, while the second array returned contains their + indices. In general, multiple points can be queried at the same time. + """ + check_is_fitted(self) + + if sort_results and not return_distance: + raise ValueError("return_distance must be True if sort_results is True.") + + ensure_all_finite = "allow-nan" if get_tags(self).input_tags.allow_nan else True + query_is_train = X is None + if query_is_train: + X = self._fit_X + else: + if self.metric == "precomputed": + X = _check_precomputed(X) + else: + X = validate_data( + self, + X, + ensure_all_finite=ensure_all_finite, + accept_sparse="csr", + reset=False, + order="C", + ) + + if radius is None: + radius = self.radius + + use_pairwise_distances_reductions = ( + self._fit_method == "brute" + and RadiusNeighbors.is_usable_for( + X if X is not None else self._fit_X, self._fit_X, self.effective_metric_ + ) + ) + + if use_pairwise_distances_reductions: + results = RadiusNeighbors.compute( + X=X, + Y=self._fit_X, + radius=radius, + metric=self.effective_metric_, + metric_kwargs=self.effective_metric_params_, + strategy="auto", + return_distance=return_distance, + sort_results=sort_results, + ) + + elif ( + self._fit_method == "brute" and self.metric == "precomputed" and issparse(X) + ): + results = _radius_neighbors_from_graph( + X, radius=radius, return_distance=return_distance + ) + + elif self._fit_method == "brute": + # Joblib-based backend, which is used when user-defined callable + # are passed for metric. + + # This won't be used in the future once PairwiseDistancesReductions + # support: + # - DistanceMetrics which work on supposedly binary data + # - CSR-dense and dense-CSR case if 'euclidean' in metric. + + # for efficiency, use squared euclidean distances + if self.effective_metric_ == "euclidean": + radius *= radius + kwds = {"squared": True} + else: + kwds = self.effective_metric_params_ + + reduce_func = partial( + self._radius_neighbors_reduce_func, + radius=radius, + return_distance=return_distance, + ) + + chunked_results = pairwise_distances_chunked( + X, + self._fit_X, + reduce_func=reduce_func, + metric=self.effective_metric_, + n_jobs=self.n_jobs, + **kwds, + ) + if return_distance: + neigh_dist_chunks, neigh_ind_chunks = zip(*chunked_results) + neigh_dist_list = list(itertools.chain.from_iterable(neigh_dist_chunks)) + neigh_ind_list = list(itertools.chain.from_iterable(neigh_ind_chunks)) + neigh_dist = _to_object_array(neigh_dist_list) + neigh_ind = _to_object_array(neigh_ind_list) + results = neigh_dist, neigh_ind + else: + neigh_ind_list = list(itertools.chain.from_iterable(chunked_results)) + results = _to_object_array(neigh_ind_list) + + if sort_results: + for ii in range(len(neigh_dist)): + order = np.argsort(neigh_dist[ii], kind="mergesort") + neigh_ind[ii] = neigh_ind[ii][order] + neigh_dist[ii] = neigh_dist[ii][order] + results = neigh_dist, neigh_ind + + elif self._fit_method in ["ball_tree", "kd_tree"]: + if issparse(X): + raise ValueError( + "%s does not work with sparse matrices. Densify the data, " + "or set algorithm='brute'" % self._fit_method + ) + + n_jobs = effective_n_jobs(self.n_jobs) + delayed_query = delayed(self._tree.query_radius) + chunked_results = Parallel(n_jobs, prefer="threads")( + delayed_query(X[s], radius, return_distance, sort_results=sort_results) + for s in gen_even_slices(X.shape[0], n_jobs) + ) + if return_distance: + neigh_ind, neigh_dist = tuple(zip(*chunked_results)) + results = np.hstack(neigh_dist), np.hstack(neigh_ind) + else: + results = np.hstack(chunked_results) + else: + raise ValueError("internal: _fit_method not recognized") + + if not query_is_train: + return results + else: + # If the query data is the same as the indexed data, we would like + # to ignore the first nearest neighbor of every sample, i.e + # the sample itself. + if return_distance: + neigh_dist, neigh_ind = results + else: + neigh_ind = results + + for ind, ind_neighbor in enumerate(neigh_ind): + mask = ind_neighbor != ind + + neigh_ind[ind] = ind_neighbor[mask] + if return_distance: + neigh_dist[ind] = neigh_dist[ind][mask] + + if return_distance: + return neigh_dist, neigh_ind + return neigh_ind + + def radius_neighbors_graph( + self, X=None, radius=None, mode="connectivity", sort_results=False + ): + """Compute the (weighted) graph of Neighbors for points in X. + + Neighborhoods are restricted the points at a distance lower than + radius. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None + The query point or points. + If not provided, neighbors of each indexed point are returned. + In this case, the query point is not considered its own neighbor. + + radius : float, default=None + Radius of neighborhoods. The default is the value passed to the + constructor. + + mode : {'connectivity', 'distance'}, default='connectivity' + Type of returned matrix: 'connectivity' will return the + connectivity matrix with ones and zeros, in 'distance' the + edges are distances between points, type of distance + depends on the selected metric parameter in + NearestNeighbors class. + + sort_results : bool, default=False + If True, in each row of the result, the non-zero entries will be + sorted by increasing distances. If False, the non-zero entries may + not be sorted. Only used with mode='distance'. + + .. versionadded:: 0.22 + + Returns + ------- + A : sparse-matrix of shape (n_queries, n_samples_fit) + `n_samples_fit` is the number of samples in the fitted data. + `A[i, j]` gives the weight of the edge connecting `i` to `j`. + The matrix is of CSR format. + + See Also + -------- + kneighbors_graph : Compute the (weighted) graph of k-Neighbors for + points in X. + + Examples + -------- + >>> X = [[0], [3], [1]] + >>> from sklearn.neighbors import NearestNeighbors + >>> neigh = NearestNeighbors(radius=1.5) + >>> neigh.fit(X) + NearestNeighbors(radius=1.5) + >>> A = neigh.radius_neighbors_graph(X) + >>> A.toarray() + array([[1., 0., 1.], + [0., 1., 0.], + [1., 0., 1.]]) + """ + check_is_fitted(self) + + # check the input only in self.radius_neighbors + + if radius is None: + radius = self.radius + + # construct CSR matrix representation of the NN graph + if mode == "connectivity": + A_ind = self.radius_neighbors(X, radius, return_distance=False) + A_data = None + elif mode == "distance": + dist, A_ind = self.radius_neighbors( + X, radius, return_distance=True, sort_results=sort_results + ) + A_data = np.concatenate(list(dist)) + else: + raise ValueError( + 'Unsupported mode, must be one of "connectivity", ' + f'or "distance" but got "{mode}" instead' + ) + + n_queries = A_ind.shape[0] + n_samples_fit = self.n_samples_fit_ + n_neighbors = np.array([len(a) for a in A_ind]) + A_ind = np.concatenate(list(A_ind)) + if A_data is None: + A_data = np.ones(len(A_ind)) + A_indptr = np.concatenate((np.zeros(1, dtype=int), np.cumsum(n_neighbors))) + + return csr_matrix((A_data, A_ind, A_indptr), shape=(n_queries, n_samples_fit)) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = self.metric == "nan_euclidean" + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_binary_tree.pxi.tp b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_binary_tree.pxi.tp new file mode 100644 index 0000000000000000000000000000000000000000..de3bcb0e5d916d3153b7d41c8c975927385b8aac --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_binary_tree.pxi.tp @@ -0,0 +1,2478 @@ +{{py: + +# Generated file: _binary_tree.pxi + +implementation_specific_values = [ + # The values are arranged as follows: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE + # + ('64', 'float64_t', 'np.float64', 'cnp.NPY_DOUBLE'), + ('32', 'float32_t', 'np.float32', 'cnp.NPY_FLOAT') +] + +# KD Tree and Ball Tree +# ===================== +# +# _binary_tree.pxi is generated and is then literally Cython included in +# ball_tree.pyx and kd_tree.pyx. See ball_tree.pyx.tp and kd_tree.pyx.tp. + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause +}} + + +# KD Tree and Ball Tree +# ===================== +# +# The routines here are the core algorithms of the KDTree and BallTree +# structures. If Cython supported polymorphism, we would be able to +# create a subclass and derive KDTree and BallTree from it. Because +# polymorphism is not an option, we use this single BinaryTree class +# as a literal include to avoid duplicating the entire file. +# +# A series of functions are implemented in kd_tree.pyx and ball_tree.pyx +# which use the information here to calculate the lower and upper bounds +# between a node and a point, and between two nodes. These functions are +# used here, and are all that are needed to differentiate between the two +# tree types. +# +# Description of Binary Tree Algorithms +# ------------------------------------- +# A binary tree can be thought of as a collection of nodes. The top node +# contains all the points. The next level consists of two nodes with half +# the points in each, and this continues recursively. Each node contains +# metadata which allow fast computation of distance bounds: in the case of +# a ball tree, the metadata is a center and a radius. In the case of a +# KD tree, the metadata is the minimum and maximum bound along each dimension. +# +# In a typical KD Tree or Ball Tree implementation, the nodes are implemented +# as dynamically allocated structures with pointers linking them. Here we +# take a different approach, storing all relevant data in a set of arrays +# so that the entire tree object can be saved in a pickle file. For efficiency, +# the data can be stored in such a way that explicit pointers are not +# necessary: for node data stored at index i, the two child nodes are at +# index (2 * i + 1) and (2 * i + 2); the parent node is (i - 1) // 2 +# (where // indicates integer division). +# +# The data arrays used here are as follows: +# data : the [n_samples x n_features] array of data from which the tree +# is built +# idx_array : the length n_samples array used to keep track of the indices +# of data within each node. Each node has values idx_start and +# idx_end: the points within the node are given by (using numpy +# syntax) data[idx_array[idx_start:idx_end]]. +# node_data : the length n_nodes array of structures which store the node +# indices, node radii, and leaf information for each node. +# node_bounds : the [* x n_nodes x n_features] array containing the node +# bound information. For ball tree, the first dimension is 1, and +# each row contains the centroid of the node. For kd tree, the first +# dimension is 2 and the rows for each point contain the arrays of +# lower bounds and upper bounds in each direction. +# +# The lack of dynamic allocation means the number of nodes must be computed +# before the building of the tree. This can be done assuming the points are +# divided equally between child nodes at each step; although this removes +# some flexibility in tree creation, it ensures a balanced tree and ensures +# that the number of nodes required can be computed beforehand. Given a +# specified leaf_size (the minimum number of points in any node), it is +# possible to show that a balanced tree will have +# +# n_levels = 1 + max(0, floor(log2((n_samples - 1) / leaf_size))) +# +# in order to satisfy +# +# leaf_size <= min(n_points) <= 2 * leaf_size +# +# with the exception of the special case where n_samples < leaf_size. +# for a given number of levels, the number of nodes in the tree is given by +# +# n_nodes = 2 ** n_levels - 1 +# +# both these results can be straightforwardly shown by induction. The +# following code uses these values in the construction of the tree. +# +# Distance Metrics +# ---------------- +# For flexibility, the trees can be built using a variety of distance metrics. +# The metrics are described in the DistanceMetric class: the standard +# Euclidean distance is the default, and is inlined to be faster than other +# metrics. In addition, each metric defines both a distance and a +# "reduced distance", which is often faster to compute, and is therefore +# used in the query architecture whenever possible. (For example, in the +# case of the standard Euclidean distance, the reduced distance is the +# squared-distance). +# +# Implementation Notes +# -------------------- +# This implementation uses the common object-oriented approach of having an +# abstract base class which is extended by the KDTree and BallTree +# specializations. +# +# The BinaryTree "base class" is defined here and then subclassed in the BallTree +# and KDTree pyx files. These files include implementations of the +# "abstract" methods. + +# Necessary Helper Functions +# -------------------------- +# These are the names and descriptions of the "abstract" functions which are +# defined in kd_tree.pyx and ball_tree.pyx: + +# cdef int allocate_data(BinaryTree tree, intp_t n_nodes, intp_t n_features): +# """Allocate arrays needed for the KD Tree""" + +# cdef int init_node(BinaryTree tree, intp_t i_node, +# intp_t idx_start, intp_t idx_end): +# """Initialize the node for the dataset stored in tree.data""" + +# cdef float64_t min_rdist(BinaryTree tree, intp_t i_node, float64_t* pt): +# """Compute the minimum reduced-distance between a point and a node""" + +# cdef float64_t min_dist(BinaryTree tree, intp_t i_node, float64_t* pt): +# """Compute the minimum distance between a point and a node""" + +# cdef float64_t max_rdist(BinaryTree tree, intp_t i_node, float64_t* pt): +# """Compute the maximum reduced-distance between a point and a node""" + +# cdef float64_t max_dist(BinaryTree tree, intp_t i_node, float64_t* pt): +# """Compute the maximum distance between a point and a node""" + +# cdef inline int min_max_dist(BinaryTree tree, intp_t i_node, float64_t* pt, +# float64_t* min_dist, float64_t* max_dist): +# """Compute the minimum and maximum distance between a point and a node""" + +# cdef inline float64_t min_rdist_dual(BinaryTree tree1, intp_t i_node1, +# BinaryTree tree2, intp_t i_node2): +# """Compute the minimum reduced distance between two nodes""" + +# cdef inline float64_t min_dist_dual(BinaryTree tree1, intp_t i_node1, +# BinaryTree tree2, intp_t i_node2): +# """Compute the minimum distance between two nodes""" + +# cdef inline float64_t max_rdist_dual(BinaryTree tree1, intp_t i_node1, +# BinaryTree tree2, intp_t i_node2): +# """Compute the maximum reduced distance between two nodes""" + +# cdef inline float64_t max_dist_dual(BinaryTree tree1, intp_t i_node1, +# BinaryTree tree2, intp_t i_node2): +# """Compute the maximum distance between two nodes""" + +cimport numpy as cnp +from cython cimport floating +from libc.math cimport fabs, sqrt, exp, cos, pow, log, lgamma +from libc.math cimport fmin, fmax +from libc.stdlib cimport calloc, malloc, free +from libc.string cimport memcpy + +import numpy as np +import warnings + +from ..metrics._dist_metrics cimport ( + DistanceMetric, + DistanceMetric64, + DistanceMetric32, + euclidean_dist64, + euclidean_dist32, + euclidean_rdist64, + euclidean_rdist32, + euclidean_dist_to_rdist64, + euclidean_dist_to_rdist32, +) + +from ._partition_nodes cimport partition_node_indices + +from ..utils import check_array +from ..utils._typedefs cimport float32_t, float64_t, intp_t +from ..utils._heap cimport heap_push +from ..utils._sorting cimport simultaneous_sort as _simultaneous_sort + +cnp.import_array() + + +# TODO: use cnp.PyArray_ENABLEFLAGS when Cython>=3.0 is used. +cdef extern from "numpy/arrayobject.h": + void PyArray_ENABLEFLAGS(cnp.ndarray arr, int flags) + + +# some handy constants +cdef float64_t INF = np.inf +cdef float64_t NEG_INF = -np.inf +cdef float64_t PI = np.pi +cdef float64_t ROOT_2PI = sqrt(2 * PI) +cdef float64_t LOG_PI = log(PI) +cdef float64_t LOG_2PI = log(2 * PI) + + +# Some compound datatypes used below: +cdef struct NodeHeapData_t: + float64_t val + intp_t i1 + intp_t i2 + +# build the corresponding numpy dtype for NodeHeapData +cdef NodeHeapData_t nhd_tmp +NodeHeapData = np.asarray((&nhd_tmp)).dtype + +cdef struct NodeData_t: + intp_t idx_start + intp_t idx_end + intp_t is_leaf + float64_t radius + +# build the corresponding numpy dtype for NodeData +cdef NodeData_t nd_tmp +NodeData = np.asarray((&nd_tmp)).dtype + + +###################################################################### +# Define doc strings, substituting the appropriate class name using +# the DOC_DICT variable defined in the pyx files. +CLASS_DOC = """{BinaryTree} for fast generalized N-point problems + +Read more in the :ref:`User Guide `. + +Parameters +---------- +X : array-like of shape (n_samples, n_features) + n_samples is the number of points in the data set, and + n_features is the dimension of the parameter space. + Note: if X is a C-contiguous array of doubles then data will + not be copied. Otherwise, an internal copy will be made. + +leaf_size : positive int, default=40 + Number of points at which to switch to brute-force. Changing + leaf_size will not affect the results of a query, but can + significantly impact the speed of a query and the memory required + to store the constructed tree. The amount of memory needed to + store the tree scales as approximately n_samples / leaf_size. + For a specified ``leaf_size``, a leaf node is guaranteed to + satisfy ``leaf_size <= n_points <= 2 * leaf_size``, except in + the case that ``n_samples < leaf_size``. + +metric : str or DistanceMetric64 object, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. + A list of valid metrics for {BinaryTree} is given by the attribute + `valid_metrics`. + See the documentation of `scipy.spatial.distance + `_ and + the metrics listed in :class:`~sklearn.metrics.pairwise.distance_metrics` for + more information on any distance metric. + +Additional keywords are passed to the distance metric class. +Note: Callable functions in the metric parameter are NOT supported for KDTree +and Ball Tree. Function call overhead will result in very poor performance. + +Attributes +---------- +data : memory view + The training data +valid_metrics: list of str + List of valid distance metrics. + +Examples +-------- +Query for k-nearest neighbors + + >>> import numpy as np + >>> from sklearn.neighbors import {BinaryTree} + >>> rng = np.random.RandomState(0) + >>> X = rng.random_sample((10, 3)) # 10 points in 3 dimensions + >>> tree = {BinaryTree}(X, leaf_size=2) # doctest: +SKIP + >>> dist, ind = tree.query(X[:1], k=3) # doctest: +SKIP + >>> print(ind) # indices of 3 closest neighbors + [0 3 1] + >>> print(dist) # distances to 3 closest neighbors + [ 0. 0.19662693 0.29473397] + +Pickle and Unpickle a tree. Note that the state of the tree is saved in the +pickle operation: the tree needs not be rebuilt upon unpickling. + + >>> import numpy as np + >>> import pickle + >>> rng = np.random.RandomState(0) + >>> X = rng.random_sample((10, 3)) # 10 points in 3 dimensions + >>> tree = {BinaryTree}(X, leaf_size=2) # doctest: +SKIP + >>> s = pickle.dumps(tree) # doctest: +SKIP + >>> tree_copy = pickle.loads(s) # doctest: +SKIP + >>> dist, ind = tree_copy.query(X[:1], k=3) # doctest: +SKIP + >>> print(ind) # indices of 3 closest neighbors + [0 3 1] + >>> print(dist) # distances to 3 closest neighbors + [ 0. 0.19662693 0.29473397] + +Query for neighbors within a given radius + + >>> import numpy as np + >>> rng = np.random.RandomState(0) + >>> X = rng.random_sample((10, 3)) # 10 points in 3 dimensions + >>> tree = {BinaryTree}(X, leaf_size=2) # doctest: +SKIP + >>> print(tree.query_radius(X[:1], r=0.3, count_only=True)) + 3 + >>> ind = tree.query_radius(X[:1], r=0.3) # doctest: +SKIP + >>> print(ind) # indices of neighbors within distance 0.3 + [3 0 1] + + +Compute a gaussian kernel density estimate: + + >>> import numpy as np + >>> rng = np.random.RandomState(42) + >>> X = rng.random_sample((100, 3)) + >>> tree = {BinaryTree}(X) # doctest: +SKIP + >>> tree.kernel_density(X[:3], h=0.1, kernel='gaussian') + array([ 6.94114649, 7.83281226, 7.2071716 ]) + +Compute a two-point auto-correlation function + + >>> import numpy as np + >>> rng = np.random.RandomState(0) + >>> X = rng.random_sample((30, 3)) + >>> r = np.linspace(0, 1, 5) + >>> tree = {BinaryTree}(X) # doctest: +SKIP + >>> tree.two_point_correlation(X, r) + array([ 30, 62, 278, 580, 820]) + +""" + + +###################################################################### +# Utility functions +cdef float64_t logaddexp(float64_t x1, float64_t x2): + """logaddexp(x1, x2) -> log(exp(x1) + exp(x2))""" + cdef float64_t a = fmax(x1, x2) + if a == NEG_INF: + return NEG_INF + else: + return a + log(exp(x1 - a) + exp(x2 - a)) + +cdef float64_t logsubexp(float64_t x1, float64_t x2): + """logsubexp(x1, x2) -> log(exp(x1) - exp(x2))""" + if x1 <= x2: + return NEG_INF + else: + return x1 + log(1 - exp(x2 - x1)) + + +###################################################################### +# Kernel functions +# +# Note: Kernels assume dist is non-negative and h is positive +# All kernel functions are normalized such that K(0, h) = 1. +# The fully normalized kernel is: +# K = exp[kernel_norm(h, d, kernel) + compute_kernel(dist, h, kernel)] +# The code only works with non-negative kernels: i.e. K(d, h) >= 0 +# for all valid d and h. Note that for precision, the log of both +# the kernel and kernel norm is returned. +cdef enum KernelType: + GAUSSIAN_KERNEL = 1 + TOPHAT_KERNEL = 2 + EPANECHNIKOV_KERNEL = 3 + EXPONENTIAL_KERNEL = 4 + LINEAR_KERNEL = 5 + COSINE_KERNEL = 6 + + +cdef inline float64_t log_gaussian_kernel(float64_t dist, float64_t h): + """log of the gaussian kernel for bandwidth h (unnormalized)""" + return -0.5 * (dist * dist) / (h * h) + + +cdef inline float64_t log_tophat_kernel(float64_t dist, float64_t h): + """log of the tophat kernel for bandwidth h (unnormalized)""" + if dist < h: + return 0.0 + else: + return NEG_INF + + +cdef inline float64_t log_epanechnikov_kernel(float64_t dist, float64_t h): + """log of the epanechnikov kernel for bandwidth h (unnormalized)""" + if dist < h: + return log(1.0 - (dist * dist) / (h * h)) + else: + return NEG_INF + + +cdef inline float64_t log_exponential_kernel(float64_t dist, float64_t h): + """log of the exponential kernel for bandwidth h (unnormalized)""" + return -dist / h + + +cdef inline float64_t log_linear_kernel(float64_t dist, float64_t h): + """log of the linear kernel for bandwidth h (unnormalized)""" + if dist < h: + return log(1 - dist / h) + else: + return NEG_INF + + +cdef inline float64_t log_cosine_kernel(float64_t dist, float64_t h): + """log of the cosine kernel for bandwidth h (unnormalized)""" + if dist < h: + return log(cos(0.5 * PI * dist / h)) + else: + return NEG_INF + + +cdef inline float64_t compute_log_kernel(float64_t dist, float64_t h, + KernelType kernel): + """Given a KernelType enumeration, compute the appropriate log-kernel""" + if kernel == GAUSSIAN_KERNEL: + return log_gaussian_kernel(dist, h) + elif kernel == TOPHAT_KERNEL: + return log_tophat_kernel(dist, h) + elif kernel == EPANECHNIKOV_KERNEL: + return log_epanechnikov_kernel(dist, h) + elif kernel == EXPONENTIAL_KERNEL: + return log_exponential_kernel(dist, h) + elif kernel == LINEAR_KERNEL: + return log_linear_kernel(dist, h) + elif kernel == COSINE_KERNEL: + return log_cosine_kernel(dist, h) + + +# ------------------------------------------------------------ +# Kernel norms are defined via the volume element V_n +# and surface element S_(n-1) of an n-sphere. +cdef float64_t logVn(intp_t n): + """V_n = pi^(n/2) / gamma(n/2 - 1)""" + return 0.5 * n * LOG_PI - lgamma(0.5 * n + 1) + + +cdef float64_t logSn(intp_t n): + """V_(n+1) = int_0^1 S_n r^n dr""" + return LOG_2PI + logVn(n - 1) + + +cdef float64_t _log_kernel_norm(float64_t h, intp_t d, + KernelType kernel) except -1: + """Given a KernelType enumeration, compute the kernel normalization. + + h is the bandwidth, d is the dimension. + """ + cdef float64_t tmp, factor = 0 + cdef intp_t k + if kernel == GAUSSIAN_KERNEL: + factor = 0.5 * d * LOG_2PI + elif kernel == TOPHAT_KERNEL: + factor = logVn(d) + elif kernel == EPANECHNIKOV_KERNEL: + factor = logVn(d) + log(2. / (d + 2.)) + elif kernel == EXPONENTIAL_KERNEL: + factor = logSn(d - 1) + lgamma(d) + elif kernel == LINEAR_KERNEL: + factor = logVn(d) - log(d + 1.) + elif kernel == COSINE_KERNEL: + # this is derived from a chain rule integration + factor = 0 + tmp = 2. / PI + for k in range(1, d + 1, 2): + factor += tmp + tmp *= -(d - k) * (d - k - 1) * (2. / PI) ** 2 + factor = log(factor) + logSn(d - 1) + else: + raise ValueError("Kernel code not recognized") + return -factor - d * log(h) + + +def kernel_norm(h, d, kernel, return_log=False): + """Given a string specification of a kernel, compute the normalization. + + Parameters + ---------- + h : float + The bandwidth of the kernel. + d : int + The dimension of the space in which the kernel norm is computed. + kernel : str + The kernel identifier. Must be one of + ['gaussian'|'tophat'|'epanechnikov'| + 'exponential'|'linear'|'cosine'] + return_log : bool, default=False + If True, return the log of the kernel norm. Otherwise, return the + kernel norm. + Returns + ------- + knorm or log_knorm : float + the kernel norm or logarithm of the kernel norm. + """ + if kernel == 'gaussian': + result = _log_kernel_norm(h, d, GAUSSIAN_KERNEL) + elif kernel == 'tophat': + result = _log_kernel_norm(h, d, TOPHAT_KERNEL) + elif kernel == 'epanechnikov': + result = _log_kernel_norm(h, d, EPANECHNIKOV_KERNEL) + elif kernel == 'exponential': + result = _log_kernel_norm(h, d, EXPONENTIAL_KERNEL) + elif kernel == 'linear': + result = _log_kernel_norm(h, d, LINEAR_KERNEL) + elif kernel == 'cosine': + result = _log_kernel_norm(h, d, COSINE_KERNEL) + else: + raise ValueError('kernel not recognized') + + if return_log: + return result + else: + return np.exp(result) + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}} + +cdef class NeighborsHeap{{name_suffix}}: + """A max-heap structure to keep track of distances/indices of neighbors + + This implements an efficient pre-allocated set of fixed-size heaps + for chasing neighbors, holding both an index and a distance. + When any row of the heap is full, adding an additional point will push + the furthest point off the heap. + + Parameters + ---------- + n_pts : int + the number of heaps to use + n_nbrs : int + the size of each heap. + """ + cdef {{INPUT_DTYPE_t}}[:, ::1] distances + cdef intp_t[:, ::1] indices + + def __cinit__(self): + # One-element arrays are used as placeholders to prevent + # any problem due to potential access to those attributes + # (e.g. assigning to NULL or a to value in another segment). + self.distances = np.zeros((1, 1), dtype={{INPUT_DTYPE}}, order='C') + self.indices = np.zeros((1, 1), dtype=np.intp, order='C') + + def __init__(self, n_pts, n_nbrs): + self.distances = np.full( + (n_pts, n_nbrs), np.inf, dtype={{INPUT_DTYPE}}, order='C' + ) + self.indices = np.zeros((n_pts, n_nbrs), dtype=np.intp, order='C') + + def get_arrays(self, sort=True): + """Get the arrays of distances and indices within the heap. + + If sort=True, then simultaneously sort the indices and distances, + so the closer points are listed first. + """ + if sort: + self._sort() + return self.distances.base, self.indices.base + + cdef inline float64_t largest(self, intp_t row) except -1 nogil: + """Return the largest distance in the given row""" + return self.distances[row, 0] + + def push(self, intp_t row, float64_t val, intp_t i_val): + return self._push(row, val, i_val) + + cdef int _push(self, intp_t row, float64_t val, + intp_t i_val) except -1 nogil: + """push (val, i_val) into the given row""" + return heap_push( + values=&self.distances[row, 0], + indices=&self.indices[row, 0], + size=self.distances.shape[1], + val=val, + val_idx=i_val, + ) + + cdef int _sort(self) except -1: + """simultaneously sort the distances and indices""" + cdef intp_t row + for row in range(self.distances.shape[0]): + _simultaneous_sort( + dist=&self.distances[row, 0], + idx=&self.indices[row, 0], + size=self.distances.shape[1], + ) + return 0 + +{{endfor}} + +#------------------------------------------------------------ +# find_node_split_dim: +# this computes the equivalent of +# j_max = np.argmax(np.max(data, 0) - np.min(data, 0)) +cdef intp_t find_node_split_dim(const floating* data, + const intp_t* node_indices, + intp_t n_features, + intp_t n_points) except -1: + """Find the dimension with the largest spread. + + Parameters + ---------- + data : double pointer + Pointer to a 2D array of the training data, of shape [N, n_features]. + N must be greater than any of the values in node_indices. + node_indices : int pointer + Pointer to a 1D array of length n_points. This lists the indices of + each of the points within the current node. + + Returns + ------- + i_max : int + The index of the feature (dimension) within the node that has the + largest spread. + + Notes + ----- + In numpy, this operation is equivalent to + + def find_node_split_dim(data, node_indices): + return np.argmax(data[node_indices].max(0) - data[node_indices].min(0)) + + The cython version is much more efficient in both computation and memory. + """ + cdef float64_t min_val, max_val, val, spread, max_spread + cdef intp_t i, j, j_max + + j_max = 0 + max_spread = 0 + + for j in range(n_features): + max_val = data[node_indices[0] * n_features + j] + min_val = max_val + for i in range(1, n_points): + val = data[node_indices[i] * n_features + j] + max_val = fmax(max_val, val) + min_val = fmin(min_val, val) + spread = max_val - min_val + if spread > max_spread: + max_spread = spread + j_max = j + return j_max + + +###################################################################### +# NodeHeap : min-heap used to keep track of nodes during +# breadth-first query +cdef inline void swap_nodes(NodeHeapData_t* arr, intp_t i1, intp_t i2): + cdef NodeHeapData_t tmp = arr[i1] + arr[i1] = arr[i2] + arr[i2] = tmp + + +cdef class NodeHeap: + """NodeHeap + + This is a min-heap implementation for keeping track of nodes + during a breadth-first search. Unlike the NeighborsHeap above, + the NodeHeap does not have a fixed size and must be able to grow + as elements are added. + + Internally, the data is stored in a simple binary heap which meets + the min heap condition: + + heap[i].val < min(heap[2 * i + 1].val, heap[2 * i + 2].val) + """ + cdef NodeHeapData_t[:] data + cdef intp_t n + + def __cinit__(self): + # A one-elements array is used as a placeholder to prevent + # any problem due to potential access to this attribute + # (e.g. assigning to NULL or a to value in another segment). + self.data = np.zeros(1, dtype=NodeHeapData, order='C') + + def __init__(self, size_guess=100): + size_guess = max(size_guess, 1) # need space for at least one item + self.data = np.zeros(size_guess, dtype=NodeHeapData, order='C') + self.n = size_guess + self.clear() + + cdef int resize(self, intp_t new_size) except -1: + """Resize the heap to be either larger or smaller""" + cdef: + NodeHeapData_t *data_ptr + NodeHeapData_t *new_data_ptr + intp_t i + intp_t size = self.data.shape[0] + NodeHeapData_t[:] new_data = np.zeros( + new_size, + dtype=NodeHeapData, + ) + + if size > 0 and new_size > 0: + data_ptr = &self.data[0] + new_data_ptr = &new_data[0] + for i in range(min(size, new_size)): + new_data_ptr[i] = data_ptr[i] + + if new_size < size: + self.n = new_size + + self.data = new_data + return 0 + + cdef int push(self, NodeHeapData_t data) except -1: + """Push a new item onto the heap""" + cdef intp_t i, i_parent + cdef NodeHeapData_t* data_arr + self.n += 1 + if self.n > self.data.shape[0]: + self.resize(2 * self.n) + + # put the new element at the end, + # and then perform swaps until the heap is in order + data_arr = &self.data[0] + i = self.n - 1 + data_arr[i] = data + + while i > 0: + i_parent = (i - 1) // 2 + if data_arr[i_parent].val <= data_arr[i].val: + break + else: + swap_nodes(data_arr, i, i_parent) + i = i_parent + return 0 + + cdef NodeHeapData_t peek(self): + """Peek at the root of the heap, without removing it""" + return self.data[0] + + cdef NodeHeapData_t pop(self): + """Remove the root of the heap, and update the remaining nodes""" + if self.n == 0: + raise ValueError('cannot pop on empty heap') + + cdef intp_t i, i_child1, i_child2, i_swap + cdef NodeHeapData_t* data_arr = &self.data[0] + cdef NodeHeapData_t popped_element = data_arr[0] + + # pop off the first element, move the last element to the front, + # and then perform swaps until the heap is back in order + data_arr[0] = data_arr[self.n - 1] + self.n -= 1 + + i = 0 + + while (i < self.n): + i_child1 = 2 * i + 1 + i_child2 = 2 * i + 2 + i_swap = 0 + + if i_child2 < self.n: + if data_arr[i_child1].val <= data_arr[i_child2].val: + i_swap = i_child1 + else: + i_swap = i_child2 + elif i_child1 < self.n: + i_swap = i_child1 + else: + break + + if (i_swap > 0) and (data_arr[i_swap].val <= data_arr[i].val): + swap_nodes(data_arr, i, i_swap) + i = i_swap + else: + break + + return popped_element + + cdef void clear(self): + """Clear the heap""" + self.n = 0 + + +###################################################################### +# newObj function +# this is a helper function for pickling +def newObj(obj): + return obj.__new__(obj) + + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE, NPY_TYPE in implementation_specific_values}} + +###################################################################### +# define the reverse mapping of VALID_METRICS{{name_suffix}} +from sklearn.metrics._dist_metrics import get_valid_metric_ids +VALID_METRIC_IDS{{name_suffix}} = get_valid_metric_ids(VALID_METRICS{{name_suffix}}) + + +###################################################################### +# Binary Tree class +cdef class BinaryTree{{name_suffix}}: + + cdef readonly const {{INPUT_DTYPE_t}}[:, ::1] data + cdef readonly const {{INPUT_DTYPE_t}}[::1] sample_weight + cdef public float64_t sum_weight + + # TODO: idx_array and node_bounds must not be const, but this change needs + # to happen in a way which preserves pickling + # See also: https://github.com/cython/cython/issues/5639 + cdef public const intp_t[::1] idx_array + cdef public const NodeData_t[::1] node_data + cdef public const {{INPUT_DTYPE_t}}[:, :, ::1] node_bounds + + cdef intp_t leaf_size + cdef intp_t n_levels + cdef intp_t n_nodes + + cdef DistanceMetric{{name_suffix}} dist_metric + cdef int euclidean + + # variables to keep track of building & querying stats + cdef int n_trims + cdef int n_leaves + cdef int n_splits + cdef int n_calls + + valid_metrics = VALID_METRIC_IDS{{name_suffix}} + + # Use cinit to initialize all arrays to empty: this will prevent memory + # errors and seg-faults in rare cases where __init__ is not called + # A one-elements array is used as a placeholder to prevent + # any problem due to potential access to this attribute + # (e.g. assigning to NULL or a to value in another segment). + def __cinit__(self): + self.data = np.empty((1, 1), dtype={{INPUT_DTYPE}}, order='C') + self.sample_weight = np.empty(1, dtype={{INPUT_DTYPE}}, order='C') + self.idx_array = np.empty(1, dtype=np.intp, order='C') + self.node_data = np.empty(1, dtype=NodeData, order='C') + self.node_bounds = np.empty((1, 1, 1), dtype={{INPUT_DTYPE}}) + + self.leaf_size = 0 + self.n_levels = 0 + self.n_nodes = 0 + + self.euclidean = False + + self.n_trims = 0 + self.n_leaves = 0 + self.n_splits = 0 + self.n_calls = 0 + + def __init__(self, data, + leaf_size=40, metric='minkowski', sample_weight=None, **kwargs): + # validate data + self.data = check_array(data, dtype={{INPUT_DTYPE}}, order='C') + if self.data.size == 0: + raise ValueError("X is an empty array") + + n_samples = self.data.shape[0] + n_features = self.data.shape[1] + + if leaf_size < 1: + raise ValueError("leaf_size must be greater than or equal to 1") + self.leaf_size = leaf_size + + self.dist_metric = DistanceMetric.get_metric(metric, dtype={{INPUT_DTYPE}}, **kwargs) + self.euclidean = (self.dist_metric.__class__.__name__ + == 'EuclideanDistance{{name_suffix}}') + + metric = self.dist_metric.__class__.__name__ + if metric not in VALID_METRICS{{name_suffix}}: + raise ValueError('metric {metric} is not valid for ' + '{BinaryTree}'.format(metric=metric, + **DOC_DICT{{name_suffix}})) + self.dist_metric._validate_data(self.data) + + # determine number of levels in the tree, and from this + # the number of nodes in the tree. This results in leaf nodes + # with numbers of points between leaf_size and 2 * leaf_size + self.n_levels = int( + np.log2(fmax(1, (n_samples - 1) / self.leaf_size)) + 1) + self.n_nodes = (2 ** self.n_levels) - 1 + + # allocate arrays for storage + self.idx_array = np.arange(n_samples, dtype=np.intp) + self.node_data = np.zeros(self.n_nodes, dtype=NodeData) + + self._update_sample_weight(n_samples, sample_weight) + + # Allocate tree-specific data + allocate_data{{name_suffix}}(self, self.n_nodes, n_features) + self._recursive_build( + node_data=self.node_data.base, + i_node=0, + idx_start=0, + idx_end=n_samples + ) + + def _update_sample_weight(self, n_samples, sample_weight): + if sample_weight is not None: + self.sample_weight = np.asarray( + sample_weight, dtype={{INPUT_DTYPE}}, order='C') + self.sum_weight = np.sum(self.sample_weight) + else: + self.sample_weight = None + self.sum_weight = n_samples + + def __reduce__(self): + """ + reduce method used for pickling + """ + return (newObj, (type(self),), self.__getstate__()) + + def __getstate__(self): + """ + get state for pickling + """ + if self.sample_weight is not None: + # pass the numpy array + sample_weight = self.sample_weight.base + else: + # pass None to avoid confusion with the empty place holder + # of size 1 from __cinit__ + sample_weight = None + return (self.data.base, + self.idx_array.base, + self.node_data.base, + self.node_bounds.base, + int(self.leaf_size), + int(self.n_levels), + int(self.n_nodes), + int(self.n_trims), + int(self.n_leaves), + int(self.n_splits), + int(self.n_calls), + self.dist_metric, + sample_weight) + + def __setstate__(self, state): + """ + set state for pickling + """ + self.data = state[0] + self.idx_array = state[1] + self.node_data = state[2] + self.node_bounds = state[3] + self.leaf_size = state[4] + self.n_levels = state[5] + self.n_nodes = state[6] + self.n_trims = state[7] + self.n_leaves = state[8] + self.n_splits = state[9] + self.n_calls = state[10] + self.dist_metric = state[11] + sample_weight = state[12] + + self.euclidean = (self.dist_metric.__class__.__name__ + == 'EuclideanDistance64') + n_samples = self.data.shape[0] + self._update_sample_weight(n_samples, sample_weight) + + def get_tree_stats(self): + """ + get_tree_stats() + + Get tree status. + + Returns + ------- + tree_stats: tuple of int + (number of trims, number of leaves, number of splits) + """ + return (self.n_trims, self.n_leaves, self.n_splits) + + def reset_n_calls(self): + """ + reset_n_calls() + + Reset number of calls to 0. + """ + self.n_calls = 0 + + def get_n_calls(self): + """ + get_n_calls() + + Get number of calls. + + Returns + ------- + n_calls: int + number of distance computation calls + """ + return self.n_calls + + def get_arrays(self): + """ + get_arrays() + + Get data and node arrays. + + Returns + ------- + arrays: tuple of array + Arrays for storing tree data, index, node data and node bounds. + """ + return ( + self.data.base, + self.idx_array.base, + self.node_data.base, + self.node_bounds.base, + ) + + cdef inline float64_t dist(self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, + intp_t size) except -1 nogil: + """Compute the distance between arrays x1 and x2""" + self.n_calls += 1 + if self.euclidean: + return euclidean_dist{{name_suffix}}(x1, x2, size) + else: + return self.dist_metric.dist(x1, x2, size) + + cdef inline float64_t rdist(self, const {{INPUT_DTYPE_t}}* x1, const {{INPUT_DTYPE_t}}* x2, + intp_t size) except -1 nogil: + """Compute the reduced distance between arrays x1 and x2. + + The reduced distance, defined for some metrics, is a quantity which + is more efficient to compute than the distance, but preserves the + relative rankings of the true distance. For example, the reduced + distance for the Euclidean metric is the squared-euclidean distance. + """ + self.n_calls += 1 + if self.euclidean: + return euclidean_rdist{{name_suffix}}(x1, x2, size) + else: + return self.dist_metric.rdist(x1, x2, size) + + cdef int _recursive_build(self, NodeData_t[::1] node_data, intp_t i_node, intp_t idx_start, + intp_t idx_end) except -1: + """Recursively build the tree. + + Parameters + ---------- + i_node : int + the node for the current step + idx_start, idx_end : int + the bounding indices in the idx_array which define the points that + belong to this node. + """ + cdef intp_t imax + cdef intp_t n_features = self.data.shape[1] + cdef intp_t n_points = idx_end - idx_start + cdef intp_t n_mid = n_points / 2 + cdef intp_t* idx_array = &self.idx_array[idx_start] + cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0] + + # initialize node data + init_node{{name_suffix}}(self, node_data, i_node, idx_start, idx_end) + + if 2 * i_node + 1 >= self.n_nodes: + node_data[i_node].is_leaf = True + if idx_end - idx_start > 2 * self.leaf_size: + # this shouldn't happen if our memory allocation is correct + # we'll proactively prevent memory errors, but raise a + # warning saying we're doing so. + import warnings + warnings.warn("Internal: memory layout is flawed: " + "not enough nodes allocated") + + elif idx_end - idx_start < 2: + # again, this shouldn't happen if our memory allocation + # is correct. Raise a warning. + import warnings + warnings.warn("Internal: memory layout is flawed: " + "too many nodes allocated") + node_data[i_node].is_leaf = True + + else: + # split node and recursively construct child nodes. + node_data[i_node].is_leaf = False + i_max = find_node_split_dim(data, idx_array, + n_features, n_points) + partition_node_indices(data, idx_array, i_max, n_mid, + n_features, n_points) + self._recursive_build(node_data, 2 * i_node + 1, + idx_start, idx_start + n_mid) + self._recursive_build(node_data, 2 * i_node + 2, + idx_start + n_mid, idx_end) + + def query(self, X, k=1, return_distance=True, + dualtree=False, breadth_first=False, + sort_results=True): + """ + query(X, k=1, return_distance=True, + dualtree=False, breadth_first=False) + + query the tree for the k nearest neighbors + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + An array of points to query + k : int, default=1 + The number of nearest neighbors to return + return_distance : bool, default=True + if True, return a tuple (d, i) of distances and indices + if False, return array i + dualtree : bool, default=False + if True, use the dual tree formalism for the query: a tree is + built for the query points, and the pair of trees is used to + efficiently search this space. This can lead to better + performance as the number of points grows large. + breadth_first : bool, default=False + if True, then query the nodes in a breadth-first manner. + Otherwise, query the nodes in a depth-first manner. + sort_results : bool, default=True + if True, then distances and indices of each point are sorted + on return, so that the first column contains the closest points. + Otherwise, neighbors are returned in an arbitrary order. + + Returns + ------- + i : if return_distance == False + (d,i) : if return_distance == True + + d : ndarray of shape X.shape[:-1] + (k,), dtype=double + Each entry gives the list of distances to the neighbors of the + corresponding point. + + i : ndarray of shape X.shape[:-1] + (k,), dtype=int + Each entry gives the list of indices of neighbors of the + corresponding point. + """ + # XXX: we should allow X to be a pre-built tree. + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') + + if X.shape[X.ndim - 1] != self.data.shape[1]: + raise ValueError("query data dimension must " + "match training data dimension") + + if self.data.shape[0] < k: + raise ValueError("k must be less than or equal " + "to the number of training points") + + # flatten X, and save original shape information + np_Xarr = X.reshape((-1, self.data.shape[1])) + cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = np_Xarr + cdef float64_t reduced_dist_LB + cdef intp_t i + cdef const {{INPUT_DTYPE_t}}* pt + + # initialize heap for neighbors + cdef NeighborsHeap{{name_suffix}} heap = NeighborsHeap{{name_suffix}}(Xarr.shape[0], k) + + # node heap for breadth-first queries + cdef NodeHeap nodeheap + if breadth_first: + nodeheap = NodeHeap(self.data.shape[0] // self.leaf_size) + + # bounds is needed for the dual tree algorithm + cdef float64_t[::1] bounds + + self.n_trims = 0 + self.n_leaves = 0 + self.n_splits = 0 + + if dualtree: + other = self.__class__(np_Xarr, metric=self.dist_metric, + leaf_size=self.leaf_size) + if breadth_first: + self._query_dual_breadthfirst(other, heap, nodeheap) + else: + reduced_dist_LB = min_rdist_dual{{name_suffix}}(self, 0, other, 0) + bounds = np.full(other.node_data.shape[0], np.inf) + self._query_dual_depthfirst(0, other, 0, bounds, + heap, reduced_dist_LB) + + else: + pt = &Xarr[0, 0] + if breadth_first: + for i in range(Xarr.shape[0]): + self._query_single_breadthfirst(pt, i, heap, nodeheap) + pt += Xarr.shape[1] + else: + with nogil: + for i in range(Xarr.shape[0]): + reduced_dist_LB = min_rdist{{name_suffix}}(self, 0, pt) + self._query_single_depthfirst(0, pt, i, heap, + reduced_dist_LB) + pt += Xarr.shape[1] + + distances, indices = heap.get_arrays(sort=sort_results) + distances = self.dist_metric.rdist_to_dist(distances) + + # deflatten results + if return_distance: + return (distances.reshape(X.shape[:X.ndim - 1] + (k,)), + indices.reshape(X.shape[:X.ndim - 1] + (k,))) + else: + return indices.reshape(X.shape[:X.ndim - 1] + (k,)) + + def query_radius(self, X, r, int return_distance=False, + int count_only=False, int sort_results=False): + """ + query_radius(X, r, return_distance=False, + count_only=False, sort_results=False) + + query the tree for neighbors within a radius r + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + An array of points to query + r : distance within which neighbors are returned + r can be a single value, or an array of values of shape + x.shape[:-1] if different radii are desired for each point. + return_distance : bool, default=False + if True, return distances to neighbors of each point + if False, return only neighbors + Note that unlike the query() method, setting return_distance=True + here adds to the computation time. Not all distances need to be + calculated explicitly for return_distance=False. Results are + not sorted by default: see ``sort_results`` keyword. + count_only : bool, default=False + if True, return only the count of points within distance r + if False, return the indices of all points within distance r + If return_distance==True, setting count_only=True will + result in an error. + sort_results : bool, default=False + if True, the distances and indices will be sorted before being + returned. If False, the results will not be sorted. If + return_distance == False, setting sort_results = True will + result in an error. + + Returns + ------- + count : if count_only == True + ind : if count_only == False and return_distance == False + (ind, dist) : if count_only == False and return_distance == True + + count : ndarray of shape X.shape[:-1], dtype=int + Each entry gives the number of neighbors within a distance r of the + corresponding point. + + ind : ndarray of shape X.shape[:-1], dtype=object + Each element is a numpy integer array listing the indices of + neighbors of the corresponding point. Note that unlike + the results of a k-neighbors query, the returned neighbors + are not sorted by distance by default. + + dist : ndarray of shape X.shape[:-1], dtype=object + Each element is a numpy double array listing the distances + corresponding to indices in i. + """ + if count_only and return_distance: + raise ValueError("count_only and return_distance " + "cannot both be true") + + if sort_results and not return_distance: + raise ValueError("return_distance must be True " + "if sort_results is True") + + cdef intp_t i, count_i = 0 + cdef intp_t n_features = self.data.shape[1] + cdef {{INPUT_DTYPE_t}}[::1] dist_arr_i + cdef intp_t[::1] idx_arr_i, counts + cdef const {{INPUT_DTYPE_t}}* pt + cdef intp_t** indices = NULL + cdef {{INPUT_DTYPE_t}}** distances = NULL + + # validate X and prepare for query + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') + + if X.shape[X.ndim - 1] != self.data.shape[1]: + raise ValueError("query data dimension must " + "match training data dimension") + + cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = X.reshape((-1, self.data.shape[1])) + + # prepare r for query + r = np.asarray(r, dtype=np.float64, order='C') + r = np.atleast_1d(r) + if r.shape == (1,): + r = np.full(X.shape[:X.ndim - 1], r[0], dtype=np.float64) + else: + if r.shape != X.shape[:X.ndim - 1]: + raise ValueError("r must be broadcastable to X.shape") + + rarr_np = r.reshape(-1) # store explicitly to keep in scope + cdef float64_t[::1] rarr = rarr_np + + if not count_only: + indices = calloc(Xarr.shape[0], sizeof(intp_t*)) + if indices == NULL: + raise MemoryError() + if return_distance: + distances = <{{INPUT_DTYPE_t}}**>calloc(Xarr.shape[0], sizeof({{INPUT_DTYPE_t}}*)) + if distances == NULL: + free(indices) + raise MemoryError() + + np_idx_arr = np.zeros(self.data.shape[0], dtype=np.intp) + idx_arr_i = np_idx_arr + + np_dist_arr = np.zeros(self.data.shape[0], dtype={{INPUT_DTYPE}}) + dist_arr_i = np_dist_arr + + counts_arr = np.zeros(Xarr.shape[0], dtype=np.intp) + counts = counts_arr + + pt = &Xarr[0, 0] + memory_error = False + with nogil: + for i in range(Xarr.shape[0]): + counts[i] = self._query_radius_single(0, pt, rarr[i], + &idx_arr_i[0], + &dist_arr_i[0], + 0, count_only, + return_distance) + pt += n_features + + if count_only: + continue + + if sort_results: + _simultaneous_sort(&dist_arr_i[0], &idx_arr_i[0], + counts[i]) + + # equivalent to: indices[i] = np_idx_arr[:counts[i]].copy() + indices[i] = malloc(counts[i] * sizeof(intp_t)) + if indices[i] == NULL: + memory_error = True + break + memcpy(indices[i], &idx_arr_i[0], counts[i] * sizeof(intp_t)) + + if return_distance: + # equivalent to: distances[i] = np_dist_arr[:counts[i]].copy() + distances[i] = <{{INPUT_DTYPE_t}}*>malloc(counts[i] * sizeof({{INPUT_DTYPE_t}})) + if distances[i] == NULL: + memory_error = True + break + memcpy(distances[i], &dist_arr_i[0], counts[i] * sizeof({{INPUT_DTYPE_t}})) + + try: + if memory_error: + raise MemoryError() + + if count_only: + # deflatten results + return counts_arr.reshape(X.shape[:X.ndim - 1]) + elif return_distance: + indices_npy = np.zeros(Xarr.shape[0], dtype='object') + distances_npy = np.zeros(Xarr.shape[0], dtype='object') + for i in range(Xarr.shape[0]): + # make a new numpy array that wraps the existing data + # TODO: remove the explicit cast to cnp.intp_t* when cython min version >= 3.0 + indices_npy[i] = cnp.PyArray_SimpleNewFromData(1, &counts[i], cnp.NPY_INTP, indices[i]) + # make sure the data will be freed when the numpy array is garbage collected + PyArray_ENABLEFLAGS(indices_npy[i], cnp.NPY_ARRAY_OWNDATA) + # make sure the data is not freed twice + indices[i] = NULL + + # make a new numpy array that wraps the existing data + # TODO: remove the explicit cast to cnp.intp_t* when cython min version >= 3.0 + distances_npy[i] = cnp.PyArray_SimpleNewFromData(1, &counts[i], {{NPY_TYPE}}, distances[i]) + # make sure the data will be freed when the numpy array is garbage collected + PyArray_ENABLEFLAGS(distances_npy[i], cnp.NPY_ARRAY_OWNDATA) + # make sure the data is not freed twice + distances[i] = NULL + + # deflatten results + return (indices_npy.reshape(X.shape[:X.ndim - 1]), + distances_npy.reshape(X.shape[:X.ndim - 1])) + else: + indices_npy = np.zeros(Xarr.shape[0], dtype='object') + for i in range(Xarr.shape[0]): + # make a new numpy array that wraps the existing data + # TODO: remove the explicit cast to cnp.intp_t* when cython min version >= 3.0 + indices_npy[i] = cnp.PyArray_SimpleNewFromData(1, &counts[i], cnp.NPY_INTP, indices[i]) + # make sure the data will be freed when the numpy array is garbage collected + PyArray_ENABLEFLAGS(indices_npy[i], cnp.NPY_ARRAY_OWNDATA) + # make sure the data is not freed twice + indices[i] = NULL + + # deflatten results + return indices_npy.reshape(X.shape[:X.ndim - 1]) + except MemoryError: + # free any buffer that is not owned by a numpy array + for i in range(Xarr.shape[0]): + free(indices[i]) + if return_distance: + free(distances[i]) + raise + finally: + free(indices) + free(distances) + + def kernel_density(self, X, h, kernel='gaussian', + atol=0, rtol=1E-8, + breadth_first=True, return_log=False): + """ + kernel_density(X, h, kernel='gaussian', atol=0, rtol=1E-8, + breadth_first=True, return_log=False) + + Compute the kernel density estimate at points X with the given kernel, + using the distance metric specified at tree creation. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + An array of points to query. Last dimension should match dimension + of training data. + h : float + the bandwidth of the kernel + kernel : str, default="gaussian" + specify the kernel to use. Options are + - 'gaussian' + - 'tophat' + - 'epanechnikov' + - 'exponential' + - 'linear' + - 'cosine' + Default is kernel = 'gaussian' + atol : float, default=0 + Specify the desired absolute tolerance of the result. + If the true result is `K_true`, then the returned result `K_ret` + satisfies ``abs(K_true - K_ret) < atol + rtol * K_ret`` + The default is zero (i.e. machine precision). + rtol : float, default=1e-8 + Specify the desired relative tolerance of the result. + If the true result is `K_true`, then the returned result `K_ret` + satisfies ``abs(K_true - K_ret) < atol + rtol * K_ret`` + The default is `1e-8` (i.e. machine precision). + breadth_first : bool, default=False + If True, use a breadth-first search. If False (default) use a + depth-first search. Breadth-first is generally faster for + compact kernels and/or high tolerances. + return_log : bool, default=False + Return the logarithm of the result. This can be more accurate + than returning the result itself for narrow kernels. + + Returns + ------- + density : ndarray of shape X.shape[:-1] + The array of (log)-density evaluations + """ + cdef float64_t h_c = h + cdef float64_t log_atol = log(atol) + cdef float64_t log_rtol = log(rtol) + cdef float64_t log_min_bound, log_max_bound, log_bound_spread + cdef float64_t dist_LB = 0, dist_UB = 0 + + cdef intp_t n_samples = self.data.shape[0] + cdef intp_t n_features = self.data.shape[1] + cdef intp_t i + cdef KernelType kernel_c + + # validate kernel + if kernel == 'gaussian': + kernel_c = GAUSSIAN_KERNEL + elif kernel == 'tophat': + kernel_c = TOPHAT_KERNEL + elif kernel == 'epanechnikov': + kernel_c = EPANECHNIKOV_KERNEL + elif kernel == 'exponential': + kernel_c = EXPONENTIAL_KERNEL + elif kernel == 'linear': + kernel_c = LINEAR_KERNEL + elif kernel == 'cosine': + kernel_c = COSINE_KERNEL + else: + raise ValueError("kernel = '%s' not recognized" % kernel) + + cdef float64_t log_knorm = _log_kernel_norm(h_c, n_features, kernel_c) + + # validate X and prepare for query + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') + + if X.shape[X.ndim - 1] != n_features: + raise ValueError("query data dimension must " + "match training data dimension") + Xarr_np = X.reshape((-1, n_features)) + cdef const {{INPUT_DTYPE_t}}[:, ::1] Xarr = Xarr_np + + log_density_arr = np.zeros(Xarr.shape[0], dtype={{INPUT_DTYPE}}) + cdef {{INPUT_DTYPE_t}}[::1] log_density = log_density_arr + + cdef const {{INPUT_DTYPE_t}}* pt = &Xarr[0, 0] + + cdef NodeHeap nodeheap + if breadth_first: + nodeheap = NodeHeap(self.data.shape[0] // self.leaf_size) + cdef float64_t[::1] node_log_min_bounds + cdef float64_t[::1] node_bound_widths + # TODO: implement dual tree approach. + # this is difficult because of the need to cache values + # computed between node pairs. + if breadth_first: + node_log_min_bounds_arr = np.full(self.n_nodes, -np.inf) + node_log_min_bounds = node_log_min_bounds_arr + node_bound_widths_arr = np.zeros(self.n_nodes) + node_bound_widths = node_bound_widths_arr + for i in range(Xarr.shape[0]): + log_density[i] = self._kde_single_breadthfirst( + pt, kernel_c, h_c, + log_knorm, log_atol, log_rtol, + nodeheap, + &node_log_min_bounds[0], + &node_bound_widths[0]) + pt += n_features + else: + for i in range(Xarr.shape[0]): + min_max_dist{{name_suffix}}(self, 0, pt, &dist_LB, &dist_UB) + # compute max & min bounds on density within top node + log_min_bound = (log(self.sum_weight) + + compute_log_kernel(dist_UB, + h_c, kernel_c)) + log_max_bound = (log(self.sum_weight) + + compute_log_kernel(dist_LB, + h_c, kernel_c)) + log_bound_spread = logsubexp(log_max_bound, log_min_bound) + self._kde_single_depthfirst(0, pt, kernel_c, h_c, + log_knorm, log_atol, log_rtol, + log_min_bound, + log_bound_spread, + &log_min_bound, + &log_bound_spread) + log_density[i] = logaddexp(log_min_bound, + log_bound_spread - log(2)) + pt += n_features + + # normalize the results + for i in range(log_density.shape[0]): + log_density[i] += log_knorm + + log_density_arr = log_density_arr.reshape(X.shape[:X.ndim - 1]) + + if return_log: + return log_density_arr + else: + return np.exp(log_density_arr) + + def two_point_correlation(self, X, r, dualtree=False): + """ + two_point_correlation(X, r, dualtree=False) + + Compute the two-point correlation function + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + An array of points to query. Last dimension should match dimension + of training data. + r : array-like + A one-dimensional array of distances + dualtree : bool, default=False + If True, use a dualtree algorithm. Otherwise, use a single-tree + algorithm. Dual tree algorithms can have better scaling for + large N. + + Returns + ------- + counts : ndarray + counts[i] contains the number of pairs of points with distance + less than or equal to r[i] + """ + cdef intp_t n_features = self.data.shape[1] + cdef intp_t i + + # validate X and prepare for query + X = check_array(X, dtype={{INPUT_DTYPE}}, order='C') + + if X.shape[X.ndim - 1] != self.data.shape[1]: + raise ValueError("query data dimension must " + "match training data dimension") + + np_Xarr = X.reshape((-1, self.data.shape[1])) + cdef {{INPUT_DTYPE_t}}[:, ::1] Xarr = np_Xarr + + # prepare r for query + r = np.asarray(r, dtype=np.float64, order='C') + r = np.atleast_1d(r) + if r.ndim != 1: + raise ValueError("r must be a 1-dimensional array") + i_rsort = np.argsort(r) + rarr_np = r[i_rsort] # needed to keep memory in scope + cdef float64_t[::1] rarr = rarr_np + + # create array to hold counts + count = np.zeros(r.shape[0], dtype=np.intp) + cdef intp_t[::1] carr = count + + cdef const {{INPUT_DTYPE_t}}* pt = &Xarr[0, 0] + + if dualtree: + other = self.__class__(Xarr, metric=self.dist_metric, + leaf_size=self.leaf_size) + self._two_point_dual(0, other, 0, &rarr[0], &carr[0], + 0, rarr.shape[0]) + else: + for i in range(Xarr.shape[0]): + self._two_point_single(0, pt, &rarr[0], &carr[0], + 0, rarr.shape[0]) + pt += n_features + + return count + + cdef int _query_single_depthfirst( + self, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, + intp_t i_pt, + NeighborsHeap{{name_suffix}} heap, + float64_t reduced_dist_LB, + ) except -1 nogil: + """Recursive Single-tree k-neighbors query, depth-first approach""" + cdef NodeData_t node_info = self.node_data[i_node] + + cdef float64_t dist_pt, reduced_dist_LB_1, reduced_dist_LB_2 + cdef intp_t i, i1, i2 + + cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0] + + # ------------------------------------------------------------ + # Case 1: query point is outside node radius: + # trim it from the query + if reduced_dist_LB > heap.largest(i_pt): + self.n_trims += 1 + + # ------------------------------------------------------------ + # Case 2: this is a leaf node. Update set of nearby points + elif node_info.is_leaf: + self.n_leaves += 1 + for i in range(node_info.idx_start, node_info.idx_end): + dist_pt = self.rdist(pt, + &self.data[self.idx_array[i], 0], + self.data.shape[1]) + heap._push(i_pt, dist_pt, self.idx_array[i]) + + # ------------------------------------------------------------ + # Case 3: Node is not a leaf. Recursively query subnodes + # starting with the closest + else: + self.n_splits += 1 + i1 = 2 * i_node + 1 + i2 = i1 + 1 + reduced_dist_LB_1 = min_rdist{{name_suffix}}(self, i1, pt) + reduced_dist_LB_2 = min_rdist{{name_suffix}}(self, i2, pt) + + # recursively query subnodes + if reduced_dist_LB_1 <= reduced_dist_LB_2: + self._query_single_depthfirst(i1, pt, i_pt, heap, + reduced_dist_LB_1) + self._query_single_depthfirst(i2, pt, i_pt, heap, + reduced_dist_LB_2) + else: + self._query_single_depthfirst(i2, pt, i_pt, heap, + reduced_dist_LB_2) + self._query_single_depthfirst(i1, pt, i_pt, heap, + reduced_dist_LB_1) + return 0 + + cdef int _query_single_breadthfirst( + self, + const {{INPUT_DTYPE_t}}* pt, + intp_t i_pt, + NeighborsHeap{{name_suffix}} heap, + NodeHeap nodeheap, + ) except -1: + """Non-recursive single-tree k-neighbors query, breadth-first search""" + cdef intp_t i, i_node + cdef float64_t dist_pt, reduced_dist_LB + cdef const NodeData_t* node_data = &self.node_data[0] + cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0] + + # Set up the node heap and push the head node onto it + cdef NodeHeapData_t nodeheap_item + nodeheap_item.val = min_rdist{{name_suffix}}(self, 0, pt) + nodeheap_item.i1 = 0 + nodeheap.push(nodeheap_item) + + while nodeheap.n > 0: + nodeheap_item = nodeheap.pop() + reduced_dist_LB = nodeheap_item.val + i_node = nodeheap_item.i1 + node_info = node_data[i_node] + + # ------------------------------------------------------------ + # Case 1: query point is outside node radius: + # trim it from the query + if reduced_dist_LB > heap.largest(i_pt): + self.n_trims += 1 + + # ------------------------------------------------------------ + # Case 2: this is a leaf node. Update set of nearby points + elif node_data[i_node].is_leaf: + self.n_leaves += 1 + for i in range(node_data[i_node].idx_start, + node_data[i_node].idx_end): + dist_pt = self.rdist(pt, + &self.data[self.idx_array[i], 0], + self.data.shape[1]) + heap._push(i_pt, dist_pt, self.idx_array[i]) + + # ------------------------------------------------------------ + # Case 3: Node is not a leaf. Add subnodes to the node heap + else: + self.n_splits += 1 + for i in range(2 * i_node + 1, 2 * i_node + 3): + nodeheap_item.i1 = i + nodeheap_item.val = min_rdist{{name_suffix}}(self, i, pt) + nodeheap.push(nodeheap_item) + return 0 + + cdef int _query_dual_depthfirst( + self, + intp_t i_node1, + BinaryTree{{name_suffix}} other, + intp_t i_node2, + float64_t[::1] bounds, + NeighborsHeap{{name_suffix}} heap, + float64_t reduced_dist_LB, + ) except -1: + """Recursive dual-tree k-neighbors query, depth-first""" + # note that the array `bounds` is maintained such that + # bounds[i] is the largest distance among any of the + # current neighbors in node i of the other tree. + cdef NodeData_t node_info1 = self.node_data[i_node1] + cdef NodeData_t node_info2 = other.node_data[i_node2] + + cdef const {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0] + cdef const {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0] + cdef intp_t n_features = self.data.shape[1] + + cdef float64_t bound_max, dist_pt, reduced_dist_LB1, reduced_dist_LB2 + cdef intp_t i1, i2, i_pt, i_parent + + # ------------------------------------------------------------ + # Case 1: nodes are further apart than the current bound: + # trim both from the query + if reduced_dist_LB > bounds[i_node2]: + pass + + # ------------------------------------------------------------ + # Case 2: both nodes are leaves: + # do a brute-force search comparing all pairs + elif node_info1.is_leaf and node_info2.is_leaf: + bounds[i_node2] = 0 + + for i2 in range(node_info2.idx_start, node_info2.idx_end): + i_pt = other.idx_array[i2] + + if heap.largest(i_pt) <= reduced_dist_LB: + continue + + for i1 in range(node_info1.idx_start, node_info1.idx_end): + dist_pt = self.rdist( + data1 + n_features * self.idx_array[i1], + data2 + n_features * i_pt, + n_features) + heap._push(i_pt, dist_pt, self.idx_array[i1]) + + # keep track of node bound + bounds[i_node2] = fmax(bounds[i_node2], + heap.largest(i_pt)) + + # update bounds up the tree + while i_node2 > 0: + i_parent = (i_node2 - 1) // 2 + bound_max = fmax(bounds[2 * i_parent + 1], + bounds[2 * i_parent + 2]) + if bound_max < bounds[i_parent]: + bounds[i_parent] = bound_max + i_node2 = i_parent + else: + break + + # ------------------------------------------------------------ + # Case 3a: node 1 is a leaf or is smaller: split node 2 and + # recursively query, starting with the nearest subnode + elif node_info1.is_leaf or (not node_info2.is_leaf + and node_info2.radius > node_info1.radius): + reduced_dist_LB1 = min_rdist_dual{{name_suffix}}(self, i_node1, + other, 2 * i_node2 + 1) + reduced_dist_LB2 = min_rdist_dual{{name_suffix}}(self, i_node1, + other, 2 * i_node2 + 2) + + if reduced_dist_LB1 < reduced_dist_LB2: + self._query_dual_depthfirst(i_node1, other, 2 * i_node2 + 1, + bounds, heap, reduced_dist_LB1) + self._query_dual_depthfirst(i_node1, other, 2 * i_node2 + 2, + bounds, heap, reduced_dist_LB2) + else: + self._query_dual_depthfirst(i_node1, other, 2 * i_node2 + 2, + bounds, heap, reduced_dist_LB2) + self._query_dual_depthfirst(i_node1, other, 2 * i_node2 + 1, + bounds, heap, reduced_dist_LB1) + + # ------------------------------------------------------------ + # Case 3b: node 2 is a leaf or is smaller: split node 1 and + # recursively query, starting with the nearest subnode + else: + reduced_dist_LB1 = min_rdist_dual{{name_suffix}}(self, 2 * i_node1 + 1, + other, i_node2) + reduced_dist_LB2 = min_rdist_dual{{name_suffix}}(self, 2 * i_node1 + 2, + other, i_node2) + + if reduced_dist_LB1 < reduced_dist_LB2: + self._query_dual_depthfirst(2 * i_node1 + 1, other, i_node2, + bounds, heap, reduced_dist_LB1) + self._query_dual_depthfirst(2 * i_node1 + 2, other, i_node2, + bounds, heap, reduced_dist_LB2) + else: + self._query_dual_depthfirst(2 * i_node1 + 2, other, i_node2, + bounds, heap, reduced_dist_LB2) + self._query_dual_depthfirst(2 * i_node1 + 1, other, i_node2, + bounds, heap, reduced_dist_LB1) + return 0 + + cdef int _query_dual_breadthfirst( + self, + BinaryTree{{name_suffix}} other, + NeighborsHeap{{name_suffix}} heap, + NodeHeap nodeheap, + ) except -1: + """Non-recursive dual-tree k-neighbors query, breadth-first""" + cdef intp_t i, i1, i2, i_node1, i_node2, i_pt + cdef float64_t dist_pt, reduced_dist_LB + cdef float64_t[::1] bounds = np.full(other.node_data.shape[0], np.inf) + cdef const NodeData_t* node_data1 = &self.node_data[0] + cdef const NodeData_t* node_data2 = &other.node_data[0] + cdef NodeData_t node_info1, node_info2 + cdef const {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0] + cdef const {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0] + cdef intp_t n_features = self.data.shape[1] + + # Set up the node heap and push the head nodes onto it + cdef NodeHeapData_t nodeheap_item + nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, 0, other, 0) + nodeheap_item.i1 = 0 + nodeheap_item.i2 = 0 + nodeheap.push(nodeheap_item) + + while nodeheap.n > 0: + nodeheap_item = nodeheap.pop() + reduced_dist_LB = nodeheap_item.val + i_node1 = nodeheap_item.i1 + i_node2 = nodeheap_item.i2 + + node_info1 = node_data1[i_node1] + node_info2 = node_data2[i_node2] + + # ------------------------------------------------------------ + # Case 1: nodes are further apart than the current bound: + # trim both from the query + if reduced_dist_LB > bounds[i_node2]: + pass + + # ------------------------------------------------------------ + # Case 2: both nodes are leaves: + # do a brute-force search comparing all pairs + elif node_info1.is_leaf and node_info2.is_leaf: + bounds[i_node2] = -1 + + for i2 in range(node_info2.idx_start, node_info2.idx_end): + i_pt = other.idx_array[i2] + + if heap.largest(i_pt) <= reduced_dist_LB: + continue + + for i1 in range(node_info1.idx_start, node_info1.idx_end): + dist_pt = self.rdist( + data1 + n_features * self.idx_array[i1], + data2 + n_features * i_pt, + n_features) + heap._push(i_pt, dist_pt, self.idx_array[i1]) + + # keep track of node bound + bounds[i_node2] = fmax(bounds[i_node2], + heap.largest(i_pt)) + + # ------------------------------------------------------------ + # Case 3a: node 1 is a leaf or is smaller: split node 2 and + # recursively query, starting with the nearest subnode + elif node_info1.is_leaf or (not node_info2.is_leaf + and (node_info2.radius + > node_info1.radius)): + nodeheap_item.i1 = i_node1 + for i2 in range(2 * i_node2 + 1, 2 * i_node2 + 3): + nodeheap_item.i2 = i2 + nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, i_node1, + other, i2) + nodeheap.push(nodeheap_item) + + # ------------------------------------------------------------ + # Case 3b: node 2 is a leaf or is smaller: split node 1 and + # recursively query, starting with the nearest subnode + else: + nodeheap_item.i2 = i_node2 + for i1 in range(2 * i_node1 + 1, 2 * i_node1 + 3): + nodeheap_item.i1 = i1 + nodeheap_item.val = min_rdist_dual{{name_suffix}}(self, i1, + other, i_node2) + nodeheap.push(nodeheap_item) + return 0 + + cdef intp_t _query_radius_single( + self, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, + float64_t r, + intp_t* indices, + {{INPUT_DTYPE_t}}* distances, + intp_t count, + int count_only, + int return_distance, + ) noexcept nogil: + """recursive single-tree radius query, depth-first""" + cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0] + cdef intp_t* idx_array = &self.idx_array[0] + cdef intp_t n_features = self.data.shape[1] + cdef NodeData_t node_info = self.node_data[i_node] + + cdef intp_t i + cdef float64_t reduced_r + + cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0 + min_max_dist{{name_suffix}}(self, i_node, pt, &dist_LB, &dist_UB) + + # ------------------------------------------------------------ + # Case 1: all node points are outside distance r. + # prune this branch. + if dist_LB > r: + pass + + # ------------------------------------------------------------ + # Case 2: all node points are within distance r + # add all points to neighbors + elif dist_UB <= r: + if count_only: + count += (node_info.idx_end - node_info.idx_start) + else: + for i in range(node_info.idx_start, node_info.idx_end): + if (count < 0) or (count >= self.data.shape[0]): + return -1 + indices[count] = idx_array[i] + if return_distance: + distances[count] = self.dist(pt, (data + n_features + * idx_array[i]), + n_features) + count += 1 + + # ------------------------------------------------------------ + # Case 3: this is a leaf node. Go through all points to + # determine if they fall within radius + elif node_info.is_leaf: + reduced_r = self.dist_metric._dist_to_rdist(r) + + for i in range(node_info.idx_start, node_info.idx_end): + dist_pt = self.rdist(pt, (data + n_features * idx_array[i]), + n_features) + if dist_pt <= reduced_r: + if (count < 0) or (count >= self.data.shape[0]): + return -1 + if count_only: + pass + else: + indices[count] = idx_array[i] + if return_distance: + distances[count] =\ + self.dist_metric._rdist_to_dist(dist_pt) + count += 1 + + # ------------------------------------------------------------ + # Case 4: Node is not a leaf. Recursively query subnodes + else: + count = self._query_radius_single(2 * i_node + 1, pt, r, + indices, distances, count, + count_only, return_distance) + count = self._query_radius_single(2 * i_node + 2, pt, r, + indices, distances, count, + count_only, return_distance) + + return count + + cdef float64_t _kde_single_breadthfirst( + self, const {{INPUT_DTYPE_t}}* pt, + KernelType kernel, + float64_t h, + float64_t log_knorm, + float64_t log_atol, + float64_t log_rtol, + NodeHeap nodeheap, + float64_t* node_log_min_bounds, + float64_t* node_log_bound_spreads, + ): + """non-recursive single-tree kernel density estimation""" + # For the given point, node_log_min_bounds and node_log_bound_spreads + # will encode the current bounds on the density between the point + # and the associated node. + # The variables global_log_min_bound and global_log_bound_spread + # keep track of the global bounds on density. The procedure here is + # to split nodes, updating these bounds, until the bounds are within + # atol & rtol. + cdef intp_t i, i1, i2, i_node + cdef float64_t N1, N2 + cdef float64_t global_log_min_bound, global_log_bound_spread + cdef float64_t global_log_max_bound + + cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0] + cdef bint with_sample_weight = self.sample_weight is not None + cdef const {{INPUT_DTYPE_t}}* sample_weight + if with_sample_weight: + sample_weight = &self.sample_weight[0] + cdef intp_t* idx_array = &self.idx_array[0] + cdef const NodeData_t* node_data = &self.node_data[0] + cdef float64_t N + cdef float64_t log_weight + if with_sample_weight: + N = self.sum_weight + else: + N = self.data.shape[0] + cdef intp_t n_features = self.data.shape[1] + + cdef NodeData_t node_info + cdef float64_t dist_pt, log_density + cdef float64_t dist_LB_1 = 0, dist_LB_2 = 0 + cdef float64_t dist_UB_1 = 0, dist_UB_2 = 0 + + cdef float64_t dist_UB, dist_LB + + # push the top node to the heap + cdef NodeHeapData_t nodeheap_item + nodeheap_item.val = min_dist{{name_suffix}}(self, 0, pt) + nodeheap_item.i1 = 0 + nodeheap.push(nodeheap_item) + + global_log_min_bound = log(N) + compute_log_kernel( + max_dist{{name_suffix}}(self, 0, pt), h, kernel + ) + global_log_max_bound = log(N) + compute_log_kernel(nodeheap_item.val, + h, kernel) + global_log_bound_spread = logsubexp(global_log_max_bound, + global_log_min_bound) + + node_log_min_bounds[0] = global_log_min_bound + node_log_bound_spreads[0] = global_log_bound_spread + + while nodeheap.n > 0: + nodeheap_item = nodeheap.pop() + i_node = nodeheap_item.i1 + + node_info = node_data[i_node] + if with_sample_weight: + N1 = _total_node_weight(node_data, sample_weight, + idx_array, i_node) + else: + N1 = node_info.idx_end - node_info.idx_start + + # ------------------------------------------------------------ + # Case 1: local bounds are equal to within per-point tolerance. + if (log_knorm + node_log_bound_spreads[i_node] - log(N1) + log(N) + <= logaddexp(log_atol, (log_rtol + log_knorm + + node_log_min_bounds[i_node]))): + pass + + # ------------------------------------------------------------ + # Case 2: global bounds are within rtol & atol. + elif (log_knorm + global_log_bound_spread + <= logaddexp(log_atol, + log_rtol + log_knorm + global_log_min_bound)): + break + + # ------------------------------------------------------------ + # Case 3: node is a leaf. Count contributions from all points + elif node_info.is_leaf: + global_log_min_bound =\ + logsubexp(global_log_min_bound, + node_log_min_bounds[i_node]) + global_log_bound_spread =\ + logsubexp(global_log_bound_spread, + node_log_bound_spreads[i_node]) + for i in range(node_info.idx_start, node_info.idx_end): + dist_pt = self.dist(pt, data + n_features * idx_array[i], + n_features) + log_density = compute_log_kernel(dist_pt, h, kernel) + if with_sample_weight: + log_weight = np.log(sample_weight[idx_array[i]]) + else: + log_weight = 0. + global_log_min_bound = logaddexp(global_log_min_bound, + log_density + log_weight) + + # ------------------------------------------------------------ + # Case 4: split node and query subnodes + else: + i1 = 2 * i_node + 1 + i2 = 2 * i_node + 2 + + if with_sample_weight: + N1 = _total_node_weight(node_data, sample_weight, + idx_array, i1) + N2 = _total_node_weight(node_data, sample_weight, + idx_array, i2) + else: + N1 = node_data[i1].idx_end - node_data[i1].idx_start + N2 = node_data[i2].idx_end - node_data[i2].idx_start + + min_max_dist{{name_suffix}}(self, i1, pt, &dist_LB_1, &dist_UB_1) + min_max_dist{{name_suffix}}(self, i2, pt, &dist_LB_2, &dist_UB_2) + + node_log_min_bounds[i1] = (log(N1) + + compute_log_kernel(dist_UB_1, + h, kernel)) + node_log_bound_spreads[i1] = (log(N1) + + compute_log_kernel(dist_LB_1, + h, kernel)) + + node_log_min_bounds[i2] = (log(N2) + + compute_log_kernel(dist_UB_2, + h, kernel)) + node_log_bound_spreads[i2] = (log(N2) + + compute_log_kernel(dist_LB_2, + h, kernel)) + + global_log_min_bound = logsubexp(global_log_min_bound, + node_log_min_bounds[i_node]) + global_log_min_bound = logaddexp(global_log_min_bound, + node_log_min_bounds[i1]) + global_log_min_bound = logaddexp(global_log_min_bound, + node_log_min_bounds[i2]) + + global_log_bound_spread =\ + logsubexp(global_log_bound_spread, + node_log_bound_spreads[i_node]) + global_log_bound_spread = logaddexp(global_log_bound_spread, + node_log_bound_spreads[i1]) + global_log_bound_spread = logaddexp(global_log_bound_spread, + node_log_bound_spreads[i2]) + + # TODO: rank by the spread rather than the distance? + nodeheap_item.val = dist_LB_1 + nodeheap_item.i1 = i1 + nodeheap.push(nodeheap_item) + + nodeheap_item.val = dist_LB_2 + nodeheap_item.i1 = i2 + nodeheap.push(nodeheap_item) + + nodeheap.clear() + return logaddexp(global_log_min_bound, + global_log_bound_spread - log(2)) + + cdef int _kde_single_depthfirst( + self, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, + KernelType kernel, + float64_t h, + float64_t log_knorm, + float64_t log_atol, + float64_t log_rtol, + float64_t local_log_min_bound, + float64_t local_log_bound_spread, + float64_t* global_log_min_bound, + float64_t* global_log_bound_spread, + ) except -1: + """recursive single-tree kernel density estimate, depth-first""" + # For the given point, local_min_bound and local_max_bound give the + # minimum and maximum density for the current node, while + # global_min_bound and global_max_bound give the minimum and maximum + # density over the entire tree. We recurse down until global_min_bound + # and global_max_bound are within rtol and atol. + cdef intp_t i, i1, i2, iw, start, end + cdef float64_t N1, N2 + + cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0] + cdef const NodeData_t* node_data = &self.node_data[0] + cdef bint with_sample_weight = self.sample_weight is not None + cdef const {{INPUT_DTYPE_t}}* sample_weight + cdef float64_t log_weight + if with_sample_weight: + sample_weight = &self.sample_weight[0] + cdef intp_t* idx_array = &self.idx_array[0] + cdef intp_t n_features = self.data.shape[1] + + cdef NodeData_t node_info = self.node_data[i_node] + cdef float64_t dist_pt, log_dens_contribution + + cdef float64_t child1_log_min_bound, child2_log_min_bound + cdef float64_t child1_log_bound_spread, child2_log_bound_spread + cdef float64_t dist_UB = 0, dist_LB = 0 + + if with_sample_weight: + N1 = _total_node_weight(node_data, sample_weight, + idx_array, i_node) + N2 = self.sum_weight + else: + N1 = (node_info.idx_end - node_info.idx_start) + N2 = self.data.shape[0] + + # ------------------------------------------------------------ + # Case 1: local bounds are equal to within errors. Return + if ( + log_knorm + local_log_bound_spread - log(N1) + log(N2) + <= logaddexp(log_atol, (log_rtol + log_knorm + local_log_min_bound)) + ): + pass + + # ------------------------------------------------------------ + # Case 2: global bounds are within rtol & atol. Return + elif ( + log_knorm + global_log_bound_spread[0] + <= logaddexp(log_atol, (log_rtol + log_knorm + global_log_min_bound[0])) + ): + pass + + # ------------------------------------------------------------ + # Case 3: node is a leaf. Count contributions from all points + elif node_info.is_leaf: + global_log_min_bound[0] = logsubexp(global_log_min_bound[0], + local_log_min_bound) + global_log_bound_spread[0] = logsubexp(global_log_bound_spread[0], + local_log_bound_spread) + for i in range(node_info.idx_start, node_info.idx_end): + dist_pt = self.dist(pt, (data + n_features * idx_array[i]), + n_features) + log_dens_contribution = compute_log_kernel(dist_pt, h, kernel) + if with_sample_weight: + log_weight = np.log(sample_weight[idx_array[i]]) + else: + log_weight = 0. + global_log_min_bound[0] = logaddexp(global_log_min_bound[0], + (log_dens_contribution + + log_weight)) + + # ------------------------------------------------------------ + # Case 4: split node and query subnodes + else: + i1 = 2 * i_node + 1 + i2 = 2 * i_node + 2 + + if with_sample_weight: + N1 = _total_node_weight(node_data, sample_weight, + idx_array, i1) + N2 = _total_node_weight(node_data, sample_weight, + idx_array, i2) + else: + N1 = (self.node_data[i1].idx_end - self.node_data[i1].idx_start) + N2 = (self.node_data[i2].idx_end - self.node_data[i2].idx_start) + + min_max_dist{{name_suffix}}(self, i1, pt, &dist_LB, &dist_UB) + child1_log_min_bound = log(N1) + compute_log_kernel(dist_UB, h, + kernel) + child1_log_bound_spread = logsubexp(log(N1) + + compute_log_kernel(dist_LB, h, + kernel), + child1_log_min_bound) + + min_max_dist{{name_suffix}}(self, i2, pt, &dist_LB, &dist_UB) + child2_log_min_bound = log(N2) + compute_log_kernel(dist_UB, h, + kernel) + child2_log_bound_spread = logsubexp(log(N2) + + compute_log_kernel(dist_LB, h, + kernel), + child2_log_min_bound) + + global_log_min_bound[0] = logsubexp(global_log_min_bound[0], + local_log_min_bound) + global_log_min_bound[0] = logaddexp(global_log_min_bound[0], + child1_log_min_bound) + global_log_min_bound[0] = logaddexp(global_log_min_bound[0], + child2_log_min_bound) + + global_log_bound_spread[0] = logsubexp(global_log_bound_spread[0], + local_log_bound_spread) + global_log_bound_spread[0] = logaddexp(global_log_bound_spread[0], + child1_log_bound_spread) + global_log_bound_spread[0] = logaddexp(global_log_bound_spread[0], + child2_log_bound_spread) + + self._kde_single_depthfirst(i1, pt, kernel, h, log_knorm, + log_atol, log_rtol, + child1_log_min_bound, + child1_log_bound_spread, + global_log_min_bound, + global_log_bound_spread) + self._kde_single_depthfirst(i2, pt, kernel, h, log_knorm, + log_atol, log_rtol, + child2_log_min_bound, + child2_log_bound_spread, + global_log_min_bound, + global_log_bound_spread) + return 0 + + cdef int _two_point_single( + self, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, + float64_t* r, + intp_t* count, + intp_t i_min, + intp_t i_max, + ) except -1: + """recursive single-tree two-point correlation function query""" + cdef const {{INPUT_DTYPE_t}}* data = &self.data[0, 0] + cdef intp_t* idx_array = &self.idx_array[0] + cdef intp_t n_features = self.data.shape[1] + cdef NodeData_t node_info = self.node_data[i_node] + + cdef intp_t i, j, Npts + cdef float64_t reduced_r + + cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0 + min_max_dist{{name_suffix}}(self, i_node, pt, &dist_LB, &dist_UB) + + # ------------------------------------------------------------ + # Go through bounds and check for cuts + while i_min < i_max: + if dist_LB > r[i_min]: + i_min += 1 + else: + break + + while i_max > i_min: + Npts = (node_info.idx_end - node_info.idx_start) + if dist_UB <= r[i_max - 1]: + count[i_max - 1] += Npts + i_max -= 1 + else: + break + + if i_min < i_max: + # If node is a leaf, go through all points + if node_info.is_leaf: + for i in range(node_info.idx_start, node_info.idx_end): + dist_pt = self.dist(pt, (data + n_features * idx_array[i]), + n_features) + j = i_max - 1 + while (j >= i_min) and (dist_pt <= r[j]): + count[j] += 1 + j -= 1 + + else: + self._two_point_single(2 * i_node + 1, pt, r, + count, i_min, i_max) + self._two_point_single(2 * i_node + 2, pt, r, + count, i_min, i_max) + return 0 + + cdef int _two_point_dual( + self, + intp_t i_node1, + BinaryTree{{name_suffix}} other, + intp_t i_node2, + float64_t* r, + intp_t* count, + intp_t i_min, + intp_t i_max, + ) except -1: + """recursive dual-tree two-point correlation function query""" + cdef const {{INPUT_DTYPE_t}}* data1 = &self.data[0, 0] + cdef const {{INPUT_DTYPE_t}}* data2 = &other.data[0, 0] + cdef intp_t* idx_array1 = &self.idx_array[0] + cdef intp_t* idx_array2 = &other.idx_array[0] + cdef NodeData_t node_info1 = self.node_data[i_node1] + cdef NodeData_t node_info2 = other.node_data[i_node2] + + cdef intp_t n_features = self.data.shape[1] + + cdef intp_t i1, i2, j, Npts + cdef float64_t reduced_r + + cdef float64_t dist_pt, dist_LB = 0, dist_UB = 0 + dist_LB = min_dist_dual{{name_suffix}}(self, i_node1, other, i_node2) + dist_UB = max_dist_dual{{name_suffix}}(self, i_node1, other, i_node2) + + # ------------------------------------------------------------ + # Go through bounds and check for cuts + while i_min < i_max: + if dist_LB > r[i_min]: + i_min += 1 + else: + break + + while i_max > i_min: + Npts = ((node_info1.idx_end - node_info1.idx_start) + * (node_info2.idx_end - node_info2.idx_start)) + if dist_UB <= r[i_max - 1]: + count[i_max - 1] += Npts + i_max -= 1 + else: + break + + if i_min < i_max: + if node_info1.is_leaf and node_info2.is_leaf: + # If both nodes are leaves, go through all points + for i1 in range(node_info1.idx_start, node_info1.idx_end): + for i2 in range(node_info2.idx_start, node_info2.idx_end): + dist_pt = self.dist((data1 + n_features + * idx_array1[i1]), + (data2 + n_features + * idx_array2[i2]), + n_features) + j = i_max - 1 + while (j >= i_min) and (dist_pt <= r[j]): + count[j] += 1 + j -= 1 + + elif node_info1.is_leaf: + # If only one is a leaf, split the other + for i2 in range(2 * i_node2 + 1, 2 * i_node2 + 3): + self._two_point_dual(i_node1, other, i2, + r, count, i_min, i_max) + + elif node_info2.is_leaf: + for i1 in range(2 * i_node1 + 1, 2 * i_node1 + 3): + self._two_point_dual(i1, other, i_node2, + r, count, i_min, i_max) + + else: + # neither is a leaf: split & query both + for i1 in range(2 * i_node1 + 1, 2 * i_node1 + 3): + for i2 in range(2 * i_node2 + 1, 2 * i_node2 + 3): + self._two_point_dual(i1, other, i2, + r, count, i_min, i_max) + return 0 + +{{endfor}} + +###################################################################### +# Python functions for benchmarking and testing C implementations + +def simultaneous_sort(float64_t[:, ::1] distances, intp_t[:, ::1] indices): + """In-place simultaneous sort the given row of the arrays + + This python wrapper exists primarily to enable unit testing + of the _simultaneous_sort C routine. + """ + assert distances.shape[0] == indices.shape[0] + assert distances.shape[1] == indices.shape[1] + cdef intp_t row + for row in range(distances.shape[0]): + _simultaneous_sort(&distances[row, 0], + &indices[row, 0], + distances.shape[1]) + + +def nodeheap_sort(float64_t[::1] vals): + """In-place reverse sort of vals using NodeHeap""" + cdef intp_t[::1] indices = np.zeros(vals.shape[0], dtype=np.intp) + cdef float64_t[::1] vals_sorted = np.zeros_like(vals) + + # use initial size 0 to check corner case + cdef NodeHeap heap = NodeHeap(0) + cdef NodeHeapData_t data + cdef intp_t i + for i in range(vals.shape[0]): + data.val = vals[i] + data.i1 = i + data.i2 = i + 1 + heap.push(data) + + for i in range(vals.shape[0]): + data = heap.pop() + vals_sorted[i] = data.val + indices[i] = data.i1 + + return np.asarray(vals_sorted), np.asarray(indices) + + +cdef inline float64_t _total_node_weight( + const NodeData_t* node_data, + const floating* sample_weight, + const intp_t* idx_array, + intp_t i_node, +): + cdef intp_t i + cdef float64_t N = 0.0 + for i in range(node_data[i_node].idx_start, node_data[i_node].idx_end): + N += sample_weight[idx_array[i]] + return N diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_classification.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..c70b83cb1d3bdbcab4f241bf19416d410cbaf9e4 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_classification.py @@ -0,0 +1,919 @@ +"""Nearest Neighbor Classification""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from numbers import Integral + +import numpy as np + +from sklearn.neighbors._base import _check_precomputed + +from ..base import ClassifierMixin, _fit_context +from ..metrics._pairwise_distances_reduction import ( + ArgKminClassMode, + RadiusNeighborsClassMode, +) +from ..utils._param_validation import StrOptions +from ..utils.arrayfuncs import _all_with_any_reduction_axis_1 +from ..utils.extmath import weighted_mode +from ..utils.fixes import _mode +from ..utils.validation import ( + _is_arraylike, + _num_samples, + check_is_fitted, + validate_data, +) +from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin, _get_weights + + +def _adjusted_metric(metric, metric_kwargs, p=None): + metric_kwargs = metric_kwargs or {} + if metric == "minkowski": + metric_kwargs["p"] = p + if p == 2: + metric = "euclidean" + return metric, metric_kwargs + + +class KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, NeighborsBase): + """Classifier implementing the k-nearest neighbors vote. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_neighbors : int, default=5 + Number of neighbors to use by default for :meth:`kneighbors` queries. + + weights : {'uniform', 'distance'}, callable or None, default='uniform' + Weight function used in prediction. Possible values: + + - 'uniform' : uniform weights. All points in each neighborhood + are weighted equally. + - 'distance' : weight points by the inverse of their distance. + in this case, closer neighbors of a query point will have a + greater influence than neighbors which are further away. + - [callable] : a user-defined function which accepts an + array of distances, and returns an array of the same shape + containing the weights. + + Refer to the example entitled + :ref:`sphx_glr_auto_examples_neighbors_plot_classification.py` + showing the impact of the `weights` parameter on the decision + boundary. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`BallTree` + - 'kd_tree' will use :class:`KDTree` + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf size passed to BallTree or KDTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + p : float, default=2 + Power parameter for the Minkowski metric. When p = 1, this is equivalent + to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. + For arbitrary p, minkowski_distance (l_p) is used. This parameter is expected + to be positive. + + metric : str or callable, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + If metric is "precomputed", X is assumed to be a distance matrix and + must be square during fit. X may be a :term:`sparse graph`, in which + case only "nonzero" elements may be considered neighbors. + + If metric is a callable function, it takes two arrays representing 1D + vectors as inputs and must return one value indicating the distance + between those vectors. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + Doesn't affect :meth:`fit` method. + + Attributes + ---------- + classes_ : array of shape (n_classes,) + Class labels known to the classifier + + effective_metric_ : str or callble + The distance metric used. It will be same as the `metric` parameter + or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to + 'minkowski' and `p` parameter set to 2. + + effective_metric_params_ : dict + Additional keyword arguments for the metric function. For most metrics + will be same with `metric_params` parameter, but may also contain the + `p` parameter value if the `effective_metric_` attribute is set to + 'minkowski'. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_samples_fit_ : int + Number of samples in the fitted data. + + outputs_2d_ : bool + False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit + otherwise True. + + See Also + -------- + RadiusNeighborsClassifier: Classifier based on neighbors within a fixed radius. + KNeighborsRegressor: Regression based on k-nearest neighbors. + RadiusNeighborsRegressor: Regression based on neighbors within a fixed radius. + NearestNeighbors: Unsupervised learner for implementing neighbor searches. + + Notes + ----- + See :ref:`Nearest Neighbors ` in the online documentation + for a discussion of the choice of ``algorithm`` and ``leaf_size``. + + .. warning:: + + Regarding the Nearest Neighbors algorithms, if it is found that two + neighbors, neighbor `k+1` and `k`, have identical distances + but different labels, the results will depend on the ordering of the + training data. + + https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm + + Examples + -------- + >>> X = [[0], [1], [2], [3]] + >>> y = [0, 0, 1, 1] + >>> from sklearn.neighbors import KNeighborsClassifier + >>> neigh = KNeighborsClassifier(n_neighbors=3) + >>> neigh.fit(X, y) + KNeighborsClassifier(...) + >>> print(neigh.predict([[1.1]])) + [0] + >>> print(neigh.predict_proba([[0.9]])) + [[0.666 0.333]] + """ + + _parameter_constraints: dict = {**NeighborsBase._parameter_constraints} + _parameter_constraints.pop("radius") + _parameter_constraints.update( + {"weights": [StrOptions({"uniform", "distance"}), callable, None]} + ) + + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + ): + super().__init__( + n_neighbors=n_neighbors, + algorithm=algorithm, + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) + self.weights = weights + + @_fit_context( + # KNeighborsClassifier.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y): + """Fit the k-nearest neighbors classifier from the training dataset. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) if metric='precomputed' + Training data. + + y : {array-like, sparse matrix} of shape (n_samples,) or \ + (n_samples, n_outputs) + Target values. + + Returns + ------- + self : KNeighborsClassifier + The fitted k-nearest neighbors classifier. + """ + return self._fit(X, y) + + def predict(self, X): + """Predict the class labels for the provided data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed', or None + Test samples. If `None`, predictions for all indexed points are + returned; in this case, points are not considered their own + neighbors. + + Returns + ------- + y : ndarray of shape (n_queries,) or (n_queries, n_outputs) + Class labels for each data sample. + """ + check_is_fitted(self, "_fit_method") + if self.weights == "uniform": + if self._fit_method == "brute" and ArgKminClassMode.is_usable_for( + X, self._fit_X, self.metric + ): + probabilities = self.predict_proba(X) + if self.outputs_2d_: + return np.stack( + [ + self.classes_[idx][np.argmax(probas, axis=1)] + for idx, probas in enumerate(probabilities) + ], + axis=1, + ) + return self.classes_[np.argmax(probabilities, axis=1)] + # In that case, we do not need the distances to perform + # the weighting so we do not compute them. + neigh_ind = self.kneighbors(X, return_distance=False) + neigh_dist = None + else: + neigh_dist, neigh_ind = self.kneighbors(X) + + classes_ = self.classes_ + _y = self._y + if not self.outputs_2d_: + _y = self._y.reshape((-1, 1)) + classes_ = [self.classes_] + + n_outputs = len(classes_) + n_queries = _num_samples(self._fit_X if X is None else X) + weights = _get_weights(neigh_dist, self.weights) + if weights is not None and _all_with_any_reduction_axis_1(weights, value=0): + raise ValueError( + "All neighbors of some sample is getting zero weights. " + "Please modify 'weights' to avoid this case if you are " + "using a user-defined function." + ) + + y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype) + for k, classes_k in enumerate(classes_): + if weights is None: + mode, _ = _mode(_y[neigh_ind, k], axis=1) + else: + mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1) + + mode = np.asarray(mode.ravel(), dtype=np.intp) + y_pred[:, k] = classes_k.take(mode) + + if not self.outputs_2d_: + y_pred = y_pred.ravel() + + return y_pred + + def predict_proba(self, X): + """Return probability estimates for the test data X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed', or None + Test samples. If `None`, predictions for all indexed points are + returned; in this case, points are not considered their own + neighbors. + + Returns + ------- + p : ndarray of shape (n_queries, n_classes), or a list of n_outputs \ + of such arrays if n_outputs > 1. + The class probabilities of the input samples. Classes are ordered + by lexicographic order. + """ + check_is_fitted(self, "_fit_method") + if self.weights == "uniform": + # TODO: systematize this mapping of metric for + # PairwiseDistancesReductions. + metric, metric_kwargs = _adjusted_metric( + metric=self.metric, metric_kwargs=self.metric_params, p=self.p + ) + if ( + self._fit_method == "brute" + and ArgKminClassMode.is_usable_for(X, self._fit_X, metric) + # TODO: Implement efficient multi-output solution + and not self.outputs_2d_ + ): + if self.metric == "precomputed": + X = _check_precomputed(X) + else: + X = validate_data( + self, X, accept_sparse="csr", reset=False, order="C" + ) + + probabilities = ArgKminClassMode.compute( + X, + self._fit_X, + k=self.n_neighbors, + weights=self.weights, + Y_labels=self._y, + unique_Y_labels=self.classes_, + metric=metric, + metric_kwargs=metric_kwargs, + # `strategy="parallel_on_X"` has in practice be shown + # to be more efficient than `strategy="parallel_on_Y`` + # on many combination of datasets. + # Hence, we choose to enforce it here. + # For more information, see: + # https://github.com/scikit-learn/scikit-learn/pull/24076#issuecomment-1445258342 + # TODO: adapt the heuristic for `strategy="auto"` for + # `ArgKminClassMode` and use `strategy="auto"`. + strategy="parallel_on_X", + ) + return probabilities + + # In that case, we do not need the distances to perform + # the weighting so we do not compute them. + neigh_ind = self.kneighbors(X, return_distance=False) + neigh_dist = None + else: + neigh_dist, neigh_ind = self.kneighbors(X) + + classes_ = self.classes_ + _y = self._y + if not self.outputs_2d_: + _y = self._y.reshape((-1, 1)) + classes_ = [self.classes_] + + n_queries = _num_samples(self._fit_X if X is None else X) + + weights = _get_weights(neigh_dist, self.weights) + if weights is None: + weights = np.ones_like(neigh_ind) + elif _all_with_any_reduction_axis_1(weights, value=0): + raise ValueError( + "All neighbors of some sample is getting zero weights. " + "Please modify 'weights' to avoid this case if you are " + "using a user-defined function." + ) + + all_rows = np.arange(n_queries) + probabilities = [] + for k, classes_k in enumerate(classes_): + pred_labels = _y[:, k][neigh_ind] + proba_k = np.zeros((n_queries, classes_k.size)) + + # a simple ':' index doesn't work right + for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors) + proba_k[all_rows, idx] += weights[:, i] + + # normalize 'votes' into real [0,1] probabilities + normalizer = proba_k.sum(axis=1)[:, np.newaxis] + proba_k /= normalizer + + probabilities.append(proba_k) + + if not self.outputs_2d_: + probabilities = probabilities[0] + + return probabilities + + # This function is defined here only to modify the parent docstring + # and add information about X=None + def score(self, X, y, sample_weight=None): + """ + Return the mean accuracy on the given test data and labels. + + In multi-label classification, this is the subset accuracy + which is a harsh metric since you require for each sample that + each label set be correctly predicted. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features), or None + Test samples. If `None`, predictions for all indexed points are + used; in this case, points are not considered their own + neighbors. This means that `knn.fit(X, y).score(None, y)` + implicitly performs a leave-one-out cross-validation procedure + and is equivalent to `cross_val_score(knn, X, y, cv=LeaveOneOut())` + but typically much faster. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + True labels for `X`. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + score : float + Mean accuracy of ``self.predict(X)`` w.r.t. `y`. + """ + return super().score(X, y, sample_weight) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.classifier_tags.multi_label = True + tags.input_tags.pairwise = self.metric == "precomputed" + return tags + + +class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, NeighborsBase): + """Classifier implementing a vote among neighbors within a given radius. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + radius : float, default=1.0 + Range of parameter space to use by default for :meth:`radius_neighbors` + queries. + + weights : {'uniform', 'distance'}, callable or None, default='uniform' + Weight function used in prediction. Possible values: + + - 'uniform' : uniform weights. All points in each neighborhood + are weighted equally. + - 'distance' : weight points by the inverse of their distance. + in this case, closer neighbors of a query point will have a + greater influence than neighbors which are further away. + - [callable] : a user-defined function which accepts an + array of distances, and returns an array of the same shape + containing the weights. + + Uniform weights are used by default. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`BallTree` + - 'kd_tree' will use :class:`KDTree` + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf size passed to BallTree or KDTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + p : float, default=2 + Power parameter for the Minkowski metric. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + This parameter is expected to be positive. + + metric : str or callable, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + If metric is "precomputed", X is assumed to be a distance matrix and + must be square during fit. X may be a :term:`sparse graph`, in which + case only "nonzero" elements may be considered neighbors. + + If metric is a callable function, it takes two arrays representing 1D + vectors as inputs and must return one value indicating the distance + between those vectors. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + outlier_label : {manual label, 'most_frequent'}, default=None + Label for outlier samples (samples with no neighbors in given radius). + + - manual label: str or int label (should be the same type as y) + or list of manual labels if multi-output is used. + - 'most_frequent' : assign the most frequent label of y to outliers. + - None : when any outlier is detected, ValueError will be raised. + + The outlier label should be selected from among the unique 'Y' labels. + If it is specified with a different value a warning will be raised and + all class probabilities of outliers will be assigned to be 0. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) + Class labels known to the classifier. + + effective_metric_ : str or callable + The distance metric used. It will be same as the `metric` parameter + or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to + 'minkowski' and `p` parameter set to 2. + + effective_metric_params_ : dict + Additional keyword arguments for the metric function. For most metrics + will be same with `metric_params` parameter, but may also contain the + `p` parameter value if the `effective_metric_` attribute is set to + 'minkowski'. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_samples_fit_ : int + Number of samples in the fitted data. + + outlier_label_ : int or array-like of shape (n_class,) + Label which is given for outlier samples (samples with no neighbors + on given radius). + + outputs_2d_ : bool + False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit + otherwise True. + + See Also + -------- + KNeighborsClassifier : Classifier implementing the k-nearest neighbors + vote. + RadiusNeighborsRegressor : Regression based on neighbors within a + fixed radius. + KNeighborsRegressor : Regression based on k-nearest neighbors. + NearestNeighbors : Unsupervised learner for implementing neighbor + searches. + + Notes + ----- + See :ref:`Nearest Neighbors ` in the online documentation + for a discussion of the choice of ``algorithm`` and ``leaf_size``. + + https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm + + Examples + -------- + >>> X = [[0], [1], [2], [3]] + >>> y = [0, 0, 1, 1] + >>> from sklearn.neighbors import RadiusNeighborsClassifier + >>> neigh = RadiusNeighborsClassifier(radius=1.0) + >>> neigh.fit(X, y) + RadiusNeighborsClassifier(...) + >>> print(neigh.predict([[1.5]])) + [0] + >>> print(neigh.predict_proba([[1.0]])) + [[0.66666667 0.33333333]] + """ + + _parameter_constraints: dict = { + **NeighborsBase._parameter_constraints, + "weights": [StrOptions({"uniform", "distance"}), callable, None], + "outlier_label": [Integral, str, "array-like", None], + } + _parameter_constraints.pop("n_neighbors") + + def __init__( + self, + radius=1.0, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + outlier_label=None, + metric_params=None, + n_jobs=None, + ): + super().__init__( + radius=radius, + algorithm=algorithm, + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) + self.weights = weights + self.outlier_label = outlier_label + + @_fit_context( + # RadiusNeighborsClassifier.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y): + """Fit the radius neighbors classifier from the training dataset. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) if metric='precomputed' + Training data. + + y : {array-like, sparse matrix} of shape (n_samples,) or \ + (n_samples, n_outputs) + Target values. + + Returns + ------- + self : RadiusNeighborsClassifier + The fitted radius neighbors classifier. + """ + self._fit(X, y) + + classes_ = self.classes_ + _y = self._y + if not self.outputs_2d_: + _y = self._y.reshape((-1, 1)) + classes_ = [self.classes_] + + if self.outlier_label is None: + outlier_label_ = None + + elif self.outlier_label == "most_frequent": + outlier_label_ = [] + # iterate over multi-output, get the most frequent label for each + # output. + for k, classes_k in enumerate(classes_): + label_count = np.bincount(_y[:, k]) + outlier_label_.append(classes_k[label_count.argmax()]) + + else: + if _is_arraylike(self.outlier_label) and not isinstance( + self.outlier_label, str + ): + if len(self.outlier_label) != len(classes_): + raise ValueError( + "The length of outlier_label: {} is " + "inconsistent with the output " + "length: {}".format(self.outlier_label, len(classes_)) + ) + outlier_label_ = self.outlier_label + else: + outlier_label_ = [self.outlier_label] * len(classes_) + + for classes, label in zip(classes_, outlier_label_): + if _is_arraylike(label) and not isinstance(label, str): + # ensure the outlier label for each output is a scalar. + raise TypeError( + "The outlier_label of classes {} is " + "supposed to be a scalar, got " + "{}.".format(classes, label) + ) + if np.append(classes, label).dtype != classes.dtype: + # ensure the dtype of outlier label is consistent with y. + raise TypeError( + "The dtype of outlier_label {} is " + "inconsistent with classes {} in " + "y.".format(label, classes) + ) + + self.outlier_label_ = outlier_label_ + + return self + + def predict(self, X): + """Predict the class labels for the provided data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed', or None + Test samples. If `None`, predictions for all indexed points are + returned; in this case, points are not considered their own + neighbors. + + Returns + ------- + y : ndarray of shape (n_queries,) or (n_queries, n_outputs) + Class labels for each data sample. + """ + + probs = self.predict_proba(X) + classes_ = self.classes_ + + if not self.outputs_2d_: + probs = [probs] + classes_ = [self.classes_] + + n_outputs = len(classes_) + n_queries = probs[0].shape[0] + y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype) + + for k, prob in enumerate(probs): + # iterate over multi-output, assign labels based on probabilities + # of each output. + max_prob_index = prob.argmax(axis=1) + y_pred[:, k] = classes_[k].take(max_prob_index) + + outlier_zero_probs = (prob == 0).all(axis=1) + if outlier_zero_probs.any(): + zero_prob_index = np.flatnonzero(outlier_zero_probs) + y_pred[zero_prob_index, k] = self.outlier_label_[k] + + if not self.outputs_2d_: + y_pred = y_pred.ravel() + + return y_pred + + def predict_proba(self, X): + """Return probability estimates for the test data X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed', or None + Test samples. If `None`, predictions for all indexed points are + returned; in this case, points are not considered their own + neighbors. + + Returns + ------- + p : ndarray of shape (n_queries, n_classes), or a list of \ + n_outputs of such arrays if n_outputs > 1. + The class probabilities of the input samples. Classes are ordered + by lexicographic order. + """ + check_is_fitted(self, "_fit_method") + n_queries = _num_samples(self._fit_X if X is None else X) + + metric, metric_kwargs = _adjusted_metric( + metric=self.metric, metric_kwargs=self.metric_params, p=self.p + ) + + if ( + self.weights == "uniform" + and self._fit_method == "brute" + and not self.outputs_2d_ + and RadiusNeighborsClassMode.is_usable_for(X, self._fit_X, metric) + ): + probabilities = RadiusNeighborsClassMode.compute( + X=X, + Y=self._fit_X, + radius=self.radius, + weights=self.weights, + Y_labels=self._y, + unique_Y_labels=self.classes_, + outlier_label=self.outlier_label, + metric=metric, + metric_kwargs=metric_kwargs, + strategy="parallel_on_X", + # `strategy="parallel_on_X"` has in practice be shown + # to be more efficient than `strategy="parallel_on_Y`` + # on many combination of datasets. + # Hence, we choose to enforce it here. + # For more information, see: + # https://github.com/scikit-learn/scikit-learn/pull/26828/files#r1282398471 + ) + return probabilities + + neigh_dist, neigh_ind = self.radius_neighbors(X) + outlier_mask = np.zeros(n_queries, dtype=bool) + outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind] + outliers = np.flatnonzero(outlier_mask) + inliers = np.flatnonzero(~outlier_mask) + + classes_ = self.classes_ + _y = self._y + if not self.outputs_2d_: + _y = self._y.reshape((-1, 1)) + classes_ = [self.classes_] + + if self.outlier_label_ is None and outliers.size > 0: + raise ValueError( + "No neighbors found for test samples %r, " + "you can try using larger radius, " + "giving a label for outliers, " + "or considering removing them from your dataset." % outliers + ) + + weights = _get_weights(neigh_dist, self.weights) + if weights is not None: + weights = weights[inliers] + + probabilities = [] + # iterate over multi-output, measure probabilities of the k-th output. + for k, classes_k in enumerate(classes_): + pred_labels = np.zeros(len(neigh_ind), dtype=object) + pred_labels[:] = [_y[ind, k] for ind in neigh_ind] + + proba_k = np.zeros((n_queries, classes_k.size)) + proba_inl = np.zeros((len(inliers), classes_k.size)) + + # samples have different size of neighbors within the same radius + if weights is None: + for i, idx in enumerate(pred_labels[inliers]): + proba_inl[i, :] = np.bincount(idx, minlength=classes_k.size) + else: + for i, idx in enumerate(pred_labels[inliers]): + proba_inl[i, :] = np.bincount( + idx, weights[i], minlength=classes_k.size + ) + proba_k[inliers, :] = proba_inl + + if outliers.size > 0: + _outlier_label = self.outlier_label_[k] + label_index = np.flatnonzero(classes_k == _outlier_label) + if label_index.size == 1: + proba_k[outliers, label_index[0]] = 1.0 + else: + warnings.warn( + "Outlier label {} is not in training " + "classes. All class probabilities of " + "outliers will be assigned with 0." + "".format(self.outlier_label_[k]) + ) + + # normalize 'votes' into real [0,1] probabilities + normalizer = proba_k.sum(axis=1)[:, np.newaxis] + normalizer[normalizer == 0.0] = 1.0 + proba_k /= normalizer + + probabilities.append(proba_k) + + if not self.outputs_2d_: + probabilities = probabilities[0] + + return probabilities + + # This function is defined here only to modify the parent docstring + # and add information about X=None + def score(self, X, y, sample_weight=None): + """ + Return the mean accuracy on the given test data and labels. + + In multi-label classification, this is the subset accuracy + which is a harsh metric since you require for each sample that + each label set be correctly predicted. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features), or None + Test samples. If `None`, predictions for all indexed points are + used; in this case, points are not considered their own + neighbors. This means that `knn.fit(X, y).score(None, y)` + implicitly performs a leave-one-out cross-validation procedure + and is equivalent to `cross_val_score(knn, X, y, cv=LeaveOneOut())` + but typically much faster. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + True labels for `X`. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + score : float + Mean accuracy of ``self.predict(X)`` w.r.t. `y`. + """ + return super().score(X, y, sample_weight) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.classifier_tags.multi_label = True + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_graph.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..3562fab1fcf01b5487d210a11d83d203bffd7835 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_graph.py @@ -0,0 +1,704 @@ +"""Nearest Neighbors graph functions""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import itertools + +from ..base import ClassNamePrefixFeaturesOutMixin, TransformerMixin, _fit_context +from ..utils._param_validation import ( + Integral, + Interval, + Real, + StrOptions, + validate_params, +) +from ..utils.validation import check_is_fitted +from ._base import VALID_METRICS, KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin +from ._unsupervised import NearestNeighbors + + +def _check_params(X, metric, p, metric_params): + """Check the validity of the input parameters""" + params = zip(["metric", "p", "metric_params"], [metric, p, metric_params]) + est_params = X.get_params() + for param_name, func_param in params: + if func_param != est_params[param_name]: + raise ValueError( + "Got %s for %s, while the estimator has %s for the same parameter." + % (func_param, param_name, est_params[param_name]) + ) + + +def _query_include_self(X, include_self, mode): + """Return the query based on include_self param""" + if include_self == "auto": + include_self = mode == "connectivity" + + # it does not include each sample as its own neighbors + if not include_self: + X = None + + return X + + +@validate_params( + { + "X": ["array-like", "sparse matrix", KNeighborsMixin], + "n_neighbors": [Interval(Integral, 1, None, closed="left")], + "mode": [StrOptions({"connectivity", "distance"})], + "metric": [StrOptions(set(itertools.chain(*VALID_METRICS.values()))), callable], + "p": [Interval(Real, 0, None, closed="right"), None], + "metric_params": [dict, None], + "include_self": ["boolean", StrOptions({"auto"})], + "n_jobs": [Integral, None], + }, + prefer_skip_nested_validation=False, # metric is not validated yet +) +def kneighbors_graph( + X, + n_neighbors, + *, + mode="connectivity", + metric="minkowski", + p=2, + metric_params=None, + include_self=False, + n_jobs=None, +): + """Compute the (weighted) graph of k-Neighbors for points in X. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Sample data. + + n_neighbors : int + Number of neighbors for each sample. + + mode : {'connectivity', 'distance'}, default='connectivity' + Type of returned matrix: 'connectivity' will return the connectivity + matrix with ones and zeros, and 'distance' will return the distances + between neighbors according to the given metric. + + metric : str, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + p : float, default=2 + Power parameter for the Minkowski metric. When p = 1, this is equivalent + to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. + For arbitrary p, minkowski_distance (l_p) is used. This parameter is expected + to be positive. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + include_self : bool or 'auto', default=False + Whether or not to mark each sample as the first nearest neighbor to + itself. If 'auto', then True is used for mode='connectivity' and False + for mode='distance'. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Returns + ------- + A : sparse matrix of shape (n_samples, n_samples) + Graph where A[i, j] is assigned the weight of edge that + connects i to j. The matrix is of CSR format. + + See Also + -------- + radius_neighbors_graph: Compute the (weighted) graph of Neighbors for points in X. + + Examples + -------- + >>> X = [[0], [3], [1]] + >>> from sklearn.neighbors import kneighbors_graph + >>> A = kneighbors_graph(X, 2, mode='connectivity', include_self=True) + >>> A.toarray() + array([[1., 0., 1.], + [0., 1., 1.], + [1., 0., 1.]]) + """ + if not isinstance(X, KNeighborsMixin): + X = NearestNeighbors( + n_neighbors=n_neighbors, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ).fit(X) + else: + _check_params(X, metric, p, metric_params) + + query = _query_include_self(X._fit_X, include_self, mode) + return X.kneighbors_graph(X=query, n_neighbors=n_neighbors, mode=mode) + + +@validate_params( + { + "X": ["array-like", "sparse matrix", RadiusNeighborsMixin], + "radius": [Interval(Real, 0, None, closed="both")], + "mode": [StrOptions({"connectivity", "distance"})], + "metric": [StrOptions(set(itertools.chain(*VALID_METRICS.values()))), callable], + "p": [Interval(Real, 0, None, closed="right"), None], + "metric_params": [dict, None], + "include_self": ["boolean", StrOptions({"auto"})], + "n_jobs": [Integral, None], + }, + prefer_skip_nested_validation=False, # metric is not validated yet +) +def radius_neighbors_graph( + X, + radius, + *, + mode="connectivity", + metric="minkowski", + p=2, + metric_params=None, + include_self=False, + n_jobs=None, +): + """Compute the (weighted) graph of Neighbors for points in X. + + Neighborhoods are restricted the points at a distance lower than + radius. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Sample data. + + radius : float + Radius of neighborhoods. + + mode : {'connectivity', 'distance'}, default='connectivity' + Type of returned matrix: 'connectivity' will return the connectivity + matrix with ones and zeros, and 'distance' will return the distances + between neighbors according to the given metric. + + metric : str, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + p : float, default=2 + Power parameter for the Minkowski metric. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + include_self : bool or 'auto', default=False + Whether or not to mark each sample as the first nearest neighbor to + itself. If 'auto', then True is used for mode='connectivity' and False + for mode='distance'. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Returns + ------- + A : sparse matrix of shape (n_samples, n_samples) + Graph where A[i, j] is assigned the weight of edge that connects + i to j. The matrix is of CSR format. + + See Also + -------- + kneighbors_graph: Compute the weighted graph of k-neighbors for points in X. + + Examples + -------- + >>> X = [[0], [3], [1]] + >>> from sklearn.neighbors import radius_neighbors_graph + >>> A = radius_neighbors_graph(X, 1.5, mode='connectivity', + ... include_self=True) + >>> A.toarray() + array([[1., 0., 1.], + [0., 1., 0.], + [1., 0., 1.]]) + """ + if not isinstance(X, RadiusNeighborsMixin): + X = NearestNeighbors( + radius=radius, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ).fit(X) + else: + _check_params(X, metric, p, metric_params) + + query = _query_include_self(X._fit_X, include_self, mode) + return X.radius_neighbors_graph(query, radius, mode) + + +class KNeighborsTransformer( + ClassNamePrefixFeaturesOutMixin, KNeighborsMixin, TransformerMixin, NeighborsBase +): + """Transform X into a (weighted) graph of k nearest neighbors. + + The transformed data is a sparse graph as returned by kneighbors_graph. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.22 + + Parameters + ---------- + mode : {'distance', 'connectivity'}, default='distance' + Type of returned matrix: 'connectivity' will return the connectivity + matrix with ones and zeros, and 'distance' will return the distances + between neighbors according to the given metric. + + n_neighbors : int, default=5 + Number of neighbors for each sample in the transformed sparse graph. + For compatibility reasons, as each sample is considered as its own + neighbor, one extra neighbor will be computed when mode == 'distance'. + In this case, the sparse graph contains (n_neighbors + 1) neighbors. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`BallTree` + - 'kd_tree' will use :class:`KDTree` + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf size passed to BallTree or KDTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + metric : str or callable, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + If metric is a callable function, it takes two arrays representing 1D + vectors as inputs and must return one value indicating the distance + between those vectors. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + Distance matrices are not supported. + + p : float, default=2 + Parameter for the Minkowski metric from + sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + This parameter is expected to be positive. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + If ``-1``, then the number of jobs is set to the number of CPU cores. + + Attributes + ---------- + effective_metric_ : str or callable + The distance metric used. It will be same as the `metric` parameter + or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to + 'minkowski' and `p` parameter set to 2. + + effective_metric_params_ : dict + Additional keyword arguments for the metric function. For most metrics + will be same with `metric_params` parameter, but may also contain the + `p` parameter value if the `effective_metric_` attribute is set to + 'minkowski'. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_samples_fit_ : int + Number of samples in the fitted data. + + See Also + -------- + kneighbors_graph : Compute the weighted graph of k-neighbors for + points in X. + RadiusNeighborsTransformer : Transform X into a weighted graph of + neighbors nearer than a radius. + + Notes + ----- + For an example of using :class:`~sklearn.neighbors.KNeighborsTransformer` + in combination with :class:`~sklearn.manifold.TSNE` see + :ref:`sphx_glr_auto_examples_neighbors_approximate_nearest_neighbors.py`. + + Examples + -------- + >>> from sklearn.datasets import load_wine + >>> from sklearn.neighbors import KNeighborsTransformer + >>> X, _ = load_wine(return_X_y=True) + >>> X.shape + (178, 13) + >>> transformer = KNeighborsTransformer(n_neighbors=5, mode='distance') + >>> X_dist_graph = transformer.fit_transform(X) + >>> X_dist_graph.shape + (178, 178) + """ + + _parameter_constraints: dict = { + **NeighborsBase._parameter_constraints, + "mode": [StrOptions({"distance", "connectivity"})], + } + _parameter_constraints.pop("radius") + + def __init__( + self, + *, + mode="distance", + n_neighbors=5, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + n_jobs=None, + ): + super().__init__( + n_neighbors=n_neighbors, + radius=None, + algorithm=algorithm, + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) + self.mode = mode + + @_fit_context( + # KNeighborsTransformer.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y=None): + """Fit the k-nearest neighbors transformer from the training dataset. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) if metric='precomputed' + Training data. + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : KNeighborsTransformer + The fitted k-nearest neighbors transformer. + """ + self._fit(X) + self._n_features_out = self.n_samples_fit_ + return self + + def transform(self, X): + """Compute the (weighted) graph of Neighbors for points in X. + + Parameters + ---------- + X : array-like of shape (n_samples_transform, n_features) + Sample data. + + Returns + ------- + Xt : sparse matrix of shape (n_samples_transform, n_samples_fit) + Xt[i, j] is assigned the weight of edge that connects i to j. + Only the neighbors have an explicit value. + The diagonal is always explicit. + The matrix is of CSR format. + """ + check_is_fitted(self) + add_one = self.mode == "distance" + return self.kneighbors_graph( + X, mode=self.mode, n_neighbors=self.n_neighbors + add_one + ) + + def fit_transform(self, X, y=None): + """Fit to data, then transform it. + + Fits transformer to X and y with optional parameters fit_params + and returns a transformed version of X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training set. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + Xt : sparse matrix of shape (n_samples, n_samples) + Xt[i, j] is assigned the weight of edge that connects i to j. + Only the neighbors have an explicit value. + The diagonal is always explicit. + The matrix is of CSR format. + """ + return self.fit(X).transform(X) + + +class RadiusNeighborsTransformer( + ClassNamePrefixFeaturesOutMixin, + RadiusNeighborsMixin, + TransformerMixin, + NeighborsBase, +): + """Transform X into a (weighted) graph of neighbors nearer than a radius. + + The transformed data is a sparse graph as returned by + `radius_neighbors_graph`. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.22 + + Parameters + ---------- + mode : {'distance', 'connectivity'}, default='distance' + Type of returned matrix: 'connectivity' will return the connectivity + matrix with ones and zeros, and 'distance' will return the distances + between neighbors according to the given metric. + + radius : float, default=1.0 + Radius of neighborhood in the transformed sparse graph. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`BallTree` + - 'kd_tree' will use :class:`KDTree` + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf size passed to BallTree or KDTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + metric : str or callable, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + If metric is a callable function, it takes two arrays representing 1D + vectors as inputs and must return one value indicating the distance + between those vectors. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + Distance matrices are not supported. + + p : float, default=2 + Parameter for the Minkowski metric from + sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + This parameter is expected to be positive. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + If ``-1``, then the number of jobs is set to the number of CPU cores. + + Attributes + ---------- + effective_metric_ : str or callable + The distance metric used. It will be same as the `metric` parameter + or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to + 'minkowski' and `p` parameter set to 2. + + effective_metric_params_ : dict + Additional keyword arguments for the metric function. For most metrics + will be same with `metric_params` parameter, but may also contain the + `p` parameter value if the `effective_metric_` attribute is set to + 'minkowski'. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_samples_fit_ : int + Number of samples in the fitted data. + + See Also + -------- + kneighbors_graph : Compute the weighted graph of k-neighbors for + points in X. + KNeighborsTransformer : Transform X into a weighted graph of k + nearest neighbors. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.datasets import load_wine + >>> from sklearn.cluster import DBSCAN + >>> from sklearn.neighbors import RadiusNeighborsTransformer + >>> from sklearn.pipeline import make_pipeline + >>> X, _ = load_wine(return_X_y=True) + >>> estimator = make_pipeline( + ... RadiusNeighborsTransformer(radius=42.0, mode='distance'), + ... DBSCAN(eps=25.0, metric='precomputed')) + >>> X_clustered = estimator.fit_predict(X) + >>> clusters, counts = np.unique(X_clustered, return_counts=True) + >>> print(counts) + [ 29 15 111 11 12] + """ + + _parameter_constraints: dict = { + **NeighborsBase._parameter_constraints, + "mode": [StrOptions({"distance", "connectivity"})], + } + _parameter_constraints.pop("n_neighbors") + + def __init__( + self, + *, + mode="distance", + radius=1.0, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + n_jobs=None, + ): + super().__init__( + n_neighbors=None, + radius=radius, + algorithm=algorithm, + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) + self.mode = mode + + @_fit_context( + # RadiusNeighborsTransformer.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y=None): + """Fit the radius neighbors transformer from the training dataset. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) if metric='precomputed' + Training data. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : RadiusNeighborsTransformer + The fitted radius neighbors transformer. + """ + self._fit(X) + self._n_features_out = self.n_samples_fit_ + return self + + def transform(self, X): + """Compute the (weighted) graph of Neighbors for points in X. + + Parameters + ---------- + X : array-like of shape (n_samples_transform, n_features) + Sample data. + + Returns + ------- + Xt : sparse matrix of shape (n_samples_transform, n_samples_fit) + Xt[i, j] is assigned the weight of edge that connects i to j. + Only the neighbors have an explicit value. + The diagonal is always explicit. + The matrix is of CSR format. + """ + check_is_fitted(self) + return self.radius_neighbors_graph(X, mode=self.mode, sort_results=True) + + def fit_transform(self, X, y=None): + """Fit to data, then transform it. + + Fits transformer to X and y with optional parameters fit_params + and returns a transformed version of X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training set. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + Xt : sparse matrix of shape (n_samples, n_samples) + Xt[i, j] is assigned the weight of edge that connects i to j. + Only the neighbors have an explicit value. + The diagonal is always explicit. + The matrix is of CSR format. + """ + return self.fit(X).transform(X) diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_kd_tree.pyx.tp b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_kd_tree.pyx.tp new file mode 100644 index 0000000000000000000000000000000000000000..d21af05270b9aad33560ea6ff72d55c3fa5c91b4 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_kd_tree.pyx.tp @@ -0,0 +1,336 @@ +{{py: + +# Generated file: _kd_tree.pyx + +implementation_specific_values = [ + # The values are arranged as follows: + # + # name_suffix, INPUT_DTYPE_t, INPUT_DTYPE + # + ('64', 'float64_t', 'np.float64'), + ('32', 'float32_t', 'np.float32') +] + +# By Jake Vanderplas (2013) +# written for the scikit-learn project +# SPDX-License-Identifier: BSD-3-Clause + +}} + + +__all__ = ['KDTree', 'KDTree64', 'KDTree32'] + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +DOC_DICT{{name_suffix}} = { + 'BinaryTree': 'KDTree{{name_suffix}}', + 'binary_tree': 'kd_tree{{name_suffix}}', +} + +VALID_METRICS{{name_suffix}} = [ + 'EuclideanDistance{{name_suffix}}', + 'ManhattanDistance{{name_suffix}}', + 'ChebyshevDistance{{name_suffix}}', + 'MinkowskiDistance{{name_suffix}}' +] + +{{endfor}} + +include "_binary_tree.pxi" + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +# Inherit KDTree{{name_suffix}} from BinaryTree{{name_suffix}} +cdef class KDTree{{name_suffix}}(BinaryTree{{name_suffix}}): + __doc__ = CLASS_DOC.format(**DOC_DICT{{name_suffix}}) + pass + +{{endfor}} + + +# ---------------------------------------------------------------------- +# The functions below specialized the Binary Tree as a KD Tree +# +# Note that these functions use the concept of "reduced distance". +# The reduced distance, defined for some metrics, is a quantity which +# is more efficient to compute than the distance, but preserves the +# relative rankings of the true distance. For example, the reduced +# distance for the Euclidean metric is the squared-euclidean distance. +# For some metrics, the reduced distance is simply the distance. + +{{for name_suffix, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}} + +cdef int allocate_data{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t n_nodes, + intp_t n_features, +) except -1: + """Allocate arrays needed for the KD Tree""" + tree.node_bounds = np.zeros((2, n_nodes, n_features), dtype={{INPUT_DTYPE}}) + return 0 + + +cdef int init_node{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + NodeData_t[::1] node_data, + intp_t i_node, + intp_t idx_start, + intp_t idx_end, +) except -1: + """Initialize the node for the dataset stored in tree.data""" + cdef intp_t n_features = tree.data.shape[1] + cdef intp_t i, j + cdef float64_t rad = 0 + + cdef {{INPUT_DTYPE_t}}* lower_bounds = &tree.node_bounds[0, i_node, 0] + cdef {{INPUT_DTYPE_t}}* upper_bounds = &tree.node_bounds[1, i_node, 0] + cdef const {{INPUT_DTYPE_t}}* data = &tree.data[0, 0] + cdef const intp_t* idx_array = &tree.idx_array[0] + + cdef const {{INPUT_DTYPE_t}}* data_row + + # determine Node bounds + for j in range(n_features): + lower_bounds[j] = INF + upper_bounds[j] = -INF + + # Compute the actual data range. At build time, this is slightly + # slower than using the previously-computed bounds of the parent node, + # but leads to more compact trees and thus faster queries. + for i in range(idx_start, idx_end): + data_row = data + idx_array[i] * n_features + for j in range(n_features): + lower_bounds[j] = fmin(lower_bounds[j], data_row[j]) + upper_bounds[j] = fmax(upper_bounds[j], data_row[j]) + + for j in range(n_features): + if tree.dist_metric.p == INF: + rad = fmax(rad, 0.5 * (upper_bounds[j] - lower_bounds[j])) + else: + rad += pow(0.5 * abs(upper_bounds[j] - lower_bounds[j]), + tree.dist_metric.p) + + node_data[i_node].idx_start = idx_start + node_data[i_node].idx_end = idx_end + + # The radius will hold the size of the circumscribed hypersphere measured + # with the specified metric: in querying, this is used as a measure of the + # size of each node when deciding which nodes to split. + node_data[i_node].radius = pow(rad, 1. / tree.dist_metric.p) + return 0 + + +cdef float64_t min_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, +) except -1 nogil: + """Compute the minimum reduced-distance between a point and a node""" + cdef intp_t n_features = tree.data.shape[1] + cdef float64_t d, d_lo, d_hi, rdist=0.0 + cdef intp_t j + + if tree.dist_metric.p == INF: + for j in range(n_features): + d_lo = tree.node_bounds[0, i_node, j] - pt[j] + d_hi = pt[j] - tree.node_bounds[1, i_node, j] + d = (d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi)) + rdist = fmax(rdist, 0.5 * d) + else: + # here we'll use the fact that x + abs(x) = 2 * max(x, 0) + for j in range(n_features): + d_lo = tree.node_bounds[0, i_node, j] - pt[j] + d_hi = pt[j] - tree.node_bounds[1, i_node, j] + d = (d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi)) + rdist += pow(0.5 * d, tree.dist_metric.p) + + return rdist + + +cdef float64_t min_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, +) except -1: + """Compute the minimum distance between a point and a node""" + if tree.dist_metric.p == INF: + return min_rdist{{name_suffix}}(tree, i_node, pt) + else: + return pow( + min_rdist{{name_suffix}}(tree, i_node, pt), + 1. / tree.dist_metric.p + ) + + +cdef float64_t max_rdist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, +) except -1: + """Compute the maximum reduced-distance between a point and a node""" + cdef intp_t n_features = tree.data.shape[1] + + cdef float64_t d_lo, d_hi, rdist=0.0 + cdef intp_t j + + if tree.dist_metric.p == INF: + for j in range(n_features): + rdist = fmax(rdist, fabs(pt[j] - tree.node_bounds[0, i_node, j])) + rdist = fmax(rdist, fabs(pt[j] - tree.node_bounds[1, i_node, j])) + else: + for j in range(n_features): + d_lo = fabs(pt[j] - tree.node_bounds[0, i_node, j]) + d_hi = fabs(pt[j] - tree.node_bounds[1, i_node, j]) + rdist += pow(fmax(d_lo, d_hi), tree.dist_metric.p) + + return rdist + + +cdef float64_t max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, +) except -1: + """Compute the maximum distance between a point and a node""" + if tree.dist_metric.p == INF: + return max_rdist{{name_suffix}}(tree, i_node, pt) + else: + return pow( + max_rdist{{name_suffix}}(tree, i_node, pt), + 1. / tree.dist_metric.p + ) + + +cdef inline int min_max_dist{{name_suffix}}( + BinaryTree{{name_suffix}} tree, + intp_t i_node, + const {{INPUT_DTYPE_t}}* pt, + float64_t* min_dist, + float64_t* max_dist, +) except -1 nogil: + """Compute the minimum and maximum distance between a point and a node""" + cdef intp_t n_features = tree.data.shape[1] + + cdef float64_t d, d_lo, d_hi + cdef intp_t j + + min_dist[0] = 0.0 + max_dist[0] = 0.0 + + if tree.dist_metric.p == INF: + for j in range(n_features): + d_lo = tree.node_bounds[0, i_node, j] - pt[j] + d_hi = pt[j] - tree.node_bounds[1, i_node, j] + d = (d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi)) + min_dist[0] = fmax(min_dist[0], 0.5 * d) + max_dist[0] = fmax(max_dist[0], fabs(d_lo)) + max_dist[0] = fmax(max_dist[0], fabs(d_hi)) + else: + # as above, use the fact that x + abs(x) = 2 * max(x, 0) + for j in range(n_features): + d_lo = tree.node_bounds[0, i_node, j] - pt[j] + d_hi = pt[j] - tree.node_bounds[1, i_node, j] + d = (d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi)) + min_dist[0] += pow(0.5 * d, tree.dist_metric.p) + max_dist[0] += pow(fmax(fabs(d_lo), fabs(d_hi)), + tree.dist_metric.p) + + min_dist[0] = pow(min_dist[0], 1. / tree.dist_metric.p) + max_dist[0] = pow(max_dist[0], 1. / tree.dist_metric.p) + + return 0 + + +cdef inline float64_t min_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """Compute the minimum reduced distance between two nodes""" + cdef intp_t n_features = tree1.data.shape[1] + + cdef float64_t d, d1, d2, rdist=0.0 + cdef intp_t j + + if tree1.dist_metric.p == INF: + for j in range(n_features): + d1 = (tree1.node_bounds[0, i_node1, j] + - tree2.node_bounds[1, i_node2, j]) + d2 = (tree2.node_bounds[0, i_node2, j] + - tree1.node_bounds[1, i_node1, j]) + d = (d1 + fabs(d1)) + (d2 + fabs(d2)) + + rdist = fmax(rdist, 0.5 * d) + else: + # here we'll use the fact that x + abs(x) = 2 * max(x, 0) + for j in range(n_features): + d1 = (tree1.node_bounds[0, i_node1, j] + - tree2.node_bounds[1, i_node2, j]) + d2 = (tree2.node_bounds[0, i_node2, j] + - tree1.node_bounds[1, i_node1, j]) + d = (d1 + fabs(d1)) + (d2 + fabs(d2)) + + rdist += pow(0.5 * d, tree1.dist_metric.p) + + return rdist + + +cdef inline float64_t min_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """Compute the minimum distance between two nodes""" + return tree1.dist_metric._rdist_to_dist( + min_rdist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + + +cdef inline float64_t max_rdist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """Compute the maximum reduced distance between two nodes""" + cdef intp_t n_features = tree1.data.shape[1] + + cdef float64_t d1, d2, rdist=0.0 + cdef intp_t j + + if tree1.dist_metric.p == INF: + for j in range(n_features): + rdist = fmax(rdist, fabs(tree1.node_bounds[0, i_node1, j] + - tree2.node_bounds[1, i_node2, j])) + rdist = fmax(rdist, fabs(tree1.node_bounds[1, i_node1, j] + - tree2.node_bounds[0, i_node2, j])) + else: + for j in range(n_features): + d1 = fabs(tree1.node_bounds[0, i_node1, j] + - tree2.node_bounds[1, i_node2, j]) + d2 = fabs(tree1.node_bounds[1, i_node1, j] + - tree2.node_bounds[0, i_node2, j]) + rdist += pow(fmax(d1, d2), tree1.dist_metric.p) + + return rdist + + +cdef inline float64_t max_dist_dual{{name_suffix}}( + BinaryTree{{name_suffix}} tree1, + intp_t i_node1, + BinaryTree{{name_suffix}} tree2, + intp_t i_node2, +) except -1: + """Compute the maximum distance between two nodes""" + return tree1.dist_metric._rdist_to_dist( + max_rdist_dual{{name_suffix}}(tree1, i_node1, tree2, i_node2) + ) + +{{endfor}} + + +class KDTree(KDTree64): + __doc__ = CLASS_DOC.format(BinaryTree="KDTree") + pass diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_kde.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_kde.py new file mode 100644 index 0000000000000000000000000000000000000000..7661308db2e01665c82cf82985586006b7c39a56 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_kde.py @@ -0,0 +1,359 @@ +""" +Kernel Density Estimation +------------------------- +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import itertools +from numbers import Integral, Real + +import numpy as np +from scipy.special import gammainc + +from ..base import BaseEstimator, _fit_context +from ..neighbors._base import VALID_METRICS +from ..utils import check_random_state +from ..utils._param_validation import Interval, StrOptions +from ..utils.extmath import row_norms +from ..utils.validation import _check_sample_weight, check_is_fitted, validate_data +from ._ball_tree import BallTree +from ._kd_tree import KDTree + +VALID_KERNELS = [ + "gaussian", + "tophat", + "epanechnikov", + "exponential", + "linear", + "cosine", +] + +TREE_DICT = {"ball_tree": BallTree, "kd_tree": KDTree} + + +# TODO: implement a brute force version for testing purposes +# TODO: create a density estimation base class? +class KernelDensity(BaseEstimator): + """Kernel Density Estimation. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + bandwidth : float or {"scott", "silverman"}, default=1.0 + The bandwidth of the kernel. If bandwidth is a float, it defines the + bandwidth of the kernel. If bandwidth is a string, one of the estimation + methods is implemented. + + algorithm : {'kd_tree', 'ball_tree', 'auto'}, default='auto' + The tree algorithm to use. + + kernel : {'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', \ + 'cosine'}, default='gaussian' + The kernel to use. + + metric : str, default='euclidean' + Metric to use for distance computation. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + Not all metrics are valid with all algorithms: refer to the + documentation of :class:`BallTree` and :class:`KDTree`. Note that the + normalization of the density output is correct only for the Euclidean + distance metric. + + atol : float, default=0 + The desired absolute tolerance of the result. A larger tolerance will + generally lead to faster execution. + + rtol : float, default=0 + The desired relative tolerance of the result. A larger tolerance will + generally lead to faster execution. + + breadth_first : bool, default=True + If true (default), use a breadth-first approach to the problem. + Otherwise use a depth-first approach. + + leaf_size : int, default=40 + Specify the leaf size of the underlying tree. See :class:`BallTree` + or :class:`KDTree` for details. + + metric_params : dict, default=None + Additional parameters to be passed to the tree for use with the + metric. For more information, see the documentation of + :class:`BallTree` or :class:`KDTree`. + + Attributes + ---------- + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + tree_ : ``BinaryTree`` instance + The tree algorithm for fast generalized N-point problems. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + bandwidth_ : float + Value of the bandwidth, given directly by the bandwidth parameter or + estimated using the 'scott' or 'silverman' method. + + .. versionadded:: 1.0 + + See Also + -------- + sklearn.neighbors.KDTree : K-dimensional tree for fast generalized N-point + problems. + sklearn.neighbors.BallTree : Ball tree for fast generalized N-point + problems. + + Examples + -------- + Compute a gaussian kernel density estimate with a fixed bandwidth. + + >>> from sklearn.neighbors import KernelDensity + >>> import numpy as np + >>> rng = np.random.RandomState(42) + >>> X = rng.random_sample((100, 3)) + >>> kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(X) + >>> log_density = kde.score_samples(X[:3]) + >>> log_density + array([-1.52955942, -1.51462041, -1.60244657]) + """ + + _parameter_constraints: dict = { + "bandwidth": [ + Interval(Real, 0, None, closed="neither"), + StrOptions({"scott", "silverman"}), + ], + "algorithm": [StrOptions(set(TREE_DICT.keys()) | {"auto"})], + "kernel": [StrOptions(set(VALID_KERNELS))], + "metric": [ + StrOptions( + set(itertools.chain(*[VALID_METRICS[alg] for alg in TREE_DICT.keys()])) + ) + ], + "atol": [Interval(Real, 0, None, closed="left")], + "rtol": [Interval(Real, 0, None, closed="left")], + "breadth_first": ["boolean"], + "leaf_size": [Interval(Integral, 1, None, closed="left")], + "metric_params": [None, dict], + } + + def __init__( + self, + *, + bandwidth=1.0, + algorithm="auto", + kernel="gaussian", + metric="euclidean", + atol=0, + rtol=0, + breadth_first=True, + leaf_size=40, + metric_params=None, + ): + self.algorithm = algorithm + self.bandwidth = bandwidth + self.kernel = kernel + self.metric = metric + self.atol = atol + self.rtol = rtol + self.breadth_first = breadth_first + self.leaf_size = leaf_size + self.metric_params = metric_params + + def _choose_algorithm(self, algorithm, metric): + # given the algorithm string + metric string, choose the optimal + # algorithm to compute the result. + if algorithm == "auto": + # use KD Tree if possible + if metric in KDTree.valid_metrics: + return "kd_tree" + elif metric in BallTree.valid_metrics: + return "ball_tree" + else: # kd_tree or ball_tree + if metric not in TREE_DICT[algorithm].valid_metrics: + raise ValueError( + "invalid metric for {0}: '{1}'".format(TREE_DICT[algorithm], metric) + ) + return algorithm + + @_fit_context( + # KernelDensity.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y=None, sample_weight=None): + """Fit the Kernel Density model on the data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + List of n_features-dimensional data points. Each row + corresponds to a single data point. + + y : None + Ignored. This parameter exists only for compatibility with + :class:`~sklearn.pipeline.Pipeline`. + + sample_weight : array-like of shape (n_samples,), default=None + List of sample weights attached to the data X. + + .. versionadded:: 0.20 + + Returns + ------- + self : object + Returns the instance itself. + """ + algorithm = self._choose_algorithm(self.algorithm, self.metric) + + if isinstance(self.bandwidth, str): + if self.bandwidth == "scott": + self.bandwidth_ = X.shape[0] ** (-1 / (X.shape[1] + 4)) + elif self.bandwidth == "silverman": + self.bandwidth_ = (X.shape[0] * (X.shape[1] + 2) / 4) ** ( + -1 / (X.shape[1] + 4) + ) + else: + self.bandwidth_ = self.bandwidth + + X = validate_data(self, X, order="C", dtype=np.float64) + + if sample_weight is not None: + sample_weight = _check_sample_weight( + sample_weight, X, dtype=np.float64, ensure_non_negative=True + ) + + kwargs = self.metric_params + if kwargs is None: + kwargs = {} + self.tree_ = TREE_DICT[algorithm]( + X, + metric=self.metric, + leaf_size=self.leaf_size, + sample_weight=sample_weight, + **kwargs, + ) + return self + + def score_samples(self, X): + """Compute the log-likelihood of each sample under the model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + An array of points to query. Last dimension should match dimension + of training data (n_features). + + Returns + ------- + density : ndarray of shape (n_samples,) + Log-likelihood of each sample in `X`. These are normalized to be + probability densities, so values will be low for high-dimensional + data. + """ + check_is_fitted(self) + # The returned density is normalized to the number of points. + # For it to be a probability, we must scale it. For this reason + # we'll also scale atol. + X = validate_data(self, X, order="C", dtype=np.float64, reset=False) + if self.tree_.sample_weight is None: + N = self.tree_.data.shape[0] + else: + N = self.tree_.sum_weight + atol_N = self.atol * N + log_density = self.tree_.kernel_density( + X, + h=self.bandwidth_, + kernel=self.kernel, + atol=atol_N, + rtol=self.rtol, + breadth_first=self.breadth_first, + return_log=True, + ) + log_density -= np.log(N) + return log_density + + def score(self, X, y=None): + """Compute the total log-likelihood under the model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + List of n_features-dimensional data points. Each row + corresponds to a single data point. + + y : None + Ignored. This parameter exists only for compatibility with + :class:`~sklearn.pipeline.Pipeline`. + + Returns + ------- + logprob : float + Total log-likelihood of the data in X. This is normalized to be a + probability density, so the value will be low for high-dimensional + data. + """ + return np.sum(self.score_samples(X)) + + def sample(self, n_samples=1, random_state=None): + """Generate random samples from the model. + + Currently, this is implemented only for gaussian and tophat kernels. + + Parameters + ---------- + n_samples : int, default=1 + Number of samples to generate. + + random_state : int, RandomState instance or None, default=None + Determines random number generation used to generate + random samples. Pass an int for reproducible results + across multiple function calls. + See :term:`Glossary `. + + Returns + ------- + X : array-like of shape (n_samples, n_features) + List of samples. + """ + check_is_fitted(self) + # TODO: implement sampling for other valid kernel shapes + if self.kernel not in ["gaussian", "tophat"]: + raise NotImplementedError() + + data = np.asarray(self.tree_.data) + + rng = check_random_state(random_state) + u = rng.uniform(0, 1, size=n_samples) + if self.tree_.sample_weight is None: + i = (u * data.shape[0]).astype(np.int64) + else: + cumsum_weight = np.cumsum(np.asarray(self.tree_.sample_weight)) + sum_weight = cumsum_weight[-1] + i = np.searchsorted(cumsum_weight, u * sum_weight) + if self.kernel == "gaussian": + return np.atleast_2d(rng.normal(data[i], self.bandwidth_)) + + elif self.kernel == "tophat": + # we first draw points from a d-dimensional normal distribution, + # then use an incomplete gamma function to map them to a uniform + # d-dimensional tophat distribution. + dim = data.shape[1] + X = rng.normal(size=(n_samples, dim)) + s_sq = row_norms(X, squared=True) + correction = ( + gammainc(0.5 * dim, 0.5 * s_sq) ** (1.0 / dim) + * self.bandwidth_ + / np.sqrt(s_sq) + ) + return data[i] + X * correction[:, np.newaxis] diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_lof.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_lof.py new file mode 100644 index 0000000000000000000000000000000000000000..d9f00be42570e2841e5445b5fd68e1dec5413c6a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_lof.py @@ -0,0 +1,518 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from numbers import Real + +import numpy as np + +from ..base import OutlierMixin, _fit_context +from ..utils import check_array +from ..utils._param_validation import Interval, StrOptions +from ..utils.metaestimators import available_if +from ..utils.validation import check_is_fitted +from ._base import KNeighborsMixin, NeighborsBase + +__all__ = ["LocalOutlierFactor"] + + +class LocalOutlierFactor(KNeighborsMixin, OutlierMixin, NeighborsBase): + """Unsupervised Outlier Detection using the Local Outlier Factor (LOF). + + The anomaly score of each sample is called the Local Outlier Factor. + It measures the local deviation of the density of a given sample with respect + to its neighbors. + It is local in that the anomaly score depends on how isolated the object + is with respect to the surrounding neighborhood. + More precisely, locality is given by k-nearest neighbors, whose distance + is used to estimate the local density. + By comparing the local density of a sample to the local densities of its + neighbors, one can identify samples that have a substantially lower density + than their neighbors. These are considered outliers. + + .. versionadded:: 0.19 + + Parameters + ---------- + n_neighbors : int, default=20 + Number of neighbors to use by default for :meth:`kneighbors` queries. + If n_neighbors is larger than the number of samples provided, + all samples will be used. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`BallTree` + - 'kd_tree' will use :class:`KDTree` + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf is size passed to :class:`BallTree` or :class:`KDTree`. This can + affect the speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + metric : str or callable, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + If metric is "precomputed", X is assumed to be a distance matrix and + must be square during fit. X may be a :term:`sparse graph`, in which + case only "nonzero" elements may be considered neighbors. + + If metric is a callable function, it takes two arrays representing 1D + vectors as inputs and must return one value indicating the distance + between those vectors. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + p : float, default=2 + Parameter for the Minkowski metric from + :func:`sklearn.metrics.pairwise_distances`. When p = 1, this + is equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + contamination : 'auto' or float, default='auto' + The amount of contamination of the data set, i.e. the proportion + of outliers in the data set. When fitting this is used to define the + threshold on the scores of the samples. + + - if 'auto', the threshold is determined as in the + original paper, + - if a float, the contamination should be in the range (0, 0.5]. + + .. versionchanged:: 0.22 + The default value of ``contamination`` changed from 0.1 + to ``'auto'``. + + novelty : bool, default=False + By default, LocalOutlierFactor is only meant to be used for outlier + detection (novelty=False). Set novelty to True if you want to use + LocalOutlierFactor for novelty detection. In this case be aware that + you should only use predict, decision_function and score_samples + on new unseen data and not on the training set; and note that the + results obtained this way may differ from the standard LOF results. + + .. versionadded:: 0.20 + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Attributes + ---------- + negative_outlier_factor_ : ndarray of shape (n_samples,) + The opposite LOF of the training samples. The higher, the more normal. + Inliers tend to have a LOF score close to 1 + (``negative_outlier_factor_`` close to -1), while outliers tend to have + a larger LOF score. + + The local outlier factor (LOF) of a sample captures its + supposed 'degree of abnormality'. + It is the average of the ratio of the local reachability density of + a sample and those of its k-nearest neighbors. + + n_neighbors_ : int + The actual number of neighbors used for :meth:`kneighbors` queries. + + offset_ : float + Offset used to obtain binary labels from the raw scores. + Observations having a negative_outlier_factor smaller than `offset_` + are detected as abnormal. + The offset is set to -1.5 (inliers score around -1), except when a + contamination parameter different than "auto" is provided. In that + case, the offset is defined in such a way we obtain the expected + number of outliers in training. + + .. versionadded:: 0.20 + + effective_metric_ : str + The effective metric used for the distance computation. + + effective_metric_params_ : dict + The effective additional keyword arguments for the metric function. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_samples_fit_ : int + It is the number of samples in the fitted data. + + See Also + -------- + sklearn.svm.OneClassSVM: Unsupervised Outlier Detection using + Support Vector Machine. + + References + ---------- + .. [1] Breunig, M. M., Kriegel, H. P., Ng, R. T., & Sander, J. (2000, May). + LOF: identifying density-based local outliers. In ACM sigmod record. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.neighbors import LocalOutlierFactor + >>> X = [[-1.1], [0.2], [101.1], [0.3]] + >>> clf = LocalOutlierFactor(n_neighbors=2) + >>> clf.fit_predict(X) + array([ 1, 1, -1, 1]) + >>> clf.negative_outlier_factor_ + array([ -0.9821, -1.0370, -73.3697, -0.9821]) + """ + + _parameter_constraints: dict = { + **NeighborsBase._parameter_constraints, + "contamination": [ + StrOptions({"auto"}), + Interval(Real, 0, 0.5, closed="right"), + ], + "novelty": ["boolean"], + } + _parameter_constraints.pop("radius") + + def __init__( + self, + n_neighbors=20, + *, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + contamination="auto", + novelty=False, + n_jobs=None, + ): + super().__init__( + n_neighbors=n_neighbors, + algorithm=algorithm, + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) + self.contamination = contamination + self.novelty = novelty + + def _check_novelty_fit_predict(self): + if self.novelty: + msg = ( + "fit_predict is not available when novelty=True. Use " + "novelty=False if you want to predict on the training set." + ) + raise AttributeError(msg) + return True + + @available_if(_check_novelty_fit_predict) + def fit_predict(self, X, y=None): + """Fit the model to the training set X and return the labels. + + **Not available for novelty detection (when novelty is set to True).** + Label is 1 for an inlier and -1 for an outlier according to the LOF + score and the contamination parameter. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None + The query sample or samples to compute the Local Outlier Factor + w.r.t. the training samples. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + is_inlier : ndarray of shape (n_samples,) + Returns -1 for anomalies/outliers and 1 for inliers. + """ + + # As fit_predict would be different from fit.predict, fit_predict is + # only available for outlier detection (novelty=False) + + return self.fit(X)._predict() + + @_fit_context( + # LocalOutlierFactor.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y=None): + """Fit the local outlier factor detector from the training dataset. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) if metric='precomputed' + Training data. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : LocalOutlierFactor + The fitted local outlier factor detector. + """ + self._fit(X) + + n_samples = self.n_samples_fit_ + if self.n_neighbors > n_samples: + warnings.warn( + "n_neighbors (%s) is greater than the " + "total number of samples (%s). n_neighbors " + "will be set to (n_samples - 1) for estimation." + % (self.n_neighbors, n_samples) + ) + self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1)) + + self._distances_fit_X_, _neighbors_indices_fit_X_ = self.kneighbors( + n_neighbors=self.n_neighbors_ + ) + + if self._fit_X.dtype == np.float32: + self._distances_fit_X_ = self._distances_fit_X_.astype( + self._fit_X.dtype, + copy=False, + ) + + self._lrd = self._local_reachability_density( + self._distances_fit_X_, _neighbors_indices_fit_X_ + ) + + # Compute lof score over training samples to define offset_: + lrd_ratios_array = ( + self._lrd[_neighbors_indices_fit_X_] / self._lrd[:, np.newaxis] + ) + + self.negative_outlier_factor_ = -np.mean(lrd_ratios_array, axis=1) + + if self.contamination == "auto": + # inliers score around -1 (the higher, the less abnormal). + self.offset_ = -1.5 + else: + self.offset_ = np.percentile( + self.negative_outlier_factor_, 100.0 * self.contamination + ) + + # Verify if negative_outlier_factor_ values are within acceptable range. + # Novelty must also be false to detect outliers + if np.min(self.negative_outlier_factor_) < -1e7 and not self.novelty: + warnings.warn( + "Duplicate values are leading to incorrect results. " + "Increase the number of neighbors for more accurate results." + ) + + return self + + def _check_novelty_predict(self): + if not self.novelty: + msg = ( + "predict is not available when novelty=False, use " + "fit_predict if you want to predict on training data. Use " + "novelty=True if you want to use LOF for novelty detection " + "and predict on new unseen data." + ) + raise AttributeError(msg) + return True + + @available_if(_check_novelty_predict) + def predict(self, X=None): + """Predict the labels (1 inlier, -1 outlier) of X according to LOF. + + **Only available for novelty detection (when novelty is set to True).** + This method allows to generalize prediction to *new observations* (not + in the training set). Note that the result of ``clf.fit(X)`` then + ``clf.predict(X)`` with ``novelty=True`` may differ from the result + obtained by ``clf.fit_predict(X)`` with ``novelty=False``. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The query sample or samples to compute the Local Outlier Factor + w.r.t. the training samples. + + Returns + ------- + is_inlier : ndarray of shape (n_samples,) + Returns -1 for anomalies/outliers and +1 for inliers. + """ + return self._predict(X) + + def _predict(self, X=None): + """Predict the labels (1 inlier, -1 outlier) of X according to LOF. + + If X is None, returns the same as fit_predict(X_train). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None + The query sample or samples to compute the Local Outlier Factor + w.r.t. the training samples. If None, makes prediction on the + training data without considering them as their own neighbors. + + Returns + ------- + is_inlier : ndarray of shape (n_samples,) + Returns -1 for anomalies/outliers and +1 for inliers. + """ + check_is_fitted(self) + + if X is not None: + shifted_opposite_lof_scores = self.decision_function(X) + is_inlier = np.ones(shifted_opposite_lof_scores.shape[0], dtype=int) + is_inlier[shifted_opposite_lof_scores < 0] = -1 + else: + is_inlier = np.ones(self.n_samples_fit_, dtype=int) + is_inlier[self.negative_outlier_factor_ < self.offset_] = -1 + + return is_inlier + + def _check_novelty_decision_function(self): + if not self.novelty: + msg = ( + "decision_function is not available when novelty=False. " + "Use novelty=True if you want to use LOF for novelty " + "detection and compute decision_function for new unseen " + "data. Note that the opposite LOF of the training samples " + "is always available by considering the " + "negative_outlier_factor_ attribute." + ) + raise AttributeError(msg) + return True + + @available_if(_check_novelty_decision_function) + def decision_function(self, X): + """Shifted opposite of the Local Outlier Factor of X. + + Bigger is better, i.e. large values correspond to inliers. + + **Only available for novelty detection (when novelty is set to True).** + The shift offset allows a zero threshold for being an outlier. + The argument X is supposed to contain *new data*: if X contains a + point from training, it considers the later in its own neighborhood. + Also, the samples in X are not considered in the neighborhood of any + point. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The query sample or samples to compute the Local Outlier Factor + w.r.t. the training samples. + + Returns + ------- + shifted_opposite_lof_scores : ndarray of shape (n_samples,) + The shifted opposite of the Local Outlier Factor of each input + samples. The lower, the more abnormal. Negative scores represent + outliers, positive scores represent inliers. + """ + return self.score_samples(X) - self.offset_ + + def _check_novelty_score_samples(self): + if not self.novelty: + msg = ( + "score_samples is not available when novelty=False. The " + "scores of the training samples are always available " + "through the negative_outlier_factor_ attribute. Use " + "novelty=True if you want to use LOF for novelty detection " + "and compute score_samples for new unseen data." + ) + raise AttributeError(msg) + return True + + @available_if(_check_novelty_score_samples) + def score_samples(self, X): + """Opposite of the Local Outlier Factor of X. + + It is the opposite as bigger is better, i.e. large values correspond + to inliers. + + **Only available for novelty detection (when novelty is set to True).** + The argument X is supposed to contain *new data*: if X contains a + point from training, it considers the later in its own neighborhood. + Also, the samples in X are not considered in the neighborhood of any + point. Because of this, the scores obtained via ``score_samples`` may + differ from the standard LOF scores. + The standard LOF scores for the training data is available via the + ``negative_outlier_factor_`` attribute. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The query sample or samples to compute the Local Outlier Factor + w.r.t. the training samples. + + Returns + ------- + opposite_lof_scores : ndarray of shape (n_samples,) + The opposite of the Local Outlier Factor of each input samples. + The lower, the more abnormal. + """ + check_is_fitted(self) + X = check_array(X, accept_sparse="csr") + + distances_X, neighbors_indices_X = self.kneighbors( + X, n_neighbors=self.n_neighbors_ + ) + + if X.dtype == np.float32: + distances_X = distances_X.astype(X.dtype, copy=False) + + X_lrd = self._local_reachability_density( + distances_X, + neighbors_indices_X, + ) + + lrd_ratios_array = self._lrd[neighbors_indices_X] / X_lrd[:, np.newaxis] + + # as bigger is better: + return -np.mean(lrd_ratios_array, axis=1) + + def _local_reachability_density(self, distances_X, neighbors_indices): + """The local reachability density (LRD) + + The LRD of a sample is the inverse of the average reachability + distance of its k-nearest neighbors. + + Parameters + ---------- + distances_X : ndarray of shape (n_queries, self.n_neighbors) + Distances to the neighbors (in the training samples `self._fit_X`) + of each query point to compute the LRD. + + neighbors_indices : ndarray of shape (n_queries, self.n_neighbors) + Neighbors indices (of each query point) among training samples + self._fit_X. + + Returns + ------- + local_reachability_density : ndarray of shape (n_queries,) + The local reachability density of each sample. + """ + dist_k = self._distances_fit_X_[neighbors_indices, self.n_neighbors_ - 1] + reach_dist_array = np.maximum(distances_X, dist_k) + + # 1e-10 to avoid `nan' when nb of duplicates > n_neighbors_: + return 1.0 / (np.mean(reach_dist_array, axis=1) + 1e-10) diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_nca.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_nca.py new file mode 100644 index 0000000000000000000000000000000000000000..8383f95338932cd4a5a88fda6e5e5b9211b9ca0a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_nca.py @@ -0,0 +1,534 @@ +""" +Neighborhood Component Analysis +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import sys +import time +from numbers import Integral, Real +from warnings import warn + +import numpy as np +from scipy.optimize import minimize + +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) +from ..decomposition import PCA +from ..exceptions import ConvergenceWarning +from ..metrics import pairwise_distances +from ..preprocessing import LabelEncoder +from ..utils._param_validation import Interval, StrOptions +from ..utils.extmath import softmax +from ..utils.fixes import _get_additional_lbfgs_options_dict +from ..utils.multiclass import check_classification_targets +from ..utils.random import check_random_state +from ..utils.validation import check_array, check_is_fitted, validate_data + + +class NeighborhoodComponentsAnalysis( + ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator +): + """Neighborhood Components Analysis. + + Neighborhood Component Analysis (NCA) is a machine learning algorithm for + metric learning. It learns a linear transformation in a supervised fashion + to improve the classification accuracy of a stochastic nearest neighbors + rule in the transformed space. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int, default=None + Preferred dimensionality of the projected space. + If None it will be set to `n_features`. + + init : {'auto', 'pca', 'lda', 'identity', 'random'} or ndarray of shape \ + (n_features_a, n_features_b), default='auto' + Initialization of the linear transformation. Possible options are + `'auto'`, `'pca'`, `'lda'`, `'identity'`, `'random'`, and a numpy + array of shape `(n_features_a, n_features_b)`. + + - `'auto'` + Depending on `n_components`, the most reasonable initialization + is chosen. If `n_components <= min(n_features, n_classes - 1)` + we use `'lda'`, as it uses labels information. If not, but + `n_components < min(n_features, n_samples)`, we use `'pca'`, as + it projects data in meaningful directions (those of higher + variance). Otherwise, we just use `'identity'`. + + - `'pca'` + `n_components` principal components of the inputs passed + to :meth:`fit` will be used to initialize the transformation. + (See :class:`~sklearn.decomposition.PCA`) + + - `'lda'` + `min(n_components, n_classes)` most discriminative + components of the inputs passed to :meth:`fit` will be used to + initialize the transformation. (If `n_components > n_classes`, + the rest of the components will be zero.) (See + :class:`~sklearn.discriminant_analysis.LinearDiscriminantAnalysis`) + + - `'identity'` + If `n_components` is strictly smaller than the + dimensionality of the inputs passed to :meth:`fit`, the identity + matrix will be truncated to the first `n_components` rows. + + - `'random'` + The initial transformation will be a random array of shape + `(n_components, n_features)`. Each value is sampled from the + standard normal distribution. + + - numpy array + `n_features_b` must match the dimensionality of the inputs passed + to :meth:`fit` and n_features_a must be less than or equal to that. + If `n_components` is not `None`, `n_features_a` must match it. + + warm_start : bool, default=False + If `True` and :meth:`fit` has been called before, the solution of the + previous call to :meth:`fit` is used as the initial linear + transformation (`n_components` and `init` will be ignored). + + max_iter : int, default=50 + Maximum number of iterations in the optimization. + + tol : float, default=1e-5 + Convergence tolerance for the optimization. + + callback : callable, default=None + If not `None`, this function is called after every iteration of the + optimizer, taking as arguments the current solution (flattened + transformation matrix) and the number of iterations. This might be + useful in case one wants to examine or store the transformation + found after each iteration. + + verbose : int, default=0 + If 0, no progress messages will be printed. + If 1, progress messages will be printed to stdout. + If > 1, progress messages will be printed and the `disp` + parameter of :func:`scipy.optimize.minimize` will be set to + `verbose - 2`. + + random_state : int or numpy.RandomState, default=None + A pseudo random number generator object or a seed for it if int. If + `init='random'`, `random_state` is used to initialize the random + transformation. If `init='pca'`, `random_state` is passed as an + argument to PCA when initializing the transformation. Pass an int + for reproducible results across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + components_ : ndarray of shape (n_components, n_features) + The linear transformation learned during fitting. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + n_iter_ : int + Counts the number of iterations performed by the optimizer. + + random_state_ : numpy.RandomState + Pseudo random number generator object used during initialization. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + sklearn.discriminant_analysis.LinearDiscriminantAnalysis : Linear + Discriminant Analysis. + sklearn.decomposition.PCA : Principal component analysis (PCA). + + References + ---------- + .. [1] J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov. + "Neighbourhood Components Analysis". Advances in Neural Information + Processing Systems. 17, 513-520, 2005. + http://www.cs.nyu.edu/~roweis/papers/ncanips.pdf + + .. [2] Wikipedia entry on Neighborhood Components Analysis + https://en.wikipedia.org/wiki/Neighbourhood_components_analysis + + Examples + -------- + >>> from sklearn.neighbors import NeighborhoodComponentsAnalysis + >>> from sklearn.neighbors import KNeighborsClassifier + >>> from sklearn.datasets import load_iris + >>> from sklearn.model_selection import train_test_split + >>> X, y = load_iris(return_X_y=True) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, + ... stratify=y, test_size=0.7, random_state=42) + >>> nca = NeighborhoodComponentsAnalysis(random_state=42) + >>> nca.fit(X_train, y_train) + NeighborhoodComponentsAnalysis(...) + >>> knn = KNeighborsClassifier(n_neighbors=3) + >>> knn.fit(X_train, y_train) + KNeighborsClassifier(...) + >>> print(knn.score(X_test, y_test)) + 0.933333... + >>> knn.fit(nca.transform(X_train), y_train) + KNeighborsClassifier(...) + >>> print(knn.score(nca.transform(X_test), y_test)) + 0.961904... + """ + + _parameter_constraints: dict = { + "n_components": [ + Interval(Integral, 1, None, closed="left"), + None, + ], + "init": [ + StrOptions({"auto", "pca", "lda", "identity", "random"}), + np.ndarray, + ], + "warm_start": ["boolean"], + "max_iter": [Interval(Integral, 1, None, closed="left")], + "tol": [Interval(Real, 0, None, closed="left")], + "callback": [callable, None], + "verbose": ["verbose"], + "random_state": ["random_state"], + } + + def __init__( + self, + n_components=None, + *, + init="auto", + warm_start=False, + max_iter=50, + tol=1e-5, + callback=None, + verbose=0, + random_state=None, + ): + self.n_components = n_components + self.init = init + self.warm_start = warm_start + self.max_iter = max_iter + self.tol = tol + self.callback = callback + self.verbose = verbose + self.random_state = random_state + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y): + """Fit the model according to the given training data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The training samples. + + y : array-like of shape (n_samples,) + The corresponding training labels. + + Returns + ------- + self : object + Fitted estimator. + """ + # Validate the inputs X and y, and converts y to numerical classes. + X, y = validate_data(self, X, y, ensure_min_samples=2) + check_classification_targets(y) + y = LabelEncoder().fit_transform(y) + + # Check the preferred dimensionality of the projected space + if self.n_components is not None and self.n_components > X.shape[1]: + raise ValueError( + "The preferred dimensionality of the " + f"projected space `n_components` ({self.n_components}) cannot " + "be greater than the given data " + f"dimensionality ({X.shape[1]})!" + ) + # If warm_start is enabled, check that the inputs are consistent + if ( + self.warm_start + and hasattr(self, "components_") + and self.components_.shape[1] != X.shape[1] + ): + raise ValueError( + f"The new inputs dimensionality ({X.shape[1]}) does not " + "match the input dimensionality of the " + f"previously learned transformation ({self.components_.shape[1]})." + ) + # Check how the linear transformation should be initialized + init = self.init + if isinstance(init, np.ndarray): + init = check_array(init) + # Assert that init.shape[1] = X.shape[1] + if init.shape[1] != X.shape[1]: + raise ValueError( + f"The input dimensionality ({init.shape[1]}) of the given " + "linear transformation `init` must match the " + f"dimensionality of the given inputs `X` ({X.shape[1]})." + ) + # Assert that init.shape[0] <= init.shape[1] + if init.shape[0] > init.shape[1]: + raise ValueError( + f"The output dimensionality ({init.shape[0]}) of the given " + "linear transformation `init` cannot be " + f"greater than its input dimensionality ({init.shape[1]})." + ) + # Assert that self.n_components = init.shape[0] + if self.n_components is not None and self.n_components != init.shape[0]: + raise ValueError( + "The preferred dimensionality of the " + f"projected space `n_components` ({self.n_components}) does" + " not match the output dimensionality of " + "the given linear transformation " + f"`init` ({init.shape[0]})!" + ) + + # Initialize the random generator + self.random_state_ = check_random_state(self.random_state) + + # Measure the total training time + t_train = time.time() + + # Compute a mask that stays fixed during optimization: + same_class_mask = y[:, np.newaxis] == y[np.newaxis, :] + # (n_samples, n_samples) + + # Initialize the transformation + transformation = np.ravel(self._initialize(X, y, init)) + + # Create a dictionary of parameters to be passed to the optimizer + disp = self.verbose - 2 if self.verbose > 1 else -1 + optimizer_params = { + "method": "L-BFGS-B", + "fun": self._loss_grad_lbfgs, + "args": (X, same_class_mask, -1.0), + "jac": True, + "x0": transformation, + "tol": self.tol, + "options": dict( + maxiter=self.max_iter, + **_get_additional_lbfgs_options_dict("disp", disp), + ), + "callback": self._callback, + } + + # Call the optimizer + self.n_iter_ = 0 + opt_result = minimize(**optimizer_params) + + # Reshape the solution found by the optimizer + self.components_ = opt_result.x.reshape(-1, X.shape[1]) + + # Stop timer + t_train = time.time() - t_train + if self.verbose: + cls_name = self.__class__.__name__ + + # Warn the user if the algorithm did not converge + if not opt_result.success: + warn( + "[{}] NCA did not converge: {}".format( + cls_name, opt_result.message + ), + ConvergenceWarning, + ) + + print("[{}] Training took {:8.2f}s.".format(cls_name, t_train)) + + return self + + def transform(self, X): + """Apply the learned transformation to the given data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data samples. + + Returns + ------- + X_embedded: ndarray of shape (n_samples, n_components) + The data samples transformed. + + Raises + ------ + NotFittedError + If :meth:`fit` has not been called before. + """ + + check_is_fitted(self) + X = validate_data(self, X, reset=False) + + return np.dot(X, self.components_.T) + + def _initialize(self, X, y, init): + """Initialize the transformation. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The training samples. + + y : array-like of shape (n_samples,) + The training labels. + + init : str or ndarray of shape (n_features_a, n_features_b) + The validated initialization of the linear transformation. + + Returns + ------- + transformation : ndarray of shape (n_components, n_features) + The initialized linear transformation. + + """ + + transformation = init + if self.warm_start and hasattr(self, "components_"): + transformation = self.components_ + elif isinstance(init, np.ndarray): + pass + else: + n_samples, n_features = X.shape + n_components = self.n_components or n_features + if init == "auto": + n_classes = len(np.unique(y)) + if n_components <= min(n_features, n_classes - 1): + init = "lda" + elif n_components < min(n_features, n_samples): + init = "pca" + else: + init = "identity" + if init == "identity": + transformation = np.eye(n_components, X.shape[1]) + elif init == "random": + transformation = self.random_state_.standard_normal( + size=(n_components, X.shape[1]) + ) + elif init in {"pca", "lda"}: + init_time = time.time() + if init == "pca": + pca = PCA( + n_components=n_components, random_state=self.random_state_ + ) + if self.verbose: + print("Finding principal components... ", end="") + sys.stdout.flush() + pca.fit(X) + transformation = pca.components_ + elif init == "lda": + from ..discriminant_analysis import LinearDiscriminantAnalysis + + lda = LinearDiscriminantAnalysis(n_components=n_components) + if self.verbose: + print("Finding most discriminative components... ", end="") + sys.stdout.flush() + lda.fit(X, y) + transformation = lda.scalings_.T[:n_components] + if self.verbose: + print("done in {:5.2f}s".format(time.time() - init_time)) + return transformation + + def _callback(self, transformation): + """Called after each iteration of the optimizer. + + Parameters + ---------- + transformation : ndarray of shape (n_components * n_features,) + The solution computed by the optimizer in this iteration. + """ + if self.callback is not None: + self.callback(transformation, self.n_iter_) + + self.n_iter_ += 1 + + def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0): + """Compute the loss and the loss gradient w.r.t. `transformation`. + + Parameters + ---------- + transformation : ndarray of shape (n_components * n_features,) + The raveled linear transformation on which to compute loss and + evaluate gradient. + + X : ndarray of shape (n_samples, n_features) + The training samples. + + same_class_mask : ndarray of shape (n_samples, n_samples) + A mask where `mask[i, j] == 1` if `X[i]` and `X[j]` belong + to the same class, and `0` otherwise. + + Returns + ------- + loss : float + The loss computed for the given transformation. + + gradient : ndarray of shape (n_components * n_features,) + The new (flattened) gradient of the loss. + """ + + if self.n_iter_ == 0: + self.n_iter_ += 1 + if self.verbose: + header_fields = ["Iteration", "Objective Value", "Time(s)"] + header_fmt = "{:>10} {:>20} {:>10}" + header = header_fmt.format(*header_fields) + cls_name = self.__class__.__name__ + print("[{}]".format(cls_name)) + print( + "[{}] {}\n[{}] {}".format( + cls_name, header, cls_name, "-" * len(header) + ) + ) + + t_funcall = time.time() + + transformation = transformation.reshape(-1, X.shape[1]) + X_embedded = np.dot(X, transformation.T) # (n_samples, n_components) + + # Compute softmax distances + p_ij = pairwise_distances(X_embedded, squared=True) + np.fill_diagonal(p_ij, np.inf) + p_ij = softmax(-p_ij) # (n_samples, n_samples) + + # Compute loss + masked_p_ij = p_ij * same_class_mask + p = np.sum(masked_p_ij, axis=1, keepdims=True) # (n_samples, 1) + loss = np.sum(p) + + # Compute gradient of loss w.r.t. `transform` + weighted_p_ij = masked_p_ij - p_ij * p + weighted_p_ij_sym = weighted_p_ij + weighted_p_ij.T + np.fill_diagonal(weighted_p_ij_sym, -weighted_p_ij.sum(axis=0)) + gradient = 2 * X_embedded.T.dot(weighted_p_ij_sym).dot(X) + # time complexity of the gradient: O(n_components x n_samples x ( + # n_samples + n_features)) + + if self.verbose: + t_funcall = time.time() - t_funcall + values_fmt = "[{}] {:>10} {:>20.6e} {:>10.2f}" + print( + values_fmt.format( + self.__class__.__name__, self.n_iter_, loss, t_funcall + ) + ) + sys.stdout.flush() + + return sign * loss, sign * gradient.ravel() + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.target_tags.required = True + return tags + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.components_.shape[0] diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_nearest_centroid.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_nearest_centroid.py new file mode 100644 index 0000000000000000000000000000000000000000..a780c27587792478fcef0965127310d35238040d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_nearest_centroid.py @@ -0,0 +1,359 @@ +""" +Nearest Centroid Classification +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from numbers import Real + +import numpy as np +from scipy import sparse as sp + +from ..base import BaseEstimator, ClassifierMixin, _fit_context +from ..discriminant_analysis import DiscriminantAnalysisPredictionMixin +from ..metrics.pairwise import ( + pairwise_distances, + pairwise_distances_argmin, +) +from ..preprocessing import LabelEncoder +from ..utils import get_tags +from ..utils._available_if import available_if +from ..utils._param_validation import Interval, StrOptions +from ..utils.multiclass import check_classification_targets +from ..utils.sparsefuncs import csc_median_axis_0 +from ..utils.validation import check_is_fitted, validate_data + + +class NearestCentroid( + DiscriminantAnalysisPredictionMixin, ClassifierMixin, BaseEstimator +): + """Nearest centroid classifier. + + Each class is represented by its centroid, with test samples classified to + the class with the nearest centroid. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + metric : {"euclidean", "manhattan"}, default="euclidean" + Metric to use for distance computation. + + If `metric="euclidean"`, the centroid for the samples corresponding to each + class is the arithmetic mean, which minimizes the sum of squared L1 distances. + If `metric="manhattan"`, the centroid is the feature-wise median, which + minimizes the sum of L1 distances. + + .. versionchanged:: 1.5 + All metrics but `"euclidean"` and `"manhattan"` were deprecated and + now raise an error. + + .. versionchanged:: 0.19 + `metric='precomputed'` was deprecated and now raises an error + + shrink_threshold : float, default=None + Threshold for shrinking centroids to remove features. + + priors : {"uniform", "empirical"} or array-like of shape (n_classes,), \ + default="uniform" + The class prior probabilities. By default, the class proportions are + inferred from the training data. + + .. versionadded:: 1.6 + + Attributes + ---------- + centroids_ : array-like of shape (n_classes, n_features) + Centroid of each class. + + classes_ : array of shape (n_classes,) + The unique classes labels. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + deviations_ : ndarray of shape (n_classes, n_features) + Deviations (or shrinkages) of the centroids of each class from the + overall centroid. Equal to eq. (18.4) if `shrink_threshold=None`, + else (18.5) p. 653 of [2]. Can be used to identify features used + for classification. + + .. versionadded:: 1.6 + + within_class_std_dev_ : ndarray of shape (n_features,) + Pooled or within-class standard deviation of input data. + + .. versionadded:: 1.6 + + class_prior_ : ndarray of shape (n_classes,) + The class prior probabilities. + + .. versionadded:: 1.6 + + See Also + -------- + KNeighborsClassifier : Nearest neighbors classifier. + + Notes + ----- + When used for text classification with tf-idf vectors, this classifier is + also known as the Rocchio classifier. + + References + ---------- + [1] Tibshirani, R., Hastie, T., Narasimhan, B., & Chu, G. (2002). Diagnosis of + multiple cancer types by shrunken centroids of gene expression. Proceedings + of the National Academy of Sciences of the United States of America, + 99(10), 6567-6572. The National Academy of Sciences. + + [2] Hastie, T., Tibshirani, R., Friedman, J. (2009). The Elements of Statistical + Learning Data Mining, Inference, and Prediction. 2nd Edition. New York, Springer. + + Examples + -------- + >>> from sklearn.neighbors import NearestCentroid + >>> import numpy as np + >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) + >>> y = np.array([1, 1, 1, 2, 2, 2]) + >>> clf = NearestCentroid() + >>> clf.fit(X, y) + NearestCentroid() + >>> print(clf.predict([[-0.8, -1]])) + [1] + """ + + _parameter_constraints: dict = { + "metric": [StrOptions({"manhattan", "euclidean"})], + "shrink_threshold": [Interval(Real, 0, None, closed="neither"), None], + "priors": ["array-like", StrOptions({"empirical", "uniform"})], + } + + def __init__( + self, + metric="euclidean", + *, + shrink_threshold=None, + priors="uniform", + ): + self.metric = metric + self.shrink_threshold = shrink_threshold + self.priors = priors + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y): + """ + Fit the NearestCentroid model according to the given training data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + Note that centroid shrinking cannot be used with sparse matrices. + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + self : object + Fitted estimator. + """ + # If X is sparse and the metric is "manhattan", store it in a csc + # format is easier to calculate the median. + if self.metric == "manhattan": + X, y = validate_data(self, X, y, accept_sparse=["csc"]) + else: + ensure_all_finite = ( + "allow-nan" if get_tags(self).input_tags.allow_nan else True + ) + X, y = validate_data( + self, + X, + y, + ensure_all_finite=ensure_all_finite, + accept_sparse=["csr", "csc"], + ) + is_X_sparse = sp.issparse(X) + check_classification_targets(y) + + n_samples, n_features = X.shape + le = LabelEncoder() + y_ind = le.fit_transform(y) + self.classes_ = classes = le.classes_ + n_classes = classes.size + if n_classes < 2: + raise ValueError( + "The number of classes has to be greater than one; got %d class" + % (n_classes) + ) + + if self.priors == "empirical": # estimate priors from sample + _, class_counts = np.unique(y, return_inverse=True) # non-negative ints + self.class_prior_ = np.bincount(class_counts) / float(len(y)) + elif self.priors == "uniform": + self.class_prior_ = np.asarray([1 / n_classes] * n_classes) + else: + self.class_prior_ = np.asarray(self.priors) + + if (self.class_prior_ < 0).any(): + raise ValueError("priors must be non-negative") + if not np.isclose(self.class_prior_.sum(), 1.0): + warnings.warn( + "The priors do not sum to 1. Normalizing such that it sums to one.", + UserWarning, + ) + self.class_prior_ = self.class_prior_ / self.class_prior_.sum() + + # Mask mapping each class to its members. + self.centroids_ = np.empty((n_classes, n_features), dtype=np.float64) + + # Number of clusters in each class. + nk = np.zeros(n_classes) + + for cur_class in range(n_classes): + center_mask = y_ind == cur_class + nk[cur_class] = np.sum(center_mask) + if is_X_sparse: + center_mask = np.where(center_mask)[0] + + if self.metric == "manhattan": + # NumPy does not calculate median of sparse matrices. + if not is_X_sparse: + self.centroids_[cur_class] = np.median(X[center_mask], axis=0) + else: + self.centroids_[cur_class] = csc_median_axis_0(X[center_mask]) + else: # metric == "euclidean" + self.centroids_[cur_class] = X[center_mask].mean(axis=0) + + # Compute within-class std_dev with unshrunked centroids + variance = np.array(X - self.centroids_[y_ind], copy=False) ** 2 + self.within_class_std_dev_ = np.array( + np.sqrt(variance.sum(axis=0) / (n_samples - n_classes)), copy=False + ) + if any(self.within_class_std_dev_ == 0): + warnings.warn( + "self.within_class_std_dev_ has at least 1 zero standard deviation." + "Inputs within the same classes for at least 1 feature are identical." + ) + + err_msg = "All features have zero variance. Division by zero." + if is_X_sparse and np.all((X.max(axis=0) - X.min(axis=0)).toarray() == 0): + raise ValueError(err_msg) + elif not is_X_sparse and np.all(np.ptp(X, axis=0) == 0): + raise ValueError(err_msg) + + dataset_centroid_ = X.mean(axis=0) + # m parameter for determining deviation + m = np.sqrt((1.0 / nk) - (1.0 / n_samples)) + # Calculate deviation using the standard deviation of centroids. + # To deter outliers from affecting the results. + s = self.within_class_std_dev_ + np.median(self.within_class_std_dev_) + mm = m.reshape(len(m), 1) # Reshape to allow broadcasting. + ms = mm * s + self.deviations_ = np.array( + (self.centroids_ - dataset_centroid_) / ms, copy=False + ) + # Soft thresholding: if the deviation crosses 0 during shrinking, + # it becomes zero. + if self.shrink_threshold: + signs = np.sign(self.deviations_) + self.deviations_ = np.abs(self.deviations_) - self.shrink_threshold + np.clip(self.deviations_, 0, None, out=self.deviations_) + self.deviations_ *= signs + # Now adjust the centroids using the deviation + msd = ms * self.deviations_ + self.centroids_ = np.array(dataset_centroid_ + msd, copy=False) + return self + + def predict(self, X): + """Perform classification on an array of test vectors `X`. + + The predicted class `C` for each sample in `X` is returned. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + The predicted classes. + """ + check_is_fitted(self) + if np.isclose(self.class_prior_, 1 / len(self.classes_)).all(): + # `validate_data` is called here since we are not calling `super()` + ensure_all_finite = ( + "allow-nan" if get_tags(self).input_tags.allow_nan else True + ) + X = validate_data( + self, + X, + ensure_all_finite=ensure_all_finite, + accept_sparse="csr", + reset=False, + ) + return self.classes_[ + pairwise_distances_argmin(X, self.centroids_, metric=self.metric) + ] + else: + return super().predict(X) + + def _decision_function(self, X): + # return discriminant scores, see eq. (18.2) p. 652 of the ESL. + check_is_fitted(self, "centroids_") + + X_normalized = validate_data( + self, X, copy=True, reset=False, accept_sparse="csr", dtype=np.float64 + ) + + discriminant_score = np.empty( + (X_normalized.shape[0], self.classes_.size), dtype=np.float64 + ) + + mask = self.within_class_std_dev_ != 0 + X_normalized[:, mask] /= self.within_class_std_dev_[mask] + centroids_normalized = self.centroids_.copy() + centroids_normalized[:, mask] /= self.within_class_std_dev_[mask] + + for class_idx in range(self.classes_.size): + distances = pairwise_distances( + X_normalized, centroids_normalized[[class_idx]], metric=self.metric + ).ravel() + distances **= 2 + discriminant_score[:, class_idx] = np.squeeze( + -distances + 2.0 * np.log(self.class_prior_[class_idx]) + ) + + return discriminant_score + + def _check_euclidean_metric(self): + return self.metric == "euclidean" + + decision_function = available_if(_check_euclidean_metric)( + DiscriminantAnalysisPredictionMixin.decision_function + ) + + predict_proba = available_if(_check_euclidean_metric)( + DiscriminantAnalysisPredictionMixin.predict_proba + ) + + predict_log_proba = available_if(_check_euclidean_metric)( + DiscriminantAnalysisPredictionMixin.predict_log_proba + ) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = self.metric == "nan_euclidean" + tags.input_tags.sparse = True + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_partition_nodes.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_partition_nodes.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..fae6a33eb2cb132a68f87bfab8e7b3803fd61f70 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_partition_nodes.cpython-312-x86_64-linux-gnu.so differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_partition_nodes.pxd b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_partition_nodes.pxd new file mode 100644 index 0000000000000000000000000000000000000000..bd2160cc3b26f4eaf0821735aeb278fd3a16eb15 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_partition_nodes.pxd @@ -0,0 +1,10 @@ +from cython cimport floating +from ..utils._typedefs cimport float64_t, intp_t + +cdef int partition_node_indices( + const floating *data, + intp_t *node_indices, + intp_t split_dim, + intp_t split_index, + intp_t n_features, + intp_t n_points) except -1 diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_partition_nodes.pyx b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_partition_nodes.pyx new file mode 100644 index 0000000000000000000000000000000000000000..111353c49a22becb74cf2d3d609241d208784508 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_partition_nodes.pyx @@ -0,0 +1,122 @@ +# BinaryTrees rely on partial sorts to partition their nodes during their +# initialisation. +# +# The C++ std library exposes nth_element, an efficient partial sort for this +# situation which has a linear time complexity as well as the best performances. +# +# To use std::algorithm::nth_element, a few fixture are defined using Cython: +# - partition_node_indices, a Cython function used in BinaryTrees, that calls +# - partition_node_indices_inner, a C++ function that wraps nth_element and uses +# - an IndexComparator to state how to compare KDTrees' indices +# +# IndexComparator has been defined so that partial sorts are stable with +# respect to the nodes initial indices. +# +# See for reference: +# - https://en.cppreference.com/w/cpp/algorithm/nth_element. +# - https://github.com/scikit-learn/scikit-learn/pull/11103 +# - https://github.com/scikit-learn/scikit-learn/pull/19473 +from cython cimport floating + + +cdef extern from *: + """ + #include + + template + class IndexComparator { + private: + const D *data; + I split_dim, n_features; + public: + IndexComparator(const D *data, const I &split_dim, const I &n_features): + data(data), split_dim(split_dim), n_features(n_features) {} + + bool operator()(const I &a, const I &b) const { + D a_value = data[a * n_features + split_dim]; + D b_value = data[b * n_features + split_dim]; + return a_value == b_value ? a < b : a_value < b_value; + } + }; + + template + void partition_node_indices_inner( + const D *data, + I *node_indices, + const I &split_dim, + const I &split_index, + const I &n_features, + const I &n_points) { + IndexComparator index_comparator(data, split_dim, n_features); + std::nth_element( + node_indices, + node_indices + split_index, + node_indices + n_points, + index_comparator); + } + """ + void partition_node_indices_inner[D, I]( + const D *data, + I *node_indices, + I split_dim, + I split_index, + I n_features, + I n_points) except + + + +cdef int partition_node_indices( + const floating *data, + intp_t *node_indices, + intp_t split_dim, + intp_t split_index, + intp_t n_features, + intp_t n_points) except -1: + """Partition points in the node into two equal-sized groups. + + Upon return, the values in node_indices will be rearranged such that + (assuming numpy-style indexing): + + data[node_indices[0:split_index], split_dim] + <= data[node_indices[split_index], split_dim] + + and + + data[node_indices[split_index], split_dim] + <= data[node_indices[split_index:n_points], split_dim] + + The algorithm is essentially a partial in-place quicksort around a + set pivot. + + Parameters + ---------- + data : double pointer + Pointer to a 2D array of the training data, of shape [N, n_features]. + N must be greater than any of the values in node_indices. + node_indices : int pointer + Pointer to a 1D array of length n_points. This lists the indices of + each of the points within the current node. This will be modified + in-place. + split_dim : int + the dimension on which to split. This will usually be computed via + the routine ``find_node_split_dim``. + split_index : int + the index within node_indices around which to split the points. + n_features: int + the number of features (i.e columns) in the 2D array pointed by data. + n_points : int + the length of node_indices. This is also the number of points in + the original dataset. + Returns + ------- + status : int + integer exit status. On return, the contents of node_indices are + modified as noted above. + """ + partition_node_indices_inner( + data, + node_indices, + split_dim, + split_index, + n_features, + n_points) + return 0 diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_quad_tree.pxd b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_quad_tree.pxd new file mode 100644 index 0000000000000000000000000000000000000000..e7e817902f103fe6e42f37516e56ad273884c507 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_quad_tree.pxd @@ -0,0 +1,92 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# See quad_tree.pyx for details. + +cimport numpy as cnp +from ..utils._typedefs cimport float32_t, intp_t + +# This is effectively an ifdef statement in Cython +# It allows us to write printf debugging lines +# and remove them at compile time +cdef enum: + DEBUGFLAG = 0 + +cdef float EPSILON = 1e-6 + +# XXX: Careful to not change the order of the arguments. It is important to +# have is_leaf and max_width consecutive as it permits to avoid padding by +# the compiler and keep the size coherent for both C and numpy data structures. +cdef struct Cell: + # Base storage structure for cells in a QuadTree object + + # Tree structure + intp_t parent # Parent cell of this cell + intp_t[8] children # Array pointing to children of this cell + + # Cell description + intp_t cell_id # Id of the cell in the cells array in the Tree + intp_t point_index # Index of the point at this cell (only defined + # # in non empty leaf) + bint is_leaf # Does this cell have children? + float32_t squared_max_width # Squared value of the maximum width w + intp_t depth # Depth of the cell in the tree + intp_t cumulative_size # Number of points included in the subtree with + # # this cell as a root. + + # Internal constants + float32_t[3] center # Store the center for quick split of cells + float32_t[3] barycenter # Keep track of the center of mass of the cell + + # Cell boundaries + float32_t[3] min_bounds # Inferior boundaries of this cell (inclusive) + float32_t[3] max_bounds # Superior boundaries of this cell (exclusive) + + +cdef class _QuadTree: + # The QuadTree object is a quad tree structure constructed by inserting + # recursively points in the tree and splitting cells in 4 so that each + # leaf cell contains at most one point. + # This structure also handle 3D data, inserted in trees with 8 children + # for each node. + + # Parameters of the tree + cdef public int n_dimensions # Number of dimensions in X + cdef public int verbose # Verbosity of the output + cdef intp_t n_cells_per_cell # Number of children per node. (2 ** n_dimension) + + # Tree inner structure + cdef public intp_t max_depth # Max depth of the tree + cdef public intp_t cell_count # Counter for node IDs + cdef public intp_t capacity # Capacity of tree, in terms of nodes + cdef public intp_t n_points # Total number of points + cdef Cell* cells # Array of nodes + + # Point insertion methods + cdef int insert_point(self, float32_t[3] point, intp_t point_index, + intp_t cell_id=*) except -1 nogil + cdef intp_t _insert_point_in_new_child(self, float32_t[3] point, Cell* cell, + intp_t point_index, intp_t size=* + ) noexcept nogil + cdef intp_t _select_child(self, float32_t[3] point, Cell* cell) noexcept nogil + cdef bint _is_duplicate(self, float32_t[3] point1, float32_t[3] point2) noexcept nogil + + # Create a summary of the Tree compare to a query point + cdef long summarize(self, float32_t[3] point, float32_t* results, + float squared_theta=*, intp_t cell_id=*, long idx=* + ) noexcept nogil + + # Internal cell initialization methods + cdef void _init_cell(self, Cell* cell, intp_t parent, intp_t depth) noexcept nogil + cdef void _init_root(self, float32_t[3] min_bounds, float32_t[3] max_bounds + ) noexcept nogil + + # Private methods + cdef int _check_point_in_cell(self, float32_t[3] point, Cell* cell + ) except -1 nogil + + # Private array manipulation to manage the ``cells`` array + cdef int _resize(self, intp_t capacity) except -1 nogil + cdef int _resize_c(self, intp_t capacity=*) except -1 nogil + cdef int _get_cell(self, float32_t[3] point, intp_t cell_id=*) except -1 nogil + cdef Cell[:] _get_cell_ndarray(self) diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_quad_tree.pyx b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_quad_tree.pyx new file mode 100644 index 0000000000000000000000000000000000000000..aec79da505f52b9620568b3dd7c329a144259a76 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_quad_tree.pyx @@ -0,0 +1,609 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + + +from cpython cimport Py_INCREF, PyObject, PyTypeObject + +from libc.math cimport fabsf +from libc.stdlib cimport free +from libc.string cimport memcpy +from libc.stdio cimport printf +from libc.stdint cimport SIZE_MAX + +from ..tree._utils cimport safe_realloc + +import numpy as np +cimport numpy as cnp +cnp.import_array() + +cdef extern from "numpy/arrayobject.h": + object PyArray_NewFromDescr(PyTypeObject* subtype, cnp.dtype descr, + int nd, cnp.npy_intp* dims, + cnp.npy_intp* strides, + void* data, int flags, object obj) + int PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj) + +# Build the corresponding numpy dtype for Cell. +# This works by casting `dummy` to an array of Cell of length 1, which numpy +# can construct a `dtype`-object for. See https://stackoverflow.com/q/62448946 +# for a more detailed explanation. +cdef Cell dummy +CELL_DTYPE = np.asarray((&dummy)).dtype + +assert CELL_DTYPE.itemsize == sizeof(Cell) + + +cdef class _QuadTree: + """Array-based representation of a QuadTree. + + This class is currently working for indexing 2D data (regular QuadTree) and + for indexing 3D data (OcTree). It is planned to split the 2 implementations + using `Cython.Tempita` to save some memory for QuadTree. + + Note that this code is currently internally used only by the Barnes-Hut + method in `sklearn.manifold.TSNE`. It is planned to be refactored and + generalized in the future to be compatible with nearest neighbors API of + `sklearn.neighbors` with 2D and 3D data. + """ + def __cinit__(self, int n_dimensions, int verbose): + """Constructor.""" + # Parameters of the tree + self.n_dimensions = n_dimensions + self.verbose = verbose + self.n_cells_per_cell = (2 ** self.n_dimensions) + + # Inner structures + self.max_depth = 0 + self.cell_count = 0 + self.capacity = 0 + self.n_points = 0 + self.cells = NULL + + def __dealloc__(self): + """Destructor.""" + # Free all inner structures + free(self.cells) + + @property + def cumulative_size(self): + cdef Cell[:] cell_mem_view = self._get_cell_ndarray() + return cell_mem_view.base['cumulative_size'][:self.cell_count] + + @property + def leafs(self): + cdef Cell[:] cell_mem_view = self._get_cell_ndarray() + return cell_mem_view.base['is_leaf'][:self.cell_count] + + def build_tree(self, X): + """Build a tree from an array of points X.""" + cdef: + int i + float32_t[3] pt + float32_t[3] min_bounds, max_bounds + + # validate X and prepare for query + # X = check_array(X, dtype=float32_t, order='C') + n_samples = X.shape[0] + + capacity = 100 + self._resize(capacity) + m = np.min(X, axis=0) + M = np.max(X, axis=0) + # Scale the maximum to get all points strictly in the tree bounding box + # The 3 bounds are for positive, negative and small values + M = np.maximum(M * (1. + 1e-3 * np.sign(M)), M + 1e-3) + for i in range(self.n_dimensions): + min_bounds[i] = m[i] + max_bounds[i] = M[i] + + if self.verbose > 10: + printf("[QuadTree] bounding box axis %i : [%f, %f]\n", + i, min_bounds[i], max_bounds[i]) + + # Create the initial node with boundaries from the dataset + self._init_root(min_bounds, max_bounds) + + for i in range(n_samples): + for j in range(self.n_dimensions): + pt[j] = X[i, j] + self.insert_point(pt, i) + + # Shrink the cells array to reduce memory usage + self._resize(capacity=self.cell_count) + + cdef int insert_point(self, float32_t[3] point, intp_t point_index, + intp_t cell_id=0) except -1 nogil: + """Insert a point in the QuadTree.""" + cdef int ax + cdef intp_t selected_child + cdef Cell* cell = &self.cells[cell_id] + cdef intp_t n_point = cell.cumulative_size + + if self.verbose > 10: + printf("[QuadTree] Inserting depth %li\n", cell.depth) + + # Assert that the point is in the right range + if DEBUGFLAG: + self._check_point_in_cell(point, cell) + + # If the cell is an empty leaf, insert the point in it + if cell.cumulative_size == 0: + cell.cumulative_size = 1 + self.n_points += 1 + for i in range(self.n_dimensions): + cell.barycenter[i] = point[i] + cell.point_index = point_index + if self.verbose > 10: + printf("[QuadTree] inserted point %li in cell %li\n", + point_index, cell_id) + return cell_id + + # If the cell is not a leaf, update cell internals and + # recurse in selected child + if not cell.is_leaf: + for ax in range(self.n_dimensions): + # barycenter update using a weighted mean + cell.barycenter[ax] = ( + n_point * cell.barycenter[ax] + point[ax]) / (n_point + 1) + + # Increase the size of the subtree starting from this cell + cell.cumulative_size += 1 + + # Insert child in the correct subtree + selected_child = self._select_child(point, cell) + if self.verbose > 49: + printf("[QuadTree] selected child %li\n", selected_child) + if selected_child == -1: + self.n_points += 1 + return self._insert_point_in_new_child(point, cell, point_index) + return self.insert_point(point, point_index, selected_child) + + # Finally, if the cell is a leaf with a point already inserted, + # split the cell in n_cells_per_cell if the point is not a duplicate. + # If it is a duplicate, increase the size of the leaf and return. + if self._is_duplicate(point, cell.barycenter): + if self.verbose > 10: + printf("[QuadTree] found a duplicate!\n") + cell.cumulative_size += 1 + self.n_points += 1 + return cell_id + + # In a leaf, the barycenter correspond to the only point included + # in it. + self._insert_point_in_new_child(cell.barycenter, cell, cell.point_index, + cell.cumulative_size) + return self.insert_point(point, point_index, cell_id) + + # XXX: This operation is not Thread safe + cdef intp_t _insert_point_in_new_child( + self, float32_t[3] point, Cell* cell, intp_t point_index, intp_t size=1 + ) noexcept nogil: + """Create a child of cell which will contain point.""" + + # Local variable definition + cdef: + intp_t cell_id, cell_child_id, parent_id + float32_t[3] save_point + float32_t width + Cell* child + int i + + # If the maximal capacity of the Tree have been reached, double the capacity + # We need to save the current cell id and the current point to retrieve them + # in case the reallocation + if self.cell_count + 1 > self.capacity: + parent_id = cell.cell_id + for i in range(self.n_dimensions): + save_point[i] = point[i] + self._resize(SIZE_MAX) + cell = &self.cells[parent_id] + point = save_point + + # Get an empty cell and initialize it + cell_id = self.cell_count + self.cell_count += 1 + child = &self.cells[cell_id] + + self._init_cell(child, cell.cell_id, cell.depth + 1) + child.cell_id = cell_id + + # Set the cell as an inner cell of the Tree + cell.is_leaf = False + cell.point_index = -1 + + # Set the correct boundary for the cell, store the point in the cell + # and compute its index in the children array. + cell_child_id = 0 + for i in range(self.n_dimensions): + cell_child_id *= 2 + if point[i] >= cell.center[i]: + cell_child_id += 1 + child.min_bounds[i] = cell.center[i] + child.max_bounds[i] = cell.max_bounds[i] + else: + child.min_bounds[i] = cell.min_bounds[i] + child.max_bounds[i] = cell.center[i] + child.center[i] = (child.min_bounds[i] + child.max_bounds[i]) / 2. + width = child.max_bounds[i] - child.min_bounds[i] + + child.barycenter[i] = point[i] + child.squared_max_width = max(child.squared_max_width, width*width) + + # Store the point info and the size to account for duplicated points + child.point_index = point_index + child.cumulative_size = size + + # Store the child cell in the correct place in children + cell.children[cell_child_id] = child.cell_id + + if DEBUGFLAG: + # Assert that the point is in the right range + self._check_point_in_cell(point, child) + if self.verbose > 10: + printf("[QuadTree] inserted point %li in new child %li\n", + point_index, cell_id) + + return cell_id + + cdef bint _is_duplicate(self, float32_t[3] point1, float32_t[3] point2) noexcept nogil: + """Check if the two given points are equals.""" + cdef int i + cdef bint res = True + for i in range(self.n_dimensions): + # Use EPSILON to avoid numerical error that would overgrow the tree + res &= fabsf(point1[i] - point2[i]) <= EPSILON + return res + + cdef intp_t _select_child(self, float32_t[3] point, Cell* cell) noexcept nogil: + """Select the child of cell which contains the given query point.""" + cdef: + int i + intp_t selected_child = 0 + + for i in range(self.n_dimensions): + # Select the correct child cell to insert the point by comparing + # it to the borders of the cells using precomputed center. + selected_child *= 2 + if point[i] >= cell.center[i]: + selected_child += 1 + return cell.children[selected_child] + + cdef void _init_cell(self, Cell* cell, intp_t parent, intp_t depth) noexcept nogil: + """Initialize a cell structure with some constants.""" + cell.parent = parent + cell.is_leaf = True + cell.depth = depth + cell.squared_max_width = 0 + cell.cumulative_size = 0 + for i in range(self.n_cells_per_cell): + cell.children[i] = SIZE_MAX + + cdef void _init_root(self, float32_t[3] min_bounds, float32_t[3] max_bounds + ) noexcept nogil: + """Initialize the root node with the given space boundaries""" + cdef: + int i + float32_t width + Cell* root = &self.cells[0] + + self._init_cell(root, -1, 0) + for i in range(self.n_dimensions): + root.min_bounds[i] = min_bounds[i] + root.max_bounds[i] = max_bounds[i] + root.center[i] = (max_bounds[i] + min_bounds[i]) / 2. + width = max_bounds[i] - min_bounds[i] + root.squared_max_width = max(root.squared_max_width, width*width) + root.cell_id = 0 + + self.cell_count += 1 + + cdef int _check_point_in_cell(self, float32_t[3] point, Cell* cell + ) except -1 nogil: + """Check that the given point is in the cell boundaries.""" + + if self.verbose >= 50: + if self.n_dimensions == 3: + printf("[QuadTree] Checking point (%f, %f, %f) in cell %li " + "([%f/%f, %f/%f, %f/%f], size %li)\n", + point[0], point[1], point[2], cell.cell_id, + cell.min_bounds[0], cell.max_bounds[0], cell.min_bounds[1], + cell.max_bounds[1], cell.min_bounds[2], cell.max_bounds[2], + cell.cumulative_size) + else: + printf("[QuadTree] Checking point (%f, %f) in cell %li " + "([%f/%f, %f/%f], size %li)\n", + point[0], point[1], cell.cell_id, cell.min_bounds[0], + cell.max_bounds[0], cell.min_bounds[1], + cell.max_bounds[1], cell.cumulative_size) + + for i in range(self.n_dimensions): + if (cell.min_bounds[i] > point[i] or + cell.max_bounds[i] <= point[i]): + with gil: + msg = "[QuadTree] InsertionError: point out of cell " + msg += "boundary.\nAxis %li: cell [%f, %f]; point %f\n" + + msg %= i, cell.min_bounds[i], cell.max_bounds[i], point[i] + raise ValueError(msg) + + def _check_coherence(self): + """Check the coherence of the cells of the tree. + + Check that the info stored in each cell is compatible with the info + stored in descendent and sibling cells. Raise a ValueError if this + fails. + """ + for cell in self.cells[:self.cell_count]: + # Check that the barycenter of inserted point is within the cell + # boundaries + self._check_point_in_cell(cell.barycenter, &cell) + + if not cell.is_leaf: + # Compute the number of point in children and compare with + # its cummulative_size. + n_points = 0 + for idx in range(self.n_cells_per_cell): + child_id = cell.children[idx] + if child_id != -1: + child = self.cells[child_id] + n_points += child.cumulative_size + assert child.cell_id == child_id, ( + "Cell id not correctly initialized.") + if n_points != cell.cumulative_size: + raise ValueError( + "Cell {} is incoherent. Size={} but found {} points " + "in children. ({})" + .format(cell.cell_id, cell.cumulative_size, + n_points, cell.children)) + + # Make sure that the number of point in the tree correspond to the + # cumulative size in root cell. + if self.n_points != self.cells[0].cumulative_size: + raise ValueError( + "QuadTree is incoherent. Size={} but found {} points " + "in children." + .format(self.n_points, self.cells[0].cumulative_size)) + + cdef long summarize(self, float32_t[3] point, float32_t* results, + float squared_theta=.5, intp_t cell_id=0, long idx=0 + ) noexcept nogil: + """Summarize the tree compared to a query point. + + Input arguments + --------------- + point : array (n_dimensions) + query point to construct the summary. + cell_id : integer, optional (default: 0) + current cell of the tree summarized. This should be set to 0 for + external calls. + idx : integer, optional (default: 0) + current index in the result array. This should be set to 0 for + external calls + squared_theta: float, optional (default: .5) + threshold to decide whether the node is sufficiently far + from the query point to be a good summary. The formula is such that + the node is a summary if + node_width^2 / dist_node_point^2 < squared_theta. + Note that the argument should be passed as theta^2 to avoid + computing square roots of the distances. + + Output arguments + ---------------- + results : array (n_samples * (n_dimensions+2)) + result will contain a summary of the tree information compared to + the query point: + - results[idx:idx+n_dimensions] contains the coordinate-wise + difference between the query point and the summary cell idx. + This is useful in t-SNE to compute the negative forces. + - result[idx+n_dimensions+1] contains the squared euclidean + distance to the summary cell idx. + - result[idx+n_dimensions+2] contains the number of point of the + tree contained in the summary cell idx. + + Return + ------ + idx : integer + number of elements in the results array. + """ + cdef: + int i, idx_d = idx + self.n_dimensions + bint duplicate = True + Cell* cell = &self.cells[cell_id] + + results[idx_d] = 0. + for i in range(self.n_dimensions): + results[idx + i] = point[i] - cell.barycenter[i] + results[idx_d] += results[idx + i] * results[idx + i] + duplicate &= fabsf(results[idx + i]) <= EPSILON + + # Do not compute self interactions + if duplicate and cell.is_leaf: + return idx + + # Check whether we can use this node as a summary + # It's a summary node if the angular size as measured from the point + # is relatively small (w.r.t. theta) or if it is a leaf node. + # If it can be summarized, we use the cell center of mass + # Otherwise, we go a higher level of resolution and into the leaves. + if cell.is_leaf or ( + (cell.squared_max_width / results[idx_d]) < squared_theta): + results[idx_d + 1] = cell.cumulative_size + return idx + self.n_dimensions + 2 + + else: + # Recursively compute the summary in nodes + for c in range(self.n_cells_per_cell): + child_id = cell.children[c] + if child_id != -1: + idx = self.summarize(point, results, squared_theta, + child_id, idx) + + return idx + + def get_cell(self, point): + """return the id of the cell containing the query point or raise + ValueError if the point is not in the tree + """ + cdef float32_t[3] query_pt + cdef int i + + assert len(point) == self.n_dimensions, ( + "Query point should be a point in dimension {}." + .format(self.n_dimensions)) + + for i in range(self.n_dimensions): + query_pt[i] = point[i] + + return self._get_cell(query_pt, 0) + + cdef int _get_cell(self, float32_t[3] point, intp_t cell_id=0 + ) except -1 nogil: + """guts of get_cell. + + Return the id of the cell containing the query point or raise ValueError + if the point is not in the tree""" + cdef: + intp_t selected_child + Cell* cell = &self.cells[cell_id] + + if cell.is_leaf: + if self._is_duplicate(cell.barycenter, point): + if self.verbose > 99: + printf("[QuadTree] Found point in cell: %li\n", + cell.cell_id) + return cell_id + with gil: + raise ValueError("Query point not in the Tree.") + + selected_child = self._select_child(point, cell) + if selected_child > 0: + if self.verbose > 99: + printf("[QuadTree] Selected_child: %li\n", selected_child) + return self._get_cell(point, selected_child) + with gil: + raise ValueError("Query point not in the Tree.") + + # Pickling primitives + + def __reduce__(self): + """Reduce re-implementation, for pickling.""" + return (_QuadTree, (self.n_dimensions, self.verbose), self.__getstate__()) + + def __getstate__(self): + """Getstate re-implementation, for pickling.""" + d = {} + # capacity is inferred during the __setstate__ using nodes + d["max_depth"] = self.max_depth + d["cell_count"] = self.cell_count + d["capacity"] = self.capacity + d["n_points"] = self.n_points + d["cells"] = self._get_cell_ndarray().base + return d + + def __setstate__(self, d): + """Setstate re-implementation, for unpickling.""" + self.max_depth = d["max_depth"] + self.cell_count = d["cell_count"] + self.capacity = d["capacity"] + self.n_points = d["n_points"] + + if 'cells' not in d: + raise ValueError('You have loaded Tree version which ' + 'cannot be imported') + + cell_ndarray = d['cells'] + + if (cell_ndarray.ndim != 1 or + cell_ndarray.dtype != CELL_DTYPE or + not cell_ndarray.flags.c_contiguous): + raise ValueError('Did not recognise loaded array layout') + + self.capacity = cell_ndarray.shape[0] + if self._resize_c(self.capacity) != 0: + raise MemoryError("resizing tree to %d" % self.capacity) + + cdef Cell[:] cell_mem_view = cell_ndarray + memcpy( + pto=self.cells, + pfrom=&cell_mem_view[0], + size=self.capacity * sizeof(Cell), + ) + + # Array manipulation methods, to convert it to numpy or to resize + # self.cells array + + cdef Cell[:] _get_cell_ndarray(self): + """Wraps nodes as a NumPy struct array. + + The array keeps a reference to this Tree, which manages the underlying + memory. Individual fields are publicly accessible as properties of the + Tree. + """ + cdef cnp.npy_intp shape[1] + shape[0] = self.cell_count + cdef cnp.npy_intp strides[1] + strides[0] = sizeof(Cell) + cdef Cell[:] arr + Py_INCREF(CELL_DTYPE) + arr = PyArray_NewFromDescr( + subtype= np.ndarray, + descr=CELL_DTYPE, + nd=1, + dims=shape, + strides=strides, + data= self.cells, + flags=cnp.NPY_ARRAY_DEFAULT, + obj=None, + ) + Py_INCREF(self) + if PyArray_SetBaseObject(arr.base, self) < 0: + raise ValueError("Can't initialize array!") + return arr + + cdef int _resize(self, intp_t capacity) except -1 nogil: + """Resize all inner arrays to `capacity`, if `capacity` == -1, then + double the size of the inner arrays. + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + if self._resize_c(capacity) != 0: + # Acquire gil only if we need to raise + with gil: + raise MemoryError() + + cdef int _resize_c(self, intp_t capacity=SIZE_MAX) except -1 nogil: + """Guts of _resize + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + if capacity == self.capacity and self.cells != NULL: + return 0 + + if capacity == SIZE_MAX: + if self.capacity == 0: + capacity = 9 # default initial value to min + else: + capacity = 2 * self.capacity + + safe_realloc(&self.cells, capacity) + + # if capacity smaller than cell_count, adjust the counter + if capacity < self.cell_count: + self.cell_count = capacity + + self.capacity = capacity + return 0 + + def _py_summarize(self, float32_t[:] query_pt, float32_t[:, :] X, float angle): + # Used for testing summarize + cdef: + float32_t[:] summary + int n_samples + + n_samples = X.shape[0] + summary = np.empty(4 * n_samples, dtype=np.float32) + + idx = self.summarize(&query_pt[0], &summary[0], angle * angle) + return idx, summary diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_regression.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_regression.py new file mode 100644 index 0000000000000000000000000000000000000000..0ee0a340b8153b632fb8174785d53d018545f8ce --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_regression.py @@ -0,0 +1,513 @@ +"""Nearest Neighbor Regression.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings + +import numpy as np + +from ..base import RegressorMixin, _fit_context +from ..metrics import DistanceMetric +from ..utils._param_validation import StrOptions +from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin, _get_weights + + +class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase): + """Regression based on k-nearest neighbors. + + The target is predicted by local interpolation of the targets + associated of the nearest neighbors in the training set. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.9 + + Parameters + ---------- + n_neighbors : int, default=5 + Number of neighbors to use by default for :meth:`kneighbors` queries. + + weights : {'uniform', 'distance'}, callable or None, default='uniform' + Weight function used in prediction. Possible values: + + - 'uniform' : uniform weights. All points in each neighborhood + are weighted equally. + - 'distance' : weight points by the inverse of their distance. + in this case, closer neighbors of a query point will have a + greater influence than neighbors which are further away. + - [callable] : a user-defined function which accepts an + array of distances, and returns an array of the same shape + containing the weights. + + Uniform weights are used by default. + + See the following example for a demonstration of the impact of + different weighting schemes on predictions: + :ref:`sphx_glr_auto_examples_neighbors_plot_regression.py`. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`BallTree` + - 'kd_tree' will use :class:`KDTree` + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf size passed to BallTree or KDTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + p : float, default=2 + Power parameter for the Minkowski metric. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + + metric : str, DistanceMetric object or callable, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + If metric is "precomputed", X is assumed to be a distance matrix and + must be square during fit. X may be a :term:`sparse graph`, in which + case only "nonzero" elements may be considered neighbors. + + If metric is a callable function, it takes two arrays representing 1D + vectors as inputs and must return one value indicating the distance + between those vectors. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + If metric is a DistanceMetric object, it will be passed directly to + the underlying computation routines. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + Doesn't affect :meth:`fit` method. + + Attributes + ---------- + effective_metric_ : str or callable + The distance metric to use. It will be same as the `metric` parameter + or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to + 'minkowski' and `p` parameter set to 2. + + effective_metric_params_ : dict + Additional keyword arguments for the metric function. For most metrics + will be same with `metric_params` parameter, but may also contain the + `p` parameter value if the `effective_metric_` attribute is set to + 'minkowski'. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_samples_fit_ : int + Number of samples in the fitted data. + + See Also + -------- + NearestNeighbors : Unsupervised learner for implementing neighbor searches. + RadiusNeighborsRegressor : Regression based on neighbors within a fixed radius. + KNeighborsClassifier : Classifier implementing the k-nearest neighbors vote. + RadiusNeighborsClassifier : Classifier implementing + a vote among neighbors within a given radius. + + Notes + ----- + See :ref:`Nearest Neighbors ` in the online documentation + for a discussion of the choice of ``algorithm`` and ``leaf_size``. + + .. warning:: + + Regarding the Nearest Neighbors algorithms, if it is found that two + neighbors, neighbor `k+1` and `k`, have identical distances but + different labels, the results will depend on the ordering of the + training data. + + https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm + + Examples + -------- + >>> X = [[0], [1], [2], [3]] + >>> y = [0, 0, 1, 1] + >>> from sklearn.neighbors import KNeighborsRegressor + >>> neigh = KNeighborsRegressor(n_neighbors=2) + >>> neigh.fit(X, y) + KNeighborsRegressor(...) + >>> print(neigh.predict([[1.5]])) + [0.5] + """ + + _parameter_constraints: dict = { + **NeighborsBase._parameter_constraints, + "weights": [StrOptions({"uniform", "distance"}), callable, None], + } + _parameter_constraints["metric"].append(DistanceMetric) + _parameter_constraints.pop("radius") + + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + ): + super().__init__( + n_neighbors=n_neighbors, + algorithm=algorithm, + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) + self.weights = weights + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + # For cross-validation routines to split data correctly + tags.input_tags.pairwise = self.metric == "precomputed" + return tags + + @_fit_context( + # KNeighborsRegressor.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y): + """Fit the k-nearest neighbors regressor from the training dataset. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) if metric='precomputed' + Training data. + + y : {array-like, sparse matrix} of shape (n_samples,) or \ + (n_samples, n_outputs) + Target values. + + Returns + ------- + self : KNeighborsRegressor + The fitted k-nearest neighbors regressor. + """ + return self._fit(X, y) + + def predict(self, X): + """Predict the target for the provided data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed', or None + Test samples. If `None`, predictions for all indexed points are + returned; in this case, points are not considered their own + neighbors. + + Returns + ------- + y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int + Target values. + """ + if self.weights == "uniform": + # In that case, we do not need the distances to perform + # the weighting so we do not compute them. + neigh_ind = self.kneighbors(X, return_distance=False) + neigh_dist = None + else: + neigh_dist, neigh_ind = self.kneighbors(X) + + weights = _get_weights(neigh_dist, self.weights) + + _y = self._y + if _y.ndim == 1: + _y = _y.reshape((-1, 1)) + + if weights is None: + y_pred = np.mean(_y[neigh_ind], axis=1) + else: + y_pred = np.empty((neigh_dist.shape[0], _y.shape[1]), dtype=np.float64) + denom = np.sum(weights, axis=1) + + for j in range(_y.shape[1]): + num = np.sum(_y[neigh_ind, j] * weights, axis=1) + y_pred[:, j] = num / denom + + if self._y.ndim == 1: + y_pred = y_pred.ravel() + + return y_pred + + +class RadiusNeighborsRegressor(RadiusNeighborsMixin, RegressorMixin, NeighborsBase): + """Regression based on neighbors within a fixed radius. + + The target is predicted by local interpolation of the targets + associated of the nearest neighbors in the training set. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.9 + + Parameters + ---------- + radius : float, default=1.0 + Range of parameter space to use by default for :meth:`radius_neighbors` + queries. + + weights : {'uniform', 'distance'}, callable or None, default='uniform' + Weight function used in prediction. Possible values: + + - 'uniform' : uniform weights. All points in each neighborhood + are weighted equally. + - 'distance' : weight points by the inverse of their distance. + in this case, closer neighbors of a query point will have a + greater influence than neighbors which are further away. + - [callable] : a user-defined function which accepts an + array of distances, and returns an array of the same shape + containing the weights. + + Uniform weights are used by default. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`BallTree` + - 'kd_tree' will use :class:`KDTree` + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf size passed to BallTree or KDTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + p : float, default=2 + Power parameter for the Minkowski metric. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + + metric : str or callable, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + If metric is "precomputed", X is assumed to be a distance matrix and + must be square during fit. X may be a :term:`sparse graph`, in which + case only "nonzero" elements may be considered neighbors. + + If metric is a callable function, it takes two arrays representing 1D + vectors as inputs and must return one value indicating the distance + between those vectors. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Attributes + ---------- + effective_metric_ : str or callable + The distance metric to use. It will be same as the `metric` parameter + or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to + 'minkowski' and `p` parameter set to 2. + + effective_metric_params_ : dict + Additional keyword arguments for the metric function. For most metrics + will be same with `metric_params` parameter, but may also contain the + `p` parameter value if the `effective_metric_` attribute is set to + 'minkowski'. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_samples_fit_ : int + Number of samples in the fitted data. + + See Also + -------- + NearestNeighbors : Unsupervised learner for implementing neighbor searches. + KNeighborsRegressor : Regression based on k-nearest neighbors. + KNeighborsClassifier : Classifier based on the k-nearest neighbors. + RadiusNeighborsClassifier : Classifier based on neighbors within a given radius. + + Notes + ----- + See :ref:`Nearest Neighbors ` in the online documentation + for a discussion of the choice of ``algorithm`` and ``leaf_size``. + + https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm + + Examples + -------- + >>> X = [[0], [1], [2], [3]] + >>> y = [0, 0, 1, 1] + >>> from sklearn.neighbors import RadiusNeighborsRegressor + >>> neigh = RadiusNeighborsRegressor(radius=1.0) + >>> neigh.fit(X, y) + RadiusNeighborsRegressor(...) + >>> print(neigh.predict([[1.5]])) + [0.5] + """ + + _parameter_constraints: dict = { + **NeighborsBase._parameter_constraints, + "weights": [StrOptions({"uniform", "distance"}), callable, None], + } + _parameter_constraints.pop("n_neighbors") + + def __init__( + self, + radius=1.0, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + ): + super().__init__( + radius=radius, + algorithm=algorithm, + leaf_size=leaf_size, + p=p, + metric=metric, + metric_params=metric_params, + n_jobs=n_jobs, + ) + self.weights = weights + + @_fit_context( + # RadiusNeighborsRegressor.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y): + """Fit the radius neighbors regressor from the training dataset. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) if metric='precomputed' + Training data. + + y : {array-like, sparse matrix} of shape (n_samples,) or \ + (n_samples, n_outputs) + Target values. + + Returns + ------- + self : RadiusNeighborsRegressor + The fitted radius neighbors regressor. + """ + return self._fit(X, y) + + def predict(self, X): + """Predict the target for the provided data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed', or None + Test samples. If `None`, predictions for all indexed points are + returned; in this case, points are not considered their own + neighbors. + + Returns + ------- + y : ndarray of shape (n_queries,) or (n_queries, n_outputs), \ + dtype=double + Target values. + """ + neigh_dist, neigh_ind = self.radius_neighbors(X) + + weights = _get_weights(neigh_dist, self.weights) + + _y = self._y + if _y.ndim == 1: + _y = _y.reshape((-1, 1)) + + empty_obs = np.full_like(_y[0], np.nan) + + if weights is None: + y_pred = np.array( + [ + np.mean(_y[ind, :], axis=0) if len(ind) else empty_obs + for (i, ind) in enumerate(neigh_ind) + ] + ) + + else: + y_pred = np.array( + [ + ( + np.average(_y[ind, :], axis=0, weights=weights[i]) + if len(ind) + else empty_obs + ) + for (i, ind) in enumerate(neigh_ind) + ] + ) + + if np.any(np.isnan(y_pred)): + empty_warning_msg = ( + "One or more samples have no neighbors " + "within specified radius; predicting NaN." + ) + warnings.warn(empty_warning_msg) + + if self._y.ndim == 1: + y_pred = y_pred.ravel() + + return y_pred diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/_unsupervised.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_unsupervised.py new file mode 100644 index 0000000000000000000000000000000000000000..8888fe18483c6ae5f7008d78b0d6ff97d096a419 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/_unsupervised.py @@ -0,0 +1,179 @@ +"""Unsupervised nearest neighbors learner""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ..base import _fit_context +from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin + + +class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase): + """Unsupervised learner for implementing neighbor searches. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.9 + + Parameters + ---------- + n_neighbors : int, default=5 + Number of neighbors to use by default for :meth:`kneighbors` queries. + + radius : float, default=1.0 + Range of parameter space to use by default for :meth:`radius_neighbors` + queries. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use :class:`BallTree` + - 'kd_tree' will use :class:`KDTree` + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, default=30 + Leaf size passed to BallTree or KDTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + metric : str or callable, default='minkowski' + Metric to use for distance computation. Default is "minkowski", which + results in the standard Euclidean distance when p = 2. See the + documentation of `scipy.spatial.distance + `_ and + the metrics listed in + :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric + values. + + If metric is "precomputed", X is assumed to be a distance matrix and + must be square during fit. X may be a :term:`sparse graph`, in which + case only "nonzero" elements may be considered neighbors. + + If metric is a callable function, it takes two arrays representing 1D + vectors as inputs and must return one value indicating the distance + between those vectors. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + p : float (positive), default=2 + Parameter for the Minkowski metric from + sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + + metric_params : dict, default=None + Additional keyword arguments for the metric function. + + n_jobs : int, default=None + The number of parallel jobs to run for neighbors search. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Attributes + ---------- + effective_metric_ : str + Metric used to compute distances to neighbors. + + effective_metric_params_ : dict + Parameters for the metric used to compute distances to neighbors. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_samples_fit_ : int + Number of samples in the fitted data. + + See Also + -------- + KNeighborsClassifier : Classifier implementing the k-nearest neighbors + vote. + RadiusNeighborsClassifier : Classifier implementing a vote among neighbors + within a given radius. + KNeighborsRegressor : Regression based on k-nearest neighbors. + RadiusNeighborsRegressor : Regression based on neighbors within a fixed + radius. + BallTree : Space partitioning data structure for organizing points in a + multi-dimensional space, used for nearest neighbor search. + + Notes + ----- + See :ref:`Nearest Neighbors ` in the online documentation + for a discussion of the choice of ``algorithm`` and ``leaf_size``. + + https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm + + Examples + -------- + >>> import numpy as np + >>> from sklearn.neighbors import NearestNeighbors + >>> samples = [[0, 0, 2], [1, 0, 0], [0, 0, 1]] + >>> neigh = NearestNeighbors(n_neighbors=2, radius=0.4) + >>> neigh.fit(samples) + NearestNeighbors(...) + >>> neigh.kneighbors([[0, 0, 1.3]], 2, return_distance=False) + array([[2, 0]]...) + >>> nbrs = neigh.radius_neighbors( + ... [[0, 0, 1.3]], 0.4, return_distance=False + ... ) + >>> np.asarray(nbrs[0][0]) + array(2) + """ + + def __init__( + self, + *, + n_neighbors=5, + radius=1.0, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + n_jobs=None, + ): + super().__init__( + n_neighbors=n_neighbors, + radius=radius, + algorithm=algorithm, + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) + + @_fit_context( + # NearestNeighbors.metric is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y=None): + """Fit the nearest neighbors estimator from the training dataset. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples, n_samples) if metric='precomputed' + Training data. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : NearestNeighbors + The fitted nearest neighbors estimator. + """ + return self._fit(X) diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/meson.build b/.venv/lib/python3.12/site-packages/sklearn/neighbors/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..7993421896218d3a4c9db8055d2dfd9528ac3746 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/meson.build @@ -0,0 +1,53 @@ +_binary_tree_pxi = custom_target( + '_binary_tree_pxi', + output: '_binary_tree.pxi', + input: '_binary_tree.pxi.tp', + command: [tempita, '@INPUT@', '-o', '@OUTDIR@'], +) + +# .pyx is generated so this is needed to make Cython compilation work. The pxi +# file is included avoid "missing dependency paths" with ninja -t missindeps +neighbors_cython_tree = [ + fs.copyfile('__init__.py'), + fs.copyfile('_partition_nodes.pxd'), + _binary_tree_pxi, +] + +name_list = ['_ball_tree', '_kd_tree'] + +foreach name: name_list + pyx = custom_target( + name + '_pyx', + output: name + '.pyx', + input: name + '.pyx.tp', + command: [tempita, '@INPUT@', '-o', '@OUTDIR@'], + # TODO in principle this should go in py.exension_module below. This is + # temporary work-around for dependency issue with .pyx.tp files. For more + # details, see https://github.com/mesonbuild/meson/issues/13212 + depends: [neighbors_cython_tree, utils_cython_tree, metrics_cython_tree], + ) + py.extension_module( + name, + cython_gen.process(pyx), + dependencies: [np_dep], + subdir: 'sklearn/neighbors', + install: true +) +endforeach + +neighbors_extension_metadata = { + '_partition_nodes': + {'sources': [cython_gen_cpp.process('_partition_nodes.pyx')], + 'dependencies': [np_dep]}, + '_quad_tree': {'sources': [cython_gen.process('_quad_tree.pyx')], 'dependencies': [np_dep]}, +} + +foreach ext_name, ext_dict : neighbors_extension_metadata + py.extension_module( + ext_name, + [ext_dict.get('sources'), utils_cython_tree], + dependencies: ext_dict.get('dependencies'), + subdir: 'sklearn/neighbors', + install: true + ) +endforeach diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_ball_tree.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_ball_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..5263f201f320b17ced98fb223e7aaaf624d9271d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_ball_tree.py @@ -0,0 +1,200 @@ +import itertools + +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_array_almost_equal, assert_equal + +from sklearn.neighbors._ball_tree import BallTree, BallTree32, BallTree64 +from sklearn.utils import check_random_state +from sklearn.utils._testing import _convert_container +from sklearn.utils.validation import check_array + +rng = np.random.RandomState(10) +V_mahalanobis = rng.rand(3, 3) +V_mahalanobis = np.dot(V_mahalanobis, V_mahalanobis.T) + +DIMENSION = 3 + +METRICS = { + "euclidean": {}, + "manhattan": {}, + "minkowski": dict(p=3), + "chebyshev": {}, +} + +DISCRETE_METRICS = ["hamming", "canberra", "braycurtis"] + +BOOLEAN_METRICS = [ + "jaccard", + "dice", + "rogerstanimoto", + "russellrao", + "sokalmichener", + "sokalsneath", +] + +BALL_TREE_CLASSES = [ + BallTree64, + BallTree32, +] + + +def brute_force_neighbors(X, Y, k, metric, **kwargs): + from sklearn.metrics import DistanceMetric + + X, Y = check_array(X), check_array(Y) + D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X) + ind = np.argsort(D, axis=1)[:, :k] + dist = D[np.arange(Y.shape[0])[:, None], ind] + return dist, ind + + +def test_BallTree_is_BallTree64_subclass(): + assert issubclass(BallTree, BallTree64) + + +@pytest.mark.parametrize("metric", itertools.chain(BOOLEAN_METRICS, DISCRETE_METRICS)) +@pytest.mark.parametrize("array_type", ["list", "array"]) +@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES) +def test_ball_tree_query_metrics(metric, array_type, BallTreeImplementation): + rng = check_random_state(0) + if metric in BOOLEAN_METRICS: + X = rng.random_sample((40, 10)).round(0) + Y = rng.random_sample((10, 10)).round(0) + elif metric in DISCRETE_METRICS: + X = (4 * rng.random_sample((40, 10))).round(0) + Y = (4 * rng.random_sample((10, 10))).round(0) + X = _convert_container(X, array_type) + Y = _convert_container(Y, array_type) + + k = 5 + + bt = BallTreeImplementation(X, leaf_size=1, metric=metric) + dist1, ind1 = bt.query(Y, k) + dist2, ind2 = brute_force_neighbors(X, Y, k, metric) + assert_array_almost_equal(dist1, dist2) + + +@pytest.mark.parametrize( + "BallTreeImplementation, decimal_tol", zip(BALL_TREE_CLASSES, [6, 5]) +) +def test_query_haversine(BallTreeImplementation, decimal_tol): + rng = check_random_state(0) + X = 2 * np.pi * rng.random_sample((40, 2)) + bt = BallTreeImplementation(X, leaf_size=1, metric="haversine") + dist1, ind1 = bt.query(X, k=5) + dist2, ind2 = brute_force_neighbors(X, X, k=5, metric="haversine") + + assert_array_almost_equal(dist1, dist2, decimal=decimal_tol) + assert_array_almost_equal(ind1, ind2) + + +@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES) +def test_array_object_type(BallTreeImplementation): + """Check that we do not accept object dtype array.""" + X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) + with pytest.raises(ValueError, match="setting an array element with a sequence"): + BallTreeImplementation(X) + + +@pytest.mark.parametrize("BallTreeImplementation", BALL_TREE_CLASSES) +def test_bad_pyfunc_metric(BallTreeImplementation): + def wrong_returned_value(x, y): + return "1" + + def one_arg_func(x): + return 1.0 # pragma: no cover + + X = np.ones((5, 2)) + msg = "Custom distance function must accept two vectors and return a float." + with pytest.raises(TypeError, match=msg): + BallTreeImplementation(X, metric=wrong_returned_value) + + msg = "takes 1 positional argument but 2 were given" + with pytest.raises(TypeError, match=msg): + BallTreeImplementation(X, metric=one_arg_func) + + +@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) +def test_ball_tree_numerical_consistency(global_random_seed, metric): + # Results on float64 and float32 versions of a dataset must be + # numerically close. + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree( + random_seed=global_random_seed, features=50 + ) + + metric_params = METRICS.get(metric, {}) + bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params) + bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params) + + # Test consistency with respect to the `query` method + k = 5 + dist_64, ind_64 = bt_64.query(Y_64, k=k) + dist_32, ind_32 = bt_32.query(Y_32, k=k) + assert_allclose(dist_64, dist_32, rtol=1e-5) + assert_equal(ind_64, ind_32) + assert dist_64.dtype == np.float64 + assert dist_32.dtype == np.float32 + + # Test consistency with respect to the `query_radius` method + r = 2.38 + ind_64 = bt_64.query_radius(Y_64, r=r) + ind_32 = bt_32.query_radius(Y_32, r=r) + for _ind64, _ind32 in zip(ind_64, ind_32): + assert_equal(_ind64, _ind32) + + # Test consistency with respect to the `query_radius` method + # with return distances being true + ind_64, dist_64 = bt_64.query_radius(Y_64, r=r, return_distance=True) + ind_32, dist_32 = bt_32.query_radius(Y_32, r=r, return_distance=True) + for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32): + assert_equal(_ind64, _ind32) + assert_allclose(_dist_64, _dist_32, rtol=1e-5) + assert _dist_64.dtype == np.float64 + assert _dist_32.dtype == np.float32 + + +@pytest.mark.parametrize("metric", itertools.chain(METRICS, BOOLEAN_METRICS)) +def test_kernel_density_numerical_consistency(global_random_seed, metric): + # Test consistency with respect to the `kernel_density` method + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed) + + metric_params = METRICS.get(metric, {}) + bt_64 = BallTree64(X_64, leaf_size=1, metric=metric, **metric_params) + bt_32 = BallTree32(X_32, leaf_size=1, metric=metric, **metric_params) + + kernel = "gaussian" + h = 0.1 + density64 = bt_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True) + density32 = bt_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True) + assert_allclose(density64, density32, rtol=1e-5) + assert density64.dtype == np.float64 + assert density32.dtype == np.float32 + + +def test_two_point_correlation_numerical_consistency(global_random_seed): + # Test consistency with respect to the `two_point_correlation` method + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed) + + bt_64 = BallTree64(X_64, leaf_size=10) + bt_32 = BallTree32(X_32, leaf_size=10) + + r = np.linspace(0, 1, 10) + + counts_64 = bt_64.two_point_correlation(Y_64, r=r, dualtree=True) + counts_32 = bt_32.two_point_correlation(Y_32, r=r, dualtree=True) + assert_allclose(counts_64, counts_32) + + +def get_dataset_for_binary_tree(random_seed, features=3): + rng = np.random.RandomState(random_seed) + _X = rng.rand(100, features) + _Y = rng.rand(5, features) + + X_64 = _X.astype(dtype=np.float64, copy=False) + Y_64 = _Y.astype(dtype=np.float64, copy=False) + + X_32 = _X.astype(dtype=np.float32, copy=False) + Y_32 = _Y.astype(dtype=np.float32, copy=False) + + return X_64, X_32, Y_64, Y_32 diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_graph.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..fb593485d17a8155f784ef881b3868338348e1a8 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_graph.py @@ -0,0 +1,101 @@ +import numpy as np +import pytest + +from sklearn.metrics import euclidean_distances +from sklearn.neighbors import KNeighborsTransformer, RadiusNeighborsTransformer +from sklearn.neighbors._base import _is_sorted_by_data +from sklearn.utils._testing import assert_array_equal + + +def test_transformer_result(): + # Test the number of neighbors returned + n_neighbors = 5 + n_samples_fit = 20 + n_queries = 18 + n_features = 10 + + rng = np.random.RandomState(42) + X = rng.randn(n_samples_fit, n_features) + X2 = rng.randn(n_queries, n_features) + radius = np.percentile(euclidean_distances(X), 10) + + # with n_neighbors + for mode in ["distance", "connectivity"]: + add_one = mode == "distance" + nnt = KNeighborsTransformer(n_neighbors=n_neighbors, mode=mode) + Xt = nnt.fit_transform(X) + assert Xt.shape == (n_samples_fit, n_samples_fit) + assert Xt.data.shape == (n_samples_fit * (n_neighbors + add_one),) + assert Xt.format == "csr" + assert _is_sorted_by_data(Xt) + + X2t = nnt.transform(X2) + assert X2t.shape == (n_queries, n_samples_fit) + assert X2t.data.shape == (n_queries * (n_neighbors + add_one),) + assert X2t.format == "csr" + assert _is_sorted_by_data(X2t) + + # with radius + for mode in ["distance", "connectivity"]: + add_one = mode == "distance" + nnt = RadiusNeighborsTransformer(radius=radius, mode=mode) + Xt = nnt.fit_transform(X) + assert Xt.shape == (n_samples_fit, n_samples_fit) + assert not Xt.data.shape == (n_samples_fit * (n_neighbors + add_one),) + assert Xt.format == "csr" + assert _is_sorted_by_data(Xt) + + X2t = nnt.transform(X2) + assert X2t.shape == (n_queries, n_samples_fit) + assert not X2t.data.shape == (n_queries * (n_neighbors + add_one),) + assert X2t.format == "csr" + assert _is_sorted_by_data(X2t) + + +def _has_explicit_diagonal(X): + """Return True if the diagonal is explicitly stored""" + X = X.tocoo() + explicit = X.row[X.row == X.col] + return len(explicit) == X.shape[0] + + +def test_explicit_diagonal(): + # Test that the diagonal is explicitly stored in the sparse graph + n_neighbors = 5 + n_samples_fit, n_samples_transform, n_features = 20, 18, 10 + rng = np.random.RandomState(42) + X = rng.randn(n_samples_fit, n_features) + X2 = rng.randn(n_samples_transform, n_features) + + nnt = KNeighborsTransformer(n_neighbors=n_neighbors) + Xt = nnt.fit_transform(X) + assert _has_explicit_diagonal(Xt) + assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0) + + Xt = nnt.transform(X) + assert _has_explicit_diagonal(Xt) + assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0) + + # Using transform on new data should not always have zero diagonal + X2t = nnt.transform(X2) + assert not _has_explicit_diagonal(X2t) + + +@pytest.mark.parametrize("Klass", [KNeighborsTransformer, RadiusNeighborsTransformer]) +def test_graph_feature_names_out(Klass): + """Check `get_feature_names_out` for transformers defined in `_graph.py`.""" + + n_samples_fit = 20 + n_features = 10 + rng = np.random.RandomState(42) + X = rng.randn(n_samples_fit, n_features) + + est = Klass().fit(X) + names_out = est.get_feature_names_out() + + class_name_lower = Klass.__name__.lower() + expected_names_out = np.array( + [f"{class_name_lower}{i}" for i in range(est.n_samples_fit_)], + dtype=object, + ) + assert_array_equal(names_out, expected_names_out) diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_kd_tree.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_kd_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..749601baaf66fdbf96e8396ca1df45c5bdab4a1e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_kd_tree.py @@ -0,0 +1,100 @@ +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_equal + +from sklearn.neighbors._kd_tree import KDTree, KDTree32, KDTree64 +from sklearn.neighbors.tests.test_ball_tree import get_dataset_for_binary_tree +from sklearn.utils.parallel import Parallel, delayed + +DIMENSION = 3 + +METRICS = {"euclidean": {}, "manhattan": {}, "chebyshev": {}, "minkowski": dict(p=3)} + +KD_TREE_CLASSES = [ + KDTree64, + KDTree32, +] + + +def test_KDTree_is_KDTree64_subclass(): + assert issubclass(KDTree, KDTree64) + + +@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES) +def test_array_object_type(BinarySearchTree): + """Check that we do not accept object dtype array.""" + X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) + with pytest.raises(ValueError, match="setting an array element with a sequence"): + BinarySearchTree(X) + + +@pytest.mark.parametrize("BinarySearchTree", KD_TREE_CLASSES) +def test_kdtree_picklable_with_joblib(BinarySearchTree): + """Make sure that KDTree queries work when joblib memmaps. + + Non-regression test for #21685 and #21228.""" + rng = np.random.RandomState(0) + X = rng.random_sample((10, 3)) + tree = BinarySearchTree(X, leaf_size=2) + + # Call Parallel with max_nbytes=1 to trigger readonly memory mapping that + # use to raise "ValueError: buffer source array is read-only" in a previous + # version of the Cython code. + Parallel(n_jobs=2, max_nbytes=1)(delayed(tree.query)(data) for data in 2 * [X]) + + +@pytest.mark.parametrize("metric", METRICS) +def test_kd_tree_numerical_consistency(global_random_seed, metric): + # Results on float64 and float32 versions of a dataset must be + # numerically close. + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree( + random_seed=global_random_seed, features=50 + ) + + metric_params = METRICS.get(metric, {}) + kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params) + kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params) + + # Test consistency with respect to the `query` method + k = 4 + dist_64, ind_64 = kd_64.query(Y_64, k=k) + dist_32, ind_32 = kd_32.query(Y_32, k=k) + assert_allclose(dist_64, dist_32, rtol=1e-5) + assert_equal(ind_64, ind_32) + assert dist_64.dtype == np.float64 + assert dist_32.dtype == np.float32 + + # Test consistency with respect to the `query_radius` method + r = 2.38 + ind_64 = kd_64.query_radius(Y_64, r=r) + ind_32 = kd_32.query_radius(Y_32, r=r) + for _ind64, _ind32 in zip(ind_64, ind_32): + assert_equal(_ind64, _ind32) + + # Test consistency with respect to the `query_radius` method + # with return distances being true + ind_64, dist_64 = kd_64.query_radius(Y_64, r=r, return_distance=True) + ind_32, dist_32 = kd_32.query_radius(Y_32, r=r, return_distance=True) + for _ind64, _ind32, _dist_64, _dist_32 in zip(ind_64, ind_32, dist_64, dist_32): + assert_equal(_ind64, _ind32) + assert_allclose(_dist_64, _dist_32, rtol=1e-5) + assert _dist_64.dtype == np.float64 + assert _dist_32.dtype == np.float32 + + +@pytest.mark.parametrize("metric", METRICS) +def test_kernel_density_numerical_consistency(global_random_seed, metric): + # Test consistency with respect to the `kernel_density` method + X_64, X_32, Y_64, Y_32 = get_dataset_for_binary_tree(random_seed=global_random_seed) + + metric_params = METRICS.get(metric, {}) + kd_64 = KDTree64(X_64, leaf_size=2, metric=metric, **metric_params) + kd_32 = KDTree32(X_32, leaf_size=2, metric=metric, **metric_params) + + kernel = "gaussian" + h = 0.1 + density64 = kd_64.kernel_density(Y_64, h=h, kernel=kernel, breadth_first=True) + density32 = kd_32.kernel_density(Y_32, h=h, kernel=kernel, breadth_first=True) + assert_allclose(density64, density32, rtol=1e-5) + assert density64.dtype == np.float64 + assert density32.dtype == np.float32 diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_kde.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_kde.py new file mode 100644 index 0000000000000000000000000000000000000000..b6bf09d01b672b7ad5a3abf3506443b0ac620915 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_kde.py @@ -0,0 +1,252 @@ +import joblib +import numpy as np +import pytest + +from sklearn.datasets import make_blobs +from sklearn.exceptions import NotFittedError +from sklearn.model_selection import GridSearchCV +from sklearn.neighbors import KDTree, KernelDensity, NearestNeighbors +from sklearn.neighbors._ball_tree import kernel_norm +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.utils._testing import assert_allclose + + +# XXX Duplicated in test_neighbors_tree, test_kde +def compute_kernel_slow(Y, X, kernel, h): + if h == "scott": + h = X.shape[0] ** (-1 / (X.shape[1] + 4)) + elif h == "silverman": + h = (X.shape[0] * (X.shape[1] + 2) / 4) ** (-1 / (X.shape[1] + 4)) + + d = np.sqrt(((Y[:, None, :] - X) ** 2).sum(-1)) + norm = kernel_norm(h, X.shape[1], kernel) / X.shape[0] + + if kernel == "gaussian": + return norm * np.exp(-0.5 * (d * d) / (h * h)).sum(-1) + elif kernel == "tophat": + return norm * (d < h).sum(-1) + elif kernel == "epanechnikov": + return norm * ((1.0 - (d * d) / (h * h)) * (d < h)).sum(-1) + elif kernel == "exponential": + return norm * (np.exp(-d / h)).sum(-1) + elif kernel == "linear": + return norm * ((1 - d / h) * (d < h)).sum(-1) + elif kernel == "cosine": + return norm * (np.cos(0.5 * np.pi * d / h) * (d < h)).sum(-1) + else: + raise ValueError("kernel not recognized") + + +def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true): + kde = KernelDensity(kernel=kernel, bandwidth=bandwidth, atol=atol, rtol=rtol) + log_dens = kde.fit(X).score_samples(Y) + assert_allclose(np.exp(log_dens), dens_true, atol=atol, rtol=max(1e-7, rtol)) + assert_allclose( + np.exp(kde.score(Y)), np.prod(dens_true), atol=atol, rtol=max(1e-7, rtol) + ) + + +@pytest.mark.parametrize( + "kernel", ["gaussian", "tophat", "epanechnikov", "exponential", "linear", "cosine"] +) +@pytest.mark.parametrize("bandwidth", [0.01, 0.1, 1, "scott", "silverman"]) +def test_kernel_density(kernel, bandwidth): + n_samples, n_features = (100, 3) + + rng = np.random.RandomState(0) + X = rng.randn(n_samples, n_features) + Y = rng.randn(n_samples, n_features) + + dens_true = compute_kernel_slow(Y, X, kernel, bandwidth) + + for rtol in [0, 1e-5]: + for atol in [1e-6, 1e-2]: + for breadth_first in (True, False): + check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true) + + +def test_kernel_density_sampling(n_samples=100, n_features=3): + rng = np.random.RandomState(0) + X = rng.randn(n_samples, n_features) + + bandwidth = 0.2 + + for kernel in ["gaussian", "tophat"]: + # draw a tophat sample + kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X) + samp = kde.sample(100) + assert X.shape == samp.shape + + # check that samples are in the right range + nbrs = NearestNeighbors(n_neighbors=1).fit(X) + dist, ind = nbrs.kneighbors(X, return_distance=True) + + if kernel == "tophat": + assert np.all(dist < bandwidth) + elif kernel == "gaussian": + # 5 standard deviations is safe for 100 samples, but there's a + # very small chance this test could fail. + assert np.all(dist < 5 * bandwidth) + + # check unsupported kernels + for kernel in ["epanechnikov", "exponential", "linear", "cosine"]: + kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X) + with pytest.raises(NotImplementedError): + kde.sample(100) + + # non-regression test: used to return a scalar + X = rng.randn(4, 1) + kde = KernelDensity(kernel="gaussian").fit(X) + assert kde.sample().shape == (1, 1) + + +@pytest.mark.parametrize("algorithm", ["auto", "ball_tree", "kd_tree"]) +@pytest.mark.parametrize( + "metric", ["euclidean", "minkowski", "manhattan", "chebyshev", "haversine"] +) +def test_kde_algorithm_metric_choice(algorithm, metric): + # Smoke test for various metrics and algorithms + rng = np.random.RandomState(0) + X = rng.randn(10, 2) # 2 features required for haversine dist. + Y = rng.randn(10, 2) + + kde = KernelDensity(algorithm=algorithm, metric=metric) + + if algorithm == "kd_tree" and metric not in KDTree.valid_metrics: + with pytest.raises(ValueError, match="invalid metric"): + kde.fit(X) + else: + kde.fit(X) + y_dens = kde.score_samples(Y) + assert y_dens.shape == Y.shape[:1] + + +def test_kde_score(n_samples=100, n_features=3): + pass + # FIXME + # rng = np.random.RandomState(0) + # X = rng.random_sample((n_samples, n_features)) + # Y = rng.random_sample((n_samples, n_features)) + + +def test_kde_sample_weights_error(): + kde = KernelDensity() + with pytest.raises(ValueError): + kde.fit(np.random.random((200, 10)), sample_weight=np.random.random((200, 10))) + with pytest.raises(ValueError): + kde.fit(np.random.random((200, 10)), sample_weight=-np.random.random(200)) + + +def test_kde_pipeline_gridsearch(): + # test that kde plays nice in pipelines and grid-searches + X, _ = make_blobs(cluster_std=0.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]]) + pipe1 = make_pipeline( + StandardScaler(with_mean=False, with_std=False), + KernelDensity(kernel="gaussian"), + ) + params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10]) + search = GridSearchCV(pipe1, param_grid=params) + search.fit(X) + assert search.best_params_["kerneldensity__bandwidth"] == 0.1 + + +def test_kde_sample_weights(): + n_samples = 400 + size_test = 20 + weights_neutral = np.full(n_samples, 3.0) + for d in [1, 2, 10]: + rng = np.random.RandomState(0) + X = rng.rand(n_samples, d) + weights = 1 + (10 * X.sum(axis=1)).astype(np.int8) + X_repetitions = np.repeat(X, weights, axis=0) + n_samples_test = size_test // d + test_points = rng.rand(n_samples_test, d) + for algorithm in ["auto", "ball_tree", "kd_tree"]: + for metric in ["euclidean", "minkowski", "manhattan", "chebyshev"]: + if algorithm != "kd_tree" or metric in KDTree.valid_metrics: + kde = KernelDensity(algorithm=algorithm, metric=metric) + + # Test that adding a constant sample weight has no effect + kde.fit(X, sample_weight=weights_neutral) + scores_const_weight = kde.score_samples(test_points) + sample_const_weight = kde.sample(random_state=1234) + kde.fit(X) + scores_no_weight = kde.score_samples(test_points) + sample_no_weight = kde.sample(random_state=1234) + assert_allclose(scores_const_weight, scores_no_weight) + assert_allclose(sample_const_weight, sample_no_weight) + + # Test equivalence between sampling and (integer) weights + kde.fit(X, sample_weight=weights) + scores_weight = kde.score_samples(test_points) + sample_weight = kde.sample(random_state=1234) + kde.fit(X_repetitions) + scores_ref_sampling = kde.score_samples(test_points) + sample_ref_sampling = kde.sample(random_state=1234) + assert_allclose(scores_weight, scores_ref_sampling) + assert_allclose(sample_weight, sample_ref_sampling) + + # Test that sample weights has a non-trivial effect + diff = np.max(np.abs(scores_no_weight - scores_weight)) + assert diff > 0.001 + + # Test invariance with respect to arbitrary scaling + scale_factor = rng.rand() + kde.fit(X, sample_weight=(scale_factor * weights)) + scores_scaled_weight = kde.score_samples(test_points) + assert_allclose(scores_scaled_weight, scores_weight) + + +@pytest.mark.parametrize("sample_weight", [None, [0.1, 0.2, 0.3]]) +def test_pickling(tmpdir, sample_weight): + # Make sure that predictions are the same before and after pickling. Used + # to be a bug because sample_weights wasn't pickled and the resulting tree + # would miss some info. + + kde = KernelDensity() + data = np.reshape([1.0, 2.0, 3.0], (-1, 1)) + kde.fit(data, sample_weight=sample_weight) + + X = np.reshape([1.1, 2.1], (-1, 1)) + scores = kde.score_samples(X) + + file_path = str(tmpdir.join("dump.pkl")) + joblib.dump(kde, file_path) + kde = joblib.load(file_path) + scores_pickled = kde.score_samples(X) + + assert_allclose(scores, scores_pickled) + + +@pytest.mark.parametrize("method", ["score_samples", "sample"]) +def test_check_is_fitted(method): + # Check that predict raises an exception in an unfitted estimator. + # Unfitted estimators should raise a NotFittedError. + rng = np.random.RandomState(0) + X = rng.randn(10, 2) + kde = KernelDensity() + + with pytest.raises(NotFittedError): + getattr(kde, method)(X) + + +@pytest.mark.parametrize("bandwidth", ["scott", "silverman", 0.1]) +def test_bandwidth(bandwidth): + n_samples, n_features = (100, 3) + rng = np.random.RandomState(0) + X = rng.randn(n_samples, n_features) + kde = KernelDensity(bandwidth=bandwidth).fit(X) + samp = kde.sample(100) + kde_sc = kde.score_samples(X) + assert X.shape == samp.shape + assert kde_sc.shape == (n_samples,) + + # Test that the attribute self.bandwidth_ has the expected value + if bandwidth == "scott": + h = X.shape[0] ** (-1 / (X.shape[1] + 4)) + elif bandwidth == "silverman": + h = (X.shape[0] * (X.shape[1] + 2) / 4) ** (-1 / (X.shape[1] + 4)) + else: + h = bandwidth + assert kde.bandwidth_ == pytest.approx(h) diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_lof.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_lof.py new file mode 100644 index 0000000000000000000000000000000000000000..140d0d9ba6dff1ba15acf54fe769cd526e832c3d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_lof.py @@ -0,0 +1,394 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import re +from math import sqrt + +import numpy as np +import pytest + +from sklearn import metrics, neighbors +from sklearn.datasets import load_iris +from sklearn.metrics import roc_auc_score +from sklearn.utils import check_random_state +from sklearn.utils._testing import assert_allclose, assert_array_equal +from sklearn.utils.estimator_checks import ( + check_outlier_corruption, + parametrize_with_checks, +) +from sklearn.utils.fixes import CSR_CONTAINERS + +# load the iris dataset +# and randomly permute it +rng = check_random_state(0) +iris = load_iris() +perm = rng.permutation(iris.target.size) +iris.data = iris.data[perm] +iris.target = iris.target[perm] + + +def test_lof(global_dtype): + # Toy sample (the last two samples are outliers): + X = np.asarray( + [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [5, 3], [-4, 2]], + dtype=global_dtype, + ) + + # Test LocalOutlierFactor: + clf = neighbors.LocalOutlierFactor(n_neighbors=5) + score = clf.fit(X).negative_outlier_factor_ + assert_array_equal(clf._fit_X, X) + + # Assert largest outlier score is smaller than smallest inlier score: + assert np.min(score[:-2]) > np.max(score[-2:]) + + # Assert predict() works: + clf = neighbors.LocalOutlierFactor(contamination=0.25, n_neighbors=5).fit(X) + expected_predictions = 6 * [1] + 2 * [-1] + assert_array_equal(clf._predict(), expected_predictions) + assert_array_equal(clf.fit_predict(X), expected_predictions) + + +def test_lof_performance(global_dtype): + # Generate train/test data + rng = check_random_state(2) + X = 0.3 * rng.randn(120, 2).astype(global_dtype, copy=False) + X_train = X[:100] + + # Generate some abnormal novel observations + X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)).astype( + global_dtype, copy=False + ) + X_test = np.r_[X[100:], X_outliers] + y_test = np.array([0] * 20 + [1] * 20) + + # fit the model for novelty detection + clf = neighbors.LocalOutlierFactor(novelty=True).fit(X_train) + + # predict scores (the lower, the more normal) + y_pred = -clf.decision_function(X_test) + + # check that roc_auc is good + assert roc_auc_score(y_test, y_pred) > 0.99 + + +def test_lof_values(global_dtype): + # toy samples: + X_train = np.asarray([[1, 1], [1, 2], [2, 1]], dtype=global_dtype) + clf1 = neighbors.LocalOutlierFactor( + n_neighbors=2, contamination=0.1, novelty=True + ).fit(X_train) + clf2 = neighbors.LocalOutlierFactor(n_neighbors=2, novelty=True).fit(X_train) + s_0 = 2.0 * sqrt(2.0) / (1.0 + sqrt(2.0)) + s_1 = (1.0 + sqrt(2)) * (1.0 / (4.0 * sqrt(2.0)) + 1.0 / (2.0 + 2.0 * sqrt(2))) + # check predict() + assert_allclose(-clf1.negative_outlier_factor_, [s_0, s_1, s_1]) + assert_allclose(-clf2.negative_outlier_factor_, [s_0, s_1, s_1]) + # check predict(one sample not in train) + assert_allclose(-clf1.score_samples([[2.0, 2.0]]), [s_0]) + assert_allclose(-clf2.score_samples([[2.0, 2.0]]), [s_0]) + # check predict(one sample already in train) + assert_allclose(-clf1.score_samples([[1.0, 1.0]]), [s_1]) + assert_allclose(-clf2.score_samples([[1.0, 1.0]]), [s_1]) + + +def test_lof_precomputed(global_dtype, random_state=42): + """Tests LOF with a distance matrix.""" + # Note: smaller samples may result in spurious test success + rng = np.random.RandomState(random_state) + X = rng.random_sample((10, 4)).astype(global_dtype, copy=False) + Y = rng.random_sample((3, 4)).astype(global_dtype, copy=False) + DXX = metrics.pairwise_distances(X, metric="euclidean") + DYX = metrics.pairwise_distances(Y, X, metric="euclidean") + # As a feature matrix (n_samples by n_features) + lof_X = neighbors.LocalOutlierFactor(n_neighbors=3, novelty=True) + lof_X.fit(X) + pred_X_X = lof_X._predict() + pred_X_Y = lof_X.predict(Y) + + # As a dense distance matrix (n_samples by n_samples) + lof_D = neighbors.LocalOutlierFactor( + n_neighbors=3, algorithm="brute", metric="precomputed", novelty=True + ) + lof_D.fit(DXX) + pred_D_X = lof_D._predict() + pred_D_Y = lof_D.predict(DYX) + + assert_allclose(pred_X_X, pred_D_X) + assert_allclose(pred_X_Y, pred_D_Y) + + +def test_n_neighbors_attribute(): + X = iris.data + clf = neighbors.LocalOutlierFactor(n_neighbors=500).fit(X) + assert clf.n_neighbors_ == X.shape[0] - 1 + + clf = neighbors.LocalOutlierFactor(n_neighbors=500) + msg = "n_neighbors will be set to (n_samples - 1)" + with pytest.warns(UserWarning, match=re.escape(msg)): + clf.fit(X) + assert clf.n_neighbors_ == X.shape[0] - 1 + + +def test_score_samples(global_dtype): + X_train = np.asarray([[1, 1], [1, 2], [2, 1]], dtype=global_dtype) + X_test = np.asarray([[2.0, 2.0]], dtype=global_dtype) + clf1 = neighbors.LocalOutlierFactor( + n_neighbors=2, contamination=0.1, novelty=True + ).fit(X_train) + clf2 = neighbors.LocalOutlierFactor(n_neighbors=2, novelty=True).fit(X_train) + + clf1_scores = clf1.score_samples(X_test) + clf1_decisions = clf1.decision_function(X_test) + + clf2_scores = clf2.score_samples(X_test) + clf2_decisions = clf2.decision_function(X_test) + + assert_allclose( + clf1_scores, + clf1_decisions + clf1.offset_, + ) + assert_allclose( + clf2_scores, + clf2_decisions + clf2.offset_, + ) + assert_allclose(clf1_scores, clf2_scores) + + +def test_novelty_errors(): + X = iris.data + + # check errors for novelty=False + clf = neighbors.LocalOutlierFactor() + clf.fit(X) + # predict, decision_function and score_samples raise ValueError + for method in ["predict", "decision_function", "score_samples"]: + outer_msg = f"'LocalOutlierFactor' has no attribute '{method}'" + inner_msg = "{} is not available when novelty=False".format(method) + with pytest.raises(AttributeError, match=outer_msg) as exec_info: + getattr(clf, method) + + assert isinstance(exec_info.value.__cause__, AttributeError) + assert inner_msg in str(exec_info.value.__cause__) + + # check errors for novelty=True + clf = neighbors.LocalOutlierFactor(novelty=True) + + outer_msg = "'LocalOutlierFactor' has no attribute 'fit_predict'" + inner_msg = "fit_predict is not available when novelty=True" + with pytest.raises(AttributeError, match=outer_msg) as exec_info: + getattr(clf, "fit_predict") + + assert isinstance(exec_info.value.__cause__, AttributeError) + assert inner_msg in str(exec_info.value.__cause__) + + +def test_novelty_training_scores(global_dtype): + # check that the scores of the training samples are still accessible + # when novelty=True through the negative_outlier_factor_ attribute + X = iris.data.astype(global_dtype) + + # fit with novelty=False + clf_1 = neighbors.LocalOutlierFactor() + clf_1.fit(X) + scores_1 = clf_1.negative_outlier_factor_ + + # fit with novelty=True + clf_2 = neighbors.LocalOutlierFactor(novelty=True) + clf_2.fit(X) + scores_2 = clf_2.negative_outlier_factor_ + + assert_allclose(scores_1, scores_2) + + +def test_hasattr_prediction(): + # check availability of prediction methods depending on novelty value. + X = [[1, 1], [1, 2], [2, 1]] + + # when novelty=True + clf = neighbors.LocalOutlierFactor(novelty=True) + clf.fit(X) + assert hasattr(clf, "predict") + assert hasattr(clf, "decision_function") + assert hasattr(clf, "score_samples") + assert not hasattr(clf, "fit_predict") + + # when novelty=False + clf = neighbors.LocalOutlierFactor(novelty=False) + clf.fit(X) + assert hasattr(clf, "fit_predict") + assert not hasattr(clf, "predict") + assert not hasattr(clf, "decision_function") + assert not hasattr(clf, "score_samples") + + +@parametrize_with_checks([neighbors.LocalOutlierFactor(novelty=True)]) +def test_novelty_true_common_tests(estimator, check): + # the common tests are run for the default LOF (novelty=False). + # here we run these common tests for LOF when novelty=True + check(estimator) + + +@pytest.mark.parametrize("expected_outliers", [30, 53]) +def test_predicted_outlier_number(expected_outliers): + # the number of predicted outliers should be equal to the number of + # expected outliers unless there are ties in the abnormality scores. + X = iris.data + n_samples = X.shape[0] + contamination = float(expected_outliers) / n_samples + + clf = neighbors.LocalOutlierFactor(contamination=contamination) + y_pred = clf.fit_predict(X) + + num_outliers = np.sum(y_pred != 1) + if num_outliers != expected_outliers: + y_dec = clf.negative_outlier_factor_ + check_outlier_corruption(num_outliers, expected_outliers, y_dec) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse(csr_container): + # LocalOutlierFactor must support CSR inputs + # TODO: compare results on dense and sparse data as proposed in: + # https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186 + X = csr_container(iris.data) + + lof = neighbors.LocalOutlierFactor(novelty=True) + lof.fit(X) + lof.predict(X) + lof.score_samples(X) + lof.decision_function(X) + + lof = neighbors.LocalOutlierFactor(novelty=False) + lof.fit_predict(X) + + +def test_lof_error_n_neighbors_too_large(): + """Check that we raise a proper error message when n_neighbors == n_samples. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/17207 + """ + X = np.ones((7, 7)) + + msg = ( + "Expected n_neighbors < n_samples_fit, but n_neighbors = 1, " + "n_samples_fit = 1, n_samples = 1" + ) + with pytest.raises(ValueError, match=msg): + lof = neighbors.LocalOutlierFactor(n_neighbors=1).fit(X[:1]) + + lof = neighbors.LocalOutlierFactor(n_neighbors=2).fit(X[:2]) + assert lof.n_samples_fit_ == 2 + + msg = ( + "Expected n_neighbors < n_samples_fit, but n_neighbors = 2, " + "n_samples_fit = 2, n_samples = 2" + ) + with pytest.raises(ValueError, match=msg): + lof.kneighbors(None, n_neighbors=2) + + distances, indices = lof.kneighbors(None, n_neighbors=1) + assert distances.shape == (2, 1) + assert indices.shape == (2, 1) + + msg = ( + "Expected n_neighbors <= n_samples_fit, but n_neighbors = 3, " + "n_samples_fit = 2, n_samples = 7" + ) + with pytest.raises(ValueError, match=msg): + lof.kneighbors(X, n_neighbors=3) + + ( + distances, + indices, + ) = lof.kneighbors(X, n_neighbors=2) + assert distances.shape == (7, 2) + assert indices.shape == (7, 2) + + +@pytest.mark.parametrize("algorithm", ["auto", "ball_tree", "kd_tree", "brute"]) +@pytest.mark.parametrize("novelty", [True, False]) +@pytest.mark.parametrize("contamination", [0.5, "auto"]) +def test_lof_input_dtype_preservation(global_dtype, algorithm, contamination, novelty): + """Check that the fitted attributes are stored using the data type of X.""" + X = iris.data.astype(global_dtype, copy=False) + + iso = neighbors.LocalOutlierFactor( + n_neighbors=5, algorithm=algorithm, contamination=contamination, novelty=novelty + ) + iso.fit(X) + + assert iso.negative_outlier_factor_.dtype == global_dtype + + for method in ("score_samples", "decision_function"): + if hasattr(iso, method): + y_pred = getattr(iso, method)(X) + assert y_pred.dtype == global_dtype + + +@pytest.mark.parametrize("algorithm", ["auto", "ball_tree", "kd_tree", "brute"]) +@pytest.mark.parametrize("novelty", [True, False]) +@pytest.mark.parametrize("contamination", [0.5, "auto"]) +def test_lof_dtype_equivalence(algorithm, novelty, contamination): + """Check the equivalence of the results with 32 and 64 bits input.""" + + inliers = iris.data[:50] # setosa iris are really distinct from others + outliers = iris.data[-5:] # virginica will be considered as outliers + # lower the precision of the input data to check that we have an equivalence when + # making the computation in 32 and 64 bits. + X = np.concatenate([inliers, outliers], axis=0).astype(np.float32) + + lof_32 = neighbors.LocalOutlierFactor( + algorithm=algorithm, novelty=novelty, contamination=contamination + ) + X_32 = X.astype(np.float32, copy=True) + lof_32.fit(X_32) + + lof_64 = neighbors.LocalOutlierFactor( + algorithm=algorithm, novelty=novelty, contamination=contamination + ) + X_64 = X.astype(np.float64, copy=True) + lof_64.fit(X_64) + + assert_allclose(lof_32.negative_outlier_factor_, lof_64.negative_outlier_factor_) + + for method in ("score_samples", "decision_function", "predict", "fit_predict"): + if hasattr(lof_32, method): + y_pred_32 = getattr(lof_32, method)(X_32) + y_pred_64 = getattr(lof_64, method)(X_64) + assert_allclose(y_pred_32, y_pred_64, atol=0.0002) + + +def test_lof_duplicate_samples(): + """ + Check that LocalOutlierFactor raises a warning when duplicate values + in the training data cause inaccurate results. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/27839 + """ + + rng = np.random.default_rng(0) + + x = rng.permutation( + np.hstack( + [ + [0.1] * 1000, # constant values + np.linspace(0.1, 0.3, num=3000), + rng.random(500) * 100, # the clear outliers + ] + ) + ) + X = x.reshape(-1, 1) + + error_msg = ( + "Duplicate values are leading to incorrect results. " + "Increase the number of neighbors for more accurate results." + ) + + lof = neighbors.LocalOutlierFactor(n_neighbors=5, contamination=0.1) + + # Catch the warning + with pytest.warns(UserWarning, match=re.escape(error_msg)): + lof.fit_predict(X) diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_nca.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_nca.py new file mode 100644 index 0000000000000000000000000000000000000000..ebfb01d12e3acbbb31d79a3a0573f39884cac6bb --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_nca.py @@ -0,0 +1,563 @@ +""" +Testing for Neighborhood Component Analysis module (sklearn.neighbors.nca) +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import re + +import numpy as np +import pytest +from numpy.testing import assert_array_almost_equal, assert_array_equal +from scipy.optimize import check_grad + +from sklearn import clone +from sklearn.datasets import load_iris, make_blobs, make_classification +from sklearn.exceptions import ConvergenceWarning +from sklearn.metrics import pairwise_distances +from sklearn.neighbors import NeighborhoodComponentsAnalysis +from sklearn.preprocessing import LabelEncoder +from sklearn.utils import check_random_state +from sklearn.utils.validation import validate_data + +rng = check_random_state(0) +# Load and shuffle the iris dataset. +iris = load_iris() +perm = rng.permutation(iris.target.size) +iris_data = iris.data[perm] +iris_target = iris.target[perm] +# Avoid having test data introducing dependencies between tests. +iris_data.flags.writeable = False +iris_target.flags.writeable = False +EPS = np.finfo(float).eps + + +def test_simple_example(): + """Test on a simple example. + + Puts four points in the input space where the opposite labels points are + next to each other. After transform the samples from the same class + should be next to each other. + + """ + X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]]) + y = np.array([1, 0, 1, 0]) + nca = NeighborhoodComponentsAnalysis( + n_components=2, init="identity", random_state=42 + ) + nca.fit(X, y) + X_t = nca.transform(X) + assert_array_equal(pairwise_distances(X_t).argsort()[:, 1], np.array([2, 3, 0, 1])) + + +def test_toy_example_collapse_points(): + """Test on a toy example of three points that should collapse + + We build a simple example: two points from the same class and a point from + a different class in the middle of them. On this simple example, the new + (transformed) points should all collapse into one single point. Indeed, the + objective is 2/(1 + exp(d/2)), with d the euclidean distance between the + two samples from the same class. This is maximized for d=0 (because d>=0), + with an objective equal to 1 (loss=-1.). + + """ + rng = np.random.RandomState(42) + input_dim = 5 + two_points = rng.randn(2, input_dim) + X = np.vstack([two_points, two_points.mean(axis=0)[np.newaxis, :]]) + y = [0, 0, 1] + + class LossStorer: + def __init__(self, X, y): + self.loss = np.inf # initialize the loss to very high + # Initialize a fake NCA and variables needed to compute the loss: + self.fake_nca = NeighborhoodComponentsAnalysis() + self.fake_nca.n_iter_ = np.inf + self.X, y = validate_data(self.fake_nca, X, y, ensure_min_samples=2) + y = LabelEncoder().fit_transform(y) + self.same_class_mask = y[:, np.newaxis] == y[np.newaxis, :] + + def callback(self, transformation, n_iter): + """Stores the last value of the loss function""" + self.loss, _ = self.fake_nca._loss_grad_lbfgs( + transformation, self.X, self.same_class_mask, -1.0 + ) + + loss_storer = LossStorer(X, y) + nca = NeighborhoodComponentsAnalysis(random_state=42, callback=loss_storer.callback) + X_t = nca.fit_transform(X, y) + print(X_t) + # test that points are collapsed into one point + assert_array_almost_equal(X_t - X_t[0], 0.0) + assert abs(loss_storer.loss + 1) < 1e-10 + + +def test_finite_differences(global_random_seed): + """Test gradient of loss function + + Assert that the gradient is almost equal to its finite differences + approximation. + """ + # Initialize the transformation `M`, as well as `X` and `y` and `NCA` + rng = np.random.RandomState(global_random_seed) + X, y = make_classification(random_state=global_random_seed) + M = rng.randn(rng.randint(1, X.shape[1] + 1), X.shape[1]) + nca = NeighborhoodComponentsAnalysis() + nca.n_iter_ = 0 + mask = y[:, np.newaxis] == y[np.newaxis, :] + + def fun(M): + return nca._loss_grad_lbfgs(M, X, mask)[0] + + def grad(M): + return nca._loss_grad_lbfgs(M, X, mask)[1] + + # compare the gradient to a finite difference approximation + diff = check_grad(fun, grad, M.ravel()) + assert diff == pytest.approx(0.0, abs=1e-4) + + +def test_params_validation(): + # Test that invalid parameters raise value error + X = np.arange(12).reshape(4, 3) + y = [1, 1, 2, 2] + NCA = NeighborhoodComponentsAnalysis + rng = np.random.RandomState(42) + + init = rng.rand(5, 3) + msg = ( + f"The output dimensionality ({init.shape[0]}) " + "of the given linear transformation `init` cannot be " + f"greater than its input dimensionality ({init.shape[1]})." + ) + with pytest.raises(ValueError, match=re.escape(msg)): + NCA(init=init).fit(X, y) + n_components = 10 + msg = ( + "The preferred dimensionality of the projected space " + f"`n_components` ({n_components}) cannot be greater " + f"than the given data dimensionality ({X.shape[1]})!" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + NCA(n_components=n_components).fit(X, y) + + +def test_transformation_dimensions(): + X = np.arange(12).reshape(4, 3) + y = [1, 1, 2, 2] + + # Fail if transformation input dimension does not match inputs dimensions + transformation = np.array([[1, 2], [3, 4]]) + with pytest.raises(ValueError): + NeighborhoodComponentsAnalysis(init=transformation).fit(X, y) + + # Fail if transformation output dimension is larger than + # transformation input dimension + transformation = np.array([[1, 2], [3, 4], [5, 6]]) + # len(transformation) > len(transformation[0]) + with pytest.raises(ValueError): + NeighborhoodComponentsAnalysis(init=transformation).fit(X, y) + + # Pass otherwise + transformation = np.arange(9).reshape(3, 3) + NeighborhoodComponentsAnalysis(init=transformation).fit(X, y) + + +def test_n_components(): + rng = np.random.RandomState(42) + X = np.arange(12).reshape(4, 3) + y = [1, 1, 2, 2] + + init = rng.rand(X.shape[1] - 1, 3) + + # n_components = X.shape[1] != transformation.shape[0] + n_components = X.shape[1] + nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components) + msg = ( + "The preferred dimensionality of the projected space " + f"`n_components` ({n_components}) does not match the output " + "dimensionality of the given linear transformation " + f"`init` ({init.shape[0]})!" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + nca.fit(X, y) + + # n_components > X.shape[1] + n_components = X.shape[1] + 2 + nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components) + msg = ( + "The preferred dimensionality of the projected space " + f"`n_components` ({n_components}) cannot be greater than " + f"the given data dimensionality ({X.shape[1]})!" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + nca.fit(X, y) + + # n_components < X.shape[1] + nca = NeighborhoodComponentsAnalysis(n_components=2, init="identity") + nca.fit(X, y) + + +def test_init_transformation(): + rng = np.random.RandomState(42) + X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0) + + # Start learning from scratch + nca = NeighborhoodComponentsAnalysis(init="identity") + nca.fit(X, y) + + # Initialize with random + nca_random = NeighborhoodComponentsAnalysis(init="random") + nca_random.fit(X, y) + + # Initialize with auto + nca_auto = NeighborhoodComponentsAnalysis(init="auto") + nca_auto.fit(X, y) + + # Initialize with PCA + nca_pca = NeighborhoodComponentsAnalysis(init="pca") + nca_pca.fit(X, y) + + # Initialize with LDA + nca_lda = NeighborhoodComponentsAnalysis(init="lda") + nca_lda.fit(X, y) + + init = rng.rand(X.shape[1], X.shape[1]) + nca = NeighborhoodComponentsAnalysis(init=init) + nca.fit(X, y) + + # init.shape[1] must match X.shape[1] + init = rng.rand(X.shape[1], X.shape[1] + 1) + nca = NeighborhoodComponentsAnalysis(init=init) + msg = ( + f"The input dimensionality ({init.shape[1]}) of the given " + "linear transformation `init` must match the " + f"dimensionality of the given inputs `X` ({X.shape[1]})." + ) + with pytest.raises(ValueError, match=re.escape(msg)): + nca.fit(X, y) + + # init.shape[0] must be <= init.shape[1] + init = rng.rand(X.shape[1] + 1, X.shape[1]) + nca = NeighborhoodComponentsAnalysis(init=init) + msg = ( + f"The output dimensionality ({init.shape[0]}) of the given " + "linear transformation `init` cannot be " + f"greater than its input dimensionality ({init.shape[1]})." + ) + with pytest.raises(ValueError, match=re.escape(msg)): + nca.fit(X, y) + + # init.shape[0] must match n_components + init = rng.rand(X.shape[1], X.shape[1]) + n_components = X.shape[1] - 2 + nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components) + msg = ( + "The preferred dimensionality of the " + f"projected space `n_components` ({n_components}) " + "does not match the output dimensionality of the given " + f"linear transformation `init` ({init.shape[0]})!" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + nca.fit(X, y) + + +@pytest.mark.parametrize("n_samples", [3, 5, 7, 11]) +@pytest.mark.parametrize("n_features", [3, 5, 7, 11]) +@pytest.mark.parametrize("n_classes", [5, 7, 11]) +@pytest.mark.parametrize("n_components", [3, 5, 7, 11]) +def test_auto_init(n_samples, n_features, n_classes, n_components): + # Test that auto choose the init as expected with every configuration + # of order of n_samples, n_features, n_classes and n_components. + rng = np.random.RandomState(42) + nca_base = NeighborhoodComponentsAnalysis( + init="auto", n_components=n_components, max_iter=1, random_state=rng + ) + if n_classes >= n_samples: + pass + # n_classes > n_samples is impossible, and n_classes == n_samples + # throws an error from lda but is an absurd case + else: + X = rng.randn(n_samples, n_features) + y = np.tile(range(n_classes), n_samples // n_classes + 1)[:n_samples] + if n_components > n_features: + # this would return a ValueError, which is already tested in + # test_params_validation + pass + else: + nca = clone(nca_base) + nca.fit(X, y) + if n_components <= min(n_classes - 1, n_features): + nca_other = clone(nca_base).set_params(init="lda") + elif n_components < min(n_features, n_samples): + nca_other = clone(nca_base).set_params(init="pca") + else: + nca_other = clone(nca_base).set_params(init="identity") + nca_other.fit(X, y) + assert_array_almost_equal(nca.components_, nca_other.components_) + + +def test_warm_start_validation(): + X, y = make_classification( + n_samples=30, + n_features=5, + n_classes=4, + n_redundant=0, + n_informative=5, + random_state=0, + ) + + nca = NeighborhoodComponentsAnalysis(warm_start=True, max_iter=5) + nca.fit(X, y) + + X_less_features, y = make_classification( + n_samples=30, + n_features=4, + n_classes=4, + n_redundant=0, + n_informative=4, + random_state=0, + ) + msg = ( + f"The new inputs dimensionality ({X_less_features.shape[1]}) " + "does not match the input dimensionality of the previously learned " + f"transformation ({nca.components_.shape[1]})." + ) + with pytest.raises(ValueError, match=re.escape(msg)): + nca.fit(X_less_features, y) + + +def test_warm_start_effectiveness(): + # A 1-iteration second fit on same data should give almost same result + # with warm starting, and quite different result without warm starting. + + nca_warm = NeighborhoodComponentsAnalysis(warm_start=True, random_state=0) + nca_warm.fit(iris_data, iris_target) + transformation_warm = nca_warm.components_ + nca_warm.max_iter = 1 + nca_warm.fit(iris_data, iris_target) + transformation_warm_plus_one = nca_warm.components_ + + nca_cold = NeighborhoodComponentsAnalysis(warm_start=False, random_state=0) + nca_cold.fit(iris_data, iris_target) + transformation_cold = nca_cold.components_ + nca_cold.max_iter = 1 + nca_cold.fit(iris_data, iris_target) + transformation_cold_plus_one = nca_cold.components_ + + diff_warm = np.sum(np.abs(transformation_warm_plus_one - transformation_warm)) + diff_cold = np.sum(np.abs(transformation_cold_plus_one - transformation_cold)) + assert diff_warm < 3.0, ( + "Transformer changed significantly after one " + "iteration even though it was warm-started." + ) + + assert diff_cold > diff_warm, ( + "Cold-started transformer changed less " + "significantly than warm-started " + "transformer after one iteration." + ) + + +@pytest.mark.parametrize( + "init_name", ["pca", "lda", "identity", "random", "precomputed"] +) +def test_verbose(init_name, capsys): + # assert there is proper output when verbose = 1, for every initialization + # except auto because auto will call one of the others + rng = np.random.RandomState(42) + X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0) + regexp_init = r"... done in \ *\d+\.\d{2}s" + msgs = { + "pca": "Finding principal components" + regexp_init, + "lda": "Finding most discriminative components" + regexp_init, + } + if init_name == "precomputed": + init = rng.randn(X.shape[1], X.shape[1]) + else: + init = init_name + nca = NeighborhoodComponentsAnalysis(verbose=1, init=init) + nca.fit(X, y) + out, _ = capsys.readouterr() + + # check output + lines = re.split("\n+", out) + # if pca or lda init, an additional line is printed, so we test + # it and remove it to test the rest equally among initializations + if init_name in ["pca", "lda"]: + assert re.match(msgs[init_name], lines[0]) + lines = lines[1:] + assert lines[0] == "[NeighborhoodComponentsAnalysis]" + header = "{:>10} {:>20} {:>10}".format("Iteration", "Objective Value", "Time(s)") + assert lines[1] == "[NeighborhoodComponentsAnalysis] {}".format(header) + assert lines[2] == "[NeighborhoodComponentsAnalysis] {}".format("-" * len(header)) + for line in lines[3:-2]: + # The following regex will match for instance: + # '[NeighborhoodComponentsAnalysis] 0 6.988936e+01 0.01' + assert re.match( + r"\[NeighborhoodComponentsAnalysis\] *\d+ *\d\.\d{6}e" + r"[+|-]\d+\ *\d+\.\d{2}", + line, + ) + assert re.match( + r"\[NeighborhoodComponentsAnalysis\] Training took\ *\d+\.\d{2}s\.", + lines[-2], + ) + assert lines[-1] == "" + + +def test_no_verbose(capsys): + # assert by default there is no output (verbose=0) + nca = NeighborhoodComponentsAnalysis() + nca.fit(iris_data, iris_target) + out, _ = capsys.readouterr() + # check output + assert out == "" + + +def test_singleton_class(): + X = iris_data.copy() + y = iris_target.copy() + + # one singleton class + singleton_class = 1 + (ind_singleton,) = np.where(y == singleton_class) + y[ind_singleton] = 2 + y[ind_singleton[0]] = singleton_class + + nca = NeighborhoodComponentsAnalysis(max_iter=30) + nca.fit(X, y) + + # One non-singleton class + (ind_1,) = np.where(y == 1) + (ind_2,) = np.where(y == 2) + y[ind_1] = 0 + y[ind_1[0]] = 1 + y[ind_2] = 0 + y[ind_2[0]] = 2 + + nca = NeighborhoodComponentsAnalysis(max_iter=30) + nca.fit(X, y) + + # Only singleton classes + (ind_0,) = np.where(y == 0) + (ind_1,) = np.where(y == 1) + (ind_2,) = np.where(y == 2) + X = X[[ind_0[0], ind_1[0], ind_2[0]]] + y = y[[ind_0[0], ind_1[0], ind_2[0]]] + + nca = NeighborhoodComponentsAnalysis(init="identity", max_iter=30) + nca.fit(X, y) + assert_array_equal(X, nca.transform(X)) + + +def test_one_class(): + X = iris_data[iris_target == 0] + y = iris_target[iris_target == 0] + + nca = NeighborhoodComponentsAnalysis( + max_iter=30, n_components=X.shape[1], init="identity" + ) + nca.fit(X, y) + assert_array_equal(X, nca.transform(X)) + + +def test_callback(capsys): + max_iter = 10 + + def my_cb(transformation, n_iter): + assert transformation.shape == (iris_data.shape[1] ** 2,) + rem_iter = max_iter - n_iter + print("{} iterations remaining...".format(rem_iter)) + + # assert that my_cb is called + nca = NeighborhoodComponentsAnalysis(max_iter=max_iter, callback=my_cb, verbose=1) + nca.fit(iris_data, iris_target) + out, _ = capsys.readouterr() + + # check output + assert "{} iterations remaining...".format(max_iter - 1) in out + + +def test_expected_transformation_shape(): + """Test that the transformation has the expected shape.""" + X = iris_data + y = iris_target + + class TransformationStorer: + def __init__(self, X, y): + # Initialize a fake NCA and variables needed to call the loss + # function: + self.fake_nca = NeighborhoodComponentsAnalysis() + self.fake_nca.n_iter_ = np.inf + self.X, y = validate_data(self.fake_nca, X, y, ensure_min_samples=2) + y = LabelEncoder().fit_transform(y) + self.same_class_mask = y[:, np.newaxis] == y[np.newaxis, :] + + def callback(self, transformation, n_iter): + """Stores the last value of the transformation taken as input by + the optimizer""" + self.transformation = transformation + + transformation_storer = TransformationStorer(X, y) + cb = transformation_storer.callback + nca = NeighborhoodComponentsAnalysis(max_iter=5, callback=cb) + nca.fit(X, y) + assert transformation_storer.transformation.size == X.shape[1] ** 2 + + +def test_convergence_warning(): + nca = NeighborhoodComponentsAnalysis(max_iter=2, verbose=1) + cls_name = nca.__class__.__name__ + msg = "[{}] NCA did not converge".format(cls_name) + with pytest.warns(ConvergenceWarning, match=re.escape(msg)): + nca.fit(iris_data, iris_target) + + +@pytest.mark.parametrize( + "param, value", + [ + ("n_components", np.int32(3)), + ("max_iter", np.int32(100)), + ("tol", np.float32(0.0001)), + ], +) +def test_parameters_valid_types(param, value): + # check that no error is raised when parameters have numpy integer or + # floating types. + nca = NeighborhoodComponentsAnalysis(**{param: value}) + + X = iris_data + y = iris_target + + nca.fit(X, y) + + +@pytest.mark.parametrize("n_components", [None, 2]) +def test_nca_feature_names_out(n_components): + """Check `get_feature_names_out` for `NeighborhoodComponentsAnalysis`. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/28293 + """ + + X = iris_data + y = iris_target + + est = NeighborhoodComponentsAnalysis(n_components=n_components).fit(X, y) + names_out = est.get_feature_names_out() + + class_name_lower = est.__class__.__name__.lower() + + if n_components is not None: + expected_n_features = n_components + else: + expected_n_features = X.shape[1] + + expected_names_out = np.array( + [f"{class_name_lower}{i}" for i in range(expected_n_features)], + dtype=object, + ) + + assert_array_equal(names_out, expected_names_out) diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_nearest_centroid.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_nearest_centroid.py new file mode 100644 index 0000000000000000000000000000000000000000..1aa9274cd28a89be3744f56b6c3f31b80c2252ed --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_nearest_centroid.py @@ -0,0 +1,237 @@ +""" +Testing for the nearest centroid module. +""" + +import numpy as np +import pytest + +from sklearn import datasets +from sklearn.neighbors import NearestCentroid +from sklearn.utils._testing import ( + assert_allclose, + assert_array_almost_equal, + assert_array_equal, +) +from sklearn.utils.fixes import CSR_CONTAINERS + +# toy sample +X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] +y = [-1, -1, -1, 1, 1, 1] +T = [[-1, -1], [2, 2], [3, 2]] +true_result = [-1, 1, 1] +true_result_prior1 = [-1, 1, 1] + +true_discriminant_scores = [-32, 64, 80] +true_proba = [[1, 1.26642e-14], [1.60381e-28, 1], [1.80485e-35, 1]] + + +# also load the iris dataset +# and randomly permute it +iris = datasets.load_iris() +rng = np.random.RandomState(1) +perm = rng.permutation(iris.target.size) +iris.data = iris.data[perm] +iris.target = iris.target[perm] + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_classification_toy(csr_container): + # Check classification on a toy dataset, including sparse versions. + X_csr = csr_container(X) + T_csr = csr_container(T) + + # Check classification on a toy dataset, including sparse versions. + clf = NearestCentroid() + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + assert_array_almost_equal(clf.decision_function(T), true_discriminant_scores) + assert_array_almost_equal(clf.predict_proba(T), true_proba) + + # Test uniform priors + clf = NearestCentroid(priors="uniform") + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + assert_array_almost_equal(clf.decision_function(T), true_discriminant_scores) + assert_array_almost_equal(clf.predict_proba(T), true_proba) + + clf = NearestCentroid(priors="empirical") + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + assert_array_almost_equal(clf.decision_function(T), true_discriminant_scores) + assert_array_almost_equal(clf.predict_proba(T), true_proba) + + # Test custom priors + clf = NearestCentroid(priors=[0.25, 0.75]) + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result_prior1) + + # Same test, but with a sparse matrix to fit and test. + clf = NearestCentroid() + clf.fit(X_csr, y) + assert_array_equal(clf.predict(T_csr), true_result) + + # Fit with sparse, test with non-sparse + clf = NearestCentroid() + clf.fit(X_csr, y) + assert_array_equal(clf.predict(T), true_result) + + # Fit with non-sparse, test with sparse + clf = NearestCentroid() + clf.fit(X, y) + assert_array_equal(clf.predict(T_csr), true_result) + + # Fit and predict with non-CSR sparse matrices + clf = NearestCentroid() + clf.fit(X_csr.tocoo(), y) + assert_array_equal(clf.predict(T_csr.tolil()), true_result) + + +def test_iris(): + # Check consistency on dataset iris. + for metric in ("euclidean", "manhattan"): + clf = NearestCentroid(metric=metric).fit(iris.data, iris.target) + score = np.mean(clf.predict(iris.data) == iris.target) + assert score > 0.9, "Failed with score = " + str(score) + + +def test_iris_shrinkage(): + # Check consistency on dataset iris, when using shrinkage. + for metric in ("euclidean", "manhattan"): + for shrink_threshold in [None, 0.1, 0.5]: + clf = NearestCentroid(metric=metric, shrink_threshold=shrink_threshold) + clf = clf.fit(iris.data, iris.target) + score = np.mean(clf.predict(iris.data) == iris.target) + assert score > 0.8, "Failed with score = " + str(score) + + +def test_pickle(): + import pickle + + # classification + obj = NearestCentroid() + obj.fit(iris.data, iris.target) + score = obj.score(iris.data, iris.target) + s = pickle.dumps(obj) + + obj2 = pickle.loads(s) + assert type(obj2) == obj.__class__ + score2 = obj2.score(iris.data, iris.target) + assert_array_equal( + score, + score2, + "Failed to generate same score after pickling (classification).", + ) + + +def test_shrinkage_correct(): + # Ensure that the shrinking is correct. + # The expected result is calculated by R (pamr), + # which is implemented by the author of the original paper. + # (One need to modify the code to output the new centroid in pamr.predict) + + X = np.array([[0, 1], [1, 0], [1, 1], [2, 0], [6, 8]]) + y = np.array([1, 1, 2, 2, 2]) + clf = NearestCentroid(shrink_threshold=0.1) + clf.fit(X, y) + expected_result = np.array([[0.7787310, 0.8545292], [2.814179, 2.763647]]) + np.testing.assert_array_almost_equal(clf.centroids_, expected_result) + + +def test_shrinkage_threshold_decoded_y(): + clf = NearestCentroid(shrink_threshold=0.01) + y_ind = np.asarray(y) + y_ind[y_ind == -1] = 0 + clf.fit(X, y_ind) + centroid_encoded = clf.centroids_ + clf.fit(X, y) + assert_array_equal(centroid_encoded, clf.centroids_) + + +def test_predict_translated_data(): + # Test that NearestCentroid gives same results on translated data + + rng = np.random.RandomState(0) + X = rng.rand(50, 50) + y = rng.randint(0, 3, 50) + noise = rng.rand(50) + clf = NearestCentroid(shrink_threshold=0.1) + clf.fit(X, y) + y_init = clf.predict(X) + clf = NearestCentroid(shrink_threshold=0.1) + X_noise = X + noise + clf.fit(X_noise, y) + y_translate = clf.predict(X_noise) + assert_array_equal(y_init, y_translate) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_manhattan_metric(csr_container): + # Test the manhattan metric. + X_csr = csr_container(X) + + clf = NearestCentroid(metric="manhattan") + clf.fit(X, y) + dense_centroid = clf.centroids_ + clf.fit(X_csr, y) + assert_array_equal(clf.centroids_, dense_centroid) + assert_array_equal(dense_centroid, [[-1, -1], [1, 1]]) + + +def test_features_zero_var(): + # Test that features with 0 variance throw error + + X = np.empty((10, 2)) + X[:, 0] = -0.13725701 + X[:, 1] = -0.9853293 + y = np.zeros((10)) + y[0] = 1 + + clf = NearestCentroid(shrink_threshold=0.1) + with pytest.raises(ValueError): + clf.fit(X, y) + + +def test_negative_priors_error(): + """Check that we raise an error when the user-defined priors are negative.""" + clf = NearestCentroid(priors=[-2, 4]) + with pytest.raises(ValueError, match="priors must be non-negative"): + clf.fit(X, y) + + +def test_warn_non_normalized_priors(): + """Check that we raise a warning and normalize the user-defined priors when they + don't sum to 1. + """ + priors = [2, 4] + clf = NearestCentroid(priors=priors) + with pytest.warns( + UserWarning, + match="The priors do not sum to 1. Normalizing such that it sums to one.", + ): + clf.fit(X, y) + + assert_allclose(clf.class_prior_, np.asarray(priors) / np.asarray(priors).sum()) + + +@pytest.mark.parametrize( + "response_method", ["decision_function", "predict_proba", "predict_log_proba"] +) +def test_method_not_available_with_manhattan(response_method): + """Check that we raise an AttributeError with Manhattan metric when trying + to call a non-thresholded response method. + """ + clf = NearestCentroid(metric="manhattan").fit(X, y) + with pytest.raises(AttributeError): + getattr(clf, response_method)(T) + + +@pytest.mark.parametrize("array_constructor", [np.array] + CSR_CONTAINERS) +def test_error_zero_variances(array_constructor): + """Check that we raise an error when the variance for all features is zero.""" + X = np.ones((len(y), 2)) + X[:, 1] *= 2 + X = array_constructor(X) + + clf = NearestCentroid() + with pytest.raises(ValueError, match="All features have zero variance"): + clf.fit(X, y) diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_neighbors.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_neighbors.py new file mode 100644 index 0000000000000000000000000000000000000000..ae589b30dd74369cb8ef242fb86a11e0c75a09a2 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_neighbors.py @@ -0,0 +1,2503 @@ +import re +import warnings +from itertools import product + +import joblib +import numpy as np +import pytest +from scipy.sparse import issparse + +from sklearn import ( + config_context, + datasets, + metrics, + neighbors, +) +from sklearn.base import clone +from sklearn.exceptions import EfficiencyWarning, NotFittedError +from sklearn.metrics._dist_metrics import ( + DistanceMetric, +) +from sklearn.metrics.pairwise import PAIRWISE_BOOLEAN_FUNCTIONS, pairwise_distances +from sklearn.metrics.tests.test_dist_metrics import BOOL_METRICS +from sklearn.metrics.tests.test_pairwise_distances_reduction import ( + assert_compatible_argkmin_results, + assert_compatible_radius_results, +) +from sklearn.model_selection import ( + LeaveOneOut, + cross_val_predict, + cross_val_score, + train_test_split, +) +from sklearn.neighbors import ( + VALID_METRICS_SPARSE, + KNeighborsRegressor, +) +from sklearn.neighbors._base import ( + KNeighborsMixin, + _check_precomputed, + _is_sorted_by_data, + sort_graph_by_row_values, +) +from sklearn.pipeline import make_pipeline +from sklearn.utils._testing import ( + assert_allclose, + assert_array_equal, + ignore_warnings, +) +from sklearn.utils.fixes import ( + BSR_CONTAINERS, + COO_CONTAINERS, + CSC_CONTAINERS, + CSR_CONTAINERS, + DIA_CONTAINERS, + DOK_CONTAINERS, + LIL_CONTAINERS, +) +from sklearn.utils.validation import check_random_state + +rng = np.random.RandomState(0) +# load and shuffle iris dataset +iris = datasets.load_iris() +perm = rng.permutation(iris.target.size) +iris.data = iris.data[perm] +iris.target = iris.target[perm] + +# load and shuffle digits +digits = datasets.load_digits() +perm = rng.permutation(digits.target.size) +digits.data = digits.data[perm] +digits.target = digits.target[perm] + +SPARSE_TYPES = tuple( + BSR_CONTAINERS + + COO_CONTAINERS + + CSC_CONTAINERS + + CSR_CONTAINERS + + DOK_CONTAINERS + + LIL_CONTAINERS +) +SPARSE_OR_DENSE = SPARSE_TYPES + (np.asarray,) + +ALGORITHMS = ("ball_tree", "brute", "kd_tree", "auto") +COMMON_VALID_METRICS = sorted( + set.intersection(*map(set, neighbors.VALID_METRICS.values())) +) + +P = (1, 2, 3, 4, np.inf) + +# Filter deprecation warnings. +neighbors.kneighbors_graph = ignore_warnings(neighbors.kneighbors_graph) +neighbors.radius_neighbors_graph = ignore_warnings(neighbors.radius_neighbors_graph) + +# A list containing metrics where the string specifies the use of the +# DistanceMetric object directly (as resolved in _parse_metric) +DISTANCE_METRIC_OBJS = ["DM_euclidean"] + + +def _parse_metric(metric: str, dtype=None): + """ + Helper function for properly building a type-specialized DistanceMetric instances. + + Constructs a type-specialized DistanceMetric instance from a string + beginning with "DM_" while allowing a pass-through for other metric-specifying + strings. This is necessary since we wish to parameterize dtype independent of + metric, yet DistanceMetric requires it for construction. + + """ + if metric[:3] == "DM_": + return DistanceMetric.get_metric(metric[3:], dtype=dtype) + return metric + + +def _generate_test_params_for(metric: str, n_features: int): + """Return list of DistanceMetric kwargs for tests.""" + + # Distinguishing on cases not to compute unneeded datastructures. + rng = np.random.RandomState(1) + + if metric == "minkowski": + return [ + dict(p=1.5), + dict(p=2), + dict(p=3), + dict(p=np.inf), + dict(p=3, w=rng.rand(n_features)), + ] + + if metric == "seuclidean": + return [dict(V=rng.rand(n_features))] + + if metric == "mahalanobis": + A = rng.rand(n_features, n_features) + # Make the matrix symmetric positive definite + VI = A + A.T + 3 * np.eye(n_features) + return [dict(VI=VI)] + + # Case of: "euclidean", "manhattan", "chebyshev", "haversine" or any other metric. + # In those cases, no kwargs are needed. + return [{}] + + +def _weight_func(dist): + """Weight function to replace lambda d: d ** -2. + The lambda function is not valid because: + if d==0 then 0^-2 is not valid.""" + + # Dist could be multidimensional, flatten it so all values + # can be looped + with np.errstate(divide="ignore"): + retval = 1.0 / dist + return retval**2 + + +WEIGHTS = ["uniform", "distance", _weight_func] + + +@pytest.mark.parametrize( + "n_samples, n_features, n_query_pts, n_neighbors", + [ + (100, 100, 10, 100), + (1000, 5, 100, 1), + ], +) +@pytest.mark.parametrize("query_is_train", [False, True]) +@pytest.mark.parametrize("metric", COMMON_VALID_METRICS + DISTANCE_METRIC_OBJS) +def test_unsupervised_kneighbors( + global_dtype, + n_samples, + n_features, + n_query_pts, + n_neighbors, + query_is_train, + metric, +): + # The different algorithms must return identical results + # on their common metrics, with and without returning + # distances + + metric = _parse_metric(metric, global_dtype) + + # Redefining the rng locally to use the same generated X + local_rng = np.random.RandomState(0) + X = local_rng.rand(n_samples, n_features).astype(global_dtype, copy=False) + + query = ( + X + if query_is_train + else local_rng.rand(n_query_pts, n_features).astype(global_dtype, copy=False) + ) + + results_nodist = [] + results = [] + + for algorithm in ALGORITHMS: + if isinstance(metric, DistanceMetric) and global_dtype == np.float32: + if "tree" in algorithm: # pragma: nocover + pytest.skip( + "Neither KDTree nor BallTree support 32-bit distance metric" + " objects." + ) + neigh = neighbors.NearestNeighbors( + n_neighbors=n_neighbors, algorithm=algorithm, metric=metric + ) + neigh.fit(X) + + results_nodist.append(neigh.kneighbors(query, return_distance=False)) + results.append(neigh.kneighbors(query, return_distance=True)) + + for i in range(len(results) - 1): + algorithm = ALGORITHMS[i] + next_algorithm = ALGORITHMS[i + 1] + + indices_no_dist = results_nodist[i] + distances, next_distances = results[i][0], results[i + 1][0] + indices, next_indices = results[i][1], results[i + 1][1] + assert_array_equal( + indices_no_dist, + indices, + err_msg=( + f"The '{algorithm}' algorithm returns different" + "indices depending on 'return_distances'." + ), + ) + assert_array_equal( + indices, + next_indices, + err_msg=( + f"The '{algorithm}' and '{next_algorithm}' " + "algorithms return different indices." + ), + ) + assert_allclose( + distances, + next_distances, + err_msg=( + f"The '{algorithm}' and '{next_algorithm}' " + "algorithms return different distances." + ), + atol=1e-6, + ) + + +@pytest.mark.parametrize( + "n_samples, n_features, n_query_pts", + [ + (100, 100, 10), + (1000, 5, 100), + ], +) +@pytest.mark.parametrize("metric", COMMON_VALID_METRICS + DISTANCE_METRIC_OBJS) +@pytest.mark.parametrize("n_neighbors, radius", [(1, 100), (50, 500), (100, 1000)]) +@pytest.mark.parametrize( + "NeighborsMixinSubclass", + [ + neighbors.KNeighborsClassifier, + neighbors.KNeighborsRegressor, + neighbors.RadiusNeighborsClassifier, + neighbors.RadiusNeighborsRegressor, + ], +) +def test_neigh_predictions_algorithm_agnosticity( + global_dtype, + n_samples, + n_features, + n_query_pts, + metric, + n_neighbors, + radius, + NeighborsMixinSubclass, +): + # The different algorithms must return identical predictions results + # on their common metrics. + + metric = _parse_metric(metric, global_dtype) + if isinstance(metric, DistanceMetric): + if "Classifier" in NeighborsMixinSubclass.__name__: + pytest.skip( + "Metrics of type `DistanceMetric` are not yet supported for" + " classifiers." + ) + if "Radius" in NeighborsMixinSubclass.__name__: + pytest.skip( + "Metrics of type `DistanceMetric` are not yet supported for" + " radius-neighbor estimators." + ) + + # Redefining the rng locally to use the same generated X + local_rng = np.random.RandomState(0) + X = local_rng.rand(n_samples, n_features).astype(global_dtype, copy=False) + y = local_rng.randint(3, size=n_samples) + + query = local_rng.rand(n_query_pts, n_features).astype(global_dtype, copy=False) + + predict_results = [] + + parameter = ( + n_neighbors if issubclass(NeighborsMixinSubclass, KNeighborsMixin) else radius + ) + + for algorithm in ALGORITHMS: + if isinstance(metric, DistanceMetric) and global_dtype == np.float32: + if "tree" in algorithm: # pragma: nocover + pytest.skip( + "Neither KDTree nor BallTree support 32-bit distance metric" + " objects." + ) + neigh = NeighborsMixinSubclass(parameter, algorithm=algorithm, metric=metric) + neigh.fit(X, y) + + predict_results.append(neigh.predict(query)) + + for i in range(len(predict_results) - 1): + algorithm = ALGORITHMS[i] + next_algorithm = ALGORITHMS[i + 1] + + predictions, next_predictions = predict_results[i], predict_results[i + 1] + + assert_allclose( + predictions, + next_predictions, + err_msg=( + f"The '{algorithm}' and '{next_algorithm}' " + "algorithms return different predictions." + ), + ) + + +@pytest.mark.parametrize( + "KNeighborsMixinSubclass", + [ + neighbors.KNeighborsClassifier, + neighbors.KNeighborsRegressor, + neighbors.NearestNeighbors, + ], +) +def test_unsupervised_inputs(global_dtype, KNeighborsMixinSubclass): + # Test unsupervised inputs for neighbors estimators + + X = rng.random_sample((10, 3)).astype(global_dtype, copy=False) + y = rng.randint(3, size=10) + nbrs_fid = neighbors.NearestNeighbors(n_neighbors=1) + nbrs_fid.fit(X) + + dist1, ind1 = nbrs_fid.kneighbors(X) + + nbrs = KNeighborsMixinSubclass(n_neighbors=1) + + for data in (nbrs_fid, neighbors.BallTree(X), neighbors.KDTree(X)): + nbrs.fit(data, y) + + dist2, ind2 = nbrs.kneighbors(X) + + assert_allclose(dist1, dist2) + assert_array_equal(ind1, ind2) + + +def test_not_fitted_error_gets_raised(): + X = [[1]] + neighbors_ = neighbors.NearestNeighbors() + with pytest.raises(NotFittedError): + neighbors_.kneighbors_graph(X) + with pytest.raises(NotFittedError): + neighbors_.radius_neighbors_graph(X) + + +@pytest.mark.filterwarnings("ignore:EfficiencyWarning") +def check_precomputed(make_train_test, estimators): + """Tests unsupervised NearestNeighbors with a distance matrix.""" + # Note: smaller samples may result in spurious test success + rng = np.random.RandomState(42) + X = rng.random_sample((10, 4)) + Y = rng.random_sample((3, 4)) + DXX, DYX = make_train_test(X, Y) + for method in [ + "kneighbors", + ]: + # TODO: also test radius_neighbors, but requires different assertion + + # As a feature matrix (n_samples by n_features) + nbrs_X = neighbors.NearestNeighbors(n_neighbors=3) + nbrs_X.fit(X) + dist_X, ind_X = getattr(nbrs_X, method)(Y) + + # As a dense distance matrix (n_samples by n_samples) + nbrs_D = neighbors.NearestNeighbors( + n_neighbors=3, algorithm="brute", metric="precomputed" + ) + nbrs_D.fit(DXX) + dist_D, ind_D = getattr(nbrs_D, method)(DYX) + assert_allclose(dist_X, dist_D) + assert_array_equal(ind_X, ind_D) + + # Check auto works too + nbrs_D = neighbors.NearestNeighbors( + n_neighbors=3, algorithm="auto", metric="precomputed" + ) + nbrs_D.fit(DXX) + dist_D, ind_D = getattr(nbrs_D, method)(DYX) + assert_allclose(dist_X, dist_D) + assert_array_equal(ind_X, ind_D) + + # Check X=None in prediction + dist_X, ind_X = getattr(nbrs_X, method)(None) + dist_D, ind_D = getattr(nbrs_D, method)(None) + assert_allclose(dist_X, dist_D) + assert_array_equal(ind_X, ind_D) + + # Must raise a ValueError if the matrix is not of correct shape + with pytest.raises(ValueError): + getattr(nbrs_D, method)(X) + + target = np.arange(X.shape[0]) + for Est in estimators: + est = Est(metric="euclidean") + est.radius = est.n_neighbors = 1 + pred_X = est.fit(X, target).predict(Y) + est.metric = "precomputed" + pred_D = est.fit(DXX, target).predict(DYX) + assert_allclose(pred_X, pred_D) + + +def test_precomputed_dense(): + def make_train_test(X_train, X_test): + return ( + metrics.pairwise_distances(X_train), + metrics.pairwise_distances(X_test, X_train), + ) + + estimators = [ + neighbors.KNeighborsClassifier, + neighbors.KNeighborsRegressor, + neighbors.RadiusNeighborsClassifier, + neighbors.RadiusNeighborsRegressor, + ] + check_precomputed(make_train_test, estimators) + + +@pytest.mark.parametrize("fmt", ["csr", "lil"]) +def test_precomputed_sparse_knn(fmt): + def make_train_test(X_train, X_test): + nn = neighbors.NearestNeighbors(n_neighbors=3 + 1).fit(X_train) + return ( + nn.kneighbors_graph(X_train, mode="distance").asformat(fmt), + nn.kneighbors_graph(X_test, mode="distance").asformat(fmt), + ) + + # We do not test RadiusNeighborsClassifier and RadiusNeighborsRegressor + # since the precomputed neighbors graph is built with k neighbors only. + estimators = [ + neighbors.KNeighborsClassifier, + neighbors.KNeighborsRegressor, + ] + check_precomputed(make_train_test, estimators) + + +@pytest.mark.parametrize("fmt", ["csr", "lil"]) +def test_precomputed_sparse_radius(fmt): + def make_train_test(X_train, X_test): + nn = neighbors.NearestNeighbors(radius=1).fit(X_train) + return ( + nn.radius_neighbors_graph(X_train, mode="distance").asformat(fmt), + nn.radius_neighbors_graph(X_test, mode="distance").asformat(fmt), + ) + + # We do not test KNeighborsClassifier and KNeighborsRegressor + # since the precomputed neighbors graph is built with a radius. + estimators = [ + neighbors.RadiusNeighborsClassifier, + neighbors.RadiusNeighborsRegressor, + ] + check_precomputed(make_train_test, estimators) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_is_sorted_by_data(csr_container): + # Test that _is_sorted_by_data works as expected. In CSR sparse matrix, + # entries in each row can be sorted by indices, by data, or unsorted. + # _is_sorted_by_data should return True when entries are sorted by data, + # and False in all other cases. + + # Test with sorted single row sparse array + X = csr_container(np.arange(10).reshape(1, 10)) + assert _is_sorted_by_data(X) + # Test with unsorted 1D array + X[0, 2] = 5 + assert not _is_sorted_by_data(X) + + # Test when the data is sorted in each sample, but not necessarily + # between samples + X = csr_container([[0, 1, 2], [3, 0, 0], [3, 4, 0], [1, 0, 2]]) + assert _is_sorted_by_data(X) + + # Test with duplicates entries in X.indptr + data, indices, indptr = [0, 4, 2, 2], [0, 1, 1, 1], [0, 2, 2, 4] + X = csr_container((data, indices, indptr), shape=(3, 3)) + assert _is_sorted_by_data(X) + + +@pytest.mark.filterwarnings("ignore:EfficiencyWarning") +@pytest.mark.parametrize("function", [sort_graph_by_row_values, _check_precomputed]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sort_graph_by_row_values(function, csr_container): + # Test that sort_graph_by_row_values returns a graph sorted by row values + X = csr_container(np.abs(np.random.RandomState(42).randn(10, 10))) + assert not _is_sorted_by_data(X) + Xt = function(X) + assert _is_sorted_by_data(Xt) + + # test with a different number of nonzero entries for each sample + mask = np.random.RandomState(42).randint(2, size=(10, 10)) + X = X.toarray() + X[mask == 1] = 0 + X = csr_container(X) + assert not _is_sorted_by_data(X) + Xt = function(X) + assert _is_sorted_by_data(Xt) + + +@pytest.mark.filterwarnings("ignore:EfficiencyWarning") +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sort_graph_by_row_values_copy(csr_container): + # Test if the sorting is done inplace if X is CSR, so that Xt is X. + X_ = csr_container(np.abs(np.random.RandomState(42).randn(10, 10))) + assert not _is_sorted_by_data(X_) + + # sort_graph_by_row_values is done inplace if copy=False + X = X_.copy() + assert sort_graph_by_row_values(X).data is X.data + + X = X_.copy() + assert sort_graph_by_row_values(X, copy=False).data is X.data + + X = X_.copy() + assert sort_graph_by_row_values(X, copy=True).data is not X.data + + # _check_precomputed is never done inplace + X = X_.copy() + assert _check_precomputed(X).data is not X.data + + # do not raise if X is not CSR and copy=True + sort_graph_by_row_values(X.tocsc(), copy=True) + + # raise if X is not CSR and copy=False + with pytest.raises(ValueError, match="Use copy=True to allow the conversion"): + sort_graph_by_row_values(X.tocsc(), copy=False) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sort_graph_by_row_values_warning(csr_container): + # Test that the parameter warn_when_not_sorted works as expected. + X = csr_container(np.abs(np.random.RandomState(42).randn(10, 10))) + assert not _is_sorted_by_data(X) + + # warning + with pytest.warns(EfficiencyWarning, match="was not sorted by row values"): + sort_graph_by_row_values(X, copy=True) + with pytest.warns(EfficiencyWarning, match="was not sorted by row values"): + sort_graph_by_row_values(X, copy=True, warn_when_not_sorted=True) + with pytest.warns(EfficiencyWarning, match="was not sorted by row values"): + _check_precomputed(X) + + # no warning + with warnings.catch_warnings(): + warnings.simplefilter("error") + sort_graph_by_row_values(X, copy=True, warn_when_not_sorted=False) + + +@pytest.mark.parametrize( + "sparse_container", DOK_CONTAINERS + BSR_CONTAINERS + DIA_CONTAINERS +) +def test_sort_graph_by_row_values_bad_sparse_format(sparse_container): + # Test that sort_graph_by_row_values and _check_precomputed error on bad formats + X = sparse_container(np.abs(np.random.RandomState(42).randn(10, 10))) + with pytest.raises(TypeError, match="format is not supported"): + sort_graph_by_row_values(X) + with pytest.raises(TypeError, match="format is not supported"): + _check_precomputed(X) + + +@pytest.mark.filterwarnings("ignore:EfficiencyWarning") +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_precomputed_sparse_invalid(csr_container): + dist = np.array([[0.0, 2.0, 1.0], [2.0, 0.0, 3.0], [1.0, 3.0, 0.0]]) + dist_csr = csr_container(dist) + neigh = neighbors.NearestNeighbors(n_neighbors=1, metric="precomputed") + neigh.fit(dist_csr) + neigh.kneighbors(None, n_neighbors=1) + neigh.kneighbors(np.array([[0.0, 0.0, 0.0]]), n_neighbors=2) + + # Ensures enough number of nearest neighbors + dist = np.array([[0.0, 2.0, 0.0], [2.0, 0.0, 3.0], [0.0, 3.0, 0.0]]) + dist_csr = csr_container(dist) + neigh.fit(dist_csr) + msg = "2 neighbors per samples are required, but some samples have only 1" + with pytest.raises(ValueError, match=msg): + neigh.kneighbors(None, n_neighbors=1) + + # Checks error with inconsistent distance matrix + dist = np.array([[5.0, 2.0, 1.0], [-2.0, 0.0, 3.0], [1.0, 3.0, 0.0]]) + dist_csr = csr_container(dist) + msg = "Negative values in data passed to precomputed distance matrix." + with pytest.raises(ValueError, match=msg): + neigh.kneighbors(dist_csr, n_neighbors=1) + + +def test_precomputed_cross_validation(): + # Ensure array is split correctly + rng = np.random.RandomState(0) + X = rng.rand(20, 2) + D = pairwise_distances(X, metric="euclidean") + y = rng.randint(3, size=20) + for Est in ( + neighbors.KNeighborsClassifier, + neighbors.RadiusNeighborsClassifier, + neighbors.KNeighborsRegressor, + neighbors.RadiusNeighborsRegressor, + ): + metric_score = cross_val_score(Est(), X, y) + precomp_score = cross_val_score(Est(metric="precomputed"), D, y) + assert_array_equal(metric_score, precomp_score) + + +def test_unsupervised_radius_neighbors( + global_dtype, n_samples=20, n_features=5, n_query_pts=2, radius=0.5, random_state=0 +): + # Test unsupervised radius-based query + rng = np.random.RandomState(random_state) + + X = rng.rand(n_samples, n_features).astype(global_dtype, copy=False) + + test = rng.rand(n_query_pts, n_features).astype(global_dtype, copy=False) + + for p in P: + results = [] + + for algorithm in ALGORITHMS: + neigh = neighbors.NearestNeighbors(radius=radius, algorithm=algorithm, p=p) + neigh.fit(X) + + ind1 = neigh.radius_neighbors(test, return_distance=False) + + # sort the results: this is not done automatically for + # radius searches + dist, ind = neigh.radius_neighbors(test, return_distance=True) + for d, i, i1 in zip(dist, ind, ind1): + j = d.argsort() + d[:] = d[j] + i[:] = i[j] + i1[:] = i1[j] + results.append((dist, ind)) + + assert_allclose(np.concatenate(list(ind)), np.concatenate(list(ind1))) + + for i in range(len(results) - 1): + assert_allclose( + np.concatenate(list(results[i][0])), + np.concatenate(list(results[i + 1][0])), + ) + assert_allclose( + np.concatenate(list(results[i][1])), + np.concatenate(list(results[i + 1][1])), + ) + + +@pytest.mark.parametrize("algorithm", ALGORITHMS) +@pytest.mark.parametrize("weights", WEIGHTS) +def test_kneighbors_classifier( + global_dtype, + algorithm, + weights, + n_samples=40, + n_features=5, + n_test_pts=10, + n_neighbors=5, + random_state=0, +): + # Test k-neighbors classification + rng = np.random.RandomState(random_state) + X = 2 * rng.rand(n_samples, n_features).astype(global_dtype, copy=False) - 1 + y = ((X**2).sum(axis=1) < 0.5).astype(int) + y_str = y.astype(str) + + knn = neighbors.KNeighborsClassifier( + n_neighbors=n_neighbors, weights=weights, algorithm=algorithm + ) + knn.fit(X, y) + epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) + y_pred = knn.predict(X[:n_test_pts] + epsilon) + assert_array_equal(y_pred, y[:n_test_pts]) + # Test prediction with y_str + knn.fit(X, y_str) + y_pred = knn.predict(X[:n_test_pts] + epsilon) + assert_array_equal(y_pred, y_str[:n_test_pts]) + + +def test_kneighbors_classifier_float_labels( + global_dtype, + n_samples=40, + n_features=5, + n_test_pts=10, + n_neighbors=5, + random_state=0, +): + # Test k-neighbors classification + rng = np.random.RandomState(random_state) + X = 2 * rng.rand(n_samples, n_features).astype(global_dtype, copy=False) - 1 + y = ((X**2).sum(axis=1) < 0.5).astype(int) + + knn = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors) + knn.fit(X, y.astype(float)) + epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) + y_pred = knn.predict(X[:n_test_pts] + epsilon) + assert_array_equal(y_pred, y[:n_test_pts]) + + +def test_kneighbors_classifier_predict_proba(global_dtype): + # Test KNeighborsClassifier.predict_proba() method + X = np.array( + [[0, 2, 0], [0, 2, 1], [2, 0, 0], [2, 2, 0], [0, 0, 2], [0, 0, 1]] + ).astype(global_dtype, copy=False) + y = np.array([4, 4, 5, 5, 1, 1]) + cls = neighbors.KNeighborsClassifier(n_neighbors=3, p=1) # cityblock dist + cls.fit(X, y) + y_prob = cls.predict_proba(X) + real_prob = ( + np.array( + [ + [0, 2, 1], + [1, 2, 0], + [1, 0, 2], + [0, 1, 2], + [2, 1, 0], + [2, 1, 0], + ] + ) + / 3.0 + ) + assert_array_equal(real_prob, y_prob) + # Check that it also works with non integer labels + cls.fit(X, y.astype(str)) + y_prob = cls.predict_proba(X) + assert_array_equal(real_prob, y_prob) + # Check that it works with weights='distance' + cls = neighbors.KNeighborsClassifier(n_neighbors=2, p=1, weights="distance") + cls.fit(X, y) + y_prob = cls.predict_proba(np.array([[0, 2, 0], [2, 2, 2]])) + real_prob = np.array([[0, 1, 0], [0, 0.4, 0.6]]) + assert_allclose(real_prob, y_prob) + + +@pytest.mark.parametrize("algorithm", ALGORITHMS) +@pytest.mark.parametrize("weights", WEIGHTS) +def test_radius_neighbors_classifier( + global_dtype, + algorithm, + weights, + n_samples=40, + n_features=5, + n_test_pts=10, + radius=0.5, + random_state=0, +): + # Test radius-based classification + rng = np.random.RandomState(random_state) + X = 2 * rng.rand(n_samples, n_features).astype(global_dtype, copy=False) - 1 + y = ((X**2).sum(axis=1) < radius).astype(int) + y_str = y.astype(str) + + neigh = neighbors.RadiusNeighborsClassifier( + radius=radius, weights=weights, algorithm=algorithm + ) + neigh.fit(X, y) + epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) + y_pred = neigh.predict(X[:n_test_pts] + epsilon) + assert_array_equal(y_pred, y[:n_test_pts]) + neigh.fit(X, y_str) + y_pred = neigh.predict(X[:n_test_pts] + epsilon) + assert_array_equal(y_pred, y_str[:n_test_pts]) + + +@pytest.mark.parametrize("algorithm", ALGORITHMS) +@pytest.mark.parametrize("weights", WEIGHTS) +@pytest.mark.parametrize("outlier_label", [0, -1, None]) +def test_radius_neighbors_classifier_when_no_neighbors( + global_dtype, algorithm, weights, outlier_label +): + # Test radius-based classifier when no neighbors found. + # In this case it should rise an informative exception + + X = np.array([[1.0, 1.0], [2.0, 2.0]], dtype=global_dtype) + y = np.array([1, 2]) + radius = 0.1 + + # no outliers + z1 = np.array([[1.01, 1.01], [2.01, 2.01]], dtype=global_dtype) + + # one outlier + z2 = np.array([[1.01, 1.01], [1.4, 1.4]], dtype=global_dtype) + + rnc = neighbors.RadiusNeighborsClassifier + clf = rnc( + radius=radius, + weights=weights, + algorithm=algorithm, + outlier_label=outlier_label, + ) + clf.fit(X, y) + assert_array_equal(np.array([1, 2]), clf.predict(z1)) + if outlier_label is None: + with pytest.raises(ValueError): + clf.predict(z2) + + +@pytest.mark.parametrize("algorithm", ALGORITHMS) +@pytest.mark.parametrize("weights", WEIGHTS) +def test_radius_neighbors_classifier_outlier_labeling(global_dtype, algorithm, weights): + # Test radius-based classifier when no neighbors found and outliers + # are labeled. + + X = np.array( + [[1.0, 1.0], [2.0, 2.0], [0.99, 0.99], [0.98, 0.98], [2.01, 2.01]], + dtype=global_dtype, + ) + y = np.array([1, 2, 1, 1, 2]) + radius = 0.1 + + # no outliers + z1 = np.array([[1.01, 1.01], [2.01, 2.01]], dtype=global_dtype) + + # one outlier + z2 = np.array([[1.4, 1.4], [1.01, 1.01], [2.01, 2.01]], dtype=global_dtype) + + correct_labels1 = np.array([1, 2]) + correct_labels2 = np.array([-1, 1, 2]) + outlier_proba = np.array([0, 0]) + + clf = neighbors.RadiusNeighborsClassifier( + radius=radius, weights=weights, algorithm=algorithm, outlier_label=-1 + ) + clf.fit(X, y) + assert_array_equal(correct_labels1, clf.predict(z1)) + with pytest.warns(UserWarning, match="Outlier label -1 is not in training classes"): + assert_array_equal(correct_labels2, clf.predict(z2)) + with pytest.warns(UserWarning, match="Outlier label -1 is not in training classes"): + assert_allclose(outlier_proba, clf.predict_proba(z2)[0]) + + # test outlier_labeling of using predict_proba() + RNC = neighbors.RadiusNeighborsClassifier + X = np.array([[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]], dtype=global_dtype) + y = np.array([0, 2, 2, 1, 1, 1, 3, 3, 3, 3]) + + # test outlier_label scalar verification + def check_array_exception(): + clf = RNC(radius=1, outlier_label=[[5]]) + clf.fit(X, y) + + with pytest.raises(TypeError): + check_array_exception() + + # test invalid outlier_label dtype + def check_dtype_exception(): + clf = RNC(radius=1, outlier_label="a") + clf.fit(X, y) + + with pytest.raises(TypeError): + check_dtype_exception() + + # test most frequent + clf = RNC(radius=1, outlier_label="most_frequent") + clf.fit(X, y) + proba = clf.predict_proba([[1], [15]]) + assert_array_equal(proba[1, :], [0, 0, 0, 1]) + + # test manual label in y + clf = RNC(radius=1, outlier_label=1) + clf.fit(X, y) + proba = clf.predict_proba([[1], [15]]) + assert_array_equal(proba[1, :], [0, 1, 0, 0]) + pred = clf.predict([[1], [15]]) + assert_array_equal(pred, [2, 1]) + + # test manual label out of y warning + def check_warning(): + clf = RNC(radius=1, outlier_label=4) + clf.fit(X, y) + clf.predict_proba([[1], [15]]) + + with pytest.warns(UserWarning): + check_warning() + + # test multi output same outlier label + y_multi = [ + [0, 1], + [2, 1], + [2, 2], + [1, 2], + [1, 2], + [1, 3], + [3, 3], + [3, 3], + [3, 0], + [3, 0], + ] + clf = RNC(radius=1, outlier_label=1) + clf.fit(X, y_multi) + proba = clf.predict_proba([[7], [15]]) + assert_array_equal(proba[1][1, :], [0, 1, 0, 0]) + pred = clf.predict([[7], [15]]) + assert_array_equal(pred[1, :], [1, 1]) + + # test multi output different outlier label + y_multi = [ + [0, 0], + [2, 2], + [2, 2], + [1, 1], + [1, 1], + [1, 1], + [3, 3], + [3, 3], + [3, 3], + [3, 3], + ] + clf = RNC(radius=1, outlier_label=[0, 1]) + clf.fit(X, y_multi) + proba = clf.predict_proba([[7], [15]]) + assert_array_equal(proba[0][1, :], [1, 0, 0, 0]) + assert_array_equal(proba[1][1, :], [0, 1, 0, 0]) + pred = clf.predict([[7], [15]]) + assert_array_equal(pred[1, :], [0, 1]) + + # test inconsistent outlier label list length + def check_exception(): + clf = RNC(radius=1, outlier_label=[0, 1, 2]) + clf.fit(X, y_multi) + + with pytest.raises(ValueError): + check_exception() + + +def test_radius_neighbors_classifier_zero_distance(): + # Test radius-based classifier, when distance to a sample is zero. + + X = np.array([[1.0, 1.0], [2.0, 2.0]]) + y = np.array([1, 2]) + radius = 0.1 + + z1 = np.array([[1.01, 1.01], [2.0, 2.0]]) + correct_labels1 = np.array([1, 2]) + + weight_func = _weight_func + + for algorithm in ALGORITHMS: + for weights in ["uniform", "distance", weight_func]: + clf = neighbors.RadiusNeighborsClassifier( + radius=radius, weights=weights, algorithm=algorithm + ) + clf.fit(X, y) + with np.errstate(invalid="ignore"): + # Ignore the warning raised in _weight_func when making + # predictions with null distances resulting in np.inf values. + assert_array_equal(correct_labels1, clf.predict(z1)) + + +def test_neighbors_regressors_zero_distance(): + # Test radius-based regressor, when distance to a sample is zero. + + X = np.array([[1.0, 1.0], [1.0, 1.0], [2.0, 2.0], [2.5, 2.5]]) + y = np.array([1.0, 1.5, 2.0, 0.0]) + radius = 0.2 + z = np.array([[1.1, 1.1], [2.0, 2.0]]) + + rnn_correct_labels = np.array([1.25, 2.0]) + + knn_correct_unif = np.array([1.25, 1.0]) + knn_correct_dist = np.array([1.25, 2.0]) + + for algorithm in ALGORITHMS: + # we don't test for weights=_weight_func since user will be expected + # to handle zero distances themselves in the function. + for weights in ["uniform", "distance"]: + rnn = neighbors.RadiusNeighborsRegressor( + radius=radius, weights=weights, algorithm=algorithm + ) + rnn.fit(X, y) + assert_allclose(rnn_correct_labels, rnn.predict(z)) + + for weights, corr_labels in zip( + ["uniform", "distance"], [knn_correct_unif, knn_correct_dist] + ): + knn = neighbors.KNeighborsRegressor( + n_neighbors=2, weights=weights, algorithm=algorithm + ) + knn.fit(X, y) + assert_allclose(corr_labels, knn.predict(z)) + + +def test_radius_neighbors_boundary_handling(): + """Test whether points lying on boundary are handled consistently + + Also ensures that even with only one query point, an object array + is returned rather than a 2d array. + """ + + X = np.array([[1.5], [3.0], [3.01]]) + radius = 3.0 + + for algorithm in ALGORITHMS: + nbrs = neighbors.NearestNeighbors(radius=radius, algorithm=algorithm).fit(X) + results = nbrs.radius_neighbors([[0.0]], return_distance=False) + assert results.shape == (1,) + assert results.dtype == object + assert_array_equal(results[0], [0, 1]) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_radius_neighbors_returns_array_of_objects(csr_container): + # check that we can pass precomputed distances to + # NearestNeighbors.radius_neighbors() + # non-regression test for + # https://github.com/scikit-learn/scikit-learn/issues/16036 + X = csr_container(np.ones((4, 4))) + X.setdiag([0, 0, 0, 0]) + + nbrs = neighbors.NearestNeighbors( + radius=0.5, algorithm="auto", leaf_size=30, metric="precomputed" + ).fit(X) + neigh_dist, neigh_ind = nbrs.radius_neighbors(X, return_distance=True) + + expected_dist = np.empty(X.shape[0], dtype=object) + expected_dist[:] = [np.array([0]), np.array([0]), np.array([0]), np.array([0])] + expected_ind = np.empty(X.shape[0], dtype=object) + expected_ind[:] = [np.array([0]), np.array([1]), np.array([2]), np.array([3])] + + assert_array_equal(neigh_dist, expected_dist) + assert_array_equal(neigh_ind, expected_ind) + + +@pytest.mark.parametrize("algorithm", ["ball_tree", "kd_tree", "brute"]) +def test_query_equidistant_kth_nn(algorithm): + # For several candidates for the k-th nearest neighbor position, + # the first candidate should be chosen + query_point = np.array([[0, 0]]) + equidistant_points = np.array([[1, 0], [0, 1], [-1, 0], [0, -1]]) + # The 3rd and 4th points should not replace the 2nd point + # for the 2th nearest neighbor position + k = 2 + knn_indices = np.array([[0, 1]]) + nn = neighbors.NearestNeighbors(algorithm=algorithm).fit(equidistant_points) + indices = np.sort(nn.kneighbors(query_point, n_neighbors=k, return_distance=False)) + assert_array_equal(indices, knn_indices) + + +@pytest.mark.parametrize( + ["algorithm", "metric"], + list( + product( + ("kd_tree", "ball_tree", "brute"), + ("euclidean", *DISTANCE_METRIC_OBJS), + ) + ) + + [ + ("brute", "euclidean"), + ("brute", "precomputed"), + ], +) +def test_radius_neighbors_sort_results(algorithm, metric): + # Test radius_neighbors[_graph] output when sort_result is True + + metric = _parse_metric(metric, np.float64) + if isinstance(metric, DistanceMetric): + pytest.skip( + "Metrics of type `DistanceMetric` are not yet supported for radius-neighbor" + " estimators." + ) + n_samples = 10 + rng = np.random.RandomState(42) + X = rng.random_sample((n_samples, 4)) + + if metric == "precomputed": + X = neighbors.radius_neighbors_graph(X, radius=np.inf, mode="distance") + model = neighbors.NearestNeighbors(algorithm=algorithm, metric=metric) + model.fit(X) + + # self.radius_neighbors + distances, indices = model.radius_neighbors(X=X, radius=np.inf, sort_results=True) + for ii in range(n_samples): + assert_array_equal(distances[ii], np.sort(distances[ii])) + + # sort_results=True and return_distance=False + if metric != "precomputed": # no need to raise with precomputed graph + with pytest.raises(ValueError, match="return_distance must be True"): + model.radius_neighbors( + X=X, radius=np.inf, sort_results=True, return_distance=False + ) + + # self.radius_neighbors_graph + graph = model.radius_neighbors_graph( + X=X, radius=np.inf, mode="distance", sort_results=True + ) + assert _is_sorted_by_data(graph) + + +def test_RadiusNeighborsClassifier_multioutput(): + # Test k-NN classifier on multioutput data + rng = check_random_state(0) + n_features = 2 + n_samples = 40 + n_output = 3 + + X = rng.rand(n_samples, n_features) + y = rng.randint(0, 3, (n_samples, n_output)) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + weights = [None, "uniform", "distance", _weight_func] + + for algorithm, weights in product(ALGORITHMS, weights): + # Stack single output prediction + y_pred_so = [] + for o in range(n_output): + rnn = neighbors.RadiusNeighborsClassifier( + weights=weights, algorithm=algorithm + ) + rnn.fit(X_train, y_train[:, o]) + y_pred_so.append(rnn.predict(X_test)) + + y_pred_so = np.vstack(y_pred_so).T + assert y_pred_so.shape == y_test.shape + + # Multioutput prediction + rnn_mo = neighbors.RadiusNeighborsClassifier( + weights=weights, algorithm=algorithm + ) + rnn_mo.fit(X_train, y_train) + y_pred_mo = rnn_mo.predict(X_test) + + assert y_pred_mo.shape == y_test.shape + assert_array_equal(y_pred_mo, y_pred_so) + + +def test_kneighbors_classifier_sparse( + n_samples=40, n_features=5, n_test_pts=10, n_neighbors=5, random_state=0 +): + # Test k-NN classifier on sparse matrices + # Like the above, but with various types of sparse matrices + rng = np.random.RandomState(random_state) + X = 2 * rng.rand(n_samples, n_features) - 1 + X *= X > 0.2 + y = ((X**2).sum(axis=1) < 0.5).astype(int) + + for sparsemat in SPARSE_TYPES: + knn = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, algorithm="auto") + knn.fit(sparsemat(X), y) + epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) + for sparsev in SPARSE_TYPES + (np.asarray,): + X_eps = sparsev(X[:n_test_pts] + epsilon) + y_pred = knn.predict(X_eps) + assert_array_equal(y_pred, y[:n_test_pts]) + + +def test_KNeighborsClassifier_multioutput(): + # Test k-NN classifier on multioutput data + rng = check_random_state(0) + n_features = 5 + n_samples = 50 + n_output = 3 + + X = rng.rand(n_samples, n_features) + y = rng.randint(0, 3, (n_samples, n_output)) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + weights = [None, "uniform", "distance", _weight_func] + + for algorithm, weights in product(ALGORITHMS, weights): + # Stack single output prediction + y_pred_so = [] + y_pred_proba_so = [] + for o in range(n_output): + knn = neighbors.KNeighborsClassifier(weights=weights, algorithm=algorithm) + knn.fit(X_train, y_train[:, o]) + y_pred_so.append(knn.predict(X_test)) + y_pred_proba_so.append(knn.predict_proba(X_test)) + + y_pred_so = np.vstack(y_pred_so).T + assert y_pred_so.shape == y_test.shape + assert len(y_pred_proba_so) == n_output + + # Multioutput prediction + knn_mo = neighbors.KNeighborsClassifier(weights=weights, algorithm=algorithm) + knn_mo.fit(X_train, y_train) + y_pred_mo = knn_mo.predict(X_test) + + assert y_pred_mo.shape == y_test.shape + assert_array_equal(y_pred_mo, y_pred_so) + + # Check proba + y_pred_proba_mo = knn_mo.predict_proba(X_test) + assert len(y_pred_proba_mo) == n_output + + for proba_mo, proba_so in zip(y_pred_proba_mo, y_pred_proba_so): + assert_array_equal(proba_mo, proba_so) + + +def test_kneighbors_regressor( + n_samples=40, n_features=5, n_test_pts=10, n_neighbors=3, random_state=0 +): + # Test k-neighbors regression + rng = np.random.RandomState(random_state) + X = 2 * rng.rand(n_samples, n_features) - 1 + y = np.sqrt((X**2).sum(1)) + y /= y.max() + + y_target = y[:n_test_pts] + + weight_func = _weight_func + + for algorithm in ALGORITHMS: + for weights in ["uniform", "distance", weight_func]: + knn = neighbors.KNeighborsRegressor( + n_neighbors=n_neighbors, weights=weights, algorithm=algorithm + ) + knn.fit(X, y) + epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) + y_pred = knn.predict(X[:n_test_pts] + epsilon) + assert np.all(abs(y_pred - y_target) < 0.3) + + +def test_KNeighborsRegressor_multioutput_uniform_weight(): + # Test k-neighbors in multi-output regression with uniform weight + rng = check_random_state(0) + n_features = 5 + n_samples = 40 + n_output = 4 + + X = rng.rand(n_samples, n_features) + y = rng.rand(n_samples, n_output) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + for algorithm, weights in product(ALGORITHMS, [None, "uniform"]): + knn = neighbors.KNeighborsRegressor(weights=weights, algorithm=algorithm) + knn.fit(X_train, y_train) + + neigh_idx = knn.kneighbors(X_test, return_distance=False) + y_pred_idx = np.array([np.mean(y_train[idx], axis=0) for idx in neigh_idx]) + + y_pred = knn.predict(X_test) + + assert y_pred.shape == y_test.shape + assert y_pred_idx.shape == y_test.shape + assert_allclose(y_pred, y_pred_idx) + + +def test_kneighbors_regressor_multioutput( + n_samples=40, n_features=5, n_test_pts=10, n_neighbors=3, random_state=0 +): + # Test k-neighbors in multi-output regression + rng = np.random.RandomState(random_state) + X = 2 * rng.rand(n_samples, n_features) - 1 + y = np.sqrt((X**2).sum(1)) + y /= y.max() + y = np.vstack([y, y]).T + + y_target = y[:n_test_pts] + + weights = ["uniform", "distance", _weight_func] + for algorithm, weights in product(ALGORITHMS, weights): + knn = neighbors.KNeighborsRegressor( + n_neighbors=n_neighbors, weights=weights, algorithm=algorithm + ) + knn.fit(X, y) + epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) + y_pred = knn.predict(X[:n_test_pts] + epsilon) + assert y_pred.shape == y_target.shape + + assert np.all(np.abs(y_pred - y_target) < 0.3) + + +def test_radius_neighbors_regressor( + n_samples=40, n_features=3, n_test_pts=10, radius=0.5, random_state=0 +): + # Test radius-based neighbors regression + rng = np.random.RandomState(random_state) + X = 2 * rng.rand(n_samples, n_features) - 1 + y = np.sqrt((X**2).sum(1)) + y /= y.max() + + y_target = y[:n_test_pts] + + weight_func = _weight_func + + for algorithm in ALGORITHMS: + for weights in ["uniform", "distance", weight_func]: + neigh = neighbors.RadiusNeighborsRegressor( + radius=radius, weights=weights, algorithm=algorithm + ) + neigh.fit(X, y) + epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) + y_pred = neigh.predict(X[:n_test_pts] + epsilon) + assert np.all(abs(y_pred - y_target) < radius / 2) + + # test that nan is returned when no nearby observations + for weights in ["uniform", "distance"]: + neigh = neighbors.RadiusNeighborsRegressor( + radius=radius, weights=weights, algorithm="auto" + ) + neigh.fit(X, y) + X_test_nan = np.full((1, n_features), -1.0) + empty_warning_msg = ( + "One or more samples have no neighbors " + "within specified radius; predicting NaN." + ) + with pytest.warns(UserWarning, match=re.escape(empty_warning_msg)): + pred = neigh.predict(X_test_nan) + assert np.all(np.isnan(pred)) + + +def test_RadiusNeighborsRegressor_multioutput_with_uniform_weight(): + # Test radius neighbors in multi-output regression (uniform weight) + + rng = check_random_state(0) + n_features = 5 + n_samples = 40 + n_output = 4 + + X = rng.rand(n_samples, n_features) + y = rng.rand(n_samples, n_output) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + for algorithm, weights in product(ALGORITHMS, [None, "uniform"]): + rnn = neighbors.RadiusNeighborsRegressor(weights=weights, algorithm=algorithm) + rnn.fit(X_train, y_train) + + neigh_idx = rnn.radius_neighbors(X_test, return_distance=False) + y_pred_idx = np.array([np.mean(y_train[idx], axis=0) for idx in neigh_idx]) + + y_pred_idx = np.array(y_pred_idx) + y_pred = rnn.predict(X_test) + + assert y_pred_idx.shape == y_test.shape + assert y_pred.shape == y_test.shape + assert_allclose(y_pred, y_pred_idx) + + +def test_RadiusNeighborsRegressor_multioutput( + n_samples=40, n_features=5, n_test_pts=10, random_state=0 +): + # Test k-neighbors in multi-output regression with various weight + rng = np.random.RandomState(random_state) + X = 2 * rng.rand(n_samples, n_features) - 1 + y = np.sqrt((X**2).sum(1)) + y /= y.max() + y = np.vstack([y, y]).T + + y_target = y[:n_test_pts] + weights = ["uniform", "distance", _weight_func] + + for algorithm, weights in product(ALGORITHMS, weights): + rnn = neighbors.RadiusNeighborsRegressor(weights=weights, algorithm=algorithm) + rnn.fit(X, y) + epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) + y_pred = rnn.predict(X[:n_test_pts] + epsilon) + + assert y_pred.shape == y_target.shape + assert np.all(np.abs(y_pred - y_target) < 0.3) + + +@pytest.mark.filterwarnings("ignore:EfficiencyWarning") +def test_kneighbors_regressor_sparse( + n_samples=40, n_features=5, n_test_pts=10, n_neighbors=5, random_state=0 +): + # Test radius-based regression on sparse matrices + # Like the above, but with various types of sparse matrices + rng = np.random.RandomState(random_state) + X = 2 * rng.rand(n_samples, n_features) - 1 + y = ((X**2).sum(axis=1) < 0.25).astype(int) + + for sparsemat in SPARSE_TYPES: + knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, algorithm="auto") + knn.fit(sparsemat(X), y) + + knn_pre = neighbors.KNeighborsRegressor( + n_neighbors=n_neighbors, metric="precomputed" + ) + knn_pre.fit(pairwise_distances(X, metric="euclidean"), y) + + for sparsev in SPARSE_OR_DENSE: + X2 = sparsev(X) + assert np.mean(knn.predict(X2).round() == y) > 0.95 + + X2_pre = sparsev(pairwise_distances(X, metric="euclidean")) + if sparsev in DOK_CONTAINERS + BSR_CONTAINERS: + msg = "not supported due to its handling of explicit zeros" + with pytest.raises(TypeError, match=msg): + knn_pre.predict(X2_pre) + else: + assert np.mean(knn_pre.predict(X2_pre).round() == y) > 0.95 + + +def test_neighbors_iris(): + # Sanity checks on the iris dataset + # Puts three points of each label in the plane and performs a + # nearest neighbor query on points near the decision boundary. + + for algorithm in ALGORITHMS: + clf = neighbors.KNeighborsClassifier(n_neighbors=1, algorithm=algorithm) + clf.fit(iris.data, iris.target) + assert_array_equal(clf.predict(iris.data), iris.target) + + clf.set_params(n_neighbors=9, algorithm=algorithm) + clf.fit(iris.data, iris.target) + assert np.mean(clf.predict(iris.data) == iris.target) > 0.95 + + rgs = neighbors.KNeighborsRegressor(n_neighbors=5, algorithm=algorithm) + rgs.fit(iris.data, iris.target) + assert np.mean(rgs.predict(iris.data).round() == iris.target) > 0.95 + + +def test_neighbors_digits(): + # Sanity check on the digits dataset + # the 'brute' algorithm has been observed to fail if the input + # dtype is uint8 due to overflow in distance calculations. + + X = digits.data.astype("uint8") + Y = digits.target + (n_samples, n_features) = X.shape + train_test_boundary = int(n_samples * 0.8) + train = np.arange(0, train_test_boundary) + test = np.arange(train_test_boundary, n_samples) + (X_train, Y_train, X_test, Y_test) = X[train], Y[train], X[test], Y[test] + + clf = neighbors.KNeighborsClassifier(n_neighbors=1, algorithm="brute") + score_uint8 = clf.fit(X_train, Y_train).score(X_test, Y_test) + score_float = clf.fit(X_train.astype(float, copy=False), Y_train).score( + X_test.astype(float, copy=False), Y_test + ) + assert score_uint8 == score_float + + +def test_kneighbors_graph(): + # Test kneighbors_graph to build the k-Nearest Neighbor graph. + X = np.array([[0, 1], [1.01, 1.0], [2, 0]]) + + # n_neighbors = 1 + A = neighbors.kneighbors_graph(X, 1, mode="connectivity", include_self=True) + assert_array_equal(A.toarray(), np.eye(A.shape[0])) + + A = neighbors.kneighbors_graph(X, 1, mode="distance") + assert_allclose( + A.toarray(), [[0.00, 1.01, 0.0], [1.01, 0.0, 0.0], [0.00, 1.40716026, 0.0]] + ) + + # n_neighbors = 2 + A = neighbors.kneighbors_graph(X, 2, mode="connectivity", include_self=True) + assert_array_equal(A.toarray(), [[1.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 1.0, 1.0]]) + + A = neighbors.kneighbors_graph(X, 2, mode="distance") + assert_allclose( + A.toarray(), + [ + [0.0, 1.01, 2.23606798], + [1.01, 0.0, 1.40716026], + [2.23606798, 1.40716026, 0.0], + ], + ) + + # n_neighbors = 3 + A = neighbors.kneighbors_graph(X, 3, mode="connectivity", include_self=True) + assert_allclose(A.toarray(), [[1, 1, 1], [1, 1, 1], [1, 1, 1]]) + + +@pytest.mark.parametrize("n_neighbors", [1, 2, 3]) +@pytest.mark.parametrize("mode", ["connectivity", "distance"]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_kneighbors_graph_sparse(n_neighbors, mode, csr_container, seed=36): + # Test kneighbors_graph to build the k-Nearest Neighbor graph + # for sparse input. + rng = np.random.RandomState(seed) + X = rng.randn(10, 10) + Xcsr = csr_container(X) + + assert_allclose( + neighbors.kneighbors_graph(X, n_neighbors, mode=mode).toarray(), + neighbors.kneighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(), + ) + + +def test_radius_neighbors_graph(): + # Test radius_neighbors_graph to build the Nearest Neighbor graph. + X = np.array([[0, 1], [1.01, 1.0], [2, 0]]) + + A = neighbors.radius_neighbors_graph(X, 1.5, mode="connectivity", include_self=True) + assert_array_equal(A.toarray(), [[1.0, 1.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0]]) + + A = neighbors.radius_neighbors_graph(X, 1.5, mode="distance") + assert_allclose( + A.toarray(), [[0.0, 1.01, 0.0], [1.01, 0.0, 1.40716026], [0.0, 1.40716026, 0.0]] + ) + + +@pytest.mark.parametrize("n_neighbors", [1, 2, 3]) +@pytest.mark.parametrize("mode", ["connectivity", "distance"]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_radius_neighbors_graph_sparse(n_neighbors, mode, csr_container, seed=36): + # Test radius_neighbors_graph to build the Nearest Neighbor graph + # for sparse input. + rng = np.random.RandomState(seed) + X = rng.randn(10, 10) + Xcsr = csr_container(X) + + assert_allclose( + neighbors.radius_neighbors_graph(X, n_neighbors, mode=mode).toarray(), + neighbors.radius_neighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(), + ) + + +@pytest.mark.parametrize( + "Estimator", + [ + neighbors.KNeighborsClassifier, + neighbors.RadiusNeighborsClassifier, + neighbors.KNeighborsRegressor, + neighbors.RadiusNeighborsRegressor, + ], +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_neighbors_validate_parameters(Estimator, csr_container): + """Additional parameter validation for *Neighbors* estimators not covered by common + validation.""" + X = rng.random_sample((10, 2)) + Xsparse = csr_container(X) + X3 = rng.random_sample((10, 3)) + y = np.ones(10) + + nbrs = Estimator(algorithm="ball_tree", metric="haversine") + msg = "instance is not fitted yet" + with pytest.raises(ValueError, match=msg): + nbrs.predict(X) + msg = "Metric 'haversine' not valid for sparse input." + with pytest.raises(ValueError, match=msg): + ignore_warnings(nbrs.fit(Xsparse, y)) + + nbrs = Estimator(metric="haversine", algorithm="brute") + nbrs.fit(X3, y) + msg = "Haversine distance only valid in 2 dimensions" + with pytest.raises(ValueError, match=msg): + nbrs.predict(X3) + + nbrs = Estimator() + msg = re.escape("Found array with 0 sample(s)") + with pytest.raises(ValueError, match=msg): + nbrs.fit(np.ones((0, 2)), np.ones(0)) + + msg = "Found array with dim 3" + with pytest.raises(ValueError, match=msg): + nbrs.fit(X[:, :, None], y) + nbrs.fit(X, y) + + msg = re.escape("Found array with 0 feature(s)") + with pytest.raises(ValueError, match=msg): + nbrs.predict([[]]) + + +@pytest.mark.parametrize( + "Estimator", + [ + neighbors.KNeighborsClassifier, + neighbors.RadiusNeighborsClassifier, + neighbors.KNeighborsRegressor, + neighbors.RadiusNeighborsRegressor, + ], +) +@pytest.mark.parametrize("n_features", [2, 100]) +@pytest.mark.parametrize("algorithm", ["auto", "brute"]) +def test_neighbors_minkowski_semimetric_algo_warn(Estimator, n_features, algorithm): + """ + Validation of all classes extending NeighborsBase with + Minkowski semi-metrics (i.e. when 0 < p < 1). That proper + Warning is raised for `algorithm="auto"` and "brute". + """ + X = rng.random_sample((10, n_features)) + y = np.ones(10) + + model = Estimator(p=0.1, algorithm=algorithm) + msg = ( + "Mind that for 0 < p < 1, Minkowski metrics are not distance" + " metrics. Continuing the execution with `algorithm='brute'`." + ) + with pytest.warns(UserWarning, match=msg): + model.fit(X, y) + + assert model._fit_method == "brute" + + +@pytest.mark.parametrize( + "Estimator", + [ + neighbors.KNeighborsClassifier, + neighbors.RadiusNeighborsClassifier, + neighbors.KNeighborsRegressor, + neighbors.RadiusNeighborsRegressor, + ], +) +@pytest.mark.parametrize("n_features", [2, 100]) +@pytest.mark.parametrize("algorithm", ["kd_tree", "ball_tree"]) +def test_neighbors_minkowski_semimetric_algo_error(Estimator, n_features, algorithm): + """Check that we raise a proper error if `algorithm!='brute'` and `p<1`.""" + X = rng.random_sample((10, 2)) + y = np.ones(10) + + model = Estimator(algorithm=algorithm, p=0.1) + msg = ( + f'algorithm="{algorithm}" does not support 0 < p < 1 for ' + "the Minkowski metric. To resolve this problem either " + 'set p >= 1 or algorithm="brute".' + ) + with pytest.raises(ValueError, match=msg): + model.fit(X, y) + + +# TODO: remove when NearestNeighbors methods uses parameter validation mechanism +def test_nearest_neighbors_validate_params(): + """Validate parameter of NearestNeighbors.""" + X = rng.random_sample((10, 2)) + + nbrs = neighbors.NearestNeighbors().fit(X) + msg = ( + 'Unsupported mode, must be one of "connectivity", or "distance" but got "blah"' + " instead" + ) + with pytest.raises(ValueError, match=msg): + nbrs.kneighbors_graph(X, mode="blah") + with pytest.raises(ValueError, match=msg): + nbrs.radius_neighbors_graph(X, mode="blah") + + +@pytest.mark.parametrize( + "metric", + sorted( + set(neighbors.VALID_METRICS["ball_tree"]).intersection( + neighbors.VALID_METRICS["brute"] + ) + - set(["pyfunc", *BOOL_METRICS]) + ) + + DISTANCE_METRIC_OBJS, +) +def test_neighbors_metrics( + global_dtype, + global_random_seed, + metric, + n_samples=20, + n_features=3, + n_query_pts=2, + n_neighbors=5, +): + rng = np.random.RandomState(global_random_seed) + + metric = _parse_metric(metric, global_dtype) + + # Test computing the neighbors for various metrics + algorithms = ["brute", "ball_tree", "kd_tree"] + X_train = rng.rand(n_samples, n_features).astype(global_dtype, copy=False) + X_test = rng.rand(n_query_pts, n_features).astype(global_dtype, copy=False) + + metric_params_list = _generate_test_params_for(metric, n_features) + + for metric_params in metric_params_list: + # Some metric (e.g. Weighted minkowski) are not supported by KDTree + exclude_kd_tree = ( + False + if isinstance(metric, DistanceMetric) + else metric not in neighbors.VALID_METRICS["kd_tree"] + or ("minkowski" in metric and "w" in metric_params) + ) + results = {} + p = metric_params.pop("p", 2) + for algorithm in algorithms: + if isinstance(metric, DistanceMetric) and global_dtype == np.float32: + if "tree" in algorithm: # pragma: nocover + pytest.skip( + "Neither KDTree nor BallTree support 32-bit distance metric" + " objects." + ) + neigh = neighbors.NearestNeighbors( + n_neighbors=n_neighbors, + algorithm=algorithm, + metric=metric, + p=p, + metric_params=metric_params, + ) + + if exclude_kd_tree and algorithm == "kd_tree": + with pytest.raises(ValueError): + neigh.fit(X_train) + continue + + # Haversine distance only accepts 2D data + if metric == "haversine": + feature_sl = slice(None, 2) + X_train = np.ascontiguousarray(X_train[:, feature_sl]) + X_test = np.ascontiguousarray(X_test[:, feature_sl]) + + neigh.fit(X_train) + results[algorithm] = neigh.kneighbors(X_test, return_distance=True) + + brute_dst, brute_idx = results["brute"] + ball_tree_dst, ball_tree_idx = results["ball_tree"] + + # The returned distances are always in float64 regardless of the input dtype + # We need to adjust the tolerance w.r.t the input dtype + rtol = 1e-7 if global_dtype == np.float64 else 1e-4 + + assert_allclose(brute_dst, ball_tree_dst, rtol=rtol) + assert_array_equal(brute_idx, ball_tree_idx) + + if not exclude_kd_tree: + kd_tree_dst, kd_tree_idx = results["kd_tree"] + assert_allclose(brute_dst, kd_tree_dst, rtol=rtol) + assert_array_equal(brute_idx, kd_tree_idx) + + assert_allclose(ball_tree_dst, kd_tree_dst, rtol=rtol) + assert_array_equal(ball_tree_idx, kd_tree_idx) + + +# TODO: Remove ignore_warnings when minimum supported SciPy version is 1.17 +# Some scipy metrics are deprecated (depending on the scipy version) but we +# still want to test them. +@ignore_warnings(category=DeprecationWarning) +@pytest.mark.parametrize( + "metric", sorted(set(neighbors.VALID_METRICS["brute"]) - set(["precomputed"])) +) +def test_kneighbors_brute_backend( + metric, + global_dtype, + global_random_seed, + n_samples=2000, + n_features=30, + n_query_pts=5, + n_neighbors=5, +): + rng = np.random.RandomState(global_random_seed) + # Both backend for the 'brute' algorithm of kneighbors must give identical results. + X_train = rng.rand(n_samples, n_features).astype(global_dtype, copy=False) + X_test = rng.rand(n_query_pts, n_features).astype(global_dtype, copy=False) + + # Haversine distance only accepts 2D data + if metric == "haversine": + feature_sl = slice(None, 2) + X_train = np.ascontiguousarray(X_train[:, feature_sl]) + X_test = np.ascontiguousarray(X_test[:, feature_sl]) + + if metric in PAIRWISE_BOOLEAN_FUNCTIONS: + X_train = X_train > 0.5 + X_test = X_test > 0.5 + + metric_params_list = _generate_test_params_for(metric, n_features) + + for metric_params in metric_params_list: + p = metric_params.pop("p", 2) + + neigh = neighbors.NearestNeighbors( + n_neighbors=n_neighbors, + algorithm="brute", + metric=metric, + p=p, + metric_params=metric_params, + ) + + neigh.fit(X_train) + + with config_context(enable_cython_pairwise_dist=False): + # Use the legacy backend for brute + legacy_brute_dst, legacy_brute_idx = neigh.kneighbors( + X_test, return_distance=True + ) + with config_context(enable_cython_pairwise_dist=True): + # Use the pairwise-distances reduction backend for brute + pdr_brute_dst, pdr_brute_idx = neigh.kneighbors( + X_test, return_distance=True + ) + + assert_compatible_argkmin_results( + legacy_brute_dst, pdr_brute_dst, legacy_brute_idx, pdr_brute_idx + ) + + +def test_callable_metric(): + def custom_metric(x1, x2): + return np.sqrt(np.sum(x1**2 + x2**2)) + + X = np.random.RandomState(42).rand(20, 2) + nbrs1 = neighbors.NearestNeighbors( + n_neighbors=3, algorithm="auto", metric=custom_metric + ) + nbrs2 = neighbors.NearestNeighbors( + n_neighbors=3, algorithm="brute", metric=custom_metric + ) + + nbrs1.fit(X) + nbrs2.fit(X) + + dist1, ind1 = nbrs1.kneighbors(X) + dist2, ind2 = nbrs2.kneighbors(X) + + assert_allclose(dist1, dist2) + + +@pytest.mark.parametrize( + "metric", neighbors.VALID_METRICS["brute"] + DISTANCE_METRIC_OBJS +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_valid_brute_metric_for_auto_algorithm( + global_dtype, metric, csr_container, n_samples=20, n_features=12 +): + metric = _parse_metric(metric, global_dtype) + + X = rng.rand(n_samples, n_features).astype(global_dtype, copy=False) + Xcsr = csr_container(X) + + metric_params_list = _generate_test_params_for(metric, n_features) + + if metric == "precomputed": + X_precomputed = rng.random_sample((10, 4)) + Y_precomputed = rng.random_sample((3, 4)) + DXX = metrics.pairwise_distances(X_precomputed, metric="euclidean") + DYX = metrics.pairwise_distances( + Y_precomputed, X_precomputed, metric="euclidean" + ) + nb_p = neighbors.NearestNeighbors(n_neighbors=3, metric="precomputed") + nb_p.fit(DXX) + nb_p.kneighbors(DYX) + + else: + for metric_params in metric_params_list: + nn = neighbors.NearestNeighbors( + n_neighbors=3, + algorithm="auto", + metric=metric, + metric_params=metric_params, + ) + # Haversine distance only accepts 2D data + if metric == "haversine": + feature_sl = slice(None, 2) + X = np.ascontiguousarray(X[:, feature_sl]) + + nn.fit(X) + nn.kneighbors(X) + + if metric in VALID_METRICS_SPARSE["brute"]: + nn = neighbors.NearestNeighbors( + n_neighbors=3, algorithm="auto", metric=metric + ).fit(Xcsr) + nn.kneighbors(Xcsr) + + +def test_metric_params_interface(): + X = rng.rand(5, 5) + y = rng.randint(0, 2, 5) + est = neighbors.KNeighborsClassifier(metric_params={"p": 3}) + with pytest.warns(SyntaxWarning): + est.fit(X, y) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_predict_sparse_ball_kd_tree(csr_container): + rng = np.random.RandomState(0) + X = rng.rand(5, 5) + y = rng.randint(0, 2, 5) + nbrs1 = neighbors.KNeighborsClassifier(1, algorithm="kd_tree") + nbrs2 = neighbors.KNeighborsRegressor(1, algorithm="ball_tree") + for model in [nbrs1, nbrs2]: + model.fit(X, y) + with pytest.raises(ValueError): + model.predict(csr_container(X)) + + +def test_non_euclidean_kneighbors(): + rng = np.random.RandomState(0) + X = rng.rand(5, 5) + + # Find a reasonable radius. + dist_array = pairwise_distances(X).flatten() + np.sort(dist_array) + radius = dist_array[15] + + # Test kneighbors_graph + for metric in ["manhattan", "chebyshev"]: + nbrs_graph = neighbors.kneighbors_graph( + X, 3, metric=metric, mode="connectivity", include_self=True + ).toarray() + nbrs1 = neighbors.NearestNeighbors(n_neighbors=3, metric=metric).fit(X) + assert_array_equal(nbrs_graph, nbrs1.kneighbors_graph(X).toarray()) + + # Test radiusneighbors_graph + for metric in ["manhattan", "chebyshev"]: + nbrs_graph = neighbors.radius_neighbors_graph( + X, radius, metric=metric, mode="connectivity", include_self=True + ).toarray() + nbrs1 = neighbors.NearestNeighbors(metric=metric, radius=radius).fit(X) + assert_array_equal(nbrs_graph, nbrs1.radius_neighbors_graph(X).toarray()) + + # Raise error when wrong parameters are supplied, + X_nbrs = neighbors.NearestNeighbors(n_neighbors=3, metric="manhattan") + X_nbrs.fit(X) + with pytest.raises(ValueError): + neighbors.kneighbors_graph(X_nbrs, 3, metric="euclidean") + X_nbrs = neighbors.NearestNeighbors(radius=radius, metric="manhattan") + X_nbrs.fit(X) + with pytest.raises(ValueError): + neighbors.radius_neighbors_graph(X_nbrs, radius, metric="euclidean") + + +def check_object_arrays(nparray, list_check): + for ind, ele in enumerate(nparray): + assert_array_equal(ele, list_check[ind]) + + +def test_k_and_radius_neighbors_train_is_not_query(): + # Test kneighbors et.al when query is not training data + + for algorithm in ALGORITHMS: + nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) + + X = [[0], [1]] + nn.fit(X) + test_data = [[2], [1]] + + # Test neighbors. + dist, ind = nn.kneighbors(test_data) + assert_array_equal(dist, [[1], [0]]) + assert_array_equal(ind, [[1], [1]]) + dist, ind = nn.radius_neighbors([[2], [1]], radius=1.5) + check_object_arrays(dist, [[1], [1, 0]]) + check_object_arrays(ind, [[1], [0, 1]]) + + # Test the graph variants. + assert_array_equal( + nn.kneighbors_graph(test_data).toarray(), [[0.0, 1.0], [0.0, 1.0]] + ) + assert_array_equal( + nn.kneighbors_graph([[2], [1]], mode="distance").toarray(), + np.array([[0.0, 1.0], [0.0, 0.0]]), + ) + rng = nn.radius_neighbors_graph([[2], [1]], radius=1.5) + assert_array_equal(rng.toarray(), [[0, 1], [1, 1]]) + + +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_k_and_radius_neighbors_X_None(algorithm): + # Test kneighbors et.al when query is None + nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) + + X = [[0], [1]] + nn.fit(X) + + dist, ind = nn.kneighbors() + assert_array_equal(dist, [[1], [1]]) + assert_array_equal(ind, [[1], [0]]) + dist, ind = nn.radius_neighbors(None, radius=1.5) + check_object_arrays(dist, [[1], [1]]) + check_object_arrays(ind, [[1], [0]]) + + # Test the graph variants. + rng = nn.radius_neighbors_graph(None, radius=1.5) + kng = nn.kneighbors_graph(None) + for graph in [rng, kng]: + assert_array_equal(graph.toarray(), [[0, 1], [1, 0]]) + assert_array_equal(graph.data, [1, 1]) + assert_array_equal(graph.indices, [1, 0]) + + X = [[0, 1], [0, 1], [1, 1]] + nn = neighbors.NearestNeighbors(n_neighbors=2, algorithm=algorithm) + nn.fit(X) + assert_array_equal( + nn.kneighbors_graph().toarray(), + np.array([[0.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 0]]), + ) + + +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_k_and_radius_neighbors_duplicates(algorithm): + # Test behavior of kneighbors when duplicates are present in query + nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) + duplicates = [[0], [1], [3]] + + nn.fit(duplicates) + + # Do not do anything special to duplicates. + kng = nn.kneighbors_graph(duplicates, mode="distance") + assert_allclose( + kng.toarray(), np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]) + ) + assert_allclose(kng.data, [0.0, 0.0, 0.0]) + assert_allclose(kng.indices, [0, 1, 2]) + + dist, ind = nn.radius_neighbors([[0], [1]], radius=1.5) + check_object_arrays(dist, [[0, 1], [1, 0]]) + check_object_arrays(ind, [[0, 1], [0, 1]]) + + rng = nn.radius_neighbors_graph(duplicates, radius=1.5) + assert_allclose( + rng.toarray(), np.array([[1.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + ) + + rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode="distance") + rng.sort_indices() + assert_allclose(rng.toarray(), [[0, 1, 0], [1, 0, 0]]) + assert_allclose(rng.indices, [0, 1, 0, 1]) + assert_allclose(rng.data, [0, 1, 1, 0]) + + # Mask the first duplicates when n_duplicates > n_neighbors. + X = np.ones((3, 1)) + nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm="brute") + nn.fit(X) + dist, ind = nn.kneighbors() + assert_allclose(dist, np.zeros((3, 1))) + assert_allclose(ind, [[1], [0], [1]]) + + # Test that zeros are explicitly marked in kneighbors_graph. + kng = nn.kneighbors_graph(mode="distance") + assert_allclose(kng.toarray(), np.zeros((3, 3))) + assert_allclose(kng.data, np.zeros(3)) + assert_allclose(kng.indices, [1, 0, 1]) + assert_allclose( + nn.kneighbors_graph().toarray(), + np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]), + ) + + +def test_include_self_neighbors_graph(): + # Test include_self parameter in neighbors_graph + X = [[2, 3], [4, 5]] + kng = neighbors.kneighbors_graph(X, 1, include_self=True).toarray() + kng_not_self = neighbors.kneighbors_graph(X, 1, include_self=False).toarray() + assert_array_equal(kng, [[1.0, 0.0], [0.0, 1.0]]) + assert_array_equal(kng_not_self, [[0.0, 1.0], [1.0, 0.0]]) + + rng = neighbors.radius_neighbors_graph(X, 5.0, include_self=True).toarray() + rng_not_self = neighbors.radius_neighbors_graph( + X, 5.0, include_self=False + ).toarray() + assert_array_equal(rng, [[1.0, 1.0], [1.0, 1.0]]) + assert_array_equal(rng_not_self, [[0.0, 1.0], [1.0, 0.0]]) + + +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_same_knn_parallel(algorithm): + X, y = datasets.make_classification( + n_samples=30, n_features=5, n_redundant=0, random_state=0 + ) + X_train, X_test, y_train, y_test = train_test_split(X, y) + + clf = neighbors.KNeighborsClassifier(n_neighbors=3, algorithm=algorithm) + clf.fit(X_train, y_train) + y = clf.predict(X_test) + dist, ind = clf.kneighbors(X_test) + graph = clf.kneighbors_graph(X_test, mode="distance").toarray() + + clf.set_params(n_jobs=3) + clf.fit(X_train, y_train) + y_parallel = clf.predict(X_test) + dist_parallel, ind_parallel = clf.kneighbors(X_test) + graph_parallel = clf.kneighbors_graph(X_test, mode="distance").toarray() + + assert_array_equal(y, y_parallel) + assert_allclose(dist, dist_parallel) + assert_array_equal(ind, ind_parallel) + assert_allclose(graph, graph_parallel) + + +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_same_radius_neighbors_parallel(algorithm): + X, y = datasets.make_classification( + n_samples=30, n_features=5, n_redundant=0, random_state=0 + ) + X_train, X_test, y_train, y_test = train_test_split(X, y) + + clf = neighbors.RadiusNeighborsClassifier(radius=10, algorithm=algorithm) + clf.fit(X_train, y_train) + y = clf.predict(X_test) + dist, ind = clf.radius_neighbors(X_test) + graph = clf.radius_neighbors_graph(X_test, mode="distance").toarray() + + clf.set_params(n_jobs=3) + clf.fit(X_train, y_train) + y_parallel = clf.predict(X_test) + dist_parallel, ind_parallel = clf.radius_neighbors(X_test) + graph_parallel = clf.radius_neighbors_graph(X_test, mode="distance").toarray() + + assert_array_equal(y, y_parallel) + for i in range(len(dist)): + assert_allclose(dist[i], dist_parallel[i]) + assert_array_equal(ind[i], ind_parallel[i]) + assert_allclose(graph, graph_parallel) + + +@pytest.mark.parametrize("backend", ["threading", "loky"]) +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_knn_forcing_backend(backend, algorithm): + # Non-regression test which ensures the knn methods are properly working + # even when forcing the global joblib backend. + with joblib.parallel_backend(backend): + X, y = datasets.make_classification( + n_samples=30, n_features=5, n_redundant=0, random_state=0 + ) + X_train, X_test, y_train, y_test = train_test_split(X, y) + + clf = neighbors.KNeighborsClassifier( + n_neighbors=3, algorithm=algorithm, n_jobs=2 + ) + clf.fit(X_train, y_train) + clf.predict(X_test) + clf.kneighbors(X_test) + clf.kneighbors_graph(X_test, mode="distance") + + +def test_dtype_convert(): + classifier = neighbors.KNeighborsClassifier(n_neighbors=1) + CLASSES = 15 + X = np.eye(CLASSES) + y = [ch for ch in "ABCDEFGHIJKLMNOPQRSTU"[:CLASSES]] + + result = classifier.fit(X, y).predict(X) + assert_array_equal(result, y) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse_metric_callable(csr_container): + def sparse_metric(x, y): # Metric accepting sparse matrix input (only) + assert issparse(x) and issparse(y) + return x.dot(y.T).toarray().item() + + X = csr_container( + [[1, 1, 1, 1, 1], [1, 0, 1, 0, 1], [0, 0, 1, 0, 0]] # Population matrix + ) + + Y = csr_container([[1, 1, 0, 1, 1], [1, 0, 0, 1, 1]]) # Query matrix + + nn = neighbors.NearestNeighbors( + algorithm="brute", n_neighbors=2, metric=sparse_metric + ).fit(X) + N = nn.kneighbors(Y, return_distance=False) + + # GS indices of nearest neighbours in `X` for `sparse_metric` + gold_standard_nn = np.array([[2, 1], [2, 1]]) + + assert_array_equal(N, gold_standard_nn) + + +# ignore conversion to boolean in pairwise_distances +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.DataConversionWarning") +def test_pairwise_boolean_distance(): + # Non-regression test for #4523 + # 'brute': uses scipy.spatial.distance through pairwise_distances + # 'ball_tree': uses sklearn.neighbors._dist_metrics + rng = np.random.RandomState(0) + X = rng.uniform(size=(6, 5)) + NN = neighbors.NearestNeighbors + + nn1 = NN(metric="jaccard", algorithm="brute").fit(X) + nn2 = NN(metric="jaccard", algorithm="ball_tree").fit(X) + assert_array_equal(nn1.kneighbors(X)[0], nn2.kneighbors(X)[0]) + + +def test_radius_neighbors_predict_proba(): + for seed in range(5): + X, y = datasets.make_classification( + n_samples=50, + n_features=5, + n_informative=3, + n_redundant=0, + n_classes=3, + random_state=seed, + ) + X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=0) + outlier_label = int(2 - seed) + clf = neighbors.RadiusNeighborsClassifier(radius=2, outlier_label=outlier_label) + clf.fit(X_tr, y_tr) + pred = clf.predict(X_te) + proba = clf.predict_proba(X_te) + proba_label = proba.argmax(axis=1) + proba_label = np.where(proba.sum(axis=1) == 0, outlier_label, proba_label) + assert_array_equal(pred, proba_label) + + +def test_pipeline_with_nearest_neighbors_transformer(): + # Test chaining KNeighborsTransformer and classifiers/regressors + rng = np.random.RandomState(0) + X = 2 * rng.rand(40, 5) - 1 + X2 = 2 * rng.rand(40, 5) - 1 + y = rng.rand(40, 1) + + n_neighbors = 12 + radius = 1.5 + # We precompute more neighbors than necessary, to have equivalence between + # k-neighbors estimator after radius-neighbors transformer, and vice-versa. + factor = 2 + + k_trans = neighbors.KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance") + k_trans_factor = neighbors.KNeighborsTransformer( + n_neighbors=int(n_neighbors * factor), mode="distance" + ) + + r_trans = neighbors.RadiusNeighborsTransformer(radius=radius, mode="distance") + r_trans_factor = neighbors.RadiusNeighborsTransformer( + radius=int(radius * factor), mode="distance" + ) + + k_reg = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors) + r_reg = neighbors.RadiusNeighborsRegressor(radius=radius) + + test_list = [ + (k_trans, k_reg), + (k_trans_factor, r_reg), + (r_trans, r_reg), + (r_trans_factor, k_reg), + ] + + for trans, reg in test_list: + # compare the chained version and the compact version + reg_compact = clone(reg) + reg_precomp = clone(reg) + reg_precomp.set_params(metric="precomputed") + + reg_chain = make_pipeline(clone(trans), reg_precomp) + + y_pred_chain = reg_chain.fit(X, y).predict(X2) + y_pred_compact = reg_compact.fit(X, y).predict(X2) + assert_allclose(y_pred_chain, y_pred_compact) + + +@pytest.mark.parametrize( + "X, metric, metric_params, expected_algo", + [ + (np.random.randint(10, size=(10, 10)), "precomputed", None, "brute"), + (np.random.randn(10, 20), "euclidean", None, "brute"), + (np.random.randn(8, 5), "euclidean", None, "brute"), + (np.random.randn(10, 5), "euclidean", None, "kd_tree"), + (np.random.randn(10, 5), "seuclidean", {"V": [2] * 5}, "ball_tree"), + (np.random.randn(10, 5), "correlation", None, "brute"), + ], +) +def test_auto_algorithm(X, metric, metric_params, expected_algo): + model = neighbors.NearestNeighbors( + n_neighbors=4, algorithm="auto", metric=metric, metric_params=metric_params + ) + model.fit(X) + assert model._fit_method == expected_algo + + +# TODO: Remove ignore_warnings when minimum supported SciPy version is 1.17 +# Some scipy metrics are deprecated (depending on the scipy version) but we +# still want to test them. +@ignore_warnings(category=DeprecationWarning) +@pytest.mark.parametrize( + "metric", sorted(set(neighbors.VALID_METRICS["brute"]) - set(["precomputed"])) +) +def test_radius_neighbors_brute_backend( + metric, + global_random_seed, + global_dtype, + n_samples=2000, + n_features=30, + n_query_pts=5, + radius=1.0, +): + rng = np.random.RandomState(global_random_seed) + # Both backends for the 'brute' algorithm of radius_neighbors + # must give identical results. + X_train = rng.rand(n_samples, n_features).astype(global_dtype, copy=False) + X_test = rng.rand(n_query_pts, n_features).astype(global_dtype, copy=False) + + # Haversine distance only accepts 2D data + if metric == "haversine": + feature_sl = slice(None, 2) + X_train = np.ascontiguousarray(X_train[:, feature_sl]) + X_test = np.ascontiguousarray(X_test[:, feature_sl]) + + metric_params_list = _generate_test_params_for(metric, n_features) + + for metric_params in metric_params_list: + p = metric_params.pop("p", 2) + + neigh = neighbors.NearestNeighbors( + radius=radius, + algorithm="brute", + metric=metric, + p=p, + metric_params=metric_params, + ) + + neigh.fit(X_train) + + with config_context(enable_cython_pairwise_dist=False): + # Use the legacy backend for brute + legacy_brute_dst, legacy_brute_idx = neigh.radius_neighbors( + X_test, return_distance=True + ) + with config_context(enable_cython_pairwise_dist=True): + # Use the pairwise-distances reduction backend for brute + pdr_brute_dst, pdr_brute_idx = neigh.radius_neighbors( + X_test, return_distance=True + ) + + assert_compatible_radius_results( + legacy_brute_dst, + pdr_brute_dst, + legacy_brute_idx, + pdr_brute_idx, + radius=radius, + check_sorted=False, + ) + + +def test_valid_metrics_has_no_duplicate(): + for val in neighbors.VALID_METRICS.values(): + assert len(val) == len(set(val)) + + +def test_regressor_predict_on_arraylikes(): + """Ensures that `predict` works for array-likes when `weights` is a callable. + + Non-regression test for #22687. + """ + X = [[5, 1], [3, 1], [4, 3], [0, 3]] + y = [2, 3, 5, 6] + + def _weights(dist): + return np.ones_like(dist) + + est = KNeighborsRegressor(n_neighbors=1, algorithm="brute", weights=_weights) + est.fit(X, y) + assert_allclose(est.predict([[0, 2.5]]), [6]) + + +@pytest.mark.parametrize( + "Estimator, params", + [ + (neighbors.KNeighborsClassifier, {"n_neighbors": 2}), + (neighbors.KNeighborsRegressor, {"n_neighbors": 2}), + (neighbors.RadiusNeighborsRegressor, {}), + (neighbors.RadiusNeighborsClassifier, {}), + (neighbors.KNeighborsTransformer, {"n_neighbors": 2}), + (neighbors.RadiusNeighborsTransformer, {"radius": 1.5}), + (neighbors.LocalOutlierFactor, {"n_neighbors": 1}), + ], +) +def test_nan_euclidean_support(Estimator, params): + """Check that the different neighbor estimators are lenient towards `nan` + values if using `metric="nan_euclidean"`. + """ + + X = [[0, 1], [1, np.nan], [2, 3], [3, 5]] + y = [0, 0, 1, 1] + + params.update({"metric": "nan_euclidean"}) + estimator = Estimator().set_params(**params).fit(X, y) + + for response_method in ("kneighbors", "predict", "transform", "fit_predict"): + if hasattr(estimator, response_method): + output = getattr(estimator, response_method)(X) + if hasattr(output, "toarray"): + assert not np.isnan(output.data).any() + else: + assert not np.isnan(output).any() + + +def test_predict_dataframe(): + """Check that KNN predict works with dataframes + + non-regression test for issue #26768 + """ + pd = pytest.importorskip("pandas") + + X = pd.DataFrame(np.array([[1, 2], [3, 4], [5, 6], [7, 8]]), columns=["a", "b"]) + y = np.array([1, 2, 3, 4]) + + knn = neighbors.KNeighborsClassifier(n_neighbors=2).fit(X, y) + knn.predict(X) + + +def test_nearest_neighbours_works_with_p_less_than_1(): + """Check that NearestNeighbors works with :math:`p \\in (0,1)` when `algorithm` + is `"auto"` or `"brute"` regardless of the dtype of X. + + Non-regression test for issue #26548 + """ + X = np.array([[1.0, 0.0], [0.0, 0.0], [0.0, 1.0]]) + neigh = neighbors.NearestNeighbors( + n_neighbors=3, algorithm="brute", metric_params={"p": 0.5} + ) + neigh.fit(X) + + y = neigh.radius_neighbors(X[0].reshape(1, -1), radius=4, return_distance=False) + assert_allclose(y[0], [0, 1, 2]) + + y = neigh.kneighbors(X[0].reshape(1, -1), return_distance=False) + assert_allclose(y[0], [0, 1, 2]) + + +def test_KNeighborsClassifier_raise_on_all_zero_weights(): + """Check that `predict` and `predict_proba` raises on sample of all zeros weights. + + Related to Issue #25854. + """ + X = [[0, 1], [1, 2], [2, 3], [3, 4]] + y = [0, 0, 1, 1] + + def _weights(dist): + return np.vectorize(lambda x: 0 if x > 0.5 else 1)(dist) + + est = neighbors.KNeighborsClassifier(n_neighbors=3, weights=_weights) + est.fit(X, y) + + msg = ( + "All neighbors of some sample is getting zero weights. " + "Please modify 'weights' to avoid this case if you are " + "using a user-defined function." + ) + + with pytest.raises(ValueError, match=msg): + est.predict([[1.1, 1.1]]) + + with pytest.raises(ValueError, match=msg): + est.predict_proba([[1.1, 1.1]]) + + +@pytest.mark.parametrize( + "nn_model", + [ + neighbors.KNeighborsClassifier(n_neighbors=10), + neighbors.RadiusNeighborsClassifier(), + ], +) +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_neighbor_classifiers_loocv(nn_model, algorithm): + """Check that `predict` and related functions work fine with X=None + + Calling predict with X=None computes a prediction for each training point + from the labels of its neighbors (without the label of the data point being + predicted upon). This is therefore mathematically equivalent to + leave-one-out cross-validation without having do any retraining (rebuilding + a KD-tree or Ball-tree index) or any data reshuffling. + """ + X, y = datasets.make_blobs(n_samples=15, centers=5, n_features=2, random_state=0) + + nn_model = clone(nn_model).set_params(algorithm=algorithm) + + # Set the radius for RadiusNeighborsRegressor to some percentile of the + # empirical pairwise distances to avoid trivial test cases and warnings for + # predictions with no neighbors within the radius. + if "radius" in nn_model.get_params(): + dists = pairwise_distances(X).ravel() + dists = dists[dists > 0] + nn_model.set_params(radius=np.percentile(dists, 80)) + + loocv = cross_val_score(nn_model, X, y, cv=LeaveOneOut()) + nn_model.fit(X, y) + + assert_allclose(loocv, nn_model.predict(None) == y) + assert np.mean(loocv) == pytest.approx(nn_model.score(None, y)) + + # Evaluating `nn_model` on its "training" set should lead to a higher + # accuracy value than leaving out each data point in turn because the + # former can overfit while the latter cannot by construction. + assert nn_model.score(None, y) < nn_model.score(X, y) + + +@pytest.mark.parametrize( + "nn_model", + [ + neighbors.KNeighborsRegressor(n_neighbors=10), + neighbors.RadiusNeighborsRegressor(), + ], +) +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_neighbor_regressors_loocv(nn_model, algorithm): + """Check that `predict` and related functions work fine with X=None""" + X, y = datasets.make_regression(n_samples=15, n_features=2, random_state=0) + + # Only checking cross_val_predict and not cross_val_score because + # cross_val_score does not work with LeaveOneOut() for a regressor: the + # default score method implements R2 score which is not well defined for a + # single data point. + # + # TODO: if score is refactored to evaluate models for other scoring + # functions, then this test can be extended to check cross_val_score as + # well. + nn_model = clone(nn_model).set_params(algorithm=algorithm) + + # Set the radius for RadiusNeighborsRegressor to some percentile of the + # empirical pairwise distances to avoid trivial test cases and warnings for + # predictions with no neighbors within the radius. + if "radius" in nn_model.get_params(): + dists = pairwise_distances(X).ravel() + dists = dists[dists > 0] + nn_model.set_params(radius=np.percentile(dists, 80)) + + loocv = cross_val_predict(nn_model, X, y, cv=LeaveOneOut()) + nn_model.fit(X, y) + assert_allclose(loocv, nn_model.predict(None)) diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_neighbors_pipeline.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_neighbors_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..6ad78824489cada3ad56ccff34d806ba6cf1278a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_neighbors_pipeline.py @@ -0,0 +1,256 @@ +""" +This is testing the equivalence between some estimators with internal nearest +neighbors computations, and the corresponding pipeline versions with +KNeighborsTransformer or RadiusNeighborsTransformer to precompute the +neighbors. +""" + +import numpy as np + +from sklearn.base import clone +from sklearn.cluster import DBSCAN, SpectralClustering +from sklearn.cluster.tests.common import generate_clustered_data +from sklearn.datasets import make_blobs +from sklearn.manifold import TSNE, Isomap, SpectralEmbedding +from sklearn.neighbors import ( + KNeighborsRegressor, + KNeighborsTransformer, + LocalOutlierFactor, + RadiusNeighborsRegressor, + RadiusNeighborsTransformer, +) +from sklearn.pipeline import make_pipeline +from sklearn.utils._testing import assert_array_almost_equal + + +def test_spectral_clustering(): + # Test chaining KNeighborsTransformer and SpectralClustering + n_neighbors = 5 + X, _ = make_blobs(random_state=0) + + # compare the chained version and the compact version + est_chain = make_pipeline( + KNeighborsTransformer(n_neighbors=n_neighbors, mode="connectivity"), + SpectralClustering( + n_neighbors=n_neighbors, affinity="precomputed", random_state=42 + ), + ) + est_compact = SpectralClustering( + n_neighbors=n_neighbors, affinity="nearest_neighbors", random_state=42 + ) + labels_compact = est_compact.fit_predict(X) + labels_chain = est_chain.fit_predict(X) + assert_array_almost_equal(labels_chain, labels_compact) + + +def test_spectral_embedding(): + # Test chaining KNeighborsTransformer and SpectralEmbedding + n_neighbors = 5 + + n_samples = 1000 + centers = np.array( + [ + [0.0, 5.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 4.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 5.0, 1.0], + ] + ) + S, true_labels = make_blobs( + n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42 + ) + + # compare the chained version and the compact version + est_chain = make_pipeline( + KNeighborsTransformer(n_neighbors=n_neighbors, mode="connectivity"), + SpectralEmbedding( + n_neighbors=n_neighbors, affinity="precomputed", random_state=42 + ), + ) + est_compact = SpectralEmbedding( + n_neighbors=n_neighbors, affinity="nearest_neighbors", random_state=42 + ) + St_compact = est_compact.fit_transform(S) + St_chain = est_chain.fit_transform(S) + assert_array_almost_equal(St_chain, St_compact) + + +def test_dbscan(): + # Test chaining RadiusNeighborsTransformer and DBSCAN + radius = 0.3 + n_clusters = 3 + X = generate_clustered_data(n_clusters=n_clusters) + + # compare the chained version and the compact version + est_chain = make_pipeline( + RadiusNeighborsTransformer(radius=radius, mode="distance"), + DBSCAN(metric="precomputed", eps=radius), + ) + est_compact = DBSCAN(eps=radius) + + labels_chain = est_chain.fit_predict(X) + labels_compact = est_compact.fit_predict(X) + assert_array_almost_equal(labels_chain, labels_compact) + + +def test_isomap(): + # Test chaining KNeighborsTransformer and Isomap with + # neighbors_algorithm='precomputed' + algorithm = "auto" + n_neighbors = 10 + + X, _ = make_blobs(random_state=0) + X2, _ = make_blobs(random_state=1) + + # compare the chained version and the compact version + est_chain = make_pipeline( + KNeighborsTransformer( + n_neighbors=n_neighbors, algorithm=algorithm, mode="distance" + ), + Isomap(n_neighbors=n_neighbors, metric="precomputed"), + ) + est_compact = Isomap(n_neighbors=n_neighbors, neighbors_algorithm=algorithm) + + Xt_chain = est_chain.fit_transform(X) + Xt_compact = est_compact.fit_transform(X) + assert_array_almost_equal(Xt_chain, Xt_compact) + + Xt_chain = est_chain.transform(X2) + Xt_compact = est_compact.transform(X2) + assert_array_almost_equal(Xt_chain, Xt_compact) + + +def test_tsne(): + # Test chaining KNeighborsTransformer and TSNE + max_iter = 250 + perplexity = 5 + n_neighbors = int(3.0 * perplexity + 1) + + rng = np.random.RandomState(0) + X = rng.randn(20, 2) + + for metric in ["minkowski", "sqeuclidean"]: + # compare the chained version and the compact version + est_chain = make_pipeline( + KNeighborsTransformer( + n_neighbors=n_neighbors, mode="distance", metric=metric + ), + TSNE( + init="random", + metric="precomputed", + perplexity=perplexity, + method="barnes_hut", + random_state=42, + max_iter=max_iter, + ), + ) + est_compact = TSNE( + init="random", + metric=metric, + perplexity=perplexity, + max_iter=max_iter, + method="barnes_hut", + random_state=42, + ) + + Xt_chain = est_chain.fit_transform(X) + Xt_compact = est_compact.fit_transform(X) + assert_array_almost_equal(Xt_chain, Xt_compact) + + +def test_lof_novelty_false(): + # Test chaining KNeighborsTransformer and LocalOutlierFactor + n_neighbors = 4 + + rng = np.random.RandomState(0) + X = rng.randn(40, 2) + + # compare the chained version and the compact version + est_chain = make_pipeline( + KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance"), + LocalOutlierFactor( + metric="precomputed", + n_neighbors=n_neighbors, + novelty=False, + contamination="auto", + ), + ) + est_compact = LocalOutlierFactor( + n_neighbors=n_neighbors, novelty=False, contamination="auto" + ) + + pred_chain = est_chain.fit_predict(X) + pred_compact = est_compact.fit_predict(X) + assert_array_almost_equal(pred_chain, pred_compact) + + +def test_lof_novelty_true(): + # Test chaining KNeighborsTransformer and LocalOutlierFactor + n_neighbors = 4 + + rng = np.random.RandomState(0) + X1 = rng.randn(40, 2) + X2 = rng.randn(40, 2) + + # compare the chained version and the compact version + est_chain = make_pipeline( + KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance"), + LocalOutlierFactor( + metric="precomputed", + n_neighbors=n_neighbors, + novelty=True, + contamination="auto", + ), + ) + est_compact = LocalOutlierFactor( + n_neighbors=n_neighbors, novelty=True, contamination="auto" + ) + + pred_chain = est_chain.fit(X1).predict(X2) + pred_compact = est_compact.fit(X1).predict(X2) + assert_array_almost_equal(pred_chain, pred_compact) + + +def test_kneighbors_regressor(): + # Test chaining KNeighborsTransformer and classifiers/regressors + rng = np.random.RandomState(0) + X = 2 * rng.rand(40, 5) - 1 + X2 = 2 * rng.rand(40, 5) - 1 + y = rng.rand(40, 1) + + n_neighbors = 12 + radius = 1.5 + # We precompute more neighbors than necessary, to have equivalence between + # k-neighbors estimator after radius-neighbors transformer, and vice-versa. + factor = 2 + + k_trans = KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance") + k_trans_factor = KNeighborsTransformer( + n_neighbors=int(n_neighbors * factor), mode="distance" + ) + + r_trans = RadiusNeighborsTransformer(radius=radius, mode="distance") + r_trans_factor = RadiusNeighborsTransformer( + radius=int(radius * factor), mode="distance" + ) + + k_reg = KNeighborsRegressor(n_neighbors=n_neighbors) + r_reg = RadiusNeighborsRegressor(radius=radius) + + test_list = [ + (k_trans, k_reg), + (k_trans_factor, r_reg), + (r_trans, r_reg), + (r_trans_factor, k_reg), + ] + + for trans, reg in test_list: + # compare the chained version and the compact version + reg_compact = clone(reg) + reg_precomp = clone(reg) + reg_precomp.set_params(metric="precomputed") + + reg_chain = make_pipeline(clone(trans), reg_precomp) + + y_pred_chain = reg_chain.fit(X, y).predict(X2) + y_pred_compact = reg_compact.fit(X, y).predict(X2) + assert_array_almost_equal(y_pred_chain, y_pred_compact) diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_neighbors_tree.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_neighbors_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..de19152e8b7f236d0a524f756ca9c40d48023edb --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_neighbors_tree.py @@ -0,0 +1,296 @@ +# SPDX-License-Identifier: BSD-3-Clause + +import itertools +import pickle + +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_array_almost_equal + +from sklearn.metrics import DistanceMetric +from sklearn.neighbors._ball_tree import ( + BallTree, + kernel_norm, +) +from sklearn.neighbors._ball_tree import ( + NeighborsHeap64 as NeighborsHeapBT, +) +from sklearn.neighbors._ball_tree import ( + nodeheap_sort as nodeheap_sort_bt, +) +from sklearn.neighbors._ball_tree import ( + simultaneous_sort as simultaneous_sort_bt, +) +from sklearn.neighbors._kd_tree import ( + KDTree, +) +from sklearn.neighbors._kd_tree import ( + NeighborsHeap64 as NeighborsHeapKDT, +) +from sklearn.neighbors._kd_tree import ( + nodeheap_sort as nodeheap_sort_kdt, +) +from sklearn.neighbors._kd_tree import ( + simultaneous_sort as simultaneous_sort_kdt, +) +from sklearn.utils import check_random_state + +rng = np.random.RandomState(42) +V_mahalanobis = rng.rand(3, 3) +V_mahalanobis = np.dot(V_mahalanobis, V_mahalanobis.T) + +DIMENSION = 3 + +METRICS = { + "euclidean": {}, + "manhattan": {}, + "minkowski": dict(p=3), + "chebyshev": {}, + "seuclidean": dict(V=rng.random_sample(DIMENSION)), + "mahalanobis": dict(V=V_mahalanobis), +} + +KD_TREE_METRICS = ["euclidean", "manhattan", "chebyshev", "minkowski"] +BALL_TREE_METRICS = list(METRICS) + + +def dist_func(x1, x2, p): + return np.sum((x1 - x2) ** p) ** (1.0 / p) + + +def compute_kernel_slow(Y, X, kernel, h): + d = np.sqrt(((Y[:, None, :] - X) ** 2).sum(-1)) + norm = kernel_norm(h, X.shape[1], kernel) + + if kernel == "gaussian": + return norm * np.exp(-0.5 * (d * d) / (h * h)).sum(-1) + elif kernel == "tophat": + return norm * (d < h).sum(-1) + elif kernel == "epanechnikov": + return norm * ((1.0 - (d * d) / (h * h)) * (d < h)).sum(-1) + elif kernel == "exponential": + return norm * (np.exp(-d / h)).sum(-1) + elif kernel == "linear": + return norm * ((1 - d / h) * (d < h)).sum(-1) + elif kernel == "cosine": + return norm * (np.cos(0.5 * np.pi * d / h) * (d < h)).sum(-1) + else: + raise ValueError("kernel not recognized") + + +def brute_force_neighbors(X, Y, k, metric, **kwargs): + D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X) + ind = np.argsort(D, axis=1)[:, :k] + dist = D[np.arange(Y.shape[0])[:, None], ind] + return dist, ind + + +@pytest.mark.parametrize("Cls", [KDTree, BallTree]) +@pytest.mark.parametrize( + "kernel", ["gaussian", "tophat", "epanechnikov", "exponential", "linear", "cosine"] +) +@pytest.mark.parametrize("h", [0.01, 0.1, 1]) +@pytest.mark.parametrize("rtol", [0, 1e-5]) +@pytest.mark.parametrize("atol", [1e-6, 1e-2]) +@pytest.mark.parametrize("breadth_first", [True, False]) +def test_kernel_density( + Cls, kernel, h, rtol, atol, breadth_first, n_samples=100, n_features=3 +): + rng = check_random_state(1) + X = rng.random_sample((n_samples, n_features)) + Y = rng.random_sample((n_samples, n_features)) + dens_true = compute_kernel_slow(Y, X, kernel, h) + + tree = Cls(X, leaf_size=10) + dens = tree.kernel_density( + Y, h, atol=atol, rtol=rtol, kernel=kernel, breadth_first=breadth_first + ) + assert_allclose(dens, dens_true, atol=atol, rtol=max(rtol, 1e-7)) + + +@pytest.mark.parametrize("Cls", [KDTree, BallTree]) +def test_neighbor_tree_query_radius(Cls, n_samples=100, n_features=10): + rng = check_random_state(0) + X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1 + query_pt = np.zeros(n_features, dtype=float) + + eps = 1e-15 # roundoff error can cause test to fail + tree = Cls(X, leaf_size=5) + rad = np.sqrt(((X - query_pt) ** 2).sum(1)) + + for r in np.linspace(rad[0], rad[-1], 100): + ind = tree.query_radius([query_pt], r + eps)[0] + i = np.where(rad <= r + eps)[0] + + ind.sort() + i.sort() + + assert_array_almost_equal(i, ind) + + +@pytest.mark.parametrize("Cls", [KDTree, BallTree]) +def test_neighbor_tree_query_radius_distance(Cls, n_samples=100, n_features=10): + rng = check_random_state(0) + X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1 + query_pt = np.zeros(n_features, dtype=float) + + eps = 1e-15 # roundoff error can cause test to fail + tree = Cls(X, leaf_size=5) + rad = np.sqrt(((X - query_pt) ** 2).sum(1)) + + for r in np.linspace(rad[0], rad[-1], 100): + ind, dist = tree.query_radius([query_pt], r + eps, return_distance=True) + + ind = ind[0] + dist = dist[0] + + d = np.sqrt(((query_pt - X[ind]) ** 2).sum(1)) + + assert_array_almost_equal(d, dist) + + +@pytest.mark.parametrize("Cls", [KDTree, BallTree]) +@pytest.mark.parametrize("dualtree", (True, False)) +def test_neighbor_tree_two_point(Cls, dualtree, n_samples=100, n_features=3): + rng = check_random_state(0) + X = rng.random_sample((n_samples, n_features)) + Y = rng.random_sample((n_samples, n_features)) + r = np.linspace(0, 1, 10) + tree = Cls(X, leaf_size=10) + + D = DistanceMetric.get_metric("euclidean").pairwise(Y, X) + counts_true = [(D <= ri).sum() for ri in r] + + counts = tree.two_point_correlation(Y, r=r, dualtree=dualtree) + assert_array_almost_equal(counts, counts_true) + + +@pytest.mark.parametrize("NeighborsHeap", [NeighborsHeapBT, NeighborsHeapKDT]) +def test_neighbors_heap(NeighborsHeap, n_pts=5, n_nbrs=10): + heap = NeighborsHeap(n_pts, n_nbrs) + rng = check_random_state(0) + + for row in range(n_pts): + d_in = rng.random_sample(2 * n_nbrs).astype(np.float64, copy=False) + i_in = np.arange(2 * n_nbrs, dtype=np.intp) + for d, i in zip(d_in, i_in): + heap.push(row, d, i) + + ind = np.argsort(d_in) + d_in = d_in[ind] + i_in = i_in[ind] + + d_heap, i_heap = heap.get_arrays(sort=True) + + assert_array_almost_equal(d_in[:n_nbrs], d_heap[row]) + assert_array_almost_equal(i_in[:n_nbrs], i_heap[row]) + + +@pytest.mark.parametrize("nodeheap_sort", [nodeheap_sort_bt, nodeheap_sort_kdt]) +def test_node_heap(nodeheap_sort, n_nodes=50): + rng = check_random_state(0) + vals = rng.random_sample(n_nodes).astype(np.float64, copy=False) + + i1 = np.argsort(vals) + vals2, i2 = nodeheap_sort(vals) + + assert_array_almost_equal(i1, i2) + assert_array_almost_equal(vals[i1], vals2) + + +@pytest.mark.parametrize( + "simultaneous_sort", [simultaneous_sort_bt, simultaneous_sort_kdt] +) +def test_simultaneous_sort(simultaneous_sort, n_rows=10, n_pts=201): + rng = check_random_state(0) + dist = rng.random_sample((n_rows, n_pts)).astype(np.float64, copy=False) + ind = (np.arange(n_pts) + np.zeros((n_rows, 1))).astype(np.intp, copy=False) + + dist2 = dist.copy() + ind2 = ind.copy() + + # simultaneous sort rows using function + simultaneous_sort(dist, ind) + + # simultaneous sort rows using numpy + i = np.argsort(dist2, axis=1) + row_ind = np.arange(n_rows)[:, None] + dist2 = dist2[row_ind, i] + ind2 = ind2[row_ind, i] + + assert_array_almost_equal(dist, dist2) + assert_array_almost_equal(ind, ind2) + + +@pytest.mark.parametrize("Cls", [KDTree, BallTree]) +def test_gaussian_kde(Cls, n_samples=1000): + # Compare gaussian KDE results to scipy.stats.gaussian_kde + from scipy.stats import gaussian_kde + + rng = check_random_state(0) + x_in = rng.normal(0, 1, n_samples) + x_out = np.linspace(-5, 5, 30) + + for h in [0.01, 0.1, 1]: + tree = Cls(x_in[:, None]) + gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in)) + + dens_tree = tree.kernel_density(x_out[:, None], h) / n_samples + dens_gkde = gkde.evaluate(x_out) + + assert_array_almost_equal(dens_tree, dens_gkde, decimal=3) + + +@pytest.mark.parametrize( + "Cls, metric", + itertools.chain( + [(KDTree, metric) for metric in KD_TREE_METRICS], + [(BallTree, metric) for metric in BALL_TREE_METRICS], + ), +) +@pytest.mark.parametrize("k", (1, 3, 5)) +@pytest.mark.parametrize("dualtree", (True, False)) +@pytest.mark.parametrize("breadth_first", (True, False)) +def test_nn_tree_query(Cls, metric, k, dualtree, breadth_first): + rng = check_random_state(0) + X = rng.random_sample((40, DIMENSION)) + Y = rng.random_sample((10, DIMENSION)) + + kwargs = METRICS[metric] + + kdt = Cls(X, leaf_size=1, metric=metric, **kwargs) + dist1, ind1 = kdt.query(Y, k, dualtree=dualtree, breadth_first=breadth_first) + dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs) + + # don't check indices here: if there are any duplicate distances, + # the indices may not match. Distances should not have this problem. + assert_array_almost_equal(dist1, dist2) + + +@pytest.mark.parametrize( + "Cls, metric", + [(KDTree, "euclidean"), (BallTree, "euclidean"), (BallTree, dist_func)], +) +@pytest.mark.parametrize("protocol", (0, 1, 2)) +def test_pickle(Cls, metric, protocol): + rng = check_random_state(0) + X = rng.random_sample((10, 3)) + + if hasattr(metric, "__call__"): + kwargs = {"p": 2} + else: + kwargs = {} + + tree1 = Cls(X, leaf_size=1, metric=metric, **kwargs) + + ind1, dist1 = tree1.query(X) + + s = pickle.dumps(tree1, protocol=protocol) + tree2 = pickle.loads(s) + + ind2, dist2 = tree2.query(X) + + assert_array_almost_equal(ind1, ind2) + assert_array_almost_equal(dist1, dist2) + + assert isinstance(tree2, Cls) diff --git a/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_quad_tree.py b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_quad_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..be9a4c5fe549d32a130f9c6a55f6675fa0e42f20 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neighbors/tests/test_quad_tree.py @@ -0,0 +1,144 @@ +import pickle + +import numpy as np +import pytest + +from sklearn.neighbors._quad_tree import _QuadTree +from sklearn.utils import check_random_state + + +def test_quadtree_boundary_computation(): + # Introduce a point into a quad tree with boundaries not easy to compute. + Xs = [] + + # check a random case + Xs.append(np.array([[-1, 1], [-4, -1]], dtype=np.float32)) + # check the case where only 0 are inserted + Xs.append(np.array([[0, 0], [0, 0]], dtype=np.float32)) + # check the case where only negative are inserted + Xs.append(np.array([[-1, -2], [-4, 0]], dtype=np.float32)) + # check the case where only small numbers are inserted + Xs.append(np.array([[-1e-6, 1e-6], [-4e-6, -1e-6]], dtype=np.float32)) + + for X in Xs: + tree = _QuadTree(n_dimensions=2, verbose=0) + tree.build_tree(X) + tree._check_coherence() + + +def test_quadtree_similar_point(): + # Introduce a point into a quad tree where a similar point already exists. + # Test will hang if it doesn't complete. + Xs = [] + + # check the case where points are actually different + Xs.append(np.array([[1, 2], [3, 4]], dtype=np.float32)) + # check the case where points are the same on X axis + Xs.append(np.array([[1.0, 2.0], [1.0, 3.0]], dtype=np.float32)) + # check the case where points are arbitrarily close on X axis + Xs.append(np.array([[1.00001, 2.0], [1.00002, 3.0]], dtype=np.float32)) + # check the case where points are the same on Y axis + Xs.append(np.array([[1.0, 2.0], [3.0, 2.0]], dtype=np.float32)) + # check the case where points are arbitrarily close on Y axis + Xs.append(np.array([[1.0, 2.00001], [3.0, 2.00002]], dtype=np.float32)) + # check the case where points are arbitrarily close on both axes + Xs.append(np.array([[1.00001, 2.00001], [1.00002, 2.00002]], dtype=np.float32)) + + # check the case where points are arbitrarily close on both axes + # close to machine epsilon - x axis + Xs.append(np.array([[1, 0.0003817754041], [2, 0.0003817753750]], dtype=np.float32)) + + # check the case where points are arbitrarily close on both axes + # close to machine epsilon - y axis + Xs.append( + np.array([[0.0003817754041, 1.0], [0.0003817753750, 2.0]], dtype=np.float32) + ) + + for X in Xs: + tree = _QuadTree(n_dimensions=2, verbose=0) + tree.build_tree(X) + tree._check_coherence() + + +@pytest.mark.parametrize("n_dimensions", (2, 3)) +@pytest.mark.parametrize("protocol", (0, 1, 2)) +def test_quad_tree_pickle(n_dimensions, protocol): + rng = check_random_state(0) + + X = rng.random_sample((10, n_dimensions)) + + tree = _QuadTree(n_dimensions=n_dimensions, verbose=0) + tree.build_tree(X) + + s = pickle.dumps(tree, protocol=protocol) + bt2 = pickle.loads(s) + + for x in X: + cell_x_tree = tree.get_cell(x) + cell_x_bt2 = bt2.get_cell(x) + assert cell_x_tree == cell_x_bt2 + + +@pytest.mark.parametrize("n_dimensions", (2, 3)) +def test_qt_insert_duplicate(n_dimensions): + rng = check_random_state(0) + + X = rng.random_sample((10, n_dimensions)) + Xd = np.r_[X, X[:5]] + tree = _QuadTree(n_dimensions=n_dimensions, verbose=0) + tree.build_tree(Xd) + + cumulative_size = tree.cumulative_size + leafs = tree.leafs + + # Assert that the first 5 are indeed duplicated and that the next + # ones are single point leaf + for i, x in enumerate(X): + cell_id = tree.get_cell(x) + assert leafs[cell_id] + assert cumulative_size[cell_id] == 1 + (i < 5) + + +def test_summarize(): + # Simple check for quad tree's summarize + + angle = 0.9 + X = np.array( + [[-10.0, -10.0], [9.0, 10.0], [10.0, 9.0], [10.0, 10.0]], dtype=np.float32 + ) + query_pt = X[0, :] + n_dimensions = X.shape[1] + offset = n_dimensions + 2 + + qt = _QuadTree(n_dimensions, verbose=0) + qt.build_tree(X) + + idx, summary = qt._py_summarize(query_pt, X, angle) + + node_dist = summary[n_dimensions] + node_size = summary[n_dimensions + 1] + + # Summary should contain only 1 node with size 3 and distance to + # X[1:] barycenter + barycenter = X[1:].mean(axis=0) + ds2c = ((X[0] - barycenter) ** 2).sum() + + assert idx == offset + assert node_size == 3, "summary size = {}".format(node_size) + assert np.isclose(node_dist, ds2c) + + # Summary should contain all 3 node with size 1 and distance to + # each point in X[1:] for ``angle=0`` + idx, summary = qt._py_summarize(query_pt, X, 0.0) + barycenter = X[1:].mean(axis=0) + ds2c = ((X[0] - barycenter) ** 2).sum() + + assert idx == 3 * (offset) + for i in range(3): + node_dist = summary[i * offset + n_dimensions] + node_size = summary[i * offset + n_dimensions + 1] + + ds2c = ((X[0] - X[i + 1]) ** 2).sum() + + assert node_size == 1, "summary size = {}".format(node_size) + assert np.isclose(node_dist, ds2c) diff --git a/.venv/lib/python3.12/site-packages/sklearn/neural_network/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/neural_network/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fa5980ce24f5c778f8c1cb505c9e5218b5f30a27 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/__init__.py @@ -0,0 +1,9 @@ +"""Models based on neural networks.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._multilayer_perceptron import MLPClassifier, MLPRegressor +from ._rbm import BernoulliRBM + +__all__ = ["BernoulliRBM", "MLPClassifier", "MLPRegressor"] diff --git a/.venv/lib/python3.12/site-packages/sklearn/neural_network/_base.py b/.venv/lib/python3.12/site-packages/sklearn/neural_network/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..25f0b0a18512b71147e292caf5891cf5620fccb6 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/_base.py @@ -0,0 +1,287 @@ +"""Utilities for the neural network modules""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np +from scipy.special import expit as logistic_sigmoid +from scipy.special import xlogy + + +def inplace_identity(X): + """Simply leave the input array unchanged. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Data, where `n_samples` is the number of samples + and `n_features` is the number of features. + """ + # Nothing to do + + +def inplace_exp(X): + """Compute the exponential inplace. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The input data. + """ + np.exp(X, out=X) + + +def inplace_logistic(X): + """Compute the logistic function inplace. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The input data. + """ + logistic_sigmoid(X, out=X) + + +def inplace_tanh(X): + """Compute the hyperbolic tan function inplace. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The input data. + """ + np.tanh(X, out=X) + + +def inplace_relu(X): + """Compute the rectified linear unit function inplace. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The input data. + """ + np.maximum(X, 0, out=X) + + +def inplace_softmax(X): + """Compute the K-way softmax function inplace. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The input data. + """ + tmp = X - X.max(axis=1)[:, np.newaxis] + np.exp(tmp, out=X) + X /= X.sum(axis=1)[:, np.newaxis] + + +ACTIVATIONS = { + "identity": inplace_identity, + "exp": inplace_exp, + "tanh": inplace_tanh, + "logistic": inplace_logistic, + "relu": inplace_relu, + "softmax": inplace_softmax, +} + + +def inplace_identity_derivative(Z, delta): + """Apply the derivative of the identity function: do nothing. + + Parameters + ---------- + Z : {array-like, sparse matrix}, shape (n_samples, n_features) + The data which was output from the identity activation function during + the forward pass. + + delta : {array-like}, shape (n_samples, n_features) + The backpropagated error signal to be modified inplace. + """ + # Nothing to do + + +def inplace_logistic_derivative(Z, delta): + """Apply the derivative of the logistic sigmoid function. + + It exploits the fact that the derivative is a simple function of the output + value from logistic function. + + Parameters + ---------- + Z : {array-like, sparse matrix}, shape (n_samples, n_features) + The data which was output from the logistic activation function during + the forward pass. + + delta : {array-like}, shape (n_samples, n_features) + The backpropagated error signal to be modified inplace. + """ + delta *= Z + delta *= 1 - Z + + +def inplace_tanh_derivative(Z, delta): + """Apply the derivative of the hyperbolic tanh function. + + It exploits the fact that the derivative is a simple function of the output + value from hyperbolic tangent. + + Parameters + ---------- + Z : {array-like, sparse matrix}, shape (n_samples, n_features) + The data which was output from the hyperbolic tangent activation + function during the forward pass. + + delta : {array-like}, shape (n_samples, n_features) + The backpropagated error signal to be modified inplace. + """ + delta *= 1 - Z**2 + + +def inplace_relu_derivative(Z, delta): + """Apply the derivative of the relu function. + + It exploits the fact that the derivative is a simple function of the output + value from rectified linear units activation function. + + Parameters + ---------- + Z : {array-like, sparse matrix}, shape (n_samples, n_features) + The data which was output from the rectified linear units activation + function during the forward pass. + + delta : {array-like}, shape (n_samples, n_features) + The backpropagated error signal to be modified inplace. + """ + delta[Z == 0] = 0 + + +DERIVATIVES = { + "identity": inplace_identity_derivative, + "tanh": inplace_tanh_derivative, + "logistic": inplace_logistic_derivative, + "relu": inplace_relu_derivative, +} + + +def squared_loss(y_true, y_pred, sample_weight=None): + """Compute the squared loss for regression. + + Parameters + ---------- + y_true : array-like or label indicator matrix + Ground truth (correct) values. + + y_pred : array-like or label indicator matrix + Predicted values, as returned by a regression estimator. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + loss : float + The degree to which the samples are correctly predicted. + """ + return ( + 0.5 * np.average((y_true - y_pred) ** 2, weights=sample_weight, axis=0).mean() + ) + + +def poisson_loss(y_true, y_pred, sample_weight=None): + """Compute (half of the) Poisson deviance loss for regression. + + Parameters + ---------- + y_true : array-like or label indicator matrix + Ground truth (correct) labels. + + y_pred : array-like or label indicator matrix + Predicted values, as returned by a regression estimator. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + loss : float + The degree to which the samples are correctly predicted. + """ + # TODO: Decide what to do with the term `xlogy(y_true, y_true) - y_true`. For now, + # it is included. But the _loss module doesn't use it (for performance reasons) and + # only adds it as return of constant_to_optimal_zero (mainly for testing). + return np.average( + xlogy(y_true, y_true / y_pred) - y_true + y_pred, weights=sample_weight, axis=0 + ).sum() + + +def log_loss(y_true, y_prob, sample_weight=None): + """Compute Logistic loss for classification. + + Parameters + ---------- + y_true : array-like or label indicator matrix + Ground truth (correct) labels. + + y_prob : array-like of float, shape = (n_samples, n_classes) + Predicted probabilities, as returned by a classifier's + predict_proba method. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + loss : float + The degree to which the samples are correctly predicted. + """ + eps = np.finfo(y_prob.dtype).eps + y_prob = np.clip(y_prob, eps, 1 - eps) + if y_prob.shape[1] == 1: + y_prob = np.append(1 - y_prob, y_prob, axis=1) + + if y_true.shape[1] == 1: + y_true = np.append(1 - y_true, y_true, axis=1) + + return -np.average(xlogy(y_true, y_prob), weights=sample_weight, axis=0).sum() + + +def binary_log_loss(y_true, y_prob, sample_weight=None): + """Compute binary logistic loss for classification. + + This is identical to log_loss in binary classification case, + but is kept for its use in multilabel case. + + Parameters + ---------- + y_true : array-like or label indicator matrix + Ground truth (correct) labels. + + y_prob : array-like of float, shape = (n_samples, 1) + Predicted probabilities, as returned by a classifier's + predict_proba method. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + Returns + ------- + loss : float + The degree to which the samples are correctly predicted. + """ + eps = np.finfo(y_prob.dtype).eps + y_prob = np.clip(y_prob, eps, 1 - eps) + return -np.average( + xlogy(y_true, y_prob) + xlogy(1 - y_true, 1 - y_prob), + weights=sample_weight, + axis=0, + ).sum() + + +LOSS_FUNCTIONS = { + "squared_error": squared_loss, + "poisson": poisson_loss, + "log_loss": log_loss, + "binary_log_loss": binary_log_loss, +} diff --git a/.venv/lib/python3.12/site-packages/sklearn/neural_network/_multilayer_perceptron.py b/.venv/lib/python3.12/site-packages/sklearn/neural_network/_multilayer_perceptron.py new file mode 100644 index 0000000000000000000000000000000000000000..e8260164202e648385618ff32bd9f3a1e5f21617 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/_multilayer_perceptron.py @@ -0,0 +1,1797 @@ +"""Multi-layer Perceptron""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from abc import ABC, abstractmethod +from itertools import chain, pairwise +from numbers import Integral, Real + +import numpy as np +import scipy.optimize + +from ..base import ( + BaseEstimator, + ClassifierMixin, + RegressorMixin, + _fit_context, + is_classifier, +) +from ..exceptions import ConvergenceWarning +from ..metrics import accuracy_score, r2_score +from ..model_selection import train_test_split +from ..preprocessing import LabelBinarizer +from ..utils import ( + _safe_indexing, + check_random_state, + column_or_1d, + gen_batches, + shuffle, +) +from ..utils._param_validation import Interval, Options, StrOptions +from ..utils.extmath import safe_sparse_dot +from ..utils.fixes import _get_additional_lbfgs_options_dict +from ..utils.metaestimators import available_if +from ..utils.multiclass import ( + _check_partial_fit_first_call, + type_of_target, + unique_labels, +) +from ..utils.optimize import _check_optimize_result +from ..utils.validation import _check_sample_weight, check_is_fitted, validate_data +from ._base import ACTIVATIONS, DERIVATIVES, LOSS_FUNCTIONS +from ._stochastic_optimizers import AdamOptimizer, SGDOptimizer + +_STOCHASTIC_SOLVERS = ["sgd", "adam"] + + +def _pack(coefs_, intercepts_): + """Pack the parameters into a single vector.""" + return np.hstack([l.ravel() for l in coefs_ + intercepts_]) + + +class BaseMultilayerPerceptron(BaseEstimator, ABC): + """Base class for MLP classification and regression. + + Warning: This class should not be used directly. + Use derived classes instead. + + .. versionadded:: 0.18 + """ + + _parameter_constraints: dict = { + "hidden_layer_sizes": [ + "array-like", + Interval(Integral, 1, None, closed="left"), + ], + "activation": [StrOptions({"identity", "logistic", "tanh", "relu"})], + "solver": [StrOptions({"lbfgs", "sgd", "adam"})], + "alpha": [Interval(Real, 0, None, closed="left")], + "batch_size": [ + StrOptions({"auto"}), + Interval(Integral, 1, None, closed="left"), + ], + "learning_rate": [StrOptions({"constant", "invscaling", "adaptive"})], + "learning_rate_init": [Interval(Real, 0, None, closed="neither")], + "power_t": [Interval(Real, 0, None, closed="left")], + "max_iter": [Interval(Integral, 1, None, closed="left")], + "shuffle": ["boolean"], + "random_state": ["random_state"], + "tol": [Interval(Real, 0, None, closed="left")], + "verbose": ["verbose"], + "warm_start": ["boolean"], + "momentum": [Interval(Real, 0, 1, closed="both")], + "nesterovs_momentum": ["boolean"], + "early_stopping": ["boolean"], + "validation_fraction": [Interval(Real, 0, 1, closed="left")], + "beta_1": [Interval(Real, 0, 1, closed="left")], + "beta_2": [Interval(Real, 0, 1, closed="left")], + "epsilon": [Interval(Real, 0, None, closed="neither")], + "n_iter_no_change": [ + Interval(Integral, 1, None, closed="left"), + Options(Real, {np.inf}), + ], + "max_fun": [Interval(Integral, 1, None, closed="left")], + } + + @abstractmethod + def __init__( + self, + hidden_layer_sizes, + activation, + solver, + alpha, + batch_size, + learning_rate, + learning_rate_init, + power_t, + max_iter, + loss, + shuffle, + random_state, + tol, + verbose, + warm_start, + momentum, + nesterovs_momentum, + early_stopping, + validation_fraction, + beta_1, + beta_2, + epsilon, + n_iter_no_change, + max_fun, + ): + self.activation = activation + self.solver = solver + self.alpha = alpha + self.batch_size = batch_size + self.learning_rate = learning_rate + self.learning_rate_init = learning_rate_init + self.power_t = power_t + self.max_iter = max_iter + self.loss = loss + self.hidden_layer_sizes = hidden_layer_sizes + self.shuffle = shuffle + self.random_state = random_state + self.tol = tol + self.verbose = verbose + self.warm_start = warm_start + self.momentum = momentum + self.nesterovs_momentum = nesterovs_momentum + self.early_stopping = early_stopping + self.validation_fraction = validation_fraction + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.n_iter_no_change = n_iter_no_change + self.max_fun = max_fun + + def _unpack(self, packed_parameters): + """Extract the coefficients and intercepts from packed_parameters.""" + for i in range(self.n_layers_ - 1): + start, end, shape = self._coef_indptr[i] + self.coefs_[i] = np.reshape(packed_parameters[start:end], shape) + + start, end = self._intercept_indptr[i] + self.intercepts_[i] = packed_parameters[start:end] + + def _forward_pass(self, activations): + """Perform a forward pass on the network by computing the values + of the neurons in the hidden layers and the output layer. + + Parameters + ---------- + activations : list, length = n_layers - 1 + The ith element of the list holds the values of the ith layer. + """ + hidden_activation = ACTIVATIONS[self.activation] + # Iterate over the hidden layers + for i in range(self.n_layers_ - 1): + activations[i + 1] = safe_sparse_dot(activations[i], self.coefs_[i]) + activations[i + 1] += self.intercepts_[i] + + # For the hidden layers + if (i + 1) != (self.n_layers_ - 1): + hidden_activation(activations[i + 1]) + + # For the last layer + output_activation = ACTIVATIONS[self.out_activation_] + output_activation(activations[i + 1]) + + return activations + + def _forward_pass_fast(self, X, check_input=True): + """Predict using the trained model + + This is the same as _forward_pass but does not record the activations + of all layers and only returns the last layer's activation. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input data. + + check_input : bool, default=True + Perform input data validation or not. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs) + The decision function of the samples for each class in the model. + """ + if check_input: + X = validate_data(self, X, accept_sparse=["csr", "csc"], reset=False) + + # Initialize first layer + activation = X + + # Forward propagate + hidden_activation = ACTIVATIONS[self.activation] + for i in range(self.n_layers_ - 1): + activation = safe_sparse_dot(activation, self.coefs_[i]) + activation += self.intercepts_[i] + if i != self.n_layers_ - 2: + hidden_activation(activation) + output_activation = ACTIVATIONS[self.out_activation_] + output_activation(activation) + + return activation + + def _compute_loss_grad( + self, layer, sw_sum, activations, deltas, coef_grads, intercept_grads + ): + """Compute the gradient of loss with respect to coefs and intercept for + specified layer. + + This function does backpropagation for the specified one layer. + """ + coef_grads[layer] = safe_sparse_dot(activations[layer].T, deltas[layer]) + coef_grads[layer] += self.alpha * self.coefs_[layer] + coef_grads[layer] /= sw_sum + + intercept_grads[layer] = np.sum(deltas[layer], axis=0) / sw_sum + + def _loss_grad_lbfgs( + self, + packed_coef_inter, + X, + y, + sample_weight, + activations, + deltas, + coef_grads, + intercept_grads, + ): + """Compute the MLP loss function and its corresponding derivatives + with respect to the different parameters given in the initialization. + + Returned gradients are packed in a single vector so it can be used + in lbfgs + + Parameters + ---------- + packed_coef_inter : ndarray + A vector comprising the flattened coefficients and intercepts. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input data. + + y : ndarray of shape (n_samples,) + The target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + activations : list, length = n_layers - 1 + The ith element of the list holds the values of the ith layer. + + deltas : list, length = n_layers - 1 + The ith element of the list holds the difference between the + activations of the i + 1 layer and the backpropagated error. + More specifically, deltas are gradients of loss with respect to z + in each layer, where z = wx + b is the value of a particular layer + before passing through the activation function + + coef_grads : list, length = n_layers - 1 + The ith element contains the amount of change used to update the + coefficient parameters of the ith layer in an iteration. + + intercept_grads : list, length = n_layers - 1 + The ith element contains the amount of change used to update the + intercept parameters of the ith layer in an iteration. + + Returns + ------- + loss : float + grad : array-like, shape (number of nodes of all layers,) + """ + self._unpack(packed_coef_inter) + loss, coef_grads, intercept_grads = self._backprop( + X, y, sample_weight, activations, deltas, coef_grads, intercept_grads + ) + grad = _pack(coef_grads, intercept_grads) + return loss, grad + + def _backprop( + self, X, y, sample_weight, activations, deltas, coef_grads, intercept_grads + ): + """Compute the MLP loss function and its corresponding derivatives + with respect to each parameter: weights and bias vectors. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input data. + + y : ndarray of shape (n_samples,) + The target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + activations : list, length = n_layers - 1 + The ith element of the list holds the values of the ith layer. + + deltas : list, length = n_layers - 1 + The ith element of the list holds the difference between the + activations of the i + 1 layer and the backpropagated error. + More specifically, deltas are gradients of loss with respect to z + in each layer, where z = wx + b is the value of a particular layer + before passing through the activation function + + coef_grads : list, length = n_layers - 1 + The ith element contains the amount of change used to update the + coefficient parameters of the ith layer in an iteration. + + intercept_grads : list, length = n_layers - 1 + The ith element contains the amount of change used to update the + intercept parameters of the ith layer in an iteration. + + Returns + ------- + loss : float + coef_grads : list, length = n_layers - 1 + intercept_grads : list, length = n_layers - 1 + """ + n_samples = X.shape[0] + + # Forward propagate + activations = self._forward_pass(activations) + + # Get loss + loss_func_name = self.loss + if loss_func_name == "log_loss" and self.out_activation_ == "logistic": + loss_func_name = "binary_log_loss" + loss = LOSS_FUNCTIONS[loss_func_name](y, activations[-1], sample_weight) + # Add L2 regularization term to loss + values = 0 + for s in self.coefs_: + s = s.ravel() + values += np.dot(s, s) + if sample_weight is None: + sw_sum = n_samples + else: + sw_sum = sample_weight.sum() + loss += (0.5 * self.alpha) * values / sw_sum + + # Backward propagate + last = self.n_layers_ - 2 + + # The calculation of delta[last] is as follows: + # delta[last] = d/dz loss(y, act(z)) = act(z) - y + # with z=x@w + b being the output of the last layer before passing through the + # output activation, act(z) = activations[-1]. + # The simple formula for delta[last] here works with following (canonical + # loss-link) combinations of output activation and loss function: + # sigmoid and binary cross entropy, softmax and categorical cross + # entropy, and identity with squared loss + deltas[last] = activations[-1] - y + if sample_weight is not None: + deltas[last] *= sample_weight.reshape(-1, 1) + + # Compute gradient for the last layer + self._compute_loss_grad( + last, sw_sum, activations, deltas, coef_grads, intercept_grads + ) + + inplace_derivative = DERIVATIVES[self.activation] + # Iterate over the hidden layers + for i in range(last, 0, -1): + deltas[i - 1] = safe_sparse_dot(deltas[i], self.coefs_[i].T) + inplace_derivative(activations[i], deltas[i - 1]) + + self._compute_loss_grad( + i - 1, sw_sum, activations, deltas, coef_grads, intercept_grads + ) + + return loss, coef_grads, intercept_grads + + def _initialize(self, y, layer_units, dtype): + # set all attributes, allocate weights etc. for first call + # Initialize parameters + self.n_iter_ = 0 + self.t_ = 0 + self.n_outputs_ = y.shape[1] + + # Compute the number of layers + self.n_layers_ = len(layer_units) + + # Output for regression + if not is_classifier(self): + if self.loss == "poisson": + self.out_activation_ = "exp" + else: + # loss = "squared_error" + self.out_activation_ = "identity" + # Output for multi class + elif self._label_binarizer.y_type_ == "multiclass": + self.out_activation_ = "softmax" + # Output for binary class and multi-label + else: + self.out_activation_ = "logistic" + + # Initialize coefficient and intercept layers + self.coefs_ = [] + self.intercepts_ = [] + + for i in range(self.n_layers_ - 1): + coef_init, intercept_init = self._init_coef( + layer_units[i], layer_units[i + 1], dtype + ) + self.coefs_.append(coef_init) + self.intercepts_.append(intercept_init) + + self._best_coefs = [c.copy() for c in self.coefs_] + self._best_intercepts = [i.copy() for i in self.intercepts_] + + if self.solver in _STOCHASTIC_SOLVERS: + self.loss_curve_ = [] + self._no_improvement_count = 0 + if self.early_stopping: + self.validation_scores_ = [] + self.best_validation_score_ = -np.inf + self.best_loss_ = None + else: + self.best_loss_ = np.inf + self.validation_scores_ = None + self.best_validation_score_ = None + + def _init_coef(self, fan_in, fan_out, dtype): + # Use the initialization method recommended by + # Glorot et al. + factor = 6.0 + if self.activation == "logistic": + factor = 2.0 + init_bound = np.sqrt(factor / (fan_in + fan_out)) + + # Generate weights and bias: + coef_init = self._random_state.uniform( + -init_bound, init_bound, (fan_in, fan_out) + ) + intercept_init = self._random_state.uniform(-init_bound, init_bound, fan_out) + coef_init = coef_init.astype(dtype, copy=False) + intercept_init = intercept_init.astype(dtype, copy=False) + return coef_init, intercept_init + + def _fit(self, X, y, sample_weight=None, incremental=False): + # Make sure self.hidden_layer_sizes is a list + hidden_layer_sizes = self.hidden_layer_sizes + if not hasattr(hidden_layer_sizes, "__iter__"): + hidden_layer_sizes = [hidden_layer_sizes] + hidden_layer_sizes = list(hidden_layer_sizes) + + if np.any(np.array(hidden_layer_sizes) <= 0): + raise ValueError( + "hidden_layer_sizes must be > 0, got %s." % hidden_layer_sizes + ) + first_pass = not hasattr(self, "coefs_") or ( + not self.warm_start and not incremental + ) + + X, y = self._validate_input(X, y, incremental, reset=first_pass) + n_samples, n_features = X.shape + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) + + # Ensure y is 2D + if y.ndim == 1: + y = y.reshape((-1, 1)) + + self.n_outputs_ = y.shape[1] + + layer_units = [n_features] + hidden_layer_sizes + [self.n_outputs_] + + # check random state + self._random_state = check_random_state(self.random_state) + + if first_pass: + # First time training the model + self._initialize(y, layer_units, X.dtype) + + # Initialize lists + activations = [X] + [None] * (len(layer_units) - 1) + deltas = [None] * (len(activations) - 1) + + coef_grads = [ + np.empty((n_fan_in_, n_fan_out_), dtype=X.dtype) + for n_fan_in_, n_fan_out_ in pairwise(layer_units) + ] + + intercept_grads = [ + np.empty(n_fan_out_, dtype=X.dtype) for n_fan_out_ in layer_units[1:] + ] + + # Run the Stochastic optimization solver + if self.solver in _STOCHASTIC_SOLVERS: + self._fit_stochastic( + X, + y, + sample_weight, + activations, + deltas, + coef_grads, + intercept_grads, + layer_units, + incremental, + ) + + # Run the LBFGS solver + elif self.solver == "lbfgs": + self._fit_lbfgs( + X, + y, + sample_weight, + activations, + deltas, + coef_grads, + intercept_grads, + layer_units, + ) + + # validate parameter weights + weights = chain(self.coefs_, self.intercepts_) + if not all(np.isfinite(w).all() for w in weights): + raise ValueError( + "Solver produced non-finite parameter weights. The input data may" + " contain large values and need to be preprocessed." + ) + + return self + + def _fit_lbfgs( + self, + X, + y, + sample_weight, + activations, + deltas, + coef_grads, + intercept_grads, + layer_units, + ): + # Store meta information for the parameters + self._coef_indptr = [] + self._intercept_indptr = [] + start = 0 + + # Save sizes and indices of coefficients for faster unpacking + for i in range(self.n_layers_ - 1): + n_fan_in, n_fan_out = layer_units[i], layer_units[i + 1] + + end = start + (n_fan_in * n_fan_out) + self._coef_indptr.append((start, end, (n_fan_in, n_fan_out))) + start = end + + # Save sizes and indices of intercepts for faster unpacking + for i in range(self.n_layers_ - 1): + end = start + layer_units[i + 1] + self._intercept_indptr.append((start, end)) + start = end + + # Run LBFGS + packed_coef_inter = _pack(self.coefs_, self.intercepts_) + + if self.verbose is True or self.verbose >= 1: + iprint = 1 + else: + iprint = -1 + + opt_res = scipy.optimize.minimize( + self._loss_grad_lbfgs, + packed_coef_inter, + method="L-BFGS-B", + jac=True, + options={ + "maxfun": self.max_fun, + "maxiter": self.max_iter, + "gtol": self.tol, + **_get_additional_lbfgs_options_dict("iprint", iprint), + }, + args=( + X, + y, + sample_weight, + activations, + deltas, + coef_grads, + intercept_grads, + ), + ) + self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter) + self.loss_ = opt_res.fun + self._unpack(opt_res.x) + + def _fit_stochastic( + self, + X, + y, + sample_weight, + activations, + deltas, + coef_grads, + intercept_grads, + layer_units, + incremental, + ): + params = self.coefs_ + self.intercepts_ + if not incremental or not hasattr(self, "_optimizer"): + if self.solver == "sgd": + self._optimizer = SGDOptimizer( + params, + self.learning_rate_init, + self.learning_rate, + self.momentum, + self.nesterovs_momentum, + self.power_t, + ) + elif self.solver == "adam": + self._optimizer = AdamOptimizer( + params, + self.learning_rate_init, + self.beta_1, + self.beta_2, + self.epsilon, + ) + + # early_stopping in partial_fit doesn't make sense + if self.early_stopping and incremental: + raise ValueError("partial_fit does not support early_stopping=True") + early_stopping = self.early_stopping + if early_stopping: + # don't stratify in multilabel classification + should_stratify = is_classifier(self) and self.n_outputs_ == 1 + stratify = y if should_stratify else None + if sample_weight is None: + X_train, X_val, y_train, y_val = train_test_split( + X, + y, + random_state=self._random_state, + test_size=self.validation_fraction, + stratify=stratify, + ) + sample_weight_train = sample_weight_val = None + else: + # TODO: incorporate sample_weight in sampling here. + ( + X_train, + X_val, + y_train, + y_val, + sample_weight_train, + sample_weight_val, + ) = train_test_split( + X, + y, + sample_weight, + random_state=self._random_state, + test_size=self.validation_fraction, + stratify=stratify, + ) + if X_val.shape[0] < 2: + raise ValueError( + "The validation set is too small. Increase 'validation_fraction' " + "or the size of your dataset." + ) + + if is_classifier(self): + y_val = self._label_binarizer.inverse_transform(y_val) + else: + X_train, y_train, sample_weight_train = X, y, sample_weight + X_val = y_val = sample_weight_val = None + + n_samples = X_train.shape[0] + sample_idx = np.arange(n_samples, dtype=int) + + if self.batch_size == "auto": + batch_size = min(200, n_samples) + else: + if self.batch_size > n_samples: + warnings.warn( + "Got `batch_size` less than 1 or larger than " + "sample size. It is going to be clipped" + ) + batch_size = np.clip(self.batch_size, 1, n_samples) + + try: + self.n_iter_ = 0 + for it in range(self.max_iter): + if self.shuffle: + # Only shuffle the sample indices instead of X and y to + # reduce the memory footprint. These indices will be used + # to slice the X and y. + sample_idx = shuffle(sample_idx, random_state=self._random_state) + + accumulated_loss = 0.0 + for batch_slice in gen_batches(n_samples, batch_size): + if self.shuffle: + batch_idx = sample_idx[batch_slice] + X_batch = _safe_indexing(X_train, batch_idx) + else: + batch_idx = batch_slice + X_batch = X_train[batch_idx] + y_batch = y_train[batch_idx] + if sample_weight is None: + sample_weight_batch = None + else: + sample_weight_batch = sample_weight_train[batch_idx] + + activations[0] = X_batch + batch_loss, coef_grads, intercept_grads = self._backprop( + X_batch, + y_batch, + sample_weight_batch, + activations, + deltas, + coef_grads, + intercept_grads, + ) + accumulated_loss += batch_loss * ( + batch_slice.stop - batch_slice.start + ) + + # update weights + grads = coef_grads + intercept_grads + self._optimizer.update_params(params, grads) + + self.n_iter_ += 1 + self.loss_ = accumulated_loss / X_train.shape[0] + + self.t_ += n_samples + self.loss_curve_.append(self.loss_) + if self.verbose: + print("Iteration %d, loss = %.8f" % (self.n_iter_, self.loss_)) + + # update no_improvement_count based on training loss or + # validation score according to early_stopping + self._update_no_improvement_count( + early_stopping, X_val, y_val, sample_weight_val + ) + + # for learning rate that needs to be updated at iteration end + self._optimizer.iteration_ends(self.t_) + + if self._no_improvement_count > self.n_iter_no_change: + # not better than last `n_iter_no_change` iterations by tol + # stop or decrease learning rate + if early_stopping: + msg = ( + "Validation score did not improve more than " + "tol=%f for %d consecutive epochs." + % (self.tol, self.n_iter_no_change) + ) + else: + msg = ( + "Training loss did not improve more than tol=%f" + " for %d consecutive epochs." + % (self.tol, self.n_iter_no_change) + ) + + is_stopping = self._optimizer.trigger_stopping(msg, self.verbose) + if is_stopping: + break + else: + self._no_improvement_count = 0 + + if incremental: + break + + if self.n_iter_ == self.max_iter: + warnings.warn( + "Stochastic Optimizer: Maximum iterations (%d) " + "reached and the optimization hasn't converged yet." + % self.max_iter, + ConvergenceWarning, + ) + except KeyboardInterrupt: + warnings.warn("Training interrupted by user.") + + if early_stopping: + # restore best weights + self.coefs_ = self._best_coefs + self.intercepts_ = self._best_intercepts + + def _update_no_improvement_count(self, early_stopping, X, y, sample_weight): + if early_stopping: + # compute validation score (can be NaN), use that for stopping + val_score = self._score(X, y, sample_weight=sample_weight) + + self.validation_scores_.append(val_score) + + if self.verbose: + print("Validation score: %f" % self.validation_scores_[-1]) + # update best parameters + # use validation_scores_, not loss_curve_ + # let's hope no-one overloads .score with mse + last_valid_score = self.validation_scores_[-1] + + if last_valid_score < (self.best_validation_score_ + self.tol): + self._no_improvement_count += 1 + else: + self._no_improvement_count = 0 + + if last_valid_score > self.best_validation_score_: + self.best_validation_score_ = last_valid_score + self._best_coefs = [c.copy() for c in self.coefs_] + self._best_intercepts = [i.copy() for i in self.intercepts_] + else: + if self.loss_curve_[-1] > self.best_loss_ - self.tol: + self._no_improvement_count += 1 + else: + self._no_improvement_count = 0 + if self.loss_curve_[-1] < self.best_loss_: + self.best_loss_ = self.loss_curve_[-1] + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, sample_weight=None): + """Fit the model to data matrix X and target(s) y. + + Parameters + ---------- + X : ndarray or sparse matrix of shape (n_samples, n_features) + The input data. + + y : ndarray of shape (n_samples,) or (n_samples, n_outputs) + The target values (class labels in classification, real numbers in + regression). + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + .. versionadded:: 1.7 + + Returns + ------- + self : object + Returns a trained MLP model. + """ + return self._fit(X, y, sample_weight=sample_weight, incremental=False) + + def _check_solver(self): + if self.solver not in _STOCHASTIC_SOLVERS: + raise AttributeError( + "partial_fit is only available for stochastic" + " optimizers. %s is not stochastic." % self.solver + ) + return True + + def _score_with_function(self, X, y, sample_weight, score_function): + """Private score method without input validation.""" + # Input validation would remove feature names, so we disable it + y_pred = self._predict(X, check_input=False) + + if np.isnan(y_pred).any() or np.isinf(y_pred).any(): + return np.nan + + return score_function(y, y_pred, sample_weight=sample_weight) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + + +class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron): + """Multi-layer Perceptron classifier. + + This model optimizes the log-loss function using LBFGS or stochastic + gradient descent. + + .. versionadded:: 0.18 + + Parameters + ---------- + hidden_layer_sizes : array-like of shape(n_layers - 2,), default=(100,) + The ith element represents the number of neurons in the ith + hidden layer. + + activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu' + Activation function for the hidden layer. + + - 'identity', no-op activation, useful to implement linear bottleneck, + returns f(x) = x + + - 'logistic', the logistic sigmoid function, + returns f(x) = 1 / (1 + exp(-x)). + + - 'tanh', the hyperbolic tan function, + returns f(x) = tanh(x). + + - 'relu', the rectified linear unit function, + returns f(x) = max(0, x) + + solver : {'lbfgs', 'sgd', 'adam'}, default='adam' + The solver for weight optimization. + + - 'lbfgs' is an optimizer in the family of quasi-Newton methods. + + - 'sgd' refers to stochastic gradient descent. + + - 'adam' refers to a stochastic gradient-based optimizer proposed + by Kingma, Diederik, and Jimmy Ba + + For a comparison between Adam optimizer and SGD, see + :ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_training_curves.py`. + + Note: The default solver 'adam' works pretty well on relatively + large datasets (with thousands of training samples or more) in terms of + both training time and validation score. + For small datasets, however, 'lbfgs' can converge faster and perform + better. + + alpha : float, default=0.0001 + Strength of the L2 regularization term. The L2 regularization term + is divided by the sample size when added to the loss. + + For an example usage and visualization of varying regularization, see + :ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_alpha.py`. + + batch_size : int, default='auto' + Size of minibatches for stochastic optimizers. + If the solver is 'lbfgs', the classifier will not use minibatch. + When set to "auto", `batch_size=min(200, n_samples)`. + + learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant' + Learning rate schedule for weight updates. + + - 'constant' is a constant learning rate given by + 'learning_rate_init'. + + - 'invscaling' gradually decreases the learning rate at each + time step 't' using an inverse scaling exponent of 'power_t'. + effective_learning_rate = learning_rate_init / pow(t, power_t) + + - 'adaptive' keeps the learning rate constant to + 'learning_rate_init' as long as training loss keeps decreasing. + Each time two consecutive epochs fail to decrease training loss by at + least tol, or fail to increase validation score by at least tol if + 'early_stopping' is on, the current learning rate is divided by 5. + + Only used when ``solver='sgd'``. + + learning_rate_init : float, default=0.001 + The initial learning rate used. It controls the step-size + in updating the weights. Only used when solver='sgd' or 'adam'. + + power_t : float, default=0.5 + The exponent for inverse scaling learning rate. + It is used in updating effective learning rate when the learning_rate + is set to 'invscaling'. Only used when solver='sgd'. + + max_iter : int, default=200 + Maximum number of iterations. The solver iterates until convergence + (determined by 'tol') or this number of iterations. For stochastic + solvers ('sgd', 'adam'), note that this determines the number of epochs + (how many times each data point will be used), not the number of + gradient steps. + + shuffle : bool, default=True + Whether to shuffle samples in each iteration. Only used when + solver='sgd' or 'adam'. + + random_state : int, RandomState instance, default=None + Determines random number generation for weights and bias + initialization, train-test split if early stopping is used, and batch + sampling when solver='sgd' or 'adam'. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + tol : float, default=1e-4 + Tolerance for the optimization. When the loss or score is not improving + by at least ``tol`` for ``n_iter_no_change`` consecutive iterations, + unless ``learning_rate`` is set to 'adaptive', convergence is + considered to be reached and training stops. + + verbose : bool, default=False + Whether to print progress messages to stdout. + + warm_start : bool, default=False + When set to True, reuse the solution of the previous + call to fit as initialization, otherwise, just erase the + previous solution. See :term:`the Glossary `. + + momentum : float, default=0.9 + Momentum for gradient descent update. Should be between 0 and 1. Only + used when solver='sgd'. + + nesterovs_momentum : bool, default=True + Whether to use Nesterov's momentum. Only used when solver='sgd' and + momentum > 0. + + early_stopping : bool, default=False + Whether to use early stopping to terminate training when validation + score is not improving. If set to true, it will automatically set + aside 10% of training data as validation and terminate training when + validation score is not improving by at least ``tol`` for + ``n_iter_no_change`` consecutive epochs. The split is stratified, + except in a multilabel setting. + If early stopping is False, then the training stops when the training + loss does not improve by more than tol for n_iter_no_change consecutive + passes over the training set. + Only effective when solver='sgd' or 'adam'. + + validation_fraction : float, default=0.1 + The proportion of training data to set aside as validation set for + early stopping. Must be between 0 and 1. + Only used if early_stopping is True. + + beta_1 : float, default=0.9 + Exponential decay rate for estimates of first moment vector in adam, + should be in [0, 1). Only used when solver='adam'. + + beta_2 : float, default=0.999 + Exponential decay rate for estimates of second moment vector in adam, + should be in [0, 1). Only used when solver='adam'. + + epsilon : float, default=1e-8 + Value for numerical stability in adam. Only used when solver='adam'. + + n_iter_no_change : int, default=10 + Maximum number of epochs to not meet ``tol`` improvement. + Only effective when solver='sgd' or 'adam'. + + .. versionadded:: 0.20 + + max_fun : int, default=15000 + Only used when solver='lbfgs'. Maximum number of loss function calls. + The solver iterates until convergence (determined by 'tol'), number + of iterations reaches max_iter, or this number of loss function calls. + Note that number of loss function calls will be greater than or equal + to the number of iterations for the `MLPClassifier`. + + .. versionadded:: 0.22 + + Attributes + ---------- + classes_ : ndarray or list of ndarray of shape (n_classes,) + Class labels for each output. + + loss_ : float + The current loss computed with the loss function. + + best_loss_ : float or None + The minimum loss reached by the solver throughout fitting. + If `early_stopping=True`, this attribute is set to `None`. Refer to + the `best_validation_score_` fitted attribute instead. + + loss_curve_ : list of shape (`n_iter_`,) + The ith element in the list represents the loss at the ith iteration. + + validation_scores_ : list of shape (`n_iter_`,) or None + The score at each iteration on a held-out validation set. The score + reported is the accuracy score. Only available if `early_stopping=True`, + otherwise the attribute is set to `None`. + + best_validation_score_ : float or None + The best validation score (i.e. accuracy score) that triggered the + early stopping. Only available if `early_stopping=True`, otherwise the + attribute is set to `None`. + + t_ : int + The number of training samples seen by the solver during fitting. + + coefs_ : list of shape (n_layers - 1,) + The ith element in the list represents the weight matrix corresponding + to layer i. + + intercepts_ : list of shape (n_layers - 1,) + The ith element in the list represents the bias vector corresponding to + layer i + 1. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + The number of iterations the solver has run. + + n_layers_ : int + Number of layers. + + n_outputs_ : int + Number of outputs. + + out_activation_ : str + Name of the output activation function. + + See Also + -------- + MLPRegressor : Multi-layer Perceptron regressor. + BernoulliRBM : Bernoulli Restricted Boltzmann Machine (RBM). + + Notes + ----- + MLPClassifier trains iteratively since at each time step + the partial derivatives of the loss function with respect to the model + parameters are computed to update the parameters. + + It can also have a regularization term added to the loss function + that shrinks model parameters to prevent overfitting. + + This implementation works with data represented as dense numpy arrays or + sparse scipy arrays of floating point values. + + References + ---------- + Hinton, Geoffrey E. "Connectionist learning procedures." + Artificial intelligence 40.1 (1989): 185-234. + + Glorot, Xavier, and Yoshua Bengio. + "Understanding the difficulty of training deep feedforward neural networks." + International Conference on Artificial Intelligence and Statistics. 2010. + + :arxiv:`He, Kaiming, et al (2015). "Delving deep into rectifiers: + Surpassing human-level performance on imagenet classification." <1502.01852>` + + :arxiv:`Kingma, Diederik, and Jimmy Ba (2014) + "Adam: A method for stochastic optimization." <1412.6980>` + + Examples + -------- + >>> from sklearn.neural_network import MLPClassifier + >>> from sklearn.datasets import make_classification + >>> from sklearn.model_selection import train_test_split + >>> X, y = make_classification(n_samples=100, random_state=1) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, + ... random_state=1) + >>> clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train) + >>> clf.predict_proba(X_test[:1]) + array([[0.0383, 0.961]]) + >>> clf.predict(X_test[:5, :]) + array([1, 0, 1, 0, 1]) + >>> clf.score(X_test, y_test) + 0.8... + """ + + def __init__( + self, + hidden_layer_sizes=(100,), + activation="relu", + *, + solver="adam", + alpha=0.0001, + batch_size="auto", + learning_rate="constant", + learning_rate_init=0.001, + power_t=0.5, + max_iter=200, + shuffle=True, + random_state=None, + tol=1e-4, + verbose=False, + warm_start=False, + momentum=0.9, + nesterovs_momentum=True, + early_stopping=False, + validation_fraction=0.1, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-8, + n_iter_no_change=10, + max_fun=15000, + ): + super().__init__( + hidden_layer_sizes=hidden_layer_sizes, + activation=activation, + solver=solver, + alpha=alpha, + batch_size=batch_size, + learning_rate=learning_rate, + learning_rate_init=learning_rate_init, + power_t=power_t, + max_iter=max_iter, + loss="log_loss", + shuffle=shuffle, + random_state=random_state, + tol=tol, + verbose=verbose, + warm_start=warm_start, + momentum=momentum, + nesterovs_momentum=nesterovs_momentum, + early_stopping=early_stopping, + validation_fraction=validation_fraction, + beta_1=beta_1, + beta_2=beta_2, + epsilon=epsilon, + n_iter_no_change=n_iter_no_change, + max_fun=max_fun, + ) + + def _validate_input(self, X, y, incremental, reset): + X, y = validate_data( + self, + X, + y, + accept_sparse=["csr", "csc"], + multi_output=True, + dtype=(np.float64, np.float32), + reset=reset, + ) + if y.ndim == 2 and y.shape[1] == 1: + y = column_or_1d(y, warn=True) + + # Matrix of actions to be taken under the possible combinations: + # The case that incremental == True and classes_ not defined is + # already checked by _check_partial_fit_first_call that is called + # in _partial_fit below. + # The cases are already grouped into the respective if blocks below. + # + # incremental warm_start classes_ def action + # 0 0 0 define classes_ + # 0 1 0 define classes_ + # 0 0 1 redefine classes_ + # + # 0 1 1 check compat warm_start + # 1 1 1 check compat warm_start + # + # 1 0 1 check compat last fit + # + # Note the reliance on short-circuiting here, so that the second + # or part implies that classes_ is defined. + if (not hasattr(self, "classes_")) or (not self.warm_start and not incremental): + self._label_binarizer = LabelBinarizer() + self._label_binarizer.fit(y) + self.classes_ = self._label_binarizer.classes_ + else: + classes = unique_labels(y) + if self.warm_start: + if set(classes) != set(self.classes_): + raise ValueError( + "warm_start can only be used where `y` has the same " + "classes as in the previous call to fit. Previously " + f"got {self.classes_}, `y` has {classes}" + ) + elif len(np.setdiff1d(classes, self.classes_, assume_unique=True)): + raise ValueError( + "`y` has classes not in `self.classes_`. " + f"`self.classes_` has {self.classes_}. 'y' has {classes}." + ) + + # This downcast to bool is to prevent upcasting when working with + # float32 data + y = self._label_binarizer.transform(y).astype(bool) + return X, y + + def predict(self, X): + """Predict using the multi-layer perceptron classifier. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input data. + + Returns + ------- + y : ndarray, shape (n_samples,) or (n_samples, n_classes) + The predicted classes. + """ + check_is_fitted(self) + return self._predict(X) + + def _predict(self, X, check_input=True): + """Private predict method with optional input validation""" + y_pred = self._forward_pass_fast(X, check_input=check_input) + + if self.n_outputs_ == 1: + y_pred = y_pred.ravel() + + return self._label_binarizer.inverse_transform(y_pred) + + def _score(self, X, y, sample_weight=None): + return super()._score_with_function( + X, y, sample_weight=sample_weight, score_function=accuracy_score + ) + + @available_if(lambda est: est._check_solver()) + @_fit_context(prefer_skip_nested_validation=True) + def partial_fit(self, X, y, sample_weight=None, classes=None): + """Update the model with a single iteration over the given data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input data. + + y : array-like of shape (n_samples,) + The target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + .. versionadded:: 1.7 + + classes : array of shape (n_classes,), default=None + Classes across all calls to partial_fit. + Can be obtained via `np.unique(y_all)`, where y_all is the + target vector of the entire dataset. + This argument is required for the first call to partial_fit + and can be omitted in the subsequent calls. + Note that y doesn't need to contain all labels in `classes`. + + Returns + ------- + self : object + Trained MLP model. + """ + if _check_partial_fit_first_call(self, classes): + self._label_binarizer = LabelBinarizer() + if type_of_target(y).startswith("multilabel"): + self._label_binarizer.fit(y) + else: + self._label_binarizer.fit(classes) + + return self._fit(X, y, sample_weight=sample_weight, incremental=True) + + def predict_log_proba(self, X): + """Return the log of probability estimates. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + The input data. + + Returns + ------- + log_y_prob : ndarray of shape (n_samples, n_classes) + The predicted log-probability of the sample for each class + in the model, where classes are ordered as they are in + `self.classes_`. Equivalent to `log(predict_proba(X))`. + """ + y_prob = self.predict_proba(X) + return np.log(y_prob, out=y_prob) + + def predict_proba(self, X): + """Probability estimates. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input data. + + Returns + ------- + y_prob : ndarray of shape (n_samples, n_classes) + The predicted probability of the sample for each class in the + model, where classes are ordered as they are in `self.classes_`. + """ + check_is_fitted(self) + y_pred = self._forward_pass_fast(X) + + if self.n_outputs_ == 1: + y_pred = y_pred.ravel() + + if y_pred.ndim == 1: + return np.vstack([1 - y_pred, y_pred]).T + else: + return y_pred + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.classifier_tags.multi_label = True + return tags + + +class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): + """Multi-layer Perceptron regressor. + + This model optimizes the squared error using LBFGS or stochastic gradient + descent. + + .. versionadded:: 0.18 + + Parameters + ---------- + loss : {'squared_error', 'poisson'}, default='squared_error' + The loss function to use when training the weights. Note that the + "squared error" and "poisson" losses actually implement + "half squares error" and "half poisson deviance" to simplify the + computation of the gradient. Furthermore, the "poisson" loss internally uses + a log-link (exponential as the output activation function) and requires + ``y >= 0``. + + .. versionchanged:: 1.7 + Added parameter `loss` and option 'poisson'. + + hidden_layer_sizes : array-like of shape(n_layers - 2,), default=(100,) + The ith element represents the number of neurons in the ith + hidden layer. + + activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu' + Activation function for the hidden layer. + + - 'identity', no-op activation, useful to implement linear bottleneck, + returns f(x) = x + + - 'logistic', the logistic sigmoid function, + returns f(x) = 1 / (1 + exp(-x)). + + - 'tanh', the hyperbolic tan function, + returns f(x) = tanh(x). + + - 'relu', the rectified linear unit function, + returns f(x) = max(0, x) + + solver : {'lbfgs', 'sgd', 'adam'}, default='adam' + The solver for weight optimization. + + - 'lbfgs' is an optimizer in the family of quasi-Newton methods. + + - 'sgd' refers to stochastic gradient descent. + + - 'adam' refers to a stochastic gradient-based optimizer proposed by + Kingma, Diederik, and Jimmy Ba + + For a comparison between Adam optimizer and SGD, see + :ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_training_curves.py`. + + Note: The default solver 'adam' works pretty well on relatively + large datasets (with thousands of training samples or more) in terms of + both training time and validation score. + For small datasets, however, 'lbfgs' can converge faster and perform + better. + + alpha : float, default=0.0001 + Strength of the L2 regularization term. The L2 regularization term + is divided by the sample size when added to the loss. + + batch_size : int, default='auto' + Size of minibatches for stochastic optimizers. + If the solver is 'lbfgs', the regressor will not use minibatch. + When set to "auto", `batch_size=min(200, n_samples)`. + + learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant' + Learning rate schedule for weight updates. + + - 'constant' is a constant learning rate given by + 'learning_rate_init'. + + - 'invscaling' gradually decreases the learning rate ``learning_rate_`` + at each time step 't' using an inverse scaling exponent of 'power_t'. + effective_learning_rate = learning_rate_init / pow(t, power_t) + + - 'adaptive' keeps the learning rate constant to + 'learning_rate_init' as long as training loss keeps decreasing. + Each time two consecutive epochs fail to decrease training loss by at + least tol, or fail to increase validation score by at least tol if + 'early_stopping' is on, the current learning rate is divided by 5. + + Only used when solver='sgd'. + + learning_rate_init : float, default=0.001 + The initial learning rate used. It controls the step-size + in updating the weights. Only used when solver='sgd' or 'adam'. + + power_t : float, default=0.5 + The exponent for inverse scaling learning rate. + It is used in updating effective learning rate when the learning_rate + is set to 'invscaling'. Only used when solver='sgd'. + + max_iter : int, default=200 + Maximum number of iterations. The solver iterates until convergence + (determined by 'tol') or this number of iterations. For stochastic + solvers ('sgd', 'adam'), note that this determines the number of epochs + (how many times each data point will be used), not the number of + gradient steps. + + shuffle : bool, default=True + Whether to shuffle samples in each iteration. Only used when + solver='sgd' or 'adam'. + + random_state : int, RandomState instance, default=None + Determines random number generation for weights and bias + initialization, train-test split if early stopping is used, and batch + sampling when solver='sgd' or 'adam'. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + tol : float, default=1e-4 + Tolerance for the optimization. When the loss or score is not improving + by at least ``tol`` for ``n_iter_no_change`` consecutive iterations, + unless ``learning_rate`` is set to 'adaptive', convergence is + considered to be reached and training stops. + + verbose : bool, default=False + Whether to print progress messages to stdout. + + warm_start : bool, default=False + When set to True, reuse the solution of the previous + call to fit as initialization, otherwise, just erase the + previous solution. See :term:`the Glossary `. + + momentum : float, default=0.9 + Momentum for gradient descent update. Should be between 0 and 1. Only + used when solver='sgd'. + + nesterovs_momentum : bool, default=True + Whether to use Nesterov's momentum. Only used when solver='sgd' and + momentum > 0. + + early_stopping : bool, default=False + Whether to use early stopping to terminate training when validation + score is not improving. If set to True, it will automatically set + aside ``validation_fraction`` of training data as validation and + terminate training when validation score is not improving by at + least ``tol`` for ``n_iter_no_change`` consecutive epochs. + Only effective when solver='sgd' or 'adam'. + + validation_fraction : float, default=0.1 + The proportion of training data to set aside as validation set for + early stopping. Must be between 0 and 1. + Only used if early_stopping is True. + + beta_1 : float, default=0.9 + Exponential decay rate for estimates of first moment vector in adam, + should be in [0, 1). Only used when solver='adam'. + + beta_2 : float, default=0.999 + Exponential decay rate for estimates of second moment vector in adam, + should be in [0, 1). Only used when solver='adam'. + + epsilon : float, default=1e-8 + Value for numerical stability in adam. Only used when solver='adam'. + + n_iter_no_change : int, default=10 + Maximum number of epochs to not meet ``tol`` improvement. + Only effective when solver='sgd' or 'adam'. + + .. versionadded:: 0.20 + + max_fun : int, default=15000 + Only used when solver='lbfgs'. Maximum number of function calls. + The solver iterates until convergence (determined by ``tol``), number + of iterations reaches max_iter, or this number of function calls. + Note that number of function calls will be greater than or equal to + the number of iterations for the MLPRegressor. + + .. versionadded:: 0.22 + + Attributes + ---------- + loss_ : float + The current loss computed with the loss function. + + best_loss_ : float + The minimum loss reached by the solver throughout fitting. + If `early_stopping=True`, this attribute is set to `None`. Refer to + the `best_validation_score_` fitted attribute instead. + Only accessible when solver='sgd' or 'adam'. + + loss_curve_ : list of shape (`n_iter_`,) + Loss value evaluated at the end of each training step. + The ith element in the list represents the loss at the ith iteration. + Only accessible when solver='sgd' or 'adam'. + + validation_scores_ : list of shape (`n_iter_`,) or None + The score at each iteration on a held-out validation set. The score + reported is the R2 score. Only available if `early_stopping=True`, + otherwise the attribute is set to `None`. + Only accessible when solver='sgd' or 'adam'. + + best_validation_score_ : float or None + The best validation score (i.e. R2 score) that triggered the + early stopping. Only available if `early_stopping=True`, otherwise the + attribute is set to `None`. + Only accessible when solver='sgd' or 'adam'. + + t_ : int + The number of training samples seen by the solver during fitting. + Mathematically equals `n_iters * X.shape[0]`, it means + `time_step` and it is used by optimizer's learning rate scheduler. + + coefs_ : list of shape (n_layers - 1,) + The ith element in the list represents the weight matrix corresponding + to layer i. + + intercepts_ : list of shape (n_layers - 1,) + The ith element in the list represents the bias vector corresponding to + layer i + 1. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + The number of iterations the solver has run. + + n_layers_ : int + Number of layers. + + n_outputs_ : int + Number of outputs. + + out_activation_ : str + Name of the output activation function. + + See Also + -------- + BernoulliRBM : Bernoulli Restricted Boltzmann Machine (RBM). + MLPClassifier : Multi-layer Perceptron classifier. + sklearn.linear_model.SGDRegressor : Linear model fitted by minimizing + a regularized empirical loss with SGD. + + Notes + ----- + MLPRegressor trains iteratively since at each time step + the partial derivatives of the loss function with respect to the model + parameters are computed to update the parameters. + + It can also have a regularization term added to the loss function + that shrinks model parameters to prevent overfitting. + + This implementation works with data represented as dense and sparse numpy + arrays of floating point values. + + References + ---------- + Hinton, Geoffrey E. "Connectionist learning procedures." + Artificial intelligence 40.1 (1989): 185-234. + + Glorot, Xavier, and Yoshua Bengio. + "Understanding the difficulty of training deep feedforward neural networks." + International Conference on Artificial Intelligence and Statistics. 2010. + + :arxiv:`He, Kaiming, et al (2015). "Delving deep into rectifiers: + Surpassing human-level performance on imagenet classification." <1502.01852>` + + :arxiv:`Kingma, Diederik, and Jimmy Ba (2014) + "Adam: A method for stochastic optimization." <1412.6980>` + + Examples + -------- + >>> from sklearn.neural_network import MLPRegressor + >>> from sklearn.datasets import make_regression + >>> from sklearn.model_selection import train_test_split + >>> X, y = make_regression(n_samples=200, n_features=20, random_state=1) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, + ... random_state=1) + >>> regr = MLPRegressor(random_state=1, max_iter=2000, tol=0.1) + >>> regr.fit(X_train, y_train) + MLPRegressor(max_iter=2000, random_state=1, tol=0.1) + >>> regr.predict(X_test[:2]) + array([ 28.98, -291]) + >>> regr.score(X_test, y_test) + 0.98 + """ + + _parameter_constraints: dict = { + **BaseMultilayerPerceptron._parameter_constraints, + "loss": [StrOptions({"squared_error", "poisson"})], + } + + def __init__( + self, + loss="squared_error", + hidden_layer_sizes=(100,), + activation="relu", + *, + solver="adam", + alpha=0.0001, + batch_size="auto", + learning_rate="constant", + learning_rate_init=0.001, + power_t=0.5, + max_iter=200, + shuffle=True, + random_state=None, + tol=1e-4, + verbose=False, + warm_start=False, + momentum=0.9, + nesterovs_momentum=True, + early_stopping=False, + validation_fraction=0.1, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-8, + n_iter_no_change=10, + max_fun=15000, + ): + super().__init__( + hidden_layer_sizes=hidden_layer_sizes, + activation=activation, + solver=solver, + alpha=alpha, + batch_size=batch_size, + learning_rate=learning_rate, + learning_rate_init=learning_rate_init, + power_t=power_t, + max_iter=max_iter, + loss=loss, + shuffle=shuffle, + random_state=random_state, + tol=tol, + verbose=verbose, + warm_start=warm_start, + momentum=momentum, + nesterovs_momentum=nesterovs_momentum, + early_stopping=early_stopping, + validation_fraction=validation_fraction, + beta_1=beta_1, + beta_2=beta_2, + epsilon=epsilon, + n_iter_no_change=n_iter_no_change, + max_fun=max_fun, + ) + + def predict(self, X): + """Predict using the multi-layer perceptron model. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input data. + + Returns + ------- + y : ndarray of shape (n_samples, n_outputs) + The predicted values. + """ + check_is_fitted(self) + return self._predict(X) + + def _predict(self, X, check_input=True): + """Private predict method with optional input validation""" + y_pred = self._forward_pass_fast(X, check_input=check_input) + if y_pred.shape[1] == 1: + return y_pred.ravel() + return y_pred + + def _score(self, X, y, sample_weight=None): + return super()._score_with_function( + X, y, sample_weight=sample_weight, score_function=r2_score + ) + + def _validate_input(self, X, y, incremental, reset): + X, y = validate_data( + self, + X, + y, + accept_sparse=["csr", "csc"], + multi_output=True, + y_numeric=True, + dtype=(np.float64, np.float32), + reset=reset, + ) + if y.ndim == 2 and y.shape[1] == 1: + y = column_or_1d(y, warn=True) + return X, y + + @available_if(lambda est: est._check_solver) + @_fit_context(prefer_skip_nested_validation=True) + def partial_fit(self, X, y, sample_weight=None): + """Update the model with a single iteration over the given data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input data. + + y : ndarray of shape (n_samples,) + The target values. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. + + .. versionadded:: 1.6 + + Returns + ------- + self : object + Trained MLP model. + """ + return self._fit(X, y, sample_weight=sample_weight, incremental=True) diff --git a/.venv/lib/python3.12/site-packages/sklearn/neural_network/_rbm.py b/.venv/lib/python3.12/site-packages/sklearn/neural_network/_rbm.py new file mode 100644 index 0000000000000000000000000000000000000000..1e1d3c2e11b7cd8a43b57aefeda4a93903698264 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/_rbm.py @@ -0,0 +1,445 @@ +"""Restricted Boltzmann Machine""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import time +from numbers import Integral, Real + +import numpy as np +import scipy.sparse as sp +from scipy.special import expit # logistic function + +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, + _fit_context, +) +from ..utils import check_random_state, gen_even_slices +from ..utils._param_validation import Interval +from ..utils.extmath import safe_sparse_dot +from ..utils.validation import check_is_fitted, validate_data + + +class BernoulliRBM(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): + """Bernoulli Restricted Boltzmann Machine (RBM). + + A Restricted Boltzmann Machine with binary visible units and + binary hidden units. Parameters are estimated using Stochastic Maximum + Likelihood (SML), also known as Persistent Contrastive Divergence (PCD) + [2]. + + The time complexity of this implementation is ``O(d ** 2)`` assuming + d ~ n_features ~ n_components. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_components : int, default=256 + Number of binary hidden units. + + learning_rate : float, default=0.1 + The learning rate for weight updates. It is *highly* recommended + to tune this hyper-parameter. Reasonable values are in the + 10**[0., -3.] range. + + batch_size : int, default=10 + Number of examples per minibatch. + + n_iter : int, default=10 + Number of iterations/sweeps over the training dataset to perform + during training. + + verbose : int, default=0 + The verbosity level. The default, zero, means silent mode. Range + of values is [0, inf]. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for: + + - Gibbs sampling from visible and hidden layers. + + - Initializing components, sampling from layers during fit. + + - Corrupting the data when scoring samples. + + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + intercept_hidden_ : array-like of shape (n_components,) + Biases of the hidden units. + + intercept_visible_ : array-like of shape (n_features,) + Biases of the visible units. + + components_ : array-like of shape (n_components, n_features) + Weight matrix, where `n_features` is the number of + visible units and `n_components` is the number of hidden units. + + h_samples_ : array-like of shape (batch_size, n_components) + Hidden Activation sampled from the model distribution, + where `batch_size` is the number of examples per minibatch and + `n_components` is the number of hidden units. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + sklearn.neural_network.MLPRegressor : Multi-layer Perceptron regressor. + sklearn.neural_network.MLPClassifier : Multi-layer Perceptron classifier. + sklearn.decomposition.PCA : An unsupervised linear dimensionality + reduction model. + + References + ---------- + + [1] Hinton, G. E., Osindero, S. and Teh, Y. A fast learning algorithm for + deep belief nets. Neural Computation 18, pp 1527-1554. + https://www.cs.toronto.edu/~hinton/absps/fastnc.pdf + + [2] Tieleman, T. Training Restricted Boltzmann Machines using + Approximations to the Likelihood Gradient. International Conference + on Machine Learning (ICML) 2008 + + Examples + -------- + + >>> import numpy as np + >>> from sklearn.neural_network import BernoulliRBM + >>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) + >>> model = BernoulliRBM(n_components=2) + >>> model.fit(X) + BernoulliRBM(n_components=2) + + For a more detailed example usage, see + :ref:`sphx_glr_auto_examples_neural_networks_plot_rbm_logistic_classification.py`. + """ + + _parameter_constraints: dict = { + "n_components": [Interval(Integral, 1, None, closed="left")], + "learning_rate": [Interval(Real, 0, None, closed="neither")], + "batch_size": [Interval(Integral, 1, None, closed="left")], + "n_iter": [Interval(Integral, 0, None, closed="left")], + "verbose": ["verbose"], + "random_state": ["random_state"], + } + + def __init__( + self, + n_components=256, + *, + learning_rate=0.1, + batch_size=10, + n_iter=10, + verbose=0, + random_state=None, + ): + self.n_components = n_components + self.learning_rate = learning_rate + self.batch_size = batch_size + self.n_iter = n_iter + self.verbose = verbose + self.random_state = random_state + + def transform(self, X): + """Compute the hidden layer activation probabilities, P(h=1|v=X). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to be transformed. + + Returns + ------- + h : ndarray of shape (n_samples, n_components) + Latent representations of the data. + """ + check_is_fitted(self) + + X = validate_data( + self, X, accept_sparse="csr", reset=False, dtype=(np.float64, np.float32) + ) + return self._mean_hiddens(X) + + def _mean_hiddens(self, v): + """Computes the probabilities P(h=1|v). + + Parameters + ---------- + v : ndarray of shape (n_samples, n_features) + Values of the visible layer. + + Returns + ------- + h : ndarray of shape (n_samples, n_components) + Corresponding mean field values for the hidden layer. + """ + p = safe_sparse_dot(v, self.components_.T) + p += self.intercept_hidden_ + return expit(p, out=p) + + def _sample_hiddens(self, v, rng): + """Sample from the distribution P(h|v). + + Parameters + ---------- + v : ndarray of shape (n_samples, n_features) + Values of the visible layer to sample from. + + rng : RandomState instance + Random number generator to use. + + Returns + ------- + h : ndarray of shape (n_samples, n_components) + Values of the hidden layer. + """ + p = self._mean_hiddens(v) + return rng.uniform(size=p.shape) < p + + def _sample_visibles(self, h, rng): + """Sample from the distribution P(v|h). + + Parameters + ---------- + h : ndarray of shape (n_samples, n_components) + Values of the hidden layer to sample from. + + rng : RandomState instance + Random number generator to use. + + Returns + ------- + v : ndarray of shape (n_samples, n_features) + Values of the visible layer. + """ + p = np.dot(h, self.components_) + p += self.intercept_visible_ + expit(p, out=p) + return rng.uniform(size=p.shape) < p + + def _free_energy(self, v): + """Computes the free energy F(v) = - log sum_h exp(-E(v,h)). + + Parameters + ---------- + v : ndarray of shape (n_samples, n_features) + Values of the visible layer. + + Returns + ------- + free_energy : ndarray of shape (n_samples,) + The value of the free energy. + """ + return -safe_sparse_dot(v, self.intercept_visible_) - np.logaddexp( + 0, safe_sparse_dot(v, self.components_.T) + self.intercept_hidden_ + ).sum(axis=1) + + def gibbs(self, v): + """Perform one Gibbs sampling step. + + Parameters + ---------- + v : ndarray of shape (n_samples, n_features) + Values of the visible layer to start from. + + Returns + ------- + v_new : ndarray of shape (n_samples, n_features) + Values of the visible layer after one Gibbs step. + """ + check_is_fitted(self) + if not hasattr(self, "random_state_"): + self.random_state_ = check_random_state(self.random_state) + h_ = self._sample_hiddens(v, self.random_state_) + v_ = self._sample_visibles(h_, self.random_state_) + + return v_ + + @_fit_context(prefer_skip_nested_validation=True) + def partial_fit(self, X, y=None): + """Fit the model to the partial segment of the data X. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None + Target values (None for unsupervised transformations). + + Returns + ------- + self : BernoulliRBM + The fitted model. + """ + first_pass = not hasattr(self, "components_") + X = validate_data( + self, X, accept_sparse="csr", dtype=np.float64, reset=first_pass + ) + if not hasattr(self, "random_state_"): + self.random_state_ = check_random_state(self.random_state) + if not hasattr(self, "components_"): + self.components_ = np.asarray( + self.random_state_.normal(0, 0.01, (self.n_components, X.shape[1])), + order="F", + ) + self._n_features_out = self.components_.shape[0] + if not hasattr(self, "intercept_hidden_"): + self.intercept_hidden_ = np.zeros( + self.n_components, + ) + if not hasattr(self, "intercept_visible_"): + self.intercept_visible_ = np.zeros( + X.shape[1], + ) + if not hasattr(self, "h_samples_"): + self.h_samples_ = np.zeros((self.batch_size, self.n_components)) + + self._fit(X, self.random_state_) + + def _fit(self, v_pos, rng): + """Inner fit for one mini-batch. + + Adjust the parameters to maximize the likelihood of v using + Stochastic Maximum Likelihood (SML). + + Parameters + ---------- + v_pos : ndarray of shape (n_samples, n_features) + The data to use for training. + + rng : RandomState instance + Random number generator to use for sampling. + """ + h_pos = self._mean_hiddens(v_pos) + v_neg = self._sample_visibles(self.h_samples_, rng) + h_neg = self._mean_hiddens(v_neg) + + lr = float(self.learning_rate) / v_pos.shape[0] + update = safe_sparse_dot(v_pos.T, h_pos, dense_output=True).T + update -= np.dot(h_neg.T, v_neg) + self.components_ += lr * update + self.intercept_hidden_ += lr * (h_pos.sum(axis=0) - h_neg.sum(axis=0)) + self.intercept_visible_ += lr * ( + np.asarray(v_pos.sum(axis=0)).squeeze() - v_neg.sum(axis=0) + ) + + h_neg[rng.uniform(size=h_neg.shape) < h_neg] = 1.0 # sample binomial + self.h_samples_ = np.floor(h_neg, h_neg) + + def score_samples(self, X): + """Compute the pseudo-likelihood of X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Values of the visible layer. Must be all-boolean (not checked). + + Returns + ------- + pseudo_likelihood : ndarray of shape (n_samples,) + Value of the pseudo-likelihood (proxy for likelihood). + + Notes + ----- + This method is not deterministic: it computes a quantity called the + free energy on X, then on a randomly corrupted version of X, and + returns the log of the logistic function of the difference. + """ + check_is_fitted(self) + + v = validate_data(self, X, accept_sparse="csr", reset=False) + rng = check_random_state(self.random_state) + + # Randomly corrupt one feature in each sample in v. + ind = (np.arange(v.shape[0]), rng.randint(0, v.shape[1], v.shape[0])) + if sp.issparse(v): + data = -2 * v[ind] + 1 + if isinstance(data, np.matrix): # v is a sparse matrix + v_ = v + sp.csr_matrix((data.A.ravel(), ind), shape=v.shape) + else: # v is a sparse array + v_ = v + sp.csr_array((data.ravel(), ind), shape=v.shape) + else: + v_ = v.copy() + v_[ind] = 1 - v_[ind] + + fe = self._free_energy(v) + fe_ = self._free_energy(v_) + # log(expit(x)) = log(1 / (1 + exp(-x)) = -np.logaddexp(0, -x) + return -v.shape[1] * np.logaddexp(0, -(fe_ - fe)) + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit the model to the data X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None + Target values (None for unsupervised transformations). + + Returns + ------- + self : BernoulliRBM + The fitted model. + """ + X = validate_data(self, X, accept_sparse="csr", dtype=(np.float64, np.float32)) + n_samples = X.shape[0] + rng = check_random_state(self.random_state) + + self.components_ = np.asarray( + rng.normal(0, 0.01, (self.n_components, X.shape[1])), + order="F", + dtype=X.dtype, + ) + self._n_features_out = self.components_.shape[0] + self.intercept_hidden_ = np.zeros(self.n_components, dtype=X.dtype) + self.intercept_visible_ = np.zeros(X.shape[1], dtype=X.dtype) + self.h_samples_ = np.zeros((self.batch_size, self.n_components), dtype=X.dtype) + + n_batches = int(np.ceil(float(n_samples) / self.batch_size)) + batch_slices = list( + gen_even_slices(n_batches * self.batch_size, n_batches, n_samples=n_samples) + ) + verbose = self.verbose + begin = time.time() + for iteration in range(1, self.n_iter + 1): + for batch_slice in batch_slices: + self._fit(X[batch_slice], rng) + + if verbose: + end = time.time() + print( + "[%s] Iteration %d, pseudo-likelihood = %.2f, time = %.2fs" + % ( + type(self).__name__, + iteration, + self.score_samples(X).mean(), + end - begin, + ) + ) + begin = end + + return self + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + tags.transformer_tags.preserves_dtype = ["float64", "float32"] + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/neural_network/_stochastic_optimizers.py b/.venv/lib/python3.12/site-packages/sklearn/neural_network/_stochastic_optimizers.py new file mode 100644 index 0000000000000000000000000000000000000000..52641a91ce4d396dfbd1ab65116f7b8a937ff3e9 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/_stochastic_optimizers.py @@ -0,0 +1,287 @@ +"""Stochastic optimization methods for MLP""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np + + +class BaseOptimizer: + """Base (Stochastic) gradient descent optimizer + + Parameters + ---------- + learning_rate_init : float, default=0.1 + The initial learning rate used. It controls the step-size in updating + the weights + + Attributes + ---------- + learning_rate : float + the current learning rate + """ + + def __init__(self, learning_rate_init=0.1): + self.learning_rate_init = learning_rate_init + self.learning_rate = float(learning_rate_init) + + def update_params(self, params, grads): + """Update parameters with given gradients + + Parameters + ---------- + params : list of length = len(coefs_) + len(intercepts_) + The concatenated list containing coefs_ and intercepts_ in MLP + model. Used for initializing velocities and updating params + + grads : list of length = len(params) + Containing gradients with respect to coefs_ and intercepts_ in MLP + model. So length should be aligned with params + """ + updates = self._get_updates(grads) + for param, update in zip((p for p in params), updates): + param += update + + def iteration_ends(self, time_step): + """Perform update to learning rate and potentially other states at the + end of an iteration + """ + pass + + def trigger_stopping(self, msg, verbose): + """Decides whether it is time to stop training + + Parameters + ---------- + msg : str + Message passed in for verbose output + + verbose : bool + Print message to stdin if True + + Returns + ------- + is_stopping : bool + True if training needs to stop + """ + if verbose: + print(msg + " Stopping.") + return True + + +class SGDOptimizer(BaseOptimizer): + """Stochastic gradient descent optimizer with momentum + + Parameters + ---------- + params : list, length = len(coefs_) + len(intercepts_) + The concatenated list containing coefs_ and intercepts_ in MLP model. + Used for initializing velocities and updating params + + learning_rate_init : float, default=0.1 + The initial learning rate used. It controls the step-size in updating + the weights + + lr_schedule : {'constant', 'adaptive', 'invscaling'}, default='constant' + Learning rate schedule for weight updates. + + -'constant', is a constant learning rate given by + 'learning_rate_init'. + + -'invscaling' gradually decreases the learning rate 'learning_rate_' at + each time step 't' using an inverse scaling exponent of 'power_t'. + learning_rate_ = learning_rate_init / pow(t, power_t) + + -'adaptive', keeps the learning rate constant to + 'learning_rate_init' as long as the training keeps decreasing. + Each time 2 consecutive epochs fail to decrease the training loss by + tol, or fail to increase validation score by tol if 'early_stopping' + is on, the current learning rate is divided by 5. + + momentum : float, default=0.9 + Value of momentum used, must be larger than or equal to 0 + + nesterov : bool, default=True + Whether to use nesterov's momentum or not. Use nesterov's if True + + power_t : float, default=0.5 + Power of time step 't' in inverse scaling. See `lr_schedule` for + more details. + + Attributes + ---------- + learning_rate : float + the current learning rate + + velocities : list, length = len(params) + velocities that are used to update params + """ + + def __init__( + self, + params, + learning_rate_init=0.1, + lr_schedule="constant", + momentum=0.9, + nesterov=True, + power_t=0.5, + ): + super().__init__(learning_rate_init) + + self.lr_schedule = lr_schedule + self.momentum = momentum + self.nesterov = nesterov + self.power_t = power_t + self.velocities = [np.zeros_like(param) for param in params] + + def iteration_ends(self, time_step): + """Perform updates to learning rate and potential other states at the + end of an iteration + + Parameters + ---------- + time_step : int + number of training samples trained on so far, used to update + learning rate for 'invscaling' + """ + if self.lr_schedule == "invscaling": + self.learning_rate = ( + float(self.learning_rate_init) / (time_step + 1) ** self.power_t + ) + + def trigger_stopping(self, msg, verbose): + if self.lr_schedule != "adaptive": + if verbose: + print(msg + " Stopping.") + return True + + if self.learning_rate <= 1e-6: + if verbose: + print(msg + " Learning rate too small. Stopping.") + return True + + self.learning_rate /= 5.0 + if verbose: + print(msg + " Setting learning rate to %f" % self.learning_rate) + return False + + def _get_updates(self, grads): + """Get the values used to update params with given gradients + + Parameters + ---------- + grads : list, length = len(coefs_) + len(intercepts_) + Containing gradients with respect to coefs_ and intercepts_ in MLP + model. So length should be aligned with params + + Returns + ------- + updates : list, length = len(grads) + The values to add to params + """ + updates = [ + self.momentum * velocity - self.learning_rate * grad + for velocity, grad in zip(self.velocities, grads) + ] + self.velocities = updates + + if self.nesterov: + updates = [ + self.momentum * velocity - self.learning_rate * grad + for velocity, grad in zip(self.velocities, grads) + ] + + return updates + + +class AdamOptimizer(BaseOptimizer): + """Stochastic gradient descent optimizer with Adam + + Note: All default values are from the original Adam paper + + Parameters + ---------- + params : list, length = len(coefs_) + len(intercepts_) + The concatenated list containing coefs_ and intercepts_ in MLP model. + Used for initializing velocities and updating params + + learning_rate_init : float, default=0.001 + The initial learning rate used. It controls the step-size in updating + the weights + + beta_1 : float, default=0.9 + Exponential decay rate for estimates of first moment vector, should be + in [0, 1) + + beta_2 : float, default=0.999 + Exponential decay rate for estimates of second moment vector, should be + in [0, 1) + + epsilon : float, default=1e-8 + Value for numerical stability + + Attributes + ---------- + learning_rate : float + The current learning rate + + t : int + Timestep + + ms : list, length = len(params) + First moment vectors + + vs : list, length = len(params) + Second moment vectors + + References + ---------- + :arxiv:`Kingma, Diederik, and Jimmy Ba (2014) "Adam: A method for + stochastic optimization." <1412.6980> + """ + + def __init__( + self, params, learning_rate_init=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8 + ): + super().__init__(learning_rate_init) + + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.t = 0 + self.ms = [np.zeros_like(param) for param in params] + self.vs = [np.zeros_like(param) for param in params] + + def _get_updates(self, grads): + """Get the values used to update params with given gradients + + Parameters + ---------- + grads : list, length = len(coefs_) + len(intercepts_) + Containing gradients with respect to coefs_ and intercepts_ in MLP + model. So length should be aligned with params + + Returns + ------- + updates : list, length = len(grads) + The values to add to params + """ + self.t += 1 + self.ms = [ + self.beta_1 * m + (1 - self.beta_1) * grad + for m, grad in zip(self.ms, grads) + ] + self.vs = [ + self.beta_2 * v + (1 - self.beta_2) * (grad**2) + for v, grad in zip(self.vs, grads) + ] + self.learning_rate = ( + self.learning_rate_init + * np.sqrt(1 - self.beta_2**self.t) + / (1 - self.beta_1**self.t) + ) + updates = [ + -self.learning_rate * m / (np.sqrt(v) + self.epsilon) + for m, v in zip(self.ms, self.vs) + ] + return updates diff --git a/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_base.py b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_base.py new file mode 100644 index 0000000000000000000000000000000000000000..598b7e6054eead605e47fbf4e067ba2119f8d5b6 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_base.py @@ -0,0 +1,52 @@ +import numpy as np +import pytest + +from sklearn._loss import HalfPoissonLoss +from sklearn.neural_network._base import binary_log_loss, log_loss, poisson_loss + + +def test_binary_log_loss_1_prob_finite(): + # y_proba is equal to one should result in a finite logloss + y_true = np.array([[0, 0, 1]]).T + y_prob = np.array([[0.9, 1.0, 1.0]]).T + + loss = binary_log_loss(y_true, y_prob) + assert np.isfinite(loss) + + +@pytest.mark.parametrize( + "y_true, y_prob", + [ + ( + np.array([[1, 0, 0], [0, 1, 0]]), + np.array([[0.0, 1.0, 0.0], [0.9, 0.05, 0.05]]), + ), + (np.array([[0, 0, 1]]).T, np.array([[0.9, 1.0, 1.0]]).T), + ], +) +def test_log_loss_1_prob_finite(y_true, y_prob): + # y_proba is equal to 1 should result in a finite logloss + loss = log_loss(y_true, y_prob) + assert np.isfinite(loss) + + +def test_poisson_loss(global_random_seed): + """Test Poisson loss against well tested HalfPoissonLoss.""" + n = 1000 + rng = np.random.default_rng(global_random_seed) + y_true = rng.integers(low=0, high=10, size=n).astype(float) + y_raw = rng.standard_normal(n) + y_pred = np.exp(y_raw) + sw = rng.uniform(low=0.1, high=10, size=n) + + assert 0 in y_true + + loss = poisson_loss(y_true=y_true, y_pred=y_pred, sample_weight=sw) + pl = HalfPoissonLoss() + loss_ref = ( + pl(y_true=y_true, raw_prediction=y_raw, sample_weight=sw) + + pl.constant_to_optimal_zero(y_true=y_true, sample_weight=sw).mean() + / sw.mean() + ) + + assert loss == pytest.approx(loss_ref, rel=1e-12) diff --git a/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_mlp.py b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_mlp.py new file mode 100644 index 0000000000000000000000000000000000000000..9dddb78223ea71cfdfa9dfa9755fe74efef6a42c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_mlp.py @@ -0,0 +1,1094 @@ +""" +Testing for Multi-layer Perceptron module (sklearn.neural_network) +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import re +import sys +import warnings +from io import StringIO + +import joblib +import numpy as np +import pytest + +from sklearn.datasets import ( + load_digits, + load_iris, + make_multilabel_classification, + make_regression, +) +from sklearn.exceptions import ConvergenceWarning +from sklearn.linear_model import PoissonRegressor +from sklearn.metrics import roc_auc_score +from sklearn.neural_network import MLPClassifier, MLPRegressor +from sklearn.preprocessing import LabelBinarizer, MinMaxScaler, scale +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_equal, + ignore_warnings, +) +from sklearn.utils.fixes import CSR_CONTAINERS + +ACTIVATION_TYPES = ["identity", "logistic", "tanh", "relu"] + +X_digits, y_digits = load_digits(n_class=3, return_X_y=True) + +X_digits_multi = MinMaxScaler().fit_transform(X_digits[:200]) +y_digits_multi = y_digits[:200] + +X_digits, y_digits = load_digits(n_class=2, return_X_y=True) + +X_digits_binary = MinMaxScaler().fit_transform(X_digits[:200]) +y_digits_binary = y_digits[:200] + +classification_datasets = [ + (X_digits_multi, y_digits_multi), + (X_digits_binary, y_digits_binary), +] + +X_reg, y_reg = make_regression( + n_samples=200, n_features=10, bias=20.0, noise=100.0, random_state=7 +) +y_reg = scale(y_reg) +regression_datasets = [(X_reg, y_reg)] + +iris = load_iris() + +X_iris = iris.data +y_iris = iris.target + + +def test_alpha(): + # Test that larger alpha yields weights closer to zero + X = X_digits_binary[:100] + y = y_digits_binary[:100] + + alpha_vectors = [] + alpha_values = np.arange(2) + absolute_sum = lambda x: np.sum(np.abs(x)) + + for alpha in alpha_values: + mlp = MLPClassifier(hidden_layer_sizes=10, alpha=alpha, random_state=1) + with ignore_warnings(category=ConvergenceWarning): + mlp.fit(X, y) + alpha_vectors.append( + np.array([absolute_sum(mlp.coefs_[0]), absolute_sum(mlp.coefs_[1])]) + ) + + for i in range(len(alpha_values) - 1): + assert (alpha_vectors[i] > alpha_vectors[i + 1]).all() + + +def test_fit(): + # Test that the algorithm solution is equal to a worked out example. + X = np.array([[0.6, 0.8, 0.7]]) + y = np.array([0]) + mlp = MLPClassifier( + solver="sgd", + learning_rate_init=0.1, + alpha=0.1, + activation="logistic", + random_state=1, + max_iter=1, + hidden_layer_sizes=2, + momentum=0, + ) + # set weights + mlp.coefs_ = [0] * 2 + mlp.intercepts_ = [0] * 2 + mlp.n_outputs_ = 1 + mlp.coefs_[0] = np.array([[0.1, 0.2], [0.3, 0.1], [0.5, 0]]) + mlp.coefs_[1] = np.array([[0.1], [0.2]]) + mlp.intercepts_[0] = np.array([0.1, 0.1]) + mlp.intercepts_[1] = np.array([1.0]) + mlp._coef_grads = [] * 2 + mlp._intercept_grads = [] * 2 + mlp.n_features_in_ = 3 + + # Initialize parameters + mlp.n_iter_ = 0 + mlp.learning_rate_ = 0.1 + + # Compute the number of layers + mlp.n_layers_ = 3 + + # Pre-allocate gradient matrices + mlp._coef_grads = [0] * (mlp.n_layers_ - 1) + mlp._intercept_grads = [0] * (mlp.n_layers_ - 1) + + mlp.out_activation_ = "logistic" + mlp.t_ = 0 + mlp.best_loss_ = np.inf + mlp.loss_curve_ = [] + mlp._no_improvement_count = 0 + mlp._intercept_velocity = [ + np.zeros_like(intercepts) for intercepts in mlp.intercepts_ + ] + mlp._coef_velocity = [np.zeros_like(coefs) for coefs in mlp.coefs_] + + mlp.partial_fit(X, y, classes=[0, 1]) + # Manually worked out example + # h1 = g(X1 * W_i1 + b11) = g(0.6 * 0.1 + 0.8 * 0.3 + 0.7 * 0.5 + 0.1) + # = 0.679178699175393 + # h2 = g(X2 * W_i2 + b12) = g(0.6 * 0.2 + 0.8 * 0.1 + 0.7 * 0 + 0.1) + # = 0.574442516811659 + # o1 = g(h * W2 + b21) = g(0.679 * 0.1 + 0.574 * 0.2 + 1) + # = 0.7654329236196236 + # d21 = -(0 - 0.765) = 0.765 + # d11 = (1 - 0.679) * 0.679 * 0.765 * 0.1 = 0.01667 + # d12 = (1 - 0.574) * 0.574 * 0.765 * 0.2 = 0.0374 + # W1grad11 = X1 * d11 + alpha * W11 = 0.6 * 0.01667 + 0.1 * 0.1 = 0.0200 + # W1grad11 = X1 * d12 + alpha * W12 = 0.6 * 0.0374 + 0.1 * 0.2 = 0.04244 + # W1grad21 = X2 * d11 + alpha * W13 = 0.8 * 0.01667 + 0.1 * 0.3 = 0.043336 + # W1grad22 = X2 * d12 + alpha * W14 = 0.8 * 0.0374 + 0.1 * 0.1 = 0.03992 + # W1grad31 = X3 * d11 + alpha * W15 = 0.6 * 0.01667 + 0.1 * 0.5 = 0.060002 + # W1grad32 = X3 * d12 + alpha * W16 = 0.6 * 0.0374 + 0.1 * 0 = 0.02244 + # W2grad1 = h1 * d21 + alpha * W21 = 0.679 * 0.765 + 0.1 * 0.1 = 0.5294 + # W2grad2 = h2 * d21 + alpha * W22 = 0.574 * 0.765 + 0.1 * 0.2 = 0.45911 + # b1grad1 = d11 = 0.01667 + # b1grad2 = d12 = 0.0374 + # b2grad = d21 = 0.765 + # W1 = W1 - eta * [W1grad11, .., W1grad32] = [[0.1, 0.2], [0.3, 0.1], + # [0.5, 0]] - 0.1 * [[0.0200, 0.04244], [0.043336, 0.03992], + # [0.060002, 0.02244]] = [[0.098, 0.195756], [0.2956664, + # 0.096008], [0.4939998, -0.002244]] + # W2 = W2 - eta * [W2grad1, W2grad2] = [[0.1], [0.2]] - 0.1 * + # [[0.5294], [0.45911]] = [[0.04706], [0.154089]] + # b1 = b1 - eta * [b1grad1, b1grad2] = 0.1 - 0.1 * [0.01667, 0.0374] + # = [0.098333, 0.09626] + # b2 = b2 - eta * b2grad = 1.0 - 0.1 * 0.765 = 0.9235 + assert_almost_equal( + mlp.coefs_[0], + np.array([[0.098, 0.195756], [0.2956664, 0.096008], [0.4939998, -0.002244]]), + decimal=3, + ) + assert_almost_equal(mlp.coefs_[1], np.array([[0.04706], [0.154089]]), decimal=3) + assert_almost_equal(mlp.intercepts_[0], np.array([0.098333, 0.09626]), decimal=3) + assert_almost_equal(mlp.intercepts_[1], np.array(0.9235), decimal=3) + # Testing output + # h1 = g(X1 * W_i1 + b11) = g(0.6 * 0.098 + 0.8 * 0.2956664 + + # 0.7 * 0.4939998 + 0.098333) = 0.677 + # h2 = g(X2 * W_i2 + b12) = g(0.6 * 0.195756 + 0.8 * 0.096008 + + # 0.7 * -0.002244 + 0.09626) = 0.572 + # o1 = h * W2 + b21 = 0.677 * 0.04706 + + # 0.572 * 0.154089 + 0.9235 = 1.043 + # prob = sigmoid(o1) = 0.739 + assert_almost_equal(mlp.predict_proba(X)[0, 1], 0.739, decimal=3) + + +def test_gradient(): + # Test gradient. + + # This makes sure that the activation functions and their derivatives + # are correct. The numerical and analytical computation of the gradient + # should be close. + for n_labels in [2, 3]: + n_samples = 5 + n_features = 10 + random_state = np.random.RandomState(seed=42) + X = random_state.rand(n_samples, n_features) + y = 1 + np.mod(np.arange(n_samples) + 1, n_labels) + Y = LabelBinarizer().fit_transform(y) + + for activation in ACTIVATION_TYPES: + mlp = MLPClassifier( + activation=activation, + hidden_layer_sizes=10, + solver="lbfgs", + alpha=1e-5, + learning_rate_init=0.2, + max_iter=1, + random_state=1, + ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", ConvergenceWarning) + mlp.fit(X, y) + + theta = np.hstack([l.ravel() for l in mlp.coefs_ + mlp.intercepts_]) + + layer_units = [X.shape[1]] + [mlp.hidden_layer_sizes] + [mlp.n_outputs_] + + activations = [] + deltas = [] + coef_grads = [] + intercept_grads = [] + + activations.append(X) + for i in range(mlp.n_layers_ - 1): + activations.append(np.empty((X.shape[0], layer_units[i + 1]))) + deltas.append(np.empty((X.shape[0], layer_units[i + 1]))) + + fan_in = layer_units[i] + fan_out = layer_units[i + 1] + coef_grads.append(np.empty((fan_in, fan_out))) + intercept_grads.append(np.empty(fan_out)) + + # analytically compute the gradients + def loss_grad_fun(t): + return mlp._loss_grad_lbfgs( + t, X, Y, None, activations, deltas, coef_grads, intercept_grads + ) + + [value, grad] = loss_grad_fun(theta) + numgrad = np.zeros(np.size(theta)) + n = np.size(theta, 0) + E = np.eye(n) + epsilon = 1e-5 + # numerically compute the gradients + for i in range(n): + dtheta = E[:, i] * epsilon + numgrad[i] = ( + loss_grad_fun(theta + dtheta)[0] - loss_grad_fun(theta - dtheta)[0] + ) / (epsilon * 2.0) + assert_almost_equal(numgrad, grad) + + +@pytest.mark.parametrize("X,y", classification_datasets) +def test_lbfgs_classification(X, y): + # Test lbfgs on classification. + # It should achieve a score higher than 0.95 for the binary and multi-class + # versions of the digits dataset. + X_train = X[:150] + y_train = y[:150] + X_test = X[150:] + expected_shape_dtype = (X_test.shape[0], y_train.dtype.kind) + + for activation in ACTIVATION_TYPES: + mlp = MLPClassifier( + solver="lbfgs", + hidden_layer_sizes=50, + max_iter=150, + shuffle=True, + random_state=1, + activation=activation, + ) + mlp.fit(X_train, y_train) + y_predict = mlp.predict(X_test) + assert mlp.score(X_train, y_train) > 0.95 + assert (y_predict.shape[0], y_predict.dtype.kind) == expected_shape_dtype + + +@pytest.mark.parametrize("X,y", regression_datasets) +def test_lbfgs_regression(X, y): + # Test lbfgs on the regression dataset. + for activation in ACTIVATION_TYPES: + mlp = MLPRegressor( + solver="lbfgs", + hidden_layer_sizes=50, + max_iter=200, + tol=1e-3, + shuffle=True, + random_state=1, + activation=activation, + ) + mlp.fit(X, y) + if activation == "identity": + assert mlp.score(X, y) > 0.80 + else: + # Non linear models perform much better than linear bottleneck: + assert mlp.score(X, y) > 0.98 + + +@pytest.mark.parametrize("X,y", classification_datasets) +def test_lbfgs_classification_maxfun(X, y): + # Test lbfgs parameter max_fun. + # It should independently limit the number of iterations for lbfgs. + max_fun = 10 + # classification tests + for activation in ACTIVATION_TYPES: + mlp = MLPClassifier( + solver="lbfgs", + hidden_layer_sizes=50, + max_iter=150, + max_fun=max_fun, + shuffle=True, + random_state=1, + activation=activation, + ) + with pytest.warns(ConvergenceWarning): + mlp.fit(X, y) + assert max_fun >= mlp.n_iter_ + + +@pytest.mark.parametrize("X,y", regression_datasets) +def test_lbfgs_regression_maxfun(X, y): + # Test lbfgs parameter max_fun. + # It should independently limit the number of iterations for lbfgs. + max_fun = 10 + # regression tests + for activation in ACTIVATION_TYPES: + mlp = MLPRegressor( + solver="lbfgs", + hidden_layer_sizes=50, + tol=0.0, + max_iter=150, + max_fun=max_fun, + shuffle=True, + random_state=1, + activation=activation, + ) + with pytest.warns(ConvergenceWarning): + mlp.fit(X, y) + assert max_fun >= mlp.n_iter_ + + +def test_learning_rate_warmstart(): + # Tests that warm_start reuse past solutions. + X = [[3, 2], [1, 6], [5, 6], [-2, -4]] + y = [1, 1, 1, 0] + for learning_rate in ["invscaling", "constant"]: + mlp = MLPClassifier( + solver="sgd", + hidden_layer_sizes=4, + learning_rate=learning_rate, + max_iter=1, + power_t=0.25, + warm_start=True, + ) + with ignore_warnings(category=ConvergenceWarning): + mlp.fit(X, y) + prev_eta = mlp._optimizer.learning_rate + mlp.fit(X, y) + post_eta = mlp._optimizer.learning_rate + + if learning_rate == "constant": + assert prev_eta == post_eta + elif learning_rate == "invscaling": + assert mlp.learning_rate_init / pow(8 + 1, mlp.power_t) == post_eta + + +def test_multilabel_classification(): + # Test that multi-label classification works as expected. + # test fit method + X, y = make_multilabel_classification( + n_samples=50, random_state=0, return_indicator=True + ) + mlp = MLPClassifier( + solver="lbfgs", + hidden_layer_sizes=50, + alpha=1e-5, + max_iter=150, + random_state=0, + activation="logistic", + learning_rate_init=0.2, + ) + mlp.fit(X, y) + assert mlp.score(X, y) > 0.97 + + # test partial fit method + mlp = MLPClassifier( + solver="sgd", + hidden_layer_sizes=50, + max_iter=150, + random_state=0, + activation="logistic", + alpha=1e-5, + learning_rate_init=0.2, + ) + for i in range(100): + mlp.partial_fit(X, y, classes=[0, 1, 2, 3, 4]) + assert mlp.score(X, y) > 0.9 + + # Make sure early stopping still work now that splitting is stratified by + # default (it is disabled for multilabel classification) + mlp = MLPClassifier(early_stopping=True) + mlp.fit(X, y).predict(X) + + +def test_multioutput_regression(): + # Test that multi-output regression works as expected + X, y = make_regression(n_samples=200, n_targets=5, random_state=11) + mlp = MLPRegressor( + solver="lbfgs", hidden_layer_sizes=50, max_iter=200, tol=1e-2, random_state=1 + ) + mlp.fit(X, y) + assert mlp.score(X, y) > 0.9 + + +def test_partial_fit_classes_error(): + # Tests that passing different classes to partial_fit raises an error + X = [[3, 2]] + y = [0] + clf = MLPClassifier(solver="sgd") + clf.partial_fit(X, y, classes=[0, 1]) + with pytest.raises(ValueError): + clf.partial_fit(X, y, classes=[1, 2]) + + +def test_partial_fit_classification(): + # Test partial_fit on classification. + # `partial_fit` should yield the same results as 'fit' for binary and + # multi-class classification. + for X, y in classification_datasets: + mlp = MLPClassifier( + solver="sgd", + max_iter=100, + random_state=1, + tol=0, + alpha=1e-5, + learning_rate_init=0.2, + ) + + with ignore_warnings(category=ConvergenceWarning): + mlp.fit(X, y) + pred1 = mlp.predict(X) + mlp = MLPClassifier( + solver="sgd", random_state=1, alpha=1e-5, learning_rate_init=0.2 + ) + for i in range(100): + mlp.partial_fit(X, y, classes=np.unique(y)) + pred2 = mlp.predict(X) + assert_array_equal(pred1, pred2) + assert mlp.score(X, y) > 0.95 + + +def test_partial_fit_unseen_classes(): + # Non regression test for bug 6994 + # Tests for labeling errors in partial fit + + clf = MLPClassifier(random_state=0) + clf.partial_fit([[1], [2], [3]], ["a", "b", "c"], classes=["a", "b", "c", "d"]) + clf.partial_fit([[4]], ["d"]) + assert clf.score([[1], [2], [3], [4]], ["a", "b", "c", "d"]) > 0 + + +def test_partial_fit_regression(): + # Test partial_fit on regression. + # `partial_fit` should yield the same results as 'fit' for regression. + X = X_reg + y = y_reg + + for momentum in [0, 0.9]: + mlp = MLPRegressor( + solver="sgd", + max_iter=100, + activation="relu", + random_state=1, + learning_rate_init=0.01, + batch_size=X.shape[0], + momentum=momentum, + ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", ConvergenceWarning) + mlp.fit(X, y) + pred1 = mlp.predict(X) + mlp = MLPRegressor( + solver="sgd", + activation="relu", + learning_rate_init=0.01, + random_state=1, + batch_size=X.shape[0], + momentum=momentum, + ) + for i in range(100): + mlp.partial_fit(X, y) + + pred2 = mlp.predict(X) + assert_allclose(pred1, pred2) + score = mlp.score(X, y) + assert score > 0.65 + + +def test_partial_fit_errors(): + # Test partial_fit error handling. + X = [[3, 2], [1, 6]] + y = [1, 0] + + # no classes passed + with pytest.raises(ValueError): + MLPClassifier(solver="sgd").partial_fit(X, y, classes=[2]) + + # lbfgs doesn't support partial_fit + assert not hasattr(MLPClassifier(solver="lbfgs"), "partial_fit") + + +def test_nonfinite_params(): + # Check that MLPRegressor throws ValueError when dealing with non-finite + # parameter values + rng = np.random.RandomState(0) + n_samples = 10 + fmax = np.finfo(np.float64).max + X = fmax * rng.uniform(size=(n_samples, 2)) + y = rng.standard_normal(size=n_samples) + + clf = MLPRegressor() + msg = ( + "Solver produced non-finite parameter weights. The input data may contain large" + " values and need to be preprocessed." + ) + with pytest.raises(ValueError, match=msg): + with warnings.catch_warnings(): + # RuntimeWarning: overflow encountered in square + warnings.simplefilter("ignore") + clf.fit(X, y) + + +def test_predict_proba_binary(): + # Test that predict_proba works as expected for binary class. + X = X_digits_binary[:50] + y = y_digits_binary[:50] + + clf = MLPClassifier(hidden_layer_sizes=5, activation="logistic", random_state=1) + with ignore_warnings(category=ConvergenceWarning): + clf.fit(X, y) + y_proba = clf.predict_proba(X) + y_log_proba = clf.predict_log_proba(X) + + (n_samples, n_classes) = y.shape[0], 2 + + proba_max = y_proba.argmax(axis=1) + proba_log_max = y_log_proba.argmax(axis=1) + + assert y_proba.shape == (n_samples, n_classes) + assert_array_equal(proba_max, proba_log_max) + assert_allclose(y_log_proba, np.log(y_proba)) + + assert roc_auc_score(y, y_proba[:, 1]) == 1.0 + + +def test_predict_proba_multiclass(): + # Test that predict_proba works as expected for multi class. + X = X_digits_multi[:10] + y = y_digits_multi[:10] + + clf = MLPClassifier(hidden_layer_sizes=5) + with ignore_warnings(category=ConvergenceWarning): + clf.fit(X, y) + y_proba = clf.predict_proba(X) + y_log_proba = clf.predict_log_proba(X) + + (n_samples, n_classes) = y.shape[0], np.unique(y).size + + proba_max = y_proba.argmax(axis=1) + proba_log_max = y_log_proba.argmax(axis=1) + + assert y_proba.shape == (n_samples, n_classes) + assert_array_equal(proba_max, proba_log_max) + assert_allclose(y_log_proba, np.log(y_proba)) + + +def test_predict_proba_multilabel(): + # Test that predict_proba works as expected for multilabel. + # Multilabel should not use softmax which makes probabilities sum to 1 + X, Y = make_multilabel_classification( + n_samples=50, random_state=0, return_indicator=True + ) + n_samples, n_classes = Y.shape + + clf = MLPClassifier(solver="lbfgs", hidden_layer_sizes=30, random_state=0) + clf.fit(X, Y) + y_proba = clf.predict_proba(X) + + assert y_proba.shape == (n_samples, n_classes) + assert_array_equal(y_proba > 0.5, Y) + + y_log_proba = clf.predict_log_proba(X) + proba_max = y_proba.argmax(axis=1) + proba_log_max = y_log_proba.argmax(axis=1) + + assert (y_proba.sum(1) - 1).dot(y_proba.sum(1) - 1) > 1e-10 + assert_array_equal(proba_max, proba_log_max) + assert_allclose(y_log_proba, np.log(y_proba)) + + +def test_shuffle(): + # Test that the shuffle parameter affects the training process (it should) + X, y = make_regression(n_samples=50, n_features=5, n_targets=1, random_state=0) + + # The coefficients will be identical if both do or do not shuffle + for shuffle in [True, False]: + mlp1 = MLPRegressor( + hidden_layer_sizes=1, + max_iter=1, + batch_size=1, + random_state=0, + shuffle=shuffle, + ) + mlp2 = MLPRegressor( + hidden_layer_sizes=1, + max_iter=1, + batch_size=1, + random_state=0, + shuffle=shuffle, + ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", ConvergenceWarning) + mlp1.fit(X, y) + mlp2.fit(X, y) + + assert np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0]) + + # The coefficients will be slightly different if shuffle=True + mlp1 = MLPRegressor( + hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=True + ) + mlp2 = MLPRegressor( + hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=False + ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", ConvergenceWarning) + mlp1.fit(X, y) + mlp2.fit(X, y) + + assert not np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0]) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse_matrices(csr_container): + # Test that sparse and dense input matrices output the same results. + X = X_digits_binary[:50] + y = y_digits_binary[:50] + X_sparse = csr_container(X) + mlp = MLPClassifier(solver="lbfgs", hidden_layer_sizes=15, random_state=1) + mlp.fit(X, y) + pred1 = mlp.predict(X) + mlp.fit(X_sparse, y) + pred2 = mlp.predict(X_sparse) + assert_almost_equal(pred1, pred2) + pred1 = mlp.predict(X) + pred2 = mlp.predict(X_sparse) + assert_array_equal(pred1, pred2) + + +def test_tolerance(): + # Test tolerance. + # It should force the solver to exit the loop when it converges. + X = [[3, 2], [1, 6]] + y = [1, 0] + clf = MLPClassifier(tol=0.5, max_iter=3000, solver="sgd") + clf.fit(X, y) + assert clf.max_iter > clf.n_iter_ + + +def test_verbose_sgd(): + # Test verbose. + X = [[3, 2], [1, 6]] + y = [1, 0] + clf = MLPClassifier(solver="sgd", max_iter=2, verbose=10, hidden_layer_sizes=2) + old_stdout = sys.stdout + sys.stdout = output = StringIO() + + with ignore_warnings(category=ConvergenceWarning): + clf.fit(X, y) + clf.partial_fit(X, y) + + sys.stdout = old_stdout + assert "Iteration" in output.getvalue() + + +@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor]) +def test_early_stopping(MLPEstimator): + X = X_digits_binary[:100] + y = y_digits_binary[:100] + tol = 0.2 + mlp_estimator = MLPEstimator( + tol=tol, max_iter=3000, solver="sgd", early_stopping=True + ) + mlp_estimator.fit(X, y) + assert mlp_estimator.max_iter > mlp_estimator.n_iter_ + + assert mlp_estimator.best_loss_ is None + assert isinstance(mlp_estimator.validation_scores_, list) + + valid_scores = mlp_estimator.validation_scores_ + best_valid_score = mlp_estimator.best_validation_score_ + assert max(valid_scores) == best_valid_score + assert best_valid_score + tol > valid_scores[-2] + assert best_valid_score + tol > valid_scores[-1] + + # check that the attributes `validation_scores_` and `best_validation_score_` + # are set to None when `early_stopping=False` + mlp_estimator = MLPEstimator( + tol=tol, max_iter=3000, solver="sgd", early_stopping=False + ) + mlp_estimator.fit(X, y) + assert mlp_estimator.validation_scores_ is None + assert mlp_estimator.best_validation_score_ is None + assert mlp_estimator.best_loss_ is not None + + +def test_adaptive_learning_rate(): + X = [[3, 2], [1, 6]] + y = [1, 0] + clf = MLPClassifier(tol=0.5, max_iter=3000, solver="sgd", learning_rate="adaptive") + clf.fit(X, y) + assert clf.max_iter > clf.n_iter_ + assert 1e-6 > clf._optimizer.learning_rate + + +def test_warm_start(): + X = X_iris + y = y_iris + + y_2classes = np.array([0] * 75 + [1] * 75) + y_3classes = np.array([0] * 40 + [1] * 40 + [2] * 70) + y_3classes_alt = np.array([0] * 50 + [1] * 50 + [3] * 50) + y_4classes = np.array([0] * 37 + [1] * 37 + [2] * 38 + [3] * 38) + y_5classes = np.array([0] * 30 + [1] * 30 + [2] * 30 + [3] * 30 + [4] * 30) + + # No error raised + clf = MLPClassifier( + hidden_layer_sizes=2, solver="lbfgs", warm_start=True, random_state=42, tol=1e-2 + ).fit(X, y) + clf.fit(X, y) + clf.fit(X, y_3classes) + + for y_i in (y_2classes, y_3classes_alt, y_4classes, y_5classes): + clf = MLPClassifier( + hidden_layer_sizes=2, + solver="lbfgs", + warm_start=True, + random_state=42, + tol=1e-2, + ).fit(X, y) + message = ( + "warm_start can only be used where `y` has the same " + "classes as in the previous call to fit." + " Previously got [0 1 2], `y` has %s" % np.unique(y_i) + ) + with pytest.raises(ValueError, match=re.escape(message)): + clf.fit(X, y_i) + + +@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor]) +def test_warm_start_full_iteration(MLPEstimator): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/16812 + # Check that the MLP estimator accomplish `max_iter` with a + # warm started estimator. + X, y = X_iris, y_iris + max_iter = 3 + clf = MLPEstimator( + hidden_layer_sizes=2, solver="sgd", warm_start=True, max_iter=max_iter + ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", ConvergenceWarning) + clf.fit(X, y) + assert max_iter == clf.n_iter_ + clf.fit(X, y) + assert max_iter == clf.n_iter_ + + +def test_n_iter_no_change(): + # test n_iter_no_change using binary data set + # the classifying fitting process is not prone to loss curve fluctuations + X = X_digits_binary[:100] + y = y_digits_binary[:100] + tol = 0.01 + max_iter = 3000 + + # test multiple n_iter_no_change + for n_iter_no_change in [2, 5, 10, 50, 100]: + clf = MLPClassifier( + tol=tol, max_iter=max_iter, solver="sgd", n_iter_no_change=n_iter_no_change + ) + clf.fit(X, y) + + # validate n_iter_no_change + assert clf._no_improvement_count == n_iter_no_change + 1 + assert max_iter > clf.n_iter_ + + +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +def test_n_iter_no_change_inf(): + # test n_iter_no_change using binary data set + # the fitting process should go to max_iter iterations + X = X_digits_binary[:100] + y = y_digits_binary[:100] + + # set a ridiculous tolerance + # this should always trigger _update_no_improvement_count() + tol = 1e9 + + # fit + n_iter_no_change = np.inf + max_iter = 3000 + clf = MLPClassifier( + tol=tol, max_iter=max_iter, solver="sgd", n_iter_no_change=n_iter_no_change + ) + clf.fit(X, y) + + # validate n_iter_no_change doesn't cause early stopping + assert clf.n_iter_ == max_iter + + # validate _update_no_improvement_count() was always triggered + assert clf._no_improvement_count == clf.n_iter_ - 1 + + +def test_early_stopping_stratified(): + # Make sure data splitting for early stopping is stratified + X = [[1, 2], [2, 3], [3, 4], [4, 5]] + y = [0, 0, 0, 1] + + mlp = MLPClassifier(early_stopping=True) + with pytest.raises( + ValueError, match="The least populated class in y has only 1 member" + ): + mlp.fit(X, y) + + +def test_mlp_classifier_dtypes_casting(): + # Compare predictions for different dtypes + mlp_64 = MLPClassifier( + alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=100, tol=1e-1 + ) + mlp_64.fit(X_digits[:300], y_digits[:300]) + pred_64 = mlp_64.predict(X_digits[300:]) + proba_64 = mlp_64.predict_proba(X_digits[300:]) + + mlp_32 = MLPClassifier( + alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=100, tol=1e-1 + ) + mlp_32.fit(X_digits[:300].astype(np.float32), y_digits[:300]) + pred_32 = mlp_32.predict(X_digits[300:].astype(np.float32)) + proba_32 = mlp_32.predict_proba(X_digits[300:].astype(np.float32)) + + assert_array_equal(pred_64, pred_32) + assert_allclose(proba_64, proba_32, rtol=1e-02) + + +def test_mlp_regressor_dtypes_casting(): + mlp_64 = MLPRegressor( + alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=150, tol=1e-3 + ) + mlp_64.fit(X_digits[:300], y_digits[:300]) + pred_64 = mlp_64.predict(X_digits[300:]) + + mlp_32 = MLPRegressor( + alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=150, tol=1e-3 + ) + mlp_32.fit(X_digits[:300].astype(np.float32), y_digits[:300]) + pred_32 = mlp_32.predict(X_digits[300:].astype(np.float32)) + + assert_allclose(pred_64, pred_32, rtol=5e-04) + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("Estimator", [MLPClassifier, MLPRegressor]) +def test_mlp_param_dtypes(dtype, Estimator): + # Checks if input dtype is used for network parameters + # and predictions + X, y = X_digits.astype(dtype), y_digits + mlp = Estimator( + alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=50, tol=1e-1 + ) + mlp.fit(X[:300], y[:300]) + pred = mlp.predict(X[300:]) + + assert all([intercept.dtype == dtype for intercept in mlp.intercepts_]) + + assert all([coef.dtype == dtype for coef in mlp.coefs_]) + + if Estimator == MLPRegressor: + assert pred.dtype == dtype + + +def test_mlp_loading_from_joblib_partial_fit(tmp_path): + """Loading from MLP and partial fitting updates weights. Non-regression + test for #19626.""" + pre_trained_estimator = MLPRegressor( + hidden_layer_sizes=(42,), random_state=42, learning_rate_init=0.01, max_iter=200 + ) + features, target = [[2]], [4] + + # Fit on x=2, y=4 + pre_trained_estimator.fit(features, target) + + # dump and load model + pickled_file = tmp_path / "mlp.pkl" + joblib.dump(pre_trained_estimator, pickled_file) + load_estimator = joblib.load(pickled_file) + + # Train for a more epochs on point x=2, y=1 + fine_tune_features, fine_tune_target = [[2]], [1] + + for _ in range(200): + load_estimator.partial_fit(fine_tune_features, fine_tune_target) + + # finetuned model learned the new target + predicted_value = load_estimator.predict(fine_tune_features) + assert_allclose(predicted_value, fine_tune_target, rtol=1e-4) + + +@pytest.mark.parametrize("Estimator", [MLPClassifier, MLPRegressor]) +def test_preserve_feature_names(Estimator): + """Check that feature names are preserved when early stopping is enabled. + + Feature names are required for consistency checks during scoring. + + Non-regression test for gh-24846 + """ + pd = pytest.importorskip("pandas") + rng = np.random.RandomState(0) + + X = pd.DataFrame(data=rng.randn(10, 2), columns=["colname_a", "colname_b"]) + y = pd.Series(data=np.full(10, 1), name="colname_y") + + model = Estimator(early_stopping=True, validation_fraction=0.2) + + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + model.fit(X, y) + + +@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor]) +def test_mlp_warm_start_with_early_stopping(MLPEstimator): + """Check that early stopping works with warm start.""" + mlp = MLPEstimator( + max_iter=10, random_state=0, warm_start=True, early_stopping=True + ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", ConvergenceWarning) + mlp.fit(X_iris, y_iris) + n_validation_scores = len(mlp.validation_scores_) + mlp.set_params(max_iter=20) + mlp.fit(X_iris, y_iris) + assert len(mlp.validation_scores_) > n_validation_scores + + +@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor]) +@pytest.mark.parametrize("solver", ["sgd", "adam", "lbfgs"]) +def test_mlp_warm_start_no_convergence(MLPEstimator, solver): + """Check that we stop the number of iteration at `max_iter` when warm starting. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/24764 + """ + model = MLPEstimator( + solver=solver, + warm_start=True, + early_stopping=False, + max_iter=10, + n_iter_no_change=np.inf, + random_state=0, + ) + + with pytest.warns(ConvergenceWarning): + model.fit(X_iris, y_iris) + assert model.n_iter_ == 10 + + model.set_params(max_iter=20) + with pytest.warns(ConvergenceWarning): + model.fit(X_iris, y_iris) + assert model.n_iter_ == 20 + + +@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor]) +def test_mlp_partial_fit_after_fit(MLPEstimator): + """Check partial fit does not fail after fit when early_stopping=True. + + Non-regression test for gh-25693. + """ + mlp = MLPEstimator(early_stopping=True, random_state=0).fit(X_iris, y_iris) + + msg = "partial_fit does not support early_stopping=True" + with pytest.raises(ValueError, match=msg): + mlp.partial_fit(X_iris, y_iris) + + +def test_mlp_diverging_loss(): + """Test that a diverging model does not raise errors when early stopping is enabled. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/29504 + """ + mlp = MLPRegressor( + hidden_layer_sizes=100, + activation="identity", + solver="sgd", + alpha=0.0001, + learning_rate="constant", + learning_rate_init=1, + shuffle=True, + max_iter=20, + early_stopping=True, + n_iter_no_change=10, + random_state=0, + ) + + with warnings.catch_warnings(): + # RuntimeWarning: overflow encountered in matmul + # ConvergenceWarning: Stochastic Optimizer: Maximum iteration + warnings.simplefilter("ignore", RuntimeWarning) + warnings.simplefilter("ignore", ConvergenceWarning) + mlp.fit(X_iris, y_iris) + + # In python, float("nan") != float("nan") + assert str(mlp.validation_scores_[-1]) == str(np.nan) + assert isinstance(mlp.validation_scores_[-1], float) + + +def test_mlp_sample_weight_with_early_stopping(): + # Test code path for inner validation set splitting. + X, y = make_regression( + n_samples=100, + n_features=2, + n_informative=2, + random_state=42, + ) + sw = np.ones_like(y) + params = dict( + hidden_layer_sizes=10, + solver="adam", + early_stopping=True, + tol=1e-2, + learning_rate_init=0.01, + batch_size=10, + random_state=42, + ) + m1 = MLPRegressor( + **params, + ) + m1.fit(X, y, sample_weight=sw) + + m2 = MLPRegressor(**params).fit(X, y, sample_weight=None) + assert_allclose(m1.predict(X), m2.predict(X)) + + +def test_mlp_vs_poisson_glm_equivalent(global_random_seed): + """Test MLP with Poisson loss and no hidden layer equals GLM.""" + n = 100 + rng = np.random.default_rng(global_random_seed) + X = np.linspace(0, 1, n) + y = rng.poisson(np.exp(X + 1)) + X = X.reshape(n, -1) + glm = PoissonRegressor(alpha=0, tol=1e-7).fit(X, y) + # Unfortunately, we can't set a zero hidden_layer_size, so we use a trick by using + # just one hidden layer node with an identity activation. Coefficients will + # therefore be different, but predictions are the same. + mlp = MLPRegressor( + loss="poisson", + hidden_layer_sizes=(1,), + activation="identity", + alpha=0, + solver="lbfgs", + tol=1e-7, + random_state=np.random.RandomState(global_random_seed + 1), + ).fit(X, y) + + assert_allclose(mlp.predict(X), glm.predict(X), rtol=1e-4) + + # The same does not work with the squared error because the output activation is + # the identity instead of the exponential. + mlp = MLPRegressor( + loss="squared_error", + hidden_layer_sizes=(1,), + activation="identity", + alpha=0, + solver="lbfgs", + tol=1e-7, + random_state=np.random.RandomState(global_random_seed + 1), + ).fit(X, y) + assert not np.allclose(mlp.predict(X), glm.predict(X), rtol=1e-4) + + +def test_minimum_input_sample_size(): + """Check error message when the validation set is too small.""" + X, y = make_regression(n_samples=2, n_features=5, random_state=0) + model = MLPRegressor(early_stopping=True, random_state=0) + with pytest.raises(ValueError, match="The validation set is too small"): + model.fit(X, y) diff --git a/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_rbm.py b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_rbm.py new file mode 100644 index 0000000000000000000000000000000000000000..8211c9735923d650234d4268cb30336ddc3ebbb1 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_rbm.py @@ -0,0 +1,251 @@ +import re +import sys +from io import StringIO + +import numpy as np +import pytest + +from sklearn.datasets import load_digits +from sklearn.neural_network import BernoulliRBM +from sklearn.utils._testing import ( + assert_allclose, + assert_almost_equal, + assert_array_equal, +) +from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS, LIL_CONTAINERS +from sklearn.utils.validation import assert_all_finite + +Xdigits, _ = load_digits(return_X_y=True) +Xdigits -= Xdigits.min() +Xdigits /= Xdigits.max() + + +def test_fit(): + X = Xdigits.copy() + + rbm = BernoulliRBM( + n_components=64, learning_rate=0.1, batch_size=10, n_iter=7, random_state=9 + ) + rbm.fit(X) + + assert_almost_equal(rbm.score_samples(X).mean(), -21.0, decimal=0) + + # in-place tricks shouldn't have modified X + assert_array_equal(X, Xdigits) + + +def test_partial_fit(): + X = Xdigits.copy() + rbm = BernoulliRBM( + n_components=64, learning_rate=0.1, batch_size=20, random_state=9 + ) + n_samples = X.shape[0] + n_batches = int(np.ceil(float(n_samples) / rbm.batch_size)) + batch_slices = np.array_split(X, n_batches) + + for i in range(7): + for batch in batch_slices: + rbm.partial_fit(batch) + + assert_almost_equal(rbm.score_samples(X).mean(), -21.0, decimal=0) + assert_array_equal(X, Xdigits) + + +def test_transform(): + X = Xdigits[:100] + rbm1 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42) + rbm1.fit(X) + + Xt1 = rbm1.transform(X) + Xt2 = rbm1._mean_hiddens(X) + + assert_array_equal(Xt1, Xt2) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_small_sparse(csr_container): + # BernoulliRBM should work on small sparse matrices. + X = csr_container(Xdigits[:4]) + BernoulliRBM().fit(X) # no exception + + +@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) +def test_small_sparse_partial_fit(sparse_container): + X_sparse = sparse_container(Xdigits[:100]) + X = Xdigits[:100].copy() + + rbm1 = BernoulliRBM( + n_components=64, learning_rate=0.1, batch_size=10, random_state=9 + ) + rbm2 = BernoulliRBM( + n_components=64, learning_rate=0.1, batch_size=10, random_state=9 + ) + + rbm1.partial_fit(X_sparse) + rbm2.partial_fit(X) + + assert_almost_equal( + rbm1.score_samples(X).mean(), rbm2.score_samples(X).mean(), decimal=0 + ) + + +def test_sample_hiddens(): + rng = np.random.RandomState(0) + X = Xdigits[:100] + rbm1 = BernoulliRBM(n_components=2, batch_size=5, n_iter=5, random_state=42) + rbm1.fit(X) + + h = rbm1._mean_hiddens(X[0]) + hs = np.mean([rbm1._sample_hiddens(X[0], rng) for i in range(100)], 0) + + assert_almost_equal(h, hs, decimal=1) + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_fit_gibbs(csc_container): + # XXX: this test is very seed-dependent! It probably needs to be rewritten. + + # Gibbs on the RBM hidden layer should be able to recreate [[0], [1]] + # from the same input + rng = np.random.RandomState(42) + X = np.array([[0.0], [1.0]]) + rbm1 = BernoulliRBM(n_components=2, batch_size=2, n_iter=42, random_state=rng) + # you need that much iters + rbm1.fit(X) + assert_almost_equal( + rbm1.components_, np.array([[0.02649814], [0.02009084]]), decimal=4 + ) + assert_almost_equal(rbm1.gibbs(X), X) + + # Gibbs on the RBM hidden layer should be able to recreate [[0], [1]] from + # the same input even when the input is sparse, and test against non-sparse + rng = np.random.RandomState(42) + X = csc_container([[0.0], [1.0]]) + rbm2 = BernoulliRBM(n_components=2, batch_size=2, n_iter=42, random_state=rng) + rbm2.fit(X) + assert_almost_equal( + rbm2.components_, np.array([[0.02649814], [0.02009084]]), decimal=4 + ) + assert_almost_equal(rbm2.gibbs(X), X.toarray()) + assert_almost_equal(rbm1.components_, rbm2.components_) + + +def test_gibbs_smoke(): + # Check if we don't get NaNs sampling the full digits dataset. + # Also check that sampling again will yield different results. + X = Xdigits + rbm1 = BernoulliRBM(n_components=42, batch_size=40, n_iter=20, random_state=42) + rbm1.fit(X) + X_sampled = rbm1.gibbs(X) + assert_all_finite(X_sampled) + X_sampled2 = rbm1.gibbs(X) + assert np.all((X_sampled != X_sampled2).max(axis=1)) + + +@pytest.mark.parametrize("lil_containers", LIL_CONTAINERS) +def test_score_samples(lil_containers): + # Test score_samples (pseudo-likelihood) method. + # Assert that pseudo-likelihood is computed without clipping. + # See Fabian's blog, http://bit.ly/1iYefRk + rng = np.random.RandomState(42) + X = np.vstack([np.zeros(1000), np.ones(1000)]) + rbm1 = BernoulliRBM(n_components=10, batch_size=2, n_iter=10, random_state=rng) + rbm1.fit(X) + assert (rbm1.score_samples(X) < -300).all() + + # Sparse vs. dense should not affect the output. Also test sparse input + # validation. + rbm1.random_state = 42 + d_score = rbm1.score_samples(X) + rbm1.random_state = 42 + s_score = rbm1.score_samples(lil_containers(X)) + assert_almost_equal(d_score, s_score) + + # Test numerical stability (#2785): would previously generate infinities + # and crash with an exception. + with np.errstate(under="ignore"): + rbm1.score_samples([np.arange(1000) * 100]) + + +def test_rbm_verbose(): + rbm = BernoulliRBM(n_iter=2, verbose=10) + old_stdout = sys.stdout + sys.stdout = StringIO() + try: + rbm.fit(Xdigits) + finally: + sys.stdout = old_stdout + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_sparse_and_verbose(csc_container): + # Make sure RBM works with sparse input when verbose=True + old_stdout = sys.stdout + sys.stdout = StringIO() + + X = csc_container([[0.0], [1.0]]) + rbm = BernoulliRBM( + n_components=2, batch_size=2, n_iter=1, random_state=42, verbose=True + ) + try: + rbm.fit(X) + s = sys.stdout.getvalue() + # make sure output is sound + assert re.match( + r"\[BernoulliRBM\] Iteration 1," + r" pseudo-likelihood = -?(\d)+(\.\d+)?," + r" time = (\d|\.)+s", + s, + ) + finally: + sys.stdout = old_stdout + + +@pytest.mark.parametrize( + "dtype_in, dtype_out", + [(np.float32, np.float32), (np.float64, np.float64), (int, np.float64)], +) +def test_transformer_dtypes_casting(dtype_in, dtype_out): + X = Xdigits[:100].astype(dtype_in) + rbm = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42) + Xt = rbm.fit_transform(X) + + # dtype_in and dtype_out should be consistent + assert Xt.dtype == dtype_out, "transform dtype: {} - original dtype: {}".format( + Xt.dtype, X.dtype + ) + + +def test_convergence_dtype_consistency(): + # float 64 transformer + X_64 = Xdigits[:100].astype(np.float64) + rbm_64 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42) + Xt_64 = rbm_64.fit_transform(X_64) + + # float 32 transformer + X_32 = Xdigits[:100].astype(np.float32) + rbm_32 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42) + Xt_32 = rbm_32.fit_transform(X_32) + + # results and attributes should be close enough in 32 bit and 64 bit + assert_allclose(Xt_64, Xt_32, rtol=1e-06, atol=0) + assert_allclose( + rbm_64.intercept_hidden_, rbm_32.intercept_hidden_, rtol=1e-06, atol=0 + ) + assert_allclose( + rbm_64.intercept_visible_, rbm_32.intercept_visible_, rtol=1e-05, atol=0 + ) + assert_allclose(rbm_64.components_, rbm_32.components_, rtol=1e-03, atol=0) + assert_allclose(rbm_64.h_samples_, rbm_32.h_samples_) + + +@pytest.mark.parametrize("method", ["fit", "partial_fit"]) +def test_feature_names_out(method): + """Check `get_feature_names_out` for `BernoulliRBM`.""" + n_components = 10 + rbm = BernoulliRBM(n_components=n_components) + getattr(rbm, method)(Xdigits) + + names = rbm.get_feature_names_out() + expected_names = [f"bernoullirbm{i}" for i in range(n_components)] + assert_array_equal(expected_names, names) diff --git a/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_stochastic_optimizers.py b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_stochastic_optimizers.py new file mode 100644 index 0000000000000000000000000000000000000000..58a9f0c7dda13fd288c1c86f6a52fede485787ad --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_stochastic_optimizers.py @@ -0,0 +1,112 @@ +import numpy as np + +from sklearn.neural_network._stochastic_optimizers import ( + AdamOptimizer, + BaseOptimizer, + SGDOptimizer, +) +from sklearn.utils._testing import assert_array_equal + +shapes = [(4, 6), (6, 8), (7, 8, 9)] + + +def test_base_optimizer(): + for lr in [10**i for i in range(-3, 4)]: + optimizer = BaseOptimizer(lr) + assert optimizer.trigger_stopping("", False) + + +def test_sgd_optimizer_no_momentum(): + params = [np.zeros(shape) for shape in shapes] + rng = np.random.RandomState(0) + + for lr in [10**i for i in range(-3, 4)]: + optimizer = SGDOptimizer(params, lr, momentum=0, nesterov=False) + grads = [rng.random_sample(shape) for shape in shapes] + expected = [param - lr * grad for param, grad in zip(params, grads)] + optimizer.update_params(params, grads) + + for exp, param in zip(expected, params): + assert_array_equal(exp, param) + + +def test_sgd_optimizer_momentum(): + params = [np.zeros(shape) for shape in shapes] + lr = 0.1 + rng = np.random.RandomState(0) + + for momentum in np.arange(0.5, 0.9, 0.1): + optimizer = SGDOptimizer(params, lr, momentum=momentum, nesterov=False) + velocities = [rng.random_sample(shape) for shape in shapes] + optimizer.velocities = velocities + grads = [rng.random_sample(shape) for shape in shapes] + updates = [ + momentum * velocity - lr * grad for velocity, grad in zip(velocities, grads) + ] + expected = [param + update for param, update in zip(params, updates)] + optimizer.update_params(params, grads) + + for exp, param in zip(expected, params): + assert_array_equal(exp, param) + + +def test_sgd_optimizer_trigger_stopping(): + params = [np.zeros(shape) for shape in shapes] + lr = 2e-6 + optimizer = SGDOptimizer(params, lr, lr_schedule="adaptive") + assert not optimizer.trigger_stopping("", False) + assert lr / 5 == optimizer.learning_rate + assert optimizer.trigger_stopping("", False) + + +def test_sgd_optimizer_nesterovs_momentum(): + params = [np.zeros(shape) for shape in shapes] + lr = 0.1 + rng = np.random.RandomState(0) + + for momentum in np.arange(0.5, 0.9, 0.1): + optimizer = SGDOptimizer(params, lr, momentum=momentum, nesterov=True) + velocities = [rng.random_sample(shape) for shape in shapes] + optimizer.velocities = velocities + grads = [rng.random_sample(shape) for shape in shapes] + updates = [ + momentum * velocity - lr * grad for velocity, grad in zip(velocities, grads) + ] + updates = [ + momentum * update - lr * grad for update, grad in zip(updates, grads) + ] + expected = [param + update for param, update in zip(params, updates)] + optimizer.update_params(params, grads) + + for exp, param in zip(expected, params): + assert_array_equal(exp, param) + + +def test_adam_optimizer(): + params = [np.zeros(shape) for shape in shapes] + lr = 0.001 + epsilon = 1e-8 + rng = np.random.RandomState(0) + + for beta_1 in np.arange(0.9, 1.0, 0.05): + for beta_2 in np.arange(0.995, 1.0, 0.001): + optimizer = AdamOptimizer(params, lr, beta_1, beta_2, epsilon) + ms = [rng.random_sample(shape) for shape in shapes] + vs = [rng.random_sample(shape) for shape in shapes] + t = 10 + optimizer.ms = ms + optimizer.vs = vs + optimizer.t = t - 1 + grads = [rng.random_sample(shape) for shape in shapes] + + ms = [beta_1 * m + (1 - beta_1) * grad for m, grad in zip(ms, grads)] + vs = [beta_2 * v + (1 - beta_2) * (grad**2) for v, grad in zip(vs, grads)] + learning_rate = lr * np.sqrt(1 - beta_2**t) / (1 - beta_1**t) + updates = [ + -learning_rate * m / (np.sqrt(v) + epsilon) for m, v in zip(ms, vs) + ] + expected = [param + update for param, update in zip(params, updates)] + + optimizer.update_params(params, grads) + for exp, param in zip(expected, params): + assert_array_equal(exp, param) diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..48bb3aa6a7a4e811f02e13924658858984a21681 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__init__.py @@ -0,0 +1,63 @@ +"""Methods for scaling, centering, normalization, binarization, and more.""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._data import ( + Binarizer, + KernelCenterer, + MaxAbsScaler, + MinMaxScaler, + Normalizer, + PowerTransformer, + QuantileTransformer, + RobustScaler, + StandardScaler, + add_dummy_feature, + binarize, + maxabs_scale, + minmax_scale, + normalize, + power_transform, + quantile_transform, + robust_scale, + scale, +) +from ._discretization import KBinsDiscretizer +from ._encoders import OneHotEncoder, OrdinalEncoder +from ._function_transformer import FunctionTransformer +from ._label import LabelBinarizer, LabelEncoder, MultiLabelBinarizer, label_binarize +from ._polynomial import PolynomialFeatures, SplineTransformer +from ._target_encoder import TargetEncoder + +__all__ = [ + "Binarizer", + "FunctionTransformer", + "KBinsDiscretizer", + "KernelCenterer", + "LabelBinarizer", + "LabelEncoder", + "MaxAbsScaler", + "MinMaxScaler", + "MultiLabelBinarizer", + "Normalizer", + "OneHotEncoder", + "OrdinalEncoder", + "PolynomialFeatures", + "PowerTransformer", + "QuantileTransformer", + "RobustScaler", + "SplineTransformer", + "StandardScaler", + "TargetEncoder", + "add_dummy_feature", + "binarize", + "label_binarize", + "maxabs_scale", + "minmax_scale", + "normalize", + "power_transform", + "quantile_transform", + "robust_scale", + "scale", +] diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/__init__.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..17e15e243e288a4e726cc39c790f924fb72be9b0 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/__init__.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_discretization.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_discretization.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c2c5c382922e1e41dd745f2471b2f91e5c41d254 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_discretization.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_encoders.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_encoders.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..08c8341a0166b0d02ddcdf75eddb1ca54d45f53d Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_encoders.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_function_transformer.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_function_transformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc5ad9de109e7ee0bd4649989883f9117058eedd Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_function_transformer.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_label.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_label.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..228020175451be16e55ca1c41da0938173e635bd Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_label.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_polynomial.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_polynomial.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9c992002dbc75cc8be6e8da029f0509ec869f2b5 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_polynomial.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_target_encoder.cpython-312.pyc b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_target_encoder.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d111b0de0024befb293fead397d14adceeea9f4f Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/__pycache__/_target_encoder.cpython-312.pyc differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.pyx b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.pyx new file mode 100644 index 0000000000000000000000000000000000000000..38e5c3069d252c0f31db2fe7b3046390eb30be12 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.pyx @@ -0,0 +1,258 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ..utils._typedefs cimport uint8_t, int64_t, intp_t + +ctypedef uint8_t FLAG_t + +# We use the following verbatim block to determine whether the current +# platform's compiler supports 128-bit integer values intrinsically. +# This should work for GCC and CLANG on 64-bit architectures, but doesn't for +# MSVC on any architecture. We prefer to use 128-bit integers when possible +# because the intermediate calculations have a non-trivial risk of overflow. It +# is, however, very unlikely to come up on an average use case, hence 64-bit +# integers (i.e. `long long`) are "good enough" for most common cases. There is +# not much we can do to efficiently mitigate the overflow risk on the Windows +# platform at this time. Consider this a "best effort" design decision that +# could be revisited later in case someone comes up with a safer option that +# does not hurt the performance of the common cases. +# See `test_sizeof_LARGEST_INT_t()`for more information on exact type expectations. +cdef extern from *: + """ + #ifdef __SIZEOF_INT128__ + typedef __int128 LARGEST_INT_t; + #elif (__clang__ || __EMSCRIPTEN__) && !__i386__ + typedef _BitInt(128) LARGEST_INT_t; + #else + typedef long long LARGEST_INT_t; + #endif + """ + ctypedef long long LARGEST_INT_t + + +# Determine the size of `LARGEST_INT_t` at runtime. +# Used in `test_sizeof_LARGEST_INT_t`. +def _get_sizeof_LARGEST_INT_t(): + return sizeof(LARGEST_INT_t) + + +# TODO: use `{int,float}{32,64}_t` when cython#5230 is resolved: +# https://github.com/cython/cython/issues/5230 +ctypedef fused DATA_t: + float + double + int + long long +# INDEX_{A,B}_t are defined to generate a proper Cartesian product +# of types through Cython fused-type expansion. +ctypedef fused INDEX_A_t: + signed int + signed long long +ctypedef fused INDEX_B_t: + signed int + signed long long + +cdef inline int64_t _deg2_column( + LARGEST_INT_t n_features, + LARGEST_INT_t i, + LARGEST_INT_t j, + FLAG_t interaction_only +) nogil: + """Compute the index of the column for a degree 2 expansion + + n_features is the dimensionality of the input data, i and j are the indices + for the columns involved in the expansion. + """ + if interaction_only: + return n_features * i - i * (i + 3) / 2 - 1 + j + else: + return n_features * i - i* (i + 1) / 2 + j + + +cdef inline int64_t _deg3_column( + LARGEST_INT_t n_features, + LARGEST_INT_t i, + LARGEST_INT_t j, + LARGEST_INT_t k, + FLAG_t interaction_only +) nogil: + """Compute the index of the column for a degree 3 expansion + + n_features is the dimensionality of the input data, i, j and k are the indices + for the columns involved in the expansion. + """ + if interaction_only: + return ( + ( + (3 * n_features) * (n_features * i - i**2) + + i * (i**2 + 11) - (3 * j) * (j + 3) + ) / 6 + i**2 + n_features * (j - 1 - 2 * i) + k + ) + else: + return ( + ( + (3 * n_features) * (n_features * i - i**2) + + i ** 3 - i - (3 * j) * (j + 1) + ) / 6 + n_features * j + k + ) + + +def py_calc_expanded_nnz_deg2(n, interaction_only): + return n * (n + 1) // 2 - interaction_only * n + + +def py_calc_expanded_nnz_deg3(n, interaction_only): + return n * (n**2 + 3 * n + 2) // 6 - interaction_only * n**2 + + +cpdef int64_t _calc_expanded_nnz( + LARGEST_INT_t n, + FLAG_t interaction_only, + LARGEST_INT_t degree +): + """ + Calculates the number of non-zero interaction terms generated by the + non-zero elements of a single row. + """ + # This is the maximum value before the intermediate computation + # d**2 + d overflows + # Solution to d**2 + d = maxint64 + # SymPy: solve(x**2 + x - int64_max, x) + cdef int64_t MAX_SAFE_INDEX_CALC_DEG2 = 3037000499 + + # This is the maximum value before the intermediate computation + # d**3 + 3 * d**2 + 2*d overflows + # Solution to d**3 + 3 * d**2 + 2*d = maxint64 + # SymPy: solve(x * (x**2 + 3 * x + 2) - int64_max, x) + cdef int64_t MAX_SAFE_INDEX_CALC_DEG3 = 2097151 + + if degree == 2: + # Only need to check when not using 128-bit integers + if sizeof(LARGEST_INT_t) < 16 and n <= MAX_SAFE_INDEX_CALC_DEG2: + return n * (n + 1) / 2 - interaction_only * n + return py_calc_expanded_nnz_deg2(n, interaction_only) + else: + # Only need to check when not using 128-bit integers + if sizeof(LARGEST_INT_t) < 16 and n <= MAX_SAFE_INDEX_CALC_DEG3: + return n * (n**2 + 3 * n + 2) / 6 - interaction_only * n**2 + return py_calc_expanded_nnz_deg3(n, interaction_only) + +cpdef int64_t _calc_total_nnz( + INDEX_A_t[:] indptr, + FLAG_t interaction_only, + int64_t degree, +): + """ + Calculates the number of non-zero interaction terms generated by the + non-zero elements across all rows for a single degree. + """ + cdef int64_t total_nnz=0 + cdef intp_t row_idx + for row_idx in range(len(indptr) - 1): + total_nnz += _calc_expanded_nnz( + indptr[row_idx + 1] - indptr[row_idx], + interaction_only, + degree + ) + return total_nnz + + +cpdef void _csr_polynomial_expansion( + const DATA_t[:] data, # IN READ-ONLY + const INDEX_A_t[:] indices, # IN READ-ONLY + const INDEX_A_t[:] indptr, # IN READ-ONLY + INDEX_A_t n_features, + DATA_t[:] result_data, # OUT + INDEX_B_t[:] result_indices, # OUT + INDEX_B_t[:] result_indptr, # OUT + FLAG_t interaction_only, + FLAG_t degree +): + """ + Perform a second or third degree polynomial or interaction expansion on a + compressed sparse row (CSR) matrix. The method used only takes products of + non-zero features. For a matrix with density :math:`d`, this results in a + speedup on the order of :math:`(1/d)^k` where :math:`k` is the degree of + the expansion, assuming all rows are of similar density. + + Parameters + ---------- + data : memory view on nd-array + The "data" attribute of the input CSR matrix. + + indices : memory view on nd-array + The "indices" attribute of the input CSR matrix. + + indptr : memory view on nd-array + The "indptr" attribute of the input CSR matrix. + + n_features : int + The dimensionality of the input CSR matrix. + + result_data : nd-array + The output CSR matrix's "data" attribute. + It is modified by this routine. + + result_indices : nd-array + The output CSR matrix's "indices" attribute. + It is modified by this routine. + + result_indptr : nd-array + The output CSR matrix's "indptr" attribute. + It is modified by this routine. + + interaction_only : int + 0 for a polynomial expansion, 1 for an interaction expansion. + + degree : int + The degree of the expansion. This must be either 2 or 3. + + References + ---------- + "Leveraging Sparsity to Speed Up Polynomial Feature Expansions of CSR + Matrices Using K-Simplex Numbers" by Andrew Nystrom and John Hughes. + """ + + # Make the arrays that will form the CSR matrix of the expansion. + cdef INDEX_A_t row_i, row_starts, row_ends, i, j, k, i_ptr, j_ptr, k_ptr + cdef INDEX_B_t expanded_index=0, num_cols_in_row, col + with nogil: + result_indptr[0] = indptr[0] + for row_i in range(indptr.shape[0]-1): + row_starts = indptr[row_i] + row_ends = indptr[row_i + 1] + num_cols_in_row = 0 + for i_ptr in range(row_starts, row_ends): + i = indices[i_ptr] + for j_ptr in range(i_ptr + interaction_only, row_ends): + j = indices[j_ptr] + if degree == 2: + col = _deg2_column( + n_features, + i, j, + interaction_only + ) + result_indices[expanded_index] = col + result_data[expanded_index] = ( + data[i_ptr] * data[j_ptr] + ) + expanded_index += 1 + num_cols_in_row += 1 + else: + # degree == 3 + for k_ptr in range(j_ptr + interaction_only, row_ends): + k = indices[k_ptr] + col = _deg3_column( + n_features, + i, j, k, + interaction_only + ) + result_indices[expanded_index] = col + result_data[expanded_index] = ( + data[i_ptr] * data[j_ptr] * data[k_ptr] + ) + expanded_index += 1 + num_cols_in_row += 1 + + result_indptr[row_i+1] = result_indptr[row_i] + num_cols_in_row + return diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_data.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_data.py new file mode 100644 index 0000000000000000000000000000000000000000..fe138cda73803ea7612215b0f9ca3abd11083f23 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_data.py @@ -0,0 +1,3706 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + + +import warnings +from numbers import Integral, Real + +import numpy as np +from scipy import sparse, stats +from scipy.special import boxcox, inv_boxcox + +from sklearn.utils import metadata_routing + +from ..base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + OneToOneFeatureMixin, + TransformerMixin, + _fit_context, +) +from ..utils import _array_api, check_array, resample +from ..utils._array_api import ( + _find_matching_floating_dtype, + _modify_in_place_if_numpy, + device, + get_namespace, + get_namespace_and_device, +) +from ..utils._param_validation import Interval, Options, StrOptions, validate_params +from ..utils.extmath import _incremental_mean_and_var, row_norms +from ..utils.fixes import _yeojohnson_lambda +from ..utils.sparsefuncs import ( + incr_mean_variance_axis, + inplace_column_scale, + mean_variance_axis, + min_max_axis, +) +from ..utils.sparsefuncs_fast import ( + inplace_csr_row_normalize_l1, + inplace_csr_row_normalize_l2, +) +from ..utils.validation import ( + FLOAT_DTYPES, + _check_sample_weight, + check_is_fitted, + check_random_state, + validate_data, +) +from ._encoders import OneHotEncoder + +BOUNDS_THRESHOLD = 1e-7 + +__all__ = [ + "Binarizer", + "KernelCenterer", + "MaxAbsScaler", + "MinMaxScaler", + "Normalizer", + "OneHotEncoder", + "PowerTransformer", + "QuantileTransformer", + "RobustScaler", + "StandardScaler", + "add_dummy_feature", + "binarize", + "maxabs_scale", + "minmax_scale", + "normalize", + "power_transform", + "quantile_transform", + "robust_scale", + "scale", +] + + +def _is_constant_feature(var, mean, n_samples): + """Detect if a feature is indistinguishable from a constant feature. + + The detection is based on its computed variance and on the theoretical + error bounds of the '2 pass algorithm' for variance computation. + + See "Algorithms for computing the sample variance: analysis and + recommendations", by Chan, Golub, and LeVeque. + """ + # In scikit-learn, variance is always computed using float64 accumulators. + eps = np.finfo(np.float64).eps + + upper_bound = n_samples * eps * var + (n_samples * mean * eps) ** 2 + return var <= upper_bound + + +def _handle_zeros_in_scale(scale, copy=True, constant_mask=None): + """Set scales of near constant features to 1. + + The goal is to avoid division by very small or zero values. + + Near constant features are detected automatically by identifying + scales close to machine precision unless they are precomputed by + the caller and passed with the `constant_mask` kwarg. + + Typically for standard scaling, the scales are the standard + deviation while near constant features are better detected on the + computed variances which are closer to machine precision by + construction. + """ + # if we are fitting on 1D arrays, scale might be a scalar + if np.isscalar(scale): + if scale == 0.0: + scale = 1.0 + return scale + # scale is an array + else: + xp, _ = get_namespace(scale) + if constant_mask is None: + # Detect near constant values to avoid dividing by a very small + # value that could lead to surprising results and numerical + # stability issues. + constant_mask = scale < 10 * xp.finfo(scale.dtype).eps + + if copy: + # New array to avoid side-effects + scale = xp.asarray(scale, copy=True) + scale[constant_mask] = 1.0 + return scale + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "axis": [Options(Integral, {0, 1})], + "with_mean": ["boolean"], + "with_std": ["boolean"], + "copy": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True): + """Standardize a dataset along any axis. + + Center to the mean and component wise scale to unit variance. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to center and scale. + + axis : {0, 1}, default=0 + Axis used to compute the means and standard deviations along. If 0, + independently standardize each feature, otherwise (if 1) standardize + each sample. + + with_mean : bool, default=True + If True, center the data before scaling. + + with_std : bool, default=True + If True, scale the data to unit variance (or equivalently, + unit standard deviation). + + copy : bool, default=True + If False, try to avoid a copy and scale in place. + This is not guaranteed to always work in place; e.g. if the data is + a numpy array with an int dtype, a copy will be returned even with + copy=False. + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) + The transformed data. + + See Also + -------- + StandardScaler : Performs scaling to unit variance using the Transformer + API (e.g. as part of a preprocessing + :class:`~sklearn.pipeline.Pipeline`). + + Notes + ----- + This implementation will refuse to center scipy.sparse matrices + since it would make them non-sparse and would potentially crash the + program with memory exhaustion problems. + + Instead the caller is expected to either set explicitly + `with_mean=False` (in that case, only variance scaling will be + performed on the features of the CSC matrix) or to call `X.toarray()` + if he/she expects the materialized dense array to fit in memory. + + To avoid memory copy the caller should pass a CSC matrix. + + NaNs are treated as missing values: disregarded to compute the statistics, + and maintained during the data transformation. + + We use a biased estimator for the standard deviation, equivalent to + `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to + affect model performance. + + For a comparison of the different scalers, transformers, and normalizers, + see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. + + .. warning:: Risk of data leak + + Do not use :func:`~sklearn.preprocessing.scale` unless you know + what you are doing. A common mistake is to apply it to the entire data + *before* splitting into training and test sets. This will bias the + model evaluation because information would have leaked from the test + set to the training set. + In general, we recommend using + :class:`~sklearn.preprocessing.StandardScaler` within a + :ref:`Pipeline ` in order to prevent most risks of data + leaking: `pipe = make_pipeline(StandardScaler(), LogisticRegression())`. + + Examples + -------- + >>> from sklearn.preprocessing import scale + >>> X = [[-2, 1, 2], [-1, 0, 1]] + >>> scale(X, axis=0) # scaling each column independently + array([[-1., 1., 1.], + [ 1., -1., -1.]]) + >>> scale(X, axis=1) # scaling each row independently + array([[-1.37, 0.39, 0.98], + [-1.22, 0. , 1.22]]) + """ + X = check_array( + X, + accept_sparse="csc", + copy=copy, + ensure_2d=False, + estimator="the scale function", + dtype=FLOAT_DTYPES, + ensure_all_finite="allow-nan", + ) + if sparse.issparse(X): + if with_mean: + raise ValueError( + "Cannot center sparse matrices: pass `with_mean=False` instead" + " See docstring for motivation and alternatives." + ) + if axis != 0: + raise ValueError( + "Can only scale sparse matrix on axis=0, got axis=%d" % axis + ) + if with_std: + _, var = mean_variance_axis(X, axis=0) + var = _handle_zeros_in_scale(var, copy=False) + inplace_column_scale(X, 1 / np.sqrt(var)) + else: + X = np.asarray(X) + if with_mean: + mean_ = np.nanmean(X, axis) + if with_std: + scale_ = np.nanstd(X, axis) + # Xr is a view on the original array that enables easy use of + # broadcasting on the axis in which we are interested in + Xr = np.rollaxis(X, axis) + if with_mean: + Xr -= mean_ + mean_1 = np.nanmean(Xr, axis=0) + # Verify that mean_1 is 'close to zero'. If X contains very + # large values, mean_1 can also be very large, due to a lack of + # precision of mean_. In this case, a pre-scaling of the + # concerned feature is efficient, for instance by its mean or + # maximum. + if not np.allclose(mean_1, 0): + warnings.warn( + "Numerical issues were encountered " + "when centering the data " + "and might not be solved. Dataset may " + "contain too large values. You may need " + "to prescale your features." + ) + Xr -= mean_1 + if with_std: + scale_ = _handle_zeros_in_scale(scale_, copy=False) + Xr /= scale_ + if with_mean: + mean_2 = np.nanmean(Xr, axis=0) + # If mean_2 is not 'close to zero', it comes from the fact that + # scale_ is very small so that mean_2 = mean_1/scale_ > 0, even + # if mean_1 was close to zero. The problem is thus essentially + # due to the lack of precision of mean_. A solution is then to + # subtract the mean again: + if not np.allclose(mean_2, 0): + warnings.warn( + "Numerical issues were encountered " + "when scaling the data " + "and might not be solved. The standard " + "deviation of the data is probably " + "very close to 0. " + ) + Xr -= mean_2 + return X + + +class MinMaxScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): + """Transform features by scaling each feature to a given range. + + This estimator scales and translates each feature individually such + that it is in the given range on the training set, e.g. between + zero and one. + + The transformation is given by:: + + X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) + X_scaled = X_std * (max - min) + min + + where min, max = feature_range. + + This transformation is often used as an alternative to zero mean, + unit variance scaling. + + `MinMaxScaler` doesn't reduce the effect of outliers, but it linearly + scales them down into a fixed range, where the largest occurring data point + corresponds to the maximum value and the smallest one corresponds to the + minimum value. For an example visualization, refer to :ref:`Compare + MinMaxScaler with other scalers `. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + feature_range : tuple (min, max), default=(0, 1) + Desired range of transformed data. + + copy : bool, default=True + Set to False to perform inplace row normalization and avoid a + copy (if the input is already a numpy array). + + clip : bool, default=False + Set to True to clip transformed values of held-out data to + provided `feature range`. + + .. versionadded:: 0.24 + + Attributes + ---------- + min_ : ndarray of shape (n_features,) + Per feature adjustment for minimum. Equivalent to + ``min - X.min(axis=0) * self.scale_`` + + scale_ : ndarray of shape (n_features,) + Per feature relative scaling of the data. Equivalent to + ``(max - min) / (X.max(axis=0) - X.min(axis=0))`` + + .. versionadded:: 0.17 + *scale_* attribute. + + data_min_ : ndarray of shape (n_features,) + Per feature minimum seen in the data + + .. versionadded:: 0.17 + *data_min_* + + data_max_ : ndarray of shape (n_features,) + Per feature maximum seen in the data + + .. versionadded:: 0.17 + *data_max_* + + data_range_ : ndarray of shape (n_features,) + Per feature range ``(data_max_ - data_min_)`` seen in the data + + .. versionadded:: 0.17 + *data_range_* + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + n_samples_seen_ : int + The number of samples processed by the estimator. + It will be reset on new calls to fit, but increments across + ``partial_fit`` calls. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + minmax_scale : Equivalent function without the estimator API. + + Notes + ----- + NaNs are treated as missing values: disregarded in fit, and maintained in + transform. + + Examples + -------- + >>> from sklearn.preprocessing import MinMaxScaler + >>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]] + >>> scaler = MinMaxScaler() + >>> print(scaler.fit(data)) + MinMaxScaler() + >>> print(scaler.data_max_) + [ 1. 18.] + >>> print(scaler.transform(data)) + [[0. 0. ] + [0.25 0.25] + [0.5 0.5 ] + [1. 1. ]] + >>> print(scaler.transform([[2, 2]])) + [[1.5 0. ]] + """ + + _parameter_constraints: dict = { + "feature_range": [tuple], + "copy": ["boolean"], + "clip": ["boolean"], + } + + def __init__(self, feature_range=(0, 1), *, copy=True, clip=False): + self.feature_range = feature_range + self.copy = copy + self.clip = clip + + def _reset(self): + """Reset internal data-dependent state of the scaler, if necessary. + + __init__ parameters are not touched. + """ + # Checking one attribute is enough, because they are all set together + # in partial_fit + if hasattr(self, "scale_"): + del self.scale_ + del self.min_ + del self.n_samples_seen_ + del self.data_min_ + del self.data_max_ + del self.data_range_ + + def fit(self, X, y=None): + """Compute the minimum and maximum to be used for later scaling. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data used to compute the per-feature minimum and maximum + used for later scaling along the features axis. + + y : None + Ignored. + + Returns + ------- + self : object + Fitted scaler. + """ + # Reset internal state before fitting + self._reset() + return self.partial_fit(X, y) + + @_fit_context(prefer_skip_nested_validation=True) + def partial_fit(self, X, y=None): + """Online computation of min and max on X for later scaling. + + All of X is processed as a single batch. This is intended for cases + when :meth:`fit` is not feasible due to very large number of + `n_samples` or because X is read from a continuous stream. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data used to compute the mean and standard deviation + used for later scaling along the features axis. + + y : None + Ignored. + + Returns + ------- + self : object + Fitted scaler. + """ + feature_range = self.feature_range + if feature_range[0] >= feature_range[1]: + raise ValueError( + "Minimum of desired feature range must be smaller than maximum. Got %s." + % str(feature_range) + ) + + if sparse.issparse(X): + raise TypeError( + "MinMaxScaler does not support sparse input. " + "Consider using MaxAbsScaler instead." + ) + + xp, _ = get_namespace(X) + + first_pass = not hasattr(self, "n_samples_seen_") + X = validate_data( + self, + X, + reset=first_pass, + dtype=_array_api.supported_float_dtypes(xp), + ensure_all_finite="allow-nan", + ) + + device_ = device(X) + feature_range = ( + xp.asarray(feature_range[0], dtype=X.dtype, device=device_), + xp.asarray(feature_range[1], dtype=X.dtype, device=device_), + ) + + data_min = _array_api._nanmin(X, axis=0, xp=xp) + data_max = _array_api._nanmax(X, axis=0, xp=xp) + + if first_pass: + self.n_samples_seen_ = X.shape[0] + else: + data_min = xp.minimum(self.data_min_, data_min) + data_max = xp.maximum(self.data_max_, data_max) + self.n_samples_seen_ += X.shape[0] + + data_range = data_max - data_min + self.scale_ = (feature_range[1] - feature_range[0]) / _handle_zeros_in_scale( + data_range, copy=True + ) + self.min_ = feature_range[0] - data_min * self.scale_ + self.data_min_ = data_min + self.data_max_ = data_max + self.data_range_ = data_range + return self + + def transform(self, X): + """Scale features of X according to feature_range. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data that will be transformed. + + Returns + ------- + Xt : ndarray of shape (n_samples, n_features) + Transformed data. + """ + check_is_fitted(self) + + xp, _ = get_namespace(X) + + X = validate_data( + self, + X, + copy=self.copy, + dtype=_array_api.supported_float_dtypes(xp), + force_writeable=True, + ensure_all_finite="allow-nan", + reset=False, + ) + + X *= self.scale_ + X += self.min_ + if self.clip: + device_ = device(X) + X = _modify_in_place_if_numpy( + xp, + xp.clip, + X, + xp.asarray(self.feature_range[0], dtype=X.dtype, device=device_), + xp.asarray(self.feature_range[1], dtype=X.dtype, device=device_), + out=X, + ) + return X + + def inverse_transform(self, X): + """Undo the scaling of X according to feature_range. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data that will be transformed. It cannot be sparse. + + Returns + ------- + X_original : ndarray of shape (n_samples, n_features) + Transformed data. + """ + check_is_fitted(self) + + xp, _ = get_namespace(X) + + X = check_array( + X, + copy=self.copy, + dtype=_array_api.supported_float_dtypes(xp), + force_writeable=True, + ensure_all_finite="allow-nan", + ) + + X -= self.min_ + X /= self.scale_ + return X + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + tags.array_api_support = True + return tags + + +@validate_params( + { + "X": ["array-like"], + "axis": [Options(Integral, {0, 1})], + }, + prefer_skip_nested_validation=False, +) +def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True): + """Transform features by scaling each feature to a given range. + + This estimator scales and translates each feature individually such + that it is in the given range on the training set, i.e. between + zero and one. + + The transformation is given by (when ``axis=0``):: + + X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) + X_scaled = X_std * (max - min) + min + + where min, max = feature_range. + + The transformation is calculated as (when ``axis=0``):: + + X_scaled = scale * X + min - X.min(axis=0) * scale + where scale = (max - min) / (X.max(axis=0) - X.min(axis=0)) + + This transformation is often used as an alternative to zero mean, + unit variance scaling. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.17 + *minmax_scale* function interface + to :class:`~sklearn.preprocessing.MinMaxScaler`. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data. + + feature_range : tuple (min, max), default=(0, 1) + Desired range of transformed data. + + axis : {0, 1}, default=0 + Axis used to scale along. If 0, independently scale each feature, + otherwise (if 1) scale each sample. + + copy : bool, default=True + If False, try to avoid a copy and scale in place. + This is not guaranteed to always work in place; e.g. if the data is + a numpy array with an int dtype, a copy will be returned even with + copy=False. + + Returns + ------- + X_tr : ndarray of shape (n_samples, n_features) + The transformed data. + + .. warning:: Risk of data leak + + Do not use :func:`~sklearn.preprocessing.minmax_scale` unless you know + what you are doing. A common mistake is to apply it to the entire data + *before* splitting into training and test sets. This will bias the + model evaluation because information would have leaked from the test + set to the training set. + In general, we recommend using + :class:`~sklearn.preprocessing.MinMaxScaler` within a + :ref:`Pipeline ` in order to prevent most risks of data + leaking: `pipe = make_pipeline(MinMaxScaler(), LogisticRegression())`. + + See Also + -------- + MinMaxScaler : Performs scaling to a given range using the Transformer + API (e.g. as part of a preprocessing + :class:`~sklearn.pipeline.Pipeline`). + + Notes + ----- + For a comparison of the different scalers, transformers, and normalizers, + see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. + + Examples + -------- + >>> from sklearn.preprocessing import minmax_scale + >>> X = [[-2, 1, 2], [-1, 0, 1]] + >>> minmax_scale(X, axis=0) # scale each column independently + array([[0., 1., 1.], + [1., 0., 0.]]) + >>> minmax_scale(X, axis=1) # scale each row independently + array([[0. , 0.75, 1. ], + [0. , 0.5 , 1. ]]) + """ + # Unlike the scaler object, this function allows 1d input. + # If copy is required, it will be done inside the scaler object. + X = check_array( + X, + copy=False, + ensure_2d=False, + dtype=FLOAT_DTYPES, + ensure_all_finite="allow-nan", + ) + original_ndim = X.ndim + + if original_ndim == 1: + X = X.reshape(X.shape[0], 1) + + s = MinMaxScaler(feature_range=feature_range, copy=copy) + if axis == 0: + X = s.fit_transform(X) + else: + X = s.fit_transform(X.T).T + + if original_ndim == 1: + X = X.ravel() + + return X + + +class StandardScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): + """Standardize features by removing the mean and scaling to unit variance. + + The standard score of a sample `x` is calculated as: + + .. code-block:: text + + z = (x - u) / s + + where `u` is the mean of the training samples or zero if `with_mean=False`, + and `s` is the standard deviation of the training samples or one if + `with_std=False`. + + Centering and scaling happen independently on each feature by computing + the relevant statistics on the samples in the training set. Mean and + standard deviation are then stored to be used on later data using + :meth:`transform`. + + Standardization of a dataset is a common requirement for many + machine learning estimators: they might behave badly if the + individual features do not more or less look like standard normally + distributed data (e.g. Gaussian with 0 mean and unit variance). + + For instance many elements used in the objective function of + a learning algorithm (such as the RBF kernel of Support Vector + Machines or the L1 and L2 regularizers of linear models) assume that + all features are centered around 0 and have variance in the same + order. If a feature has a variance that is orders of magnitude larger + than others, it might dominate the objective function and make the + estimator unable to learn from other features correctly as expected. + + `StandardScaler` is sensitive to outliers, and the features may scale + differently from each other in the presence of outliers. For an example + visualization, refer to :ref:`Compare StandardScaler with other scalers + `. + + This scaler can also be applied to sparse CSR or CSC matrices by passing + `with_mean=False` to avoid breaking the sparsity structure of the data. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + copy : bool, default=True + If False, try to avoid a copy and do inplace scaling instead. + This is not guaranteed to always work inplace; e.g. if the data is + not a NumPy array or scipy.sparse CSR matrix, a copy may still be + returned. + + with_mean : bool, default=True + If True, center the data before scaling. + This does not work (and will raise an exception) when attempted on + sparse matrices, because centering them entails building a dense + matrix which in common use cases is likely to be too large to fit in + memory. + + with_std : bool, default=True + If True, scale the data to unit variance (or equivalently, + unit standard deviation). + + Attributes + ---------- + scale_ : ndarray of shape (n_features,) or None + Per feature relative scaling of the data to achieve zero mean and unit + variance. Generally this is calculated using `np.sqrt(var_)`. If a + variance is zero, we can't achieve unit variance, and the data is left + as-is, giving a scaling factor of 1. `scale_` is equal to `None` + when `with_std=False`. + + .. versionadded:: 0.17 + *scale_* + + mean_ : ndarray of shape (n_features,) or None + The mean value for each feature in the training set. + Equal to ``None`` when ``with_mean=False`` and ``with_std=False``. + + var_ : ndarray of shape (n_features,) or None + The variance for each feature in the training set. Used to compute + `scale_`. Equal to ``None`` when ``with_mean=False`` and + ``with_std=False``. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_samples_seen_ : int or ndarray of shape (n_features,) + The number of samples processed by the estimator for each feature. + If there are no missing samples, the ``n_samples_seen`` will be an + integer, otherwise it will be an array of dtype int. If + `sample_weights` are used it will be a float (if no missing data) + or an array of dtype float that sums the weights seen so far. + Will be reset on new calls to fit, but increments across + ``partial_fit`` calls. + + See Also + -------- + scale : Equivalent function without the estimator API. + + :class:`~sklearn.decomposition.PCA` : Further removes the linear + correlation across features with 'whiten=True'. + + Notes + ----- + NaNs are treated as missing values: disregarded in fit, and maintained in + transform. + + We use a biased estimator for the standard deviation, equivalent to + `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to + affect model performance. + + Examples + -------- + >>> from sklearn.preprocessing import StandardScaler + >>> data = [[0, 0], [0, 0], [1, 1], [1, 1]] + >>> scaler = StandardScaler() + >>> print(scaler.fit(data)) + StandardScaler() + >>> print(scaler.mean_) + [0.5 0.5] + >>> print(scaler.transform(data)) + [[-1. -1.] + [-1. -1.] + [ 1. 1.] + [ 1. 1.]] + >>> print(scaler.transform([[2, 2]])) + [[3. 3.]] + """ + + _parameter_constraints: dict = { + "copy": ["boolean"], + "with_mean": ["boolean"], + "with_std": ["boolean"], + } + + def __init__(self, *, copy=True, with_mean=True, with_std=True): + self.with_mean = with_mean + self.with_std = with_std + self.copy = copy + + def _reset(self): + """Reset internal data-dependent state of the scaler, if necessary. + + __init__ parameters are not touched. + """ + # Checking one attribute is enough, because they are all set together + # in partial_fit + if hasattr(self, "scale_"): + del self.scale_ + del self.n_samples_seen_ + del self.mean_ + del self.var_ + + def fit(self, X, y=None, sample_weight=None): + """Compute the mean and std to be used for later scaling. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data used to compute the mean and standard deviation + used for later scaling along the features axis. + + y : None + Ignored. + + sample_weight : array-like of shape (n_samples,), default=None + Individual weights for each sample. + + .. versionadded:: 0.24 + parameter *sample_weight* support to StandardScaler. + + Returns + ------- + self : object + Fitted scaler. + """ + # Reset internal state before fitting + self._reset() + return self.partial_fit(X, y, sample_weight) + + @_fit_context(prefer_skip_nested_validation=True) + def partial_fit(self, X, y=None, sample_weight=None): + """Online computation of mean and std on X for later scaling. + + All of X is processed as a single batch. This is intended for cases + when :meth:`fit` is not feasible due to very large number of + `n_samples` or because X is read from a continuous stream. + + The algorithm for incremental mean and std is given in Equation 1.5a,b + in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms + for computing the sample variance: Analysis and recommendations." + The American Statistician 37.3 (1983): 242-247: + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data used to compute the mean and standard deviation + used for later scaling along the features axis. + + y : None + Ignored. + + sample_weight : array-like of shape (n_samples,), default=None + Individual weights for each sample. + + .. versionadded:: 0.24 + parameter *sample_weight* support to StandardScaler. + + Returns + ------- + self : object + Fitted scaler. + """ + first_call = not hasattr(self, "n_samples_seen_") + X = validate_data( + self, + X, + accept_sparse=("csr", "csc"), + dtype=FLOAT_DTYPES, + ensure_all_finite="allow-nan", + reset=first_call, + ) + n_features = X.shape[1] + + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + + # Even in the case of `with_mean=False`, we update the mean anyway + # This is needed for the incremental computation of the var + # See incr_mean_variance_axis and _incremental_mean_variance_axis + + # if n_samples_seen_ is an integer (i.e. no missing values), we need to + # transform it to a NumPy array of shape (n_features,) required by + # incr_mean_variance_axis and _incremental_variance_axis + dtype = np.int64 if sample_weight is None else X.dtype + if not hasattr(self, "n_samples_seen_"): + self.n_samples_seen_ = np.zeros(n_features, dtype=dtype) + elif np.size(self.n_samples_seen_) == 1: + self.n_samples_seen_ = np.repeat(self.n_samples_seen_, X.shape[1]) + self.n_samples_seen_ = self.n_samples_seen_.astype(dtype, copy=False) + + if sparse.issparse(X): + if self.with_mean: + raise ValueError( + "Cannot center sparse matrices: pass `with_mean=False` " + "instead. See docstring for motivation and alternatives." + ) + sparse_constructor = ( + sparse.csr_matrix if X.format == "csr" else sparse.csc_matrix + ) + + if self.with_std: + # First pass + if not hasattr(self, "scale_"): + self.mean_, self.var_, self.n_samples_seen_ = mean_variance_axis( + X, axis=0, weights=sample_weight, return_sum_weights=True + ) + # Next passes + else: + ( + self.mean_, + self.var_, + self.n_samples_seen_, + ) = incr_mean_variance_axis( + X, + axis=0, + last_mean=self.mean_, + last_var=self.var_, + last_n=self.n_samples_seen_, + weights=sample_weight, + ) + # We force the mean and variance to float64 for large arrays + # See https://github.com/scikit-learn/scikit-learn/pull/12338 + self.mean_ = self.mean_.astype(np.float64, copy=False) + self.var_ = self.var_.astype(np.float64, copy=False) + else: + self.mean_ = None # as with_mean must be False for sparse + self.var_ = None + weights = _check_sample_weight(sample_weight, X) + sum_weights_nan = weights @ sparse_constructor( + (np.isnan(X.data), X.indices, X.indptr), shape=X.shape + ) + self.n_samples_seen_ += (np.sum(weights) - sum_weights_nan).astype( + dtype + ) + else: + # First pass + if not hasattr(self, "scale_"): + self.mean_ = 0.0 + if self.with_std: + self.var_ = 0.0 + else: + self.var_ = None + + if not self.with_mean and not self.with_std: + self.mean_ = None + self.var_ = None + self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0) + + else: + self.mean_, self.var_, self.n_samples_seen_ = _incremental_mean_and_var( + X, + self.mean_, + self.var_, + self.n_samples_seen_, + sample_weight=sample_weight, + ) + + # for backward-compatibility, reduce n_samples_seen_ to an integer + # if the number of samples is the same for each feature (i.e. no + # missing values) + if np.ptp(self.n_samples_seen_) == 0: + self.n_samples_seen_ = self.n_samples_seen_[0] + + if self.with_std: + # Extract the list of near constant features on the raw variances, + # before taking the square root. + constant_mask = _is_constant_feature( + self.var_, self.mean_, self.n_samples_seen_ + ) + self.scale_ = _handle_zeros_in_scale( + np.sqrt(self.var_), copy=False, constant_mask=constant_mask + ) + else: + self.scale_ = None + + return self + + def transform(self, X, copy=None): + """Perform standardization by centering and scaling. + + Parameters + ---------- + X : {array-like, sparse matrix of shape (n_samples, n_features) + The data used to scale along the features axis. + copy : bool, default=None + Copy the input X or not. + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) + Transformed array. + """ + check_is_fitted(self) + + copy = copy if copy is not None else self.copy + X = validate_data( + self, + X, + reset=False, + accept_sparse="csr", + copy=copy, + dtype=FLOAT_DTYPES, + force_writeable=True, + ensure_all_finite="allow-nan", + ) + + if sparse.issparse(X): + if self.with_mean: + raise ValueError( + "Cannot center sparse matrices: pass `with_mean=False` " + "instead. See docstring for motivation and alternatives." + ) + if self.scale_ is not None: + inplace_column_scale(X, 1 / self.scale_) + else: + if self.with_mean: + X -= self.mean_ + if self.with_std: + X /= self.scale_ + return X + + def inverse_transform(self, X, copy=None): + """Scale back the data to the original representation. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data used to scale along the features axis. + + copy : bool, default=None + Copy the input `X` or not. + + Returns + ------- + X_original : {ndarray, sparse matrix} of shape (n_samples, n_features) + Transformed array. + """ + check_is_fitted(self) + + copy = copy if copy is not None else self.copy + X = check_array( + X, + accept_sparse="csr", + copy=copy, + dtype=FLOAT_DTYPES, + force_writeable=True, + ensure_all_finite="allow-nan", + ) + + if sparse.issparse(X): + if self.with_mean: + raise ValueError( + "Cannot uncenter sparse matrices: pass `with_mean=False` " + "instead See docstring for motivation and alternatives." + ) + if self.scale_ is not None: + inplace_column_scale(X, self.scale_) + else: + if self.with_std: + X *= self.scale_ + if self.with_mean: + X += self.mean_ + return X + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + tags.input_tags.sparse = not self.with_mean + tags.transformer_tags.preserves_dtype = ["float64", "float32"] + return tags + + +class MaxAbsScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): + """Scale each feature by its maximum absolute value. + + This estimator scales and translates each feature individually such + that the maximal absolute value of each feature in the + training set will be 1.0. It does not shift/center the data, and + thus does not destroy any sparsity. + + This scaler can also be applied to sparse CSR or CSC matrices. + + `MaxAbsScaler` doesn't reduce the effect of outliers; it only linearly + scales them down. For an example visualization, refer to :ref:`Compare + MaxAbsScaler with other scalers `. + + .. versionadded:: 0.17 + + Parameters + ---------- + copy : bool, default=True + Set to False to perform inplace scaling and avoid a copy (if the input + is already a numpy array). + + Attributes + ---------- + scale_ : ndarray of shape (n_features,) + Per feature relative scaling of the data. + + .. versionadded:: 0.17 + *scale_* attribute. + + max_abs_ : ndarray of shape (n_features,) + Per feature maximum absolute value. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_samples_seen_ : int + The number of samples processed by the estimator. Will be reset on + new calls to fit, but increments across ``partial_fit`` calls. + + See Also + -------- + maxabs_scale : Equivalent function without the estimator API. + + Notes + ----- + NaNs are treated as missing values: disregarded in fit, and maintained in + transform. + + Examples + -------- + >>> from sklearn.preprocessing import MaxAbsScaler + >>> X = [[ 1., -1., 2.], + ... [ 2., 0., 0.], + ... [ 0., 1., -1.]] + >>> transformer = MaxAbsScaler().fit(X) + >>> transformer + MaxAbsScaler() + >>> transformer.transform(X) + array([[ 0.5, -1. , 1. ], + [ 1. , 0. , 0. ], + [ 0. , 1. , -0.5]]) + """ + + _parameter_constraints: dict = {"copy": ["boolean"]} + + def __init__(self, *, copy=True): + self.copy = copy + + def _reset(self): + """Reset internal data-dependent state of the scaler, if necessary. + + __init__ parameters are not touched. + """ + # Checking one attribute is enough, because they are all set together + # in partial_fit + if hasattr(self, "scale_"): + del self.scale_ + del self.n_samples_seen_ + del self.max_abs_ + + def fit(self, X, y=None): + """Compute the maximum absolute value to be used for later scaling. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data used to compute the per-feature minimum and maximum + used for later scaling along the features axis. + + y : None + Ignored. + + Returns + ------- + self : object + Fitted scaler. + """ + # Reset internal state before fitting + self._reset() + return self.partial_fit(X, y) + + @_fit_context(prefer_skip_nested_validation=True) + def partial_fit(self, X, y=None): + """Online computation of max absolute value of X for later scaling. + + All of X is processed as a single batch. This is intended for cases + when :meth:`fit` is not feasible due to very large number of + `n_samples` or because X is read from a continuous stream. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data used to compute the mean and standard deviation + used for later scaling along the features axis. + + y : None + Ignored. + + Returns + ------- + self : object + Fitted scaler. + """ + xp, _ = get_namespace(X) + + first_pass = not hasattr(self, "n_samples_seen_") + X = validate_data( + self, + X, + reset=first_pass, + accept_sparse=("csr", "csc"), + dtype=_array_api.supported_float_dtypes(xp), + ensure_all_finite="allow-nan", + ) + + if sparse.issparse(X): + mins, maxs = min_max_axis(X, axis=0, ignore_nan=True) + max_abs = np.maximum(np.abs(mins), np.abs(maxs)) + else: + max_abs = _array_api._nanmax(xp.abs(X), axis=0, xp=xp) + + if first_pass: + self.n_samples_seen_ = X.shape[0] + else: + max_abs = xp.maximum(self.max_abs_, max_abs) + self.n_samples_seen_ += X.shape[0] + + self.max_abs_ = max_abs + self.scale_ = _handle_zeros_in_scale(max_abs, copy=True) + return self + + def transform(self, X): + """Scale the data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data that should be scaled. + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) + Transformed array. + """ + check_is_fitted(self) + + xp, _ = get_namespace(X) + + X = validate_data( + self, + X, + accept_sparse=("csr", "csc"), + copy=self.copy, + reset=False, + dtype=_array_api.supported_float_dtypes(xp), + force_writeable=True, + ensure_all_finite="allow-nan", + ) + + if sparse.issparse(X): + inplace_column_scale(X, 1.0 / self.scale_) + else: + X /= self.scale_ + return X + + def inverse_transform(self, X): + """Scale back the data to the original representation. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data that should be transformed back. + + Returns + ------- + X_original : {ndarray, sparse matrix} of shape (n_samples, n_features) + Transformed array. + """ + check_is_fitted(self) + + xp, _ = get_namespace(X) + + X = check_array( + X, + accept_sparse=("csr", "csc"), + copy=self.copy, + dtype=_array_api.supported_float_dtypes(xp), + force_writeable=True, + ensure_all_finite="allow-nan", + ) + + if sparse.issparse(X): + inplace_column_scale(X, self.scale_) + else: + X *= self.scale_ + return X + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + tags.input_tags.sparse = True + return tags + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "axis": [Options(Integral, {0, 1})], + }, + prefer_skip_nested_validation=False, +) +def maxabs_scale(X, *, axis=0, copy=True): + """Scale each feature to the [-1, 1] range without breaking the sparsity. + + This estimator scales each feature individually such + that the maximal absolute value of each feature in the + training set will be 1.0. + + This scaler can also be applied to sparse CSR or CSC matrices. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data. + + axis : {0, 1}, default=0 + Axis used to scale along. If 0, independently scale each feature, + otherwise (if 1) scale each sample. + + copy : bool, default=True + If False, try to avoid a copy and scale in place. + This is not guaranteed to always work in place; e.g. if the data is + a numpy array with an int dtype, a copy will be returned even with + copy=False. + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) + The transformed data. + + .. warning:: Risk of data leak + + Do not use :func:`~sklearn.preprocessing.maxabs_scale` unless you know + what you are doing. A common mistake is to apply it to the entire data + *before* splitting into training and test sets. This will bias the + model evaluation because information would have leaked from the test + set to the training set. + In general, we recommend using + :class:`~sklearn.preprocessing.MaxAbsScaler` within a + :ref:`Pipeline ` in order to prevent most risks of data + leaking: `pipe = make_pipeline(MaxAbsScaler(), LogisticRegression())`. + + See Also + -------- + MaxAbsScaler : Performs scaling to the [-1, 1] range using + the Transformer API (e.g. as part of a preprocessing + :class:`~sklearn.pipeline.Pipeline`). + + Notes + ----- + NaNs are treated as missing values: disregarded to compute the statistics, + and maintained during the data transformation. + + For a comparison of the different scalers, transformers, and normalizers, + see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. + + Examples + -------- + >>> from sklearn.preprocessing import maxabs_scale + >>> X = [[-2, 1, 2], [-1, 0, 1]] + >>> maxabs_scale(X, axis=0) # scale each column independently + array([[-1. , 1. , 1. ], + [-0.5, 0. , 0.5]]) + >>> maxabs_scale(X, axis=1) # scale each row independently + array([[-1. , 0.5, 1. ], + [-1. , 0. , 1. ]]) + """ + # Unlike the scaler object, this function allows 1d input. + + # If copy is required, it will be done inside the scaler object. + X = check_array( + X, + accept_sparse=("csr", "csc"), + copy=False, + ensure_2d=False, + dtype=FLOAT_DTYPES, + ensure_all_finite="allow-nan", + ) + original_ndim = X.ndim + + if original_ndim == 1: + X = X.reshape(X.shape[0], 1) + + s = MaxAbsScaler(copy=copy) + if axis == 0: + X = s.fit_transform(X) + else: + X = s.fit_transform(X.T).T + + if original_ndim == 1: + X = X.ravel() + + return X + + +class RobustScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): + """Scale features using statistics that are robust to outliers. + + This Scaler removes the median and scales the data according to + the quantile range (defaults to IQR: Interquartile Range). + The IQR is the range between the 1st quartile (25th quantile) + and the 3rd quartile (75th quantile). + + Centering and scaling happen independently on each feature by + computing the relevant statistics on the samples in the training + set. Median and interquartile range are then stored to be used on + later data using the :meth:`transform` method. + + Standardization of a dataset is a common preprocessing for many machine + learning estimators. Typically this is done by removing the mean and + scaling to unit variance. However, outliers can often influence the sample + mean / variance in a negative way. In such cases, using the median and the + interquartile range often give better results. For an example visualization + and comparison to other scalers, refer to :ref:`Compare RobustScaler with + other scalers `. + + .. versionadded:: 0.17 + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + with_centering : bool, default=True + If `True`, center the data before scaling. + This will cause :meth:`transform` to raise an exception when attempted + on sparse matrices, because centering them entails building a dense + matrix which in common use cases is likely to be too large to fit in + memory. + + with_scaling : bool, default=True + If `True`, scale the data to interquartile range. + + quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0, \ + default=(25.0, 75.0) + Quantile range used to calculate `scale_`. By default this is equal to + the IQR, i.e., `q_min` is the first quantile and `q_max` is the third + quantile. + + .. versionadded:: 0.18 + + copy : bool, default=True + If `False`, try to avoid a copy and do inplace scaling instead. + This is not guaranteed to always work inplace; e.g. if the data is + not a NumPy array or scipy.sparse CSR matrix, a copy may still be + returned. + + unit_variance : bool, default=False + If `True`, scale data so that normally distributed features have a + variance of 1. In general, if the difference between the x-values of + `q_max` and `q_min` for a standard normal distribution is greater + than 1, the dataset will be scaled down. If less than 1, the dataset + will be scaled up. + + .. versionadded:: 0.24 + + Attributes + ---------- + center_ : array of floats + The median value for each feature in the training set. + + scale_ : array of floats + The (scaled) interquartile range for each feature in the training set. + + .. versionadded:: 0.17 + *scale_* attribute. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + robust_scale : Equivalent function without the estimator API. + sklearn.decomposition.PCA : Further removes the linear correlation across + features with 'whiten=True'. + + Notes + ----- + + https://en.wikipedia.org/wiki/Median + https://en.wikipedia.org/wiki/Interquartile_range + + Examples + -------- + >>> from sklearn.preprocessing import RobustScaler + >>> X = [[ 1., -2., 2.], + ... [ -2., 1., 3.], + ... [ 4., 1., -2.]] + >>> transformer = RobustScaler().fit(X) + >>> transformer + RobustScaler() + >>> transformer.transform(X) + array([[ 0. , -2. , 0. ], + [-1. , 0. , 0.4], + [ 1. , 0. , -1.6]]) + """ + + _parameter_constraints: dict = { + "with_centering": ["boolean"], + "with_scaling": ["boolean"], + "quantile_range": [tuple], + "copy": ["boolean"], + "unit_variance": ["boolean"], + } + + def __init__( + self, + *, + with_centering=True, + with_scaling=True, + quantile_range=(25.0, 75.0), + copy=True, + unit_variance=False, + ): + self.with_centering = with_centering + self.with_scaling = with_scaling + self.quantile_range = quantile_range + self.unit_variance = unit_variance + self.copy = copy + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Compute the median and quantiles to be used for scaling. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data used to compute the median and quantiles + used for later scaling along the features axis. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self : object + Fitted scaler. + """ + # at fit, convert sparse matrices to csc for optimized computation of + # the quantiles + X = validate_data( + self, + X, + accept_sparse="csc", + dtype=FLOAT_DTYPES, + ensure_all_finite="allow-nan", + ) + + q_min, q_max = self.quantile_range + if not 0 <= q_min <= q_max <= 100: + raise ValueError("Invalid quantile range: %s" % str(self.quantile_range)) + + if self.with_centering: + if sparse.issparse(X): + raise ValueError( + "Cannot center sparse matrices: use `with_centering=False`" + " instead. See docstring for motivation and alternatives." + ) + self.center_ = np.nanmedian(X, axis=0) + else: + self.center_ = None + + if self.with_scaling: + quantiles = [] + for feature_idx in range(X.shape[1]): + if sparse.issparse(X): + column_nnz_data = X.data[ + X.indptr[feature_idx] : X.indptr[feature_idx + 1] + ] + column_data = np.zeros(shape=X.shape[0], dtype=X.dtype) + column_data[: len(column_nnz_data)] = column_nnz_data + else: + column_data = X[:, feature_idx] + + quantiles.append(np.nanpercentile(column_data, self.quantile_range)) + + quantiles = np.transpose(quantiles) + + self.scale_ = quantiles[1] - quantiles[0] + self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False) + if self.unit_variance: + adjust = stats.norm.ppf(q_max / 100.0) - stats.norm.ppf(q_min / 100.0) + self.scale_ = self.scale_ / adjust + else: + self.scale_ = None + + return self + + def transform(self, X): + """Center and scale the data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data used to scale along the specified axis. + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) + Transformed array. + """ + check_is_fitted(self) + X = validate_data( + self, + X, + accept_sparse=("csr", "csc"), + copy=self.copy, + dtype=FLOAT_DTYPES, + force_writeable=True, + reset=False, + ensure_all_finite="allow-nan", + ) + + if sparse.issparse(X): + if self.with_scaling: + inplace_column_scale(X, 1.0 / self.scale_) + else: + if self.with_centering: + X -= self.center_ + if self.with_scaling: + X /= self.scale_ + return X + + def inverse_transform(self, X): + """Scale back the data to the original representation. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The rescaled data to be transformed back. + + Returns + ------- + X_original : {ndarray, sparse matrix} of shape (n_samples, n_features) + Transformed array. + """ + check_is_fitted(self) + X = check_array( + X, + accept_sparse=("csr", "csc"), + copy=self.copy, + dtype=FLOAT_DTYPES, + force_writeable=True, + ensure_all_finite="allow-nan", + ) + + if sparse.issparse(X): + if self.with_scaling: + inplace_column_scale(X, self.scale_) + else: + if self.with_scaling: + X *= self.scale_ + if self.with_centering: + X += self.center_ + return X + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = not self.with_centering + tags.input_tags.allow_nan = True + return tags + + +@validate_params( + {"X": ["array-like", "sparse matrix"], "axis": [Options(Integral, {0, 1})]}, + prefer_skip_nested_validation=False, +) +def robust_scale( + X, + *, + axis=0, + with_centering=True, + with_scaling=True, + quantile_range=(25.0, 75.0), + copy=True, + unit_variance=False, +): + """Standardize a dataset along any axis. + + Center to the median and component wise scale + according to the interquartile range. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_sample, n_features) + The data to center and scale. + + axis : int, default=0 + Axis used to compute the medians and IQR along. If 0, + independently scale each feature, otherwise (if 1) scale + each sample. + + with_centering : bool, default=True + If `True`, center the data before scaling. + + with_scaling : bool, default=True + If `True`, scale the data to unit variance (or equivalently, + unit standard deviation). + + quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0,\ + default=(25.0, 75.0) + Quantile range used to calculate `scale_`. By default this is equal to + the IQR, i.e., `q_min` is the first quantile and `q_max` is the third + quantile. + + .. versionadded:: 0.18 + + copy : bool, default=True + If False, try to avoid a copy and scale in place. + This is not guaranteed to always work in place; e.g. if the data is + a numpy array with an int dtype, a copy will be returned even with + copy=False. + + unit_variance : bool, default=False + If `True`, scale data so that normally distributed features have a + variance of 1. In general, if the difference between the x-values of + `q_max` and `q_min` for a standard normal distribution is greater + than 1, the dataset will be scaled down. If less than 1, the dataset + will be scaled up. + + .. versionadded:: 0.24 + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) + The transformed data. + + See Also + -------- + RobustScaler : Performs centering and scaling using the Transformer API + (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`). + + Notes + ----- + This implementation will refuse to center scipy.sparse matrices + since it would make them non-sparse and would potentially crash the + program with memory exhaustion problems. + + Instead the caller is expected to either set explicitly + `with_centering=False` (in that case, only variance scaling will be + performed on the features of the CSR matrix) or to call `X.toarray()` + if he/she expects the materialized dense array to fit in memory. + + To avoid memory copy the caller should pass a CSR matrix. + + For a comparison of the different scalers, transformers, and normalizers, + see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. + + .. warning:: Risk of data leak + + Do not use :func:`~sklearn.preprocessing.robust_scale` unless you know + what you are doing. A common mistake is to apply it to the entire data + *before* splitting into training and test sets. This will bias the + model evaluation because information would have leaked from the test + set to the training set. + In general, we recommend using + :class:`~sklearn.preprocessing.RobustScaler` within a + :ref:`Pipeline ` in order to prevent most risks of data + leaking: `pipe = make_pipeline(RobustScaler(), LogisticRegression())`. + + Examples + -------- + >>> from sklearn.preprocessing import robust_scale + >>> X = [[-2, 1, 2], [-1, 0, 1]] + >>> robust_scale(X, axis=0) # scale each column independently + array([[-1., 1., 1.], + [ 1., -1., -1.]]) + >>> robust_scale(X, axis=1) # scale each row independently + array([[-1.5, 0. , 0.5], + [-1. , 0. , 1. ]]) + """ + X = check_array( + X, + accept_sparse=("csr", "csc"), + copy=False, + ensure_2d=False, + dtype=FLOAT_DTYPES, + ensure_all_finite="allow-nan", + ) + original_ndim = X.ndim + + if original_ndim == 1: + X = X.reshape(X.shape[0], 1) + + s = RobustScaler( + with_centering=with_centering, + with_scaling=with_scaling, + quantile_range=quantile_range, + unit_variance=unit_variance, + copy=copy, + ) + if axis == 0: + X = s.fit_transform(X) + else: + X = s.fit_transform(X.T).T + + if original_ndim == 1: + X = X.ravel() + + return X + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "norm": [StrOptions({"l1", "l2", "max"})], + "axis": [Options(Integral, {0, 1})], + "copy": ["boolean"], + "return_norm": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def normalize(X, norm="l2", *, axis=1, copy=True, return_norm=False): + """Scale input vectors individually to unit norm (vector length). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to normalize, element by element. + scipy.sparse matrices should be in CSR format to avoid an + un-necessary copy. + + norm : {'l1', 'l2', 'max'}, default='l2' + The norm to use to normalize each non zero sample (or each non-zero + feature if axis is 0). + + axis : {0, 1}, default=1 + Define axis used to normalize the data along. If 1, independently + normalize each sample, otherwise (if 0) normalize each feature. + + copy : bool, default=True + If False, try to avoid a copy and normalize in place. + This is not guaranteed to always work in place; e.g. if the data is + a numpy array with an int dtype, a copy will be returned even with + copy=False. + + return_norm : bool, default=False + Whether to return the computed norms. + + Returns + ------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + Normalized input X. + + norms : ndarray of shape (n_samples, ) if axis=1 else (n_features, ) + An array of norms along given axis for X. + When X is sparse, a NotImplementedError will be raised + for norm 'l1' or 'l2'. + + See Also + -------- + Normalizer : Performs normalization using the Transformer API + (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`). + + Notes + ----- + For a comparison of the different scalers, transformers, and normalizers, + see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. + + Examples + -------- + >>> from sklearn.preprocessing import normalize + >>> X = [[-2, 1, 2], [-1, 0, 1]] + >>> normalize(X, norm="l1") # L1 normalization each row independently + array([[-0.4, 0.2, 0.4], + [-0.5, 0. , 0.5]]) + >>> normalize(X, norm="l2") # L2 normalization each row independently + array([[-0.67, 0.33, 0.67], + [-0.71, 0. , 0.71]]) + """ + if axis == 0: + sparse_format = "csc" + else: # axis == 1: + sparse_format = "csr" + + xp, _ = get_namespace(X) + + X = check_array( + X, + accept_sparse=sparse_format, + copy=copy, + estimator="the normalize function", + dtype=_array_api.supported_float_dtypes(xp), + force_writeable=True, + ) + if axis == 0: + X = X.T + + if sparse.issparse(X): + if return_norm and norm in ("l1", "l2"): + raise NotImplementedError( + "return_norm=True is not implemented " + "for sparse matrices with norm 'l1' " + "or norm 'l2'" + ) + if norm == "l1": + inplace_csr_row_normalize_l1(X) + elif norm == "l2": + inplace_csr_row_normalize_l2(X) + elif norm == "max": + mins, maxes = min_max_axis(X, 1) + norms = np.maximum(abs(mins), maxes) + norms_elementwise = norms.repeat(np.diff(X.indptr)) + mask = norms_elementwise != 0 + X.data[mask] /= norms_elementwise[mask] + else: + if norm == "l1": + norms = xp.sum(xp.abs(X), axis=1) + elif norm == "l2": + norms = row_norms(X) + elif norm == "max": + norms = xp.max(xp.abs(X), axis=1) + norms = _handle_zeros_in_scale(norms, copy=False) + X /= norms[:, None] + + if axis == 0: + X = X.T + + if return_norm: + return X, norms + else: + return X + + +class Normalizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): + """Normalize samples individually to unit norm. + + Each sample (i.e. each row of the data matrix) with at least one + non zero component is rescaled independently of other samples so + that its norm (l1, l2 or inf) equals one. + + This transformer is able to work both with dense numpy arrays and + scipy.sparse matrix (use CSR format if you want to avoid the burden of + a copy / conversion). + + Scaling inputs to unit norms is a common operation for text + classification or clustering for instance. For instance the dot + product of two l2-normalized TF-IDF vectors is the cosine similarity + of the vectors and is the base similarity metric for the Vector + Space Model commonly used by the Information Retrieval community. + + For an example visualization, refer to :ref:`Compare Normalizer with other + scalers `. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + norm : {'l1', 'l2', 'max'}, default='l2' + The norm to use to normalize each non zero sample. If norm='max' + is used, values will be rescaled by the maximum of the absolute + values. + + copy : bool, default=True + Set to False to perform inplace row normalization and avoid a + copy (if the input is already a numpy array or a scipy.sparse + CSR matrix). + + Attributes + ---------- + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + normalize : Equivalent function without the estimator API. + + Notes + ----- + This estimator is :term:`stateless` and does not need to be fitted. + However, we recommend to call :meth:`fit_transform` instead of + :meth:`transform`, as parameter validation is only performed in + :meth:`fit`. + + Examples + -------- + >>> from sklearn.preprocessing import Normalizer + >>> X = [[4, 1, 2, 2], + ... [1, 3, 9, 3], + ... [5, 7, 5, 1]] + >>> transformer = Normalizer().fit(X) # fit does nothing. + >>> transformer + Normalizer() + >>> transformer.transform(X) + array([[0.8, 0.2, 0.4, 0.4], + [0.1, 0.3, 0.9, 0.3], + [0.5, 0.7, 0.5, 0.1]]) + """ + + _parameter_constraints: dict = { + "norm": [StrOptions({"l1", "l2", "max"})], + "copy": ["boolean"], + } + + def __init__(self, norm="l2", *, copy=True): + self.norm = norm + self.copy = copy + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Only validates estimator's parameters. + + This method allows to: (i) validate the estimator's parameters and + (ii) be consistent with the scikit-learn transformer API. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to estimate the normalization parameters. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self : object + Fitted transformer. + """ + validate_data(self, X, accept_sparse="csr") + return self + + def transform(self, X, copy=None): + """Scale each non zero row of X to unit norm. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to normalize, row by row. scipy.sparse matrices should be + in CSR format to avoid an un-necessary copy. + + copy : bool, default=None + Copy the input X or not. + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) + Transformed array. + """ + copy = copy if copy is not None else self.copy + X = validate_data( + self, X, accept_sparse="csr", force_writeable=True, copy=copy, reset=False + ) + return normalize(X, norm=self.norm, axis=1, copy=False) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + tags.requires_fit = False + tags.array_api_support = True + return tags + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "threshold": [Interval(Real, None, None, closed="neither")], + "copy": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def binarize(X, *, threshold=0.0, copy=True): + """Boolean thresholding of array-like or scipy.sparse matrix. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to binarize, element by element. + scipy.sparse matrices should be in CSR or CSC format to avoid an + un-necessary copy. + + threshold : float, default=0.0 + Feature values below or equal to this are replaced by 0, above it by 1. + Threshold may not be less than 0 for operations on sparse matrices. + + copy : bool, default=True + If False, try to avoid a copy and binarize in place. + This is not guaranteed to always work in place; e.g. if the data is + a numpy array with an object dtype, a copy will be returned even with + copy=False. + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) + The transformed data. + + See Also + -------- + Binarizer : Performs binarization using the Transformer API + (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`). + + Examples + -------- + >>> from sklearn.preprocessing import binarize + >>> X = [[0.4, 0.6, 0.5], [0.6, 0.1, 0.2]] + >>> binarize(X, threshold=0.5) + array([[0., 1., 0.], + [1., 0., 0.]]) + """ + X = check_array(X, accept_sparse=["csr", "csc"], force_writeable=True, copy=copy) + if sparse.issparse(X): + if threshold < 0: + raise ValueError("Cannot binarize a sparse matrix with threshold < 0") + cond = X.data > threshold + not_cond = np.logical_not(cond) + X.data[cond] = 1 + X.data[not_cond] = 0 + X.eliminate_zeros() + else: + xp, _, device = get_namespace_and_device(X) + float_dtype = _find_matching_floating_dtype(X, threshold, xp=xp) + cond = xp.astype(X, float_dtype, copy=False) > threshold + not_cond = xp.logical_not(cond) + X[cond] = 1 + X[not_cond] = 0 + return X + + +class Binarizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): + """Binarize data (set feature values to 0 or 1) according to a threshold. + + Values greater than the threshold map to 1, while values less than + or equal to the threshold map to 0. With the default threshold of 0, + only positive values map to 1. + + Binarization is a common operation on text count data where the + analyst can decide to only consider the presence or absence of a + feature rather than a quantified number of occurrences for instance. + + It can also be used as a pre-processing step for estimators that + consider boolean random variables (e.g. modelled using the Bernoulli + distribution in a Bayesian setting). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + threshold : float, default=0.0 + Feature values below or equal to this are replaced by 0, above it by 1. + Threshold may not be less than 0 for operations on sparse matrices. + + copy : bool, default=True + Set to False to perform inplace binarization and avoid a copy (if + the input is already a numpy array or a scipy.sparse CSR matrix). + + Attributes + ---------- + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + binarize : Equivalent function without the estimator API. + KBinsDiscretizer : Bin continuous data into intervals. + OneHotEncoder : Encode categorical features as a one-hot numeric array. + + Notes + ----- + If the input is a sparse matrix, only the non-zero values are subject + to update by the :class:`Binarizer` class. + + This estimator is :term:`stateless` and does not need to be fitted. + However, we recommend to call :meth:`fit_transform` instead of + :meth:`transform`, as parameter validation is only performed in + :meth:`fit`. + + Examples + -------- + >>> from sklearn.preprocessing import Binarizer + >>> X = [[ 1., -1., 2.], + ... [ 2., 0., 0.], + ... [ 0., 1., -1.]] + >>> transformer = Binarizer().fit(X) # fit does nothing. + >>> transformer + Binarizer() + >>> transformer.transform(X) + array([[1., 0., 1.], + [1., 0., 0.], + [0., 1., 0.]]) + """ + + _parameter_constraints: dict = { + "threshold": [Real], + "copy": ["boolean"], + } + + def __init__(self, *, threshold=0.0, copy=True): + self.threshold = threshold + self.copy = copy + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Only validates estimator's parameters. + + This method allows to: (i) validate the estimator's parameters and + (ii) be consistent with the scikit-learn transformer API. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data. + + y : None + Ignored. + + Returns + ------- + self : object + Fitted transformer. + """ + validate_data(self, X, accept_sparse="csr") + return self + + def transform(self, X, copy=None): + """Binarize each element of X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to binarize, element by element. + scipy.sparse matrices should be in CSR format to avoid an + un-necessary copy. + + copy : bool + Copy the input X or not. + + Returns + ------- + X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) + Transformed array. + """ + copy = copy if copy is not None else self.copy + # TODO: This should be refactored because binarize also calls + # check_array + X = validate_data( + self, + X, + accept_sparse=["csr", "csc"], + force_writeable=True, + copy=copy, + reset=False, + ) + return binarize(X, threshold=self.threshold, copy=False) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.requires_fit = False + tags.array_api_support = True + tags.input_tags.sparse = True + return tags + + +class KernelCenterer(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): + r"""Center an arbitrary kernel matrix :math:`K`. + + Let define a kernel :math:`K` such that: + + .. math:: + K(X, Y) = \phi(X) . \phi(Y)^{T} + + :math:`\phi(X)` is a function mapping of rows of :math:`X` to a + Hilbert space and :math:`K` is of shape `(n_samples, n_samples)`. + + This class allows to compute :math:`\tilde{K}(X, Y)` such that: + + .. math:: + \tilde{K(X, Y)} = \tilde{\phi}(X) . \tilde{\phi}(Y)^{T} + + :math:`\tilde{\phi}(X)` is the centered mapped data in the Hilbert + space. + + `KernelCenterer` centers the features without explicitly computing the + mapping :math:`\phi(\cdot)`. Working with centered kernels is sometime + expected when dealing with algebra computation such as eigendecomposition + for :class:`~sklearn.decomposition.KernelPCA` for instance. + + Read more in the :ref:`User Guide `. + + Attributes + ---------- + K_fit_rows_ : ndarray of shape (n_samples,) + Average of each column of kernel matrix. + + K_fit_all_ : float + Average of kernel matrix. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + sklearn.kernel_approximation.Nystroem : Approximate a kernel map + using a subset of the training data. + + References + ---------- + .. [1] `Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller. + "Nonlinear component analysis as a kernel eigenvalue problem." + Neural computation 10.5 (1998): 1299-1319. + `_ + + Examples + -------- + >>> from sklearn.preprocessing import KernelCenterer + >>> from sklearn.metrics.pairwise import pairwise_kernels + >>> X = [[ 1., -2., 2.], + ... [ -2., 1., 3.], + ... [ 4., 1., -2.]] + >>> K = pairwise_kernels(X, metric='linear') + >>> K + array([[ 9., 2., -2.], + [ 2., 14., -13.], + [ -2., -13., 21.]]) + >>> transformer = KernelCenterer().fit(K) + >>> transformer + KernelCenterer() + >>> transformer.transform(K) + array([[ 5., 0., -5.], + [ 0., 14., -14.], + [ -5., -14., 19.]]) + """ + + # X is called K in these methods. + __metadata_request__transform = {"K": metadata_routing.UNUSED} + __metadata_request__fit = {"K": metadata_routing.UNUSED} + + def fit(self, K, y=None): + """Fit KernelCenterer. + + Parameters + ---------- + K : ndarray of shape (n_samples, n_samples) + Kernel matrix. + + y : None + Ignored. + + Returns + ------- + self : object + Returns the instance itself. + """ + xp, _ = get_namespace(K) + + K = validate_data(self, K, dtype=_array_api.supported_float_dtypes(xp)) + + if K.shape[0] != K.shape[1]: + raise ValueError( + "Kernel matrix must be a square matrix." + " Input is a {}x{} matrix.".format(K.shape[0], K.shape[1]) + ) + + n_samples = K.shape[0] + self.K_fit_rows_ = xp.sum(K, axis=0) / n_samples + self.K_fit_all_ = xp.sum(self.K_fit_rows_) / n_samples + return self + + def transform(self, K, copy=True): + """Center kernel matrix. + + Parameters + ---------- + K : ndarray of shape (n_samples1, n_samples2) + Kernel matrix. + + copy : bool, default=True + Set to False to perform inplace computation. + + Returns + ------- + K_new : ndarray of shape (n_samples1, n_samples2) + Returns the instance itself. + """ + check_is_fitted(self) + + xp, _ = get_namespace(K) + + K = validate_data( + self, + K, + copy=copy, + force_writeable=True, + dtype=_array_api.supported_float_dtypes(xp), + reset=False, + ) + + K_pred_cols = (xp.sum(K, axis=1) / self.K_fit_rows_.shape[0])[:, None] + + K -= self.K_fit_rows_ + K -= K_pred_cols + K += self.K_fit_all_ + + return K + + @property + def _n_features_out(self): + """Number of transformed output features.""" + # Used by ClassNamePrefixFeaturesOutMixin. This model preserves the + # number of input features but this is not a one-to-one mapping in the + # usual sense. Hence the choice not to use OneToOneFeatureMixin to + # implement get_feature_names_out for this class. + return self.n_features_in_ + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.pairwise = True + tags.array_api_support = True + return tags + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "value": [Interval(Real, None, None, closed="neither")], + }, + prefer_skip_nested_validation=True, +) +def add_dummy_feature(X, value=1.0): + """Augment dataset with an additional dummy feature. + + This is useful for fitting an intercept term with implementations which + cannot otherwise fit it directly. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Data. + + value : float + Value to use for the dummy feature. + + Returns + ------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features + 1) + Same data with dummy feature added as first column. + + Examples + -------- + >>> from sklearn.preprocessing import add_dummy_feature + >>> add_dummy_feature([[0, 1], [1, 0]]) + array([[1., 0., 1.], + [1., 1., 0.]]) + """ + X = check_array(X, accept_sparse=["csc", "csr", "coo"], dtype=FLOAT_DTYPES) + n_samples, n_features = X.shape + shape = (n_samples, n_features + 1) + if sparse.issparse(X): + if X.format == "coo": + # Shift columns to the right. + col = X.col + 1 + # Column indices of dummy feature are 0 everywhere. + col = np.concatenate((np.zeros(n_samples), col)) + # Row indices of dummy feature are 0, ..., n_samples-1. + row = np.concatenate((np.arange(n_samples), X.row)) + # Prepend the dummy feature n_samples times. + data = np.concatenate((np.full(n_samples, value), X.data)) + return sparse.coo_matrix((data, (row, col)), shape) + elif X.format == "csc": + # Shift index pointers since we need to add n_samples elements. + indptr = X.indptr + n_samples + # indptr[0] must be 0. + indptr = np.concatenate((np.array([0]), indptr)) + # Row indices of dummy feature are 0, ..., n_samples-1. + indices = np.concatenate((np.arange(n_samples), X.indices)) + # Prepend the dummy feature n_samples times. + data = np.concatenate((np.full(n_samples, value), X.data)) + return sparse.csc_matrix((data, indices, indptr), shape) + else: + klass = X.__class__ + return klass(add_dummy_feature(X.tocoo(), value)) + else: + return np.hstack((np.full((n_samples, 1), value), X)) + + +class QuantileTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): + """Transform features using quantiles information. + + This method transforms the features to follow a uniform or a normal + distribution. Therefore, for a given feature, this transformation tends + to spread out the most frequent values. It also reduces the impact of + (marginal) outliers: this is therefore a robust preprocessing scheme. + + The transformation is applied on each feature independently. First an + estimate of the cumulative distribution function of a feature is + used to map the original values to a uniform distribution. The obtained + values are then mapped to the desired output distribution using the + associated quantile function. Features values of new/unseen data that fall + below or above the fitted range will be mapped to the bounds of the output + distribution. Note that this transform is non-linear. It may distort linear + correlations between variables measured at the same scale but renders + variables measured at different scales more directly comparable. + + For example visualizations, refer to :ref:`Compare QuantileTransformer with + other scalers `. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.19 + + Parameters + ---------- + n_quantiles : int, default=1000 or n_samples + Number of quantiles to be computed. It corresponds to the number + of landmarks used to discretize the cumulative distribution function. + If n_quantiles is larger than the number of samples, n_quantiles is set + to the number of samples as a larger number of quantiles does not give + a better approximation of the cumulative distribution function + estimator. + + output_distribution : {'uniform', 'normal'}, default='uniform' + Marginal distribution for the transformed data. The choices are + 'uniform' (default) or 'normal'. + + ignore_implicit_zeros : bool, default=False + Only applies to sparse matrices. If True, the sparse entries of the + matrix are discarded to compute the quantile statistics. If False, + these entries are treated as zeros. + + subsample : int or None, default=10_000 + Maximum number of samples used to estimate the quantiles for + computational efficiency. Note that the subsampling procedure may + differ for value-identical sparse and dense matrices. + Disable subsampling by setting `subsample=None`. + + .. versionadded:: 1.5 + The option `None` to disable subsampling was added. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for subsampling and smoothing + noise. + Please see ``subsample`` for more details. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + copy : bool, default=True + Set to False to perform inplace transformation and avoid a copy (if the + input is already a numpy array). + + Attributes + ---------- + n_quantiles_ : int + The actual number of quantiles used to discretize the cumulative + distribution function. + + quantiles_ : ndarray of shape (n_quantiles, n_features) + The values corresponding the quantiles of reference. + + references_ : ndarray of shape (n_quantiles, ) + Quantiles of references. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + quantile_transform : Equivalent function without the estimator API. + PowerTransformer : Perform mapping to a normal distribution using a power + transform. + StandardScaler : Perform standardization that is faster, but less robust + to outliers. + RobustScaler : Perform robust standardization that removes the influence + of outliers but does not put outliers and inliers on the same scale. + + Notes + ----- + NaNs are treated as missing values: disregarded in fit, and maintained in + transform. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.preprocessing import QuantileTransformer + >>> rng = np.random.RandomState(0) + >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0) + >>> qt = QuantileTransformer(n_quantiles=10, random_state=0) + >>> qt.fit_transform(X) + array([...]) + """ + + _parameter_constraints: dict = { + "n_quantiles": [Interval(Integral, 1, None, closed="left")], + "output_distribution": [StrOptions({"uniform", "normal"})], + "ignore_implicit_zeros": ["boolean"], + "subsample": [Interval(Integral, 1, None, closed="left"), None], + "random_state": ["random_state"], + "copy": ["boolean"], + } + + def __init__( + self, + *, + n_quantiles=1000, + output_distribution="uniform", + ignore_implicit_zeros=False, + subsample=10_000, + random_state=None, + copy=True, + ): + self.n_quantiles = n_quantiles + self.output_distribution = output_distribution + self.ignore_implicit_zeros = ignore_implicit_zeros + self.subsample = subsample + self.random_state = random_state + self.copy = copy + + def _dense_fit(self, X, random_state): + """Compute percentiles for dense matrices. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + The data used to scale along the features axis. + """ + if self.ignore_implicit_zeros: + warnings.warn( + "'ignore_implicit_zeros' takes effect only with" + " sparse matrix. This parameter has no effect." + ) + + n_samples, n_features = X.shape + references = self.references_ * 100 + + if self.subsample is not None and self.subsample < n_samples: + # Take a subsample of `X` + X = resample( + X, replace=False, n_samples=self.subsample, random_state=random_state + ) + + self.quantiles_ = np.nanpercentile(X, references, axis=0) + # Due to floating-point precision error in `np.nanpercentile`, + # make sure that quantiles are monotonically increasing. + # Upstream issue in numpy: + # https://github.com/numpy/numpy/issues/14685 + self.quantiles_ = np.maximum.accumulate(self.quantiles_) + + def _sparse_fit(self, X, random_state): + """Compute percentiles for sparse matrices. + + Parameters + ---------- + X : sparse matrix of shape (n_samples, n_features) + The data used to scale along the features axis. The sparse matrix + needs to be nonnegative. If a sparse matrix is provided, + it will be converted into a sparse ``csc_matrix``. + """ + n_samples, n_features = X.shape + references = self.references_ * 100 + + self.quantiles_ = [] + for feature_idx in range(n_features): + column_nnz_data = X.data[X.indptr[feature_idx] : X.indptr[feature_idx + 1]] + if self.subsample is not None and len(column_nnz_data) > self.subsample: + column_subsample = self.subsample * len(column_nnz_data) // n_samples + if self.ignore_implicit_zeros: + column_data = np.zeros(shape=column_subsample, dtype=X.dtype) + else: + column_data = np.zeros(shape=self.subsample, dtype=X.dtype) + column_data[:column_subsample] = random_state.choice( + column_nnz_data, size=column_subsample, replace=False + ) + else: + if self.ignore_implicit_zeros: + column_data = np.zeros(shape=len(column_nnz_data), dtype=X.dtype) + else: + column_data = np.zeros(shape=n_samples, dtype=X.dtype) + column_data[: len(column_nnz_data)] = column_nnz_data + + if not column_data.size: + # if no nnz, an error will be raised for computing the + # quantiles. Force the quantiles to be zeros. + self.quantiles_.append([0] * len(references)) + else: + self.quantiles_.append(np.nanpercentile(column_data, references)) + self.quantiles_ = np.transpose(self.quantiles_) + # due to floating-point precision error in `np.nanpercentile`, + # make sure the quantiles are monotonically increasing + # Upstream issue in numpy: + # https://github.com/numpy/numpy/issues/14685 + self.quantiles_ = np.maximum.accumulate(self.quantiles_) + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Compute the quantiles used for transforming. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data used to scale along the features axis. If a sparse + matrix is provided, it will be converted into a sparse + ``csc_matrix``. Additionally, the sparse matrix needs to be + nonnegative if `ignore_implicit_zeros` is False. + + y : None + Ignored. + + Returns + ------- + self : object + Fitted transformer. + """ + if self.subsample is not None and self.n_quantiles > self.subsample: + raise ValueError( + "The number of quantiles cannot be greater than" + " the number of samples used. Got {} quantiles" + " and {} samples.".format(self.n_quantiles, self.subsample) + ) + + X = self._check_inputs(X, in_fit=True, copy=False) + n_samples = X.shape[0] + + if self.n_quantiles > n_samples: + warnings.warn( + "n_quantiles (%s) is greater than the total number " + "of samples (%s). n_quantiles is set to " + "n_samples." % (self.n_quantiles, n_samples) + ) + self.n_quantiles_ = max(1, min(self.n_quantiles, n_samples)) + + rng = check_random_state(self.random_state) + + # Create the quantiles of reference + self.references_ = np.linspace(0, 1, self.n_quantiles_, endpoint=True) + if sparse.issparse(X): + self._sparse_fit(X, rng) + else: + self._dense_fit(X, rng) + + return self + + def _transform_col(self, X_col, quantiles, inverse): + """Private function to transform a single feature.""" + + output_distribution = self.output_distribution + + if not inverse: + lower_bound_x = quantiles[0] + upper_bound_x = quantiles[-1] + lower_bound_y = 0 + upper_bound_y = 1 + else: + lower_bound_x = 0 + upper_bound_x = 1 + lower_bound_y = quantiles[0] + upper_bound_y = quantiles[-1] + # for inverse transform, match a uniform distribution + with np.errstate(invalid="ignore"): # hide NaN comparison warnings + if output_distribution == "normal": + X_col = stats.norm.cdf(X_col) + # else output distribution is already a uniform distribution + + # find index for lower and higher bounds + with np.errstate(invalid="ignore"): # hide NaN comparison warnings + if output_distribution == "normal": + lower_bounds_idx = X_col - BOUNDS_THRESHOLD < lower_bound_x + upper_bounds_idx = X_col + BOUNDS_THRESHOLD > upper_bound_x + if output_distribution == "uniform": + lower_bounds_idx = X_col == lower_bound_x + upper_bounds_idx = X_col == upper_bound_x + + isfinite_mask = ~np.isnan(X_col) + X_col_finite = X_col[isfinite_mask] + if not inverse: + # Interpolate in one direction and in the other and take the + # mean. This is in case of repeated values in the features + # and hence repeated quantiles + # + # If we don't do this, only one extreme of the duplicated is + # used (the upper when we do ascending, and the + # lower for descending). We take the mean of these two + X_col[isfinite_mask] = 0.5 * ( + np.interp(X_col_finite, quantiles, self.references_) + - np.interp(-X_col_finite, -quantiles[::-1], -self.references_[::-1]) + ) + else: + X_col[isfinite_mask] = np.interp(X_col_finite, self.references_, quantiles) + + X_col[upper_bounds_idx] = upper_bound_y + X_col[lower_bounds_idx] = lower_bound_y + # for forward transform, match the output distribution + if not inverse: + with np.errstate(invalid="ignore"): # hide NaN comparison warnings + if output_distribution == "normal": + X_col = stats.norm.ppf(X_col) + # find the value to clip the data to avoid mapping to + # infinity. Clip such that the inverse transform will be + # consistent + clip_min = stats.norm.ppf(BOUNDS_THRESHOLD - np.spacing(1)) + clip_max = stats.norm.ppf(1 - (BOUNDS_THRESHOLD - np.spacing(1))) + X_col = np.clip(X_col, clip_min, clip_max) + # else output distribution is uniform and the ppf is the + # identity function so we let X_col unchanged + + return X_col + + def _check_inputs(self, X, in_fit, accept_sparse_negative=False, copy=False): + """Check inputs before fit and transform.""" + X = validate_data( + self, + X, + reset=in_fit, + accept_sparse="csc", + copy=copy, + dtype=FLOAT_DTYPES, + # only set force_writeable for the validation at transform time because + # it's the only place where QuantileTransformer performs inplace operations. + force_writeable=True if not in_fit else None, + ensure_all_finite="allow-nan", + ) + # we only accept positive sparse matrix when ignore_implicit_zeros is + # false and that we call fit or transform. + with np.errstate(invalid="ignore"): # hide NaN comparison warnings + if ( + not accept_sparse_negative + and not self.ignore_implicit_zeros + and (sparse.issparse(X) and np.any(X.data < 0)) + ): + raise ValueError( + "QuantileTransformer only accepts non-negative sparse matrices." + ) + + return X + + def _transform(self, X, inverse=False): + """Forward and inverse transform. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + The data used to scale along the features axis. + + inverse : bool, default=False + If False, apply forward transform. If True, apply + inverse transform. + + Returns + ------- + X : ndarray of shape (n_samples, n_features) + Projected data. + """ + if sparse.issparse(X): + for feature_idx in range(X.shape[1]): + column_slice = slice(X.indptr[feature_idx], X.indptr[feature_idx + 1]) + X.data[column_slice] = self._transform_col( + X.data[column_slice], self.quantiles_[:, feature_idx], inverse + ) + else: + for feature_idx in range(X.shape[1]): + X[:, feature_idx] = self._transform_col( + X[:, feature_idx], self.quantiles_[:, feature_idx], inverse + ) + + return X + + def transform(self, X): + """Feature-wise transformation of the data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data used to scale along the features axis. If a sparse + matrix is provided, it will be converted into a sparse + ``csc_matrix``. Additionally, the sparse matrix needs to be + nonnegative if `ignore_implicit_zeros` is False. + + Returns + ------- + Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) + The projected data. + """ + check_is_fitted(self) + X = self._check_inputs(X, in_fit=False, copy=self.copy) + + return self._transform(X, inverse=False) + + def inverse_transform(self, X): + """Back-projection to the original space. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data used to scale along the features axis. If a sparse + matrix is provided, it will be converted into a sparse + ``csc_matrix``. Additionally, the sparse matrix needs to be + nonnegative if `ignore_implicit_zeros` is False. + + Returns + ------- + X_original : {ndarray, sparse matrix} of (n_samples, n_features) + The projected data. + """ + check_is_fitted(self) + X = self._check_inputs( + X, in_fit=False, accept_sparse_negative=True, copy=self.copy + ) + + return self._transform(X, inverse=True) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + tags.input_tags.allow_nan = True + return tags + + +@validate_params( + {"X": ["array-like", "sparse matrix"], "axis": [Options(Integral, {0, 1})]}, + prefer_skip_nested_validation=False, +) +def quantile_transform( + X, + *, + axis=0, + n_quantiles=1000, + output_distribution="uniform", + ignore_implicit_zeros=False, + subsample=int(1e5), + random_state=None, + copy=True, +): + """Transform features using quantiles information. + + This method transforms the features to follow a uniform or a normal + distribution. Therefore, for a given feature, this transformation tends + to spread out the most frequent values. It also reduces the impact of + (marginal) outliers: this is therefore a robust preprocessing scheme. + + The transformation is applied on each feature independently. First an + estimate of the cumulative distribution function of a feature is + used to map the original values to a uniform distribution. The obtained + values are then mapped to the desired output distribution using the + associated quantile function. Features values of new/unseen data that fall + below or above the fitted range will be mapped to the bounds of the output + distribution. Note that this transform is non-linear. It may distort linear + correlations between variables measured at the same scale but renders + variables measured at different scales more directly comparable. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to transform. + + axis : int, default=0 + Axis used to compute the means and standard deviations along. If 0, + transform each feature, otherwise (if 1) transform each sample. + + n_quantiles : int, default=1000 or n_samples + Number of quantiles to be computed. It corresponds to the number + of landmarks used to discretize the cumulative distribution function. + If n_quantiles is larger than the number of samples, n_quantiles is set + to the number of samples as a larger number of quantiles does not give + a better approximation of the cumulative distribution function + estimator. + + output_distribution : {'uniform', 'normal'}, default='uniform' + Marginal distribution for the transformed data. The choices are + 'uniform' (default) or 'normal'. + + ignore_implicit_zeros : bool, default=False + Only applies to sparse matrices. If True, the sparse entries of the + matrix are discarded to compute the quantile statistics. If False, + these entries are treated as zeros. + + subsample : int or None, default=1e5 + Maximum number of samples used to estimate the quantiles for + computational efficiency. Note that the subsampling procedure may + differ for value-identical sparse and dense matrices. + Disable subsampling by setting `subsample=None`. + + .. versionadded:: 1.5 + The option `None` to disable subsampling was added. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for subsampling and smoothing + noise. + Please see ``subsample`` for more details. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + copy : bool, default=True + If False, try to avoid a copy and transform in place. + This is not guaranteed to always work in place; e.g. if the data is + a numpy array with an int dtype, a copy will be returned even with + copy=False. + + .. versionchanged:: 0.23 + The default value of `copy` changed from False to True in 0.23. + + Returns + ------- + Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) + The transformed data. + + See Also + -------- + QuantileTransformer : Performs quantile-based scaling using the + Transformer API (e.g. as part of a preprocessing + :class:`~sklearn.pipeline.Pipeline`). + power_transform : Maps data to a normal distribution using a + power transformation. + scale : Performs standardization that is faster, but less robust + to outliers. + robust_scale : Performs robust standardization that removes the influence + of outliers but does not put outliers and inliers on the same scale. + + Notes + ----- + NaNs are treated as missing values: disregarded in fit, and maintained in + transform. + + .. warning:: Risk of data leak + + Do not use :func:`~sklearn.preprocessing.quantile_transform` unless + you know what you are doing. A common mistake is to apply it + to the entire data *before* splitting into training and + test sets. This will bias the model evaluation because + information would have leaked from the test set to the + training set. + In general, we recommend using + :class:`~sklearn.preprocessing.QuantileTransformer` within a + :ref:`Pipeline ` in order to prevent most risks of data + leaking:`pipe = make_pipeline(QuantileTransformer(), + LogisticRegression())`. + + For a comparison of the different scalers, transformers, and normalizers, + see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.preprocessing import quantile_transform + >>> rng = np.random.RandomState(0) + >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0) + >>> quantile_transform(X, n_quantiles=10, random_state=0, copy=True) + array([...]) + """ + n = QuantileTransformer( + n_quantiles=n_quantiles, + output_distribution=output_distribution, + subsample=subsample, + ignore_implicit_zeros=ignore_implicit_zeros, + random_state=random_state, + copy=copy, + ) + if axis == 0: + X = n.fit_transform(X) + else: # axis == 1 + X = n.fit_transform(X.T).T + return X + + +class PowerTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): + """Apply a power transform featurewise to make data more Gaussian-like. + + Power transforms are a family of parametric, monotonic transformations + that are applied to make data more Gaussian-like. This is useful for + modeling issues related to heteroscedasticity (non-constant variance), + or other situations where normality is desired. + + Currently, PowerTransformer supports the Box-Cox transform and the + Yeo-Johnson transform. The optimal parameter for stabilizing variance and + minimizing skewness is estimated through maximum likelihood. + + Box-Cox requires input data to be strictly positive, while Yeo-Johnson + supports both positive or negative data. + + By default, zero-mean, unit-variance normalization is applied to the + transformed data. + + For an example visualization, refer to :ref:`Compare PowerTransformer with + other scalers `. To see the + effect of Box-Cox and Yeo-Johnson transformations on different + distributions, see: + :ref:`sphx_glr_auto_examples_preprocessing_plot_map_data_to_normal.py`. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.20 + + Parameters + ---------- + method : {'yeo-johnson', 'box-cox'}, default='yeo-johnson' + The power transform method. Available methods are: + + - 'yeo-johnson' [1]_, works with positive and negative values + - 'box-cox' [2]_, only works with strictly positive values + + standardize : bool, default=True + Set to True to apply zero-mean, unit-variance normalization to the + transformed output. + + copy : bool, default=True + Set to False to perform inplace computation during transformation. + + Attributes + ---------- + lambdas_ : ndarray of float of shape (n_features,) + The parameters of the power transformation for the selected features. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + power_transform : Equivalent function without the estimator API. + + QuantileTransformer : Maps data to a standard normal distribution with + the parameter `output_distribution='normal'`. + + Notes + ----- + NaNs are treated as missing values: disregarded in ``fit``, and maintained + in ``transform``. + + References + ---------- + + .. [1] :doi:`I.K. Yeo and R.A. Johnson, "A new family of power + transformations to improve normality or symmetry." Biometrika, + 87(4), pp.954-959, (2000). <10.1093/biomet/87.4.954>` + + .. [2] :doi:`G.E.P. Box and D.R. Cox, "An Analysis of Transformations", + Journal of the Royal Statistical Society B, 26, 211-252 (1964). + <10.1111/j.2517-6161.1964.tb00553.x>` + + Examples + -------- + >>> import numpy as np + >>> from sklearn.preprocessing import PowerTransformer + >>> pt = PowerTransformer() + >>> data = [[1, 2], [3, 2], [4, 5]] + >>> print(pt.fit(data)) + PowerTransformer() + >>> print(pt.lambdas_) + [ 1.386 -3.100] + >>> print(pt.transform(data)) + [[-1.316 -0.707] + [ 0.209 -0.707] + [ 1.106 1.414]] + """ + + _parameter_constraints: dict = { + "method": [StrOptions({"yeo-johnson", "box-cox"})], + "standardize": ["boolean"], + "copy": ["boolean"], + } + + def __init__(self, method="yeo-johnson", *, standardize=True, copy=True): + self.method = method + self.standardize = standardize + self.copy = copy + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Estimate the optimal parameter lambda for each feature. + + The optimal lambda parameter for minimizing skewness is estimated on + each feature independently using maximum likelihood. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data used to estimate the optimal transformation parameters. + + y : None + Ignored. + + Returns + ------- + self : object + Fitted transformer. + """ + self._fit(X, y=y, force_transform=False) + return self + + @_fit_context(prefer_skip_nested_validation=True) + def fit_transform(self, X, y=None): + """Fit `PowerTransformer` to `X`, then transform `X`. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data used to estimate the optimal transformation parameters + and to be transformed using a power transformation. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_features) + Transformed data. + """ + return self._fit(X, y, force_transform=True) + + def _fit(self, X, y=None, force_transform=False): + X = self._check_input(X, in_fit=True, check_positive=True) + + if not self.copy and not force_transform: # if call from fit() + X = X.copy() # force copy so that fit does not change X inplace + + n_samples = X.shape[0] + mean = np.mean(X, axis=0, dtype=np.float64) + var = np.var(X, axis=0, dtype=np.float64) + + optim_function = { + "box-cox": self._box_cox_optimize, + "yeo-johnson": self._yeo_johnson_optimize, + }[self.method] + + transform_function = { + "box-cox": boxcox, + "yeo-johnson": self._yeo_johnson_transform, + }[self.method] + + with np.errstate(invalid="ignore"): # hide NaN warnings + self.lambdas_ = np.empty(X.shape[1], dtype=X.dtype) + for i, col in enumerate(X.T): + # For yeo-johnson, leave constant features unchanged + # lambda=1 corresponds to the identity transformation + is_constant_feature = _is_constant_feature(var[i], mean[i], n_samples) + if self.method == "yeo-johnson" and is_constant_feature: + self.lambdas_[i] = 1.0 + continue + + self.lambdas_[i] = optim_function(col) + + if self.standardize or force_transform: + X[:, i] = transform_function(X[:, i], self.lambdas_[i]) + + if self.standardize: + self._scaler = StandardScaler(copy=False).set_output(transform="default") + if force_transform: + X = self._scaler.fit_transform(X) + else: + self._scaler.fit(X) + + return X + + def transform(self, X): + """Apply the power transform to each feature using the fitted lambdas. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to be transformed using a power transformation. + + Returns + ------- + X_trans : ndarray of shape (n_samples, n_features) + The transformed data. + """ + check_is_fitted(self) + X = self._check_input(X, in_fit=False, check_positive=True, check_shape=True) + + transform_function = { + "box-cox": boxcox, + "yeo-johnson": self._yeo_johnson_transform, + }[self.method] + for i, lmbda in enumerate(self.lambdas_): + with np.errstate(invalid="ignore"): # hide NaN warnings + X[:, i] = transform_function(X[:, i], lmbda) + + if self.standardize: + X = self._scaler.transform(X) + + return X + + def inverse_transform(self, X): + """Apply the inverse power transformation using the fitted lambdas. + + The inverse of the Box-Cox transformation is given by:: + + if lambda_ == 0: + X_original = exp(X_trans) + else: + X_original = (X * lambda_ + 1) ** (1 / lambda_) + + The inverse of the Yeo-Johnson transformation is given by:: + + if X >= 0 and lambda_ == 0: + X_original = exp(X) - 1 + elif X >= 0 and lambda_ != 0: + X_original = (X * lambda_ + 1) ** (1 / lambda_) - 1 + elif X < 0 and lambda_ != 2: + X_original = 1 - (-(2 - lambda_) * X + 1) ** (1 / (2 - lambda_)) + elif X < 0 and lambda_ == 2: + X_original = 1 - exp(-X) + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The transformed data. + + Returns + ------- + X_original : ndarray of shape (n_samples, n_features) + The original data. + """ + check_is_fitted(self) + X = self._check_input(X, in_fit=False, check_shape=True) + + if self.standardize: + X = self._scaler.inverse_transform(X) + + inv_fun = { + "box-cox": inv_boxcox, + "yeo-johnson": self._yeo_johnson_inverse_transform, + }[self.method] + for i, lmbda in enumerate(self.lambdas_): + with np.errstate(invalid="ignore"): # hide NaN warnings + X[:, i] = inv_fun(X[:, i], lmbda) + + return X + + def _yeo_johnson_inverse_transform(self, x, lmbda): + """Return inverse-transformed input x following Yeo-Johnson inverse + transform with parameter lambda. + """ + x_inv = np.zeros_like(x) + pos = x >= 0 + + # when x >= 0 + if abs(lmbda) < np.spacing(1.0): + x_inv[pos] = np.exp(x[pos]) - 1 + else: # lmbda != 0 + x_inv[pos] = np.power(x[pos] * lmbda + 1, 1 / lmbda) - 1 + + # when x < 0 + if abs(lmbda - 2) > np.spacing(1.0): + x_inv[~pos] = 1 - np.power(-(2 - lmbda) * x[~pos] + 1, 1 / (2 - lmbda)) + else: # lmbda == 2 + x_inv[~pos] = 1 - np.exp(-x[~pos]) + + return x_inv + + def _yeo_johnson_transform(self, x, lmbda): + """Return transformed input x following Yeo-Johnson transform with + parameter lambda. + """ + + out = np.zeros_like(x) + pos = x >= 0 # binary mask + + # when x >= 0 + if abs(lmbda) < np.spacing(1.0): + out[pos] = np.log1p(x[pos]) + else: # lmbda != 0 + out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda + + # when x < 0 + if abs(lmbda - 2) > np.spacing(1.0): + out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda) + else: # lmbda == 2 + out[~pos] = -np.log1p(-x[~pos]) + + return out + + def _box_cox_optimize(self, x): + """Find and return optimal lambda parameter of the Box-Cox transform by + MLE, for observed data x. + + We here use scipy builtins which uses the brent optimizer. + """ + mask = np.isnan(x) + if np.all(mask): + raise ValueError("Column must not be all nan.") + + # the computation of lambda is influenced by NaNs so we need to + # get rid of them + _, lmbda = stats.boxcox(x[~mask], lmbda=None) + + return lmbda + + def _yeo_johnson_optimize(self, x): + """Find and return optimal lambda parameter of the Yeo-Johnson + transform by MLE, for observed data x. + + Like for Box-Cox, MLE is done via the brent optimizer. + """ + x_tiny = np.finfo(np.float64).tiny + + def _neg_log_likelihood(lmbda): + """Return the negative log likelihood of the observed data x as a + function of lambda.""" + x_trans = self._yeo_johnson_transform(x, lmbda) + n_samples = x.shape[0] + x_trans_var = x_trans.var() + + # Reject transformed data that would raise a RuntimeWarning in np.log + if x_trans_var < x_tiny: + return np.inf + + log_var = np.log(x_trans_var) + loglike = -n_samples / 2 * log_var + loglike += (lmbda - 1) * (np.sign(x) * np.log1p(np.abs(x))).sum() + + return -loglike + + # the computation of lambda is influenced by NaNs so we need to + # get rid of them + x = x[~np.isnan(x)] + + return _yeojohnson_lambda(_neg_log_likelihood, x) + + def _check_input(self, X, in_fit, check_positive=False, check_shape=False): + """Validate the input before fit and transform. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + in_fit : bool + Whether or not `_check_input` is called from `fit` or other + methods, e.g. `predict`, `transform`, etc. + + check_positive : bool, default=False + If True, check that all data is positive and non-zero (only if + ``self.method=='box-cox'``). + + check_shape : bool, default=False + If True, check that n_features matches the length of self.lambdas_ + """ + X = validate_data( + self, + X, + ensure_2d=True, + dtype=FLOAT_DTYPES, + force_writeable=True, + copy=self.copy, + ensure_all_finite="allow-nan", + reset=in_fit, + ) + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", r"All-NaN (slice|axis) encountered") + if check_positive and self.method == "box-cox" and np.nanmin(X) <= 0: + raise ValueError( + "The Box-Cox transformation can only be " + "applied to strictly positive data" + ) + + if check_shape and not X.shape[1] == len(self.lambdas_): + raise ValueError( + "Input data has a different number of features " + "than fitting data. Should have {n}, data has {m}".format( + n=len(self.lambdas_), m=X.shape[1] + ) + ) + + return X + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags + + +@validate_params( + {"X": ["array-like"]}, + prefer_skip_nested_validation=False, +) +def power_transform(X, method="yeo-johnson", *, standardize=True, copy=True): + """Parametric, monotonic transformation to make data more Gaussian-like. + + Power transforms are a family of parametric, monotonic transformations + that are applied to make data more Gaussian-like. This is useful for + modeling issues related to heteroscedasticity (non-constant variance), + or other situations where normality is desired. + + Currently, power_transform supports the Box-Cox transform and the + Yeo-Johnson transform. The optimal parameter for stabilizing variance and + minimizing skewness is estimated through maximum likelihood. + + Box-Cox requires input data to be strictly positive, while Yeo-Johnson + supports both positive or negative data. + + By default, zero-mean, unit-variance normalization is applied to the + transformed data. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to be transformed using a power transformation. + + method : {'yeo-johnson', 'box-cox'}, default='yeo-johnson' + The power transform method. Available methods are: + + - 'yeo-johnson' [1]_, works with positive and negative values + - 'box-cox' [2]_, only works with strictly positive values + + .. versionchanged:: 0.23 + The default value of the `method` parameter changed from + 'box-cox' to 'yeo-johnson' in 0.23. + + standardize : bool, default=True + Set to True to apply zero-mean, unit-variance normalization to the + transformed output. + + copy : bool, default=True + If False, try to avoid a copy and transform in place. + This is not guaranteed to always work in place; e.g. if the data is + a numpy array with an int dtype, a copy will be returned even with + copy=False. + + Returns + ------- + X_trans : ndarray of shape (n_samples, n_features) + The transformed data. + + See Also + -------- + PowerTransformer : Equivalent transformation with the + Transformer API (e.g. as part of a preprocessing + :class:`~sklearn.pipeline.Pipeline`). + + quantile_transform : Maps data to a standard normal distribution with + the parameter `output_distribution='normal'`. + + Notes + ----- + NaNs are treated as missing values: disregarded in ``fit``, and maintained + in ``transform``. + + For a comparison of the different scalers, transformers, and normalizers, + see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. + + References + ---------- + + .. [1] I.K. Yeo and R.A. Johnson, "A new family of power transformations to + improve normality or symmetry." Biometrika, 87(4), pp.954-959, + (2000). + + .. [2] G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal + of the Royal Statistical Society B, 26, 211-252 (1964). + + Examples + -------- + >>> import numpy as np + >>> from sklearn.preprocessing import power_transform + >>> data = [[1, 2], [3, 2], [4, 5]] + >>> print(power_transform(data, method='box-cox')) + [[-1.332 -0.707] + [ 0.256 -0.707] + [ 1.076 1.414]] + + .. warning:: Risk of data leak. + Do not use :func:`~sklearn.preprocessing.power_transform` unless you + know what you are doing. A common mistake is to apply it to the entire + data *before* splitting into training and test sets. This will bias the + model evaluation because information would have leaked from the test + set to the training set. + In general, we recommend using + :class:`~sklearn.preprocessing.PowerTransformer` within a + :ref:`Pipeline ` in order to prevent most risks of data + leaking, e.g.: `pipe = make_pipeline(PowerTransformer(), + LogisticRegression())`. + """ + pt = PowerTransformer(method=method, standardize=standardize, copy=copy) + return pt.fit_transform(X) diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_discretization.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_discretization.py new file mode 100644 index 0000000000000000000000000000000000000000..ef5081080bda1813d4f16b9931dc58cf608c9818 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_discretization.py @@ -0,0 +1,548 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + + +import warnings +from numbers import Integral + +import numpy as np + +from ..base import BaseEstimator, TransformerMixin, _fit_context +from ..utils import resample +from ..utils._param_validation import Interval, Options, StrOptions +from ..utils.stats import _averaged_weighted_percentile, _weighted_percentile +from ..utils.validation import ( + _check_feature_names_in, + _check_sample_weight, + check_array, + check_is_fitted, + validate_data, +) +from ._encoders import OneHotEncoder + + +class KBinsDiscretizer(TransformerMixin, BaseEstimator): + """ + Bin continuous data into intervals. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.20 + + Parameters + ---------- + n_bins : int or array-like of shape (n_features,), default=5 + The number of bins to produce. Raises ValueError if ``n_bins < 2``. + + encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot' + Method used to encode the transformed result. + + - 'onehot': Encode the transformed result with one-hot encoding + and return a sparse matrix. Ignored features are always + stacked to the right. + - 'onehot-dense': Encode the transformed result with one-hot encoding + and return a dense array. Ignored features are always + stacked to the right. + - 'ordinal': Return the bin identifier encoded as an integer value. + + strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile' + Strategy used to define the widths of the bins. + + - 'uniform': All bins in each feature have identical widths. + - 'quantile': All bins in each feature have the same number of points. + - 'kmeans': Values in each bin have the same nearest center of a 1D + k-means cluster. + + For an example of the different strategies see: + :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_strategies.py`. + + quantile_method : {"inverted_cdf", "averaged_inverted_cdf", + "closest_observation", "interpolated_inverted_cdf", "hazen", + "weibull", "linear", "median_unbiased", "normal_unbiased"}, + default="linear" + Method to pass on to np.percentile calculation when using + strategy="quantile". Only `averaged_inverted_cdf` and `inverted_cdf` + support the use of `sample_weight != None` when subsampling is not + active. + + .. versionadded:: 1.7 + + dtype : {np.float32, np.float64}, default=None + The desired data-type for the output. If None, output dtype is + consistent with input dtype. Only np.float32 and np.float64 are + supported. + + .. versionadded:: 0.24 + + subsample : int or None, default=200_000 + Maximum number of samples, used to fit the model, for computational + efficiency. + `subsample=None` means that all the training samples are used when + computing the quantiles that determine the binning thresholds. + Since quantile computation relies on sorting each column of `X` and + that sorting has an `n log(n)` time complexity, + it is recommended to use subsampling on datasets with a + very large number of samples. + + .. versionchanged:: 1.3 + The default value of `subsample` changed from `None` to `200_000` when + `strategy="quantile"`. + + .. versionchanged:: 1.5 + The default value of `subsample` changed from `None` to `200_000` when + `strategy="uniform"` or `strategy="kmeans"`. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for subsampling. + Pass an int for reproducible results across multiple function calls. + See the `subsample` parameter for more details. + See :term:`Glossary `. + + .. versionadded:: 1.1 + + Attributes + ---------- + bin_edges_ : ndarray of ndarray of shape (n_features,) + The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )`` + Ignored features will have empty arrays. + + n_bins_ : ndarray of shape (n_features,), dtype=np.int64 + Number of bins per feature. Bins whose width are too small + (i.e., <= 1e-8) are removed with a warning. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + Binarizer : Class used to bin values as ``0`` or + ``1`` based on a parameter ``threshold``. + + Notes + ----- + + For a visualization of discretization on different datasets refer to + :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_classification.py`. + On the effect of discretization on linear models see: + :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization.py`. + + In bin edges for feature ``i``, the first and last values are used only for + ``inverse_transform``. During transform, bin edges are extended to:: + + np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf]) + + You can combine ``KBinsDiscretizer`` with + :class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess + part of the features. + + ``KBinsDiscretizer`` might produce constant features (e.g., when + ``encode = 'onehot'`` and certain bins do not contain any data). + These features can be removed with feature selection algorithms + (e.g., :class:`~sklearn.feature_selection.VarianceThreshold`). + + Examples + -------- + >>> from sklearn.preprocessing import KBinsDiscretizer + >>> X = [[-2, 1, -4, -1], + ... [-1, 2, -3, -0.5], + ... [ 0, 3, -2, 0.5], + ... [ 1, 4, -1, 2]] + >>> est = KBinsDiscretizer( + ... n_bins=3, encode='ordinal', strategy='uniform' + ... ) + >>> est.fit(X) + KBinsDiscretizer(...) + >>> Xt = est.transform(X) + >>> Xt # doctest: +SKIP + array([[ 0., 0., 0., 0.], + [ 1., 1., 1., 0.], + [ 2., 2., 2., 1.], + [ 2., 2., 2., 2.]]) + + Sometimes it may be useful to convert the data back into the original + feature space. The ``inverse_transform`` function converts the binned + data into the original feature space. Each value will be equal to the mean + of the two bin edges. + + >>> est.bin_edges_[0] + array([-2., -1., 0., 1.]) + >>> est.inverse_transform(Xt) + array([[-1.5, 1.5, -3.5, -0.5], + [-0.5, 2.5, -2.5, -0.5], + [ 0.5, 3.5, -1.5, 0.5], + [ 0.5, 3.5, -1.5, 1.5]]) + """ + + _parameter_constraints: dict = { + "n_bins": [Interval(Integral, 2, None, closed="left"), "array-like"], + "encode": [StrOptions({"onehot", "onehot-dense", "ordinal"})], + "strategy": [StrOptions({"uniform", "quantile", "kmeans"})], + "quantile_method": [ + StrOptions( + { + "warn", + "inverted_cdf", + "averaged_inverted_cdf", + "closest_observation", + "interpolated_inverted_cdf", + "hazen", + "weibull", + "linear", + "median_unbiased", + "normal_unbiased", + } + ) + ], + "dtype": [Options(type, {np.float64, np.float32}), None], + "subsample": [Interval(Integral, 1, None, closed="left"), None], + "random_state": ["random_state"], + } + + def __init__( + self, + n_bins=5, + *, + encode="onehot", + strategy="quantile", + quantile_method="warn", + dtype=None, + subsample=200_000, + random_state=None, + ): + self.n_bins = n_bins + self.encode = encode + self.strategy = strategy + self.quantile_method = quantile_method + self.dtype = dtype + self.subsample = subsample + self.random_state = random_state + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None, sample_weight=None): + """ + Fit the estimator. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data to be discretized. + + y : None + Ignored. This parameter exists only for compatibility with + :class:`~sklearn.pipeline.Pipeline`. + + sample_weight : ndarray of shape (n_samples,) + Contains weight values to be associated with each sample. + + .. versionadded:: 1.3 + + .. versionchanged:: 1.7 + Added support for strategy="uniform". + + Returns + ------- + self : object + Returns the instance itself. + """ + X = validate_data(self, X, dtype="numeric") + + if self.dtype in (np.float64, np.float32): + output_dtype = self.dtype + else: # self.dtype is None + output_dtype = X.dtype + + n_samples, n_features = X.shape + + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + + if self.subsample is not None and n_samples > self.subsample: + # Take a subsample of `X` + # When resampling, it is important to subsample **with replacement** to + # preserve the distribution, in particular in the presence of a few data + # points with large weights. You can check this by setting `replace=False` + # in sklearn.utils.test.test_indexing.test_resample_weighted and check that + # it fails as a justification for this claim. + X = resample( + X, + replace=True, + n_samples=self.subsample, + random_state=self.random_state, + sample_weight=sample_weight, + ) + # Since we already used the weights when resampling when provided, + # we set them back to `None` to avoid accounting for the weights twice + # in subsequent operations to compute weight-aware bin edges with + # quantiles or k-means. + sample_weight = None + + n_features = X.shape[1] + n_bins = self._validate_n_bins(n_features) + + bin_edges = np.zeros(n_features, dtype=object) + + # TODO(1.9): remove and switch to quantile_method="averaged_inverted_cdf" + # by default. + quantile_method = self.quantile_method + if self.strategy == "quantile" and quantile_method == "warn": + warnings.warn( + "The current default behavior, quantile_method='linear', will be " + "changed to quantile_method='averaged_inverted_cdf' in " + "scikit-learn version 1.9 to naturally support sample weight " + "equivalence properties by default. Pass " + "quantile_method='averaged_inverted_cdf' explicitly to silence this " + "warning.", + FutureWarning, + ) + quantile_method = "linear" + + if ( + self.strategy == "quantile" + and quantile_method not in ["inverted_cdf", "averaged_inverted_cdf"] + and sample_weight is not None + ): + raise ValueError( + "When fitting with strategy='quantile' and sample weights, " + "quantile_method should either be set to 'averaged_inverted_cdf' or " + f"'inverted_cdf', got quantile_method='{quantile_method}' instead." + ) + + if self.strategy != "quantile" and sample_weight is not None: + # Prepare a mask to filter out zero-weight samples when extracting + # the min and max values of each columns which are needed for the + # "uniform" and "kmeans" strategies. + nnz_weight_mask = sample_weight != 0 + else: + # Otherwise, all samples are used. Use a slice to avoid creating a + # new array. + nnz_weight_mask = slice(None) + + for jj in range(n_features): + column = X[:, jj] + col_min = column[nnz_weight_mask].min() + col_max = column[nnz_weight_mask].max() + + if col_min == col_max: + warnings.warn( + "Feature %d is constant and will be replaced with 0." % jj + ) + n_bins[jj] = 1 + bin_edges[jj] = np.array([-np.inf, np.inf]) + continue + + if self.strategy == "uniform": + bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1) + + elif self.strategy == "quantile": + percentile_levels = np.linspace(0, 100, n_bins[jj] + 1) + + # method="linear" is the implicit default for any numpy + # version. So we keep it version independent in that case by + # using an empty param dict. + percentile_kwargs = {} + if quantile_method != "linear" and sample_weight is None: + percentile_kwargs["method"] = quantile_method + + if sample_weight is None: + bin_edges[jj] = np.asarray( + np.percentile(column, percentile_levels, **percentile_kwargs), + dtype=np.float64, + ) + else: + # TODO: make _weighted_percentile and + # _averaged_weighted_percentile accept an array of + # quantiles instead of calling it multiple times and + # sorting the column multiple times as a result. + percentile_func = { + "inverted_cdf": _weighted_percentile, + "averaged_inverted_cdf": _averaged_weighted_percentile, + }[quantile_method] + bin_edges[jj] = np.asarray( + [ + percentile_func(column, sample_weight, percentile_rank=p) + for p in percentile_levels + ], + dtype=np.float64, + ) + elif self.strategy == "kmeans": + from ..cluster import KMeans # fixes import loops + + # Deterministic initialization with uniform spacing + uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1) + init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5 + + # 1D k-means procedure + km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1) + centers = km.fit( + column[:, None], sample_weight=sample_weight + ).cluster_centers_[:, 0] + # Must sort, centers may be unsorted even with sorted init + centers.sort() + bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5 + bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max] + + # Remove bins whose width are too small (i.e., <= 1e-8) + if self.strategy in ("quantile", "kmeans"): + mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8 + bin_edges[jj] = bin_edges[jj][mask] + if len(bin_edges[jj]) - 1 != n_bins[jj]: + warnings.warn( + "Bins whose width are too small (i.e., <= " + "1e-8) in feature %d are removed. Consider " + "decreasing the number of bins." % jj + ) + n_bins[jj] = len(bin_edges[jj]) - 1 + + self.bin_edges_ = bin_edges + self.n_bins_ = n_bins + + if "onehot" in self.encode: + self._encoder = OneHotEncoder( + categories=[np.arange(i) for i in self.n_bins_], + sparse_output=self.encode == "onehot", + dtype=output_dtype, + ) + # Fit the OneHotEncoder with toy datasets + # so that it's ready for use after the KBinsDiscretizer is fitted + self._encoder.fit(np.zeros((1, len(self.n_bins_)))) + + return self + + def _validate_n_bins(self, n_features): + """Returns n_bins_, the number of bins per feature.""" + orig_bins = self.n_bins + if isinstance(orig_bins, Integral): + return np.full(n_features, orig_bins, dtype=int) + + n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False) + + if n_bins.ndim > 1 or n_bins.shape[0] != n_features: + raise ValueError("n_bins must be a scalar or array of shape (n_features,).") + + bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins) + + violating_indices = np.where(bad_nbins_value)[0] + if violating_indices.shape[0] > 0: + indices = ", ".join(str(i) for i in violating_indices) + raise ValueError( + "{} received an invalid number " + "of bins at indices {}. Number of bins " + "must be at least 2, and must be an int.".format( + KBinsDiscretizer.__name__, indices + ) + ) + return n_bins + + def transform(self, X): + """ + Discretize the data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data to be discretized. + + Returns + ------- + Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64} + Data in the binned space. Will be a sparse matrix if + `self.encode='onehot'` and ndarray otherwise. + """ + check_is_fitted(self) + + # check input and attribute dtypes + dtype = (np.float64, np.float32) if self.dtype is None else self.dtype + Xt = validate_data(self, X, copy=True, dtype=dtype, reset=False) + + bin_edges = self.bin_edges_ + for jj in range(Xt.shape[1]): + Xt[:, jj] = np.searchsorted(bin_edges[jj][1:-1], Xt[:, jj], side="right") + + if self.encode == "ordinal": + return Xt + + dtype_init = None + if "onehot" in self.encode: + dtype_init = self._encoder.dtype + self._encoder.dtype = Xt.dtype + try: + Xt_enc = self._encoder.transform(Xt) + finally: + # revert the initial dtype to avoid modifying self. + self._encoder.dtype = dtype_init + return Xt_enc + + def inverse_transform(self, X): + """ + Transform discretized data back to original feature space. + + Note that this function does not regenerate the original data + due to discretization rounding. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Transformed data in the binned space. + + Returns + ------- + X_original : ndarray, dtype={np.float32, np.float64} + Data in the original feature space. + """ + + check_is_fitted(self) + + if "onehot" in self.encode: + X = self._encoder.inverse_transform(X) + + Xinv = check_array(X, copy=True, dtype=(np.float64, np.float32)) + n_features = self.n_bins_.shape[0] + if Xinv.shape[1] != n_features: + raise ValueError( + "Incorrect number of features. Expecting {}, received {}.".format( + n_features, Xinv.shape[1] + ) + ) + + for jj in range(n_features): + bin_edges = self.bin_edges_[jj] + bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5 + Xinv[:, jj] = bin_centers[(Xinv[:, jj]).astype(np.int64)] + + return Xinv + + def get_feature_names_out(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then the following input feature names are generated: + `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + check_is_fitted(self, "n_features_in_") + input_features = _check_feature_names_in(self, input_features) + if hasattr(self, "_encoder"): + return self._encoder.get_feature_names_out(input_features) + + # ordinal encoding + return input_features diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_encoders.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_encoders.py new file mode 100644 index 0000000000000000000000000000000000000000..5f41c9d0c6d22822efd228a94d3c8a8b27b053a3 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_encoders.py @@ -0,0 +1,1698 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import numbers +import warnings +from numbers import Integral + +import numpy as np +from scipy import sparse + +from ..base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin, _fit_context +from ..utils import _safe_indexing, check_array +from ..utils._encode import _check_unknown, _encode, _get_counts, _unique +from ..utils._mask import _get_mask +from ..utils._missing import is_scalar_nan +from ..utils._param_validation import Interval, RealNotInt, StrOptions +from ..utils._set_output import _get_output_config +from ..utils.validation import ( + _check_feature_names, + _check_feature_names_in, + _check_n_features, + check_is_fitted, +) + +__all__ = ["OneHotEncoder", "OrdinalEncoder"] + + +class _BaseEncoder(TransformerMixin, BaseEstimator): + """ + Base class for encoders that includes the code to categorize and + transform the input features. + + """ + + def _check_X(self, X, ensure_all_finite=True): + """ + Perform custom check_array: + - convert list of strings to object dtype + - check for missing values for object dtype data (check_array does + not do that) + - return list of features (arrays): this list of features is + constructed feature by feature to preserve the data types + of pandas DataFrame columns, as otherwise information is lost + and cannot be used, e.g. for the `categories_` attribute. + + """ + if not (hasattr(X, "iloc") and getattr(X, "ndim", 0) == 2): + # if not a dataframe, do normal check_array validation + X_temp = check_array(X, dtype=None, ensure_all_finite=ensure_all_finite) + if not hasattr(X, "dtype") and np.issubdtype(X_temp.dtype, np.str_): + X = check_array(X, dtype=object, ensure_all_finite=ensure_all_finite) + else: + X = X_temp + needs_validation = False + else: + # pandas dataframe, do validation later column by column, in order + # to keep the dtype information to be used in the encoder. + needs_validation = ensure_all_finite + + n_samples, n_features = X.shape + X_columns = [] + + for i in range(n_features): + Xi = _safe_indexing(X, indices=i, axis=1) + Xi = check_array( + Xi, ensure_2d=False, dtype=None, ensure_all_finite=needs_validation + ) + X_columns.append(Xi) + + return X_columns, n_samples, n_features + + def _fit( + self, + X, + handle_unknown="error", + ensure_all_finite=True, + return_counts=False, + return_and_ignore_missing_for_infrequent=False, + ): + self._check_infrequent_enabled() + _check_n_features(self, X, reset=True) + _check_feature_names(self, X, reset=True) + X_list, n_samples, n_features = self._check_X( + X, ensure_all_finite=ensure_all_finite + ) + self.n_features_in_ = n_features + + if self.categories != "auto": + if len(self.categories) != n_features: + raise ValueError( + "Shape mismatch: if categories is an array," + " it has to be of shape (n_features,)." + ) + + self.categories_ = [] + category_counts = [] + compute_counts = return_counts or self._infrequent_enabled + + for i in range(n_features): + Xi = X_list[i] + + if self.categories == "auto": + result = _unique(Xi, return_counts=compute_counts) + if compute_counts: + cats, counts = result + category_counts.append(counts) + else: + cats = result + else: + if np.issubdtype(Xi.dtype, np.str_): + # Always convert string categories to objects to avoid + # unexpected string truncation for longer category labels + # passed in the constructor. + Xi_dtype = object + else: + Xi_dtype = Xi.dtype + + cats = np.array(self.categories[i], dtype=Xi_dtype) + if ( + cats.dtype == object + and isinstance(cats[0], bytes) + and Xi.dtype.kind != "S" + ): + msg = ( + f"In column {i}, the predefined categories have type 'bytes'" + " which is incompatible with values of type" + f" '{type(Xi[0]).__name__}'." + ) + raise ValueError(msg) + + # `nan` must be the last stated category + for category in cats[:-1]: + if is_scalar_nan(category): + raise ValueError( + "Nan should be the last element in user" + f" provided categories, see categories {cats}" + f" in column #{i}" + ) + + if cats.size != len(_unique(cats)): + msg = ( + f"In column {i}, the predefined categories" + " contain duplicate elements." + ) + raise ValueError(msg) + + if Xi.dtype.kind not in "OUS": + sorted_cats = np.sort(cats) + error_msg = ( + "Unsorted categories are not supported for numerical categories" + ) + # if there are nans, nan should be the last element + stop_idx = -1 if np.isnan(sorted_cats[-1]) else None + if np.any(sorted_cats[:stop_idx] != cats[:stop_idx]): + raise ValueError(error_msg) + + if handle_unknown == "error": + diff = _check_unknown(Xi, cats) + if diff: + msg = ( + "Found unknown categories {0} in column {1}" + " during fit".format(diff, i) + ) + raise ValueError(msg) + if compute_counts: + category_counts.append(_get_counts(Xi, cats)) + + self.categories_.append(cats) + + output = {"n_samples": n_samples} + if return_counts: + output["category_counts"] = category_counts + + missing_indices = {} + if return_and_ignore_missing_for_infrequent: + for feature_idx, categories_for_idx in enumerate(self.categories_): + if is_scalar_nan(categories_for_idx[-1]): + # `nan` values can only be placed in the latest position + missing_indices[feature_idx] = categories_for_idx.size - 1 + output["missing_indices"] = missing_indices + + if self._infrequent_enabled: + self._fit_infrequent_category_mapping( + n_samples, + category_counts, + missing_indices, + ) + return output + + def _transform( + self, + X, + handle_unknown="error", + ensure_all_finite=True, + warn_on_unknown=False, + ignore_category_indices=None, + ): + X_list, n_samples, n_features = self._check_X( + X, ensure_all_finite=ensure_all_finite + ) + _check_feature_names(self, X, reset=False) + _check_n_features(self, X, reset=False) + + X_int = np.zeros((n_samples, n_features), dtype=int) + X_mask = np.ones((n_samples, n_features), dtype=bool) + + columns_with_unknown = [] + for i in range(n_features): + Xi = X_list[i] + diff, valid_mask = _check_unknown(Xi, self.categories_[i], return_mask=True) + + if not np.all(valid_mask): + if handle_unknown == "error": + msg = ( + "Found unknown categories {0} in column {1}" + " during transform".format(diff, i) + ) + raise ValueError(msg) + else: + if warn_on_unknown: + columns_with_unknown.append(i) + # Set the problematic rows to an acceptable value and + # continue `The rows are marked `X_mask` and will be + # removed later. + X_mask[:, i] = valid_mask + # cast Xi into the largest string type necessary + # to handle different lengths of numpy strings + if ( + self.categories_[i].dtype.kind in ("U", "S") + and self.categories_[i].itemsize > Xi.itemsize + ): + Xi = Xi.astype(self.categories_[i].dtype) + elif self.categories_[i].dtype.kind == "O" and Xi.dtype.kind == "U": + # categories are objects and Xi are numpy strings. + # Cast Xi to an object dtype to prevent truncation + # when setting invalid values. + Xi = Xi.astype("O") + else: + Xi = Xi.copy() + + Xi[~valid_mask] = self.categories_[i][0] + # We use check_unknown=False, since _check_unknown was + # already called above. + X_int[:, i] = _encode(Xi, uniques=self.categories_[i], check_unknown=False) + if columns_with_unknown: + warnings.warn( + ( + "Found unknown categories in columns " + f"{columns_with_unknown} during transform. These " + "unknown categories will be encoded as all zeros" + ), + UserWarning, + ) + + self._map_infrequent_categories(X_int, X_mask, ignore_category_indices) + return X_int, X_mask + + @property + def infrequent_categories_(self): + """Infrequent categories for each feature.""" + # raises an AttributeError if `_infrequent_indices` is not defined + infrequent_indices = self._infrequent_indices + return [ + None if indices is None else category[indices] + for category, indices in zip(self.categories_, infrequent_indices) + ] + + def _check_infrequent_enabled(self): + """ + This functions checks whether _infrequent_enabled is True or False. + This has to be called after parameter validation in the fit function. + """ + max_categories = getattr(self, "max_categories", None) + min_frequency = getattr(self, "min_frequency", None) + self._infrequent_enabled = ( + max_categories is not None and max_categories >= 1 + ) or min_frequency is not None + + def _identify_infrequent(self, category_count, n_samples, col_idx): + """Compute the infrequent indices. + + Parameters + ---------- + category_count : ndarray of shape (n_cardinality,) + Category counts. + + n_samples : int + Number of samples. + + col_idx : int + Index of the current category. Only used for the error message. + + Returns + ------- + output : ndarray of shape (n_infrequent_categories,) or None + If there are infrequent categories, indices of infrequent + categories. Otherwise None. + """ + if isinstance(self.min_frequency, numbers.Integral): + infrequent_mask = category_count < self.min_frequency + elif isinstance(self.min_frequency, numbers.Real): + min_frequency_abs = n_samples * self.min_frequency + infrequent_mask = category_count < min_frequency_abs + else: + infrequent_mask = np.zeros(category_count.shape[0], dtype=bool) + + n_current_features = category_count.size - infrequent_mask.sum() + 1 + if self.max_categories is not None and self.max_categories < n_current_features: + # max_categories includes the one infrequent category + frequent_category_count = self.max_categories - 1 + if frequent_category_count == 0: + # All categories are infrequent + infrequent_mask[:] = True + else: + # stable sort to preserve original count order + smallest_levels = np.argsort(category_count, kind="mergesort")[ + :-frequent_category_count + ] + infrequent_mask[smallest_levels] = True + + output = np.flatnonzero(infrequent_mask) + return output if output.size > 0 else None + + def _fit_infrequent_category_mapping( + self, n_samples, category_counts, missing_indices + ): + """Fit infrequent categories. + + Defines the private attribute: `_default_to_infrequent_mappings`. For + feature `i`, `_default_to_infrequent_mappings[i]` defines the mapping + from the integer encoding returned by `super().transform()` into + infrequent categories. If `_default_to_infrequent_mappings[i]` is None, + there were no infrequent categories in the training set. + + For example if categories 0, 2 and 4 were frequent, while categories + 1, 3, 5 were infrequent for feature 7, then these categories are mapped + to a single output: + `_default_to_infrequent_mappings[7] = array([0, 3, 1, 3, 2, 3])` + + Defines private attribute: `_infrequent_indices`. `_infrequent_indices[i]` + is an array of indices such that + `categories_[i][_infrequent_indices[i]]` are all the infrequent category + labels. If the feature `i` has no infrequent categories + `_infrequent_indices[i]` is None. + + .. versionadded:: 1.1 + + Parameters + ---------- + n_samples : int + Number of samples in training set. + category_counts: list of ndarray + `category_counts[i]` is the category counts corresponding to + `self.categories_[i]`. + missing_indices : dict + Dict mapping from feature_idx to category index with a missing value. + """ + # Remove missing value from counts, so it is not considered as infrequent + if missing_indices: + category_counts_ = [] + for feature_idx, count in enumerate(category_counts): + if feature_idx in missing_indices: + category_counts_.append( + np.delete(count, missing_indices[feature_idx]) + ) + else: + category_counts_.append(count) + else: + category_counts_ = category_counts + + self._infrequent_indices = [ + self._identify_infrequent(category_count, n_samples, col_idx) + for col_idx, category_count in enumerate(category_counts_) + ] + + # compute mapping from default mapping to infrequent mapping + self._default_to_infrequent_mappings = [] + + for feature_idx, infreq_idx in enumerate(self._infrequent_indices): + cats = self.categories_[feature_idx] + # no infrequent categories + if infreq_idx is None: + self._default_to_infrequent_mappings.append(None) + continue + + n_cats = len(cats) + if feature_idx in missing_indices: + # Missing index was removed from this category when computing + # infrequent indices, thus we need to decrease the number of + # total categories when considering the infrequent mapping. + n_cats -= 1 + + # infrequent indices exist + mapping = np.empty(n_cats, dtype=np.int64) + n_infrequent_cats = infreq_idx.size + + # infrequent categories are mapped to the last element. + n_frequent_cats = n_cats - n_infrequent_cats + mapping[infreq_idx] = n_frequent_cats + + frequent_indices = np.setdiff1d(np.arange(n_cats), infreq_idx) + mapping[frequent_indices] = np.arange(n_frequent_cats) + + self._default_to_infrequent_mappings.append(mapping) + + def _map_infrequent_categories(self, X_int, X_mask, ignore_category_indices): + """Map infrequent categories to integer representing the infrequent category. + + This modifies X_int in-place. Values that were invalid based on `X_mask` + are mapped to the infrequent category if there was an infrequent + category for that feature. + + Parameters + ---------- + X_int: ndarray of shape (n_samples, n_features) + Integer encoded categories. + + X_mask: ndarray of shape (n_samples, n_features) + Bool mask for valid values in `X_int`. + + ignore_category_indices : dict + Dictionary mapping from feature_idx to category index to ignore. + Ignored indexes will not be grouped and the original ordinal encoding + will remain. + """ + if not self._infrequent_enabled: + return + + ignore_category_indices = ignore_category_indices or {} + + for col_idx in range(X_int.shape[1]): + infrequent_idx = self._infrequent_indices[col_idx] + if infrequent_idx is None: + continue + + X_int[~X_mask[:, col_idx], col_idx] = infrequent_idx[0] + if self.handle_unknown == "infrequent_if_exist": + # All the unknown values are now mapped to the + # infrequent_idx[0], which makes the unknown values valid + # This is needed in `transform` when the encoding is formed + # using `X_mask`. + X_mask[:, col_idx] = True + + # Remaps encoding in `X_int` where the infrequent categories are + # grouped together. + for i, mapping in enumerate(self._default_to_infrequent_mappings): + if mapping is None: + continue + + if i in ignore_category_indices: + # Update rows that are **not** ignored + rows_to_update = X_int[:, i] != ignore_category_indices[i] + else: + rows_to_update = slice(None) + + X_int[rows_to_update, i] = np.take(mapping, X_int[rows_to_update, i]) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.categorical = True + tags.input_tags.allow_nan = True + return tags + + +class OneHotEncoder(_BaseEncoder): + """ + Encode categorical features as a one-hot numeric array. + + The input to this transformer should be an array-like of integers or + strings, denoting the values taken on by categorical (discrete) features. + The features are encoded using a one-hot (aka 'one-of-K' or 'dummy') + encoding scheme. This creates a binary column for each category and + returns a sparse matrix or dense array (depending on the ``sparse_output`` + parameter). + + By default, the encoder derives the categories based on the unique values + in each feature. Alternatively, you can also specify the `categories` + manually. + + This encoding is needed for feeding categorical data to many scikit-learn + estimators, notably linear models and SVMs with the standard kernels. + + Note: a one-hot encoding of y labels should use a LabelBinarizer + instead. + + Read more in the :ref:`User Guide `. + For a comparison of different encoders, refer to: + :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`. + + Parameters + ---------- + categories : 'auto' or a list of array-like, default='auto' + Categories (unique values) per feature: + + - 'auto' : Determine categories automatically from the training data. + - list : ``categories[i]`` holds the categories expected in the ith + column. The passed categories should not mix strings and numeric + values within a single feature, and should be sorted in case of + numeric values. + + The used categories can be found in the ``categories_`` attribute. + + .. versionadded:: 0.20 + + drop : {'first', 'if_binary'} or an array-like of shape (n_features,), \ + default=None + Specifies a methodology to use to drop one of the categories per + feature. This is useful in situations where perfectly collinear + features cause problems, such as when feeding the resulting data + into an unregularized linear regression model. + + However, dropping one category breaks the symmetry of the original + representation and can therefore induce a bias in downstream models, + for instance for penalized linear classification or regression models. + + - None : retain all features (the default). + - 'first' : drop the first category in each feature. If only one + category is present, the feature will be dropped entirely. + - 'if_binary' : drop the first category in each feature with two + categories. Features with 1 or more than 2 categories are + left intact. + - array : ``drop[i]`` is the category in feature ``X[:, i]`` that + should be dropped. + + When `max_categories` or `min_frequency` is configured to group + infrequent categories, the dropping behavior is handled after the + grouping. + + .. versionadded:: 0.21 + The parameter `drop` was added in 0.21. + + .. versionchanged:: 0.23 + The option `drop='if_binary'` was added in 0.23. + + .. versionchanged:: 1.1 + Support for dropping infrequent categories. + + sparse_output : bool, default=True + When ``True``, it returns a :class:`scipy.sparse.csr_matrix`, + i.e. a sparse matrix in "Compressed Sparse Row" (CSR) format. + + .. versionadded:: 1.2 + `sparse` was renamed to `sparse_output` + + dtype : number type, default=np.float64 + Desired dtype of output. + + handle_unknown : {'error', 'ignore', 'infrequent_if_exist', 'warn'}, \ + default='error' + Specifies the way unknown categories are handled during :meth:`transform`. + + - 'error' : Raise an error if an unknown category is present during transform. + - 'ignore' : When an unknown category is encountered during + transform, the resulting one-hot encoded columns for this feature + will be all zeros. In the inverse transform, an unknown category + will be denoted as None. + - 'infrequent_if_exist' : When an unknown category is encountered + during transform, the resulting one-hot encoded columns for this + feature will map to the infrequent category if it exists. The + infrequent category will be mapped to the last position in the + encoding. During inverse transform, an unknown category will be + mapped to the category denoted `'infrequent'` if it exists. If the + `'infrequent'` category does not exist, then :meth:`transform` and + :meth:`inverse_transform` will handle an unknown category as with + `handle_unknown='ignore'`. Infrequent categories exist based on + `min_frequency` and `max_categories`. Read more in the + :ref:`User Guide `. + - 'warn' : When an unknown category is encountered during transform + a warning is issued, and the encoding then proceeds as described for + `handle_unknown="infrequent_if_exist"`. + + .. versionchanged:: 1.1 + `'infrequent_if_exist'` was added to automatically handle unknown + categories and infrequent categories. + + .. versionadded:: 1.6 + The option `"warn"` was added in 1.6. + + min_frequency : int or float, default=None + Specifies the minimum frequency below which a category will be + considered infrequent. + + - If `int`, categories with a smaller cardinality will be considered + infrequent. + + - If `float`, categories with a smaller cardinality than + `min_frequency * n_samples` will be considered infrequent. + + .. versionadded:: 1.1 + Read more in the :ref:`User Guide `. + + max_categories : int, default=None + Specifies an upper limit to the number of output features for each input + feature when considering infrequent categories. If there are infrequent + categories, `max_categories` includes the category representing the + infrequent categories along with the frequent categories. If `None`, + there is no limit to the number of output features. + + .. versionadded:: 1.1 + Read more in the :ref:`User Guide `. + + feature_name_combiner : "concat" or callable, default="concat" + Callable with signature `def callable(input_feature, category)` that returns a + string. This is used to create feature names to be returned by + :meth:`get_feature_names_out`. + + `"concat"` concatenates encoded feature name and category with + `feature + "_" + str(category)`.E.g. feature X with values 1, 6, 7 create + feature names `X_1, X_6, X_7`. + + .. versionadded:: 1.3 + + Attributes + ---------- + categories_ : list of arrays + The categories of each feature determined during fitting + (in order of the features in X and corresponding with the output + of ``transform``). This includes the category specified in ``drop`` + (if any). + + drop_idx_ : array of shape (n_features,) + - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category + to be dropped for each feature. + - ``drop_idx_[i] = None`` if no category is to be dropped from the + feature with index ``i``, e.g. when `drop='if_binary'` and the + feature isn't binary. + - ``drop_idx_ = None`` if all the transformed features will be + retained. + + If infrequent categories are enabled by setting `min_frequency` or + `max_categories` to a non-default value and `drop_idx[i]` corresponds + to a infrequent category, then the entire infrequent category is + dropped. + + .. versionchanged:: 0.23 + Added the possibility to contain `None` values. + + infrequent_categories_ : list of ndarray + Defined only if infrequent categories are enabled by setting + `min_frequency` or `max_categories` to a non-default value. + `infrequent_categories_[i]` are the infrequent categories for feature + `i`. If the feature `i` has no infrequent categories + `infrequent_categories_[i]` is None. + + .. versionadded:: 1.1 + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 1.0 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + feature_name_combiner : callable or None + Callable with signature `def callable(input_feature, category)` that returns a + string. This is used to create feature names to be returned by + :meth:`get_feature_names_out`. + + .. versionadded:: 1.3 + + See Also + -------- + OrdinalEncoder : Performs an ordinal (integer) + encoding of the categorical features. + TargetEncoder : Encodes categorical features using the target. + sklearn.feature_extraction.DictVectorizer : Performs a one-hot encoding of + dictionary items (also handles string-valued features). + sklearn.feature_extraction.FeatureHasher : Performs an approximate one-hot + encoding of dictionary items or strings. + LabelBinarizer : Binarizes labels in a one-vs-all + fashion. + MultiLabelBinarizer : Transforms between iterable of + iterables and a multilabel format, e.g. a (samples x classes) binary + matrix indicating the presence of a class label. + + Examples + -------- + Given a dataset with two features, we let the encoder find the unique + values per feature and transform the data to a binary one-hot encoding. + + >>> from sklearn.preprocessing import OneHotEncoder + + One can discard categories not seen during `fit`: + + >>> enc = OneHotEncoder(handle_unknown='ignore') + >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] + >>> enc.fit(X) + OneHotEncoder(handle_unknown='ignore') + >>> enc.categories_ + [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] + >>> enc.transform([['Female', 1], ['Male', 4]]).toarray() + array([[1., 0., 1., 0., 0.], + [0., 1., 0., 0., 0.]]) + >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]]) + array([['Male', 1], + [None, 2]], dtype=object) + >>> enc.get_feature_names_out(['gender', 'group']) + array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'], ...) + + One can always drop the first column for each feature: + + >>> drop_enc = OneHotEncoder(drop='first').fit(X) + >>> drop_enc.categories_ + [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] + >>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray() + array([[0., 0., 0.], + [1., 1., 0.]]) + + Or drop a column for feature only having 2 categories: + + >>> drop_binary_enc = OneHotEncoder(drop='if_binary').fit(X) + >>> drop_binary_enc.transform([['Female', 1], ['Male', 2]]).toarray() + array([[0., 1., 0., 0.], + [1., 0., 1., 0.]]) + + One can change the way feature names are created. + + >>> def custom_combiner(feature, category): + ... return str(feature) + "_" + type(category).__name__ + "_" + str(category) + >>> custom_fnames_enc = OneHotEncoder(feature_name_combiner=custom_combiner).fit(X) + >>> custom_fnames_enc.get_feature_names_out() + array(['x0_str_Female', 'x0_str_Male', 'x1_int_1', 'x1_int_2', 'x1_int_3'], + dtype=object) + + Infrequent categories are enabled by setting `max_categories` or `min_frequency`. + + >>> import numpy as np + >>> X = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object).T + >>> ohe = OneHotEncoder(max_categories=3, sparse_output=False).fit(X) + >>> ohe.infrequent_categories_ + [array(['a', 'd'], dtype=object)] + >>> ohe.transform([["a"], ["b"]]) + array([[0., 0., 1.], + [1., 0., 0.]]) + """ + + _parameter_constraints: dict = { + "categories": [StrOptions({"auto"}), list], + "drop": [StrOptions({"first", "if_binary"}), "array-like", None], + "dtype": "no_validation", # validation delegated to numpy + "handle_unknown": [ + StrOptions({"error", "ignore", "infrequent_if_exist", "warn"}) + ], + "max_categories": [Interval(Integral, 1, None, closed="left"), None], + "min_frequency": [ + Interval(Integral, 1, None, closed="left"), + Interval(RealNotInt, 0, 1, closed="neither"), + None, + ], + "sparse_output": ["boolean"], + "feature_name_combiner": [StrOptions({"concat"}), callable], + } + + def __init__( + self, + *, + categories="auto", + drop=None, + sparse_output=True, + dtype=np.float64, + handle_unknown="error", + min_frequency=None, + max_categories=None, + feature_name_combiner="concat", + ): + self.categories = categories + self.sparse_output = sparse_output + self.dtype = dtype + self.handle_unknown = handle_unknown + self.drop = drop + self.min_frequency = min_frequency + self.max_categories = max_categories + self.feature_name_combiner = feature_name_combiner + + def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx): + """Convert `drop_idx` into the index for infrequent categories. + + If there are no infrequent categories, then `drop_idx` is + returned. This method is called in `_set_drop_idx` when the `drop` + parameter is an array-like. + """ + if not self._infrequent_enabled: + return drop_idx + + default_to_infrequent = self._default_to_infrequent_mappings[feature_idx] + if default_to_infrequent is None: + return drop_idx + + # Raise error when explicitly dropping a category that is infrequent + infrequent_indices = self._infrequent_indices[feature_idx] + if infrequent_indices is not None and drop_idx in infrequent_indices: + categories = self.categories_[feature_idx] + raise ValueError( + f"Unable to drop category {categories[drop_idx].item()!r} from" + f" feature {feature_idx} because it is infrequent" + ) + return default_to_infrequent[drop_idx] + + def _set_drop_idx(self): + """Compute the drop indices associated with `self.categories_`. + + If `self.drop` is: + - `None`, No categories have been dropped. + - `'first'`, All zeros to drop the first category. + - `'if_binary'`, All zeros if the category is binary and `None` + otherwise. + - array-like, The indices of the categories that match the + categories in `self.drop`. If the dropped category is an infrequent + category, then the index for the infrequent category is used. This + means that the entire infrequent category is dropped. + + This methods defines a public `drop_idx_` and a private + `_drop_idx_after_grouping`. + + - `drop_idx_`: Public facing API that references the drop category in + `self.categories_`. + - `_drop_idx_after_grouping`: Used internally to drop categories *after* the + infrequent categories are grouped together. + + If there are no infrequent categories or drop is `None`, then + `drop_idx_=_drop_idx_after_grouping`. + """ + if self.drop is None: + drop_idx_after_grouping = None + elif isinstance(self.drop, str): + if self.drop == "first": + drop_idx_after_grouping = np.zeros(len(self.categories_), dtype=object) + elif self.drop == "if_binary": + n_features_out_no_drop = [len(cat) for cat in self.categories_] + if self._infrequent_enabled: + for i, infreq_idx in enumerate(self._infrequent_indices): + if infreq_idx is None: + continue + n_features_out_no_drop[i] -= infreq_idx.size - 1 + + drop_idx_after_grouping = np.array( + [ + 0 if n_features_out == 2 else None + for n_features_out in n_features_out_no_drop + ], + dtype=object, + ) + + else: + drop_array = np.asarray(self.drop, dtype=object) + droplen = len(drop_array) + + if droplen != len(self.categories_): + msg = ( + "`drop` should have length equal to the number " + "of features ({}), got {}" + ) + raise ValueError(msg.format(len(self.categories_), droplen)) + missing_drops = [] + drop_indices = [] + for feature_idx, (drop_val, cat_list) in enumerate( + zip(drop_array, self.categories_) + ): + if not is_scalar_nan(drop_val): + drop_idx = np.where(cat_list == drop_val)[0] + if drop_idx.size: # found drop idx + drop_indices.append( + self._map_drop_idx_to_infrequent(feature_idx, drop_idx[0]) + ) + else: + missing_drops.append((feature_idx, drop_val)) + continue + + # drop_val is nan, find nan in categories manually + if is_scalar_nan(cat_list[-1]): + drop_indices.append( + self._map_drop_idx_to_infrequent(feature_idx, cat_list.size - 1) + ) + else: # nan is missing + missing_drops.append((feature_idx, drop_val)) + + if any(missing_drops): + msg = ( + "The following categories were supposed to be " + "dropped, but were not found in the training " + "data.\n{}".format( + "\n".join( + [ + "Category: {}, Feature: {}".format(c, v) + for c, v in missing_drops + ] + ) + ) + ) + raise ValueError(msg) + drop_idx_after_grouping = np.array(drop_indices, dtype=object) + + # `_drop_idx_after_grouping` are the categories to drop *after* the infrequent + # categories are grouped together. If needed, we remap `drop_idx` back + # to the categories seen in `self.categories_`. + self._drop_idx_after_grouping = drop_idx_after_grouping + + if not self._infrequent_enabled or drop_idx_after_grouping is None: + self.drop_idx_ = self._drop_idx_after_grouping + else: + drop_idx_ = [] + for feature_idx, drop_idx in enumerate(drop_idx_after_grouping): + default_to_infrequent = self._default_to_infrequent_mappings[ + feature_idx + ] + if drop_idx is None or default_to_infrequent is None: + orig_drop_idx = drop_idx + else: + orig_drop_idx = np.flatnonzero(default_to_infrequent == drop_idx)[0] + + drop_idx_.append(orig_drop_idx) + + self.drop_idx_ = np.asarray(drop_idx_, dtype=object) + + def _compute_transformed_categories(self, i, remove_dropped=True): + """Compute the transformed categories used for column `i`. + + 1. If there are infrequent categories, the category is named + 'infrequent_sklearn'. + 2. Dropped columns are removed when remove_dropped=True. + """ + cats = self.categories_[i] + + if self._infrequent_enabled: + infreq_map = self._default_to_infrequent_mappings[i] + if infreq_map is not None: + frequent_mask = infreq_map < infreq_map.max() + infrequent_cat = "infrequent_sklearn" + # infrequent category is always at the end + cats = np.concatenate( + (cats[frequent_mask], np.array([infrequent_cat], dtype=object)) + ) + + if remove_dropped: + cats = self._remove_dropped_categories(cats, i) + return cats + + def _remove_dropped_categories(self, categories, i): + """Remove dropped categories.""" + if ( + self._drop_idx_after_grouping is not None + and self._drop_idx_after_grouping[i] is not None + ): + return np.delete(categories, self._drop_idx_after_grouping[i]) + return categories + + def _compute_n_features_outs(self): + """Compute the n_features_out for each input feature.""" + output = [len(cats) for cats in self.categories_] + + if self._drop_idx_after_grouping is not None: + for i, drop_idx in enumerate(self._drop_idx_after_grouping): + if drop_idx is not None: + output[i] -= 1 + + if not self._infrequent_enabled: + return output + + # infrequent is enabled, the number of features out are reduced + # because the infrequent categories are grouped together + for i, infreq_idx in enumerate(self._infrequent_indices): + if infreq_idx is None: + continue + output[i] -= infreq_idx.size - 1 + + return output + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """ + Fit OneHotEncoder to X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to determine the categories of each feature. + + y : None + Ignored. This parameter exists only for compatibility with + :class:`~sklearn.pipeline.Pipeline`. + + Returns + ------- + self + Fitted encoder. + """ + self._fit( + X, + handle_unknown=self.handle_unknown, + ensure_all_finite="allow-nan", + ) + self._set_drop_idx() + self._n_features_outs = self._compute_n_features_outs() + return self + + def transform(self, X): + """ + Transform X using one-hot encoding. + + If `sparse_output=True` (default), it returns an instance of + :class:`scipy.sparse._csr.csr_matrix` (CSR format). + + If there are infrequent categories for a feature, set by specifying + `max_categories` or `min_frequency`, the infrequent categories are + grouped into a single category. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to encode. + + Returns + ------- + X_out : {ndarray, sparse matrix} of shape \ + (n_samples, n_encoded_features) + Transformed input. If `sparse_output=True`, a sparse matrix will be + returned. + """ + check_is_fitted(self) + transform_output = _get_output_config("transform", estimator=self)["dense"] + if transform_output != "default" and self.sparse_output: + capitalize_transform_output = transform_output.capitalize() + raise ValueError( + f"{capitalize_transform_output} output does not support sparse data." + f" Set sparse_output=False to output {transform_output} dataframes or" + f" disable {capitalize_transform_output} output via" + '` ohe.set_output(transform="default").' + ) + + # validation of X happens in _check_X called by _transform + if self.handle_unknown == "warn": + warn_on_unknown, handle_unknown = True, "infrequent_if_exist" + else: + warn_on_unknown = self.drop is not None and self.handle_unknown in { + "ignore", + "infrequent_if_exist", + } + handle_unknown = self.handle_unknown + X_int, X_mask = self._transform( + X, + handle_unknown=handle_unknown, + ensure_all_finite="allow-nan", + warn_on_unknown=warn_on_unknown, + ) + + n_samples, n_features = X_int.shape + + if self._drop_idx_after_grouping is not None: + to_drop = self._drop_idx_after_grouping.copy() + # We remove all the dropped categories from mask, and decrement all + # categories that occur after them to avoid an empty column. + keep_cells = X_int != to_drop + for i, cats in enumerate(self.categories_): + # drop='if_binary' but feature isn't binary + if to_drop[i] is None: + # set to cardinality to not drop from X_int + to_drop[i] = len(cats) + + to_drop = to_drop.reshape(1, -1) + X_int[X_int > to_drop] -= 1 + X_mask &= keep_cells + + mask = X_mask.ravel() + feature_indices = np.cumsum([0] + self._n_features_outs) + indices = (X_int + feature_indices[:-1]).ravel()[mask] + + indptr = np.empty(n_samples + 1, dtype=int) + indptr[0] = 0 + np.sum(X_mask, axis=1, out=indptr[1:], dtype=indptr.dtype) + np.cumsum(indptr[1:], out=indptr[1:]) + data = np.ones(indptr[-1]) + + out = sparse.csr_matrix( + (data, indices, indptr), + shape=(n_samples, feature_indices[-1]), + dtype=self.dtype, + ) + if not self.sparse_output: + return out.toarray() + else: + return out + + def inverse_transform(self, X): + """ + Convert the data back to the original representation. + + When unknown categories are encountered (all zeros in the + one-hot encoding), ``None`` is used to represent this category. If the + feature with the unknown category has a dropped category, the dropped + category will be its inverse. + + For a given input feature, if there is an infrequent category, + 'infrequent_sklearn' will be used to represent the infrequent category. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape \ + (n_samples, n_encoded_features) + The transformed data. + + Returns + ------- + X_original : ndarray of shape (n_samples, n_features) + Inverse transformed array. + """ + check_is_fitted(self) + X = check_array(X, accept_sparse="csr") + + n_samples, _ = X.shape + n_features = len(self.categories_) + + n_features_out = np.sum(self._n_features_outs) + + # validate shape of passed X + msg = ( + "Shape of the passed X data is not correct. Expected {0} columns, got {1}." + ) + if X.shape[1] != n_features_out: + raise ValueError(msg.format(n_features_out, X.shape[1])) + + transformed_features = [ + self._compute_transformed_categories(i, remove_dropped=False) + for i, _ in enumerate(self.categories_) + ] + + # create resulting array of appropriate dtype + dt = np.result_type(*[cat.dtype for cat in transformed_features]) + X_tr = np.empty((n_samples, n_features), dtype=dt) + + j = 0 + found_unknown = {} + + if self._infrequent_enabled: + infrequent_indices = self._infrequent_indices + else: + infrequent_indices = [None] * n_features + + for i in range(n_features): + cats_wo_dropped = self._remove_dropped_categories( + transformed_features[i], i + ) + n_categories = cats_wo_dropped.shape[0] + + # Only happens if there was a column with a unique + # category. In this case we just fill the column with this + # unique category value. + if n_categories == 0: + X_tr[:, i] = self.categories_[i][self._drop_idx_after_grouping[i]] + j += n_categories + continue + sub = X[:, j : j + n_categories] + # for sparse X argmax returns 2D matrix, ensure 1D array + labels = np.asarray(sub.argmax(axis=1)).flatten() + X_tr[:, i] = cats_wo_dropped[labels] + + if self.handle_unknown == "ignore" or ( + self.handle_unknown in ("infrequent_if_exist", "warn") + and infrequent_indices[i] is None + ): + unknown = np.asarray(sub.sum(axis=1) == 0).flatten() + # ignored unknown categories: we have a row of all zero + if unknown.any(): + # if categories were dropped then unknown categories will + # be mapped to the dropped category + if ( + self._drop_idx_after_grouping is None + or self._drop_idx_after_grouping[i] is None + ): + found_unknown[i] = unknown + else: + X_tr[unknown, i] = self.categories_[i][ + self._drop_idx_after_grouping[i] + ] + else: + dropped = np.asarray(sub.sum(axis=1) == 0).flatten() + if dropped.any(): + if self._drop_idx_after_grouping is None: + all_zero_samples = np.flatnonzero(dropped) + raise ValueError( + f"Samples {all_zero_samples} can not be inverted " + "when drop=None and handle_unknown='error' " + "because they contain all zeros" + ) + # we can safely assume that all of the nulls in each column + # are the dropped value + drop_idx = self._drop_idx_after_grouping[i] + X_tr[dropped, i] = transformed_features[i][drop_idx] + + j += n_categories + + # if ignored are found: potentially need to upcast result to + # insert None values + if found_unknown: + if X_tr.dtype != object: + X_tr = X_tr.astype(object) + + for idx, mask in found_unknown.items(): + X_tr[mask, idx] = None + + return X_tr + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then the following input feature names are generated: + `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + check_is_fitted(self) + input_features = _check_feature_names_in(self, input_features) + cats = [ + self._compute_transformed_categories(i) + for i, _ in enumerate(self.categories_) + ] + + name_combiner = self._check_get_feature_name_combiner() + feature_names = [] + for i in range(len(cats)): + names = [name_combiner(input_features[i], t) for t in cats[i]] + feature_names.extend(names) + + return np.array(feature_names, dtype=object) + + def _check_get_feature_name_combiner(self): + if self.feature_name_combiner == "concat": + return lambda feature, category: feature + "_" + str(category) + else: # callable + dry_run_combiner = self.feature_name_combiner("feature", "category") + if not isinstance(dry_run_combiner, str): + raise TypeError( + "When `feature_name_combiner` is a callable, it should return a " + f"Python string. Got {type(dry_run_combiner)} instead." + ) + return self.feature_name_combiner + + +class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder): + """ + Encode categorical features as an integer array. + + The input to this transformer should be an array-like of integers or + strings, denoting the values taken on by categorical (discrete) features. + The features are converted to ordinal integers. This results in + a single column of integers (0 to n_categories - 1) per feature. + + Read more in the :ref:`User Guide `. + For a comparison of different encoders, refer to: + :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`. + + .. versionadded:: 0.20 + + Parameters + ---------- + categories : 'auto' or a list of array-like, default='auto' + Categories (unique values) per feature: + + - 'auto' : Determine categories automatically from the training data. + - list : ``categories[i]`` holds the categories expected in the ith + column. The passed categories should not mix strings and numeric + values, and should be sorted in case of numeric values. + + The used categories can be found in the ``categories_`` attribute. + + dtype : number type, default=np.float64 + Desired dtype of output. + + handle_unknown : {'error', 'use_encoded_value'}, default='error' + When set to 'error' an error will be raised in case an unknown + categorical feature is present during transform. When set to + 'use_encoded_value', the encoded value of unknown categories will be + set to the value given for the parameter `unknown_value`. In + :meth:`inverse_transform`, an unknown category will be denoted as None. + + .. versionadded:: 0.24 + + unknown_value : int or np.nan, default=None + When the parameter handle_unknown is set to 'use_encoded_value', this + parameter is required and will set the encoded value of unknown + categories. It has to be distinct from the values used to encode any of + the categories in `fit`. If set to np.nan, the `dtype` parameter must + be a float dtype. + + .. versionadded:: 0.24 + + encoded_missing_value : int or np.nan, default=np.nan + Encoded value of missing categories. If set to `np.nan`, then the `dtype` + parameter must be a float dtype. + + .. versionadded:: 1.1 + + min_frequency : int or float, default=None + Specifies the minimum frequency below which a category will be + considered infrequent. + + - If `int`, categories with a smaller cardinality will be considered + infrequent. + + - If `float`, categories with a smaller cardinality than + `min_frequency * n_samples` will be considered infrequent. + + .. versionadded:: 1.3 + Read more in the :ref:`User Guide `. + + max_categories : int, default=None + Specifies an upper limit to the number of output categories for each input + feature when considering infrequent categories. If there are infrequent + categories, `max_categories` includes the category representing the + infrequent categories along with the frequent categories. If `None`, + there is no limit to the number of output features. + + `max_categories` do **not** take into account missing or unknown + categories. Setting `unknown_value` or `encoded_missing_value` to an + integer will increase the number of unique integer codes by one each. + This can result in up to `max_categories + 2` integer codes. + + .. versionadded:: 1.3 + Read more in the :ref:`User Guide `. + + Attributes + ---------- + categories_ : list of arrays + The categories of each feature determined during ``fit`` (in order of + the features in X and corresponding with the output of ``transform``). + This does not include categories that weren't seen during ``fit``. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 1.0 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + infrequent_categories_ : list of ndarray + Defined only if infrequent categories are enabled by setting + `min_frequency` or `max_categories` to a non-default value. + `infrequent_categories_[i]` are the infrequent categories for feature + `i`. If the feature `i` has no infrequent categories + `infrequent_categories_[i]` is None. + + .. versionadded:: 1.3 + + See Also + -------- + OneHotEncoder : Performs a one-hot encoding of categorical features. This encoding + is suitable for low to medium cardinality categorical variables, both in + supervised and unsupervised settings. + TargetEncoder : Encodes categorical features using supervised signal + in a classification or regression pipeline. This encoding is typically + suitable for high cardinality categorical variables. + LabelEncoder : Encodes target labels with values between 0 and + ``n_classes-1``. + + Notes + ----- + With a high proportion of `nan` values, inferring categories becomes slow with + Python versions before 3.10. The handling of `nan` values was improved + from Python 3.10 onwards, (c.f. + `bpo-43475 `_). + + Examples + -------- + Given a dataset with two features, we let the encoder find the unique + values per feature and transform the data to an ordinal encoding. + + >>> from sklearn.preprocessing import OrdinalEncoder + >>> enc = OrdinalEncoder() + >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] + >>> enc.fit(X) + OrdinalEncoder() + >>> enc.categories_ + [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] + >>> enc.transform([['Female', 3], ['Male', 1]]) + array([[0., 2.], + [1., 0.]]) + + >>> enc.inverse_transform([[1, 0], [0, 1]]) + array([['Male', 1], + ['Female', 2]], dtype=object) + + By default, :class:`OrdinalEncoder` is lenient towards missing values by + propagating them. + + >>> import numpy as np + >>> X = [['Male', 1], ['Female', 3], ['Female', np.nan]] + >>> enc.fit_transform(X) + array([[ 1., 0.], + [ 0., 1.], + [ 0., nan]]) + + You can use the parameter `encoded_missing_value` to encode missing values. + + >>> enc.set_params(encoded_missing_value=-1).fit_transform(X) + array([[ 1., 0.], + [ 0., 1.], + [ 0., -1.]]) + + Infrequent categories are enabled by setting `max_categories` or `min_frequency`. + In the following example, "a" and "d" are considered infrequent and grouped + together into a single category, "b" and "c" are their own categories, unknown + values are encoded as 3 and missing values are encoded as 4. + + >>> X_train = np.array( + ... [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3 + [np.nan]], + ... dtype=object).T + >>> enc = OrdinalEncoder( + ... handle_unknown="use_encoded_value", unknown_value=3, + ... max_categories=3, encoded_missing_value=4) + >>> _ = enc.fit(X_train) + >>> X_test = np.array([["a"], ["b"], ["c"], ["d"], ["e"], [np.nan]], dtype=object) + >>> enc.transform(X_test) + array([[2.], + [0.], + [1.], + [2.], + [3.], + [4.]]) + """ + + _parameter_constraints: dict = { + "categories": [StrOptions({"auto"}), list], + "dtype": "no_validation", # validation delegated to numpy + "encoded_missing_value": [Integral, type(np.nan)], + "handle_unknown": [StrOptions({"error", "use_encoded_value"})], + "unknown_value": [Integral, type(np.nan), None], + "max_categories": [Interval(Integral, 1, None, closed="left"), None], + "min_frequency": [ + Interval(Integral, 1, None, closed="left"), + Interval(RealNotInt, 0, 1, closed="neither"), + None, + ], + } + + def __init__( + self, + *, + categories="auto", + dtype=np.float64, + handle_unknown="error", + unknown_value=None, + encoded_missing_value=np.nan, + min_frequency=None, + max_categories=None, + ): + self.categories = categories + self.dtype = dtype + self.handle_unknown = handle_unknown + self.unknown_value = unknown_value + self.encoded_missing_value = encoded_missing_value + self.min_frequency = min_frequency + self.max_categories = max_categories + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """ + Fit the OrdinalEncoder to X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to determine the categories of each feature. + + y : None + Ignored. This parameter exists only for compatibility with + :class:`~sklearn.pipeline.Pipeline`. + + Returns + ------- + self : object + Fitted encoder. + """ + if self.handle_unknown == "use_encoded_value": + if is_scalar_nan(self.unknown_value): + if np.dtype(self.dtype).kind != "f": + raise ValueError( + "When unknown_value is np.nan, the dtype " + "parameter should be " + f"a float dtype. Got {self.dtype}." + ) + elif not isinstance(self.unknown_value, numbers.Integral): + raise TypeError( + "unknown_value should be an integer or " + "np.nan when " + "handle_unknown is 'use_encoded_value', " + f"got {self.unknown_value}." + ) + elif self.unknown_value is not None: + raise TypeError( + "unknown_value should only be set when " + "handle_unknown is 'use_encoded_value', " + f"got {self.unknown_value}." + ) + + # `_fit` will only raise an error when `self.handle_unknown="error"` + fit_results = self._fit( + X, + handle_unknown=self.handle_unknown, + ensure_all_finite="allow-nan", + return_and_ignore_missing_for_infrequent=True, + ) + self._missing_indices = fit_results["missing_indices"] + + cardinalities = [len(categories) for categories in self.categories_] + if self._infrequent_enabled: + # Cardinality decreases because the infrequent categories are grouped + # together + for feature_idx, infrequent in enumerate(self.infrequent_categories_): + if infrequent is not None: + cardinalities[feature_idx] -= len(infrequent) + + # missing values are not considered part of the cardinality + # when considering unknown categories or encoded_missing_value + for cat_idx, categories_for_idx in enumerate(self.categories_): + if is_scalar_nan(categories_for_idx[-1]): + cardinalities[cat_idx] -= 1 + + if self.handle_unknown == "use_encoded_value": + for cardinality in cardinalities: + if 0 <= self.unknown_value < cardinality: + raise ValueError( + "The used value for unknown_value " + f"{self.unknown_value} is one of the " + "values already used for encoding the " + "seen categories." + ) + + if self._missing_indices: + if np.dtype(self.dtype).kind != "f" and is_scalar_nan( + self.encoded_missing_value + ): + raise ValueError( + "There are missing values in features " + f"{list(self._missing_indices)}. For OrdinalEncoder to " + f"encode missing values with dtype: {self.dtype}, set " + "encoded_missing_value to a non-nan value, or " + "set dtype to a float" + ) + + if not is_scalar_nan(self.encoded_missing_value): + # Features are invalid when they contain a missing category + # and encoded_missing_value was already used to encode a + # known category + invalid_features = [ + cat_idx + for cat_idx, cardinality in enumerate(cardinalities) + if cat_idx in self._missing_indices + and 0 <= self.encoded_missing_value < cardinality + ] + + if invalid_features: + # Use feature names if they are available + if hasattr(self, "feature_names_in_"): + invalid_features = self.feature_names_in_[invalid_features] + raise ValueError( + f"encoded_missing_value ({self.encoded_missing_value}) " + "is already used to encode a known category in features: " + f"{invalid_features}" + ) + + return self + + def transform(self, X): + """ + Transform X to ordinal codes. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to encode. + + Returns + ------- + X_out : ndarray of shape (n_samples, n_features) + Transformed input. + """ + check_is_fitted(self, "categories_") + X_int, X_mask = self._transform( + X, + handle_unknown=self.handle_unknown, + ensure_all_finite="allow-nan", + ignore_category_indices=self._missing_indices, + ) + X_trans = X_int.astype(self.dtype, copy=False) + + for cat_idx, missing_idx in self._missing_indices.items(): + X_missing_mask = X_int[:, cat_idx] == missing_idx + X_trans[X_missing_mask, cat_idx] = self.encoded_missing_value + + # create separate category for unknown values + if self.handle_unknown == "use_encoded_value": + X_trans[~X_mask] = self.unknown_value + return X_trans + + def inverse_transform(self, X): + """ + Convert the data back to the original representation. + + Parameters + ---------- + X : array-like of shape (n_samples, n_encoded_features) + The transformed data. + + Returns + ------- + X_original : ndarray of shape (n_samples, n_features) + Inverse transformed array. + """ + check_is_fitted(self) + X = check_array(X, ensure_all_finite="allow-nan") + + n_samples, _ = X.shape + n_features = len(self.categories_) + + # validate shape of passed X + msg = ( + "Shape of the passed X data is not correct. Expected {0} columns, got {1}." + ) + if X.shape[1] != n_features: + raise ValueError(msg.format(n_features, X.shape[1])) + + # create resulting array of appropriate dtype + dt = np.result_type(*[cat.dtype for cat in self.categories_]) + X_tr = np.empty((n_samples, n_features), dtype=dt) + + found_unknown = {} + infrequent_masks = {} + + infrequent_indices = getattr(self, "_infrequent_indices", None) + + for i in range(n_features): + labels = X[:, i] + + # replace values of X[:, i] that were nan with actual indices + if i in self._missing_indices: + X_i_mask = _get_mask(labels, self.encoded_missing_value) + labels[X_i_mask] = self._missing_indices[i] + + rows_to_update = slice(None) + categories = self.categories_[i] + + if infrequent_indices is not None and infrequent_indices[i] is not None: + # Compute mask for frequent categories + infrequent_encoding_value = len(categories) - len(infrequent_indices[i]) + infrequent_masks[i] = labels == infrequent_encoding_value + rows_to_update = ~infrequent_masks[i] + + # Remap categories to be only frequent categories. The infrequent + # categories will be mapped to "infrequent_sklearn" later + frequent_categories_mask = np.ones_like(categories, dtype=bool) + frequent_categories_mask[infrequent_indices[i]] = False + categories = categories[frequent_categories_mask] + + if self.handle_unknown == "use_encoded_value": + unknown_labels = _get_mask(labels, self.unknown_value) + found_unknown[i] = unknown_labels + + known_labels = ~unknown_labels + if isinstance(rows_to_update, np.ndarray): + rows_to_update &= known_labels + else: + rows_to_update = known_labels + + labels_int = labels[rows_to_update].astype("int64", copy=False) + X_tr[rows_to_update, i] = categories[labels_int] + + if found_unknown or infrequent_masks: + X_tr = X_tr.astype(object, copy=False) + + # insert None values for unknown values + if found_unknown: + for idx, mask in found_unknown.items(): + X_tr[mask, idx] = None + + if infrequent_masks: + for idx, mask in infrequent_masks.items(): + X_tr[mask, idx] = "infrequent_sklearn" + + return X_tr diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_function_transformer.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_function_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..f3530f3284dc941f582acd254f563fb29b3215c1 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_function_transformer.py @@ -0,0 +1,449 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from functools import partial + +import numpy as np + +from ..base import BaseEstimator, TransformerMixin, _fit_context +from ..utils._param_validation import StrOptions +from ..utils._repr_html.estimator import _VisualBlock +from ..utils._set_output import ( + _get_adapter_from_container, + _get_output_config, +) +from ..utils.metaestimators import available_if +from ..utils.validation import ( + _allclose_dense_sparse, + _check_feature_names, + _check_feature_names_in, + _check_n_features, + _get_feature_names, + _is_pandas_df, + _is_polars_df, + check_array, + validate_data, +) + + +def _identity(X): + """The identity function.""" + return X + + +class FunctionTransformer(TransformerMixin, BaseEstimator): + """Constructs a transformer from an arbitrary callable. + + A FunctionTransformer forwards its X (and optionally y) arguments to a + user-defined function or function object and returns the result of this + function. This is useful for stateless transformations such as taking the + log of frequencies, doing custom scaling, etc. + + Note: If a lambda is used as the function, then the resulting + transformer will not be pickleable. + + .. versionadded:: 0.17 + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + func : callable, default=None + The callable to use for the transformation. This will be passed + the same arguments as transform, with args and kwargs forwarded. + If func is None, then func will be the identity function. + + inverse_func : callable, default=None + The callable to use for the inverse transformation. This will be + passed the same arguments as inverse transform, with args and + kwargs forwarded. If inverse_func is None, then inverse_func + will be the identity function. + + validate : bool, default=False + Indicate that the input X array should be checked before calling + ``func``. The possibilities are: + + - If False, there is no input validation. + - If True, then X will be converted to a 2-dimensional NumPy array or + sparse matrix. If the conversion is not possible an exception is + raised. + + .. versionchanged:: 0.22 + The default of ``validate`` changed from True to False. + + accept_sparse : bool, default=False + Indicate that func accepts a sparse matrix as input. If validate is + False, this has no effect. Otherwise, if accept_sparse is false, + sparse matrix inputs will cause an exception to be raised. + + check_inverse : bool, default=True + Whether to check that or ``func`` followed by ``inverse_func`` leads to + the original inputs. It can be used for a sanity check, raising a + warning when the condition is not fulfilled. + + .. versionadded:: 0.20 + + feature_names_out : callable, 'one-to-one' or None, default=None + Determines the list of feature names that will be returned by the + `get_feature_names_out` method. If it is 'one-to-one', then the output + feature names will be equal to the input feature names. If it is a + callable, then it must take two positional arguments: this + `FunctionTransformer` (`self`) and an array-like of input feature names + (`input_features`). It must return an array-like of output feature + names. The `get_feature_names_out` method is only defined if + `feature_names_out` is not None. + + See ``get_feature_names_out`` for more details. + + .. versionadded:: 1.1 + + kw_args : dict, default=None + Dictionary of additional keyword arguments to pass to func. + + .. versionadded:: 0.18 + + inv_kw_args : dict, default=None + Dictionary of additional keyword arguments to pass to inverse_func. + + .. versionadded:: 0.18 + + Attributes + ---------- + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 1.0 + + See Also + -------- + MaxAbsScaler : Scale each feature by its maximum absolute value. + StandardScaler : Standardize features by removing the mean and + scaling to unit variance. + LabelBinarizer : Binarize labels in a one-vs-all fashion. + MultiLabelBinarizer : Transform between iterable of iterables + and a multilabel format. + + Notes + ----- + If `func` returns an output with a `columns` attribute, then the columns is enforced + to be consistent with the output of `get_feature_names_out`. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.preprocessing import FunctionTransformer + >>> transformer = FunctionTransformer(np.log1p) + >>> X = np.array([[0, 1], [2, 3]]) + >>> transformer.transform(X) + array([[0. , 0.6931], + [1.0986, 1.3862]]) + """ + + _parameter_constraints: dict = { + "func": [callable, None], + "inverse_func": [callable, None], + "validate": ["boolean"], + "accept_sparse": ["boolean"], + "check_inverse": ["boolean"], + "feature_names_out": [callable, StrOptions({"one-to-one"}), None], + "kw_args": [dict, None], + "inv_kw_args": [dict, None], + } + + def __init__( + self, + func=None, + inverse_func=None, + *, + validate=False, + accept_sparse=False, + check_inverse=True, + feature_names_out=None, + kw_args=None, + inv_kw_args=None, + ): + self.func = func + self.inverse_func = inverse_func + self.validate = validate + self.accept_sparse = accept_sparse + self.check_inverse = check_inverse + self.feature_names_out = feature_names_out + self.kw_args = kw_args + self.inv_kw_args = inv_kw_args + + def _check_input(self, X, *, reset): + if self.validate: + return validate_data(self, X, accept_sparse=self.accept_sparse, reset=reset) + elif reset: + # Set feature_names_in_ and n_features_in_ even if validate=False + # We run this only when reset==True to store the attributes but not + # validate them, because validate=False + _check_n_features(self, X, reset=reset) + _check_feature_names(self, X, reset=reset) + return X + + def _check_inverse_transform(self, X): + """Check that func and inverse_func are the inverse.""" + idx_selected = slice(None, None, max(1, X.shape[0] // 100)) + X_round_trip = self.inverse_transform(self.transform(X[idx_selected])) + + if hasattr(X, "dtype"): + dtypes = [X.dtype] + elif hasattr(X, "dtypes"): + # Dataframes can have multiple dtypes + dtypes = X.dtypes + + # Not all dtypes are numpy dtypes, they can be pandas dtypes as well + if not all( + isinstance(d, np.dtype) and np.issubdtype(d, np.number) for d in dtypes + ): + raise ValueError( + "'check_inverse' is only supported when all the elements in `X` is" + " numerical." + ) + + if not _allclose_dense_sparse(X[idx_selected], X_round_trip): + warnings.warn( + ( + "The provided functions are not strictly" + " inverse of each other. If you are sure you" + " want to proceed regardless, set" + " 'check_inverse=False'." + ), + UserWarning, + ) + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """Fit transformer by checking X. + + If ``validate`` is ``True``, ``X`` will be checked. + + Parameters + ---------- + X : {array-like, sparse-matrix} of shape (n_samples, n_features) \ + if `validate=True` else any object that `func` can handle + Input array. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self : object + FunctionTransformer class instance. + """ + X = self._check_input(X, reset=True) + if self.check_inverse and not (self.func is None or self.inverse_func is None): + self._check_inverse_transform(X) + return self + + def transform(self, X): + """Transform X using the forward function. + + Parameters + ---------- + X : {array-like, sparse-matrix} of shape (n_samples, n_features) \ + if `validate=True` else any object that `func` can handle + Input array. + + Returns + ------- + X_out : array-like, shape (n_samples, n_features) + Transformed input. + """ + X = self._check_input(X, reset=False) + out = self._transform(X, func=self.func, kw_args=self.kw_args) + output_config = _get_output_config("transform", self)["dense"] + + if hasattr(out, "columns") and self.feature_names_out is not None: + # check the consistency between the column provided by `transform` and + # the column names provided by `get_feature_names_out`. + feature_names_out = self.get_feature_names_out() + if list(out.columns) != list(feature_names_out): + # we can override the column names of the output if it is inconsistent + # with the column names provided by `get_feature_names_out` in the + # following cases: + # * `func` preserved the column names between the input and the output + # * the input column names are all numbers + # * the output is requested to be a DataFrame (pandas or polars) + feature_names_in = getattr( + X, "feature_names_in_", _get_feature_names(X) + ) + same_feature_names_in_out = feature_names_in is not None and list( + feature_names_in + ) == list(out.columns) + not_all_str_columns = not all( + isinstance(col, str) for col in out.columns + ) + if same_feature_names_in_out or not_all_str_columns: + adapter = _get_adapter_from_container(out) + out = adapter.create_container( + X_output=out, + X_original=out, + columns=feature_names_out, + inplace=False, + ) + else: + raise ValueError( + "The output generated by `func` have different column names " + "than the ones provided by `get_feature_names_out`. " + f"Got output with columns names: {list(out.columns)} and " + "`get_feature_names_out` returned: " + f"{list(self.get_feature_names_out())}. " + "The column names can be overridden by setting " + "`set_output(transform='pandas')` or " + "`set_output(transform='polars')` such that the column names " + "are set to the names provided by `get_feature_names_out`." + ) + + if self.feature_names_out is None: + warn_msg = ( + "When `set_output` is configured to be '{0}', `func` should return " + "a {0} DataFrame to follow the `set_output` API or `feature_names_out`" + " should be defined." + ) + if output_config == "pandas" and not _is_pandas_df(out): + warnings.warn(warn_msg.format("pandas")) + elif output_config == "polars" and not _is_polars_df(out): + warnings.warn(warn_msg.format("polars")) + + return out + + def inverse_transform(self, X): + """Transform X using the inverse function. + + Parameters + ---------- + X : {array-like, sparse-matrix} of shape (n_samples, n_features) \ + if `validate=True` else any object that `inverse_func` can handle + Input array. + + Returns + ------- + X_original : array-like, shape (n_samples, n_features) + Transformed input. + """ + if self.validate: + X = check_array(X, accept_sparse=self.accept_sparse) + return self._transform(X, func=self.inverse_func, kw_args=self.inv_kw_args) + + @available_if(lambda self: self.feature_names_out is not None) + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + This method is only defined if `feature_names_out` is not None. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input feature names. + + - If `input_features` is None, then `feature_names_in_` is + used as the input feature names. If `feature_names_in_` is not + defined, then names are generated: + `[x0, x1, ..., x(n_features_in_ - 1)]`. + - If `input_features` is array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + + - If `feature_names_out` is 'one-to-one', the input feature names + are returned (see `input_features` above). This requires + `feature_names_in_` and/or `n_features_in_` to be defined, which + is done automatically if `validate=True`. Alternatively, you can + set them in `func`. + - If `feature_names_out` is a callable, then it is called with two + arguments, `self` and `input_features`, and its return value is + returned by this method. + """ + if hasattr(self, "n_features_in_") or input_features is not None: + input_features = _check_feature_names_in(self, input_features) + if self.feature_names_out == "one-to-one": + names_out = input_features + elif callable(self.feature_names_out): + names_out = self.feature_names_out(self, input_features) + else: + raise ValueError( + f"feature_names_out={self.feature_names_out!r} is invalid. " + 'It must either be "one-to-one" or a callable with two ' + "arguments: the function transformer and an array-like of " + "input feature names. The callable must return an array-like " + "of output feature names." + ) + return np.asarray(names_out, dtype=object) + + def _transform(self, X, func=None, kw_args=None): + if func is None: + func = _identity + + return func(X, **(kw_args if kw_args else {})) + + def __sklearn_is_fitted__(self): + """Return True since FunctionTransfomer is stateless.""" + return True + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.no_validation = not self.validate + tags.requires_fit = False + tags.input_tags.sparse = not self.validate or self.accept_sparse + return tags + + def set_output(self, *, transform=None): + """Set output container. + + See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py` + for an example on how to use the API. + + Parameters + ---------- + transform : {"default", "pandas", "polars"}, default=None + Configure output of `transform` and `fit_transform`. + + - `"default"`: Default output format of a transformer + - `"pandas"`: DataFrame output + - `"polars"`: Polars output + - `None`: Transform configuration is unchanged + + .. versionadded:: 1.4 + `"polars"` option was added. + + Returns + ------- + self : estimator instance + Estimator instance. + """ + if not hasattr(self, "_sklearn_output_config"): + self._sklearn_output_config = {} + + self._sklearn_output_config["transform"] = transform + return self + + def _get_function_name(self): + """Get the name display of the `func` used in HTML representation.""" + if hasattr(self.func, "__name__"): + return self.func.__name__ + if isinstance(self.func, partial): + return self.func.func.__name__ + return f"{self.func.__class__.__name__}(...)" + + def _sk_visual_block_(self): + return _VisualBlock( + "single", + self, + names=self._get_function_name(), + name_details=str(self), + name_caption="FunctionTransformer", + doc_link_label="FunctionTransformer", + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_label.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_label.py new file mode 100644 index 0000000000000000000000000000000000000000..dd721b35a35217bc6cb8badfb8ff66e2bdc15c8e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_label.py @@ -0,0 +1,963 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import array +import itertools +import warnings +from collections import defaultdict +from numbers import Integral + +import numpy as np +import scipy.sparse as sp + +from ..base import BaseEstimator, TransformerMixin, _fit_context +from ..utils import column_or_1d +from ..utils._array_api import device, get_namespace, xpx +from ..utils._encode import _encode, _unique +from ..utils._param_validation import Interval, validate_params +from ..utils.multiclass import type_of_target, unique_labels +from ..utils.sparsefuncs import min_max_axis +from ..utils.validation import _num_samples, check_array, check_is_fitted + +__all__ = [ + "LabelBinarizer", + "LabelEncoder", + "MultiLabelBinarizer", + "label_binarize", +] + + +class LabelEncoder(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None): + """Encode target labels with value between 0 and n_classes-1. + + This transformer should be used to encode target values, *i.e.* `y`, and + not the input `X`. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.12 + + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) + Holds the label for each class. + + See Also + -------- + OrdinalEncoder : Encode categorical features using an ordinal encoding + scheme. + OneHotEncoder : Encode categorical features as a one-hot numeric array. + + Examples + -------- + `LabelEncoder` can be used to normalize labels. + + >>> from sklearn.preprocessing import LabelEncoder + >>> le = LabelEncoder() + >>> le.fit([1, 2, 2, 6]) + LabelEncoder() + >>> le.classes_ + array([1, 2, 6]) + >>> le.transform([1, 1, 2, 6]) + array([0, 0, 1, 2]...) + >>> le.inverse_transform([0, 0, 1, 2]) + array([1, 1, 2, 6]) + + It can also be used to transform non-numerical labels (as long as they are + hashable and comparable) to numerical labels. + + >>> le = LabelEncoder() + >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) + LabelEncoder() + >>> list(le.classes_) + [np.str_('amsterdam'), np.str_('paris'), np.str_('tokyo')] + >>> le.transform(["tokyo", "tokyo", "paris"]) + array([2, 2, 1]...) + >>> list(le.inverse_transform([2, 2, 1])) + [np.str_('tokyo'), np.str_('tokyo'), np.str_('paris')] + """ + + def fit(self, y): + """Fit label encoder. + + Parameters + ---------- + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + self : returns an instance of self. + Fitted label encoder. + """ + y = column_or_1d(y, warn=True) + self.classes_ = _unique(y) + return self + + def fit_transform(self, y): + """Fit label encoder and return encoded labels. + + Parameters + ---------- + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + y : array-like of shape (n_samples,) + Encoded labels. + """ + y = column_or_1d(y, warn=True) + self.classes_, y = _unique(y, return_inverse=True) + return y + + def transform(self, y): + """Transform labels to normalized encoding. + + Parameters + ---------- + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + y : array-like of shape (n_samples,) + Labels as normalized encodings. + """ + check_is_fitted(self) + xp, _ = get_namespace(y) + y = column_or_1d(y, dtype=self.classes_.dtype, warn=True) + # transform of empty array is empty array + if _num_samples(y) == 0: + return xp.asarray([]) + + return _encode(y, uniques=self.classes_) + + def inverse_transform(self, y): + """Transform labels back to original encoding. + + Parameters + ---------- + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + y_original : ndarray of shape (n_samples,) + Original encoding. + """ + check_is_fitted(self) + xp, _ = get_namespace(y) + y = column_or_1d(y, warn=True) + # inverse transform of empty array is empty array + if _num_samples(y) == 0: + return xp.asarray([]) + + diff = xpx.setdiff1d( + y, + xp.arange(self.classes_.shape[0], device=device(y)), + xp=xp, + ) + if diff.shape[0]: + raise ValueError("y contains previously unseen labels: %s" % str(diff)) + y = xp.asarray(y) + return xp.take(self.classes_, y, axis=0) + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.array_api_support = True + tags.input_tags.two_d_array = False + tags.target_tags.one_d_labels = True + return tags + + +class LabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None): + """Binarize labels in a one-vs-all fashion. + + Several regression and binary classification algorithms are + available in scikit-learn. A simple way to extend these algorithms + to the multi-class classification case is to use the so-called + one-vs-all scheme. + + At learning time, this simply consists in learning one regressor + or binary classifier per class. In doing so, one needs to convert + multi-class labels to binary labels (belong or does not belong + to the class). `LabelBinarizer` makes this process easy with the + transform method. + + At prediction time, one assigns the class for which the corresponding + model gave the greatest confidence. `LabelBinarizer` makes this easy + with the :meth:`inverse_transform` method. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + neg_label : int, default=0 + Value with which negative labels must be encoded. + + pos_label : int, default=1 + Value with which positive labels must be encoded. + + sparse_output : bool, default=False + True if the returned array from transform is desired to be in sparse + CSR format. + + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) + Holds the label for each class. + + y_type_ : str + Represents the type of the target data as evaluated by + :func:`~sklearn.utils.multiclass.type_of_target`. Possible type are + 'continuous', 'continuous-multioutput', 'binary', 'multiclass', + 'multiclass-multioutput', 'multilabel-indicator', and 'unknown'. + + sparse_input_ : bool + `True` if the input data to transform is given as a sparse matrix, + `False` otherwise. + + See Also + -------- + label_binarize : Function to perform the transform operation of + LabelBinarizer with fixed classes. + OneHotEncoder : Encode categorical features using a one-hot aka one-of-K + scheme. + + Examples + -------- + >>> from sklearn.preprocessing import LabelBinarizer + >>> lb = LabelBinarizer() + >>> lb.fit([1, 2, 6, 4, 2]) + LabelBinarizer() + >>> lb.classes_ + array([1, 2, 4, 6]) + >>> lb.transform([1, 6]) + array([[1, 0, 0, 0], + [0, 0, 0, 1]]) + + Binary targets transform to a column vector + + >>> lb = LabelBinarizer() + >>> lb.fit_transform(['yes', 'no', 'no', 'yes']) + array([[1], + [0], + [0], + [1]]) + + Passing a 2D matrix for multilabel classification + + >>> import numpy as np + >>> lb.fit(np.array([[0, 1, 1], [1, 0, 0]])) + LabelBinarizer() + >>> lb.classes_ + array([0, 1, 2]) + >>> lb.transform([0, 1, 2, 1]) + array([[1, 0, 0], + [0, 1, 0], + [0, 0, 1], + [0, 1, 0]]) + """ + + _parameter_constraints: dict = { + "neg_label": [Integral], + "pos_label": [Integral], + "sparse_output": ["boolean"], + } + + def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False): + self.neg_label = neg_label + self.pos_label = pos_label + self.sparse_output = sparse_output + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, y): + """Fit label binarizer. + + Parameters + ---------- + y : ndarray of shape (n_samples,) or (n_samples, n_classes) + Target values. The 2-d matrix should only contain 0 and 1, + represents multilabel classification. + + Returns + ------- + self : object + Returns the instance itself. + """ + if self.neg_label >= self.pos_label: + raise ValueError( + f"neg_label={self.neg_label} must be strictly less than " + f"pos_label={self.pos_label}." + ) + + if self.sparse_output and (self.pos_label == 0 or self.neg_label != 0): + raise ValueError( + "Sparse binarization is only supported with non " + "zero pos_label and zero neg_label, got " + f"pos_label={self.pos_label} and neg_label={self.neg_label}" + ) + + self.y_type_ = type_of_target(y, input_name="y") + + if "multioutput" in self.y_type_: + raise ValueError( + "Multioutput target data is not supported with label binarization" + ) + if _num_samples(y) == 0: + raise ValueError("y has 0 samples: %r" % y) + + self.sparse_input_ = sp.issparse(y) + self.classes_ = unique_labels(y) + return self + + def fit_transform(self, y): + """Fit label binarizer/transform multi-class labels to binary labels. + + The output of transform is sometimes referred to as + the 1-of-K coding scheme. + + Parameters + ---------- + y : {ndarray, sparse matrix} of shape (n_samples,) or \ + (n_samples, n_classes) + Target values. The 2-d matrix should only contain 0 and 1, + represents multilabel classification. Sparse matrix can be + CSR, CSC, COO, DOK, or LIL. + + Returns + ------- + Y : {ndarray, sparse matrix} of shape (n_samples, n_classes) + Shape will be (n_samples, 1) for binary problems. Sparse matrix + will be of CSR format. + """ + return self.fit(y).transform(y) + + def transform(self, y): + """Transform multi-class labels to binary labels. + + The output of transform is sometimes referred to by some authors as + the 1-of-K coding scheme. + + Parameters + ---------- + y : {array, sparse matrix} of shape (n_samples,) or \ + (n_samples, n_classes) + Target values. The 2-d matrix should only contain 0 and 1, + represents multilabel classification. Sparse matrix can be + CSR, CSC, COO, DOK, or LIL. + + Returns + ------- + Y : {ndarray, sparse matrix} of shape (n_samples, n_classes) + Shape will be (n_samples, 1) for binary problems. Sparse matrix + will be of CSR format. + """ + check_is_fitted(self) + + y_is_multilabel = type_of_target(y).startswith("multilabel") + if y_is_multilabel and not self.y_type_.startswith("multilabel"): + raise ValueError("The object was not fitted with multilabel input.") + + return label_binarize( + y, + classes=self.classes_, + pos_label=self.pos_label, + neg_label=self.neg_label, + sparse_output=self.sparse_output, + ) + + def inverse_transform(self, Y, threshold=None): + """Transform binary labels back to multi-class labels. + + Parameters + ---------- + Y : {ndarray, sparse matrix} of shape (n_samples, n_classes) + Target values. All sparse matrices are converted to CSR before + inverse transformation. + + threshold : float, default=None + Threshold used in the binary and multi-label cases. + + Use 0 when ``Y`` contains the output of :term:`decision_function` + (classifier). + Use 0.5 when ``Y`` contains the output of :term:`predict_proba`. + + If None, the threshold is assumed to be half way between + neg_label and pos_label. + + Returns + ------- + y_original : {ndarray, sparse matrix} of shape (n_samples,) + Target values. Sparse matrix will be of CSR format. + + Notes + ----- + In the case when the binary labels are fractional + (probabilistic), :meth:`inverse_transform` chooses the class with the + greatest value. Typically, this allows to use the output of a + linear model's :term:`decision_function` method directly as the input + of :meth:`inverse_transform`. + """ + check_is_fitted(self) + + if threshold is None: + threshold = (self.pos_label + self.neg_label) / 2.0 + + if self.y_type_ == "multiclass": + y_inv = _inverse_binarize_multiclass(Y, self.classes_) + else: + y_inv = _inverse_binarize_thresholding( + Y, self.y_type_, self.classes_, threshold + ) + + if self.sparse_input_: + y_inv = sp.csr_matrix(y_inv) + elif sp.issparse(y_inv): + y_inv = y_inv.toarray() + + return y_inv + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.two_d_array = False + tags.target_tags.one_d_labels = True + return tags + + +@validate_params( + { + "y": ["array-like", "sparse matrix"], + "classes": ["array-like"], + "neg_label": [Interval(Integral, None, None, closed="neither")], + "pos_label": [Interval(Integral, None, None, closed="neither")], + "sparse_output": ["boolean"], + }, + prefer_skip_nested_validation=True, +) +def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False): + """Binarize labels in a one-vs-all fashion. + + Several regression and binary classification algorithms are + available in scikit-learn. A simple way to extend these algorithms + to the multi-class classification case is to use the so-called + one-vs-all scheme. + + This function makes it possible to compute this transformation for a + fixed set of class labels known ahead of time. + + Parameters + ---------- + y : array-like or sparse matrix + Sequence of integer labels or multilabel data to encode. + + classes : array-like of shape (n_classes,) + Uniquely holds the label for each class. + + neg_label : int, default=0 + Value with which negative labels must be encoded. + + pos_label : int, default=1 + Value with which positive labels must be encoded. + + sparse_output : bool, default=False, + Set to true if output binary array is desired in CSR sparse format. + + Returns + ------- + Y : {ndarray, sparse matrix} of shape (n_samples, n_classes) + Shape will be (n_samples, 1) for binary problems. Sparse matrix will + be of CSR format. + + See Also + -------- + LabelBinarizer : Class used to wrap the functionality of label_binarize and + allow for fitting to classes independently of the transform operation. + + Examples + -------- + >>> from sklearn.preprocessing import label_binarize + >>> label_binarize([1, 6], classes=[1, 2, 4, 6]) + array([[1, 0, 0, 0], + [0, 0, 0, 1]]) + + The class ordering is preserved: + + >>> label_binarize([1, 6], classes=[1, 6, 4, 2]) + array([[1, 0, 0, 0], + [0, 1, 0, 0]]) + + Binary targets transform to a column vector + + >>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes']) + array([[1], + [0], + [0], + [1]]) + """ + if not isinstance(y, list): + # XXX Workaround that will be removed when list of list format is + # dropped + y = check_array( + y, input_name="y", accept_sparse="csr", ensure_2d=False, dtype=None + ) + else: + if _num_samples(y) == 0: + raise ValueError("y has 0 samples: %r" % y) + if neg_label >= pos_label: + raise ValueError( + "neg_label={0} must be strictly less than pos_label={1}.".format( + neg_label, pos_label + ) + ) + + if sparse_output and (pos_label == 0 or neg_label != 0): + raise ValueError( + "Sparse binarization is only supported with non " + "zero pos_label and zero neg_label, got " + "pos_label={0} and neg_label={1}" + "".format(pos_label, neg_label) + ) + + # To account for pos_label == 0 in the dense case + pos_switch = pos_label == 0 + if pos_switch: + pos_label = -neg_label + + y_type = type_of_target(y) + if "multioutput" in y_type: + raise ValueError( + "Multioutput target data is not supported with label binarization" + ) + if y_type == "unknown": + raise ValueError("The type of target data is not known") + + n_samples = y.shape[0] if sp.issparse(y) else len(y) + n_classes = len(classes) + classes = np.asarray(classes) + + if y_type == "binary": + if n_classes == 1: + if sparse_output: + return sp.csr_matrix((n_samples, 1), dtype=int) + else: + Y = np.zeros((len(y), 1), dtype=int) + Y += neg_label + return Y + elif len(classes) >= 3: + y_type = "multiclass" + + sorted_class = np.sort(classes) + if y_type == "multilabel-indicator": + y_n_classes = y.shape[1] if hasattr(y, "shape") else len(y[0]) + if classes.size != y_n_classes: + raise ValueError( + "classes {0} mismatch with the labels {1} found in the data".format( + classes, unique_labels(y) + ) + ) + + if y_type in ("binary", "multiclass"): + y = column_or_1d(y) + + # pick out the known labels from y + y_in_classes = np.isin(y, classes) + y_seen = y[y_in_classes] + indices = np.searchsorted(sorted_class, y_seen) + indptr = np.hstack((0, np.cumsum(y_in_classes))) + + data = np.empty_like(indices) + data.fill(pos_label) + Y = sp.csr_matrix((data, indices, indptr), shape=(n_samples, n_classes)) + elif y_type == "multilabel-indicator": + Y = sp.csr_matrix(y) + if pos_label != 1: + data = np.empty_like(Y.data) + data.fill(pos_label) + Y.data = data + else: + raise ValueError( + "%s target data is not supported with label binarization" % y_type + ) + + if not sparse_output: + Y = Y.toarray() + Y = Y.astype(int, copy=False) + + if neg_label != 0: + Y[Y == 0] = neg_label + + if pos_switch: + Y[Y == pos_label] = 0 + else: + Y.data = Y.data.astype(int, copy=False) + + # preserve label ordering + if np.any(classes != sorted_class): + indices = np.searchsorted(sorted_class, classes) + Y = Y[:, indices] + + if y_type == "binary": + if sparse_output: + Y = Y[:, [-1]] + else: + Y = Y[:, -1].reshape((-1, 1)) + + return Y + + +def _inverse_binarize_multiclass(y, classes): + """Inverse label binarization transformation for multiclass. + + Multiclass uses the maximal score instead of a threshold. + """ + classes = np.asarray(classes) + + if sp.issparse(y): + # Find the argmax for each row in y where y is a CSR matrix + + y = y.tocsr() + n_samples, n_outputs = y.shape + outputs = np.arange(n_outputs) + row_max = min_max_axis(y, 1)[1] + row_nnz = np.diff(y.indptr) + + y_data_repeated_max = np.repeat(row_max, row_nnz) + # picks out all indices obtaining the maximum per row + y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data) + + # For corner case where last row has a max of 0 + if row_max[-1] == 0: + y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)]) + + # Gets the index of the first argmax in each row from y_i_all_argmax + index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1]) + # first argmax of each row + y_ind_ext = np.append(y.indices, [0]) + y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]] + # Handle rows of all 0 + y_i_argmax[np.where(row_nnz == 0)[0]] = 0 + + # Handles rows with max of 0 that contain negative numbers + samples = np.arange(n_samples)[(row_nnz > 0) & (row_max.ravel() == 0)] + for i in samples: + ind = y.indices[y.indptr[i] : y.indptr[i + 1]] + y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0] + + return classes[y_i_argmax] + else: + return classes.take(y.argmax(axis=1), mode="clip") + + +def _inverse_binarize_thresholding(y, output_type, classes, threshold): + """Inverse label binarization transformation using thresholding.""" + + if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2: + raise ValueError("output_type='binary', but y.shape = {0}".format(y.shape)) + + if output_type != "binary" and y.shape[1] != len(classes): + raise ValueError( + "The number of class is not equal to the number of dimension of y." + ) + + classes = np.asarray(classes) + + # Perform thresholding + if sp.issparse(y): + if threshold > 0: + if y.format not in ("csr", "csc"): + y = y.tocsr() + y.data = np.array(y.data > threshold, dtype=int) + y.eliminate_zeros() + else: + y = np.array(y.toarray() > threshold, dtype=int) + else: + y = np.array(y > threshold, dtype=int) + + # Inverse transform data + if output_type == "binary": + if sp.issparse(y): + y = y.toarray() + if y.ndim == 2 and y.shape[1] == 2: + return classes[y[:, 1]] + else: + if len(classes) == 1: + return np.repeat(classes[0], len(y)) + else: + return classes[y.ravel()] + + elif output_type == "multilabel-indicator": + return y + + else: + raise ValueError("{0} format is not supported".format(output_type)) + + +class MultiLabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None): + """Transform between iterable of iterables and a multilabel format. + + Although a list of sets or tuples is a very intuitive format for multilabel + data, it is unwieldy to process. This transformer converts between this + intuitive format and the supported multilabel format: a (samples x classes) + binary matrix indicating the presence of a class label. + + Parameters + ---------- + classes : array-like of shape (n_classes,), default=None + Indicates an ordering for the class labels. + All entries should be unique (cannot contain duplicate classes). + + sparse_output : bool, default=False + Set to True if output binary array is desired in CSR sparse format. + + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) + A copy of the `classes` parameter when provided. + Otherwise it corresponds to the sorted set of classes found + when fitting. + + See Also + -------- + OneHotEncoder : Encode categorical features using a one-hot aka one-of-K + scheme. + + Examples + -------- + >>> from sklearn.preprocessing import MultiLabelBinarizer + >>> mlb = MultiLabelBinarizer() + >>> mlb.fit_transform([(1, 2), (3,)]) + array([[1, 1, 0], + [0, 0, 1]]) + >>> mlb.classes_ + array([1, 2, 3]) + + >>> mlb.fit_transform([{'sci-fi', 'thriller'}, {'comedy'}]) + array([[0, 1, 1], + [1, 0, 0]]) + >>> list(mlb.classes_) + ['comedy', 'sci-fi', 'thriller'] + + A common mistake is to pass in a list, which leads to the following issue: + + >>> mlb = MultiLabelBinarizer() + >>> mlb.fit(['sci-fi', 'thriller', 'comedy']) + MultiLabelBinarizer() + >>> mlb.classes_ + array(['-', 'c', 'd', 'e', 'f', 'h', 'i', 'l', 'm', 'o', 'r', 's', 't', + 'y'], dtype=object) + + To correct this, the list of labels should be passed in as: + + >>> mlb = MultiLabelBinarizer() + >>> mlb.fit([['sci-fi', 'thriller', 'comedy']]) + MultiLabelBinarizer() + >>> mlb.classes_ + array(['comedy', 'sci-fi', 'thriller'], dtype=object) + """ + + _parameter_constraints: dict = { + "classes": ["array-like", None], + "sparse_output": ["boolean"], + } + + def __init__(self, *, classes=None, sparse_output=False): + self.classes = classes + self.sparse_output = sparse_output + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, y): + """Fit the label sets binarizer, storing :term:`classes_`. + + Parameters + ---------- + y : iterable of iterables + A set of labels (any orderable and hashable object) for each + sample. If the `classes` parameter is set, `y` will not be + iterated. + + Returns + ------- + self : object + Fitted estimator. + """ + self._cached_dict = None + + if self.classes is None: + classes = sorted(set(itertools.chain.from_iterable(y))) + elif len(set(self.classes)) < len(self.classes): + raise ValueError( + "The classes argument contains duplicate " + "classes. Remove these duplicates before passing " + "them to MultiLabelBinarizer." + ) + else: + classes = self.classes + dtype = int if all(isinstance(c, int) for c in classes) else object + self.classes_ = np.empty(len(classes), dtype=dtype) + self.classes_[:] = classes + return self + + @_fit_context(prefer_skip_nested_validation=True) + def fit_transform(self, y): + """Fit the label sets binarizer and transform the given label sets. + + Parameters + ---------- + y : iterable of iterables + A set of labels (any orderable and hashable object) for each + sample. If the `classes` parameter is set, `y` will not be + iterated. + + Returns + ------- + y_indicator : {ndarray, sparse matrix} of shape (n_samples, n_classes) + A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` + is in `y[i]`, and 0 otherwise. Sparse matrix will be of CSR + format. + """ + if self.classes is not None: + return self.fit(y).transform(y) + + self._cached_dict = None + + # Automatically increment on new class + class_mapping = defaultdict(int) + class_mapping.default_factory = class_mapping.__len__ + yt = self._transform(y, class_mapping) + + # sort classes and reorder columns + tmp = sorted(class_mapping, key=class_mapping.get) + + # (make safe for tuples) + dtype = int if all(isinstance(c, int) for c in tmp) else object + class_mapping = np.empty(len(tmp), dtype=dtype) + class_mapping[:] = tmp + self.classes_, inverse = np.unique(class_mapping, return_inverse=True) + # ensure yt.indices keeps its current dtype + yt.indices = np.asarray(inverse[yt.indices], dtype=yt.indices.dtype) + + if not self.sparse_output: + yt = yt.toarray() + + return yt + + def transform(self, y): + """Transform the given label sets. + + Parameters + ---------- + y : iterable of iterables + A set of labels (any orderable and hashable object) for each + sample. If the `classes` parameter is set, `y` will not be + iterated. + + Returns + ------- + y_indicator : array or CSR matrix, shape (n_samples, n_classes) + A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in + `y[i]`, and 0 otherwise. + """ + check_is_fitted(self) + + class_to_index = self._build_cache() + yt = self._transform(y, class_to_index) + + if not self.sparse_output: + yt = yt.toarray() + + return yt + + def _build_cache(self): + if self._cached_dict is None: + self._cached_dict = dict(zip(self.classes_, range(len(self.classes_)))) + + return self._cached_dict + + def _transform(self, y, class_mapping): + """Transforms the label sets with a given mapping. + + Parameters + ---------- + y : iterable of iterables + A set of labels (any orderable and hashable object) for each + sample. If the `classes` parameter is set, `y` will not be + iterated. + + class_mapping : Mapping + Maps from label to column index in label indicator matrix. + + Returns + ------- + y_indicator : sparse matrix of shape (n_samples, n_classes) + Label indicator matrix. Will be of CSR format. + """ + indices = array.array("i") + indptr = array.array("i", [0]) + unknown = set() + for labels in y: + index = set() + for label in labels: + try: + index.add(class_mapping[label]) + except KeyError: + unknown.add(label) + indices.extend(index) + indptr.append(len(indices)) + if unknown: + warnings.warn( + "unknown class(es) {0} will be ignored".format(sorted(unknown, key=str)) + ) + data = np.ones(len(indices), dtype=int) + + return sp.csr_matrix( + (data, indices, indptr), shape=(len(indptr) - 1, len(class_mapping)) + ) + + def inverse_transform(self, yt): + """Transform the given indicator matrix into label sets. + + Parameters + ---------- + yt : {ndarray, sparse matrix} of shape (n_samples, n_classes) + A matrix containing only 1s ands 0s. + + Returns + ------- + y_original : list of tuples + The set of labels for each sample such that `y[i]` consists of + `classes_[j]` for each `yt[i, j] == 1`. + """ + check_is_fitted(self) + + if yt.shape[1] != len(self.classes_): + raise ValueError( + "Expected indicator for {0} classes, but got {1}".format( + len(self.classes_), yt.shape[1] + ) + ) + + if sp.issparse(yt): + yt = yt.tocsr() + if len(yt.data) != 0 and len(np.setdiff1d(yt.data, [0, 1])) > 0: + raise ValueError("Expected only 0s and 1s in label indicator.") + return [ + tuple(self.classes_.take(yt.indices[start:end])) + for start, end in zip(yt.indptr[:-1], yt.indptr[1:]) + ] + else: + unexpected = np.setdiff1d(yt, [0, 1]) + if len(unexpected) > 0: + raise ValueError( + "Expected only 0s and 1s in label indicator. Also got {0}".format( + unexpected + ) + ) + return [tuple(self.classes_.compress(indicators)) for indicators in yt] + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.two_d_array = False + tags.target_tags.two_d_labels = True + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_polynomial.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_polynomial.py new file mode 100644 index 0000000000000000000000000000000000000000..69bfe7b212bba6b3bfaaa021eed9d26b21b8fd68 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_polynomial.py @@ -0,0 +1,1153 @@ +""" +This file contains preprocessing tools based on polynomials. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import collections +from itertools import chain, combinations +from itertools import combinations_with_replacement as combinations_w_r +from numbers import Integral + +import numpy as np +from scipy import sparse +from scipy.interpolate import BSpline +from scipy.special import comb + +from ..base import BaseEstimator, TransformerMixin, _fit_context +from ..utils import check_array +from ..utils._param_validation import Interval, StrOptions +from ..utils.fixes import parse_version, sp_version +from ..utils.stats import _weighted_percentile +from ..utils.validation import ( + FLOAT_DTYPES, + _check_feature_names_in, + _check_sample_weight, + check_is_fitted, + validate_data, +) +from ._csr_polynomial_expansion import ( + _calc_expanded_nnz, + _calc_total_nnz, + _csr_polynomial_expansion, +) + +__all__ = [ + "PolynomialFeatures", + "SplineTransformer", +] + + +def _create_expansion(X, interaction_only, deg, n_features, cumulative_size=0): + """Helper function for creating and appending sparse expansion matrices""" + + total_nnz = _calc_total_nnz(X.indptr, interaction_only, deg) + expanded_col = _calc_expanded_nnz(n_features, interaction_only, deg) + + if expanded_col == 0: + return None + # This only checks whether each block needs 64bit integers upon + # expansion. We prefer to keep int32 indexing where we can, + # since currently SciPy's CSR construction downcasts when possible, + # so we prefer to avoid an unnecessary cast. The dtype may still + # change in the concatenation process if needed. + # See: https://github.com/scipy/scipy/issues/16569 + max_indices = expanded_col - 1 + max_indptr = total_nnz + max_int32 = np.iinfo(np.int32).max + needs_int64 = max(max_indices, max_indptr) > max_int32 + index_dtype = np.int64 if needs_int64 else np.int32 + + # Result of the expansion, modified in place by the + # `_csr_polynomial_expansion` routine. + expanded_data = np.empty(shape=total_nnz, dtype=X.data.dtype) + expanded_indices = np.empty(shape=total_nnz, dtype=index_dtype) + expanded_indptr = np.empty(shape=X.indptr.shape[0], dtype=index_dtype) + _csr_polynomial_expansion( + X.data, + X.indices, + X.indptr, + X.shape[1], + expanded_data, + expanded_indices, + expanded_indptr, + interaction_only, + deg, + ) + return sparse.csr_matrix( + (expanded_data, expanded_indices, expanded_indptr), + shape=(X.indptr.shape[0] - 1, expanded_col), + dtype=X.dtype, + ) + + +class PolynomialFeatures(TransformerMixin, BaseEstimator): + """Generate polynomial and interaction features. + + Generate a new feature matrix consisting of all polynomial combinations + of the features with degree less than or equal to the specified degree. + For example, if an input sample is two dimensional and of the form + [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2]. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + degree : int or tuple (min_degree, max_degree), default=2 + If a single int is given, it specifies the maximal degree of the + polynomial features. If a tuple `(min_degree, max_degree)` is passed, + then `min_degree` is the minimum and `max_degree` is the maximum + polynomial degree of the generated features. Note that `min_degree=0` + and `min_degree=1` are equivalent as outputting the degree zero term is + determined by `include_bias`. + + interaction_only : bool, default=False + If `True`, only interaction features are produced: features that are + products of at most `degree` *distinct* input features, i.e. terms with + power of 2 or higher of the same input feature are excluded: + + - included: `x[0]`, `x[1]`, `x[0] * x[1]`, etc. + - excluded: `x[0] ** 2`, `x[0] ** 2 * x[1]`, etc. + + include_bias : bool, default=True + If `True` (default), then include a bias column, the feature in which + all polynomial powers are zero (i.e. a column of ones - acts as an + intercept term in a linear model). + + order : {'C', 'F'}, default='C' + Order of output array in the dense case. `'F'` order is faster to + compute, but may slow down subsequent estimators. + + .. versionadded:: 0.21 + + Attributes + ---------- + powers_ : ndarray of shape (`n_output_features_`, `n_features_in_`) + `powers_[i, j]` is the exponent of the jth input in the ith output. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_output_features_ : int + The total number of polynomial output features. The number of output + features is computed by iterating over all suitably sized combinations + of input features. + + See Also + -------- + SplineTransformer : Transformer that generates univariate B-spline bases + for features. + + Notes + ----- + Be aware that the number of features in the output array scales + polynomially in the number of features of the input array, and + exponentially in the degree. High degrees can cause overfitting. + + See :ref:`examples/linear_model/plot_polynomial_interpolation.py + ` + + Examples + -------- + >>> import numpy as np + >>> from sklearn.preprocessing import PolynomialFeatures + >>> X = np.arange(6).reshape(3, 2) + >>> X + array([[0, 1], + [2, 3], + [4, 5]]) + >>> poly = PolynomialFeatures(2) + >>> poly.fit_transform(X) + array([[ 1., 0., 1., 0., 0., 1.], + [ 1., 2., 3., 4., 6., 9.], + [ 1., 4., 5., 16., 20., 25.]]) + >>> poly = PolynomialFeatures(interaction_only=True) + >>> poly.fit_transform(X) + array([[ 1., 0., 1., 0.], + [ 1., 2., 3., 6.], + [ 1., 4., 5., 20.]]) + """ + + _parameter_constraints: dict = { + "degree": [Interval(Integral, 0, None, closed="left"), "array-like"], + "interaction_only": ["boolean"], + "include_bias": ["boolean"], + "order": [StrOptions({"C", "F"})], + } + + def __init__( + self, degree=2, *, interaction_only=False, include_bias=True, order="C" + ): + self.degree = degree + self.interaction_only = interaction_only + self.include_bias = include_bias + self.order = order + + @staticmethod + def _combinations( + n_features, min_degree, max_degree, interaction_only, include_bias + ): + comb = combinations if interaction_only else combinations_w_r + start = max(1, min_degree) + iter = chain.from_iterable( + comb(range(n_features), i) for i in range(start, max_degree + 1) + ) + if include_bias: + iter = chain(comb(range(n_features), 0), iter) + return iter + + @staticmethod + def _num_combinations( + n_features, min_degree, max_degree, interaction_only, include_bias + ): + """Calculate number of terms in polynomial expansion + + This should be equivalent to counting the number of terms returned by + _combinations(...) but much faster. + """ + + if interaction_only: + combinations = sum( + [ + comb(n_features, i, exact=True) + for i in range(max(1, min_degree), min(max_degree, n_features) + 1) + ] + ) + else: + combinations = comb(n_features + max_degree, max_degree, exact=True) - 1 + if min_degree > 0: + d = min_degree - 1 + combinations -= comb(n_features + d, d, exact=True) - 1 + + if include_bias: + combinations += 1 + + return combinations + + @property + def powers_(self): + """Exponent for each of the inputs in the output.""" + check_is_fitted(self) + + combinations = self._combinations( + n_features=self.n_features_in_, + min_degree=self._min_degree, + max_degree=self._max_degree, + interaction_only=self.interaction_only, + include_bias=self.include_bias, + ) + return np.vstack( + [np.bincount(c, minlength=self.n_features_in_) for c in combinations] + ) + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + - If `input_features is None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then the following input feature names are generated: + `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + powers = self.powers_ + input_features = _check_feature_names_in(self, input_features) + feature_names = [] + for row in powers: + inds = np.where(row)[0] + if len(inds): + name = " ".join( + ( + "%s^%d" % (input_features[ind], exp) + if exp != 1 + else input_features[ind] + ) + for ind, exp in zip(inds, row[inds]) + ) + else: + name = "1" + feature_names.append(name) + return np.asarray(feature_names, dtype=object) + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None): + """ + Compute number of output features. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data. + + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self : object + Fitted transformer. + """ + _, n_features = validate_data(self, X, accept_sparse=True).shape + + if isinstance(self.degree, Integral): + if self.degree == 0 and not self.include_bias: + raise ValueError( + "Setting degree to zero and include_bias to False would result in" + " an empty output array." + ) + + self._min_degree = 0 + self._max_degree = self.degree + elif ( + isinstance(self.degree, collections.abc.Iterable) and len(self.degree) == 2 + ): + self._min_degree, self._max_degree = self.degree + if not ( + isinstance(self._min_degree, Integral) + and isinstance(self._max_degree, Integral) + and self._min_degree >= 0 + and self._min_degree <= self._max_degree + ): + raise ValueError( + "degree=(min_degree, max_degree) must " + "be non-negative integers that fulfil " + "min_degree <= max_degree, got " + f"{self.degree}." + ) + elif self._max_degree == 0 and not self.include_bias: + raise ValueError( + "Setting both min_degree and max_degree to zero and include_bias to" + " False would result in an empty output array." + ) + else: + raise ValueError( + "degree must be a non-negative int or tuple " + "(min_degree, max_degree), got " + f"{self.degree}." + ) + + self.n_output_features_ = self._num_combinations( + n_features=n_features, + min_degree=self._min_degree, + max_degree=self._max_degree, + interaction_only=self.interaction_only, + include_bias=self.include_bias, + ) + if self.n_output_features_ > np.iinfo(np.intp).max: + msg = ( + "The output that would result from the current configuration would" + f" have {self.n_output_features_} features which is too large to be" + f" indexed by {np.intp().dtype.name}. Please change some or all of the" + " following:\n- The number of features in the input, currently" + f" {n_features=}\n- The range of degrees to calculate, currently" + f" [{self._min_degree}, {self._max_degree}]\n- Whether to include only" + f" interaction terms, currently {self.interaction_only}\n- Whether to" + f" include a bias term, currently {self.include_bias}." + ) + if ( + np.intp == np.int32 + and self.n_output_features_ <= np.iinfo(np.int64).max + ): # pragma: nocover + msg += ( + "\nNote that the current Python runtime has a limited 32 bit " + "address space and that this configuration would have been " + "admissible if run on a 64 bit Python runtime." + ) + raise ValueError(msg) + # We also record the number of output features for + # _min_degree = 0 + self._n_out_full = self._num_combinations( + n_features=n_features, + min_degree=0, + max_degree=self._max_degree, + interaction_only=self.interaction_only, + include_bias=self.include_bias, + ) + + return self + + def transform(self, X): + """Transform data to polynomial features. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to transform, row by row. + + Prefer CSR over CSC for sparse input (for speed), but CSC is + required if the degree is 4 or higher. If the degree is less than + 4 and the input format is CSC, it will be converted to CSR, have + its polynomial features generated, then converted back to CSC. + + If the degree is 2 or 3, the method described in "Leveraging + Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices + Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is + used, which is much faster than the method used on CSC input. For + this reason, a CSC input will be converted to CSR, and the output + will be converted back to CSC prior to being returned, hence the + preference of CSR. + + Returns + ------- + XP : {ndarray, sparse matrix} of shape (n_samples, NP) + The matrix of features, where `NP` is the number of polynomial + features generated from the combination of inputs. If a sparse + matrix is provided, it will be converted into a sparse + `csr_matrix`. + """ + check_is_fitted(self) + + X = validate_data( + self, + X, + order="F", + dtype=FLOAT_DTYPES, + reset=False, + accept_sparse=("csr", "csc"), + ) + + n_samples, n_features = X.shape + max_int32 = np.iinfo(np.int32).max + if sparse.issparse(X) and X.format == "csr": + if self._max_degree > 3: + return self.transform(X.tocsc()).tocsr() + to_stack = [] + if self.include_bias: + to_stack.append( + sparse.csr_matrix(np.ones(shape=(n_samples, 1), dtype=X.dtype)) + ) + if self._min_degree <= 1 and self._max_degree > 0: + to_stack.append(X) + + cumulative_size = sum(mat.shape[1] for mat in to_stack) + for deg in range(max(2, self._min_degree), self._max_degree + 1): + expanded = _create_expansion( + X=X, + interaction_only=self.interaction_only, + deg=deg, + n_features=n_features, + cumulative_size=cumulative_size, + ) + if expanded is not None: + to_stack.append(expanded) + cumulative_size += expanded.shape[1] + if len(to_stack) == 0: + # edge case: deal with empty matrix + XP = sparse.csr_matrix((n_samples, 0), dtype=X.dtype) + else: + # `scipy.sparse.hstack` breaks in scipy<1.9.2 + # when `n_output_features_ > max_int32` + all_int32 = all(mat.indices.dtype == np.int32 for mat in to_stack) + if ( + sp_version < parse_version("1.9.2") + and self.n_output_features_ > max_int32 + and all_int32 + ): + raise ValueError( # pragma: no cover + "In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`" + " produces negative columns when:\n1. The output shape contains" + " `n_cols` too large to be represented by a 32bit signed" + " integer.\n2. All sub-matrices to be stacked have indices of" + " dtype `np.int32`.\nTo avoid this error, either use a version" + " of scipy `>=1.9.2` or alter the `PolynomialFeatures`" + " transformer to produce fewer than 2^31 output features" + ) + XP = sparse.hstack(to_stack, dtype=X.dtype, format="csr") + elif sparse.issparse(X) and X.format == "csc" and self._max_degree < 4: + return self.transform(X.tocsr()).tocsc() + elif sparse.issparse(X): + combinations = self._combinations( + n_features=n_features, + min_degree=self._min_degree, + max_degree=self._max_degree, + interaction_only=self.interaction_only, + include_bias=self.include_bias, + ) + columns = [] + for combi in combinations: + if combi: + out_col = 1 + for col_idx in combi: + out_col = X[:, [col_idx]].multiply(out_col) + columns.append(out_col) + else: + bias = sparse.csc_matrix(np.ones((X.shape[0], 1))) + columns.append(bias) + XP = sparse.hstack(columns, dtype=X.dtype).tocsc() + else: + # Do as if _min_degree = 0 and cut down array after the + # computation, i.e. use _n_out_full instead of n_output_features_. + XP = np.empty( + shape=(n_samples, self._n_out_full), dtype=X.dtype, order=self.order + ) + + # What follows is a faster implementation of: + # for i, comb in enumerate(combinations): + # XP[:, i] = X[:, comb].prod(1) + # This implementation uses two optimisations. + # First one is broadcasting, + # multiply ([X1, ..., Xn], X1) -> [X1 X1, ..., Xn X1] + # multiply ([X2, ..., Xn], X2) -> [X2 X2, ..., Xn X2] + # ... + # multiply ([X[:, start:end], X[:, start]) -> ... + # Second optimisation happens for degrees >= 3. + # Xi^3 is computed reusing previous computation: + # Xi^3 = Xi^2 * Xi. + + # degree 0 term + if self.include_bias: + XP[:, 0] = 1 + current_col = 1 + else: + current_col = 0 + + if self._max_degree == 0: + return XP + + # degree 1 term + XP[:, current_col : current_col + n_features] = X + index = list(range(current_col, current_col + n_features)) + current_col += n_features + index.append(current_col) + + # loop over degree >= 2 terms + for _ in range(2, self._max_degree + 1): + new_index = [] + end = index[-1] + for feature_idx in range(n_features): + start = index[feature_idx] + new_index.append(current_col) + if self.interaction_only: + start += index[feature_idx + 1] - index[feature_idx] + next_col = current_col + end - start + if next_col <= current_col: + break + # XP[:, start:end] are terms of degree d - 1 + # that exclude feature #feature_idx. + np.multiply( + XP[:, start:end], + X[:, feature_idx : feature_idx + 1], + out=XP[:, current_col:next_col], + casting="no", + ) + current_col = next_col + + new_index.append(current_col) + index = new_index + + if self._min_degree > 1: + n_XP, n_Xout = self._n_out_full, self.n_output_features_ + if self.include_bias: + Xout = np.empty( + shape=(n_samples, n_Xout), dtype=XP.dtype, order=self.order + ) + Xout[:, 0] = 1 + Xout[:, 1:] = XP[:, n_XP - n_Xout + 1 :] + else: + Xout = XP[:, n_XP - n_Xout :].copy() + XP = Xout + return XP + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + + +class SplineTransformer(TransformerMixin, BaseEstimator): + """Generate univariate B-spline bases for features. + + Generate a new feature matrix consisting of + `n_splines=n_knots + degree - 1` (`n_knots - 1` for + `extrapolation="periodic"`) spline basis functions + (B-splines) of polynomial order=`degree` for each feature. + + In order to learn more about the SplineTransformer class go to: + :ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py` + + Read more in the :ref:`User Guide `. + + .. versionadded:: 1.0 + + Parameters + ---------- + n_knots : int, default=5 + Number of knots of the splines if `knots` equals one of + {'uniform', 'quantile'}. Must be larger or equal 2. Ignored if `knots` + is array-like. + + degree : int, default=3 + The polynomial degree of the spline basis. Must be a non-negative + integer. + + knots : {'uniform', 'quantile'} or array-like of shape \ + (n_knots, n_features), default='uniform' + Set knot positions such that first knot <= features <= last knot. + + - If 'uniform', `n_knots` number of knots are distributed uniformly + from min to max values of the features. + - If 'quantile', they are distributed uniformly along the quantiles of + the features. + - If an array-like is given, it directly specifies the sorted knot + positions including the boundary knots. Note that, internally, + `degree` number of knots are added before the first knot, the same + after the last knot. + + extrapolation : {'error', 'constant', 'linear', 'continue', 'periodic'}, \ + default='constant' + If 'error', values outside the min and max values of the training + features raises a `ValueError`. If 'constant', the value of the + splines at minimum and maximum value of the features is used as + constant extrapolation. If 'linear', a linear extrapolation is used. + If 'continue', the splines are extrapolated as is, i.e. option + `extrapolate=True` in :class:`scipy.interpolate.BSpline`. If + 'periodic', periodic splines with a periodicity equal to the distance + between the first and last knot are used. Periodic splines enforce + equal function values and derivatives at the first and last knot. + For example, this makes it possible to avoid introducing an arbitrary + jump between Dec 31st and Jan 1st in spline features derived from a + naturally periodic "day-of-year" input feature. In this case it is + recommended to manually set the knot values to control the period. + + include_bias : bool, default=True + If False, then the last spline element inside the data range + of a feature is dropped. As B-splines sum to one over the spline basis + functions for each data point, they implicitly include a bias term, + i.e. a column of ones. It acts as an intercept term in a linear models. + + order : {'C', 'F'}, default='C' + Order of output array in the dense case. `'F'` order is faster to compute, but + may slow down subsequent estimators. + + sparse_output : bool, default=False + Will return sparse CSR matrix if set True else will return an array. + + .. versionadded:: 1.2 + + Attributes + ---------- + bsplines_ : list of shape (n_features,) + List of BSplines objects, one for each feature. + + n_features_in_ : int + The total number of input features. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_features_out_ : int + The total number of output features, which is computed as + `n_features * n_splines`, where `n_splines` is + the number of bases elements of the B-splines, + `n_knots + degree - 1` for non-periodic splines and + `n_knots - 1` for periodic ones. + If `include_bias=False`, then it is only + `n_features * (n_splines - 1)`. + + See Also + -------- + KBinsDiscretizer : Transformer that bins continuous data into intervals. + + PolynomialFeatures : Transformer that generates polynomial and interaction + features. + + Notes + ----- + High degrees and a high number of knots can cause overfitting. + + See :ref:`examples/linear_model/plot_polynomial_interpolation.py + `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.preprocessing import SplineTransformer + >>> X = np.arange(6).reshape(6, 1) + >>> spline = SplineTransformer(degree=2, n_knots=3) + >>> spline.fit_transform(X) + array([[0.5 , 0.5 , 0. , 0. ], + [0.18, 0.74, 0.08, 0. ], + [0.02, 0.66, 0.32, 0. ], + [0. , 0.32, 0.66, 0.02], + [0. , 0.08, 0.74, 0.18], + [0. , 0. , 0.5 , 0.5 ]]) + """ + + _parameter_constraints: dict = { + "n_knots": [Interval(Integral, 2, None, closed="left")], + "degree": [Interval(Integral, 0, None, closed="left")], + "knots": [StrOptions({"uniform", "quantile"}), "array-like"], + "extrapolation": [ + StrOptions({"error", "constant", "linear", "continue", "periodic"}) + ], + "include_bias": ["boolean"], + "order": [StrOptions({"C", "F"})], + "sparse_output": ["boolean"], + } + + def __init__( + self, + n_knots=5, + degree=3, + *, + knots="uniform", + extrapolation="constant", + include_bias=True, + order="C", + sparse_output=False, + ): + self.n_knots = n_knots + self.degree = degree + self.knots = knots + self.extrapolation = extrapolation + self.include_bias = include_bias + self.order = order + self.sparse_output = sparse_output + + @staticmethod + def _get_base_knot_positions(X, n_knots=10, knots="uniform", sample_weight=None): + """Calculate base knot positions. + + Base knots such that first knot <= feature <= last knot. For the + B-spline construction with scipy.interpolate.BSpline, 2*degree knots + beyond the base interval are added. + + Returns + ------- + knots : ndarray of shape (n_knots, n_features), dtype=np.float64 + Knot positions (points) of base interval. + """ + if knots == "quantile": + percentile_ranks = 100 * np.linspace( + start=0, stop=1, num=n_knots, dtype=np.float64 + ) + + if sample_weight is None: + knots = np.percentile(X, percentile_ranks, axis=0) + else: + knots = np.array( + [ + _weighted_percentile(X, sample_weight, percentile_rank) + for percentile_rank in percentile_ranks + ] + ) + + else: + # knots == 'uniform': + # Note that the variable `knots` has already been validated and + # `else` is therefore safe. + # Disregard observations with zero weight. + mask = slice(None, None, 1) if sample_weight is None else sample_weight > 0 + x_min = np.amin(X[mask], axis=0) + x_max = np.amax(X[mask], axis=0) + + knots = np.linspace( + start=x_min, + stop=x_max, + num=n_knots, + endpoint=True, + dtype=np.float64, + ) + + return knots + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then the following input feature names are generated: + `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + check_is_fitted(self, "n_features_in_") + n_splines = self.bsplines_[0].c.shape[1] + + input_features = _check_feature_names_in(self, input_features) + feature_names = [] + for i in range(self.n_features_in_): + for j in range(n_splines - 1 + self.include_bias): + feature_names.append(f"{input_features[i]}_sp_{j}") + return np.asarray(feature_names, dtype=object) + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y=None, sample_weight=None): + """Compute knot positions of splines. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data. + + y : None + Ignored. + + sample_weight : array-like of shape (n_samples,), default = None + Individual weights for each sample. Used to calculate quantiles if + `knots="quantile"`. For `knots="uniform"`, zero weighted + observations are ignored for finding the min and max of `X`. + + Returns + ------- + self : object + Fitted transformer. + """ + X = validate_data( + self, + X, + reset=True, + accept_sparse=False, + ensure_min_samples=2, + ensure_2d=True, + ) + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + + _, n_features = X.shape + + if isinstance(self.knots, str): + base_knots = self._get_base_knot_positions( + X, n_knots=self.n_knots, knots=self.knots, sample_weight=sample_weight + ) + else: + base_knots = check_array(self.knots, dtype=np.float64) + if base_knots.shape[0] < 2: + raise ValueError("Number of knots, knots.shape[0], must be >= 2.") + elif base_knots.shape[1] != n_features: + raise ValueError("knots.shape[1] == n_features is violated.") + elif not np.all(np.diff(base_knots, axis=0) > 0): + raise ValueError("knots must be sorted without duplicates.") + + # number of knots for base interval + n_knots = base_knots.shape[0] + + if self.extrapolation == "periodic" and n_knots <= self.degree: + raise ValueError( + "Periodic splines require degree < n_knots. Got n_knots=" + f"{n_knots} and degree={self.degree}." + ) + + # number of splines basis functions + if self.extrapolation != "periodic": + n_splines = n_knots + self.degree - 1 + else: + # periodic splines have self.degree less degrees of freedom + n_splines = n_knots - 1 + + degree = self.degree + n_out = n_features * n_splines + # We have to add degree number of knots below, and degree number knots + # above the base knots in order to make the spline basis complete. + if self.extrapolation == "periodic": + # For periodic splines the spacing of the first / last degree knots + # needs to be a continuation of the spacing of the last / first + # base knots. + period = base_knots[-1] - base_knots[0] + knots = np.r_[ + base_knots[-(degree + 1) : -1] - period, + base_knots, + base_knots[1 : (degree + 1)] + period, + ] + + else: + # Eilers & Marx in "Flexible smoothing with B-splines and + # penalties" https://doi.org/10.1214/ss/1038425655 advice + # against repeating first and last knot several times, which + # would have inferior behaviour at boundaries if combined with + # a penalty (hence P-Spline). We follow this advice even if our + # splines are unpenalized. Meaning we do not: + # knots = np.r_[ + # np.tile(base_knots.min(axis=0), reps=[degree, 1]), + # base_knots, + # np.tile(base_knots.max(axis=0), reps=[degree, 1]) + # ] + # Instead, we reuse the distance of the 2 fist/last knots. + dist_min = base_knots[1] - base_knots[0] + dist_max = base_knots[-1] - base_knots[-2] + + knots = np.r_[ + np.linspace( + base_knots[0] - degree * dist_min, + base_knots[0] - dist_min, + num=degree, + ), + base_knots, + np.linspace( + base_knots[-1] + dist_max, + base_knots[-1] + degree * dist_max, + num=degree, + ), + ] + + # With a diagonal coefficient matrix, we get back the spline basis + # elements, i.e. the design matrix of the spline. + # Note, BSpline appreciates C-contiguous float64 arrays as c=coef. + coef = np.eye(n_splines, dtype=np.float64) + if self.extrapolation == "periodic": + coef = np.concatenate((coef, coef[:degree, :])) + + extrapolate = self.extrapolation in ["periodic", "continue"] + + bsplines = [ + BSpline.construct_fast( + knots[:, i], coef, self.degree, extrapolate=extrapolate + ) + for i in range(n_features) + ] + self.bsplines_ = bsplines + + self.n_features_out_ = n_out - n_features * (1 - self.include_bias) + return self + + def transform(self, X): + """Transform each feature data to B-splines. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to transform. + + Returns + ------- + XBS : {ndarray, sparse matrix} of shape (n_samples, n_features * n_splines) + The matrix of features, where n_splines is the number of bases + elements of the B-splines, n_knots + degree - 1. + """ + check_is_fitted(self) + + X = validate_data(self, X, reset=False, accept_sparse=False, ensure_2d=True) + + n_samples, n_features = X.shape + n_splines = self.bsplines_[0].c.shape[1] + degree = self.degree + + # TODO: Remove this condition, once scipy 1.10 is the minimum version. + # Only scipy => 1.10 supports design_matrix(.., extrapolate=..). + # The default (implicit in scipy < 1.10) is extrapolate=False. + scipy_1_10 = sp_version >= parse_version("1.10.0") + # Note: self.bsplines_[0].extrapolate is True for extrapolation in + # ["periodic", "continue"] + if scipy_1_10: + use_sparse = self.sparse_output + kwargs_extrapolate = {"extrapolate": self.bsplines_[0].extrapolate} + else: + use_sparse = self.sparse_output and not self.bsplines_[0].extrapolate + kwargs_extrapolate = dict() + + # Note that scipy BSpline returns float64 arrays and converts input + # x=X[:, i] to c-contiguous float64. + n_out = self.n_features_out_ + n_features * (1 - self.include_bias) + if X.dtype in FLOAT_DTYPES: + dtype = X.dtype + else: + dtype = np.float64 + if use_sparse: + output_list = [] + else: + XBS = np.zeros((n_samples, n_out), dtype=dtype, order=self.order) + + for i in range(n_features): + spl = self.bsplines_[i] + + if self.extrapolation in ("continue", "error", "periodic"): + if self.extrapolation == "periodic": + # With periodic extrapolation we map x to the segment + # [spl.t[k], spl.t[n]]. + # This is equivalent to BSpline(.., extrapolate="periodic") + # for scipy>=1.0.0. + n = spl.t.size - spl.k - 1 + # Assign to new array to avoid inplace operation + x = spl.t[spl.k] + (X[:, i] - spl.t[spl.k]) % ( + spl.t[n] - spl.t[spl.k] + ) + else: + x = X[:, i] + + if use_sparse: + XBS_sparse = BSpline.design_matrix( + x, spl.t, spl.k, **kwargs_extrapolate + ) + if self.extrapolation == "periodic": + # See the construction of coef in fit. We need to add the last + # degree spline basis function to the first degree ones and + # then drop the last ones. + # Note: See comment about SparseEfficiencyWarning below. + XBS_sparse = XBS_sparse.tolil() + XBS_sparse[:, :degree] += XBS_sparse[:, -degree:] + XBS_sparse = XBS_sparse[:, :-degree] + else: + XBS[:, (i * n_splines) : ((i + 1) * n_splines)] = spl(x) + else: # extrapolation in ("constant", "linear") + xmin, xmax = spl.t[degree], spl.t[-degree - 1] + # spline values at boundaries + f_min, f_max = spl(xmin), spl(xmax) + mask = (xmin <= X[:, i]) & (X[:, i] <= xmax) + if use_sparse: + mask_inv = ~mask + x = X[:, i].copy() + # Set some arbitrary values outside boundary that will be reassigned + # later. + x[mask_inv] = spl.t[self.degree] + XBS_sparse = BSpline.design_matrix(x, spl.t, spl.k) + # Note: Without converting to lil_matrix we would get: + # scipy.sparse._base.SparseEfficiencyWarning: Changing the sparsity + # structure of a csr_matrix is expensive. lil_matrix is more + # efficient. + if np.any(mask_inv): + XBS_sparse = XBS_sparse.tolil() + XBS_sparse[mask_inv, :] = 0 + else: + XBS[mask, (i * n_splines) : ((i + 1) * n_splines)] = spl(X[mask, i]) + + # Note for extrapolation: + # 'continue' is already returned as is by scipy BSplines + if self.extrapolation == "error": + # BSpline with extrapolate=False does not raise an error, but + # outputs np.nan. + if (use_sparse and np.any(np.isnan(XBS_sparse.data))) or ( + not use_sparse + and np.any( + np.isnan(XBS[:, (i * n_splines) : ((i + 1) * n_splines)]) + ) + ): + raise ValueError( + "X contains values beyond the limits of the knots." + ) + elif self.extrapolation == "constant": + # Set all values beyond xmin and xmax to the value of the + # spline basis functions at those two positions. + # Only the first degree and last degree number of splines + # have non-zero values at the boundaries. + + mask = X[:, i] < xmin + if np.any(mask): + if use_sparse: + # Note: See comment about SparseEfficiencyWarning above. + XBS_sparse = XBS_sparse.tolil() + XBS_sparse[mask, :degree] = f_min[:degree] + + else: + XBS[mask, (i * n_splines) : (i * n_splines + degree)] = f_min[ + :degree + ] + + mask = X[:, i] > xmax + if np.any(mask): + if use_sparse: + # Note: See comment about SparseEfficiencyWarning above. + XBS_sparse = XBS_sparse.tolil() + XBS_sparse[mask, -degree:] = f_max[-degree:] + else: + XBS[ + mask, + ((i + 1) * n_splines - degree) : ((i + 1) * n_splines), + ] = f_max[-degree:] + + elif self.extrapolation == "linear": + # Continue the degree first and degree last spline bases + # linearly beyond the boundaries, with slope = derivative at + # the boundary. + # Note that all others have derivative = value = 0 at the + # boundaries. + + # spline derivatives = slopes at boundaries + fp_min, fp_max = spl(xmin, nu=1), spl(xmax, nu=1) + # Compute the linear continuation. + if degree <= 1: + # For degree=1, the derivative of 2nd spline is not zero at + # boundary. For degree=0 it is the same as 'constant'. + degree += 1 + for j in range(degree): + mask = X[:, i] < xmin + if np.any(mask): + linear_extr = f_min[j] + (X[mask, i] - xmin) * fp_min[j] + if use_sparse: + # Note: See comment about SparseEfficiencyWarning above. + XBS_sparse = XBS_sparse.tolil() + XBS_sparse[mask, j] = linear_extr + else: + XBS[mask, i * n_splines + j] = linear_extr + + mask = X[:, i] > xmax + if np.any(mask): + k = n_splines - 1 - j + linear_extr = f_max[k] + (X[mask, i] - xmax) * fp_max[k] + if use_sparse: + # Note: See comment about SparseEfficiencyWarning above. + XBS_sparse = XBS_sparse.tolil() + XBS_sparse[mask, k : k + 1] = linear_extr[:, None] + else: + XBS[mask, i * n_splines + k] = linear_extr + + if use_sparse: + XBS_sparse = XBS_sparse.tocsr() + output_list.append(XBS_sparse) + + if use_sparse: + # TODO: Remove this conditional error when the minimum supported version of + # SciPy is 1.9.2 + # `scipy.sparse.hstack` breaks in scipy<1.9.2 + # when `n_features_out_ > max_int32` + max_int32 = np.iinfo(np.int32).max + all_int32 = True + for mat in output_list: + all_int32 &= mat.indices.dtype == np.int32 + if ( + sp_version < parse_version("1.9.2") + and self.n_features_out_ > max_int32 + and all_int32 + ): + raise ValueError( + "In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`" + " produces negative columns when:\n1. The output shape contains" + " `n_cols` too large to be represented by a 32bit signed" + " integer.\n. All sub-matrices to be stacked have indices of" + " dtype `np.int32`.\nTo avoid this error, either use a version" + " of scipy `>=1.9.2` or alter the `SplineTransformer`" + " transformer to produce fewer than 2^31 output features" + ) + XBS = sparse.hstack(output_list, format="csr") + elif self.sparse_output: + # TODO: Remove ones scipy 1.10 is the minimum version. See comments above. + XBS = sparse.csr_matrix(XBS) + + if self.include_bias: + return XBS + else: + # We throw away one spline basis per feature. + # We chose the last one. + indices = [j for j in range(XBS.shape[1]) if (j + 1) % n_splines != 0] + return XBS[:, indices] diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_target_encoder.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_target_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..77b404e3e39e9e7173d995f207ae1fca30f19f15 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_target_encoder.py @@ -0,0 +1,534 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Integral, Real + +import numpy as np + +from ..base import OneToOneFeatureMixin, _fit_context +from ..utils._param_validation import Interval, StrOptions +from ..utils.multiclass import type_of_target +from ..utils.validation import ( + _check_feature_names_in, + _check_y, + check_consistent_length, + check_is_fitted, +) +from ._encoders import _BaseEncoder +from ._target_encoder_fast import _fit_encoding_fast, _fit_encoding_fast_auto_smooth + + +class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder): + """Target Encoder for regression and classification targets. + + Each category is encoded based on a shrunk estimate of the average target + values for observations belonging to the category. The encoding scheme mixes + the global target mean with the target mean conditioned on the value of the + category (see [MIC]_). + + When the target type is "multiclass", encodings are based + on the conditional probability estimate for each class. The target is first + binarized using the "one-vs-all" scheme via + :class:`~sklearn.preprocessing.LabelBinarizer`, then the average target + value for each class and each category is used for encoding, resulting in + `n_features` * `n_classes` encoded output features. + + :class:`TargetEncoder` considers missing values, such as `np.nan` or `None`, + as another category and encodes them like any other category. Categories + that are not seen during :meth:`fit` are encoded with the target mean, i.e. + `target_mean_`. + + For a demo on the importance of the `TargetEncoder` internal cross-fitting, + see + :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder_cross_val.py`. + For a comparison of different encoders, refer to + :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`. Read + more in the :ref:`User Guide `. + + .. note:: + `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a + :term:`cross fitting` scheme is used in `fit_transform` for encoding. + See the :ref:`User Guide ` for details. + + .. versionadded:: 1.3 + + Parameters + ---------- + categories : "auto" or list of shape (n_features,) of array-like, default="auto" + Categories (unique values) per feature: + + - `"auto"` : Determine categories automatically from the training data. + - list : `categories[i]` holds the categories expected in the i-th column. The + passed categories should not mix strings and numeric values within a single + feature, and should be sorted in case of numeric values. + + The used categories are stored in the `categories_` fitted attribute. + + target_type : {"auto", "continuous", "binary", "multiclass"}, default="auto" + Type of target. + + - `"auto"` : Type of target is inferred with + :func:`~sklearn.utils.multiclass.type_of_target`. + - `"continuous"` : Continuous target + - `"binary"` : Binary target + - `"multiclass"` : Multiclass target + + .. note:: + The type of target inferred with `"auto"` may not be the desired target + type used for modeling. For example, if the target consisted of integers + between 0 and 100, then :func:`~sklearn.utils.multiclass.type_of_target` + will infer the target as `"multiclass"`. In this case, setting + `target_type="continuous"` will specify the target as a regression + problem. The `target_type_` attribute gives the target type used by the + encoder. + + .. versionchanged:: 1.4 + Added the option 'multiclass'. + + smooth : "auto" or float, default="auto" + The amount of mixing of the target mean conditioned on the value of the + category with the global target mean. A larger `smooth` value will put + more weight on the global target mean. + If `"auto"`, then `smooth` is set to an empirical Bayes estimate. + + cv : int, default=5 + Determines the number of folds in the :term:`cross fitting` strategy used in + :meth:`fit_transform`. For classification targets, `StratifiedKFold` is used + and for continuous targets, `KFold` is used. + + shuffle : bool, default=True + Whether to shuffle the data in :meth:`fit_transform` before splitting into + folds. Note that the samples within each split will not be shuffled. + + random_state : int, RandomState instance or None, default=None + When `shuffle` is True, `random_state` affects the ordering of the + indices, which controls the randomness of each fold. Otherwise, this + parameter has no effect. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + encodings_ : list of shape (n_features,) or (n_features * n_classes) of \ + ndarray + Encodings learnt on all of `X`. + For feature `i`, `encodings_[i]` are the encodings matching the + categories listed in `categories_[i]`. When `target_type_` is + "multiclass", the encoding for feature `i` and class `j` is stored in + `encodings_[j + (i * len(classes_))]`. E.g., for 2 features (f) and + 3 classes (c), encodings are ordered: + f0_c0, f0_c1, f0_c2, f1_c0, f1_c1, f1_c2, + + categories_ : list of shape (n_features,) of ndarray + The categories of each input feature determined during fitting or + specified in `categories` + (in order of the features in `X` and corresponding with the output + of :meth:`transform`). + + target_type_ : str + Type of target. + + target_mean_ : float + The overall mean of the target. This value is only used in :meth:`transform` + to encode categories. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + classes_ : ndarray or None + If `target_type_` is 'binary' or 'multiclass', holds the label for each class, + otherwise `None`. + + See Also + -------- + OrdinalEncoder : Performs an ordinal (integer) encoding of the categorical features. + Contrary to TargetEncoder, this encoding is not supervised. Treating the + resulting encoding as a numerical features therefore lead arbitrarily + ordered values and therefore typically lead to lower predictive performance + when used as preprocessing for a classifier or regressor. + OneHotEncoder : Performs a one-hot encoding of categorical features. This + unsupervised encoding is better suited for low cardinality categorical + variables as it generate one new feature per unique category. + + References + ---------- + .. [MIC] :doi:`Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality + categorical attributes in classification and prediction problems" + SIGKDD Explor. Newsl. 3, 1 (July 2001), 27–32. <10.1145/507533.507538>` + + Examples + -------- + With `smooth="auto"`, the smoothing parameter is set to an empirical Bayes estimate: + + >>> import numpy as np + >>> from sklearn.preprocessing import TargetEncoder + >>> X = np.array([["dog"] * 20 + ["cat"] * 30 + ["snake"] * 38], dtype=object).T + >>> y = [90.3] * 5 + [80.1] * 15 + [20.4] * 5 + [20.1] * 25 + [21.2] * 8 + [49] * 30 + >>> enc_auto = TargetEncoder(smooth="auto") + >>> X_trans = enc_auto.fit_transform(X, y) + + >>> # A high `smooth` parameter puts more weight on global mean on the categorical + >>> # encodings: + >>> enc_high_smooth = TargetEncoder(smooth=5000.0).fit(X, y) + >>> enc_high_smooth.target_mean_ + np.float64(44.3) + >>> enc_high_smooth.encodings_ + [array([44.1, 44.4, 44.3])] + + >>> # On the other hand, a low `smooth` parameter puts more weight on target + >>> # conditioned on the value of the categorical: + >>> enc_low_smooth = TargetEncoder(smooth=1.0).fit(X, y) + >>> enc_low_smooth.encodings_ + [array([21, 80.8, 43.2])] + """ + + _parameter_constraints: dict = { + "categories": [StrOptions({"auto"}), list], + "target_type": [StrOptions({"auto", "continuous", "binary", "multiclass"})], + "smooth": [StrOptions({"auto"}), Interval(Real, 0, None, closed="left")], + "cv": [Interval(Integral, 2, None, closed="left")], + "shuffle": ["boolean"], + "random_state": ["random_state"], + } + + def __init__( + self, + categories="auto", + target_type="auto", + smooth="auto", + cv=5, + shuffle=True, + random_state=None, + ): + self.categories = categories + self.smooth = smooth + self.target_type = target_type + self.cv = cv + self.shuffle = shuffle + self.random_state = random_state + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y): + """Fit the :class:`TargetEncoder` to X and y. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to determine the categories of each feature. + + y : array-like of shape (n_samples,) + The target data used to encode the categories. + + Returns + ------- + self : object + Fitted encoder. + """ + self._fit_encodings_all(X, y) + return self + + @_fit_context(prefer_skip_nested_validation=True) + def fit_transform(self, X, y): + """Fit :class:`TargetEncoder` and transform X with the target encoding. + + .. note:: + `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a + :term:`cross fitting` scheme is used in `fit_transform` for encoding. + See the :ref:`User Guide `. for details. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to determine the categories of each feature. + + y : array-like of shape (n_samples,) + The target data used to encode the categories. + + Returns + ------- + X_trans : ndarray of shape (n_samples, n_features) or \ + (n_samples, (n_features * n_classes)) + Transformed input. + """ + from ..model_selection import KFold, StratifiedKFold # avoid circular import + + X_ordinal, X_known_mask, y_encoded, n_categories = self._fit_encodings_all(X, y) + + # The cv splitter is voluntarily restricted to *KFold to enforce non + # overlapping validation folds, otherwise the fit_transform output will + # not be well-specified. + if self.target_type_ == "continuous": + cv = KFold(self.cv, shuffle=self.shuffle, random_state=self.random_state) + else: + cv = StratifiedKFold( + self.cv, shuffle=self.shuffle, random_state=self.random_state + ) + + # If 'multiclass' multiply axis=1 by num classes else keep shape the same + if self.target_type_ == "multiclass": + X_out = np.empty( + (X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)), + dtype=np.float64, + ) + else: + X_out = np.empty_like(X_ordinal, dtype=np.float64) + + for train_idx, test_idx in cv.split(X, y): + X_train, y_train = X_ordinal[train_idx, :], y_encoded[train_idx] + y_train_mean = np.mean(y_train, axis=0) + + if self.target_type_ == "multiclass": + encodings = self._fit_encoding_multiclass( + X_train, + y_train, + n_categories, + y_train_mean, + ) + else: + encodings = self._fit_encoding_binary_or_continuous( + X_train, + y_train, + n_categories, + y_train_mean, + ) + self._transform_X_ordinal( + X_out, + X_ordinal, + ~X_known_mask, + test_idx, + encodings, + y_train_mean, + ) + return X_out + + def transform(self, X): + """Transform X with the target encoding. + + .. note:: + `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a + :term:`cross fitting` scheme is used in `fit_transform` for encoding. + See the :ref:`User Guide `. for details. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to determine the categories of each feature. + + Returns + ------- + X_trans : ndarray of shape (n_samples, n_features) or \ + (n_samples, (n_features * n_classes)) + Transformed input. + """ + X_ordinal, X_known_mask = self._transform( + X, handle_unknown="ignore", ensure_all_finite="allow-nan" + ) + + # If 'multiclass' multiply axis=1 by num of classes else keep shape the same + if self.target_type_ == "multiclass": + X_out = np.empty( + (X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)), + dtype=np.float64, + ) + else: + X_out = np.empty_like(X_ordinal, dtype=np.float64) + + self._transform_X_ordinal( + X_out, + X_ordinal, + ~X_known_mask, + slice(None), + self.encodings_, + self.target_mean_, + ) + return X_out + + def _fit_encodings_all(self, X, y): + """Fit a target encoding with all the data.""" + # avoid circular import + from ..preprocessing import ( + LabelBinarizer, + LabelEncoder, + ) + + check_consistent_length(X, y) + self._fit(X, handle_unknown="ignore", ensure_all_finite="allow-nan") + + if self.target_type == "auto": + accepted_target_types = ("binary", "multiclass", "continuous") + inferred_type_of_target = type_of_target(y, input_name="y") + if inferred_type_of_target not in accepted_target_types: + raise ValueError( + "Unknown label type: Target type was inferred to be " + f"{inferred_type_of_target!r}. Only {accepted_target_types} are " + "supported." + ) + self.target_type_ = inferred_type_of_target + else: + self.target_type_ = self.target_type + + self.classes_ = None + if self.target_type_ == "binary": + label_encoder = LabelEncoder() + y = label_encoder.fit_transform(y) + self.classes_ = label_encoder.classes_ + elif self.target_type_ == "multiclass": + label_binarizer = LabelBinarizer() + y = label_binarizer.fit_transform(y) + self.classes_ = label_binarizer.classes_ + else: # continuous + y = _check_y(y, y_numeric=True, estimator=self) + + self.target_mean_ = np.mean(y, axis=0) + + X_ordinal, X_known_mask = self._transform( + X, handle_unknown="ignore", ensure_all_finite="allow-nan" + ) + n_categories = np.fromiter( + (len(category_for_feature) for category_for_feature in self.categories_), + dtype=np.int64, + count=len(self.categories_), + ) + if self.target_type_ == "multiclass": + encodings = self._fit_encoding_multiclass( + X_ordinal, + y, + n_categories, + self.target_mean_, + ) + else: + encodings = self._fit_encoding_binary_or_continuous( + X_ordinal, + y, + n_categories, + self.target_mean_, + ) + self.encodings_ = encodings + + return X_ordinal, X_known_mask, y, n_categories + + def _fit_encoding_binary_or_continuous( + self, X_ordinal, y, n_categories, target_mean + ): + """Learn target encodings.""" + if self.smooth == "auto": + y_variance = np.var(y) + encodings = _fit_encoding_fast_auto_smooth( + X_ordinal, + y, + n_categories, + target_mean, + y_variance, + ) + else: + encodings = _fit_encoding_fast( + X_ordinal, + y, + n_categories, + self.smooth, + target_mean, + ) + return encodings + + def _fit_encoding_multiclass(self, X_ordinal, y, n_categories, target_mean): + """Learn multiclass encodings. + + Learn encodings for each class (c) then reorder encodings such that + the same features (f) are grouped together. `reorder_index` enables + converting from: + f0_c0, f1_c0, f0_c1, f1_c1, f0_c2, f1_c2 + to: + f0_c0, f0_c1, f0_c2, f1_c0, f1_c1, f1_c2 + """ + n_features = self.n_features_in_ + n_classes = len(self.classes_) + + encodings = [] + for i in range(n_classes): + y_class = y[:, i] + encoding = self._fit_encoding_binary_or_continuous( + X_ordinal, + y_class, + n_categories, + target_mean[i], + ) + encodings.extend(encoding) + + reorder_index = ( + idx + for start in range(n_features) + for idx in range(start, (n_classes * n_features), n_features) + ) + return [encodings[idx] for idx in reorder_index] + + def _transform_X_ordinal( + self, + X_out, + X_ordinal, + X_unknown_mask, + row_indices, + encodings, + target_mean, + ): + """Transform X_ordinal using encodings. + + In the multiclass case, `X_ordinal` and `X_unknown_mask` have column + (axis=1) size `n_features`, while `encodings` has length of size + `n_features * n_classes`. `feat_idx` deals with this by repeating + feature indices by `n_classes` E.g., for 3 features, 2 classes: + 0,0,1,1,2,2 + + Additionally, `target_mean` is of shape (`n_classes`,) so `mean_idx` + cycles through 0 to `n_classes` - 1, `n_features` times. + """ + if self.target_type_ == "multiclass": + n_classes = len(self.classes_) + for e_idx, encoding in enumerate(encodings): + # Repeat feature indices by n_classes + feat_idx = e_idx // n_classes + # Cycle through each class + mean_idx = e_idx % n_classes + X_out[row_indices, e_idx] = encoding[X_ordinal[row_indices, feat_idx]] + X_out[X_unknown_mask[:, feat_idx], e_idx] = target_mean[mean_idx] + else: + for e_idx, encoding in enumerate(encodings): + X_out[row_indices, e_idx] = encoding[X_ordinal[row_indices, e_idx]] + X_out[X_unknown_mask[:, e_idx], e_idx] = target_mean + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Not used, present here for API consistency by convention. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. `feature_names_in_` is used unless it is + not defined, in which case the following input feature names are + generated: `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. + When `type_of_target_` is "multiclass" the names are of the format + '_'. + """ + check_is_fitted(self, "n_features_in_") + feature_names = _check_feature_names_in(self, input_features) + if self.target_type_ == "multiclass": + feature_names = [ + f"{feature_name}_{class_name}" + for feature_name in feature_names + for class_name in self.classes_ + ] + return np.asarray(feature_names, dtype=object) + else: + return feature_names + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.target_tags.required = True + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_target_encoder_fast.pyx b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_target_encoder_fast.pyx new file mode 100644 index 0000000000000000000000000000000000000000..dca5f78e8d60fd70906b63cc434309b832e68d57 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/_target_encoder_fast.pyx @@ -0,0 +1,167 @@ +from libc.math cimport isnan +from libcpp.vector cimport vector + +from ..utils._typedefs cimport float32_t, float64_t, int32_t, int64_t + +import numpy as np + + +ctypedef fused INT_DTYPE: + int64_t + int32_t + +ctypedef fused Y_DTYPE: + int64_t + int32_t + float64_t + float32_t + + +def _fit_encoding_fast( + INT_DTYPE[:, ::1] X_int, + const Y_DTYPE[:] y, + int64_t[::1] n_categories, + double smooth, + double y_mean, +): + """Fit a target encoding on X_int and y. + + This implementation uses Eq 7 from [1] to compute the encoding. + As stated in the paper, Eq 7 is the same as Eq 3. + + [1]: Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality + categorical attributes in classification and prediction problems" + """ + cdef: + int64_t sample_idx, feat_idx, cat_idx, n_cats + INT_DTYPE X_int_tmp + int n_samples = X_int.shape[0] + int n_features = X_int.shape[1] + double smooth_sum = smooth * y_mean + int64_t max_n_cats = np.max(n_categories) + double[::1] sums = np.empty(max_n_cats, dtype=np.float64) + double[::1] counts = np.empty(max_n_cats, dtype=np.float64) + list encodings = [] + double[::1] current_encoding + # Gives access to encodings without gil + vector[double*] encoding_vec + + encoding_vec.resize(n_features) + for feat_idx in range(n_features): + current_encoding = np.empty(shape=n_categories[feat_idx], dtype=np.float64) + encoding_vec[feat_idx] = ¤t_encoding[0] + encodings.append(np.asarray(current_encoding)) + + with nogil: + for feat_idx in range(n_features): + n_cats = n_categories[feat_idx] + + for cat_idx in range(n_cats): + sums[cat_idx] = smooth_sum + counts[cat_idx] = smooth + + for sample_idx in range(n_samples): + X_int_tmp = X_int[sample_idx, feat_idx] + # -1 are unknown categories, which are not counted + if X_int_tmp == -1: + continue + sums[X_int_tmp] += y[sample_idx] + counts[X_int_tmp] += 1.0 + + for cat_idx in range(n_cats): + if counts[cat_idx] == 0: + encoding_vec[feat_idx][cat_idx] = y_mean + else: + encoding_vec[feat_idx][cat_idx] = sums[cat_idx] / counts[cat_idx] + + return encodings + + +def _fit_encoding_fast_auto_smooth( + INT_DTYPE[:, ::1] X_int, + const Y_DTYPE[:] y, + int64_t[::1] n_categories, + double y_mean, + double y_variance, +): + """Fit a target encoding on X_int and y with auto smoothing. + + This implementation uses Eq 5 and 6 from [1]. + + [1]: Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality + categorical attributes in classification and prediction problems" + """ + cdef: + int64_t sample_idx, feat_idx, cat_idx, n_cats + INT_DTYPE X_int_tmp + double diff + int n_samples = X_int.shape[0] + int n_features = X_int.shape[1] + int64_t max_n_cats = np.max(n_categories) + double[::1] means = np.empty(max_n_cats, dtype=np.float64) + int64_t[::1] counts = np.empty(max_n_cats, dtype=np.int64) + double[::1] sum_of_squared_diffs = np.empty(max_n_cats, dtype=np.float64) + double lambda_ + list encodings = [] + double[::1] current_encoding + # Gives access to encodings without gil + vector[double*] encoding_vec + + encoding_vec.resize(n_features) + for feat_idx in range(n_features): + current_encoding = np.empty(shape=n_categories[feat_idx], dtype=np.float64) + encoding_vec[feat_idx] = ¤t_encoding[0] + encodings.append(np.asarray(current_encoding)) + + # TODO: parallelize this with OpenMP prange. When n_features >= n_threads, it's + # probably good to parallelize the outer loop. When n_features is too small, + # then it would probably better to parallelize the nested loops on n_samples and + # n_cats, but the code to handle thread-local temporary variables might be + # significantly more complex. + with nogil: + for feat_idx in range(n_features): + n_cats = n_categories[feat_idx] + + for cat_idx in range(n_cats): + means[cat_idx] = 0.0 + counts[cat_idx] = 0 + sum_of_squared_diffs[cat_idx] = 0.0 + + # first pass to compute the mean + for sample_idx in range(n_samples): + X_int_tmp = X_int[sample_idx, feat_idx] + + # -1 are unknown categories, which are not counted + if X_int_tmp == -1: + continue + counts[X_int_tmp] += 1 + means[X_int_tmp] += y[sample_idx] + + for cat_idx in range(n_cats): + means[cat_idx] /= counts[cat_idx] + + # second pass to compute the sum of squared differences + for sample_idx in range(n_samples): + X_int_tmp = X_int[sample_idx, feat_idx] + if X_int_tmp == -1: + continue + diff = y[sample_idx] - means[X_int_tmp] + sum_of_squared_diffs[X_int_tmp] += diff * diff + + for cat_idx in range(n_cats): + lambda_ = ( + y_variance * counts[cat_idx] / + (y_variance * counts[cat_idx] + sum_of_squared_diffs[cat_idx] / + counts[cat_idx]) + ) + if isnan(lambda_): + # A nan can happen when: + # 1. counts[cat_idx] == 0 + # 2. y_variance == 0 and sum_of_squared_diffs[cat_idx] == 0 + encoding_vec[feat_idx][cat_idx] = y_mean + else: + encoding_vec[feat_idx][cat_idx] = ( + lambda_ * means[cat_idx] + (1 - lambda_) * y_mean + ) + + return encodings diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/meson.build b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..052c4a6766ad4ed409a08ffe3c8ff31a7412d3dd --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/meson.build @@ -0,0 +1,13 @@ +py.extension_module( + '_csr_polynomial_expansion', + [cython_gen.process('_csr_polynomial_expansion.pyx'), utils_cython_tree], + subdir: 'sklearn/preprocessing', + install: true +) + +py.extension_module( + '_target_encoder_fast', + [cython_gen_cpp.process('_target_encoder_fast.pyx'), utils_cython_tree], + subdir: 'sklearn/preprocessing', + install: true +) diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_common.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_common.py new file mode 100644 index 0000000000000000000000000000000000000000..09f702f64ce2367ef6fe47fdb789e0475bf11def --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_common.py @@ -0,0 +1,187 @@ +import warnings + +import numpy as np +import pytest + +from sklearn.base import clone +from sklearn.datasets import load_iris +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import ( + MaxAbsScaler, + MinMaxScaler, + PowerTransformer, + QuantileTransformer, + RobustScaler, + StandardScaler, + maxabs_scale, + minmax_scale, + power_transform, + quantile_transform, + robust_scale, + scale, +) +from sklearn.utils._testing import assert_allclose, assert_array_equal +from sklearn.utils.fixes import ( + BSR_CONTAINERS, + COO_CONTAINERS, + CSC_CONTAINERS, + CSR_CONTAINERS, + DIA_CONTAINERS, + DOK_CONTAINERS, + LIL_CONTAINERS, +) + +iris = load_iris() + + +def _get_valid_samples_by_column(X, col): + """Get non NaN samples in column of X""" + return X[:, [col]][~np.isnan(X[:, col])] + + +@pytest.mark.parametrize( + "est, func, support_sparse, strictly_positive, omit_kwargs", + [ + (MaxAbsScaler(), maxabs_scale, True, False, []), + (MinMaxScaler(), minmax_scale, False, False, ["clip"]), + (StandardScaler(), scale, False, False, []), + (StandardScaler(with_mean=False), scale, True, False, []), + (PowerTransformer("yeo-johnson"), power_transform, False, False, []), + (PowerTransformer("box-cox"), power_transform, False, True, []), + (QuantileTransformer(n_quantiles=10), quantile_transform, True, False, []), + (RobustScaler(), robust_scale, False, False, []), + (RobustScaler(with_centering=False), robust_scale, True, False, []), + ], +) +def test_missing_value_handling( + est, func, support_sparse, strictly_positive, omit_kwargs +): + # check that the preprocessing method let pass nan + rng = np.random.RandomState(42) + X = iris.data.copy() + n_missing = 50 + X[ + rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing) + ] = np.nan + if strictly_positive: + X += np.nanmin(X) + 0.1 + X_train, X_test = train_test_split(X, random_state=1) + # sanity check + assert not np.all(np.isnan(X_train), axis=0).any() + assert np.any(np.isnan(X_train), axis=0).all() + assert np.any(np.isnan(X_test), axis=0).all() + X_test[:, 0] = np.nan # make sure this boundary case is tested + + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + Xt = est.fit(X_train).transform(X_test) + # ensure no warnings are raised + # missing values should still be missing, and only them + assert_array_equal(np.isnan(Xt), np.isnan(X_test)) + + # check that the function leads to the same results as the class + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + Xt_class = est.transform(X_train) + kwargs = est.get_params() + # remove the parameters which should be omitted because they + # are not defined in the counterpart function of the preprocessing class + for kwarg in omit_kwargs: + _ = kwargs.pop(kwarg) + Xt_func = func(X_train, **kwargs) + assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class)) + assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)]) + + # check that the inverse transform keep NaN + Xt_inv = est.inverse_transform(Xt) + assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test)) + # FIXME: we can introduce equal_nan=True in recent version of numpy. + # For the moment which just check that non-NaN values are almost equal. + assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)]) + + for i in range(X.shape[1]): + # train only on non-NaN + est.fit(_get_valid_samples_by_column(X_train, i)) + # check transforming with NaN works even when training without NaN + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + Xt_col = est.transform(X_test[:, [i]]) + assert_allclose(Xt_col, Xt[:, [i]]) + # check non-NaN is handled as before - the 1st column is all nan + if not np.isnan(X_test[:, i]).all(): + Xt_col_nonan = est.transform(_get_valid_samples_by_column(X_test, i)) + assert_array_equal(Xt_col_nonan, Xt_col[~np.isnan(Xt_col.squeeze())]) + + if support_sparse: + est_dense = clone(est) + est_sparse = clone(est) + + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + Xt_dense = est_dense.fit(X_train).transform(X_test) + Xt_inv_dense = est_dense.inverse_transform(Xt_dense) + + for sparse_container in ( + BSR_CONTAINERS + + COO_CONTAINERS + + CSC_CONTAINERS + + CSR_CONTAINERS + + DIA_CONTAINERS + + DOK_CONTAINERS + + LIL_CONTAINERS + ): + # check that the dense and sparse inputs lead to the same results + # precompute the matrix to avoid catching side warnings + X_train_sp = sparse_container(X_train) + X_test_sp = sparse_container(X_test) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", PendingDeprecationWarning) + warnings.simplefilter("error", RuntimeWarning) + Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp) + + assert_allclose(Xt_sp.toarray(), Xt_dense) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", PendingDeprecationWarning) + warnings.simplefilter("error", RuntimeWarning) + Xt_inv_sp = est_sparse.inverse_transform(Xt_sp) + + assert_allclose(Xt_inv_sp.toarray(), Xt_inv_dense) + + +@pytest.mark.parametrize( + "est, func", + [ + (MaxAbsScaler(), maxabs_scale), + (MinMaxScaler(), minmax_scale), + (StandardScaler(), scale), + (StandardScaler(with_mean=False), scale), + (PowerTransformer("yeo-johnson"), power_transform), + ( + PowerTransformer("box-cox"), + power_transform, + ), + (QuantileTransformer(n_quantiles=3), quantile_transform), + (RobustScaler(), robust_scale), + (RobustScaler(with_centering=False), robust_scale), + ], +) +def test_missing_value_pandas_na_support(est, func): + # Test pandas IntegerArray with pd.NA + pd = pytest.importorskip("pandas") + + X = np.array( + [ + [1, 2, 3, np.nan, np.nan, 4, 5, 1], + [np.nan, np.nan, 8, 4, 6, np.nan, np.nan, 8], + [1, 2, 3, 4, 5, 6, 7, 8], + ] + ).T + + # Creates dataframe with IntegerArrays with pd.NA + X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c"]) + X_df["c"] = X_df["c"].astype("int") + + X_trans = est.fit_transform(X) + X_df_trans = est.fit_transform(X_df) + + assert_allclose(X_trans, X_df_trans) diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_data.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_data.py new file mode 100644 index 0000000000000000000000000000000000000000..a618d426a7dcb28da4ea858bec03dd957de5eb0c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_data.py @@ -0,0 +1,2693 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import re +import warnings + +import numpy as np +import numpy.linalg as la +import pytest +from scipy import sparse, stats + +from sklearn import config_context, datasets +from sklearn.base import clone +from sklearn.exceptions import NotFittedError +from sklearn.externals._packaging.version import parse as parse_version +from sklearn.metrics.pairwise import linear_kernel +from sklearn.model_selection import cross_val_predict +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import ( + Binarizer, + KernelCenterer, + MaxAbsScaler, + MinMaxScaler, + Normalizer, + PowerTransformer, + QuantileTransformer, + RobustScaler, + StandardScaler, + add_dummy_feature, + maxabs_scale, + minmax_scale, + normalize, + power_transform, + quantile_transform, + robust_scale, + scale, +) +from sklearn.preprocessing._data import BOUNDS_THRESHOLD, _handle_zeros_in_scale +from sklearn.svm import SVR +from sklearn.utils import gen_batches, shuffle +from sklearn.utils._array_api import ( + _convert_to_numpy, + _get_namespace_device_dtype_ids, + yield_namespace_device_dtype_combinations, +) +from sklearn.utils._test_common.instance_generator import _get_check_estimator_ids +from sklearn.utils._testing import ( + _array_api_for_tests, + _convert_container, + assert_allclose, + assert_allclose_dense_sparse, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, + assert_array_less, + skip_if_32bit, +) +from sklearn.utils.estimator_checks import ( + check_array_api_input_and_values, +) +from sklearn.utils.fixes import ( + COO_CONTAINERS, + CSC_CONTAINERS, + CSR_CONTAINERS, + LIL_CONTAINERS, + sp_version, +) +from sklearn.utils.sparsefuncs import mean_variance_axis + +iris = datasets.load_iris() + +# Make some data to be used many times +rng = np.random.RandomState(0) +n_features = 30 +n_samples = 1000 +offsets = rng.uniform(-1, 1, size=n_features) +scales = rng.uniform(1, 10, size=n_features) +X_2d = rng.randn(n_samples, n_features) * scales + offsets +X_1row = X_2d[0, :].reshape(1, n_features) +X_1col = X_2d[:, 0].reshape(n_samples, 1) +X_list_1row = X_1row.tolist() +X_list_1col = X_1col.tolist() + + +def toarray(a): + if hasattr(a, "toarray"): + a = a.toarray() + return a + + +def _check_dim_1axis(a): + return np.asarray(a).shape[0] + + +def assert_correct_incr(i, batch_start, batch_stop, n, chunk_size, n_samples_seen): + if batch_stop != n: + assert (i + 1) * chunk_size == n_samples_seen + else: + assert i * chunk_size + (batch_stop - batch_start) == n_samples_seen + + +def test_raises_value_error_if_sample_weights_greater_than_1d(): + # Sample weights must be either scalar or 1D + + n_sampless = [2, 3] + n_featuress = [3, 2] + + for n_samples, n_features in zip(n_sampless, n_featuress): + X = rng.randn(n_samples, n_features) + y = rng.randn(n_samples) + + scaler = StandardScaler() + + # make sure Error is raised the sample weights greater than 1d + sample_weight_notOK = rng.randn(n_samples, 1) ** 2 + with pytest.raises(ValueError): + scaler.fit(X, y, sample_weight=sample_weight_notOK) + + +@pytest.mark.parametrize( + ["Xw", "X", "sample_weight"], + [ + ([[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [1, 2, 3], [4, 5, 6]], [2.0, 1.0]), + ( + [[1, 0, 1], [0, 0, 1]], + [[1, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1]], + np.array([1, 3]), + ), + ( + [[1, np.nan, 1], [np.nan, np.nan, 1]], + [ + [1, np.nan, 1], + [np.nan, np.nan, 1], + [np.nan, np.nan, 1], + [np.nan, np.nan, 1], + ], + np.array([1, 3]), + ), + ], +) +@pytest.mark.parametrize("array_constructor", ["array", "sparse_csr", "sparse_csc"]) +def test_standard_scaler_sample_weight(Xw, X, sample_weight, array_constructor): + with_mean = not array_constructor.startswith("sparse") + X = _convert_container(X, array_constructor) + Xw = _convert_container(Xw, array_constructor) + + # weighted StandardScaler + yw = np.ones(Xw.shape[0]) + scaler_w = StandardScaler(with_mean=with_mean) + scaler_w.fit(Xw, yw, sample_weight=sample_weight) + + # unweighted, but with repeated samples + y = np.ones(X.shape[0]) + scaler = StandardScaler(with_mean=with_mean) + scaler.fit(X, y) + + X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]] + + assert_almost_equal(scaler.mean_, scaler_w.mean_) + assert_almost_equal(scaler.var_, scaler_w.var_) + assert_almost_equal(scaler.transform(X_test), scaler_w.transform(X_test)) + + +def test_standard_scaler_1d(): + # Test scaling of dataset along single axis + for X in [X_1row, X_1col, X_list_1row, X_list_1row]: + scaler = StandardScaler() + X_scaled = scaler.fit(X).transform(X, copy=True) + + if isinstance(X, list): + X = np.array(X) # cast only after scaling done + + if _check_dim_1axis(X) == 1: + assert_almost_equal(scaler.mean_, X.ravel()) + assert_almost_equal(scaler.scale_, np.ones(n_features)) + assert_array_almost_equal(X_scaled.mean(axis=0), np.zeros_like(n_features)) + assert_array_almost_equal(X_scaled.std(axis=0), np.zeros_like(n_features)) + else: + assert_almost_equal(scaler.mean_, X.mean()) + assert_almost_equal(scaler.scale_, X.std()) + assert_array_almost_equal(X_scaled.mean(axis=0), np.zeros_like(n_features)) + assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) + assert_array_almost_equal(X_scaled.std(axis=0), 1.0) + assert scaler.n_samples_seen_ == X.shape[0] + + # check inverse transform + X_scaled_back = scaler.inverse_transform(X_scaled) + assert_array_almost_equal(X_scaled_back, X) + + # Constant feature + X = np.ones((5, 1)) + scaler = StandardScaler() + X_scaled = scaler.fit(X).transform(X, copy=True) + assert_almost_equal(scaler.mean_, 1.0) + assert_almost_equal(scaler.scale_, 1.0) + assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) + assert_array_almost_equal(X_scaled.std(axis=0), 0.0) + assert scaler.n_samples_seen_ == X.shape[0] + + +@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS) +@pytest.mark.parametrize("add_sample_weight", [False, True]) +def test_standard_scaler_dtype(add_sample_weight, sparse_container): + # Ensure scaling does not affect dtype + rng = np.random.RandomState(0) + n_samples = 10 + n_features = 3 + if add_sample_weight: + sample_weight = np.ones(n_samples) + else: + sample_weight = None + with_mean = True + if sparse_container is not None: + # scipy sparse containers do not support float16, see + # https://github.com/scipy/scipy/issues/7408 for more details. + supported_dtype = [np.float64, np.float32] + else: + supported_dtype = [np.float64, np.float32, np.float16] + for dtype in supported_dtype: + X = rng.randn(n_samples, n_features).astype(dtype) + if sparse_container is not None: + X = sparse_container(X) + with_mean = False + + scaler = StandardScaler(with_mean=with_mean) + X_scaled = scaler.fit(X, sample_weight=sample_weight).transform(X) + assert X.dtype == X_scaled.dtype + assert scaler.mean_.dtype == np.float64 + assert scaler.scale_.dtype == np.float64 + + +@pytest.mark.parametrize( + "scaler", + [ + StandardScaler(with_mean=False), + RobustScaler(with_centering=False), + ], +) +@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS) +@pytest.mark.parametrize("add_sample_weight", [False, True]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("constant", [0, 1.0, 100.0]) +def test_standard_scaler_constant_features( + scaler, add_sample_weight, sparse_container, dtype, constant +): + if isinstance(scaler, RobustScaler) and add_sample_weight: + pytest.skip(f"{scaler.__class__.__name__} does not yet support sample_weight") + + rng = np.random.RandomState(0) + n_samples = 100 + n_features = 1 + if add_sample_weight: + fit_params = dict(sample_weight=rng.uniform(size=n_samples) * 2) + else: + fit_params = {} + X_array = np.full(shape=(n_samples, n_features), fill_value=constant, dtype=dtype) + X = X_array if sparse_container is None else sparse_container(X_array) + X_scaled = scaler.fit(X, **fit_params).transform(X) + + if isinstance(scaler, StandardScaler): + # The variance info should be close to zero for constant features. + assert_allclose(scaler.var_, np.zeros(X.shape[1]), atol=1e-7) + + # Constant features should not be scaled (scale of 1.): + assert_allclose(scaler.scale_, np.ones(X.shape[1])) + + assert X_scaled is not X # make sure we make a copy + assert_allclose_dense_sparse(X_scaled, X) + + if isinstance(scaler, StandardScaler) and not add_sample_weight: + # Also check consistency with the standard scale function. + X_scaled_2 = scale(X, with_mean=scaler.with_mean) + assert X_scaled_2 is not X # make sure we did a copy + assert_allclose_dense_sparse(X_scaled_2, X) + + +@pytest.mark.parametrize("n_samples", [10, 100, 10_000]) +@pytest.mark.parametrize("average", [1e-10, 1, 1e10]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS) +def test_standard_scaler_near_constant_features( + n_samples, sparse_container, average, dtype +): + # Check that when the variance is too small (var << mean**2) the feature + # is considered constant and not scaled. + + scale_min, scale_max = -30, 19 + scales = np.array([10**i for i in range(scale_min, scale_max + 1)], dtype=dtype) + + n_features = scales.shape[0] + X = np.empty((n_samples, n_features), dtype=dtype) + # Make a dataset of known var = scales**2 and mean = average + X[: n_samples // 2, :] = average + scales + X[n_samples // 2 :, :] = average - scales + X_array = X if sparse_container is None else sparse_container(X) + + scaler = StandardScaler(with_mean=False).fit(X_array) + + # StandardScaler uses float64 accumulators even if the data has a float32 + # dtype. + eps = np.finfo(np.float64).eps + + # if var < bound = N.eps.var + N².eps².mean², the feature is considered + # constant and the scale_ attribute is set to 1. + bounds = n_samples * eps * scales**2 + n_samples**2 * eps**2 * average**2 + within_bounds = scales**2 <= bounds + + # Check that scale_min is small enough to have some scales below the + # bound and therefore detected as constant: + assert np.any(within_bounds) + + # Check that such features are actually treated as constant by the scaler: + assert all(scaler.var_[within_bounds] <= bounds[within_bounds]) + assert_allclose(scaler.scale_[within_bounds], 1.0) + + # Depending the on the dtype of X, some features might not actually be + # representable as non constant for small scales (even if above the + # precision bound of the float64 variance estimate). Such feature should + # be correctly detected as constants with 0 variance by StandardScaler. + representable_diff = X[0, :] - X[-1, :] != 0 + assert_allclose(scaler.var_[np.logical_not(representable_diff)], 0) + assert_allclose(scaler.scale_[np.logical_not(representable_diff)], 1) + + # The other features are scaled and scale_ is equal to sqrt(var_) assuming + # that scales are large enough for average + scale and average - scale to + # be distinct in X (depending on X's dtype). + common_mask = np.logical_and(scales**2 > bounds, representable_diff) + assert_allclose(scaler.scale_[common_mask], np.sqrt(scaler.var_)[common_mask]) + + +def test_scale_1d(): + # 1-d inputs + X_list = [1.0, 3.0, 5.0, 0.0] + X_arr = np.array(X_list) + + for X in [X_list, X_arr]: + X_scaled = scale(X) + assert_array_almost_equal(X_scaled.mean(), 0.0) + assert_array_almost_equal(X_scaled.std(), 1.0) + assert_array_equal(scale(X, with_mean=False, with_std=False), X) + + +@skip_if_32bit +def test_standard_scaler_numerical_stability(): + # Test numerical stability of scaling + # np.log(1e-5) is taken because of its floating point representation + # was empirically found to cause numerical problems with np.mean & np.std. + x = np.full(8, np.log(1e-5), dtype=np.float64) + # This does not raise a warning as the number of samples is too low + # to trigger the problem in recent numpy + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + scale(x) + assert_array_almost_equal(scale(x), np.zeros(8)) + + # with 2 more samples, the std computation run into numerical issues: + x = np.full(10, np.log(1e-5), dtype=np.float64) + warning_message = "standard deviation of the data is probably very close to 0" + with pytest.warns(UserWarning, match=warning_message): + x_scaled = scale(x) + assert_array_almost_equal(x_scaled, np.zeros(10)) + + x = np.full(10, 1e-100, dtype=np.float64) + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + x_small_scaled = scale(x) + assert_array_almost_equal(x_small_scaled, np.zeros(10)) + + # Large values can cause (often recoverable) numerical stability issues: + x_big = np.full(10, 1e100, dtype=np.float64) + warning_message = "Dataset may contain too large values" + with pytest.warns(UserWarning, match=warning_message): + x_big_scaled = scale(x_big) + assert_array_almost_equal(x_big_scaled, np.zeros(10)) + assert_array_almost_equal(x_big_scaled, x_small_scaled) + with pytest.warns(UserWarning, match=warning_message): + x_big_centered = scale(x_big, with_std=False) + assert_array_almost_equal(x_big_centered, np.zeros(10)) + assert_array_almost_equal(x_big_centered, x_small_scaled) + + +def test_scaler_2d_arrays(): + # Test scaling of 2d array along first axis + rng = np.random.RandomState(0) + n_features = 5 + n_samples = 4 + X = rng.randn(n_samples, n_features) + X[:, 0] = 0.0 # first feature is always of zero + + scaler = StandardScaler() + X_scaled = scaler.fit(X).transform(X, copy=True) + assert not np.any(np.isnan(X_scaled)) + assert scaler.n_samples_seen_ == n_samples + + assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0]) + assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) + # Check that X has been copied + assert X_scaled is not X + + # check inverse transform + X_scaled_back = scaler.inverse_transform(X_scaled) + assert X_scaled_back is not X + assert X_scaled_back is not X_scaled + assert_array_almost_equal(X_scaled_back, X) + + X_scaled = scale(X, axis=1, with_std=False) + assert not np.any(np.isnan(X_scaled)) + assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0]) + X_scaled = scale(X, axis=1, with_std=True) + assert not np.any(np.isnan(X_scaled)) + assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0]) + assert_array_almost_equal(X_scaled.std(axis=1), n_samples * [1.0]) + # Check that the data hasn't been modified + assert X_scaled is not X + + X_scaled = scaler.fit(X).transform(X, copy=False) + assert not np.any(np.isnan(X_scaled)) + assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0]) + assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) + # Check that X has not been copied + assert X_scaled is X + + X = rng.randn(4, 5) + X[:, 0] = 1.0 # first feature is a constant, non zero feature + scaler = StandardScaler() + X_scaled = scaler.fit(X).transform(X, copy=True) + assert not np.any(np.isnan(X_scaled)) + assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0]) + assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) + # Check that X has not been copied + assert X_scaled is not X + + +def test_scaler_float16_overflow(): + # Test if the scaler will not overflow on float16 numpy arrays + rng = np.random.RandomState(0) + # float16 has a maximum of 65500.0. On the worst case 5 * 200000 is 100000 + # which is enough to overflow the data type + X = rng.uniform(5, 10, [200000, 1]).astype(np.float16) + + with np.errstate(over="raise"): + scaler = StandardScaler().fit(X) + X_scaled = scaler.transform(X) + + # Calculate the float64 equivalent to verify result + X_scaled_f64 = StandardScaler().fit_transform(X.astype(np.float64)) + + # Overflow calculations may cause -inf, inf, or nan. Since there is no nan + # input, all of the outputs should be finite. This may be redundant since a + # FloatingPointError exception will be thrown on overflow above. + assert np.all(np.isfinite(X_scaled)) + + # The normal distribution is very unlikely to go above 4. At 4.0-8.0 the + # float16 precision is 2^-8 which is around 0.004. Thus only 2 decimals are + # checked to account for precision differences. + assert_array_almost_equal(X_scaled, X_scaled_f64, decimal=2) + + +def test_handle_zeros_in_scale(): + s1 = np.array([0, 1e-16, 1, 2, 3]) + s2 = _handle_zeros_in_scale(s1, copy=True) + + assert_allclose(s1, np.array([0, 1e-16, 1, 2, 3])) + assert_allclose(s2, np.array([1, 1, 1, 2, 3])) + + +def test_minmax_scaler_partial_fit(): + # Test if partial_fit run over many batches of size 1 and 50 + # gives the same results as fit + X = X_2d + n = X.shape[0] + + for chunk_size in [1, 2, 50, n, n + 42]: + # Test mean at the end of the process + scaler_batch = MinMaxScaler().fit(X) + + scaler_incr = MinMaxScaler() + for batch in gen_batches(n_samples, chunk_size): + scaler_incr = scaler_incr.partial_fit(X[batch]) + + assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_) + assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_) + assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ + assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_) + assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) + assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_) + + # Test std after 1 step + batch0 = slice(0, chunk_size) + scaler_batch = MinMaxScaler().fit(X[batch0]) + scaler_incr = MinMaxScaler().partial_fit(X[batch0]) + + assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_) + assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_) + assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ + assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_) + assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) + assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_) + + # Test std until the end of partial fits, and + scaler_batch = MinMaxScaler().fit(X) + scaler_incr = MinMaxScaler() # Clean estimator + for i, batch in enumerate(gen_batches(n_samples, chunk_size)): + scaler_incr = scaler_incr.partial_fit(X[batch]) + assert_correct_incr( + i, + batch_start=batch.start, + batch_stop=batch.stop, + n=n, + chunk_size=chunk_size, + n_samples_seen=scaler_incr.n_samples_seen_, + ) + + +def test_standard_scaler_partial_fit(): + # Test if partial_fit run over many batches of size 1 and 50 + # gives the same results as fit + X = X_2d + n = X.shape[0] + + for chunk_size in [1, 2, 50, n, n + 42]: + # Test mean at the end of the process + scaler_batch = StandardScaler(with_std=False).fit(X) + + scaler_incr = StandardScaler(with_std=False) + for batch in gen_batches(n_samples, chunk_size): + scaler_incr = scaler_incr.partial_fit(X[batch]) + assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_) + assert scaler_batch.var_ == scaler_incr.var_ # Nones + assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ + + # Test std after 1 step + batch0 = slice(0, chunk_size) + scaler_incr = StandardScaler().partial_fit(X[batch0]) + if chunk_size == 1: + assert_array_almost_equal( + np.zeros(n_features, dtype=np.float64), scaler_incr.var_ + ) + assert_array_almost_equal( + np.ones(n_features, dtype=np.float64), scaler_incr.scale_ + ) + else: + assert_array_almost_equal(np.var(X[batch0], axis=0), scaler_incr.var_) + assert_array_almost_equal( + np.std(X[batch0], axis=0), scaler_incr.scale_ + ) # no constants + + # Test std until the end of partial fits, and + scaler_batch = StandardScaler().fit(X) + scaler_incr = StandardScaler() # Clean estimator + for i, batch in enumerate(gen_batches(n_samples, chunk_size)): + scaler_incr = scaler_incr.partial_fit(X[batch]) + assert_correct_incr( + i, + batch_start=batch.start, + batch_stop=batch.stop, + n=n, + chunk_size=chunk_size, + n_samples_seen=scaler_incr.n_samples_seen_, + ) + + assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_) + assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ + + +@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) +def test_standard_scaler_partial_fit_numerical_stability(sparse_container): + # Test if the incremental computation introduces significative errors + # for large datasets with values of large magniture + rng = np.random.RandomState(0) + n_features = 2 + n_samples = 100 + offsets = rng.uniform(-1e15, 1e15, size=n_features) + scales = rng.uniform(1e3, 1e6, size=n_features) + X = rng.randn(n_samples, n_features) * scales + offsets + + scaler_batch = StandardScaler().fit(X) + scaler_incr = StandardScaler() + for chunk in X: + scaler_incr = scaler_incr.partial_fit(chunk.reshape(1, n_features)) + + # Regardless of abs values, they must not be more diff 6 significant digits + tol = 10 ** (-6) + assert_allclose(scaler_incr.mean_, scaler_batch.mean_, rtol=tol) + assert_allclose(scaler_incr.var_, scaler_batch.var_, rtol=tol) + assert_allclose(scaler_incr.scale_, scaler_batch.scale_, rtol=tol) + # NOTE Be aware that for much larger offsets std is very unstable (last + # assert) while mean is OK. + + # Sparse input + size = (100, 3) + scale = 1e20 + X = sparse_container(rng.randint(0, 2, size).astype(np.float64) * scale) + + # with_mean=False is required with sparse input + scaler = StandardScaler(with_mean=False).fit(X) + scaler_incr = StandardScaler(with_mean=False) + + for chunk in X: + if chunk.ndim == 1: + # Sparse arrays can be 1D (in scipy 1.14 and later) while old + # sparse matrix instances are always 2D. + chunk = chunk.reshape(1, -1) + scaler_incr = scaler_incr.partial_fit(chunk) + + # Regardless of magnitude, they must not differ more than of 6 digits + tol = 10 ** (-6) + assert scaler.mean_ is not None + assert_allclose(scaler_incr.var_, scaler.var_, rtol=tol) + assert_allclose(scaler_incr.scale_, scaler.scale_, rtol=tol) + + +@pytest.mark.parametrize("sample_weight", [True, None]) +@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) +def test_partial_fit_sparse_input(sample_weight, sparse_container): + # Check that sparsity is not destroyed + X = sparse_container(np.array([[1.0], [0.0], [0.0], [5.0]])) + + if sample_weight: + sample_weight = rng.rand(X.shape[0]) + + null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) + X_null = null_transform.partial_fit(X, sample_weight=sample_weight).transform(X) + assert_array_equal(X_null.toarray(), X.toarray()) + X_orig = null_transform.inverse_transform(X_null) + assert_array_equal(X_orig.toarray(), X_null.toarray()) + assert_array_equal(X_orig.toarray(), X.toarray()) + + +@pytest.mark.parametrize("sample_weight", [True, None]) +def test_standard_scaler_trasform_with_partial_fit(sample_weight): + # Check some postconditions after applying partial_fit and transform + X = X_2d[:100, :] + + if sample_weight: + sample_weight = rng.rand(X.shape[0]) + + scaler_incr = StandardScaler() + for i, batch in enumerate(gen_batches(X.shape[0], 1)): + X_sofar = X[: (i + 1), :] + chunks_copy = X_sofar.copy() + if sample_weight is None: + scaled_batch = StandardScaler().fit_transform(X_sofar) + scaler_incr = scaler_incr.partial_fit(X[batch]) + else: + scaled_batch = StandardScaler().fit_transform( + X_sofar, sample_weight=sample_weight[: i + 1] + ) + scaler_incr = scaler_incr.partial_fit( + X[batch], sample_weight=sample_weight[batch] + ) + scaled_incr = scaler_incr.transform(X_sofar) + + assert_array_almost_equal(scaled_batch, scaled_incr) + assert_array_almost_equal(X_sofar, chunks_copy) # No change + right_input = scaler_incr.inverse_transform(scaled_incr) + assert_array_almost_equal(X_sofar, right_input) + + zero = np.zeros(X.shape[1]) + epsilon = np.finfo(float).eps + assert_array_less(zero, scaler_incr.var_ + epsilon) # as less or equal + assert_array_less(zero, scaler_incr.scale_ + epsilon) + if sample_weight is None: + # (i+1) because the Scaler has been already fitted + assert (i + 1) == scaler_incr.n_samples_seen_ + else: + assert np.sum(sample_weight[: i + 1]) == pytest.approx( + scaler_incr.n_samples_seen_ + ) + + +def test_standard_check_array_of_inverse_transform(): + # Check if StandardScaler inverse_transform is + # converting the integer array to float + x = np.array( + [ + [1, 1, 1, 0, 1, 0], + [1, 1, 1, 0, 1, 0], + [0, 8, 0, 1, 0, 0], + [1, 4, 1, 1, 0, 0], + [0, 1, 0, 0, 1, 0], + [0, 4, 0, 1, 0, 1], + ], + dtype=np.int32, + ) + + scaler = StandardScaler() + scaler.fit(x) + + # The of inverse_transform should be converted + # to a float array. + # If not X *= self.scale_ will fail. + scaler.inverse_transform(x) + + +@pytest.mark.parametrize( + "array_namespace, device, dtype_name", + yield_namespace_device_dtype_combinations(), + ids=_get_namespace_device_dtype_ids, +) +@pytest.mark.parametrize( + "check", + [check_array_api_input_and_values], + ids=_get_check_estimator_ids, +) +@pytest.mark.parametrize( + "estimator", + [ + MaxAbsScaler(), + MinMaxScaler(), + MinMaxScaler(clip=True), + KernelCenterer(), + Normalizer(norm="l1"), + Normalizer(norm="l2"), + Normalizer(norm="max"), + Binarizer(), + ], + ids=_get_check_estimator_ids, +) +def test_preprocessing_array_api_compliance( + estimator, check, array_namespace, device, dtype_name +): + name = estimator.__class__.__name__ + check(name, estimator, array_namespace, device=device, dtype_name=dtype_name) + + +def test_min_max_scaler_iris(): + X = iris.data + scaler = MinMaxScaler() + # default params + X_trans = scaler.fit_transform(X) + assert_array_almost_equal(X_trans.min(axis=0), 0) + assert_array_almost_equal(X_trans.max(axis=0), 1) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + # not default params: min=1, max=2 + scaler = MinMaxScaler(feature_range=(1, 2)) + X_trans = scaler.fit_transform(X) + assert_array_almost_equal(X_trans.min(axis=0), 1) + assert_array_almost_equal(X_trans.max(axis=0), 2) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + # min=-.5, max=.6 + scaler = MinMaxScaler(feature_range=(-0.5, 0.6)) + X_trans = scaler.fit_transform(X) + assert_array_almost_equal(X_trans.min(axis=0), -0.5) + assert_array_almost_equal(X_trans.max(axis=0), 0.6) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + # raises on invalid range + scaler = MinMaxScaler(feature_range=(2, 1)) + with pytest.raises(ValueError): + scaler.fit(X) + + +def test_min_max_scaler_zero_variance_features(): + # Check min max scaler on toy data with zero variance features + X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.1], [0.0, 1.0, +1.1]] + + X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]] + + # default params + scaler = MinMaxScaler() + X_trans = scaler.fit_transform(X) + X_expected_0_1 = [[0.0, 0.0, 0.5], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0]] + assert_array_almost_equal(X_trans, X_expected_0_1) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + X_trans_new = scaler.transform(X_new) + X_expected_0_1_new = [[+0.0, 1.0, 0.500], [-1.0, 0.0, 0.083], [+0.0, 0.0, 1.333]] + assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2) + + # not default params + scaler = MinMaxScaler(feature_range=(1, 2)) + X_trans = scaler.fit_transform(X) + X_expected_1_2 = [[1.0, 1.0, 1.5], [1.0, 1.0, 1.0], [1.0, 1.0, 2.0]] + assert_array_almost_equal(X_trans, X_expected_1_2) + + # function interface + X_trans = minmax_scale(X) + assert_array_almost_equal(X_trans, X_expected_0_1) + X_trans = minmax_scale(X, feature_range=(1, 2)) + assert_array_almost_equal(X_trans, X_expected_1_2) + + +def test_minmax_scale_axis1(): + X = iris.data + X_trans = minmax_scale(X, axis=1) + assert_array_almost_equal(np.min(X_trans, axis=1), 0) + assert_array_almost_equal(np.max(X_trans, axis=1), 1) + + +def test_min_max_scaler_1d(): + # Test scaling of dataset along single axis + for X in [X_1row, X_1col, X_list_1row, X_list_1row]: + scaler = MinMaxScaler(copy=True) + X_scaled = scaler.fit(X).transform(X) + + if isinstance(X, list): + X = np.array(X) # cast only after scaling done + + if _check_dim_1axis(X) == 1: + assert_array_almost_equal(X_scaled.min(axis=0), np.zeros(n_features)) + assert_array_almost_equal(X_scaled.max(axis=0), np.zeros(n_features)) + else: + assert_array_almost_equal(X_scaled.min(axis=0), 0.0) + assert_array_almost_equal(X_scaled.max(axis=0), 1.0) + assert scaler.n_samples_seen_ == X.shape[0] + + # check inverse transform + X_scaled_back = scaler.inverse_transform(X_scaled) + assert_array_almost_equal(X_scaled_back, X) + + # Constant feature + X = np.ones((5, 1)) + scaler = MinMaxScaler() + X_scaled = scaler.fit(X).transform(X) + assert X_scaled.min() >= 0.0 + assert X_scaled.max() <= 1.0 + assert scaler.n_samples_seen_ == X.shape[0] + + # Function interface + X_1d = X_1row.ravel() + min_ = X_1d.min() + max_ = X_1d.max() + assert_array_almost_equal( + (X_1d - min_) / (max_ - min_), minmax_scale(X_1d, copy=True) + ) + + +@pytest.mark.parametrize("sample_weight", [True, None]) +@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) +def test_scaler_without_centering(sample_weight, sparse_container): + rng = np.random.RandomState(42) + X = rng.randn(4, 5) + X[:, 0] = 0.0 # first feature is always of zero + X_sparse = sparse_container(X) + + if sample_weight: + sample_weight = rng.rand(X.shape[0]) + + with pytest.raises(ValueError): + StandardScaler().fit(X_sparse) + + scaler = StandardScaler(with_mean=False).fit(X, sample_weight=sample_weight) + X_scaled = scaler.transform(X, copy=True) + assert not np.any(np.isnan(X_scaled)) + + scaler_sparse = StandardScaler(with_mean=False).fit( + X_sparse, sample_weight=sample_weight + ) + X_sparse_scaled = scaler_sparse.transform(X_sparse, copy=True) + assert not np.any(np.isnan(X_sparse_scaled.data)) + + assert_array_almost_equal(scaler.mean_, scaler_sparse.mean_) + assert_array_almost_equal(scaler.var_, scaler_sparse.var_) + assert_array_almost_equal(scaler.scale_, scaler_sparse.scale_) + assert_array_almost_equal(scaler.n_samples_seen_, scaler_sparse.n_samples_seen_) + + if sample_weight is None: + assert_array_almost_equal( + X_scaled.mean(axis=0), [0.0, -0.01, 2.24, -0.35, -0.78], 2 + ) + assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) + + X_sparse_scaled_mean, X_sparse_scaled_var = mean_variance_axis(X_sparse_scaled, 0) + assert_array_almost_equal(X_sparse_scaled_mean, X_scaled.mean(axis=0)) + assert_array_almost_equal(X_sparse_scaled_var, X_scaled.var(axis=0)) + + # Check that X has not been modified (copy) + assert X_scaled is not X + assert X_sparse_scaled is not X_sparse + + X_scaled_back = scaler.inverse_transform(X_scaled) + assert X_scaled_back is not X + assert X_scaled_back is not X_scaled + assert_array_almost_equal(X_scaled_back, X) + + X_sparse_scaled_back = scaler_sparse.inverse_transform(X_sparse_scaled) + assert X_sparse_scaled_back is not X_sparse + assert X_sparse_scaled_back is not X_sparse_scaled + assert_array_almost_equal(X_sparse_scaled_back.toarray(), X) + + if sparse_container in CSR_CONTAINERS: + null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) + X_null = null_transform.fit_transform(X_sparse) + assert_array_equal(X_null.data, X_sparse.data) + X_orig = null_transform.inverse_transform(X_null) + assert_array_equal(X_orig.data, X_sparse.data) + + +@pytest.mark.parametrize("with_mean", [True, False]) +@pytest.mark.parametrize("with_std", [True, False]) +@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS) +def test_scaler_n_samples_seen_with_nan(with_mean, with_std, sparse_container): + X = np.array( + [[0, 1, 3], [np.nan, 6, 10], [5, 4, np.nan], [8, 0, np.nan]], dtype=np.float64 + ) + if sparse_container is not None: + X = sparse_container(X) + + if sparse.issparse(X) and with_mean: + pytest.skip("'with_mean=True' cannot be used with sparse matrix.") + + transformer = StandardScaler(with_mean=with_mean, with_std=with_std) + transformer.fit(X) + + assert_array_equal(transformer.n_samples_seen_, np.array([3, 4, 2])) + + +def _check_identity_scalers_attributes(scaler_1, scaler_2): + assert scaler_1.mean_ is scaler_2.mean_ is None + assert scaler_1.var_ is scaler_2.var_ is None + assert scaler_1.scale_ is scaler_2.scale_ is None + assert scaler_1.n_samples_seen_ == scaler_2.n_samples_seen_ + + +@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) +def test_scaler_return_identity(sparse_container): + # test that the scaler return identity when with_mean and with_std are + # False + X_dense = np.array([[0, 1, 3], [5, 6, 0], [8, 0, 10]], dtype=np.float64) + X_sparse = sparse_container(X_dense) + + transformer_dense = StandardScaler(with_mean=False, with_std=False) + X_trans_dense = transformer_dense.fit_transform(X_dense) + assert_allclose(X_trans_dense, X_dense) + + transformer_sparse = clone(transformer_dense) + X_trans_sparse = transformer_sparse.fit_transform(X_sparse) + assert_allclose_dense_sparse(X_trans_sparse, X_sparse) + + _check_identity_scalers_attributes(transformer_dense, transformer_sparse) + + transformer_dense.partial_fit(X_dense) + transformer_sparse.partial_fit(X_sparse) + _check_identity_scalers_attributes(transformer_dense, transformer_sparse) + + transformer_dense.fit(X_dense) + transformer_sparse.fit(X_sparse) + _check_identity_scalers_attributes(transformer_dense, transformer_sparse) + + +@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) +def test_scaler_int(sparse_container): + # test that scaler converts integer input to floating + # for both sparse and dense matrices + rng = np.random.RandomState(42) + X = rng.randint(20, size=(4, 5)) + X[:, 0] = 0 # first feature is always of zero + X_sparse = sparse_container(X) + + with warnings.catch_warnings(record=True): + scaler = StandardScaler(with_mean=False).fit(X) + X_scaled = scaler.transform(X, copy=True) + assert not np.any(np.isnan(X_scaled)) + + with warnings.catch_warnings(record=True): + scaler_sparse = StandardScaler(with_mean=False).fit(X_sparse) + X_sparse_scaled = scaler_sparse.transform(X_sparse, copy=True) + assert not np.any(np.isnan(X_sparse_scaled.data)) + + assert_array_almost_equal(scaler.mean_, scaler_sparse.mean_) + assert_array_almost_equal(scaler.var_, scaler_sparse.var_) + assert_array_almost_equal(scaler.scale_, scaler_sparse.scale_) + + assert_array_almost_equal( + X_scaled.mean(axis=0), [0.0, 1.109, 1.856, 21.0, 1.559], 2 + ) + assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) + + X_sparse_scaled_mean, X_sparse_scaled_std = mean_variance_axis( + X_sparse_scaled.astype(float), 0 + ) + assert_array_almost_equal(X_sparse_scaled_mean, X_scaled.mean(axis=0)) + assert_array_almost_equal(X_sparse_scaled_std, X_scaled.std(axis=0)) + + # Check that X has not been modified (copy) + assert X_scaled is not X + assert X_sparse_scaled is not X_sparse + + X_scaled_back = scaler.inverse_transform(X_scaled) + assert X_scaled_back is not X + assert X_scaled_back is not X_scaled + assert_array_almost_equal(X_scaled_back, X) + + X_sparse_scaled_back = scaler_sparse.inverse_transform(X_sparse_scaled) + assert X_sparse_scaled_back is not X_sparse + assert X_sparse_scaled_back is not X_sparse_scaled + assert_array_almost_equal(X_sparse_scaled_back.toarray(), X) + + if sparse_container in CSR_CONTAINERS: + null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) + with warnings.catch_warnings(record=True): + X_null = null_transform.fit_transform(X_sparse) + assert_array_equal(X_null.data, X_sparse.data) + X_orig = null_transform.inverse_transform(X_null) + assert_array_equal(X_orig.data, X_sparse.data) + + +@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) +def test_scaler_without_copy(sparse_container): + # Check that StandardScaler.fit does not change input + rng = np.random.RandomState(42) + X = rng.randn(4, 5) + X[:, 0] = 0.0 # first feature is always of zero + X_sparse = sparse_container(X) + + X_copy = X.copy() + StandardScaler(copy=False).fit(X) + assert_array_equal(X, X_copy) + + X_sparse_copy = X_sparse.copy() + StandardScaler(with_mean=False, copy=False).fit(X_sparse) + assert_array_equal(X_sparse.toarray(), X_sparse_copy.toarray()) + + +@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) +def test_scale_sparse_with_mean_raise_exception(sparse_container): + rng = np.random.RandomState(42) + X = rng.randn(4, 5) + X_sparse = sparse_container(X) + + # check scaling and fit with direct calls on sparse data + with pytest.raises(ValueError): + scale(X_sparse, with_mean=True) + with pytest.raises(ValueError): + StandardScaler(with_mean=True).fit(X_sparse) + + # check transform and inverse_transform after a fit on a dense array + scaler = StandardScaler(with_mean=True).fit(X) + with pytest.raises(ValueError): + scaler.transform(X_sparse) + + X_transformed_sparse = sparse_container(scaler.transform(X)) + with pytest.raises(ValueError): + scaler.inverse_transform(X_transformed_sparse) + + +def test_scale_input_finiteness_validation(): + # Check if non finite inputs raise ValueError + X = [[np.inf, 5, 6, 7, 8]] + with pytest.raises( + ValueError, match="Input contains infinity or a value too large" + ): + scale(X) + + +def test_robust_scaler_error_sparse(): + X_sparse = sparse.rand(1000, 10) + scaler = RobustScaler(with_centering=True) + err_msg = "Cannot center sparse matrices" + with pytest.raises(ValueError, match=err_msg): + scaler.fit(X_sparse) + + +@pytest.mark.parametrize("with_centering", [True, False]) +@pytest.mark.parametrize("with_scaling", [True, False]) +@pytest.mark.parametrize("X", [np.random.randn(10, 3), sparse.rand(10, 3, density=0.5)]) +def test_robust_scaler_attributes(X, with_centering, with_scaling): + # check consistent type of attributes + if with_centering and sparse.issparse(X): + pytest.skip("RobustScaler cannot center sparse matrix") + + scaler = RobustScaler(with_centering=with_centering, with_scaling=with_scaling) + scaler.fit(X) + + if with_centering: + assert isinstance(scaler.center_, np.ndarray) + else: + assert scaler.center_ is None + if with_scaling: + assert isinstance(scaler.scale_, np.ndarray) + else: + assert scaler.scale_ is None + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_robust_scaler_col_zero_sparse(csr_container): + # check that the scaler is working when there is not data materialized in a + # column of a sparse matrix + X = np.random.randn(10, 5) + X[:, 0] = 0 + X = csr_container(X) + + scaler = RobustScaler(with_centering=False) + scaler.fit(X) + assert scaler.scale_[0] == pytest.approx(1) + + X_trans = scaler.transform(X) + assert_allclose(X[:, [0]].toarray(), X_trans[:, [0]].toarray()) + + +def test_robust_scaler_2d_arrays(): + # Test robust scaling of 2d array along first axis + rng = np.random.RandomState(0) + X = rng.randn(4, 5) + X[:, 0] = 0.0 # first feature is always of zero + + scaler = RobustScaler() + X_scaled = scaler.fit(X).transform(X) + + assert_array_almost_equal(np.median(X_scaled, axis=0), 5 * [0.0]) + assert_array_almost_equal(X_scaled.std(axis=0)[0], 0) + + +@pytest.mark.parametrize("density", [0, 0.05, 0.1, 0.5, 1]) +@pytest.mark.parametrize("strictly_signed", ["positive", "negative", "zeros", None]) +def test_robust_scaler_equivalence_dense_sparse(density, strictly_signed): + # Check the equivalence of the fitting with dense and sparse matrices + X_sparse = sparse.rand(1000, 5, density=density).tocsc() + if strictly_signed == "positive": + X_sparse.data = np.abs(X_sparse.data) + elif strictly_signed == "negative": + X_sparse.data = -np.abs(X_sparse.data) + elif strictly_signed == "zeros": + X_sparse.data = np.zeros(X_sparse.data.shape, dtype=np.float64) + X_dense = X_sparse.toarray() + + scaler_sparse = RobustScaler(with_centering=False) + scaler_dense = RobustScaler(with_centering=False) + + scaler_sparse.fit(X_sparse) + scaler_dense.fit(X_dense) + + assert_allclose(scaler_sparse.scale_, scaler_dense.scale_) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_robust_scaler_transform_one_row_csr(csr_container): + # Check RobustScaler on transforming csr matrix with one row + rng = np.random.RandomState(0) + X = rng.randn(4, 5) + single_row = np.array([[0.1, 1.0, 2.0, 0.0, -1.0]]) + scaler = RobustScaler(with_centering=False) + scaler = scaler.fit(X) + row_trans = scaler.transform(csr_container(single_row)) + row_expected = single_row / scaler.scale_ + assert_array_almost_equal(row_trans.toarray(), row_expected) + row_scaled_back = scaler.inverse_transform(row_trans) + assert_array_almost_equal(single_row, row_scaled_back.toarray()) + + +def test_robust_scaler_iris(): + X = iris.data + scaler = RobustScaler() + X_trans = scaler.fit_transform(X) + assert_array_almost_equal(np.median(X_trans, axis=0), 0) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + q = np.percentile(X_trans, q=(25, 75), axis=0) + iqr = q[1] - q[0] + assert_array_almost_equal(iqr, 1) + + +def test_robust_scaler_iris_quantiles(): + X = iris.data + scaler = RobustScaler(quantile_range=(10, 90)) + X_trans = scaler.fit_transform(X) + assert_array_almost_equal(np.median(X_trans, axis=0), 0) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + q = np.percentile(X_trans, q=(10, 90), axis=0) + q_range = q[1] - q[0] + assert_array_almost_equal(q_range, 1) + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_quantile_transform_iris(csc_container): + X = iris.data + # uniform output distribution + transformer = QuantileTransformer(n_quantiles=30) + X_trans = transformer.fit_transform(X) + X_trans_inv = transformer.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + # normal output distribution + transformer = QuantileTransformer(n_quantiles=30, output_distribution="normal") + X_trans = transformer.fit_transform(X) + X_trans_inv = transformer.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + # make sure it is possible to take the inverse of a sparse matrix + # which contain negative value; this is the case in the iris dataset + X_sparse = csc_container(X) + X_sparse_tran = transformer.fit_transform(X_sparse) + X_sparse_tran_inv = transformer.inverse_transform(X_sparse_tran) + assert_array_almost_equal(X_sparse.toarray(), X_sparse_tran_inv.toarray()) + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_quantile_transform_check_error(csc_container): + X = np.transpose( + [ + [0, 25, 50, 0, 0, 0, 75, 0, 0, 100], + [2, 4, 0, 0, 6, 8, 0, 10, 0, 0], + [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1], + ] + ) + X = csc_container(X) + X_neg = np.transpose( + [ + [0, 25, 50, 0, 0, 0, 75, 0, 0, 100], + [-2, 4, 0, 0, 6, 8, 0, 10, 0, 0], + [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1], + ] + ) + X_neg = csc_container(X_neg) + + err_msg = ( + "The number of quantiles cannot be greater than " + "the number of samples used. Got 1000 quantiles " + "and 10 samples." + ) + with pytest.raises(ValueError, match=err_msg): + QuantileTransformer(subsample=10).fit(X) + + transformer = QuantileTransformer(n_quantiles=10) + err_msg = "QuantileTransformer only accepts non-negative sparse matrices." + with pytest.raises(ValueError, match=err_msg): + transformer.fit(X_neg) + transformer.fit(X) + err_msg = "QuantileTransformer only accepts non-negative sparse matrices." + with pytest.raises(ValueError, match=err_msg): + transformer.transform(X_neg) + + X_bad_feat = np.transpose( + [[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]] + ) + err_msg = ( + "X has 2 features, but QuantileTransformer is expecting 3 features as input." + ) + with pytest.raises(ValueError, match=err_msg): + transformer.inverse_transform(X_bad_feat) + + transformer = QuantileTransformer(n_quantiles=10).fit(X) + # check that an error is raised if input is scalar + with pytest.raises(ValueError, match="Expected 2D array, got scalar array instead"): + transformer.transform(10) + # check that a warning is raised is n_quantiles > n_samples + transformer = QuantileTransformer(n_quantiles=100) + warn_msg = "n_quantiles is set to n_samples" + with pytest.warns(UserWarning, match=warn_msg) as record: + transformer.fit(X) + assert len(record) == 1 + assert transformer.n_quantiles_ == X.shape[0] + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_quantile_transform_sparse_ignore_zeros(csc_container): + X = np.array([[0, 1], [0, 0], [0, 2], [0, 2], [0, 1]]) + X_sparse = csc_container(X) + transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5) + + # dense case -> warning raise + warning_message = ( + "'ignore_implicit_zeros' takes effect" + " only with sparse matrix. This parameter has no" + " effect." + ) + with pytest.warns(UserWarning, match=warning_message): + transformer.fit(X) + + X_expected = np.array([[0, 0], [0, 0], [0, 1], [0, 1], [0, 0]]) + X_trans = transformer.fit_transform(X_sparse) + assert_almost_equal(X_expected, X_trans.toarray()) + + # consider the case where sparse entries are missing values and user-given + # zeros are to be considered + X_data = np.array([0, 0, 1, 0, 2, 2, 1, 0, 1, 2, 0]) + X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]) + X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6, 7, 8]) + X_sparse = csc_container((X_data, (X_row, X_col))) + X_trans = transformer.fit_transform(X_sparse) + X_expected = np.array( + [ + [0.0, 0.5], + [0.0, 0.0], + [0.0, 1.0], + [0.0, 1.0], + [0.0, 0.5], + [0.0, 0.0], + [0.0, 0.5], + [0.0, 1.0], + [0.0, 0.0], + ] + ) + assert_almost_equal(X_expected, X_trans.toarray()) + + transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5) + X_data = np.array([-1, -1, 1, 0, 0, 0, 1, -1, 1]) + X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1]) + X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6]) + X_sparse = csc_container((X_data, (X_row, X_col))) + X_trans = transformer.fit_transform(X_sparse) + X_expected = np.array( + [[0, 1], [0, 0.375], [0, 0.375], [0, 0.375], [0, 1], [0, 0], [0, 1]] + ) + assert_almost_equal(X_expected, X_trans.toarray()) + assert_almost_equal( + X_sparse.toarray(), transformer.inverse_transform(X_trans).toarray() + ) + + # check in conjunction with subsampling + transformer = QuantileTransformer( + ignore_implicit_zeros=True, n_quantiles=5, subsample=8, random_state=0 + ) + X_trans = transformer.fit_transform(X_sparse) + assert_almost_equal(X_expected, X_trans.toarray()) + assert_almost_equal( + X_sparse.toarray(), transformer.inverse_transform(X_trans).toarray() + ) + + +def test_quantile_transform_dense_toy(): + X = np.array( + [[0, 2, 2.6], [25, 4, 4.1], [50, 6, 2.3], [75, 8, 9.5], [100, 10, 0.1]] + ) + + transformer = QuantileTransformer(n_quantiles=5) + transformer.fit(X) + + # using a uniform output, each entry of X should be map between 0 and 1 + # and equally spaced + X_trans = transformer.fit_transform(X) + X_expected = np.tile(np.linspace(0, 1, num=5), (3, 1)).T + assert_almost_equal(np.sort(X_trans, axis=0), X_expected) + + X_test = np.array( + [ + [-1, 1, 0], + [101, 11, 10], + ] + ) + X_expected = np.array( + [ + [0, 0, 0], + [1, 1, 1], + ] + ) + assert_array_almost_equal(transformer.transform(X_test), X_expected) + + X_trans_inv = transformer.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + +def test_quantile_transform_subsampling(): + # Test that subsampling the input yield to a consistent results We check + # that the computed quantiles are almost mapped to a [0, 1] vector where + # values are equally spaced. The infinite norm is checked to be smaller + # than a given threshold. This is repeated 5 times. + + # dense support + n_samples = 1000000 + n_quantiles = 1000 + X = np.sort(np.random.sample((n_samples, 1)), axis=0) + ROUND = 5 + inf_norm_arr = [] + for random_state in range(ROUND): + transformer = QuantileTransformer( + random_state=random_state, + n_quantiles=n_quantiles, + subsample=n_samples // 10, + ) + transformer.fit(X) + diff = np.linspace(0, 1, n_quantiles) - np.ravel(transformer.quantiles_) + inf_norm = np.max(np.abs(diff)) + assert inf_norm < 1e-2 + inf_norm_arr.append(inf_norm) + # each random subsampling yield a unique approximation to the expected + # linspace CDF + assert len(np.unique(inf_norm_arr)) == len(inf_norm_arr) + + # sparse support + + X = sparse.rand(n_samples, 1, density=0.99, format="csc", random_state=0) + inf_norm_arr = [] + for random_state in range(ROUND): + transformer = QuantileTransformer( + random_state=random_state, + n_quantiles=n_quantiles, + subsample=n_samples // 10, + ) + transformer.fit(X) + diff = np.linspace(0, 1, n_quantiles) - np.ravel(transformer.quantiles_) + inf_norm = np.max(np.abs(diff)) + assert inf_norm < 1e-1 + inf_norm_arr.append(inf_norm) + # each random subsampling yield a unique approximation to the expected + # linspace CDF + assert len(np.unique(inf_norm_arr)) == len(inf_norm_arr) + + +def test_quantile_transform_subsampling_disabled(): + """Check the behaviour of `QuantileTransformer` when `subsample=None`.""" + X = np.random.RandomState(0).normal(size=(200, 1)) + + n_quantiles = 5 + transformer = QuantileTransformer(n_quantiles=n_quantiles, subsample=None).fit(X) + + expected_references = np.linspace(0, 1, n_quantiles) + assert_allclose(transformer.references_, expected_references) + expected_quantiles = np.quantile(X.ravel(), expected_references) + assert_allclose(transformer.quantiles_.ravel(), expected_quantiles) + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_quantile_transform_sparse_toy(csc_container): + X = np.array( + [ + [0.0, 2.0, 0.0], + [25.0, 4.0, 0.0], + [50.0, 0.0, 2.6], + [0.0, 0.0, 4.1], + [0.0, 6.0, 0.0], + [0.0, 8.0, 0.0], + [75.0, 0.0, 2.3], + [0.0, 10.0, 0.0], + [0.0, 0.0, 9.5], + [100.0, 0.0, 0.1], + ] + ) + + X = csc_container(X) + + transformer = QuantileTransformer(n_quantiles=10) + transformer.fit(X) + + X_trans = transformer.fit_transform(X) + assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.0) + assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.0) + + X_trans_inv = transformer.inverse_transform(X_trans) + assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) + + transformer_dense = QuantileTransformer(n_quantiles=10).fit(X.toarray()) + + X_trans = transformer_dense.transform(X) + assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.0) + assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.0) + + X_trans_inv = transformer_dense.inverse_transform(X_trans) + assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) + + +def test_quantile_transform_axis1(): + X = np.array([[0, 25, 50, 75, 100], [2, 4, 6, 8, 10], [2.6, 4.1, 2.3, 9.5, 0.1]]) + + X_trans_a0 = quantile_transform(X.T, axis=0, n_quantiles=5) + X_trans_a1 = quantile_transform(X, axis=1, n_quantiles=5) + assert_array_almost_equal(X_trans_a0, X_trans_a1.T) + + +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_quantile_transform_bounds(csc_container): + # Lower and upper bounds are manually mapped. We checked that in the case + # of a constant feature and binary feature, the bounds are properly mapped. + X_dense = np.array([[0, 0], [0, 0], [1, 0]]) + X_sparse = csc_container(X_dense) + + # check sparse and dense are consistent + X_trans = QuantileTransformer(n_quantiles=3, random_state=0).fit_transform(X_dense) + assert_array_almost_equal(X_trans, X_dense) + X_trans_sp = QuantileTransformer(n_quantiles=3, random_state=0).fit_transform( + X_sparse + ) + assert_array_almost_equal(X_trans_sp.toarray(), X_dense) + assert_array_almost_equal(X_trans, X_trans_sp.toarray()) + + # check the consistency of the bounds by learning on 1 matrix + # and transforming another + X = np.array([[0, 1], [0, 0.5], [1, 0]]) + X1 = np.array([[0, 0.1], [0, 0.5], [1, 0.1]]) + transformer = QuantileTransformer(n_quantiles=3).fit(X) + X_trans = transformer.transform(X1) + assert_array_almost_equal(X_trans, X1) + + # check that values outside of the range learned will be mapped properly. + X = np.random.random((1000, 1)) + transformer = QuantileTransformer() + transformer.fit(X) + assert transformer.transform([[-10]]) == transformer.transform([[np.min(X)]]) + assert transformer.transform([[10]]) == transformer.transform([[np.max(X)]]) + assert transformer.inverse_transform([[-10]]) == transformer.inverse_transform( + [[np.min(transformer.references_)]] + ) + assert transformer.inverse_transform([[10]]) == transformer.inverse_transform( + [[np.max(transformer.references_)]] + ) + + +def test_quantile_transform_and_inverse(): + X_1 = iris.data + X_2 = np.array([[0.0], [BOUNDS_THRESHOLD / 10], [1.5], [2], [3], [3], [4]]) + for X in [X_1, X_2]: + transformer = QuantileTransformer(n_quantiles=1000, random_state=0) + X_trans = transformer.fit_transform(X) + X_trans_inv = transformer.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv, decimal=9) + + +def test_quantile_transform_nan(): + X = np.array([[np.nan, 0, 0, 1], [np.nan, np.nan, 0, 0.5], [np.nan, 1, 1, 0]]) + + transformer = QuantileTransformer(n_quantiles=10, random_state=42) + transformer.fit_transform(X) + + # check that the quantile of the first column is all NaN + assert np.isnan(transformer.quantiles_[:, 0]).all() + # all other column should not contain NaN + assert not np.isnan(transformer.quantiles_[:, 1:]).any() + + +@pytest.mark.parametrize("array_type", ["array", "sparse"]) +def test_quantile_transformer_sorted_quantiles(array_type): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/15733 + # Taken from upstream bug report: + # https://github.com/numpy/numpy/issues/14685 + X = np.array([0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 1, 1, 9, 9, 9, 8, 8, 7] * 10) + X = 0.1 * X.reshape(-1, 1) + X = _convert_container(X, array_type) + + n_quantiles = 100 + qt = QuantileTransformer(n_quantiles=n_quantiles).fit(X) + + # Check that the estimated quantile thresholds are monotically + # increasing: + quantiles = qt.quantiles_[:, 0] + assert len(quantiles) == 100 + assert all(np.diff(quantiles) >= 0) + + +def test_robust_scaler_invalid_range(): + for range_ in [ + (-1, 90), + (-2, -3), + (10, 101), + (100.5, 101), + (90, 50), + ]: + scaler = RobustScaler(quantile_range=range_) + + with pytest.raises(ValueError, match=r"Invalid quantile range: \("): + scaler.fit(iris.data) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_scale_function_without_centering(csr_container): + rng = np.random.RandomState(42) + X = rng.randn(4, 5) + X[:, 0] = 0.0 # first feature is always of zero + X_csr = csr_container(X) + + X_scaled = scale(X, with_mean=False) + assert not np.any(np.isnan(X_scaled)) + + X_csr_scaled = scale(X_csr, with_mean=False) + assert not np.any(np.isnan(X_csr_scaled.data)) + + # test csc has same outcome + X_csc_scaled = scale(X_csr.tocsc(), with_mean=False) + assert_array_almost_equal(X_scaled, X_csc_scaled.toarray()) + + # raises value error on axis != 0 + with pytest.raises(ValueError): + scale(X_csr, with_mean=False, axis=1) + + assert_array_almost_equal( + X_scaled.mean(axis=0), [0.0, -0.01, 2.24, -0.35, -0.78], 2 + ) + assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) + # Check that X has not been copied + assert X_scaled is not X + + X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0) + assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) + assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) + + # null scale + X_csr_scaled = scale(X_csr, with_mean=False, with_std=False, copy=True) + assert_array_almost_equal(X_csr.toarray(), X_csr_scaled.toarray()) + + +def test_robust_scale_axis1(): + X = iris.data + X_trans = robust_scale(X, axis=1) + assert_array_almost_equal(np.median(X_trans, axis=1), 0) + q = np.percentile(X_trans, q=(25, 75), axis=1) + iqr = q[1] - q[0] + assert_array_almost_equal(iqr, 1) + + +def test_robust_scale_1d_array(): + X = iris.data[:, 1] + X_trans = robust_scale(X) + assert_array_almost_equal(np.median(X_trans), 0) + q = np.percentile(X_trans, q=(25, 75)) + iqr = q[1] - q[0] + assert_array_almost_equal(iqr, 1) + + +def test_robust_scaler_zero_variance_features(): + # Check RobustScaler on toy data with zero variance features + X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.1], [0.0, 1.0, +1.1]] + + scaler = RobustScaler() + X_trans = scaler.fit_transform(X) + + # NOTE: for such a small sample size, what we expect in the third column + # depends HEAVILY on the method used to calculate quantiles. The values + # here were calculated to fit the quantiles produces by np.percentile + # using numpy 1.9 Calculating quantiles with + # scipy.stats.mstats.scoreatquantile or scipy.stats.mstats.mquantiles + # would yield very different results! + X_expected = [[0.0, 0.0, +0.0], [0.0, 0.0, -1.0], [0.0, 0.0, +1.0]] + assert_array_almost_equal(X_trans, X_expected) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + # make sure new data gets transformed correctly + X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]] + X_trans_new = scaler.transform(X_new) + X_expected_new = [[+0.0, 1.0, +0.0], [-1.0, 0.0, -0.83333], [+0.0, 0.0, +1.66667]] + assert_array_almost_equal(X_trans_new, X_expected_new, decimal=3) + + +def test_robust_scaler_unit_variance(): + # Check RobustScaler with unit_variance=True on standard normal data with + # outliers + rng = np.random.RandomState(42) + X = rng.randn(1000000, 1) + X_with_outliers = np.vstack([X, np.ones((100, 1)) * 100, np.ones((100, 1)) * -100]) + + quantile_range = (1, 99) + robust_scaler = RobustScaler(quantile_range=quantile_range, unit_variance=True).fit( + X_with_outliers + ) + X_trans = robust_scaler.transform(X) + + assert robust_scaler.center_ == pytest.approx(0, abs=1e-3) + assert robust_scaler.scale_ == pytest.approx(1, abs=1e-2) + assert X_trans.std() == pytest.approx(1, abs=1e-2) + + +@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) +def test_maxabs_scaler_zero_variance_features(sparse_container): + # Check MaxAbsScaler on toy data with zero variance features + X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.3], [0.0, 1.0, +1.5], [0.0, 0.0, +0.0]] + + scaler = MaxAbsScaler() + X_trans = scaler.fit_transform(X) + X_expected = [ + [0.0, 1.0, 1.0 / 3.0], + [0.0, 1.0, -0.2], + [0.0, 1.0, 1.0], + [0.0, 0.0, 0.0], + ] + assert_array_almost_equal(X_trans, X_expected) + X_trans_inv = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X, X_trans_inv) + + # make sure new data gets transformed correctly + X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]] + X_trans_new = scaler.transform(X_new) + X_expected_new = [[+0.0, 2.0, 1.0 / 3.0], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.0]] + + assert_array_almost_equal(X_trans_new, X_expected_new, decimal=2) + + # function interface + X_trans = maxabs_scale(X) + assert_array_almost_equal(X_trans, X_expected) + + # sparse data + X_sparse = sparse_container(X) + X_trans_sparse = scaler.fit_transform(X_sparse) + X_expected = [ + [0.0, 1.0, 1.0 / 3.0], + [0.0, 1.0, -0.2], + [0.0, 1.0, 1.0], + [0.0, 0.0, 0.0], + ] + assert_array_almost_equal(X_trans_sparse.toarray(), X_expected) + X_trans_sparse_inv = scaler.inverse_transform(X_trans_sparse) + assert_array_almost_equal(X, X_trans_sparse_inv.toarray()) + + +def test_maxabs_scaler_large_negative_value(): + # Check MaxAbsScaler on toy data with a large negative value + X = [ + [0.0, 1.0, +0.5, -1.0], + [0.0, 1.0, -0.3, -0.5], + [0.0, 1.0, -100.0, 0.0], + [0.0, 0.0, +0.0, -2.0], + ] + + scaler = MaxAbsScaler() + X_trans = scaler.fit_transform(X) + X_expected = [ + [0.0, 1.0, 0.005, -0.5], + [0.0, 1.0, -0.003, -0.25], + [0.0, 1.0, -1.0, 0.0], + [0.0, 0.0, 0.0, -1.0], + ] + assert_array_almost_equal(X_trans, X_expected) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_maxabs_scaler_transform_one_row_csr(csr_container): + # Check MaxAbsScaler on transforming csr matrix with one row + X = csr_container([[0.5, 1.0, 1.0]]) + scaler = MaxAbsScaler() + scaler = scaler.fit(X) + X_trans = scaler.transform(X) + X_expected = csr_container([[1.0, 1.0, 1.0]]) + assert_array_almost_equal(X_trans.toarray(), X_expected.toarray()) + X_scaled_back = scaler.inverse_transform(X_trans) + assert_array_almost_equal(X.toarray(), X_scaled_back.toarray()) + + +def test_maxabs_scaler_1d(): + # Test scaling of dataset along single axis + for X in [X_1row, X_1col, X_list_1row, X_list_1row]: + scaler = MaxAbsScaler(copy=True) + X_scaled = scaler.fit(X).transform(X) + + if isinstance(X, list): + X = np.array(X) # cast only after scaling done + + if _check_dim_1axis(X) == 1: + assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), np.ones(n_features)) + else: + assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.0) + assert scaler.n_samples_seen_ == X.shape[0] + + # check inverse transform + X_scaled_back = scaler.inverse_transform(X_scaled) + assert_array_almost_equal(X_scaled_back, X) + + # Constant feature + X = np.ones((5, 1)) + scaler = MaxAbsScaler() + X_scaled = scaler.fit(X).transform(X) + assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.0) + assert scaler.n_samples_seen_ == X.shape[0] + + # function interface + X_1d = X_1row.ravel() + max_abs = np.abs(X_1d).max() + assert_array_almost_equal(X_1d / max_abs, maxabs_scale(X_1d, copy=True)) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_maxabs_scaler_partial_fit(csr_container): + # Test if partial_fit run over many batches of size 1 and 50 + # gives the same results as fit + X = X_2d[:100, :] + n = X.shape[0] + + for chunk_size in [1, 2, 50, n, n + 42]: + # Test mean at the end of the process + scaler_batch = MaxAbsScaler().fit(X) + + scaler_incr = MaxAbsScaler() + scaler_incr_csr = MaxAbsScaler() + scaler_incr_csc = MaxAbsScaler() + for batch in gen_batches(n, chunk_size): + scaler_incr = scaler_incr.partial_fit(X[batch]) + X_csr = csr_container(X[batch]) + scaler_incr_csr = scaler_incr_csr.partial_fit(X_csr) + X_csc = csr_container(X[batch]) + scaler_incr_csc = scaler_incr_csc.partial_fit(X_csc) + + assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_) + assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csr.max_abs_) + assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csc.max_abs_) + assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ + assert scaler_batch.n_samples_seen_ == scaler_incr_csr.n_samples_seen_ + assert scaler_batch.n_samples_seen_ == scaler_incr_csc.n_samples_seen_ + assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) + assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csr.scale_) + assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csc.scale_) + assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X)) + + # Test std after 1 step + batch0 = slice(0, chunk_size) + scaler_batch = MaxAbsScaler().fit(X[batch0]) + scaler_incr = MaxAbsScaler().partial_fit(X[batch0]) + + assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_) + assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ + assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) + assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X)) + + # Test std until the end of partial fits, and + scaler_batch = MaxAbsScaler().fit(X) + scaler_incr = MaxAbsScaler() # Clean estimator + for i, batch in enumerate(gen_batches(n, chunk_size)): + scaler_incr = scaler_incr.partial_fit(X[batch]) + assert_correct_incr( + i, + batch_start=batch.start, + batch_stop=batch.stop, + n=n, + chunk_size=chunk_size, + n_samples_seen=scaler_incr.n_samples_seen_, + ) + + +def check_normalizer(norm, X_norm): + """ + Convenient checking function for `test_normalizer_l1_l2_max` and + `test_normalizer_l1_l2_max_non_csr` + """ + if norm == "l1": + row_sums = np.abs(X_norm).sum(axis=1) + for i in range(3): + assert_almost_equal(row_sums[i], 1.0) + assert_almost_equal(row_sums[3], 0.0) + elif norm == "l2": + for i in range(3): + assert_almost_equal(la.norm(X_norm[i]), 1.0) + assert_almost_equal(la.norm(X_norm[3]), 0.0) + elif norm == "max": + row_maxs = abs(X_norm).max(axis=1) + for i in range(3): + assert_almost_equal(row_maxs[i], 1.0) + assert_almost_equal(row_maxs[3], 0.0) + + +@pytest.mark.parametrize("norm", ["l1", "l2", "max"]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_normalizer_l1_l2_max(norm, csr_container): + rng = np.random.RandomState(0) + X_dense = rng.randn(4, 5) + X_sparse_unpruned = csr_container(X_dense) + + # set the row number 3 to zero + X_dense[3, :] = 0.0 + + # set the row number 3 to zero without pruning (can happen in real life) + indptr_3 = X_sparse_unpruned.indptr[3] + indptr_4 = X_sparse_unpruned.indptr[4] + X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0 + + # build the pruned variant using the regular constructor + X_sparse_pruned = csr_container(X_dense) + + # check inputs that support the no-copy optim + for X in (X_dense, X_sparse_pruned, X_sparse_unpruned): + normalizer = Normalizer(norm=norm, copy=True) + X_norm1 = normalizer.transform(X) + assert X_norm1 is not X + X_norm1 = toarray(X_norm1) + + normalizer = Normalizer(norm=norm, copy=False) + X_norm2 = normalizer.transform(X) + assert X_norm2 is X + X_norm2 = toarray(X_norm2) + + for X_norm in (X_norm1, X_norm2): + check_normalizer(norm, X_norm) + + +@pytest.mark.parametrize("norm", ["l1", "l2", "max"]) +@pytest.mark.parametrize( + "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + LIL_CONTAINERS +) +def test_normalizer_l1_l2_max_non_csr(norm, sparse_container): + rng = np.random.RandomState(0) + X_dense = rng.randn(4, 5) + + # set the row number 3 to zero + X_dense[3, :] = 0.0 + + X = sparse_container(X_dense) + X_norm = Normalizer(norm=norm, copy=False).transform(X) + + assert X_norm is not X + assert sparse.issparse(X_norm) and X_norm.format == "csr" + + X_norm = toarray(X_norm) + check_normalizer(norm, X_norm) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_normalizer_max_sign(csr_container): + # check that we normalize by a positive number even for negative data + rng = np.random.RandomState(0) + X_dense = rng.randn(4, 5) + # set the row number 3 to zero + X_dense[3, :] = 0.0 + # check for mixed data where the value with + # largest magnitude is negative + X_dense[2, abs(X_dense[2, :]).argmax()] *= -1 + X_all_neg = -np.abs(X_dense) + X_all_neg_sparse = csr_container(X_all_neg) + + for X in (X_dense, X_all_neg, X_all_neg_sparse): + normalizer = Normalizer(norm="max") + X_norm = normalizer.transform(X) + assert X_norm is not X + X_norm = toarray(X_norm) + assert_array_equal(np.sign(X_norm), np.sign(toarray(X))) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_normalize(csr_container): + # Test normalize function + # Only tests functionality not used by the tests for Normalizer. + X = np.random.RandomState(37).randn(3, 2) + assert_array_equal(normalize(X, copy=False), normalize(X.T, axis=0, copy=False).T) + + rs = np.random.RandomState(0) + X_dense = rs.randn(10, 5) + X_sparse = csr_container(X_dense) + ones = np.ones((10)) + for X in (X_dense, X_sparse): + for dtype in (np.float32, np.float64): + for norm in ("l1", "l2"): + X = X.astype(dtype) + X_norm = normalize(X, norm=norm) + assert X_norm.dtype == dtype + + X_norm = toarray(X_norm) + if norm == "l1": + row_sums = np.abs(X_norm).sum(axis=1) + else: + X_norm_squared = X_norm**2 + row_sums = X_norm_squared.sum(axis=1) + + assert_array_almost_equal(row_sums, ones) + + # Test return_norm + X_dense = np.array([[3.0, 0, 4.0], [1.0, 0.0, 0.0], [2.0, 3.0, 0.0]]) + for norm in ("l1", "l2", "max"): + _, norms = normalize(X_dense, norm=norm, return_norm=True) + if norm == "l1": + assert_array_almost_equal(norms, np.array([7.0, 1.0, 5.0])) + elif norm == "l2": + assert_array_almost_equal(norms, np.array([5.0, 1.0, 3.60555127])) + else: + assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0])) + + X_sparse = csr_container(X_dense) + for norm in ("l1", "l2"): + with pytest.raises(NotImplementedError): + normalize(X_sparse, norm=norm, return_norm=True) + _, norms = normalize(X_sparse, norm="max", return_norm=True) + assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0])) + + +@pytest.mark.parametrize( + "constructor", [np.array, list] + CSC_CONTAINERS + CSR_CONTAINERS +) +def test_binarizer(constructor): + X_ = np.array([[1, 0, 5], [2, 3, -1]]) + X = constructor(X_.copy()) + + binarizer = Binarizer(threshold=2.0, copy=True) + X_bin = toarray(binarizer.transform(X)) + assert np.sum(X_bin == 0) == 4 + assert np.sum(X_bin == 1) == 2 + X_bin = binarizer.transform(X) + assert sparse.issparse(X) == sparse.issparse(X_bin) + + binarizer = Binarizer(copy=True).fit(X) + X_bin = toarray(binarizer.transform(X)) + assert X_bin is not X + assert np.sum(X_bin == 0) == 2 + assert np.sum(X_bin == 1) == 4 + + binarizer = Binarizer(copy=True) + X_bin = binarizer.transform(X) + assert X_bin is not X + X_bin = toarray(X_bin) + assert np.sum(X_bin == 0) == 2 + assert np.sum(X_bin == 1) == 4 + + binarizer = Binarizer(copy=False) + X_bin = binarizer.transform(X) + if constructor is not list: + assert X_bin is X + + binarizer = Binarizer(copy=False) + X_float = np.array([[1, 0, 5], [2, 3, -1]], dtype=np.float64) + X_bin = binarizer.transform(X_float) + if constructor is not list: + assert X_bin is X_float + + X_bin = toarray(X_bin) + assert np.sum(X_bin == 0) == 2 + assert np.sum(X_bin == 1) == 4 + + binarizer = Binarizer(threshold=-0.5, copy=True) + if constructor in (np.array, list): + X = constructor(X_.copy()) + + X_bin = toarray(binarizer.transform(X)) + assert np.sum(X_bin == 0) == 1 + assert np.sum(X_bin == 1) == 5 + X_bin = binarizer.transform(X) + + # Cannot use threshold < 0 for sparse + if constructor in CSC_CONTAINERS: + with pytest.raises(ValueError): + binarizer.transform(constructor(X)) + + +@pytest.mark.parametrize( + "array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations() +) +def test_binarizer_array_api_int(array_namespace, device, dtype_name): + # Checks that Binarizer works with integer elements and float threshold + xp = _array_api_for_tests(array_namespace, device) + for dtype_name_ in [dtype_name, "int32", "int64"]: + X_np = np.reshape(np.asarray([0, 1, 2, 3, 4], dtype=dtype_name_), (-1, 1)) + X_xp = xp.asarray(X_np, device=device) + binarized_np = Binarizer(threshold=2.5).fit_transform(X_np) + with config_context(array_api_dispatch=True): + binarized_xp = Binarizer(threshold=2.5).fit_transform(X_xp) + assert_array_equal(_convert_to_numpy(binarized_xp, xp), binarized_np) + + +def test_center_kernel(): + # Test that KernelCenterer is equivalent to StandardScaler + # in feature space + rng = np.random.RandomState(0) + X_fit = rng.random_sample((5, 4)) + scaler = StandardScaler(with_std=False) + scaler.fit(X_fit) + X_fit_centered = scaler.transform(X_fit) + K_fit = np.dot(X_fit, X_fit.T) + + # center fit time matrix + centerer = KernelCenterer() + K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T) + K_fit_centered2 = centerer.fit_transform(K_fit) + assert_array_almost_equal(K_fit_centered, K_fit_centered2) + + # center predict time matrix + X_pred = rng.random_sample((2, 4)) + K_pred = np.dot(X_pred, X_fit.T) + X_pred_centered = scaler.transform(X_pred) + K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T) + K_pred_centered2 = centerer.transform(K_pred) + assert_array_almost_equal(K_pred_centered, K_pred_centered2) + + # check the results coherence with the method proposed in: + # B. Schölkopf, A. Smola, and K.R. Müller, + # "Nonlinear component analysis as a kernel eigenvalue problem" + # equation (B.3) + + # K_centered3 = (I - 1_M) K (I - 1_M) + # = K - 1_M K - K 1_M + 1_M K 1_M + ones_M = np.ones_like(K_fit) / K_fit.shape[0] + K_fit_centered3 = K_fit - ones_M @ K_fit - K_fit @ ones_M + ones_M @ K_fit @ ones_M + assert_allclose(K_fit_centered, K_fit_centered3) + + # K_test_centered3 = (K_test - 1'_M K)(I - 1_M) + # = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M + ones_prime_M = np.ones_like(K_pred) / K_fit.shape[0] + K_pred_centered3 = ( + K_pred - ones_prime_M @ K_fit - K_pred @ ones_M + ones_prime_M @ K_fit @ ones_M + ) + assert_allclose(K_pred_centered, K_pred_centered3) + + +def test_kernelcenterer_non_linear_kernel(): + """Check kernel centering for non-linear kernel.""" + rng = np.random.RandomState(0) + X, X_test = rng.randn(100, 50), rng.randn(20, 50) + + def phi(X): + """Our mapping function phi.""" + return np.vstack( + [ + np.clip(X, a_min=0, a_max=None), + -np.clip(X, a_min=None, a_max=0), + ] + ) + + phi_X = phi(X) + phi_X_test = phi(X_test) + + # centered the projection + scaler = StandardScaler(with_std=False) + phi_X_center = scaler.fit_transform(phi_X) + phi_X_test_center = scaler.transform(phi_X_test) + + # create the different kernel + K = phi_X @ phi_X.T + K_test = phi_X_test @ phi_X.T + K_center = phi_X_center @ phi_X_center.T + K_test_center = phi_X_test_center @ phi_X_center.T + + kernel_centerer = KernelCenterer() + kernel_centerer.fit(K) + + assert_allclose(kernel_centerer.transform(K), K_center) + assert_allclose(kernel_centerer.transform(K_test), K_test_center) + + # check the results coherence with the method proposed in: + # B. Schölkopf, A. Smola, and K.R. Müller, + # "Nonlinear component analysis as a kernel eigenvalue problem" + # equation (B.3) + + # K_centered = (I - 1_M) K (I - 1_M) + # = K - 1_M K - K 1_M + 1_M K 1_M + ones_M = np.ones_like(K) / K.shape[0] + K_centered = K - ones_M @ K - K @ ones_M + ones_M @ K @ ones_M + assert_allclose(kernel_centerer.transform(K), K_centered) + + # K_test_centered = (K_test - 1'_M K)(I - 1_M) + # = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M + ones_prime_M = np.ones_like(K_test) / K.shape[0] + K_test_centered = ( + K_test - ones_prime_M @ K - K_test @ ones_M + ones_prime_M @ K @ ones_M + ) + assert_allclose(kernel_centerer.transform(K_test), K_test_centered) + + +def test_cv_pipeline_precomputed(): + # Cross-validate a regression on four coplanar points with the same + # value. Use precomputed kernel to ensure Pipeline with KernelCenterer + # is treated as a pairwise operation. + X = np.array([[3, 0, 0], [0, 3, 0], [0, 0, 3], [1, 1, 1]]) + y_true = np.ones((4,)) + K = X.dot(X.T) + kcent = KernelCenterer() + pipeline = Pipeline([("kernel_centerer", kcent), ("svr", SVR())]) + + # did the pipeline set the pairwise attribute? + assert pipeline.__sklearn_tags__().input_tags.pairwise + + # test cross-validation, score should be almost perfect + # NB: this test is pretty vacuous -- it's mainly to test integration + # of Pipeline and KernelCenterer + y_pred = cross_val_predict(pipeline, K, y_true, cv=2) + assert_array_almost_equal(y_true, y_pred) + + +def test_fit_transform(): + rng = np.random.RandomState(0) + X = rng.random_sample((5, 4)) + for obj in (StandardScaler(), Normalizer(), Binarizer()): + X_transformed = obj.fit(X).transform(X) + X_transformed2 = obj.fit_transform(X) + assert_array_equal(X_transformed, X_transformed2) + + +def test_add_dummy_feature(): + X = [[1, 0], [0, 1], [0, 1]] + X = add_dummy_feature(X) + assert_array_equal(X, [[1, 1, 0], [1, 0, 1], [1, 0, 1]]) + + +@pytest.mark.parametrize( + "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS +) +def test_add_dummy_feature_sparse(sparse_container): + X = sparse_container([[1, 0], [0, 1], [0, 1]]) + desired_format = X.format + X = add_dummy_feature(X) + assert sparse.issparse(X) and X.format == desired_format, X + assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]]) + + +def test_fit_cold_start(): + X = iris.data + X_2d = X[:, :2] + + # Scalers that have a partial_fit method + scalers = [ + StandardScaler(with_mean=False, with_std=False), + MinMaxScaler(), + MaxAbsScaler(), + ] + + for scaler in scalers: + scaler.fit_transform(X) + # with a different shape, this may break the scaler unless the internal + # state is reset + scaler.fit_transform(X_2d) + + +@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) +def test_power_transformer_notfitted(method): + pt = PowerTransformer(method=method) + X = np.abs(X_1col) + with pytest.raises(NotFittedError): + pt.transform(X) + with pytest.raises(NotFittedError): + pt.inverse_transform(X) + + +@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) +@pytest.mark.parametrize("standardize", [True, False]) +@pytest.mark.parametrize("X", [X_1col, X_2d]) +def test_power_transformer_inverse(method, standardize, X): + # Make sure we get the original input when applying transform and then + # inverse transform + X = np.abs(X) if method == "box-cox" else X + pt = PowerTransformer(method=method, standardize=standardize) + X_trans = pt.fit_transform(X) + assert_almost_equal(X, pt.inverse_transform(X_trans)) + + +def test_power_transformer_1d(): + X = np.abs(X_1col) + + for standardize in [True, False]: + pt = PowerTransformer(method="box-cox", standardize=standardize) + + X_trans = pt.fit_transform(X) + X_trans_func = power_transform(X, method="box-cox", standardize=standardize) + + X_expected, lambda_expected = stats.boxcox(X.flatten()) + + if standardize: + X_expected = scale(X_expected) + + assert_almost_equal(X_expected.reshape(-1, 1), X_trans) + assert_almost_equal(X_expected.reshape(-1, 1), X_trans_func) + + assert_almost_equal(X, pt.inverse_transform(X_trans)) + assert_almost_equal(lambda_expected, pt.lambdas_[0]) + + assert len(pt.lambdas_) == X.shape[1] + assert isinstance(pt.lambdas_, np.ndarray) + + +def test_power_transformer_2d(): + X = np.abs(X_2d) + + for standardize in [True, False]: + pt = PowerTransformer(method="box-cox", standardize=standardize) + + X_trans_class = pt.fit_transform(X) + X_trans_func = power_transform(X, method="box-cox", standardize=standardize) + + for X_trans in [X_trans_class, X_trans_func]: + for j in range(X_trans.shape[1]): + X_expected, lmbda = stats.boxcox(X[:, j].flatten()) + + if standardize: + X_expected = scale(X_expected) + + assert_almost_equal(X_trans[:, j], X_expected) + assert_almost_equal(lmbda, pt.lambdas_[j]) + + # Test inverse transformation + X_inv = pt.inverse_transform(X_trans) + assert_array_almost_equal(X_inv, X) + + assert len(pt.lambdas_) == X.shape[1] + assert isinstance(pt.lambdas_, np.ndarray) + + +def test_power_transformer_boxcox_strictly_positive_exception(): + # Exceptions should be raised for negative arrays and zero arrays when + # method is boxcox + + pt = PowerTransformer(method="box-cox") + pt.fit(np.abs(X_2d)) + X_with_negatives = X_2d + not_positive_message = "strictly positive" + + with pytest.raises(ValueError, match=not_positive_message): + pt.transform(X_with_negatives) + + with pytest.raises(ValueError, match=not_positive_message): + pt.fit(X_with_negatives) + + with pytest.raises(ValueError, match=not_positive_message): + power_transform(X_with_negatives, method="box-cox") + + with pytest.raises(ValueError, match=not_positive_message): + pt.transform(np.zeros(X_2d.shape)) + + with pytest.raises(ValueError, match=not_positive_message): + pt.fit(np.zeros(X_2d.shape)) + + with pytest.raises(ValueError, match=not_positive_message): + power_transform(np.zeros(X_2d.shape), method="box-cox") + + +@pytest.mark.parametrize("X", [X_2d, np.abs(X_2d), -np.abs(X_2d), np.zeros(X_2d.shape)]) +def test_power_transformer_yeojohnson_any_input(X): + # Yeo-Johnson method should support any kind of input + power_transform(X, method="yeo-johnson") + + +@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) +def test_power_transformer_shape_exception(method): + pt = PowerTransformer(method=method) + X = np.abs(X_2d) + pt.fit(X) + + # Exceptions should be raised for arrays with different num_columns + # than during fitting + wrong_shape_message = ( + r"X has \d+ features, but PowerTransformer is expecting \d+ features" + ) + + with pytest.raises(ValueError, match=wrong_shape_message): + pt.transform(X[:, 0:1]) + + with pytest.raises(ValueError, match=wrong_shape_message): + pt.inverse_transform(X[:, 0:1]) + + +def test_power_transformer_lambda_zero(): + pt = PowerTransformer(method="box-cox", standardize=False) + X = np.abs(X_2d)[:, 0:1] + + # Test the lambda = 0 case + pt.lambdas_ = np.array([0]) + X_trans = pt.transform(X) + assert_array_almost_equal(pt.inverse_transform(X_trans), X) + + +def test_power_transformer_lambda_one(): + # Make sure lambda = 1 corresponds to the identity for yeo-johnson + pt = PowerTransformer(method="yeo-johnson", standardize=False) + X = np.abs(X_2d)[:, 0:1] + + pt.lambdas_ = np.array([1]) + X_trans = pt.transform(X) + assert_array_almost_equal(X_trans, X) + + +@pytest.mark.parametrize( + "method, lmbda", + [ + ("box-cox", 0.1), + ("box-cox", 0.5), + ("yeo-johnson", 0.1), + ("yeo-johnson", 0.5), + ("yeo-johnson", 1.0), + ], +) +def test_optimization_power_transformer(method, lmbda): + # Test the optimization procedure: + # - set a predefined value for lambda + # - apply inverse_transform to a normal dist (we get X_inv) + # - apply fit_transform to X_inv (we get X_inv_trans) + # - check that X_inv_trans is roughly equal to X + + rng = np.random.RandomState(0) + n_samples = 20000 + X = rng.normal(loc=0, scale=1, size=(n_samples, 1)) + + if method == "box-cox": + # For box-cox, means that lmbda * y + 1 > 0 or y > - 1 / lmbda + # Clip the data here to make sure the inequality is valid. + X = np.clip(X, -1 / lmbda + 1e-5, None) + + pt = PowerTransformer(method=method, standardize=False) + pt.lambdas_ = [lmbda] + X_inv = pt.inverse_transform(X) + + pt = PowerTransformer(method=method, standardize=False) + X_inv_trans = pt.fit_transform(X_inv) + + assert_almost_equal(0, np.linalg.norm(X - X_inv_trans) / n_samples, decimal=2) + assert_almost_equal(0, X_inv_trans.mean(), decimal=1) + assert_almost_equal(1, X_inv_trans.std(), decimal=1) + + +def test_invserse_box_cox(): + # output nan if the input is invalid + pt = PowerTransformer(method="box-cox", standardize=False) + pt.lambdas_ = [0.5] + X_inv = pt.inverse_transform([[-2.1]]) + assert np.isnan(X_inv) + + +def test_yeo_johnson_darwin_example(): + # test from original paper "A new family of power transformations to + # improve normality or symmetry" by Yeo and Johnson. + X = [6.1, -8.4, 1.0, 2.0, 0.7, 2.9, 3.5, 5.1, 1.8, 3.6, 7.0, 3.0, 9.3, 7.5, -6.0] + X = np.array(X).reshape(-1, 1) + lmbda = PowerTransformer(method="yeo-johnson").fit(X).lambdas_ + assert np.allclose(lmbda, 1.305, atol=1e-3) + + +@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) +def test_power_transformer_nans(method): + # Make sure lambda estimation is not influenced by NaN values + # and that transform() supports NaN silently + + X = np.abs(X_1col) + pt = PowerTransformer(method=method) + pt.fit(X) + lmbda_no_nans = pt.lambdas_[0] + + # concat nans at the end and check lambda stays the same + X = np.concatenate([X, np.full_like(X, np.nan)]) + X = shuffle(X, random_state=0) + + pt.fit(X) + lmbda_nans = pt.lambdas_[0] + + assert_almost_equal(lmbda_no_nans, lmbda_nans, decimal=5) + + X_trans = pt.transform(X) + assert_array_equal(np.isnan(X_trans), np.isnan(X)) + + +@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) +@pytest.mark.parametrize("standardize", [True, False]) +def test_power_transformer_fit_transform(method, standardize): + # check that fit_transform() and fit().transform() return the same values + X = X_1col + if method == "box-cox": + X = np.abs(X) + + pt = PowerTransformer(method, standardize=standardize) + assert_array_almost_equal(pt.fit(X).transform(X), pt.fit_transform(X)) + + +@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) +@pytest.mark.parametrize("standardize", [True, False]) +def test_power_transformer_copy_True(method, standardize): + # Check that neither fit, transform, fit_transform nor inverse_transform + # modify X inplace when copy=True + X = X_1col + if method == "box-cox": + X = np.abs(X) + + X_original = X.copy() + assert X is not X_original # sanity checks + assert_array_almost_equal(X, X_original) + + pt = PowerTransformer(method, standardize=standardize, copy=True) + + pt.fit(X) + assert_array_almost_equal(X, X_original) + X_trans = pt.transform(X) + assert X_trans is not X + + X_trans = pt.fit_transform(X) + assert_array_almost_equal(X, X_original) + assert X_trans is not X + + X_inv_trans = pt.inverse_transform(X_trans) + assert X_trans is not X_inv_trans + + +@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) +@pytest.mark.parametrize("standardize", [True, False]) +def test_power_transformer_copy_False(method, standardize): + # check that when copy=False fit doesn't change X inplace but transform, + # fit_transform and inverse_transform do. + X = X_1col + if method == "box-cox": + X = np.abs(X) + + X_original = X.copy() + assert X is not X_original # sanity checks + assert_array_almost_equal(X, X_original) + + pt = PowerTransformer(method, standardize=standardize, copy=False) + + pt.fit(X) + assert_array_almost_equal(X, X_original) # fit didn't change X + + X_trans = pt.transform(X) + assert X_trans is X + + if method == "box-cox": + X = np.abs(X) + X_trans = pt.fit_transform(X) + assert X_trans is X + + X_inv_trans = pt.inverse_transform(X_trans) + assert X_trans is X_inv_trans + + +def test_power_transformer_box_cox_raise_all_nans_col(): + """Check that box-cox raises informative when a column contains all nans. + + Non-regression test for gh-26303 + """ + X = rng.random_sample((4, 5)) + X[:, 0] = np.nan + + err_msg = "Column must not be all nan." + + pt = PowerTransformer(method="box-cox") + with pytest.raises(ValueError, match=err_msg): + pt.fit_transform(X) + + +@pytest.mark.parametrize( + "X_2", + [sparse.random(10, 1, density=0.8, random_state=0)] + + [ + csr_container(np.full((10, 1), fill_value=np.nan)) + for csr_container in CSR_CONTAINERS + ], +) +def test_standard_scaler_sparse_partial_fit_finite_variance(X_2): + # non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/16448 + X_1 = sparse.random(5, 1, density=0.8) + scaler = StandardScaler(with_mean=False) + scaler.fit(X_1).partial_fit(X_2) + assert np.isfinite(scaler.var_[0]) + + +@pytest.mark.parametrize("feature_range", [(0, 1), (-10, 10)]) +def test_minmax_scaler_clip(feature_range): + # test behaviour of the parameter 'clip' in MinMaxScaler + X = iris.data + scaler = MinMaxScaler(feature_range=feature_range, clip=True).fit(X) + X_min, X_max = np.min(X, axis=0), np.max(X, axis=0) + X_test = [np.r_[X_min[:2] - 10, X_max[2:] + 10]] + X_transformed = scaler.transform(X_test) + assert_allclose( + X_transformed, + [[feature_range[0], feature_range[0], feature_range[1], feature_range[1]]], + ) + + +def test_standard_scaler_raise_error_for_1d_input(): + """Check that `inverse_transform` from `StandardScaler` raises an error + with 1D array. + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/19518 + """ + scaler = StandardScaler().fit(X_2d) + err_msg = "Expected 2D array, got 1D array instead" + with pytest.raises(ValueError, match=err_msg): + scaler.inverse_transform(X_2d[:, 0]) + + +def test_power_transformer_significantly_non_gaussian(): + """Check that significantly non-Gaussian data before transforms correctly. + + For some explored lambdas, the transformed data may be constant and will + be rejected. Non-regression test for + https://github.com/scikit-learn/scikit-learn/issues/14959 + """ + + X_non_gaussian = 1e6 * np.array( + [0.6, 2.0, 3.0, 4.0] * 4 + [11, 12, 12, 16, 17, 20, 85, 90], dtype=np.float64 + ).reshape(-1, 1) + pt = PowerTransformer() + + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + X_trans = pt.fit_transform(X_non_gaussian) + + assert not np.any(np.isnan(X_trans)) + assert X_trans.mean() == pytest.approx(0.0) + assert X_trans.std() == pytest.approx(1.0) + assert X_trans.min() > -2 + assert X_trans.max() < 2 + + +@pytest.mark.parametrize( + "Transformer", + [ + MinMaxScaler, + MaxAbsScaler, + RobustScaler, + StandardScaler, + QuantileTransformer, + PowerTransformer, + ], +) +def test_one_to_one_features(Transformer): + """Check one-to-one transformers give correct feature names.""" + tr = Transformer().fit(iris.data) + names_out = tr.get_feature_names_out(iris.feature_names) + assert_array_equal(names_out, iris.feature_names) + + +@pytest.mark.parametrize( + "Transformer", + [ + MinMaxScaler, + MaxAbsScaler, + RobustScaler, + StandardScaler, + QuantileTransformer, + PowerTransformer, + Normalizer, + Binarizer, + ], +) +def test_one_to_one_features_pandas(Transformer): + """Check one-to-one transformers give correct feature names.""" + pd = pytest.importorskip("pandas") + + df = pd.DataFrame(iris.data, columns=iris.feature_names) + tr = Transformer().fit(df) + + names_out_df_default = tr.get_feature_names_out() + assert_array_equal(names_out_df_default, iris.feature_names) + + names_out_df_valid_in = tr.get_feature_names_out(iris.feature_names) + assert_array_equal(names_out_df_valid_in, iris.feature_names) + + msg = re.escape("input_features is not equal to feature_names_in_") + with pytest.raises(ValueError, match=msg): + invalid_names = list("abcd") + tr.get_feature_names_out(invalid_names) + + +def test_kernel_centerer_feature_names_out(): + """Test that kernel centerer `feature_names_out`.""" + + rng = np.random.RandomState(0) + X = rng.random_sample((6, 4)) + X_pairwise = linear_kernel(X) + centerer = KernelCenterer().fit(X_pairwise) + + names_out = centerer.get_feature_names_out() + samples_out2 = X_pairwise.shape[1] + assert_array_equal(names_out, [f"kernelcenterer{i}" for i in range(samples_out2)]) + + +@pytest.mark.parametrize("standardize", [True, False]) +def test_power_transformer_constant_feature(standardize): + """Check that PowerTransfomer leaves constant features unchanged.""" + X = [[-2, 0, 2], [-2, 0, 2], [-2, 0, 2]] + + pt = PowerTransformer(method="yeo-johnson", standardize=standardize).fit(X) + + assert_allclose(pt.lambdas_, [1, 1, 1]) + + Xft = pt.fit_transform(X) + Xt = pt.transform(X) + + for Xt_ in [Xft, Xt]: + if standardize: + assert_allclose(Xt_, np.zeros_like(X)) + else: + assert_allclose(Xt_, X) + + +@pytest.mark.skipif( + sp_version < parse_version("1.12"), + reason="scipy version 1.12 required for stable yeo-johnson", +) +def test_power_transformer_no_warnings(): + """Verify that PowerTransformer operates without raising any warnings on valid data. + + This test addresses numerical issues with floating point numbers (mostly + overflows) with the Yeo-Johnson transform, see + https://github.com/scikit-learn/scikit-learn/issues/23319#issuecomment-1464933635 + """ + x = np.array( + [ + 2003.0, + 1950.0, + 1997.0, + 2000.0, + 2009.0, + 2009.0, + 1980.0, + 1999.0, + 2007.0, + 1991.0, + ] + ) + + def _test_no_warnings(data): + """Internal helper to test for unexpected warnings.""" + with warnings.catch_warnings(record=True) as caught_warnings: + warnings.simplefilter("always") # Ensure all warnings are captured + PowerTransformer(method="yeo-johnson", standardize=True).fit_transform(data) + + assert not caught_warnings, "Unexpected warnings were raised:\n" + "\n".join( + str(w.message) for w in caught_warnings + ) + + # Full dataset: Should not trigger overflow in variance calculation. + _test_no_warnings(x.reshape(-1, 1)) + + # Subset of data: Should not trigger overflow in power calculation. + _test_no_warnings(x[:5].reshape(-1, 1)) + + +def test_yeojohnson_for_different_scipy_version(): + """Check that the results are consistent across different SciPy versions.""" + pt = PowerTransformer(method="yeo-johnson").fit(X_1col) + pt.lambdas_[0] == pytest.approx(0.99546157, rel=1e-7) diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_discretization.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_discretization.py new file mode 100644 index 0000000000000000000000000000000000000000..7463a8608291c9e9f580a3afe8a774a1b3f7e665 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_discretization.py @@ -0,0 +1,665 @@ +import warnings + +import numpy as np +import pytest +import scipy.sparse as sp + +from sklearn import clone +from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder +from sklearn.utils._testing import ( + assert_allclose, + assert_allclose_dense_sparse, + assert_array_almost_equal, + assert_array_equal, + ignore_warnings, +) + +X = [[-2, 1.5, -4, -1], [-1, 2.5, -3, -0.5], [0, 3.5, -2, 0.5], [1, 4.5, -1, 2]] + + +@pytest.mark.parametrize( + "strategy, quantile_method, expected, sample_weight", + [ + ( + "uniform", + "warn", # default, will not warn when strategy != "quantile" + [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]], + None, + ), + ( + "kmeans", + "warn", # default, will not warn when strategy != "quantile" + [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]], + None, + ), + ( + "quantile", + "averaged_inverted_cdf", + [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]], + None, + ), + ( + "uniform", + "warn", # default, will not warn when strategy != "quantile" + [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]], + [1, 1, 2, 1], + ), + ( + "uniform", + "warn", # default, will not warn when strategy != "quantile" + [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]], + [1, 1, 1, 1], + ), + ( + "quantile", + "averaged_inverted_cdf", + [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]], + [1, 1, 2, 1], + ), + ( + "quantile", + "averaged_inverted_cdf", + [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]], + [1, 1, 1, 1], + ), + ( + "quantile", + "averaged_inverted_cdf", + [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]], + [0, 1, 1, 1], + ), + ( + "kmeans", + "warn", # default, will not warn when strategy != "quantile" + [[0, 0, 0, 0], [1, 1, 1, 0], [1, 1, 1, 1], [2, 2, 2, 2]], + [1, 0, 3, 1], + ), + ( + "kmeans", + "warn", # default, will not warn when strategy != "quantile" + [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]], + [1, 1, 1, 1], + ), + ], +) +def test_fit_transform(strategy, quantile_method, expected, sample_weight): + est = KBinsDiscretizer( + n_bins=3, encode="ordinal", strategy=strategy, quantile_method=quantile_method + ) + with ignore_warnings(category=UserWarning): + # Ignore the warning on removed small bins. + est.fit(X, sample_weight=sample_weight) + assert_array_equal(est.transform(X), expected) + + +def test_valid_n_bins(): + KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf").fit_transform(X) + KBinsDiscretizer( + n_bins=np.array([2])[0], quantile_method="averaged_inverted_cdf" + ).fit_transform(X) + assert KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf").fit( + X + ).n_bins_.dtype == np.dtype(int) + + +def test_invalid_n_bins_array(): + # Bad shape + n_bins = np.full((2, 4), 2.0) + est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf") + err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)." + with pytest.raises(ValueError, match=err_msg): + est.fit_transform(X) + + # Incorrect number of features + n_bins = [1, 2, 2] + est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf") + err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)." + with pytest.raises(ValueError, match=err_msg): + est.fit_transform(X) + + # Bad bin values + n_bins = [1, 2, 2, 1] + est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf") + err_msg = ( + "KBinsDiscretizer received an invalid number of bins " + "at indices 0, 3. Number of bins must be at least 2, " + "and must be an int." + ) + with pytest.raises(ValueError, match=err_msg): + est.fit_transform(X) + + # Float bin values + n_bins = [2.1, 2, 2.1, 2] + est = KBinsDiscretizer(n_bins=n_bins, quantile_method="averaged_inverted_cdf") + err_msg = ( + "KBinsDiscretizer received an invalid number of bins " + "at indices 0, 2. Number of bins must be at least 2, " + "and must be an int." + ) + with pytest.raises(ValueError, match=err_msg): + est.fit_transform(X) + + +@pytest.mark.parametrize( + "strategy, quantile_method, expected, sample_weight", + [ + ( + "uniform", + "warn", # default, will not warn when strategy != "quantile" + [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]], + None, + ), + ( + "kmeans", + "warn", # default, will not warn when strategy != "quantile" + [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]], + None, + ), + ( + "quantile", + "linear", + [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]], + None, + ), + ( + "quantile", + "averaged_inverted_cdf", + [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]], + None, + ), + ( + "quantile", + "averaged_inverted_cdf", + [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]], + [1, 1, 1, 1], + ), + ( + "quantile", + "averaged_inverted_cdf", + [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]], + [0, 1, 3, 1], + ), + ( + "quantile", + "averaged_inverted_cdf", + [[0, 0, 0, 0], [0, 0, 0, 0], [1, 2, 2, 2], [1, 2, 2, 2]], + [1, 1, 3, 1], + ), + ( + "kmeans", + "warn", # default, will not warn when strategy != "quantile" + [[0, 0, 0, 0], [0, 1, 1, 0], [1, 1, 1, 1], [1, 2, 2, 2]], + [1, 0, 3, 1], + ), + ], +) +def test_fit_transform_n_bins_array(strategy, quantile_method, expected, sample_weight): + est = KBinsDiscretizer( + n_bins=[2, 3, 3, 3], + encode="ordinal", + strategy=strategy, + quantile_method=quantile_method, + ).fit(X, sample_weight=sample_weight) + assert_array_equal(est.transform(X), expected) + + # test the shape of bin_edges_ + n_features = np.array(X).shape[1] + assert est.bin_edges_.shape == (n_features,) + for bin_edges, n_bins in zip(est.bin_edges_, est.n_bins_): + assert bin_edges.shape == (n_bins + 1,) + + +@pytest.mark.filterwarnings("ignore: Bins whose width are too small") +def test_kbinsdiscretizer_effect_sample_weight(): + """Check the impact of `sample_weight` one computed quantiles.""" + X = np.array([[-2], [-1], [1], [3], [500], [1000]]) + # add a large number of bins such that each sample with a non-null weight + # will be used as bin edge + est = KBinsDiscretizer( + n_bins=10, + encode="ordinal", + strategy="quantile", + quantile_method="averaged_inverted_cdf", + ) + est.fit(X, sample_weight=[1, 1, 1, 1, 0, 0]) + assert_allclose(est.bin_edges_[0], [-2, -1, 0, 1, 3]) + assert_allclose(est.transform(X), [[0.0], [1.0], [3.0], [3.0], [3.0], [3.0]]) + + +@pytest.mark.parametrize("strategy", ["kmeans", "quantile"]) +def test_kbinsdiscretizer_no_mutating_sample_weight(strategy): + """Make sure that `sample_weight` is not changed in place.""" + + if strategy == "quantile": + est = KBinsDiscretizer( + n_bins=3, + encode="ordinal", + strategy=strategy, + quantile_method="averaged_inverted_cdf", + ) + else: + est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy) + sample_weight = np.array([1, 3, 1, 2], dtype=np.float64) + sample_weight_copy = np.copy(sample_weight) + est.fit(X, sample_weight=sample_weight) + assert_allclose(sample_weight, sample_weight_copy) + + +@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"]) +def test_same_min_max(strategy): + warnings.simplefilter("always") + X = np.array([[1, -2], [1, -1], [1, 0], [1, 1]]) + if strategy == "quantile": + est = KBinsDiscretizer( + strategy=strategy, + n_bins=3, + encode="ordinal", + quantile_method="averaged_inverted_cdf", + ) + else: + est = KBinsDiscretizer(strategy=strategy, n_bins=3, encode="ordinal") + warning_message = "Feature 0 is constant and will be replaced with 0." + with pytest.warns(UserWarning, match=warning_message): + est.fit(X) + assert est.n_bins_[0] == 1 + # replace the feature with zeros + Xt = est.transform(X) + assert_array_equal(Xt[:, 0], np.zeros(X.shape[0])) + + +def test_transform_1d_behavior(): + X = np.arange(4) + est = KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf") + with pytest.raises(ValueError): + est.fit(X) + + est = KBinsDiscretizer(n_bins=2, quantile_method="averaged_inverted_cdf") + est.fit(X.reshape(-1, 1)) + with pytest.raises(ValueError): + est.transform(X) + + +@pytest.mark.parametrize("i", range(1, 9)) +def test_numeric_stability(i): + X_init = np.array([2.0, 4.0, 6.0, 8.0, 10.0]).reshape(-1, 1) + Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1) + + # Test up to discretizing nano units + X = X_init / 10**i + Xt = KBinsDiscretizer( + n_bins=2, encode="ordinal", quantile_method="averaged_inverted_cdf" + ).fit_transform(X) + assert_array_equal(Xt_expected, Xt) + + +def test_encode_options(): + est = KBinsDiscretizer( + n_bins=[2, 3, 3, 3], encode="ordinal", quantile_method="averaged_inverted_cdf" + ).fit(X) + Xt_1 = est.transform(X) + est = KBinsDiscretizer( + n_bins=[2, 3, 3, 3], + encode="onehot-dense", + quantile_method="averaged_inverted_cdf", + ).fit(X) + Xt_2 = est.transform(X) + assert not sp.issparse(Xt_2) + assert_array_equal( + OneHotEncoder( + categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse_output=False + ).fit_transform(Xt_1), + Xt_2, + ) + est = KBinsDiscretizer( + n_bins=[2, 3, 3, 3], encode="onehot", quantile_method="averaged_inverted_cdf" + ).fit(X) + Xt_3 = est.transform(X) + assert sp.issparse(Xt_3) + assert_array_equal( + OneHotEncoder( + categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse_output=True + ) + .fit_transform(Xt_1) + .toarray(), + Xt_3.toarray(), + ) + + +@pytest.mark.parametrize( + "strategy, quantile_method, expected_2bins, expected_3bins, expected_5bins", + [ + ("uniform", "warn", [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 2, 2], [0, 0, 1, 1, 4, 4]), + ("kmeans", "warn", [0, 0, 0, 0, 1, 1], [0, 0, 1, 1, 2, 2], [0, 0, 1, 2, 3, 4]), + ( + "quantile", + "averaged_inverted_cdf", + [0, 0, 0, 1, 1, 1], + [0, 0, 1, 1, 2, 2], + [0, 1, 2, 3, 4, 4], + ), + ], +) +def test_nonuniform_strategies( + strategy, quantile_method, expected_2bins, expected_3bins, expected_5bins +): + X = np.array([0, 0.5, 2, 3, 9, 10]).reshape(-1, 1) + + # with 2 bins + est = KBinsDiscretizer( + n_bins=2, strategy=strategy, quantile_method=quantile_method, encode="ordinal" + ) + Xt = est.fit_transform(X) + assert_array_equal(expected_2bins, Xt.ravel()) + + # with 3 bins + est = KBinsDiscretizer( + n_bins=3, strategy=strategy, quantile_method=quantile_method, encode="ordinal" + ) + Xt = est.fit_transform(X) + assert_array_equal(expected_3bins, Xt.ravel()) + + # with 5 bins + est = KBinsDiscretizer( + n_bins=5, strategy=strategy, quantile_method=quantile_method, encode="ordinal" + ) + Xt = est.fit_transform(X) + assert_array_equal(expected_5bins, Xt.ravel()) + + +@pytest.mark.parametrize( + "strategy, expected_inv,quantile_method", + [ + ( + "uniform", + [ + [-1.5, 2.0, -3.5, -0.5], + [-0.5, 3.0, -2.5, -0.5], + [0.5, 4.0, -1.5, 0.5], + [0.5, 4.0, -1.5, 1.5], + ], + "warn", # default, will not warn when strategy != "quantile" + ), + ( + "kmeans", + [ + [-1.375, 2.125, -3.375, -0.5625], + [-1.375, 2.125, -3.375, -0.5625], + [-0.125, 3.375, -2.125, 0.5625], + [0.75, 4.25, -1.25, 1.625], + ], + "warn", # default, will not warn when strategy != "quantile" + ), + ( + "quantile", + [ + [-1.5, 2.0, -3.5, -0.75], + [-0.5, 3.0, -2.5, 0.0], + [0.5, 4.0, -1.5, 1.25], + [0.5, 4.0, -1.5, 1.25], + ], + "averaged_inverted_cdf", + ), + ], +) +@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"]) +def test_inverse_transform(strategy, encode, expected_inv, quantile_method): + kbd = KBinsDiscretizer( + n_bins=3, strategy=strategy, quantile_method=quantile_method, encode=encode + ) + Xt = kbd.fit_transform(X) + Xinv = kbd.inverse_transform(Xt) + assert_array_almost_equal(expected_inv, Xinv) + + +@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"]) +def test_transform_outside_fit_range(strategy): + X = np.array([0, 1, 2, 3])[:, None] + + if strategy == "quantile": + kbd = KBinsDiscretizer( + n_bins=4, + strategy=strategy, + encode="ordinal", + quantile_method="averaged_inverted_cdf", + ) + else: + kbd = KBinsDiscretizer(n_bins=4, strategy=strategy, encode="ordinal") + kbd.fit(X) + + X2 = np.array([-2, 5])[:, None] + X2t = kbd.transform(X2) + assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_) + assert_array_equal(X2t.min(axis=0), [0]) + + +def test_overwrite(): + X = np.array([0, 1, 2, 3])[:, None] + X_before = X.copy() + + est = KBinsDiscretizer( + n_bins=3, quantile_method="averaged_inverted_cdf", encode="ordinal" + ) + Xt = est.fit_transform(X) + assert_array_equal(X, X_before) + + Xt_before = Xt.copy() + Xinv = est.inverse_transform(Xt) + assert_array_equal(Xt, Xt_before) + assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]])) + + +@pytest.mark.parametrize( + "strategy, expected_bin_edges, quantile_method", + [ + ("quantile", [0, 1.5, 3], "averaged_inverted_cdf"), + ("kmeans", [0, 1.5, 3], "warn"), + ], +) +def test_redundant_bins(strategy, expected_bin_edges, quantile_method): + X = [[0], [0], [0], [0], [3], [3]] + kbd = KBinsDiscretizer( + n_bins=3, strategy=strategy, quantile_method=quantile_method, subsample=None + ) + warning_message = "Consider decreasing the number of bins." + with pytest.warns(UserWarning, match=warning_message): + kbd.fit(X) + + assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges) + + +def test_percentile_numeric_stability(): + X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1) + bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95]) + Xt = np.array([0, 0, 4]).reshape(-1, 1) + kbd = KBinsDiscretizer( + n_bins=10, + encode="ordinal", + strategy="quantile", + quantile_method="linear", + ) + ## TODO: change to averaged inverted cdf, but that means we only get bin + ## edges of 0.05 and 0.95 and nothing in between + + warning_message = "Consider decreasing the number of bins." + with pytest.warns(UserWarning, match=warning_message): + kbd.fit(X) + + assert_array_almost_equal(kbd.bin_edges_[0], bin_edges) + assert_array_almost_equal(kbd.transform(X), Xt) + + +@pytest.mark.parametrize("in_dtype", [np.float16, np.float32, np.float64]) +@pytest.mark.parametrize("out_dtype", [None, np.float32, np.float64]) +@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"]) +def test_consistent_dtype(in_dtype, out_dtype, encode): + X_input = np.array(X, dtype=in_dtype) + kbd = KBinsDiscretizer( + n_bins=3, + encode=encode, + quantile_method="averaged_inverted_cdf", + dtype=out_dtype, + ) + kbd.fit(X_input) + + # test output dtype + if out_dtype is not None: + expected_dtype = out_dtype + elif out_dtype is None and X_input.dtype == np.float16: + # wrong numeric input dtype are cast in np.float64 + expected_dtype = np.float64 + else: + expected_dtype = X_input.dtype + Xt = kbd.transform(X_input) + assert Xt.dtype == expected_dtype + + +@pytest.mark.parametrize("input_dtype", [np.float16, np.float32, np.float64]) +@pytest.mark.parametrize("encode", ["ordinal", "onehot", "onehot-dense"]) +def test_32_equal_64(input_dtype, encode): + # TODO this check is redundant with common checks and can be removed + # once #16290 is merged + X_input = np.array(X, dtype=input_dtype) + + # 32 bit output + kbd_32 = KBinsDiscretizer( + n_bins=3, + encode=encode, + quantile_method="averaged_inverted_cdf", + dtype=np.float32, + ) + kbd_32.fit(X_input) + Xt_32 = kbd_32.transform(X_input) + + # 64 bit output + kbd_64 = KBinsDiscretizer( + n_bins=3, + encode=encode, + quantile_method="averaged_inverted_cdf", + dtype=np.float64, + ) + kbd_64.fit(X_input) + Xt_64 = kbd_64.transform(X_input) + + assert_allclose_dense_sparse(Xt_32, Xt_64) + + +def test_kbinsdiscretizer_subsample_default(): + # Since the size of X is small (< 2e5), subsampling will not take place. + X = np.array([-2, 1.5, -4, -1]).reshape(-1, 1) + kbd_default = KBinsDiscretizer( + n_bins=10, + encode="ordinal", + strategy="quantile", + quantile_method="averaged_inverted_cdf", + ) + kbd_default.fit(X) + + kbd_without_subsampling = clone(kbd_default) + kbd_without_subsampling.set_params(subsample=None) + kbd_without_subsampling.fit(X) + + for bin_kbd_default, bin_kbd_with_subsampling in zip( + kbd_default.bin_edges_[0], kbd_without_subsampling.bin_edges_[0] + ): + np.testing.assert_allclose(bin_kbd_default, bin_kbd_with_subsampling) + assert kbd_default.bin_edges_.shape == kbd_without_subsampling.bin_edges_.shape + + +@pytest.mark.parametrize( + "encode, expected_names", + [ + ( + "onehot", + [ + f"feat{col_id}_{float(bin_id)}" + for col_id in range(3) + for bin_id in range(4) + ], + ), + ( + "onehot-dense", + [ + f"feat{col_id}_{float(bin_id)}" + for col_id in range(3) + for bin_id in range(4) + ], + ), + ("ordinal", [f"feat{col_id}" for col_id in range(3)]), + ], +) +def test_kbinsdiscrtizer_get_feature_names_out(encode, expected_names): + """Check get_feature_names_out for different settings. + Non-regression test for #22731 + """ + X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]] + + kbd = KBinsDiscretizer( + n_bins=4, encode=encode, quantile_method="averaged_inverted_cdf" + ).fit(X) + Xt = kbd.transform(X) + + input_features = [f"feat{i}" for i in range(3)] + output_names = kbd.get_feature_names_out(input_features) + assert Xt.shape[1] == output_names.shape[0] + + assert_array_equal(output_names, expected_names) + + +@pytest.mark.parametrize("strategy", ["uniform", "kmeans", "quantile"]) +def test_kbinsdiscretizer_subsample(strategy, global_random_seed): + # Check that the bin edges are almost the same when subsampling is used. + X = np.random.RandomState(global_random_seed).random_sample((100000, 1)) + 1 + + if strategy == "quantile": + kbd_subsampling = KBinsDiscretizer( + strategy=strategy, + subsample=50000, + random_state=global_random_seed, + quantile_method="averaged_inverted_cdf", + ) + else: + kbd_subsampling = KBinsDiscretizer( + strategy=strategy, subsample=50000, random_state=global_random_seed + ) + kbd_subsampling.fit(X) + + kbd_no_subsampling = clone(kbd_subsampling) + kbd_no_subsampling.set_params(subsample=None) + kbd_no_subsampling.fit(X) + + # We use a large tolerance because we can't expect the bin edges to be exactly the + # same when subsampling is used. + assert_allclose( + kbd_subsampling.bin_edges_[0], kbd_no_subsampling.bin_edges_[0], rtol=1e-2 + ) + + +def test_quantile_method_future_warnings(): + X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]] + with pytest.warns( + FutureWarning, + match="The current default behavior, quantile_method='linear', will be " + "changed to quantile_method='averaged_inverted_cdf' in " + "scikit-learn version 1.9 to naturally support sample weight " + "equivalence properties by default. Pass " + "quantile_method='averaged_inverted_cdf' explicitly to silence this " + "warning.", + ): + KBinsDiscretizer(strategy="quantile").fit(X) + + +def test_invalid_quantile_method_with_sample_weight(): + X = [[-2, 1, -4], [-1, 2, -3], [0, 3, -2], [1, 4, -1]] + expected_msg = ( + "When fitting with strategy='quantile' and sample weights, " + "quantile_method should either be set to 'averaged_inverted_cdf' or " + "'inverted_cdf', got quantile_method='linear' instead." + ) + with pytest.raises( + ValueError, + match=expected_msg, + ): + KBinsDiscretizer(strategy="quantile", quantile_method="linear").fit( + X, + sample_weight=[1, 1, 2, 2], + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_encoders.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_encoders.py new file mode 100644 index 0000000000000000000000000000000000000000..f843a4f16d17074c7f9414bbc4733a8cd49a7ac8 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_encoders.py @@ -0,0 +1,2367 @@ +import re +import warnings + +import numpy as np +import pytest +from scipy import sparse + +from sklearn.exceptions import NotFittedError +from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder +from sklearn.utils._missing import is_scalar_nan +from sklearn.utils._testing import ( + _convert_container, + assert_allclose, + assert_array_equal, +) +from sklearn.utils.fixes import CSR_CONTAINERS + + +def test_one_hot_encoder_sparse_dense(): + # check that sparse and dense will give the same results + + X = np.array([[3, 2, 1], [0, 1, 1]]) + enc_sparse = OneHotEncoder() + enc_dense = OneHotEncoder(sparse_output=False) + + X_trans_sparse = enc_sparse.fit_transform(X) + X_trans_dense = enc_dense.fit_transform(X) + + assert X_trans_sparse.shape == (2, 5) + assert X_trans_dense.shape == (2, 5) + + assert sparse.issparse(X_trans_sparse) + assert not sparse.issparse(X_trans_dense) + + # check outcome + assert_array_equal( + X_trans_sparse.toarray(), [[0.0, 1.0, 0.0, 1.0, 1.0], [1.0, 0.0, 1.0, 0.0, 1.0]] + ) + assert_array_equal(X_trans_sparse.toarray(), X_trans_dense) + + +@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"]) +def test_one_hot_encoder_handle_unknown(handle_unknown): + X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) + X2 = np.array([[4, 1, 1]]) + + # Test that one hot encoder raises error for unknown features + # present during transform. + oh = OneHotEncoder(handle_unknown="error") + oh.fit(X) + with pytest.raises(ValueError, match="Found unknown categories"): + oh.transform(X2) + + # Test the ignore option, ignores unknown features (giving all 0's) + oh = OneHotEncoder(handle_unknown=handle_unknown) + oh.fit(X) + X2_passed = X2.copy() + assert_array_equal( + oh.transform(X2_passed).toarray(), + np.array([[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]]), + ) + # ensure transformed data was not modified in place + assert_allclose(X2, X2_passed) + + +@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"]) +def test_one_hot_encoder_handle_unknown_strings(handle_unknown): + X = np.array(["11111111", "22", "333", "4444"]).reshape((-1, 1)) + X2 = np.array(["55555", "22"]).reshape((-1, 1)) + # Non Regression test for the issue #12470 + # Test the ignore option, when categories are numpy string dtype + # particularly when the known category strings are larger + # than the unknown category strings + oh = OneHotEncoder(handle_unknown=handle_unknown) + oh.fit(X) + X2_passed = X2.copy() + assert_array_equal( + oh.transform(X2_passed).toarray(), + np.array([[0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]]), + ) + # ensure transformed data was not modified in place + assert_array_equal(X2, X2_passed) + + +@pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64]) +@pytest.mark.parametrize("input_dtype", [np.int32, np.float32, np.float64]) +def test_one_hot_encoder_dtype(input_dtype, output_dtype): + X = np.asarray([[0, 1]], dtype=input_dtype).T + X_expected = np.asarray([[1, 0], [0, 1]], dtype=output_dtype) + + oh = OneHotEncoder(categories="auto", dtype=output_dtype) + assert_array_equal(oh.fit_transform(X).toarray(), X_expected) + assert_array_equal(oh.fit(X).transform(X).toarray(), X_expected) + + oh = OneHotEncoder(categories="auto", dtype=output_dtype, sparse_output=False) + assert_array_equal(oh.fit_transform(X), X_expected) + assert_array_equal(oh.fit(X).transform(X), X_expected) + + +@pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64]) +def test_one_hot_encoder_dtype_pandas(output_dtype): + pd = pytest.importorskip("pandas") + + X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]}) + X_expected = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=output_dtype) + + oh = OneHotEncoder(dtype=output_dtype) + assert_array_equal(oh.fit_transform(X_df).toarray(), X_expected) + assert_array_equal(oh.fit(X_df).transform(X_df).toarray(), X_expected) + + oh = OneHotEncoder(dtype=output_dtype, sparse_output=False) + assert_array_equal(oh.fit_transform(X_df), X_expected) + assert_array_equal(oh.fit(X_df).transform(X_df), X_expected) + + +def test_one_hot_encoder_feature_names(): + enc = OneHotEncoder() + X = [ + ["Male", 1, "girl", 2, 3], + ["Female", 41, "girl", 1, 10], + ["Male", 51, "boy", 12, 3], + ["Male", 91, "girl", 21, 30], + ] + + enc.fit(X) + feature_names = enc.get_feature_names_out() + + assert_array_equal( + [ + "x0_Female", + "x0_Male", + "x1_1", + "x1_41", + "x1_51", + "x1_91", + "x2_boy", + "x2_girl", + "x3_1", + "x3_2", + "x3_12", + "x3_21", + "x4_3", + "x4_10", + "x4_30", + ], + feature_names, + ) + + feature_names2 = enc.get_feature_names_out(["one", "two", "three", "four", "five"]) + + assert_array_equal( + [ + "one_Female", + "one_Male", + "two_1", + "two_41", + "two_51", + "two_91", + "three_boy", + "three_girl", + "four_1", + "four_2", + "four_12", + "four_21", + "five_3", + "five_10", + "five_30", + ], + feature_names2, + ) + + with pytest.raises(ValueError, match="input_features should have length"): + enc.get_feature_names_out(["one", "two"]) + + +def test_one_hot_encoder_feature_names_unicode(): + enc = OneHotEncoder() + X = np.array([["c❤t1", "dat2"]], dtype=object).T + enc.fit(X) + feature_names = enc.get_feature_names_out() + assert_array_equal(["x0_c❤t1", "x0_dat2"], feature_names) + feature_names = enc.get_feature_names_out(input_features=["n👍me"]) + assert_array_equal(["n👍me_c❤t1", "n👍me_dat2"], feature_names) + + +def test_one_hot_encoder_custom_feature_name_combiner(): + """Check the behaviour of `feature_name_combiner` as a callable.""" + + def name_combiner(feature, category): + return feature + "_" + repr(category) + + enc = OneHotEncoder(feature_name_combiner=name_combiner) + X = np.array([["None", None]], dtype=object).T + enc.fit(X) + feature_names = enc.get_feature_names_out() + assert_array_equal(["x0_'None'", "x0_None"], feature_names) + feature_names = enc.get_feature_names_out(input_features=["a"]) + assert_array_equal(["a_'None'", "a_None"], feature_names) + + def wrong_combiner(feature, category): + # we should be returning a Python string + return 0 + + enc = OneHotEncoder(feature_name_combiner=wrong_combiner).fit(X) + err_msg = ( + "When `feature_name_combiner` is a callable, it should return a Python string." + ) + with pytest.raises(TypeError, match=err_msg): + enc.get_feature_names_out() + + +def test_one_hot_encoder_set_params(): + X = np.array([[1, 2]]).T + oh = OneHotEncoder() + # set params on not yet fitted object + oh.set_params(categories=[[0, 1, 2, 3]]) + assert oh.get_params()["categories"] == [[0, 1, 2, 3]] + assert oh.fit_transform(X).toarray().shape == (2, 4) + # set params on already fitted object + oh.set_params(categories=[[0, 1, 2, 3, 4]]) + assert oh.fit_transform(X).toarray().shape == (2, 5) + + +def check_categorical_onehot(X): + enc = OneHotEncoder(categories="auto") + Xtr1 = enc.fit_transform(X) + + enc = OneHotEncoder(categories="auto", sparse_output=False) + Xtr2 = enc.fit_transform(X) + + assert_allclose(Xtr1.toarray(), Xtr2) + + assert sparse.issparse(Xtr1) and Xtr1.format == "csr" + return Xtr1.toarray() + + +@pytest.mark.parametrize( + "X", + [ + [["def", 1, 55], ["abc", 2, 55]], + np.array([[10, 1, 55], [5, 2, 55]]), + np.array([["b", "A", "cat"], ["a", "B", "cat"]], dtype=object), + np.array([["b", 1, "cat"], ["a", np.nan, "cat"]], dtype=object), + np.array([["b", 1, "cat"], ["a", float("nan"), "cat"]], dtype=object), + np.array([[None, 1, "cat"], ["a", 2, "cat"]], dtype=object), + np.array([[None, 1, None], ["a", np.nan, None]], dtype=object), + np.array([[None, 1, None], ["a", float("nan"), None]], dtype=object), + ], + ids=[ + "mixed", + "numeric", + "object", + "mixed-nan", + "mixed-float-nan", + "mixed-None", + "mixed-None-nan", + "mixed-None-float-nan", + ], +) +def test_one_hot_encoder(X): + Xtr = check_categorical_onehot(np.array(X)[:, [0]]) + assert_allclose(Xtr, [[0, 1], [1, 0]]) + + Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]]) + assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]]) + + Xtr = OneHotEncoder(categories="auto").fit_transform(X) + assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]]) + + +@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"]) +@pytest.mark.parametrize("sparse_", [False, True]) +@pytest.mark.parametrize("drop", [None, "first"]) +def test_one_hot_encoder_inverse(handle_unknown, sparse_, drop): + X = [["abc", 2, 55], ["def", 1, 55], ["abc", 3, 55]] + enc = OneHotEncoder(sparse_output=sparse_, drop=drop) + X_tr = enc.fit_transform(X) + exp = np.array(X, dtype=object) + assert_array_equal(enc.inverse_transform(X_tr), exp) + + X = [[2, 55], [1, 55], [3, 55]] + enc = OneHotEncoder(sparse_output=sparse_, categories="auto", drop=drop) + X_tr = enc.fit_transform(X) + exp = np.array(X) + assert_array_equal(enc.inverse_transform(X_tr), exp) + + if drop is None: + # with unknown categories + # drop is incompatible with handle_unknown=ignore + X = [["abc", 2, 55], ["def", 1, 55], ["abc", 3, 55]] + enc = OneHotEncoder( + sparse_output=sparse_, + handle_unknown=handle_unknown, + categories=[["abc", "def"], [1, 2], [54, 55, 56]], + ) + X_tr = enc.fit_transform(X) + exp = np.array(X, dtype=object) + exp[2, 1] = None + assert_array_equal(enc.inverse_transform(X_tr), exp) + + # with an otherwise numerical output, still object if unknown + X = [[2, 55], [1, 55], [3, 55]] + enc = OneHotEncoder( + sparse_output=sparse_, + categories=[[1, 2], [54, 56]], + handle_unknown=handle_unknown, + ) + X_tr = enc.fit_transform(X) + exp = np.array(X, dtype=object) + exp[2, 0] = None + exp[:, 1] = None + assert_array_equal(enc.inverse_transform(X_tr), exp) + + # incorrect shape raises + X_tr = np.array([[0, 1, 1], [1, 0, 1]]) + msg = re.escape("Shape of the passed X data is not correct") + with pytest.raises(ValueError, match=msg): + enc.inverse_transform(X_tr) + + +@pytest.mark.parametrize("sparse_", [False, True]) +@pytest.mark.parametrize( + "X, X_trans", + [ + ([[2, 55], [1, 55], [2, 55]], [[0, 1, 1], [0, 0, 0], [0, 1, 1]]), + ( + [["one", "a"], ["two", "a"], ["three", "b"], ["two", "a"]], + [[0, 0, 0, 0, 0], [0, 0, 0, 0, 1], [0, 1, 0, 0, 0]], + ), + ], +) +def test_one_hot_encoder_inverse_transform_raise_error_with_unknown( + X, X_trans, sparse_ +): + """Check that `inverse_transform` raise an error with unknown samples, no + dropped feature, and `handle_unknow="error`. + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/14934 + """ + enc = OneHotEncoder(sparse_output=sparse_).fit(X) + msg = ( + r"Samples \[(\d )*\d\] can not be inverted when drop=None and " + r"handle_unknown='error' because they contain all zeros" + ) + + if sparse_: + # emulate sparse data transform by a one-hot encoder sparse. + X_trans = _convert_container(X_trans, "sparse") + with pytest.raises(ValueError, match=msg): + enc.inverse_transform(X_trans) + + +def test_one_hot_encoder_inverse_if_binary(): + X = np.array([["Male", 1], ["Female", 3], ["Female", 2]], dtype=object) + ohe = OneHotEncoder(drop="if_binary", sparse_output=False) + X_tr = ohe.fit_transform(X) + assert_array_equal(ohe.inverse_transform(X_tr), X) + + +@pytest.mark.parametrize("drop", ["if_binary", "first", None]) +@pytest.mark.parametrize("reset_drop", ["if_binary", "first", None]) +def test_one_hot_encoder_drop_reset(drop, reset_drop): + # check that resetting drop option without refitting does not throw an error + X = np.array([["Male", 1], ["Female", 3], ["Female", 2]], dtype=object) + ohe = OneHotEncoder(drop=drop, sparse_output=False) + ohe.fit(X) + X_tr = ohe.transform(X) + feature_names = ohe.get_feature_names_out() + ohe.set_params(drop=reset_drop) + assert_array_equal(ohe.inverse_transform(X_tr), X) + assert_allclose(ohe.transform(X), X_tr) + assert_array_equal(ohe.get_feature_names_out(), feature_names) + + +@pytest.mark.parametrize("method", ["fit", "fit_transform"]) +@pytest.mark.parametrize("X", [[1, 2], np.array([3.0, 4.0])]) +def test_X_is_not_1D(X, method): + oh = OneHotEncoder() + + msg = "Expected 2D array, got 1D array instead" + with pytest.raises(ValueError, match=msg): + getattr(oh, method)(X) + + +@pytest.mark.parametrize("method", ["fit", "fit_transform"]) +def test_X_is_not_1D_pandas(method): + pd = pytest.importorskip("pandas") + X = pd.Series([6, 3, 4, 6]) + oh = OneHotEncoder() + + msg = f"Expected a 2-dimensional container but got {type(X)} instead." + with pytest.raises(ValueError, match=msg): + getattr(oh, method)(X) + + +@pytest.mark.parametrize( + "X, cat_exp, cat_dtype", + [ + ([["abc", 55], ["def", 55]], [["abc", "def"], [55]], np.object_), + (np.array([[1, 2], [3, 2]]), [[1, 3], [2]], np.integer), + ( + np.array([["A", "cat"], ["B", "cat"]], dtype=object), + [["A", "B"], ["cat"]], + np.object_, + ), + (np.array([["A", "cat"], ["B", "cat"]]), [["A", "B"], ["cat"]], np.str_), + (np.array([[1, 2], [np.nan, 2]]), [[1, np.nan], [2]], np.float64), + ( + np.array([["A", np.nan], [None, np.nan]], dtype=object), + [["A", None], [np.nan]], + np.object_, + ), + ( + np.array([["A", float("nan")], [None, float("nan")]], dtype=object), + [["A", None], [float("nan")]], + np.object_, + ), + ], + ids=[ + "mixed", + "numeric", + "object", + "string", + "missing-float", + "missing-np.nan-object", + "missing-float-nan-object", + ], +) +def test_one_hot_encoder_categories(X, cat_exp, cat_dtype): + # order of categories should not depend on order of samples + for Xi in [X, X[::-1]]: + enc = OneHotEncoder(categories="auto") + enc.fit(Xi) + # assert enc.categories == 'auto' + assert isinstance(enc.categories_, list) + for res, exp in zip(enc.categories_, cat_exp): + res_list = res.tolist() + if is_scalar_nan(exp[-1]): + assert is_scalar_nan(res_list[-1]) + assert res_list[:-1] == exp[:-1] + else: + assert res.tolist() == exp + assert np.issubdtype(res.dtype, cat_dtype) + + +@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"]) +@pytest.mark.parametrize( + "X, X2, cats, cat_dtype", + [ + ( + np.array([["a", "b"]], dtype=object).T, + np.array([["a", "d"]], dtype=object).T, + [["a", "b", "c"]], + np.object_, + ), + ( + np.array([[1, 2]], dtype="int64").T, + np.array([[1, 4]], dtype="int64").T, + [[1, 2, 3]], + np.int64, + ), + ( + np.array([["a", "b"]], dtype=object).T, + np.array([["a", "d"]], dtype=object).T, + [np.array(["a", "b", "c"])], + np.object_, + ), + ( + np.array([[None, "a"]], dtype=object).T, + np.array([[None, "b"]], dtype=object).T, + [[None, "a", "z"]], + object, + ), + ( + np.array([["a", "b"]], dtype=object).T, + np.array([["a", np.nan]], dtype=object).T, + [["a", "b", "z"]], + object, + ), + ( + np.array([["a", None]], dtype=object).T, + np.array([["a", np.nan]], dtype=object).T, + [["a", None, "z"]], + object, + ), + ], + ids=[ + "object", + "numeric", + "object-string", + "object-string-none", + "object-string-nan", + "object-None-and-nan", + ], +) +def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype, handle_unknown): + enc = OneHotEncoder(categories=cats) + exp = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]) + assert_array_equal(enc.fit_transform(X).toarray(), exp) + assert list(enc.categories[0]) == list(cats[0]) + assert enc.categories_[0].tolist() == list(cats[0]) + # manually specified categories should have same dtype as + # the data when coerced from lists + assert enc.categories_[0].dtype == cat_dtype + + # when specifying categories manually, unknown categories should already + # raise when fitting + enc = OneHotEncoder(categories=cats) + with pytest.raises(ValueError, match="Found unknown categories"): + enc.fit(X2) + enc = OneHotEncoder(categories=cats, handle_unknown=handle_unknown) + exp = np.array([[1.0, 0.0, 0.0], [0.0, 0.0, 0.0]]) + assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp) + + +def test_one_hot_encoder_unsorted_categories(): + X = np.array([["a", "b"]], dtype=object).T + + enc = OneHotEncoder(categories=[["b", "a", "c"]]) + exp = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0]]) + assert_array_equal(enc.fit(X).transform(X).toarray(), exp) + assert_array_equal(enc.fit_transform(X).toarray(), exp) + assert enc.categories_[0].tolist() == ["b", "a", "c"] + assert np.issubdtype(enc.categories_[0].dtype, np.object_) + + # unsorted passed categories still raise for numerical values + X = np.array([[1, 2]]).T + enc = OneHotEncoder(categories=[[2, 1, 3]]) + msg = "Unsorted categories are not supported" + with pytest.raises(ValueError, match=msg): + enc.fit_transform(X) + + +@pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder]) +def test_encoder_nan_ending_specified_categories(Encoder): + """Test encoder for specified categories that nan is at the end. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/27088 + """ + cats = [np.array([0, np.nan, 1])] + enc = Encoder(categories=cats) + X = np.array([[0, 1]], dtype=object).T + with pytest.raises(ValueError, match="Nan should be the last element"): + enc.fit(X) + + +def test_one_hot_encoder_specified_categories_mixed_columns(): + # multiple columns + X = np.array([["a", "b"], [0, 2]], dtype=object).T + enc = OneHotEncoder(categories=[["a", "b", "c"], [0, 1, 2]]) + exp = np.array([[1.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0, 1.0]]) + assert_array_equal(enc.fit_transform(X).toarray(), exp) + assert enc.categories_[0].tolist() == ["a", "b", "c"] + assert np.issubdtype(enc.categories_[0].dtype, np.object_) + assert enc.categories_[1].tolist() == [0, 1, 2] + # integer categories but from object dtype data + assert np.issubdtype(enc.categories_[1].dtype, np.object_) + + +def test_one_hot_encoder_pandas(): + pd = pytest.importorskip("pandas") + + X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]}) + + Xtr = check_categorical_onehot(X_df) + assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]]) + + +@pytest.mark.parametrize( + "drop, expected_names", + [ + ("first", ["x0_c", "x2_b"]), + ("if_binary", ["x0_c", "x1_2", "x2_b"]), + (["c", 2, "b"], ["x0_b", "x2_a"]), + ], + ids=["first", "binary", "manual"], +) +def test_one_hot_encoder_feature_names_drop(drop, expected_names): + X = [["c", 2, "a"], ["b", 2, "b"]] + + ohe = OneHotEncoder(drop=drop) + ohe.fit(X) + feature_names = ohe.get_feature_names_out() + assert_array_equal(expected_names, feature_names) + + +def test_one_hot_encoder_drop_equals_if_binary(): + # Canonical case + X = [[10, "yes"], [20, "no"], [30, "yes"]] + expected = np.array( + [[1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 1.0]] + ) + expected_drop_idx = np.array([None, 0]) + + ohe = OneHotEncoder(drop="if_binary", sparse_output=False) + result = ohe.fit_transform(X) + assert_array_equal(ohe.drop_idx_, expected_drop_idx) + assert_allclose(result, expected) + + # with only one cat, the behaviour is equivalent to drop=None + X = [["true", "a"], ["false", "a"], ["false", "a"]] + expected = np.array([[1.0, 1.0], [0.0, 1.0], [0.0, 1.0]]) + expected_drop_idx = np.array([0, None]) + + ohe = OneHotEncoder(drop="if_binary", sparse_output=False) + result = ohe.fit_transform(X) + assert_array_equal(ohe.drop_idx_, expected_drop_idx) + assert_allclose(result, expected) + + +@pytest.mark.parametrize( + "X", + [ + [["abc", 2, 55], ["def", 1, 55]], + np.array([[10, 2, 55], [20, 1, 55]]), + np.array([["a", "B", "cat"], ["b", "A", "cat"]], dtype=object), + ], + ids=["mixed", "numeric", "object"], +) +def test_ordinal_encoder(X): + enc = OrdinalEncoder() + exp = np.array([[0, 1, 0], [1, 0, 0]], dtype="int64") + assert_array_equal(enc.fit_transform(X), exp.astype("float64")) + enc = OrdinalEncoder(dtype="int64") + assert_array_equal(enc.fit_transform(X), exp) + + +@pytest.mark.parametrize( + "X, X2, cats, cat_dtype", + [ + ( + np.array([["a", "b"]], dtype=object).T, + np.array([["a", "d"]], dtype=object).T, + [["a", "b", "c"]], + np.object_, + ), + ( + np.array([[1, 2]], dtype="int64").T, + np.array([[1, 4]], dtype="int64").T, + [[1, 2, 3]], + np.int64, + ), + ( + np.array([["a", "b"]], dtype=object).T, + np.array([["a", "d"]], dtype=object).T, + [np.array(["a", "b", "c"])], + np.object_, + ), + ], + ids=["object", "numeric", "object-string-cat"], +) +def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype): + enc = OrdinalEncoder(categories=cats) + exp = np.array([[0.0], [1.0]]) + assert_array_equal(enc.fit_transform(X), exp) + assert list(enc.categories[0]) == list(cats[0]) + assert enc.categories_[0].tolist() == list(cats[0]) + # manually specified categories should have same dtype as + # the data when coerced from lists + assert enc.categories_[0].dtype == cat_dtype + + # when specifying categories manually, unknown categories should already + # raise when fitting + enc = OrdinalEncoder(categories=cats) + with pytest.raises(ValueError, match="Found unknown categories"): + enc.fit(X2) + + +def test_ordinal_encoder_inverse(): + X = [["abc", 2, 55], ["def", 1, 55]] + enc = OrdinalEncoder() + X_tr = enc.fit_transform(X) + exp = np.array(X, dtype=object) + assert_array_equal(enc.inverse_transform(X_tr), exp) + + # incorrect shape raises + X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]]) + msg = re.escape("Shape of the passed X data is not correct") + with pytest.raises(ValueError, match=msg): + enc.inverse_transform(X_tr) + + +def test_ordinal_encoder_handle_unknowns_string(): + enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-2) + X_fit = np.array([["a", "x"], ["b", "y"], ["c", "z"]], dtype=object) + X_trans = np.array([["c", "xy"], ["bla", "y"], ["a", "x"]], dtype=object) + enc.fit(X_fit) + + X_trans_enc = enc.transform(X_trans) + exp = np.array([[2, -2], [-2, 1], [0, 0]], dtype="int64") + assert_array_equal(X_trans_enc, exp) + + X_trans_inv = enc.inverse_transform(X_trans_enc) + inv_exp = np.array([["c", None], [None, "y"], ["a", "x"]], dtype=object) + assert_array_equal(X_trans_inv, inv_exp) + + +@pytest.mark.parametrize("dtype", [float, int]) +def test_ordinal_encoder_handle_unknowns_numeric(dtype): + enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-999) + X_fit = np.array([[1, 7], [2, 8], [3, 9]], dtype=dtype) + X_trans = np.array([[3, 12], [23, 8], [1, 7]], dtype=dtype) + enc.fit(X_fit) + + X_trans_enc = enc.transform(X_trans) + exp = np.array([[2, -999], [-999, 1], [0, 0]], dtype="int64") + assert_array_equal(X_trans_enc, exp) + + X_trans_inv = enc.inverse_transform(X_trans_enc) + inv_exp = np.array([[3, None], [None, 8], [1, 7]], dtype=object) + assert_array_equal(X_trans_inv, inv_exp) + + +def test_ordinal_encoder_handle_unknowns_nan(): + # Make sure unknown_value=np.nan properly works + + enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan) + + X_fit = np.array([[1], [2], [3]]) + enc.fit(X_fit) + X_trans = enc.transform([[1], [2], [4]]) + assert_array_equal(X_trans, [[0], [1], [np.nan]]) + + +def test_ordinal_encoder_handle_unknowns_nan_non_float_dtype(): + # Make sure an error is raised when unknown_value=np.nan and the dtype + # isn't a float dtype + enc = OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=np.nan, dtype=int + ) + + X_fit = np.array([[1], [2], [3]]) + with pytest.raises(ValueError, match="dtype parameter should be a float dtype"): + enc.fit(X_fit) + + +def test_ordinal_encoder_raise_categories_shape(): + X = np.array([["Low", "Medium", "High", "Medium", "Low"]], dtype=object).T + cats = ["Low", "Medium", "High"] + enc = OrdinalEncoder(categories=cats) + msg = "Shape mismatch: if categories is an array," + + with pytest.raises(ValueError, match=msg): + enc.fit(X) + + +def test_encoder_dtypes(): + # check that dtypes are preserved when determining categories + enc = OneHotEncoder(categories="auto") + exp = np.array([[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0]], dtype="float64") + + for X in [ + np.array([[1, 2], [3, 4]], dtype="int64"), + np.array([[1, 2], [3, 4]], dtype="float64"), + np.array([["a", "b"], ["c", "d"]]), # str dtype + np.array([[b"a", b"b"], [b"c", b"d"]]), # bytes dtype + np.array([[1, "a"], [3, "b"]], dtype="object"), + ]: + enc.fit(X) + assert all([enc.categories_[i].dtype == X.dtype for i in range(2)]) + assert_array_equal(enc.transform(X).toarray(), exp) + + X = [[1, 2], [3, 4]] + enc.fit(X) + assert all([np.issubdtype(enc.categories_[i].dtype, np.integer) for i in range(2)]) + assert_array_equal(enc.transform(X).toarray(), exp) + + X = [[1, "a"], [3, "b"]] + enc.fit(X) + assert all([enc.categories_[i].dtype == "object" for i in range(2)]) + assert_array_equal(enc.transform(X).toarray(), exp) + + +def test_encoder_dtypes_pandas(): + # check dtype (similar to test_categorical_encoder_dtypes for dataframes) + pd = pytest.importorskip("pandas") + + enc = OneHotEncoder(categories="auto") + exp = np.array( + [[1.0, 0.0, 1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0, 0.0, 1.0]], + dtype="float64", + ) + + X = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}, dtype="int64") + enc.fit(X) + assert all([enc.categories_[i].dtype == "int64" for i in range(2)]) + assert_array_equal(enc.transform(X).toarray(), exp) + + X = pd.DataFrame({"A": [1, 2], "B": ["a", "b"], "C": [3.0, 4.0]}) + expected_cat_type = ["int64", "object", "float64"] + enc.fit(X) + assert all([enc.categories_[i].dtype == expected_cat_type[i] for i in range(3)]) + assert_array_equal(enc.transform(X).toarray(), exp) + + +def test_one_hot_encoder_warning(): + enc = OneHotEncoder() + X = [["Male", 1], ["Female", 3]] + with warnings.catch_warnings(): + warnings.simplefilter("error") + enc.fit_transform(X) + + +@pytest.mark.parametrize("drop", ["if_binary", "first"]) +def test_ohe_handle_unknown_warn(drop): + """Check handle_unknown='warn' works correctly.""" + + X = [["a", 0], ["b", 2], ["b", 1]] + + ohe = OneHotEncoder( + drop=drop, + sparse_output=False, + handle_unknown="warn", + categories=[["b", "a"], [1, 2]], + ) + ohe.fit(X) + + X_test = [["c", 1]] + X_expected = np.array([[0, 0]]) + + warn_msg = ( + r"Found unknown categories in columns \[0\] during transform. " + r"These unknown categories will be encoded as all zeros" + ) + with pytest.warns(UserWarning, match=warn_msg): + X_trans = ohe.transform(X_test) + assert_allclose(X_trans, X_expected) + + +@pytest.mark.parametrize("missing_value", [np.nan, None, float("nan")]) +def test_one_hot_encoder_drop_manual(missing_value): + cats_to_drop = ["def", 12, 3, 56, missing_value] + enc = OneHotEncoder(drop=cats_to_drop) + X = [ + ["abc", 12, 2, 55, "a"], + ["def", 12, 1, 55, "a"], + ["def", 12, 3, 56, missing_value], + ] + trans = enc.fit_transform(X).toarray() + exp = [[1, 0, 1, 1, 1], [0, 1, 0, 1, 1], [0, 0, 0, 0, 0]] + assert_array_equal(trans, exp) + assert enc.drop is cats_to_drop + + dropped_cats = [ + cat[feature] for cat, feature in zip(enc.categories_, enc.drop_idx_) + ] + X_inv_trans = enc.inverse_transform(trans) + X_array = np.array(X, dtype=object) + + # last value is np.nan + if is_scalar_nan(cats_to_drop[-1]): + assert_array_equal(dropped_cats[:-1], cats_to_drop[:-1]) + assert is_scalar_nan(dropped_cats[-1]) + assert is_scalar_nan(cats_to_drop[-1]) + # do not include the last column which includes missing values + assert_array_equal(X_array[:, :-1], X_inv_trans[:, :-1]) + + # check last column is the missing value + assert_array_equal(X_array[-1, :-1], X_inv_trans[-1, :-1]) + assert is_scalar_nan(X_array[-1, -1]) + assert is_scalar_nan(X_inv_trans[-1, -1]) + else: + assert_array_equal(dropped_cats, cats_to_drop) + assert_array_equal(X_array, X_inv_trans) + + +@pytest.mark.parametrize("drop", [["abc", 3], ["abc", 3, 41, "a"]]) +def test_invalid_drop_length(drop): + enc = OneHotEncoder(drop=drop) + err_msg = "`drop` should have length equal to the number" + with pytest.raises(ValueError, match=err_msg): + enc.fit([["abc", 2, 55], ["def", 1, 55], ["def", 3, 59]]) + + +@pytest.mark.parametrize("density", [True, False], ids=["sparse", "dense"]) +@pytest.mark.parametrize("drop", ["first", ["a", 2, "b"]], ids=["first", "manual"]) +def test_categories(density, drop): + ohe_base = OneHotEncoder(sparse_output=density) + ohe_test = OneHotEncoder(sparse_output=density, drop=drop) + X = [["c", 1, "a"], ["a", 2, "b"]] + ohe_base.fit(X) + ohe_test.fit(X) + assert_array_equal(ohe_base.categories_, ohe_test.categories_) + if drop == "first": + assert_array_equal(ohe_test.drop_idx_, 0) + else: + for drop_cat, drop_idx, cat_list in zip( + drop, ohe_test.drop_idx_, ohe_test.categories_ + ): + assert cat_list[int(drop_idx)] == drop_cat + assert isinstance(ohe_test.drop_idx_, np.ndarray) + assert ohe_test.drop_idx_.dtype == object + + +@pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder]) +def test_encoders_has_categorical_tags(Encoder): + assert Encoder().__sklearn_tags__().input_tags.categorical + + +@pytest.mark.parametrize( + "kwargs", + [ + {"max_categories": 2}, + {"min_frequency": 11}, + {"min_frequency": 0.29}, + {"max_categories": 2, "min_frequency": 6}, + {"max_categories": 4, "min_frequency": 12}, + ], +) +@pytest.mark.parametrize("categories", ["auto", [["a", "b", "c", "d"]]]) +def test_ohe_infrequent_two_levels(kwargs, categories): + """Test that different parameters for combine 'a', 'c', and 'd' into + the infrequent category works as expected.""" + + X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T + ohe = OneHotEncoder( + categories=categories, + handle_unknown="infrequent_if_exist", + sparse_output=False, + **kwargs, + ).fit(X_train) + assert_array_equal(ohe.infrequent_categories_, [["a", "c", "d"]]) + + X_test = [["b"], ["a"], ["c"], ["d"], ["e"]] + expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]]) + + X_trans = ohe.transform(X_test) + assert_allclose(expected, X_trans) + + expected_inv = [[col] for col in ["b"] + ["infrequent_sklearn"] * 4] + X_inv = ohe.inverse_transform(X_trans) + assert_array_equal(expected_inv, X_inv) + + feature_names = ohe.get_feature_names_out() + assert_array_equal(["x0_b", "x0_infrequent_sklearn"], feature_names) + + +@pytest.mark.parametrize("drop", ["if_binary", "first", ["b"]]) +def test_ohe_infrequent_two_levels_drop_frequent(drop): + """Test two levels and dropping the frequent category.""" + + X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T + ohe = OneHotEncoder( + handle_unknown="infrequent_if_exist", + sparse_output=False, + max_categories=2, + drop=drop, + ).fit(X_train) + assert ohe.categories_[0][ohe.drop_idx_[0]] == "b" + + X_test = np.array([["b"], ["c"]]) + X_trans = ohe.transform(X_test) + assert_allclose([[0], [1]], X_trans) + + feature_names = ohe.get_feature_names_out() + assert_array_equal(["x0_infrequent_sklearn"], feature_names) + + X_inverse = ohe.inverse_transform(X_trans) + assert_array_equal([["b"], ["infrequent_sklearn"]], X_inverse) + + +@pytest.mark.parametrize("drop", [["a"], ["d"]]) +def test_ohe_infrequent_two_levels_drop_infrequent_errors(drop): + """Test two levels and dropping any infrequent category removes the + whole infrequent category.""" + + X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T + ohe = OneHotEncoder( + handle_unknown="infrequent_if_exist", + sparse_output=False, + max_categories=2, + drop=drop, + ) + + msg = f"Unable to drop category {drop[0]!r} from feature 0 because it is infrequent" + with pytest.raises(ValueError, match=msg): + ohe.fit(X_train) + + +@pytest.mark.parametrize( + "kwargs", + [ + {"max_categories": 3}, + {"min_frequency": 6}, + {"min_frequency": 9}, + {"min_frequency": 0.24}, + {"min_frequency": 0.16}, + {"max_categories": 3, "min_frequency": 8}, + {"max_categories": 4, "min_frequency": 6}, + ], +) +def test_ohe_infrequent_three_levels(kwargs): + """Test that different parameters for combing 'a', and 'd' into + the infrequent category works as expected.""" + + X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T + ohe = OneHotEncoder( + handle_unknown="infrequent_if_exist", sparse_output=False, **kwargs + ).fit(X_train) + assert_array_equal(ohe.infrequent_categories_, [["a", "d"]]) + + X_test = [["b"], ["a"], ["c"], ["d"], ["e"]] + expected = np.array([[1, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1], [0, 0, 1]]) + + X_trans = ohe.transform(X_test) + assert_allclose(expected, X_trans) + + expected_inv = [ + ["b"], + ["infrequent_sklearn"], + ["c"], + ["infrequent_sklearn"], + ["infrequent_sklearn"], + ] + X_inv = ohe.inverse_transform(X_trans) + assert_array_equal(expected_inv, X_inv) + + feature_names = ohe.get_feature_names_out() + assert_array_equal(["x0_b", "x0_c", "x0_infrequent_sklearn"], feature_names) + + +@pytest.mark.parametrize("drop", ["first", ["b"]]) +def test_ohe_infrequent_three_levels_drop_frequent(drop): + """Test three levels and dropping the frequent category.""" + + X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T + ohe = OneHotEncoder( + handle_unknown="infrequent_if_exist", + sparse_output=False, + max_categories=3, + drop=drop, + ).fit(X_train) + + X_test = np.array([["b"], ["c"], ["d"]]) + assert_allclose([[0, 0], [1, 0], [0, 1]], ohe.transform(X_test)) + + # Check handle_unknown="ignore" + ohe.set_params(handle_unknown="ignore").fit(X_train) + msg = "Found unknown categories" + with pytest.warns(UserWarning, match=msg): + X_trans = ohe.transform([["b"], ["e"]]) + + assert_allclose([[0, 0], [0, 0]], X_trans) + + +@pytest.mark.parametrize("drop", [["a"], ["d"]]) +def test_ohe_infrequent_three_levels_drop_infrequent_errors(drop): + """Test three levels and dropping the infrequent category.""" + X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T + ohe = OneHotEncoder( + handle_unknown="infrequent_if_exist", + sparse_output=False, + max_categories=3, + drop=drop, + ) + + msg = f"Unable to drop category {drop[0]!r} from feature 0 because it is infrequent" + with pytest.raises(ValueError, match=msg): + ohe.fit(X_train) + + +def test_ohe_infrequent_handle_unknown_error(): + """Test that different parameters for combining 'a', and 'd' into + the infrequent category works as expected.""" + + X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T + ohe = OneHotEncoder( + handle_unknown="error", sparse_output=False, max_categories=3 + ).fit(X_train) + assert_array_equal(ohe.infrequent_categories_, [["a", "d"]]) + + # all categories are known + X_test = [["b"], ["a"], ["c"], ["d"]] + expected = np.array([[1, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1]]) + + X_trans = ohe.transform(X_test) + assert_allclose(expected, X_trans) + + # 'bad' is not known and will error + X_test = [["bad"]] + msg = r"Found unknown categories \['bad'\] in column 0" + with pytest.raises(ValueError, match=msg): + ohe.transform(X_test) + + +@pytest.mark.parametrize( + "kwargs", [{"max_categories": 3, "min_frequency": 1}, {"min_frequency": 4}] +) +def test_ohe_infrequent_two_levels_user_cats_one_frequent(kwargs): + """'a' is the only frequent category, all other categories are infrequent.""" + + X_train = np.array([["a"] * 5 + ["e"] * 30], dtype=object).T + ohe = OneHotEncoder( + categories=[["c", "d", "a", "b"]], + sparse_output=False, + handle_unknown="infrequent_if_exist", + **kwargs, + ).fit(X_train) + + X_test = [["a"], ["b"], ["c"], ["d"], ["e"]] + expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]]) + + X_trans = ohe.transform(X_test) + assert_allclose(expected, X_trans) + + # 'a' is dropped + drops = ["first", "if_binary", ["a"]] + X_test = [["a"], ["c"]] + for drop in drops: + ohe.set_params(drop=drop).fit(X_train) + assert_allclose([[0], [1]], ohe.transform(X_test)) + + +def test_ohe_infrequent_two_levels_user_cats(): + """Test that the order of the categories provided by a user is respected.""" + X_train = np.array( + [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object + ).T + ohe = OneHotEncoder( + categories=[["c", "d", "a", "b"]], + sparse_output=False, + handle_unknown="infrequent_if_exist", + max_categories=2, + ).fit(X_train) + + assert_array_equal(ohe.infrequent_categories_, [["c", "d", "a"]]) + + X_test = [["b"], ["a"], ["c"], ["d"], ["e"]] + expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]]) + + X_trans = ohe.transform(X_test) + assert_allclose(expected, X_trans) + + # 'infrequent' is used to denote the infrequent categories for + # `inverse_transform` + expected_inv = [[col] for col in ["b"] + ["infrequent_sklearn"] * 4] + X_inv = ohe.inverse_transform(X_trans) + assert_array_equal(expected_inv, X_inv) + + +def test_ohe_infrequent_three_levels_user_cats(): + """Test that the order of the categories provided by a user is respected. + In this case 'c' is encoded as the first category and 'b' is encoded + as the second one.""" + + X_train = np.array( + [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object + ).T + ohe = OneHotEncoder( + categories=[["c", "d", "b", "a"]], + sparse_output=False, + handle_unknown="infrequent_if_exist", + max_categories=3, + ).fit(X_train) + + assert_array_equal(ohe.infrequent_categories_, [["d", "a"]]) + + X_test = [["b"], ["a"], ["c"], ["d"], ["e"]] + expected = np.array([[0, 1, 0], [0, 0, 1], [1, 0, 0], [0, 0, 1], [0, 0, 1]]) + + X_trans = ohe.transform(X_test) + assert_allclose(expected, X_trans) + + # 'infrequent' is used to denote the infrequent categories for + # `inverse_transform` + expected_inv = [ + ["b"], + ["infrequent_sklearn"], + ["c"], + ["infrequent_sklearn"], + ["infrequent_sklearn"], + ] + X_inv = ohe.inverse_transform(X_trans) + assert_array_equal(expected_inv, X_inv) + + +def test_ohe_infrequent_mixed(): + """Test infrequent categories where feature 0 has infrequent categories, + and feature 1 does not.""" + + # X[:, 0] 1 and 2 are infrequent + # X[:, 1] nothing is infrequent + X = np.c_[[0, 1, 3, 3, 3, 3, 2, 0, 3], [0, 0, 0, 0, 1, 1, 1, 1, 1]] + + ohe = OneHotEncoder(max_categories=3, drop="if_binary", sparse_output=False) + ohe.fit(X) + + X_test = [[3, 0], [1, 1]] + X_trans = ohe.transform(X_test) + + # feature 1 is binary so it drops a category 0 + assert_allclose(X_trans, [[0, 1, 0, 0], [0, 0, 1, 1]]) + + +def test_ohe_infrequent_multiple_categories(): + """Test infrequent categories with feature matrix with 3 features.""" + + X = np.c_[ + [0, 1, 3, 3, 3, 3, 2, 0, 3], + [0, 0, 5, 1, 1, 10, 5, 5, 0], + [1, 0, 1, 0, 1, 0, 1, 0, 1], + ] + + ohe = OneHotEncoder( + categories="auto", max_categories=3, handle_unknown="infrequent_if_exist" + ) + # X[:, 0] 1 and 2 are infrequent + # X[:, 1] 1 and 10 are infrequent + # X[:, 2] nothing is infrequent + + X_trans = ohe.fit_transform(X).toarray() + assert_array_equal(ohe.infrequent_categories_[0], [1, 2]) + assert_array_equal(ohe.infrequent_categories_[1], [1, 10]) + assert_array_equal(ohe.infrequent_categories_[2], None) + + # 'infrequent' is used to denote the infrequent categories + # For the first column, 1 and 2 have the same frequency. In this case, + # 1 will be chosen to be the feature name because is smaller lexiconically + feature_names = ohe.get_feature_names_out() + assert_array_equal( + [ + "x0_0", + "x0_3", + "x0_infrequent_sklearn", + "x1_0", + "x1_5", + "x1_infrequent_sklearn", + "x2_0", + "x2_1", + ], + feature_names, + ) + + expected = [ + [1, 0, 0, 1, 0, 0, 0, 1], + [0, 0, 1, 1, 0, 0, 1, 0], + [0, 1, 0, 0, 1, 0, 0, 1], + [0, 1, 0, 0, 0, 1, 1, 0], + [0, 1, 0, 0, 0, 1, 0, 1], + [0, 1, 0, 0, 0, 1, 1, 0], + [0, 0, 1, 0, 1, 0, 0, 1], + [1, 0, 0, 0, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 0, 0, 1], + ] + + assert_allclose(expected, X_trans) + + X_test = [[3, 1, 2], [4, 0, 3]] + + X_test_trans = ohe.transform(X_test) + + # X[:, 2] does not have an infrequent category, thus it is encoded as all + # zeros + expected = [[0, 1, 0, 0, 0, 1, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0]] + assert_allclose(expected, X_test_trans.toarray()) + + X_inv = ohe.inverse_transform(X_test_trans) + expected_inv = np.array( + [[3, "infrequent_sklearn", None], ["infrequent_sklearn", 0, None]], dtype=object + ) + assert_array_equal(expected_inv, X_inv) + + # error for unknown categories + ohe = OneHotEncoder( + categories="auto", max_categories=3, handle_unknown="error" + ).fit(X) + with pytest.raises(ValueError, match="Found unknown categories"): + ohe.transform(X_test) + + # only infrequent or known categories + X_test = [[1, 1, 1], [3, 10, 0]] + X_test_trans = ohe.transform(X_test) + + expected = [[0, 0, 1, 0, 0, 1, 0, 1], [0, 1, 0, 0, 0, 1, 1, 0]] + assert_allclose(expected, X_test_trans.toarray()) + + X_inv = ohe.inverse_transform(X_test_trans) + + expected_inv = np.array( + [["infrequent_sklearn", "infrequent_sklearn", 1], [3, "infrequent_sklearn", 0]], + dtype=object, + ) + assert_array_equal(expected_inv, X_inv) + + +def test_ohe_infrequent_multiple_categories_dtypes(): + """Test infrequent categories with a pandas dataframe with multiple dtypes.""" + + pd = pytest.importorskip("pandas") + X = pd.DataFrame( + { + "str": ["a", "f", "c", "f", "f", "a", "c", "b", "b"], + "int": [5, 3, 0, 10, 10, 12, 0, 3, 5], + }, + columns=["str", "int"], + ) + + ohe = OneHotEncoder( + categories="auto", max_categories=3, handle_unknown="infrequent_if_exist" + ) + # X[:, 0] 'a', 'b', 'c' have the same frequency. 'a' and 'b' will be + # considered infrequent because they are greater + + # X[:, 1] 0, 3, 5, 10 has frequency 2 and 12 has frequency 1. + # 0, 3, 12 will be considered infrequent + + X_trans = ohe.fit_transform(X).toarray() + assert_array_equal(ohe.infrequent_categories_[0], ["a", "b"]) + assert_array_equal(ohe.infrequent_categories_[1], [0, 3, 12]) + + expected = [ + [0, 0, 1, 1, 0, 0], + [0, 1, 0, 0, 0, 1], + [1, 0, 0, 0, 0, 1], + [0, 1, 0, 0, 1, 0], + [0, 1, 0, 0, 1, 0], + [0, 0, 1, 0, 0, 1], + [1, 0, 0, 0, 0, 1], + [0, 0, 1, 0, 0, 1], + [0, 0, 1, 1, 0, 0], + ] + + assert_allclose(expected, X_trans) + + X_test = pd.DataFrame({"str": ["b", "f"], "int": [14, 12]}, columns=["str", "int"]) + + expected = [[0, 0, 1, 0, 0, 1], [0, 1, 0, 0, 0, 1]] + X_test_trans = ohe.transform(X_test) + assert_allclose(expected, X_test_trans.toarray()) + + X_inv = ohe.inverse_transform(X_test_trans) + expected_inv = np.array( + [["infrequent_sklearn", "infrequent_sklearn"], ["f", "infrequent_sklearn"]], + dtype=object, + ) + assert_array_equal(expected_inv, X_inv) + + # only infrequent or known categories + X_test = pd.DataFrame({"str": ["c", "b"], "int": [12, 5]}, columns=["str", "int"]) + X_test_trans = ohe.transform(X_test).toarray() + expected = [[1, 0, 0, 0, 0, 1], [0, 0, 1, 1, 0, 0]] + assert_allclose(expected, X_test_trans) + + X_inv = ohe.inverse_transform(X_test_trans) + expected_inv = np.array( + [["c", "infrequent_sklearn"], ["infrequent_sklearn", 5]], dtype=object + ) + assert_array_equal(expected_inv, X_inv) + + +@pytest.mark.parametrize("kwargs", [{"min_frequency": 21, "max_categories": 1}]) +def test_ohe_infrequent_one_level_errors(kwargs): + """All user provided categories are infrequent.""" + X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 2]).T + + ohe = OneHotEncoder( + handle_unknown="infrequent_if_exist", sparse_output=False, **kwargs + ) + ohe.fit(X_train) + + X_trans = ohe.transform([["a"]]) + assert_allclose(X_trans, [[1]]) + + +@pytest.mark.parametrize("kwargs", [{"min_frequency": 2, "max_categories": 3}]) +def test_ohe_infrequent_user_cats_unknown_training_errors(kwargs): + """All user provided categories are infrequent.""" + + X_train = np.array([["e"] * 3], dtype=object).T + ohe = OneHotEncoder( + categories=[["c", "d", "a", "b"]], + sparse_output=False, + handle_unknown="infrequent_if_exist", + **kwargs, + ).fit(X_train) + + X_trans = ohe.transform([["a"], ["e"]]) + assert_allclose(X_trans, [[1], [1]]) + + +# deliberately omit 'OS' as an invalid combo +@pytest.mark.parametrize( + "input_dtype, category_dtype", ["OO", "OU", "UO", "UU", "SO", "SU", "SS"] +) +@pytest.mark.parametrize("array_type", ["list", "array", "dataframe"]) +def test_encoders_string_categories(input_dtype, category_dtype, array_type): + """Check that encoding work with object, unicode, and byte string dtypes. + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/15616 + https://github.com/scikit-learn/scikit-learn/issues/15726 + https://github.com/scikit-learn/scikit-learn/issues/19677 + """ + + X = np.array([["b"], ["a"]], dtype=input_dtype) + categories = [np.array(["b", "a"], dtype=category_dtype)] + ohe = OneHotEncoder(categories=categories, sparse_output=False).fit(X) + + X_test = _convert_container( + [["a"], ["a"], ["b"], ["a"]], array_type, dtype=input_dtype + ) + X_trans = ohe.transform(X_test) + + expected = np.array([[0, 1], [0, 1], [1, 0], [0, 1]]) + assert_allclose(X_trans, expected) + + oe = OrdinalEncoder(categories=categories).fit(X) + X_trans = oe.transform(X_test) + + expected = np.array([[1], [1], [0], [1]]) + assert_array_equal(X_trans, expected) + + +def test_mixed_string_bytes_categoricals(): + """Check that this mixture of predefined categories and X raises an error. + + Categories defined as bytes can not easily be compared to data that is + a string. + """ + # data as unicode + X = np.array([["b"], ["a"]], dtype="U") + # predefined categories as bytes + categories = [np.array(["b", "a"], dtype="S")] + ohe = OneHotEncoder(categories=categories, sparse_output=False) + + msg = re.escape( + "In column 0, the predefined categories have type 'bytes' which is incompatible" + " with values of type 'str_'." + ) + + with pytest.raises(ValueError, match=msg): + ohe.fit(X) + + +@pytest.mark.parametrize("missing_value", [np.nan, None]) +def test_ohe_missing_values_get_feature_names(missing_value): + # encoder with missing values with object dtypes + X = np.array([["a", "b", missing_value, "a", missing_value]], dtype=object).T + ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore").fit(X) + names = ohe.get_feature_names_out() + assert_array_equal(names, ["x0_a", "x0_b", f"x0_{missing_value}"]) + + +def test_ohe_missing_value_support_pandas(): + # check support for pandas with mixed dtypes and missing values + pd = pytest.importorskip("pandas") + df = pd.DataFrame( + { + "col1": ["dog", "cat", None, "cat"], + "col2": np.array([3, 0, 4, np.nan], dtype=float), + }, + columns=["col1", "col2"], + ) + expected_df_trans = np.array( + [ + [0, 1, 0, 0, 1, 0, 0], + [1, 0, 0, 1, 0, 0, 0], + [0, 0, 1, 0, 0, 1, 0], + [1, 0, 0, 0, 0, 0, 1], + ] + ) + + Xtr = check_categorical_onehot(df) + assert_allclose(Xtr, expected_df_trans) + + +@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"]) +@pytest.mark.parametrize("pd_nan_type", ["pd.NA", "np.nan"]) +def test_ohe_missing_value_support_pandas_categorical(pd_nan_type, handle_unknown): + # checks pandas dataframe with categorical features + pd = pytest.importorskip("pandas") + + pd_missing_value = pd.NA if pd_nan_type == "pd.NA" else np.nan + + df = pd.DataFrame( + { + "col1": pd.Series(["c", "a", pd_missing_value, "b", "a"], dtype="category"), + } + ) + expected_df_trans = np.array( + [ + [0, 0, 1, 0], + [1, 0, 0, 0], + [0, 0, 0, 1], + [0, 1, 0, 0], + [1, 0, 0, 0], + ] + ) + + ohe = OneHotEncoder(sparse_output=False, handle_unknown=handle_unknown) + df_trans = ohe.fit_transform(df) + assert_allclose(expected_df_trans, df_trans) + + assert len(ohe.categories_) == 1 + assert_array_equal(ohe.categories_[0][:-1], ["a", "b", "c"]) + assert np.isnan(ohe.categories_[0][-1]) + + +@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"]) +def test_ohe_drop_first_handle_unknown_ignore_warns(handle_unknown): + """Check drop='first' and handle_unknown='ignore'/'infrequent_if_exist' + during transform.""" + X = [["a", 0], ["b", 2], ["b", 1]] + + ohe = OneHotEncoder( + drop="first", sparse_output=False, handle_unknown=handle_unknown + ) + X_trans = ohe.fit_transform(X) + + X_expected = np.array( + [ + [0, 0, 0], + [1, 0, 1], + [1, 1, 0], + ] + ) + assert_allclose(X_trans, X_expected) + + # Both categories are unknown + X_test = [["c", 3]] + X_expected = np.array([[0, 0, 0]]) + + warn_msg = ( + r"Found unknown categories in columns \[0, 1\] during " + "transform. These unknown categories will be encoded as all " + "zeros" + ) + with pytest.warns(UserWarning, match=warn_msg): + X_trans = ohe.transform(X_test) + assert_allclose(X_trans, X_expected) + + # inverse_transform maps to None + X_inv = ohe.inverse_transform(X_expected) + assert_array_equal(X_inv, np.array([["a", 0]], dtype=object)) + + +@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"]) +def test_ohe_drop_if_binary_handle_unknown_ignore_warns(handle_unknown): + """Check drop='if_binary' and handle_unknown='ignore' during transform.""" + X = [["a", 0], ["b", 2], ["b", 1]] + + ohe = OneHotEncoder( + drop="if_binary", sparse_output=False, handle_unknown=handle_unknown + ) + X_trans = ohe.fit_transform(X) + + X_expected = np.array( + [ + [0, 1, 0, 0], + [1, 0, 0, 1], + [1, 0, 1, 0], + ] + ) + assert_allclose(X_trans, X_expected) + + # Both categories are unknown + X_test = [["c", 3]] + X_expected = np.array([[0, 0, 0, 0]]) + + warn_msg = ( + r"Found unknown categories in columns \[0, 1\] during " + "transform. These unknown categories will be encoded as all " + "zeros" + ) + with pytest.warns(UserWarning, match=warn_msg): + X_trans = ohe.transform(X_test) + assert_allclose(X_trans, X_expected) + + # inverse_transform maps to None + X_inv = ohe.inverse_transform(X_expected) + assert_array_equal(X_inv, np.array([["a", None]], dtype=object)) + + +@pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist", "warn"]) +def test_ohe_drop_first_explicit_categories(handle_unknown): + """Check drop='first' and handle_unknown='ignore'/'infrequent_if_exist' + during fit with categories passed in.""" + + X = [["a", 0], ["b", 2], ["b", 1]] + + ohe = OneHotEncoder( + drop="first", + sparse_output=False, + handle_unknown=handle_unknown, + categories=[["b", "a"], [1, 2]], + ) + ohe.fit(X) + + X_test = [["c", 1]] + X_expected = np.array([[0, 0]]) + + warn_msg = ( + r"Found unknown categories in columns \[0\] during transform. " + r"These unknown categories will be encoded as all zeros" + ) + with pytest.warns(UserWarning, match=warn_msg): + X_trans = ohe.transform(X_test) + assert_allclose(X_trans, X_expected) + + +def test_ohe_more_informative_error_message(): + """Raise informative error message when pandas output and sparse_output=True.""" + pd = pytest.importorskip("pandas") + df = pd.DataFrame({"a": [1, 2, 3], "b": ["z", "b", "b"]}, columns=["a", "b"]) + + ohe = OneHotEncoder(sparse_output=True) + ohe.set_output(transform="pandas") + + msg = ( + "Pandas output does not support sparse data. Set " + "sparse_output=False to output pandas dataframes or disable Pandas output" + ) + with pytest.raises(ValueError, match=msg): + ohe.fit_transform(df) + + ohe.fit(df) + with pytest.raises(ValueError, match=msg): + ohe.transform(df) + + +def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype(): + """Test ordinal encoder with nan passthrough fails when dtype=np.int32.""" + + X = np.array([[np.nan, 3.0, 1.0, 3.0]]).T + oe = OrdinalEncoder(dtype=np.int32) + + msg = ( + r"There are missing values in features \[0\]. For OrdinalEncoder " + f"to encode missing values with dtype: {np.int32}" + ) + with pytest.raises(ValueError, match=msg): + oe.fit(X) + + +@pytest.mark.parametrize("encoded_missing_value", [np.nan, -2]) +def test_ordinal_encoder_passthrough_missing_values_float(encoded_missing_value): + """Test ordinal encoder with nan on float dtypes.""" + + X = np.array([[np.nan, 3.0, 1.0, 3.0]], dtype=np.float64).T + oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(X) + + assert len(oe.categories_) == 1 + + assert_allclose(oe.categories_[0], [1.0, 3.0, np.nan]) + + X_trans = oe.transform(X) + assert_allclose(X_trans, [[encoded_missing_value], [1.0], [0.0], [1.0]]) + + X_inverse = oe.inverse_transform(X_trans) + assert_allclose(X_inverse, X) + + +@pytest.mark.parametrize("pd_nan_type", ["pd.NA", "np.nan"]) +@pytest.mark.parametrize("encoded_missing_value", [np.nan, -2]) +def test_ordinal_encoder_missing_value_support_pandas_categorical( + pd_nan_type, encoded_missing_value +): + """Check ordinal encoder is compatible with pandas.""" + # checks pandas dataframe with categorical features + pd = pytest.importorskip("pandas") + + pd_missing_value = pd.NA if pd_nan_type == "pd.NA" else np.nan + + df = pd.DataFrame( + { + "col1": pd.Series(["c", "a", pd_missing_value, "b", "a"], dtype="category"), + } + ) + + oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(df) + assert len(oe.categories_) == 1 + assert_array_equal(oe.categories_[0][:3], ["a", "b", "c"]) + assert np.isnan(oe.categories_[0][-1]) + + df_trans = oe.transform(df) + + assert_allclose(df_trans, [[2.0], [0.0], [encoded_missing_value], [1.0], [0.0]]) + + X_inverse = oe.inverse_transform(df_trans) + assert X_inverse.shape == (5, 1) + assert_array_equal(X_inverse[:2, 0], ["c", "a"]) + assert_array_equal(X_inverse[3:, 0], ["b", "a"]) + assert np.isnan(X_inverse[2, 0]) + + +@pytest.mark.parametrize( + "X, X2, cats, cat_dtype", + [ + ( + ( + np.array([["a", np.nan]], dtype=object).T, + np.array([["a", "b"]], dtype=object).T, + [np.array(["a", "d", np.nan], dtype=object)], + np.object_, + ) + ), + ( + ( + np.array([["a", np.nan]], dtype=object).T, + np.array([["a", "b"]], dtype=object).T, + [np.array(["a", "d", np.nan], dtype=object)], + np.object_, + ) + ), + ( + ( + np.array([[2.0, np.nan]], dtype=np.float64).T, + np.array([[3.0]], dtype=np.float64).T, + [np.array([2.0, 4.0, np.nan])], + np.float64, + ) + ), + ], + ids=[ + "object-None-missing-value", + "object-nan-missing_value", + "numeric-missing-value", + ], +) +def test_ordinal_encoder_specified_categories_missing_passthrough( + X, X2, cats, cat_dtype +): + """Test ordinal encoder for specified categories.""" + oe = OrdinalEncoder(categories=cats) + exp = np.array([[0.0], [np.nan]]) + assert_array_equal(oe.fit_transform(X), exp) + # manually specified categories should have same dtype as + # the data when coerced from lists + assert oe.categories_[0].dtype == cat_dtype + + # when specifying categories manually, unknown categories should already + # raise when fitting + oe = OrdinalEncoder(categories=cats) + with pytest.raises(ValueError, match="Found unknown categories"): + oe.fit(X2) + + +@pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder]) +def test_encoder_duplicate_specified_categories(Encoder): + """Test encoder for specified categories have duplicate values. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/27088 + """ + cats = [np.array(["a", "b", "a"], dtype=object)] + enc = Encoder(categories=cats) + X = np.array([["a", "b"]], dtype=object).T + with pytest.raises( + ValueError, match="the predefined categories contain duplicate elements." + ): + enc.fit(X) + + +@pytest.mark.parametrize( + "X, expected_X_trans, X_test", + [ + ( + np.array([[1.0, np.nan, 3.0]]).T, + np.array([[0.0, np.nan, 1.0]]).T, + np.array([[4.0]]), + ), + ( + np.array([[1.0, 4.0, 3.0]]).T, + np.array([[0.0, 2.0, 1.0]]).T, + np.array([[np.nan]]), + ), + ( + np.array([["c", np.nan, "b"]], dtype=object).T, + np.array([[1.0, np.nan, 0.0]]).T, + np.array([["d"]], dtype=object), + ), + ( + np.array([["c", "a", "b"]], dtype=object).T, + np.array([[2.0, 0.0, 1.0]]).T, + np.array([[np.nan]], dtype=object), + ), + ], +) +def test_ordinal_encoder_handle_missing_and_unknown(X, expected_X_trans, X_test): + """Test the interaction between missing values and handle_unknown""" + + oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1) + + X_trans = oe.fit_transform(X) + assert_allclose(X_trans, expected_X_trans) + + assert_allclose(oe.transform(X_test), [[-1.0]]) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_ordinal_encoder_sparse(csr_container): + """Check that we raise proper error with sparse input in OrdinalEncoder. + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/19878 + """ + X = np.array([[3, 2, 1], [0, 1, 1]]) + X_sparse = csr_container(X) + + encoder = OrdinalEncoder() + + err_msg = "Sparse data was passed, but dense data is required" + with pytest.raises(TypeError, match=err_msg): + encoder.fit(X_sparse) + with pytest.raises(TypeError, match=err_msg): + encoder.fit_transform(X_sparse) + + X_trans = encoder.fit_transform(X) + X_trans_sparse = csr_container(X_trans) + with pytest.raises(TypeError, match=err_msg): + encoder.inverse_transform(X_trans_sparse) + + +def test_ordinal_encoder_fit_with_unseen_category(): + """Check OrdinalEncoder.fit works with unseen category when + `handle_unknown="use_encoded_value"`. + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/19872 + """ + X = np.array([0, 0, 1, 0, 2, 5])[:, np.newaxis] + oe = OrdinalEncoder( + categories=[[-1, 0, 1]], handle_unknown="use_encoded_value", unknown_value=-999 + ) + oe.fit(X) + + oe = OrdinalEncoder(categories=[[-1, 0, 1]], handle_unknown="error") + with pytest.raises(ValueError, match="Found unknown categories"): + oe.fit(X) + + +@pytest.mark.parametrize( + "X_train", + [ + [["AA", "B"]], + np.array([["AA", "B"]], dtype="O"), + np.array([["AA", "B"]], dtype="U"), + ], +) +@pytest.mark.parametrize( + "X_test", + [ + [["A", "B"]], + np.array([["A", "B"]], dtype="O"), + np.array([["A", "B"]], dtype="U"), + ], +) +def test_ordinal_encoder_handle_unknown_string_dtypes(X_train, X_test): + """Checks that `OrdinalEncoder` transforms string dtypes. + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/19872 + """ + enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-9) + enc.fit(X_train) + + X_trans = enc.transform(X_test) + assert_allclose(X_trans, [[-9, 0]]) + + +def test_ordinal_encoder_python_integer(): + """Check that `OrdinalEncoder` accepts Python integers that are potentially + larger than 64 bits. + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/20721 + """ + X = np.array( + [ + 44253463435747313673, + 9867966753463435747313673, + 44253462342215747313673, + 442534634357764313673, + ] + ).reshape(-1, 1) + encoder = OrdinalEncoder().fit(X) + assert_array_equal(encoder.categories_, np.sort(X, axis=0).T) + X_trans = encoder.transform(X) + assert_array_equal(X_trans, [[0], [3], [2], [1]]) + + +def test_ordinal_encoder_features_names_out_pandas(): + """Check feature names out is same as the input.""" + pd = pytest.importorskip("pandas") + + names = ["b", "c", "a"] + X = pd.DataFrame([[1, 2, 3]], columns=names) + enc = OrdinalEncoder().fit(X) + + feature_names_out = enc.get_feature_names_out() + assert_array_equal(names, feature_names_out) + + +def test_ordinal_encoder_unknown_missing_interaction(): + """Check interactions between encode_unknown and missing value encoding.""" + + X = np.array([["a"], ["b"], [np.nan]], dtype=object) + + oe = OrdinalEncoder( + handle_unknown="use_encoded_value", + unknown_value=np.nan, + encoded_missing_value=-3, + ).fit(X) + + X_trans = oe.transform(X) + assert_allclose(X_trans, [[0], [1], [-3]]) + + # "c" is unknown and is mapped to np.nan + # "None" is a missing value and is set to -3 + X_test = np.array([["c"], [np.nan]], dtype=object) + X_test_trans = oe.transform(X_test) + assert_allclose(X_test_trans, [[np.nan], [-3]]) + + # Non-regression test for #24082 + X_roundtrip = oe.inverse_transform(X_test_trans) + + # np.nan is unknown so it maps to None + assert X_roundtrip[0][0] is None + + # -3 is the encoded missing value so it maps back to nan + assert np.isnan(X_roundtrip[1][0]) + + +@pytest.mark.parametrize("with_pandas", [True, False]) +def test_ordinal_encoder_encoded_missing_value_error(with_pandas): + """Check OrdinalEncoder errors when encoded_missing_value is used by + an known category.""" + X = np.array([["a", "dog"], ["b", "cat"], ["c", np.nan]], dtype=object) + + # The 0-th feature has no missing values so it is not included in the list of + # features + error_msg = ( + r"encoded_missing_value \(1\) is already used to encode a known category " + r"in features: " + ) + + if with_pandas: + pd = pytest.importorskip("pandas") + X = pd.DataFrame(X, columns=["letter", "pet"]) + error_msg = error_msg + r"\['pet'\]" + else: + error_msg = error_msg + r"\[1\]" + + oe = OrdinalEncoder(encoded_missing_value=1) + + with pytest.raises(ValueError, match=error_msg): + oe.fit(X) + + +@pytest.mark.parametrize( + "X_train, X_test_trans_expected, X_roundtrip_expected", + [ + ( + # missing value is not in training set + # inverse transform will considering encoded nan as unknown + np.array([["a"], ["1"]], dtype=object), + [[0], [np.nan], [np.nan]], + np.asarray([["1"], [None], [None]], dtype=object), + ), + ( + # missing value in training set, + # inverse transform will considering encoded nan as missing + np.array([[np.nan], ["1"], ["a"]], dtype=object), + [[0], [np.nan], [np.nan]], + np.asarray([["1"], [np.nan], [np.nan]], dtype=object), + ), + ], +) +def test_ordinal_encoder_unknown_missing_interaction_both_nan( + X_train, X_test_trans_expected, X_roundtrip_expected +): + """Check transform when unknown_value and encoded_missing_value is nan. + + Non-regression test for #24082. + """ + oe = OrdinalEncoder( + handle_unknown="use_encoded_value", + unknown_value=np.nan, + encoded_missing_value=np.nan, + ).fit(X_train) + + X_test = np.array([["1"], [np.nan], ["b"]]) + X_test_trans = oe.transform(X_test) + + # both nan and unknown are encoded as nan + assert_allclose(X_test_trans, X_test_trans_expected) + X_roundtrip = oe.inverse_transform(X_test_trans) + + n_samples = X_roundtrip_expected.shape[0] + for i in range(n_samples): + expected_val = X_roundtrip_expected[i, 0] + val = X_roundtrip[i, 0] + + if expected_val is None: + assert val is None + elif is_scalar_nan(expected_val): + assert np.isnan(val) + else: + assert val == expected_val + + +def test_one_hot_encoder_set_output(): + """Check OneHotEncoder works with set_output.""" + pd = pytest.importorskip("pandas") + + X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]}) + ohe = OneHotEncoder() + + ohe.set_output(transform="pandas") + + match = "Pandas output does not support sparse data. Set sparse_output=False" + with pytest.raises(ValueError, match=match): + ohe.fit_transform(X_df) + + ohe_default = OneHotEncoder(sparse_output=False).set_output(transform="default") + ohe_pandas = OneHotEncoder(sparse_output=False).set_output(transform="pandas") + + X_default = ohe_default.fit_transform(X_df) + X_pandas = ohe_pandas.fit_transform(X_df) + + assert_allclose(X_pandas.to_numpy(), X_default) + assert_array_equal(ohe_pandas.get_feature_names_out(), X_pandas.columns) + + +def test_ordinal_set_output(): + """Check OrdinalEncoder works with set_output.""" + pd = pytest.importorskip("pandas") + + X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]}) + + ord_default = OrdinalEncoder().set_output(transform="default") + ord_pandas = OrdinalEncoder().set_output(transform="pandas") + + X_default = ord_default.fit_transform(X_df) + X_pandas = ord_pandas.fit_transform(X_df) + + assert_allclose(X_pandas.to_numpy(), X_default) + assert_array_equal(ord_pandas.get_feature_names_out(), X_pandas.columns) + + +def test_predefined_categories_dtype(): + """Check that the categories_ dtype is `object` for string categories + + Regression test for gh-25171. + """ + categories = [["as", "mmas", "eas", "ras", "acs"], ["1", "2"]] + + enc = OneHotEncoder(categories=categories) + + enc.fit([["as", "1"]]) + + assert len(categories) == len(enc.categories_) + for n, cat in enumerate(enc.categories_): + assert cat.dtype == object + assert_array_equal(categories[n], cat) + + +def test_ordinal_encoder_missing_unknown_encoding_max(): + """Check missing value or unknown encoding can equal the cardinality.""" + X = np.array([["dog"], ["cat"], [np.nan]], dtype=object) + X_trans = OrdinalEncoder(encoded_missing_value=2).fit_transform(X) + assert_allclose(X_trans, [[1], [0], [2]]) + + enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=2).fit(X) + X_test = np.array([["snake"]]) + X_trans = enc.transform(X_test) + assert_allclose(X_trans, [[2]]) + + +def test_drop_idx_infrequent_categories(): + """Check drop_idx is defined correctly with infrequent categories. + + Non-regression test for gh-25550. + """ + X = np.array( + [["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object + ).T + ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="first").fit(X) + assert_array_equal( + ohe.get_feature_names_out(), ["x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"] + ) + assert ohe.categories_[0][ohe.drop_idx_[0]] == "b" + + X = np.array([["a"] * 2 + ["b"] * 2 + ["c"] * 10], dtype=object).T + ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="if_binary").fit(X) + assert_array_equal(ohe.get_feature_names_out(), ["x0_infrequent_sklearn"]) + assert ohe.categories_[0][ohe.drop_idx_[0]] == "c" + + X = np.array( + [["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object + ).T + ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=["d"]).fit(X) + assert_array_equal( + ohe.get_feature_names_out(), ["x0_b", "x0_c", "x0_e", "x0_infrequent_sklearn"] + ) + assert ohe.categories_[0][ohe.drop_idx_[0]] == "d" + + ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=None).fit(X) + assert_array_equal( + ohe.get_feature_names_out(), + ["x0_b", "x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"], + ) + assert ohe.drop_idx_ is None + + +@pytest.mark.parametrize( + "kwargs", + [ + {"max_categories": 3}, + {"min_frequency": 6}, + {"min_frequency": 9}, + {"min_frequency": 0.24}, + {"min_frequency": 0.16}, + {"max_categories": 3, "min_frequency": 8}, + {"max_categories": 4, "min_frequency": 6}, + ], +) +def test_ordinal_encoder_infrequent_three_levels(kwargs): + """Test parameters for grouping 'a', and 'd' into the infrequent category.""" + + X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T + ordinal = OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=-1, **kwargs + ).fit(X_train) + assert_array_equal(ordinal.categories_, [["a", "b", "c", "d"]]) + assert_array_equal(ordinal.infrequent_categories_, [["a", "d"]]) + + X_test = [["a"], ["b"], ["c"], ["d"], ["z"]] + expected_trans = [[2], [0], [1], [2], [-1]] + + X_trans = ordinal.transform(X_test) + assert_allclose(X_trans, expected_trans) + + X_inverse = ordinal.inverse_transform(X_trans) + expected_inverse = [ + ["infrequent_sklearn"], + ["b"], + ["c"], + ["infrequent_sklearn"], + [None], + ] + assert_array_equal(X_inverse, expected_inverse) + + +def test_ordinal_encoder_infrequent_three_levels_user_cats(): + """Test that the order of the categories provided by a user is respected. + + In this case 'c' is encoded as the first category and 'b' is encoded + as the second one. + """ + + X_train = np.array( + [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object + ).T + ordinal = OrdinalEncoder( + categories=[["c", "d", "b", "a"]], + max_categories=3, + handle_unknown="use_encoded_value", + unknown_value=-1, + ).fit(X_train) + assert_array_equal(ordinal.categories_, [["c", "d", "b", "a"]]) + assert_array_equal(ordinal.infrequent_categories_, [["d", "a"]]) + + X_test = [["a"], ["b"], ["c"], ["d"], ["z"]] + expected_trans = [[2], [1], [0], [2], [-1]] + + X_trans = ordinal.transform(X_test) + assert_allclose(X_trans, expected_trans) + + X_inverse = ordinal.inverse_transform(X_trans) + expected_inverse = [ + ["infrequent_sklearn"], + ["b"], + ["c"], + ["infrequent_sklearn"], + [None], + ] + assert_array_equal(X_inverse, expected_inverse) + + +def test_ordinal_encoder_infrequent_mixed(): + """Test when feature 0 has infrequent categories and feature 1 does not.""" + + X = np.column_stack(([0, 1, 3, 3, 3, 3, 2, 0, 3], [0, 0, 0, 0, 1, 1, 1, 1, 1])) + + ordinal = OrdinalEncoder(max_categories=3).fit(X) + + assert_array_equal(ordinal.infrequent_categories_[0], [1, 2]) + assert ordinal.infrequent_categories_[1] is None + + X_test = [[3, 0], [1, 1]] + expected_trans = [[1, 0], [2, 1]] + + X_trans = ordinal.transform(X_test) + assert_allclose(X_trans, expected_trans) + + X_inverse = ordinal.inverse_transform(X_trans) + expected_inverse = np.array([[3, 0], ["infrequent_sklearn", 1]], dtype=object) + assert_array_equal(X_inverse, expected_inverse) + + +def test_ordinal_encoder_infrequent_multiple_categories_dtypes(): + """Test infrequent categories with a pandas DataFrame with multiple dtypes.""" + + pd = pytest.importorskip("pandas") + categorical_dtype = pd.CategoricalDtype(["bird", "cat", "dog", "snake"]) + X = pd.DataFrame( + { + "str": ["a", "f", "c", "f", "f", "a", "c", "b", "b"], + "int": [5, 3, 0, 10, 10, 12, 0, 3, 5], + "categorical": pd.Series( + ["dog"] * 4 + ["cat"] * 3 + ["snake"] + ["bird"], + dtype=categorical_dtype, + ), + }, + columns=["str", "int", "categorical"], + ) + + ordinal = OrdinalEncoder(max_categories=3).fit(X) + # X[:, 0] 'a', 'b', 'c' have the same frequency. 'a' and 'b' will be + # considered infrequent because they appear first when sorted + + # X[:, 1] 0, 3, 5, 10 has frequency 2 and 12 has frequency 1. + # 0, 3, 12 will be considered infrequent because they appear first when + # sorted. + + # X[:, 2] "snake" and "bird" or infrequent + + assert_array_equal(ordinal.infrequent_categories_[0], ["a", "b"]) + assert_array_equal(ordinal.infrequent_categories_[1], [0, 3, 12]) + assert_array_equal(ordinal.infrequent_categories_[2], ["bird", "snake"]) + + X_test = pd.DataFrame( + { + "str": ["a", "b", "f", "c"], + "int": [12, 0, 10, 5], + "categorical": pd.Series( + ["cat"] + ["snake"] + ["bird"] + ["dog"], + dtype=categorical_dtype, + ), + }, + columns=["str", "int", "categorical"], + ) + expected_trans = [[2, 2, 0], [2, 2, 2], [1, 1, 2], [0, 0, 1]] + + X_trans = ordinal.transform(X_test) + assert_allclose(X_trans, expected_trans) + + +def test_ordinal_encoder_infrequent_custom_mapping(): + """Check behavior of unknown_value and encoded_missing_value with infrequent.""" + X_train = np.array( + [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3 + [np.nan]], dtype=object + ).T + + ordinal = OrdinalEncoder( + handle_unknown="use_encoded_value", + unknown_value=2, + max_categories=2, + encoded_missing_value=3, + ).fit(X_train) + assert_array_equal(ordinal.infrequent_categories_, [["a", "c", "d"]]) + + X_test = np.array([["a"], ["b"], ["c"], ["d"], ["e"], [np.nan]], dtype=object) + expected_trans = [[1], [0], [1], [1], [2], [3]] + + X_trans = ordinal.transform(X_test) + assert_allclose(X_trans, expected_trans) + + +@pytest.mark.parametrize( + "kwargs", + [ + {"max_categories": 6}, + {"min_frequency": 2}, + ], +) +def test_ordinal_encoder_all_frequent(kwargs): + """All categories are considered frequent have same encoding as default encoder.""" + X_train = np.array( + [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object + ).T + + adjusted_encoder = OrdinalEncoder( + **kwargs, handle_unknown="use_encoded_value", unknown_value=-1 + ).fit(X_train) + default_encoder = OrdinalEncoder( + handle_unknown="use_encoded_value", unknown_value=-1 + ).fit(X_train) + + X_test = [["a"], ["b"], ["c"], ["d"], ["e"]] + + assert_allclose( + adjusted_encoder.transform(X_test), default_encoder.transform(X_test) + ) + + +@pytest.mark.parametrize( + "kwargs", + [ + {"max_categories": 1}, + {"min_frequency": 100}, + ], +) +def test_ordinal_encoder_all_infrequent(kwargs): + """When all categories are infrequent, they are all encoded as zero.""" + X_train = np.array( + [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object + ).T + encoder = OrdinalEncoder( + **kwargs, handle_unknown="use_encoded_value", unknown_value=-1 + ).fit(X_train) + + X_test = [["a"], ["b"], ["c"], ["d"], ["e"]] + assert_allclose(encoder.transform(X_test), [[0], [0], [0], [0], [-1]]) + + +def test_ordinal_encoder_missing_appears_frequent(): + """Check behavior when missing value appears frequently.""" + X = np.array( + [[np.nan] * 20 + ["dog"] * 10 + ["cat"] * 5 + ["snake"] + ["deer"]], + dtype=object, + ).T + ordinal = OrdinalEncoder(max_categories=3).fit(X) + + X_test = np.array([["snake", "cat", "dog", np.nan]], dtype=object).T + X_trans = ordinal.transform(X_test) + assert_allclose(X_trans, [[2], [0], [1], [np.nan]]) + + +def test_ordinal_encoder_missing_appears_infrequent(): + """Check behavior when missing value appears infrequently.""" + + # feature 0 has infrequent categories + # feature 1 has no infrequent categories + X = np.array( + [ + [np.nan] + ["dog"] * 10 + ["cat"] * 5 + ["snake"] + ["deer"], + ["red"] * 9 + ["green"] * 9, + ], + dtype=object, + ).T + ordinal = OrdinalEncoder(min_frequency=4).fit(X) + + X_test = np.array( + [ + ["snake", "red"], + ["deer", "green"], + [np.nan, "green"], + ["dog", "green"], + ["cat", "red"], + ], + dtype=object, + ) + X_trans = ordinal.transform(X_test) + assert_allclose(X_trans, [[2, 1], [2, 0], [np.nan, 0], [1, 0], [0, 1]]) + + +@pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder]) +def test_encoder_not_fitted(Encoder): + """Check that we raise a `NotFittedError` by calling transform before fit with + the encoders. + + One could expect that the passing the `categories` argument to the encoder + would make it stateless. However, `fit` is making a couple of check, such as the + position of `np.nan`. + """ + X = np.array([["A"], ["B"], ["C"]], dtype=object) + encoder = Encoder(categories=[["A", "B", "C"]]) + with pytest.raises(NotFittedError): + encoder.transform(X) diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_function_transformer.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_function_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..6bfb5d1367c8da36e2ce829a28a02ff253b34801 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_function_transformer.py @@ -0,0 +1,579 @@ +import warnings + +import numpy as np +import pytest + +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import FunctionTransformer, StandardScaler +from sklearn.utils._testing import ( + _convert_container, + assert_allclose_dense_sparse, + assert_array_equal, +) +from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS + + +def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X): + def _func(X, *args, **kwargs): + args_store.append(X) + args_store.extend(args) + kwargs_store.update(kwargs) + return func(X) + + return _func + + +def test_delegate_to_func(): + # (args|kwargs)_store will hold the positional and keyword arguments + # passed to the function inside the FunctionTransformer. + args_store = [] + kwargs_store = {} + X = np.arange(10).reshape((5, 2)) + assert_array_equal( + FunctionTransformer(_make_func(args_store, kwargs_store)).transform(X), + X, + "transform should have returned X unchanged", + ) + + # The function should only have received X. + assert args_store == [X], ( + "Incorrect positional arguments passed to func: {args}".format(args=args_store) + ) + + assert not kwargs_store, ( + "Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store) + ) + + # reset the argument stores. + args_store[:] = [] + kwargs_store.clear() + transformed = FunctionTransformer( + _make_func(args_store, kwargs_store), + ).transform(X) + + assert_array_equal( + transformed, X, err_msg="transform should have returned X unchanged" + ) + + # The function should have received X + assert args_store == [X], ( + "Incorrect positional arguments passed to func: {args}".format(args=args_store) + ) + + assert not kwargs_store, ( + "Unexpected keyword arguments passed to func: {args}".format(args=kwargs_store) + ) + + +def test_np_log(): + X = np.arange(10).reshape((5, 2)) + + # Test that the numpy.log example still works. + assert_array_equal( + FunctionTransformer(np.log1p).transform(X), + np.log1p(X), + ) + + +def test_kw_arg(): + X = np.linspace(0, 1, num=10).reshape((5, 2)) + + F = FunctionTransformer(np.around, kw_args=dict(decimals=3)) + + # Test that rounding is correct + assert_array_equal(F.transform(X), np.around(X, decimals=3)) + + +def test_kw_arg_update(): + X = np.linspace(0, 1, num=10).reshape((5, 2)) + + F = FunctionTransformer(np.around, kw_args=dict(decimals=3)) + + F.kw_args["decimals"] = 1 + + # Test that rounding is correct + assert_array_equal(F.transform(X), np.around(X, decimals=1)) + + +def test_kw_arg_reset(): + X = np.linspace(0, 1, num=10).reshape((5, 2)) + + F = FunctionTransformer(np.around, kw_args=dict(decimals=3)) + + F.kw_args = dict(decimals=1) + + # Test that rounding is correct + assert_array_equal(F.transform(X), np.around(X, decimals=1)) + + +def test_inverse_transform(): + X = np.array([1, 4, 9, 16]).reshape((2, 2)) + + # Test that inverse_transform works correctly + F = FunctionTransformer( + func=np.sqrt, + inverse_func=np.around, + inv_kw_args=dict(decimals=3), + ) + assert_array_equal( + F.inverse_transform(F.transform(X)), + np.around(np.sqrt(X), decimals=3), + ) + + +@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS) +def test_check_inverse(sparse_container): + X = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2)) + if sparse_container is not None: + X = sparse_container(X) + + trans = FunctionTransformer( + func=np.sqrt, + inverse_func=np.around, + accept_sparse=sparse_container is not None, + check_inverse=True, + validate=True, + ) + warning_message = ( + "The provided functions are not strictly" + " inverse of each other. If you are sure you" + " want to proceed regardless, set" + " 'check_inverse=False'." + ) + with pytest.warns(UserWarning, match=warning_message): + trans.fit(X) + + trans = FunctionTransformer( + func=np.expm1, + inverse_func=np.log1p, + accept_sparse=sparse_container is not None, + check_inverse=True, + validate=True, + ) + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + Xt = trans.fit_transform(X) + + assert_allclose_dense_sparse(X, trans.inverse_transform(Xt)) + + +def test_check_inverse_func_or_inverse_not_provided(): + # check that we don't check inverse when one of the func or inverse is not + # provided. + X = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2)) + + trans = FunctionTransformer( + func=np.expm1, inverse_func=None, check_inverse=True, validate=True + ) + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + trans.fit(X) + trans = FunctionTransformer( + func=None, inverse_func=np.expm1, check_inverse=True, validate=True + ) + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + trans.fit(X) + + +def test_function_transformer_frame(): + pd = pytest.importorskip("pandas") + X_df = pd.DataFrame(np.random.randn(100, 10)) + transformer = FunctionTransformer() + X_df_trans = transformer.fit_transform(X_df) + assert hasattr(X_df_trans, "loc") + + +@pytest.mark.parametrize("X_type", ["array", "series"]) +def test_function_transformer_raise_error_with_mixed_dtype(X_type): + """Check that `FunctionTransformer.check_inverse` raises error on mixed dtype.""" + mapping = {"one": 1, "two": 2, "three": 3, 5: "five", 6: "six"} + inverse_mapping = {value: key for key, value in mapping.items()} + dtype = "object" + + data = ["one", "two", "three", "one", "one", 5, 6] + data = _convert_container(data, X_type, columns_name=["value"], dtype=dtype) + + def func(X): + return np.array([mapping[X[i]] for i in range(X.size)], dtype=object) + + def inverse_func(X): + return _convert_container( + [inverse_mapping[x] for x in X], + X_type, + columns_name=["value"], + dtype=dtype, + ) + + transformer = FunctionTransformer( + func=func, inverse_func=inverse_func, validate=False, check_inverse=True + ) + + msg = "'check_inverse' is only supported when all the elements in `X` is numerical." + with pytest.raises(ValueError, match=msg): + transformer.fit(data) + + +def test_function_transformer_support_all_nummerical_dataframes_check_inverse_True(): + """Check support for dataframes with only numerical values.""" + pd = pytest.importorskip("pandas") + + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + transformer = FunctionTransformer( + func=lambda x: x + 2, inverse_func=lambda x: x - 2, check_inverse=True + ) + + # Does not raise an error + df_out = transformer.fit_transform(df) + assert_allclose_dense_sparse(df_out, df + 2) + + +def test_function_transformer_with_dataframe_and_check_inverse_True(): + """Check error is raised when check_inverse=True. + + Non-regresion test for gh-25261. + """ + pd = pytest.importorskip("pandas") + transformer = FunctionTransformer( + func=lambda x: x, inverse_func=lambda x: x, check_inverse=True + ) + + df_mixed = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + msg = "'check_inverse' is only supported when all the elements in `X` is numerical." + with pytest.raises(ValueError, match=msg): + transformer.fit(df_mixed) + + +@pytest.mark.parametrize( + "X, feature_names_out, input_features, expected", + [ + ( + # NumPy inputs, default behavior: generate names + np.random.rand(100, 3), + "one-to-one", + None, + ("x0", "x1", "x2"), + ), + ( + # Pandas input, default behavior: use input feature names + {"a": np.random.rand(100), "b": np.random.rand(100)}, + "one-to-one", + None, + ("a", "b"), + ), + ( + # NumPy input, feature_names_out=callable + np.random.rand(100, 3), + lambda transformer, input_features: ("a", "b"), + None, + ("a", "b"), + ), + ( + # Pandas input, feature_names_out=callable + {"a": np.random.rand(100), "b": np.random.rand(100)}, + lambda transformer, input_features: ("c", "d", "e"), + None, + ("c", "d", "e"), + ), + ( + # NumPy input, feature_names_out=callable – default input_features + np.random.rand(100, 3), + lambda transformer, input_features: tuple(input_features) + ("a",), + None, + ("x0", "x1", "x2", "a"), + ), + ( + # Pandas input, feature_names_out=callable – default input_features + {"a": np.random.rand(100), "b": np.random.rand(100)}, + lambda transformer, input_features: tuple(input_features) + ("c",), + None, + ("a", "b", "c"), + ), + ( + # NumPy input, input_features=list of names + np.random.rand(100, 3), + "one-to-one", + ("a", "b", "c"), + ("a", "b", "c"), + ), + ( + # Pandas input, input_features=list of names + {"a": np.random.rand(100), "b": np.random.rand(100)}, + "one-to-one", + ("a", "b"), # must match feature_names_in_ + ("a", "b"), + ), + ( + # NumPy input, feature_names_out=callable, input_features=list + np.random.rand(100, 3), + lambda transformer, input_features: tuple(input_features) + ("d",), + ("a", "b", "c"), + ("a", "b", "c", "d"), + ), + ( + # Pandas input, feature_names_out=callable, input_features=list + {"a": np.random.rand(100), "b": np.random.rand(100)}, + lambda transformer, input_features: tuple(input_features) + ("c",), + ("a", "b"), # must match feature_names_in_ + ("a", "b", "c"), + ), + ], +) +@pytest.mark.parametrize("validate", [True, False]) +def test_function_transformer_get_feature_names_out( + X, feature_names_out, input_features, expected, validate +): + if isinstance(X, dict): + pd = pytest.importorskip("pandas") + X = pd.DataFrame(X) + + transformer = FunctionTransformer( + feature_names_out=feature_names_out, validate=validate + ) + transformer.fit(X) + names = transformer.get_feature_names_out(input_features) + assert isinstance(names, np.ndarray) + assert names.dtype == object + assert_array_equal(names, expected) + + +def test_function_transformer_get_feature_names_out_without_validation(): + transformer = FunctionTransformer(feature_names_out="one-to-one", validate=False) + X = np.random.rand(100, 2) + transformer.fit_transform(X) + + names = transformer.get_feature_names_out(("a", "b")) + assert isinstance(names, np.ndarray) + assert names.dtype == object + assert_array_equal(names, ("a", "b")) + + +def test_function_transformer_feature_names_out_is_None(): + transformer = FunctionTransformer() + X = np.random.rand(100, 2) + transformer.fit_transform(X) + + msg = "This 'FunctionTransformer' has no attribute 'get_feature_names_out'" + with pytest.raises(AttributeError, match=msg): + transformer.get_feature_names_out() + + +def test_function_transformer_feature_names_out_uses_estimator(): + def add_n_random_features(X, n): + return np.concatenate([X, np.random.rand(len(X), n)], axis=1) + + def feature_names_out(transformer, input_features): + n = transformer.kw_args["n"] + return list(input_features) + [f"rnd{i}" for i in range(n)] + + transformer = FunctionTransformer( + func=add_n_random_features, + feature_names_out=feature_names_out, + kw_args=dict(n=3), + validate=True, + ) + pd = pytest.importorskip("pandas") + df = pd.DataFrame({"a": np.random.rand(100), "b": np.random.rand(100)}) + transformer.fit_transform(df) + names = transformer.get_feature_names_out() + + assert isinstance(names, np.ndarray) + assert names.dtype == object + assert_array_equal(names, ("a", "b", "rnd0", "rnd1", "rnd2")) + + +def test_function_transformer_validate_inverse(): + """Test that function transformer does not reset estimator in + `inverse_transform`.""" + + def add_constant_feature(X): + X_one = np.ones((X.shape[0], 1)) + return np.concatenate((X, X_one), axis=1) + + def inverse_add_constant(X): + return X[:, :-1] + + X = np.array([[1, 2], [3, 4], [3, 4]]) + trans = FunctionTransformer( + func=add_constant_feature, + inverse_func=inverse_add_constant, + validate=True, + ) + X_trans = trans.fit_transform(X) + assert trans.n_features_in_ == X.shape[1] + + trans.inverse_transform(X_trans) + assert trans.n_features_in_ == X.shape[1] + + +@pytest.mark.parametrize( + "feature_names_out, expected", + [ + ("one-to-one", ["pet", "color"]), + [lambda est, names: [f"{n}_out" for n in names], ["pet_out", "color_out"]], + ], +) +@pytest.mark.parametrize("in_pipeline", [True, False]) +def test_get_feature_names_out_dataframe_with_string_data( + feature_names_out, expected, in_pipeline +): + """Check that get_feature_names_out works with DataFrames with string data.""" + pd = pytest.importorskip("pandas") + X = pd.DataFrame({"pet": ["dog", "cat"], "color": ["red", "green"]}) + + def func(X): + if feature_names_out == "one-to-one": + return X + else: + name = feature_names_out(None, X.columns) + return X.rename(columns=dict(zip(X.columns, name))) + + transformer = FunctionTransformer(func=func, feature_names_out=feature_names_out) + if in_pipeline: + transformer = make_pipeline(transformer) + + X_trans = transformer.fit_transform(X) + assert isinstance(X_trans, pd.DataFrame) + + names = transformer.get_feature_names_out() + assert isinstance(names, np.ndarray) + assert names.dtype == object + assert_array_equal(names, expected) + + +def test_set_output_func(): + """Check behavior of set_output with different settings.""" + pd = pytest.importorskip("pandas") + + X = pd.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]}) + + ft = FunctionTransformer(np.log, feature_names_out="one-to-one") + + # no warning is raised when feature_names_out is defined + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + ft.set_output(transform="pandas") + + X_trans = ft.fit_transform(X) + assert isinstance(X_trans, pd.DataFrame) + assert_array_equal(X_trans.columns, ["a", "b"]) + + ft = FunctionTransformer(lambda x: 2 * x) + ft.set_output(transform="pandas") + + # no warning is raised when func returns a panda dataframe + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + X_trans = ft.fit_transform(X) + assert isinstance(X_trans, pd.DataFrame) + assert_array_equal(X_trans.columns, ["a", "b"]) + + # Warning is raised when func returns a ndarray + ft_np = FunctionTransformer(lambda x: np.asarray(x)) + + for transform in ("pandas", "polars"): + ft_np.set_output(transform=transform) + msg = ( + f"When `set_output` is configured to be '{transform}'.*{transform} " + "DataFrame.*" + ) + with pytest.warns(UserWarning, match=msg): + ft_np.fit_transform(X) + + # default transform does not warn + ft_np.set_output(transform="default") + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + ft_np.fit_transform(X) + + +def test_consistence_column_name_between_steps(): + """Check that we have a consistence between the feature names out of + `FunctionTransformer` and the feature names in of the next step in the pipeline. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/27695 + """ + pd = pytest.importorskip("pandas") + + def with_suffix(_, names): + return [name + "__log" for name in names] + + pipeline = make_pipeline( + FunctionTransformer(np.log1p, feature_names_out=with_suffix), StandardScaler() + ) + + df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["a", "b"]) + X_trans = pipeline.fit_transform(df) + assert pipeline.get_feature_names_out().tolist() == ["a__log", "b__log"] + # StandardScaler will convert to a numpy array + assert isinstance(X_trans, np.ndarray) + + +@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"]) +@pytest.mark.parametrize("transform_output", ["default", "pandas", "polars"]) +def test_function_transformer_overwrite_column_names(dataframe_lib, transform_output): + """Check that we overwrite the column names when we should.""" + lib = pytest.importorskip(dataframe_lib) + if transform_output != "numpy": + pytest.importorskip(transform_output) + + df = lib.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]}) + + def with_suffix(_, names): + return [name + "__log" for name in names] + + transformer = FunctionTransformer(feature_names_out=with_suffix).set_output( + transform=transform_output + ) + X_trans = transformer.fit_transform(df) + assert_array_equal(np.asarray(X_trans), np.asarray(df)) + + feature_names = transformer.get_feature_names_out() + assert list(X_trans.columns) == with_suffix(None, df.columns) + assert feature_names.tolist() == with_suffix(None, df.columns) + + +@pytest.mark.parametrize( + "feature_names_out", + ["one-to-one", lambda _, names: [f"{name}_log" for name in names]], +) +def test_function_transformer_overwrite_column_names_numerical(feature_names_out): + """Check the same as `test_function_transformer_overwrite_column_names` + but for the specific case of pandas where column names can be numerical.""" + pd = pytest.importorskip("pandas") + + df = pd.DataFrame({0: [1, 2, 3], 1: [10, 20, 100]}) + + transformer = FunctionTransformer(feature_names_out=feature_names_out) + X_trans = transformer.fit_transform(df) + assert_array_equal(np.asarray(X_trans), np.asarray(df)) + + feature_names = transformer.get_feature_names_out() + assert list(X_trans.columns) == list(feature_names) + + +@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"]) +@pytest.mark.parametrize( + "feature_names_out", + ["one-to-one", lambda _, names: [f"{name}_log" for name in names]], +) +def test_function_transformer_error_column_inconsistent( + dataframe_lib, feature_names_out +): + """Check that we raise an error when `func` returns a dataframe with new + column names that become inconsistent with `get_feature_names_out`.""" + lib = pytest.importorskip(dataframe_lib) + + df = lib.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]}) + + def func(df): + if dataframe_lib == "pandas": + return df.rename(columns={"a": "c"}) + else: + return df.rename({"a": "c"}) + + transformer = FunctionTransformer(func=func, feature_names_out=feature_names_out) + err_msg = "The output generated by `func` have different column names" + with pytest.raises(ValueError, match=err_msg): + transformer.fit_transform(df).columns diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_label.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_label.py new file mode 100644 index 0000000000000000000000000000000000000000..053b474e675bca761b035953b30c495892e2d46a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_label.py @@ -0,0 +1,748 @@ +import numpy as np +import pytest +from scipy.sparse import issparse + +from sklearn import config_context, datasets +from sklearn.preprocessing._label import ( + LabelBinarizer, + LabelEncoder, + MultiLabelBinarizer, + _inverse_binarize_multiclass, + _inverse_binarize_thresholding, + label_binarize, +) +from sklearn.utils._array_api import ( + _convert_to_numpy, + _get_namespace_device_dtype_ids, + get_namespace, + yield_namespace_device_dtype_combinations, +) +from sklearn.utils._testing import ( + _array_api_for_tests, + assert_array_equal, +) +from sklearn.utils.fixes import ( + COO_CONTAINERS, + CSC_CONTAINERS, + CSR_CONTAINERS, + DOK_CONTAINERS, + LIL_CONTAINERS, +) +from sklearn.utils.multiclass import type_of_target +from sklearn.utils.validation import _to_object_array + +iris = datasets.load_iris() + + +def toarray(a): + if hasattr(a, "toarray"): + a = a.toarray() + return a + + +def test_label_binarizer(): + # one-class case defaults to negative label + # For dense case: + inp = ["pos", "pos", "pos", "pos"] + lb = LabelBinarizer(sparse_output=False) + expected = np.array([[0, 0, 0, 0]]).T + got = lb.fit_transform(inp) + assert_array_equal(lb.classes_, ["pos"]) + assert_array_equal(expected, got) + assert_array_equal(lb.inverse_transform(got), inp) + + # For sparse case: + lb = LabelBinarizer(sparse_output=True) + got = lb.fit_transform(inp) + assert issparse(got) + assert_array_equal(lb.classes_, ["pos"]) + assert_array_equal(expected, got.toarray()) + assert_array_equal(lb.inverse_transform(got.toarray()), inp) + + lb = LabelBinarizer(sparse_output=False) + # two-class case + inp = ["neg", "pos", "pos", "neg"] + expected = np.array([[0, 1, 1, 0]]).T + got = lb.fit_transform(inp) + assert_array_equal(lb.classes_, ["neg", "pos"]) + assert_array_equal(expected, got) + + to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]]) + assert_array_equal(lb.inverse_transform(to_invert), inp) + + # multi-class case + inp = ["spam", "ham", "eggs", "ham", "0"] + expected = np.array( + [[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]] + ) + got = lb.fit_transform(inp) + assert_array_equal(lb.classes_, ["0", "eggs", "ham", "spam"]) + assert_array_equal(expected, got) + assert_array_equal(lb.inverse_transform(got), inp) + + +def test_label_binarizer_unseen_labels(): + lb = LabelBinarizer() + + expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) + got = lb.fit_transform(["b", "d", "e"]) + assert_array_equal(expected, got) + + expected = np.array( + [[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]] + ) + got = lb.transform(["a", "b", "c", "d", "e", "f"]) + assert_array_equal(expected, got) + + +def test_label_binarizer_set_label_encoding(): + lb = LabelBinarizer(neg_label=-2, pos_label=0) + + # two-class case with pos_label=0 + inp = np.array([0, 1, 1, 0]) + expected = np.array([[-2, 0, 0, -2]]).T + got = lb.fit_transform(inp) + assert_array_equal(expected, got) + assert_array_equal(lb.inverse_transform(got), inp) + + lb = LabelBinarizer(neg_label=-2, pos_label=2) + + # multi-class case + inp = np.array([3, 2, 1, 2, 0]) + expected = np.array( + [ + [-2, -2, -2, +2], + [-2, -2, +2, -2], + [-2, +2, -2, -2], + [-2, -2, +2, -2], + [+2, -2, -2, -2], + ] + ) + got = lb.fit_transform(inp) + assert_array_equal(expected, got) + assert_array_equal(lb.inverse_transform(got), inp) + + +@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) +@pytest.mark.parametrize("unique_first", [True, False]) +def test_label_binarizer_pandas_nullable(dtype, unique_first): + """Checks that LabelBinarizer works with pandas nullable dtypes. + + Non-regression test for gh-25637. + """ + pd = pytest.importorskip("pandas") + + y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype) + if unique_first: + # Calling unique creates a pandas array which has a different interface + # compared to a pandas Series. Specifically, pandas arrays do not have "iloc". + y_true = y_true.unique() + lb = LabelBinarizer().fit(y_true) + y_out = lb.transform([1, 0]) + + assert_array_equal(y_out, [[1], [0]]) + + +def test_label_binarizer_errors(): + # Check that invalid arguments yield ValueError + one_class = np.array([0, 0, 0, 0]) + lb = LabelBinarizer().fit(one_class) + + multi_label = [(2, 3), (0,), (0, 2)] + err_msg = "You appear to be using a legacy multi-label data representation." + with pytest.raises(ValueError, match=err_msg): + lb.transform(multi_label) + + lb = LabelBinarizer() + err_msg = "This LabelBinarizer instance is not fitted yet" + with pytest.raises(ValueError, match=err_msg): + lb.transform([]) + with pytest.raises(ValueError, match=err_msg): + lb.inverse_transform([]) + + input_labels = [0, 1, 0, 1] + err_msg = "neg_label=2 must be strictly less than pos_label=1." + lb = LabelBinarizer(neg_label=2, pos_label=1) + with pytest.raises(ValueError, match=err_msg): + lb.fit(input_labels) + err_msg = "neg_label=2 must be strictly less than pos_label=2." + lb = LabelBinarizer(neg_label=2, pos_label=2) + with pytest.raises(ValueError, match=err_msg): + lb.fit(input_labels) + err_msg = ( + "Sparse binarization is only supported with non zero pos_label and zero " + "neg_label, got pos_label=2 and neg_label=1" + ) + lb = LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True) + with pytest.raises(ValueError, match=err_msg): + lb.fit(input_labels) + + # Sequence of seq type should raise ValueError + y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]] + err_msg = "You appear to be using a legacy multi-label data representation" + with pytest.raises(ValueError, match=err_msg): + LabelBinarizer().fit_transform(y_seq_of_seqs) + + # Fail on the dimension of 'binary' + err_msg = "output_type='binary', but y.shape" + with pytest.raises(ValueError, match=err_msg): + _inverse_binarize_thresholding( + y=np.array([[1, 2, 3], [2, 1, 3]]), + output_type="binary", + classes=[1, 2, 3], + threshold=0, + ) + + # Fail on multioutput data + err_msg = "Multioutput target data is not supported with label binarization" + with pytest.raises(ValueError, match=err_msg): + LabelBinarizer().fit(np.array([[1, 3], [2, 1]])) + with pytest.raises(ValueError, match=err_msg): + label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3]) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_label_binarizer_sparse_errors(csr_container): + # Fail on y_type + err_msg = "foo format is not supported" + with pytest.raises(ValueError, match=err_msg): + _inverse_binarize_thresholding( + y=csr_container([[1, 2], [2, 1]]), + output_type="foo", + classes=[1, 2], + threshold=0, + ) + + # Fail on the number of classes + err_msg = "The number of class is not equal to the number of dimension of y." + with pytest.raises(ValueError, match=err_msg): + _inverse_binarize_thresholding( + y=csr_container([[1, 2], [2, 1]]), + output_type="foo", + classes=[1, 2, 3], + threshold=0, + ) + + +@pytest.mark.parametrize( + "values, classes, unknown", + [ + ( + np.array([2, 1, 3, 1, 3], dtype="int64"), + np.array([1, 2, 3], dtype="int64"), + np.array([4], dtype="int64"), + ), + ( + np.array(["b", "a", "c", "a", "c"], dtype=object), + np.array(["a", "b", "c"], dtype=object), + np.array(["d"], dtype=object), + ), + ( + np.array(["b", "a", "c", "a", "c"]), + np.array(["a", "b", "c"]), + np.array(["d"]), + ), + ], + ids=["int64", "object", "str"], +) +def test_label_encoder(values, classes, unknown): + # Test LabelEncoder's transform, fit_transform and + # inverse_transform methods + le = LabelEncoder() + le.fit(values) + assert_array_equal(le.classes_, classes) + assert_array_equal(le.transform(values), [1, 0, 2, 0, 2]) + assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values) + le = LabelEncoder() + ret = le.fit_transform(values) + assert_array_equal(ret, [1, 0, 2, 0, 2]) + + with pytest.raises(ValueError, match="unseen labels"): + le.transform(unknown) + + +def test_label_encoder_negative_ints(): + le = LabelEncoder() + le.fit([1, 1, 4, 5, -1, 0]) + assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) + assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0]) + assert_array_equal( + le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1] + ) + with pytest.raises(ValueError): + le.transform([0, 6]) + + +@pytest.mark.parametrize("dtype", ["str", "object"]) +def test_label_encoder_str_bad_shape(dtype): + le = LabelEncoder() + le.fit(np.array(["apple", "orange"], dtype=dtype)) + msg = "should be a 1d array" + with pytest.raises(ValueError, match=msg): + le.transform("apple") + + +def test_label_encoder_errors(): + # Check that invalid arguments yield ValueError + le = LabelEncoder() + with pytest.raises(ValueError): + le.transform([]) + with pytest.raises(ValueError): + le.inverse_transform([]) + + # Fail on unseen labels + le = LabelEncoder() + le.fit([1, 2, 3, -1, 1]) + msg = "contains previously unseen labels" + with pytest.raises(ValueError, match=msg): + le.inverse_transform([-2]) + with pytest.raises(ValueError, match=msg): + le.inverse_transform([-2, -3, -4]) + + # Fail on inverse_transform("") + msg = r"should be a 1d array.+shape \(\)" + with pytest.raises(ValueError, match=msg): + le.inverse_transform("") + + +@pytest.mark.parametrize( + "values", + [ + np.array([2, 1, 3, 1, 3], dtype="int64"), + np.array(["b", "a", "c", "a", "c"], dtype=object), + np.array(["b", "a", "c", "a", "c"]), + ], + ids=["int64", "object", "str"], +) +def test_label_encoder_empty_array(values): + le = LabelEncoder() + le.fit(values) + # test empty transform + transformed = le.transform([]) + assert_array_equal(np.array([]), transformed) + # test empty inverse transform + inverse_transformed = le.inverse_transform([]) + assert_array_equal(np.array([]), inverse_transformed) + + +def test_sparse_output_multilabel_binarizer(): + # test input as iterable of iterables + inputs = [ + lambda: [(2, 3), (1,), (1, 2)], + lambda: ({2, 3}, {1}, {1, 2}), + lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]), + ] + indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) + + inverse = inputs[0]() + for sparse_output in [True, False]: + for inp in inputs: + # With fit_transform + mlb = MultiLabelBinarizer(sparse_output=sparse_output) + got = mlb.fit_transform(inp()) + assert issparse(got) == sparse_output + if sparse_output: + # verify CSR assumption that indices and indptr have same dtype + assert got.indices.dtype == got.indptr.dtype + got = got.toarray() + assert_array_equal(indicator_mat, got) + assert_array_equal([1, 2, 3], mlb.classes_) + assert mlb.inverse_transform(got) == inverse + + # With fit + mlb = MultiLabelBinarizer(sparse_output=sparse_output) + got = mlb.fit(inp()).transform(inp()) + assert issparse(got) == sparse_output + if sparse_output: + # verify CSR assumption that indices and indptr have same dtype + assert got.indices.dtype == got.indptr.dtype + got = got.toarray() + assert_array_equal(indicator_mat, got) + assert_array_equal([1, 2, 3], mlb.classes_) + assert mlb.inverse_transform(got) == inverse + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse_output_multilabel_binarizer_errors(csr_container): + inp = iter([iter((2, 3)), iter((1,)), {1, 2}]) + mlb = MultiLabelBinarizer(sparse_output=False) + mlb.fit(inp) + with pytest.raises(ValueError): + mlb.inverse_transform( + csr_container(np.array([[0, 1, 1], [2, 0, 0], [1, 1, 0]])) + ) + + +def test_multilabel_binarizer(): + # test input as iterable of iterables + inputs = [ + lambda: [(2, 3), (1,), (1, 2)], + lambda: ({2, 3}, {1}, {1, 2}), + lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]), + ] + indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) + inverse = inputs[0]() + for inp in inputs: + # With fit_transform + mlb = MultiLabelBinarizer() + got = mlb.fit_transform(inp()) + assert_array_equal(indicator_mat, got) + assert_array_equal([1, 2, 3], mlb.classes_) + assert mlb.inverse_transform(got) == inverse + + # With fit + mlb = MultiLabelBinarizer() + got = mlb.fit(inp()).transform(inp()) + assert_array_equal(indicator_mat, got) + assert_array_equal([1, 2, 3], mlb.classes_) + assert mlb.inverse_transform(got) == inverse + + +def test_multilabel_binarizer_empty_sample(): + mlb = MultiLabelBinarizer() + y = [[1, 2], [1], []] + Y = np.array([[1, 1], [1, 0], [0, 0]]) + assert_array_equal(mlb.fit_transform(y), Y) + + +def test_multilabel_binarizer_unknown_class(): + mlb = MultiLabelBinarizer() + y = [[1, 2]] + Y = np.array([[1, 0], [0, 1]]) + warning_message = "unknown class.* will be ignored" + with pytest.warns(UserWarning, match=warning_message): + matrix = mlb.fit(y).transform([[4, 1], [2, 0]]) + + Y = np.array([[1, 0, 0], [0, 1, 0]]) + mlb = MultiLabelBinarizer(classes=[1, 2, 3]) + with pytest.warns(UserWarning, match=warning_message): + matrix = mlb.fit(y).transform([[4, 1], [2, 0]]) + assert_array_equal(matrix, Y) + + +def test_multilabel_binarizer_given_classes(): + inp = [(2, 3), (1,), (1, 2)] + indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]]) + # fit_transform() + mlb = MultiLabelBinarizer(classes=[1, 3, 2]) + assert_array_equal(mlb.fit_transform(inp), indicator_mat) + assert_array_equal(mlb.classes_, [1, 3, 2]) + + # fit().transform() + mlb = MultiLabelBinarizer(classes=[1, 3, 2]) + assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) + assert_array_equal(mlb.classes_, [1, 3, 2]) + + # ensure works with extra class + mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2]) + assert_array_equal( + mlb.fit_transform(inp), np.hstack(([[0], [0], [0]], indicator_mat)) + ) + assert_array_equal(mlb.classes_, [4, 1, 3, 2]) + + # ensure fit is no-op as iterable is not consumed + inp = iter(inp) + mlb = MultiLabelBinarizer(classes=[1, 3, 2]) + assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) + + # ensure a ValueError is thrown if given duplicate classes + err_msg = ( + "The classes argument contains duplicate classes. Remove " + "these duplicates before passing them to MultiLabelBinarizer." + ) + mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3]) + with pytest.raises(ValueError, match=err_msg): + mlb.fit(inp) + + +def test_multilabel_binarizer_multiple_calls(): + inp = [(2, 3), (1,), (1, 2)] + indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]]) + + indicator_mat2 = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) + + # first call + mlb = MultiLabelBinarizer(classes=[1, 3, 2]) + assert_array_equal(mlb.fit_transform(inp), indicator_mat) + # second call change class + mlb.classes = [1, 2, 3] + assert_array_equal(mlb.fit_transform(inp), indicator_mat2) + + +def test_multilabel_binarizer_same_length_sequence(): + # Ensure sequences of the same length are not interpreted as a 2-d array + inp = [[1], [0], [2]] + indicator_mat = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]]) + # fit_transform() + mlb = MultiLabelBinarizer() + assert_array_equal(mlb.fit_transform(inp), indicator_mat) + assert_array_equal(mlb.inverse_transform(indicator_mat), inp) + + # fit().transform() + mlb = MultiLabelBinarizer() + assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) + assert_array_equal(mlb.inverse_transform(indicator_mat), inp) + + +def test_multilabel_binarizer_non_integer_labels(): + tuple_classes = _to_object_array([(1,), (2,), (3,)]) + inputs = [ + ([("2", "3"), ("1",), ("1", "2")], ["1", "2", "3"]), + ([("b", "c"), ("a",), ("a", "b")], ["a", "b", "c"]), + ([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes), + ] + indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) + for inp, classes in inputs: + # fit_transform() + mlb = MultiLabelBinarizer() + inp = np.array(inp, dtype=object) + assert_array_equal(mlb.fit_transform(inp), indicator_mat) + assert_array_equal(mlb.classes_, classes) + indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object) + assert_array_equal(indicator_mat_inv, inp) + + # fit().transform() + mlb = MultiLabelBinarizer() + assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) + assert_array_equal(mlb.classes_, classes) + indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object) + assert_array_equal(indicator_mat_inv, inp) + + mlb = MultiLabelBinarizer() + with pytest.raises(TypeError): + mlb.fit_transform([({}), ({}, {"a": "b"})]) + + +def test_multilabel_binarizer_non_unique(): + inp = [(1, 1, 1, 0)] + indicator_mat = np.array([[1, 1]]) + mlb = MultiLabelBinarizer() + assert_array_equal(mlb.fit_transform(inp), indicator_mat) + + +def test_multilabel_binarizer_inverse_validation(): + inp = [(1, 1, 1, 0)] + mlb = MultiLabelBinarizer() + mlb.fit_transform(inp) + # Not binary + with pytest.raises(ValueError): + mlb.inverse_transform(np.array([[1, 3]])) + # The following binary cases are fine, however + mlb.inverse_transform(np.array([[0, 0]])) + mlb.inverse_transform(np.array([[1, 1]])) + mlb.inverse_transform(np.array([[1, 0]])) + + # Wrong shape + with pytest.raises(ValueError): + mlb.inverse_transform(np.array([[1]])) + with pytest.raises(ValueError): + mlb.inverse_transform(np.array([[1, 1, 1]])) + + +def test_label_binarize_with_class_order(): + out = label_binarize([1, 6], classes=[1, 2, 4, 6]) + expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]]) + assert_array_equal(out, expected) + + # Modified class order + out = label_binarize([1, 6], classes=[1, 6, 4, 2]) + expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]]) + assert_array_equal(out, expected) + + out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1]) + expected = np.array([[0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0]]) + assert_array_equal(out, expected) + + +def check_binarized_results(y, classes, pos_label, neg_label, expected): + for sparse_output in [True, False]: + if (pos_label == 0 or neg_label != 0) and sparse_output: + with pytest.raises(ValueError): + label_binarize( + y, + classes=classes, + neg_label=neg_label, + pos_label=pos_label, + sparse_output=sparse_output, + ) + continue + + # check label_binarize + binarized = label_binarize( + y, + classes=classes, + neg_label=neg_label, + pos_label=pos_label, + sparse_output=sparse_output, + ) + assert_array_equal(toarray(binarized), expected) + assert issparse(binarized) == sparse_output + + # check inverse + y_type = type_of_target(y) + if y_type == "multiclass": + inversed = _inverse_binarize_multiclass(binarized, classes=classes) + + else: + inversed = _inverse_binarize_thresholding( + binarized, + output_type=y_type, + classes=classes, + threshold=((neg_label + pos_label) / 2.0), + ) + + assert_array_equal(toarray(inversed), toarray(y)) + + # Check label binarizer + lb = LabelBinarizer( + neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output + ) + binarized = lb.fit_transform(y) + assert_array_equal(toarray(binarized), expected) + assert issparse(binarized) == sparse_output + inverse_output = lb.inverse_transform(binarized) + assert_array_equal(toarray(inverse_output), toarray(y)) + assert issparse(inverse_output) == issparse(y) + + +def test_label_binarize_binary(): + y = [0, 1, 0] + classes = [0, 1] + pos_label = 2 + neg_label = -1 + expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1)) + + check_binarized_results(y, classes, pos_label, neg_label, expected) + + # Binary case where sparse_output = True will not result in a ValueError + y = [0, 1, 0] + classes = [0, 1] + pos_label = 3 + neg_label = 0 + expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1)) + + check_binarized_results(y, classes, pos_label, neg_label, expected) + + +def test_label_binarize_multiclass(): + y = [0, 1, 2] + classes = [0, 1, 2] + pos_label = 2 + neg_label = 0 + expected = 2 * np.eye(3) + + check_binarized_results(y, classes, pos_label, neg_label, expected) + + with pytest.raises(ValueError): + label_binarize( + y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True + ) + + +@pytest.mark.parametrize( + "arr_type", + [np.array] + + COO_CONTAINERS + + CSC_CONTAINERS + + CSR_CONTAINERS + + DOK_CONTAINERS + + LIL_CONTAINERS, +) +def test_label_binarize_multilabel(arr_type): + y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]]) + classes = [0, 1, 2] + pos_label = 2 + neg_label = 0 + expected = pos_label * y_ind + y = arr_type(y_ind) + + check_binarized_results(y, classes, pos_label, neg_label, expected) + + with pytest.raises(ValueError): + label_binarize( + y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True + ) + + +def test_invalid_input_label_binarize(): + with pytest.raises(ValueError): + label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1) + with pytest.raises(ValueError, match="continuous target data is not "): + label_binarize([1.2, 2.7], classes=[0, 1]) + with pytest.raises(ValueError, match="mismatch with the labels"): + label_binarize([[1, 3]], classes=[1, 2, 3]) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_inverse_binarize_multiclass(csr_container): + got = _inverse_binarize_multiclass( + csr_container([[0, 1, 0], [-1, 0, -1], [0, 0, 0]]), np.arange(3) + ) + assert_array_equal(got, np.array([1, 1, 0])) + + +def test_nan_label_encoder(): + """Check that label encoder encodes nans in transform. + + Non-regression test for #22628. + """ + le = LabelEncoder() + le.fit(["a", "a", "b", np.nan]) + + y_trans = le.transform([np.nan]) + assert_array_equal(y_trans, [2]) + + +@pytest.mark.parametrize( + "encoder", [LabelEncoder(), LabelBinarizer(), MultiLabelBinarizer()] +) +def test_label_encoders_do_not_have_set_output(encoder): + """Check that label encoders do not define set_output and work with y as a kwarg. + + Non-regression test for #26854. + """ + assert not hasattr(encoder, "set_output") + y_encoded_with_kwarg = encoder.fit_transform(y=["a", "b", "c"]) + y_encoded_positional = encoder.fit_transform(["a", "b", "c"]) + assert_array_equal(y_encoded_with_kwarg, y_encoded_positional) + + +@pytest.mark.parametrize( + "array_namespace, device, dtype", + yield_namespace_device_dtype_combinations(), + ids=_get_namespace_device_dtype_ids, +) +@pytest.mark.parametrize( + "y", + [ + np.array([2, 1, 3, 1, 3]), + np.array([1, 1, 4, 5, -1, 0]), + np.array([3, 5, 9, 5, 9, 3]), + ], +) +def test_label_encoder_array_api_compliance(y, array_namespace, device, dtype): + xp = _array_api_for_tests(array_namespace, device) + xp_y = xp.asarray(y, device=device) + with config_context(array_api_dispatch=True): + xp_label = LabelEncoder() + np_label = LabelEncoder() + xp_label = xp_label.fit(xp_y) + xp_transformed = xp_label.transform(xp_y) + xp_inv_transformed = xp_label.inverse_transform(xp_transformed) + np_label = np_label.fit(y) + np_transformed = np_label.transform(y) + assert get_namespace(xp_transformed)[0].__name__ == xp.__name__ + assert get_namespace(xp_inv_transformed)[0].__name__ == xp.__name__ + assert get_namespace(xp_label.classes_)[0].__name__ == xp.__name__ + assert_array_equal(_convert_to_numpy(xp_transformed, xp), np_transformed) + assert_array_equal(_convert_to_numpy(xp_inv_transformed, xp), y) + assert_array_equal(_convert_to_numpy(xp_label.classes_, xp), np_label.classes_) + + xp_label = LabelEncoder() + np_label = LabelEncoder() + xp_transformed = xp_label.fit_transform(xp_y) + np_transformed = np_label.fit_transform(y) + assert get_namespace(xp_transformed)[0].__name__ == xp.__name__ + assert get_namespace(xp_label.classes_)[0].__name__ == xp.__name__ + assert_array_equal(_convert_to_numpy(xp_transformed, xp), np_transformed) + assert_array_equal(_convert_to_numpy(xp_label.classes_, xp), np_label.classes_) diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_polynomial.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_polynomial.py new file mode 100644 index 0000000000000000000000000000000000000000..640bf5705baad6ee644ba81942791864f9587f60 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_polynomial.py @@ -0,0 +1,1230 @@ +import sys + +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_array_equal +from scipy import sparse +from scipy.interpolate import BSpline +from scipy.sparse import random as sparse_random + +from sklearn.linear_model import LinearRegression +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import ( + KBinsDiscretizer, + PolynomialFeatures, + SplineTransformer, +) +from sklearn.preprocessing._csr_polynomial_expansion import ( + _get_sizeof_LARGEST_INT_t, +) +from sklearn.utils._testing import assert_array_almost_equal +from sklearn.utils.fixes import ( + CSC_CONTAINERS, + CSR_CONTAINERS, + parse_version, + sp_version, +) + + +@pytest.mark.parametrize("est", (PolynomialFeatures, SplineTransformer)) +def test_polynomial_and_spline_array_order(est): + """Test that output array has the given order.""" + X = np.arange(10).reshape(5, 2) + + def is_c_contiguous(a): + return np.isfortran(a.T) + + assert is_c_contiguous(est().fit_transform(X)) + assert is_c_contiguous(est(order="C").fit_transform(X)) + assert np.isfortran(est(order="F").fit_transform(X)) + + +@pytest.mark.parametrize( + "params, err_msg", + [ + ({"knots": [[1]]}, r"Number of knots, knots.shape\[0\], must be >= 2."), + ({"knots": [[1, 1], [2, 2]]}, r"knots.shape\[1\] == n_features is violated"), + ({"knots": [[1], [0]]}, "knots must be sorted without duplicates."), + ], +) +def test_spline_transformer_input_validation(params, err_msg): + """Test that we raise errors for invalid input in SplineTransformer.""" + X = [[1], [2]] + + with pytest.raises(ValueError, match=err_msg): + SplineTransformer(**params).fit(X) + + +@pytest.mark.parametrize("extrapolation", ["continue", "periodic"]) +def test_spline_transformer_integer_knots(extrapolation): + """Test that SplineTransformer accepts integer value knot positions.""" + X = np.arange(20).reshape(10, 2) + knots = [[0, 1], [1, 2], [5, 5], [11, 10], [12, 11]] + _ = SplineTransformer( + degree=3, knots=knots, extrapolation=extrapolation + ).fit_transform(X) + + +def test_spline_transformer_feature_names(): + """Test that SplineTransformer generates correct features name.""" + X = np.arange(20).reshape(10, 2) + splt = SplineTransformer(n_knots=3, degree=3, include_bias=True).fit(X) + feature_names = splt.get_feature_names_out() + assert_array_equal( + feature_names, + [ + "x0_sp_0", + "x0_sp_1", + "x0_sp_2", + "x0_sp_3", + "x0_sp_4", + "x1_sp_0", + "x1_sp_1", + "x1_sp_2", + "x1_sp_3", + "x1_sp_4", + ], + ) + + splt = SplineTransformer(n_knots=3, degree=3, include_bias=False).fit(X) + feature_names = splt.get_feature_names_out(["a", "b"]) + assert_array_equal( + feature_names, + [ + "a_sp_0", + "a_sp_1", + "a_sp_2", + "a_sp_3", + "b_sp_0", + "b_sp_1", + "b_sp_2", + "b_sp_3", + ], + ) + + +@pytest.mark.parametrize( + "extrapolation", + ["constant", "linear", "continue", "periodic"], +) +@pytest.mark.parametrize("degree", [2, 3]) +def test_split_transform_feature_names_extrapolation_degree(extrapolation, degree): + """Test feature names are correct for different extrapolations and degree. + + Non-regression test for gh-25292. + """ + X = np.arange(20).reshape(10, 2) + splt = SplineTransformer(degree=degree, extrapolation=extrapolation).fit(X) + feature_names = splt.get_feature_names_out(["a", "b"]) + assert len(feature_names) == splt.n_features_out_ + + X_trans = splt.transform(X) + assert X_trans.shape[1] == len(feature_names) + + +@pytest.mark.parametrize("degree", range(1, 5)) +@pytest.mark.parametrize("n_knots", range(3, 5)) +@pytest.mark.parametrize("knots", ["uniform", "quantile"]) +@pytest.mark.parametrize("extrapolation", ["constant", "periodic"]) +def test_spline_transformer_unity_decomposition(degree, n_knots, knots, extrapolation): + """Test that B-splines are indeed a decomposition of unity. + + Splines basis functions must sum up to 1 per row, if we stay in between boundaries. + """ + X = np.linspace(0, 1, 100)[:, None] + # make the boundaries 0 and 1 part of X_train, for sure. + X_train = np.r_[[[0]], X[::2, :], [[1]]] + X_test = X[1::2, :] + + if extrapolation == "periodic": + n_knots = n_knots + degree # periodic splines require degree < n_knots + + splt = SplineTransformer( + n_knots=n_knots, + degree=degree, + knots=knots, + include_bias=True, + extrapolation=extrapolation, + ) + splt.fit(X_train) + for X in [X_train, X_test]: + assert_allclose(np.sum(splt.transform(X), axis=1), 1) + + +@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)]) +def test_spline_transformer_linear_regression(bias, intercept): + """Test that B-splines fit a sinusodial curve pretty well.""" + X = np.linspace(0, 10, 100)[:, None] + y = np.sin(X[:, 0]) + 2 # +2 to avoid the value 0 in assert_allclose + pipe = Pipeline( + steps=[ + ( + "spline", + SplineTransformer( + n_knots=15, + degree=3, + include_bias=bias, + extrapolation="constant", + ), + ), + ("ols", LinearRegression(fit_intercept=intercept)), + ] + ) + pipe.fit(X, y) + assert_allclose(pipe.predict(X), y, rtol=1e-3) + + +@pytest.mark.parametrize( + ["knots", "n_knots", "sample_weight", "expected_knots"], + [ + ("uniform", 3, None, np.array([[0, 2], [3, 8], [6, 14]])), + ( + "uniform", + 3, + np.array([0, 0, 1, 1, 0, 3, 1]), + np.array([[2, 2], [4, 8], [6, 14]]), + ), + ("uniform", 4, None, np.array([[0, 2], [2, 6], [4, 10], [6, 14]])), + ("quantile", 3, None, np.array([[0, 2], [3, 3], [6, 14]])), + ( + "quantile", + 3, + np.array([0, 0, 1, 1, 0, 3, 1]), + np.array([[2, 2], [5, 8], [6, 14]]), + ), + ], +) +def test_spline_transformer_get_base_knot_positions( + knots, n_knots, sample_weight, expected_knots +): + """Check the behaviour to find knot positions with and without sample_weight.""" + X = np.array([[0, 2], [0, 2], [2, 2], [3, 3], [4, 6], [5, 8], [6, 14]]) + base_knots = SplineTransformer._get_base_knot_positions( + X=X, knots=knots, n_knots=n_knots, sample_weight=sample_weight + ) + assert_allclose(base_knots, expected_knots) + + +@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)]) +def test_spline_transformer_periodic_linear_regression(bias, intercept): + """Test that B-splines fit a periodic curve pretty well.""" + + # "+ 3" to avoid the value 0 in assert_allclose + def f(x): + return np.sin(2 * np.pi * x) - np.sin(8 * np.pi * x) + 3 + + X = np.linspace(0, 1, 101)[:, None] + pipe = Pipeline( + steps=[ + ( + "spline", + SplineTransformer( + n_knots=20, + degree=3, + include_bias=bias, + extrapolation="periodic", + ), + ), + ("ols", LinearRegression(fit_intercept=intercept)), + ] + ) + pipe.fit(X, f(X[:, 0])) + + # Generate larger array to check periodic extrapolation + X_ = np.linspace(-1, 2, 301)[:, None] + predictions = pipe.predict(X_) + assert_allclose(predictions, f(X_[:, 0]), atol=0.01, rtol=0.01) + assert_allclose(predictions[0:100], predictions[100:200], rtol=1e-3) + + +def test_spline_transformer_periodic_spline_backport(): + """Test that the backport of extrapolate="periodic" works correctly""" + X = np.linspace(-2, 3.5, 10)[:, None] + degree = 2 + + # Use periodic extrapolation backport in SplineTransformer + transformer = SplineTransformer( + degree=degree, extrapolation="periodic", knots=[[-1.0], [0.0], [1.0]] + ) + Xt = transformer.fit_transform(X) + + # Use periodic extrapolation in BSpline + coef = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0]]) + spl = BSpline(np.arange(-3, 4), coef, degree, "periodic") + Xspl = spl(X[:, 0]) + assert_allclose(Xt, Xspl) + + +def test_spline_transformer_periodic_splines_periodicity(): + """Test if shifted knots result in the same transformation up to permutation.""" + X = np.linspace(0, 10, 101)[:, None] + + transformer_1 = SplineTransformer( + degree=3, + extrapolation="periodic", + knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]], + ) + + transformer_2 = SplineTransformer( + degree=3, + extrapolation="periodic", + knots=[[1.0], [3.0], [4.0], [5.0], [8.0], [9.0]], + ) + + Xt_1 = transformer_1.fit_transform(X) + Xt_2 = transformer_2.fit_transform(X) + + assert_allclose(Xt_1, Xt_2[:, [4, 0, 1, 2, 3]]) + + +@pytest.mark.parametrize("degree", [3, 5]) +def test_spline_transformer_periodic_splines_smoothness(degree): + """Test that spline transformation is smooth at first / last knot.""" + X = np.linspace(-2, 10, 10_000)[:, None] + + transformer = SplineTransformer( + degree=degree, + extrapolation="periodic", + knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]], + ) + Xt = transformer.fit_transform(X) + + delta = (X.max() - X.min()) / len(X) + tol = 10 * delta + + dXt = Xt + # We expect splines of degree `degree` to be (`degree`-1) times + # continuously differentiable. I.e. for d = 0, ..., `degree` - 1 the d-th + # derivative should be continuous. This is the case if the (d+1)-th + # numerical derivative is reasonably small (smaller than `tol` in absolute + # value). We thus compute d-th numeric derivatives for d = 1, ..., `degree` + # and compare them to `tol`. + # + # Note that the 0-th derivative is the function itself, such that we are + # also checking its continuity. + for d in range(1, degree + 1): + # Check continuity of the (d-1)-th derivative + diff = np.diff(dXt, axis=0) + assert np.abs(diff).max() < tol + # Compute d-th numeric derivative + dXt = diff / delta + + # As degree `degree` splines are not `degree` times continuously + # differentiable at the knots, the `degree + 1`-th numeric derivative + # should have spikes at the knots. + diff = np.diff(dXt, axis=0) + assert np.abs(diff).max() > 1 + + +@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)]) +@pytest.mark.parametrize("degree", [1, 2, 3, 4, 5]) +def test_spline_transformer_extrapolation(bias, intercept, degree): + """Test that B-spline extrapolation works correctly.""" + # we use a straight line for that + X = np.linspace(-1, 1, 100)[:, None] + y = X.squeeze() + + # 'constant' + pipe = Pipeline( + [ + [ + "spline", + SplineTransformer( + n_knots=4, + degree=degree, + include_bias=bias, + extrapolation="constant", + ), + ], + ["ols", LinearRegression(fit_intercept=intercept)], + ] + ) + pipe.fit(X, y) + assert_allclose(pipe.predict([[-10], [5]]), [-1, 1]) + + # 'linear' + pipe = Pipeline( + [ + [ + "spline", + SplineTransformer( + n_knots=4, + degree=degree, + include_bias=bias, + extrapolation="linear", + ), + ], + ["ols", LinearRegression(fit_intercept=intercept)], + ] + ) + pipe.fit(X, y) + assert_allclose(pipe.predict([[-10], [5]]), [-10, 5]) + + # 'error' + splt = SplineTransformer( + n_knots=4, degree=degree, include_bias=bias, extrapolation="error" + ) + splt.fit(X) + msg = "X contains values beyond the limits of the knots" + with pytest.raises(ValueError, match=msg): + splt.transform([[-10]]) + with pytest.raises(ValueError, match=msg): + splt.transform([[5]]) + + +def test_spline_transformer_kbindiscretizer(global_random_seed): + """Test that a B-spline of degree=0 is equivalent to KBinsDiscretizer.""" + rng = np.random.RandomState(global_random_seed) + X = rng.randn(200).reshape(200, 1) + n_bins = 5 + n_knots = n_bins + 1 + + splt = SplineTransformer( + n_knots=n_knots, degree=0, knots="quantile", include_bias=True + ) + splines = splt.fit_transform(X) + + kbd = KBinsDiscretizer( + n_bins=n_bins, + encode="onehot-dense", + strategy="quantile", + quantile_method="averaged_inverted_cdf", + ) + kbins = kbd.fit_transform(X) + + # Though they should be exactly equal, we test approximately with high + # accuracy. + assert_allclose(splines, kbins, rtol=1e-13) + + +@pytest.mark.parametrize("degree", range(1, 3)) +@pytest.mark.parametrize("knots", ["uniform", "quantile"]) +@pytest.mark.parametrize( + "extrapolation", ["error", "constant", "linear", "continue", "periodic"] +) +@pytest.mark.parametrize("include_bias", [False, True]) +def test_spline_transformer_sparse_output( + degree, knots, extrapolation, include_bias, global_random_seed +): + rng = np.random.RandomState(global_random_seed) + X = rng.randn(200).reshape(40, 5) + + splt_dense = SplineTransformer( + degree=degree, + knots=knots, + extrapolation=extrapolation, + include_bias=include_bias, + sparse_output=False, + ) + splt_sparse = SplineTransformer( + degree=degree, + knots=knots, + extrapolation=extrapolation, + include_bias=include_bias, + sparse_output=True, + ) + + splt_dense.fit(X) + splt_sparse.fit(X) + + X_trans_sparse = splt_sparse.transform(X) + X_trans_dense = splt_dense.transform(X) + assert sparse.issparse(X_trans_sparse) and X_trans_sparse.format == "csr" + assert_allclose(X_trans_dense, X_trans_sparse.toarray()) + + # extrapolation regime + X_min = np.amin(X, axis=0) + X_max = np.amax(X, axis=0) + X_extra = np.r_[ + np.linspace(X_min - 5, X_min, 10), np.linspace(X_max, X_max + 5, 10) + ] + if extrapolation == "error": + msg = "X contains values beyond the limits of the knots" + with pytest.raises(ValueError, match=msg): + splt_dense.transform(X_extra) + msg = "Out of bounds" + with pytest.raises(ValueError, match=msg): + splt_sparse.transform(X_extra) + else: + assert_allclose( + splt_dense.transform(X_extra), splt_sparse.transform(X_extra).toarray() + ) + + +@pytest.mark.parametrize("n_knots", [5, 10]) +@pytest.mark.parametrize("include_bias", [True, False]) +@pytest.mark.parametrize("degree", [3, 4]) +@pytest.mark.parametrize( + "extrapolation", ["error", "constant", "linear", "continue", "periodic"] +) +@pytest.mark.parametrize("sparse_output", [False, True]) +def test_spline_transformer_n_features_out( + n_knots, include_bias, degree, extrapolation, sparse_output +): + """Test that transform results in n_features_out_ features.""" + splt = SplineTransformer( + n_knots=n_knots, + degree=degree, + include_bias=include_bias, + extrapolation=extrapolation, + sparse_output=sparse_output, + ) + X = np.linspace(0, 1, 10)[:, None] + splt.fit(X) + + assert splt.transform(X).shape[1] == splt.n_features_out_ + + +@pytest.mark.parametrize( + "params, err_msg", + [ + ({"degree": (-1, 2)}, r"degree=\(min_degree, max_degree\) must"), + ({"degree": (0, 1.5)}, r"degree=\(min_degree, max_degree\) must"), + ({"degree": (3, 2)}, r"degree=\(min_degree, max_degree\) must"), + ({"degree": (1, 2, 3)}, r"int or tuple \(min_degree, max_degree\)"), + ], +) +def test_polynomial_features_input_validation(params, err_msg): + """Test that we raise errors for invalid input in PolynomialFeatures.""" + X = [[1], [2]] + + with pytest.raises(ValueError, match=err_msg): + PolynomialFeatures(**params).fit(X) + + +@pytest.fixture() +def single_feature_degree3(): + X = np.arange(6)[:, np.newaxis] + P = np.hstack([np.ones_like(X), X, X**2, X**3]) + return X, P + + +@pytest.mark.parametrize( + "degree, include_bias, interaction_only, indices", + [ + (3, True, False, slice(None, None)), + (3, False, False, slice(1, None)), + (3, True, True, [0, 1]), + (3, False, True, [1]), + ((2, 3), True, False, [0, 2, 3]), + ((2, 3), False, False, [2, 3]), + ((2, 3), True, True, [0]), + ((2, 3), False, True, []), + ], +) +@pytest.mark.parametrize("X_container", [None] + CSR_CONTAINERS + CSC_CONTAINERS) +def test_polynomial_features_one_feature( + single_feature_degree3, + degree, + include_bias, + interaction_only, + indices, + X_container, +): + """Test PolynomialFeatures on single feature up to degree 3.""" + X, P = single_feature_degree3 + if X_container is not None: + X = X_container(X) + tf = PolynomialFeatures( + degree=degree, include_bias=include_bias, interaction_only=interaction_only + ).fit(X) + out = tf.transform(X) + if X_container is not None: + out = out.toarray() + assert_allclose(out, P[:, indices]) + if tf.n_output_features_ > 0: + assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_) + + +@pytest.fixture() +def two_features_degree3(): + X = np.arange(6).reshape((3, 2)) + x1 = X[:, :1] + x2 = X[:, 1:] + P = np.hstack( + [ + x1**0 * x2**0, # 0 + x1**1 * x2**0, # 1 + x1**0 * x2**1, # 2 + x1**2 * x2**0, # 3 + x1**1 * x2**1, # 4 + x1**0 * x2**2, # 5 + x1**3 * x2**0, # 6 + x1**2 * x2**1, # 7 + x1**1 * x2**2, # 8 + x1**0 * x2**3, # 9 + ] + ) + return X, P + + +@pytest.mark.parametrize( + "degree, include_bias, interaction_only, indices", + [ + (2, True, False, slice(0, 6)), + (2, False, False, slice(1, 6)), + (2, True, True, [0, 1, 2, 4]), + (2, False, True, [1, 2, 4]), + ((2, 2), True, False, [0, 3, 4, 5]), + ((2, 2), False, False, [3, 4, 5]), + ((2, 2), True, True, [0, 4]), + ((2, 2), False, True, [4]), + (3, True, False, slice(None, None)), + (3, False, False, slice(1, None)), + (3, True, True, [0, 1, 2, 4]), + (3, False, True, [1, 2, 4]), + ((2, 3), True, False, [0, 3, 4, 5, 6, 7, 8, 9]), + ((2, 3), False, False, slice(3, None)), + ((2, 3), True, True, [0, 4]), + ((2, 3), False, True, [4]), + ((3, 3), True, False, [0, 6, 7, 8, 9]), + ((3, 3), False, False, [6, 7, 8, 9]), + ((3, 3), True, True, [0]), + ((3, 3), False, True, []), # would need 3 input features + ], +) +@pytest.mark.parametrize("X_container", [None] + CSR_CONTAINERS + CSC_CONTAINERS) +def test_polynomial_features_two_features( + two_features_degree3, + degree, + include_bias, + interaction_only, + indices, + X_container, +): + """Test PolynomialFeatures on 2 features up to degree 3.""" + X, P = two_features_degree3 + if X_container is not None: + X = X_container(X) + tf = PolynomialFeatures( + degree=degree, include_bias=include_bias, interaction_only=interaction_only + ).fit(X) + out = tf.transform(X) + if X_container is not None: + out = out.toarray() + assert_allclose(out, P[:, indices]) + if tf.n_output_features_ > 0: + assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_) + + +def test_polynomial_feature_names(): + X = np.arange(30).reshape(10, 3) + poly = PolynomialFeatures(degree=2, include_bias=True).fit(X) + feature_names = poly.get_feature_names_out() + assert_array_equal( + ["1", "x0", "x1", "x2", "x0^2", "x0 x1", "x0 x2", "x1^2", "x1 x2", "x2^2"], + feature_names, + ) + assert len(feature_names) == poly.transform(X).shape[1] + + poly = PolynomialFeatures(degree=3, include_bias=False).fit(X) + feature_names = poly.get_feature_names_out(["a", "b", "c"]) + assert_array_equal( + [ + "a", + "b", + "c", + "a^2", + "a b", + "a c", + "b^2", + "b c", + "c^2", + "a^3", + "a^2 b", + "a^2 c", + "a b^2", + "a b c", + "a c^2", + "b^3", + "b^2 c", + "b c^2", + "c^3", + ], + feature_names, + ) + assert len(feature_names) == poly.transform(X).shape[1] + + poly = PolynomialFeatures(degree=(2, 3), include_bias=False).fit(X) + feature_names = poly.get_feature_names_out(["a", "b", "c"]) + assert_array_equal( + [ + "a^2", + "a b", + "a c", + "b^2", + "b c", + "c^2", + "a^3", + "a^2 b", + "a^2 c", + "a b^2", + "a b c", + "a c^2", + "b^3", + "b^2 c", + "b c^2", + "c^3", + ], + feature_names, + ) + assert len(feature_names) == poly.transform(X).shape[1] + + poly = PolynomialFeatures( + degree=(3, 3), include_bias=True, interaction_only=True + ).fit(X) + feature_names = poly.get_feature_names_out(["a", "b", "c"]) + assert_array_equal(["1", "a b c"], feature_names) + assert len(feature_names) == poly.transform(X).shape[1] + + # test some unicode + poly = PolynomialFeatures(degree=1, include_bias=True).fit(X) + feature_names = poly.get_feature_names_out(["\u0001F40D", "\u262e", "\u05d0"]) + assert_array_equal(["1", "\u0001F40D", "\u262e", "\u05d0"], feature_names) + + +@pytest.mark.parametrize( + ["deg", "include_bias", "interaction_only", "dtype"], + [ + (1, True, False, int), + (2, True, False, int), + (2, True, False, np.float32), + (2, True, False, np.float64), + (3, False, False, np.float64), + (3, False, True, np.float64), + (4, False, False, np.float64), + (4, False, True, np.float64), + ], +) +@pytest.mark.parametrize("csc_container", CSC_CONTAINERS) +def test_polynomial_features_csc_X( + deg, include_bias, interaction_only, dtype, csc_container, global_random_seed +): + rng = np.random.RandomState(global_random_seed) + X = rng.randint(0, 2, (100, 2)) + X_csc = csc_container(X) + + est = PolynomialFeatures( + deg, include_bias=include_bias, interaction_only=interaction_only + ) + Xt_csc = est.fit_transform(X_csc.astype(dtype)) + Xt_dense = est.fit_transform(X.astype(dtype)) + + assert sparse.issparse(Xt_csc) and Xt_csc.format == "csc" + assert Xt_csc.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csc.toarray(), Xt_dense) + + +@pytest.mark.parametrize( + ["deg", "include_bias", "interaction_only", "dtype"], + [ + (1, True, False, int), + (2, True, False, int), + (2, True, False, np.float32), + (2, True, False, np.float64), + (3, False, False, np.float64), + (3, False, True, np.float64), + ], +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_polynomial_features_csr_X( + deg, include_bias, interaction_only, dtype, csr_container, global_random_seed +): + rng = np.random.RandomState(global_random_seed) + X = rng.randint(0, 2, (100, 2)) + X_csr = csr_container(X) + + est = PolynomialFeatures( + deg, include_bias=include_bias, interaction_only=interaction_only + ) + Xt_csr = est.fit_transform(X_csr.astype(dtype)) + Xt_dense = est.fit_transform(X.astype(dtype, copy=False)) + + assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr" + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.toarray(), Xt_dense) + + +@pytest.mark.parametrize("n_features", [1, 4, 5]) +@pytest.mark.parametrize( + "min_degree, max_degree", [(0, 1), (0, 2), (1, 3), (0, 4), (3, 4)] +) +@pytest.mark.parametrize("interaction_only", [True, False]) +@pytest.mark.parametrize("include_bias", [True, False]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_num_combinations( + n_features, min_degree, max_degree, interaction_only, include_bias, csr_container +): + """ + Test that n_output_features_ is calculated correctly. + """ + x = csr_container(([1], ([0], [n_features - 1]))) + est = PolynomialFeatures( + degree=max_degree, + interaction_only=interaction_only, + include_bias=include_bias, + ) + est.fit(x) + num_combos = est.n_output_features_ + + combos = PolynomialFeatures._combinations( + n_features=n_features, + min_degree=0, + max_degree=max_degree, + interaction_only=interaction_only, + include_bias=include_bias, + ) + assert num_combos == sum([1 for _ in combos]) + + +@pytest.mark.parametrize( + ["deg", "include_bias", "interaction_only", "dtype"], + [ + (2, True, False, np.float32), + (2, True, False, np.float64), + (3, False, False, np.float64), + (3, False, True, np.float64), + ], +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_polynomial_features_csr_X_floats( + deg, include_bias, interaction_only, dtype, csr_container, global_random_seed +): + X_csr = csr_container(sparse_random(1000, 10, 0.5, random_state=global_random_seed)) + X = X_csr.toarray() + + est = PolynomialFeatures( + deg, include_bias=include_bias, interaction_only=interaction_only + ) + Xt_csr = est.fit_transform(X_csr.astype(dtype)) + Xt_dense = est.fit_transform(X.astype(dtype)) + + assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr" + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.toarray(), Xt_dense) + + +@pytest.mark.parametrize( + ["zero_row_index", "deg", "interaction_only"], + [ + (0, 2, True), + (1, 2, True), + (2, 2, True), + (0, 3, True), + (1, 3, True), + (2, 3, True), + (0, 2, False), + (1, 2, False), + (2, 2, False), + (0, 3, False), + (1, 3, False), + (2, 3, False), + ], +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_polynomial_features_csr_X_zero_row( + zero_row_index, deg, interaction_only, csr_container, global_random_seed +): + X_csr = csr_container(sparse_random(3, 10, 1.0, random_state=global_random_seed)) + X_csr[zero_row_index, :] = 0.0 + X = X_csr.toarray() + + est = PolynomialFeatures(deg, include_bias=False, interaction_only=interaction_only) + Xt_csr = est.fit_transform(X_csr) + Xt_dense = est.fit_transform(X) + + assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr" + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.toarray(), Xt_dense) + + +# This degree should always be one more than the highest degree supported by +# _csr_expansion. +@pytest.mark.parametrize( + ["include_bias", "interaction_only"], + [(True, True), (True, False), (False, True), (False, False)], +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_polynomial_features_csr_X_degree_4( + include_bias, interaction_only, csr_container, global_random_seed +): + X_csr = csr_container(sparse_random(1000, 10, 0.5, random_state=global_random_seed)) + X = X_csr.toarray() + + est = PolynomialFeatures( + 4, include_bias=include_bias, interaction_only=interaction_only + ) + Xt_csr = est.fit_transform(X_csr) + Xt_dense = est.fit_transform(X) + + assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr" + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.toarray(), Xt_dense) + + +@pytest.mark.parametrize( + ["deg", "dim", "interaction_only"], + [ + (2, 1, True), + (2, 2, True), + (3, 1, True), + (3, 2, True), + (3, 3, True), + (2, 1, False), + (2, 2, False), + (3, 1, False), + (3, 2, False), + (3, 3, False), + ], +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_polynomial_features_csr_X_dim_edges( + deg, dim, interaction_only, csr_container, global_random_seed +): + X_csr = csr_container( + sparse_random(1000, dim, 0.5, random_state=global_random_seed) + ) + X = X_csr.toarray() + + est = PolynomialFeatures(deg, interaction_only=interaction_only) + Xt_csr = est.fit_transform(X_csr) + Xt_dense = est.fit_transform(X) + + assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr" + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.toarray(), Xt_dense) + + +@pytest.mark.parametrize("interaction_only", [True, False]) +@pytest.mark.parametrize("include_bias", [True, False]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_csr_polynomial_expansion_index_overflow_non_regression( + interaction_only, include_bias, csr_container +): + """Check the automatic index dtype promotion to `np.int64` when needed. + + This ensures that sufficiently large input configurations get + properly promoted to use `np.int64` for index and indptr representation + while preserving data integrity. Non-regression test for gh-16803. + + Note that this is only possible for Python runtimes with a 64 bit address + space. On 32 bit platforms, a `ValueError` is raised instead. + """ + + def degree_2_calc(d, i, j): + if interaction_only: + return d * i - (i**2 + 3 * i) // 2 - 1 + j + else: + return d * i - (i**2 + i) // 2 + j + + n_samples = 13 + n_features = 120001 + data_dtype = np.float32 + data = np.arange(1, 5, dtype=np.int64) + row = np.array([n_samples - 2, n_samples - 2, n_samples - 1, n_samples - 1]) + # An int64 dtype is required to avoid overflow error on Windows within the + # `degree_2_calc` function. + col = np.array( + [n_features - 2, n_features - 1, n_features - 2, n_features - 1], dtype=np.int64 + ) + X = csr_container( + (data, (row, col)), + shape=(n_samples, n_features), + dtype=data_dtype, + ) + pf = PolynomialFeatures( + interaction_only=interaction_only, include_bias=include_bias, degree=2 + ) + + # Calculate the number of combinations a-priori, and if needed check for + # the correct ValueError and terminate the test early. + num_combinations = pf._num_combinations( + n_features=n_features, + min_degree=0, + max_degree=2, + interaction_only=pf.interaction_only, + include_bias=pf.include_bias, + ) + if num_combinations > np.iinfo(np.intp).max: + msg = ( + r"The output that would result from the current configuration would have" + r" \d* features which is too large to be indexed" + ) + with pytest.raises(ValueError, match=msg): + pf.fit(X) + return + X_trans = pf.fit_transform(X) + row_nonzero, col_nonzero = X_trans.nonzero() + n_degree_1_features_out = n_features + include_bias + max_degree_2_idx = ( + degree_2_calc(n_features, col[int(not interaction_only)], col[1]) + + n_degree_1_features_out + ) + + # Account for bias of all samples except last one which will be handled + # separately since there are distinct data values before it + data_target = [1] * (n_samples - 2) if include_bias else [] + col_nonzero_target = [0] * (n_samples - 2) if include_bias else [] + + for i in range(2): + x = data[2 * i] + y = data[2 * i + 1] + x_idx = col[2 * i] + y_idx = col[2 * i + 1] + if include_bias: + data_target.append(1) + col_nonzero_target.append(0) + data_target.extend([x, y]) + col_nonzero_target.extend( + [x_idx + int(include_bias), y_idx + int(include_bias)] + ) + if not interaction_only: + data_target.extend([x * x, x * y, y * y]) + col_nonzero_target.extend( + [ + degree_2_calc(n_features, x_idx, x_idx) + n_degree_1_features_out, + degree_2_calc(n_features, x_idx, y_idx) + n_degree_1_features_out, + degree_2_calc(n_features, y_idx, y_idx) + n_degree_1_features_out, + ] + ) + else: + data_target.extend([x * y]) + col_nonzero_target.append( + degree_2_calc(n_features, x_idx, y_idx) + n_degree_1_features_out + ) + + nnz_per_row = int(include_bias) + 3 + 2 * int(not interaction_only) + + assert pf.n_output_features_ == max_degree_2_idx + 1 + assert X_trans.dtype == data_dtype + assert X_trans.shape == (n_samples, max_degree_2_idx + 1) + assert X_trans.indptr.dtype == X_trans.indices.dtype == np.int64 + # Ensure that dtype promotion was actually required: + assert X_trans.indices.max() > np.iinfo(np.int32).max + + row_nonzero_target = list(range(n_samples - 2)) if include_bias else [] + row_nonzero_target.extend( + [n_samples - 2] * nnz_per_row + [n_samples - 1] * nnz_per_row + ) + + assert_allclose(X_trans.data, data_target) + assert_array_equal(row_nonzero, row_nonzero_target) + assert_array_equal(col_nonzero, col_nonzero_target) + + +@pytest.mark.parametrize( + "degree, n_features", + [ + # Needs promotion to int64 when interaction_only=False + (2, 65535), + (3, 2344), + # This guarantees that the intermediate operation when calculating + # output columns would overflow a C-long, hence checks that python- + # longs are being used. + (2, int(np.sqrt(np.iinfo(np.int64).max) + 1)), + (3, 65535), + # This case tests the second clause of the overflow check which + # takes into account the value of `n_features` itself. + (2, int(np.sqrt(np.iinfo(np.int64).max))), + ], +) +@pytest.mark.parametrize("interaction_only", [True, False]) +@pytest.mark.parametrize("include_bias", [True, False]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_csr_polynomial_expansion_index_overflow( + degree, n_features, interaction_only, include_bias, csr_container +): + """Tests known edge-cases to the dtype promotion strategy and custom + Cython code, including a current bug in the upstream + `scipy.sparse.hstack`. + """ + data = [1.0] + # Use int32 indices as much as we can + indices_dtype = np.int32 if n_features - 1 <= np.iinfo(np.int32).max else np.int64 + row = np.array([0], dtype=indices_dtype) + col = np.array([n_features - 1], dtype=indices_dtype) + + # First degree index + expected_indices = [ + n_features - 1 + int(include_bias), + ] + # Second degree index + expected_indices.append(n_features * (n_features + 1) // 2 + expected_indices[0]) + # Third degree index + expected_indices.append( + n_features * (n_features + 1) * (n_features + 2) // 6 + expected_indices[1] + ) + + X = csr_container((data, (row, col))) + pf = PolynomialFeatures( + interaction_only=interaction_only, include_bias=include_bias, degree=degree + ) + + # Calculate the number of combinations a-priori, and if needed check for + # the correct ValueError and terminate the test early. + num_combinations = pf._num_combinations( + n_features=n_features, + min_degree=0, + max_degree=degree, + interaction_only=pf.interaction_only, + include_bias=pf.include_bias, + ) + if num_combinations > np.iinfo(np.intp).max: + msg = ( + r"The output that would result from the current configuration would have" + r" \d* features which is too large to be indexed" + ) + with pytest.raises(ValueError, match=msg): + pf.fit(X) + return + + # When `n_features>=65535`, `scipy.sparse.hstack` may not use the right + # dtype for representing indices and indptr if `n_features` is still + # small enough so that each block matrix's indices and indptr arrays + # can be represented with `np.int32`. We test `n_features==65535` + # since it is guaranteed to run into this bug. + if ( + sp_version < parse_version("1.9.2") + and n_features == 65535 + and degree == 2 + and not interaction_only + ): # pragma: no cover + msg = r"In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`" + with pytest.raises(ValueError, match=msg): + X_trans = pf.fit_transform(X) + return + X_trans = pf.fit_transform(X) + + expected_dtype = np.int64 if num_combinations > np.iinfo(np.int32).max else np.int32 + # Terms higher than first degree + non_bias_terms = 1 + (degree - 1) * int(not interaction_only) + expected_nnz = int(include_bias) + non_bias_terms + assert X_trans.dtype == X.dtype + assert X_trans.shape == (1, pf.n_output_features_) + assert X_trans.indptr.dtype == X_trans.indices.dtype == expected_dtype + assert X_trans.nnz == expected_nnz + + if include_bias: + assert X_trans[0, 0] == pytest.approx(1.0) + for idx in range(non_bias_terms): + assert X_trans[0, expected_indices[idx]] == pytest.approx(1.0) + + offset = interaction_only * n_features + if degree == 3: + offset *= 1 + n_features + assert pf.n_output_features_ == expected_indices[degree - 1] + 1 - offset + + +@pytest.mark.parametrize("interaction_only", [True, False]) +@pytest.mark.parametrize("include_bias", [True, False]) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_csr_polynomial_expansion_too_large_to_index( + interaction_only, include_bias, csr_container +): + n_features = np.iinfo(np.int64).max // 2 + data = [1.0] + row = [0] + col = [n_features - 1] + X = csr_container((data, (row, col))) + pf = PolynomialFeatures( + interaction_only=interaction_only, include_bias=include_bias, degree=(2, 2) + ) + msg = ( + r"The output that would result from the current configuration would have \d*" + r" features which is too large to be indexed" + ) + with pytest.raises(ValueError, match=msg): + pf.fit(X) + with pytest.raises(ValueError, match=msg): + pf.fit_transform(X) + + +@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) +def test_polynomial_features_behaviour_on_zero_degree(sparse_container): + """Check that PolynomialFeatures raises error when degree=0 and include_bias=False, + and output a single constant column when include_bias=True + """ + X = np.ones((10, 2)) + poly = PolynomialFeatures(degree=0, include_bias=False) + err_msg = ( + "Setting degree to zero and include_bias to False would result in" + " an empty output array." + ) + with pytest.raises(ValueError, match=err_msg): + poly.fit_transform(X) + + poly = PolynomialFeatures(degree=(0, 0), include_bias=False) + err_msg = ( + "Setting both min_degree and max_degree to zero and include_bias to" + " False would result in an empty output array." + ) + with pytest.raises(ValueError, match=err_msg): + poly.fit_transform(X) + + for _X in [X, sparse_container(X)]: + poly = PolynomialFeatures(degree=0, include_bias=True) + output = poly.fit_transform(_X) + # convert to dense array if needed + if sparse.issparse(output): + output = output.toarray() + assert_array_equal(output, np.ones((X.shape[0], 1))) + + +def test_sizeof_LARGEST_INT_t(): + # On Windows, scikit-learn is typically compiled with MSVC that + # does not support int128 arithmetic (at the time of writing): + # https://stackoverflow.com/a/6761962/163740 + if sys.platform == "win32" or ( + sys.maxsize <= 2**32 and sys.platform != "emscripten" + ): + expected_size = 8 + else: + expected_size = 16 + + assert _get_sizeof_LARGEST_INT_t() == expected_size + + +@pytest.mark.xfail( + sys.platform == "win32", + reason=( + "On Windows, scikit-learn is typically compiled with MSVC that does not support" + " int128 arithmetic (at the time of writing)" + ), + run=True, +) +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_csr_polynomial_expansion_windows_fail(csr_container): + # Minimum needed to ensure integer overflow occurs while guaranteeing an + # int64-indexable output. + n_features = int(np.iinfo(np.int64).max ** (1 / 3) + 3) + data = [1.0] + row = [0] + col = [n_features - 1] + + # First degree index + expected_indices = [ + n_features - 1, + ] + # Second degree index + expected_indices.append( + int(n_features * (n_features + 1) // 2 + expected_indices[0]) + ) + # Third degree index + expected_indices.append( + int(n_features * (n_features + 1) * (n_features + 2) // 6 + expected_indices[1]) + ) + + X = csr_container((data, (row, col))) + pf = PolynomialFeatures(interaction_only=False, include_bias=False, degree=3) + if sys.maxsize <= 2**32: + msg = ( + r"The output that would result from the current configuration would" + r" have \d*" + r" features which is too large to be indexed" + ) + with pytest.raises(ValueError, match=msg): + pf.fit_transform(X) + else: + X_trans = pf.fit_transform(X) + for idx in range(3): + assert X_trans[0, expected_indices[idx]] == pytest.approx(1.0) diff --git a/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_target_encoder.py b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_target_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..536f2e031bf771dab7d73b7f4d5447b155c53ec3 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/preprocessing/tests/test_target_encoder.py @@ -0,0 +1,714 @@ +import re + +import numpy as np +import pytest +from numpy.testing import assert_allclose, assert_array_equal + +from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import Ridge +from sklearn.model_selection import ( + KFold, + ShuffleSplit, + StratifiedKFold, + cross_val_score, + train_test_split, +) +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import ( + KBinsDiscretizer, + LabelBinarizer, + LabelEncoder, + TargetEncoder, +) + + +def _encode_target(X_ordinal, y_numeric, n_categories, smooth): + """Simple Python implementation of target encoding.""" + cur_encodings = np.zeros(n_categories, dtype=np.float64) + y_mean = np.mean(y_numeric) + + if smooth == "auto": + y_variance = np.var(y_numeric) + for c in range(n_categories): + y_subset = y_numeric[X_ordinal == c] + n_i = y_subset.shape[0] + + if n_i == 0: + cur_encodings[c] = y_mean + continue + + y_subset_variance = np.var(y_subset) + m = y_subset_variance / y_variance + lambda_ = n_i / (n_i + m) + + cur_encodings[c] = lambda_ * np.mean(y_subset) + (1 - lambda_) * y_mean + return cur_encodings + else: # float + for c in range(n_categories): + y_subset = y_numeric[X_ordinal == c] + current_sum = np.sum(y_subset) + y_mean * smooth + current_cnt = y_subset.shape[0] + smooth + cur_encodings[c] = current_sum / current_cnt + return cur_encodings + + +@pytest.mark.parametrize( + "categories, unknown_value", + [ + ([np.array([0, 1, 2], dtype=np.int64)], 4), + ([np.array([1.0, 3.0, np.nan], dtype=np.float64)], 6.0), + ([np.array(["cat", "dog", "snake"], dtype=object)], "bear"), + ("auto", 3), + ], +) +@pytest.mark.parametrize("smooth", [5.0, "auto"]) +@pytest.mark.parametrize("target_type", ["binary", "continuous"]) +def test_encoding(categories, unknown_value, global_random_seed, smooth, target_type): + """Check encoding for binary and continuous targets. + + Compare the values returned by `TargetEncoder.fit_transform` against the + expected encodings for cv splits from a naive reference Python + implementation in _encode_target. + """ + + n_categories = 3 + X_train_int_array = np.array([[0] * 20 + [1] * 30 + [2] * 40], dtype=np.int64).T + X_test_int_array = np.array([[0, 1, 2]], dtype=np.int64).T + n_samples = X_train_int_array.shape[0] + + if categories == "auto": + X_train = X_train_int_array + X_test = X_test_int_array + else: + X_train = categories[0][X_train_int_array] + X_test = categories[0][X_test_int_array] + + X_test = np.concatenate((X_test, [[unknown_value]])) + + data_rng = np.random.RandomState(global_random_seed) + n_splits = 3 + if target_type == "binary": + y_numeric = data_rng.randint(low=0, high=2, size=n_samples) + target_names = np.array(["cat", "dog"], dtype=object) + y_train = target_names[y_numeric] + + else: + assert target_type == "continuous" + y_numeric = data_rng.uniform(low=-10, high=20, size=n_samples) + y_train = y_numeric + + shuffled_idx = data_rng.permutation(n_samples) + X_train_int_array = X_train_int_array[shuffled_idx] + X_train = X_train[shuffled_idx] + y_train = y_train[shuffled_idx] + y_numeric = y_numeric[shuffled_idx] + + # Define our CV splitting strategy + if target_type == "binary": + cv = StratifiedKFold( + n_splits=n_splits, random_state=global_random_seed, shuffle=True + ) + else: + cv = KFold(n_splits=n_splits, random_state=global_random_seed, shuffle=True) + + # Compute the expected values using our reference Python implementation of + # target encoding: + expected_X_fit_transform = np.empty_like(X_train_int_array, dtype=np.float64) + + for train_idx, test_idx in cv.split(X_train_int_array, y_train): + X_, y_ = X_train_int_array[train_idx, 0], y_numeric[train_idx] + cur_encodings = _encode_target(X_, y_, n_categories, smooth) + expected_X_fit_transform[test_idx, 0] = cur_encodings[ + X_train_int_array[test_idx, 0] + ] + + # Check that we can obtain the same encodings by calling `fit_transform` on + # the estimator with the same CV parameters: + target_encoder = TargetEncoder( + smooth=smooth, + categories=categories, + cv=n_splits, + random_state=global_random_seed, + ) + + X_fit_transform = target_encoder.fit_transform(X_train, y_train) + + assert target_encoder.target_type_ == target_type + assert_allclose(X_fit_transform, expected_X_fit_transform) + assert len(target_encoder.encodings_) == 1 + if target_type == "binary": + assert_array_equal(target_encoder.classes_, target_names) + else: + assert target_encoder.classes_ is None + + # compute encodings for all data to validate `transform` + y_mean = np.mean(y_numeric) + expected_encodings = _encode_target( + X_train_int_array[:, 0], y_numeric, n_categories, smooth + ) + assert_allclose(target_encoder.encodings_[0], expected_encodings) + assert target_encoder.target_mean_ == pytest.approx(y_mean) + + # Transform on test data, the last value is unknown so it is encoded as the target + # mean + expected_X_test_transform = np.concatenate( + (expected_encodings, np.array([y_mean])) + ).reshape(-1, 1) + + X_test_transform = target_encoder.transform(X_test) + assert_allclose(X_test_transform, expected_X_test_transform) + + +@pytest.mark.parametrize( + "categories, unknown_values", + [ + ([np.array([0, 1, 2], dtype=np.int64)], "auto"), + ([np.array(["cat", "dog", "snake"], dtype=object)], ["bear", "rabbit"]), + ], +) +@pytest.mark.parametrize( + "target_labels", [np.array([1, 2, 3]), np.array(["a", "b", "c"])] +) +@pytest.mark.parametrize("smooth", [5.0, "auto"]) +def test_encoding_multiclass( + global_random_seed, categories, unknown_values, target_labels, smooth +): + """Check encoding for multiclass targets.""" + rng = np.random.RandomState(global_random_seed) + + n_samples = 80 + n_features = 2 + feat_1_int = np.array(rng.randint(low=0, high=2, size=n_samples)) + feat_2_int = np.array(rng.randint(low=0, high=3, size=n_samples)) + feat_1 = categories[0][feat_1_int] + feat_2 = categories[0][feat_2_int] + X_train = np.column_stack((feat_1, feat_2)) + X_train_int = np.column_stack((feat_1_int, feat_2_int)) + categories_ = [[0, 1], [0, 1, 2]] + + n_classes = 3 + y_train_int = np.array(rng.randint(low=0, high=n_classes, size=n_samples)) + y_train = target_labels[y_train_int] + y_train_enc = LabelBinarizer().fit_transform(y_train) + + n_splits = 3 + cv = StratifiedKFold( + n_splits=n_splits, random_state=global_random_seed, shuffle=True + ) + + # Manually compute encodings for cv splits to validate `fit_transform` + expected_X_fit_transform = np.empty( + (X_train_int.shape[0], X_train_int.shape[1] * n_classes), + dtype=np.float64, + ) + for f_idx, cats in enumerate(categories_): + for c_idx in range(n_classes): + for train_idx, test_idx in cv.split(X_train, y_train): + y_class = y_train_enc[:, c_idx] + X_, y_ = X_train_int[train_idx, f_idx], y_class[train_idx] + current_encoding = _encode_target(X_, y_, len(cats), smooth) + # f_idx: 0, 0, 0, 1, 1, 1 + # c_idx: 0, 1, 2, 0, 1, 2 + # exp_idx: 0, 1, 2, 3, 4, 5 + exp_idx = c_idx + (f_idx * n_classes) + expected_X_fit_transform[test_idx, exp_idx] = current_encoding[ + X_train_int[test_idx, f_idx] + ] + + target_encoder = TargetEncoder( + smooth=smooth, + cv=n_splits, + random_state=global_random_seed, + ) + X_fit_transform = target_encoder.fit_transform(X_train, y_train) + + assert target_encoder.target_type_ == "multiclass" + assert_allclose(X_fit_transform, expected_X_fit_transform) + + # Manually compute encoding to validate `transform` + expected_encodings = [] + for f_idx, cats in enumerate(categories_): + for c_idx in range(n_classes): + y_class = y_train_enc[:, c_idx] + current_encoding = _encode_target( + X_train_int[:, f_idx], y_class, len(cats), smooth + ) + expected_encodings.append(current_encoding) + + assert len(target_encoder.encodings_) == n_features * n_classes + for i in range(n_features * n_classes): + assert_allclose(target_encoder.encodings_[i], expected_encodings[i]) + assert_array_equal(target_encoder.classes_, target_labels) + + # Include unknown values at the end + X_test_int = np.array([[0, 1], [1, 2], [4, 5]]) + if unknown_values == "auto": + X_test = X_test_int + else: + X_test = np.empty_like(X_test_int[:-1, :], dtype=object) + for column_idx in range(X_test_int.shape[1]): + X_test[:, column_idx] = categories[0][X_test_int[:-1, column_idx]] + # Add unknown values at end + X_test = np.vstack((X_test, unknown_values)) + + y_mean = np.mean(y_train_enc, axis=0) + expected_X_test_transform = np.empty( + (X_test_int.shape[0], X_test_int.shape[1] * n_classes), + dtype=np.float64, + ) + n_rows = X_test_int.shape[0] + f_idx = [0, 0, 0, 1, 1, 1] + # Last row are unknowns, dealt with later + for row_idx in range(n_rows - 1): + for i, enc in enumerate(expected_encodings): + expected_X_test_transform[row_idx, i] = enc[X_test_int[row_idx, f_idx[i]]] + + # Unknowns encoded as target mean for each class + # `y_mean` contains target mean for each class, thus cycle through mean of + # each class, `n_features` times + mean_idx = [0, 1, 2, 0, 1, 2] + for i in range(n_classes * n_features): + expected_X_test_transform[n_rows - 1, i] = y_mean[mean_idx[i]] + + X_test_transform = target_encoder.transform(X_test) + assert_allclose(X_test_transform, expected_X_test_transform) + + +@pytest.mark.parametrize( + "X, categories", + [ + ( + np.array([[0] * 10 + [1] * 10 + [3]], dtype=np.int64).T, # 3 is unknown + [[0, 1, 2]], + ), + ( + np.array( + [["cat"] * 10 + ["dog"] * 10 + ["snake"]], dtype=object + ).T, # snake is unknown + [["dog", "cat", "cow"]], + ), + ], +) +@pytest.mark.parametrize("smooth", [4.0, "auto"]) +def test_custom_categories(X, categories, smooth): + """Custom categories with unknown categories that are not in training data.""" + rng = np.random.RandomState(0) + y = rng.uniform(low=-10, high=20, size=X.shape[0]) + enc = TargetEncoder(categories=categories, smooth=smooth, random_state=0).fit(X, y) + + # The last element is unknown and encoded as the mean + y_mean = y.mean() + X_trans = enc.transform(X[-1:]) + assert X_trans[0, 0] == pytest.approx(y_mean) + + assert len(enc.encodings_) == 1 + # custom category that is not in training data + assert enc.encodings_[0][-1] == pytest.approx(y_mean) + + +@pytest.mark.parametrize( + "y, msg", + [ + ([1, 2, 0, 1], "Found input variables with inconsistent"), + ( + np.array([[1, 2, 0], [1, 2, 3]]).T, + "Target type was inferred to be 'multiclass-multioutput'", + ), + ], +) +def test_errors(y, msg): + """Check invalidate input.""" + X = np.array([[1, 0, 1]]).T + + enc = TargetEncoder() + with pytest.raises(ValueError, match=msg): + enc.fit_transform(X, y) + + +def test_use_regression_target(): + """Check inferred and specified `target_type` on regression target.""" + X = np.array([[0, 1, 0, 1, 0, 1]]).T + y = np.array([1.0, 2.0, 3.0, 2.0, 3.0, 4.0]) + + enc = TargetEncoder(cv=2) + with pytest.warns( + UserWarning, + match=re.escape( + "The least populated class in y has only 1 members, which is less than" + " n_splits=2." + ), + ): + enc.fit_transform(X, y) + assert enc.target_type_ == "multiclass" + + enc = TargetEncoder(cv=2, target_type="continuous") + enc.fit_transform(X, y) + assert enc.target_type_ == "continuous" + + +@pytest.mark.parametrize( + "y, feature_names", + [ + ([1, 2] * 10, ["A", "B"]), + ([1, 2, 3] * 6 + [1, 2], ["A_1", "A_2", "A_3", "B_1", "B_2", "B_3"]), + ( + ["y1", "y2", "y3"] * 6 + ["y1", "y2"], + ["A_y1", "A_y2", "A_y3", "B_y1", "B_y2", "B_y3"], + ), + ], +) +def test_feature_names_out_set_output(y, feature_names): + """Check TargetEncoder works with set_output.""" + pd = pytest.importorskip("pandas") + + X_df = pd.DataFrame({"A": ["a", "b"] * 10, "B": [1, 2] * 10}) + + enc_default = TargetEncoder(cv=2, smooth=3.0, random_state=0) + enc_default.set_output(transform="default") + enc_pandas = TargetEncoder(cv=2, smooth=3.0, random_state=0) + enc_pandas.set_output(transform="pandas") + + X_default = enc_default.fit_transform(X_df, y) + X_pandas = enc_pandas.fit_transform(X_df, y) + + assert_allclose(X_pandas.to_numpy(), X_default) + assert_array_equal(enc_pandas.get_feature_names_out(), feature_names) + assert_array_equal(enc_pandas.get_feature_names_out(), X_pandas.columns) + + +@pytest.mark.parametrize("to_pandas", [True, False]) +@pytest.mark.parametrize("smooth", [1.0, "auto"]) +@pytest.mark.parametrize("target_type", ["binary-ints", "binary-str", "continuous"]) +def test_multiple_features_quick(to_pandas, smooth, target_type): + """Check target encoder with multiple features.""" + X_ordinal = np.array( + [[1, 1], [0, 1], [1, 1], [2, 1], [1, 0], [0, 1], [1, 0], [0, 0]], dtype=np.int64 + ) + if target_type == "binary-str": + y_train = np.array(["a", "b", "a", "a", "b", "b", "a", "b"]) + y_integer = LabelEncoder().fit_transform(y_train) + cv = StratifiedKFold(2, random_state=0, shuffle=True) + elif target_type == "binary-ints": + y_train = np.array([3, 4, 3, 3, 3, 4, 4, 4]) + y_integer = LabelEncoder().fit_transform(y_train) + cv = StratifiedKFold(2, random_state=0, shuffle=True) + else: + y_train = np.array([3.0, 5.1, 2.4, 3.5, 4.1, 5.5, 10.3, 7.3], dtype=np.float32) + y_integer = y_train + cv = KFold(2, random_state=0, shuffle=True) + y_mean = np.mean(y_integer) + categories = [[0, 1, 2], [0, 1]] + + X_test = np.array( + [ + [0, 1], + [3, 0], # 3 is unknown + [1, 10], # 10 is unknown + ], + dtype=np.int64, + ) + + if to_pandas: + pd = pytest.importorskip("pandas") + # convert second feature to an object + X_train = pd.DataFrame( + { + "feat0": X_ordinal[:, 0], + "feat1": np.array(["cat", "dog"], dtype=object)[X_ordinal[:, 1]], + } + ) + # "snake" is unknown + X_test = pd.DataFrame({"feat0": X_test[:, 0], "feat1": ["dog", "cat", "snake"]}) + else: + X_train = X_ordinal + + # manually compute encoding for fit_transform + expected_X_fit_transform = np.empty_like(X_ordinal, dtype=np.float64) + for f_idx, cats in enumerate(categories): + for train_idx, test_idx in cv.split(X_ordinal, y_integer): + X_, y_ = X_ordinal[train_idx, f_idx], y_integer[train_idx] + current_encoding = _encode_target(X_, y_, len(cats), smooth) + expected_X_fit_transform[test_idx, f_idx] = current_encoding[ + X_ordinal[test_idx, f_idx] + ] + + # manually compute encoding for transform + expected_encodings = [] + for f_idx, cats in enumerate(categories): + current_encoding = _encode_target( + X_ordinal[:, f_idx], y_integer, len(cats), smooth + ) + expected_encodings.append(current_encoding) + + expected_X_test_transform = np.array( + [ + [expected_encodings[0][0], expected_encodings[1][1]], + [y_mean, expected_encodings[1][0]], + [expected_encodings[0][1], y_mean], + ], + dtype=np.float64, + ) + + enc = TargetEncoder(smooth=smooth, cv=2, random_state=0) + X_fit_transform = enc.fit_transform(X_train, y_train) + assert_allclose(X_fit_transform, expected_X_fit_transform) + + assert len(enc.encodings_) == 2 + for i in range(2): + assert_allclose(enc.encodings_[i], expected_encodings[i]) + + X_test_transform = enc.transform(X_test) + assert_allclose(X_test_transform, expected_X_test_transform) + + +@pytest.mark.parametrize( + "y, y_mean", + [ + (np.array([3.4] * 20), 3.4), + (np.array([0] * 20), 0), + (np.array(["a"] * 20, dtype=object), 0), + ], + ids=["continuous", "binary", "binary-string"], +) +@pytest.mark.parametrize("smooth", ["auto", 4.0, 0.0]) +def test_constant_target_and_feature(y, y_mean, smooth): + """Check edge case where feature and target is constant.""" + X = np.array([[1] * 20]).T + n_samples = X.shape[0] + + enc = TargetEncoder(cv=2, smooth=smooth, random_state=0) + X_trans = enc.fit_transform(X, y) + assert_allclose(X_trans, np.repeat([[y_mean]], n_samples, axis=0)) + assert enc.encodings_[0][0] == pytest.approx(y_mean) + assert enc.target_mean_ == pytest.approx(y_mean) + + X_test = np.array([[1], [0]]) + X_test_trans = enc.transform(X_test) + assert_allclose(X_test_trans, np.repeat([[y_mean]], 2, axis=0)) + + +def test_fit_transform_not_associated_with_y_if_ordinal_categorical_is_not( + global_random_seed, +): + cardinality = 30 # not too large, otherwise we need a very large n_samples + n_samples = 3000 + rng = np.random.RandomState(global_random_seed) + y_train = rng.normal(size=n_samples) + X_train = rng.randint(0, cardinality, size=n_samples).reshape(-1, 1) + + # Sort by y_train to attempt to cause a leak + y_sorted_indices = y_train.argsort() + y_train = y_train[y_sorted_indices] + X_train = X_train[y_sorted_indices] + + target_encoder = TargetEncoder(shuffle=True, random_state=global_random_seed) + X_encoded_train_shuffled = target_encoder.fit_transform(X_train, y_train) + + target_encoder = TargetEncoder(shuffle=False) + X_encoded_train_no_shuffled = target_encoder.fit_transform(X_train, y_train) + + # Check that no information about y_train has leaked into X_train: + regressor = RandomForestRegressor( + n_estimators=10, min_samples_leaf=20, random_state=global_random_seed + ) + + # It's impossible to learn a good predictive model on the training set when + # using the original representation X_train or the target encoded + # representation with shuffled inner CV. For the latter, no information + # about y_train has inadvertently leaked into the prior used to generate + # `X_encoded_train_shuffled`: + cv = ShuffleSplit(n_splits=50, random_state=global_random_seed) + assert cross_val_score(regressor, X_train, y_train, cv=cv).mean() < 0.1 + assert ( + cross_val_score(regressor, X_encoded_train_shuffled, y_train, cv=cv).mean() + < 0.1 + ) + + # Without the inner CV shuffling, a lot of information about y_train goes into the + # the per-fold y_train.mean() priors: shrinkage is no longer effective in this + # case and would no longer be able to prevent downstream over-fitting. + assert ( + cross_val_score(regressor, X_encoded_train_no_shuffled, y_train, cv=cv).mean() + > 0.5 + ) + + +def test_smooth_zero(): + """Check edge case with zero smoothing and cv does not contain category.""" + X = np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]).T + y = np.array([2.1, 4.3, 1.2, 3.1, 1.0, 9.0, 10.3, 14.2, 13.3, 15.0]) + + enc = TargetEncoder(smooth=0.0, shuffle=False, cv=2) + X_trans = enc.fit_transform(X, y) + + # With cv = 2, category 0 does not exist in the second half, thus + # it will be encoded as the mean of the second half + assert_allclose(X_trans[0], np.mean(y[5:])) + + # category 1 does not exist in the first half, thus it will be encoded as + # the mean of the first half + assert_allclose(X_trans[-1], np.mean(y[:5])) + + +@pytest.mark.parametrize("smooth", [0.0, 1e3, "auto"]) +def test_invariance_of_encoding_under_label_permutation(smooth, global_random_seed): + # Check that the encoding does not depend on the integer of the value of + # the integer labels. This is quite a trivial property but it is helpful + # to understand the following test. + rng = np.random.RandomState(global_random_seed) + + # Random y and informative categorical X to make the test non-trivial when + # using smoothing. + y = rng.normal(size=1000) + n_categories = 30 + X = KBinsDiscretizer( + n_bins=n_categories, quantile_method="averaged_inverted_cdf", encode="ordinal" + ).fit_transform(y.reshape(-1, 1)) + + X_train, X_test, y_train, y_test = train_test_split( + X, y, random_state=global_random_seed + ) + + # Shuffle the labels to make sure that the encoding is invariant to the + # permutation of the labels + permutated_labels = rng.permutation(n_categories) + X_train_permuted = permutated_labels[X_train.astype(np.int32)] + X_test_permuted = permutated_labels[X_test.astype(np.int32)] + + target_encoder = TargetEncoder(smooth=smooth, random_state=global_random_seed) + X_train_encoded = target_encoder.fit_transform(X_train, y_train) + X_test_encoded = target_encoder.transform(X_test) + + X_train_permuted_encoded = target_encoder.fit_transform(X_train_permuted, y_train) + X_test_permuted_encoded = target_encoder.transform(X_test_permuted) + + assert_allclose(X_train_encoded, X_train_permuted_encoded) + assert_allclose(X_test_encoded, X_test_permuted_encoded) + + +@pytest.mark.parametrize("smooth", [0.0, "auto"]) +def test_target_encoding_for_linear_regression(smooth, global_random_seed): + # Check some expected statistical properties when fitting a linear + # regression model on target encoded features depending on their relation + # with that target. + + # In this test, we use the Ridge class with the "lsqr" solver and a little + # bit of regularization to implement a linear regression model that + # converges quickly for large `n_samples` and robustly in case of + # correlated features. Since we will fit this model on a mean centered + # target, we do not need to fit an intercept and this will help simplify + # the analysis with respect to the expected coefficients. + linear_regression = Ridge(alpha=1e-6, solver="lsqr", fit_intercept=False) + + # Construct a random target variable. We need a large number of samples for + # this test to be stable across all values of the random seed. + n_samples = 50_000 + rng = np.random.RandomState(global_random_seed) + y = rng.randn(n_samples) + + # Generate a single informative ordinal feature with medium cardinality. + # Inject some irreducible noise to make it harder for a multivariate model + # to identify the informative feature from other pure noise features. + noise = 0.8 * rng.randn(n_samples) + n_categories = 100 + X_informative = KBinsDiscretizer( + n_bins=n_categories, + encode="ordinal", + strategy="uniform", + random_state=rng, + ).fit_transform((y + noise).reshape(-1, 1)) + + # Let's permute the labels to hide the fact that this feature is + # informative to naive linear regression model trained on the raw ordinal + # values. As highlighted in the previous test, the target encoding should be + # invariant to such a permutation. + permutated_labels = rng.permutation(n_categories) + X_informative = permutated_labels[X_informative.astype(np.int32)] + + # Generate a shuffled copy of the informative feature to destroy the + # relationship with the target. + X_shuffled = rng.permutation(X_informative) + + # Also include a very high cardinality categorical feature that is by + # itself independent of the target variable: target encoding such a feature + # without internal cross-validation should cause catastrophic overfitting + # for the downstream regressor, even with shrinkage. This kind of features + # typically represents near unique identifiers of samples. In general they + # should be removed from a machine learning datasets but here we want to + # study the ability of the default behavior of TargetEncoder to mitigate + # them automatically. + X_near_unique_categories = rng.choice( + int(0.9 * n_samples), size=n_samples, replace=True + ).reshape(-1, 1) + + # Assemble the dataset and do a train-test split: + X = np.concatenate( + [X_informative, X_shuffled, X_near_unique_categories], + axis=1, + ) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + # Let's first check that a linear regression model trained on the raw + # features underfits because of the meaning-less ordinal encoding of the + # labels. + raw_model = linear_regression.fit(X_train, y_train) + assert raw_model.score(X_train, y_train) < 0.1 + assert raw_model.score(X_test, y_test) < 0.1 + + # Now do the same with target encoding using the internal CV mechanism + # implemented when using fit_transform. + model_with_cv = make_pipeline( + TargetEncoder(smooth=smooth, random_state=rng), linear_regression + ).fit(X_train, y_train) + + # This model should be able to fit the data well and also generalise to the + # test data (assuming that the binning is fine-grained enough). The R2 + # scores are not perfect because of the noise injected during the + # generation of the unique informative feature. + coef = model_with_cv[-1].coef_ + assert model_with_cv.score(X_train, y_train) > 0.5, coef + assert model_with_cv.score(X_test, y_test) > 0.5, coef + + # The target encoder recovers the linear relationship with slope 1 between + # the target encoded unique informative predictor and the target. Since the + # target encoding of the 2 other features is not informative thanks to the + # use of internal cross-validation, the multivariate linear regressor + # assigns a coef of 1 to the first feature and 0 to the other 2. + assert coef[0] == pytest.approx(1, abs=1e-2) + assert (np.abs(coef[1:]) < 0.2).all() + + # Let's now disable the internal cross-validation by calling fit and then + # transform separately on the training set: + target_encoder = TargetEncoder(smooth=smooth, random_state=rng).fit( + X_train, y_train + ) + X_enc_no_cv_train = target_encoder.transform(X_train) + X_enc_no_cv_test = target_encoder.transform(X_test) + model_no_cv = linear_regression.fit(X_enc_no_cv_train, y_train) + + # The linear regression model should always overfit because it assigns + # too much weight to the extremely high cardinality feature relatively to + # the informative feature. Note that this is the case even when using + # the empirical Bayes smoothing which is not enough to prevent such + # overfitting alone. + coef = model_no_cv.coef_ + assert model_no_cv.score(X_enc_no_cv_train, y_train) > 0.7, coef + assert model_no_cv.score(X_enc_no_cv_test, y_test) < 0.5, coef + + # The model overfits because it assigns too much weight to the high + # cardinality yet non-informative feature instead of the lower + # cardinality yet informative feature: + assert abs(coef[0]) < abs(coef[2]) + + +def test_pandas_copy_on_write(): + """ + Test target-encoder cython code when y is read-only. + + The numpy array underlying df["y"] is read-only when copy-on-write is enabled. + Non-regression test for gh-27879. + """ + pd = pytest.importorskip("pandas", minversion="2.0") + with pd.option_context("mode.copy_on_write", True): + df = pd.DataFrame({"x": ["a", "b", "b"], "y": [4.0, 5.0, 6.0]}) + TargetEncoder(target_type="continuous").fit(df[["x"]], df["y"]) diff --git a/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..453cd5edc348bf1a0d957e011cd2fa85fee9b34a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/__init__.py @@ -0,0 +1,13 @@ +"""Semi-supervised learning algorithms. + +These algorithms utilize small amounts of labeled data and large amounts of unlabeled +data for classification tasks. +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._label_propagation import LabelPropagation, LabelSpreading +from ._self_training import SelfTrainingClassifier + +__all__ = ["LabelPropagation", "LabelSpreading", "SelfTrainingClassifier"] diff --git a/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/_label_propagation.py b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/_label_propagation.py new file mode 100644 index 0000000000000000000000000000000000000000..559a17a13d6ae35f4a97a008d6e4c07e4dc77923 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/_label_propagation.py @@ -0,0 +1,630 @@ +# coding=utf8 +""" +Label propagation in the context of this module refers to a set of +semi-supervised classification algorithms. At a high level, these algorithms +work by forming a fully-connected graph between all points given and solving +for the steady-state distribution of labels at each point. + +These algorithms perform very well in practice. The cost of running can be very +expensive, at approximately O(N^3) where N is the number of (labeled and +unlabeled) points. The theory (why they perform so well) is motivated by +intuitions from random walk algorithms and geometric relationships in the data. +For more information see the references below. + +Model Features +-------------- +Label clamping: + The algorithm tries to learn distributions of labels over the dataset given + label assignments over an initial subset. In one variant, the algorithm does + not allow for any errors in the initial assignment (hard-clamping) while + in another variant, the algorithm allows for some wiggle room for the initial + assignments, allowing them to change by a fraction alpha in each iteration + (soft-clamping). + +Kernel: + A function which projects a vector into some higher dimensional space. This + implementation supports RBF and KNN kernels. Using the RBF kernel generates + a dense matrix of size O(N^2). KNN kernel will generate a sparse matrix of + size O(k*N) which will run much faster. See the documentation for SVMs for + more info on kernels. + +Examples +-------- +>>> import numpy as np +>>> from sklearn import datasets +>>> from sklearn.semi_supervised import LabelPropagation +>>> label_prop_model = LabelPropagation() +>>> iris = datasets.load_iris() +>>> rng = np.random.RandomState(42) +>>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3 +>>> labels = np.copy(iris.target) +>>> labels[random_unlabeled_points] = -1 +>>> label_prop_model.fit(iris.data, labels) +LabelPropagation(...) + +Notes +----- +References: +[1] Yoshua Bengio, Olivier Delalleau, Nicolas Le Roux. In Semi-Supervised +Learning (2006), pp. 193-216 + +[2] Olivier Delalleau, Yoshua Bengio, Nicolas Le Roux. Efficient +Non-Parametric Function Induction in Semi-Supervised Learning. AISTAT 2005 +""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from abc import ABCMeta, abstractmethod +from numbers import Integral, Real + +import numpy as np +from scipy import sparse + +from ..base import BaseEstimator, ClassifierMixin, _fit_context +from ..exceptions import ConvergenceWarning +from ..metrics.pairwise import rbf_kernel +from ..neighbors import NearestNeighbors +from ..utils._param_validation import Interval, StrOptions +from ..utils.extmath import safe_sparse_dot +from ..utils.fixes import laplacian as csgraph_laplacian +from ..utils.multiclass import check_classification_targets +from ..utils.validation import check_is_fitted, validate_data + + +class BaseLabelPropagation(ClassifierMixin, BaseEstimator, metaclass=ABCMeta): + """Base class for label propagation module. + + Parameters + ---------- + kernel : {'knn', 'rbf'} or callable, default='rbf' + String identifier for kernel function to use or the kernel function + itself. Only 'rbf' and 'knn' strings are valid inputs. The function + passed should take two inputs, each of shape (n_samples, n_features), + and return a (n_samples, n_samples) shaped weight matrix. + + gamma : float, default=20 + Parameter for rbf kernel. + + n_neighbors : int, default=7 + Parameter for knn kernel. Need to be strictly positive. + + alpha : float, default=1.0 + Clamping factor. + + max_iter : int, default=30 + Change maximum number of iterations allowed. + + tol : float, default=1e-3 + Convergence tolerance: threshold to consider the system at steady + state. + + n_jobs : int, default=None + The number of parallel jobs to run. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + """ + + _parameter_constraints: dict = { + "kernel": [StrOptions({"knn", "rbf"}), callable], + "gamma": [Interval(Real, 0, None, closed="left")], + "n_neighbors": [Interval(Integral, 0, None, closed="neither")], + "alpha": [None, Interval(Real, 0, 1, closed="neither")], + "max_iter": [Interval(Integral, 0, None, closed="neither")], + "tol": [Interval(Real, 0, None, closed="left")], + "n_jobs": [None, Integral], + } + + def __init__( + self, + kernel="rbf", + *, + gamma=20, + n_neighbors=7, + alpha=1, + max_iter=30, + tol=1e-3, + n_jobs=None, + ): + self.max_iter = max_iter + self.tol = tol + + # kernel parameters + self.kernel = kernel + self.gamma = gamma + self.n_neighbors = n_neighbors + + # clamping factor + self.alpha = alpha + + self.n_jobs = n_jobs + + def _get_kernel(self, X, y=None): + if self.kernel == "rbf": + if y is None: + return rbf_kernel(X, X, gamma=self.gamma) + else: + return rbf_kernel(X, y, gamma=self.gamma) + elif self.kernel == "knn": + if self.nn_fit is None: + self.nn_fit = NearestNeighbors( + n_neighbors=self.n_neighbors, n_jobs=self.n_jobs + ).fit(X) + if y is None: + return self.nn_fit.kneighbors_graph( + self.nn_fit._fit_X, self.n_neighbors, mode="connectivity" + ) + else: + return self.nn_fit.kneighbors(y, return_distance=False) + elif callable(self.kernel): + if y is None: + return self.kernel(X, X) + else: + return self.kernel(X, y) + + @abstractmethod + def _build_graph(self): + raise NotImplementedError( + "Graph construction must be implemented to fit a label propagation model." + ) + + def predict(self, X): + """Perform inductive inference across the model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data matrix. + + Returns + ------- + y : ndarray of shape (n_samples,) + Predictions for input data. + """ + # Note: since `predict` does not accept semi-supervised labels as input, + # `fit(X, y).predict(X) != fit(X, y).transduction_`. + # Hence, `fit_predict` is not implemented. + # See https://github.com/scikit-learn/scikit-learn/pull/24898 + probas = self.predict_proba(X) + return self.classes_[np.argmax(probas, axis=1)].ravel() + + def predict_proba(self, X): + """Predict probability for each possible outcome. + + Compute the probability estimates for each single sample in X + and each possible outcome seen during training (categorical + distribution). + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data matrix. + + Returns + ------- + probabilities : ndarray of shape (n_samples, n_classes) + Normalized probability distributions across + class labels. + """ + check_is_fitted(self) + + X_2d = validate_data( + self, + X, + accept_sparse=["csc", "csr", "coo", "dok", "bsr", "lil", "dia"], + reset=False, + ) + weight_matrices = self._get_kernel(self.X_, X_2d) + if self.kernel == "knn": + probabilities = np.array( + [ + np.sum(self.label_distributions_[weight_matrix], axis=0) + for weight_matrix in weight_matrices + ] + ) + else: + weight_matrices = weight_matrices.T + probabilities = safe_sparse_dot(weight_matrices, self.label_distributions_) + normalizer = np.atleast_2d(np.sum(probabilities, axis=1)).T + probabilities /= normalizer + return probabilities + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y): + """Fit a semi-supervised label propagation model to X. + + The input samples (labeled and unlabeled) are provided by matrix X, + and target labels are provided by matrix y. We conventionally apply the + label -1 to unlabeled samples in matrix y in a semi-supervised + classification. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target class values with unlabeled points marked as -1. + All unlabeled samples will be transductively assigned labels + internally, which are stored in `transduction_`. + + Returns + ------- + self : object + Returns the instance itself. + """ + X, y = validate_data( + self, + X, + y, + accept_sparse=["csr", "csc"], + reset=True, + ) + self.X_ = X + check_classification_targets(y) + + # actual graph construction (implementations should override this) + graph_matrix = self._build_graph() + + # label construction + # construct a categorical distribution for classification only + classes = np.unique(y) + classes = classes[classes != -1] + self.classes_ = classes + + n_samples, n_classes = len(y), len(classes) + + y = np.asarray(y) + unlabeled = y == -1 + + # initialize distributions + self.label_distributions_ = np.zeros((n_samples, n_classes)) + for label in classes: + self.label_distributions_[y == label, classes == label] = 1 + + y_static = np.copy(self.label_distributions_) + if self._variant == "propagation": + # LabelPropagation + y_static[unlabeled] = 0 + else: + # LabelSpreading + y_static *= 1 - self.alpha + + l_previous = np.zeros((self.X_.shape[0], n_classes)) + + unlabeled = unlabeled[:, np.newaxis] + if sparse.issparse(graph_matrix): + graph_matrix = graph_matrix.tocsr() + + for self.n_iter_ in range(self.max_iter): + if np.abs(self.label_distributions_ - l_previous).sum() < self.tol: + break + + l_previous = self.label_distributions_ + self.label_distributions_ = safe_sparse_dot( + graph_matrix, self.label_distributions_ + ) + + if self._variant == "propagation": + normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis] + normalizer[normalizer == 0] = 1 + self.label_distributions_ /= normalizer + self.label_distributions_ = np.where( + unlabeled, self.label_distributions_, y_static + ) + else: + # clamp + self.label_distributions_ = ( + np.multiply(self.alpha, self.label_distributions_) + y_static + ) + else: + warnings.warn( + "max_iter=%d was reached without convergence." % self.max_iter, + category=ConvergenceWarning, + ) + self.n_iter_ += 1 + + normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis] + normalizer[normalizer == 0] = 1 + self.label_distributions_ /= normalizer + + # set the transduction item + transduction = self.classes_[np.argmax(self.label_distributions_, axis=1)] + self.transduction_ = transduction.ravel() + return self + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + + +class LabelPropagation(BaseLabelPropagation): + """Label Propagation classifier. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + kernel : {'knn', 'rbf'} or callable, default='rbf' + String identifier for kernel function to use or the kernel function + itself. Only 'rbf' and 'knn' strings are valid inputs. The function + passed should take two inputs, each of shape (n_samples, n_features), + and return a (n_samples, n_samples) shaped weight matrix. + + gamma : float, default=20 + Parameter for rbf kernel. + + n_neighbors : int, default=7 + Parameter for knn kernel which need to be strictly positive. + + max_iter : int, default=1000 + Change maximum number of iterations allowed. + + tol : float, default=1e-3 + Convergence tolerance: threshold to consider the system at steady + state. + + n_jobs : int, default=None + The number of parallel jobs to run. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Attributes + ---------- + X_ : {array-like, sparse matrix} of shape (n_samples, n_features) + Input array. + + classes_ : ndarray of shape (n_classes,) + The distinct labels used in classifying instances. + + label_distributions_ : ndarray of shape (n_samples, n_classes) + Categorical distribution for each item. + + transduction_ : ndarray of shape (n_samples) + Label assigned to each item during :term:`fit`. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + Number of iterations run. + + See Also + -------- + LabelSpreading : Alternate label propagation strategy more robust to noise. + + References + ---------- + Xiaojin Zhu and Zoubin Ghahramani. Learning from labeled and unlabeled data + with label propagation. Technical Report CMU-CALD-02-107, Carnegie Mellon + University, 2002 http://pages.cs.wisc.edu/~jerryzhu/pub/CMU-CALD-02-107.pdf + + Examples + -------- + >>> import numpy as np + >>> from sklearn import datasets + >>> from sklearn.semi_supervised import LabelPropagation + >>> label_prop_model = LabelPropagation() + >>> iris = datasets.load_iris() + >>> rng = np.random.RandomState(42) + >>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3 + >>> labels = np.copy(iris.target) + >>> labels[random_unlabeled_points] = -1 + >>> label_prop_model.fit(iris.data, labels) + LabelPropagation(...) + """ + + _variant = "propagation" + + _parameter_constraints: dict = {**BaseLabelPropagation._parameter_constraints} + _parameter_constraints.pop("alpha") + + def __init__( + self, + kernel="rbf", + *, + gamma=20, + n_neighbors=7, + max_iter=1000, + tol=1e-3, + n_jobs=None, + ): + super().__init__( + kernel=kernel, + gamma=gamma, + n_neighbors=n_neighbors, + max_iter=max_iter, + tol=tol, + n_jobs=n_jobs, + alpha=None, + ) + + def _build_graph(self): + """Matrix representing a fully connected graph between each sample + + This basic implementation creates a non-stochastic affinity matrix, so + class distributions will exceed 1 (normalization may be desired). + """ + if self.kernel == "knn": + self.nn_fit = None + affinity_matrix = self._get_kernel(self.X_) + normalizer = affinity_matrix.sum(axis=0) + if sparse.issparse(affinity_matrix): + affinity_matrix.data /= np.diag(np.array(normalizer)) + else: + affinity_matrix /= normalizer[:, np.newaxis] + return affinity_matrix + + def fit(self, X, y): + """Fit a semi-supervised label propagation model to X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target class values with unlabeled points marked as -1. + All unlabeled samples will be transductively assigned labels + internally, which are stored in `transduction_`. + + Returns + ------- + self : object + Returns the instance itself. + """ + return super().fit(X, y) + + +class LabelSpreading(BaseLabelPropagation): + """LabelSpreading model for semi-supervised learning. + + This model is similar to the basic Label Propagation algorithm, + but uses affinity matrix based on the normalized graph Laplacian + and soft clamping across the labels. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + kernel : {'knn', 'rbf'} or callable, default='rbf' + String identifier for kernel function to use or the kernel function + itself. Only 'rbf' and 'knn' strings are valid inputs. The function + passed should take two inputs, each of shape (n_samples, n_features), + and return a (n_samples, n_samples) shaped weight matrix. + + gamma : float, default=20 + Parameter for rbf kernel. + + n_neighbors : int, default=7 + Parameter for knn kernel which is a strictly positive integer. + + alpha : float, default=0.2 + Clamping factor. A value in (0, 1) that specifies the relative amount + that an instance should adopt the information from its neighbors as + opposed to its initial label. + alpha=0 means keeping the initial label information; alpha=1 means + replacing all initial information. + + max_iter : int, default=30 + Maximum number of iterations allowed. + + tol : float, default=1e-3 + Convergence tolerance: threshold to consider the system at steady + state. + + n_jobs : int, default=None + The number of parallel jobs to run. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + + Attributes + ---------- + X_ : ndarray of shape (n_samples, n_features) + Input array. + + classes_ : ndarray of shape (n_classes,) + The distinct labels used in classifying instances. + + label_distributions_ : ndarray of shape (n_samples, n_classes) + Categorical distribution for each item. + + transduction_ : ndarray of shape (n_samples,) + Label assigned to each item during :term:`fit`. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + Number of iterations run. + + See Also + -------- + LabelPropagation : Unregularized graph based semi-supervised learning. + + References + ---------- + `Dengyong Zhou, Olivier Bousquet, Thomas Navin Lal, Jason Weston, + Bernhard Schoelkopf. Learning with local and global consistency (2004) + `_ + + Examples + -------- + >>> import numpy as np + >>> from sklearn import datasets + >>> from sklearn.semi_supervised import LabelSpreading + >>> label_prop_model = LabelSpreading() + >>> iris = datasets.load_iris() + >>> rng = np.random.RandomState(42) + >>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3 + >>> labels = np.copy(iris.target) + >>> labels[random_unlabeled_points] = -1 + >>> label_prop_model.fit(iris.data, labels) + LabelSpreading(...) + """ + + _variant = "spreading" + + _parameter_constraints: dict = {**BaseLabelPropagation._parameter_constraints} + _parameter_constraints["alpha"] = [Interval(Real, 0, 1, closed="neither")] + + def __init__( + self, + kernel="rbf", + *, + gamma=20, + n_neighbors=7, + alpha=0.2, + max_iter=30, + tol=1e-3, + n_jobs=None, + ): + # this one has different base parameters + super().__init__( + kernel=kernel, + gamma=gamma, + n_neighbors=n_neighbors, + alpha=alpha, + max_iter=max_iter, + tol=tol, + n_jobs=n_jobs, + ) + + def _build_graph(self): + """Graph matrix for Label Spreading computes the graph laplacian""" + # compute affinity matrix (or gram matrix) + if self.kernel == "knn": + self.nn_fit = None + n_samples = self.X_.shape[0] + affinity_matrix = self._get_kernel(self.X_) + laplacian = csgraph_laplacian(affinity_matrix, normed=True) + laplacian = -laplacian + if sparse.issparse(laplacian): + diag_mask = laplacian.row == laplacian.col + laplacian.data[diag_mask] = 0.0 + else: + laplacian.flat[:: n_samples + 1] = 0.0 # set diag to 0.0 + return laplacian diff --git a/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/_self_training.py b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/_self_training.py new file mode 100644 index 0000000000000000000000000000000000000000..0fe6f57d6c1ed281748e7223554a103a52a01334 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/_self_training.py @@ -0,0 +1,625 @@ +import warnings +from numbers import Integral, Real +from warnings import warn + +import numpy as np + +from ..base import ( + BaseEstimator, + ClassifierMixin, + MetaEstimatorMixin, + _fit_context, + clone, +) +from ..utils import Bunch, get_tags, safe_mask +from ..utils._param_validation import HasMethods, Hidden, Interval, StrOptions +from ..utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + _routing_enabled, + process_routing, +) +from ..utils.metaestimators import available_if +from ..utils.validation import _estimator_has, check_is_fitted, validate_data + +__all__ = ["SelfTrainingClassifier"] + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + + +class SelfTrainingClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator): + """Self-training classifier. + + This :term:`metaestimator` allows a given supervised classifier to function as a + semi-supervised classifier, allowing it to learn from unlabeled data. It + does this by iteratively predicting pseudo-labels for the unlabeled data + and adding them to the training set. + + The classifier will continue iterating until either max_iter is reached, or + no pseudo-labels were added to the training set in the previous iteration. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : estimator object + An estimator object implementing `fit` and `predict_proba`. + Invoking the `fit` method will fit a clone of the passed estimator, + which will be stored in the `estimator_` attribute. + + .. versionadded:: 1.6 + `estimator` was added to replace `base_estimator`. + + base_estimator : estimator object + An estimator object implementing `fit` and `predict_proba`. + Invoking the `fit` method will fit a clone of the passed estimator, + which will be stored in the `estimator_` attribute. + + .. deprecated:: 1.6 + `base_estimator` was deprecated in 1.6 and will be removed in 1.8. + Use `estimator` instead. + + threshold : float, default=0.75 + The decision threshold for use with `criterion='threshold'`. + Should be in [0, 1). When using the `'threshold'` criterion, a + :ref:`well calibrated classifier ` should be used. + + criterion : {'threshold', 'k_best'}, default='threshold' + The selection criterion used to select which labels to add to the + training set. If `'threshold'`, pseudo-labels with prediction + probabilities above `threshold` are added to the dataset. If `'k_best'`, + the `k_best` pseudo-labels with highest prediction probabilities are + added to the dataset. When using the 'threshold' criterion, a + :ref:`well calibrated classifier ` should be used. + + k_best : int, default=10 + The amount of samples to add in each iteration. Only used when + `criterion='k_best'`. + + max_iter : int or None, default=10 + Maximum number of iterations allowed. Should be greater than or equal + to 0. If it is `None`, the classifier will continue to predict labels + until no new pseudo-labels are added, or all unlabeled samples have + been labeled. + + verbose : bool, default=False + Enable verbose output. + + Attributes + ---------- + estimator_ : estimator object + The fitted estimator. + + classes_ : ndarray or list of ndarray of shape (n_classes,) + Class labels for each output. (Taken from the trained + `estimator_`). + + transduction_ : ndarray of shape (n_samples,) + The labels used for the final fit of the classifier, including + pseudo-labels added during fit. + + labeled_iter_ : ndarray of shape (n_samples,) + The iteration in which each sample was labeled. When a sample has + iteration 0, the sample was already labeled in the original dataset. + When a sample has iteration -1, the sample was not labeled in any + iteration. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + The number of rounds of self-training, that is the number of times the + base estimator is fitted on relabeled variants of the training set. + + termination_condition_ : {'max_iter', 'no_change', 'all_labeled'} + The reason that fitting was stopped. + + - `'max_iter'`: `n_iter_` reached `max_iter`. + - `'no_change'`: no new labels were predicted. + - `'all_labeled'`: all unlabeled samples were labeled before `max_iter` + was reached. + + See Also + -------- + LabelPropagation : Label propagation classifier. + LabelSpreading : Label spreading model for semi-supervised learning. + + References + ---------- + :doi:`David Yarowsky. 1995. Unsupervised word sense disambiguation rivaling + supervised methods. In Proceedings of the 33rd annual meeting on + Association for Computational Linguistics (ACL '95). Association for + Computational Linguistics, Stroudsburg, PA, USA, 189-196. + <10.3115/981658.981684>` + + Examples + -------- + >>> import numpy as np + >>> from sklearn import datasets + >>> from sklearn.semi_supervised import SelfTrainingClassifier + >>> from sklearn.svm import SVC + >>> rng = np.random.RandomState(42) + >>> iris = datasets.load_iris() + >>> random_unlabeled_points = rng.rand(iris.target.shape[0]) < 0.3 + >>> iris.target[random_unlabeled_points] = -1 + >>> svc = SVC(probability=True, gamma="auto") + >>> self_training_model = SelfTrainingClassifier(svc) + >>> self_training_model.fit(iris.data, iris.target) + SelfTrainingClassifier(...) + """ + + _parameter_constraints: dict = { + # We don't require `predic_proba` here to allow passing a meta-estimator + # that only exposes `predict_proba` after fitting. + # TODO(1.8) remove None option + "estimator": [None, HasMethods(["fit"])], + # TODO(1.8) remove + "base_estimator": [ + HasMethods(["fit"]), + Hidden(StrOptions({"deprecated"})), + ], + "threshold": [Interval(Real, 0.0, 1.0, closed="left")], + "criterion": [StrOptions({"threshold", "k_best"})], + "k_best": [Interval(Integral, 1, None, closed="left")], + "max_iter": [Interval(Integral, 0, None, closed="left"), None], + "verbose": ["verbose"], + } + + def __init__( + self, + estimator=None, + base_estimator="deprecated", + threshold=0.75, + criterion="threshold", + k_best=10, + max_iter=10, + verbose=False, + ): + self.estimator = estimator + self.threshold = threshold + self.criterion = criterion + self.k_best = k_best + self.max_iter = max_iter + self.verbose = verbose + + # TODO(1.8) remove + self.base_estimator = base_estimator + + def _get_estimator(self): + """Get the estimator. + + Returns + ------- + estimator_ : estimator object + The cloned estimator object. + """ + # TODO(1.8): remove and only keep clone(self.estimator) + if self.estimator is None and self.base_estimator != "deprecated": + estimator_ = clone(self.base_estimator) + + warn( + ( + "`base_estimator` has been deprecated in 1.6 and will be removed" + " in 1.8. Please use `estimator` instead." + ), + FutureWarning, + ) + # TODO(1.8) remove + elif self.estimator is None and self.base_estimator == "deprecated": + raise ValueError( + "You must pass an estimator to SelfTrainingClassifier. Use `estimator`." + ) + elif self.estimator is not None and self.base_estimator != "deprecated": + raise ValueError( + "You must pass only one estimator to SelfTrainingClassifier." + " Use `estimator`." + ) + else: + estimator_ = clone(self.estimator) + return estimator_ + + @_fit_context( + # SelfTrainingClassifier.estimator is not validated yet + prefer_skip_nested_validation=False + ) + def fit(self, X, y, **params): + """ + Fit self-training classifier using `X`, `y` as training data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Array representing the data. + + y : {array-like, sparse matrix} of shape (n_samples,) + Array representing the labels. Unlabeled samples should have the + label -1. + + **params : dict + Parameters to pass to the underlying estimators. + + .. versionadded:: 1.6 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + self : object + Fitted estimator. + """ + _raise_for_params(params, self, "fit") + + self.estimator_ = self._get_estimator() + + # we need row slicing support for sparse matrices, but costly finiteness check + # can be delegated to the base estimator. + X, y = validate_data( + self, + X, + y, + accept_sparse=["csr", "csc", "lil", "dok"], + ensure_all_finite=False, + ) + + if y.dtype.kind in ["U", "S"]: + raise ValueError( + "y has dtype string. If you wish to predict on " + "string targets, use dtype object, and use -1" + " as the label for unlabeled samples." + ) + + has_label = y != -1 + + if np.all(has_label): + warnings.warn("y contains no unlabeled samples", UserWarning) + + if self.criterion == "k_best" and ( + self.k_best > X.shape[0] - np.sum(has_label) + ): + warnings.warn( + ( + "k_best is larger than the amount of unlabeled " + "samples. All unlabeled samples will be labeled in " + "the first iteration" + ), + UserWarning, + ) + + if _routing_enabled(): + routed_params = process_routing(self, "fit", **params) + else: + routed_params = Bunch(estimator=Bunch(fit={})) + + self.transduction_ = np.copy(y) + self.labeled_iter_ = np.full_like(y, -1) + self.labeled_iter_[has_label] = 0 + + self.n_iter_ = 0 + + while not np.all(has_label) and ( + self.max_iter is None or self.n_iter_ < self.max_iter + ): + self.n_iter_ += 1 + self.estimator_.fit( + X[safe_mask(X, has_label)], + self.transduction_[has_label], + **routed_params.estimator.fit, + ) + + # Predict on the unlabeled samples + prob = self.estimator_.predict_proba(X[safe_mask(X, ~has_label)]) + pred = self.estimator_.classes_[np.argmax(prob, axis=1)] + max_proba = np.max(prob, axis=1) + + # Select new labeled samples + if self.criterion == "threshold": + selected = max_proba > self.threshold + else: + n_to_select = min(self.k_best, max_proba.shape[0]) + if n_to_select == max_proba.shape[0]: + selected = np.ones_like(max_proba, dtype=bool) + else: + # NB these are indices, not a mask + selected = np.argpartition(-max_proba, n_to_select)[:n_to_select] + + # Map selected indices into original array + selected_full = np.nonzero(~has_label)[0][selected] + + # Add newly labeled confident predictions to the dataset + self.transduction_[selected_full] = pred[selected] + has_label[selected_full] = True + self.labeled_iter_[selected_full] = self.n_iter_ + + if selected_full.shape[0] == 0: + # no changed labels + self.termination_condition_ = "no_change" + break + + if self.verbose: + print( + f"End of iteration {self.n_iter_}," + f" added {selected_full.shape[0]} new labels." + ) + + if self.n_iter_ == self.max_iter: + self.termination_condition_ = "max_iter" + if np.all(has_label): + self.termination_condition_ = "all_labeled" + + self.estimator_.fit( + X[safe_mask(X, has_label)], + self.transduction_[has_label], + **routed_params.estimator.fit, + ) + self.classes_ = self.estimator_.classes_ + return self + + @available_if(_estimator_has("predict")) + def predict(self, X, **params): + """Predict the classes of `X`. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Array representing the data. + + **params : dict of str -> object + Parameters to pass to the underlying estimator's ``predict`` method. + + .. versionadded:: 1.6 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + y : ndarray of shape (n_samples,) + Array with predicted labels. + """ + check_is_fitted(self) + _raise_for_params(params, self, "predict") + + if _routing_enabled(): + # metadata routing is enabled. + routed_params = process_routing(self, "predict", **params) + else: + routed_params = Bunch(estimator=Bunch(predict={})) + + X = validate_data( + self, + X, + accept_sparse=True, + ensure_all_finite=False, + reset=False, + ) + return self.estimator_.predict(X, **routed_params.estimator.predict) + + @available_if(_estimator_has("predict_proba")) + def predict_proba(self, X, **params): + """Predict probability for each possible outcome. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Array representing the data. + + **params : dict of str -> object + Parameters to pass to the underlying estimator's + ``predict_proba`` method. + + .. versionadded:: 1.6 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + y : ndarray of shape (n_samples, n_features) + Array with prediction probabilities. + """ + check_is_fitted(self) + _raise_for_params(params, self, "predict_proba") + + if _routing_enabled(): + # metadata routing is enabled. + routed_params = process_routing(self, "predict_proba", **params) + else: + routed_params = Bunch(estimator=Bunch(predict_proba={})) + + X = validate_data( + self, + X, + accept_sparse=True, + ensure_all_finite=False, + reset=False, + ) + return self.estimator_.predict_proba(X, **routed_params.estimator.predict_proba) + + @available_if(_estimator_has("decision_function")) + def decision_function(self, X, **params): + """Call decision function of the `estimator`. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Array representing the data. + + **params : dict of str -> object + Parameters to pass to the underlying estimator's + ``decision_function`` method. + + .. versionadded:: 1.6 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + y : ndarray of shape (n_samples, n_features) + Result of the decision function of the `estimator`. + """ + check_is_fitted(self) + _raise_for_params(params, self, "decision_function") + + if _routing_enabled(): + # metadata routing is enabled. + routed_params = process_routing(self, "decision_function", **params) + else: + routed_params = Bunch(estimator=Bunch(decision_function={})) + + X = validate_data( + self, + X, + accept_sparse=True, + ensure_all_finite=False, + reset=False, + ) + return self.estimator_.decision_function( + X, **routed_params.estimator.decision_function + ) + + @available_if(_estimator_has("predict_log_proba")) + def predict_log_proba(self, X, **params): + """Predict log probability for each possible outcome. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Array representing the data. + + **params : dict of str -> object + Parameters to pass to the underlying estimator's + ``predict_log_proba`` method. + + .. versionadded:: 1.6 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + y : ndarray of shape (n_samples, n_features) + Array with log prediction probabilities. + """ + check_is_fitted(self) + _raise_for_params(params, self, "predict_log_proba") + + if _routing_enabled(): + # metadata routing is enabled. + routed_params = process_routing(self, "predict_log_proba", **params) + else: + routed_params = Bunch(estimator=Bunch(predict_log_proba={})) + + X = validate_data( + self, + X, + accept_sparse=True, + ensure_all_finite=False, + reset=False, + ) + return self.estimator_.predict_log_proba( + X, **routed_params.estimator.predict_log_proba + ) + + @available_if(_estimator_has("score")) + def score(self, X, y, **params): + """Call score on the `estimator`. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Array representing the data. + + y : array-like of shape (n_samples,) + Array representing the labels. + + **params : dict of str -> object + Parameters to pass to the underlying estimator's ``score`` method. + + .. versionadded:: 1.6 + Only available if `enable_metadata_routing=True`, + which can be set by using + ``sklearn.set_config(enable_metadata_routing=True)``. + See :ref:`Metadata Routing User Guide ` for + more details. + + Returns + ------- + score : float + Result of calling score on the `estimator`. + """ + check_is_fitted(self) + _raise_for_params(params, self, "score") + + if _routing_enabled(): + # metadata routing is enabled. + routed_params = process_routing(self, "score", **params) + else: + routed_params = Bunch(estimator=Bunch(score={})) + + X = validate_data( + self, + X, + accept_sparse=True, + ensure_all_finite=False, + reset=False, + ) + return self.estimator_.score(X, y, **routed_params.estimator.score) + + def get_metadata_routing(self): + """Get metadata routing of this object. + + Please check :ref:`User Guide ` on how the routing + mechanism works. + + .. versionadded:: 1.6 + + Returns + ------- + routing : MetadataRouter + A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating + routing information. + """ + router = MetadataRouter(owner=self.__class__.__name__) + router.add( + estimator=self.estimator, + method_mapping=( + MethodMapping() + .add(callee="fit", caller="fit") + .add(callee="score", caller="fit") + .add(callee="predict", caller="predict") + .add(callee="predict_proba", caller="predict_proba") + .add(callee="decision_function", caller="decision_function") + .add(callee="predict_log_proba", caller="predict_log_proba") + .add(callee="score", caller="score") + ), + ) + return router + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + # TODO(1.8): remove the condition check together with base_estimator + if self.estimator is not None: + tags.input_tags.sparse = get_tags(self.estimator).input_tags.sparse + return tags diff --git a/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/tests/test_label_propagation.py b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/tests/test_label_propagation.py new file mode 100644 index 0000000000000000000000000000000000000000..4b046aa11125032a706b5c984c5dec5caba72594 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/tests/test_label_propagation.py @@ -0,0 +1,238 @@ +"""test the label propagation module""" + +import warnings + +import numpy as np +import pytest +from scipy.sparse import issparse + +from sklearn.datasets import make_classification +from sklearn.exceptions import ConvergenceWarning +from sklearn.metrics.pairwise import rbf_kernel +from sklearn.model_selection import train_test_split +from sklearn.neighbors import NearestNeighbors +from sklearn.semi_supervised import _label_propagation as label_propagation +from sklearn.utils._testing import ( + _convert_container, + assert_allclose, + assert_array_equal, +) + +CONSTRUCTOR_TYPES = ("array", "sparse_csr", "sparse_csc") + +ESTIMATORS = [ + (label_propagation.LabelPropagation, {"kernel": "rbf"}), + (label_propagation.LabelPropagation, {"kernel": "knn", "n_neighbors": 2}), + ( + label_propagation.LabelPropagation, + {"kernel": lambda x, y: rbf_kernel(x, y, gamma=20)}, + ), + (label_propagation.LabelSpreading, {"kernel": "rbf"}), + (label_propagation.LabelSpreading, {"kernel": "knn", "n_neighbors": 2}), + ( + label_propagation.LabelSpreading, + {"kernel": lambda x, y: rbf_kernel(x, y, gamma=20)}, + ), +] + + +@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS) +def test_fit_transduction(global_dtype, Estimator, parameters): + samples = np.asarray([[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]], dtype=global_dtype) + labels = [0, 1, -1] + clf = Estimator(**parameters).fit(samples, labels) + assert clf.transduction_[2] == 1 + + +@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS) +def test_distribution(global_dtype, Estimator, parameters): + if parameters["kernel"] == "knn": + pytest.skip( + "Unstable test for this configuration: changes in k-NN ordering break it." + ) + samples = np.asarray([[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=global_dtype) + labels = [0, 1, -1] + clf = Estimator(**parameters).fit(samples, labels) + assert_allclose(clf.label_distributions_[2], [0.5, 0.5], atol=1e-2) + + +@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS) +def test_predict(global_dtype, Estimator, parameters): + samples = np.asarray([[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]], dtype=global_dtype) + labels = [0, 1, -1] + clf = Estimator(**parameters).fit(samples, labels) + assert_array_equal(clf.predict([[0.5, 2.5]]), np.array([1])) + + +@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS) +def test_predict_proba(global_dtype, Estimator, parameters): + samples = np.asarray([[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]], dtype=global_dtype) + labels = [0, 1, -1] + clf = Estimator(**parameters).fit(samples, labels) + assert_allclose(clf.predict_proba([[1.0, 1.0]]), np.array([[0.5, 0.5]])) + + +@pytest.mark.parametrize("alpha", [0.1, 0.3, 0.5, 0.7, 0.9]) +@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS) +def test_label_spreading_closed_form(global_dtype, Estimator, parameters, alpha): + n_classes = 2 + X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0) + X = X.astype(global_dtype, copy=False) + y[::3] = -1 + + gamma = 0.1 + clf = label_propagation.LabelSpreading(gamma=gamma).fit(X, y) + # adopting notation from Zhou et al (2004): + S = clf._build_graph() + Y = np.zeros((len(y), n_classes + 1), dtype=X.dtype) + Y[np.arange(len(y)), y] = 1 + Y = Y[:, :-1] + + expected = np.dot(np.linalg.inv(np.eye(len(S), dtype=S.dtype) - alpha * S), Y) + expected /= expected.sum(axis=1)[:, np.newaxis] + + clf = label_propagation.LabelSpreading( + max_iter=100, alpha=alpha, tol=1e-10, gamma=gamma + ) + clf.fit(X, y) + + assert_allclose(expected, clf.label_distributions_) + + +def test_label_propagation_closed_form(global_dtype): + n_classes = 2 + X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0) + X = X.astype(global_dtype, copy=False) + y[::3] = -1 + Y = np.zeros((len(y), n_classes + 1)) + Y[np.arange(len(y)), y] = 1 + unlabelled_idx = Y[:, (-1,)].nonzero()[0] + labelled_idx = (Y[:, (-1,)] == 0).nonzero()[0] + + clf = label_propagation.LabelPropagation(max_iter=100, tol=1e-10, gamma=0.1) + clf.fit(X, y) + # adopting notation from Zhu et al 2002 + T_bar = clf._build_graph() + Tuu = T_bar[tuple(np.meshgrid(unlabelled_idx, unlabelled_idx, indexing="ij"))] + Tul = T_bar[tuple(np.meshgrid(unlabelled_idx, labelled_idx, indexing="ij"))] + Y = Y[:, :-1] + Y_l = Y[labelled_idx, :] + Y_u = np.dot(np.dot(np.linalg.inv(np.eye(Tuu.shape[0]) - Tuu), Tul), Y_l) + + expected = Y.copy() + expected[unlabelled_idx, :] = Y_u + expected /= expected.sum(axis=1)[:, np.newaxis] + + assert_allclose(expected, clf.label_distributions_, atol=1e-4) + + +@pytest.mark.parametrize("accepted_sparse_type", ["sparse_csr", "sparse_csc"]) +@pytest.mark.parametrize("index_dtype", [np.int32, np.int64]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("Estimator, parameters", ESTIMATORS) +def test_sparse_input_types( + accepted_sparse_type, index_dtype, dtype, Estimator, parameters +): + # This is non-regression test for #17085 + X = _convert_container([[1.0, 0.0], [0.0, 2.0], [1.0, 3.0]], accepted_sparse_type) + X.data = X.data.astype(dtype, copy=False) + X.indices = X.indices.astype(index_dtype, copy=False) + X.indptr = X.indptr.astype(index_dtype, copy=False) + labels = [0, 1, -1] + clf = Estimator(**parameters).fit(X, labels) + assert_array_equal(clf.predict([[0.5, 2.5]]), np.array([1])) + + +@pytest.mark.parametrize("constructor_type", CONSTRUCTOR_TYPES) +def test_convergence_speed(constructor_type): + # This is a non-regression test for #5774 + X = _convert_container([[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]], constructor_type) + y = np.array([0, 1, -1]) + mdl = label_propagation.LabelSpreading(kernel="rbf", max_iter=5000) + mdl.fit(X, y) + + # this should converge quickly: + assert mdl.n_iter_ < 10 + assert_array_equal(mdl.predict(X), [0, 1, 1]) + + +def test_convergence_warning(): + # This is a non-regression test for #5774 + X = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 2.5]]) + y = np.array([0, 1, -1]) + mdl = label_propagation.LabelSpreading(kernel="rbf", max_iter=1) + warn_msg = "max_iter=1 was reached without convergence." + with pytest.warns(ConvergenceWarning, match=warn_msg): + mdl.fit(X, y) + assert mdl.n_iter_ == mdl.max_iter + + mdl = label_propagation.LabelPropagation(kernel="rbf", max_iter=1) + with pytest.warns(ConvergenceWarning, match=warn_msg): + mdl.fit(X, y) + assert mdl.n_iter_ == mdl.max_iter + + mdl = label_propagation.LabelSpreading(kernel="rbf", max_iter=500) + with warnings.catch_warnings(): + warnings.simplefilter("error", ConvergenceWarning) + mdl.fit(X, y) + + mdl = label_propagation.LabelPropagation(kernel="rbf", max_iter=500) + with warnings.catch_warnings(): + warnings.simplefilter("error", ConvergenceWarning) + mdl.fit(X, y) + + +@pytest.mark.parametrize( + "LabelPropagationCls", + [label_propagation.LabelSpreading, label_propagation.LabelPropagation], +) +def test_label_propagation_non_zero_normalizer(LabelPropagationCls): + # check that we don't divide by zero in case of null normalizer + # non-regression test for + # https://github.com/scikit-learn/scikit-learn/pull/15946 + # https://github.com/scikit-learn/scikit-learn/issues/9292 + X = np.array([[100.0, 100.0], [100.0, 100.0], [0.0, 0.0], [0.0, 0.0]]) + y = np.array([0, 1, -1, -1]) + mdl = LabelPropagationCls(kernel="knn", max_iter=100, n_neighbors=1) + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + mdl.fit(X, y) + + +def test_predict_sparse_callable_kernel(global_dtype): + # This is a non-regression test for #15866 + + # Custom sparse kernel (top-K RBF) + def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5): + nn = NearestNeighbors(n_neighbors=10, metric="euclidean", n_jobs=2) + nn.fit(X) + W = -1 * nn.kneighbors_graph(Y, mode="distance").power(2) * gamma + np.exp(W.data, out=W.data) + assert issparse(W) + return W.T + + n_classes = 4 + n_samples = 500 + n_test = 10 + X, y = make_classification( + n_classes=n_classes, + n_samples=n_samples, + n_features=20, + n_informative=20, + n_redundant=0, + n_repeated=0, + random_state=0, + ) + X = X.astype(global_dtype) + + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=n_test, random_state=0 + ) + + model = label_propagation.LabelSpreading(kernel=topk_rbf) + model.fit(X_train, y_train) + assert model.score(X_test, y_test) >= 0.9 + + model = label_propagation.LabelPropagation(kernel=topk_rbf) + model.fit(X_train, y_train) + assert model.score(X_test, y_test) >= 0.9 diff --git a/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/tests/test_self_training.py b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/tests/test_self_training.py new file mode 100644 index 0000000000000000000000000000000000000000..02244063994d573537d7194c2837f8e80ffad0c6 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/semi_supervised/tests/test_self_training.py @@ -0,0 +1,395 @@ +from math import ceil + +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +from sklearn.datasets import load_iris, make_blobs +from sklearn.ensemble import StackingClassifier +from sklearn.exceptions import NotFittedError +from sklearn.metrics import accuracy_score +from sklearn.model_selection import train_test_split +from sklearn.neighbors import KNeighborsClassifier +from sklearn.semi_supervised import SelfTrainingClassifier +from sklearn.svm import SVC +from sklearn.tests.test_pipeline import SimpleEstimator +from sklearn.tree import DecisionTreeClassifier + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +# load the iris dataset and randomly permute it +iris = load_iris() +X_train, X_test, y_train, y_test = train_test_split( + iris.data, iris.target, random_state=0 +) + +n_labeled_samples = 50 + +y_train_missing_labels = y_train.copy() +y_train_missing_labels[n_labeled_samples:] = -1 +mapping = {0: "A", 1: "B", 2: "C", -1: "-1"} +y_train_missing_strings = np.vectorize(mapping.get)(y_train_missing_labels).astype( + object +) +y_train_missing_strings[y_train_missing_labels == -1] = -1 + + +def test_warns_k_best(): + st = SelfTrainingClassifier(KNeighborsClassifier(), criterion="k_best", k_best=1000) + with pytest.warns(UserWarning, match="k_best is larger than"): + st.fit(X_train, y_train_missing_labels) + + assert st.termination_condition_ == "all_labeled" + + +@pytest.mark.parametrize( + "estimator", + [KNeighborsClassifier(), SVC(gamma="scale", probability=True, random_state=0)], +) +@pytest.mark.parametrize("selection_crit", ["threshold", "k_best"]) +def test_classification(estimator, selection_crit): + # Check classification for various parameter settings. + # Also assert that predictions for strings and numerical labels are equal. + # Also test for multioutput classification + threshold = 0.75 + max_iter = 10 + st = SelfTrainingClassifier( + estimator, max_iter=max_iter, threshold=threshold, criterion=selection_crit + ) + st.fit(X_train, y_train_missing_labels) + pred = st.predict(X_test) + proba = st.predict_proba(X_test) + + st_string = SelfTrainingClassifier( + estimator, max_iter=max_iter, criterion=selection_crit, threshold=threshold + ) + st_string.fit(X_train, y_train_missing_strings) + pred_string = st_string.predict(X_test) + proba_string = st_string.predict_proba(X_test) + + assert_array_equal(np.vectorize(mapping.get)(pred), pred_string) + assert_array_equal(proba, proba_string) + + assert st.termination_condition_ == st_string.termination_condition_ + # Check consistency between labeled_iter, n_iter and max_iter + labeled = y_train_missing_labels != -1 + # assert that labeled samples have labeled_iter = 0 + assert_array_equal(st.labeled_iter_ == 0, labeled) + # assert that labeled samples do not change label during training + assert_array_equal(y_train_missing_labels[labeled], st.transduction_[labeled]) + + # assert that the max of the iterations is less than the total amount of + # iterations + assert np.max(st.labeled_iter_) <= st.n_iter_ <= max_iter + assert np.max(st_string.labeled_iter_) <= st_string.n_iter_ <= max_iter + + # check shapes + assert st.labeled_iter_.shape == st.transduction_.shape + assert st_string.labeled_iter_.shape == st_string.transduction_.shape + + +def test_k_best(): + st = SelfTrainingClassifier( + KNeighborsClassifier(n_neighbors=1), + criterion="k_best", + k_best=10, + max_iter=None, + ) + y_train_only_one_label = np.copy(y_train) + y_train_only_one_label[1:] = -1 + n_samples = y_train.shape[0] + + n_expected_iter = ceil((n_samples - 1) / 10) + st.fit(X_train, y_train_only_one_label) + assert st.n_iter_ == n_expected_iter + + # Check labeled_iter_ + assert np.sum(st.labeled_iter_ == 0) == 1 + for i in range(1, n_expected_iter): + assert np.sum(st.labeled_iter_ == i) == 10 + assert np.sum(st.labeled_iter_ == n_expected_iter) == (n_samples - 1) % 10 + assert st.termination_condition_ == "all_labeled" + + +def test_sanity_classification(): + estimator = SVC(gamma="scale", probability=True) + estimator.fit(X_train[n_labeled_samples:], y_train[n_labeled_samples:]) + + st = SelfTrainingClassifier(estimator) + st.fit(X_train, y_train_missing_labels) + + pred1, pred2 = estimator.predict(X_test), st.predict(X_test) + assert not np.array_equal(pred1, pred2) + score_supervised = accuracy_score(estimator.predict(X_test), y_test) + score_self_training = accuracy_score(st.predict(X_test), y_test) + + assert score_self_training > score_supervised + + +def test_none_iter(): + # Check that the all samples were labeled after a 'reasonable' number of + # iterations. + st = SelfTrainingClassifier(KNeighborsClassifier(), threshold=0.55, max_iter=None) + st.fit(X_train, y_train_missing_labels) + + assert st.n_iter_ < 10 + assert st.termination_condition_ == "all_labeled" + + +@pytest.mark.parametrize( + "estimator", + [KNeighborsClassifier(), SVC(gamma="scale", probability=True, random_state=0)], +) +@pytest.mark.parametrize("y", [y_train_missing_labels, y_train_missing_strings]) +def test_zero_iterations(estimator, y): + # Check classification for zero iterations. + # Fitting a SelfTrainingClassifier with zero iterations should give the + # same results as fitting a supervised classifier. + # This also asserts that string arrays work as expected. + + clf1 = SelfTrainingClassifier(estimator, max_iter=0) + + clf1.fit(X_train, y) + + clf2 = estimator.fit(X_train[:n_labeled_samples], y[:n_labeled_samples]) + + assert_array_equal(clf1.predict(X_test), clf2.predict(X_test)) + assert clf1.termination_condition_ == "max_iter" + + +def test_prefitted_throws_error(): + # Test that passing a pre-fitted classifier and calling predict throws an + # error + knn = KNeighborsClassifier() + knn.fit(X_train, y_train) + st = SelfTrainingClassifier(knn) + with pytest.raises( + NotFittedError, + match="This SelfTrainingClassifier instance is not fitted yet", + ): + st.predict(X_train) + + +@pytest.mark.parametrize("max_iter", range(1, 5)) +def test_labeled_iter(max_iter): + # Check that the amount of datapoints labeled in iteration 0 is equal to + # the amount of labeled datapoints we passed. + st = SelfTrainingClassifier(KNeighborsClassifier(), max_iter=max_iter) + + st.fit(X_train, y_train_missing_labels) + amount_iter_0 = len(st.labeled_iter_[st.labeled_iter_ == 0]) + assert amount_iter_0 == n_labeled_samples + # Check that the max of the iterations is less than the total amount of + # iterations + assert np.max(st.labeled_iter_) <= st.n_iter_ <= max_iter + + +def test_no_unlabeled(): + # Test that training on a fully labeled dataset produces the same results + # as training the classifier by itself. + knn = KNeighborsClassifier() + knn.fit(X_train, y_train) + st = SelfTrainingClassifier(knn) + with pytest.warns(UserWarning, match="y contains no unlabeled samples"): + st.fit(X_train, y_train) + assert_array_equal(knn.predict(X_test), st.predict(X_test)) + # Assert that all samples were labeled in iteration 0 (since there were no + # unlabeled samples). + assert np.all(st.labeled_iter_ == 0) + assert st.termination_condition_ == "all_labeled" + + +def test_early_stopping(): + svc = SVC(gamma="scale", probability=True) + st = SelfTrainingClassifier(svc) + X_train_easy = [[1], [0], [1], [0.5]] + y_train_easy = [1, 0, -1, -1] + # X = [[0.5]] cannot be predicted on with a high confidence, so training + # stops early + st.fit(X_train_easy, y_train_easy) + assert st.n_iter_ == 1 + assert st.termination_condition_ == "no_change" + + +def test_strings_dtype(): + clf = SelfTrainingClassifier(KNeighborsClassifier()) + X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1) + labels_multiclass = ["one", "two", "three"] + + y_strings = np.take(labels_multiclass, y) + + with pytest.raises(ValueError, match="dtype"): + clf.fit(X, y_strings) + + +@pytest.mark.parametrize("verbose", [True, False]) +def test_verbose(capsys, verbose): + clf = SelfTrainingClassifier(KNeighborsClassifier(), verbose=verbose) + clf.fit(X_train, y_train_missing_labels) + + captured = capsys.readouterr() + + if verbose: + assert "iteration" in captured.out + else: + assert "iteration" not in captured.out + + +def test_verbose_k_best(capsys): + st = SelfTrainingClassifier( + KNeighborsClassifier(n_neighbors=1), + criterion="k_best", + k_best=10, + verbose=True, + max_iter=None, + ) + + y_train_only_one_label = np.copy(y_train) + y_train_only_one_label[1:] = -1 + n_samples = y_train.shape[0] + + n_expected_iter = ceil((n_samples - 1) / 10) + st.fit(X_train, y_train_only_one_label) + + captured = capsys.readouterr() + + msg = "End of iteration {}, added {} new labels." + for i in range(1, n_expected_iter): + assert msg.format(i, 10) in captured.out + + assert msg.format(n_expected_iter, (n_samples - 1) % 10) in captured.out + + +def test_k_best_selects_best(): + # Tests that the labels added by st really are the 10 best labels. + svc = SVC(gamma="scale", probability=True, random_state=0) + st = SelfTrainingClassifier(svc, criterion="k_best", max_iter=1, k_best=10) + has_label = y_train_missing_labels != -1 + st.fit(X_train, y_train_missing_labels) + + got_label = ~has_label & (st.transduction_ != -1) + + svc.fit(X_train[has_label], y_train_missing_labels[has_label]) + pred = svc.predict_proba(X_train[~has_label]) + max_proba = np.max(pred, axis=1) + + most_confident_svc = X_train[~has_label][np.argsort(max_proba)[-10:]] + added_by_st = X_train[np.where(got_label)].tolist() + + for row in most_confident_svc.tolist(): + assert row in added_by_st + + +def test_estimator_meta_estimator(): + # Check that a meta-estimator relying on an estimator implementing + # `predict_proba` will work even if it does not expose this method before being + # fitted. + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/19119 + + estimator = StackingClassifier( + estimators=[ + ("svc_1", SVC(probability=True)), + ("svc_2", SVC(probability=True)), + ], + final_estimator=SVC(probability=True), + cv=2, + ) + + assert hasattr(estimator, "predict_proba") + clf = SelfTrainingClassifier(estimator=estimator) + clf.fit(X_train, y_train_missing_labels) + clf.predict_proba(X_test) + + estimator = StackingClassifier( + estimators=[ + ("svc_1", SVC(probability=False)), + ("svc_2", SVC(probability=False)), + ], + final_estimator=SVC(probability=False), + cv=2, + ) + + assert not hasattr(estimator, "predict_proba") + clf = SelfTrainingClassifier(estimator=estimator) + with pytest.raises(AttributeError): + clf.fit(X_train, y_train_missing_labels) + + +def test_self_training_estimator_attribute_error(): + """Check that we raise the proper AttributeErrors when the `estimator` + does not implement the `predict_proba` method, which is called from within + `fit`, or `decision_function`, which is decorated with `available_if`. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/28108 + """ + # `SVC` with `probability=False` does not implement 'predict_proba' that + # is required internally in `fit` of `SelfTrainingClassifier`. We expect + # an AttributeError to be raised. + estimator = SVC(probability=False, gamma="scale") + self_training = SelfTrainingClassifier(estimator) + + with pytest.raises(AttributeError, match="has no attribute 'predict_proba'"): + self_training.fit(X_train, y_train_missing_labels) + + # `DecisionTreeClassifier` does not implement 'decision_function' and + # should raise an AttributeError + self_training = SelfTrainingClassifier(estimator=DecisionTreeClassifier()) + + outer_msg = "This 'SelfTrainingClassifier' has no attribute 'decision_function'" + inner_msg = "'DecisionTreeClassifier' object has no attribute 'decision_function'" + with pytest.raises(AttributeError, match=outer_msg) as exec_info: + self_training.fit(X_train, y_train_missing_labels).decision_function(X_train) + assert isinstance(exec_info.value.__cause__, AttributeError) + assert inner_msg in str(exec_info.value.__cause__) + + +# TODO(1.8): remove in 1.8 +def test_deprecation_warning_base_estimator(): + warn_msg = "`base_estimator` has been deprecated in 1.6 and will be removed" + with pytest.warns(FutureWarning, match=warn_msg): + SelfTrainingClassifier(base_estimator=DecisionTreeClassifier()).fit( + X_train, y_train_missing_labels + ) + + error_msg = "You must pass an estimator to SelfTrainingClassifier" + with pytest.raises(ValueError, match=error_msg): + SelfTrainingClassifier().fit(X_train, y_train_missing_labels) + + error_msg = "You must pass only one estimator to SelfTrainingClassifier." + with pytest.raises(ValueError, match=error_msg): + SelfTrainingClassifier( + base_estimator=DecisionTreeClassifier(), estimator=DecisionTreeClassifier() + ).fit(X_train, y_train_missing_labels) + + +# Metadata routing tests +# ================================================================= + + +@pytest.mark.filterwarnings("ignore:y contains no unlabeled samples:UserWarning") +@pytest.mark.parametrize( + "method", ["decision_function", "predict_log_proba", "predict_proba", "predict"] +) +def test_routing_passed_metadata_not_supported(method): + """Test that the right error message is raised when metadata is passed while + not supported when `enable_metadata_routing=False`.""" + est = SelfTrainingClassifier(estimator=SimpleEstimator()) + with pytest.raises( + ValueError, match="is only supported if enable_metadata_routing=True" + ): + est.fit([[1], [1]], [1, 1], sample_weight=[1], prop="a") + + est = SelfTrainingClassifier(estimator=SimpleEstimator()) + with pytest.raises( + ValueError, match="is only supported if enable_metadata_routing=True" + ): + # make sure that the estimator thinks it is already fitted + est.fitted_params_ = True + getattr(est, method)([[1]], sample_weight=[1], prop="a") + + +# End of routing tests +# ==================== diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/svm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a039d2e15abddf5aaca8faad462b1b951ec6e18a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/__init__.py @@ -0,0 +1,21 @@ +"""Support vector machine algorithms.""" + +# See http://scikit-learn.sourceforge.net/modules/svm.html for complete +# documentation. + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from ._bounds import l1_min_c +from ._classes import SVC, SVR, LinearSVC, LinearSVR, NuSVC, NuSVR, OneClassSVM + +__all__ = [ + "SVC", + "SVR", + "LinearSVC", + "LinearSVR", + "NuSVC", + "NuSVR", + "OneClassSVM", + "l1_min_c", +] diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/_base.py b/.venv/lib/python3.12/site-packages/sklearn/svm/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..db295e4e877b50e7dff639de4dd6bb98c95d7b91 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/_base.py @@ -0,0 +1,1262 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import warnings +from abc import ABCMeta, abstractmethod +from numbers import Integral, Real + +import numpy as np +import scipy.sparse as sp + +from ..base import BaseEstimator, ClassifierMixin, _fit_context +from ..exceptions import ConvergenceWarning, NotFittedError +from ..preprocessing import LabelEncoder +from ..utils import check_array, check_random_state, column_or_1d, compute_class_weight +from ..utils._param_validation import Interval, StrOptions +from ..utils.extmath import safe_sparse_dot +from ..utils.metaestimators import available_if +from ..utils.multiclass import _ovr_decision_function, check_classification_targets +from ..utils.validation import ( + _check_large_sparse, + _check_sample_weight, + _num_samples, + check_consistent_length, + check_is_fitted, + validate_data, +) +from . import _liblinear as liblinear # type: ignore[attr-defined] + +# mypy error: error: Module 'sklearn.svm' has no attribute '_libsvm' +# (and same for other imports) +from . import _libsvm as libsvm # type: ignore[attr-defined] +from . import _libsvm_sparse as libsvm_sparse # type: ignore[attr-defined] + +LIBSVM_IMPL = ["c_svc", "nu_svc", "one_class", "epsilon_svr", "nu_svr"] + + +def _one_vs_one_coef(dual_coef, n_support, support_vectors): + """Generate primal coefficients from dual coefficients + for the one-vs-one multi class LibSVM in the case + of a linear kernel.""" + + # get 1vs1 weights for all n*(n-1) classifiers. + # this is somewhat messy. + # shape of dual_coef_ is nSV * (n_classes -1) + # see docs for details + n_class = dual_coef.shape[0] + 1 + + # XXX we could do preallocation of coef but + # would have to take care in the sparse case + coef = [] + sv_locs = np.cumsum(np.hstack([[0], n_support])) + for class1 in range(n_class): + # SVs for class1: + sv1 = support_vectors[sv_locs[class1] : sv_locs[class1 + 1], :] + for class2 in range(class1 + 1, n_class): + # SVs for class1: + sv2 = support_vectors[sv_locs[class2] : sv_locs[class2 + 1], :] + + # dual coef for class1 SVs: + alpha1 = dual_coef[class2 - 1, sv_locs[class1] : sv_locs[class1 + 1]] + # dual coef for class2 SVs: + alpha2 = dual_coef[class1, sv_locs[class2] : sv_locs[class2 + 1]] + # build weight for class1 vs class2 + + coef.append(safe_sparse_dot(alpha1, sv1) + safe_sparse_dot(alpha2, sv2)) + return coef + + +class BaseLibSVM(BaseEstimator, metaclass=ABCMeta): + """Base class for estimators that use libsvm as backing library. + + This implements support vector machine classification and regression. + + Parameter documentation is in the derived `SVC` class. + """ + + _parameter_constraints: dict = { + "kernel": [ + StrOptions({"linear", "poly", "rbf", "sigmoid", "precomputed"}), + callable, + ], + "degree": [Interval(Integral, 0, None, closed="left")], + "gamma": [ + StrOptions({"scale", "auto"}), + Interval(Real, 0.0, None, closed="left"), + ], + "coef0": [Interval(Real, None, None, closed="neither")], + "tol": [Interval(Real, 0.0, None, closed="neither")], + "C": [Interval(Real, 0.0, None, closed="right")], + "nu": [Interval(Real, 0.0, 1.0, closed="right")], + "epsilon": [Interval(Real, 0.0, None, closed="left")], + "shrinking": ["boolean"], + "probability": ["boolean"], + "cache_size": [Interval(Real, 0, None, closed="neither")], + "class_weight": [StrOptions({"balanced"}), dict, None], + "verbose": ["verbose"], + "max_iter": [Interval(Integral, -1, None, closed="left")], + "random_state": ["random_state"], + } + + # The order of these must match the integer values in LibSVM. + # XXX These are actually the same in the dense case. Need to factor + # this out. + _sparse_kernels = ["linear", "poly", "rbf", "sigmoid", "precomputed"] + + @abstractmethod + def __init__( + self, + kernel, + degree, + gamma, + coef0, + tol, + C, + nu, + epsilon, + shrinking, + probability, + cache_size, + class_weight, + verbose, + max_iter, + random_state, + ): + if self._impl not in LIBSVM_IMPL: + raise ValueError( + "impl should be one of %s, %s was given" % (LIBSVM_IMPL, self._impl) + ) + + self.kernel = kernel + self.degree = degree + self.gamma = gamma + self.coef0 = coef0 + self.tol = tol + self.C = C + self.nu = nu + self.epsilon = epsilon + self.shrinking = shrinking + self.probability = probability + self.cache_size = cache_size + self.class_weight = class_weight + self.verbose = verbose + self.max_iter = max_iter + self.random_state = random_state + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + # Used by cross_val_score. + tags.input_tags.pairwise = self.kernel == "precomputed" + tags.input_tags.sparse = self.kernel != "precomputed" + return tags + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, sample_weight=None): + """Fit the SVM model according to the given training data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) \ + or (n_samples, n_samples) + Training vectors, where `n_samples` is the number of samples + and `n_features` is the number of features. + For kernel="precomputed", the expected shape of X is + (n_samples, n_samples). + + y : array-like of shape (n_samples,) + Target values (class labels in classification, real numbers in + regression). + + sample_weight : array-like of shape (n_samples,), default=None + Per-sample weights. Rescale C per sample. Higher weights + force the classifier to put more emphasis on these points. + + Returns + ------- + self : object + Fitted estimator. + + Notes + ----- + If X and y are not C-ordered and contiguous arrays of np.float64 and + X is not a scipy.sparse.csr_matrix, X and/or y may be copied. + + If X is a dense array, then the other methods will not support sparse + matrices as input. + """ + rnd = check_random_state(self.random_state) + + sparse = sp.issparse(X) + if sparse and self.kernel == "precomputed": + raise TypeError("Sparse precomputed kernels are not supported.") + self._sparse = sparse and not callable(self.kernel) + + if callable(self.kernel): + check_consistent_length(X, y) + else: + X, y = validate_data( + self, + X, + y, + dtype=np.float64, + order="C", + accept_sparse="csr", + accept_large_sparse=False, + ) + + y = self._validate_targets(y) + + sample_weight = np.asarray( + [] if sample_weight is None else sample_weight, dtype=np.float64 + ) + solver_type = LIBSVM_IMPL.index(self._impl) + + # input validation + n_samples = _num_samples(X) + if solver_type != 2 and n_samples != y.shape[0]: + raise ValueError( + "X and y have incompatible shapes.\n" + + "X has %s samples, but y has %s." % (n_samples, y.shape[0]) + ) + + if self.kernel == "precomputed" and n_samples != X.shape[1]: + raise ValueError( + "Precomputed matrix must be a square matrix." + " Input is a {}x{} matrix.".format(X.shape[0], X.shape[1]) + ) + + if sample_weight.shape[0] > 0 and sample_weight.shape[0] != n_samples: + raise ValueError( + "sample_weight and X have incompatible shapes: " + "%r vs %r\n" + "Note: Sparse matrices cannot be indexed w/" + "boolean masks (use `indices=True` in CV)." + % (sample_weight.shape, X.shape) + ) + + kernel = "precomputed" if callable(self.kernel) else self.kernel + + if kernel == "precomputed": + # unused but needs to be a float for cython code that ignores + # it anyway + self._gamma = 0.0 + elif isinstance(self.gamma, str): + if self.gamma == "scale": + # var = E[X^2] - E[X]^2 if sparse + X_var = (X.multiply(X)).mean() - (X.mean()) ** 2 if sparse else X.var() + self._gamma = 1.0 / (X.shape[1] * X_var) if X_var != 0 else 1.0 + elif self.gamma == "auto": + self._gamma = 1.0 / X.shape[1] + elif isinstance(self.gamma, Real): + self._gamma = self.gamma + + fit = self._sparse_fit if self._sparse else self._dense_fit + if self.verbose: + print("[LibSVM]", end="") + + seed = rnd.randint(np.iinfo("i").max) + fit(X, y, sample_weight, solver_type, kernel, random_seed=seed) + # see comment on the other call to np.iinfo in this file + + self.shape_fit_ = X.shape if hasattr(X, "shape") else (n_samples,) + + # In binary case, we need to flip the sign of coef, intercept and + # decision function. Use self._intercept_ and self._dual_coef_ + # internally. + self._intercept_ = self.intercept_.copy() + self._dual_coef_ = self.dual_coef_ + if self._impl in ["c_svc", "nu_svc"] and len(self.classes_) == 2: + self.intercept_ *= -1 + self.dual_coef_ = -self.dual_coef_ + + dual_coef = self._dual_coef_.data if self._sparse else self._dual_coef_ + intercept_finiteness = np.isfinite(self._intercept_).all() + dual_coef_finiteness = np.isfinite(dual_coef).all() + if not (intercept_finiteness and dual_coef_finiteness): + raise ValueError( + "The dual coefficients or intercepts are not finite." + " The input data may contain large values and need to be" + " preprocessed." + ) + + # Since, in the case of SVC and NuSVC, the number of models optimized by + # libSVM could be greater than one (depending on the input), `n_iter_` + # stores an ndarray. + # For the other sub-classes (SVR, NuSVR, and OneClassSVM), the number of + # models optimized by libSVM is always one, so `n_iter_` stores an + # integer. + if self._impl in ["c_svc", "nu_svc"]: + self.n_iter_ = self._num_iter + else: + self.n_iter_ = self._num_iter.item() + + return self + + def _validate_targets(self, y): + """Validation of y and class_weight. + + Default implementation for SVR and one-class; overridden in BaseSVC. + """ + return column_or_1d(y, warn=True).astype(np.float64, copy=False) + + def _warn_from_fit_status(self): + assert self.fit_status_ in (0, 1) + if self.fit_status_ == 1: + warnings.warn( + "Solver terminated early (max_iter=%i)." + " Consider pre-processing your data with" + " StandardScaler or MinMaxScaler." % self.max_iter, + ConvergenceWarning, + ) + + def _dense_fit(self, X, y, sample_weight, solver_type, kernel, random_seed): + if callable(self.kernel): + # you must store a reference to X to compute the kernel in predict + # TODO: add keyword copy to copy on demand + self.__Xfit = X + X = self._compute_kernel(X) + + if X.shape[0] != X.shape[1]: + raise ValueError("X.shape[0] should be equal to X.shape[1]") + + libsvm.set_verbosity_wrap(self.verbose) + + # we don't pass **self.get_params() to allow subclasses to + # add other parameters to __init__ + ( + self.support_, + self.support_vectors_, + self._n_support, + self.dual_coef_, + self.intercept_, + self._probA, + self._probB, + self.fit_status_, + self._num_iter, + ) = libsvm.fit( + X, + y, + svm_type=solver_type, + sample_weight=sample_weight, + class_weight=getattr(self, "class_weight_", np.empty(0)), + kernel=kernel, + C=self.C, + nu=self.nu, + probability=self.probability, + degree=self.degree, + shrinking=self.shrinking, + tol=self.tol, + cache_size=self.cache_size, + coef0=self.coef0, + gamma=self._gamma, + epsilon=self.epsilon, + max_iter=self.max_iter, + random_seed=random_seed, + ) + + self._warn_from_fit_status() + + def _sparse_fit(self, X, y, sample_weight, solver_type, kernel, random_seed): + X.data = np.asarray(X.data, dtype=np.float64, order="C") + X.sort_indices() + + kernel_type = self._sparse_kernels.index(kernel) + + libsvm_sparse.set_verbosity_wrap(self.verbose) + + ( + self.support_, + self.support_vectors_, + dual_coef_data, + self.intercept_, + self._n_support, + self._probA, + self._probB, + self.fit_status_, + self._num_iter, + ) = libsvm_sparse.libsvm_sparse_train( + X.shape[1], + X.data, + X.indices, + X.indptr, + y, + solver_type, + kernel_type, + self.degree, + self._gamma, + self.coef0, + self.tol, + self.C, + getattr(self, "class_weight_", np.empty(0)), + sample_weight, + self.nu, + self.cache_size, + self.epsilon, + int(self.shrinking), + int(self.probability), + self.max_iter, + random_seed, + ) + + self._warn_from_fit_status() + + if hasattr(self, "classes_"): + n_class = len(self.classes_) - 1 + else: # regression + n_class = 1 + n_SV = self.support_vectors_.shape[0] + + dual_coef_indices = np.tile(np.arange(n_SV), n_class) + if not n_SV: + self.dual_coef_ = sp.csr_matrix([]) + else: + dual_coef_indptr = np.arange( + 0, dual_coef_indices.size + 1, dual_coef_indices.size / n_class + ) + self.dual_coef_ = sp.csr_matrix( + (dual_coef_data, dual_coef_indices, dual_coef_indptr), (n_class, n_SV) + ) + + def predict(self, X): + """Perform regression on samples in X. + + For an one-class model, +1 (inlier) or -1 (outlier) is returned. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + For kernel="precomputed", the expected shape of X is + (n_samples_test, n_samples_train). + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + The predicted values. + """ + X = self._validate_for_predict(X) + predict = self._sparse_predict if self._sparse else self._dense_predict + return predict(X) + + def _dense_predict(self, X): + X = self._compute_kernel(X) + if X.ndim == 1: + X = check_array(X, order="C", accept_large_sparse=False) + + kernel = self.kernel + if callable(self.kernel): + kernel = "precomputed" + if X.shape[1] != self.shape_fit_[0]: + raise ValueError( + "X.shape[1] = %d should be equal to %d, " + "the number of samples at training time" + % (X.shape[1], self.shape_fit_[0]) + ) + + svm_type = LIBSVM_IMPL.index(self._impl) + + return libsvm.predict( + X, + self.support_, + self.support_vectors_, + self._n_support, + self._dual_coef_, + self._intercept_, + self._probA, + self._probB, + svm_type=svm_type, + kernel=kernel, + degree=self.degree, + coef0=self.coef0, + gamma=self._gamma, + cache_size=self.cache_size, + ) + + def _sparse_predict(self, X): + # Precondition: X is a csr_matrix of dtype np.float64. + kernel = self.kernel + if callable(kernel): + kernel = "precomputed" + + kernel_type = self._sparse_kernels.index(kernel) + + C = 0.0 # C is not useful here + + return libsvm_sparse.libsvm_sparse_predict( + X.data, + X.indices, + X.indptr, + self.support_vectors_.data, + self.support_vectors_.indices, + self.support_vectors_.indptr, + self._dual_coef_.data, + self._intercept_, + LIBSVM_IMPL.index(self._impl), + kernel_type, + self.degree, + self._gamma, + self.coef0, + self.tol, + C, + getattr(self, "class_weight_", np.empty(0)), + self.nu, + self.epsilon, + self.shrinking, + self.probability, + self._n_support, + self._probA, + self._probB, + ) + + def _compute_kernel(self, X): + """Return the data transformed by a callable kernel""" + if callable(self.kernel): + # in the case of precomputed kernel given as a function, we + # have to compute explicitly the kernel matrix + kernel = self.kernel(X, self.__Xfit) + if sp.issparse(kernel): + kernel = kernel.toarray() + X = np.asarray(kernel, dtype=np.float64, order="C") + return X + + def _decision_function(self, X): + """Evaluates the decision function for the samples in X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + + Returns + ------- + X : array-like of shape (n_samples, n_class * (n_class-1) / 2) + Returns the decision function of the sample for each class + in the model. + """ + # NOTE: _validate_for_predict contains check for is_fitted + # hence must be placed before any other attributes are used. + X = self._validate_for_predict(X) + X = self._compute_kernel(X) + + if self._sparse: + dec_func = self._sparse_decision_function(X) + else: + dec_func = self._dense_decision_function(X) + + # In binary case, we need to flip the sign of coef, intercept and + # decision function. + if self._impl in ["c_svc", "nu_svc"] and len(self.classes_) == 2: + return -dec_func.ravel() + + return dec_func + + def _dense_decision_function(self, X): + X = check_array(X, dtype=np.float64, order="C", accept_large_sparse=False) + + kernel = self.kernel + if callable(kernel): + kernel = "precomputed" + + return libsvm.decision_function( + X, + self.support_, + self.support_vectors_, + self._n_support, + self._dual_coef_, + self._intercept_, + self._probA, + self._probB, + svm_type=LIBSVM_IMPL.index(self._impl), + kernel=kernel, + degree=self.degree, + cache_size=self.cache_size, + coef0=self.coef0, + gamma=self._gamma, + ) + + def _sparse_decision_function(self, X): + X.data = np.asarray(X.data, dtype=np.float64, order="C") + + kernel = self.kernel + if hasattr(kernel, "__call__"): + kernel = "precomputed" + + kernel_type = self._sparse_kernels.index(kernel) + + return libsvm_sparse.libsvm_sparse_decision_function( + X.data, + X.indices, + X.indptr, + self.support_vectors_.data, + self.support_vectors_.indices, + self.support_vectors_.indptr, + self._dual_coef_.data, + self._intercept_, + LIBSVM_IMPL.index(self._impl), + kernel_type, + self.degree, + self._gamma, + self.coef0, + self.tol, + self.C, + getattr(self, "class_weight_", np.empty(0)), + self.nu, + self.epsilon, + self.shrinking, + self.probability, + self._n_support, + self._probA, + self._probB, + ) + + def _validate_for_predict(self, X): + check_is_fitted(self) + + if not callable(self.kernel): + X = validate_data( + self, + X, + accept_sparse="csr", + dtype=np.float64, + order="C", + accept_large_sparse=False, + reset=False, + ) + + if self._sparse and not sp.issparse(X): + X = sp.csr_matrix(X) + if self._sparse: + X.sort_indices() + + if sp.issparse(X) and not self._sparse and not callable(self.kernel): + raise ValueError( + "cannot use sparse input in %r trained on dense data" + % type(self).__name__ + ) + + if self.kernel == "precomputed": + if X.shape[1] != self.shape_fit_[0]: + raise ValueError( + "X.shape[1] = %d should be equal to %d, " + "the number of samples at training time" + % (X.shape[1], self.shape_fit_[0]) + ) + # Fixes https://nvd.nist.gov/vuln/detail/CVE-2020-28975 + # Check that _n_support is consistent with support_vectors + sv = self.support_vectors_ + if not self._sparse and sv.size > 0 and self.n_support_.sum() != sv.shape[0]: + raise ValueError( + f"The internal representation of {self.__class__.__name__} was altered" + ) + return X + + @property + def coef_(self): + """Weights assigned to the features when `kernel="linear"`. + + Returns + ------- + ndarray of shape (n_features, n_classes) + """ + if self.kernel != "linear": + raise AttributeError("coef_ is only available when using a linear kernel") + + coef = self._get_coef() + + # coef_ being a read-only property, it's better to mark the value as + # immutable to avoid hiding potential bugs for the unsuspecting user. + if sp.issparse(coef): + # sparse matrix do not have global flags + coef.data.flags.writeable = False + else: + # regular dense array + coef.flags.writeable = False + return coef + + def _get_coef(self): + return safe_sparse_dot(self._dual_coef_, self.support_vectors_) + + @property + def n_support_(self): + """Number of support vectors for each class.""" + try: + check_is_fitted(self) + except NotFittedError: + raise AttributeError + + svm_type = LIBSVM_IMPL.index(self._impl) + if svm_type in (0, 1): + return self._n_support + else: + # SVR and OneClass + # _n_support has size 2, we make it size 1 + return np.array([self._n_support[0]]) + + +class BaseSVC(ClassifierMixin, BaseLibSVM, metaclass=ABCMeta): + """ABC for LibSVM-based classifiers.""" + + _parameter_constraints: dict = { + **BaseLibSVM._parameter_constraints, + "decision_function_shape": [StrOptions({"ovr", "ovo"})], + "break_ties": ["boolean"], + } + for unused_param in ["epsilon", "nu"]: + _parameter_constraints.pop(unused_param) + + @abstractmethod + def __init__( + self, + kernel, + degree, + gamma, + coef0, + tol, + C, + nu, + shrinking, + probability, + cache_size, + class_weight, + verbose, + max_iter, + decision_function_shape, + random_state, + break_ties, + ): + self.decision_function_shape = decision_function_shape + self.break_ties = break_ties + super().__init__( + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + C=C, + nu=nu, + epsilon=0.0, + shrinking=shrinking, + probability=probability, + cache_size=cache_size, + class_weight=class_weight, + verbose=verbose, + max_iter=max_iter, + random_state=random_state, + ) + + def _validate_targets(self, y): + y_ = column_or_1d(y, warn=True) + check_classification_targets(y) + cls, y = np.unique(y_, return_inverse=True) + self.class_weight_ = compute_class_weight(self.class_weight, classes=cls, y=y_) + if len(cls) < 2: + raise ValueError( + "The number of classes has to be greater than one; got %d class" + % len(cls) + ) + + self.classes_ = cls + + return np.asarray(y, dtype=np.float64, order="C") + + def decision_function(self, X): + """Evaluate the decision function for the samples in X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input samples. + + Returns + ------- + X : ndarray of shape (n_samples, n_classes * (n_classes-1) / 2) + Returns the decision function of the sample for each class + in the model. + If decision_function_shape='ovr', the shape is (n_samples, + n_classes). + + Notes + ----- + If decision_function_shape='ovo', the function values are proportional + to the distance of the samples X to the separating hyperplane. If the + exact distances are required, divide the function values by the norm of + the weight vector (``coef_``). See also `this question + `_ for further details. + If decision_function_shape='ovr', the decision function is a monotonic + transformation of ovo decision function. + """ + dec = self._decision_function(X) + if self.decision_function_shape == "ovr" and len(self.classes_) > 2: + return _ovr_decision_function(dec < 0, -dec, len(self.classes_)) + return dec + + def predict(self, X): + """Perform classification on samples in X. + + For an one-class model, +1 or -1 is returned. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples_test, n_samples_train) + For kernel="precomputed", the expected shape of X is + (n_samples_test, n_samples_train). + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + Class labels for samples in X. + """ + check_is_fitted(self) + if self.break_ties and self.decision_function_shape == "ovo": + raise ValueError( + "break_ties must be False when decision_function_shape is 'ovo'" + ) + + if ( + self.break_ties + and self.decision_function_shape == "ovr" + and len(self.classes_) > 2 + ): + y = np.argmax(self.decision_function(X), axis=1) + else: + y = super().predict(X) + return self.classes_.take(np.asarray(y, dtype=np.intp)) + + # Hacky way of getting predict_proba to raise an AttributeError when + # probability=False using properties. Do not use this in new code; when + # probabilities are not available depending on a setting, introduce two + # estimators. + def _check_proba(self): + if not self.probability: + raise AttributeError( + "predict_proba is not available when probability=False" + ) + if self._impl not in ("c_svc", "nu_svc"): + raise AttributeError("predict_proba only implemented for SVC and NuSVC") + return True + + @available_if(_check_proba) + def predict_proba(self, X): + """Compute probabilities of possible outcomes for samples in X. + + The model needs to have probability information computed at training + time: fit with attribute `probability` set to True. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + For kernel="precomputed", the expected shape of X is + (n_samples_test, n_samples_train). + + Returns + ------- + T : ndarray of shape (n_samples, n_classes) + Returns the probability of the sample for each class in + the model. The columns correspond to the classes in sorted + order, as they appear in the attribute :term:`classes_`. + + Notes + ----- + The probability model is created using cross validation, so + the results can be slightly different than those obtained by + predict. Also, it will produce meaningless results on very small + datasets. + """ + X = self._validate_for_predict(X) + if self.probA_.size == 0 or self.probB_.size == 0: + raise NotFittedError( + "predict_proba is not available when fitted with probability=False" + ) + pred_proba = ( + self._sparse_predict_proba if self._sparse else self._dense_predict_proba + ) + return pred_proba(X) + + @available_if(_check_proba) + def predict_log_proba(self, X): + """Compute log probabilities of possible outcomes for samples in X. + + The model need to have probability information computed at training + time: fit with attribute `probability` set to True. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) or \ + (n_samples_test, n_samples_train) + For kernel="precomputed", the expected shape of X is + (n_samples_test, n_samples_train). + + Returns + ------- + T : ndarray of shape (n_samples, n_classes) + Returns the log-probabilities of the sample for each class in + the model. The columns correspond to the classes in sorted + order, as they appear in the attribute :term:`classes_`. + + Notes + ----- + The probability model is created using cross validation, so + the results can be slightly different than those obtained by + predict. Also, it will produce meaningless results on very small + datasets. + """ + return np.log(self.predict_proba(X)) + + def _dense_predict_proba(self, X): + X = self._compute_kernel(X) + + kernel = self.kernel + if callable(kernel): + kernel = "precomputed" + + svm_type = LIBSVM_IMPL.index(self._impl) + pprob = libsvm.predict_proba( + X, + self.support_, + self.support_vectors_, + self._n_support, + self._dual_coef_, + self._intercept_, + self._probA, + self._probB, + svm_type=svm_type, + kernel=kernel, + degree=self.degree, + cache_size=self.cache_size, + coef0=self.coef0, + gamma=self._gamma, + ) + + return pprob + + def _sparse_predict_proba(self, X): + X.data = np.asarray(X.data, dtype=np.float64, order="C") + + kernel = self.kernel + if callable(kernel): + kernel = "precomputed" + + kernel_type = self._sparse_kernels.index(kernel) + + return libsvm_sparse.libsvm_sparse_predict_proba( + X.data, + X.indices, + X.indptr, + self.support_vectors_.data, + self.support_vectors_.indices, + self.support_vectors_.indptr, + self._dual_coef_.data, + self._intercept_, + LIBSVM_IMPL.index(self._impl), + kernel_type, + self.degree, + self._gamma, + self.coef0, + self.tol, + self.C, + getattr(self, "class_weight_", np.empty(0)), + self.nu, + self.epsilon, + self.shrinking, + self.probability, + self._n_support, + self._probA, + self._probB, + ) + + def _get_coef(self): + if self.dual_coef_.shape[0] == 1: + # binary classifier + coef = safe_sparse_dot(self.dual_coef_, self.support_vectors_) + else: + # 1vs1 classifier + coef = _one_vs_one_coef( + self.dual_coef_, self._n_support, self.support_vectors_ + ) + if sp.issparse(coef[0]): + coef = sp.vstack(coef).tocsr() + else: + coef = np.vstack(coef) + + return coef + + @property + def probA_(self): + """Parameter learned in Platt scaling when `probability=True`. + + Returns + ------- + ndarray of shape (n_classes * (n_classes - 1) / 2) + """ + return self._probA + + @property + def probB_(self): + """Parameter learned in Platt scaling when `probability=True`. + + Returns + ------- + ndarray of shape (n_classes * (n_classes - 1) / 2) + """ + return self._probB + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = self.kernel != "precomputed" + return tags + + +def _get_liblinear_solver_type(multi_class, penalty, loss, dual): + """Find the liblinear magic number for the solver. + + This number depends on the values of the following attributes: + - multi_class + - penalty + - loss + - dual + + The same number is also internally used by LibLinear to determine + which solver to use. + """ + # nested dicts containing level 1: available loss functions, + # level2: available penalties for the given loss function, + # level3: whether the dual solver is available for the specified + # combination of loss function and penalty + _solver_type_dict = { + "logistic_regression": {"l1": {False: 6}, "l2": {False: 0, True: 7}}, + "hinge": {"l2": {True: 3}}, + "squared_hinge": {"l1": {False: 5}, "l2": {False: 2, True: 1}}, + "epsilon_insensitive": {"l2": {True: 13}}, + "squared_epsilon_insensitive": {"l2": {False: 11, True: 12}}, + "crammer_singer": 4, + } + + if multi_class == "crammer_singer": + return _solver_type_dict[multi_class] + elif multi_class != "ovr": + raise ValueError( + "`multi_class` must be one of `ovr`, `crammer_singer`, got %r" % multi_class + ) + + _solver_pen = _solver_type_dict.get(loss, None) + if _solver_pen is None: + error_string = "loss='%s' is not supported" % loss + else: + _solver_dual = _solver_pen.get(penalty, None) + if _solver_dual is None: + error_string = ( + "The combination of penalty='%s' and loss='%s' is not supported" + % (penalty, loss) + ) + else: + solver_num = _solver_dual.get(dual, None) + if solver_num is None: + error_string = ( + "The combination of penalty='%s' and " + "loss='%s' are not supported when dual=%s" % (penalty, loss, dual) + ) + else: + return solver_num + raise ValueError( + "Unsupported set of arguments: %s, Parameters: penalty=%r, loss=%r, dual=%r" + % (error_string, penalty, loss, dual) + ) + + +def _fit_liblinear( + X, + y, + C, + fit_intercept, + intercept_scaling, + class_weight, + penalty, + dual, + verbose, + max_iter, + tol, + random_state=None, + multi_class="ovr", + loss="logistic_regression", + epsilon=0.1, + sample_weight=None, +): + """Used by Logistic Regression (and CV) and LinearSVC/LinearSVR. + + Preprocessing is done in this function before supplying it to liblinear. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target vector relative to X + + C : float + Inverse of cross-validation parameter. The lower the C, the higher + the penalization. + + fit_intercept : bool + Whether or not to fit an intercept. If set to True, the feature vector + is extended to include an intercept term: ``[x_1, ..., x_n, 1]``, where + 1 corresponds to the intercept. If set to False, no intercept will be + used in calculations (i.e. data is expected to be already centered). + + intercept_scaling : float + Liblinear internally penalizes the intercept, treating it like any + other term in the feature vector. To reduce the impact of the + regularization on the intercept, the `intercept_scaling` parameter can + be set to a value greater than 1; the higher the value of + `intercept_scaling`, the lower the impact of regularization on it. + Then, the weights become `[w_x_1, ..., w_x_n, + w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent + the feature weights and the intercept weight is scaled by + `intercept_scaling`. This scaling allows the intercept term to have a + different regularization behavior compared to the other features. + + class_weight : dict or 'balanced', default=None + Weights associated with classes in the form ``{class_label: weight}``. + If not given, all classes are supposed to have weight one. For + multi-output problems, a list of dicts can be provided in the same + order as the columns of y. + + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))`` + + penalty : {'l1', 'l2'} + The norm of the penalty used in regularization. + + dual : bool + Dual or primal formulation, + + verbose : int + Set verbose to any positive number for verbosity. + + max_iter : int + Number of iterations. + + tol : float + Stopping condition. + + random_state : int, RandomState instance or None, default=None + Controls the pseudo random number generation for shuffling the data. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + multi_class : {'ovr', 'crammer_singer'}, default='ovr' + `ovr` trains n_classes one-vs-rest classifiers, while `crammer_singer` + optimizes a joint objective over all classes. + While `crammer_singer` is interesting from an theoretical perspective + as it is consistent it is seldom used in practice and rarely leads to + better accuracy and is more expensive to compute. + If `crammer_singer` is chosen, the options loss, penalty and dual will + be ignored. + + loss : {'logistic_regression', 'hinge', 'squared_hinge', \ + 'epsilon_insensitive', 'squared_epsilon_insensitive}, \ + default='logistic_regression' + The loss function used to fit the model. + + epsilon : float, default=0.1 + Epsilon parameter in the epsilon-insensitive loss function. Note + that the value of this parameter depends on the scale of the target + variable y. If unsure, set epsilon=0. + + sample_weight : array-like of shape (n_samples,), default=None + Weights assigned to each sample. + + Returns + ------- + coef_ : ndarray of shape (n_features, n_features + 1) + The coefficient vector got by minimizing the objective function. + + intercept_ : float + The intercept term added to the vector. + + n_iter_ : array of int + Number of iterations run across for each class. + """ + if loss not in ["epsilon_insensitive", "squared_epsilon_insensitive"]: + enc = LabelEncoder() + y_ind = enc.fit_transform(y) + classes_ = enc.classes_ + if len(classes_) < 2: + raise ValueError( + "This solver needs samples of at least 2 classes" + " in the data, but the data contains only one" + " class: %r" % classes_[0] + ) + class_weight_ = compute_class_weight( + class_weight, classes=classes_, y=y, sample_weight=sample_weight + ) + else: + class_weight_ = np.empty(0, dtype=np.float64) + y_ind = y + liblinear.set_verbosity_wrap(verbose) + rnd = check_random_state(random_state) + if verbose: + print("[LibLinear]", end="") + + # LinearSVC breaks when intercept_scaling is <= 0 + bias = -1.0 + if fit_intercept: + if intercept_scaling <= 0: + raise ValueError( + "Intercept scaling is %r but needs to be greater " + "than 0. To disable fitting an intercept," + " set fit_intercept=False." % intercept_scaling + ) + else: + bias = intercept_scaling + + libsvm.set_verbosity_wrap(verbose) + libsvm_sparse.set_verbosity_wrap(verbose) + liblinear.set_verbosity_wrap(verbose) + + # Liblinear doesn't support 64bit sparse matrix indices yet + if sp.issparse(X): + _check_large_sparse(X) + + # LibLinear wants targets as doubles, even for classification + y_ind = np.asarray(y_ind, dtype=np.float64).ravel() + y_ind = np.require(y_ind, requirements="W") + + sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float64) + + solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual) + raw_coef_, n_iter_ = liblinear.train_wrap( + X, + y_ind, + sp.issparse(X), + solver_type, + tol, + bias, + C, + class_weight_, + max_iter, + rnd.randint(np.iinfo("i").max), + epsilon, + sample_weight, + ) + # Regarding rnd.randint(..) in the above signature: + # seed for srand in range [0..INT_MAX); due to limitations in Numpy + # on 32-bit platforms, we can't get to the UINT_MAX limit that + # srand supports + n_iter_max = max(n_iter_) + if n_iter_max >= max_iter: + warnings.warn( + "Liblinear failed to converge, increase the number of iterations.", + ConvergenceWarning, + ) + + if fit_intercept: + coef_ = raw_coef_[:, :-1] + intercept_ = intercept_scaling * raw_coef_[:, -1] + else: + coef_ = raw_coef_ + intercept_ = 0.0 + + return coef_, intercept_, n_iter_ diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/_bounds.py b/.venv/lib/python3.12/site-packages/sklearn/svm/_bounds.py new file mode 100644 index 0000000000000000000000000000000000000000..44923cb12976776507a9dc02502424832158391c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/_bounds.py @@ -0,0 +1,98 @@ +"""Determination of parameter bounds""" + +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Real + +import numpy as np + +from ..preprocessing import LabelBinarizer +from ..utils._param_validation import Interval, StrOptions, validate_params +from ..utils.extmath import safe_sparse_dot +from ..utils.validation import check_array, check_consistent_length + + +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "y": ["array-like"], + "loss": [StrOptions({"squared_hinge", "log"})], + "fit_intercept": ["boolean"], + "intercept_scaling": [Interval(Real, 0, None, closed="neither")], + }, + prefer_skip_nested_validation=True, +) +def l1_min_c(X, y, *, loss="squared_hinge", fit_intercept=True, intercept_scaling=1.0): + """Return the lowest bound for `C`. + + The lower bound for `C` is computed such that for `C` in `(l1_min_C, infinity)` + the model is guaranteed not to be empty. This applies to l1 penalized + classifiers, such as :class:`sklearn.svm.LinearSVC` with penalty='l1' and + :class:`sklearn.linear_model.LogisticRegression` with penalty='l1'. + + This value is valid if `class_weight` parameter in `fit()` is not set. + + For an example of how to use this function, see + :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target vector relative to X. + + loss : {'squared_hinge', 'log'}, default='squared_hinge' + Specifies the loss function. + With 'squared_hinge' it is the squared hinge loss (a.k.a. L2 loss). + With 'log' it is the loss of logistic regression models. + + fit_intercept : bool, default=True + Specifies if the intercept should be fitted by the model. + It must match the fit() method parameter. + + intercept_scaling : float, default=1.0 + When fit_intercept is True, instance vector x becomes + [x, intercept_scaling], + i.e. a "synthetic" feature with constant value equals to + intercept_scaling is appended to the instance vector. + It must match the fit() method parameter. + + Returns + ------- + l1_min_c : float + Minimum value for C. + + Examples + -------- + >>> from sklearn.svm import l1_min_c + >>> from sklearn.datasets import make_classification + >>> X, y = make_classification(n_samples=100, n_features=20, random_state=42) + >>> print(f"{l1_min_c(X, y, loss='squared_hinge', fit_intercept=True):.4f}") + 0.0044 + """ + + X = check_array(X, accept_sparse="csc") + check_consistent_length(X, y) + + Y = LabelBinarizer(neg_label=-1).fit_transform(y).T + # maximum absolute value over classes and features + den = np.max(np.abs(safe_sparse_dot(Y, X))) + if fit_intercept: + bias = np.full( + (np.size(y), 1), intercept_scaling, dtype=np.array(intercept_scaling).dtype + ) + den = max(den, abs(np.dot(Y, bias)).max()) + + if den == 0.0: + raise ValueError( + "Ill-posed l1_min_c calculation: l1 will always " + "select zero coefficients for this data" + ) + if loss == "squared_hinge": + return 0.5 / den + else: # loss == 'log': + return 2.0 / den diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/_classes.py b/.venv/lib/python3.12/site-packages/sklearn/svm/_classes.py new file mode 100644 index 0000000000000000000000000000000000000000..277da42893eaff6737f32fea006e719a2f00e4d0 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/_classes.py @@ -0,0 +1,1789 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +from numbers import Integral, Real + +import numpy as np + +from ..base import BaseEstimator, OutlierMixin, RegressorMixin, _fit_context +from ..linear_model._base import LinearClassifierMixin, LinearModel, SparseCoefMixin +from ..utils._param_validation import Interval, StrOptions +from ..utils.multiclass import check_classification_targets +from ..utils.validation import _num_samples, validate_data +from ._base import BaseLibSVM, BaseSVC, _fit_liblinear, _get_liblinear_solver_type + + +def _validate_dual_parameter(dual, loss, penalty, multi_class, X): + """Helper function to assign the value of dual parameter.""" + if dual == "auto": + if X.shape[0] < X.shape[1]: + try: + _get_liblinear_solver_type(multi_class, penalty, loss, True) + return True + except ValueError: # dual not supported for the combination + return False + else: + try: + _get_liblinear_solver_type(multi_class, penalty, loss, False) + return False + except ValueError: # primal not supported by the combination + return True + else: + return dual + + +class LinearSVC(LinearClassifierMixin, SparseCoefMixin, BaseEstimator): + """Linear Support Vector Classification. + + Similar to SVC with parameter kernel='linear', but implemented in terms of + liblinear rather than libsvm, so it has more flexibility in the choice of + penalties and loss functions and should scale better to large numbers of + samples. + + The main differences between :class:`~sklearn.svm.LinearSVC` and + :class:`~sklearn.svm.SVC` lie in the loss function used by default, and in + the handling of intercept regularization between those two implementations. + + This class supports both dense and sparse input and the multiclass support + is handled according to a one-vs-the-rest scheme. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + penalty : {'l1', 'l2'}, default='l2' + Specifies the norm used in the penalization. The 'l2' + penalty is the standard used in SVC. The 'l1' leads to ``coef_`` + vectors that are sparse. + + loss : {'hinge', 'squared_hinge'}, default='squared_hinge' + Specifies the loss function. 'hinge' is the standard SVM loss + (used e.g. by the SVC class) while 'squared_hinge' is the + square of the hinge loss. The combination of ``penalty='l1'`` + and ``loss='hinge'`` is not supported. + + dual : "auto" or bool, default="auto" + Select the algorithm to either solve the dual or primal + optimization problem. Prefer dual=False when n_samples > n_features. + `dual="auto"` will choose the value of the parameter automatically, + based on the values of `n_samples`, `n_features`, `loss`, `multi_class` + and `penalty`. If `n_samples` < `n_features` and optimizer supports + chosen `loss`, `multi_class` and `penalty`, then dual will be set to True, + otherwise it will be set to False. + + .. versionchanged:: 1.3 + The `"auto"` option is added in version 1.3 and will be the default + in version 1.5. + + tol : float, default=1e-4 + Tolerance for stopping criteria. + + C : float, default=1.0 + Regularization parameter. The strength of the regularization is + inversely proportional to C. Must be strictly positive. + For an intuitive visualization of the effects of scaling + the regularization parameter C, see + :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`. + + multi_class : {'ovr', 'crammer_singer'}, default='ovr' + Determines the multi-class strategy if `y` contains more than + two classes. + ``"ovr"`` trains n_classes one-vs-rest classifiers, while + ``"crammer_singer"`` optimizes a joint objective over all classes. + While `crammer_singer` is interesting from a theoretical perspective + as it is consistent, it is seldom used in practice as it rarely leads + to better accuracy and is more expensive to compute. + If ``"crammer_singer"`` is chosen, the options loss, penalty and dual + will be ignored. + + fit_intercept : bool, default=True + Whether or not to fit an intercept. If set to True, the feature vector + is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where + 1 corresponds to the intercept. If set to False, no intercept will be + used in calculations (i.e. data is expected to be already centered). + + intercept_scaling : float, default=1.0 + When `fit_intercept` is True, the instance vector x becomes ``[x_1, + ..., x_n, intercept_scaling]``, i.e. a "synthetic" feature with a + constant value equal to `intercept_scaling` is appended to the instance + vector. The intercept becomes intercept_scaling * synthetic feature + weight. Note that liblinear internally penalizes the intercept, + treating it like any other term in the feature vector. To reduce the + impact of the regularization on the intercept, the `intercept_scaling` + parameter can be set to a value greater than 1; the higher the value of + `intercept_scaling`, the lower the impact of regularization on it. + Then, the weights become `[w_x_1, ..., w_x_n, + w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent + the feature weights and the intercept weight is scaled by + `intercept_scaling`. This scaling allows the intercept term to have a + different regularization behavior compared to the other features. + + class_weight : dict or 'balanced', default=None + Set the parameter C of class i to ``class_weight[i]*C`` for + SVC. If not given, all classes are supposed to have + weight one. + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))``. + + verbose : int, default=0 + Enable verbose output. Note that this setting takes advantage of a + per-process runtime setting in liblinear that, if enabled, may not work + properly in a multithreaded context. + + random_state : int, RandomState instance or None, default=None + Controls the pseudo random number generation for shuffling the data for + the dual coordinate descent (if ``dual=True``). When ``dual=False`` the + underlying implementation of :class:`LinearSVC` is not random and + ``random_state`` has no effect on the results. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + max_iter : int, default=1000 + The maximum number of iterations to be run. + + Attributes + ---------- + coef_ : ndarray of shape (1, n_features) if n_classes == 2 \ + else (n_classes, n_features) + Weights assigned to the features (coefficients in the primal + problem). + + ``coef_`` is a readonly property derived from ``raw_coef_`` that + follows the internal memory layout of liblinear. + + intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,) + Constants in decision function. + + classes_ : ndarray of shape (n_classes,) + The unique classes labels. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + Maximum number of iterations run across all classes. + + See Also + -------- + SVC : Implementation of Support Vector Machine classifier using libsvm: + the kernel can be non-linear but its SMO algorithm does not + scale to large number of samples as LinearSVC does. + + Furthermore SVC multi-class mode is implemented using one + vs one scheme while LinearSVC uses one vs the rest. It is + possible to implement one vs the rest with SVC by using the + :class:`~sklearn.multiclass.OneVsRestClassifier` wrapper. + + Finally SVC can fit dense data without memory copy if the input + is C-contiguous. Sparse data will still incur memory copy though. + + sklearn.linear_model.SGDClassifier : SGDClassifier can optimize the same + cost function as LinearSVC + by adjusting the penalty and loss parameters. In addition it requires + less memory, allows incremental (online) learning, and implements + various loss functions and regularization regimes. + + Notes + ----- + The underlying C implementation uses a random number generator to + select features when fitting the model. It is thus not uncommon + to have slightly different results for the same input data. If + that happens, try with a smaller ``tol`` parameter. + + The underlying implementation, liblinear, uses a sparse internal + representation for the data that will incur a memory copy. + + Predict output may not match that of standalone liblinear in certain + cases. See :ref:`differences from liblinear ` + in the narrative documentation. + + References + ---------- + `LIBLINEAR: A Library for Large Linear Classification + `__ + + Examples + -------- + >>> from sklearn.svm import LinearSVC + >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.preprocessing import StandardScaler + >>> from sklearn.datasets import make_classification + >>> X, y = make_classification(n_features=4, random_state=0) + >>> clf = make_pipeline(StandardScaler(), + ... LinearSVC(random_state=0, tol=1e-5)) + >>> clf.fit(X, y) + Pipeline(steps=[('standardscaler', StandardScaler()), + ('linearsvc', LinearSVC(random_state=0, tol=1e-05))]) + + >>> print(clf.named_steps['linearsvc'].coef_) + [[0.141 0.526 0.679 0.493]] + + >>> print(clf.named_steps['linearsvc'].intercept_) + [0.1693] + >>> print(clf.predict([[0, 0, 0, 0]])) + [1] + """ + + _parameter_constraints: dict = { + "penalty": [StrOptions({"l1", "l2"})], + "loss": [StrOptions({"hinge", "squared_hinge"})], + "dual": ["boolean", StrOptions({"auto"})], + "tol": [Interval(Real, 0.0, None, closed="neither")], + "C": [Interval(Real, 0.0, None, closed="neither")], + "multi_class": [StrOptions({"ovr", "crammer_singer"})], + "fit_intercept": ["boolean"], + "intercept_scaling": [Interval(Real, 0, None, closed="neither")], + "class_weight": [None, dict, StrOptions({"balanced"})], + "verbose": ["verbose"], + "random_state": ["random_state"], + "max_iter": [Interval(Integral, 0, None, closed="left")], + } + + def __init__( + self, + penalty="l2", + loss="squared_hinge", + *, + dual="auto", + tol=1e-4, + C=1.0, + multi_class="ovr", + fit_intercept=True, + intercept_scaling=1, + class_weight=None, + verbose=0, + random_state=None, + max_iter=1000, + ): + self.dual = dual + self.tol = tol + self.C = C + self.multi_class = multi_class + self.fit_intercept = fit_intercept + self.intercept_scaling = intercept_scaling + self.class_weight = class_weight + self.verbose = verbose + self.random_state = random_state + self.max_iter = max_iter + self.penalty = penalty + self.loss = loss + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, sample_weight=None): + """Fit the model according to the given training data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target vector relative to X. + + sample_weight : array-like of shape (n_samples,), default=None + Array of weights that are assigned to individual + samples. If not provided, + then each sample is given unit weight. + + .. versionadded:: 0.18 + + Returns + ------- + self : object + An instance of the estimator. + """ + X, y = validate_data( + self, + X, + y, + accept_sparse="csr", + dtype=np.float64, + order="C", + accept_large_sparse=False, + ) + check_classification_targets(y) + self.classes_ = np.unique(y) + + _dual = _validate_dual_parameter( + self.dual, self.loss, self.penalty, self.multi_class, X + ) + + self.coef_, self.intercept_, n_iter_ = _fit_liblinear( + X, + y, + self.C, + self.fit_intercept, + self.intercept_scaling, + self.class_weight, + self.penalty, + _dual, + self.verbose, + self.max_iter, + self.tol, + self.random_state, + self.multi_class, + self.loss, + sample_weight=sample_weight, + ) + # Backward compatibility: _fit_liblinear is used both by LinearSVC/R + # and LogisticRegression but LogisticRegression sets a structured + # `n_iter_` attribute with information about the underlying OvR fits + # while LinearSVC/R only reports the maximum value. + self.n_iter_ = n_iter_.max().item() + + if self.multi_class == "crammer_singer" and len(self.classes_) == 2: + self.coef_ = (self.coef_[1] - self.coef_[0]).reshape(1, -1) + if self.fit_intercept: + intercept = self.intercept_[1] - self.intercept_[0] + self.intercept_ = np.array([intercept]) + + return self + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + + +class LinearSVR(RegressorMixin, LinearModel): + """Linear Support Vector Regression. + + Similar to SVR with parameter kernel='linear', but implemented in terms of + liblinear rather than libsvm, so it has more flexibility in the choice of + penalties and loss functions and should scale better to large numbers of + samples. + + The main differences between :class:`~sklearn.svm.LinearSVR` and + :class:`~sklearn.svm.SVR` lie in the loss function used by default, and in + the handling of intercept regularization between those two implementations. + + This class supports both dense and sparse input. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.16 + + Parameters + ---------- + epsilon : float, default=0.0 + Epsilon parameter in the epsilon-insensitive loss function. Note + that the value of this parameter depends on the scale of the target + variable y. If unsure, set ``epsilon=0``. + + tol : float, default=1e-4 + Tolerance for stopping criteria. + + C : float, default=1.0 + Regularization parameter. The strength of the regularization is + inversely proportional to C. Must be strictly positive. + + loss : {'epsilon_insensitive', 'squared_epsilon_insensitive'}, \ + default='epsilon_insensitive' + Specifies the loss function. The epsilon-insensitive loss + (standard SVR) is the L1 loss, while the squared epsilon-insensitive + loss ('squared_epsilon_insensitive') is the L2 loss. + + fit_intercept : bool, default=True + Whether or not to fit an intercept. If set to True, the feature vector + is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where + 1 corresponds to the intercept. If set to False, no intercept will be + used in calculations (i.e. data is expected to be already centered). + + intercept_scaling : float, default=1.0 + When `fit_intercept` is True, the instance vector x becomes `[x_1, ..., + x_n, intercept_scaling]`, i.e. a "synthetic" feature with a constant + value equal to `intercept_scaling` is appended to the instance vector. + The intercept becomes intercept_scaling * synthetic feature weight. + Note that liblinear internally penalizes the intercept, treating it + like any other term in the feature vector. To reduce the impact of the + regularization on the intercept, the `intercept_scaling` parameter can + be set to a value greater than 1; the higher the value of + `intercept_scaling`, the lower the impact of regularization on it. + Then, the weights become `[w_x_1, ..., w_x_n, + w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent + the feature weights and the intercept weight is scaled by + `intercept_scaling`. This scaling allows the intercept term to have a + different regularization behavior compared to the other features. + + dual : "auto" or bool, default="auto" + Select the algorithm to either solve the dual or primal + optimization problem. Prefer dual=False when n_samples > n_features. + `dual="auto"` will choose the value of the parameter automatically, + based on the values of `n_samples`, `n_features` and `loss`. If + `n_samples` < `n_features` and optimizer supports chosen `loss`, + then dual will be set to True, otherwise it will be set to False. + + .. versionchanged:: 1.3 + The `"auto"` option is added in version 1.3 and will be the default + in version 1.5. + + verbose : int, default=0 + Enable verbose output. Note that this setting takes advantage of a + per-process runtime setting in liblinear that, if enabled, may not work + properly in a multithreaded context. + + random_state : int, RandomState instance or None, default=None + Controls the pseudo random number generation for shuffling the data. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + max_iter : int, default=1000 + The maximum number of iterations to be run. + + Attributes + ---------- + coef_ : ndarray of shape (n_features) if n_classes == 2 \ + else (n_classes, n_features) + Weights assigned to the features (coefficients in the primal + problem). + + `coef_` is a readonly property derived from `raw_coef_` that + follows the internal memory layout of liblinear. + + intercept_ : ndarray of shape (1) if n_classes == 2 else (n_classes) + Constants in decision function. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + Maximum number of iterations run across all classes. + + See Also + -------- + LinearSVC : Implementation of Support Vector Machine classifier using the + same library as this class (liblinear). + + SVR : Implementation of Support Vector Machine regression using libsvm: + the kernel can be non-linear but its SMO algorithm does not scale to + large number of samples as :class:`~sklearn.svm.LinearSVR` does. + + sklearn.linear_model.SGDRegressor : SGDRegressor can optimize the same cost + function as LinearSVR + by adjusting the penalty and loss parameters. In addition it requires + less memory, allows incremental (online) learning, and implements + various loss functions and regularization regimes. + + Examples + -------- + >>> from sklearn.svm import LinearSVR + >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.preprocessing import StandardScaler + >>> from sklearn.datasets import make_regression + >>> X, y = make_regression(n_features=4, random_state=0) + >>> regr = make_pipeline(StandardScaler(), + ... LinearSVR(random_state=0, tol=1e-5)) + >>> regr.fit(X, y) + Pipeline(steps=[('standardscaler', StandardScaler()), + ('linearsvr', LinearSVR(random_state=0, tol=1e-05))]) + + >>> print(regr.named_steps['linearsvr'].coef_) + [18.582 27.023 44.357 64.522] + >>> print(regr.named_steps['linearsvr'].intercept_) + [-4.] + >>> print(regr.predict([[0, 0, 0, 0]])) + [-2.384] + """ + + _parameter_constraints: dict = { + "epsilon": [Real], + "tol": [Interval(Real, 0.0, None, closed="neither")], + "C": [Interval(Real, 0.0, None, closed="neither")], + "loss": [StrOptions({"epsilon_insensitive", "squared_epsilon_insensitive"})], + "fit_intercept": ["boolean"], + "intercept_scaling": [Interval(Real, 0, None, closed="neither")], + "dual": ["boolean", StrOptions({"auto"})], + "verbose": ["verbose"], + "random_state": ["random_state"], + "max_iter": [Interval(Integral, 0, None, closed="left")], + } + + def __init__( + self, + *, + epsilon=0.0, + tol=1e-4, + C=1.0, + loss="epsilon_insensitive", + fit_intercept=True, + intercept_scaling=1.0, + dual="auto", + verbose=0, + random_state=None, + max_iter=1000, + ): + self.tol = tol + self.C = C + self.epsilon = epsilon + self.fit_intercept = fit_intercept + self.intercept_scaling = intercept_scaling + self.verbose = verbose + self.random_state = random_state + self.max_iter = max_iter + self.dual = dual + self.loss = loss + + @_fit_context(prefer_skip_nested_validation=True) + def fit(self, X, y, sample_weight=None): + """Fit the model according to the given training data. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Training vector, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) + Target vector relative to X. + + sample_weight : array-like of shape (n_samples,), default=None + Array of weights that are assigned to individual + samples. If not provided, + then each sample is given unit weight. + + .. versionadded:: 0.18 + + Returns + ------- + self : object + An instance of the estimator. + """ + X, y = validate_data( + self, + X, + y, + accept_sparse="csr", + dtype=np.float64, + order="C", + accept_large_sparse=False, + ) + penalty = "l2" # SVR only accepts l2 penalty + + _dual = _validate_dual_parameter(self.dual, self.loss, penalty, "ovr", X) + + self.coef_, self.intercept_, n_iter_ = _fit_liblinear( + X, + y, + self.C, + self.fit_intercept, + self.intercept_scaling, + None, + penalty, + _dual, + self.verbose, + self.max_iter, + self.tol, + self.random_state, + loss=self.loss, + epsilon=self.epsilon, + sample_weight=sample_weight, + ) + self.coef_ = self.coef_.ravel() + # Backward compatibility: _fit_liblinear is used both by LinearSVC/R + # and LogisticRegression but LogisticRegression sets a structured + # `n_iter_` attribute with information about the underlying OvR fits + # while LinearSVC/R only reports the maximum value. + self.n_iter_ = n_iter_.max().item() + + return self + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + + +class SVC(BaseSVC): + """C-Support Vector Classification. + + The implementation is based on libsvm. The fit time scales at least + quadratically with the number of samples and may be impractical + beyond tens of thousands of samples. For large datasets + consider using :class:`~sklearn.svm.LinearSVC` or + :class:`~sklearn.linear_model.SGDClassifier` instead, possibly after a + :class:`~sklearn.kernel_approximation.Nystroem` transformer or + other :ref:`kernel_approximation`. + + The multiclass support is handled according to a one-vs-one scheme. + + For details on the precise mathematical formulation of the provided + kernel functions and how `gamma`, `coef0` and `degree` affect each + other, see the corresponding section in the narrative documentation: + :ref:`svm_kernels`. + + To learn how to tune SVC's hyperparameters, see the following example: + :ref:`sphx_glr_auto_examples_model_selection_plot_nested_cross_validation_iris.py` + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + C : float, default=1.0 + Regularization parameter. The strength of the regularization is + inversely proportional to C. Must be strictly positive. The penalty + is a squared l2 penalty. For an intuitive visualization of the effects + of scaling the regularization parameter C, see + :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`. + + kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, \ + default='rbf' + Specifies the kernel type to be used in the algorithm. If + none is given, 'rbf' will be used. If a callable is given it is used to + pre-compute the kernel matrix from data matrices; that matrix should be + an array of shape ``(n_samples, n_samples)``. For an intuitive + visualization of different kernel types see + :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`. + + degree : int, default=3 + Degree of the polynomial kernel function ('poly'). + Must be non-negative. Ignored by all other kernels. + + gamma : {'scale', 'auto'} or float, default='scale' + Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. + + - if ``gamma='scale'`` (default) is passed then it uses + 1 / (n_features * X.var()) as value of gamma, + - if 'auto', uses 1 / n_features + - if float, must be non-negative. + + .. versionchanged:: 0.22 + The default value of ``gamma`` changed from 'auto' to 'scale'. + + coef0 : float, default=0.0 + Independent term in kernel function. + It is only significant in 'poly' and 'sigmoid'. + + shrinking : bool, default=True + Whether to use the shrinking heuristic. + See the :ref:`User Guide `. + + probability : bool, default=False + Whether to enable probability estimates. This must be enabled prior + to calling `fit`, will slow down that method as it internally uses + 5-fold cross-validation, and `predict_proba` may be inconsistent with + `predict`. Read more in the :ref:`User Guide `. + + tol : float, default=1e-3 + Tolerance for stopping criterion. + + cache_size : float, default=200 + Specify the size of the kernel cache (in MB). + + class_weight : dict or 'balanced', default=None + Set the parameter C of class i to class_weight[i]*C for + SVC. If not given, all classes are supposed to have + weight one. + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))``. + + verbose : bool, default=False + Enable verbose output. Note that this setting takes advantage of a + per-process runtime setting in libsvm that, if enabled, may not work + properly in a multithreaded context. + + max_iter : int, default=-1 + Hard limit on iterations within solver, or -1 for no limit. + + decision_function_shape : {'ovo', 'ovr'}, default='ovr' + Whether to return a one-vs-rest ('ovr') decision function of shape + (n_samples, n_classes) as all other classifiers, or the original + one-vs-one ('ovo') decision function of libsvm which has shape + (n_samples, n_classes * (n_classes - 1) / 2). However, note that + internally, one-vs-one ('ovo') is always used as a multi-class strategy + to train models; an ovr matrix is only constructed from the ovo matrix. + The parameter is ignored for binary classification. + + .. versionchanged:: 0.19 + decision_function_shape is 'ovr' by default. + + .. versionadded:: 0.17 + *decision_function_shape='ovr'* is recommended. + + .. versionchanged:: 0.17 + Deprecated *decision_function_shape='ovo' and None*. + + break_ties : bool, default=False + If true, ``decision_function_shape='ovr'``, and number of classes > 2, + :term:`predict` will break ties according to the confidence values of + :term:`decision_function`; otherwise the first class among the tied + classes is returned. Please note that breaking ties comes at a + relatively high computational cost compared to a simple predict. See + :ref:`sphx_glr_auto_examples_svm_plot_svm_tie_breaking.py` for an + example of its usage with ``decision_function_shape='ovr'``. + + .. versionadded:: 0.22 + + random_state : int, RandomState instance or None, default=None + Controls the pseudo random number generation for shuffling the data for + probability estimates. Ignored when `probability` is False. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + class_weight_ : ndarray of shape (n_classes,) + Multipliers of parameter C for each class. + Computed based on the ``class_weight`` parameter. + + classes_ : ndarray of shape (n_classes,) + The classes labels. + + coef_ : ndarray of shape (n_classes * (n_classes - 1) / 2, n_features) + Weights assigned to the features (coefficients in the primal + problem). This is only available in the case of a linear kernel. + + `coef_` is a readonly property derived from `dual_coef_` and + `support_vectors_`. + + dual_coef_ : ndarray of shape (n_classes -1, n_SV) + Dual coefficients of the support vector in the decision + function (see :ref:`sgd_mathematical_formulation`), multiplied by + their targets. + For multiclass, coefficient for all 1-vs-1 classifiers. + The layout of the coefficients in the multiclass case is somewhat + non-trivial. See the :ref:`multi-class section of the User Guide + ` for details. + + fit_status_ : int + 0 if correctly fitted, 1 otherwise (will raise warning) + + intercept_ : ndarray of shape (n_classes * (n_classes - 1) / 2,) + Constants in decision function. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : ndarray of shape (n_classes * (n_classes - 1) // 2,) + Number of iterations run by the optimization routine to fit the model. + The shape of this attribute depends on the number of models optimized + which in turn depends on the number of classes. + + .. versionadded:: 1.1 + + support_ : ndarray of shape (n_SV) + Indices of support vectors. + + support_vectors_ : ndarray of shape (n_SV, n_features) + Support vectors. An empty array if kernel is precomputed. + + n_support_ : ndarray of shape (n_classes,), dtype=int32 + Number of support vectors for each class. + + probA_ : ndarray of shape (n_classes * (n_classes - 1) / 2) + probB_ : ndarray of shape (n_classes * (n_classes - 1) / 2) + If `probability=True`, it corresponds to the parameters learned in + Platt scaling to produce probability estimates from decision values. + If `probability=False`, it's an empty array. Platt scaling uses the + logistic function + ``1 / (1 + exp(decision_value * probA_ + probB_))`` + where ``probA_`` and ``probB_`` are learned from the dataset [2]_. For + more information on the multiclass case and training procedure see + section 8 of [1]_. + + shape_fit_ : tuple of int of shape (n_dimensions_of_X,) + Array dimensions of training vector ``X``. + + See Also + -------- + SVR : Support Vector Machine for Regression implemented using libsvm. + + LinearSVC : Scalable Linear Support Vector Machine for classification + implemented using liblinear. Check the See Also section of + LinearSVC for more comparison element. + + References + ---------- + .. [1] `LIBSVM: A Library for Support Vector Machines + `_ + + .. [2] `Platt, John (1999). "Probabilistic Outputs for Support Vector + Machines and Comparisons to Regularized Likelihood Methods" + `_ + + Examples + -------- + >>> import numpy as np + >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.preprocessing import StandardScaler + >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) + >>> y = np.array([1, 1, 2, 2]) + >>> from sklearn.svm import SVC + >>> clf = make_pipeline(StandardScaler(), SVC(gamma='auto')) + >>> clf.fit(X, y) + Pipeline(steps=[('standardscaler', StandardScaler()), + ('svc', SVC(gamma='auto'))]) + + >>> print(clf.predict([[-0.8, -1]])) + [1] + + For a comparison of the SVC with other classifiers see: + :ref:`sphx_glr_auto_examples_classification_plot_classification_probability.py`. + """ + + _impl = "c_svc" + + def __init__( + self, + *, + C=1.0, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + shrinking=True, + probability=False, + tol=1e-3, + cache_size=200, + class_weight=None, + verbose=False, + max_iter=-1, + decision_function_shape="ovr", + break_ties=False, + random_state=None, + ): + super().__init__( + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + C=C, + nu=0.0, + shrinking=shrinking, + probability=probability, + cache_size=cache_size, + class_weight=class_weight, + verbose=verbose, + max_iter=max_iter, + decision_function_shape=decision_function_shape, + break_ties=break_ties, + random_state=random_state, + ) + + +class NuSVC(BaseSVC): + """Nu-Support Vector Classification. + + Similar to SVC but uses a parameter to control the number of support + vectors. + + The implementation is based on libsvm. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + nu : float, default=0.5 + An upper bound on the fraction of margin errors (see :ref:`User Guide + `) and a lower bound of the fraction of support vectors. + Should be in the interval (0, 1]. + + kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, \ + default='rbf' + Specifies the kernel type to be used in the algorithm. + If none is given, 'rbf' will be used. If a callable is given it is + used to precompute the kernel matrix. For an intuitive + visualization of different kernel types see + :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`. + + degree : int, default=3 + Degree of the polynomial kernel function ('poly'). + Must be non-negative. Ignored by all other kernels. + + gamma : {'scale', 'auto'} or float, default='scale' + Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. + + - if ``gamma='scale'`` (default) is passed then it uses + 1 / (n_features * X.var()) as value of gamma, + - if 'auto', uses 1 / n_features + - if float, must be non-negative. + + .. versionchanged:: 0.22 + The default value of ``gamma`` changed from 'auto' to 'scale'. + + coef0 : float, default=0.0 + Independent term in kernel function. + It is only significant in 'poly' and 'sigmoid'. + + shrinking : bool, default=True + Whether to use the shrinking heuristic. + See the :ref:`User Guide `. + + probability : bool, default=False + Whether to enable probability estimates. This must be enabled prior + to calling `fit`, will slow down that method as it internally uses + 5-fold cross-validation, and `predict_proba` may be inconsistent with + `predict`. Read more in the :ref:`User Guide `. + + tol : float, default=1e-3 + Tolerance for stopping criterion. + + cache_size : float, default=200 + Specify the size of the kernel cache (in MB). + + class_weight : {dict, 'balanced'}, default=None + Set the parameter C of class i to class_weight[i]*C for + SVC. If not given, all classes are supposed to have + weight one. The "balanced" mode uses the values of y to automatically + adjust weights inversely proportional to class frequencies as + ``n_samples / (n_classes * np.bincount(y))``. + + verbose : bool, default=False + Enable verbose output. Note that this setting takes advantage of a + per-process runtime setting in libsvm that, if enabled, may not work + properly in a multithreaded context. + + max_iter : int, default=-1 + Hard limit on iterations within solver, or -1 for no limit. + + decision_function_shape : {'ovo', 'ovr'}, default='ovr' + Whether to return a one-vs-rest ('ovr') decision function of shape + (n_samples, n_classes) as all other classifiers, or the original + one-vs-one ('ovo') decision function of libsvm which has shape + (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one + ('ovo') is always used as multi-class strategy. The parameter is + ignored for binary classification. + + .. versionchanged:: 0.19 + decision_function_shape is 'ovr' by default. + + .. versionadded:: 0.17 + *decision_function_shape='ovr'* is recommended. + + .. versionchanged:: 0.17 + Deprecated *decision_function_shape='ovo' and None*. + + break_ties : bool, default=False + If true, ``decision_function_shape='ovr'``, and number of classes > 2, + :term:`predict` will break ties according to the confidence values of + :term:`decision_function`; otherwise the first class among the tied + classes is returned. Please note that breaking ties comes at a + relatively high computational cost compared to a simple predict. + See :ref:`sphx_glr_auto_examples_svm_plot_svm_tie_breaking.py` for an + example of its usage with ``decision_function_shape='ovr'``. + + .. versionadded:: 0.22 + + random_state : int, RandomState instance or None, default=None + Controls the pseudo random number generation for shuffling the data for + probability estimates. Ignored when `probability` is False. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + + Attributes + ---------- + class_weight_ : ndarray of shape (n_classes,) + Multipliers of parameter C of each class. + Computed based on the ``class_weight`` parameter. + + classes_ : ndarray of shape (n_classes,) + The unique classes labels. + + coef_ : ndarray of shape (n_classes * (n_classes -1) / 2, n_features) + Weights assigned to the features (coefficients in the primal + problem). This is only available in the case of a linear kernel. + + `coef_` is readonly property derived from `dual_coef_` and + `support_vectors_`. + + dual_coef_ : ndarray of shape (n_classes - 1, n_SV) + Dual coefficients of the support vector in the decision + function (see :ref:`sgd_mathematical_formulation`), multiplied by + their targets. + For multiclass, coefficient for all 1-vs-1 classifiers. + The layout of the coefficients in the multiclass case is somewhat + non-trivial. See the :ref:`multi-class section of the User Guide + ` for details. + + fit_status_ : int + 0 if correctly fitted, 1 if the algorithm did not converge. + + intercept_ : ndarray of shape (n_classes * (n_classes - 1) / 2,) + Constants in decision function. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : ndarray of shape (n_classes * (n_classes - 1) // 2,) + Number of iterations run by the optimization routine to fit the model. + The shape of this attribute depends on the number of models optimized + which in turn depends on the number of classes. + + .. versionadded:: 1.1 + + support_ : ndarray of shape (n_SV,) + Indices of support vectors. + + support_vectors_ : ndarray of shape (n_SV, n_features) + Support vectors. + + n_support_ : ndarray of shape (n_classes,), dtype=int32 + Number of support vectors for each class. + + fit_status_ : int + 0 if correctly fitted, 1 if the algorithm did not converge. + + probA_ : ndarray of shape (n_classes * (n_classes - 1) / 2,) + + probB_ : ndarray of shape (n_classes * (n_classes - 1) / 2,) + If `probability=True`, it corresponds to the parameters learned in + Platt scaling to produce probability estimates from decision values. + If `probability=False`, it's an empty array. Platt scaling uses the + logistic function + ``1 / (1 + exp(decision_value * probA_ + probB_))`` + where ``probA_`` and ``probB_`` are learned from the dataset [2]_. For + more information on the multiclass case and training procedure see + section 8 of [1]_. + + shape_fit_ : tuple of int of shape (n_dimensions_of_X,) + Array dimensions of training vector ``X``. + + See Also + -------- + SVC : Support Vector Machine for classification using libsvm. + + LinearSVC : Scalable linear Support Vector Machine for classification using + liblinear. + + References + ---------- + .. [1] `LIBSVM: A Library for Support Vector Machines + `_ + + .. [2] `Platt, John (1999). "Probabilistic Outputs for Support Vector + Machines and Comparisons to Regularized Likelihood Methods" + `_ + + Examples + -------- + >>> import numpy as np + >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) + >>> y = np.array([1, 1, 2, 2]) + >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.preprocessing import StandardScaler + >>> from sklearn.svm import NuSVC + >>> clf = make_pipeline(StandardScaler(), NuSVC()) + >>> clf.fit(X, y) + Pipeline(steps=[('standardscaler', StandardScaler()), ('nusvc', NuSVC())]) + >>> print(clf.predict([[-0.8, -1]])) + [1] + """ + + _impl = "nu_svc" + + _parameter_constraints: dict = { + **BaseSVC._parameter_constraints, + "nu": [Interval(Real, 0.0, 1.0, closed="right")], + } + _parameter_constraints.pop("C") + + def __init__( + self, + *, + nu=0.5, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + shrinking=True, + probability=False, + tol=1e-3, + cache_size=200, + class_weight=None, + verbose=False, + max_iter=-1, + decision_function_shape="ovr", + break_ties=False, + random_state=None, + ): + super().__init__( + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + C=0.0, + nu=nu, + shrinking=shrinking, + probability=probability, + cache_size=cache_size, + class_weight=class_weight, + verbose=verbose, + max_iter=max_iter, + decision_function_shape=decision_function_shape, + break_ties=break_ties, + random_state=random_state, + ) + + +class SVR(RegressorMixin, BaseLibSVM): + """Epsilon-Support Vector Regression. + + The free parameters in the model are C and epsilon. + + The implementation is based on libsvm. The fit time complexity + is more than quadratic with the number of samples which makes it hard + to scale to datasets with more than a couple of 10000 samples. For large + datasets consider using :class:`~sklearn.svm.LinearSVR` or + :class:`~sklearn.linear_model.SGDRegressor` instead, possibly after a + :class:`~sklearn.kernel_approximation.Nystroem` transformer or + other :ref:`kernel_approximation`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, \ + default='rbf' + Specifies the kernel type to be used in the algorithm. + If none is given, 'rbf' will be used. If a callable is given it is + used to precompute the kernel matrix. + For an intuitive visualization of different kernel types + see :ref:`sphx_glr_auto_examples_svm_plot_svm_regression.py` + + degree : int, default=3 + Degree of the polynomial kernel function ('poly'). + Must be non-negative. Ignored by all other kernels. + + gamma : {'scale', 'auto'} or float, default='scale' + Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. + + - if ``gamma='scale'`` (default) is passed then it uses + 1 / (n_features * X.var()) as value of gamma, + - if 'auto', uses 1 / n_features + - if float, must be non-negative. + + .. versionchanged:: 0.22 + The default value of ``gamma`` changed from 'auto' to 'scale'. + + coef0 : float, default=0.0 + Independent term in kernel function. + It is only significant in 'poly' and 'sigmoid'. + + tol : float, default=1e-3 + Tolerance for stopping criterion. + + C : float, default=1.0 + Regularization parameter. The strength of the regularization is + inversely proportional to C. Must be strictly positive. + The penalty is a squared l2. For an intuitive visualization of the + effects of scaling the regularization parameter C, see + :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`. + + epsilon : float, default=0.1 + Epsilon in the epsilon-SVR model. It specifies the epsilon-tube + within which no penalty is associated in the training loss function + with points predicted within a distance epsilon from the actual + value. Must be non-negative. + + shrinking : bool, default=True + Whether to use the shrinking heuristic. + See the :ref:`User Guide `. + + cache_size : float, default=200 + Specify the size of the kernel cache (in MB). + + verbose : bool, default=False + Enable verbose output. Note that this setting takes advantage of a + per-process runtime setting in libsvm that, if enabled, may not work + properly in a multithreaded context. + + max_iter : int, default=-1 + Hard limit on iterations within solver, or -1 for no limit. + + Attributes + ---------- + coef_ : ndarray of shape (1, n_features) + Weights assigned to the features (coefficients in the primal + problem). This is only available in the case of a linear kernel. + + `coef_` is readonly property derived from `dual_coef_` and + `support_vectors_`. + + dual_coef_ : ndarray of shape (1, n_SV) + Coefficients of the support vector in the decision function. + + fit_status_ : int + 0 if correctly fitted, 1 otherwise (will raise warning) + + intercept_ : ndarray of shape (1,) + Constants in decision function. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + Number of iterations run by the optimization routine to fit the model. + + .. versionadded:: 1.1 + + n_support_ : ndarray of shape (1,), dtype=int32 + Number of support vectors. + + shape_fit_ : tuple of int of shape (n_dimensions_of_X,) + Array dimensions of training vector ``X``. + + support_ : ndarray of shape (n_SV,) + Indices of support vectors. + + support_vectors_ : ndarray of shape (n_SV, n_features) + Support vectors. + + See Also + -------- + NuSVR : Support Vector Machine for regression implemented using libsvm + using a parameter to control the number of support vectors. + + LinearSVR : Scalable Linear Support Vector Machine for regression + implemented using liblinear. + + References + ---------- + .. [1] `LIBSVM: A Library for Support Vector Machines + `_ + + .. [2] `Platt, John (1999). "Probabilistic Outputs for Support Vector + Machines and Comparisons to Regularized Likelihood Methods" + `_ + + Examples + -------- + >>> from sklearn.svm import SVR + >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.preprocessing import StandardScaler + >>> import numpy as np + >>> n_samples, n_features = 10, 5 + >>> rng = np.random.RandomState(0) + >>> y = rng.randn(n_samples) + >>> X = rng.randn(n_samples, n_features) + >>> regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2)) + >>> regr.fit(X, y) + Pipeline(steps=[('standardscaler', StandardScaler()), + ('svr', SVR(epsilon=0.2))]) + """ + + _impl = "epsilon_svr" + + _parameter_constraints: dict = {**BaseLibSVM._parameter_constraints} + for unused_param in ["class_weight", "nu", "probability", "random_state"]: + _parameter_constraints.pop(unused_param) + + def __init__( + self, + *, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + tol=1e-3, + C=1.0, + epsilon=0.1, + shrinking=True, + cache_size=200, + verbose=False, + max_iter=-1, + ): + super().__init__( + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + C=C, + nu=0.0, + epsilon=epsilon, + verbose=verbose, + shrinking=shrinking, + probability=False, + cache_size=cache_size, + class_weight=None, + max_iter=max_iter, + random_state=None, + ) + + +class NuSVR(RegressorMixin, BaseLibSVM): + """Nu Support Vector Regression. + + Similar to NuSVC, for regression, uses a parameter nu to control + the number of support vectors. However, unlike NuSVC, where nu + replaces C, here nu replaces the parameter epsilon of epsilon-SVR. + + The implementation is based on libsvm. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + nu : float, default=0.5 + An upper bound on the fraction of training errors and a lower bound of + the fraction of support vectors. Should be in the interval (0, 1]. By + default 0.5 will be taken. + + C : float, default=1.0 + Penalty parameter C of the error term. For an intuitive visualization + of the effects of scaling the regularization parameter C, see + :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`. + + kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, \ + default='rbf' + Specifies the kernel type to be used in the algorithm. + If none is given, 'rbf' will be used. If a callable is given it is + used to precompute the kernel matrix. + For an intuitive visualization of different kernel types see + See :ref:`sphx_glr_auto_examples_svm_plot_svm_regression.py` + + degree : int, default=3 + Degree of the polynomial kernel function ('poly'). + Must be non-negative. Ignored by all other kernels. + + gamma : {'scale', 'auto'} or float, default='scale' + Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. + + - if ``gamma='scale'`` (default) is passed then it uses + 1 / (n_features * X.var()) as value of gamma, + - if 'auto', uses 1 / n_features + - if float, must be non-negative. + + .. versionchanged:: 0.22 + The default value of ``gamma`` changed from 'auto' to 'scale'. + + coef0 : float, default=0.0 + Independent term in kernel function. + It is only significant in 'poly' and 'sigmoid'. + + shrinking : bool, default=True + Whether to use the shrinking heuristic. + See the :ref:`User Guide `. + + tol : float, default=1e-3 + Tolerance for stopping criterion. + + cache_size : float, default=200 + Specify the size of the kernel cache (in MB). + + verbose : bool, default=False + Enable verbose output. Note that this setting takes advantage of a + per-process runtime setting in libsvm that, if enabled, may not work + properly in a multithreaded context. + + max_iter : int, default=-1 + Hard limit on iterations within solver, or -1 for no limit. + + Attributes + ---------- + coef_ : ndarray of shape (1, n_features) + Weights assigned to the features (coefficients in the primal + problem). This is only available in the case of a linear kernel. + + `coef_` is readonly property derived from `dual_coef_` and + `support_vectors_`. + + dual_coef_ : ndarray of shape (1, n_SV) + Coefficients of the support vector in the decision function. + + fit_status_ : int + 0 if correctly fitted, 1 otherwise (will raise warning) + + intercept_ : ndarray of shape (1,) + Constants in decision function. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + Number of iterations run by the optimization routine to fit the model. + + .. versionadded:: 1.1 + + n_support_ : ndarray of shape (1,), dtype=int32 + Number of support vectors. + + shape_fit_ : tuple of int of shape (n_dimensions_of_X,) + Array dimensions of training vector ``X``. + + support_ : ndarray of shape (n_SV,) + Indices of support vectors. + + support_vectors_ : ndarray of shape (n_SV, n_features) + Support vectors. + + See Also + -------- + NuSVC : Support Vector Machine for classification implemented with libsvm + with a parameter to control the number of support vectors. + + SVR : Epsilon Support Vector Machine for regression implemented with + libsvm. + + References + ---------- + .. [1] `LIBSVM: A Library for Support Vector Machines + `_ + + .. [2] `Platt, John (1999). "Probabilistic Outputs for Support Vector + Machines and Comparisons to Regularized Likelihood Methods" + `_ + + Examples + -------- + >>> from sklearn.svm import NuSVR + >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.preprocessing import StandardScaler + >>> import numpy as np + >>> n_samples, n_features = 10, 5 + >>> np.random.seed(0) + >>> y = np.random.randn(n_samples) + >>> X = np.random.randn(n_samples, n_features) + >>> regr = make_pipeline(StandardScaler(), NuSVR(C=1.0, nu=0.1)) + >>> regr.fit(X, y) + Pipeline(steps=[('standardscaler', StandardScaler()), + ('nusvr', NuSVR(nu=0.1))]) + """ + + _impl = "nu_svr" + + _parameter_constraints: dict = {**BaseLibSVM._parameter_constraints} + for unused_param in ["class_weight", "epsilon", "probability", "random_state"]: + _parameter_constraints.pop(unused_param) + + def __init__( + self, + *, + nu=0.5, + C=1.0, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + shrinking=True, + tol=1e-3, + cache_size=200, + verbose=False, + max_iter=-1, + ): + super().__init__( + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + C=C, + nu=nu, + epsilon=0.0, + shrinking=shrinking, + probability=False, + cache_size=cache_size, + class_weight=None, + verbose=verbose, + max_iter=max_iter, + random_state=None, + ) + + +class OneClassSVM(OutlierMixin, BaseLibSVM): + """Unsupervised Outlier Detection. + + Estimate the support of a high-dimensional distribution. + + The implementation is based on libsvm. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, \ + default='rbf' + Specifies the kernel type to be used in the algorithm. + If none is given, 'rbf' will be used. If a callable is given it is + used to precompute the kernel matrix. + + degree : int, default=3 + Degree of the polynomial kernel function ('poly'). + Must be non-negative. Ignored by all other kernels. + + gamma : {'scale', 'auto'} or float, default='scale' + Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. + + - if ``gamma='scale'`` (default) is passed then it uses + 1 / (n_features * X.var()) as value of gamma, + - if 'auto', uses 1 / n_features + - if float, must be non-negative. + + .. versionchanged:: 0.22 + The default value of ``gamma`` changed from 'auto' to 'scale'. + + coef0 : float, default=0.0 + Independent term in kernel function. + It is only significant in 'poly' and 'sigmoid'. + + tol : float, default=1e-3 + Tolerance for stopping criterion. + + nu : float, default=0.5 + An upper bound on the fraction of training + errors and a lower bound of the fraction of support + vectors. Should be in the interval (0, 1]. By default 0.5 + will be taken. + + shrinking : bool, default=True + Whether to use the shrinking heuristic. + See the :ref:`User Guide `. + + cache_size : float, default=200 + Specify the size of the kernel cache (in MB). + + verbose : bool, default=False + Enable verbose output. Note that this setting takes advantage of a + per-process runtime setting in libsvm that, if enabled, may not work + properly in a multithreaded context. + + max_iter : int, default=-1 + Hard limit on iterations within solver, or -1 for no limit. + + Attributes + ---------- + coef_ : ndarray of shape (1, n_features) + Weights assigned to the features (coefficients in the primal + problem). This is only available in the case of a linear kernel. + + `coef_` is readonly property derived from `dual_coef_` and + `support_vectors_`. + + dual_coef_ : ndarray of shape (1, n_SV) + Coefficients of the support vectors in the decision function. + + fit_status_ : int + 0 if correctly fitted, 1 otherwise (will raise warning) + + intercept_ : ndarray of shape (1,) + Constant in the decision function. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + .. versionadded:: 1.0 + + n_iter_ : int + Number of iterations run by the optimization routine to fit the model. + + .. versionadded:: 1.1 + + n_support_ : ndarray of shape (n_classes,), dtype=int32 + Number of support vectors for each class. + + offset_ : float + Offset used to define the decision function from the raw scores. + We have the relation: decision_function = score_samples - `offset_`. + The offset is the opposite of `intercept_` and is provided for + consistency with other outlier detection algorithms. + + .. versionadded:: 0.20 + + shape_fit_ : tuple of int of shape (n_dimensions_of_X,) + Array dimensions of training vector ``X``. + + support_ : ndarray of shape (n_SV,) + Indices of support vectors. + + support_vectors_ : ndarray of shape (n_SV, n_features) + Support vectors. + + See Also + -------- + sklearn.linear_model.SGDOneClassSVM : Solves linear One-Class SVM using + Stochastic Gradient Descent. + sklearn.neighbors.LocalOutlierFactor : Unsupervised Outlier Detection using + Local Outlier Factor (LOF). + sklearn.ensemble.IsolationForest : Isolation Forest Algorithm. + + Examples + -------- + >>> from sklearn.svm import OneClassSVM + >>> X = [[0], [0.44], [0.45], [0.46], [1]] + >>> clf = OneClassSVM(gamma='auto').fit(X) + >>> clf.predict(X) + array([-1, 1, 1, 1, -1]) + >>> clf.score_samples(X) + array([1.7798, 2.0547, 2.0556, 2.0561, 1.7332]) + + For a more extended example, + see :ref:`sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py` + """ + + _impl = "one_class" + + _parameter_constraints: dict = {**BaseLibSVM._parameter_constraints} + for unused_param in ["C", "class_weight", "epsilon", "probability", "random_state"]: + _parameter_constraints.pop(unused_param) + + def __init__( + self, + *, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + tol=1e-3, + nu=0.5, + shrinking=True, + cache_size=200, + verbose=False, + max_iter=-1, + ): + super().__init__( + kernel, + degree, + gamma, + coef0, + tol, + 0.0, + nu, + 0.0, + shrinking, + False, + cache_size, + None, + verbose, + max_iter, + random_state=None, + ) + + def fit(self, X, y=None, sample_weight=None): + """Detect the soft boundary of the set of samples X. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Set of samples, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : Ignored + Not used, present for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + Per-sample weights. Rescale C per sample. Higher weights + force the classifier to put more emphasis on these points. + + Returns + ------- + self : object + Fitted estimator. + + Notes + ----- + If X is not a C-ordered contiguous array it is copied. + """ + super().fit(X, np.ones(_num_samples(X)), sample_weight=sample_weight) + self.offset_ = -self._intercept_ + return self + + def decision_function(self, X): + """Signed distance to the separating hyperplane. + + Signed distance is positive for an inlier and negative for an outlier. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data matrix. + + Returns + ------- + dec : ndarray of shape (n_samples,) + Returns the decision function of the samples. + """ + dec = self._decision_function(X).ravel() + return dec + + def score_samples(self, X): + """Raw scoring function of the samples. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data matrix. + + Returns + ------- + score_samples : ndarray of shape (n_samples,) + Returns the (unshifted) scoring function of the samples. + """ + return self.decision_function(X) + self.offset_ + + def predict(self, X): + """Perform classification on samples in X. + + For a one-class model, +1 or -1 is returned. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples_test, n_samples_train) + For kernel="precomputed", the expected shape of X is + (n_samples_test, n_samples_train). + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + Class labels for samples in X. + """ + y = super().predict(X) + return np.asarray(y, dtype=np.intp) diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/_liblinear.pxi b/.venv/lib/python3.12/site-packages/sklearn/svm/_liblinear.pxi new file mode 100644 index 0000000000000000000000000000000000000000..0df269b070f5cad415cbfcd3d3ccf8f30c75fe4d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/_liblinear.pxi @@ -0,0 +1,43 @@ +from ..utils._typedefs cimport intp_t + +cdef extern from "_cython_blas_helpers.h": + ctypedef double (*dot_func)(int, const double*, int, const double*, int) + ctypedef void (*axpy_func)(int, double, const double*, int, double*, int) + ctypedef void (*scal_func)(int, double, const double*, int) + ctypedef double (*nrm2_func)(int, const double*, int) + cdef struct BlasFunctions: + dot_func dot + axpy_func axpy + scal_func scal + nrm2_func nrm2 + + +cdef extern from "linear.h": + cdef struct feature_node + cdef struct problem + cdef struct model + cdef struct parameter + ctypedef problem* problem_const_ptr "problem const *" + ctypedef parameter* parameter_const_ptr "parameter const *" + ctypedef char* char_const_ptr "char const *" + char_const_ptr check_parameter(problem_const_ptr prob, parameter_const_ptr param) + model *train(problem_const_ptr prob, parameter_const_ptr param, BlasFunctions *blas_functions) nogil + int get_nr_feature (model *model) + int get_nr_class (model *model) + void get_n_iter (model *model, int *n_iter) + void free_and_destroy_model (model **) + void destroy_param (parameter *) + + +cdef extern from "liblinear_helper.c": + void copy_w(void *, model *, int) + parameter *set_parameter(int, double, double, int, char *, char *, int, int, double) + problem *set_problem (char *, int, int, int, int, double, char *, char *) + problem *csr_set_problem (char *, int, char *, char *, int, int, int, double, char *, char *) + + model *set_model(parameter *, char *, intp_t *, char *, double) + + double get_bias(model *) + void free_problem (problem *) + void free_parameter (parameter *) + void set_verbosity(int) diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/_liblinear.pyx b/.venv/lib/python3.12/site-packages/sklearn/svm/_liblinear.pyx new file mode 100644 index 0000000000000000000000000000000000000000..6d5347e746384d34876ca1d569204afa3573ac76 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/_liblinear.pyx @@ -0,0 +1,147 @@ +""" +Wrapper for liblinear + +Author: fabian.pedregosa@inria.fr +""" + +import numpy as np + +from ..utils._cython_blas cimport _dot, _axpy, _scal, _nrm2 +from ..utils._typedefs cimport float32_t, float64_t, int32_t + +include "_liblinear.pxi" + + +def train_wrap( + object X, + const float64_t[::1] Y, + bint is_sparse, + int solver_type, + double eps, + double bias, + double C, + const float64_t[:] class_weight, + int max_iter, + unsigned random_seed, + double epsilon, + const float64_t[::1] sample_weight +): + cdef parameter *param + cdef problem *problem + cdef model *model + cdef char_const_ptr error_msg + cdef int len_w + cdef bint X_has_type_float64 = X.dtype == np.float64 + cdef char * X_data_bytes_ptr + cdef const float64_t[::1] X_data_64 + cdef const float32_t[::1] X_data_32 + cdef const int32_t[::1] X_indices + cdef const int32_t[::1] X_indptr + + if is_sparse: + X_indices = X.indices + X_indptr = X.indptr + if X_has_type_float64: + X_data_64 = X.data + X_data_bytes_ptr = &X_data_64[0] + else: + X_data_32 = X.data + X_data_bytes_ptr = &X_data_32[0] + + problem = csr_set_problem( + X_data_bytes_ptr, + X_has_type_float64, + &X_indices[0], + &X_indptr[0], + (X.shape[0]), + (X.shape[1]), + (X.nnz), + bias, + &sample_weight[0], + &Y[0] + ) + else: + X_as_1d_array = X.reshape(-1) + if X_has_type_float64: + X_data_64 = X_as_1d_array + X_data_bytes_ptr = &X_data_64[0] + else: + X_data_32 = X_as_1d_array + X_data_bytes_ptr = &X_data_32[0] + + problem = set_problem( + X_data_bytes_ptr, + X_has_type_float64, + (X.shape[0]), + (X.shape[1]), + (np.count_nonzero(X)), + bias, + &sample_weight[0], + &Y[0] + ) + + cdef int32_t[::1] class_weight_label = np.arange(class_weight.shape[0], dtype=np.intc) + param = set_parameter( + solver_type, + eps, + C, + class_weight.shape[0], + &class_weight_label[0] if class_weight_label.size > 0 else NULL, + &class_weight[0] if class_weight.size > 0 else NULL, + max_iter, + random_seed, + epsilon + ) + + error_msg = check_parameter(problem, param) + if error_msg: + free_problem(problem) + free_parameter(param) + raise ValueError(error_msg) + + cdef BlasFunctions blas_functions + blas_functions.dot = _dot[double] + blas_functions.axpy = _axpy[double] + blas_functions.scal = _scal[double] + blas_functions.nrm2 = _nrm2[double] + + # early return + with nogil: + model = train(problem, param, &blas_functions) + + # FREE + free_problem(problem) + free_parameter(param) + # destroy_param(param) don't call this or it will destroy class_weight_label and class_weight + + # coef matrix holder created as fortran since that's what's used in liblinear + cdef float64_t[::1, :] w + cdef int nr_class = get_nr_class(model) + + cdef int labels_ = nr_class + if nr_class == 2: + labels_ = 1 + cdef int32_t[::1] n_iter = np.zeros(labels_, dtype=np.intc) + get_n_iter(model, &n_iter[0]) + + cdef int nr_feature = get_nr_feature(model) + if bias > 0: + nr_feature = nr_feature + 1 + if nr_class == 2 and solver_type != 4: # solver is not Crammer-Singer + w = np.empty((1, nr_feature), order='F') + copy_w(&w[0, 0], model, nr_feature) + else: + len_w = (nr_class) * nr_feature + w = np.empty((nr_class, nr_feature), order='F') + copy_w(&w[0, 0], model, len_w) + + free_and_destroy_model(&model) + + return w.base, n_iter.base + + +def set_verbosity_wrap(int verbosity): + """ + Control verbosity of libsvm library + """ + set_verbosity(verbosity) diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/_libsvm.pxi b/.venv/lib/python3.12/site-packages/sklearn/svm/_libsvm.pxi new file mode 100644 index 0000000000000000000000000000000000000000..74ddfd66c538e712e95ba183bcf34695f5b85a14 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/_libsvm.pxi @@ -0,0 +1,75 @@ +################################################################################ +# Includes +from ..utils._typedefs cimport intp_t + +cdef extern from "_svm_cython_blas_helpers.h": + ctypedef double (*dot_func)(int, const double*, int, const double*, int) + cdef struct BlasFunctions: + dot_func dot + + +cdef extern from "svm.h": + cdef struct svm_node + cdef struct svm_model + cdef struct svm_parameter: + int svm_type + int kernel_type + int degree # for poly + double gamma # for poly/rbf/sigmoid + double coef0 # for poly/sigmoid + + # these are for training only + double cache_size # in MB + double eps # stopping criteria + double C # for C_SVC, EPSILON_SVR and NU_SVR + int nr_weight # for C_SVC + int *weight_label # for C_SVC + double* weight # for C_SVC + double nu # for NU_SVC, ONE_CLASS, and NU_SVR + double p # for EPSILON_SVR + int shrinking # use the shrinking heuristics + int probability # do probability estimates + int max_iter # ceiling on Solver runtime + int random_seed # seed for random generator in probability estimation + + cdef struct svm_problem: + int l + double *y + svm_node *x + double *W # instance weights + + char *svm_check_parameter(svm_problem *, svm_parameter *) + svm_model *svm_train(svm_problem *, svm_parameter *, int *, BlasFunctions *) nogil + void svm_free_and_destroy_model(svm_model** model_ptr_ptr) + void svm_cross_validation(svm_problem *, svm_parameter *, int nr_fold, double *target, BlasFunctions *) nogil + + +cdef extern from "libsvm_helper.c": + # this file contains methods for accessing libsvm 'hidden' fields + svm_node **dense_to_sparse (char *, intp_t *) + void set_parameter (svm_parameter *, int , int , int , double, double , + double , double , double , double, + double, int, int, int, char *, char *, int, + int) + void set_problem (svm_problem *, char *, char *, char *, intp_t *, int) + + svm_model *set_model (svm_parameter *, int, char *, intp_t *, + char *, intp_t *, intp_t *, char *, + char *, char *, char *, char *) + + void copy_sv_coef (char *, svm_model *) + void copy_n_iter (char *, svm_model *) + void copy_intercept (char *, svm_model *, intp_t *) + void copy_SV (char *, svm_model *, intp_t *) + int copy_support (char *data, svm_model *model) + int copy_predict (char *, svm_model *, intp_t *, char *, BlasFunctions *) nogil + int copy_predict_proba (char *, svm_model *, intp_t *, char *, BlasFunctions *) nogil + int copy_predict_values(char *, svm_model *, intp_t *, char *, int, BlasFunctions *) nogil + void copy_nSV (char *, svm_model *) + void copy_probA (char *, svm_model *, intp_t *) + void copy_probB (char *, svm_model *, intp_t *) + intp_t get_l (svm_model *) + intp_t get_nr (svm_model *) + int free_problem (svm_problem *) + int free_model (svm_model *) + void set_verbosity(int) diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/_libsvm.pyx b/.venv/lib/python3.12/site-packages/sklearn/svm/_libsvm.pyx new file mode 100644 index 0000000000000000000000000000000000000000..be0a0826c3736469fdafbf5f42bff39d1205a6ec --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/_libsvm.pyx @@ -0,0 +1,917 @@ +""" +Binding for libsvm_skl +---------------------- + +These are the bindings for libsvm_skl, which is a fork of libsvm[1] +that adds to libsvm some capabilities, like index of support vectors +and efficient representation of dense matrices. + +These are low-level routines, but can be used for flexibility or +performance reasons. See sklearn.svm for a higher-level API. + +Low-level memory management is done in libsvm_helper.c. If we happen +to run out of memory a MemoryError will be raised. In practice this is +not very helpful since high chances are malloc fails inside svm.cpp, +where no sort of memory checks are done. + +[1] https://www.csie.ntu.edu.tw/~cjlin/libsvm/ + +Notes +----- +The signature mode='c' is somewhat superficial, since we already +check that arrays are C-contiguous in svm.py + +Authors +------- +2010: Fabian Pedregosa + Gael Varoquaux +""" + +import numpy as np +from libc.stdlib cimport free +from ..utils._cython_blas cimport _dot +from ..utils._typedefs cimport float64_t, int32_t, intp_t + +include "_libsvm.pxi" + +cdef extern from *: + ctypedef struct svm_parameter: + pass + + +################################################################################ +# Internal variables +LIBSVM_KERNEL_TYPES = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'] + + +################################################################################ +# Wrapper functions + +def fit( + const float64_t[:, ::1] X, + const float64_t[::1] Y, + int svm_type=0, + kernel='rbf', + int degree=3, + double gamma=0.1, + double coef0=0.0, + double tol=1e-3, + double C=1.0, + double nu=0.5, + double epsilon=0.1, + const float64_t[::1] class_weight=np.empty(0), + const float64_t[::1] sample_weight=np.empty(0), + int shrinking=1, + int probability=0, + double cache_size=100., + int max_iter=-1, + int random_seed=0, +): + """ + Train the model using libsvm (low-level method) + + Parameters + ---------- + X : array-like, dtype=float64 of shape (n_samples, n_features) + + Y : array, dtype=float64 of shape (n_samples,) + target vector + + svm_type : {0, 1, 2, 3, 4}, default=0 + Type of SVM: C_SVC, NuSVC, OneClassSVM, EpsilonSVR or NuSVR + respectively. + + kernel : {'linear', 'rbf', 'poly', 'sigmoid', 'precomputed'}, default="rbf" + Kernel to use in the model: linear, polynomial, RBF, sigmoid + or precomputed. + + degree : int32, default=3 + Degree of the polynomial kernel (only relevant if kernel is + set to polynomial). + + gamma : float64, default=0.1 + Gamma parameter in rbf, poly and sigmoid kernels. Ignored by other + kernels. + + coef0 : float64, default=0 + Independent parameter in poly/sigmoid kernel. + + tol : float64, default=1e-3 + Numeric stopping criterion (WRITEME). + + C : float64, default=1 + C parameter in C-Support Vector Classification. + + nu : float64, default=0.5 + An upper bound on the fraction of training errors and a lower bound of + the fraction of support vectors. Should be in the interval (0, 1]. + + epsilon : double, default=0.1 + Epsilon parameter in the epsilon-insensitive loss function. + + class_weight : array, dtype=float64, shape (n_classes,), \ + default=np.empty(0) + Set the parameter C of class i to class_weight[i]*C for + SVC. If not given, all classes are supposed to have + weight one. + + sample_weight : array, dtype=float64, shape (n_samples,), \ + default=np.empty(0) + Weights assigned to each sample. + + shrinking : int, default=1 + Whether to use the shrinking heuristic. + + probability : int, default=0 + Whether to enable probability estimates. + + cache_size : float64, default=100 + Cache size for gram matrix columns (in megabytes). + + max_iter : int (-1 for no limit), default=-1 + Stop solver after this many iterations regardless of accuracy + (XXX Currently there is no API to know whether this kicked in.) + + random_seed : int, default=0 + Seed for the random number generator used for probability estimates. + + Returns + ------- + support : array of shape (n_support,) + Index of support vectors. + + support_vectors : array of shape (n_support, n_features) + Support vectors (equivalent to X[support]). Will return an + empty array in the case of precomputed kernel. + + n_class_SV : array of shape (n_class,) + Number of support vectors in each class. + + sv_coef : array of shape (n_class-1, n_support) + Coefficients of support vectors in decision function. + + intercept : array of shape (n_class*(n_class-1)/2,) + Intercept in decision function. + + probA, probB : array of shape (n_class*(n_class-1)/2,) + Probability estimates, empty array for probability=False. + + n_iter : ndarray of shape (max(1, (n_class * (n_class - 1) // 2)),) + Number of iterations run by the optimization routine to fit the model. + """ + + cdef svm_parameter param + cdef svm_problem problem + cdef svm_model *model + cdef const char *error_msg + cdef intp_t SV_len + + if len(sample_weight) == 0: + sample_weight = np.ones(X.shape[0], dtype=np.float64) + else: + assert sample_weight.shape[0] == X.shape[0], ( + f"sample_weight and X have incompatible shapes: sample_weight has " + f"{sample_weight.shape[0]} samples while X has {X.shape[0]}" + ) + + kernel_index = LIBSVM_KERNEL_TYPES.index(kernel) + set_problem( + &problem, + &X[0, 0], + &Y[0], + &sample_weight[0], + X.shape, + kernel_index, + ) + if problem.x == NULL: + raise MemoryError("Seems we've run out of memory") + cdef int32_t[::1] class_weight_label = np.arange( + class_weight.shape[0], dtype=np.int32 + ) + set_parameter( + ¶m, + svm_type, + kernel_index, + degree, + gamma, + coef0, + nu, + cache_size, + C, + tol, + epsilon, + shrinking, + probability, + class_weight.shape[0], + &class_weight_label[0] if class_weight_label.size > 0 else NULL, + &class_weight[0] if class_weight.size > 0 else NULL, + max_iter, + random_seed, + ) + + error_msg = svm_check_parameter(&problem, ¶m) + if error_msg: + # for SVR: epsilon is called p in libsvm + error_repl = error_msg.decode('utf-8').replace("p < 0", "epsilon < 0") + raise ValueError(error_repl) + cdef BlasFunctions blas_functions + blas_functions.dot = _dot[double] + # this does the real work + cdef int fit_status = 0 + with nogil: + model = svm_train(&problem, ¶m, &fit_status, &blas_functions) + + # from here until the end, we just copy the data returned by + # svm_train + SV_len = get_l(model) + n_class = get_nr(model) + + cdef int[::1] n_iter = np.empty(max(1, n_class * (n_class - 1) // 2), dtype=np.intc) + copy_n_iter( &n_iter[0], model) + + cdef float64_t[:, ::1] sv_coef = np.empty((n_class-1, SV_len), dtype=np.float64) + copy_sv_coef( &sv_coef[0, 0] if sv_coef.size > 0 else NULL, model) + + # the intercept is just model.rho but with sign changed + cdef float64_t[::1] intercept = np.empty( + int((n_class*(n_class-1))/2), dtype=np.float64 + ) + copy_intercept( &intercept[0], model, intercept.shape) + + cdef int32_t[::1] support = np.empty(SV_len, dtype=np.int32) + copy_support( &support[0] if support.size > 0 else NULL, model) + + # copy model.SV + cdef float64_t[:, ::1] support_vectors + if kernel_index == 4: + # precomputed kernel + support_vectors = np.empty((0, 0), dtype=np.float64) + else: + support_vectors = np.empty((SV_len, X.shape[1]), dtype=np.float64) + copy_SV( + &support_vectors[0, 0] if support_vectors.size > 0 else NULL, + model, + support_vectors.shape, + ) + + cdef int32_t[::1] n_class_SV + if svm_type == 0 or svm_type == 1: + n_class_SV = np.empty(n_class, dtype=np.int32) + copy_nSV( &n_class_SV[0] if n_class_SV.size > 0 else NULL, model) + else: + # OneClass and SVR are considered to have 2 classes + n_class_SV = np.array([SV_len, SV_len], dtype=np.int32) + + cdef float64_t[::1] probA + cdef float64_t[::1] probB + if probability != 0: + if svm_type < 2: # SVC and NuSVC + probA = np.empty(int(n_class*(n_class-1)/2), dtype=np.float64) + probB = np.empty(int(n_class*(n_class-1)/2), dtype=np.float64) + copy_probB( &probB[0], model, probB.shape) + else: + probA = np.empty(1, dtype=np.float64) + probB = np.empty(0, dtype=np.float64) + copy_probA( &probA[0], model, probA.shape) + else: + probA = np.empty(0, dtype=np.float64) + probB = np.empty(0, dtype=np.float64) + + svm_free_and_destroy_model(&model) + free(problem.x) + + return ( + support.base, + support_vectors.base, + n_class_SV.base, + sv_coef.base, + intercept.base, + probA.base, + probB.base, + fit_status, + n_iter.base, + ) + + +cdef void set_predict_params( + svm_parameter *param, + int svm_type, + kernel, + int degree, + double gamma, + double coef0, + double cache_size, + int probability, + int nr_weight, + char *weight_label, + char *weight, +) except *: + """Fill param with prediction time-only parameters.""" + + # training-time only parameters + cdef double C = 0.0 + cdef double epsilon = 0.1 + cdef int max_iter = 0 + cdef double nu = 0.5 + cdef int shrinking = 0 + cdef double tol = 0.1 + cdef int random_seed = -1 + + kernel_index = LIBSVM_KERNEL_TYPES.index(kernel) + + set_parameter( + param, + svm_type, + kernel_index, + degree, + gamma, + coef0, + nu, + cache_size, + C, + tol, + epsilon, + shrinking, + probability, + nr_weight, + weight_label, + weight, + max_iter, + random_seed, + ) + + +def predict( + const float64_t[:, ::1] X, + const int32_t[::1] support, + const float64_t[:, ::1] SV, + const int32_t[::1] nSV, + const float64_t[:, ::1] sv_coef, + const float64_t[::1] intercept, + const float64_t[::1] probA=np.empty(0), + const float64_t[::1] probB=np.empty(0), + int svm_type=0, + kernel='rbf', + int degree=3, + double gamma=0.1, + double coef0=0.0, + const float64_t[::1] class_weight=np.empty(0), + const float64_t[::1] sample_weight=np.empty(0), + double cache_size=100.0, +): + """ + Predict target values of X given a model (low-level method) + + Parameters + ---------- + X : array-like, dtype=float of shape (n_samples, n_features) + + support : array of shape (n_support,) + Index of support vectors in training set. + + SV : array of shape (n_support, n_features) + Support vectors. + + nSV : array of shape (n_class,) + Number of support vectors in each class. + + sv_coef : array of shape (n_class-1, n_support) + Coefficients of support vectors in decision function. + + intercept : array of shape (n_class*(n_class-1)/2) + Intercept in decision function. + + probA, probB : array of shape (n_class*(n_class-1)/2,) + Probability estimates. + + svm_type : {0, 1, 2, 3, 4}, default=0 + Type of SVM: C_SVC, NuSVC, OneClassSVM, EpsilonSVR or NuSVR + respectively. + + kernel : {'linear', 'rbf', 'poly', 'sigmoid', 'precomputed'}, default="rbf" + Kernel to use in the model: linear, polynomial, RBF, sigmoid + or precomputed. + + degree : int32, default=3 + Degree of the polynomial kernel (only relevant if kernel is + set to polynomial). + + gamma : float64, default=0.1 + Gamma parameter in rbf, poly and sigmoid kernels. Ignored by other + kernels. + + coef0 : float64, default=0.0 + Independent parameter in poly/sigmoid kernel. + + Returns + ------- + dec_values : array + Predicted values. + """ + cdef float64_t[::1] dec_values + cdef svm_parameter param + cdef svm_model *model + cdef int rv + + cdef int32_t[::1] class_weight_label = np.arange( + class_weight.shape[0], dtype=np.int32 + ) + + set_predict_params( + ¶m, + svm_type, + kernel, + degree, + gamma, + coef0, + cache_size, + 0, + class_weight.shape[0], + &class_weight_label[0] if class_weight_label.size > 0 else NULL, + &class_weight[0] if class_weight.size > 0 else NULL, + ) + model = set_model( + ¶m, + nSV.shape[0], + &SV[0, 0] if SV.size > 0 else NULL, + SV.shape, + &support[0] if support.size > 0 else NULL, + support.shape, + sv_coef.strides, + &sv_coef[0, 0] if sv_coef.size > 0 else NULL, + &intercept[0], + &nSV[0], + &probA[0] if probA.size > 0 else NULL, + &probB[0] if probB.size > 0 else NULL, + ) + cdef BlasFunctions blas_functions + blas_functions.dot = _dot[double] + # TODO: use check_model + try: + dec_values = np.empty(X.shape[0]) + with nogil: + rv = copy_predict( + &X[0, 0], + model, + X.shape, + &dec_values[0], + &blas_functions, + ) + if rv < 0: + raise MemoryError("We've run out of memory") + finally: + free_model(model) + + return dec_values.base + + +def predict_proba( + const float64_t[:, ::1] X, + const int32_t[::1] support, + const float64_t[:, ::1] SV, + const int32_t[::1] nSV, + float64_t[:, ::1] sv_coef, + float64_t[::1] intercept, + float64_t[::1] probA=np.empty(0), + float64_t[::1] probB=np.empty(0), + int svm_type=0, + kernel='rbf', + int degree=3, + double gamma=0.1, + double coef0=0.0, + float64_t[::1] class_weight=np.empty(0), + float64_t[::1] sample_weight=np.empty(0), + double cache_size=100.0, +): + """ + Predict probabilities + + svm_model stores all parameters needed to predict a given value. + + For speed, all real work is done at the C level in function + copy_predict (libsvm_helper.c). + + We have to reconstruct model and parameters to make sure we stay + in sync with the python object. + + See sklearn.svm.predict for a complete list of parameters. + + Parameters + ---------- + X : array-like, dtype=float of shape (n_samples, n_features) + + support : array of shape (n_support,) + Index of support vectors in training set. + + SV : array of shape (n_support, n_features) + Support vectors. + + nSV : array of shape (n_class,) + Number of support vectors in each class. + + sv_coef : array of shape (n_class-1, n_support) + Coefficients of support vectors in decision function. + + intercept : array of shape (n_class*(n_class-1)/2,) + Intercept in decision function. + + probA, probB : array of shape (n_class*(n_class-1)/2,) + Probability estimates. + + svm_type : {0, 1, 2, 3, 4}, default=0 + Type of SVM: C_SVC, NuSVC, OneClassSVM, EpsilonSVR or NuSVR + respectively. + + kernel : {'linear', 'rbf', 'poly', 'sigmoid', 'precomputed'}, default="rbf" + Kernel to use in the model: linear, polynomial, RBF, sigmoid + or precomputed. + + degree : int32, default=3 + Degree of the polynomial kernel (only relevant if kernel is + set to polynomial). + + gamma : float64, default=0.1 + Gamma parameter in rbf, poly and sigmoid kernels. Ignored by other + kernels. + + coef0 : float64, default=0.0 + Independent parameter in poly/sigmoid kernel. + + Returns + ------- + dec_values : array + Predicted values. + """ + cdef float64_t[:, ::1] dec_values + cdef svm_parameter param + cdef svm_model *model + cdef int32_t[::1] class_weight_label = np.arange( + class_weight.shape[0], dtype=np.int32 + ) + cdef int rv + + set_predict_params( + ¶m, + svm_type, + kernel, + degree, + gamma, + coef0, + cache_size, + 1, + class_weight.shape[0], + &class_weight_label[0] if class_weight_label.size > 0 else NULL, + &class_weight[0] if class_weight.size > 0 else NULL, + ) + model = set_model( + ¶m, + nSV.shape[0], + &SV[0, 0] if SV.size > 0 else NULL, + SV.shape, + &support[0], + support.shape, + sv_coef.strides, + &sv_coef[0, 0], + &intercept[0], + &nSV[0], + &probA[0] if probA.size > 0 else NULL, + &probB[0] if probB.size > 0 else NULL, + ) + + cdef intp_t n_class = get_nr(model) + cdef BlasFunctions blas_functions + blas_functions.dot = _dot[double] + try: + dec_values = np.empty((X.shape[0], n_class), dtype=np.float64) + with nogil: + rv = copy_predict_proba( + &X[0, 0], + model, + X.shape, + &dec_values[0, 0], + &blas_functions, + ) + if rv < 0: + raise MemoryError("We've run out of memory") + finally: + free_model(model) + + return dec_values.base + + +def decision_function( + const float64_t[:, ::1] X, + const int32_t[::1] support, + const float64_t[:, ::1] SV, + const int32_t[::1] nSV, + const float64_t[:, ::1] sv_coef, + const float64_t[::1] intercept, + const float64_t[::1] probA=np.empty(0), + const float64_t[::1] probB=np.empty(0), + int svm_type=0, + kernel='rbf', + int degree=3, + double gamma=0.1, + double coef0=0.0, + const float64_t[::1] class_weight=np.empty(0), + const float64_t[::1] sample_weight=np.empty(0), + double cache_size=100.0, +): + """ + Predict margin (libsvm name for this is predict_values) + + We have to reconstruct model and parameters to make sure we stay + in sync with the python object. + + Parameters + ---------- + X : array-like, dtype=float, size=[n_samples, n_features] + + support : array, shape=[n_support] + Index of support vectors in training set. + + SV : array, shape=[n_support, n_features] + Support vectors. + + nSV : array, shape=[n_class] + Number of support vectors in each class. + + sv_coef : array, shape=[n_class-1, n_support] + Coefficients of support vectors in decision function. + + intercept : array, shape=[n_class*(n_class-1)/2] + Intercept in decision function. + + probA, probB : array, shape=[n_class*(n_class-1)/2] + Probability estimates. + + svm_type : {0, 1, 2, 3, 4}, optional + Type of SVM: C_SVC, NuSVC, OneClassSVM, EpsilonSVR or NuSVR + respectively. 0 by default. + + kernel : {'linear', 'rbf', 'poly', 'sigmoid', 'precomputed'}, optional + Kernel to use in the model: linear, polynomial, RBF, sigmoid + or precomputed. 'rbf' by default. + + degree : int32, optional + Degree of the polynomial kernel (only relevant if kernel is + set to polynomial), 3 by default. + + gamma : float64, optional + Gamma parameter in rbf, poly and sigmoid kernels. Ignored by other + kernels. 0.1 by default. + + coef0 : float64, optional + Independent parameter in poly/sigmoid kernel. 0 by default. + + Returns + ------- + dec_values : array + Predicted values. + """ + cdef float64_t[:, ::1] dec_values + cdef svm_parameter param + cdef svm_model *model + cdef intp_t n_class + + cdef int32_t[::1] class_weight_label = np.arange( + class_weight.shape[0], dtype=np.int32 + ) + + cdef int rv + + set_predict_params( + ¶m, + svm_type, + kernel, + degree, + gamma, + coef0, + cache_size, + 0, + class_weight.shape[0], + &class_weight_label[0] if class_weight_label.size > 0 else NULL, + &class_weight[0] if class_weight.size > 0 else NULL, + ) + + model = set_model( + ¶m, + nSV.shape[0], + &SV[0, 0] if SV.size > 0 else NULL, + SV.shape, + &support[0], + support.shape, + sv_coef.strides, + &sv_coef[0, 0], + &intercept[0], + &nSV[0], + &probA[0] if probA.size > 0 else NULL, + &probB[0] if probB.size > 0 else NULL, + ) + + if svm_type > 1: + n_class = 1 + else: + n_class = get_nr(model) + n_class = n_class * (n_class - 1) // 2 + cdef BlasFunctions blas_functions + blas_functions.dot = _dot[double] + try: + dec_values = np.empty((X.shape[0], n_class), dtype=np.float64) + with nogil: + rv = copy_predict_values( + &X[0, 0], + model, + X.shape, + &dec_values[0, 0], + n_class, + &blas_functions, + ) + if rv < 0: + raise MemoryError("We've run out of memory") + finally: + free_model(model) + + return dec_values.base + + +def cross_validation( + const float64_t[:, ::1] X, + const float64_t[::1] Y, + int n_fold, + int svm_type=0, + kernel='rbf', + int degree=3, + double gamma=0.1, + double coef0=0.0, + double tol=1e-3, + double C=1.0, + double nu=0.5, + double epsilon=0.1, + float64_t[::1] class_weight=np.empty(0), + float64_t[::1] sample_weight=np.empty(0), + int shrinking=0, + int probability=0, + double cache_size=100.0, + int max_iter=-1, + int random_seed=0, +): + """ + Binding of the cross-validation routine (low-level routine) + + Parameters + ---------- + + X : array-like, dtype=float of shape (n_samples, n_features) + + Y : array, dtype=float of shape (n_samples,) + target vector + + n_fold : int32 + Number of folds for cross validation. + + svm_type : {0, 1, 2, 3, 4}, default=0 + Type of SVM: C_SVC, NuSVC, OneClassSVM, EpsilonSVR or NuSVR + respectively. + + kernel : {'linear', 'rbf', 'poly', 'sigmoid', 'precomputed'}, default='rbf' + Kernel to use in the model: linear, polynomial, RBF, sigmoid + or precomputed. + + degree : int32, default=3 + Degree of the polynomial kernel (only relevant if kernel is + set to polynomial). + + gamma : float64, default=0.1 + Gamma parameter in rbf, poly and sigmoid kernels. Ignored by other + kernels. + + coef0 : float64, default=0.0 + Independent parameter in poly/sigmoid kernel. + + tol : float64, default=1e-3 + Numeric stopping criterion (WRITEME). + + C : float64, default=1 + C parameter in C-Support Vector Classification. + + nu : float64, default=0.5 + An upper bound on the fraction of training errors and a lower bound of + the fraction of support vectors. Should be in the interval (0, 1]. + + epsilon : double, default=0.1 + Epsilon parameter in the epsilon-insensitive loss function. + + class_weight : array, dtype=float64, shape (n_classes,), \ + default=np.empty(0) + Set the parameter C of class i to class_weight[i]*C for + SVC. If not given, all classes are supposed to have + weight one. + + sample_weight : array, dtype=float64, shape (n_samples,), \ + default=np.empty(0) + Weights assigned to each sample. + + shrinking : int, default=1 + Whether to use the shrinking heuristic. + + probability : int, default=0 + Whether to enable probability estimates. + + cache_size : float64, default=100 + Cache size for gram matrix columns (in megabytes). + + max_iter : int (-1 for no limit), default=-1 + Stop solver after this many iterations regardless of accuracy + (XXX Currently there is no API to know whether this kicked in.) + + random_seed : int, default=0 + Seed for the random number generator used for probability estimates. + + Returns + ------- + target : array, float + + """ + + cdef svm_parameter param + cdef svm_problem problem + cdef const char *error_msg + + if len(sample_weight) == 0: + sample_weight = np.ones(X.shape[0], dtype=np.float64) + else: + assert sample_weight.shape[0] == X.shape[0], ( + f"sample_weight and X have incompatible shapes: sample_weight has " + f"{sample_weight.shape[0]} samples while X has {X.shape[0]}" + ) + + if X.shape[0] < n_fold: + raise ValueError("Number of samples is less than number of folds") + + # set problem + kernel_index = LIBSVM_KERNEL_TYPES.index(kernel) + set_problem( + &problem, + &X[0, 0], + &Y[0], + &sample_weight[0] if sample_weight.size > 0 else NULL, + X.shape, + kernel_index, + ) + if problem.x == NULL: + raise MemoryError("Seems we've run out of memory") + cdef int32_t[::1] class_weight_label = np.arange( + class_weight.shape[0], dtype=np.int32 + ) + + # set parameters + set_parameter( + ¶m, + svm_type, + kernel_index, + degree, + gamma, + coef0, + nu, + cache_size, + C, + tol, + tol, + shrinking, + probability, + class_weight.shape[0], + &class_weight_label[0] if class_weight_label.size > 0 else NULL, + &class_weight[0] if class_weight.size > 0 else NULL, + max_iter, + random_seed, + ) + + error_msg = svm_check_parameter(&problem, ¶m) + if error_msg: + raise ValueError(error_msg) + + cdef float64_t[::1] target + cdef BlasFunctions blas_functions + blas_functions.dot = _dot[double] + try: + target = np.empty((X.shape[0]), dtype=np.float64) + with nogil: + svm_cross_validation( + &problem, + ¶m, + n_fold, + &target[0], + &blas_functions, + ) + finally: + free(problem.x) + + return target.base + + +def set_verbosity_wrap(int verbosity): + """ + Control verbosity of libsvm library + """ + set_verbosity(verbosity) diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/_libsvm_sparse.pyx b/.venv/lib/python3.12/site-packages/sklearn/svm/_libsvm_sparse.pyx new file mode 100644 index 0000000000000000000000000000000000000000..529758061d299f095bbe3834d85e3f10e475c537 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/_libsvm_sparse.pyx @@ -0,0 +1,550 @@ +import numpy as np +from scipy import sparse +from ..utils._cython_blas cimport _dot +from ..utils._typedefs cimport float64_t, int32_t, intp_t + +cdef extern from *: + ctypedef char* const_char_p "const char*" + +################################################################################ +# Includes + +cdef extern from "_svm_cython_blas_helpers.h": + ctypedef double (*dot_func)(int, const double*, int, const double*, int) + cdef struct BlasFunctions: + dot_func dot + +cdef extern from "svm.h": + cdef struct svm_csr_node + cdef struct svm_csr_model + cdef struct svm_parameter + cdef struct svm_csr_problem + char *svm_csr_check_parameter(svm_csr_problem *, svm_parameter *) + svm_csr_model *svm_csr_train(svm_csr_problem *, svm_parameter *, int *, BlasFunctions *) nogil + void svm_csr_free_and_destroy_model(svm_csr_model** model_ptr_ptr) + +cdef extern from "libsvm_sparse_helper.c": + # this file contains methods for accessing libsvm 'hidden' fields + svm_csr_problem * csr_set_problem ( + char *, intp_t *, char *, intp_t *, char *, char *, char *, int) + svm_csr_model *csr_set_model(svm_parameter *param, int nr_class, + char *SV_data, intp_t *SV_indices_dims, + char *SV_indices, intp_t *SV_intptr_dims, + char *SV_intptr, + char *sv_coef, char *rho, char *nSV, + char *probA, char *probB) + svm_parameter *set_parameter (int , int , int , double, double , + double , double , double , double, + double, int, int, int, char *, char *, int, + int) + void copy_sv_coef (char *, svm_csr_model *) + void copy_n_iter (char *, svm_csr_model *) + void copy_support (char *, svm_csr_model *) + void copy_intercept (char *, svm_csr_model *, intp_t *) + int copy_predict (char *, svm_csr_model *, intp_t *, char *, BlasFunctions *) + int csr_copy_predict_values (intp_t *data_size, char *data, intp_t *index_size, + char *index, intp_t *intptr_size, char *size, + svm_csr_model *model, char *dec_values, int nr_class, BlasFunctions *) + int csr_copy_predict (intp_t *data_size, char *data, intp_t *index_size, + char *index, intp_t *intptr_size, char *size, + svm_csr_model *model, char *dec_values, BlasFunctions *) nogil + int csr_copy_predict_proba (intp_t *data_size, char *data, intp_t *index_size, + char *index, intp_t *intptr_size, char *size, + svm_csr_model *model, char *dec_values, BlasFunctions *) nogil + + int copy_predict_values(char *, svm_csr_model *, intp_t *, char *, int, BlasFunctions *) + int csr_copy_SV (char *values, intp_t *n_indices, + char *indices, intp_t *n_indptr, char *indptr, + svm_csr_model *model, int n_features) + intp_t get_nonzero_SV (svm_csr_model *) + void copy_nSV (char *, svm_csr_model *) + void copy_probA (char *, svm_csr_model *, intp_t *) + void copy_probB (char *, svm_csr_model *, intp_t *) + intp_t get_l (svm_csr_model *) + intp_t get_nr (svm_csr_model *) + int free_problem (svm_csr_problem *) + int free_model (svm_csr_model *) + int free_param (svm_parameter *) + int free_model_SV(svm_csr_model *model) + void set_verbosity(int) + + +def libsvm_sparse_train (int n_features, + const float64_t[::1] values, + const int32_t[::1] indices, + const int32_t[::1] indptr, + const float64_t[::1] Y, + int svm_type, int kernel_type, int degree, double gamma, + double coef0, double eps, double C, + const float64_t[::1] class_weight, + const float64_t[::1] sample_weight, + double nu, double cache_size, double p, int + shrinking, int probability, int max_iter, + int random_seed): + """ + Wrap svm_train from libsvm using a scipy.sparse.csr matrix + + Work in progress. + + Parameters + ---------- + n_features : number of features. + XXX: can we retrieve this from any other parameter ? + + X : array-like, dtype=float, size=[N, D] + + Y : array, dtype=float, size=[N] + target vector + + ... + + Notes + ------------------- + See sklearn.svm.predict for a complete list of parameters. + + """ + + cdef svm_parameter *param + cdef svm_csr_problem *problem + cdef svm_csr_model *model + cdef const_char_p error_msg + + if len(sample_weight) == 0: + sample_weight = np.ones(Y.shape[0], dtype=np.float64) + else: + assert sample_weight.shape[0] == indptr.shape[0] - 1, \ + "sample_weight and X have incompatible shapes: " + \ + "sample_weight has %s samples while X has %s" % \ + (sample_weight.shape[0], indptr.shape[0] - 1) + + # we should never end up here with a precomputed kernel matrix, + # as this is always dense. + assert(kernel_type != 4) + + # set libsvm problem + problem = csr_set_problem( + &values[0], + indices.shape, + &indices[0], + indptr.shape, + &indptr[0], + &Y[0], + &sample_weight[0], + kernel_type, + ) + + cdef int32_t[::1] \ + class_weight_label = np.arange(class_weight.shape[0], dtype=np.int32) + + # set parameters + param = set_parameter( + svm_type, + kernel_type, + degree, + gamma, + coef0, + nu, + cache_size, + C, + eps, + p, + shrinking, + probability, + class_weight.shape[0], + &class_weight_label[0] if class_weight_label.size > 0 else NULL, + &class_weight[0] if class_weight.size > 0 else NULL, max_iter, + random_seed, + ) + + # check parameters + if (param == NULL or problem == NULL): + raise MemoryError("Seems we've run out of memory") + error_msg = svm_csr_check_parameter(problem, param) + if error_msg: + free_problem(problem) + free_param(param) + raise ValueError(error_msg) + cdef BlasFunctions blas_functions + blas_functions.dot = _dot[double] + # call svm_train, this does the real work + cdef int fit_status = 0 + with nogil: + model = svm_csr_train(problem, param, &fit_status, &blas_functions) + + cdef intp_t SV_len = get_l(model) + cdef intp_t n_class = get_nr(model) + + cdef int[::1] n_iter + n_iter = np.empty(max(1, n_class * (n_class - 1) // 2), dtype=np.intc) + copy_n_iter( &n_iter[0], model) + + # copy model.sv_coef + # we create a new array instead of resizing, otherwise + # it would not erase previous information + cdef float64_t[::1] sv_coef_data + sv_coef_data = np.empty((n_class-1)*SV_len, dtype=np.float64) + copy_sv_coef ( &sv_coef_data[0] if sv_coef_data.size > 0 else NULL, model) + + cdef int32_t[::1] support + support = np.empty(SV_len, dtype=np.int32) + copy_support( &support[0] if support.size > 0 else NULL, model) + + # copy model.rho into the intercept + # the intercept is just model.rho but with sign changed + cdef float64_t[::1]intercept + intercept = np.empty(n_class*(n_class-1)//2, dtype=np.float64) + copy_intercept ( &intercept[0], model, intercept.shape) + + # copy model.SV + # we erase any previous information in SV + # TODO: custom kernel + cdef intp_t nonzero_SV + nonzero_SV = get_nonzero_SV (model) + + cdef float64_t[::1] SV_data + cdef int32_t[::1] SV_indices, SV_indptr + SV_data = np.empty(nonzero_SV, dtype=np.float64) + SV_indices = np.empty(nonzero_SV, dtype=np.int32) + SV_indptr = np.empty(SV_len + 1, dtype=np.int32) + csr_copy_SV( + &SV_data[0] if SV_data.size > 0 else NULL, + SV_indices.shape, + &SV_indices[0] if SV_indices.size > 0 else NULL, + SV_indptr.shape, + &SV_indptr[0] if SV_indptr.size > 0 else NULL, + model, + n_features, + ) + support_vectors_ = sparse.csr_matrix( + (SV_data, SV_indices, SV_indptr), (SV_len, n_features) + ) + + # copy model.nSV + # TODO: do only in classification + cdef int32_t[::1]n_class_SV + n_class_SV = np.empty(n_class, dtype=np.int32) + copy_nSV( &n_class_SV[0], model) + + # # copy probabilities + cdef float64_t[::1] probA, probB + if probability != 0: + if svm_type < 2: # SVC and NuSVC + probA = np.empty(n_class*(n_class-1)//2, dtype=np.float64) + probB = np.empty(n_class*(n_class-1)//2, dtype=np.float64) + copy_probB( &probB[0], model, probB.shape) + else: + probA = np.empty(1, dtype=np.float64) + probB = np.empty(0, dtype=np.float64) + copy_probA( &probA[0], model, probA.shape) + else: + probA = np.empty(0, dtype=np.float64) + probB = np.empty(0, dtype=np.float64) + + svm_csr_free_and_destroy_model (&model) + free_problem(problem) + free_param(param) + + return ( + support.base, + support_vectors_, + sv_coef_data.base, + intercept.base, + n_class_SV.base, + probA.base, + probB.base, + fit_status, + n_iter.base, + ) + + +def libsvm_sparse_predict (const float64_t[::1] T_data, + const int32_t[::1] T_indices, + const int32_t[::1] T_indptr, + const float64_t[::1] SV_data, + const int32_t[::1] SV_indices, + const int32_t[::1] SV_indptr, + const float64_t[::1] sv_coef, + const float64_t[::1] + intercept, int svm_type, int kernel_type, int + degree, double gamma, double coef0, double + eps, double C, + const float64_t[:] class_weight, + double nu, double p, int + shrinking, int probability, + const int32_t[::1] nSV, + const float64_t[::1] probA, + const float64_t[::1] probB): + """ + Predict values T given a model. + + For speed, all real work is done at the C level in function + copy_predict (libsvm_helper.c). + + We have to reconstruct model and parameters to make sure we stay + in sync with the python object. + + See sklearn.svm.predict for a complete list of parameters. + + Parameters + ---------- + X : array-like, dtype=float + Y : array + target vector + + Returns + ------- + dec_values : array + predicted values. + """ + cdef float64_t[::1] dec_values + cdef svm_parameter *param + cdef svm_csr_model *model + cdef int32_t[::1] \ + class_weight_label = np.arange(class_weight.shape[0], dtype=np.int32) + cdef int rv + param = set_parameter( + svm_type, + kernel_type, + degree, + gamma, + coef0, + nu, + 100.0, # cache size has no effect on predict + C, + eps, + p, + shrinking, + probability, + class_weight.shape[0], + &class_weight_label[0] if class_weight_label.size > 0 else NULL, + &class_weight[0] if class_weight.size > 0 else NULL, + -1, + -1, # random seed has no effect on predict either + ) + + model = csr_set_model( + param, nSV.shape[0], + &SV_data[0] if SV_data.size > 0 else NULL, + SV_indices.shape, + &SV_indices[0] if SV_indices.size > 0 else NULL, + SV_indptr.shape, + &SV_indptr[0] if SV_indptr.size > 0 else NULL, + &sv_coef[0] if sv_coef.size > 0 else NULL, + &intercept[0], + &nSV[0], + &probA[0] if probA.size > 0 else NULL, + &probB[0] if probB.size > 0 else NULL, + ) + # TODO: use check_model + dec_values = np.empty(T_indptr.shape[0]-1) + cdef BlasFunctions blas_functions + blas_functions.dot = _dot[double] + with nogil: + rv = csr_copy_predict( + T_data.shape, + &T_data[0], + T_indices.shape, + &T_indices[0], + T_indptr.shape, + &T_indptr[0], + model, + &dec_values[0], + &blas_functions, + ) + if rv < 0: + raise MemoryError("We've run out of memory") + # free model and param + free_model_SV(model) + free_model(model) + free_param(param) + return dec_values.base + + +def libsvm_sparse_predict_proba( + const float64_t[::1] T_data, + const int32_t[::1] T_indices, + const int32_t[::1] T_indptr, + const float64_t[::1] SV_data, + const int32_t[::1] SV_indices, + const int32_t[::1] SV_indptr, + const float64_t[::1] sv_coef, + const float64_t[::1] + intercept, int svm_type, int kernel_type, int + degree, double gamma, double coef0, double + eps, double C, + const float64_t[:] class_weight, + double nu, double p, int shrinking, int probability, + const int32_t[::1] nSV, + const float64_t[::1] probA, + const float64_t[::1] probB, +): + """ + Predict values T given a model. + """ + cdef float64_t[:, ::1] dec_values + cdef svm_parameter *param + cdef svm_csr_model *model + cdef int32_t[::1] \ + class_weight_label = np.arange(class_weight.shape[0], dtype=np.int32) + param = set_parameter( + svm_type, + kernel_type, + degree, + gamma, + coef0, + nu, + 100.0, # cache size has no effect on predict + C, + eps, + p, + shrinking, + probability, + class_weight.shape[0], + &class_weight_label[0] if class_weight_label.size > 0 else NULL, + &class_weight[0] if class_weight.size > 0 else NULL, + -1, + -1, # random seed has no effect on predict either + ) + + model = csr_set_model( + param, + nSV.shape[0], + &SV_data[0] if SV_data.size > 0 else NULL, + SV_indices.shape, + &SV_indices[0] if SV_indices.size > 0 else NULL, + SV_indptr.shape, + &SV_indptr[0] if SV_indptr.size > 0 else NULL, + &sv_coef[0] if sv_coef.size > 0 else NULL, + &intercept[0], + &nSV[0], + &probA[0] if probA.size > 0 else NULL, + &probB[0] if probB.size > 0 else NULL, + ) + # TODO: use check_model + cdef intp_t n_class = get_nr(model) + cdef int rv + dec_values = np.empty((T_indptr.shape[0]-1, n_class), dtype=np.float64) + cdef BlasFunctions blas_functions + blas_functions.dot = _dot[double] + with nogil: + rv = csr_copy_predict_proba( + T_data.shape, + &T_data[0], + T_indices.shape, + &T_indices[0], + T_indptr.shape, + &T_indptr[0], + model, + &dec_values[0, 0], + &blas_functions, + ) + if rv < 0: + raise MemoryError("We've run out of memory") + # free model and param + free_model_SV(model) + free_model(model) + free_param(param) + return dec_values.base + + +def libsvm_sparse_decision_function( + const float64_t[::1] T_data, + const int32_t[::1] T_indices, + const int32_t[::1] T_indptr, + const float64_t[::1] SV_data, + const int32_t[::1] SV_indices, + const int32_t[::1] SV_indptr, + const float64_t[::1] sv_coef, + const float64_t[::1] + intercept, int svm_type, int kernel_type, int + degree, double gamma, double coef0, double + eps, double C, + const float64_t[:] class_weight, + double nu, double p, int shrinking, int probability, + const int32_t[::1] nSV, + const float64_t[::1] probA, + const float64_t[::1] probB, +): + """ + Predict margin (libsvm name for this is predict_values) + + We have to reconstruct model and parameters to make sure we stay + in sync with the python object. + """ + cdef float64_t[:, ::1] dec_values + cdef svm_parameter *param + cdef intp_t n_class + + cdef svm_csr_model *model + cdef int32_t[::1] \ + class_weight_label = np.arange(class_weight.shape[0], dtype=np.int32) + param = set_parameter( + svm_type, + kernel_type, + degree, + gamma, + coef0, + nu, + 100.0, # cache size has no effect on predict + C, + eps, + p, + shrinking, + probability, + class_weight.shape[0], + &class_weight_label[0] if class_weight_label.size > 0 else NULL, + &class_weight[0] if class_weight.size > 0 else NULL, + -1, + -1, + ) + + model = csr_set_model( + param, + nSV.shape[0], + &SV_data[0] if SV_data.size > 0 else NULL, + SV_indices.shape, + &SV_indices[0] if SV_indices.size > 0 else NULL, + SV_indptr.shape, + &SV_indptr[0] if SV_indptr.size > 0 else NULL, + &sv_coef[0] if sv_coef.size > 0 else NULL, + &intercept[0], + &nSV[0], + &probA[0] if probA.size > 0 else NULL, + &probB[0] if probB.size > 0 else NULL, + ) + + if svm_type > 1: + n_class = 1 + else: + n_class = get_nr(model) + n_class = n_class * (n_class - 1) // 2 + + dec_values = np.empty((T_indptr.shape[0] - 1, n_class), dtype=np.float64) + cdef BlasFunctions blas_functions + blas_functions.dot = _dot[double] + if csr_copy_predict_values( + T_data.shape, + &T_data[0], + T_indices.shape, + &T_indices[0], + T_indptr.shape, + &T_indptr[0], + model, + &dec_values[0, 0], + n_class, + &blas_functions, + ) < 0: + raise MemoryError("We've run out of memory") + # free model and param + free_model_SV(model) + free_model(model) + free_param(param) + + return dec_values.base + + +def set_verbosity_wrap(int verbosity): + """ + Control verbosity of libsvm library + """ + set_verbosity(verbosity) diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/_newrand.cpython-312-x86_64-linux-gnu.so b/.venv/lib/python3.12/site-packages/sklearn/svm/_newrand.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..c7d9391102a41eeab0c670e966fcc2234b0b1af3 Binary files /dev/null and b/.venv/lib/python3.12/site-packages/sklearn/svm/_newrand.cpython-312-x86_64-linux-gnu.so differ diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/_newrand.pyx b/.venv/lib/python3.12/site-packages/sklearn/svm/_newrand.pyx new file mode 100644 index 0000000000000000000000000000000000000000..af543ed73286a06bfb0053807bc8b8c39bfc53c0 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/_newrand.pyx @@ -0,0 +1,13 @@ +"""Wrapper for newrand.h""" + +cdef extern from "newrand.h": + void set_seed(unsigned int) + unsigned int bounded_rand_int(unsigned int) + + +def set_seed_wrap(unsigned int custom_seed): + set_seed(custom_seed) + + +def bounded_rand_int_wrap(unsigned int range_): + return bounded_rand_int(range_) diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/meson.build b/.venv/lib/python3.12/site-packages/sklearn/svm/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..6232d747d1feb220eb4656396314d7caddac9c52 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/meson.build @@ -0,0 +1,48 @@ +newrand_include = include_directories('src/newrand') +libsvm_include = include_directories('src/libsvm') +liblinear_include = include_directories('src/liblinear') + +_newrand = py.extension_module( + '_newrand', + cython_gen_cpp.process('_newrand.pyx'), + include_directories: [newrand_include], + subdir: 'sklearn/svm', + install: true +) + +libsvm_skl = static_library( + 'libsvm-skl', + ['src/libsvm/libsvm_template.cpp'], +) + +py.extension_module( + '_libsvm', + [cython_gen.process('_libsvm.pyx'), utils_cython_tree], + include_directories: [newrand_include, libsvm_include], + link_with: libsvm_skl, + subdir: 'sklearn/svm', + install: true +) + +py.extension_module( + '_libsvm_sparse', + [cython_gen.process('_libsvm_sparse.pyx'), utils_cython_tree], + include_directories: [newrand_include, libsvm_include], + link_with: libsvm_skl, + subdir: 'sklearn/svm', + install: true +) + +liblinear_skl = static_library( + 'liblinear-skl', + ['src/liblinear/linear.cpp', 'src/liblinear/tron.cpp'], +) + +py.extension_module( + '_liblinear', + [cython_gen.process('_liblinear.pyx'), utils_cython_tree], + include_directories: [newrand_include, liblinear_include], + link_with: [liblinear_skl], + subdir: 'sklearn/svm', + install: true +) diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/COPYRIGHT b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/COPYRIGHT new file mode 100644 index 0000000000000000000000000000000000000000..94371bb4cfd3a117775792c38e8354e62c46dc8f --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/COPYRIGHT @@ -0,0 +1,31 @@ + +Copyright (c) 2007-2014 The LIBLINEAR Project. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither name of copyright holders nor the names of its contributors +may be used to endorse or promote products derived from this software +without specific prior written permission. + + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/_cython_blas_helpers.h b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/_cython_blas_helpers.h new file mode 100644 index 0000000000000000000000000000000000000000..bdec1a2f99eb9c0cd57f4e588e9b277ab5f93a6a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/_cython_blas_helpers.h @@ -0,0 +1,16 @@ +#ifndef _CYTHON_BLAS_HELPERS_H +#define _CYTHON_BLAS_HELPERS_H + +typedef double (*dot_func)(int, const double*, int, const double*, int); +typedef void (*axpy_func)(int, double, const double*, int, double*, int); +typedef void (*scal_func)(int, double, const double*, int); +typedef double (*nrm2_func)(int, const double*, int); + +typedef struct BlasFunctions{ + dot_func dot; + axpy_func axpy; + scal_func scal; + nrm2_func nrm2; +} BlasFunctions; + +#endif diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/liblinear_helper.c b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/liblinear_helper.c new file mode 100644 index 0000000000000000000000000000000000000000..b66f08413e11b6af16d72a35d1e8e85a5addfd43 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/liblinear_helper.c @@ -0,0 +1,236 @@ +#include +#define PY_SSIZE_T_CLEAN +#include +#include "linear.h" + + +/* + * Convert matrix to sparse representation suitable for liblinear. x is + * expected to be an array of length n_samples*n_features. + * + * Whether the matrix is densely or sparsely populated, the fastest way to + * convert it to liblinear's sparse format is to calculate the amount of memory + * needed and allocate a single big block. + * + * Special care must be taken with indices, since liblinear indices start at 1 + * and not at 0. + * + * If bias is > 0, we append an item at the end. + */ +static struct feature_node **dense_to_sparse(char *x, int double_precision, + int n_samples, int n_features, int n_nonzero, double bias) +{ + float *x32 = (float *)x; + double *x64 = (double *)x; + struct feature_node **sparse; + int i, j; /* number of nonzero elements in row i */ + struct feature_node *T; /* pointer to the top of the stack */ + int have_bias = (bias > 0); + + sparse = malloc (n_samples * sizeof(struct feature_node *)); + if (sparse == NULL) + return NULL; + + n_nonzero += (have_bias+1) * n_samples; + T = malloc (n_nonzero * sizeof(struct feature_node)); + if (T == NULL) { + free(sparse); + return NULL; + } + + for (i=0; ivalue = *x64; + T->index = j; + ++ T; + } + ++ x64; /* go to next element */ + } else { + if (*x32 != 0) { + T->value = *x32; + T->index = j; + ++ T; + } + ++ x32; /* go to next element */ + } + } + + /* set bias element */ + if (have_bias) { + T->value = bias; + T->index = j; + ++ T; + } + + /* set sentinel */ + T->index = -1; + ++ T; + } + + return sparse; +} + + +/* + * Convert scipy.sparse.csr to liblinear's sparse data structure + */ +static struct feature_node **csr_to_sparse(char *x, int double_precision, + int *indices, int *indptr, int n_samples, int n_features, int n_nonzero, + double bias) +{ + float *x32 = (float *)x; + double *x64 = (double *)x; + struct feature_node **sparse; + int i, j=0, k=0, n; + struct feature_node *T; + int have_bias = (bias > 0); + + sparse = malloc (n_samples * sizeof(struct feature_node *)); + if (sparse == NULL) + return NULL; + + n_nonzero += (have_bias+1) * n_samples; + T = malloc (n_nonzero * sizeof(struct feature_node)); + if (T == NULL) { + free(sparse); + return NULL; + } + + for (i=0; ivalue = double_precision ? x64[k] : x32[k]; + T->index = indices[k] + 1; /* liblinear uses 1-based indexing */ + ++T; + ++k; + } + + if (have_bias) { + T->value = bias; + T->index = n_features + 1; + ++T; + ++j; + } + + /* set sentinel */ + T->index = -1; + ++T; + } + + return sparse; +} + +struct problem * set_problem(char *X, int double_precision_X, int n_samples, + int n_features, int n_nonzero, double bias, char* sample_weight, + char *Y) +{ + struct problem *problem; + /* not performant but simple */ + problem = malloc(sizeof(struct problem)); + if (problem == NULL) return NULL; + problem->l = n_samples; + problem->n = n_features + (bias > 0); + problem->y = (double *) Y; + problem->W = (double *) sample_weight; + problem->x = dense_to_sparse(X, double_precision_X, n_samples, n_features, + n_nonzero, bias); + problem->bias = bias; + + if (problem->x == NULL) { + free(problem); + return NULL; + } + + return problem; +} + +struct problem * csr_set_problem (char *X, int double_precision_X, + char *indices, char *indptr, int n_samples, int n_features, + int n_nonzero, double bias, char *sample_weight, char *Y) +{ + struct problem *problem; + problem = malloc (sizeof (struct problem)); + if (problem == NULL) return NULL; + problem->l = n_samples; + problem->n = n_features + (bias > 0); + problem->y = (double *) Y; + problem->W = (double *) sample_weight; + problem->x = csr_to_sparse(X, double_precision_X, (int *) indices, + (int *) indptr, n_samples, n_features, n_nonzero, bias); + problem->bias = bias; + + if (problem->x == NULL) { + free(problem); + return NULL; + } + + return problem; +} + + +/* Create a parameter struct with and return it */ +struct parameter *set_parameter(int solver_type, double eps, double C, + Py_ssize_t nr_weight, char *weight_label, + char *weight, int max_iter, unsigned seed, + double epsilon) +{ + struct parameter *param = malloc(sizeof(struct parameter)); + if (param == NULL) + return NULL; + + set_seed(seed); + param->solver_type = solver_type; + param->eps = eps; + param->C = C; + param->p = epsilon; // epsilon for epsilon-SVR + param->nr_weight = (int) nr_weight; + param->weight_label = (int *) weight_label; + param->weight = (double *) weight; + param->max_iter = max_iter; + return param; +} + +void copy_w(void *data, struct model *model, int len) +{ + memcpy(data, model->w, len * sizeof(double)); +} + +double get_bias(struct model *model) +{ + return model->bias; +} + +void free_problem(struct problem *problem) +{ + free(problem->x[0]); + free(problem->x); + free(problem); +} + +void free_parameter(struct parameter *param) +{ + free(param); +} + +/* rely on built-in facility to control verbose output */ +static void print_null(const char *s) {} + +static void print_string_stdout(const char *s) +{ + fputs(s ,stdout); + fflush(stdout); +} + +/* provide convenience wrapper */ +void set_verbosity(int verbosity_flag){ + if (verbosity_flag) + set_print_string_function(&print_string_stdout); + else + set_print_string_function(&print_null); +} diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/linear.cpp b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/linear.cpp new file mode 100644 index 0000000000000000000000000000000000000000..63648adbe2947de03449580f060a795fd4eb3cb6 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/linear.cpp @@ -0,0 +1,3075 @@ +/* + Modified 2011: + + - Make labels sorted in group_classes, Dan Yamins. + + Modified 2012: + + - Changes roles of +1 and -1 to match scikit API, Andreas Mueller + See issue 546: https://github.com/scikit-learn/scikit-learn/pull/546 + - Also changed roles for pairwise class weights, Andreas Mueller + See issue 1491: https://github.com/scikit-learn/scikit-learn/pull/1491 + + Modified 2014: + + - Remove the hard-coded value of max_iter (1000), that allows max_iter + to be passed as a parameter from the classes LogisticRegression and + LinearSVC, Manoj Kumar + - Added function get_n_iter that exposes the number of iterations. + See issue 3499: https://github.com/scikit-learn/scikit-learn/issues/3499 + See pull 3501: https://github.com/scikit-learn/scikit-learn/pull/3501 + + Modified 2015: + - Patched liblinear for sample_weights - Manoj Kumar + See https://github.com/scikit-learn/scikit-learn/pull/5274 + + Modified 2020: + - Improved random number generator by using a mersenne twister + tweaked + lemire postprocessor. This fixed a convergence issue on windows targets. + Sylvain Marie, Schneider Electric + See + + */ + +#include +#include +#include +#include +#include +#include +#include "linear.h" +#include "tron.h" +#include +#include +#include "../newrand/newrand.h" + +typedef signed char schar; +template static inline void swap(T& x, T& y) { T t=x; x=y; y=t; } +#ifndef min +template static inline T min(T x,T y) { return (x static inline T max(T x,T y) { return (x>y)?x:y; } +#endif +template static inline void clone(T*& dst, S* src, int n) +{ + dst = new T[n]; + memcpy((void *)dst,(void *)src,sizeof(T)*n); +} +#define Malloc(type,n) (type *)malloc((n)*sizeof(type)) +#define INF HUGE_VAL + +static void print_string_stdout(const char *s) +{ + fputs(s,stdout); + fflush(stdout); +} + +static void (*liblinear_print_string) (const char *) = &print_string_stdout; + +#if 1 +static void info(const char *fmt,...) +{ + char buf[BUFSIZ]; + va_list ap; + va_start(ap,fmt); + vsprintf(buf,fmt,ap); + va_end(ap); + (*liblinear_print_string)(buf); +} +#else +static void info(const char *fmt,...) {} +#endif + +class l2r_lr_fun: public function +{ +public: + l2r_lr_fun(const problem *prob, double *C); + ~l2r_lr_fun(); + + double fun(double *w); + void grad(double *w, double *g); + void Hv(double *s, double *Hs); + + int get_nr_variable(void); + +private: + void Xv(double *v, double *Xv); + void XTv(double *v, double *XTv); + + double *C; + double *z; + double *D; + const problem *prob; +}; + +l2r_lr_fun::l2r_lr_fun(const problem *prob, double *C) +{ + int l=prob->l; + + this->prob = prob; + + z = new double[l]; + D = new double[l]; + this->C = C; +} + +l2r_lr_fun::~l2r_lr_fun() +{ + delete[] z; + delete[] D; +} + + +double l2r_lr_fun::fun(double *w) +{ + int i; + double f=0; + double *y=prob->y; + int l=prob->l; + int w_size=get_nr_variable(); + + Xv(w, z); + + for(i=0;i= 0) + f += C[i]*log(1 + exp(-yz)); + else + f += C[i]*(-yz+log(1 + exp(yz))); + } + + return(f); +} + +void l2r_lr_fun::grad(double *w, double *g) +{ + int i; + double *y=prob->y; + int l=prob->l; + int w_size=get_nr_variable(); + + for(i=0;in; +} + +void l2r_lr_fun::Hv(double *s, double *Hs) +{ + int i; + int l=prob->l; + int w_size=get_nr_variable(); + double *wa = new double[l]; + + Xv(s, wa); + for(i=0;il; + feature_node **x=prob->x; + + for(i=0;iindex!=-1) + { + Xv[i]+=v[s->index-1]*s->value; + s++; + } + } +} + +void l2r_lr_fun::XTv(double *v, double *XTv) +{ + int i; + int l=prob->l; + int w_size=get_nr_variable(); + feature_node **x=prob->x; + + for(i=0;iindex!=-1) + { + XTv[s->index-1]+=v[i]*s->value; + s++; + } + } +} + +class l2r_l2_svc_fun: public function +{ +public: + l2r_l2_svc_fun(const problem *prob, double *C); + ~l2r_l2_svc_fun(); + + double fun(double *w); + void grad(double *w, double *g); + void Hv(double *s, double *Hs); + + int get_nr_variable(void); + +protected: + void Xv(double *v, double *Xv); + void subXv(double *v, double *Xv); + void subXTv(double *v, double *XTv); + + double *C; + double *z; + double *D; + int *I; + int sizeI; + const problem *prob; +}; + +l2r_l2_svc_fun::l2r_l2_svc_fun(const problem *prob, double *C) +{ + int l=prob->l; + + this->prob = prob; + + z = new double[l]; + D = new double[l]; + I = new int[l]; + this->C = C; +} + +l2r_l2_svc_fun::~l2r_l2_svc_fun() +{ + delete[] z; + delete[] D; + delete[] I; +} + +double l2r_l2_svc_fun::fun(double *w) +{ + int i; + double f=0; + double *y=prob->y; + int l=prob->l; + int w_size=get_nr_variable(); + + Xv(w, z); + + for(i=0;i 0) + f += C[i]*d*d; + } + + return(f); +} + +void l2r_l2_svc_fun::grad(double *w, double *g) +{ + int i; + double *y=prob->y; + int l=prob->l; + int w_size=get_nr_variable(); + + sizeI = 0; + for (i=0;in; +} + +void l2r_l2_svc_fun::Hv(double *s, double *Hs) +{ + int i; + int w_size=get_nr_variable(); + double *wa = new double[sizeI]; + + subXv(s, wa); + for(i=0;il; + feature_node **x=prob->x; + + for(i=0;iindex!=-1) + { + Xv[i]+=v[s->index-1]*s->value; + s++; + } + } +} + +void l2r_l2_svc_fun::subXv(double *v, double *Xv) +{ + int i; + feature_node **x=prob->x; + + for(i=0;iindex!=-1) + { + Xv[i]+=v[s->index-1]*s->value; + s++; + } + } +} + +void l2r_l2_svc_fun::subXTv(double *v, double *XTv) +{ + int i; + int w_size=get_nr_variable(); + feature_node **x=prob->x; + + for(i=0;iindex!=-1) + { + XTv[s->index-1]+=v[i]*s->value; + s++; + } + } +} + +class l2r_l2_svr_fun: public l2r_l2_svc_fun +{ +public: + l2r_l2_svr_fun(const problem *prob, double *C, double p); + + double fun(double *w); + void grad(double *w, double *g); + +private: + double p; +}; + +l2r_l2_svr_fun::l2r_l2_svr_fun(const problem *prob, double *C, double p): + l2r_l2_svc_fun(prob, C) +{ + this->p = p; +} + +double l2r_l2_svr_fun::fun(double *w) +{ + int i; + double f=0; + double *y=prob->y; + int l=prob->l; + int w_size=get_nr_variable(); + double d; + + Xv(w, z); + + for(i=0;i p) + f += C[i]*(d-p)*(d-p); + } + + return(f); +} + +void l2r_l2_svr_fun::grad(double *w, double *g) +{ + int i; + double *y=prob->y; + int l=prob->l; + int w_size=get_nr_variable(); + double d; + + sizeI = 0; + for(i=0;i p) + { + z[sizeI] = C[i]*(d-p); + I[sizeI] = i; + sizeI++; + } + + } + subXTv(z, g); + + for(i=0;iw_size = prob->n; + this->l = prob->l; + this->nr_class = nr_class; + this->eps = eps; + this->max_iter = max_iter; + this->prob = prob; + this->B = new double[nr_class]; + this->G = new double[nr_class]; + this->C = new double[prob->l]; + for(int i = 0; i < prob->l; i++) + this->C[i] = prob->W[i] * weighted_C[(int)prob->y[i]]; +} + +Solver_MCSVM_CS::~Solver_MCSVM_CS() +{ + delete[] B; + delete[] G; + delete[] C; +} + +int compare_double(const void *a, const void *b) +{ + if(*(double *)a > *(double *)b) + return -1; + if(*(double *)a < *(double *)b) + return 1; + return 0; +} + +void Solver_MCSVM_CS::solve_sub_problem(double A_i, int yi, double C_yi, int active_i, double *alpha_new) +{ + int r; + double *D; + + clone(D, B, active_i); + if(yi < active_i) + D[yi] += A_i*C_yi; + qsort(D, active_i, sizeof(double), compare_double); + + double beta = D[0] - A_i*C_yi; + for(r=1;ry[i] == m + // alpha[i*nr_class+m] <= 0 if prob->y[i] != m + // If initial alpha isn't zero, uncomment the for loop below to initialize w + for(i=0;ix[i]; + QD[i] = 0; + while(xi->index != -1) + { + double val = xi->value; + QD[i] += val*val; + + // Uncomment the for loop if initial alpha isn't zero + // for(m=0; mindex-1)*nr_class+m] += alpha[i*nr_class+m]*val; + xi++; + } + active_size_i[i] = nr_class; + y_index[i] = (int)prob->y[i]; + index[i] = i; + } + + while(iter < max_iter) + { + double stopping = -INF; + for(i=0;i 0) + { + for(m=0;mx[i]; + while(xi->index!= -1) + { + double *w_i = &w[(xi->index-1)*nr_class]; + for(m=0;mvalue); + xi++; + } + + double minG = INF; + double maxG = -INF; + for(m=0;m maxG) + maxG = G[m]; + } + if(y_index[i] < active_size_i[i]) + if(alpha_i[(int) prob->y[i]] < C[GETI(i)] && G[y_index[i]] < minG) + minG = G[y_index[i]]; + + for(m=0;mm) + { + if(!be_shrunk(i, active_size_i[i], y_index[i], + alpha_i[alpha_index_i[active_size_i[i]]], minG)) + { + swap(alpha_index_i[m], alpha_index_i[active_size_i[i]]); + swap(G[m], G[active_size_i[i]]); + if(y_index[i] == active_size_i[i]) + y_index[i] = m; + else if(y_index[i] == m) + y_index[i] = active_size_i[i]; + break; + } + active_size_i[i]--; + } + } + } + + if(active_size_i[i] <= 1) + { + active_size--; + swap(index[s], index[active_size]); + s--; + continue; + } + + if(maxG-minG <= 1e-12) + continue; + else + stopping = max(maxG - minG, stopping); + + for(m=0;m= 1e-12) + { + d_ind[nz_d] = alpha_index_i[m]; + d_val[nz_d] = d; + nz_d++; + } + } + + xi = prob->x[i]; + while(xi->index != -1) + { + double *w_i = &w[(xi->index-1)*nr_class]; + for(m=0;mvalue; + xi++; + } + } + } + + iter++; + if(iter % 10 == 0) + { + info("."); + } + + if(stopping < eps_shrink) + { + if(stopping < eps && start_from_all == true) + break; + else + { + active_size = l; + for(i=0;i= max_iter) + info("\nWARNING: reaching max number of iterations\n"); + + // calculate objective value + double v = 0; + int nSV = 0; + for(i=0;i 0) + nSV++; + } + for(i=0;iy[i]]; + info("Objective value = %lf\n",v); + info("nSV = %d\n",nSV); + + delete [] alpha; + delete [] alpha_new; + delete [] index; + delete [] QD; + delete [] d_ind; + delete [] d_val; + delete [] alpha_index; + delete [] y_index; + delete [] active_size_i; + return iter; +} + +// A coordinate descent algorithm for +// L1-loss and L2-loss SVM dual problems +// +// min_\alpha 0.5(\alpha^T (Q + D)\alpha) - e^T \alpha, +// s.t. 0 <= \alpha_i <= upper_bound_i, +// +// where Qij = yi yj xi^T xj and +// D is a diagonal matrix +// +// In L1-SVM case: +// upper_bound_i = Cp if y_i = 1 +// upper_bound_i = Cn if y_i = -1 +// D_ii = 0 +// In L2-SVM case: +// upper_bound_i = INF +// D_ii = 1/(2*Cp) if y_i = 1 +// D_ii = 1/(2*Cn) if y_i = -1 +// +// Given: +// x, y, Cp, Cn +// eps is the stopping tolerance +// +// solution will be put in w +// +// See Algorithm 3 of Hsieh et al., ICML 2008 + +#undef GETI +#define GETI(i) (i) +// To support weights for instances, use GETI(i) (i) + +static int solve_l2r_l1l2_svc( + const problem *prob, double *w, double eps, + double Cp, double Cn, int solver_type, int max_iter) +{ + int l = prob->l; + int w_size = prob->n; + int i, s, iter = 0; + double C, d, G; + double *QD = new double[l]; + int *index = new int[l]; + double *alpha = new double[l]; + schar *y = new schar[l]; + int active_size = l; + + // PG: projected gradient, for shrinking and stopping + double PG; + double PGmax_old = INF; + double PGmin_old = -INF; + double PGmax_new, PGmin_new; + + // default solver_type: L2R_L2LOSS_SVC_DUAL + double *diag = new double[l]; + double *upper_bound = new double[l]; + double *C_ = new double[l]; + for(i=0; iy[i]>0) + C_[i] = prob->W[i] * Cp; + else + C_[i] = prob->W[i] * Cn; + diag[i] = 0.5/C_[i]; + upper_bound[i] = INF; + } + if(solver_type == L2R_L1LOSS_SVC_DUAL) + { + for(i=0; iy[i] > 0) + { + y[i] = +1; + } + else + { + y[i] = -1; + } + } + + // Initial alpha can be set here. Note that + // 0 <= alpha[i] <= upper_bound[GETI(i)] + for(i=0; ix[i]; + while (xi->index != -1) + { + double val = xi->value; + QD[i] += val*val; + w[xi->index-1] += y[i]*alpha[i]*val; + xi++; + } + index[i] = i; + } + + while (iter < max_iter) + { + PGmax_new = -INF; + PGmin_new = INF; + + for (i=0; ix[i]; + while(xi->index!= -1) + { + G += w[xi->index-1]*(xi->value); + xi++; + } + G = G*yi-1; + + C = upper_bound[GETI(i)]; + G += alpha[i]*diag[GETI(i)]; + + PG = 0; + if (alpha[i] == 0) + { + if (G > PGmax_old) + { + active_size--; + swap(index[s], index[active_size]); + s--; + continue; + } + else if (G < 0) + PG = G; + } + else if (alpha[i] == C) + { + if (G < PGmin_old) + { + active_size--; + swap(index[s], index[active_size]); + s--; + continue; + } + else if (G > 0) + PG = G; + } + else + PG = G; + + PGmax_new = max(PGmax_new, PG); + PGmin_new = min(PGmin_new, PG); + + if(fabs(PG) > 1.0e-12) + { + double alpha_old = alpha[i]; + alpha[i] = min(max(alpha[i] - G/QD[i], 0.0), C); + d = (alpha[i] - alpha_old)*yi; + xi = prob->x[i]; + while (xi->index != -1) + { + w[xi->index-1] += d*xi->value; + xi++; + } + } + } + + iter++; + if(iter % 10 == 0) + info("."); + + if(PGmax_new - PGmin_new <= eps) + { + if(active_size == l) + break; + else + { + active_size = l; + info("*"); + PGmax_old = INF; + PGmin_old = -INF; + continue; + } + } + PGmax_old = PGmax_new; + PGmin_old = PGmin_new; + if (PGmax_old <= 0) + PGmax_old = INF; + if (PGmin_old >= 0) + PGmin_old = -INF; + } + + info("\noptimization finished, #iter = %d\n",iter); + if (iter >= max_iter) + info("\nWARNING: reaching max number of iterations\nUsing -s 2 may be faster (also see FAQ)\n\n"); + + // calculate objective value + + double v = 0; + int nSV = 0; + for(i=0; i 0) + ++nSV; + } + info("Objective value = %lf\n",v/2); + info("nSV = %d\n",nSV); + + delete [] QD; + delete [] alpha; + delete [] y; + delete [] index; + delete [] diag; + delete [] upper_bound; + delete [] C_; + return iter; +} + + +// A coordinate descent algorithm for +// L1-loss and L2-loss epsilon-SVR dual problem +// +// min_\beta 0.5\beta^T (Q + diag(lambda)) \beta - p \sum_{i=1}^l|\beta_i| + \sum_{i=1}^l yi\beta_i, +// s.t. -upper_bound_i <= \beta_i <= upper_bound_i, +// +// where Qij = xi^T xj and +// D is a diagonal matrix +// +// In L1-SVM case: +// upper_bound_i = C +// lambda_i = 0 +// In L2-SVM case: +// upper_bound_i = INF +// lambda_i = 1/(2*C) +// +// Given: +// x, y, p, C +// eps is the stopping tolerance +// +// solution will be put in w +// +// See Algorithm 4 of Ho and Lin, 2012 + +#undef GETI +#define GETI(i) (i) +// To support weights for instances, use GETI(i) (i) + +static int solve_l2r_l1l2_svr( + const problem *prob, double *w, const parameter *param, + int solver_type, int max_iter) +{ + int l = prob->l; + double C = param->C; + double p = param->p; + int w_size = prob->n; + double eps = param->eps; + int i, s, iter = 0; + int active_size = l; + int *index = new int[l]; + + double d, G, H; + double Gmax_old = INF; + double Gmax_new, Gnorm1_new; + double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration + double *beta = new double[l]; + double *QD = new double[l]; + double *y = prob->y; + + // L2R_L2LOSS_SVR_DUAL + double *lambda = new double[l]; + double *upper_bound = new double[l]; + double *C_ = new double[l]; + for (i=0; iW[i] * C; + lambda[i] = 0.5/C_[i]; + upper_bound[i] = INF; + } + if(solver_type == L2R_L1LOSS_SVR_DUAL) + { + for (i=0; ix[i]; + while(xi->index != -1) + { + double val = xi->value; + QD[i] += val*val; + w[xi->index-1] += beta[i]*val; + xi++; + } + + index[i] = i; + } + + + while(iter < max_iter) + { + Gmax_new = 0; + Gnorm1_new = 0; + + for(i=0; ix[i]; + while(xi->index != -1) + { + int ind = xi->index-1; + double val = xi->value; + G += val*w[ind]; + xi++; + } + + double Gp = G+p; + double Gn = G-p; + double violation = 0; + if(beta[i] == 0) + { + if(Gp < 0) + violation = -Gp; + else if(Gn > 0) + violation = Gn; + else if(Gp>Gmax_old && Gn<-Gmax_old) + { + active_size--; + swap(index[s], index[active_size]); + s--; + continue; + } + } + else if(beta[i] >= upper_bound[GETI(i)]) + { + if(Gp > 0) + violation = Gp; + else if(Gp < -Gmax_old) + { + active_size--; + swap(index[s], index[active_size]); + s--; + continue; + } + } + else if(beta[i] <= -upper_bound[GETI(i)]) + { + if(Gn < 0) + violation = -Gn; + else if(Gn > Gmax_old) + { + active_size--; + swap(index[s], index[active_size]); + s--; + continue; + } + } + else if(beta[i] > 0) + violation = fabs(Gp); + else + violation = fabs(Gn); + + Gmax_new = max(Gmax_new, violation); + Gnorm1_new += violation; + + // obtain Newton direction d + if(Gp < H*beta[i]) + d = -Gp/H; + else if(Gn > H*beta[i]) + d = -Gn/H; + else + d = -beta[i]; + + if(fabs(d) < 1.0e-12) + continue; + + double beta_old = beta[i]; + beta[i] = min(max(beta[i]+d, -upper_bound[GETI(i)]), upper_bound[GETI(i)]); + d = beta[i]-beta_old; + + if(d != 0) + { + xi = prob->x[i]; + while(xi->index != -1) + { + w[xi->index-1] += d*xi->value; + xi++; + } + } + } + + if(iter == 0) + Gnorm1_init = Gnorm1_new; + iter++; + if(iter % 10 == 0) + info("."); + + if(Gnorm1_new <= eps*Gnorm1_init) + { + if(active_size == l) + break; + else + { + active_size = l; + info("*"); + Gmax_old = INF; + continue; + } + } + + Gmax_old = Gmax_new; + } + + info("\noptimization finished, #iter = %d\n", iter); + if(iter >= max_iter) + info("\nWARNING: reaching max number of iterations\nUsing -s 11 may be faster\n\n"); + + // calculate objective value + double v = 0; + int nSV = 0; + for(i=0; il; + int w_size = prob->n; + int i, s, iter = 0; + double *xTx = new double[l]; + int *index = new int[l]; + double *alpha = new double[2*l]; // store alpha and C - alpha + schar *y = new schar[l]; + int max_inner_iter = 100; // for inner Newton + double innereps = 1e-2; + double innereps_min = min(1e-8, eps); + double *upper_bound = new double [l]; + + for(i=0; iy[i] > 0) + { + upper_bound[i] = prob->W[i] * Cp; + y[i] = +1; + } + else + { + upper_bound[i] = prob->W[i] * Cn; + y[i] = -1; + } + } + + // Initial alpha can be set here. Note that + // 0 < alpha[i] < upper_bound[GETI(i)] + // alpha[2*i] + alpha[2*i+1] = upper_bound[GETI(i)] + for(i=0; ix[i]; + while (xi->index != -1) + { + double val = xi->value; + xTx[i] += val*val; + w[xi->index-1] += y[i]*alpha[2*i]*val; + xi++; + } + index[i] = i; + } + + while (iter < max_iter) + { + for (i=0; ix[i]; + while (xi->index != -1) + { + ywTx += w[xi->index-1]*xi->value; + xi++; + } + ywTx *= y[i]; + double a = xisq, b = ywTx; + + // Decide to minimize g_1(z) or g_2(z) + int ind1 = 2*i, ind2 = 2*i+1, sign = 1; + if(0.5*a*(alpha[ind2]-alpha[ind1])+b < 0) + { + ind1 = 2*i+1; + ind2 = 2*i; + sign = -1; + } + + // g_t(z) = z*log(z) + (C-z)*log(C-z) + 0.5a(z-alpha_old)^2 + sign*b(z-alpha_old) + double alpha_old = alpha[ind1]; + double z = alpha_old; + if(C - z < 0.5 * C) + z = 0.1*z; + double gp = a*(z-alpha_old)+sign*b+log(z/(C-z)); + Gmax = max(Gmax, fabs(gp)); + + // Newton method on the sub-problem + const double eta = 0.1; // xi in the paper + int inner_iter = 0; + while (inner_iter <= max_inner_iter) + { + if(fabs(gp) < innereps) + break; + double gpp = a + C/(C-z)/z; + double tmpz = z - gp/gpp; + if(tmpz <= 0) + z *= eta; + else // tmpz in (0, C) + z = tmpz; + gp = a*(z-alpha_old)+sign*b+log(z/(C-z)); + newton_iter++; + inner_iter++; + } + + if(inner_iter > 0) // update w + { + alpha[ind1] = z; + alpha[ind2] = C-z; + xi = prob->x[i]; + while (xi->index != -1) + { + w[xi->index-1] += sign*(z-alpha_old)*yi*xi->value; + xi++; + } + } + } + + iter++; + if(iter % 10 == 0) + info("."); + + if(Gmax < eps) + break; + + if(newton_iter <= l/10) + innereps = max(innereps_min, 0.1*innereps); + + } + + info("\noptimization finished, #iter = %d\n",iter); + if (iter >= max_iter) + info("\nWARNING: reaching max number of iterations\nUsing -s 0 may be faster (also see FAQ)\n\n"); + + // calculate objective value + + double v = 0; + for(i=0; il; + int w_size = prob_col->n; + int j, s, iter = 0; + int active_size = w_size; + int max_num_linesearch = 20; + + double sigma = 0.01; + double d, G_loss, G, H; + double Gmax_old = INF; + double Gmax_new, Gnorm1_new; + double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration + double d_old, d_diff; + double loss_old, loss_new; + double appxcond, cond; + + int *index = new int[w_size]; + schar *y = new schar[l]; + double *b = new double[l]; // b = 1-ywTx + double *xj_sq = new double[w_size]; + feature_node *x; + + double *C = new double[l]; + + // Initial w can be set here. + for(j=0; jy[j] > 0) + { + y[j] = 1; + C[j] = prob_col->W[j] * Cp; + } + else + { + y[j] = -1; + C[j] = prob_col->W[j] * Cn; + } + } + for(j=0; jx[j]; + while(x->index != -1) + { + int ind = x->index-1; + x->value *= y[ind]; // x->value stores yi*xij + double val = x->value; + b[ind] -= w[j]*val; + xj_sq[j] += C[GETI(ind)]*val*val; + x++; + } + } + + while(iter < max_iter) + { + Gmax_new = 0; + Gnorm1_new = 0; + + for(j=0; jx[j]; + while(x->index != -1) + { + int ind = x->index-1; + if(b[ind] > 0) + { + double val = x->value; + double tmp = C[GETI(ind)]*val; + G_loss -= tmp*b[ind]; + H += tmp*val; + } + x++; + } + G_loss *= 2; + + G = G_loss; + H *= 2; + H = max(H, 1e-12); + + double Gp = G+1; + double Gn = G-1; + double violation = 0; + if(w[j] == 0) + { + if(Gp < 0) + violation = -Gp; + else if(Gn > 0) + violation = Gn; + else if(Gp>Gmax_old/l && Gn<-Gmax_old/l) + { + active_size--; + swap(index[s], index[active_size]); + s--; + continue; + } + } + else if(w[j] > 0) + violation = fabs(Gp); + else + violation = fabs(Gn); + + Gmax_new = max(Gmax_new, violation); + Gnorm1_new += violation; + + // obtain Newton direction d + if(Gp < H*w[j]) + d = -Gp/H; + else if(Gn > H*w[j]) + d = -Gn/H; + else + d = -w[j]; + + if(fabs(d) < 1.0e-12) + continue; + + double delta = fabs(w[j]+d)-fabs(w[j]) + G*d; + d_old = 0; + int num_linesearch; + for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++) + { + d_diff = d_old - d; + cond = fabs(w[j]+d)-fabs(w[j]) - sigma*delta; + + appxcond = xj_sq[j]*d*d + G_loss*d + cond; + if(appxcond <= 0) + { + x = prob_col->x[j]; + while(x->index != -1) + { + b[x->index-1] += d_diff*x->value; + x++; + } + break; + } + + if(num_linesearch == 0) + { + loss_old = 0; + loss_new = 0; + x = prob_col->x[j]; + while(x->index != -1) + { + int ind = x->index-1; + if(b[ind] > 0) + loss_old += C[GETI(ind)]*b[ind]*b[ind]; + double b_new = b[ind] + d_diff*x->value; + b[ind] = b_new; + if(b_new > 0) + loss_new += C[GETI(ind)]*b_new*b_new; + x++; + } + } + else + { + loss_new = 0; + x = prob_col->x[j]; + while(x->index != -1) + { + int ind = x->index-1; + double b_new = b[ind] + d_diff*x->value; + b[ind] = b_new; + if(b_new > 0) + loss_new += C[GETI(ind)]*b_new*b_new; + x++; + } + } + + cond = cond + loss_new - loss_old; + if(cond <= 0) + break; + else + { + d_old = d; + d *= 0.5; + delta *= 0.5; + } + } + + w[j] += d; + + // recompute b[] if line search takes too many steps + if(num_linesearch >= max_num_linesearch) + { + info("#"); + for(int i=0; ix[i]; + while(x->index != -1) + { + b[x->index-1] -= w[i]*x->value; + x++; + } + } + } + } + + if(iter == 0) + Gnorm1_init = Gnorm1_new; + iter++; + if(iter % 10 == 0) + info("."); + + if(Gnorm1_new <= eps*Gnorm1_init) + { + if(active_size == w_size) + break; + else + { + active_size = w_size; + info("*"); + Gmax_old = INF; + continue; + } + } + + Gmax_old = Gmax_new; + } + + info("\noptimization finished, #iter = %d\n", iter); + if(iter >= max_iter) + info("\nWARNING: reaching max number of iterations\n"); + + // calculate objective value + + double v = 0; + int nnz = 0; + for(j=0; jx[j]; + while(x->index != -1) + { + x->value *= prob_col->y[x->index-1]; // restore x->value + x++; + } + if(w[j] != 0) + { + v += fabs(w[j]); + nnz++; + } + } + for(j=0; j 0) + v += C[GETI(j)]*b[j]*b[j]; + + info("Objective value = %lf\n", v); + info("#nonzeros/#features = %d/%d\n", nnz, w_size); + + delete [] index; + delete [] y; + delete [] b; + delete [] xj_sq; + delete [] C; + return iter; +} + +// A coordinate descent algorithm for +// L1-regularized logistic regression problems +// +// min_w \sum |wj| + C \sum log(1+exp(-yi w^T xi)), +// +// Given: +// x, y, Cp, Cn +// eps is the stopping tolerance +// +// solution will be put in w +// +// See Yuan et al. (2011) and appendix of LIBLINEAR paper, Fan et al. (2008) + +#undef GETI +#define GETI(i) (i) +// To support weights for instances, use GETI(i) (i) + +static int solve_l1r_lr( + const problem *prob_col, double *w, double eps, + double Cp, double Cn, int max_newton_iter) +{ + int l = prob_col->l; + int w_size = prob_col->n; + int j, s, newton_iter=0, iter=0; + int max_iter = 1000; + int max_num_linesearch = 20; + int active_size; + int QP_active_size; + int QP_no_change = 0; + + double nu = 1e-12; + double inner_eps = 1; + double sigma = 0.01; + double w_norm, w_norm_new; + double z, G, H; + double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration + double Gmax_old = INF; + double Gmax_new, Gnorm1_new; + double QP_Gmax_old = INF; + double QP_Gmax_new, QP_Gnorm1_new; + double delta, negsum_xTd, cond; + + int *index = new int[w_size]; + schar *y = new schar[l]; + double *Hdiag = new double[w_size]; + double *Grad = new double[w_size]; + double *wpd = new double[w_size]; + double *xjneg_sum = new double[w_size]; + double *xTd = new double[l]; + double *exp_wTx = new double[l]; + double *exp_wTx_new = new double[l]; + double *tau = new double[l]; + double *D = new double[l]; + feature_node *x; + + double *C = new double[l]; + + // Initial w can be set here. + for(j=0; jy[j] > 0) + { + y[j] = 1; + C[j] = prob_col->W[j] * Cp; + } + else + { + y[j] = -1; + C[j] = prob_col->W[j] * Cn; + } + + exp_wTx[j] = 0; + } + + w_norm = 0; + for(j=0; jx[j]; + while(x->index != -1) + { + int ind = x->index-1; + double val = x->value; + exp_wTx[ind] += w[j]*val; + if(y[ind] == -1) + xjneg_sum[j] += C[GETI(ind)]*val; + x++; + } + } + for(j=0; jx[j]; + while(x->index != -1) + { + int ind = x->index-1; + Hdiag[j] += x->value*x->value*D[ind]; + tmp += x->value*tau[ind]; + x++; + } + Grad[j] = -tmp + xjneg_sum[j]; + + double Gp = Grad[j]+1; + double Gn = Grad[j]-1; + double violation = 0; + if(w[j] == 0) + { + if(Gp < 0) + violation = -Gp; + else if(Gn > 0) + violation = Gn; + //outer-level shrinking + else if(Gp>Gmax_old/l && Gn<-Gmax_old/l) + { + active_size--; + swap(index[s], index[active_size]); + s--; + continue; + } + } + else if(w[j] > 0) + violation = fabs(Gp); + else + violation = fabs(Gn); + + Gmax_new = max(Gmax_new, violation); + Gnorm1_new += violation; + } + + if(newton_iter == 0) + Gnorm1_init = Gnorm1_new; + + // Break outer-loop if the accumulated violation is small. + // Also break if no update in QP inner-loop ten times in a row. + if(Gnorm1_new <= eps*Gnorm1_init || QP_no_change >= 10) + break; + + QP_no_change++; + + iter = 0; + QP_Gmax_old = INF; + QP_active_size = active_size; + + for(int i=0; ix[j]; + G = Grad[j] + (wpd[j]-w[j])*nu; + while(x->index != -1) + { + int ind = x->index-1; + G += x->value*D[ind]*xTd[ind]; + x++; + } + + double Gp = G+1; + double Gn = G-1; + double violation = 0; + if(wpd[j] == 0) + { + if(Gp < 0) + violation = -Gp; + else if(Gn > 0) + violation = Gn; + //inner-level shrinking + else if(Gp>QP_Gmax_old/l && Gn<-QP_Gmax_old/l) + { + QP_active_size--; + swap(index[s], index[QP_active_size]); + s--; + continue; + } + } + else if(wpd[j] > 0) + violation = fabs(Gp); + else + violation = fabs(Gn); + + // obtain solution of one-variable problem + if(Gp < H*wpd[j]) + z = -Gp/H; + else if(Gn > H*wpd[j]) + z = -Gn/H; + else + z = -wpd[j]; + + if(fabs(z) < 1.0e-12) + continue; + z = min(max(z,-10.0),10.0); + + QP_no_change = 0; + QP_Gmax_new = max(QP_Gmax_new, violation); + QP_Gnorm1_new += violation; + + wpd[j] += z; + + x = prob_col->x[j]; + while(x->index != -1) + { + int ind = x->index-1; + xTd[ind] += x->value*z; + x++; + } + } + + iter++; + + if(QP_Gnorm1_new <= inner_eps*Gnorm1_init) + { + //inner stopping + if(QP_active_size == active_size) + break; + //active set reactivation + else + { + QP_active_size = active_size; + QP_Gmax_old = INF; + continue; + } + } + + QP_Gmax_old = QP_Gmax_new; + } + + if(iter >= max_iter) + info("WARNING: reaching max number of inner iterations\n"); + + delta = 0; + w_norm_new = 0; + for(j=0; j= max_num_linesearch) + { + for(int i=0; ix[i]; + while(x->index != -1) + { + exp_wTx[x->index-1] += w[i]*x->value; + x++; + } + } + + for(int i=0; i= max_newton_iter) + info("WARNING: reaching max number of iterations\n"); + + // calculate objective value + + double v = 0; + int nnz = 0; + for(j=0; jl; + int n = prob->n; + size_t nnz = 0; + size_t *col_ptr = new size_t [n+1]; + feature_node *x_space; + prob_col->l = l; + prob_col->n = n; + prob_col->y = new double[l]; + prob_col->x = new feature_node*[n]; + prob_col->W = new double[l]; + + for(i=0; iy[i] = prob->y[i]; + prob_col->W[i] = prob->W[i]; + } + + for(i=0; ix[i]; + while(x->index != -1) + { + nnz++; + col_ptr[x->index]++; + x++; + } + } + for(i=1; ix[i] = &x_space[col_ptr[i]]; + + for(i=0; ix[i]; + while(x->index != -1) + { + int ind = x->index-1; + x_space[col_ptr[ind]].index = i+1; // starts from 1 + x_space[col_ptr[ind]].value = x->value; + col_ptr[ind]++; + x++; + } + } + for(i=0; il; + int max_nr_class = 16; + int nr_class = 0; + int *label = Malloc(int,max_nr_class); + int *count = Malloc(int,max_nr_class); + int *data_label = Malloc(int,l); + int i; + + for(i=0;iy[i]; + int j; + for(j=0;j=0 && label[i] > this_label) + { + label[i+1] = label[i]; + count[i+1] = count[i]; + i--; + } + label[i+1] = this_label; + count[i+1] = this_count; + } + + for (i=0; i y[i]; + while(this_label != label[j]) + { + j++; + } + data_label[i] = j; + + } + + /* END MOD */ + +#if 0 + // + // Labels are ordered by their first occurrence in the training set. + // However, for two-class sets with -1/+1 labels and -1 appears first, + // we swap labels to ensure that internally the binary SVM has positive data corresponding to the +1 instances. + // + if (nr_class == 2 && label[0] == -1 && label[1] == 1) + { + swap(label[0],label[1]); + swap(count[0],count[1]); + for(i=0;ieps; + int max_iter=param->max_iter; + int pos = 0; + int neg = 0; + int n_iter = -1; + for(int i=0;il;i++) + if(prob->y[i] > 0) + pos++; + neg = prob->l - pos; + + double primal_solver_tol = eps*max(min(pos,neg), 1)/prob->l; + + function *fun_obj=NULL; + switch(param->solver_type) + { + case L2R_LR: + { + double *C = new double[prob->l]; + for(int i = 0; i < prob->l; i++) + { + if(prob->y[i] > 0) + C[i] = prob->W[i] * Cp; + else + C[i] = prob->W[i] * Cn; + } + + fun_obj=new l2r_lr_fun(prob, C); + TRON tron_obj(fun_obj, primal_solver_tol, max_iter, blas_functions); + tron_obj.set_print_string(liblinear_print_string); + n_iter=tron_obj.tron(w); + delete fun_obj; + delete[] C; + break; + } + case L2R_L2LOSS_SVC: + { + double *C = new double[prob->l]; + for(int i = 0; i < prob->l; i++) + { + if(prob->y[i] > 0) + C[i] = prob->W[i] * Cp; + else + C[i] = prob->W[i] * Cn; + } + fun_obj=new l2r_l2_svc_fun(prob, C); + TRON tron_obj(fun_obj, primal_solver_tol, max_iter, blas_functions); + tron_obj.set_print_string(liblinear_print_string); + n_iter=tron_obj.tron(w); + delete fun_obj; + delete[] C; + break; + } + case L2R_L2LOSS_SVC_DUAL: + n_iter=solve_l2r_l1l2_svc(prob, w, eps, Cp, Cn, L2R_L2LOSS_SVC_DUAL, max_iter); + break; + case L2R_L1LOSS_SVC_DUAL: + n_iter=solve_l2r_l1l2_svc(prob, w, eps, Cp, Cn, L2R_L1LOSS_SVC_DUAL, max_iter); + break; + case L1R_L2LOSS_SVC: + { + problem prob_col; + feature_node *x_space = NULL; + transpose(prob, &x_space ,&prob_col); + n_iter=solve_l1r_l2_svc(&prob_col, w, primal_solver_tol, Cp, Cn, max_iter); + delete [] prob_col.y; + delete [] prob_col.x; + delete [] prob_col.W; + delete [] x_space; + break; + } + case L1R_LR: + { + problem prob_col; + feature_node *x_space = NULL; + transpose(prob, &x_space ,&prob_col); + n_iter=solve_l1r_lr(&prob_col, w, primal_solver_tol, Cp, Cn, max_iter); + delete [] prob_col.y; + delete [] prob_col.x; + delete [] prob_col.W; + delete [] x_space; + break; + } + case L2R_LR_DUAL: + n_iter=solve_l2r_lr_dual(prob, w, eps, Cp, Cn, max_iter); + break; + case L2R_L2LOSS_SVR: + { + double *C = new double[prob->l]; + for(int i = 0; i < prob->l; i++) + C[i] = prob->W[i] * param->C; + + fun_obj=new l2r_l2_svr_fun(prob, C, param->p); + TRON tron_obj(fun_obj, param->eps, max_iter, blas_functions); + tron_obj.set_print_string(liblinear_print_string); + n_iter=tron_obj.tron(w); + delete fun_obj; + delete[] C; + break; + + } + case L2R_L1LOSS_SVR_DUAL: + n_iter=solve_l2r_l1l2_svr(prob, w, param, L2R_L1LOSS_SVR_DUAL, max_iter); + break; + case L2R_L2LOSS_SVR_DUAL: + n_iter=solve_l2r_l1l2_svr(prob, w, param, L2R_L2LOSS_SVR_DUAL, max_iter); + break; + default: + fprintf(stderr, "ERROR: unknown solver_type\n"); + break; + } + return n_iter; +} + +// +// Remove zero weighed data as libsvm and some liblinear solvers require C > 0. +// +static void remove_zero_weight(problem *newprob, const problem *prob) +{ + int i; + int l = 0; + for(i=0;il;i++) + if(prob->W[i] > 0) l++; + *newprob = *prob; + newprob->l = l; + newprob->x = Malloc(feature_node*,l); + newprob->y = Malloc(double,l); + newprob->W = Malloc(double,l); + + int j = 0; + for(i=0;il;i++) + if(prob->W[i] > 0) + { + newprob->x[j] = prob->x[i]; + newprob->y[j] = prob->y[i]; + newprob->W[j] = prob->W[i]; + j++; + } +} + +// +// Interface functions +// +model* train(const problem *prob, const parameter *param, BlasFunctions *blas_functions) +{ + problem newprob; + remove_zero_weight(&newprob, prob); + prob = &newprob; + int i,j; + int l = prob->l; + int n = prob->n; + int w_size = prob->n; + model *model_ = Malloc(model,1); + + if(prob->bias>=0) + model_->nr_feature=n-1; + else + model_->nr_feature=n; + model_->param = *param; + model_->bias = prob->bias; + + if(check_regression_model(model_)) + { + model_->w = Malloc(double, w_size); + model_->n_iter = Malloc(int, 1); + model_->nr_class = 2; + model_->label = NULL; + model_->n_iter[0] =train_one(prob, param, &model_->w[0], 0, 0, blas_functions); + } + else + { + int nr_class; + int *label = NULL; + int *start = NULL; + int *count = NULL; + int *perm = Malloc(int,l); + + // group training data of the same class + group_classes(prob,&nr_class,&label,&start,&count,perm); + + model_->nr_class=nr_class; + model_->label = Malloc(int,nr_class); + for(i=0;ilabel[i] = label[i]; + + // calculate weighted C + double *weighted_C = Malloc(double, nr_class); + for(i=0;iC; + for(i=0;inr_weight;i++) + { + for(j=0;jweight_label[i] == label[j]) + break; + if(j == nr_class) + fprintf(stderr,"WARNING: class label %d specified in weight is not found\n", param->weight_label[i]); + else + weighted_C[j] *= param->weight[i]; + } + + // constructing the subproblem + feature_node **x = Malloc(feature_node *,l); + for(i=0;ix[perm[i]]; + + int k; + problem sub_prob; + sub_prob.l = l; + sub_prob.n = n; + sub_prob.x = Malloc(feature_node *,sub_prob.l); + sub_prob.y = Malloc(double,sub_prob.l); + sub_prob.W = Malloc(double,sub_prob.l); + for(k=0; kW[perm[k]]; + } + + // multi-class svm by Crammer and Singer + if(param->solver_type == MCSVM_CS) + { + model_->w=Malloc(double, n*nr_class); + model_->n_iter=Malloc(int, 1); + for(i=0;ieps); + model_->n_iter[0]=Solver.Solve(model_->w); + } + else + { + if(nr_class == 2) + { + model_->w=Malloc(double, w_size); + model_->n_iter=Malloc(int, 1); + int e0 = start[0]+count[0]; + k=0; + for(; kn_iter[0]=train_one(&sub_prob, param, &model_->w[0], weighted_C[1], weighted_C[0], blas_functions); + } + else + { + model_->w=Malloc(double, w_size*nr_class); + double *w=Malloc(double, w_size); + model_->n_iter=Malloc(int, nr_class); + for(i=0;in_iter[i]=train_one(&sub_prob, param, w, weighted_C[i], param->C, blas_functions); + + for(int j=0;jw[j*nr_class+i] = w[j]; + } + free(w); + } + + } + + free(x); + free(label); + free(start); + free(count); + free(perm); + free(sub_prob.x); + free(sub_prob.y); + free(sub_prob.W); + free(weighted_C); + free(newprob.x); + free(newprob.y); + free(newprob.W); + } + return model_; +} + +#if 0 +void cross_validation(const problem *prob, const parameter *param, int nr_fold, double *target) +{ + int i; + int *fold_start; + int l = prob->l; + int *perm = Malloc(int,l); + if (nr_fold > l) + { + nr_fold = l; + fprintf(stderr,"WARNING: # folds > # data. Will use # folds = # data instead (i.e., leave-one-out cross validation)\n"); + } + fold_start = Malloc(int,nr_fold+1); + for(i=0;ibias; + subprob.n = prob->n; + subprob.l = l-(end-begin); + subprob.x = Malloc(struct feature_node*,subprob.l); + subprob.y = Malloc(double,subprob.l); + + k=0; + for(j=0;jx[perm[j]]; + subprob.y[k] = prob->y[perm[j]]; + ++k; + } + for(j=end;jx[perm[j]]; + subprob.y[k] = prob->y[perm[j]]; + ++k; + } + struct model *submodel = train(&subprob,param); + for(j=begin;jx[perm[j]]); + free_and_destroy_model(&submodel); + free(subprob.x); + free(subprob.y); + } + free(fold_start); + free(perm); +} + +double predict_values(const struct model *model_, const struct feature_node *x, double *dec_values) +{ + int idx; + int n; + if(model_->bias>=0) + n=model_->nr_feature+1; + else + n=model_->nr_feature; + double *w=model_->w; + int nr_class=model_->nr_class; + int i; + int nr_w; + if(nr_class==2 && model_->param.solver_type != MCSVM_CS) + nr_w = 1; + else + nr_w = nr_class; + + const feature_node *lx=x; + for(i=0;iindex)!=-1; lx++) + { + // the dimension of testing data may exceed that of training + if(idx<=n) + for(i=0;ivalue; + } + + if(nr_class==2) + { + if(check_regression_model(model_)) + return dec_values[0]; + else + return (dec_values[0]>0)?model_->label[0]:model_->label[1]; + } + else + { + int dec_max_idx = 0; + for(i=1;i dec_values[dec_max_idx]) + dec_max_idx = i; + } + return model_->label[dec_max_idx]; + } +} + +double predict(const model *model_, const feature_node *x) +{ + double *dec_values = Malloc(double, model_->nr_class); + double label=predict_values(model_, x, dec_values); + free(dec_values); + return label; +} + +double predict_probability(const struct model *model_, const struct feature_node *x, double* prob_estimates) +{ + if(check_probability_model(model_)) + { + int i; + int nr_class=model_->nr_class; + int nr_w; + if(nr_class==2) + nr_w = 1; + else + nr_w = nr_class; + + double label=predict_values(model_, x, prob_estimates); + for(i=0;inr_feature; + int n; + const parameter& param = model_->param; + + if(model_->bias>=0) + n=nr_feature+1; + else + n=nr_feature; + int w_size = n; + FILE *fp = fopen(model_file_name,"w"); + if(fp==NULL) return -1; + + char *old_locale = strdup(setlocale(LC_ALL, NULL)); + setlocale(LC_ALL, "C"); + + int nr_w; + if(model_->nr_class==2 && model_->param.solver_type != MCSVM_CS) + nr_w=1; + else + nr_w=model_->nr_class; + + fprintf(fp, "solver_type %s\n", solver_type_table[param.solver_type]); + fprintf(fp, "nr_class %d\n", model_->nr_class); + + if(model_->label) + { + fprintf(fp, "label"); + for(i=0; inr_class; i++) + fprintf(fp, " %d", model_->label[i]); + fprintf(fp, "\n"); + } + + fprintf(fp, "nr_feature %d\n", nr_feature); + + fprintf(fp, "bias %.16g\n", model_->bias); + + fprintf(fp, "w\n"); + for(i=0; iw[i*nr_w+j]); + fprintf(fp, "\n"); + } + + setlocale(LC_ALL, old_locale); + free(old_locale); + + if (ferror(fp) != 0 || fclose(fp) != 0) return -1; + else return 0; +} + +struct model *load_model(const char *model_file_name) +{ + FILE *fp = fopen(model_file_name,"r"); + if(fp==NULL) return NULL; + + int i; + int nr_feature; + int n; + int nr_class; + double bias; + model *model_ = Malloc(model,1); + parameter& param = model_->param; + + model_->label = NULL; + + char *old_locale = strdup(setlocale(LC_ALL, NULL)); + setlocale(LC_ALL, "C"); + + char cmd[81]; + while(1) + { + fscanf(fp,"%80s",cmd); + if(strcmp(cmd,"solver_type")==0) + { + fscanf(fp,"%80s",cmd); + int i; + for(i=0;solver_type_table[i];i++) + { + if(strcmp(solver_type_table[i],cmd)==0) + { + param.solver_type=i; + break; + } + } + if(solver_type_table[i] == NULL) + { + fprintf(stderr,"unknown solver type.\n"); + + setlocale(LC_ALL, old_locale); + free(model_->label); + free(model_); + free(old_locale); + return NULL; + } + } + else if(strcmp(cmd,"nr_class")==0) + { + fscanf(fp,"%d",&nr_class); + model_->nr_class=nr_class; + } + else if(strcmp(cmd,"nr_feature")==0) + { + fscanf(fp,"%d",&nr_feature); + model_->nr_feature=nr_feature; + } + else if(strcmp(cmd,"bias")==0) + { + fscanf(fp,"%lf",&bias); + model_->bias=bias; + } + else if(strcmp(cmd,"w")==0) + { + break; + } + else if(strcmp(cmd,"label")==0) + { + int nr_class = model_->nr_class; + model_->label = Malloc(int,nr_class); + for(int i=0;ilabel[i]); + } + else + { + fprintf(stderr,"unknown text in model file: [%s]\n",cmd); + setlocale(LC_ALL, old_locale); + free(model_->label); + free(model_); + free(old_locale); + return NULL; + } + } + + nr_feature=model_->nr_feature; + if(model_->bias>=0) + n=nr_feature+1; + else + n=nr_feature; + int w_size = n; + int nr_w; + if(nr_class==2 && param.solver_type != MCSVM_CS) + nr_w = 1; + else + nr_w = nr_class; + + model_->w=Malloc(double, w_size*nr_w); + for(i=0; iw[i*nr_w+j]); + fscanf(fp, "\n"); + } + + setlocale(LC_ALL, old_locale); + free(old_locale); + + if (ferror(fp) != 0 || fclose(fp) != 0) return NULL; + + return model_; +} +#endif + +int get_nr_feature(const model *model_) +{ + return model_->nr_feature; +} + +int get_nr_class(const model *model_) +{ + return model_->nr_class; +} + +void get_labels(const model *model_, int* label) +{ + if (model_->label != NULL) + for(int i=0;inr_class;i++) + label[i] = model_->label[i]; +} + +void get_n_iter(const model *model_, int* n_iter) +{ + int labels; + labels = model_->nr_class; + if (labels == 2) + labels = 1; + + if (model_->n_iter != NULL) + for(int i=0;in_iter[i]; +} + +#if 0 +// use inline here for better performance (around 20% faster than the non-inline one) +static inline double get_w_value(const struct model *model_, int idx, int label_idx) +{ + int nr_class = model_->nr_class; + int solver_type = model_->param.solver_type; + const double *w = model_->w; + + if(idx < 0 || idx > model_->nr_feature) + return 0; + if(check_regression_model(model_)) + return w[idx]; + else + { + if(label_idx < 0 || label_idx >= nr_class) + return 0; + if(nr_class == 2 && solver_type != MCSVM_CS) + { + if(label_idx == 0) + return w[idx]; + else + return -w[idx]; + } + else + return w[idx*nr_class+label_idx]; + } +} + +// feat_idx: starting from 1 to nr_feature +// label_idx: starting from 0 to nr_class-1 for classification models; +// for regression models, label_idx is ignored. +double get_decfun_coef(const struct model *model_, int feat_idx, int label_idx) +{ + if(feat_idx > model_->nr_feature) + return 0; + return get_w_value(model_, feat_idx-1, label_idx); +} + +double get_decfun_bias(const struct model *model_, int label_idx) +{ + int bias_idx = model_->nr_feature; + double bias = model_->bias; + if(bias <= 0) + return 0; + else + return bias*get_w_value(model_, bias_idx, label_idx); +} +#endif + +void free_model_content(struct model *model_ptr) +{ + if(model_ptr->w != NULL) + free(model_ptr->w); + if(model_ptr->label != NULL) + free(model_ptr->label); + if(model_ptr->n_iter != NULL) + free(model_ptr->n_iter); +} + +void free_and_destroy_model(struct model **model_ptr_ptr) +{ + struct model *model_ptr = *model_ptr_ptr; + if(model_ptr != NULL) + { + free_model_content(model_ptr); + free(model_ptr); + } +} + +void destroy_param(parameter* param) +{ + if(param->weight_label != NULL) + free(param->weight_label); + if(param->weight != NULL) + free(param->weight); +} + +const char *check_parameter(const problem *prob, const parameter *param) +{ + if(param->eps <= 0) + return "eps <= 0"; + + if(param->C <= 0) + return "C <= 0"; + + if(param->p < 0) + return "p < 0"; + + if(param->solver_type != L2R_LR + && param->solver_type != L2R_L2LOSS_SVC_DUAL + && param->solver_type != L2R_L2LOSS_SVC + && param->solver_type != L2R_L1LOSS_SVC_DUAL + && param->solver_type != MCSVM_CS + && param->solver_type != L1R_L2LOSS_SVC + && param->solver_type != L1R_LR + && param->solver_type != L2R_LR_DUAL + && param->solver_type != L2R_L2LOSS_SVR + && param->solver_type != L2R_L2LOSS_SVR_DUAL + && param->solver_type != L2R_L1LOSS_SVR_DUAL) + return "unknown solver type"; + + return NULL; +} + +#if 0 +int check_probability_model(const struct model *model_) +{ + return (model_->param.solver_type==L2R_LR || + model_->param.solver_type==L2R_LR_DUAL || + model_->param.solver_type==L1R_LR); +} +#endif + +int check_regression_model(const struct model *model_) +{ + return (model_->param.solver_type==L2R_L2LOSS_SVR || + model_->param.solver_type==L2R_L1LOSS_SVR_DUAL || + model_->param.solver_type==L2R_L2LOSS_SVR_DUAL); +} + +void set_print_string_function(void (*print_func)(const char*)) +{ + if (print_func == NULL) + liblinear_print_string = &print_string_stdout; + else + liblinear_print_string = print_func; +} diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/linear.h b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/linear.h new file mode 100644 index 0000000000000000000000000000000000000000..1dfc1c0ed014943bc797cd89689237761f41568b --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/linear.h @@ -0,0 +1,86 @@ +#ifndef _LIBLINEAR_H +#define _LIBLINEAR_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "_cython_blas_helpers.h" + +struct feature_node +{ + int index; + double value; +}; + +struct problem +{ + int l, n; + double *y; + struct feature_node **x; + double bias; /* < 0 if no bias term */ + double *W; +}; + +enum { L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR, L2R_LR_DUAL, L2R_L2LOSS_SVR = 11, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL }; /* solver_type */ + +struct parameter +{ + int solver_type; + + /* these are for training only */ + double eps; /* stopping criteria */ + double C; + int nr_weight; + int *weight_label; + double* weight; + int max_iter; + double p; +}; + +struct model +{ + struct parameter param; + int nr_class; /* number of classes */ + int nr_feature; + double *w; + int *label; /* label of each class */ + double bias; + int *n_iter; /* no. of iterations of each class */ +}; + +void set_seed(unsigned seed); + +struct model* train(const struct problem *prob, const struct parameter *param, BlasFunctions *blas_functions); +void cross_validation(const struct problem *prob, const struct parameter *param, int nr_fold, double *target); + +double predict_values(const struct model *model_, const struct feature_node *x, double* dec_values); +double predict(const struct model *model_, const struct feature_node *x); +double predict_probability(const struct model *model_, const struct feature_node *x, double* prob_estimates); + +int save_model(const char *model_file_name, const struct model *model_); +struct model *load_model(const char *model_file_name); + +int get_nr_feature(const struct model *model_); +int get_nr_class(const struct model *model_); +void get_labels(const struct model *model_, int* label); +void get_n_iter(const struct model *model_, int* n_iter); +#if 0 +double get_decfun_coef(const struct model *model_, int feat_idx, int label_idx); +double get_decfun_bias(const struct model *model_, int label_idx); +#endif + +void free_model_content(struct model *model_ptr); +void free_and_destroy_model(struct model **model_ptr_ptr); +void destroy_param(struct parameter *param); + +const char *check_parameter(const struct problem *prob, const struct parameter *param); +int check_probability_model(const struct model *model); +int check_regression_model(const struct model *model); +void set_print_string_function(void (*print_func) (const char*)); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBLINEAR_H */ diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/tron.cpp b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/tron.cpp new file mode 100644 index 0000000000000000000000000000000000000000..168a62ca47a2f4850508f6a0130eee3b8bd09194 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/tron.cpp @@ -0,0 +1,223 @@ +#include +#include +#include +#include +#include "tron.h" + +#ifndef min +template static inline T min(T x,T y) { return (x static inline T max(T x,T y) { return (x>y)?x:y; } +#endif + +static void default_print(const char *buf) +{ + fputs(buf,stdout); + fflush(stdout); +} + +void TRON::info(const char *fmt,...) +{ + char buf[BUFSIZ]; + va_list ap; + va_start(ap,fmt); + vsprintf(buf,fmt,ap); + va_end(ap); + (*tron_print_string)(buf); +} + +TRON::TRON(const function *fun_obj, double eps, int max_iter, BlasFunctions *blas) +{ + this->fun_obj=const_cast(fun_obj); + this->eps=eps; + this->max_iter=max_iter; + this->blas=blas; + tron_print_string = default_print; +} + +TRON::~TRON() +{ +} + +int TRON::tron(double *w) +{ + // Parameters for updating the iterates. + double eta0 = 1e-4, eta1 = 0.25, eta2 = 0.75; + + // Parameters for updating the trust region size delta. + double sigma1 = 0.25, sigma2 = 0.5, sigma3 = 4; + + int n = fun_obj->get_nr_variable(); + int i, cg_iter; + double delta, snorm; + double alpha, f, fnew, prered, actred, gs; + int search = 1, iter = 1, inc = 1; + double *s = new double[n]; + double *r = new double[n]; + double *w_new = new double[n]; + double *g = new double[n]; + + for (i=0; ifun(w); + fun_obj->grad(w, g); + delta = blas->nrm2(n, g, inc); + double gnorm1 = delta; + double gnorm = gnorm1; + + if (gnorm <= eps*gnorm1) + search = 0; + + iter = 1; + + while (iter <= max_iter && search) + { + cg_iter = trcg(delta, g, s, r); + + memcpy(w_new, w, sizeof(double)*n); + blas->axpy(n, 1.0, s, inc, w_new, inc); + + gs = blas->dot(n, g, inc, s, inc); + prered = -0.5*(gs - blas->dot(n, s, inc, r, inc)); + fnew = fun_obj->fun(w_new); + + // Compute the actual reduction. + actred = f - fnew; + + // On the first iteration, adjust the initial step bound. + snorm = blas->nrm2(n, s, inc); + if (iter == 1) + delta = min(delta, snorm); + + // Compute prediction alpha*snorm of the step. + if (fnew - f - gs <= 0) + alpha = sigma3; + else + alpha = max(sigma1, -0.5*(gs/(fnew - f - gs))); + + // Update the trust region bound according to the ratio of actual to predicted reduction. + if (actred < eta0*prered) + delta = min(max(alpha, sigma1)*snorm, sigma2*delta); + else if (actred < eta1*prered) + delta = max(sigma1*delta, min(alpha*snorm, sigma2*delta)); + else if (actred < eta2*prered) + delta = max(sigma1*delta, min(alpha*snorm, sigma3*delta)); + else + delta = max(delta, min(alpha*snorm, sigma3*delta)); + + info("iter %2d act %5.3e pre %5.3e delta %5.3e f %5.3e |g| %5.3e CG %3d\n", iter, actred, prered, delta, f, gnorm, cg_iter); + + if (actred > eta0*prered) + { + iter++; + memcpy(w, w_new, sizeof(double)*n); + f = fnew; + fun_obj->grad(w, g); + + gnorm = blas->nrm2(n, g, inc); + if (gnorm <= eps*gnorm1) + break; + } + if (f < -1.0e+32) + { + info("WARNING: f < -1.0e+32\n"); + break; + } + if (fabs(actred) <= 0 && prered <= 0) + { + info("WARNING: actred and prered <= 0\n"); + break; + } + if (fabs(actred) <= 1.0e-12*fabs(f) && + fabs(prered) <= 1.0e-12*fabs(f)) + { + info("WARNING: actred and prered too small\n"); + break; + } + } + + delete[] g; + delete[] r; + delete[] w_new; + delete[] s; + return --iter; +} + +int TRON::trcg(double delta, double *g, double *s, double *r) +{ + int i, inc = 1; + int n = fun_obj->get_nr_variable(); + double *d = new double[n]; + double *Hd = new double[n]; + double rTr, rnewTrnew, alpha, beta, cgtol; + + for (i=0; inrm2(n, g, inc); + + int cg_iter = 0; + rTr = blas->dot(n, r, inc, r, inc); + while (1) + { + if (blas->nrm2(n, r, inc) <= cgtol) + break; + cg_iter++; + fun_obj->Hv(d, Hd); + + alpha = rTr / blas->dot(n, d, inc, Hd, inc); + blas->axpy(n, alpha, d, inc, s, inc); + if (blas->nrm2(n, s, inc) > delta) + { + info("cg reaches trust region boundary\n"); + alpha = -alpha; + blas->axpy(n, alpha, d, inc, s, inc); + + double std = blas->dot(n, s, inc, d, inc); + double sts = blas->dot(n, s, inc, s, inc); + double dtd = blas->dot(n, d, inc, d, inc); + double dsq = delta*delta; + double rad = sqrt(std*std + dtd*(dsq-sts)); + if (std >= 0) + alpha = (dsq - sts)/(std + rad); + else + alpha = (rad - std)/dtd; + blas->axpy(n, alpha, d, inc, s, inc); + alpha = -alpha; + blas->axpy(n, alpha, Hd, inc, r, inc); + break; + } + alpha = -alpha; + blas->axpy(n, alpha, Hd, inc, r, inc); + rnewTrnew = blas->dot(n, r, inc, r, inc); + beta = rnewTrnew/rTr; + blas->scal(n, beta, d, inc); + blas->axpy(n, 1.0, r, inc, d, inc); + rTr = rnewTrnew; + } + + delete[] d; + delete[] Hd; + + return(cg_iter); +} + +double TRON::norm_inf(int n, double *x) +{ + double dmax = fabs(x[0]); + for (int i=1; i= dmax) + dmax = fabs(x[i]); + return(dmax); +} + +void TRON::set_print_string(void (*print_string) (const char *buf)) +{ + tron_print_string = print_string; +} diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/tron.h b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/tron.h new file mode 100644 index 0000000000000000000000000000000000000000..735304ed16b6fc28c5900d2be2f41f47a32ccc9a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/liblinear/tron.h @@ -0,0 +1,37 @@ +#ifndef _TRON_H +#define _TRON_H + +#include "_cython_blas_helpers.h" + +class function +{ +public: + virtual double fun(double *w) = 0 ; + virtual void grad(double *w, double *g) = 0 ; + virtual void Hv(double *s, double *Hs) = 0 ; + + virtual int get_nr_variable(void) = 0 ; + virtual ~function(void){} +}; + +class TRON +{ +public: + TRON(const function *fun_obj, double eps = 0.1, int max_iter = 1000, BlasFunctions *blas = 0); + ~TRON(); + + int tron(double *w); + void set_print_string(void (*i_print) (const char *buf)); + +private: + int trcg(double delta, double *g, double *s, double *r); + double norm_inf(int n, double *x); + + double eps; + int max_iter; + function *fun_obj; + BlasFunctions *blas; + void info(const char *fmt,...); + void (*tron_print_string)(const char *buf); +}; +#endif diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/LIBSVM_CHANGES b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/LIBSVM_CHANGES new file mode 100644 index 0000000000000000000000000000000000000000..663550b8ddd6fa905d3cec6e02be50faa43859c3 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/LIBSVM_CHANGES @@ -0,0 +1,11 @@ +Changes to Libsvm + +This is here mainly as checklist for incorporation of new versions of libsvm. + + * Add copyright to files svm.cpp and svm.h + * Add random_seed support and call to srand in fit function + * Improved random number generator (fix on windows, enhancement on other + platforms). See + * invoke scipy blas api for svm kernel function to improve performance with speedup rate of 1.5X to 2X for dense data only. See + * Expose the number of iterations run in optimization. See +The changes made with respect to upstream are detailed in the heading of svm.cpp diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/_svm_cython_blas_helpers.h b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/_svm_cython_blas_helpers.h new file mode 100644 index 0000000000000000000000000000000000000000..2548c7844d267ec631102ae1f44e48cab2b0a729 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/_svm_cython_blas_helpers.h @@ -0,0 +1,9 @@ +#ifndef _SVM_CYTHON_BLAS_HELPERS_H +#define _SVM_CYTHON_BLAS_HELPERS_H + +typedef double (*dot_func)(int, const double*, int, const double*, int); +typedef struct BlasFunctions{ + dot_func dot; +} BlasFunctions; + +#endif diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/libsvm_helper.c b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/libsvm_helper.c new file mode 100644 index 0000000000000000000000000000000000000000..b87b52a6fbdc244df315c6f03f80b3321c852fdc --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/libsvm_helper.c @@ -0,0 +1,425 @@ +#include +#define PY_SSIZE_T_CLEAN +#include +#include "svm.h" +#include "_svm_cython_blas_helpers.h" + + +#ifndef MAX + #define MAX(x, y) (((x) > (y)) ? (x) : (y)) +#endif + + +/* + * Some helper methods for libsvm bindings. + * + * We need to access from python some parameters stored in svm_model + * but libsvm does not expose this structure, so we define it here + * along some utilities to convert from numpy arrays. + * + * Authors: The scikit-learn developers + * SPDX-License-Identifier: BSD-3-Clause + * + */ + + +/* + * Convert matrix to sparse representation suitable for libsvm. x is + * expected to be an array of length nrow*ncol. + * + * Typically the matrix will be dense, so we speed up the routine for + * this case. We create a temporary array temp that collects non-zero + * elements and after we just memcpy that to the proper array. + * + * Special care must be taken with indinces, since libsvm indices start + * at 1 and not at 0. + * + * Strictly speaking, the C standard does not require that structs are + * contiguous, but in practice its a reasonable assumption. + * + */ +struct svm_node *dense_to_libsvm (double *x, Py_ssize_t *dims) +{ + struct svm_node *node; + Py_ssize_t len_row = dims[1]; + double *tx = x; + int i; + + node = malloc (dims[0] * sizeof(struct svm_node)); + + if (node == NULL) return NULL; + for (i=0; isvm_type = svm_type; + param->kernel_type = kernel_type; + param->degree = degree; + param->coef0 = coef0; + param->nu = nu; + param->cache_size = cache_size; + param->C = C; + param->eps = eps; + param->p = p; + param->shrinking = shrinking; + param->probability = probability; + param->nr_weight = nr_weight; + param->weight_label = (int *) weight_label; + param->weight = (double *) weight; + param->gamma = gamma; + param->max_iter = max_iter; + param->random_seed = random_seed; +} + +/* + * Fill an svm_problem struct. problem->x will be malloc'd. + */ +void set_problem(struct svm_problem *problem, char *X, char *Y, char *sample_weight, Py_ssize_t *dims, int kernel_type) +{ + if (problem == NULL) return; + problem->l = (int) dims[0]; /* number of samples */ + problem->y = (double *) Y; + problem->x = dense_to_libsvm((double *) X, dims); /* implicit call to malloc */ + problem->W = (double *) sample_weight; +} + +/* + * Create and return an instance of svm_model. + * + * The copy of model->sv_coef should be straightforward, but + * unfortunately to represent a matrix numpy and libsvm use different + * approaches, so it requires some iteration. + * + * Possible issue: on 64 bits, the number of columns that numpy can + * store is a long, but libsvm enforces this number (model->l) to be + * an int, so we might have numpy matrices that do not fit into libsvm's + * data structure. + * + */ +struct svm_model *set_model(struct svm_parameter *param, int nr_class, + char *SV, Py_ssize_t *SV_dims, + char *support, Py_ssize_t *support_dims, + Py_ssize_t *sv_coef_strides, + char *sv_coef, char *rho, char *nSV, + char *probA, char *probB) +{ + struct svm_model *model; + double *dsv_coef = (double *) sv_coef; + int i, m; + + m = nr_class * (nr_class-1)/2; + + if ((model = malloc(sizeof(struct svm_model))) == NULL) + goto model_error; + if ((model->nSV = malloc(nr_class * sizeof(int))) == NULL) + goto nsv_error; + if ((model->label = malloc(nr_class * sizeof(int))) == NULL) + goto label_error; + if ((model->sv_coef = malloc((nr_class-1)*sizeof(double *))) == NULL) + goto sv_coef_error; + if ((model->rho = malloc( m * sizeof(double))) == NULL) + goto rho_error; + + // This is only allocated in dynamic memory while training. + model->n_iter = NULL; + + model->nr_class = nr_class; + model->param = *param; + model->l = (int) support_dims[0]; + + if (param->kernel_type == PRECOMPUTED) { + if ((model->SV = malloc ((model->l) * sizeof(struct svm_node))) == NULL) + goto SV_error; + for (i=0; il; ++i) { + model->SV[i].ind = ((int *) support)[i]; + model->SV[i].values = NULL; + } + } else { + model->SV = dense_to_libsvm((double *) SV, SV_dims); + } + /* + * regression and one-class does not use nSV, label. + * TODO: does this provoke memory leaks (we just malloc'ed them)? + */ + if (param->svm_type < 2) { + memcpy(model->nSV, nSV, model->nr_class * sizeof(int)); + for(i=0; i < model->nr_class; i++) + model->label[i] = i; + } + + for (i=0; i < model->nr_class-1; i++) { + model->sv_coef[i] = dsv_coef + i*(model->l); + } + + for (i=0; irho)[i] = -((double *) rho)[i]; + } + + /* + * just to avoid segfaults, these features are not wrapped but + * svm_destroy_model will try to free them. + */ + + if (param->probability) { + if ((model->probA = malloc(m * sizeof(double))) == NULL) + goto probA_error; + memcpy(model->probA, probA, m * sizeof(double)); + if ((model->probB = malloc(m * sizeof(double))) == NULL) + goto probB_error; + memcpy(model->probB, probB, m * sizeof(double)); + } else { + model->probA = NULL; + model->probB = NULL; + } + + /* We'll free SV ourselves */ + model->free_sv = 0; + return model; + +probB_error: + free(model->probA); +probA_error: + free(model->SV); +SV_error: + free(model->rho); +rho_error: + free(model->sv_coef); +sv_coef_error: + free(model->label); +label_error: + free(model->nSV); +nsv_error: + free(model); +model_error: + return NULL; +} + + + +/* + * Get the number of support vectors in a model. + */ +Py_ssize_t get_l(struct svm_model *model) +{ + return (Py_ssize_t) model->l; +} + +/* + * Get the number of classes in a model, = 2 in regression/one class + * svm. + */ +Py_ssize_t get_nr(struct svm_model *model) +{ + return (Py_ssize_t) model->nr_class; +} + +/* + * Get the number of iterations run in optimization + */ +void copy_n_iter(char *data, struct svm_model *model) +{ + const int n_models = MAX(1, model->nr_class * (model->nr_class-1) / 2); + memcpy(data, model->n_iter, n_models * sizeof(int)); +} + +/* + * Some helpers to convert from libsvm sparse data structures + * model->sv_coef is a double **, whereas data is just a double *, + * so we have to do some stupid copying. + */ +void copy_sv_coef(char *data, struct svm_model *model) +{ + int i, len = model->nr_class-1; + double *temp = (double *) data; + for(i=0; isv_coef[i], sizeof(double) * model->l); + temp += model->l; + } +} + +void copy_intercept(char *data, struct svm_model *model, Py_ssize_t *dims) +{ + /* intercept = -rho */ + Py_ssize_t i, n = dims[0]; + double t, *ddata = (double *) data; + for (i=0; irho[i]; + /* we do this to avoid ugly -0.0 */ + *ddata = (t != 0) ? -t : 0; + ++ddata; + } +} + +/* + * This is a bit more complex since SV are stored as sparse + * structures, so we have to do the conversion on the fly and also + * iterate fast over data. + */ +void copy_SV(char *data, struct svm_model *model, Py_ssize_t *dims) +{ + int i, n = model->l; + double *tdata = (double *) data; + int dim = model->SV[0].dim; + for (i=0; iSV[i].values, dim * sizeof(double)); + tdata += dim; + } +} + +void copy_support (char *data, struct svm_model *model) +{ + memcpy (data, model->sv_ind, (model->l) * sizeof(int)); +} + +/* + * copy svm_model.nSV, an array with the number of SV for each class + * will be NULL in the case of SVR, OneClass + */ +void copy_nSV(char *data, struct svm_model *model) +{ + if (model->label == NULL) return; + memcpy(data, model->nSV, model->nr_class * sizeof(int)); +} + +void copy_probA(char *data, struct svm_model *model, Py_ssize_t * dims) +{ + memcpy(data, model->probA, dims[0] * sizeof(double)); +} + +void copy_probB(char *data, struct svm_model *model, Py_ssize_t * dims) +{ + memcpy(data, model->probB, dims[0] * sizeof(double)); +} + +/* + * Predict using model. + * + * It will return -1 if we run out of memory. + */ +int copy_predict(char *predict, struct svm_model *model, Py_ssize_t *predict_dims, + char *dec_values, BlasFunctions *blas_functions) +{ + double *t = (double *) dec_values; + struct svm_node *predict_nodes; + Py_ssize_t i; + + predict_nodes = dense_to_libsvm((double *) predict, predict_dims); + + if (predict_nodes == NULL) + return -1; + for(i=0; inr_class; + predict_nodes = dense_to_libsvm((double *) predict, predict_dims); + if (predict_nodes == NULL) + return -1; + for(i=0; iSV); + + /* We don't free sv_ind and n_iter, since we did not create them in + set_model */ + /* free(model->sv_ind); + * free(model->n_iter); + */ + free(model->sv_coef); + free(model->rho); + free(model->label); + free(model->probA); + free(model->probB); + free(model->nSV); + free(model); + + return 0; +} + +int free_param(struct svm_parameter *param) +{ + if (param == NULL) return -1; + free(param); + return 0; +} + + +/* borrowed from original libsvm code */ +static void print_null(const char *s) {} + +static void print_string_stdout(const char *s) +{ + fputs(s,stdout); + fflush(stdout); +} + +/* provide convenience wrapper */ +void set_verbosity(int verbosity_flag){ + if (verbosity_flag) + svm_set_print_string_function(&print_string_stdout); + else + svm_set_print_string_function(&print_null); +} diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/libsvm_sparse_helper.c b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/libsvm_sparse_helper.c new file mode 100644 index 0000000000000000000000000000000000000000..0ba153647cb8c158de24cb41e69fad90f44b1fc8 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/libsvm_sparse_helper.c @@ -0,0 +1,472 @@ +#include +#define PY_SSIZE_T_CLEAN +#include +#include "svm.h" +#include "_svm_cython_blas_helpers.h" + + +#ifndef MAX + #define MAX(x, y) (((x) > (y)) ? (x) : (y)) +#endif + + +/* + * Convert scipy.sparse.csr to libsvm's sparse data structure + */ +struct svm_csr_node **csr_to_libsvm (double *values, int* indices, int* indptr, int n_samples) +{ + struct svm_csr_node **sparse, *temp; + int i, j=0, k=0, n; + sparse = malloc (n_samples * sizeof(struct svm_csr_node *)); + + if (sparse == NULL) + return NULL; + + for (i=0; isvm_type = svm_type; + param->kernel_type = kernel_type; + param->degree = degree; + param->coef0 = coef0; + param->nu = nu; + param->cache_size = cache_size; + param->C = C; + param->eps = eps; + param->p = p; + param->shrinking = shrinking; + param->probability = probability; + param->nr_weight = nr_weight; + param->weight_label = (int *) weight_label; + param->weight = (double *) weight; + param->gamma = gamma; + param->max_iter = max_iter; + param->random_seed = random_seed; + return param; +} + + +/* + * Create and return a svm_csr_problem struct from a scipy.sparse.csr matrix. It is + * up to the user to free resulting structure. + * + * TODO: precomputed kernel. + */ +struct svm_csr_problem * csr_set_problem (char *values, Py_ssize_t *n_indices, + char *indices, Py_ssize_t *n_indptr, char *indptr, char *Y, + char *sample_weight, int kernel_type) { + + struct svm_csr_problem *problem; + problem = malloc (sizeof (struct svm_csr_problem)); + if (problem == NULL) return NULL; + problem->l = (int) n_indptr[0] - 1; + problem->y = (double *) Y; + problem->x = csr_to_libsvm((double *) values, (int *) indices, + (int *) indptr, problem->l); + /* should be removed once we implement weighted samples */ + problem->W = (double *) sample_weight; + + if (problem->x == NULL) { + free(problem); + return NULL; + } + return problem; +} + + +struct svm_csr_model *csr_set_model(struct svm_parameter *param, int nr_class, + char *SV_data, Py_ssize_t *SV_indices_dims, + char *SV_indices, Py_ssize_t *SV_indptr_dims, + char *SV_intptr, + char *sv_coef, char *rho, char *nSV, + char *probA, char *probB) +{ + struct svm_csr_model *model; + double *dsv_coef = (double *) sv_coef; + int i, m; + + m = nr_class * (nr_class-1)/2; + + if ((model = malloc(sizeof(struct svm_csr_model))) == NULL) + goto model_error; + if ((model->nSV = malloc(nr_class * sizeof(int))) == NULL) + goto nsv_error; + if ((model->label = malloc(nr_class * sizeof(int))) == NULL) + goto label_error; + if ((model->sv_coef = malloc((nr_class-1)*sizeof(double *))) == NULL) + goto sv_coef_error; + if ((model->rho = malloc( m * sizeof(double))) == NULL) + goto rho_error; + + // This is only allocated in dynamic memory while training. + model->n_iter = NULL; + + /* in the case of precomputed kernels we do not use + dense_to_precomputed because we don't want the leading 0. As + indices start at 1 (not at 0) this will work */ + model->l = (int) SV_indptr_dims[0] - 1; + model->SV = csr_to_libsvm((double *) SV_data, (int *) SV_indices, + (int *) SV_intptr, model->l); + model->nr_class = nr_class; + model->param = *param; + + /* + * regression and one-class does not use nSV, label. + */ + if (param->svm_type < 2) { + memcpy(model->nSV, nSV, model->nr_class * sizeof(int)); + for(i=0; i < model->nr_class; i++) + model->label[i] = i; + } + + for (i=0; i < model->nr_class-1; i++) { + /* + * We cannot squash all this mallocs in a single call since + * svm_destroy_model will free each element of the array. + */ + if ((model->sv_coef[i] = malloc((model->l) * sizeof(double))) == NULL) { + int j; + for (j=0; jsv_coef[j]); + goto sv_coef_i_error; + } + memcpy(model->sv_coef[i], dsv_coef, (model->l) * sizeof(double)); + dsv_coef += model->l; + } + + for (i=0; irho)[i] = -((double *) rho)[i]; + } + + /* + * just to avoid segfaults, these features are not wrapped but + * svm_destroy_model will try to free them. + */ + + if (param->probability) { + if ((model->probA = malloc(m * sizeof(double))) == NULL) + goto probA_error; + memcpy(model->probA, probA, m * sizeof(double)); + if ((model->probB = malloc(m * sizeof(double))) == NULL) + goto probB_error; + memcpy(model->probB, probB, m * sizeof(double)); + } else { + model->probA = NULL; + model->probB = NULL; + } + + /* We'll free SV ourselves */ + model->free_sv = 0; + return model; + +probB_error: + free(model->probA); +probA_error: + for (i=0; i < model->nr_class-1; i++) + free(model->sv_coef[i]); +sv_coef_i_error: + free(model->rho); +rho_error: + free(model->sv_coef); +sv_coef_error: + free(model->label); +label_error: + free(model->nSV); +nsv_error: + free(model); +model_error: + return NULL; +} + + +/* + * Copy support vectors into a scipy.sparse.csr matrix + */ +int csr_copy_SV (char *data, Py_ssize_t *n_indices, + char *indices, Py_ssize_t *n_indptr, char *indptr, + struct svm_csr_model *model, int n_features) +{ + int i, j, k=0, index; + double *dvalues = (double *) data; + int *iindices = (int *) indices; + int *iindptr = (int *) indptr; + iindptr[0] = 0; + for (i=0; il; ++i) { /* iterate over support vectors */ + index = model->SV[i][0].index; + for(j=0; index >=0 ; ++j) { + iindices[k] = index - 1; + dvalues[k] = model->SV[i][j].value; + index = model->SV[i][j+1].index; + ++k; + } + iindptr[i+1] = k; + } + + return 0; +} + +/* get number of nonzero coefficients in support vectors */ +Py_ssize_t get_nonzero_SV (struct svm_csr_model *model) { + int i, j; + Py_ssize_t count=0; + for (i=0; il; ++i) { + j = 0; + while (model->SV[i][j].index != -1) { + ++j; + ++count; + } + } + return count; +} + + +/* + * Predict using a model, where data is expected to be encoded into a csr matrix. + */ +int csr_copy_predict (Py_ssize_t *data_size, char *data, Py_ssize_t *index_size, + char *index, Py_ssize_t *intptr_size, char *intptr, struct svm_csr_model *model, + char *dec_values, BlasFunctions *blas_functions) { + double *t = (double *) dec_values; + struct svm_csr_node **predict_nodes; + Py_ssize_t i; + + predict_nodes = csr_to_libsvm((double *) data, (int *) index, + (int *) intptr, intptr_size[0]-1); + + if (predict_nodes == NULL) + return -1; + for(i=0; i < intptr_size[0] - 1; ++i) { + *t = svm_csr_predict(model, predict_nodes[i], blas_functions); + free(predict_nodes[i]); + ++t; + } + free(predict_nodes); + return 0; +} + +int csr_copy_predict_values (Py_ssize_t *data_size, char *data, Py_ssize_t *index_size, + char *index, Py_ssize_t *intptr_size, char *intptr, struct svm_csr_model *model, + char *dec_values, int nr_class, BlasFunctions *blas_functions) { + struct svm_csr_node **predict_nodes; + Py_ssize_t i; + + predict_nodes = csr_to_libsvm((double *) data, (int *) index, + (int *) intptr, intptr_size[0]-1); + + if (predict_nodes == NULL) + return -1; + for(i=0; i < intptr_size[0] - 1; ++i) { + svm_csr_predict_values(model, predict_nodes[i], + ((double *) dec_values) + i*nr_class, + blas_functions); + free(predict_nodes[i]); + } + free(predict_nodes); + + return 0; +} + +int csr_copy_predict_proba (Py_ssize_t *data_size, char *data, Py_ssize_t *index_size, + char *index, Py_ssize_t *intptr_size, char *intptr, struct svm_csr_model *model, + char *dec_values, BlasFunctions *blas_functions) { + + struct svm_csr_node **predict_nodes; + Py_ssize_t i; + int m = model->nr_class; + + predict_nodes = csr_to_libsvm((double *) data, (int *) index, + (int *) intptr, intptr_size[0]-1); + + if (predict_nodes == NULL) + return -1; + for(i=0; i < intptr_size[0] - 1; ++i) { + svm_csr_predict_probability( + model, predict_nodes[i], ((double *) dec_values) + i*m, blas_functions); + free(predict_nodes[i]); + } + free(predict_nodes); + return 0; +} + + +Py_ssize_t get_nr(struct svm_csr_model *model) +{ + return (Py_ssize_t) model->nr_class; +} + +void copy_intercept(char *data, struct svm_csr_model *model, Py_ssize_t *dims) +{ + /* intercept = -rho */ + Py_ssize_t i, n = dims[0]; + double t, *ddata = (double *) data; + for (i=0; irho[i]; + /* we do this to avoid ugly -0.0 */ + *ddata = (t != 0) ? -t : 0; + ++ddata; + } +} + +void copy_support (char *data, struct svm_csr_model *model) +{ + memcpy (data, model->sv_ind, (model->l) * sizeof(int)); +} + +/* + * Some helpers to convert from libsvm sparse data structures + * model->sv_coef is a double **, whereas data is just a double *, + * so we have to do some stupid copying. + */ +void copy_sv_coef(char *data, struct svm_csr_model *model) +{ + int i, len = model->nr_class-1; + double *temp = (double *) data; + for(i=0; isv_coef[i], sizeof(double) * model->l); + temp += model->l; + } +} + +/* + * Get the number of iterations run in optimization + */ +void copy_n_iter(char *data, struct svm_csr_model *model) +{ + const int n_models = MAX(1, model->nr_class * (model->nr_class-1) / 2); + memcpy(data, model->n_iter, n_models * sizeof(int)); +} + +/* + * Get the number of support vectors in a model. + */ +Py_ssize_t get_l(struct svm_csr_model *model) +{ + return (Py_ssize_t) model->l; +} + +void copy_nSV(char *data, struct svm_csr_model *model) +{ + if (model->label == NULL) return; + memcpy(data, model->nSV, model->nr_class * sizeof(int)); +} + +/* + * same as above with model->label + * TODO: merge in the cython layer + */ +void copy_label(char *data, struct svm_csr_model *model) +{ + if (model->label == NULL) return; + memcpy(data, model->label, model->nr_class * sizeof(int)); +} + +void copy_probA(char *data, struct svm_csr_model *model, Py_ssize_t * dims) +{ + memcpy(data, model->probA, dims[0] * sizeof(double)); +} + +void copy_probB(char *data, struct svm_csr_model *model, Py_ssize_t * dims) +{ + memcpy(data, model->probB, dims[0] * sizeof(double)); +} + + +/* + * Some free routines. Some of them are nontrivial since a lot of + * sharing happens across objects (they *must* be called in the + * correct order) + */ +int free_problem(struct svm_csr_problem *problem) +{ + int i; + if (problem == NULL) return -1; + for (i=0; il; ++i) + free (problem->x[i]); + free (problem->x); + free (problem); + return 0; +} + +int free_model(struct svm_csr_model *model) +{ + /* like svm_free_and_destroy_model, but does not free sv_coef[i] */ + /* We don't free n_iter, since we did not create them in set_model. */ + if (model == NULL) return -1; + free(model->SV); + free(model->sv_coef); + free(model->rho); + free(model->label); + free(model->probA); + free(model->probB); + free(model->nSV); + free(model); + + return 0; +} + +int free_param(struct svm_parameter *param) +{ + if (param == NULL) return -1; + free(param); + return 0; +} + + +int free_model_SV(struct svm_csr_model *model) +{ + int i; + for (i=model->l-1; i>=0; --i) free(model->SV[i]); + /* svn_destroy_model frees model->SV */ + for (i=0; i < model->nr_class-1 ; ++i) free(model->sv_coef[i]); + /* svn_destroy_model frees model->sv_coef */ + return 0; +} + + +/* borrowed from original libsvm code */ +static void print_null(const char *s) {} + +static void print_string_stdout(const char *s) +{ + fputs(s,stdout); + fflush(stdout); +} + +/* provide convenience wrapper */ +void set_verbosity(int verbosity_flag){ + if (verbosity_flag) + svm_set_print_string_function(&print_string_stdout); + else + svm_set_print_string_function(&print_null); +} diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/libsvm_template.cpp b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/libsvm_template.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8f6dbd0dfd9ecd81bdd79c74a19d7299e179389d --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/libsvm_template.cpp @@ -0,0 +1,8 @@ + +/* this is a hack to generate libsvm with both sparse and dense + methods in the same binary*/ + +#define _DENSE_REP +#include "svm.cpp" +#undef _DENSE_REP +#include "svm.cpp" diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/svm.cpp b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/svm.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a6f191d6616c968e4e2a31e24a23536da329d873 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/src/libsvm/svm.cpp @@ -0,0 +1,3187 @@ +/* +Copyright (c) 2000-2009 Chih-Chung Chang and Chih-Jen Lin +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither name of copyright holders nor the names of its contributors +may be used to endorse or promote products derived from this software +without specific prior written permission. + + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + Modified 2010: + + - Support for dense data by Ming-Fang Weng + + - Return indices for support vectors, Fabian Pedregosa + + + - Fixes to avoid name collision, Fabian Pedregosa + + - Add support for instance weights, Fabian Pedregosa based on work + by Ming-Wei Chang, Hsuan-Tien Lin, Ming-Hen Tsai, Chia-Hua Ho and + Hsiang-Fu Yu, + . + + - Make labels sorted in svm_group_classes, Fabian Pedregosa. + + Modified 2020: + + - Improved random number generator by using a mersenne twister + tweaked + lemire postprocessor. This fixed a convergence issue on windows targets. + Sylvain Marie, Schneider Electric + see + + Modified 2021: + + - Exposed number of iterations run in optimization, Juan Martín Loyola. + See + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "svm.h" +#include "_svm_cython_blas_helpers.h" +#include "../newrand/newrand.h" + + +#ifndef _LIBSVM_CPP +typedef float Qfloat; +typedef signed char schar; +#ifndef min +template static inline T min(T x,T y) { return (x static inline T max(T x,T y) { return (x>y)?x:y; } +#endif +template static inline void swap(T& x, T& y) { T t=x; x=y; y=t; } +template static inline void clone(T*& dst, S* src, int n) +{ + dst = new T[n]; + memcpy((void *)dst,(void *)src,sizeof(T)*n); +} +static inline double powi(double base, int times) +{ + double tmp = base, ret = 1.0; + + for(int t=times; t>0; t/=2) + { + if(t%2==1) ret*=tmp; + tmp = tmp * tmp; + } + return ret; +} +#define INF HUGE_VAL +#define TAU 1e-12 +#define Malloc(type,n) (type *)malloc((n)*sizeof(type)) + +static void print_string_stdout(const char *s) +{ + fputs(s,stdout); + fflush(stdout); +} +static void (*svm_print_string) (const char *) = &print_string_stdout; + +static void info(const char *fmt,...) +{ + char buf[BUFSIZ]; + va_list ap; + va_start(ap,fmt); + vsprintf(buf,fmt,ap); + va_end(ap); + (*svm_print_string)(buf); +} +#endif +#define _LIBSVM_CPP + + +/* yeah, this is ugly. It helps us to have unique names for both sparse +and dense versions of this library */ +#ifdef _DENSE_REP + #ifdef PREFIX + #undef PREFIX + #endif + #ifdef NAMESPACE + #undef NAMESPACE + #endif + #define PREFIX(name) svm_##name + #define NAMESPACE svm + namespace svm { +#else + /* sparse representation */ + #ifdef PREFIX + #undef PREFIX + #endif + #ifdef NAMESPACE + #undef NAMESPACE + #endif + #define PREFIX(name) svm_csr_##name + #define NAMESPACE svm_csr + namespace svm_csr { +#endif + + +// +// Kernel Cache +// +// l is the number of total data items +// size is the cache size limit in bytes +// +class Cache +{ +public: + Cache(int l,long int size); + ~Cache(); + + // request data [0,len) + // return some position p where [p,len) need to be filled + // (p >= len if nothing needs to be filled) + int get_data(const int index, Qfloat **data, int len); + void swap_index(int i, int j); +private: + int l; + long int size; + struct head_t + { + head_t *prev, *next; // a circular list + Qfloat *data; + int len; // data[0,len) is cached in this entry + }; + + head_t *head; + head_t lru_head; + void lru_delete(head_t *h); + void lru_insert(head_t *h); +}; + +Cache::Cache(int l_,long int size_):l(l_),size(size_) +{ + head = (head_t *)calloc(l,sizeof(head_t)); // initialized to 0 + size /= sizeof(Qfloat); + size -= l * sizeof(head_t) / sizeof(Qfloat); + size = max(size, 2 * (long int) l); // cache must be large enough for two columns + lru_head.next = lru_head.prev = &lru_head; +} + +Cache::~Cache() +{ + for(head_t *h = lru_head.next; h != &lru_head; h=h->next) + free(h->data); + free(head); +} + +void Cache::lru_delete(head_t *h) +{ + // delete from current location + h->prev->next = h->next; + h->next->prev = h->prev; +} + +void Cache::lru_insert(head_t *h) +{ + // insert to last position + h->next = &lru_head; + h->prev = lru_head.prev; + h->prev->next = h; + h->next->prev = h; +} + +int Cache::get_data(const int index, Qfloat **data, int len) +{ + head_t *h = &head[index]; + if(h->len) lru_delete(h); + int more = len - h->len; + + if(more > 0) + { + // free old space + while(size < more) + { + head_t *old = lru_head.next; + lru_delete(old); + free(old->data); + size += old->len; + old->data = 0; + old->len = 0; + } + + // allocate new space + h->data = (Qfloat *)realloc(h->data,sizeof(Qfloat)*len); + size -= more; + swap(h->len,len); + } + + lru_insert(h); + *data = h->data; + return len; +} + +void Cache::swap_index(int i, int j) +{ + if(i==j) return; + + if(head[i].len) lru_delete(&head[i]); + if(head[j].len) lru_delete(&head[j]); + swap(head[i].data,head[j].data); + swap(head[i].len,head[j].len); + if(head[i].len) lru_insert(&head[i]); + if(head[j].len) lru_insert(&head[j]); + + if(i>j) swap(i,j); + for(head_t *h = lru_head.next; h!=&lru_head; h=h->next) + { + if(h->len > i) + { + if(h->len > j) + swap(h->data[i],h->data[j]); + else + { + // give up + lru_delete(h); + free(h->data); + size += h->len; + h->data = 0; + h->len = 0; + } + } + } +} + +// +// Kernel evaluation +// +// the static method k_function is for doing single kernel evaluation +// the constructor of Kernel prepares to calculate the l*l kernel matrix +// the member function get_Q is for getting one column from the Q Matrix +// +class QMatrix { +public: + virtual Qfloat *get_Q(int column, int len) const = 0; + virtual double *get_QD() const = 0; + virtual void swap_index(int i, int j) const = 0; + virtual ~QMatrix() {} +}; + +class Kernel: public QMatrix { +public: +#ifdef _DENSE_REP + Kernel(int l, PREFIX(node) * x, const svm_parameter& param, BlasFunctions *blas_functions); +#else + Kernel(int l, PREFIX(node) * const * x, const svm_parameter& param, BlasFunctions *blas_functions); +#endif + virtual ~Kernel(); + + static double k_function(const PREFIX(node) *x, const PREFIX(node) *y, + const svm_parameter& param, BlasFunctions *blas_functions); + virtual Qfloat *get_Q(int column, int len) const = 0; + virtual double *get_QD() const = 0; + virtual void swap_index(int i, int j) const // no so const... + { + swap(x[i],x[j]); + if(x_square) swap(x_square[i],x_square[j]); + } +protected: + + double (Kernel::*kernel_function)(int i, int j) const; + +private: +#ifdef _DENSE_REP + PREFIX(node) *x; +#else + const PREFIX(node) **x; +#endif + double *x_square; + // scipy blas pointer + BlasFunctions *m_blas; + + // svm_parameter + const int kernel_type; + const int degree; + const double gamma; + const double coef0; + + static double dot(const PREFIX(node) *px, const PREFIX(node) *py, BlasFunctions *blas_functions); +#ifdef _DENSE_REP + static double dot(const PREFIX(node) &px, const PREFIX(node) &py, BlasFunctions *blas_functions); +#endif + + double kernel_linear(int i, int j) const + { + return dot(x[i],x[j],m_blas); + } + double kernel_poly(int i, int j) const + { + return powi(gamma*dot(x[i],x[j],m_blas)+coef0,degree); + } + double kernel_rbf(int i, int j) const + { + return exp(-gamma*(x_square[i]+x_square[j]-2*dot(x[i],x[j],m_blas))); + } + double kernel_sigmoid(int i, int j) const + { + return tanh(gamma*dot(x[i],x[j],m_blas)+coef0); + } + double kernel_precomputed(int i, int j) const + { +#ifdef _DENSE_REP + return (x+i)->values[x[j].ind]; +#else + return x[i][(int)(x[j][0].value)].value; +#endif + } +}; + +#ifdef _DENSE_REP +Kernel::Kernel(int l, PREFIX(node) * x_, const svm_parameter& param, BlasFunctions *blas_functions) +#else +Kernel::Kernel(int l, PREFIX(node) * const * x_, const svm_parameter& param, BlasFunctions *blas_functions) +#endif +:kernel_type(param.kernel_type), degree(param.degree), + gamma(param.gamma), coef0(param.coef0) +{ + m_blas = blas_functions; + switch(kernel_type) + { + case LINEAR: + kernel_function = &Kernel::kernel_linear; + break; + case POLY: + kernel_function = &Kernel::kernel_poly; + break; + case RBF: + kernel_function = &Kernel::kernel_rbf; + break; + case SIGMOID: + kernel_function = &Kernel::kernel_sigmoid; + break; + case PRECOMPUTED: + kernel_function = &Kernel::kernel_precomputed; + break; + } + + clone(x,x_,l); + + if(kernel_type == RBF) + { + x_square = new double[l]; + for(int i=0;idim, py->dim); + sum = blas_functions->dot(dim, px->values, 1, py->values, 1); + return sum; +} + +double Kernel::dot(const PREFIX(node) &px, const PREFIX(node) &py, BlasFunctions *blas_functions) +{ + double sum = 0; + + int dim = min(px.dim, py.dim); + sum = blas_functions->dot(dim, px.values, 1, py.values, 1); + return sum; +} +#else +double Kernel::dot(const PREFIX(node) *px, const PREFIX(node) *py, BlasFunctions *blas_functions) +{ + double sum = 0; + while(px->index != -1 && py->index != -1) + { + if(px->index == py->index) + { + sum += px->value * py->value; + ++px; + ++py; + } + else + { + if(px->index > py->index) + ++py; + else + ++px; + } + } + return sum; +} +#endif + +double Kernel::k_function(const PREFIX(node) *x, const PREFIX(node) *y, + const svm_parameter& param, BlasFunctions *blas_functions) +{ + switch(param.kernel_type) + { + case LINEAR: + return dot(x,y,blas_functions); + case POLY: + return powi(param.gamma*dot(x,y,blas_functions)+param.coef0,param.degree); + case RBF: + { + double sum = 0; +#ifdef _DENSE_REP + int dim = min(x->dim, y->dim), i; + double* m_array = (double*)malloc(sizeof(double)*dim); + for (i = 0; i < dim; i++) + { + m_array[i] = x->values[i] - y->values[i]; + } + sum = blas_functions->dot(dim, m_array, 1, m_array, 1); + free(m_array); + for (; i < x->dim; i++) + sum += x->values[i] * x->values[i]; + for (; i < y->dim; i++) + sum += y->values[i] * y->values[i]; +#else + while(x->index != -1 && y->index !=-1) + { + if(x->index == y->index) + { + double d = x->value - y->value; + sum += d*d; + ++x; + ++y; + } + else + { + if(x->index > y->index) + { + sum += y->value * y->value; + ++y; + } + else + { + sum += x->value * x->value; + ++x; + } + } + } + + while(x->index != -1) + { + sum += x->value * x->value; + ++x; + } + + while(y->index != -1) + { + sum += y->value * y->value; + ++y; + } +#endif + return exp(-param.gamma*sum); + } + case SIGMOID: + return tanh(param.gamma*dot(x,y,blas_functions)+param.coef0); + case PRECOMPUTED: //x: test (validation), y: SV + { +#ifdef _DENSE_REP + return x->values[y->ind]; +#else + return x[(int)(y->value)].value; +#endif + } + default: + return 0; // Unreachable + } +} +// An SMO algorithm in Fan et al., JMLR 6(2005), p. 1889--1918 +// Solves: +// +// min 0.5(\alpha^T Q \alpha) + p^T \alpha +// +// y^T \alpha = \delta +// y_i = +1 or -1 +// 0 <= alpha_i <= Cp for y_i = 1 +// 0 <= alpha_i <= Cn for y_i = -1 +// +// Given: +// +// Q, p, y, Cp, Cn, and an initial feasible point \alpha +// l is the size of vectors and matrices +// eps is the stopping tolerance +// +// solution will be put in \alpha, objective value will be put in obj +// + +class Solver { +public: + Solver() {}; + virtual ~Solver() {}; + + struct SolutionInfo { + double obj; + double rho; + double *upper_bound; + double r; // for Solver_NU + bool solve_timed_out; + int n_iter; + }; + + void Solve(int l, const QMatrix& Q, const double *p_, const schar *y_, + double *alpha_, const double *C_, double eps, + SolutionInfo* si, int shrinking, int max_iter); +protected: + int active_size; + schar *y; + double *G; // gradient of objective function + enum { LOWER_BOUND, UPPER_BOUND, FREE }; + char *alpha_status; // LOWER_BOUND, UPPER_BOUND, FREE + double *alpha; + const QMatrix *Q; + const double *QD; + double eps; + double Cp,Cn; + double *C; + double *p; + int *active_set; + double *G_bar; // gradient, if we treat free variables as 0 + int l; + bool unshrink; // XXX + + double get_C(int i) + { + return C[i]; + } + void update_alpha_status(int i) + { + if(alpha[i] >= get_C(i)) + alpha_status[i] = UPPER_BOUND; + else if(alpha[i] <= 0) + alpha_status[i] = LOWER_BOUND; + else alpha_status[i] = FREE; + } + bool is_upper_bound(int i) { return alpha_status[i] == UPPER_BOUND; } + bool is_lower_bound(int i) { return alpha_status[i] == LOWER_BOUND; } + bool is_free(int i) { return alpha_status[i] == FREE; } + void swap_index(int i, int j); + void reconstruct_gradient(); + virtual int select_working_set(int &i, int &j); + virtual double calculate_rho(); + virtual void do_shrinking(); +private: + bool be_shrunk(int i, double Gmax1, double Gmax2); +}; + +void Solver::swap_index(int i, int j) +{ + Q->swap_index(i,j); + swap(y[i],y[j]); + swap(G[i],G[j]); + swap(alpha_status[i],alpha_status[j]); + swap(alpha[i],alpha[j]); + swap(p[i],p[j]); + swap(active_set[i],active_set[j]); + swap(G_bar[i],G_bar[j]); + swap(C[i], C[j]); +} + +void Solver::reconstruct_gradient() +{ + // reconstruct inactive elements of G from G_bar and free variables + + if(active_size == l) return; + + int i,j; + int nr_free = 0; + + for(j=active_size;j 2*active_size*(l-active_size)) + { + for(i=active_size;iget_Q(i,active_size); + for(j=0;jget_Q(i,l); + double alpha_i = alpha[i]; + for(j=active_size;jl = l; + this->Q = &Q; + QD=Q.get_QD(); + clone(p, p_,l); + clone(y, y_,l); + clone(alpha,alpha_,l); + clone(C, C_, l); + this->eps = eps; + unshrink = false; + si->solve_timed_out = false; + + // initialize alpha_status + { + alpha_status = new char[l]; + for(int i=0;i= max_iter)) { + info("WARN: libsvm Solver reached max_iter"); + si->solve_timed_out = true; + break; + } + + // show progress and do shrinking + + if(--counter == 0) + { + counter = min(l,1000); + if(shrinking) do_shrinking(); + info("."); + } + + int i,j; + if(select_working_set(i,j)!=0) + { + // reconstruct the whole gradient + reconstruct_gradient(); + // reset active set size and check + active_size = l; + info("*"); + if(select_working_set(i,j)!=0) + break; + else + counter = 1; // do shrinking next iteration + } + + ++iter; + + // update alpha[i] and alpha[j], handle bounds carefully + + const Qfloat *Q_i = Q.get_Q(i,active_size); + const Qfloat *Q_j = Q.get_Q(j,active_size); + + double C_i = get_C(i); + double C_j = get_C(j); + + double old_alpha_i = alpha[i]; + double old_alpha_j = alpha[j]; + + if(y[i]!=y[j]) + { + double quad_coef = QD[i]+QD[j]+2*Q_i[j]; + if (quad_coef <= 0) + quad_coef = TAU; + double delta = (-G[i]-G[j])/quad_coef; + double diff = alpha[i] - alpha[j]; + alpha[i] += delta; + alpha[j] += delta; + + if(diff > 0) + { + if(alpha[j] < 0) + { + alpha[j] = 0; + alpha[i] = diff; + } + } + else + { + if(alpha[i] < 0) + { + alpha[i] = 0; + alpha[j] = -diff; + } + } + if(diff > C_i - C_j) + { + if(alpha[i] > C_i) + { + alpha[i] = C_i; + alpha[j] = C_i - diff; + } + } + else + { + if(alpha[j] > C_j) + { + alpha[j] = C_j; + alpha[i] = C_j + diff; + } + } + } + else + { + double quad_coef = QD[i]+QD[j]-2*Q_i[j]; + if (quad_coef <= 0) + quad_coef = TAU; + double delta = (G[i]-G[j])/quad_coef; + double sum = alpha[i] + alpha[j]; + alpha[i] -= delta; + alpha[j] += delta; + + if(sum > C_i) + { + if(alpha[i] > C_i) + { + alpha[i] = C_i; + alpha[j] = sum - C_i; + } + } + else + { + if(alpha[j] < 0) + { + alpha[j] = 0; + alpha[i] = sum; + } + } + if(sum > C_j) + { + if(alpha[j] > C_j) + { + alpha[j] = C_j; + alpha[i] = sum - C_j; + } + } + else + { + if(alpha[i] < 0) + { + alpha[i] = 0; + alpha[j] = sum; + } + } + } + + // update G + + double delta_alpha_i = alpha[i] - old_alpha_i; + double delta_alpha_j = alpha[j] - old_alpha_j; + + for(int k=0;krho = calculate_rho(); + + // calculate objective value + { + double v = 0; + int i; + for(i=0;iobj = v/2; + } + + // put back the solution + { + for(int i=0;iupper_bound[i] = C[i]; + + // store number of iterations + si->n_iter = iter; + + info("\noptimization finished, #iter = %d\n",iter); + + delete[] p; + delete[] y; + delete[] alpha; + delete[] alpha_status; + delete[] active_set; + delete[] G; + delete[] G_bar; + delete[] C; +} + +// return 1 if already optimal, return 0 otherwise +int Solver::select_working_set(int &out_i, int &out_j) +{ + // return i,j such that + // i: maximizes -y_i * grad(f)_i, i in I_up(\alpha) + // j: minimizes the decrease of obj value + // (if quadratic coefficient <= 0, replace it with tau) + // -y_j*grad(f)_j < -y_i*grad(f)_i, j in I_low(\alpha) + + double Gmax = -INF; + double Gmax2 = -INF; + int Gmax_idx = -1; + int Gmin_idx = -1; + double obj_diff_min = INF; + + for(int t=0;t= Gmax) + { + Gmax = -G[t]; + Gmax_idx = t; + } + } + else + { + if(!is_lower_bound(t)) + if(G[t] >= Gmax) + { + Gmax = G[t]; + Gmax_idx = t; + } + } + + int i = Gmax_idx; + const Qfloat *Q_i = NULL; + if(i != -1) // NULL Q_i not accessed: Gmax=-INF if i=-1 + Q_i = Q->get_Q(i,active_size); + + for(int j=0;j= Gmax2) + Gmax2 = G[j]; + if (grad_diff > 0) + { + double obj_diff; + double quad_coef = QD[i]+QD[j]-2.0*y[i]*Q_i[j]; + if (quad_coef > 0) + obj_diff = -(grad_diff*grad_diff)/quad_coef; + else + obj_diff = -(grad_diff*grad_diff)/TAU; + + if (obj_diff <= obj_diff_min) + { + Gmin_idx=j; + obj_diff_min = obj_diff; + } + } + } + } + else + { + if (!is_upper_bound(j)) + { + double grad_diff= Gmax-G[j]; + if (-G[j] >= Gmax2) + Gmax2 = -G[j]; + if (grad_diff > 0) + { + double obj_diff; + double quad_coef = QD[i]+QD[j]+2.0*y[i]*Q_i[j]; + if (quad_coef > 0) + obj_diff = -(grad_diff*grad_diff)/quad_coef; + else + obj_diff = -(grad_diff*grad_diff)/TAU; + + if (obj_diff <= obj_diff_min) + { + Gmin_idx=j; + obj_diff_min = obj_diff; + } + } + } + } + } + + if(Gmax+Gmax2 < eps || Gmin_idx == -1) + return 1; + + out_i = Gmax_idx; + out_j = Gmin_idx; + return 0; +} + +bool Solver::be_shrunk(int i, double Gmax1, double Gmax2) +{ + if(is_upper_bound(i)) + { + if(y[i]==+1) + return(-G[i] > Gmax1); + else + return(-G[i] > Gmax2); + } + else if(is_lower_bound(i)) + { + if(y[i]==+1) + return(G[i] > Gmax2); + else + return(G[i] > Gmax1); + } + else + return(false); +} + +void Solver::do_shrinking() +{ + int i; + double Gmax1 = -INF; // max { -y_i * grad(f)_i | i in I_up(\alpha) } + double Gmax2 = -INF; // max { y_i * grad(f)_i | i in I_low(\alpha) } + + // find maximal violating pair first + for(i=0;i= Gmax1) + Gmax1 = -G[i]; + } + if(!is_lower_bound(i)) + { + if(G[i] >= Gmax2) + Gmax2 = G[i]; + } + } + else + { + if(!is_upper_bound(i)) + { + if(-G[i] >= Gmax2) + Gmax2 = -G[i]; + } + if(!is_lower_bound(i)) + { + if(G[i] >= Gmax1) + Gmax1 = G[i]; + } + } + } + + if(unshrink == false && Gmax1 + Gmax2 <= eps*10) + { + unshrink = true; + reconstruct_gradient(); + active_size = l; + info("*"); + } + + for(i=0;i i) + { + if (!be_shrunk(active_size, Gmax1, Gmax2)) + { + swap_index(i,active_size); + break; + } + active_size--; + } + } +} + +double Solver::calculate_rho() +{ + double r; + int nr_free = 0; + double ub = INF, lb = -INF, sum_free = 0; + for(int i=0;i0) + r = sum_free/nr_free; + else + r = (ub+lb)/2; + + return r; +} + +// +// Solver for nu-svm classification and regression +// +// additional constraint: e^T \alpha = constant +// +class Solver_NU : public Solver +{ +public: + Solver_NU() {} + void Solve(int l, const QMatrix& Q, const double *p, const schar *y, + double *alpha, const double *C_, double eps, + SolutionInfo* si, int shrinking, int max_iter) + { + this->si = si; + Solver::Solve(l,Q,p,y,alpha,C_,eps,si,shrinking,max_iter); + } +private: + SolutionInfo *si; + int select_working_set(int &i, int &j); + double calculate_rho(); + bool be_shrunk(int i, double Gmax1, double Gmax2, double Gmax3, double Gmax4); + void do_shrinking(); +}; + +// return 1 if already optimal, return 0 otherwise +int Solver_NU::select_working_set(int &out_i, int &out_j) +{ + // return i,j such that y_i = y_j and + // i: maximizes -y_i * grad(f)_i, i in I_up(\alpha) + // j: minimizes the decrease of obj value + // (if quadratic coefficient <= 0, replace it with tau) + // -y_j*grad(f)_j < -y_i*grad(f)_i, j in I_low(\alpha) + + double Gmaxp = -INF; + double Gmaxp2 = -INF; + int Gmaxp_idx = -1; + + double Gmaxn = -INF; + double Gmaxn2 = -INF; + int Gmaxn_idx = -1; + + int Gmin_idx = -1; + double obj_diff_min = INF; + + for(int t=0;t= Gmaxp) + { + Gmaxp = -G[t]; + Gmaxp_idx = t; + } + } + else + { + if(!is_lower_bound(t)) + if(G[t] >= Gmaxn) + { + Gmaxn = G[t]; + Gmaxn_idx = t; + } + } + + int ip = Gmaxp_idx; + int in = Gmaxn_idx; + const Qfloat *Q_ip = NULL; + const Qfloat *Q_in = NULL; + if(ip != -1) // NULL Q_ip not accessed: Gmaxp=-INF if ip=-1 + Q_ip = Q->get_Q(ip,active_size); + if(in != -1) + Q_in = Q->get_Q(in,active_size); + + for(int j=0;j= Gmaxp2) + Gmaxp2 = G[j]; + if (grad_diff > 0) + { + double obj_diff; + double quad_coef = QD[ip]+QD[j]-2*Q_ip[j]; + if (quad_coef > 0) + obj_diff = -(grad_diff*grad_diff)/quad_coef; + else + obj_diff = -(grad_diff*grad_diff)/TAU; + + if (obj_diff <= obj_diff_min) + { + Gmin_idx=j; + obj_diff_min = obj_diff; + } + } + } + } + else + { + if (!is_upper_bound(j)) + { + double grad_diff=Gmaxn-G[j]; + if (-G[j] >= Gmaxn2) + Gmaxn2 = -G[j]; + if (grad_diff > 0) + { + double obj_diff; + double quad_coef = QD[in]+QD[j]-2*Q_in[j]; + if (quad_coef > 0) + obj_diff = -(grad_diff*grad_diff)/quad_coef; + else + obj_diff = -(grad_diff*grad_diff)/TAU; + + if (obj_diff <= obj_diff_min) + { + Gmin_idx=j; + obj_diff_min = obj_diff; + } + } + } + } + } + + if(max(Gmaxp+Gmaxp2,Gmaxn+Gmaxn2) < eps || Gmin_idx == -1) + return 1; + + if (y[Gmin_idx] == +1) + out_i = Gmaxp_idx; + else + out_i = Gmaxn_idx; + out_j = Gmin_idx; + + return 0; +} + +bool Solver_NU::be_shrunk(int i, double Gmax1, double Gmax2, double Gmax3, double Gmax4) +{ + if(is_upper_bound(i)) + { + if(y[i]==+1) + return(-G[i] > Gmax1); + else + return(-G[i] > Gmax4); + } + else if(is_lower_bound(i)) + { + if(y[i]==+1) + return(G[i] > Gmax2); + else + return(G[i] > Gmax3); + } + else + return(false); +} + +void Solver_NU::do_shrinking() +{ + double Gmax1 = -INF; // max { -y_i * grad(f)_i | y_i = +1, i in I_up(\alpha) } + double Gmax2 = -INF; // max { y_i * grad(f)_i | y_i = +1, i in I_low(\alpha) } + double Gmax3 = -INF; // max { -y_i * grad(f)_i | y_i = -1, i in I_up(\alpha) } + double Gmax4 = -INF; // max { y_i * grad(f)_i | y_i = -1, i in I_low(\alpha) } + + // find maximal violating pair first + int i; + for(i=0;i Gmax1) Gmax1 = -G[i]; + } + else if(-G[i] > Gmax4) Gmax4 = -G[i]; + } + if(!is_lower_bound(i)) + { + if(y[i]==+1) + { + if(G[i] > Gmax2) Gmax2 = G[i]; + } + else if(G[i] > Gmax3) Gmax3 = G[i]; + } + } + + if(unshrink == false && max(Gmax1+Gmax2,Gmax3+Gmax4) <= eps*10) + { + unshrink = true; + reconstruct_gradient(); + active_size = l; + } + + for(i=0;i i) + { + if (!be_shrunk(active_size, Gmax1, Gmax2, Gmax3, Gmax4)) + { + swap_index(i,active_size); + break; + } + active_size--; + } + } +} + +double Solver_NU::calculate_rho() +{ + int nr_free1 = 0,nr_free2 = 0; + double ub1 = INF, ub2 = INF; + double lb1 = -INF, lb2 = -INF; + double sum_free1 = 0, sum_free2 = 0; + + for(int i=0;i 0) + r1 = sum_free1/nr_free1; + else + r1 = (ub1+lb1)/2; + + if(nr_free2 > 0) + r2 = sum_free2/nr_free2; + else + r2 = (ub2+lb2)/2; + + si->r = (r1+r2)/2; + return (r1-r2)/2; +} + +// +// Q matrices for various formulations +// +class SVC_Q: public Kernel +{ +public: + SVC_Q(const PREFIX(problem)& prob, const svm_parameter& param, const schar *y_, BlasFunctions *blas_functions) + :Kernel(prob.l, prob.x, param, blas_functions) + { + clone(y,y_,prob.l); + cache = new Cache(prob.l,(long int)(param.cache_size*(1<<20))); + QD = new double[prob.l]; + for(int i=0;i*kernel_function)(i,i); + } + + Qfloat *get_Q(int i, int len) const + { + Qfloat *data; + int start, j; + if((start = cache->get_data(i,&data,len)) < len) + { + for(j=start;j*kernel_function)(i,j)); + } + return data; + } + + double *get_QD() const + { + return QD; + } + + void swap_index(int i, int j) const + { + cache->swap_index(i,j); + Kernel::swap_index(i,j); + swap(y[i],y[j]); + swap(QD[i],QD[j]); + } + + ~SVC_Q() + { + delete[] y; + delete cache; + delete[] QD; + } +private: + schar *y; + Cache *cache; + double *QD; +}; + +class ONE_CLASS_Q: public Kernel +{ +public: + ONE_CLASS_Q(const PREFIX(problem)& prob, const svm_parameter& param, BlasFunctions *blas_functions) + :Kernel(prob.l, prob.x, param, blas_functions) + { + cache = new Cache(prob.l,(long int)(param.cache_size*(1<<20))); + QD = new double[prob.l]; + for(int i=0;i*kernel_function)(i,i); + } + + Qfloat *get_Q(int i, int len) const + { + Qfloat *data; + int start, j; + if((start = cache->get_data(i,&data,len)) < len) + { + for(j=start;j*kernel_function)(i,j); + } + return data; + } + + double *get_QD() const + { + return QD; + } + + void swap_index(int i, int j) const + { + cache->swap_index(i,j); + Kernel::swap_index(i,j); + swap(QD[i],QD[j]); + } + + ~ONE_CLASS_Q() + { + delete cache; + delete[] QD; + } +private: + Cache *cache; + double *QD; +}; + +class SVR_Q: public Kernel +{ +public: + SVR_Q(const PREFIX(problem)& prob, const svm_parameter& param, BlasFunctions *blas_functions) + :Kernel(prob.l, prob.x, param, blas_functions) + { + l = prob.l; + cache = new Cache(l,(long int)(param.cache_size*(1<<20))); + QD = new double[2*l]; + sign = new schar[2*l]; + index = new int[2*l]; + for(int k=0;k*kernel_function)(k,k); + QD[k+l] = QD[k]; + } + buffer[0] = new Qfloat[2*l]; + buffer[1] = new Qfloat[2*l]; + next_buffer = 0; + } + + void swap_index(int i, int j) const + { + swap(sign[i],sign[j]); + swap(index[i],index[j]); + swap(QD[i],QD[j]); + } + + Qfloat *get_Q(int i, int len) const + { + Qfloat *data; + int j, real_i = index[i]; + if(cache->get_data(real_i,&data,l) < l) + { + for(j=0;j*kernel_function)(real_i,j); + } + + // reorder and copy + Qfloat *buf = buffer[next_buffer]; + next_buffer = 1 - next_buffer; + schar si = sign[i]; + for(j=0;jl; + double *minus_ones = new double[l]; + schar *y = new schar[l]; + double *C = new double[l]; + + int i; + + for(i=0;iy[i] > 0) + { + y[i] = +1; + C[i] = prob->W[i]*Cp; + } + else + { + y[i] = -1; + C[i] = prob->W[i]*Cn; + } + } + + Solver s; + s.Solve(l, SVC_Q(*prob,*param,y, blas_functions), minus_ones, y, + alpha, C, param->eps, si, param->shrinking, + param->max_iter); + + /* + double sum_alpha=0; + for(i=0;il)); + */ + + for(i=0;il; + double nu = param->nu; + + schar *y = new schar[l]; + double *C = new double[l]; + + for(i=0;iy[i]>0) + y[i] = +1; + else + y[i] = -1; + + C[i] = prob->W[i]; + } + + double nu_l = 0; + for(i=0;ieps, si, param->shrinking, param->max_iter); + double r = si->r; + + info("C = %f\n",1/r); + + for(i=0;iupper_bound[i] /= r; + } + + si->rho /= r; + si->obj /= (r*r); + + delete[] C; + delete[] y; + delete[] zeros; +} + +static void solve_one_class( + const PREFIX(problem) *prob, const svm_parameter *param, + double *alpha, Solver::SolutionInfo* si, BlasFunctions *blas_functions) +{ + int l = prob->l; + double *zeros = new double[l]; + schar *ones = new schar[l]; + double *C = new double[l]; + int i; + + double nu_l = 0; + + for(i=0;iW[i]; + nu_l += C[i] * param->nu; + } + + i = 0; + while(nu_l > 0) + { + alpha[i] = min(C[i],nu_l); + nu_l -= alpha[i]; + ++i; + } + for(;ieps, si, param->shrinking, param->max_iter); + + delete[] C; + delete[] zeros; + delete[] ones; +} + +static void solve_epsilon_svr( + const PREFIX(problem) *prob, const svm_parameter *param, + double *alpha, Solver::SolutionInfo* si, BlasFunctions *blas_functions) +{ + int l = prob->l; + double *alpha2 = new double[2*l]; + double *linear_term = new double[2*l]; + schar *y = new schar[2*l]; + double *C = new double[2*l]; + int i; + + for(i=0;ip - prob->y[i]; + y[i] = 1; + C[i] = prob->W[i]*param->C; + + alpha2[i+l] = 0; + linear_term[i+l] = param->p + prob->y[i]; + y[i+l] = -1; + C[i+l] = prob->W[i]*param->C; + } + + Solver s; + s.Solve(2*l, SVR_Q(*prob,*param,blas_functions), linear_term, y, + alpha2, C, param->eps, si, param->shrinking, param->max_iter); + + double sum_alpha = 0; + for(i=0;il; + double *C = new double[2*l]; + double *alpha2 = new double[2*l]; + double *linear_term = new double[2*l]; + schar *y = new schar[2*l]; + int i; + + double sum = 0; + for(i=0;iW[i]*param->C; + sum += C[i] * param->nu; + } + sum /= 2; + + for(i=0;iy[i]; + y[i] = 1; + + linear_term[i+l] = prob->y[i]; + y[i+l] = -1; + } + + Solver_NU s; + s.Solve(2*l, SVR_Q(*prob,*param,blas_functions), linear_term, y, + alpha2, C, param->eps, si, param->shrinking, param->max_iter); + + info("epsilon = %f\n",-si->r); + + for(i=0;il); + Solver::SolutionInfo si; + switch(param->svm_type) + { + case C_SVC: + si.upper_bound = Malloc(double,prob->l); + solve_c_svc(prob,param,alpha,&si,Cp,Cn,blas_functions); + break; + case NU_SVC: + si.upper_bound = Malloc(double,prob->l); + solve_nu_svc(prob,param,alpha,&si,blas_functions); + break; + case ONE_CLASS: + si.upper_bound = Malloc(double,prob->l); + solve_one_class(prob,param,alpha,&si,blas_functions); + break; + case EPSILON_SVR: + si.upper_bound = Malloc(double,2*prob->l); + solve_epsilon_svr(prob,param,alpha,&si,blas_functions); + break; + case NU_SVR: + si.upper_bound = Malloc(double,2*prob->l); + solve_nu_svr(prob,param,alpha,&si,blas_functions); + break; + } + + *status |= si.solve_timed_out; + + info("obj = %f, rho = %f\n",si.obj,si.rho); + + // output SVs + + int nSV = 0; + int nBSV = 0; + for(int i=0;il;i++) + { + if(fabs(alpha[i]) > 0) + { + ++nSV; + if(prob->y[i] > 0) + { + if(fabs(alpha[i]) >= si.upper_bound[i]) + ++nBSV; + } + else + { + if(fabs(alpha[i]) >= si.upper_bound[i]) + ++nBSV; + } + } + } + + free(si.upper_bound); + + info("nSV = %d, nBSV = %d\n",nSV,nBSV); + + decision_function f; + f.alpha = alpha; + f.rho = si.rho; + f.n_iter = si.n_iter; + return f; +} + +// Platt's binary SVM Probabilistic Output: an improvement from Lin et al. +static void sigmoid_train( + int l, const double *dec_values, const double *labels, + double& A, double& B) +{ + double prior1=0, prior0 = 0; + int i; + + for (i=0;i 0) prior1+=1; + else prior0+=1; + + int max_iter=100; // Maximal number of iterations + double min_step=1e-10; // Minimal step taken in line search + double sigma=1e-12; // For numerically strict PD of Hessian + double eps=1e-5; + double hiTarget=(prior1+1.0)/(prior1+2.0); + double loTarget=1/(prior0+2.0); + double *t=Malloc(double,l); + double fApB,p,q,h11,h22,h21,g1,g2,det,dA,dB,gd,stepsize; + double newA,newB,newf,d1,d2; + int iter; + + // Initial Point and Initial Fun Value + A=0.0; B=log((prior0+1.0)/(prior1+1.0)); + double fval = 0.0; + + for (i=0;i0) t[i]=hiTarget; + else t[i]=loTarget; + fApB = dec_values[i]*A+B; + if (fApB>=0) + fval += t[i]*fApB + log(1+exp(-fApB)); + else + fval += (t[i] - 1)*fApB +log(1+exp(fApB)); + } + for (iter=0;iter= 0) + { + p=exp(-fApB)/(1.0+exp(-fApB)); + q=1.0/(1.0+exp(-fApB)); + } + else + { + p=1.0/(1.0+exp(fApB)); + q=exp(fApB)/(1.0+exp(fApB)); + } + d2=p*q; + h11+=dec_values[i]*dec_values[i]*d2; + h22+=d2; + h21+=dec_values[i]*d2; + d1=t[i]-p; + g1+=dec_values[i]*d1; + g2+=d1; + } + + // Stopping Criteria + if (fabs(g1)= min_step) + { + newA = A + stepsize * dA; + newB = B + stepsize * dB; + + // New function value + newf = 0.0; + for (i=0;i= 0) + newf += t[i]*fApB + log(1+exp(-fApB)); + else + newf += (t[i] - 1)*fApB +log(1+exp(fApB)); + } + // Check sufficient decrease + if (newf=max_iter) + info("Reaching maximal iterations in two-class probability estimates\n"); + free(t); +} + +static double sigmoid_predict(double decision_value, double A, double B) +{ + double fApB = decision_value*A+B; + // 1-p used later; avoid catastrophic cancellation + if (fApB >= 0) + return exp(-fApB)/(1.0+exp(-fApB)); + else + return 1.0/(1+exp(fApB)) ; +} + +// Method 2 from the multiclass_prob paper by Wu, Lin, and Weng +static void multiclass_probability(int k, double **r, double *p) +{ + int t,j; + int iter = 0, max_iter=max(100,k); + double **Q=Malloc(double *,k); + double *Qp=Malloc(double,k); + double pQp, eps=0.005/k; + + for (t=0;tmax_error) + max_error=error; + } + if (max_error=max_iter) + info("Exceeds max_iter in multiclass_prob\n"); + for(t=0;tl); + double *dec_values = Malloc(double,prob->l); + + // random shuffle + for(i=0;il;i++) perm[i]=i; + for(i=0;il;i++) + { + int j = i+bounded_rand_int(prob->l-i); + swap(perm[i],perm[j]); + } + for(i=0;il/nr_fold; + int end = (i+1)*prob->l/nr_fold; + int j,k; + struct PREFIX(problem) subprob; + + subprob.l = prob->l-(end-begin); +#ifdef _DENSE_REP + subprob.x = Malloc(struct PREFIX(node),subprob.l); +#else + subprob.x = Malloc(struct PREFIX(node)*,subprob.l); +#endif + subprob.y = Malloc(double,subprob.l); + subprob.W = Malloc(double,subprob.l); + + k=0; + for(j=0;jx[perm[j]]; + subprob.y[k] = prob->y[perm[j]]; + subprob.W[k] = prob->W[perm[j]]; + ++k; + } + for(j=end;jl;j++) + { + subprob.x[k] = prob->x[perm[j]]; + subprob.y[k] = prob->y[perm[j]]; + subprob.W[k] = prob->W[perm[j]]; + ++k; + } + int p_count=0,n_count=0; + for(j=0;j0) + p_count++; + else + n_count++; + + if(p_count==0 && n_count==0) + for(j=begin;j 0 && n_count == 0) + for(j=begin;j 0) + for(j=begin;jx+perm[j]),&(dec_values[perm[j]]), blas_functions); +#else + PREFIX(predict_values)(submodel,prob->x[perm[j]],&(dec_values[perm[j]]), blas_functions); +#endif + // ensure +1 -1 order; reason not using CV subroutine + dec_values[perm[j]] *= submodel->label[0]; + } + PREFIX(free_and_destroy_model)(&submodel); + PREFIX(destroy_param)(&subparam); + } + free(subprob.x); + free(subprob.y); + free(subprob.W); + } + sigmoid_train(prob->l,dec_values,prob->y,probA,probB); + free(dec_values); + free(perm); +} + +// Return parameter of a Laplace distribution +static double svm_svr_probability( + const PREFIX(problem) *prob, const svm_parameter *param, BlasFunctions *blas_functions) +{ + int i; + int nr_fold = 5; + double *ymv = Malloc(double,prob->l); + double mae = 0; + + svm_parameter newparam = *param; + newparam.probability = 0; + newparam.random_seed = -1; // This is called from train, which already sets + // the seed. + PREFIX(cross_validation)(prob,&newparam,nr_fold,ymv, blas_functions); + for(i=0;il;i++) + { + ymv[i]=prob->y[i]-ymv[i]; + mae += fabs(ymv[i]); + } + mae /= prob->l; + double std=sqrt(2*mae*mae); + int count=0; + mae=0; + for(i=0;il;i++) + if (fabs(ymv[i]) > 5*std) + count=count+1; + else + mae+=fabs(ymv[i]); + mae /= (prob->l-count); + info("Prob. model for test data: target value = predicted value + z,\nz: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma= %g\n",mae); + free(ymv); + return mae; +} + + + +// label: label name, start: begin of each class, count: #data of classes, perm: indices to the original data +// perm, length l, must be allocated before calling this subroutine +static void svm_group_classes(const PREFIX(problem) *prob, int *nr_class_ret, int **label_ret, int **start_ret, int **count_ret, int *perm) +{ + int l = prob->l; + int max_nr_class = 16; + int nr_class = 0; + int *label = Malloc(int,max_nr_class); + int *count = Malloc(int,max_nr_class); + int *data_label = Malloc(int,l); + int i, j, this_label, this_count; + + for(i=0;iy[i]; + for(j=0;j=0 && label[i] > this_label) + { + label[i+1] = label[i]; + count[i+1] = count[i]; + i--; + } + label[i+1] = this_label; + count[i+1] = this_count; + } + + for (i=0; iy[i]; + while(this_label != label[j]){ + j ++; + } + data_label[i] = j; + } + + int *start = Malloc(int,nr_class); + start[0] = 0; + for(i=1;i 0. +// +static void remove_zero_weight(PREFIX(problem) *newprob, const PREFIX(problem) *prob) +{ + int i; + int l = 0; + for(i=0;il;i++) + if(prob->W[i] > 0) l++; + *newprob = *prob; + newprob->l = l; +#ifdef _DENSE_REP + newprob->x = Malloc(PREFIX(node),l); +#else + newprob->x = Malloc(PREFIX(node) *,l); +#endif + newprob->y = Malloc(double,l); + newprob->W = Malloc(double,l); + + int j = 0; + for(i=0;il;i++) + if(prob->W[i] > 0) + { + newprob->x[j] = prob->x[i]; + newprob->y[j] = prob->y[i]; + newprob->W[j] = prob->W[i]; + j++; + } +} + +// +// Interface functions +// +PREFIX(model) *PREFIX(train)(const PREFIX(problem) *prob, const svm_parameter *param, + int *status, BlasFunctions *blas_functions) +{ + PREFIX(problem) newprob; + remove_zero_weight(&newprob, prob); + prob = &newprob; + + PREFIX(model) *model = Malloc(PREFIX(model),1); + model->param = *param; + model->free_sv = 0; // XXX + + if(param->random_seed >= 0) + { + set_seed(param->random_seed); + } + + if(param->svm_type == ONE_CLASS || + param->svm_type == EPSILON_SVR || + param->svm_type == NU_SVR) + { + // regression or one-class-svm + model->nr_class = 2; + model->label = NULL; + model->nSV = NULL; + model->probA = NULL; model->probB = NULL; + model->sv_coef = Malloc(double *,1); + + if(param->probability && + (param->svm_type == EPSILON_SVR || + param->svm_type == NU_SVR)) + { + model->probA = Malloc(double,1); + model->probA[0] = NAMESPACE::svm_svr_probability(prob,param,blas_functions); + } + + NAMESPACE::decision_function f = NAMESPACE::svm_train_one(prob,param,0,0, status,blas_functions); + model->rho = Malloc(double,1); + model->rho[0] = f.rho; + model->n_iter = Malloc(int,1); + model->n_iter[0] = f.n_iter; + + int nSV = 0; + int i; + for(i=0;il;i++) + if(fabs(f.alpha[i]) > 0) ++nSV; + model->l = nSV; +#ifdef _DENSE_REP + model->SV = Malloc(PREFIX(node),nSV); +#else + model->SV = Malloc(PREFIX(node) *,nSV); +#endif + model->sv_ind = Malloc(int, nSV); + model->sv_coef[0] = Malloc(double, nSV); + int j = 0; + for(i=0;il;i++) + if(fabs(f.alpha[i]) > 0) + { + model->SV[j] = prob->x[i]; + model->sv_ind[j] = i; + model->sv_coef[0][j] = f.alpha[i]; + ++j; + } + + free(f.alpha); + } + else + { + // classification + int l = prob->l; + int nr_class; + int *label = NULL; + int *start = NULL; + int *count = NULL; + int *perm = Malloc(int,l); + + // group training data of the same class + NAMESPACE::svm_group_classes(prob,&nr_class,&label,&start,&count,perm); +#ifdef _DENSE_REP + PREFIX(node) *x = Malloc(PREFIX(node),l); +#else + PREFIX(node) **x = Malloc(PREFIX(node) *,l); +#endif + double *W = Malloc(double, l); + + int i; + for(i=0;ix[perm[i]]; + W[i] = prob->W[perm[i]]; + } + + // calculate weighted C + + double *weighted_C = Malloc(double, nr_class); + for(i=0;iC; + for(i=0;inr_weight;i++) + { + int j; + for(j=0;jweight_label[i] == label[j]) + break; + if(j == nr_class) + fprintf(stderr,"warning: class label %d specified in weight is not found\n", param->weight_label[i]); + else + weighted_C[j] *= param->weight[i]; + } + + // train k*(k-1)/2 models + + bool *nonzero = Malloc(bool,l); + for(i=0;iprobability) + { + probA=Malloc(double,nr_class*(nr_class-1)/2); + probB=Malloc(double,nr_class*(nr_class-1)/2); + } + + int p = 0; + for(i=0;iprobability) + NAMESPACE::svm_binary_svc_probability(&sub_prob,param,weighted_C[i],weighted_C[j],probA[p],probB[p], status, blas_functions); + + f[p] = NAMESPACE::svm_train_one(&sub_prob,param,weighted_C[i],weighted_C[j], status, blas_functions); + for(k=0;k 0) + nonzero[si+k] = true; + for(k=0;k 0) + nonzero[sj+k] = true; + free(sub_prob.x); + free(sub_prob.y); + free(sub_prob.W); + ++p; + } + + // build output + + model->nr_class = nr_class; + + model->label = Malloc(int,nr_class); + for(i=0;ilabel[i] = label[i]; + + model->rho = Malloc(double,nr_class*(nr_class-1)/2); + model->n_iter = Malloc(int,nr_class*(nr_class-1)/2); + for(i=0;irho[i] = f[i].rho; + model->n_iter[i] = f[i].n_iter; + } + + if(param->probability) + { + model->probA = Malloc(double,nr_class*(nr_class-1)/2); + model->probB = Malloc(double,nr_class*(nr_class-1)/2); + for(i=0;iprobA[i] = probA[i]; + model->probB[i] = probB[i]; + } + } + else + { + model->probA=NULL; + model->probB=NULL; + } + + int total_sv = 0; + int *nz_count = Malloc(int,nr_class); + model->nSV = Malloc(int,nr_class); + for(i=0;inSV[i] = nSV; + nz_count[i] = nSV; + } + + info("Total nSV = %d\n",total_sv); + + model->l = total_sv; + model->sv_ind = Malloc(int, total_sv); +#ifdef _DENSE_REP + model->SV = Malloc(PREFIX(node),total_sv); +#else + model->SV = Malloc(PREFIX(node) *,total_sv); +#endif + p = 0; + for(i=0;iSV[p] = x[i]; + model->sv_ind[p] = perm[i]; + ++p; + } + } + + int *nz_start = Malloc(int,nr_class); + nz_start[0] = 0; + for(i=1;isv_coef = Malloc(double *,nr_class-1); + for(i=0;isv_coef[i] = Malloc(double,total_sv); + + p = 0; + for(i=0;isv_coef[j-1][q++] = f[p].alpha[k]; + q = nz_start[j]; + for(k=0;ksv_coef[i][q++] = f[p].alpha[ci+k]; + ++p; + } + + free(label); + free(probA); + free(probB); + free(count); + free(perm); + free(start); + free(W); + free(x); + free(weighted_C); + free(nonzero); + for(i=0;il; + int *perm = Malloc(int,l); + int nr_class; + if(param->random_seed >= 0) + { + set_seed(param->random_seed); + } + + // stratified cv may not give leave-one-out rate + // Each class to l folds -> some folds may have zero elements + if((param->svm_type == C_SVC || + param->svm_type == NU_SVC) && nr_fold < l) + { + int *start = NULL; + int *label = NULL; + int *count = NULL; + NAMESPACE::svm_group_classes(prob,&nr_class,&label,&start,&count,perm); + + // random shuffle and then data grouped by fold using the array perm + int *fold_count = Malloc(int,nr_fold); + int c; + int *index = Malloc(int,l); + for(i=0;ix[perm[j]]; + subprob.y[k] = prob->y[perm[j]]; + subprob.W[k] = prob->W[perm[j]]; + ++k; + } + for(j=end;jx[perm[j]]; + subprob.y[k] = prob->y[perm[j]]; + subprob.W[k] = prob->W[perm[j]]; + ++k; + } + int dummy_status = 0; // IGNORES TIMEOUT ERRORS + struct PREFIX(model) *submodel = PREFIX(train)(&subprob,param, &dummy_status, blas_functions); + if(param->probability && + (param->svm_type == C_SVC || param->svm_type == NU_SVC)) + { + double *prob_estimates=Malloc(double, PREFIX(get_nr_class)(submodel)); + for(j=begin;jx + perm[j]),prob_estimates, blas_functions); +#else + target[perm[j]] = PREFIX(predict_probability)(submodel,prob->x[perm[j]],prob_estimates, blas_functions); +#endif + free(prob_estimates); + } + else + for(j=begin;jx+perm[j],blas_functions); +#else + target[perm[j]] = PREFIX(predict)(submodel,prob->x[perm[j]],blas_functions); +#endif + PREFIX(free_and_destroy_model)(&submodel); + free(subprob.x); + free(subprob.y); + free(subprob.W); + } + free(fold_start); + free(perm); +} + + +int PREFIX(get_svm_type)(const PREFIX(model) *model) +{ + return model->param.svm_type; +} + +int PREFIX(get_nr_class)(const PREFIX(model) *model) +{ + return model->nr_class; +} + +void PREFIX(get_labels)(const PREFIX(model) *model, int* label) +{ + if (model->label != NULL) + for(int i=0;inr_class;i++) + label[i] = model->label[i]; +} + +double PREFIX(get_svr_probability)(const PREFIX(model) *model) +{ + if ((model->param.svm_type == EPSILON_SVR || model->param.svm_type == NU_SVR) && + model->probA!=NULL) + return model->probA[0]; + else + { + fprintf(stderr,"Model doesn't contain information for SVR probability inference\n"); + return 0; + } +} + +double PREFIX(predict_values)(const PREFIX(model) *model, const PREFIX(node) *x, double* dec_values, BlasFunctions *blas_functions) +{ + int i; + if(model->param.svm_type == ONE_CLASS || + model->param.svm_type == EPSILON_SVR || + model->param.svm_type == NU_SVR) + { + double *sv_coef = model->sv_coef[0]; + double sum = 0; + + for(i=0;il;i++) +#ifdef _DENSE_REP + sum += sv_coef[i] * NAMESPACE::Kernel::k_function(x,model->SV+i,model->param,blas_functions); +#else + sum += sv_coef[i] * NAMESPACE::Kernel::k_function(x,model->SV[i],model->param,blas_functions); +#endif + sum -= model->rho[0]; + *dec_values = sum; + + if(model->param.svm_type == ONE_CLASS) + return (sum>0)?1:-1; + else + return sum; + } + else + { + int nr_class = model->nr_class; + int l = model->l; + + double *kvalue = Malloc(double,l); + for(i=0;iSV+i,model->param,blas_functions); +#else + kvalue[i] = NAMESPACE::Kernel::k_function(x,model->SV[i],model->param,blas_functions); +#endif + + int *start = Malloc(int,nr_class); + start[0] = 0; + for(i=1;inSV[i-1]; + + int *vote = Malloc(int,nr_class); + for(i=0;inSV[i]; + int cj = model->nSV[j]; + + int k; + double *coef1 = model->sv_coef[j-1]; + double *coef2 = model->sv_coef[i]; + for(k=0;krho[p]; + dec_values[p] = sum; + + if(dec_values[p] > 0) + ++vote[i]; + else + ++vote[j]; + p++; + } + + int vote_max_idx = 0; + for(i=1;i vote[vote_max_idx]) + vote_max_idx = i; + + free(kvalue); + free(start); + free(vote); + return model->label[vote_max_idx]; + } +} + +double PREFIX(predict)(const PREFIX(model) *model, const PREFIX(node) *x, BlasFunctions *blas_functions) +{ + int nr_class = model->nr_class; + double *dec_values; + if(model->param.svm_type == ONE_CLASS || + model->param.svm_type == EPSILON_SVR || + model->param.svm_type == NU_SVR) + dec_values = Malloc(double, 1); + else + dec_values = Malloc(double, nr_class*(nr_class-1)/2); + double pred_result = PREFIX(predict_values)(model, x, dec_values, blas_functions); + free(dec_values); + return pred_result; +} + +double PREFIX(predict_probability)( + const PREFIX(model) *model, const PREFIX(node) *x, double *prob_estimates, BlasFunctions *blas_functions) +{ + if ((model->param.svm_type == C_SVC || model->param.svm_type == NU_SVC) && + model->probA!=NULL && model->probB!=NULL) + { + int i; + int nr_class = model->nr_class; + double *dec_values = Malloc(double, nr_class*(nr_class-1)/2); + PREFIX(predict_values)(model, x, dec_values, blas_functions); + + double min_prob=1e-7; + double **pairwise_prob=Malloc(double *,nr_class); + for(i=0;iprobA[k],model->probB[k]),min_prob),1-min_prob); + pairwise_prob[j][i]=1-pairwise_prob[i][j]; + k++; + } + NAMESPACE::multiclass_probability(nr_class,pairwise_prob,prob_estimates); + + int prob_max_idx = 0; + for(i=1;i prob_estimates[prob_max_idx]) + prob_max_idx = i; + for(i=0;ilabel[prob_max_idx]; + } + else + return PREFIX(predict)(model, x, blas_functions); +} + + +void PREFIX(free_model_content)(PREFIX(model)* model_ptr) +{ + if(model_ptr->free_sv && model_ptr->l > 0 && model_ptr->SV != NULL) +#ifdef _DENSE_REP + for (int i = 0; i < model_ptr->l; i++) + free(model_ptr->SV[i].values); +#else + free((void *)(model_ptr->SV[0])); +#endif + + if(model_ptr->sv_coef) + { + for(int i=0;inr_class-1;i++) + free(model_ptr->sv_coef[i]); + } + + free(model_ptr->SV); + model_ptr->SV = NULL; + + free(model_ptr->sv_coef); + model_ptr->sv_coef = NULL; + + free(model_ptr->sv_ind); + model_ptr->sv_ind = NULL; + + free(model_ptr->rho); + model_ptr->rho = NULL; + + free(model_ptr->label); + model_ptr->label= NULL; + + free(model_ptr->probA); + model_ptr->probA = NULL; + + free(model_ptr->probB); + model_ptr->probB= NULL; + + free(model_ptr->nSV); + model_ptr->nSV = NULL; + + free(model_ptr->n_iter); + model_ptr->n_iter = NULL; +} + +void PREFIX(free_and_destroy_model)(PREFIX(model)** model_ptr_ptr) +{ + if(model_ptr_ptr != NULL && *model_ptr_ptr != NULL) + { + PREFIX(free_model_content)(*model_ptr_ptr); + free(*model_ptr_ptr); + *model_ptr_ptr = NULL; + } +} + +void PREFIX(destroy_param)(svm_parameter* param) +{ + free(param->weight_label); + free(param->weight); +} + +const char *PREFIX(check_parameter)(const PREFIX(problem) *prob, const svm_parameter *param) +{ + // svm_type + + int svm_type = param->svm_type; + if(svm_type != C_SVC && + svm_type != NU_SVC && + svm_type != ONE_CLASS && + svm_type != EPSILON_SVR && + svm_type != NU_SVR) + return "unknown svm type"; + + // kernel_type, degree + + int kernel_type = param->kernel_type; + if(kernel_type != LINEAR && + kernel_type != POLY && + kernel_type != RBF && + kernel_type != SIGMOID && + kernel_type != PRECOMPUTED) + return "unknown kernel type"; + + if(param->gamma < 0) + return "gamma < 0"; + + if(param->degree < 0) + return "degree of polynomial kernel < 0"; + + // cache_size,eps,C,nu,p,shrinking + + if(param->cache_size <= 0) + return "cache_size <= 0"; + + if(param->eps <= 0) + return "eps <= 0"; + + if(svm_type == C_SVC || + svm_type == EPSILON_SVR || + svm_type == NU_SVR) + if(param->C <= 0) + return "C <= 0"; + + if(svm_type == NU_SVC || + svm_type == ONE_CLASS || + svm_type == NU_SVR) + if(param->nu <= 0 || param->nu > 1) + return "nu <= 0 or nu > 1"; + + if(svm_type == EPSILON_SVR) + if(param->p < 0) + return "p < 0"; + + if(param->shrinking != 0 && + param->shrinking != 1) + return "shrinking != 0 and shrinking != 1"; + + if(param->probability != 0 && + param->probability != 1) + return "probability != 0 and probability != 1"; + + if(param->probability == 1 && + svm_type == ONE_CLASS) + return "one-class SVM probability output not supported yet"; + + + // check whether nu-svc is feasible + + if(svm_type == NU_SVC) + { + int l = prob->l; + int max_nr_class = 16; + int nr_class = 0; + int *label = Malloc(int,max_nr_class); + double *count = Malloc(double,max_nr_class); + + int i; + for(i=0;iy[i]; + int j; + for(j=0;jW[i]; + break; + } + if(j == nr_class) + { + if(nr_class == max_nr_class) + { + max_nr_class *= 2; + label = (int *)realloc(label,max_nr_class*sizeof(int)); + count = (double *)realloc(count,max_nr_class*sizeof(double)); + + } + label[nr_class] = this_label; + count[nr_class] = prob->W[i]; + ++nr_class; + } + } + + for(i=0;inu*(n1+n2)/2 > min(n1,n2)) + { + free(label); + free(count); + return "specified nu is infeasible"; + } + } + } + free(label); + free(count); + } + + if(svm_type == C_SVC || + svm_type == EPSILON_SVR || + svm_type == NU_SVR || + svm_type == ONE_CLASS) + { + PREFIX(problem) newprob; + // filter samples with negative and null weights + remove_zero_weight(&newprob, prob); + + // all samples were removed + if(newprob.l == 0) { + free(newprob.x); + free(newprob.y); + free(newprob.W); + return "Invalid input - all samples have zero or negative weights."; + } + else if(prob->l != newprob.l && + svm_type == C_SVC) + { + bool only_one_label = true; + int first_label = newprob.y[0]; + for(int i=1;i + */ +#ifndef _NEWRAND_H +#define _NEWRAND_H + +#ifdef __cplusplus +#include // needed for cython to generate a .cpp file from newrand.h +extern "C" { +#endif + +// Scikit-Learn-specific random number generator replacing `rand()` originally +// used in LibSVM / LibLinear, to ensure the same behaviour on windows-linux, +// with increased speed +// - (1) Init a `mt_rand` object +std::mt19937 mt_rand(std::mt19937::default_seed); + +// - (2) public `set_seed()` function that should be used instead of `srand()` to set a new seed. +void set_seed(unsigned custom_seed) { + mt_rand.seed(custom_seed); +} + +// - (3) New internal `bounded_rand_int` function, used instead of rand() everywhere. +inline uint32_t bounded_rand_int(uint32_t range) { + // "LibSVM / LibLinear Original way" - make a 31bit positive + // random number and use modulo to make it fit in the range + // return abs( (int)mt_rand()) % range; + + // "Better way": tweaked Lemire post-processor + // from http://www.pcg-random.org/posts/bounded-rands.html + uint32_t x = mt_rand(); + uint64_t m = uint64_t(x) * uint64_t(range); + uint32_t l = uint32_t(m); + if (l < range) { + uint32_t t = -range; + if (t >= range) { + t -= range; + if (t >= range) + t %= range; + } + while (l < t) { + x = mt_rand(); + m = uint64_t(x) * uint64_t(range); + l = uint32_t(m); + } + } + return m >> 32; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _NEWRAND_H */ diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/svm/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/tests/test_bounds.py b/.venv/lib/python3.12/site-packages/sklearn/svm/tests/test_bounds.py new file mode 100644 index 0000000000000000000000000000000000000000..af7e8cfb1159d1c7520d4b506015727c80391cad --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/tests/test_bounds.py @@ -0,0 +1,147 @@ +import numpy as np +import pytest +from scipy import stats + +from sklearn.linear_model import LogisticRegression +from sklearn.svm import LinearSVC +from sklearn.svm._bounds import l1_min_c +from sklearn.svm._newrand import bounded_rand_int_wrap, set_seed_wrap +from sklearn.utils.fixes import CSR_CONTAINERS + +dense_X = [[-1, 0], [0, 1], [1, 1], [1, 1]] + +Y1 = [0, 1, 1, 1] +Y2 = [2, 1, 0, 0] + + +# TODO(1.8): remove filterwarnings after the deprecation of liblinear multiclass +# and maybe remove LogisticRegression from this test +@pytest.mark.filterwarnings( + "ignore:.*'liblinear' solver for multiclass classification is deprecated.*" +) +@pytest.mark.parametrize("X_container", CSR_CONTAINERS + [np.array]) +@pytest.mark.parametrize("loss", ["squared_hinge", "log"]) +@pytest.mark.parametrize("Y_label", ["two-classes", "multi-class"]) +@pytest.mark.parametrize("intercept_label", ["no-intercept", "fit-intercept"]) +def test_l1_min_c(X_container, loss, Y_label, intercept_label): + Ys = {"two-classes": Y1, "multi-class": Y2} + intercepts = { + "no-intercept": {"fit_intercept": False}, + "fit-intercept": {"fit_intercept": True, "intercept_scaling": 10}, + } + + X = X_container(dense_X) + Y = Ys[Y_label] + intercept_params = intercepts[intercept_label] + check_l1_min_c(X, Y, loss, **intercept_params) + + +def check_l1_min_c(X, y, loss, fit_intercept=True, intercept_scaling=1.0): + min_c = l1_min_c( + X, + y, + loss=loss, + fit_intercept=fit_intercept, + intercept_scaling=intercept_scaling, + ) + + clf = { + "log": LogisticRegression(penalty="l1", solver="liblinear"), + "squared_hinge": LinearSVC(loss="squared_hinge", penalty="l1", dual=False), + }[loss] + + clf.fit_intercept = fit_intercept + clf.intercept_scaling = intercept_scaling + + clf.C = min_c + clf.fit(X, y) + assert (np.asarray(clf.coef_) == 0).all() + assert (np.asarray(clf.intercept_) == 0).all() + + clf.C = min_c * 1.01 + clf.fit(X, y) + assert (np.asarray(clf.coef_) != 0).any() or (np.asarray(clf.intercept_) != 0).any() + + +def test_ill_posed_min_c(): + X = [[0, 0], [0, 0]] + y = [0, 1] + with pytest.raises(ValueError): + l1_min_c(X, y) + + +_MAX_UNSIGNED_INT = 4294967295 + + +def test_newrand_default(): + """Test that bounded_rand_int_wrap without seeding respects the range + + Note this test should pass either if executed alone, or in conjunctions + with other tests that call set_seed explicit in any order: it checks + invariants on the RNG instead of specific values. + """ + generated = [bounded_rand_int_wrap(100) for _ in range(10)] + assert all(0 <= x < 100 for x in generated) + assert not all(x == generated[0] for x in generated) + + +@pytest.mark.parametrize("seed, expected", [(0, 54), (_MAX_UNSIGNED_INT, 9)]) +def test_newrand_set_seed(seed, expected): + """Test that `set_seed` produces deterministic results""" + set_seed_wrap(seed) + generated = bounded_rand_int_wrap(100) + assert generated == expected + + +@pytest.mark.parametrize("seed", [-1, _MAX_UNSIGNED_INT + 1]) +def test_newrand_set_seed_overflow(seed): + """Test that `set_seed_wrap` is defined for unsigned 32bits ints""" + with pytest.raises(OverflowError): + set_seed_wrap(seed) + + +@pytest.mark.parametrize("range_, n_pts", [(_MAX_UNSIGNED_INT, 10000), (100, 25)]) +def test_newrand_bounded_rand_int(range_, n_pts): + """Test that `bounded_rand_int` follows a uniform distribution""" + # XXX: this test is very seed sensitive: either it is wrong (too strict?) + # or the wrapped RNG is not uniform enough, at least on some platforms. + set_seed_wrap(42) + n_iter = 100 + ks_pvals = [] + uniform_dist = stats.uniform(loc=0, scale=range_) + # perform multiple samplings to make chance of outlier sampling negligible + for _ in range(n_iter): + # Deterministic random sampling + sample = [bounded_rand_int_wrap(range_) for _ in range(n_pts)] + res = stats.kstest(sample, uniform_dist.cdf) + ks_pvals.append(res.pvalue) + # Null hypothesis = samples come from an uniform distribution. + # Under the null hypothesis, p-values should be uniformly distributed + # and not concentrated on low values + # (this may seem counter-intuitive but is backed by multiple refs) + # So we can do two checks: + + # (1) check uniformity of p-values + uniform_p_vals_dist = stats.uniform(loc=0, scale=1) + res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf) + assert res_pvals.pvalue > 0.05, ( + "Null hypothesis rejected: generated random numbers are not uniform." + " Details: the (meta) p-value of the test of uniform distribution" + f" of p-values is {res_pvals.pvalue} which is not > 0.05" + ) + + # (2) (safety belt) check that 90% of p-values are above 0.05 + min_10pct_pval = np.percentile(ks_pvals, q=10) + # lower 10th quantile pvalue <= 0.05 means that the test rejects the + # null hypothesis that the sample came from the uniform distribution + assert min_10pct_pval > 0.05, ( + "Null hypothesis rejected: generated random numbers are not uniform. " + f"Details: lower 10th quantile p-value of {min_10pct_pval} not > 0.05." + ) + + +@pytest.mark.parametrize("range_", [-1, _MAX_UNSIGNED_INT + 1]) +def test_newrand_bounded_rand_int_limits(range_): + """Test that `bounded_rand_int_wrap` is defined for unsigned 32bits ints""" + with pytest.raises(OverflowError): + bounded_rand_int_wrap(range_) diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/tests/test_sparse.py b/.venv/lib/python3.12/site-packages/sklearn/svm/tests/test_sparse.py new file mode 100644 index 0000000000000000000000000000000000000000..4e22c86a66cd8b5625f100990e441675c7f62e34 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/tests/test_sparse.py @@ -0,0 +1,496 @@ +import numpy as np +import pytest +from scipy import sparse + +from sklearn import base, datasets, linear_model, svm +from sklearn.datasets import load_digits, make_blobs, make_classification +from sklearn.exceptions import ConvergenceWarning +from sklearn.svm.tests import test_svm +from sklearn.utils._testing import ( + assert_allclose, + assert_array_almost_equal, + assert_array_equal, + ignore_warnings, + skip_if_32bit, +) +from sklearn.utils.extmath import safe_sparse_dot +from sklearn.utils.fixes import ( + CSR_CONTAINERS, + DOK_CONTAINERS, + LIL_CONTAINERS, +) + +# test sample 1 +X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]) +Y = [1, 1, 1, 2, 2, 2] +T = np.array([[-1, -1], [2, 2], [3, 2]]) +true_result = [1, 2, 2] + +# test sample 2 +X2 = np.array( + [ + [0, 0, 0], + [1, 1, 1], + [2, 0, 0], + [0, 0, 2], + [3, 3, 3], + ] +) +Y2 = [1, 2, 2, 2, 3] +T2 = np.array([[-1, -1, -1], [1, 1, 1], [2, 2, 2]]) +true_result2 = [1, 2, 3] + +iris = datasets.load_iris() +rng = np.random.RandomState(0) +perm = rng.permutation(iris.target.size) +iris.data = iris.data[perm] +iris.target = iris.target[perm] + +X_blobs, y_blobs = make_blobs(n_samples=100, centers=10, random_state=0) + + +def check_svm_model_equal(dense_svm, X_train, y_train, X_test): + # Use the original svm model for dense fit and clone an exactly same + # svm model for sparse fit + sparse_svm = base.clone(dense_svm) + + dense_svm.fit(X_train.toarray(), y_train) + if sparse.issparse(X_test): + X_test_dense = X_test.toarray() + else: + X_test_dense = X_test + sparse_svm.fit(X_train, y_train) + assert sparse.issparse(sparse_svm.support_vectors_) + assert sparse.issparse(sparse_svm.dual_coef_) + assert_allclose(dense_svm.support_vectors_, sparse_svm.support_vectors_.toarray()) + assert_allclose(dense_svm.dual_coef_, sparse_svm.dual_coef_.toarray()) + if dense_svm.kernel == "linear": + assert sparse.issparse(sparse_svm.coef_) + assert_array_almost_equal(dense_svm.coef_, sparse_svm.coef_.toarray()) + assert_allclose(dense_svm.support_, sparse_svm.support_) + assert_allclose(dense_svm.predict(X_test_dense), sparse_svm.predict(X_test)) + + assert_array_almost_equal( + dense_svm.decision_function(X_test_dense), sparse_svm.decision_function(X_test) + ) + assert_array_almost_equal( + dense_svm.decision_function(X_test_dense), + sparse_svm.decision_function(X_test_dense), + ) + if isinstance(dense_svm, svm.OneClassSVM): + msg = "cannot use sparse input in 'OneClassSVM' trained on dense data" + else: + assert_array_almost_equal( + dense_svm.predict_proba(X_test_dense), + sparse_svm.predict_proba(X_test), + decimal=4, + ) + msg = "cannot use sparse input in 'SVC' trained on dense data" + if sparse.issparse(X_test): + with pytest.raises(ValueError, match=msg): + dense_svm.predict(X_test) + + +@skip_if_32bit +@pytest.mark.parametrize( + "X_train, y_train, X_test", + [ + [X, Y, T], + [X2, Y2, T2], + [X_blobs[:80], y_blobs[:80], X_blobs[80:]], + [iris.data, iris.target, iris.data], + ], +) +@pytest.mark.parametrize("kernel", ["linear", "poly", "rbf", "sigmoid"]) +@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + LIL_CONTAINERS) +def test_svc(X_train, y_train, X_test, kernel, sparse_container): + """Check that sparse SVC gives the same result as SVC.""" + X_train = sparse_container(X_train) + + clf = svm.SVC( + gamma=1, + kernel=kernel, + probability=True, + random_state=0, + decision_function_shape="ovo", + ) + check_svm_model_equal(clf, X_train, y_train, X_test) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_unsorted_indices(csr_container): + # test that the result with sorted and unsorted indices in csr is the same + # we use a subset of digits as iris, blobs or make_classification didn't + # show the problem + X, y = load_digits(return_X_y=True) + X_test = csr_container(X[50:100]) + X, y = X[:50], y[:50] + tols = dict(rtol=1e-12, atol=1e-14) + + X_sparse = csr_container(X) + coef_dense = ( + svm.SVC(kernel="linear", probability=True, random_state=0).fit(X, y).coef_ + ) + sparse_svc = svm.SVC(kernel="linear", probability=True, random_state=0).fit( + X_sparse, y + ) + coef_sorted = sparse_svc.coef_ + # make sure dense and sparse SVM give the same result + assert_allclose(coef_dense, coef_sorted.toarray(), **tols) + + # reverse each row's indices + def scramble_indices(X): + new_data = [] + new_indices = [] + for i in range(1, len(X.indptr)): + row_slice = slice(*X.indptr[i - 1 : i + 1]) + new_data.extend(X.data[row_slice][::-1]) + new_indices.extend(X.indices[row_slice][::-1]) + return csr_container((new_data, new_indices, X.indptr), shape=X.shape) + + X_sparse_unsorted = scramble_indices(X_sparse) + X_test_unsorted = scramble_indices(X_test) + + assert not X_sparse_unsorted.has_sorted_indices + assert not X_test_unsorted.has_sorted_indices + + unsorted_svc = svm.SVC(kernel="linear", probability=True, random_state=0).fit( + X_sparse_unsorted, y + ) + coef_unsorted = unsorted_svc.coef_ + # make sure unsorted indices give same result + assert_allclose(coef_unsorted.toarray(), coef_sorted.toarray(), **tols) + assert_allclose( + sparse_svc.predict_proba(X_test_unsorted), + sparse_svc.predict_proba(X_test), + **tols, + ) + + +@pytest.mark.parametrize("lil_container", LIL_CONTAINERS) +def test_svc_with_custom_kernel(lil_container): + def kfunc(x, y): + return safe_sparse_dot(x, y.T) + + X_sp = lil_container(X) + clf_lin = svm.SVC(kernel="linear").fit(X_sp, Y) + clf_mylin = svm.SVC(kernel=kfunc).fit(X_sp, Y) + assert_array_equal(clf_lin.predict(X_sp), clf_mylin.predict(X_sp)) + + +@skip_if_32bit +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +@pytest.mark.parametrize("kernel", ["linear", "poly", "rbf"]) +def test_svc_iris(csr_container, kernel): + # Test the sparse SVC with the iris dataset + iris_data_sp = csr_container(iris.data) + + sp_clf = svm.SVC(kernel=kernel).fit(iris_data_sp, iris.target) + clf = svm.SVC(kernel=kernel).fit(iris.data, iris.target) + + assert_allclose(clf.support_vectors_, sp_clf.support_vectors_.toarray()) + assert_allclose(clf.dual_coef_, sp_clf.dual_coef_.toarray()) + assert_allclose(clf.predict(iris.data), sp_clf.predict(iris_data_sp)) + if kernel == "linear": + assert_allclose(clf.coef_, sp_clf.coef_.toarray()) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse_decision_function(csr_container): + # Test decision_function + + # Sanity check, test that decision_function implemented in python + # returns the same as the one in libsvm + + # multi class: + iris_data_sp = csr_container(iris.data) + svc = svm.SVC(kernel="linear", C=0.1, decision_function_shape="ovo") + clf = svc.fit(iris_data_sp, iris.target) + + dec = safe_sparse_dot(iris_data_sp, clf.coef_.T) + clf.intercept_ + + assert_allclose(dec, clf.decision_function(iris_data_sp)) + + # binary: + clf.fit(X, Y) + dec = np.dot(X, clf.coef_.T) + clf.intercept_ + prediction = clf.predict(X) + assert_allclose(dec.ravel(), clf.decision_function(X)) + assert_allclose( + prediction, clf.classes_[(clf.decision_function(X) > 0).astype(int).ravel()] + ) + expected = np.array([-1.0, -0.66, -1.0, 0.66, 1.0, 1.0]) + assert_array_almost_equal(clf.decision_function(X), expected, decimal=2) + + +@pytest.mark.parametrize("lil_container", LIL_CONTAINERS) +def test_error(lil_container): + # Test that it gives proper exception on deficient input + clf = svm.SVC() + X_sp = lil_container(X) + + Y2 = Y[:-1] # wrong dimensions for labels + with pytest.raises(ValueError): + clf.fit(X_sp, Y2) + + clf.fit(X_sp, Y) + assert_array_equal(clf.predict(T), true_result) + + +@pytest.mark.parametrize( + "lil_container, dok_container", zip(LIL_CONTAINERS, DOK_CONTAINERS) +) +def test_linearsvc(lil_container, dok_container): + # Similar to test_SVC + X_sp = lil_container(X) + X2_sp = dok_container(X2) + + clf = svm.LinearSVC(random_state=0).fit(X, Y) + sp_clf = svm.LinearSVC(random_state=0).fit(X_sp, Y) + + assert sp_clf.fit_intercept + + assert_array_almost_equal(clf.coef_, sp_clf.coef_, decimal=4) + assert_array_almost_equal(clf.intercept_, sp_clf.intercept_, decimal=4) + + assert_allclose(clf.predict(X), sp_clf.predict(X_sp)) + + clf.fit(X2, Y2) + sp_clf.fit(X2_sp, Y2) + + assert_array_almost_equal(clf.coef_, sp_clf.coef_, decimal=4) + assert_array_almost_equal(clf.intercept_, sp_clf.intercept_, decimal=4) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_linearsvc_iris(csr_container): + # Test the sparse LinearSVC with the iris dataset + iris_data_sp = csr_container(iris.data) + + sp_clf = svm.LinearSVC(random_state=0).fit(iris_data_sp, iris.target) + clf = svm.LinearSVC(random_state=0).fit(iris.data, iris.target) + + assert clf.fit_intercept == sp_clf.fit_intercept + + assert_array_almost_equal(clf.coef_, sp_clf.coef_, decimal=1) + assert_array_almost_equal(clf.intercept_, sp_clf.intercept_, decimal=1) + assert_allclose(clf.predict(iris.data), sp_clf.predict(iris_data_sp)) + + # check decision_function + pred = np.argmax(sp_clf.decision_function(iris_data_sp), axis=1) + assert_allclose(pred, clf.predict(iris.data)) + + # sparsify the coefficients on both models and check that they still + # produce the same results + clf.sparsify() + assert_array_equal(pred, clf.predict(iris_data_sp)) + sp_clf.sparsify() + assert_array_equal(pred, sp_clf.predict(iris_data_sp)) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_weight(csr_container): + # Test class weights + X_, y_ = make_classification( + n_samples=200, n_features=100, weights=[0.833, 0.167], random_state=0 + ) + + X_ = csr_container(X_) + for clf in ( + linear_model.LogisticRegression(), + svm.LinearSVC(random_state=0), + svm.SVC(), + ): + clf.set_params(class_weight={0: 5}) + clf.fit(X_[:180], y_[:180]) + y_pred = clf.predict(X_[180:]) + assert np.sum(y_pred == y_[180:]) >= 11 + + +@pytest.mark.parametrize("lil_container", LIL_CONTAINERS) +def test_sample_weights(lil_container): + # Test weights on individual samples + X_sp = lil_container(X) + + clf = svm.SVC() + clf.fit(X_sp, Y) + assert_array_equal(clf.predict([X[2]]), [1.0]) + + sample_weight = [0.1] * 3 + [10] * 3 + clf.fit(X_sp, Y, sample_weight=sample_weight) + assert_array_equal(clf.predict([X[2]]), [2.0]) + + +def test_sparse_liblinear_intercept_handling(): + # Test that sparse liblinear honours intercept_scaling param + test_svm.test_dense_liblinear_intercept_handling(svm.LinearSVC) + + +@pytest.mark.parametrize( + "X_train, y_train, X_test", + [ + [X, None, T], + [X2, None, T2], + [X_blobs[:80], None, X_blobs[80:]], + [iris.data, None, iris.data], + ], +) +@pytest.mark.parametrize("kernel", ["linear", "poly", "rbf", "sigmoid"]) +@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + LIL_CONTAINERS) +@skip_if_32bit +def test_sparse_oneclasssvm(X_train, y_train, X_test, kernel, sparse_container): + # Check that sparse OneClassSVM gives the same result as dense OneClassSVM + X_train = sparse_container(X_train) + + clf = svm.OneClassSVM(gamma=1, kernel=kernel) + check_svm_model_equal(clf, X_train, y_train, X_test) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse_realdata(csr_container): + # Test on a subset from the 20newsgroups dataset. + # This catches some bugs if input is not correctly converted into + # sparse format or weights are not correctly initialized. + data = np.array([0.03771744, 0.1003567, 0.01174647, 0.027069]) + + # SVC does not support large sparse, so we specify int32 indices + # In this case, `csr_matrix` automatically uses int32 regardless of the dtypes of + # `indices` and `indptr` but `csr_array` may or may not use the same dtype as + # `indices` and `indptr`, which would be int64 if not specified + indices = np.array([6, 5, 35, 31], dtype=np.int32) + indptr = np.array([0] * 8 + [1] * 32 + [2] * 38 + [4] * 3, dtype=np.int32) + + X = csr_container((data, indices, indptr)) + y = np.array( + [ + 1.0, + 0.0, + 2.0, + 2.0, + 1.0, + 1.0, + 1.0, + 2.0, + 2.0, + 0.0, + 1.0, + 2.0, + 2.0, + 0.0, + 2.0, + 0.0, + 3.0, + 0.0, + 3.0, + 0.0, + 1.0, + 1.0, + 3.0, + 2.0, + 3.0, + 2.0, + 0.0, + 3.0, + 1.0, + 0.0, + 2.0, + 1.0, + 2.0, + 0.0, + 1.0, + 0.0, + 2.0, + 3.0, + 1.0, + 3.0, + 0.0, + 1.0, + 0.0, + 0.0, + 2.0, + 0.0, + 1.0, + 2.0, + 2.0, + 2.0, + 3.0, + 2.0, + 0.0, + 3.0, + 2.0, + 1.0, + 2.0, + 3.0, + 2.0, + 2.0, + 0.0, + 1.0, + 0.0, + 1.0, + 2.0, + 3.0, + 0.0, + 0.0, + 2.0, + 2.0, + 1.0, + 3.0, + 1.0, + 1.0, + 0.0, + 1.0, + 2.0, + 1.0, + 1.0, + 3.0, + ] + ) + + clf = svm.SVC(kernel="linear").fit(X.toarray(), y) + sp_clf = svm.SVC(kernel="linear").fit(X.tocoo(), y) + + assert_array_equal(clf.support_vectors_, sp_clf.support_vectors_.toarray()) + assert_array_equal(clf.dual_coef_, sp_clf.dual_coef_.toarray()) + + +@pytest.mark.parametrize("lil_container", LIL_CONTAINERS) +def test_sparse_svc_clone_with_callable_kernel(lil_container): + # Test that the "dense_fit" is called even though we use sparse input + # meaning that everything works fine. + a = svm.SVC(C=1, kernel=lambda x, y: x @ y.T, probability=True, random_state=0) + b = base.clone(a) + + X_sp = lil_container(X) + b.fit(X_sp, Y) + pred = b.predict(X_sp) + b.predict_proba(X_sp) + + dense_svm = svm.SVC( + C=1, kernel=lambda x, y: np.dot(x, y.T), probability=True, random_state=0 + ) + pred_dense = dense_svm.fit(X, Y).predict(X) + assert_array_equal(pred_dense, pred) + # b.decision_function(X_sp) # XXX : should be supported + + +@pytest.mark.parametrize("lil_container", LIL_CONTAINERS) +def test_timeout(lil_container): + sp = svm.SVC( + C=1, kernel=lambda x, y: x @ y.T, probability=True, random_state=0, max_iter=1 + ) + warning_msg = ( + r"Solver terminated early \(max_iter=1\). Consider pre-processing " + r"your data with StandardScaler or MinMaxScaler." + ) + with pytest.warns(ConvergenceWarning, match=warning_msg): + sp.fit(lil_container(X), Y) + + +def test_consistent_proba(): + a = svm.SVC(probability=True, max_iter=1, random_state=0) + with ignore_warnings(category=ConvergenceWarning): + proba_1 = a.fit(X, Y).predict_proba(X) + a = svm.SVC(probability=True, max_iter=1, random_state=0) + with ignore_warnings(category=ConvergenceWarning): + proba_2 = a.fit(X, Y).predict_proba(X) + assert_allclose(proba_1, proba_2) diff --git a/.venv/lib/python3.12/site-packages/sklearn/svm/tests/test_svm.py b/.venv/lib/python3.12/site-packages/sklearn/svm/tests/test_svm.py new file mode 100644 index 0000000000000000000000000000000000000000..62396451e736d02fffce21dd1f7219eba2614199 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/svm/tests/test_svm.py @@ -0,0 +1,1440 @@ +""" +Testing for Support Vector Machine module (sklearn.svm) + +TODO: remove hard coded numerical results when possible +""" + +import numpy as np +import pytest +from numpy.testing import ( + assert_allclose, + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) + +from sklearn import base, datasets, linear_model, metrics, svm +from sklearn.datasets import make_blobs, make_classification, make_regression +from sklearn.exceptions import ( + ConvergenceWarning, + NotFittedError, +) +from sklearn.metrics import f1_score +from sklearn.metrics.pairwise import rbf_kernel +from sklearn.model_selection import train_test_split +from sklearn.multiclass import OneVsRestClassifier + +# mypy error: Module 'sklearn.svm' has no attribute '_libsvm' +from sklearn.svm import ( # type: ignore[attr-defined] + SVR, + LinearSVC, + LinearSVR, + NuSVR, + OneClassSVM, + _libsvm, +) +from sklearn.svm._classes import _validate_dual_parameter +from sklearn.utils import check_random_state, shuffle +from sklearn.utils.fixes import _IS_32BIT, CSR_CONTAINERS, LIL_CONTAINERS +from sklearn.utils.validation import _num_samples + +# toy sample +X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] +Y = [1, 1, 1, 2, 2, 2] +T = [[-1, -1], [2, 2], [3, 2]] +true_result = [1, 2, 2] + +# also load the iris dataset +iris = datasets.load_iris() +rng = check_random_state(42) +perm = rng.permutation(iris.target.size) +iris.data = iris.data[perm] +iris.target = iris.target[perm] + + +def test_libsvm_parameters(): + # Test parameters on classes that make use of libsvm. + clf = svm.SVC(kernel="linear").fit(X, Y) + assert_array_equal(clf.dual_coef_, [[-0.25, 0.25]]) + assert_array_equal(clf.support_, [1, 3]) + assert_array_equal(clf.support_vectors_, (X[1], X[3])) + assert_array_equal(clf.intercept_, [0.0]) + assert_array_equal(clf.predict(X), Y) + + +def test_libsvm_iris(): + # Check consistency on dataset iris. + + # shuffle the dataset so that labels are not ordered + for k in ("linear", "rbf"): + clf = svm.SVC(kernel=k).fit(iris.data, iris.target) + assert np.mean(clf.predict(iris.data) == iris.target) > 0.9 + assert hasattr(clf, "coef_") == (k == "linear") + + assert_array_equal(clf.classes_, np.sort(clf.classes_)) + + # check also the low-level API + # We unpack the values to create a dictionary with some of the return values + # from Libsvm's fit. + ( + libsvm_support, + libsvm_support_vectors, + libsvm_n_class_SV, + libsvm_sv_coef, + libsvm_intercept, + libsvm_probA, + libsvm_probB, + # libsvm_fit_status and libsvm_n_iter won't be used below. + libsvm_fit_status, + libsvm_n_iter, + ) = _libsvm.fit(iris.data, iris.target.astype(np.float64)) + + model_params = { + "support": libsvm_support, + "SV": libsvm_support_vectors, + "nSV": libsvm_n_class_SV, + "sv_coef": libsvm_sv_coef, + "intercept": libsvm_intercept, + "probA": libsvm_probA, + "probB": libsvm_probB, + } + pred = _libsvm.predict(iris.data, **model_params) + assert np.mean(pred == iris.target) > 0.95 + + # We unpack the values to create a dictionary with some of the return values + # from Libsvm's fit. + ( + libsvm_support, + libsvm_support_vectors, + libsvm_n_class_SV, + libsvm_sv_coef, + libsvm_intercept, + libsvm_probA, + libsvm_probB, + # libsvm_fit_status and libsvm_n_iter won't be used below. + libsvm_fit_status, + libsvm_n_iter, + ) = _libsvm.fit(iris.data, iris.target.astype(np.float64), kernel="linear") + + model_params = { + "support": libsvm_support, + "SV": libsvm_support_vectors, + "nSV": libsvm_n_class_SV, + "sv_coef": libsvm_sv_coef, + "intercept": libsvm_intercept, + "probA": libsvm_probA, + "probB": libsvm_probB, + } + pred = _libsvm.predict(iris.data, **model_params, kernel="linear") + assert np.mean(pred == iris.target) > 0.95 + + pred = _libsvm.cross_validation( + iris.data, iris.target.astype(np.float64), 5, kernel="linear", random_seed=0 + ) + assert np.mean(pred == iris.target) > 0.95 + + # If random_seed >= 0, the libsvm rng is seeded (by calling `srand`), hence + # we should get deterministic results (assuming that there is no other + # thread calling this wrapper calling `srand` concurrently). + pred2 = _libsvm.cross_validation( + iris.data, iris.target.astype(np.float64), 5, kernel="linear", random_seed=0 + ) + assert_array_equal(pred, pred2) + + +def test_precomputed(): + # SVC with a precomputed kernel. + # We test it with a toy dataset and with iris. + clf = svm.SVC(kernel="precomputed") + # Gram matrix for train data (square matrix) + # (we use just a linear kernel) + K = np.dot(X, np.array(X).T) + clf.fit(K, Y) + # Gram matrix for test data (rectangular matrix) + KT = np.dot(T, np.array(X).T) + pred = clf.predict(KT) + with pytest.raises(ValueError): + clf.predict(KT.T) + + assert_array_equal(clf.dual_coef_, [[-0.25, 0.25]]) + assert_array_equal(clf.support_, [1, 3]) + assert_array_equal(clf.intercept_, [0]) + assert_array_almost_equal(clf.support_, [1, 3]) + assert_array_equal(pred, true_result) + + # Gram matrix for test data but compute KT[i,j] + # for support vectors j only. + KT = np.zeros_like(KT) + for i in range(len(T)): + for j in clf.support_: + KT[i, j] = np.dot(T[i], X[j]) + + pred = clf.predict(KT) + assert_array_equal(pred, true_result) + + # same as before, but using a callable function instead of the kernel + # matrix. kernel is just a linear kernel + + def kfunc(x, y): + return np.dot(x, y.T) + + clf = svm.SVC(kernel=kfunc) + clf.fit(np.array(X), Y) + pred = clf.predict(T) + + assert_array_equal(clf.dual_coef_, [[-0.25, 0.25]]) + assert_array_equal(clf.intercept_, [0]) + assert_array_almost_equal(clf.support_, [1, 3]) + assert_array_equal(pred, true_result) + + # test a precomputed kernel with the iris dataset + # and check parameters against a linear SVC + clf = svm.SVC(kernel="precomputed") + clf2 = svm.SVC(kernel="linear") + K = np.dot(iris.data, iris.data.T) + clf.fit(K, iris.target) + clf2.fit(iris.data, iris.target) + pred = clf.predict(K) + assert_array_almost_equal(clf.support_, clf2.support_) + assert_array_almost_equal(clf.dual_coef_, clf2.dual_coef_) + assert_array_almost_equal(clf.intercept_, clf2.intercept_) + assert_almost_equal(np.mean(pred == iris.target), 0.99, decimal=2) + + # Gram matrix for test data but compute KT[i,j] + # for support vectors j only. + K = np.zeros_like(K) + for i in range(len(iris.data)): + for j in clf.support_: + K[i, j] = np.dot(iris.data[i], iris.data[j]) + + pred = clf.predict(K) + assert_almost_equal(np.mean(pred == iris.target), 0.99, decimal=2) + + clf = svm.SVC(kernel=kfunc) + clf.fit(iris.data, iris.target) + assert_almost_equal(np.mean(pred == iris.target), 0.99, decimal=2) + + +def test_svr(): + # Test Support Vector Regression + + diabetes = datasets.load_diabetes() + for clf in ( + svm.NuSVR(kernel="linear", nu=0.4, C=1.0), + svm.NuSVR(kernel="linear", nu=0.4, C=10.0), + svm.SVR(kernel="linear", C=10.0), + svm.LinearSVR(C=10.0), + svm.LinearSVR(C=10.0), + ): + clf.fit(diabetes.data, diabetes.target) + assert clf.score(diabetes.data, diabetes.target) > 0.02 + + # non-regression test; previously, BaseLibSVM would check that + # len(np.unique(y)) < 2, which must only be done for SVC + svm.SVR().fit(diabetes.data, np.ones(len(diabetes.data))) + svm.LinearSVR().fit(diabetes.data, np.ones(len(diabetes.data))) + + +def test_linearsvr(): + # check that SVR(kernel='linear') and LinearSVC() give + # comparable results + diabetes = datasets.load_diabetes() + lsvr = svm.LinearSVR(C=1e3).fit(diabetes.data, diabetes.target) + score1 = lsvr.score(diabetes.data, diabetes.target) + + svr = svm.SVR(kernel="linear", C=1e3).fit(diabetes.data, diabetes.target) + score2 = svr.score(diabetes.data, diabetes.target) + + assert_allclose(np.linalg.norm(lsvr.coef_), np.linalg.norm(svr.coef_), 1, 0.0001) + assert_almost_equal(score1, score2, 2) + + +def test_linearsvr_fit_sampleweight(): + # check correct result when sample_weight is 1 + # check that SVR(kernel='linear') and LinearSVC() give + # comparable results + diabetes = datasets.load_diabetes() + n_samples = len(diabetes.target) + unit_weight = np.ones(n_samples) + lsvr = svm.LinearSVR(C=1e3, tol=1e-12, max_iter=10000).fit( + diabetes.data, diabetes.target, sample_weight=unit_weight + ) + score1 = lsvr.score(diabetes.data, diabetes.target) + + lsvr_no_weight = svm.LinearSVR(C=1e3, tol=1e-12, max_iter=10000).fit( + diabetes.data, diabetes.target + ) + score2 = lsvr_no_weight.score(diabetes.data, diabetes.target) + + assert_allclose( + np.linalg.norm(lsvr.coef_), np.linalg.norm(lsvr_no_weight.coef_), 1, 0.0001 + ) + assert_almost_equal(score1, score2, 2) + + # check that fit(X) = fit([X1, X2, X3], sample_weight = [n1, n2, n3]) where + # X = X1 repeated n1 times, X2 repeated n2 times and so forth + random_state = check_random_state(0) + random_weight = random_state.randint(0, 10, n_samples) + lsvr_unflat = svm.LinearSVR(C=1e3, tol=1e-12, max_iter=10000).fit( + diabetes.data, diabetes.target, sample_weight=random_weight + ) + score3 = lsvr_unflat.score( + diabetes.data, diabetes.target, sample_weight=random_weight + ) + + X_flat = np.repeat(diabetes.data, random_weight, axis=0) + y_flat = np.repeat(diabetes.target, random_weight, axis=0) + lsvr_flat = svm.LinearSVR(C=1e3, tol=1e-12, max_iter=10000).fit(X_flat, y_flat) + score4 = lsvr_flat.score(X_flat, y_flat) + + assert_almost_equal(score3, score4, 2) + + +def test_svr_errors(): + X = [[0.0], [1.0]] + y = [0.0, 0.5] + + # Bad kernel + clf = svm.SVR(kernel=lambda x, y: np.array([[1.0]])) + clf.fit(X, y) + with pytest.raises(ValueError): + clf.predict(X) + + +def test_oneclass(): + # Test OneClassSVM + clf = svm.OneClassSVM() + clf.fit(X) + pred = clf.predict(T) + + assert_array_equal(pred, [1, -1, -1]) + assert pred.dtype == np.dtype("intp") + assert_array_almost_equal(clf.intercept_, [-1.218], decimal=3) + assert_array_almost_equal(clf.dual_coef_, [[0.750, 0.750, 0.750, 0.750]], decimal=3) + with pytest.raises(AttributeError): + (lambda: clf.coef_)() + + +def test_oneclass_decision_function(): + # Test OneClassSVM decision function + clf = svm.OneClassSVM() + rnd = check_random_state(2) + + # Generate train data + X = 0.3 * rnd.randn(100, 2) + X_train = np.r_[X + 2, X - 2] + + # Generate some regular novel observations + X = 0.3 * rnd.randn(20, 2) + X_test = np.r_[X + 2, X - 2] + # Generate some abnormal novel observations + X_outliers = rnd.uniform(low=-4, high=4, size=(20, 2)) + + # fit the model + clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1) + clf.fit(X_train) + + # predict things + y_pred_test = clf.predict(X_test) + assert np.mean(y_pred_test == 1) > 0.9 + y_pred_outliers = clf.predict(X_outliers) + assert np.mean(y_pred_outliers == -1) > 0.9 + dec_func_test = clf.decision_function(X_test) + assert_array_equal((dec_func_test > 0).ravel(), y_pred_test == 1) + dec_func_outliers = clf.decision_function(X_outliers) + assert_array_equal((dec_func_outliers > 0).ravel(), y_pred_outliers == 1) + + +def test_oneclass_score_samples(): + X_train = [[1, 1], [1, 2], [2, 1]] + clf = svm.OneClassSVM(gamma=1).fit(X_train) + assert_array_equal( + clf.score_samples([[2.0, 2.0]]), + clf.decision_function([[2.0, 2.0]]) + clf.offset_, + ) + + +def test_tweak_params(): + # Make sure some tweaking of parameters works. + # We change clf.dual_coef_ at run time and expect .predict() to change + # accordingly. Notice that this is not trivial since it involves a lot + # of C/Python copying in the libsvm bindings. + # The success of this test ensures that the mapping between libsvm and + # the python classifier is complete. + clf = svm.SVC(kernel="linear", C=1.0) + clf.fit(X, Y) + assert_array_equal(clf.dual_coef_, [[-0.25, 0.25]]) + assert_array_equal(clf.predict([[-0.1, -0.1]]), [1]) + clf._dual_coef_ = np.array([[0.0, 1.0]]) + assert_array_equal(clf.predict([[-0.1, -0.1]]), [2]) + + +def test_probability(): + # Predict probabilities using SVC + # This uses cross validation, so we use a slightly bigger testing set. + + for clf in ( + svm.SVC(probability=True, random_state=0, C=1.0), + svm.NuSVC(probability=True, random_state=0), + ): + clf.fit(iris.data, iris.target) + + prob_predict = clf.predict_proba(iris.data) + assert_array_almost_equal(np.sum(prob_predict, 1), np.ones(iris.data.shape[0])) + assert np.mean(np.argmax(prob_predict, 1) == clf.predict(iris.data)) > 0.9 + + assert_almost_equal( + clf.predict_proba(iris.data), np.exp(clf.predict_log_proba(iris.data)), 8 + ) + + +def test_decision_function(): + # Test decision_function + # Sanity check, test that decision_function implemented in python + # returns the same as the one in libsvm + # multi class: + clf = svm.SVC(kernel="linear", C=0.1, decision_function_shape="ovo").fit( + iris.data, iris.target + ) + + dec = np.dot(iris.data, clf.coef_.T) + clf.intercept_ + + assert_array_almost_equal(dec, clf.decision_function(iris.data)) + + # binary: + clf.fit(X, Y) + dec = np.dot(X, clf.coef_.T) + clf.intercept_ + prediction = clf.predict(X) + assert_array_almost_equal(dec.ravel(), clf.decision_function(X)) + assert_array_almost_equal( + prediction, clf.classes_[(clf.decision_function(X) > 0).astype(int)] + ) + expected = np.array([-1.0, -0.66, -1.0, 0.66, 1.0, 1.0]) + assert_array_almost_equal(clf.decision_function(X), expected, 2) + + # kernel binary: + clf = svm.SVC(kernel="rbf", gamma=1, decision_function_shape="ovo") + clf.fit(X, Y) + + rbfs = rbf_kernel(X, clf.support_vectors_, gamma=clf.gamma) + dec = np.dot(rbfs, clf.dual_coef_.T) + clf.intercept_ + assert_array_almost_equal(dec.ravel(), clf.decision_function(X)) + + +@pytest.mark.parametrize("SVM", (svm.SVC, svm.NuSVC)) +def test_decision_function_shape(SVM): + # check that decision_function_shape='ovr' or 'ovo' gives + # correct shape and is consistent with predict + + clf = SVM(kernel="linear", decision_function_shape="ovr").fit( + iris.data, iris.target + ) + dec = clf.decision_function(iris.data) + assert dec.shape == (len(iris.data), 3) + assert_array_equal(clf.predict(iris.data), np.argmax(dec, axis=1)) + + # with five classes: + X, y = make_blobs(n_samples=80, centers=5, random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + clf = SVM(kernel="linear", decision_function_shape="ovr").fit(X_train, y_train) + dec = clf.decision_function(X_test) + assert dec.shape == (len(X_test), 5) + assert_array_equal(clf.predict(X_test), np.argmax(dec, axis=1)) + + # check shape of ovo_decition_function=True + clf = SVM(kernel="linear", decision_function_shape="ovo").fit(X_train, y_train) + dec = clf.decision_function(X_train) + assert dec.shape == (len(X_train), 10) + + +def test_svr_predict(): + # Test SVR's decision_function + # Sanity check, test that predict implemented in python + # returns the same as the one in libsvm + + X = iris.data + y = iris.target + + # linear kernel + reg = svm.SVR(kernel="linear", C=0.1).fit(X, y) + + dec = np.dot(X, reg.coef_.T) + reg.intercept_ + assert_array_almost_equal(dec.ravel(), reg.predict(X).ravel()) + + # rbf kernel + reg = svm.SVR(kernel="rbf", gamma=1).fit(X, y) + + rbfs = rbf_kernel(X, reg.support_vectors_, gamma=reg.gamma) + dec = np.dot(rbfs, reg.dual_coef_.T) + reg.intercept_ + assert_array_almost_equal(dec.ravel(), reg.predict(X).ravel()) + + +def test_weight(): + # Test class weights + clf = svm.SVC(class_weight={1: 0.1}) + # we give a small weights to class 1 + clf.fit(X, Y) + # so all predicted values belong to class 2 + assert_array_almost_equal(clf.predict(X), [2] * 6) + + X_, y_ = make_classification( + n_samples=200, n_features=10, weights=[0.833, 0.167], random_state=2 + ) + + for clf in ( + linear_model.LogisticRegression(), + svm.LinearSVC(random_state=0), + svm.SVC(), + ): + clf.set_params(class_weight={0: 0.1, 1: 10}) + clf.fit(X_[:100], y_[:100]) + y_pred = clf.predict(X_[100:]) + assert f1_score(y_[100:], y_pred) > 0.3 + + +@pytest.mark.parametrize("estimator", [svm.SVC(C=1e-2), svm.NuSVC()]) +def test_svm_classifier_sided_sample_weight(estimator): + # fit a linear SVM and check that giving more weight to opposed samples + # in the space will flip the decision toward these samples. + X = [[-2, 0], [-1, -1], [0, -2], [0, 2], [1, 1], [2, 0]] + estimator.set_params(kernel="linear") + + # check that with unit weights, a sample is supposed to be predicted on + # the boundary + sample_weight = [1] * 6 + estimator.fit(X, Y, sample_weight=sample_weight) + y_pred = estimator.decision_function([[-1.0, 1.0]]) + assert y_pred == pytest.approx(0) + + # give more weights to opposed samples + sample_weight = [10.0, 0.1, 0.1, 0.1, 0.1, 10] + estimator.fit(X, Y, sample_weight=sample_weight) + y_pred = estimator.decision_function([[-1.0, 1.0]]) + assert y_pred < 0 + + sample_weight = [1.0, 0.1, 10.0, 10.0, 0.1, 0.1] + estimator.fit(X, Y, sample_weight=sample_weight) + y_pred = estimator.decision_function([[-1.0, 1.0]]) + assert y_pred > 0 + + +@pytest.mark.parametrize("estimator", [svm.SVR(C=1e-2), svm.NuSVR(C=1e-2)]) +def test_svm_regressor_sided_sample_weight(estimator): + # similar test to test_svm_classifier_sided_sample_weight but for + # SVM regressors + X = [[-2, 0], [-1, -1], [0, -2], [0, 2], [1, 1], [2, 0]] + estimator.set_params(kernel="linear") + + # check that with unit weights, a sample is supposed to be predicted on + # the boundary + sample_weight = [1] * 6 + estimator.fit(X, Y, sample_weight=sample_weight) + y_pred = estimator.predict([[-1.0, 1.0]]) + assert y_pred == pytest.approx(1.5) + + # give more weights to opposed samples + sample_weight = [10.0, 0.1, 0.1, 0.1, 0.1, 10] + estimator.fit(X, Y, sample_weight=sample_weight) + y_pred = estimator.predict([[-1.0, 1.0]]) + assert y_pred < 1.5 + + sample_weight = [1.0, 0.1, 10.0, 10.0, 0.1, 0.1] + estimator.fit(X, Y, sample_weight=sample_weight) + y_pred = estimator.predict([[-1.0, 1.0]]) + assert y_pred > 1.5 + + +def test_svm_equivalence_sample_weight_C(): + # test that rescaling all samples is the same as changing C + clf = svm.SVC() + clf.fit(X, Y) + dual_coef_no_weight = clf.dual_coef_ + clf.set_params(C=100) + clf.fit(X, Y, sample_weight=np.repeat(0.01, len(X))) + assert_allclose(dual_coef_no_weight, clf.dual_coef_) + + +@pytest.mark.parametrize( + "Estimator, err_msg", + [ + (svm.SVC, "Invalid input - all samples have zero or negative weights."), + (svm.NuSVC, "(negative dimensions are not allowed|nu is infeasible)"), + (svm.SVR, "Invalid input - all samples have zero or negative weights."), + (svm.NuSVR, "Invalid input - all samples have zero or negative weights."), + (svm.OneClassSVM, "Invalid input - all samples have zero or negative weights."), + ], + ids=["SVC", "NuSVC", "SVR", "NuSVR", "OneClassSVM"], +) +@pytest.mark.parametrize( + "sample_weight", + [[0] * len(Y), [-0.3] * len(Y)], + ids=["weights-are-zero", "weights-are-negative"], +) +def test_negative_sample_weights_mask_all_samples(Estimator, err_msg, sample_weight): + est = Estimator(kernel="linear") + with pytest.raises(ValueError, match=err_msg): + est.fit(X, Y, sample_weight=sample_weight) + + +@pytest.mark.parametrize( + "Classifier, err_msg", + [ + ( + svm.SVC, + ( + "Invalid input - all samples with positive weights belong to the same" + " class" + ), + ), + (svm.NuSVC, "specified nu is infeasible"), + ], + ids=["SVC", "NuSVC"], +) +@pytest.mark.parametrize( + "sample_weight", + [[0, -0.5, 0, 1, 1, 1], [1, 1, 1, 0, -0.1, -0.3]], + ids=["mask-label-1", "mask-label-2"], +) +def test_negative_weights_svc_leave_just_one_label(Classifier, err_msg, sample_weight): + clf = Classifier(kernel="linear") + with pytest.raises(ValueError, match=err_msg): + clf.fit(X, Y, sample_weight=sample_weight) + + +@pytest.mark.parametrize( + "Classifier, model", + [ + (svm.SVC, {"when-left": [0.3998, 0.4], "when-right": [0.4, 0.3999]}), + (svm.NuSVC, {"when-left": [0.3333, 0.3333], "when-right": [0.3333, 0.3333]}), + ], + ids=["SVC", "NuSVC"], +) +@pytest.mark.parametrize( + "sample_weight, mask_side", + [([1, -0.5, 1, 1, 1, 1], "when-left"), ([1, 1, 1, 0, 1, 1], "when-right")], + ids=["partial-mask-label-1", "partial-mask-label-2"], +) +def test_negative_weights_svc_leave_two_labels( + Classifier, model, sample_weight, mask_side +): + clf = Classifier(kernel="linear") + clf.fit(X, Y, sample_weight=sample_weight) + assert_allclose(clf.coef_, [model[mask_side]], rtol=1e-3) + + +@pytest.mark.parametrize( + "Estimator", [svm.SVC, svm.NuSVC, svm.NuSVR], ids=["SVC", "NuSVC", "NuSVR"] +) +@pytest.mark.parametrize( + "sample_weight", + [[1, -0.5, 1, 1, 1, 1], [1, 1, 1, 0, 1, 1]], + ids=["partial-mask-label-1", "partial-mask-label-2"], +) +def test_negative_weight_equal_coeffs(Estimator, sample_weight): + # model generates equal coefficients + est = Estimator(kernel="linear") + est.fit(X, Y, sample_weight=sample_weight) + coef = np.abs(est.coef_).ravel() + assert coef[0] == pytest.approx(coef[1], rel=1e-3) + + +def test_auto_weight(): + # Test class weights for imbalanced data + from sklearn.linear_model import LogisticRegression + + # We take as dataset the two-dimensional projection of iris so + # that it is not separable and remove half of predictors from + # class 1. + # We add one to the targets as a non-regression test: + # class_weight="balanced" + # used to work only when the labels where a range [0..K). + from sklearn.utils import compute_class_weight + + X, y = iris.data[:, :2], iris.target + 1 + unbalanced = np.delete(np.arange(y.size), np.where(y > 2)[0][::2]) + + classes = np.unique(y[unbalanced]) + class_weights = compute_class_weight("balanced", classes=classes, y=y[unbalanced]) + assert np.argmax(class_weights) == 2 + + for clf in ( + svm.SVC(kernel="linear"), + svm.LinearSVC(random_state=0), + LogisticRegression(), + ): + # check that score is better when class='balanced' is set. + y_pred = clf.fit(X[unbalanced], y[unbalanced]).predict(X) + clf.set_params(class_weight="balanced") + y_pred_balanced = clf.fit( + X[unbalanced], + y[unbalanced], + ).predict(X) + assert metrics.f1_score(y, y_pred, average="macro") <= metrics.f1_score( + y, y_pred_balanced, average="macro" + ) + + +@pytest.mark.parametrize("lil_container", LIL_CONTAINERS) +def test_bad_input(lil_container): + # Test dimensions for labels + Y2 = Y[:-1] # wrong dimensions for labels + with pytest.raises(ValueError): + svm.SVC().fit(X, Y2) + + # Test with arrays that are non-contiguous. + for clf in (svm.SVC(), svm.LinearSVC(random_state=0)): + Xf = np.asfortranarray(X) + assert not Xf.flags["C_CONTIGUOUS"] + yf = np.ascontiguousarray(np.tile(Y, (2, 1)).T) + yf = yf[:, -1] + assert not yf.flags["F_CONTIGUOUS"] + assert not yf.flags["C_CONTIGUOUS"] + clf.fit(Xf, yf) + assert_array_equal(clf.predict(T), true_result) + + # error for precomputed kernelsx + clf = svm.SVC(kernel="precomputed") + with pytest.raises(ValueError): + clf.fit(X, Y) + + # predict with sparse input when trained with dense + clf = svm.SVC().fit(X, Y) + with pytest.raises(ValueError): + clf.predict(lil_container(X)) + + Xt = np.array(X).T + clf.fit(np.dot(X, Xt), Y) + with pytest.raises(ValueError): + clf.predict(X) + + clf = svm.SVC() + clf.fit(X, Y) + with pytest.raises(ValueError): + clf.predict(Xt) + + +def test_svc_nonfinite_params(): + # Check SVC throws ValueError when dealing with non-finite parameter values + rng = np.random.RandomState(0) + n_samples = 10 + fmax = np.finfo(np.float64).max + X = fmax * rng.uniform(size=(n_samples, 2)) + y = rng.randint(0, 2, size=n_samples) + + clf = svm.SVC() + msg = "The dual coefficients or intercepts are not finite" + with pytest.raises(ValueError, match=msg): + clf.fit(X, y) + + +def test_unicode_kernel(): + # Test that a unicode kernel name does not cause a TypeError + clf = svm.SVC(kernel="linear", probability=True) + clf.fit(X, Y) + clf.predict_proba(T) + _libsvm.cross_validation( + iris.data, iris.target.astype(np.float64), 5, kernel="linear", random_seed=0 + ) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse_precomputed(csr_container): + clf = svm.SVC(kernel="precomputed") + sparse_gram = csr_container([[1, 0], [0, 1]]) + with pytest.raises(TypeError, match="Sparse precomputed"): + clf.fit(sparse_gram, [0, 1]) + + +@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) +def test_sparse_fit_support_vectors_empty(csr_container): + # Regression test for #14893 + X_train = csr_container([[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1]]) + y_train = np.array([0.04, 0.04, 0.10, 0.16]) + model = svm.SVR(kernel="linear") + model.fit(X_train, y_train) + assert not model.support_vectors_.data.size + assert not model.dual_coef_.data.size + + +@pytest.mark.parametrize("loss", ["hinge", "squared_hinge"]) +@pytest.mark.parametrize("penalty", ["l1", "l2"]) +@pytest.mark.parametrize("dual", [True, False]) +def test_linearsvc_parameters(loss, penalty, dual): + # Test possible parameter combinations in LinearSVC + # Generate list of possible parameter combinations + X, y = make_classification(n_samples=5, n_features=5, random_state=0) + + clf = svm.LinearSVC(penalty=penalty, loss=loss, dual=dual, random_state=0) + if ( + (loss, penalty) == ("hinge", "l1") + or (loss, penalty, dual) == ("hinge", "l2", False) + or (penalty, dual) == ("l1", True) + ): + with pytest.raises( + ValueError, + match="Unsupported set of arguments.*penalty='%s.*loss='%s.*dual=%s" + % (penalty, loss, dual), + ): + clf.fit(X, y) + else: + clf.fit(X, y) + + +def test_linearsvc(): + # Test basic routines using LinearSVC + clf = svm.LinearSVC(random_state=0).fit(X, Y) + + # by default should have intercept + assert clf.fit_intercept + + assert_array_equal(clf.predict(T), true_result) + assert_array_almost_equal(clf.intercept_, [0], decimal=3) + + # the same with l1 penalty + clf = svm.LinearSVC( + penalty="l1", loss="squared_hinge", dual=False, random_state=0 + ).fit(X, Y) + assert_array_equal(clf.predict(T), true_result) + + # l2 penalty with dual formulation + clf = svm.LinearSVC(penalty="l2", dual=True, random_state=0).fit(X, Y) + assert_array_equal(clf.predict(T), true_result) + + # l2 penalty, l1 loss + clf = svm.LinearSVC(penalty="l2", loss="hinge", dual=True, random_state=0) + clf.fit(X, Y) + assert_array_equal(clf.predict(T), true_result) + + # test also decision function + dec = clf.decision_function(T) + res = (dec > 0).astype(int) + 1 + assert_array_equal(res, true_result) + + +def test_linearsvc_crammer_singer(): + # Test LinearSVC with crammer_singer multi-class svm + ovr_clf = svm.LinearSVC(random_state=0).fit(iris.data, iris.target) + cs_clf = svm.LinearSVC(multi_class="crammer_singer", random_state=0) + cs_clf.fit(iris.data, iris.target) + + # similar prediction for ovr and crammer-singer: + assert (ovr_clf.predict(iris.data) == cs_clf.predict(iris.data)).mean() > 0.9 + + # classifiers shouldn't be the same + assert (ovr_clf.coef_ != cs_clf.coef_).all() + + # test decision function + assert_array_equal( + cs_clf.predict(iris.data), + np.argmax(cs_clf.decision_function(iris.data), axis=1), + ) + dec_func = np.dot(iris.data, cs_clf.coef_.T) + cs_clf.intercept_ + assert_array_almost_equal(dec_func, cs_clf.decision_function(iris.data)) + + +def test_linearsvc_fit_sampleweight(): + # check correct result when sample_weight is 1 + n_samples = len(X) + unit_weight = np.ones(n_samples) + clf = svm.LinearSVC(random_state=0).fit(X, Y) + clf_unitweight = svm.LinearSVC(random_state=0, tol=1e-12, max_iter=1000).fit( + X, Y, sample_weight=unit_weight + ) + + # check if same as sample_weight=None + assert_array_equal(clf_unitweight.predict(T), clf.predict(T)) + assert_allclose(clf.coef_, clf_unitweight.coef_, 1, 0.0001) + + # check that fit(X) = fit([X1, X2, X3],sample_weight = [n1, n2, n3]) where + # X = X1 repeated n1 times, X2 repeated n2 times and so forth + + random_state = check_random_state(0) + random_weight = random_state.randint(0, 10, n_samples) + lsvc_unflat = svm.LinearSVC(random_state=0, tol=1e-12, max_iter=1000).fit( + X, Y, sample_weight=random_weight + ) + + pred1 = lsvc_unflat.predict(T) + + X_flat = np.repeat(X, random_weight, axis=0) + y_flat = np.repeat(Y, random_weight, axis=0) + lsvc_flat = svm.LinearSVC(random_state=0, tol=1e-12, max_iter=1000).fit( + X_flat, y_flat + ) + pred2 = lsvc_flat.predict(T) + + assert_array_equal(pred1, pred2) + assert_allclose(lsvc_unflat.coef_, lsvc_flat.coef_, 1, 0.0001) + + +def test_crammer_singer_binary(): + # Test Crammer-Singer formulation in the binary case + X, y = make_classification(n_classes=2, random_state=0) + + for fit_intercept in (True, False): + acc = ( + svm.LinearSVC( + fit_intercept=fit_intercept, + multi_class="crammer_singer", + random_state=0, + ) + .fit(X, y) + .score(X, y) + ) + assert acc > 0.9 + + +def test_linearsvc_iris(): + # Test that LinearSVC gives plausible predictions on the iris dataset + # Also, test symbolic class names (classes_). + target = iris.target_names[iris.target] + clf = svm.LinearSVC(random_state=0).fit(iris.data, target) + assert set(clf.classes_) == set(iris.target_names) + assert np.mean(clf.predict(iris.data) == target) > 0.8 + + dec = clf.decision_function(iris.data) + pred = iris.target_names[np.argmax(dec, 1)] + assert_array_equal(pred, clf.predict(iris.data)) + + +def test_dense_liblinear_intercept_handling(classifier=svm.LinearSVC): + # Test that dense liblinear honours intercept_scaling param + X = [[2, 1], [3, 1], [1, 3], [2, 3]] + y = [0, 0, 1, 1] + clf = classifier( + fit_intercept=True, + penalty="l1", + loss="squared_hinge", + dual=False, + C=4, + tol=1e-7, + random_state=0, + ) + assert clf.intercept_scaling == 1, clf.intercept_scaling + assert clf.fit_intercept + + # when intercept_scaling is low the intercept value is highly "penalized" + # by regularization + clf.intercept_scaling = 1 + clf.fit(X, y) + assert_almost_equal(clf.intercept_, 0, decimal=5) + + # when intercept_scaling is sufficiently high, the intercept value + # is not affected by regularization + clf.intercept_scaling = 100 + clf.fit(X, y) + intercept1 = clf.intercept_ + assert intercept1 < -1 + + # when intercept_scaling is sufficiently high, the intercept value + # doesn't depend on intercept_scaling value + clf.intercept_scaling = 1000 + clf.fit(X, y) + intercept2 = clf.intercept_ + assert_array_almost_equal(intercept1, intercept2, decimal=2) + + +def test_liblinear_set_coef(): + # multi-class case + clf = svm.LinearSVC().fit(iris.data, iris.target) + values = clf.decision_function(iris.data) + clf.coef_ = clf.coef_.copy() + clf.intercept_ = clf.intercept_.copy() + values2 = clf.decision_function(iris.data) + assert_array_almost_equal(values, values2) + + # binary-class case + X = [[2, 1], [3, 1], [1, 3], [2, 3]] + y = [0, 0, 1, 1] + + clf = svm.LinearSVC().fit(X, y) + values = clf.decision_function(X) + clf.coef_ = clf.coef_.copy() + clf.intercept_ = clf.intercept_.copy() + values2 = clf.decision_function(X) + assert_array_equal(values, values2) + + +def test_immutable_coef_property(): + # Check that primal coef modification are not silently ignored + svms = [ + svm.SVC(kernel="linear").fit(iris.data, iris.target), + svm.NuSVC(kernel="linear").fit(iris.data, iris.target), + svm.SVR(kernel="linear").fit(iris.data, iris.target), + svm.NuSVR(kernel="linear").fit(iris.data, iris.target), + svm.OneClassSVM(kernel="linear").fit(iris.data), + ] + for clf in svms: + with pytest.raises(AttributeError): + clf.__setattr__("coef_", np.arange(3)) + with pytest.raises((RuntimeError, ValueError)): + clf.coef_.__setitem__((0, 0), 0) + + +def test_linearsvc_verbose(): + # stdout: redirect + import os + + stdout = os.dup(1) # save original stdout + os.dup2(os.pipe()[1], 1) # replace it + + # actual call + clf = svm.LinearSVC(verbose=1) + clf.fit(X, Y) + + # stdout: restore + os.dup2(stdout, 1) # restore original stdout + + +def test_svc_clone_with_callable_kernel(): + # create SVM with callable linear kernel, check that results are the same + # as with built-in linear kernel + svm_callable = svm.SVC( + kernel=lambda x, y: np.dot(x, y.T), + probability=True, + random_state=0, + decision_function_shape="ovr", + ) + # clone for checking clonability with lambda functions.. + svm_cloned = base.clone(svm_callable) + svm_cloned.fit(iris.data, iris.target) + + svm_builtin = svm.SVC( + kernel="linear", probability=True, random_state=0, decision_function_shape="ovr" + ) + svm_builtin.fit(iris.data, iris.target) + + assert_array_almost_equal(svm_cloned.dual_coef_, svm_builtin.dual_coef_) + assert_array_almost_equal(svm_cloned.intercept_, svm_builtin.intercept_) + assert_array_equal(svm_cloned.predict(iris.data), svm_builtin.predict(iris.data)) + + assert_array_almost_equal( + svm_cloned.predict_proba(iris.data), + svm_builtin.predict_proba(iris.data), + decimal=4, + ) + assert_array_almost_equal( + svm_cloned.decision_function(iris.data), + svm_builtin.decision_function(iris.data), + ) + + +def test_svc_bad_kernel(): + svc = svm.SVC(kernel=lambda x, y: x) + with pytest.raises(ValueError): + svc.fit(X, Y) + + +def test_libsvm_convergence_warnings(): + a = svm.SVC( + kernel=lambda x, y: np.dot(x, y.T), probability=True, random_state=0, max_iter=2 + ) + warning_msg = ( + r"Solver terminated early \(max_iter=2\). Consider pre-processing " + r"your data with StandardScaler or MinMaxScaler." + ) + with pytest.warns(ConvergenceWarning, match=warning_msg): + a.fit(np.array(X), Y) + assert np.all(a.n_iter_ == 2) + + +def test_unfitted(): + X = "foo!" # input validation not required when SVM not fitted + + clf = svm.SVC() + with pytest.raises(Exception, match=r".*\bSVC\b.*\bnot\b.*\bfitted\b"): + clf.predict(X) + + clf = svm.NuSVR() + with pytest.raises(Exception, match=r".*\bNuSVR\b.*\bnot\b.*\bfitted\b"): + clf.predict(X) + + +# ignore convergence warnings from max_iter=1 +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +def test_consistent_proba(): + a = svm.SVC(probability=True, max_iter=1, random_state=0) + proba_1 = a.fit(X, Y).predict_proba(X) + a = svm.SVC(probability=True, max_iter=1, random_state=0) + proba_2 = a.fit(X, Y).predict_proba(X) + assert_array_almost_equal(proba_1, proba_2) + + +def test_linear_svm_convergence_warnings(): + # Test that warnings are raised if model does not converge + + lsvc = svm.LinearSVC(random_state=0, max_iter=2) + warning_msg = "Liblinear failed to converge, increase the number of iterations." + with pytest.warns(ConvergenceWarning, match=warning_msg): + lsvc.fit(X, Y) + # Check that we have an n_iter_ attribute with int type as opposed to a + # numpy array or an np.int32 so as to match the docstring. + assert isinstance(lsvc.n_iter_, int) + assert lsvc.n_iter_ == 2 + + lsvr = svm.LinearSVR(random_state=0, max_iter=2) + with pytest.warns(ConvergenceWarning, match=warning_msg): + lsvr.fit(iris.data, iris.target) + assert isinstance(lsvr.n_iter_, int) + assert lsvr.n_iter_ == 2 + + +def test_svr_coef_sign(): + # Test that SVR(kernel="linear") has coef_ with the right sign. + # Non-regression test for #2933. + X = np.random.RandomState(21).randn(10, 3) + y = np.random.RandomState(12).randn(10) + + for svr in [ + svm.SVR(kernel="linear"), + svm.NuSVR(kernel="linear"), + svm.LinearSVR(), + ]: + svr.fit(X, y) + assert_array_almost_equal( + svr.predict(X), np.dot(X, svr.coef_.ravel()) + svr.intercept_ + ) + + +def test_lsvc_intercept_scaling_zero(): + # Test that intercept_scaling is ignored when fit_intercept is False + + lsvc = svm.LinearSVC(fit_intercept=False) + lsvc.fit(X, Y) + assert lsvc.intercept_ == 0.0 + + +def test_hasattr_predict_proba(): + # Method must be (un)available before or after fit, switched by + # `probability` param + + G = svm.SVC(probability=True) + assert hasattr(G, "predict_proba") + G.fit(iris.data, iris.target) + assert hasattr(G, "predict_proba") + + G = svm.SVC(probability=False) + assert not hasattr(G, "predict_proba") + G.fit(iris.data, iris.target) + assert not hasattr(G, "predict_proba") + + # Switching to `probability=True` after fitting should make + # predict_proba available, but calling it must not work: + G.probability = True + assert hasattr(G, "predict_proba") + msg = "predict_proba is not available when fitted with probability=False" + + with pytest.raises(NotFittedError, match=msg): + G.predict_proba(iris.data) + + +def test_decision_function_shape_two_class(): + for n_classes in [2, 3]: + X, y = make_blobs(centers=n_classes, random_state=0) + for estimator in [svm.SVC, svm.NuSVC]: + clf = OneVsRestClassifier(estimator(decision_function_shape="ovr")).fit( + X, y + ) + assert len(clf.predict(X)) == len(y) + + +def test_ovr_decision_function(): + # One point from each quadrant represents one class + X_train = np.array([[1, 1], [-1, 1], [-1, -1], [1, -1]]) + y_train = [0, 1, 2, 3] + + # First point is closer to the decision boundaries than the second point + base_points = np.array([[5, 5], [10, 10]]) + + # For all the quadrants (classes) + X_test = np.vstack( + ( + base_points * [1, 1], # Q1 + base_points * [-1, 1], # Q2 + base_points * [-1, -1], # Q3 + base_points * [1, -1], # Q4 + ) + ) + + y_test = [0] * 2 + [1] * 2 + [2] * 2 + [3] * 2 + + clf = svm.SVC(kernel="linear", decision_function_shape="ovr") + clf.fit(X_train, y_train) + + y_pred = clf.predict(X_test) + + # Test if the prediction is the same as y + assert_array_equal(y_pred, y_test) + + deci_val = clf.decision_function(X_test) + + # Assert that the predicted class has the maximum value + assert_array_equal(np.argmax(deci_val, axis=1), y_pred) + + # Get decision value at test points for the predicted class + pred_class_deci_val = deci_val[range(8), y_pred].reshape((4, 2)) + + # Assert pred_class_deci_val > 0 here + assert np.min(pred_class_deci_val) > 0.0 + + # Test if the first point has lower decision value on every quadrant + # compared to the second point + assert np.all(pred_class_deci_val[:, 0] < pred_class_deci_val[:, 1]) + + +@pytest.mark.parametrize("SVCClass", [svm.SVC, svm.NuSVC]) +def test_svc_invalid_break_ties_param(SVCClass): + X, y = make_blobs(random_state=42) + + svm = SVCClass( + kernel="linear", decision_function_shape="ovo", break_ties=True, random_state=42 + ).fit(X, y) + + with pytest.raises(ValueError, match="break_ties must be False"): + svm.predict(y) + + +@pytest.mark.parametrize("SVCClass", [svm.SVC, svm.NuSVC]) +def test_svc_ovr_tie_breaking(SVCClass): + """Test if predict breaks ties in OVR mode. + Related issue: https://github.com/scikit-learn/scikit-learn/issues/8277 + """ + if SVCClass.__name__ == "NuSVC" and _IS_32BIT: + # XXX: known failure to be investigated. Either the code needs to be + # fixed or the test itself might need to be made less sensitive to + # random changes in test data and rounding errors more generally. + # https://github.com/scikit-learn/scikit-learn/issues/29633 + pytest.xfail("Failing test on 32bit OS") + + X, y = make_blobs(random_state=0, n_samples=20, n_features=2) + + xs = np.linspace(X[:, 0].min(), X[:, 0].max(), 100) + ys = np.linspace(X[:, 1].min(), X[:, 1].max(), 100) + xx, yy = np.meshgrid(xs, ys) + + common_params = dict( + kernel="rbf", gamma=1e6, random_state=42, decision_function_shape="ovr" + ) + svm = SVCClass( + break_ties=False, + **common_params, + ).fit(X, y) + pred = svm.predict(np.c_[xx.ravel(), yy.ravel()]) + dv = svm.decision_function(np.c_[xx.ravel(), yy.ravel()]) + assert not np.all(pred == np.argmax(dv, axis=1)) + + svm = SVCClass( + break_ties=True, + **common_params, + ).fit(X, y) + pred = svm.predict(np.c_[xx.ravel(), yy.ravel()]) + dv = svm.decision_function(np.c_[xx.ravel(), yy.ravel()]) + assert np.all(pred == np.argmax(dv, axis=1)) + + +def test_gamma_scale(): + X, y = [[0.0], [1.0]], [0, 1] + + clf = svm.SVC() + clf.fit(X, y) + assert_almost_equal(clf._gamma, 4) + + +@pytest.mark.parametrize( + "SVM, params", + [ + (LinearSVC, {"penalty": "l1", "loss": "squared_hinge", "dual": False}), + (LinearSVC, {"penalty": "l2", "loss": "squared_hinge", "dual": True}), + (LinearSVC, {"penalty": "l2", "loss": "squared_hinge", "dual": False}), + (LinearSVC, {"penalty": "l2", "loss": "hinge", "dual": True}), + (LinearSVR, {"loss": "epsilon_insensitive", "dual": True}), + (LinearSVR, {"loss": "squared_epsilon_insensitive", "dual": True}), + (LinearSVR, {"loss": "squared_epsilon_insensitive", "dual": True}), + ], +) +def test_linearsvm_liblinear_sample_weight(SVM, params): + X = np.array( + [ + [1, 3], + [1, 3], + [1, 3], + [1, 3], + [2, 1], + [2, 1], + [2, 1], + [2, 1], + [3, 3], + [3, 3], + [3, 3], + [3, 3], + [4, 1], + [4, 1], + [4, 1], + [4, 1], + ], + dtype=np.dtype("float"), + ) + y = np.array( + [1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2], dtype=np.dtype("int") + ) + + X2 = np.vstack([X, X]) + y2 = np.hstack([y, 3 - y]) + sample_weight = np.ones(shape=len(y) * 2) + sample_weight[len(y) :] = 0 + X2, y2, sample_weight = shuffle(X2, y2, sample_weight, random_state=0) + + base_estimator = SVM(random_state=42) + base_estimator.set_params(**params) + base_estimator.set_params(tol=1e-12, max_iter=1000) + est_no_weight = base.clone(base_estimator).fit(X, y) + est_with_weight = base.clone(base_estimator).fit( + X2, y2, sample_weight=sample_weight + ) + + for method in ("predict", "decision_function"): + if hasattr(base_estimator, method): + X_est_no_weight = getattr(est_no_weight, method)(X) + X_est_with_weight = getattr(est_with_weight, method)(X) + assert_allclose(X_est_no_weight, X_est_with_weight) + + +@pytest.mark.parametrize("Klass", (OneClassSVM, SVR, NuSVR)) +def test_n_support(Klass): + # Make n_support is correct for oneclass and SVR (used to be + # non-initialized) + # this is a non regression test for issue #14774 + X = np.array([[0], [0.44], [0.45], [0.46], [1]]) + y = np.arange(X.shape[0]) + est = Klass() + assert not hasattr(est, "n_support_") + est.fit(X, y) + assert est.n_support_[0] == est.support_vectors_.shape[0] + assert est.n_support_.size == 1 + + +@pytest.mark.parametrize("Estimator", [svm.SVC, svm.SVR]) +def test_custom_kernel_not_array_input(Estimator): + """Test using a custom kernel that is not fed with array-like for floats""" + data = ["A A", "A", "B", "B B", "A B"] + X = np.array([[2, 0], [1, 0], [0, 1], [0, 2], [1, 1]]) # count encoding + y = np.array([1, 1, 2, 2, 1]) + + def string_kernel(X1, X2): + assert isinstance(X1[0], str) + n_samples1 = _num_samples(X1) + n_samples2 = _num_samples(X2) + K = np.zeros((n_samples1, n_samples2)) + for ii in range(n_samples1): + for jj in range(ii, n_samples2): + K[ii, jj] = X1[ii].count("A") * X2[jj].count("A") + K[ii, jj] += X1[ii].count("B") * X2[jj].count("B") + K[jj, ii] = K[ii, jj] + return K + + K = string_kernel(data, data) + assert_array_equal(np.dot(X, X.T), K) + + svc1 = Estimator(kernel=string_kernel).fit(data, y) + svc2 = Estimator(kernel="linear").fit(X, y) + svc3 = Estimator(kernel="precomputed").fit(K, y) + + assert svc1.score(data, y) == svc3.score(K, y) + assert svc1.score(data, y) == svc2.score(X, y) + if hasattr(svc1, "decision_function"): # classifier + assert_allclose(svc1.decision_function(data), svc2.decision_function(X)) + assert_allclose(svc1.decision_function(data), svc3.decision_function(K)) + assert_array_equal(svc1.predict(data), svc2.predict(X)) + assert_array_equal(svc1.predict(data), svc3.predict(K)) + else: # regressor + assert_allclose(svc1.predict(data), svc2.predict(X)) + assert_allclose(svc1.predict(data), svc3.predict(K)) + + +def test_svc_raises_error_internal_representation(): + """Check that SVC raises error when internal representation is altered. + + Non-regression test for #18891 and https://nvd.nist.gov/vuln/detail/CVE-2020-28975 + """ + clf = svm.SVC(kernel="linear").fit(X, Y) + clf._n_support[0] = 1000000 + + msg = "The internal representation of SVC was altered" + with pytest.raises(ValueError, match=msg): + clf.predict(X) + + +@pytest.mark.parametrize( + "estimator, expected_n_iter_type", + [ + (svm.SVC, np.ndarray), + (svm.NuSVC, np.ndarray), + (svm.SVR, int), + (svm.NuSVR, int), + (svm.OneClassSVM, int), + ], +) +@pytest.mark.parametrize( + "dataset", + [ + make_classification(n_classes=2, n_informative=2, random_state=0), + make_classification(n_classes=3, n_informative=3, random_state=0), + make_classification(n_classes=4, n_informative=4, random_state=0), + ], +) +def test_n_iter_libsvm(estimator, expected_n_iter_type, dataset): + # Check that the type of n_iter_ is correct for the classes that inherit + # from BaseSVC. + # Note that for SVC, and NuSVC this is an ndarray; while for SVR, NuSVR, and + # OneClassSVM, it is an int. + # For SVC and NuSVC also check the shape of n_iter_. + X, y = dataset + n_iter = estimator(kernel="linear").fit(X, y).n_iter_ + assert type(n_iter) == expected_n_iter_type + if estimator in [svm.SVC, svm.NuSVC]: + n_classes = len(np.unique(y)) + assert n_iter.shape == (n_classes * (n_classes - 1) // 2,) + + +@pytest.mark.parametrize("loss", ["squared_hinge", "squared_epsilon_insensitive"]) +def test_dual_auto(loss): + # OvR, L2, N > M (6,2) + dual = _validate_dual_parameter("auto", loss, "l2", "ovr", np.asarray(X)) + assert dual is False + # OvR, L2, N < M (2,6) + dual = _validate_dual_parameter("auto", loss, "l2", "ovr", np.asarray(X).T) + assert dual is True + + +def test_dual_auto_edge_cases(): + # Hinge, OvR, L2, N > M (6,2) + dual = _validate_dual_parameter("auto", "hinge", "l2", "ovr", np.asarray(X)) + assert dual is True # only supports True + dual = _validate_dual_parameter( + "auto", "epsilon_insensitive", "l2", "ovr", np.asarray(X) + ) + assert dual is True # only supports True + # SqHinge, OvR, L1, N < M (2,6) + dual = _validate_dual_parameter( + "auto", "squared_hinge", "l1", "ovr", np.asarray(X).T + ) + assert dual is False # only supports False + + +@pytest.mark.parametrize( + "Estimator, make_dataset", + [(svm.SVC, make_classification), (svm.SVR, make_regression)], +) +@pytest.mark.parametrize("C_inf", [np.inf, float("inf")]) +def test_svm_with_infinite_C(Estimator, make_dataset, C_inf, global_random_seed): + """Check that we can pass `C=inf` that is equivalent to a very large C value. + + Non-regression test for + https://github.com/scikit-learn/scikit-learn/issues/29772 + """ + X, y = make_dataset(random_state=global_random_seed) + estimator_C_inf = Estimator(C=C_inf).fit(X, y) + estimator_C_large = Estimator(C=1e10).fit(X, y) + + assert_allclose(estimator_C_large.predict(X), estimator_C_inf.predict(X)) diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/metadata_routing_common.py b/.venv/lib/python3.12/site-packages/sklearn/tests/metadata_routing_common.py new file mode 100644 index 0000000000000000000000000000000000000000..f4dd79581db9097bd45d99b2f11e80f90862d58f --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/tests/metadata_routing_common.py @@ -0,0 +1,584 @@ +import inspect +from collections import defaultdict +from functools import partial + +import numpy as np +from numpy.testing import assert_array_equal + +from sklearn.base import ( + BaseEstimator, + ClassifierMixin, + MetaEstimatorMixin, + RegressorMixin, + TransformerMixin, + clone, +) +from sklearn.metrics._scorer import _Scorer, mean_squared_error +from sklearn.model_selection import BaseCrossValidator +from sklearn.model_selection._split import GroupsConsumerMixin +from sklearn.utils._metadata_requests import ( + SIMPLE_METHODS, +) +from sklearn.utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + process_routing, +) +from sklearn.utils.multiclass import _check_partial_fit_first_call + + +def record_metadata(obj, record_default=True, **kwargs): + """Utility function to store passed metadata to a method of obj. + + If record_default is False, kwargs whose values are "default" are skipped. + This is so that checks on keyword arguments whose default was not changed + are skipped. + + """ + stack = inspect.stack() + callee = stack[1].function + caller = stack[2].function + if not hasattr(obj, "_records"): + obj._records = defaultdict(lambda: defaultdict(list)) + if not record_default: + kwargs = { + key: val + for key, val in kwargs.items() + if not isinstance(val, str) or (val != "default") + } + obj._records[callee][caller].append(kwargs) + + +def check_recorded_metadata(obj, method, parent, split_params=tuple(), **kwargs): + """Check whether the expected metadata is passed to the object's method. + + Parameters + ---------- + obj : estimator object + sub-estimator to check routed params for + method : str + sub-estimator's method where metadata is routed to, or otherwise in + the context of metadata routing referred to as 'callee' + parent : str + the parent method which should have called `method`, or otherwise in + the context of metadata routing referred to as 'caller' + split_params : tuple, default=empty + specifies any parameters which are to be checked as being a subset + of the original values + **kwargs : dict + passed metadata + """ + all_records = ( + getattr(obj, "_records", dict()).get(method, dict()).get(parent, list()) + ) + for record in all_records: + # first check that the names of the metadata passed are the same as + # expected. The names are stored as keys in `record`. + assert set(kwargs.keys()) == set(record.keys()), ( + f"Expected {kwargs.keys()} vs {record.keys()}" + ) + for key, value in kwargs.items(): + recorded_value = record[key] + # The following condition is used to check for any specified parameters + # being a subset of the original values + if key in split_params and recorded_value is not None: + assert np.isin(recorded_value, value).all() + else: + if isinstance(recorded_value, np.ndarray): + assert_array_equal(recorded_value, value) + else: + assert recorded_value is value, ( + f"Expected {recorded_value} vs {value}. Method: {method}" + ) + + +record_metadata_not_default = partial(record_metadata, record_default=False) + + +def assert_request_is_empty(metadata_request, exclude=None): + """Check if a metadata request dict is empty. + + One can exclude a method or a list of methods from the check using the + ``exclude`` parameter. If metadata_request is a MetadataRouter, then + ``exclude`` can be of the form ``{"object" : [method, ...]}``. + """ + if isinstance(metadata_request, MetadataRouter): + for name, route_mapping in metadata_request: + if exclude is not None and name in exclude: + _exclude = exclude[name] + else: + _exclude = None + assert_request_is_empty(route_mapping.router, exclude=_exclude) + return + + exclude = [] if exclude is None else exclude + for method in SIMPLE_METHODS: + if method in exclude: + continue + mmr = getattr(metadata_request, method) + props = [ + prop + for prop, alias in mmr.requests.items() + if isinstance(alias, str) or alias is not None + ] + assert not props + + +def assert_request_equal(request, dictionary): + for method, requests in dictionary.items(): + mmr = getattr(request, method) + assert mmr.requests == requests + + empty_methods = [method for method in SIMPLE_METHODS if method not in dictionary] + for method in empty_methods: + assert not len(getattr(request, method).requests) + + +class _Registry(list): + # This list is used to get a reference to the sub-estimators, which are not + # necessarily stored on the metaestimator. We need to override __deepcopy__ + # because the sub-estimators are probably cloned, which would result in a + # new copy of the list, but we need copy and deep copy both to return the + # same instance. + def __deepcopy__(self, memo): + return self + + def __copy__(self): + return self + + +class ConsumingRegressor(RegressorMixin, BaseEstimator): + """A regressor consuming metadata. + + Parameters + ---------- + registry : list, default=None + If a list, the estimator will append itself to the list in order to have + a reference to the estimator later on. Since that reference is not + required in all tests, registration can be skipped by leaving this value + as None. + """ + + def __init__(self, registry=None): + self.registry = registry + + def partial_fit(self, X, y, sample_weight="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + return self + + def fit(self, X, y, sample_weight="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + return self + + def predict(self, X, y=None, sample_weight="default", metadata="default"): + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + return np.zeros(shape=(len(X),)) + + def score(self, X, y, sample_weight="default", metadata="default"): + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + return 1 + + +class NonConsumingClassifier(ClassifierMixin, BaseEstimator): + """A classifier which accepts no metadata on any method.""" + + def __init__(self, alpha=0.0): + self.alpha = alpha + + def fit(self, X, y): + self.classes_ = np.unique(y) + self.coef_ = np.ones_like(X) + return self + + def partial_fit(self, X, y, classes=None): + return self + + def decision_function(self, X): + return self.predict(X) + + def predict(self, X): + y_pred = np.empty(shape=(len(X),)) + y_pred[: len(X) // 2] = 0 + y_pred[len(X) // 2 :] = 1 + return y_pred + + def predict_proba(self, X): + # dummy probabilities to support predict_proba + y_proba = np.empty(shape=(len(X), len(self.classes_)), dtype=np.float32) + # each row sums up to 1.0: + y_proba[:] = np.random.dirichlet(alpha=np.ones(len(self.classes_)), size=len(X)) + return y_proba + + def predict_log_proba(self, X): + # dummy probabilities to support predict_log_proba + return self.predict_proba(X) + + +class NonConsumingRegressor(RegressorMixin, BaseEstimator): + """A classifier which accepts no metadata on any method.""" + + def fit(self, X, y): + return self + + def partial_fit(self, X, y): + return self + + def predict(self, X): + return np.ones(len(X)) # pragma: no cover + + +class ConsumingClassifier(ClassifierMixin, BaseEstimator): + """A classifier consuming metadata. + + Parameters + ---------- + registry : list, default=None + If a list, the estimator will append itself to the list in order to have + a reference to the estimator later on. Since that reference is not + required in all tests, registration can be skipped by leaving this value + as None. + + alpha : float, default=0 + This parameter is only used to test the ``*SearchCV`` objects, and + doesn't do anything. + """ + + def __init__(self, registry=None, alpha=0.0): + self.alpha = alpha + self.registry = registry + + def partial_fit( + self, X, y, classes=None, sample_weight="default", metadata="default" + ): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + _check_partial_fit_first_call(self, classes) + return self + + def fit(self, X, y, sample_weight="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + + self.classes_ = np.unique(y) + self.coef_ = np.ones_like(X) + return self + + def predict(self, X, sample_weight="default", metadata="default"): + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + y_score = np.empty(shape=(len(X),), dtype="int8") + y_score[len(X) // 2 :] = 0 + y_score[: len(X) // 2] = 1 + return y_score + + def predict_proba(self, X, sample_weight="default", metadata="default"): + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + y_proba = np.empty(shape=(len(X), len(self.classes_)), dtype=np.float32) + # each row sums up to 1.0: + y_proba[:] = np.random.dirichlet(alpha=np.ones(len(self.classes_)), size=len(X)) + return y_proba + + def predict_log_proba(self, X, sample_weight="default", metadata="default"): + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + return self.predict_proba(X) + + def decision_function(self, X, sample_weight="default", metadata="default"): + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + y_score = np.empty(shape=(len(X),)) + y_score[len(X) // 2 :] = 0 + y_score[: len(X) // 2] = 1 + return y_score + + def score(self, X, y, sample_weight="default", metadata="default"): + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + return 1 + + +class ConsumingClassifierWithoutPredictProba(ConsumingClassifier): + """ConsumingClassifier without a predict_proba method, but with predict_log_proba. + + Used to mimic dynamic method selection such as in the `_parallel_predict_proba()` + function called by `BaggingClassifier`. + """ + + @property + def predict_proba(self): + raise AttributeError("This estimator does not support predict_proba") + + +class ConsumingClassifierWithoutPredictLogProba(ConsumingClassifier): + """ConsumingClassifier without a predict_log_proba method, but with predict_proba. + + Used to mimic dynamic method selection such as in + `BaggingClassifier.predict_log_proba()`. + """ + + @property + def predict_log_proba(self): + raise AttributeError("This estimator does not support predict_log_proba") + + +class ConsumingClassifierWithOnlyPredict(ConsumingClassifier): + """ConsumingClassifier with only a predict method. + + Used to mimic dynamic method selection such as in + `BaggingClassifier.predict_log_proba()`. + """ + + @property + def predict_proba(self): + raise AttributeError("This estimator does not support predict_proba") + + @property + def predict_log_proba(self): + raise AttributeError("This estimator does not support predict_log_proba") + + +class ConsumingTransformer(TransformerMixin, BaseEstimator): + """A transformer which accepts metadata on fit and transform. + + Parameters + ---------- + registry : list, default=None + If a list, the estimator will append itself to the list in order to have + a reference to the estimator later on. Since that reference is not + required in all tests, registration can be skipped by leaving this value + as None. + """ + + def __init__(self, registry=None): + self.registry = registry + + def fit(self, X, y=None, sample_weight="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + self.fitted_ = True + return self + + def transform(self, X, sample_weight="default", metadata="default"): + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + return X + 1 + + def fit_transform(self, X, y, sample_weight="default", metadata="default"): + # implementing ``fit_transform`` is necessary since + # ``TransformerMixin.fit_transform`` doesn't route any metadata to + # ``transform``, while here we want ``transform`` to receive + # ``sample_weight`` and ``metadata``. + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + return self.fit(X, y, sample_weight=sample_weight, metadata=metadata).transform( + X, sample_weight=sample_weight, metadata=metadata + ) + + def inverse_transform(self, X, sample_weight=None, metadata=None): + record_metadata_not_default( + self, sample_weight=sample_weight, metadata=metadata + ) + return X - 1 + + +class ConsumingNoFitTransformTransformer(BaseEstimator): + """A metadata consuming transformer that doesn't inherit from + TransformerMixin, and thus doesn't implement `fit_transform`. Note that + TransformerMixin's `fit_transform` doesn't route metadata to `transform`.""" + + def __init__(self, registry=None): + self.registry = registry + + def fit(self, X, y=None, sample_weight=None, metadata=None): + if self.registry is not None: + self.registry.append(self) + + record_metadata(self, sample_weight=sample_weight, metadata=metadata) + + return self + + def transform(self, X, sample_weight=None, metadata=None): + record_metadata(self, sample_weight=sample_weight, metadata=metadata) + return X + + +class ConsumingScorer(_Scorer): + def __init__(self, registry=None): + super().__init__( + score_func=mean_squared_error, sign=1, kwargs={}, response_method="predict" + ) + self.registry = registry + + def _score(self, method_caller, clf, X, y, **kwargs): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default(self, **kwargs) + + sample_weight = kwargs.get("sample_weight", None) + return super()._score(method_caller, clf, X, y, sample_weight=sample_weight) + + +class ConsumingSplitter(GroupsConsumerMixin, BaseCrossValidator): + def __init__(self, registry=None): + self.registry = registry + + def split(self, X, y=None, groups="default", metadata="default"): + if self.registry is not None: + self.registry.append(self) + + record_metadata_not_default(self, groups=groups, metadata=metadata) + + split_index = len(X) // 2 + train_indices = list(range(0, split_index)) + test_indices = list(range(split_index, len(X))) + yield test_indices, train_indices + yield train_indices, test_indices + + def get_n_splits(self, X=None, y=None, groups=None, metadata=None): + return 2 + + def _iter_test_indices(self, X=None, y=None, groups=None): + split_index = len(X) // 2 + train_indices = list(range(0, split_index)) + test_indices = list(range(split_index, len(X))) + yield test_indices + yield train_indices + + +class MetaRegressor(MetaEstimatorMixin, RegressorMixin, BaseEstimator): + """A meta-regressor which is only a router.""" + + def __init__(self, estimator): + self.estimator = estimator + + def fit(self, X, y, **fit_params): + params = process_routing(self, "fit", **fit_params) + self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) + + def get_metadata_routing(self): + router = MetadataRouter(owner=self.__class__.__name__).add( + estimator=self.estimator, + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + return router + + +class WeightedMetaRegressor(MetaEstimatorMixin, RegressorMixin, BaseEstimator): + """A meta-regressor which is also a consumer.""" + + def __init__(self, estimator, registry=None): + self.estimator = estimator + self.registry = registry + + def fit(self, X, y, sample_weight=None, **fit_params): + if self.registry is not None: + self.registry.append(self) + + record_metadata(self, sample_weight=sample_weight) + params = process_routing(self, "fit", sample_weight=sample_weight, **fit_params) + self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) + return self + + def predict(self, X, **predict_params): + params = process_routing(self, "predict", **predict_params) + return self.estimator_.predict(X, **params.estimator.predict) + + def get_metadata_routing(self): + router = ( + MetadataRouter(owner=self.__class__.__name__) + .add_self_request(self) + .add( + estimator=self.estimator, + method_mapping=MethodMapping() + .add(caller="fit", callee="fit") + .add(caller="predict", callee="predict"), + ) + ) + return router + + +class WeightedMetaClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator): + """A meta-estimator which also consumes sample_weight itself in ``fit``.""" + + def __init__(self, estimator, registry=None): + self.estimator = estimator + self.registry = registry + + def fit(self, X, y, sample_weight=None, **kwargs): + if self.registry is not None: + self.registry.append(self) + + record_metadata(self, sample_weight=sample_weight) + params = process_routing(self, "fit", sample_weight=sample_weight, **kwargs) + self.estimator_ = clone(self.estimator).fit(X, y, **params.estimator.fit) + return self + + def get_metadata_routing(self): + router = ( + MetadataRouter(owner=self.__class__.__name__) + .add_self_request(self) + .add( + estimator=self.estimator, + method_mapping=MethodMapping().add(caller="fit", callee="fit"), + ) + ) + return router + + +class MetaTransformer(MetaEstimatorMixin, TransformerMixin, BaseEstimator): + """A simple meta-transformer.""" + + def __init__(self, transformer): + self.transformer = transformer + + def fit(self, X, y=None, **fit_params): + params = process_routing(self, "fit", **fit_params) + self.transformer_ = clone(self.transformer).fit(X, y, **params.transformer.fit) + return self + + def transform(self, X, y=None, **transform_params): + params = process_routing(self, "transform", **transform_params) + return self.transformer_.transform(X, **params.transformer.transform) + + def get_metadata_routing(self): + return MetadataRouter(owner=self.__class__.__name__).add( + transformer=self.transformer, + method_mapping=MethodMapping() + .add(caller="fit", callee="fit") + .add(caller="transform", callee="transform"), + ) diff --git a/.venv/lib/python3.12/site-packages/sklearn/tests/test_base.py b/.venv/lib/python3.12/site-packages/sklearn/tests/test_base.py new file mode 100644 index 0000000000000000000000000000000000000000..0842cf0c82b485b16717ac19c78b4d51098769eb --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/tests/test_base.py @@ -0,0 +1,1081 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import pickle +import re +import warnings + +import numpy as np +import pytest +import scipy.sparse as sp +from numpy.testing import assert_allclose + +import sklearn +from sklearn import config_context, datasets +from sklearn.base import ( + BaseEstimator, + OutlierMixin, + TransformerMixin, + clone, + is_classifier, + is_clusterer, + is_outlier_detector, + is_regressor, +) +from sklearn.cluster import KMeans +from sklearn.decomposition import PCA +from sklearn.ensemble import IsolationForest +from sklearn.exceptions import InconsistentVersionWarning +from sklearn.metrics import get_scorer +from sklearn.model_selection import GridSearchCV, KFold +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.svm import SVC, SVR +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +from sklearn.utils._mocking import MockDataFrame +from sklearn.utils._set_output import _get_output_config +from sklearn.utils._testing import ( + _convert_container, + assert_array_equal, +) +from sklearn.utils.validation import _check_n_features, validate_data + + +############################################################################# +# A few test classes +class MyEstimator(BaseEstimator): + def __init__(self, l1=0, empty=None): + self.l1 = l1 + self.empty = empty + + +class K(BaseEstimator): + def __init__(self, c=None, d=None): + self.c = c + self.d = d + + +class T(BaseEstimator): + def __init__(self, a=None, b=None): + self.a = a + self.b = b + + +class NaNTag(BaseEstimator): + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags + + +class NoNaNTag(BaseEstimator): + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = False + return tags + + +class OverrideTag(NaNTag): + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = False + return tags + + +class DiamondOverwriteTag(NaNTag, NoNaNTag): + pass + + +class InheritDiamondOverwriteTag(DiamondOverwriteTag): + pass + + +class ModifyInitParams(BaseEstimator): + """Deprecated behavior. + Equal parameters but with a type cast. + Doesn't fulfill a is a + """ + + def __init__(self, a=np.array([0])): + self.a = a.copy() + + +class Buggy(BaseEstimator): + "A buggy estimator that does not set its parameters right." + + def __init__(self, a=None): + self.a = 1 + + +class NoEstimator: + def __init__(self): + pass + + def fit(self, X=None, y=None): + return self + + def predict(self, X=None): + return None + + +class VargEstimator(BaseEstimator): + """scikit-learn estimators shouldn't have vargs.""" + + def __init__(self, *vargs): + pass + + +############################################################################# +# The tests + + +def test_clone(): + # Tests that clone creates a correct deep copy. + # We create an estimator, make a copy of its original state + # (which, in this case, is the current state of the estimator), + # and check that the obtained copy is a correct deep copy. + + from sklearn.feature_selection import SelectFpr, f_classif + + selector = SelectFpr(f_classif, alpha=0.1) + new_selector = clone(selector) + assert selector is not new_selector + assert selector.get_params() == new_selector.get_params() + + selector = SelectFpr(f_classif, alpha=np.zeros((10, 2))) + new_selector = clone(selector) + assert selector is not new_selector + + +def test_clone_2(): + # Tests that clone doesn't copy everything. + # We first create an estimator, give it an own attribute, and + # make a copy of its original state. Then we check that the copy doesn't + # have the specific attribute we manually added to the initial estimator. + + from sklearn.feature_selection import SelectFpr, f_classif + + selector = SelectFpr(f_classif, alpha=0.1) + selector.own_attribute = "test" + new_selector = clone(selector) + assert not hasattr(new_selector, "own_attribute") + + +def test_clone_buggy(): + # Check that clone raises an error on buggy estimators. + buggy = Buggy() + buggy.a = 2 + with pytest.raises(RuntimeError): + clone(buggy) + + no_estimator = NoEstimator() + with pytest.raises(TypeError): + clone(no_estimator) + + varg_est = VargEstimator() + with pytest.raises(RuntimeError): + clone(varg_est) + + est = ModifyInitParams() + with pytest.raises(RuntimeError): + clone(est) + + +def test_clone_empty_array(): + # Regression test for cloning estimators with empty arrays + clf = MyEstimator(empty=np.array([])) + clf2 = clone(clf) + assert_array_equal(clf.empty, clf2.empty) + + clf = MyEstimator(empty=sp.csr_matrix(np.array([[0]]))) + clf2 = clone(clf) + assert_array_equal(clf.empty.data, clf2.empty.data) + + +def test_clone_nan(): + # Regression test for cloning estimators with default parameter as np.nan + clf = MyEstimator(empty=np.nan) + clf2 = clone(clf) + + assert clf.empty is clf2.empty + + +def test_clone_dict(): + # test that clone creates a clone of a dict + orig = {"a": MyEstimator()} + cloned = clone(orig) + assert orig["a"] is not cloned["a"] + + +def test_clone_sparse_matrices(): + sparse_matrix_classes = [ + cls + for name in dir(sp) + if name.endswith("_matrix") and type(cls := getattr(sp, name)) is type + ] + + for cls in sparse_matrix_classes: + sparse_matrix = cls(np.eye(5)) + clf = MyEstimator(empty=sparse_matrix) + clf_cloned = clone(clf) + assert clf.empty.__class__ is clf_cloned.empty.__class__ + assert_array_equal(clf.empty.toarray(), clf_cloned.empty.toarray()) + + +def test_clone_estimator_types(): + # Check that clone works for parameters that are types rather than + # instances + clf = MyEstimator(empty=MyEstimator) + clf2 = clone(clf) + + assert clf.empty is clf2.empty + + +def test_clone_class_rather_than_instance(): + # Check that clone raises expected error message when + # cloning class rather than instance + msg = "You should provide an instance of scikit-learn estimator" + with pytest.raises(TypeError, match=msg): + clone(MyEstimator) + + +def test_repr(): + # Smoke test the repr of the base estimator. + my_estimator = MyEstimator() + repr(my_estimator) + test = T(K(), K()) + assert repr(test) == "T(a=K(), b=K())" + + some_est = T(a=["long_params"] * 1000) + assert len(repr(some_est)) == 485 + + +def test_str(): + # Smoke test the str of the base estimator + my_estimator = MyEstimator() + str(my_estimator) + + +def test_get_params(): + test = T(K(), K) + + assert "a__d" in test.get_params(deep=True) + assert "a__d" not in test.get_params(deep=False) + + test.set_params(a__d=2) + assert test.a.d == 2 + + with pytest.raises(ValueError): + test.set_params(a__a=2) + + +# TODO(1.8): Remove this test when the deprecation is removed +def test_is_estimator_type_class(): + with pytest.warns(FutureWarning, match="passing a class to.*is deprecated"): + assert is_classifier(SVC) + + with pytest.warns(FutureWarning, match="passing a class to.*is deprecated"): + assert is_regressor(SVR) + + with pytest.warns(FutureWarning, match="passing a class to.*is deprecated"): + assert is_clusterer(KMeans) + + with pytest.warns(FutureWarning, match="passing a class to.*is deprecated"): + assert is_outlier_detector(IsolationForest) + + +@pytest.mark.parametrize( + "estimator, expected_result", + [ + (SVC(), True), + (GridSearchCV(SVC(), {"C": [0.1, 1]}), True), + (Pipeline([("svc", SVC())]), True), + (Pipeline([("svc_cv", GridSearchCV(SVC(), {"C": [0.1, 1]}))]), True), + (SVR(), False), + (GridSearchCV(SVR(), {"C": [0.1, 1]}), False), + (Pipeline([("svr", SVR())]), False), + (Pipeline([("svr_cv", GridSearchCV(SVR(), {"C": [0.1, 1]}))]), False), + ], +) +def test_is_classifier(estimator, expected_result): + assert is_classifier(estimator) == expected_result + + +@pytest.mark.parametrize( + "estimator, expected_result", + [ + (SVR(), True), + (GridSearchCV(SVR(), {"C": [0.1, 1]}), True), + (Pipeline([("svr", SVR())]), True), + (Pipeline([("svr_cv", GridSearchCV(SVR(), {"C": [0.1, 1]}))]), True), + (SVC(), False), + (GridSearchCV(SVC(), {"C": [0.1, 1]}), False), + (Pipeline([("svc", SVC())]), False), + (Pipeline([("svc_cv", GridSearchCV(SVC(), {"C": [0.1, 1]}))]), False), + ], +) +def test_is_regressor(estimator, expected_result): + assert is_regressor(estimator) == expected_result + + +@pytest.mark.parametrize( + "estimator, expected_result", + [ + (KMeans(), True), + (GridSearchCV(KMeans(), {"n_clusters": [3, 8]}), True), + (Pipeline([("km", KMeans())]), True), + (Pipeline([("km_cv", GridSearchCV(KMeans(), {"n_clusters": [3, 8]}))]), True), + (SVC(), False), + (GridSearchCV(SVC(), {"C": [0.1, 1]}), False), + (Pipeline([("svc", SVC())]), False), + (Pipeline([("svc_cv", GridSearchCV(SVC(), {"C": [0.1, 1]}))]), False), + ], +) +def test_is_clusterer(estimator, expected_result): + assert is_clusterer(estimator) == expected_result + + +def test_set_params(): + # test nested estimator parameter setting + clf = Pipeline([("svc", SVC())]) + + # non-existing parameter in svc + with pytest.raises(ValueError): + clf.set_params(svc__stupid_param=True) + + # non-existing parameter of pipeline + with pytest.raises(ValueError): + clf.set_params(svm__stupid_param=True) + + # we don't currently catch if the things in pipeline are estimators + # bad_pipeline = Pipeline([("bad", NoEstimator())]) + # with pytest.raises(AttributeError): + # bad_pipeline.set_params(bad__stupid_param=True) + + +def test_set_params_passes_all_parameters(): + # Make sure all parameters are passed together to set_params + # of nested estimator. Regression test for #9944 + + class TestDecisionTree(DecisionTreeClassifier): + def set_params(self, **kwargs): + super().set_params(**kwargs) + # expected_kwargs is in test scope + assert kwargs == expected_kwargs + return self + + expected_kwargs = {"max_depth": 5, "min_samples_leaf": 2} + for est in [ + Pipeline([("estimator", TestDecisionTree())]), + GridSearchCV(TestDecisionTree(), {}), + ]: + est.set_params(estimator__max_depth=5, estimator__min_samples_leaf=2) + + +def test_set_params_updates_valid_params(): + # Check that set_params tries to set SVC().C, not + # DecisionTreeClassifier().C + gscv = GridSearchCV(DecisionTreeClassifier(), {}) + gscv.set_params(estimator=SVC(), estimator__C=42.0) + assert gscv.estimator.C == 42.0 + + +@pytest.mark.parametrize( + "tree,dataset", + [ + ( + DecisionTreeClassifier(max_depth=2, random_state=0), + datasets.make_classification(random_state=0), + ), + ( + DecisionTreeRegressor(max_depth=2, random_state=0), + datasets.make_regression(random_state=0), + ), + ], +) +def test_score_sample_weight(tree, dataset): + rng = np.random.RandomState(0) + # check that the score with and without sample weights are different + X, y = dataset + + tree.fit(X, y) + # generate random sample weights + sample_weight = rng.randint(1, 10, size=len(y)) + score_unweighted = tree.score(X, y) + score_weighted = tree.score(X, y, sample_weight=sample_weight) + msg = "Unweighted and weighted scores are unexpectedly equal" + assert score_unweighted != score_weighted, msg + + +def test_clone_pandas_dataframe(): + class DummyEstimator(TransformerMixin, BaseEstimator): + """This is a dummy class for generating numerical features + + This feature extractor extracts numerical features from pandas data + frame. + + Parameters + ---------- + + df: pandas data frame + The pandas data frame parameter. + + Notes + ----- + """ + + def __init__(self, df=None, scalar_param=1): + self.df = df + self.scalar_param = scalar_param + + def fit(self, X, y=None): + pass + + def transform(self, X): + pass + + # build and clone estimator + d = np.arange(10) + df = MockDataFrame(d) + e = DummyEstimator(df, scalar_param=1) + cloned_e = clone(e) + + # the test + assert (e.df == cloned_e.df).values.all() + assert e.scalar_param == cloned_e.scalar_param + + +def test_clone_protocol(): + """Checks that clone works with `__sklearn_clone__` protocol.""" + + class FrozenEstimator(BaseEstimator): + def __init__(self, fitted_estimator): + self.fitted_estimator = fitted_estimator + + def __getattr__(self, name): + return getattr(self.fitted_estimator, name) + + def __sklearn_clone__(self): + return self + + def fit(self, *args, **kwargs): + return self + + def fit_transform(self, *args, **kwargs): + return self.fitted_estimator.transform(*args, **kwargs) + + X = np.array([[-1, -1], [-2, -1], [-3, -2]]) + pca = PCA().fit(X) + components = pca.components_ + + frozen_pca = FrozenEstimator(pca) + assert_allclose(frozen_pca.components_, components) + + # Calling PCA methods such as `get_feature_names_out` still works + assert_array_equal(frozen_pca.get_feature_names_out(), pca.get_feature_names_out()) + + # Fitting on a new data does not alter `components_` + X_new = np.asarray([[-1, 2], [3, 4], [1, 2]]) + frozen_pca.fit(X_new) + assert_allclose(frozen_pca.components_, components) + + # `fit_transform` does not alter state + frozen_pca.fit_transform(X_new) + assert_allclose(frozen_pca.components_, components) + + # Cloning estimator is a no-op + clone_frozen_pca = clone(frozen_pca) + assert clone_frozen_pca is frozen_pca + assert_allclose(clone_frozen_pca.components_, components) + + +def test_pickle_version_warning_is_not_raised_with_matching_version(): + iris = datasets.load_iris() + tree = DecisionTreeClassifier().fit(iris.data, iris.target) + tree_pickle = pickle.dumps(tree) + assert b"_sklearn_version" in tree_pickle + + with warnings.catch_warnings(): + warnings.simplefilter("error") + tree_restored = pickle.loads(tree_pickle) + + # test that we can predict with the restored decision tree classifier + score_of_original = tree.score(iris.data, iris.target) + score_of_restored = tree_restored.score(iris.data, iris.target) + assert score_of_original == score_of_restored + + +class TreeBadVersion(DecisionTreeClassifier): + def __getstate__(self): + return dict(self.__dict__.items(), _sklearn_version="something") + + +pickle_error_message = ( + "Trying to unpickle estimator {estimator} from " + "version {old_version} when using version " + "{current_version}. This might " + "lead to breaking code or invalid results. " + "Use at your own risk." +) + + +def test_pickle_version_warning_is_issued_upon_different_version(): + iris = datasets.load_iris() + tree = TreeBadVersion().fit(iris.data, iris.target) + tree_pickle_other = pickle.dumps(tree) + message = pickle_error_message.format( + estimator="TreeBadVersion", + old_version="something", + current_version=sklearn.__version__, + ) + with pytest.warns(UserWarning, match=message) as warning_record: + pickle.loads(tree_pickle_other) + + message = warning_record.list[0].message + assert isinstance(message, InconsistentVersionWarning) + assert message.estimator_name == "TreeBadVersion" + assert message.original_sklearn_version == "something" + assert message.current_sklearn_version == sklearn.__version__ + + +class TreeNoVersion(DecisionTreeClassifier): + def __getstate__(self): + return self.__dict__ + + +def test_pickle_version_warning_is_issued_when_no_version_info_in_pickle(): + iris = datasets.load_iris() + # TreeNoVersion has no getstate, like pre-0.18 + tree = TreeNoVersion().fit(iris.data, iris.target) + + tree_pickle_noversion = pickle.dumps(tree) + assert b"_sklearn_version" not in tree_pickle_noversion + message = pickle_error_message.format( + estimator="TreeNoVersion", + old_version="pre-0.18", + current_version=sklearn.__version__, + ) + # check we got the warning about using pre-0.18 pickle + with pytest.warns(UserWarning, match=message): + pickle.loads(tree_pickle_noversion) + + +def test_pickle_version_no_warning_is_issued_with_non_sklearn_estimator(): + iris = datasets.load_iris() + tree = TreeNoVersion().fit(iris.data, iris.target) + tree_pickle_noversion = pickle.dumps(tree) + try: + module_backup = TreeNoVersion.__module__ + TreeNoVersion.__module__ = "notsklearn" + + with warnings.catch_warnings(): + warnings.simplefilter("error") + + pickle.loads(tree_pickle_noversion) + finally: + TreeNoVersion.__module__ = module_backup + + +class DontPickleAttributeMixin: + def __getstate__(self): + data = self.__dict__.copy() + data["_attribute_not_pickled"] = None + return data + + def __setstate__(self, state): + state["_restored"] = True + self.__dict__.update(state) + + +class MultiInheritanceEstimator(DontPickleAttributeMixin, BaseEstimator): + def __init__(self, attribute_pickled=5): + self.attribute_pickled = attribute_pickled + self._attribute_not_pickled = None + + +def test_pickling_when_getstate_is_overwritten_by_mixin(): + estimator = MultiInheritanceEstimator() + estimator._attribute_not_pickled = "this attribute should not be pickled" + + serialized = pickle.dumps(estimator) + estimator_restored = pickle.loads(serialized) + assert estimator_restored.attribute_pickled == 5 + assert estimator_restored._attribute_not_pickled is None + assert estimator_restored._restored + + +def test_pickling_when_getstate_is_overwritten_by_mixin_outside_of_sklearn(): + try: + estimator = MultiInheritanceEstimator() + text = "this attribute should not be pickled" + estimator._attribute_not_pickled = text + old_mod = type(estimator).__module__ + type(estimator).__module__ = "notsklearn" + + serialized = estimator.__getstate__() + assert serialized == {"_attribute_not_pickled": None, "attribute_pickled": 5} + + serialized["attribute_pickled"] = 4 + estimator.__setstate__(serialized) + assert estimator.attribute_pickled == 4 + assert estimator._restored + finally: + type(estimator).__module__ = old_mod + + +class SingleInheritanceEstimator(BaseEstimator): + def __init__(self, attribute_pickled=5): + self.attribute_pickled = attribute_pickled + self._attribute_not_pickled = None + + def __getstate__(self): + state = super().__getstate__() + state["_attribute_not_pickled"] = None + return state + + +def test_pickling_works_when_getstate_is_overwritten_in_the_child_class(): + estimator = SingleInheritanceEstimator() + estimator._attribute_not_pickled = "this attribute should not be pickled" + + serialized = pickle.dumps(estimator) + estimator_restored = pickle.loads(serialized) + assert estimator_restored.attribute_pickled == 5 + assert estimator_restored._attribute_not_pickled is None + + +def test_tag_inheritance(): + # test that changing tags by inheritance is not allowed + + nan_tag_est = NaNTag() + no_nan_tag_est = NoNaNTag() + assert nan_tag_est.__sklearn_tags__().input_tags.allow_nan + assert not no_nan_tag_est.__sklearn_tags__().input_tags.allow_nan + + redefine_tags_est = OverrideTag() + assert not redefine_tags_est.__sklearn_tags__().input_tags.allow_nan + + diamond_tag_est = DiamondOverwriteTag() + assert diamond_tag_est.__sklearn_tags__().input_tags.allow_nan + + inherit_diamond_tag_est = InheritDiamondOverwriteTag() + assert inherit_diamond_tag_est.__sklearn_tags__().input_tags.allow_nan + + +def test_raises_on_get_params_non_attribute(): + class MyEstimator(BaseEstimator): + def __init__(self, param=5): + pass + + def fit(self, X, y=None): + return self + + est = MyEstimator() + msg = "'MyEstimator' object has no attribute 'param'" + + with pytest.raises(AttributeError, match=msg): + est.get_params() + + +def test_repr_mimebundle_(): + # Checks the display configuration flag controls the json output + tree = DecisionTreeClassifier() + output = tree._repr_mimebundle_() + assert "text/plain" in output + assert "text/html" in output + + with config_context(display="text"): + output = tree._repr_mimebundle_() + assert "text/plain" in output + assert "text/html" not in output + + +def test_repr_html_wraps(): + # Checks the display configuration flag controls the html output + tree = DecisionTreeClassifier() + + output = tree._repr_html_() + assert "" + f"" + f'
' + '
' + f"
{html.escape(estimator_str)}
{fallback_msg}" + "
" + '
" + + out.write(html_end) + + html_output = out.getvalue() + return html_output diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/params.css b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/params.css new file mode 100644 index 0000000000000000000000000000000000000000..df815f966ffcfe3544b5c73a9c87c0f7f2d256f5 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/params.css @@ -0,0 +1,63 @@ +.estimator-table summary { + padding: .5rem; + font-family: monospace; + cursor: pointer; +} + +.estimator-table details[open] { + padding-left: 0.1rem; + padding-right: 0.1rem; + padding-bottom: 0.3rem; +} + +.estimator-table .parameters-table { + margin-left: auto !important; + margin-right: auto !important; +} + +.estimator-table .parameters-table tr:nth-child(odd) { + background-color: #fff; +} + +.estimator-table .parameters-table tr:nth-child(even) { + background-color: #f6f6f6; +} + +.estimator-table .parameters-table tr:hover { + background-color: #e0e0e0; +} + +.estimator-table table td { + border: 1px solid rgba(106, 105, 104, 0.232); +} + +.user-set td { + color:rgb(255, 94, 0); + text-align: left; +} + +.user-set td.value pre { + color:rgb(255, 94, 0) !important; + background-color: transparent !important; +} + +.default td { + color: black; + text-align: left; +} + +.user-set td i, +.default td i { + color: black; +} + +.copy-paste-icon { + background-image: url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCA0NDggNTEyIj48IS0tIUZvbnQgQXdlc29tZSBGcmVlIDYuNy4yIGJ5IEBmb250YXdlc29tZSAtIGh0dHBzOi8vZm9udGF3ZXNvbWUuY29tIExpY2Vuc2UgLSBodHRwczovL2ZvbnRhd2Vzb21lLmNvbS9saWNlbnNlL2ZyZWUgQ29weXJpZ2h0IDIwMjUgRm9udGljb25zLCBJbmMuLS0+PHBhdGggZD0iTTIwOCAwTDMzMi4xIDBjMTIuNyAwIDI0LjkgNS4xIDMzLjkgMTQuMWw2Ny45IDY3LjljOSA5IDE0LjEgMjEuMiAxNC4xIDMzLjlMNDQ4IDMzNmMwIDI2LjUtMjEuNSA0OC00OCA0OGwtMTkyIDBjLTI2LjUgMC00OC0yMS41LTQ4LTQ4bDAtMjg4YzAtMjYuNSAyMS41LTQ4IDQ4LTQ4ek00OCAxMjhsODAgMCAwIDY0LTY0IDAgMCAyNTYgMTkyIDAgMC0zMiA2NCAwIDAgNDhjMCAyNi41LTIxLjUgNDgtNDggNDhMNDggNTEyYy0yNi41IDAtNDgtMjEuNS00OC00OEwwIDE3NmMwLTI2LjUgMjEuNS00OCA0OC00OHoiLz48L3N2Zz4=); + background-repeat: no-repeat; + background-size: 14px 14px; + background-position: 0; + display: inline-block; + width: 14px; + height: 14px; + cursor: pointer; +} diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/params.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/params.py new file mode 100644 index 0000000000000000000000000000000000000000..d85bf1280a8fcb16bd61895a96d9959d735fc5e0 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/params.py @@ -0,0 +1,83 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +import html +import reprlib +from collections import UserDict + +from sklearn.utils._repr_html.base import ReprHTMLMixin + + +def _read_params(name, value, non_default_params): + """Categorizes parameters as 'default' or 'user-set' and formats their values. + Escapes or truncates parameter values for display safety and readability. + """ + r = reprlib.Repr() + r.maxlist = 2 # Show only first 2 items of lists + r.maxtuple = 1 # Show only first item of tuples + r.maxstring = 50 # Limit string length + cleaned_value = html.escape(r.repr(value)) + + param_type = "user-set" if name in non_default_params else "default" + + return {"param_type": param_type, "param_name": name, "param_value": cleaned_value} + + +def _params_html_repr(params): + """Generate HTML representation of estimator parameters. + + Creates an HTML table with parameter names and values, wrapped in a + collapsible details element. Parameters are styled differently based + on whether they are default or user-set values. + """ + HTML_TEMPLATE = """ +
+
+ Parameters + + + {rows} + +
+
+
+ """ + ROW_TEMPLATE = """ + + + {param_name}  + {param_value} + + """ + + rows = [ + ROW_TEMPLATE.format(**_read_params(name, value, params.non_default)) + for name, value in params.items() + ] + + return HTML_TEMPLATE.format(rows="\n".join(rows)) + + +class ParamsDict(ReprHTMLMixin, UserDict): + """Dictionary-like class to store and provide an HTML representation. + + It builds an HTML structure to be used with Jupyter notebooks or similar + environments. It allows storing metadata to track non-default parameters. + + Parameters + ---------- + params : dict, default=None + The original dictionary of parameters and their values. + + non_default : tuple + The list of non-default parameters. + """ + + _html_repr = _params_html_repr + + def __init__(self, params=None, non_default=tuple()): + super().__init__(params or {}) + self.non_default = non_default diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/tests/__init__.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/tests/test_estimator.py b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/tests/test_estimator.py new file mode 100644 index 0000000000000000000000000000000000000000..cc975d854ed8f416d3f48e16dfd0fbaa10a9352a --- /dev/null +++ b/.venv/lib/python3.12/site-packages/sklearn/utils/_repr_html/tests/test_estimator.py @@ -0,0 +1,616 @@ +import html +import locale +import re +import types +from contextlib import closing +from functools import partial +from io import StringIO +from unittest.mock import patch + +import numpy as np +import pytest + +from sklearn import config_context +from sklearn.base import BaseEstimator +from sklearn.cluster import AgglomerativeClustering, Birch +from sklearn.compose import ColumnTransformer, make_column_transformer +from sklearn.datasets import load_iris +from sklearn.decomposition import PCA, TruncatedSVD +from sklearn.ensemble import StackingClassifier, StackingRegressor, VotingClassifier +from sklearn.feature_selection import SelectPercentile +from sklearn.gaussian_process.kernels import ExpSineSquared +from sklearn.impute import SimpleImputer +from sklearn.kernel_ridge import KernelRidge +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import RandomizedSearchCV +from sklearn.multiclass import OneVsOneClassifier +from sklearn.neural_network import MLPClassifier +from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline +from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler +from sklearn.svm import LinearSVC, LinearSVR +from sklearn.tree import DecisionTreeClassifier +from sklearn.utils._repr_html.base import _HTMLDocumentationLinkMixin +from sklearn.utils._repr_html.estimator import ( + _get_css_style, + _get_visual_block, + _write_label_html, + estimator_html_repr, +) +from sklearn.utils.fixes import parse_version + + +def dummy_function(x, y): + return x + y # pragma: nocover + + +@pytest.mark.parametrize("checked", [True, False]) +def test_write_label_html(checked): + # Test checking logic and labeling + name = "LogisticRegression" + params = "" + tool_tip = "hello-world" + + with closing(StringIO()) as out: + _write_label_html(out, params, name, tool_tip, checked=checked) + html_label = out.getvalue() + + p = ( + r'